-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathspider.py
More file actions
107 lines (92 loc) · 3.46 KB
/
Copy pathspider.py
File metadata and controls
107 lines (92 loc) · 3.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# -*- coding: UTF-8 -*-
import urllib
from bs4 import BeautifulSoup
from bloomFilter import BloomFilter
import Queue
import socket
socket.setdefaulttimeout(6)
from download import downloadHtml
endWithArr = ['.pdf','.html','.doc','.docx','.png','.jpg','.gif','.txt','.xml','.ppt','.xls','.xlsx'] #资源文件
#根据popurl获取绝对路径
def get_absoluteUrlHead(url):
index = url.find('//')
urlLen = len(url)
urlSub = url[index+2:urlLen]
indexSub = urlSub.find('/')
totalIndex = index + indexSub + 3
absoluteUrlHead = url[0:totalIndex]
return absoluteUrlHead
#判断相对/绝对路径(例如输入:http://www.baidu.com/index/ss输出:http://www.baidu.com/ )
def is_relativeURL(url,seed):
if url == '#' or url == '/' or url.startswith('javascript'):
return -1 #无用URL
index = url.find('//')
if index == -1:
absoluteurl = get_absoluteUrlHead(seed) + url
return absoluteurl #拼接好的绝对路径
return url #绝对路径
#判断是否需要爬
def is_needURL(url):
if 'computer.hdu.edu.cn' in url: #这里是防止爬到不相关的网页,请分析你要爬取的网页URL,修改这里的'hdu.edu.cn'
if url.startswith('mailto'):
return False
else:
return True
return False
#判断是否是资源文件
def is_resourceFile(url):
for str in endWithArr:
if url.endswith(str):
return True
else:
continue
return False
#转码
def to_bytestring(s,enc='utf-8'):
if s:
if isinstance(s, str):
return s
else:
return s.encode(enc)
#用队列广度优先获取URL
def run(seed):
bf = BloomFilter(0.00001,1000000) #初始化布隆过滤器
queue = Queue.Queue(maxsize = 0) #初始化URL队列
urlCount = 0 #初始化已得到URL变量
urlList = [] #初始化下载列表
queue.put(seed)
while(queue.empty() == False):
currentURL = queue.get()
urlList.append(currentURL)
print 'currentURL',to_bytestring(currentURL)
try: #timeout处理
html = urllib.urlopen(currentURL)
except:
continue
bs_obj = BeautifulSoup(html,'html.parser')
a_list = bs_obj.findAll('a') + bs_obj.findAll('img')
for aa in a_list:
if aa.attrs.get('href'):
hrefStr = aa.attrs.get('href')
else:
hrefStr = aa.attrs.get('src')
if hrefStr:
hrefStr = is_relativeURL(hrefStr,currentURL)
if hrefStr == -1: #判断相对/绝对路径
continue
if is_needURL(hrefStr) == True: #判断是否需要抓取
if bf.is_element_exist(hrefStr) == False: #布隆过滤
bf.insert_element(hrefStr)
print to_bytestring(hrefStr)
if is_resourceFile(hrefStr) == False: #判断是否是资源文件
queue.put(hrefStr)
urlList.append(hrefStr)
try:
downloadHtml(hrefStr)
except:
pass
urlCount = urlCount + 1
print '所有--当前',urlCount,len(urlList)
#主函数
if __name__ == "__main__":
run('http://computer.hdu.edu.cn')