forked from wjw12/python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
spider.py
69 lines (68 loc) · 2.12 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# -*- coding: utf-8 -*-
import re,urllib2,httplib
rootURL = 'http://dou.lesile.net/'
startURL = 'http://dou.lesile.net/thread0806.php?fid=8&page='
class spider:
def __init__(self,keyword):
self.rootURL = rootURL
self.startURL = startURL
self.page = 1
self.keyword = keyword.decode('utf-8').encode('gb2312')
self.img = None
def getHrefList(self,page): #获取包含搜索词的超链接列表
reg = r'<h3><a href="(.*html)" target="_blank" id="">.*' + self.keyword + r'.*</a></h3>'
hrefRe = re.compile(reg)
return re.findall(hrefRe,page)
def getPage(self):
pageURL = self.startURL + str(self.page)
req = urllib2.Request(pageURL)
print 'Opening ' + pageURL
resp = urllib2.urlopen(req)
self.page += 1
try:
page = resp.read()
except httplib.IncompleteRead, e: #处理IncompleteRead异常
print 'IncompleteRead ' + pageURL
page = e.partial
return page
def fetchImageData(self,imgURL):
try:
print 'Downloading from ' + imgURL
self.img = urllib2.urlopen(imgURL,timeout=20).read()
except:
print 'Error saving image ' + imgURL
self.img = None
def saveImg(self):
counter = 0
while True:
page = self.getPage()
hrefList = self.getHrefList(page)
for href in hrefList:
imgPageURL = self.rootURL + href
print 'Opening image page ' + imgPageURL
try:
resp = urllib2.urlopen(imgPageURL)
except:
print 'Error opening ' + imgPageURL
continue
imgPage = resp.read()
replaceBR = re.compile('<br>')
imgPage = re.sub(replaceBR,'\n',imgPage) #替换<br>为换行符 \n
reg = r'<input type=\'image\' src=\'(http.*\.jpg)\''
imgRe = re.compile(reg)
imgList = re.findall(imgRe,imgPage)
for img in imgList:
self.fetchImageData(img)
if self.img is not None:
filename = str(counter) + '.jpg'
image = file(filename,'wb')
image.write(self.img)
image.close()
print filename + ' saved.'
counter += 1
self.img = None
else:
print 'No image data, nothing is saved'
if __name__ == '__main__':
mySpider = spider('少女')
mySpider.saveImg()