-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
289 lines (266 loc) · 13.2 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
import requests
import time
from fake_useragent import UserAgent
from playwright.sync_api import sync_playwright
from playwright.async_api import async_playwright
import asyncio
import os
import json
from bs4 import BeautifulSoup
import time
import logging
import wget
import threading
import re
import aiohttp
logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
if not os.path.exists('cookies.txt'):
logger.error('未检测到cookie!需要手动登录!')
p = sync_playwright().start()
browser = p.launch(headless=False)
context = browser.new_context()
page = context.new_page()
page.goto('http://www.baidu.com')
page.locator('id=s-top-username').click(timeout=60000)
cookie=context.cookies()
logger.info('cookie获取成功!')
with open('cookies.txt','w',encoding='utf-8') as f:
f.write(str(cookie))
logger.info('cookie写入成功')
context.close()
browser.close()
async def run(s,e,cookies,browser,tzurl,title,all):
global count
count=count
global thread
thread+=1
await asyncio.sleep(thread)
context =await browser.new_context()
await context.add_cookies(cookies)
newpage=await context.new_page()
pagenum=s
def hanle_lzl(tid,pid,s):
complete=True
i=1
f=''
try:
while complete:
result=json.loads(requests.get('https://tieba.baidu.com/mg/o/getFloorData?pn='+str(i)+'&rn=30&tid='+str(tid)+'&pid='+str(pid)).text)
for count in range(len(result['data']['sub_post_list'])):
mpic='<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/'+result['data']['sub_post_list'][count]['author']['portrait']+'">'
mname=result['data']['sub_post_list'][count]['author']['name_show']
mcon=result['data']['sub_post_list'][count]['content']
text=''
for m in mcon:
if m['type'] in [0,4]:
text+=m['text']
elif m['type'] in [1,18]:
text+='<a href="'+m['link']+'">'+m['text']+'</a>'
elif m['type']==2:
text+='<img src="'+m['src']+'">'
else:
print(m)
text+=m
mcon=text
mtime=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(result['data']['sub_post_list'][count]['time'])))
f=f+'<div class:"mall"><div class="mpic">'+str(mpic)+'</div><div class="mcon">'+mname+':'+str(mcon)+'</div><div class="mtime">'+mtime+'</div></div>'+'\n'
complete=result['data']['page']['current_page']!=result['data']['page']['total_page']
i+=1
#print('执行任务'+str(pid)+'中……i——'+str(i))
except Exception as err:
print(pid,str(err))
f='---for'+pid+'---'
global allhtml
#print(allhtml[s])
allhtml[s]=allhtml[s].replace('---for'+pid+'---',f)
#print(allhtml[s])
#print('执行完成!'+pid)
while pagenum <= e:
page=newpage
try:os.mkdir(title)
except:pass
try:os.mkdir('./'+title+'/image')
except:pass
try:os.mkdir('./'+title+'/video')
except:pass
async def pic_intersept(response):
if '.baidu.com/forum' in response.url:
try:name=response.url.split('sign=')[1].split('/')[0]
except:return ''
with open('./'+title+'/image/'+name+'.jpg','wb') as f:
f.write(await response.body())
#print(response.url)
if 'tb-video.bdstatic.com' in response.url:
try:name=response.url.split('transcode-cae/')[1].split('?')[0]
except:return ''
def dovideo(response):
wget.download(response.url,out='./'+title+'/video/'+name)
nt=threading.Thread(target=dovideo,args=(response,))
nt.start()
nt.join()
page.on('response',pic_intersept)
#拦截图片请求,直接获取二进制数据流(这样可以极大的避免后期批量爬取图片导致的ip封禁)
excluded_resource_types = ["font","stylesheet","script"]
async def block_aggressively(route):
if (route.request.resource_type in excluded_resource_types):
await route.abort()
elif 'baidu' in str(route.request.url) or 'bdstatic' in str(route.request.url):
await route.continue_()
else:
await route.abort()
await page.route("**/*", block_aggressively)
#拦截无用资源的加载
while True:
try:await page.goto(tzurl+'?pn='+str(pagenum))
except Exception as err:
logger.error('加载失败……可能是网络问题……冷却一分钟')
print(str(err))
await asyncio.sleep(60)
await page.goto(tzurl+'?pn='+str(pagenum))
else:break
#设置全页面大小
width = await page.evaluate("document.documentElement.scrollWidth;")
height = await page.evaluate("document.documentElement.scrollHeight;")#这两个要注意:不能加return(selenium里面需要加)
await page.set_viewport_size({"width": width, "height": height})
logger.info('爬取'+str(pagenum)+'页ing……')
allhtml[s]=''#接下来html拼接的起源!
if pagenum==1:
allhtml[s]+='''
<head><link rel="stylesheet" type="text/css" href="../tz.css"><meta htt=utf-8" />ontent-Type" content="text/html; charset=utf-8" /></head>
'''
allhtml[s]+="<title>"+title+"</title>"+"\n"
allhtml[s]+="<h1>"+title+"</h1><hr>"+'\n'
#获取标题并拼接
pidlist=[]#准备pid的列表以便一会获取
#获取主要楼层的信息+pid
floor=0
for div in BeautifulSoup(await page.content(),'lxml').find_all('div',attrs={'class':'l_post l_post_bright j_l_post clearfix'}):
floor+=1
a=div.find('ul',attrs={'class':'p_author'}).find('li',attrs={'class':'d_name'}).find('a',{'alog-group':'p_author'}).get_text()#获取此楼的发楼者
a_face=div.find('a',attrs={'class':'p_author_face'})#获取此楼的发楼者的头像
try:a_face.img
except:a_face=div.find('span',attrs={'class':'p_author_face p_author_face_ip'})
try:
if 'data-tb-lazyload' in a_face.img.attrs:
a_face.img['src'] = a_face.img['data-tb-lazyload']
except Exception as err:
print(a_face)
print(div)
maincon=div.find('div',attrs={'class':'d_post_content j_d_post_content'})#获取此楼的主要内容
#处理图片
for img in maincon.find_all('img',{'class':'BDE_Image'}):
url=img.attrs['src']
try:
name=url.split('sign=')[1].split('/')[0]
img['src']='./image/'+name+'.jpg'
except Exception as err:
print(url)
print(str(err))
#处理视频
for video in maincon.find_all('video'):
url=video.attrs['src']
name=url.split('transcode-cae/')[1].split('?')[0]
video['src']='./video/'+name
meg_location=div.find('div',attrs={'class':'post-tail-wrap'}).find('span').get_text()#获取此楼的发布地点(ip属地)
try:#尝试以正常方式获取(也就是显示来自xxx客户端)
meg_floor=div.find('div',attrs={'class':'post-tail-wrap'}).find_all('span',attrs={'class':'tail-info'})[1].get_text()#获取此楼的楼层
meg_time=div.find('div',attrs={'class':'post-tail-wrap'}).find_all('span',attrs={'class':'tail-info'})[2].get_text()#获取此楼的发布时间
except:#如果出错(一般就是少了来自xxx客户端,tail-info向前推1即可)
meg_floor=div.find('div',attrs={'class':'post-tail-wrap'}).find('span',attrs={'class':'tail-info'}).get_text()#获取此楼的楼层
meg_time=div.find('div',attrs={'class':'post-tail-wrap'}).find_all('span',attrs={'class':'tail-info'})[1].get_text()#获取此楼的发布时间
pid=div.attrs['data-pid']
pidlist.append(pid)#本楼的pid
tid=tzurl.split('/p/')[1]
async with aiohttp.ClientSession() as session:
async with session.get('https://tieba.baidu.com/mg/o/getFloorData?pn=1&rn=30&tid='+str(tid)+'&pid='+str(pid)) as response:
try:
lzlgetresult =json.loads(await response.text())
lzlgetresult= lzlgetresult['data']['sub_post_list']
except Exception as err:
lzlgetresult='NoneType'
print(await response.text())
print(str(err))
#print(lzlgetresult)
if 'NoneType' in str(type(lzlgetresult)) or (int(floor)==1 and int(pagenum)==1):
#logger.info('此楼无lzl')
allhtml[s]=allhtml[s]+'\n'+'<table><tr><td class="first-p">'+str(a_face)+'<h4>'+a+'</h4>'+'\n'+'</td><td class="second-p">'+str(maincon)+'\n'+'</td></tr>'+'<tr><td></td><td><div class="msg">'+str(meg_floor)+'|'+str(meg_location)+'|'+str(meg_time)+'</div></td></tr>'+'</table><hr>'
else:
#新建一个线程获取楼中楼
#logger.info('开启新线程爬取楼中楼')
allhtml[s]=allhtml[s]+'\n'+'<table><tr><td class="first-p">'+str(a_face)+'<h4>'+a+'</h4>'+'\n'+'</td><td class="second-p">'+str(maincon)+'\n'+'</td></tr>'+'<tr><td></td><td><div class="msg">'+str(meg_floor)+'|'+str(meg_location)+'|'+str(meg_time)+'</div></td></tr>'+'<tr><td></td><td class="review">'+'---for'+str(pid)+'---'+"\n"+'</td>'+'</table><hr>'
#print(str(pagenum),str(floor),str(pid))
nt=threading.Thread(target=hanle_lzl,args=(str(tid),str(pid),s))
nt.start()
while '---for' in allhtml[s]:
await asyncio.sleep(1)
'''print(re.findall('---for(.*?)---',allhtml[s]))
print(len(threading.enumerate()))
print('等待楼中楼爬取完毕')'''
logger.info('等待剩余的'+str(len(re.findall('---for(.*?)---',allhtml[s])))+'楼爬取完毕')
f=open(title+'/'+str(title)+str(pagenum)+'.html','w',encoding='utf-8')
f.write(allhtml[s].encode('utf-8', 'replace').decode())
f.close()
count+=1
logger.info(str(count)+'/'+str(all)+'|'+str((round((count/all),2))*100)+'%')
newpage=await context.new_page()
await page.close()
await context.clear_cookies()
await context.add_cookies(cookies)
pagenum+=1
await context.close()
def tidyup(title,page):
text=''
for i in range(page):
with open('./'+title+'/'+title+str(i+1)+'.html','r',encoding='utf-8') as f:
text+=f.read()
with open('./'+title+'/'+'index.html','w',encoding='utf-8') as f:
f.write(text)
async def main():
logger.info('测试登陆状态中……')
p=await async_playwright().start()
browser = await p.chromium.launch(headless=True)
context =await browser.new_context()
with open('cookies.txt','r',encoding='utf-8') as f:
cookies=eval(f.read())
await context.add_cookies(cookies)
page = await context.new_page()
#获取页码&登录状态
tzurl='https://tieba.baidu.com/p/1012328833'
await page.goto(tzurl)
if await page.locator('id=my_tieba_mod').is_visible(timeout=1000):
logger.info('登陆成功!')
else:
logger.critical('登录出错!有可能是cookie文件损坏!请尝试删除cookie重新登陆!')
quit()
pagenum=BeautifulSoup(await page.content(),'lxml').find('li',{'class':'l_reply_num'}).find_all('span')[1].get_text()
logger.info('该帖子总有'+str(pagenum)+'页')
soup=BeautifulSoup(str(await page.content()),'lxml')
title=soup.find('div',id='j_core_title_wrap').find('h3').get_text().replace('回复:','').replace(' ','')
logger.info('帖子标题:'+title)
await context.close()
global count
count=0
global thread
thread=0
global allhtml
allhtml={}
p=int(pagenum)
tn=10#线程数
threadpools=[]
if p<tn:
await run(title=title,tzurl=tzurl,s=1,e=p,all=p,browser=browser,cookies=cookies)
else:
s=1
e=s+p//tn-1
for i in range(tn):
threadpools.append(run(title=title,tzurl=tzurl,s=s,e=e,all=p,browser=browser,cookies=cookies))
s=e+1
e=s+p//tn-1
if p%tn!=0:
e=s+p%tn-1
threadpools.append(run(title=title,tzurl=tzurl,s=s,e=e,all=p,browser=browser,cookies=cookies))
await asyncio.gather(*threadpools)
await browser.close()
tidyup(title=title,page=pagenum)
#asyncio.run(main())