-
Notifications
You must be signed in to change notification settings - Fork 62
/
spider.py
134 lines (109 loc) · 5.31 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import json
import re
import gzip
import time
import datetime
import requests
import pymysql
from websocket import WebSocketApp
from urllib.parse import unquote_plus
from protobuf.douyin_pb2 import PushFrame, Response, ChatMessage
from settings import db_conf
# 定义直播间号
# 交个朋友:168465302284
# 东方甄选:80017709309
live_id = "80017709309"
# 数据库连接
db = pymysql.connect(host=db_conf["host"], port=db_conf["port"], user=db_conf["user"], password=db_conf["password"], db=db_conf["db"], charset=db_conf["charset"])
cursor = db.cursor()
print("数据库连接成功!")
# 获取直播间的基本信息
def fetch_live_room_info(url):
res = requests.get(
url=url,
headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
},
cookies={
"__ac_nonce": "063abcffa00ed8507d599" # 可以是任意值
}
)
data_string = re.findall(r'<script id="RENDER_DATA" type="application/json">(.*?)</script>', res.text)[0]
data_dict = json.loads(unquote_plus(data_string))
room_id = data_dict['app']['initialState']['roomStore']['roomInfo']['roomId']
room_title = data_dict['app']['initialState']['roomStore']['roomInfo']["room"]['title']
room_user_count = data_dict['app']['initialState']['roomStore']['roomInfo']["room"]['user_count_str']
# print(room_id)
wss_url = f"wss://webcast3-ws-web-lq.douyin.com/webcast/im/push/v2/?app_name=douyin_web&version_code=180800&webcast_sdk_version=1.3.0&update_version_code=1.3.0&compress=gzip&internal_ext=internal_src:dim|wss_push_room_id:7204471273437760314|wss_push_did:7140459943756301854|dim_log_id:202302262321404283BF425CD3004243D4|fetch_time:1677424900407|seq:1|wss_info:0-1677424900407-0-0|wrds_kvs:RoomLinkMicSyncData-1677424899240771392_WebcastRoomStatsMessage-1677424900157809312_InputPanelComponentSyncData-1677423998211004512_RoomLinkMicAnchorSettingsSyncData-1677424182191971552_WebcastRoomRankMessage-1677424870201661066&cursor=r-1_d-1_u-1_h-1_t-1677424900407&host=https://live.douyin.com&aid=6383&live_id=1&did_rule=3&debug=false&endpoint=live_pc&support_wrds=1&im_path=/webcast/im/fetch/&user_unique_id=7140459943756301854&device_platform=web&cookie_enabled=true&screen_width=1920&screen_height=1080&browser_language=zh-CN&browser_platform=MacIntel&browser_name=Mozilla&browser_version=5.0%20(Macintosh;%20Intel%20Mac%20OS%20X%2010_15_7)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/110.0.0.0%20Safari/537.36&browser_online=true&tz_name=Asia/Shanghai&identity=audience&room_id={room_id}&heartbeatDuration=0&signature=RZal/L9xj457uiOG"
ttwid = res.cookies.get_dict()['ttwid']
return room_id, room_title, room_user_count, wss_url, ttwid
def save_data_to_db(message):
global live_id
try:
sql = f"insert into t_danmu(roomId,shortId,nickName,gender,content,createTime) values (%s,%s,%s,%s,%s,%s)"
# 运行sql语句
cursor.execute(sql, (live_id, message.user.shortId, message.user.nickName, message.user.gender, message.content, datetime.datetime.now()))
# 修改
db.commit()
info = f"{datetime.datetime.now()}【{message.user.nickName}】:{message.content} "
print(info)
except Exception as e:
print("数据存储错误", message.user.nickName, e)
def on_open(ws):
print('on_open')
def on_message(ws, content):
frame = PushFrame()
frame.ParseFromString(content)
# 对PushFrame的 payload 内容进行gzip解压
origin_bytes = gzip.decompress(frame.payload)
# 根据Response+gzip解压数据,生成数据对象
response = Response()
response.ParseFromString(origin_bytes)
if response.needAck:
s = PushFrame()
s.payloadType = "ack"
s.payload = response.internalExt.encode('utf-8')
s.logId = frame.logId
ws.send(s.SerializeToString())
# 获取数据内容(需根据不同method,使用不同的结构对象对 数据 进行解析)
# 注意:此处只处理 WebcastChatMessage ,其他处理方式都是类似的。
for item in response.messagesList:
if item.method != "WebcastChatMessage":
continue
message = ChatMessage()
message.ParseFromString(item.payload)
if message.content == "":
continue
if message.user.gender == None:
message.user.gender = 0
save_data_to_db(message)
def on_error(ws, content):
print(content)
print("on_error, after 60s, try again")
time.sleep(60)
run()
def on_close(*args, **kwargs):
print(args, kwargs)
print("on_close")
def run():
web_url = "https://live.douyin.com/" + live_id
room_id, room_title, room_user_count, wss_url, ttwid = fetch_live_room_info(web_url)
print(room_id, room_title, room_user_count, wss_url, ttwid)
ws = WebSocketApp(
url=wss_url,
header={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
},
cookie=f"ttwid={ttwid}",
on_open=on_open,
on_message=on_message,
on_error=on_error,
on_close=on_close,
)
try:
ws.run_forever()
except Exception as e:
print(e)
ws.close()
if __name__ == '__main__':
run()