Skip to content

Commit

Permalink
fix: issue NanmiCoder#22
Browse files Browse the repository at this point in the history
  • Loading branch information
NanmiCoder committed Jul 30, 2023
1 parent e4c5ed5 commit b71f0e8
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 34 deletions.
40 changes: 27 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,32 @@
## 使用方法

1. 安装依赖库
`pip install -r requirements.txt`

```shell
pip install -r requirements.txt
```

2. 安装playwright浏览器驱动
`playwright install`
3. 是否选择开启保存数据到DB中
如果选择开启,则需要配置数据库连接信息,`config/db_config.py` 中的 `IS_SAVED_DATABASED``RELATION_DB_URL` 变量
<br>再执行 `python db.py` 初始化数据库信息,生成相关的数据库表结构
4. 运行爬虫程序
`python main.py --platform xhs --lt qrcode`
5. 打开对应APP扫二维码登录

```shell
playwright install
```

3. 是否保存数据到DB中

如果选择开启,则需要配置数据库连接信息,`config/db_config.py` 中的 `IS_SAVED_DATABASED``RELATION_DB_URL` 变量。然后执行以下命令初始化数据库信息,生成相关的数据库表结构:

```shell
python db.py
```

4. 运行爬虫程序

```shell
python main.py --platform xhs --lt qrcode
```

5. 打开对应APP扫二维码登录

## 项目代码结构

Expand All @@ -46,11 +62,9 @@ MediaCrawler
│ ├── base_crawler.py # 项目的抽象类
│ └── proxy_account_pool.py # 账号与IP代理池
├── config
│ ├── account_config.py # 基础配置
│ └── base_config.py # 账号池配置
├── images
│ ├── douyin.gif
│ └── xiaohongshu.git
│ ├── account_config.py # 账号代理池配置
│ ├── base_config.py # 基础配置
│ └── db_config.py # 数据库配置
├── libs
│ ├── douyin.js # 抖音Sign函数
│ └── stealth.min.js # 去除浏览器自动化特征的JS
Expand Down
6 changes: 3 additions & 3 deletions config/account_config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
# account_config.py
import os

PHONE_LIST = [
"13012345671",
Expand All @@ -22,6 +23,5 @@
]

IP_PROXY_PROTOCOL = "http://"
IP_PROXY_USER = "xxxx"
IP_PROXY_PASSWORD = "xxxx"

IP_PROXY_USER = os.getenv("IP_PROXY_USER", "test")
IP_PROXY_PASSWORD = os.getenv("IP_PROXY_PASSWORD", "123456")
9 changes: 6 additions & 3 deletions config/db_config.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import os

# redis config
REDIS_DB_HOST = "redis://127.0.0.1" # your redis host
REDIS_DB_PWD = "123456" # your redis password
REDIS_DB_HOST = "127.0.0.1" # your redis host
REDIS_DB_PWD = os.getenv("REDIS_DB_PWD", "123456") # your redis password

# mysql config
RELATION_DB_URL = "mysql://root:youdbpassword@localhost:3306/media_crawler"
RELATION_DB_PWD = os.getenv("RELATION_DB_PWD", "123456") # your relation db password
RELATION_DB_URL = f"mysql://root:{RELATION_DB_PWD}@localhost:3306/media_crawler"

# save data to database option
IS_SAVED_DATABASED = True # if you want to save data to database, set True
9 changes: 4 additions & 5 deletions media_platform/douyin/login.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import sys
from typing import Optional

import aioredis
import redis
from playwright.async_api import BrowserContext, Page
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
Expand Down Expand Up @@ -121,20 +121,19 @@ async def login_by_mobile(self):

# 检查是否有滑动验证码
await self.check_page_display_slider(move_step=10, slider_level="easy")

redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD)
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
while max_get_sms_code_time > 0:
utils.logger.info(f"get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
await asyncio.sleep(1)
sms_code_key = f"dy_{self.login_phone}"
sms_code_value = await redis_obj.get(sms_code_key)
sms_code_value = redis_obj.get(sms_code_key)
if not sms_code_value:
max_get_sms_code_time -= 1
continue

sms_code_input_ele = self.context_page.locator("xpath=//input[@placeholder='请输入验证码']")
await sms_code_input_ele.fill(value=sms_code_value)
await sms_code_input_ele.fill(value=sms_code_value.decode())
await asyncio.sleep(0.5)
submit_btn_ele = self.context_page.locator("xpath=//button[@class='web-login-button']")
await submit_btn_ele.click() # 点击登录
Expand Down
9 changes: 4 additions & 5 deletions media_platform/xhs/login.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import sys
from typing import Optional

import aioredis
import redis
from playwright.async_api import BrowserContext, Page
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
wait_fixed)
Expand Down Expand Up @@ -85,15 +85,14 @@ async def login_by_mobile(self):
await send_btn_ele.click() # 点击发送验证码
sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")

redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD)
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
no_logged_in_session = ""
while max_get_sms_code_time > 0:
utils.logger.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...")
await asyncio.sleep(1)
sms_code_key = f"xhs_{self.login_phone}"
sms_code_value = await redis_obj.get(sms_code_key)
sms_code_value = redis_obj.get(sms_code_key)
if not sms_code_value:
max_get_sms_code_time -= 1
continue
Expand All @@ -102,7 +101,7 @@ async def login_by_mobile(self):
_, cookie_dict = utils.convert_cookies(current_cookie)
no_logged_in_session = cookie_dict.get("web_session")

await sms_code_input_ele.fill(value=sms_code_value) # 输入短信验证码
await sms_code_input_ele.fill(value=sms_code_value.decode()) # 输入短信验证码
await asyncio.sleep(0.5)
agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']")
await agree_privacy_ele.click() # 点击同意隐私协议
Expand Down
8 changes: 4 additions & 4 deletions recv_sms_notification.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import re
from typing import List

import aioredis
import redis
import tornado.web

import config
Expand All @@ -15,7 +15,7 @@ def extract_verification_code(message) -> str:
Extract verification code of 6 digits from the SMS.
"""
pattern = re.compile(r'\b[0-9]{6}\b')
codes: List[str]= pattern.findall(message)
codes: List[str] = pattern.findall(message)
return codes[0] if codes and len(codes) > 0 else ""


Expand Down Expand Up @@ -47,15 +47,15 @@ async def post(self):
request_body = self.request.body.decode("utf-8")
req_body_dict = json.loads(request_body)
print("recv sms notification and body content: ", req_body_dict)
redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD)
sms_content = req_body_dict.get("sms_content")
sms_code = extract_verification_code(sms_content)
if sms_code:
# Save the verification code in Redis and set the expiration time to 3 minutes.
# Use Redis string data structure, in the following format:
# xhs_138xxxxxxxx -> 171959
key = f"{req_body_dict.get('platform')}_{req_body_dict.get('current_number')}"
await redis_obj.set(name=key, value=sms_code, ex=60 * 3)
redis_obj.set(name=key, value=sms_code, ex=60 * 3)
self.set_status(200)
self.write("ok")

Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
httpx==0.24.0
Pillow==9.5.0
playwright==1.33.0
aioredis==2.0.1
tenacity==8.2.2
tornado==6.3.2
PyExecJS==1.5.1
opencv-python==4.7.0.72
tortoise-orm[asyncmy]==0.19.3
aerich==0.7.2
numpy~=1.24.4
redis~=4.6.0

0 comments on commit b71f0e8

Please sign in to comment.