diff --git a/README.md b/README.md index 5a69254b..4fdb488a 100644 --- a/README.md +++ b/README.md @@ -27,16 +27,32 @@ ## 使用方法 1. 安装依赖库 - `pip install -r requirements.txt` + + ```shell + pip install -r requirements.txt + ``` + 2. 安装playwright浏览器驱动 - `playwright install` -3. 是否选择开启保存数据到DB中 - 如果选择开启,则需要配置数据库连接信息,`config/db_config.py` 中的 `IS_SAVED_DATABASED`和`RELATION_DB_URL` 变量 -
再执行 `python db.py` 初始化数据库信息,生成相关的数据库表结构 -4. 运行爬虫程序 - `python main.py --platform xhs --lt qrcode` -5. 打开对应APP扫二维码登录 + ```shell + playwright install + ``` + +3. 是否保存数据到DB中 + + 如果选择开启,则需要配置数据库连接信息,`config/db_config.py` 中的 `IS_SAVED_DATABASED`和`RELATION_DB_URL` 变量。然后执行以下命令初始化数据库信息,生成相关的数据库表结构: + + ```shell + python db.py + ``` + +4. 运行爬虫程序 + + ```shell + python main.py --platform xhs --lt qrcode + ``` + +5. 打开对应APP扫二维码登录 ## 项目代码结构 @@ -46,11 +62,9 @@ MediaCrawler │ ├── base_crawler.py # 项目的抽象类 │ └── proxy_account_pool.py # 账号与IP代理池 ├── config -│ ├── account_config.py # 基础配置 -│ └── base_config.py # 账号池配置 -├── images -│ ├── douyin.gif -│ └── xiaohongshu.git +│ ├── account_config.py # 账号代理池配置 +│ ├── base_config.py # 基础配置 +│ └── db_config.py # 数据库配置 ├── libs │ ├── douyin.js # 抖音Sign函数 │ └── stealth.min.js # 去除浏览器自动化特征的JS diff --git a/config/account_config.py b/config/account_config.py index a0f54b26..2ece5d7e 100644 --- a/config/account_config.py +++ b/config/account_config.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- # account_config.py +import os PHONE_LIST = [ "13012345671", @@ -22,6 +23,5 @@ ] IP_PROXY_PROTOCOL = "http://" -IP_PROXY_USER = "xxxx" -IP_PROXY_PASSWORD = "xxxx" - +IP_PROXY_USER = os.getenv("IP_PROXY_USER", "test") +IP_PROXY_PASSWORD = os.getenv("IP_PROXY_PASSWORD", "123456") diff --git a/config/db_config.py b/config/db_config.py index a5dfe98f..1e9d2676 100644 --- a/config/db_config.py +++ b/config/db_config.py @@ -1,9 +1,12 @@ +import os + # redis config -REDIS_DB_HOST = "redis://127.0.0.1" # your redis host -REDIS_DB_PWD = "123456" # your redis password +REDIS_DB_HOST = "127.0.0.1" # your redis host +REDIS_DB_PWD = os.getenv("REDIS_DB_PWD", "123456") # your redis password # mysql config -RELATION_DB_URL = "mysql://root:youdbpassword@localhost:3306/media_crawler" +RELATION_DB_PWD = os.getenv("RELATION_DB_PWD", "123456") # your relation db password +RELATION_DB_URL = f"mysql://root:{RELATION_DB_PWD}@localhost:3306/media_crawler" # save data to database option IS_SAVED_DATABASED = True # if you want to save data to database, set True diff --git a/media_platform/douyin/login.py b/media_platform/douyin/login.py index 7a2148a4..f02fa9f3 100644 --- a/media_platform/douyin/login.py +++ b/media_platform/douyin/login.py @@ -3,7 +3,7 @@ import sys from typing import Optional -import aioredis +import redis from playwright.async_api import BrowserContext, Page from playwright.async_api import TimeoutError as PlaywrightTimeoutError from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt, @@ -121,20 +121,19 @@ async def login_by_mobile(self): # 检查是否有滑动验证码 await self.check_page_display_slider(move_step=10, slider_level="easy") - - redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True) + redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD) max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟 while max_get_sms_code_time > 0: utils.logger.info(f"get douyin sms code from redis remaining time {max_get_sms_code_time}s ...") await asyncio.sleep(1) sms_code_key = f"dy_{self.login_phone}" - sms_code_value = await redis_obj.get(sms_code_key) + sms_code_value = redis_obj.get(sms_code_key) if not sms_code_value: max_get_sms_code_time -= 1 continue sms_code_input_ele = self.context_page.locator("xpath=//input[@placeholder='请输入验证码']") - await sms_code_input_ele.fill(value=sms_code_value) + await sms_code_input_ele.fill(value=sms_code_value.decode()) await asyncio.sleep(0.5) submit_btn_ele = self.context_page.locator("xpath=//button[@class='web-login-button']") await submit_btn_ele.click() # 点击登录 diff --git a/media_platform/xhs/login.py b/media_platform/xhs/login.py index f410ac2c..0e78df13 100644 --- a/media_platform/xhs/login.py +++ b/media_platform/xhs/login.py @@ -3,7 +3,7 @@ import sys from typing import Optional -import aioredis +import redis from playwright.async_api import BrowserContext, Page from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt, wait_fixed) @@ -85,15 +85,14 @@ async def login_by_mobile(self): await send_btn_ele.click() # 点击发送验证码 sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input") submit_btn_ele = await login_container_ele.query_selector("div.input-container > button") - - redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True) + redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD) max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟 no_logged_in_session = "" while max_get_sms_code_time > 0: utils.logger.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...") await asyncio.sleep(1) sms_code_key = f"xhs_{self.login_phone}" - sms_code_value = await redis_obj.get(sms_code_key) + sms_code_value = redis_obj.get(sms_code_key) if not sms_code_value: max_get_sms_code_time -= 1 continue @@ -102,7 +101,7 @@ async def login_by_mobile(self): _, cookie_dict = utils.convert_cookies(current_cookie) no_logged_in_session = cookie_dict.get("web_session") - await sms_code_input_ele.fill(value=sms_code_value) # 输入短信验证码 + await sms_code_input_ele.fill(value=sms_code_value.decode()) # 输入短信验证码 await asyncio.sleep(0.5) agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']") await agree_privacy_ele.click() # 点击同意隐私协议 diff --git a/recv_sms_notification.py b/recv_sms_notification.py index 9718df0b..31aa9bec 100644 --- a/recv_sms_notification.py +++ b/recv_sms_notification.py @@ -4,7 +4,7 @@ import re from typing import List -import aioredis +import redis import tornado.web import config @@ -15,7 +15,7 @@ def extract_verification_code(message) -> str: Extract verification code of 6 digits from the SMS. """ pattern = re.compile(r'\b[0-9]{6}\b') - codes: List[str]= pattern.findall(message) + codes: List[str] = pattern.findall(message) return codes[0] if codes and len(codes) > 0 else "" @@ -47,7 +47,7 @@ async def post(self): request_body = self.request.body.decode("utf-8") req_body_dict = json.loads(request_body) print("recv sms notification and body content: ", req_body_dict) - redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True) + redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD) sms_content = req_body_dict.get("sms_content") sms_code = extract_verification_code(sms_content) if sms_code: @@ -55,7 +55,7 @@ async def post(self): # Use Redis string data structure, in the following format: # xhs_138xxxxxxxx -> 171959 key = f"{req_body_dict.get('platform')}_{req_body_dict.get('current_number')}" - await redis_obj.set(name=key, value=sms_code, ex=60 * 3) + redis_obj.set(name=key, value=sms_code, ex=60 * 3) self.set_status(200) self.write("ok") diff --git a/requirements.txt b/requirements.txt index 8be486a9..12c99795 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,11 @@ httpx==0.24.0 Pillow==9.5.0 playwright==1.33.0 -aioredis==2.0.1 tenacity==8.2.2 tornado==6.3.2 PyExecJS==1.5.1 opencv-python==4.7.0.72 tortoise-orm[asyncmy]==0.19.3 aerich==0.7.2 +numpy~=1.24.4 +redis~=4.6.0 \ No newline at end of file