diff --git a/.obsidian/hotkeys.json b/.obsidian/hotkeys.json index d8a3787..a2984f2 100644 --- a/.obsidian/hotkeys.json +++ b/.obsidian/hotkeys.json @@ -39,5 +39,13 @@ ], "key": "ArrowDown" } + ], + "editor:toggle-code": [ + { + "modifiers": [ + "Mod" + ], + "key": "`" + } ] } \ No newline at end of file diff --git a/.obsidian/workspace.json b/.obsidian/workspace.json index b5cafb8..cbef22b 100644 --- a/.obsidian/workspace.json +++ b/.obsidian/workspace.json @@ -11,13 +11,14 @@ "id": "22a9ab72237d5df4", "type": "leaf", "state": { - "type": "diff-view", + "type": "markdown", "state": { - "file": "Clippings/使用line_profiler对python代码性能进行评估优化 - DECHIN - 博客园.md", - "staged": false + "file": "_posts/pool/2024-12-18-github_api爬虫.md", + "mode": "source", + "source": false }, - "icon": "git-pull-request", - "title": "Diff View (使用line_profiler对python代码性能进行评估优化 - DECHIN - 博客园)" + "icon": "lucide-file", + "title": "2024-12-18-github_api爬虫" } } ] @@ -188,6 +189,7 @@ }, "active": "22a9ab72237d5df4", "lastOpenFiles": [ + "_posts/pool/2024-12-18-github_api爬虫.md", "_posts/engineering/2024-09-11-git版本管理常见技巧.md", "_posts/pool/2024-12-13.md", "_posts/pool/2024-11-14-深入理解Python异步编程(上).md", @@ -215,7 +217,6 @@ "_posts/engineering/2024-09-13-linux安装全局可用的conda+创建有root权限的新账号.md", "_posts/coding/2024-08-13-json.dumps输出美化版json.md", "_posts/coding/2024-08-13-chatglm-PPO训练路径探索.md", - "_posts/coding/2024-08-13-generator.md", "Clippings", "_posts/language", "assets/img/mrj9tyfxgpwc4ohkdhkq3uu3azxww8g.png", diff --git "a/_posts/pool/2024-12-18-github_api\347\210\254\350\231\253.md" "b/_posts/pool/2024-12-18-github_api\347\210\254\350\231\253.md" new file mode 100644 index 0000000..2a32b34 --- /dev/null +++ "b/_posts/pool/2024-12-18-github_api\347\210\254\350\231\253.md" @@ -0,0 +1,128 @@ +--- +title: 2024-12-18-github_api爬虫 +author: X +date: 2024-12-18 14:08:29 +0800 +categories: +tags: +--- +# 背景 +需要爬github repo中的一些特定内容,利用了github提供的api。搞了整整两天才搞出来稍微优化的版本,主要因为github_api有**速率限制**,所以需要准备好token断连后换token续传、并发等处理。 + +# 参考文档 +[关于速率限制的官方最佳实践](https://docs.github.com/en/rest/using-the-rest-api/best-practices-for-using-the-rest-api?apiVersion=2022-11-28) +有关各种请求的endpoints链接可以在文档首页下拉找到。 +[首页](https://docs.github.com/en/rest?apiVersion=2022-11-28) +[pr's endpoints](https://docs.github.com/en/rest/pulls/pulls?apiVersion=2022-11-28) + +# 代码设计 + +## 嵌套文件夹完整下载 +涉及并发,是笔者做得比较用心的一部分: +```python +async def download_file(file_url, file_path, session, semaphore, token, logger): + retries = 0 + while retries < 50: + async with semaphore: + headers = {"Authorization": f"token {token}"} + + async with session.get(file_url, headers=headers) as response: + if response.status == 200: + data = await response.json() + file_content = base64.b64decode(data["content"]) + os.makedirs(os.path.dirname(file_path), exist_ok=True) + with open(file_path, "wb") as f: + f.write(file_content) + break + else: + retries += 1 + if retries % 6 == 0: + await asyncio.sleep(60 * 15) + else: + token = random.choice(token_list) + if retries >= 50: + logger.error(f"{file_path}下载失败,{file_url}") + return True + + +async def async_download_dir( + directory_url, directory, session, semaphore, token, +): + headers = {"Authorization": f"token {token}"} if token else {} + async with session.get(directory_url, headers=headers) as response: + response.raise_for_status() + data = await response.json() + tree = data["tree"] + + if len(tree) > 2000: + return False + + tasks = [] + for file in tree: + file_path = os.path.join(directory, file["path"]) + file_url = file["url"] + if file["type"] == "tree": + tasks.append( + async_download_dir( + file_url, file_path, session, semaphore, token + ) + ) + else: + if not os.path.exists(file_path): + tasks.append( + download_file( + file_url, file_path, session, semaphore, token + ) + ) + + await asyncio.gather(*tasks) + return True + + +def download_dir(directory_url, directory, token=None): + async def run(): + async with aiohttp.ClientSession() as session: + semaphore = asyncio.Semaphore(10) + result = await async_download_dir( + directory_url, directory, session, semaphore, token + ) + return result + + return asyncio.run(run()) +``` + +有并发,有异常处理,有细粒度的token轮换来保证断点续传,而且是有backoff的重试策略,我觉得还是很不错滴。 + +# 后台运行 + +1. **启动** + + ```bash + nohup python your_script.py > nohup.out + ``` + 默认日志就在`nohup.out`,`>`是可以指定到其他文件中 +1. **打印日志** + +```bash + tail -f nohup.out +``` + +4. **查找进程 ID (PID) 根据脚本名:** + + 使用 `pgrep` 命令直接根据脚本名查找进程 ID。 + + ```bash + pgrep -f your_script.py + ``` + + `-f` 参数表示搜索全命令行,而不仅仅是进程名称,这样可以确保找到指定脚本的进程。 + +4. **停止进程:** + + 使用 `pkill` 命令根据脚本名来停止进程: + + ```bash + pkill -f your_script.py + ``` + + 同样地,`-f` 参数表示搜索全命令行。 +