-
Notifications
You must be signed in to change notification settings - Fork 0
/
github_fetcher.py
125 lines (96 loc) · 3.38 KB
/
github_fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# %%
import os
from git import Repo
import shutil
import requests
from tqdm import tqdm
from joblib import Parallel, delayed
import pandas as pd
ACCESS_TOKEN = os.environ.get("GITHUB_PERSONAL_ACCESS_TOKEN")
def callback(user: str, repo: dict) -> dict:
"""Callback for multithreading
Args:
user (str): username
repo (dict): repo request output from github api
Returns:
dict: A dict containing repo popularity and activity stats
"""
repo_name = repo["name"]
clone_url = repo["clone_url"]
clone_repo(user, repo_name, clone_url)
return get_repo_stats(user, repo_name)
def get_repo_stats(user: str, repo_name: str) -> dict:
"""Function for repo stats
Args:
user (str): username
repo_name (str): reponame
Returns:
dict: A dict containing repo popularity and activity stats
"""
api_url = "https://api.github.com"
headers = {
"Authorization": f"Bearer {ACCESS_TOKEN}",
"Accept": "application/vnd.github.v3+json",
}
pulls_url = f"{api_url}/repos/{user}/{repo_name}/pulls"
pulls_response = requests.get(pulls_url, headers=headers)
pulls_count = len(pulls_response.json())
stars_url = f"{api_url}/repos/{user}/{repo_name}/stargazers"
stars_response = requests.get(stars_url, headers=headers)
stars_count = len(stars_response.json())
open_pulls_url = f"{pulls_url}?state=open"
open_pulls_response = requests.get(open_pulls_url, headers=headers)
open_pulls_count = len(open_pulls_response.json())
closed_pulls_url = f"{pulls_url}?state=closed"
closed_pulls_response = requests.get(closed_pulls_url, headers=headers)
closed_pulls_count = len(closed_pulls_response.json())
forks_url = f"{api_url}/repos/{user}/{repo_name}/forks"
forks_response = requests.get(forks_url, headers=headers)
forks_count = len(forks_response.json())
return {
"repo": repo_name,
"pulls_count": pulls_count,
"open_pulls_count": open_pulls_count,
"stars_count": stars_count,
"closed_pulls_count": closed_pulls_count,
"forks_count": forks_count,
}
def get_all_repos(user: str) -> pd.DataFrame:
"""This function clones the repo and gets the repo stats
Args:
user (str): username
Returns:
pd.DataFrame: A dataframe for repo stats
"""
url = f"https://api.github.com/users/{user}/repos"
response = requests.get(url)
response.raise_for_status()
data = response.json()
repo_stats = Parallel(n_jobs=-1, prefer="threads")(
delayed(callback)(user, repo) for repo in tqdm(data)
)
df = pd.DataFrame(data=repo_stats)
df = df.sort_values(by=list(df.columns)[1:], ascending=False)
return df
def clone_repo(user: str, repo_name: str, clone_url: str):
"""Function for cloning repo
Args:
user (str): User name
repo_name (str): Repo name
clone_url (str): Repo URL
"""
local_path = f"./repos/{user}/{repo_name}"
if not os.path.exists(local_path):
Repo.clone_from(clone_url, local_path)
def main(user: str):
"""entrypoint function
Args:
user (str): A string for username
"""
os.makedirs(f"repos/{user}", exist_ok=True)
os.makedirs("dumped", exist_ok=True)
df = get_all_repos(user)
df.to_csv(f"dumped/{user}.repo_stats.csv", index=False)
if __name__ == "__main__":
main(user="mythrex")
# %%