-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
182 lines (154 loc) · 7.95 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import argparse
from dotenv import load_dotenv
from src.mailchimp_adapter import MailchimpAdapter
from src.multion_utils import MultiOnUtils
from src.mem0_utils import MemorySystem
from src.time_utils import Time
import csv
import os
load_dotenv()
def main(
repo_url,
max_stargazers=None,
scrape_linkedin=False,
use_agentops=False,
use_mem0=False,
use_neo4j_kg=False,
use_mailchimp=False
):
"""
Main function to scrape GitHub and LinkedIn data.
This function takes in a GitHub repository URL and several optional parameters:
max_stargazers: Maximum number of stargazers to scrape
scrape_linkedin: Whether to scrape LinkedIn profiles
use_mem0: Whether to use Mem0 memory system
use_neo4j_kg: Whether to use Neo4j knowledge graph
The function scrapes the repository and its stargazers, then scrapes GitHub data for each stargazer. If scrape_linkedin is True, it also scrapes LinkedIn data for users with LinkedIn URLs. The function then writes the collected data to a CSV file.
It prints various debugging messages throughout its execution.
"""
multion_scraper = MultiOnUtils(use_agentops=use_agentops)
if use_mailchimp == True:
mailchimp_adapter = MailchimpAdapter()
agent_name = "StarTracker"
# Step 1: Scrape repo and stargazers
print(f"Scraping repo: {repo_url}")
repo = multion_scraper.scrape_repo(repo_url)
print(f"Scraped repo: {repo}")
print(f"Scraping stargazers: {repo}")
stargazers = multion_scraper.scrape_stargazers(repo_url)
if max_stargazers:
stargazers = stargazers[:max_stargazers]
print(f"Scraped {len(stargazers)} stargazers")
# Step 2: Scrape GitHub data for each stargazer
github_user_data = []
for stargazer in stargazers:
user_data = multion_scraper.scrape_github(stargazer.user_id)
github_user_data.append(user_data)
github_user_data = list({user.name: user for user in github_user_data}.values())
print(f"Scraped GitHub data for {len(github_user_data)} users")
# Step 3: Scrape LinkedIn data for users with LinkedIn URLs
linkedin_data = {}
if scrape_linkedin == True:
for user in github_user_data:
if user.linkedin_url:
linkedin_profile = multion_scraper.scrape_linkedin(user.linkedin_url)
if linkedin_profile.name: # Only add if we got a valid name
linkedin_data[user.name] = linkedin_profile
print(f"Scraped LinkedIn data for {len(linkedin_data)} users")
# Step 4 Print or process the collected data as needed
print("\nGitHub User Data:")
for user in github_user_data:
print(user)
if scrape_linkedin == True:
print("\nLinkedIn Data:")
for name, profile in linkedin_data.items():
print(f"{name}: {profile}")
print("---\nScraping completed")
# Step 4 Initialize Mem0 memory
if use_mem0 == True:
memory_system = MemorySystem()
memory = memory_system.get_memory()
# memory.reset()
# Step 4a Memorize stargazers of each repository on a Knowledge Graph
if use_neo4j_kg == True:
print(f"Updating knowledge graph")
repositories = [repo_url]
# Iterate over the list of repositories
for repository in repositories:
print(f"Upserting repository: {repository}")
result_repository = memory.add(
f"The github user: {stargazer.user_id} starred the repository: {repository}",
user_id=None,
agent_id=agent_name,
run_id=str(Time()),
metadata={'app_id': repository},
)
# Iterate over the list of usernames for each repository
for stargazer in stargazers:
print(f" Upserting username: {stargazer.user_id}")
result_stargazer = memory.add(
f"The github user: {stargazer.user_id} starred the repository: {repository}",
user_id=stargazer.user_id,
agent_id=agent_name,
run_id=str(Time()),
metadata={'app_id': repository},
)
# Step 5 Combine data and write to CSV
print(f"Writing stargazers data to CSV. Number of GitHub users to write: {len(github_user_data)}")
# Create the 'data' directory if it doesn't exist
os.makedirs('data', exist_ok=True)
csv_filename = f"data/Stargazers_of_{repo.description.replace(' ', '_')[:30]}__{str(Time())}.csv"
print(f"Saved file: {csv_filename}")
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
field_names = ['username', 'email', 'name', 'location', 'github_followers', 'linkedin_headline', 'current_position',
'linkedin_followers']
writer = csv.DictWriter(csvfile, fieldnames=field_names)
writer.writeheader()
row_count = 0
for user in github_user_data:
try:
linkedin_profile = linkedin_data.get(user.name)
row = {
'username': user.name,
'email': user.email or (linkedin_profile.email if linkedin_profile else '') or '',
'name': linkedin_profile.name if linkedin_profile else user.name,
'location': linkedin_profile.location if linkedin_profile else (user.location or ''),
'github_followers': user.num_followers,
'linkedin_headline': getattr(linkedin_profile, 'headline', ''),
'current_position': getattr(linkedin_profile, 'curr_job', ''),
'linkedin_followers': getattr(linkedin_profile, 'num_followers', '')
}
writer.writerow(row)
row_count += 1
print(f"Writing row {row_count}: {row}")
except Exception as e:
print(f"Error writing CSV row for user: {user.name}: {str(e)}")
print(f"---\nTotal rows written to {csv_filename}: {row_count}")
# Step 6 Add emails to mailchimp list
if use_mailchimp == True:
print(f"Adding emails to Mailchimp list")
scraped_emails = [user.email for user in github_user_data if user.email]
tag_name = repo.name
if scraped_emails:
print(f"Processing {len(scraped_emails)} emails")
mailchimp_adapter.process_emails(scraped_emails, f"tag_{tag_name}")
else:
print("No emails found to process")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape GitHub and LinkedIn data for repository stargazers.")
parser.add_argument("repo_url", help="URL of the GitHub repository to scrape")
parser.add_argument("-limit","--max-stargazers", type=int, help="Maximum number of stargazers to scrape")
parser.add_argument("-li","--with-linkedin", action="store_true", default=False,
help="Scrape LinkedIn profiles")
parser.add_argument("-aops","--with-agentops", action="store_true", default=False,
help="Use Agentops for tracking and reporting agents' actions")
parser.add_argument("-mem0", "--with-mem0", action="store_true", default=False,
help="Option to include memory usage for scraping")
parser.add_argument("-mailchimp", "--with-mailchimp", action="store_true", default=False,
help="Option to include adding scraped users to mailchimp list")
parser.add_argument("-kg", "--with-neo4j-kg", action="store_true", default=False,
help="Option to use Neo4j knowledge graph. Requires --with-mem0")
args = parser.parse_args()
if args.with_neo4j_kg and not args.with_mem0:
parser.error("--with-neo4j-kg requires --with-mem0 to be present")
main(args.repo_url, args.max_stargazers, args.with_linkedin, args.with_agentops, args.with_mem0, args.with_neo4j_kg, args.with_mailchimp)