-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
348 lines (278 loc) · 12.4 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
from urllib.request import urlopen
from bs4 import BeautifulSoup, Comment
import json, requests, os, re
from datetime import datetime
today = datetime.today()
current_year = today.year
current_month = today.month
current_day = today.day
import player_profile
import info
import tools
import random
# gloabl variables
BASE_REG_STATS_URL = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"
BASE_ADV_STATS_URL = "https://www.basketball-reference.com/leagues/NBA_{}_advanced.html"
BASE_TM_STAND_URL = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"
BASE_TM_URL = "https://www.basketball-reference.com/teams/{}/{}.html"
BASE_SEASON_URL = "https://www.basketball-reference.com/leagues/NBA_{}.html"
def scrape_player(url):
return player_profile.get_player_profile(url)
def scrape_stats(year):
reg_stats = scrape(BASE_REG_STATS_URL.format(year))
adv_stats = scrape(BASE_ADV_STATS_URL.format(year))
for player in list(reg_stats):
reg_stats[player].update(adv_stats[player])
min_played = 10 * info.season_percentage()
try:
if float(reg_stats[player]["G"]) < min_played or float(reg_stats[player]["MP"]) < min_played:
del reg_stats[player]
except:
if float(reg_stats[player]["G"]) < min_played:
del reg_stats[player]
return clean(reg_stats)
def clean(stats):
for player in list(stats):
# del stats[player]['\xa0']
stats[player]["last_update"] = today.strftime("%b %d %Y %H:%M:%S")
return stats
def extract_info(text):
info = {}
words = text.split(" ")
p = 0
for word in words:
p+=1
if word.lower() == "record:":
info["record"] = words[p].replace(",", "")
info["standing"] = words[p + 1]
if word.lower() == "nba":
info["standing"] += f" in {words[p].replace('ern', '')}"
if word.lower() == "coach:":
info["coach"] = " ".join(words[p:]).split("Executive")[0].strip()
if word.lower() == "executive:":
info["gm"] = " ".join(words[p:]).split("PTS/G")[0].strip()
if word.lower() == "preseason":
info["preseason_odds"] = " ".join(words[(p + 2)]).split(",")[0].strip().replace(" ", "")
return info
def scrape_standings(year):
url = BASE_TM_STAND_URL.format(year)
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")
comments = soup.find_all(string=lambda text: isinstance(text, Comment))
for comment in comments:
uncommented_content = BeautifulSoup(comment, 'html.parser')
comment.replace_with(uncommented_content)
table = soup.find('table', {'id': 'expanded_standings'})
# Extract table headers
headers = [th.text.strip() for th in table.find_all('tr')[1].find_all('th')]
headers.append('link')
# Initialize a list to store each row as a dictionary
data_dicts = []
# Process each row in the table body
for tr in table.find('tbody').find_all('tr'):
# Extract cell data
cells = tr.find_all(['th', 'td'])
row_data = [cell.text.strip() for cell in cells]
team_link = tr.find('td', {'data-stat': 'team_name'}).find('a')['href']
row_data.append(team_link)
# Create a dictionary for the current row, mapping headers to cell data
row_dict = dict(zip(headers, row_data))
data_dicts.append(row_dict)
img = "https://cdn.ssref.net/req/202312151/tlogo/bbr/{}-{}.png"
standings = {}
for item in data_dicts:
name = item["link"].split("teams/")[1].split("/")[0]
item["img"] = img.format(name, year)
standings[name] = item
return standings
def scrape_team(team, year):
url = BASE_TM_URL.format(team, year)
# Fetching and parsing the HTML content
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")
comments = soup.find_all(string=lambda text: isinstance(text, Comment))
for comment in comments:
uncommented_content = BeautifulSoup(comment, 'html.parser')
comment.replace_with(uncommented_content)
template_content = soup.find_all(attrs={"data-template": "Partials/Teams/Summary"})
extracted_data = [element.get_text() for element in template_content]
text = tools.replace_multiple_whitespaces_with_single(extracted_data[0]).strip()
info = extract_info(text)
roster_table = soup.find('div', id='all_roster').find('table')
player_td_tags = roster_table.find_all('td', {'data-stat': 'player'})
player_names = [td.find('a').text for td in player_td_tags if td.find('a')]
try:
injury_table = soup.find('div', id='div_injuries').find('table')
injury_reports = []
for row in injury_table.find('tbody').find_all('tr'):
player_name = row.find('th', {'data-stat': 'player'}).text
team_name = row.find('td', {'data-stat': 'team_name'}).text
update_date = row.find('td', {'data-stat': 'date_update'}).text
description = row.find('td', {'data-stat': 'note'}).text
injury_reports.append({
'Player': player_name,
'Team': team_name,
'Update': update_date,
'Description': description
})
except:
injury_reports = []
data_list = scrape_div(soup, 'div_team_and_opponent')
team_stats = {}
for item in data_list:
if item['\xa0'] in team_stats:
name = "Opp " + item['\xa0']
else:
name = item['\xa0']
team_stats[name] = item
del team_stats[name]['\xa0']
data_list = scrape_div(soup, 'div_team_misc', overhead=True)
for item in data_list:
if item[""] in team_stats:
name = "Misc " + item[""]
else:
name = item[""]
team_stats[name] = item
del team_stats[name][""]
# Placeholder for team and player stats (use your existing scraping logic)
final_stats = {}
# Fetching the team image URL
# Assuming the image can be identified by a specific tag and attribute
team_logo_img = soup.find('img', class_='teamlogo') # Adjust the class name as necessary
team_image_url = team_logo_img['src'] if team_logo_img else "No image found"
# Compiling the final JSON-like structure
j = {
team: {
'info': info,
'players': player_names,
'injuries': injury_reports,
'team_stats': team_stats,
'image_url': team_image_url,
'last_update': today.strftime("%b %d %Y %H:%M:%S")
}
}
return j
def scrape_div(soup, id, overhead=False):
# Initialize a list to store your data
if not overhead:
div = soup.find('div', id=id)
data_list = []
if div:
table = div.find('table')
if table:
headers = [th.get_text() for th in table.find('thead').find_all('th')]
# Extract table rows
for tr in table.find('tbody').find_all('tr'):
row_data = [td.get_text() for td in tr.find_all('td', recursive=False)] # Or tr.find_all(['th', 'td']) if the first column is important
# Sometimes the first column is a header (<th>) within the row
row_header = tr.find('th', recursive=False)
if row_header:
row_data.insert(0, row_header.get_text())
# Combine headers and row data into a dictionary
if row_data: # Ensure there is data to process
row_dict = dict(zip(headers, row_data))
data_list.append(row_dict)
if overhead:
# Initialize a list to store each row's data
div_container = soup.find('div', id=id)
data_list = []
# Proceed if the div container is found
if div_container:
# Find the table within the div
table = div_container.find('table')
# Proceed if the table is found
if table:
# Extract table headers, skipping 'over_header' by selecting the second row explicitly for headers
headers = [th.get_text(strip=True) for th in table.find_all('tr')[1].find_all('th')]
# Extract table rows, ignoring 'over_header' rows
for tr in table.find('tbody').find_all('tr'):
# Ensure the row is not an 'over_header' before processing
if 'over_header' not in tr.get('class', []): # The get method avoids AttributeError if 'class' is not present
row_data = [td.get_text(strip=True) for td in tr.find_all(['td', 'th'])]
data_list.append(row_data)
structured_data = []
for row in data_list:
if len(row) == len(headers):
structured_data.append(dict(zip(headers, row)))
data_list = structured_data
return data_list
def scrape(url):
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")
headers = [th.getText() for th in soup.findAll("tr", limit=2)[0].findAll("th")]
headers = headers[1:]
rows = soup.findAll("tr")[1:]
stats = {}
for i in range(len(rows)):
tds = rows[i].findAll("td")
if len(tds) > 0:
name_cell = tds[0]
name = name_cell.getText()
link = name_cell.find('a')['href'] if name_cell.find('a') else None
player_id = f"https://www.basketball-reference.com{link}".split("/")[-1].split(".html")[0]
if "basketball-reference" in player_id:
continue
try:
if stats[player_id] != {}:
h = 0
for td in tds:
header = headers[h]
if header == "Team":
team = td.getText()
h += 1
stats[player_id]["Team"].append(team)
except:
stats[player_id] = {}
stats[player_id]["link"] = f"https://www.basketball-reference.com{link}"
stats[player_id]["id"] = link.split("/")[-1].split(".html")[0]
stats[player_id]["img"] = f'https://www.basketball-reference.com/req/202106291/images/headshots/{stats[player_id]["id"]}.jpg'
h = 0
for td in tds:
header = headers[h]
if header == "MP" and "advanced" in url:
header = "TMP"
stats[player_id][header] = td.getText()
h += 1
if stats[player_id]["Team"] == "TOT":
stats[player_id]["Team"] = []
return stats
def scrape_champion_mvp(year):
url = BASE_SEASON_URL.format(year)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
comments = soup.find_all(string=lambda text: isinstance(text, Comment))
for comment in comments:
uncommented_content = BeautifulSoup(comment, 'html.parser')
comment.replace_with(uncommented_content)
champion = None
mvp = None
rookies = []
for p in soup.find_all('p'):
if 'League Champion' in p.text:
champion = (p.find('a').text, p.find('a')['href'])
elif 'Most Valuable Player' in p.text:
mvp = (p.find('a').text, p.find('a')['href'])
elif 'Rookie of the Year' in p.text:
rookies = [(a.text, a['href']) for a in p.find_all('a')]
return champion, mvp, rookies
def scrape_all_nba_teams(year):
url = BASE_SEASON_URL.format(year)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
comments = soup.find_all(string=lambda text: isinstance(text, Comment))
for comment in comments:
uncommented_content = BeautifulSoup(comment, 'html.parser')
comment.replace_with(uncommented_content)
all_nba_teams = {}
for i in range(1, 4):
team_id = f"all-nba_{i}"
team_div = soup.find('div', id=team_id)
if team_div is not None:
players = [(a.text, a['href']) for a in team_div.find_all('a')]
all_nba_teams[f"{i}st Team"] = players
else:
all_nba_teams[f"{i}st Team"] = []
return all_nba_teams
if __name__ == "__main__":
print(scrape_champion_mvp(2024))
# print(scrape_all_nba_teams(2021))