-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsmayanscrapencaa.py
361 lines (334 loc) · 15.6 KB
/
smayanscrapencaa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
import requests
from bs4 import BeautifulSoup
import mysql.connector
import pandas as pd
import json
Schools13 = [
"Louisville", "North Carolina A&T", "Duke", "Albany", "Michigan State", "Valparaiso",
"Saint Louis", "New Mexico State", "Oregon", "Oklahoma State", "Memphis", "Saint Mary's",
"Creighton", "Cincinnati", "Colorado State", "Missouri", "Gonzaga", "Southern", "Ohio State",
"Iona", "Harvard", "New Mexico", "La Salle", "Kansas State", "Ole Miss", "Wisconsin", "Arizona",
"Belmont", "Iowa State", "Notre Dame", "Wichita State", "Pittsburgh", "Indiana", "James Madison",
"Miami (Fla.)", "Pacific", "Marquette", "Davidson", "Syracuse", "Montana", "California", "UNLV",
"Butler", "Bucknell", "Illinois", "Colorado", "Temple", "NC State", "Kansas", "Western Kentucky",
"Florida Gulf Coast", "Georgetown", "Florida", "Northwestern State", "Michigan", "South Dakota State",
"VCU", "Akron", "Minnesota", "UCLA", "San Diego State", "Oklahoma", "North Carolina", "Villanova"
]
team_dict13 = {team: 0 for team in Schools13}
Schools14 = [
"Wichita State", "Cal Poly", "Michigan", "Wofford", "Mercer", "Duke", "Louisville", "Manhattan",
"Saint Louis", "NC State", "Tennessee", "UMass", "Texas", "Arizona State", "Kentucky", "Kansas State",
"Arizona", "Weber State", "Wisconsin", "American", "Creighton", "Louisiana", "San Diego State",
"New Mexico State", "North Dakota State", "Oklahoma", "Baylor", "Nebraska", "Oregon", "BYU",
"Gonzaga", "Oklahoma State", "Virginia", "Coastal Carolina", "Villanova", "Milwaukee", "Iowa State",
"North Carolina Central", "Michigan State", "Delaware", "Harvard", "Cincinnati", "North Carolina",
"Providence", "UConn", "Saint Joseph's", "Memphis", "George Washington", "Florida", "Albany",
"Kansas", "Eastern Kentucky", "Syracuse", "Western Michigan", "UCLA", "Tulsa", "Stephen F. Austin",
"VCU", "Dayton", "Ohio State", "Stanford", "New Mexico", "Pittsburgh", "Colorado"
]
team_dict14 = {team: 0 for team in Schools14}
Schools15 = [
"Kentucky", "Hampton", "Kansas", "New Mexico State", "Notre Dame", "Northeastern", "Maryland", "Valparaiso",
"West Virginia", "Buffalo", "Butler", "Texas", "Wichita State", "Indiana", "Cincinnati", "Purdue", "Wisconsin",
"Coastal Carolina", "Arizona", "Texas Southern", "Georgia State", "Baylor", "North Carolina", "Harvard", "Arkansas",
"Wofford", "Xavier", "Ole Miss", "Ohio State", "VCU", "Oregon", "Oklahoma State", "Villanova", "Lafayette", "Virginia",
"Belmont", "Oklahoma", "Albany", "Louisville", "UC Irvine", "Northern Iowa", "Wyoming", "Dayton", "Providence",
"Michigan State", "Georgia", "NC State", "LSU", "Duke", "Robert Morris", "Gonzaga", "North Dakota State", "UAB",
"Iowa State", "Georgetown", "Eastern Washington", "Utah", "Stephen F. Austin", "UCLA", "SMU", "Iowa", "Davidson",
"San Diego State", "St. John's"
]
team_dict15 = {team: 0 for team in Schools15}
Schools16 = [
"North Carolina", "Florida Gulf Coast", "Xavier", "Weber State", "Stephen F. Austin", "West Virginia",
"Kentucky", "Stony Brook", "Indiana", "Chattanooga", "Notre Dame", "Michigan",
"Wisconsin", "Pittsburgh", "Providence", "Southern California", "Oregon", "Holy Cross",
"Oklahoma", "Cal State Bakersfield", "Texas A&M", "Green Bay", "Duke", "UNC Wilmington",
"Yale", "Baylor", "Northern Iowa", "Texas", "VCU", "Oregon State",
"Saint Joseph's", "Cincinnati", "Kansas", "Austin Peay", "Villanova", "UNC Asheville",
"Miami (Fla.)", "Buffalo", "Hawai'i", "California", "Maryland", "South Dakota State",
"Wichita State", "Arizona", "Iowa", "Temple", "UConn", "Colorado",
"Virginia", "Hampton", "Middle Tennessee", "Michigan State", "Utah", "Fresno State",
"Iowa State", "Iona", "Little Rock", "Purdue", "Gonzaga", "Seton Hall",
"Syracuse", "Dayton", "Butler", "Texas Tech"
]
team_dict16 = {team: 0 for team in Schools16}
Schools17 = [
"Villanova", "Mount St. Mary's", "Duke", "Troy", "Baylor", "New Mexico State", "Florida", "East Tennessee State",
"Virginia", "UNC Wilmington", "Southern California", "SMU", "South Carolina", "Marquette", "Wisconsin", "Virginia Tech",
"Gonzaga", "South Dakota State", "Arizona", "North Dakota", "Florida State", "Florida Gulf Coast", "West Virginia", "Bucknell",
"Notre Dame", "Princeton", "Xavier", "Maryland", "Saint Mary's", "VCU", "Northwestern", "Vanderbilt",
"North Carolina", "Texas Southern", "Kentucky", "Northern Kentucky", "UCLA", "Kent State", "Butler", "Winthrop",
"Middle Tennessee", "Minnesota", "Cincinnati", "Kansas State", "Wichita State", "Dayton", "Arkansas", "Seton Hall",
"Kansas", "UC Davis", "Louisville", "Jacksonville State", "Oregon", "Iona", "Purdue", "Vermont",
"Iowa State", "Nevada", "Rhode Island", "Creighton", "Michigan", "Oklahoma State", "Michigan State", "Miami (Fla.)"
]
team_dict17 = {team: 0 for team in Schools17}
cnx = mysql.connector.connect(user = "wsa", host = "34.68.250.121", database = "Tutorials-Winter2024", password = "LeBron>MJ!")
cursor = cnx.cursor(buffered = True)
url = requests.get('https://www.ncaa.com/news/basketball-men/article/2020-05-06/2017-ncaa-tournament-bracket-scores-stats-records')
soup = BeautifulSoup(url.text, 'html.parser')
soup.prettify()
table = soup.find('article')
# Define a function to filter <ul> tags where the text in <li> elements starts with "No"
def starts_with_no(text):
if text is None:
return False
return text.startswith("No.")
# Find all <ul> tags containing the text starting with "No" in the <li> elements
li_tags = soup.find_all("li")
# Filter <ul> tags where the text in <li> elements starts with "No"
litags = [li_tag for li_tag in li_tags if starts_with_no(li_tag.get_text(separator="|", strip=True))]
gameswithupsets = []
for li_tag in litags:
gameswithupsets.append(li_tag.get_text(separator="|", strip=True))
# print(gameswithupsets)
# for game in gameswithupsets:
# print(game)
games = []
for game in gameswithupsets:
games.append(game.split("||")[0].strip())
team_names = []
for game in games:
# Splitting the row by commas
parts = game.split(',')
# Extracting the team name after "No." in the first part
first_team_with_score = parts[0].split('No. ')[1].split(',')[0].strip()
# Removing the score from the team name
first_team = ' '.join(first_team_with_score.split()[:-1])
team_names.append(first_team)
names = [team.split(' ', 1)[1] for team in team_names]
team_frequency = {}
# Loop through the list of team names and count occurrences
for team in names:
if team in team_frequency:
team_frequency[team] += 1
else:
team_frequency[team] = 1
year = '2017 '
for team, frequency in team_frequency.items():
yearteam = year + str(team)
inserts = [yearteam, frequency]
statement = f"INSERT INTO NCAAB1317_smayan_ranjan (team, wins) VALUES (%s, %s)"
cursor.execute(statement, inserts)
cnx.commit()
for school in Schools17:
if school not in team_frequency:
yearteam = year + str(school)
inserts = [yearteam, 0]
statement = f"INSERT INTO NCAAB1317_smayan_ranjan (team, wins) VALUES (%s, %s)"
cursor.execute(statement, inserts)
cnx.commit()
url = requests.get('https://www.ncaa.com/news/basketball-men/article/2020-05-07/2016-ncaa-tournament-bracket-scores-stats-records')
soup = BeautifulSoup(url.text, 'html.parser')
soup.prettify()
table = soup.find('article')
# Define a function to filter <ul> tags where the text in <li> elements starts with "No"
def starts_with_no(text):
if text is None:
return False
return text.startswith("No.")
# Find all <ul> tags containing the text starting with "No" in the <li> elements
li_tags = soup.find_all("li")
# Filter <ul> tags where the text in <li> elements starts with "No"
litags = [li_tag for li_tag in li_tags if starts_with_no(li_tag.get_text(separator="|", strip=True))]
gameswithupsets = []
for li_tag in litags:
gameswithupsets.append(li_tag.get_text(separator="|", strip=True))
# print(gameswithupsets)
# for game in gameswithupsets:
# print(game)
games = []
for game in gameswithupsets:
games.append(game.split("||")[0].strip())
team_names = []
for game in games:
# Splitting the row by commas
parts = game.split(',')
# Extracting the team name after "No." in the first part
first_team_with_score = parts[0].split('No. ')[1].split(',')[0].strip()
# Removing the score from the team name
first_team = ' '.join(first_team_with_score.split()[:-1])
team_names.append(first_team)
names = [team.split(' ', 1)[1] for team in team_names]
team_frequency = {}
# Loop through the list of team names and count occurrences
for team in names:
if team in team_frequency:
team_frequency[team] += 1
else:
team_frequency[team] = 1
year = '2016 '
team_frequency['Notre Dame'] = 3
del team_frequency['Notre']
for team, frequency in team_frequency.items():
yearteam = year + str(team)
inserts = [yearteam, frequency]
statement = f"INSERT INTO NCAAB1317_smayan_ranjan (team, wins) VALUES (%s, %s)"
cursor.execute(statement, inserts)
cnx.commit()
for school in Schools16:
if school not in team_frequency:
yearteam = year + str(school)
inserts = [yearteam, 0]
statement = f"INSERT INTO NCAAB1317_smayan_ranjan (team, wins) VALUES (%s, %s)"
cursor.execute(statement, inserts)
cnx.commit()
url = requests.get('https://www.ncaa.com/news/basketball-men/article/2020-05-08/2015-ncaa-tournament-bracket-scores-stats-records')
soup = BeautifulSoup(url.text, 'html.parser')
soup.prettify()
table = soup.find('article')
# Define a function to filter <ul> tags where the text in <li> elements starts with "No"
def starts_with_no(text):
if text is None:
return False
return text.startswith("No.")
# Find all <ul> tags containing the text starting with "No" in the <li> elements
li_tags = soup.find_all("li")
# Filter <ul> tags where the text in <li> elements starts with "No"
litags = [li_tag for li_tag in li_tags if starts_with_no(li_tag.get_text(separator="|", strip=True))]
gameswithupsets = []
for li_tag in litags:
gameswithupsets.append(li_tag.get_text(separator="|", strip=True))
# print(gameswithupsets)
# for game in gameswithupsets:
# print(game)
games = []
for game in gameswithupsets:
games.append(game.split("||")[0].strip())
team_names = []
for game in games:
# Splitting the row by commas
parts = game.split(',')
# Extracting the team name after "No." in the first part
first_team_with_score = parts[0].split('No. ')[1].split(',')[0].strip()
# Removing the score from the team name
first_team = ' '.join(first_team_with_score.split()[:-1])
team_names.append(first_team)
names = [team.split(' ', 1)[1] for team in team_names]
team_frequency = {}
# Loop through the list of team names and count occurrences
for team in names:
if team in team_frequency:
team_frequency[team] += 1
else:
team_frequency[team] = 1
year = '2015 '
for team, frequency in team_frequency.items():
yearteam = year + str(team)
inserts = [yearteam, frequency]
statement = f"INSERT INTO NCAAB1317_smayan_ranjan (team, wins) VALUES (%s, %s)"
cursor.execute(statement, inserts)
cnx.commit()
for school in Schools15:
if school not in team_frequency:
yearteam = year + str(school)
inserts = [yearteam, 0]
statement = f"INSERT INTO NCAAB1317_smayan_ranjan (team, wins) VALUES (%s, %s)"
cursor.execute(statement, inserts)
cnx.commit()
url = requests.get('https://www.ncaa.com/news/basketball-men/article/2020-05-10/2014-ncaa-tournament-bracket-scores-stats-records')
soup = BeautifulSoup(url.text, 'html.parser')
soup.prettify()
table = soup.find('article')
# Define a function to filter <ul> tags where the text in <li> elements starts with "No"
def starts_with_no(text):
if text is None:
return False
return text.startswith("No.")
# Find all <ul> tags containing the text starting with "No" in the <li> elements
li_tags = soup.find_all("li")
# Filter <ul> tags where the text in <li> elements starts with "No"
litags = [li_tag for li_tag in li_tags if starts_with_no(li_tag.get_text(separator="|", strip=True))]
gameswithupsets = []
for li_tag in litags:
gameswithupsets.append(li_tag.get_text(separator="|", strip=True))
# print(gameswithupsets)
# for game in gameswithupsets:
# print(game)
games = []
for game in gameswithupsets:
games.append(game.split("||")[0].strip())
team_names = []
for game in games:
# Splitting the row by commas
parts = game.split(',')
# Extracting the team name after "No." in the first part
first_team_with_score = parts[0].split('No. ')[1].split(',')[0].strip()
# Removing the score from the team name
first_team = ' '.join(first_team_with_score.split()[:-1])
team_names.append(first_team)
names = [team.split(' ', 1)[1] for team in team_names]
team_frequency = {}
# Loop through the list of team names and count occurrences
for team in names:
if team in team_frequency:
team_frequency[team] += 1
else:
team_frequency[team] = 1
year = '2014 '
for team, frequency in team_frequency.items():
yearteam = year + str(team)
inserts = [yearteam, frequency]
statement = f"INSERT INTO NCAAB1317_smayan_ranjan (team, wins) VALUES (%s, %s)"
cursor.execute(statement, inserts)
cnx.commit()
for school in Schools14:
if school not in team_frequency:
yearteam = year + str(school)
inserts = [yearteam, 0]
statement = f"INSERT INTO NCAAB1317_smayan_ranjan (team, wins) VALUES (%s, %s)"
cursor.execute(statement, inserts)
cnx.commit()
url = requests.get('https://www.ncaa.com/news/basketball-men/article/2020-05-11/2013-ncaa-tournament-bracket-scores-stats-records')
soup = BeautifulSoup(url.text, 'html.parser')
soup.prettify()
table = soup.find('article')
# Define a function to filter <ul> tags where the text in <li> elements starts with "No"
def starts_with_no(text):
if text is None:
return False
return text.startswith("No.")
# Find all <ul> tags containing the text starting with "No" in the <li> elements
li_tags = soup.find_all("li")
# Filter <ul> tags where the text in <li> elements starts with "No"
litags = [li_tag for li_tag in li_tags if starts_with_no(li_tag.get_text(separator="|", strip=True))]
gameswithupsets = []
for li_tag in litags:
gameswithupsets.append(li_tag.get_text(separator="|", strip=True))
# print(gameswithupsets)
# for game in gameswithupsets:
# print(game)
games = []
for game in gameswithupsets:
games.append(game.split("||")[0].strip())
team_names = []
for game in games:
# Splitting the row by commas
parts = game.split(',')
# Extracting the team name after "No." in the first part
first_team_with_score = parts[0].split('No. ')[1].split(',')[0].strip()
# Removing the score from the team name
first_team = ' '.join(first_team_with_score.split()[:-1])
team_names.append(first_team)
names = [team.split(' ', 1)[1] for team in team_names]
team_frequency = {}
# Loop through the list of team names and count occurrences
for team in names:
if team in team_frequency:
team_frequency[team] += 1
else:
team_frequency[team] = 1
year = '2013 '
for team, frequency in team_frequency.items():
yearteam = year + str(team)
inserts = [yearteam, frequency]
statement = f"INSERT INTO NCAAB1317_smayan_ranjan (team, wins) VALUES (%s, %s)"
cursor.execute(statement, inserts)
cnx.commit()
for school in Schools13:
if school not in team_frequency:
yearteam = year + str(school)
inserts = [yearteam, 0]
statement = f"INSERT INTO NCAAB1317_smayan_ranjan (team, wins) VALUES (%s, %s)"
cursor.execute(statement, inserts)
cnx.commit()