-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapingother.py
81 lines (70 loc) · 3.74 KB
/
scrapingother.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import requests
from bs4 import BeautifulSoup
import mysql.connector
import pandas as pd
import json
linkstring = ''
url = requests.get("https://www.teamrankings.com/ncaa-basketball/ranking/predictive-by-other")
soup = BeautifulSoup(url.text, 'html.parser')
soup.prettify()
table = soup.find('div', attrs = {'class' : 'main-wrapper clearfix has-left-sidebar'}).find('main')
#next = table.find('li', attrs = {'class' : 'expand-section'}).find('ul', attrs = {'class' : 'expand-section'})
links = table.find_all('a')
skips = ['yo mama']
teams = []
cnx = mysql.connector.connect(user = "wsa", host = "34.68.250.121", database = "Tutorials-Winter2024", password = "LeBron>MJ!")
cursor = cnx.cursor(buffered = True)
dates = ['2007-03-11','2008-03-16','2009-03-15'
,'2010-03-14', '2011-03-13', '2012-03-11', '2013-03-17', '2014-03-16', '2015-03-15',
'2016-03-13', '2017-03-12', '2018-03-11', '2019-03-17', '2021-03-14', '2022-03-13',
'2023-03-12', '2024-03-17']
Schools = ["Connecticut", "Houston", "Purdue", "N Carolina", "Tennessee", "Arizona", "Marquette",
"Iowa St", "Baylor", "Creighton", "Kentucky", "Illinois", "Duke", "Kansas", "Auburn",
"Alabama", "BYU", "San Diego St", "Wisconsin", "St Marys", "Gonzaga", "Clemson",
"Texas Tech", "S Carolina", "Florida", "Wash State", "Texas", "Dayton", "Nebraska",
"Utah State", "Florida Atlantic", "Mississippi State", "Michigan State", "Texas A&M", "TCU",
"Northwestern", "Nevada", "Boise St", "Colorado", "Drake", "Virginia", "New Mexico",
"Oregon", "Colorado St", "NC State", "Duquesne", "Grd Canyon", "James Mad",
"McNeese St", "UAB", "Vermont", "Yale", "Samford", "Col Charlestn", "Oakland", "Akron",
"Morehead St", "Colgate", "Lg Beach St", "W Kentucky", "S Dakota St",
"St Peters", "Longwood", "Stetson", "Montana St", "Grambling St", "Howard", "Wagner"]
stats = ["Predictive Power Rating", "Strength of Schedule Power Rating", "Last 5 Games Power Rating",
"Last 10 Games Power Rating", "Luck Power Rating", "Consistency Power Rating", "Vs. Teams 1-25 Power Rating",
"Vs. Teams 26-50 Power Rating", "Vs. Teams 51-100 Power Rating"]
for date in dates:
for link in links:
linkstring = link.get('href')
if linkstring == '#':
category = link.get_text(strip=True)
continue
else:
tablename = link.get_text(strip=True)
if tablename not in stats:
continue
year = date[:4]
urlstring = f"https://www.teamrankings.com{linkstring}?date={date}"
url = requests.get(urlstring)
soup = BeautifulSoup(url.text, 'html.parser')
table = soup.find('table')
tableRows = table.find_all('tr')
#print (table)
#print(tableRows)
for row in tableRows:
#print (row)
#print ("-----------")
columns = row.find_all("td")
if len (columns) > 0:
try:
team = columns[1].find('a').text
except AttributeError:
# If the 'a' tag doesn't exist, continue to the next iteration
continue
team = columns[1].find('a').text
if team not in Schools:
continue
rating = str(columns[2].text)
inserts = [team, year, tablename, rating]
statement = f"INSERT INTO NCAAB_otherteamstats_0724_nathan_dsouza (team, year, tablename, value) VALUES (%s, %s, %s, %s)"
cursor.execute(statement, inserts)
cnx.commit()
print(date)