-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_scraper.py
156 lines (110 loc) · 5.36 KB
/
data_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from bs4 import BeautifulSoup
from operator import itemgetter
import requests, string, numpy
raw_data_path = "/Users/dennisdeng2002/Documents/Programming/PycharmProjects/nba-comparison/raw_player_data.npy"
starting_points = 100
position_value_weight = 1
def get_soup(url):
try:
r = requests.get(url)
except:
print("Invalid url")
return BeautifulSoup(r.text, "html.parser")
def get_raw_player_data():
player_data = []
# Since b-ball reference organizes players by the first letter of last name
# get all pages by iterating through all letters in alphabet
for letter in string.ascii_lowercase:
letter_page = get_soup('http://www.basketball-reference.com/players/%s/' % letter)
# Active players are denoted in bold - html: strong
active_players_names = letter_page.findAll('strong')
# Example HTML
# <tr class="">
# <td align="left" ><a href="/players/a/abdulka01.html">Kareem Abdul-Jabbar</a>*</td>
# <td align="right" >1970</td>
# <td align="right" >1989</td>
# <td align="center" >C</td>
# <td align="right" csk="86.0">7-2</td>
# <td align="right" >225</td>
# <td align="left" csk="19470416"><a href="/friv/birthdays.cgi?month=4&day=16">April 16, 1947</a></td>
# <td align="left" ><a href="/friv/colleges.cgi?college=ucla">University of California, Los Angeles</a></td>
# </tr>
for names in active_players_names:
name_data = names.children.__next__()
try:
height_data = name_data.findNext('td').findNext('td').findNext('td').findNext('td')
weight_data = height_data.findNext('td')
url = 'http://www.basketball-reference.com/' + name_data.attrs['href'];
player_page = get_soup(url)
#subtracting the last 3 characters removes the extra dot that accompanies the position
position = player_page.find(text='Position:').parent.next_sibling[1:-3]
shooting_hand = player_page.find(text='Shoots:').parent.next_sibling[1:]
player_data.append((
name_data.contents[0],
position,
convert_height_to_inches(height_data.contents[0]),
weight_data.contents[0],
shooting_hand
))
except:
#Exceptions usually arise when data is missing from BballRef (results in Nonetype)
pass
return player_data
def save_raw_player_data():
raw_player_data = numpy.array(get_raw_player_data())
numpy.save(raw_data_path, raw_player_data)
def convert_height_to_inches(height):
height_split = height.split("-")
feet = int(height_split[0]) * 12
return feet + int(height_split[1])
def get_statistics_for_height(position, raw_player_data):
heights = []
for data in raw_player_data:
if position in data[1]:
heights.append(int(data[2]))
heights = numpy.array(heights)
return [numpy.average(heights), numpy.std(heights)]
def get_statistics_for_weight(position, raw_player_data):
weights = []
for data in raw_player_data:
if position in data[1]:
weights.append(int(data[3]))
weights = numpy.array(weights)
return [numpy.average(weights), numpy.std(weights)]
def get_player_names(my_data):
my_position, my_height, my_weight = my_data[0], my_data[1], my_data[2]
# path must be specified for this to work in Flask
raw_player_data = numpy.load(raw_data_path)
comparison_data = list()
comparison_data = compare_position(my_position, comparison_data, raw_player_data)
comparison_data = compare_height(my_position, my_height, comparison_data, raw_player_data)
comparison_data = compare_height(my_position, my_weight, comparison_data, raw_player_data)
comparison_data.sort(key=itemgetter(1), reverse=True)
return comparison_data
def compare_position(my_position, comparison_data, raw_player_data):
for data in raw_player_data:
if my_position == data[1]:
comparison_data.append([data[0], starting_points])
else:
points = starting_points - position_value_weight
comparison_data.append([data[0], points])
return comparison_data
def compare_height(my_position, my_height, comparison_data, raw_player_data):
# arbitrarily define average PG height at local gym as 5'8
local_height = 68
average_height = get_statistics_for_height(my_position, raw_player_data)
converted_height = average_height[0] + (my_height - local_height)
for i in range(0, len(comparison_data)):
points = comparison_data[i][1] - numpy.abs(converted_height - int(raw_player_data[i][2]))/average_height[1]
comparison_data[i][1] = points
return comparison_data
def compare_weight(my_position, my_weight, comparison_data, raw_player_data):
local_weight = 150
average_weight = get_statistics_for_weight(my_position, raw_player_data)
converted_weight = average_weight[0] + (my_weight - local_weight)
for i in range(0, len(comparison_data)):
points = comparison_data[i][1] - numpy.abs(converted_weight - int(raw_player_data[i][2]))/average_weight[1]
comparison_data[i][1] = points
return comparison_data
data = ["Point Guard", 62, 180]
print(get_player_names(data))