-
Notifications
You must be signed in to change notification settings - Fork 80
/
create_clean_table.py
141 lines (125 loc) · 5.76 KB
/
create_clean_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# Create clean scraped table
# encoding : 'utf-8'
# python 3.6
import numpy as np
import pandas as pd
import os
print(os.getcwd())
def create_clean_table_two_ways(df):
# (a) Count the number of bookmakers
nbooks = df['Bookmaker'].nunique()
# (b) Assign a number to each game
L = [0 for i in range(df['Bookmaker'].size)]
for i in range(1,df['Bookmaker'].size):
if (df['Date'][i] != df['Date'][i-1]) | (df['Home_id'][i] != df['Home_id'][i-1]) | (df['Away_id'][i] != df['Away_id'][i-1]):
L[i] = 1
df['MatchId'] = np.cumsum(L) + 1
# (c) Create final dataframe containing only one line per match
df_final = pd.DataFrame(index=range(max(df['MatchId'])), columns=range(7)) # Home_id, Away_id, Date, Score_home, Score_away, Season, MatchId
df_final.columns = ['MatchId','Season','Home_id', 'Away_id', 'Date', 'Score_home', 'Score_away']
c = 0
for book in df['Bookmaker'].unique():
print(book)
df_final['{}_H'.format(book)] = 'Na' # Home victory odds
df_final['{}_A'.format(book)] = 'Na' # Away victory odds
for id in range(1, max(df['MatchId']) + 1):
new_df = df[(df['Bookmaker'] == book) & (df['MatchId'] == id)]
if new_df.shape[0] > 0:
#print(id)
df_final['{}_H'.format(book)].iloc[id-1] = new_df['OddHome'].iloc[0]
df_final['{}_A'.format(book)].iloc[id-1] = new_df['OddAway'].iloc[0]
df_final['MatchId'].iloc[id-1] = new_df['MatchId'].iloc[0]
df_final['Season'].iloc[id-1] = new_df['Season'].iloc[0]
df_final['Home_id'].iloc[id-1] = new_df['Home_id'].iloc[0]
df_final['Away_id'].iloc[id-1] = new_df['Away_id'].iloc[0]
df_final['Date'].iloc[id-1] = new_df['Date'].iloc[0]
df_final['Score_home'].iloc[id-1] = new_df['Score_home'].iloc[0]
df_final['Score_away'].iloc[id-1] = new_df['Score_away'].iloc[0]
c+=1
try :
days = df_final['Date'].str[:2]
months = df_final['Date'].str[3:6]
years = df_final['Date'].str[7:]
months[months == 'Jan'] = '01'
months[months == 'Feb'] = '02'
months[months == 'Mar'] = '03'
months[months == 'Apr'] = '04'
months[months == 'May'] = '05'
months[months == 'Jun'] = '06'
months[months == 'Jul'] = '07'
months[months == 'Aug'] = '08'
months[months == 'Sep'] = '09'
months[months == 'Oct'] = '10'
months[months == 'Nov'] = '11'
months[months == 'Dec'] = '12'
date = days + '/' + months + '/' + years
df_final['Date'] = pd.to_datetime(date, format='%d/%m/%Y')
df.sort_values(by=['Date'])
except:
print('Cannot convert Date into regular Date format')
return(df_final)
def create_clean_table_three_ways(df):
# (a) Count the number of bookmakers
nbooks = df['Bookmaker'].nunique()
# (b) Assign a number to each game
L = [0 for i in range(df['Bookmaker'].size)]
for i in range(1,df['Bookmaker'].size):
if (df['Date'][i] != df['Date'][i-1]) | (df['Home_id'][i] != df['Home_id'][i-1]) | (df['Away_id'][i] != df['Away_id'][i-1]):
L[i] = 1
df['MatchId'] = np.cumsum(L) + 1
# (c) Create final dataframe containing only one line per match
df_final = pd.DataFrame(index=range(max(df['MatchId'])), columns=range(7)) # Home_id, Away_id, Date, Score_home, Score_away, Season, MatchId
df_final.columns = ['MatchId','Season','Home_id', 'Away_id', 'Date', 'Score_home', 'Score_away']
c = 0
for book in df['Bookmaker'].unique():
print(book)
df_final['{}_H'.format(book)] = 'Na' # Home victory odds
df_final['{}_D'.format(book)] = 'Na' # Draw odds
df_final['{}_A'.format(book)] = 'Na' # Away victory odds
for id in range(1, max(df['MatchId']) + 1):
new_df = df[(df['Bookmaker'] == book) & (df['MatchId'] == id)]
if new_df.shape[0] > 0:
#print(id)
df_final['{}_H'.format(book)].iloc[id-1] = new_df['OddHome'].iloc[0]
df_final['{}_D'.format(book)].iloc[id-1] = new_df['OddDraw'].iloc[0]
df_final['{}_A'.format(book)].iloc[id-1] = new_df['OddAway'].iloc[0]
df_final['MatchId'].iloc[id-1] = new_df['MatchId'].iloc[0]
df_final['Season'].iloc[id-1] = new_df['Season'].iloc[0]
df_final['Home_id'].iloc[id-1] = new_df['Home_id'].iloc[0]
df_final['Away_id'].iloc[id-1] = new_df['Away_id'].iloc[0]
df_final['Date'].iloc[id-1] = new_df['Date'].iloc[0]
df_final['Score_home'].iloc[id-1] = new_df['Score_home'].iloc[0]
df_final['Score_away'].iloc[id-1] = new_df['Score_away'].iloc[0]
c+=1
try :
days = df_final['Date'].str[:2]
months = df_final['Date'].str[3:6]
years = df_final['Date'].str[7:]
months[months == 'Jan'] = '01'
months[months == 'Feb'] = '02'
months[months == 'Mar'] = '03'
months[months == 'Apr'] = '04'
months[months == 'May'] = '05'
months[months == 'Jun'] = '06'
months[months == 'Jul'] = '07'
months[months == 'Aug'] = '08'
months[months == 'Sep'] = '09'
months[months == 'Oct'] = '10'
months[months == 'Nov'] = '11'
months[months == 'Dec'] = '12'
date = days + '/' + months + '/' + years
df_final['Date'] = pd.to_datetime(date, format='%d/%m/%Y')
df.sort_values(by=['Date'])
except:
print('Cannot convert Date into regular Date format')
return(df_final)
def create_clean_df(fileloc, sep = ";", ways = 2):
df = pd.read_csv(fileloc + '.csv', sep = sep)
if ways == 2:
df = create_clean_table_two_ways(df)
if ways == 3:
df = create_clean_table_three_ways(df)
df.to_csv(fileloc +'_CLEAN' + '.csv', index = False)
print('Finished cleaning table!')
return(0)
#fileloc = "C:\\Users\\Sébastien CARARO\\Desktop\\ATP& &Others\\WebScraping\\DATA\\Treated data\\All_data_champions-league_foot";create_clean_df(fileloc, sep = ",", ways = 3)