Skip to content

Commit

Permalink
Added data testing framework, improved estimates for promoted teams, …
Browse files Browse the repository at this point in the history
…improved formatting for expected points (#17)

* #13 Added non-cumulative future expected points columns to player_gw_next_eps_ext dataset.

* #15, Fixes #16: Added data testing framework. Fixed goals stats estimation issue.

* #15 Trying to fix component test.

* Fixes #16: Fixed comp test.

* Removed hard coded component test mode setting.
  • Loading branch information
177arc authored Jan 3, 2021
1 parent d623e1f commit 4c156bf
Show file tree
Hide file tree
Showing 46 changed files with 3,646 additions and 734 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/ci_cd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@ jobs:
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Run tests
- name: Run component tests
run: |
python -m unittest discover -s "tests/comp"
- name: Run intergration tests
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@
/.vscode/
/layer/python/
/layer/layer.zip
/*.zip
/*.log
/tests/comp/actual_*.csv
6 changes: 3 additions & 3 deletions data/team_goals_stats_estimates.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Team Code,Team Short Name,Total Team Goals Scored Home~,Total Team Goals Scored Away~,Total Team Goals Conceded Home~,Total Team Goals Conceded Away~,Total Team Clean Sheets Home~,Total Team Clean Sheets Away~,Team Fixture Count Home~,Team Fixture Count Away~
2,LEE,23,14,17,24,7,5,19,19
35,WBA,30,19,30,30,3,4,19,19
54,FUL,30,19,30,30,3,4,19,19
2,LEE,11.5,7,8.5,12,3.5,2.5,9.5,9.5
35,WBA,15,9.5,15,15,1.5,2,9.5,9.5
54,FUL,15,9.5,15,15,1.5,2,9.5,9.5
49 changes: 26 additions & 23 deletions fpldata/backtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,33 +6,36 @@
DF = pd.DataFrame
S = pd.Series


def add_gws_ago(players_gw_team: DF) -> DF:
return (players_gw_team
.sort_values(['Season', 'Game Week'], ascending=False)
.assign(**{'GWs Ago': lambda df: (~df['Expected Points'].isnull()).cumsum()}))


def get_gw_points_backtest(players_gw_team_eps: DF, ctx: Context) -> DF:
return (players_gw_team_eps
.reset_index()
[lambda df: (df['Fixture Minutes Played'] > 0) & (df['Fixtures Played To Fixture'] > 4) & # (df['Total Points To Fixture'] > 48) &
((df['Season'] == ctx.current_season) & (df['Game Week'] < ctx.next_gw) | (df['Season'] != ctx.current_season))]
.assign(**{'Player Fixture Error': lambda df: np.abs(df['Expected Points']-df['Fixture Total Points'])})
.assign(**{'Player Fixture Error Simple': lambda df: np.abs(df['Expected Points Simple']-df['Fixture Total Points'])})
.assign(**{'Player Fixture Sq Error': lambda df: df['Player Fixture Error']**2})
.assign(**{'Player Fixture Sq Error Simple': lambda df: df['Player Fixture Error Simple']**2})
.groupby(['Season', 'Game Week'])
[['Player Fixture Error', 'Player Fixture Sq Error', 'Expected Points', 'Player Fixture Error Simple', 'Player Fixture Sq Error Simple', 'Expected Points Simple', 'Fixture Total Points', 'Player Code']]
.agg({'Player Fixture Error': 'sum', 'Player Fixture Sq Error': 'sum', 'Expected Points': 'sum', 'Player Fixture Error Simple': 'sum', 'Player Fixture Sq Error Simple': 'sum', 'Expected Points Simple': 'sum', 'Fixture Total Points': 'sum', 'Player Code': 'count'})
.reset_index()
.pipe(add_gws_ago)
[lambda df: df['GWs Ago'] <= ctx.fixtures_look_back]
.sort_values(['Season', 'Game Week'])
.rename(columns={'Player Code': 'Player Count'})
.assign(**{'Avg Expected Points': lambda df: df['Expected Points']/df['Player Count']})
.assign(**{'Avg Expected Points Simple': lambda df: df['Expected Points Simple']/df['Player Count']})
.assign(**{'Avg Fixture Total Points': lambda df: df['Fixture Total Points']/df['Player Count']})
.assign(**{'Error': lambda df: df['Player Fixture Error']/df['Player Count']})
.assign(**{'Error Simple': lambda df: df['Player Fixture Error Simple']/df['Player Count']})
.assign(**{'Sq Error': lambda df: df['Player Fixture Sq Error']/df['Player Count']})
.assign(**{'Sq Error Simple': lambda df: df['Player Fixture Sq Error Simple']/df['Player Count']})
.assign(**{'Season Game Week': lambda df: df['Season']+', GW '+df['Game Week'].apply('{:.0f}'.format)}))
.reset_index()
[lambda df: (df['Fixture Minutes Played'] > 0) & (df['Fixtures Played To Fixture'] > 4) & # (df['Total Points To Fixture'] > 48) &
((df['Season'] == ctx.current_season) & (df['Game Week'] < ctx.next_gw) | (df['Season'] != ctx.current_season))]
.assign(**{'Player Fixture Error': lambda df: np.abs(df['Expected Points'] - df['Fixture Total Points'])})
.assign(**{'Player Fixture Error Simple': lambda df: np.abs(df['Expected Points Simple'] - df['Fixture Total Points'])})
.assign(**{'Player Fixture Sq Error': lambda df: df['Player Fixture Error'] ** 2})
.assign(**{'Player Fixture Sq Error Simple': lambda df: df['Player Fixture Error Simple'] ** 2})
.groupby(['Season', 'Game Week'])
[['Player Fixture Error', 'Player Fixture Sq Error', 'Expected Points', 'Player Fixture Error Simple', 'Player Fixture Sq Error Simple', 'Expected Points Simple', 'Fixture Total Points', 'Player Code']]
.agg({'Player Fixture Error': 'sum', 'Player Fixture Sq Error': 'sum', 'Expected Points': 'sum', 'Player Fixture Error Simple': 'sum', 'Player Fixture Sq Error Simple': 'sum', 'Expected Points Simple': 'sum',
'Fixture Total Points': 'sum', 'Player Code': 'count'})
.reset_index()
.pipe(add_gws_ago)
[lambda df: df['GWs Ago'] <= ctx.fixtures_look_back]
.sort_values(['Season', 'Game Week'])
.rename(columns={'Player Code': 'Player Count'})
.assign(**{'Avg Expected Points': lambda df: df['Expected Points'] / df['Player Count']})
.assign(**{'Avg Expected Points Simple': lambda df: df['Expected Points Simple'] / df['Player Count']})
.assign(**{'Avg Fixture Total Points': lambda df: df['Fixture Total Points'] / df['Player Count']})
.assign(**{'Error': lambda df: df['Player Fixture Error'] / df['Player Count']})
.assign(**{'Error Simple': lambda df: df['Player Fixture Error Simple'] / df['Player Count']})
.assign(**{'Sq Error': lambda df: df['Player Fixture Sq Error'] / df['Player Count']})
.assign(**{'Sq Error Simple': lambda df: df['Player Fixture Sq Error Simple'] / df['Player Count']})
.assign(**{'Season Game Week': lambda df: df['Season'] + ', GW ' + df['Game Week'].apply('{:.0f}'.format)}))
75 changes: 11 additions & 64 deletions fpldata/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,22 @@

import pandas as pd
import numpy as np
import datetime as dt
from datadict import DataDict
from typing import Tuple, NoReturn
from fplpandas import FPLPandas

# Define type aliases
DF = pd.DataFrame
S = pd.Series

POSITION_BY_TYPE: dict = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}
FIXTURE_TYPES: list = ['Home', 'Away', '']
STATS_TYPES: list = ['Goals Scored', 'Goals Conceded', 'Clean Sheets']
FIXTURE_STATS_TYPES: list = [STATS_TYPES, FIXTURE_TYPES]
LOCAL_COL_PREFIX = '_'
PRIVATE_COL_PREFIX = '__'

class Context:
POSITION_BY_TYPE: dict = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}
FIXTURE_TYPES: list = ['Home', 'Away', '']
STATS_TYPES: list = ['Goals Scored', 'Goals Conceded', 'Clean Sheets']
FIXTURE_STATS_TYPES: list = [STATS_TYPES, FIXTURE_TYPES]
LOCAL_COL_PREFIX = '_'
PRIVATE_COL_PREFIX = '__'

class Context:
total_gws: int # The number game weeks in a season.
next_gw: int # The upcoming game week.
def_next_gws: str # The default forecast time horizon, e.g. 'Next 8 GWs'
Expand All @@ -30,59 +29,7 @@ class Context:
last_season: str # The name of the last season, e.g. '2019-20'.
current_season: str # The name of the current season, e.g. '2020-21'.
dd: DataDict # The data dictionary to use for column remapping, formatting and descriptions.


class FPL:
def get_game_weeks(self) -> DF:
raise NotImplementedError

def get_teams(self) -> DF:
raise NotImplementedError

def get_teams_last_season(self) -> DF:
raise NotImplementedError

def get_fixtures(self) -> DF:
raise NotImplementedError

def get_fixtures_last_season(self) -> DF:
raise NotImplementedError

def get_players(self) -> Tuple[DF, DF, DF]:
raise NotImplementedError

def get_players_last_season(self) -> Tuple[DF, DF, DF]:
raise NotImplementedError

def assert_context(self, ctx: Context) -> NoReturn:
raise NotImplementedError


class FPLPandasEx(FPLPandas, FPL):
TEAMS_FILE = 'teams.csv' # File with team data for last season
PLAYERS_FILE = f'players.csv' # File with player data for last season
PLAYERS_HISTORY_FILE = f'players_history.csv' # File with player fixture data for last season
FIXTURES_FILE = f'fixtures.csv' # File with fixture data for last season

last_season_path: str

def __init__(self, last_season_path):
self.last_season_path = last_season_path
super().__init__()

def get_teams_last_season(self) -> DF:
return pd.read_csv(f'{self.last_season_path}/{self.TEAMS_FILE}', index_col=['id'], na_values='None')

def get_fixtures_last_season(self) -> DF:
return pd.read_csv(f'{self.last_season_path}/{self.FIXTURES_FILE}', index_col=['id'], na_values='None')

def get_players_last_season(self) -> Tuple[DF, DF, DF]:
return (pd.read_csv(f'{self.last_season_path}/{self.PLAYERS_FILE}', index_col=['id'], na_values='None'),
None,
pd.read_csv(f'{self.last_season_path}/{self.PLAYERS_HISTORY_FILE}', index_col=['player_id', 'fixture'], na_values='None'))

def assert_context(self, ctx: Context) -> NoReturn:
assert ctx.next_gw + len(ctx.next_gw_counts.keys()) == ctx.total_gws
now: dt.datetime # The current date/time. This is required for fixing time when running test mode.


def validate_df(df: DF, df_name: str, required_columns: list):
Expand Down Expand Up @@ -149,5 +96,5 @@ def is_notebook():
return False


def remove_temp_cols(df: DF, ctx: Context) -> DF:
return df[[col for col in df.columns if not col.startswith(ctx.LOCAL_COL_PREFIX)]]
def remove_temp_cols(df: DF) -> DF:
return df[[col for col in df.columns if not col.startswith(LOCAL_COL_PREFIX)]]
Loading

0 comments on commit 4c156bf

Please sign in to comment.