Added data testing framework, improved estimates for promoted teams, …

…improved formatting for expected points (#17) * #13 Added non-cumulative future expected points columns to player_gw_next_eps_ext dataset. * #15, Fixes #16: Added data testing framework. Fixed goals stats estimation issue. * #15 Trying to fix component test. * Fixes #16: Fixed comp test. * Removed hard coded component test mode setting.
177arc · Jan 3, 2021 · 4c156bf · 4c156bf
1 parent d623e1f
commit 4c156bf
Show file tree

Hide file tree

Showing 46 changed files with 3,646 additions and 734 deletions.
diff --git a/.github/workflows/ci_cd.yml b/.github/workflows/ci_cd.yml
@@ -43,7 +43,10 @@ jobs:
         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-    - name: Run tests
+    - name: Run component tests
+      run: |
+        python -m unittest discover -s "tests/comp"
+    - name: Run intergration tests
       env:
         AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
         AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}

diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,6 @@
 /.vscode/
 /layer/python/
 /layer/layer.zip
+/*.zip
+/*.log
+/tests/comp/actual_*.csv
diff --git a/data/team_goals_stats_estimates.csv b/data/team_goals_stats_estimates.csv
@@ -1,4 +1,4 @@
 Team Code,Team Short Name,Total Team Goals Scored Home~,Total Team Goals Scored Away~,Total Team Goals Conceded Home~,Total Team Goals Conceded Away~,Total Team Clean Sheets Home~,Total Team Clean Sheets Away~,Team Fixture Count Home~,Team Fixture Count Away~
-2,LEE,23,14,17,24,7,5,19,19
-35,WBA,30,19,30,30,3,4,19,19
-54,FUL,30,19,30,30,3,4,19,19
+2,LEE,11.5,7,8.5,12,3.5,2.5,9.5,9.5
+35,WBA,15,9.5,15,15,1.5,2,9.5,9.5
+54,FUL,15,9.5,15,15,1.5,2,9.5,9.5
diff --git a/fpldata/backtest.py b/fpldata/backtest.py
@@ -6,33 +6,36 @@
 DF = pd.DataFrame
 S = pd.Series
 
+
 def add_gws_ago(players_gw_team: DF) -> DF:
     return (players_gw_team
             .sort_values(['Season', 'Game Week'], ascending=False)
             .assign(**{'GWs Ago': lambda df: (~df['Expected Points'].isnull()).cumsum()}))
 
+
 def get_gw_points_backtest(players_gw_team_eps: DF, ctx: Context) -> DF:
     return (players_gw_team_eps
-         .reset_index()
-         [lambda df: (df['Fixture Minutes Played'] > 0) & (df['Fixtures Played To Fixture'] > 4) &  # (df['Total Points To Fixture'] > 48) &
-              ((df['Season'] == ctx.current_season) & (df['Game Week'] < ctx.next_gw) | (df['Season'] != ctx.current_season))]
-        .assign(**{'Player Fixture Error': lambda df: np.abs(df['Expected Points']-df['Fixture Total Points'])})
-        .assign(**{'Player Fixture Error Simple': lambda df: np.abs(df['Expected Points Simple']-df['Fixture Total Points'])})
-        .assign(**{'Player Fixture Sq Error': lambda df: df['Player Fixture Error']**2})
-        .assign(**{'Player Fixture Sq Error Simple': lambda df: df['Player Fixture Error Simple']**2})
-        .groupby(['Season', 'Game Week'])
-         [['Player Fixture Error', 'Player Fixture Sq Error', 'Expected Points', 'Player Fixture Error Simple', 'Player Fixture Sq Error Simple', 'Expected Points Simple', 'Fixture Total Points', 'Player Code']]
-         .agg({'Player Fixture Error': 'sum', 'Player Fixture Sq Error': 'sum', 'Expected Points': 'sum', 'Player Fixture Error Simple': 'sum', 'Player Fixture Sq Error Simple': 'sum', 'Expected Points Simple': 'sum', 'Fixture Total Points': 'sum', 'Player Code': 'count'})
-         .reset_index()
-         .pipe(add_gws_ago)
-         [lambda df: df['GWs Ago'] <= ctx.fixtures_look_back]
-         .sort_values(['Season', 'Game Week'])
-         .rename(columns={'Player Code': 'Player Count'})           
-         .assign(**{'Avg Expected Points': lambda df: df['Expected Points']/df['Player Count']})
-         .assign(**{'Avg Expected Points Simple': lambda df: df['Expected Points Simple']/df['Player Count']})
-         .assign(**{'Avg Fixture Total Points': lambda df: df['Fixture Total Points']/df['Player Count']})
-         .assign(**{'Error': lambda df: df['Player Fixture Error']/df['Player Count']})
-         .assign(**{'Error Simple': lambda df: df['Player Fixture Error Simple']/df['Player Count']})
-         .assign(**{'Sq Error': lambda df: df['Player Fixture Sq Error']/df['Player Count']})
-         .assign(**{'Sq Error Simple': lambda df: df['Player Fixture Sq Error Simple']/df['Player Count']})
-         .assign(**{'Season Game Week': lambda df: df['Season']+', GW '+df['Game Week'].apply('{:.0f}'.format)}))
+            .reset_index()
+            [lambda df: (df['Fixture Minutes Played'] > 0) & (df['Fixtures Played To Fixture'] > 4) &  # (df['Total Points To Fixture'] > 48) &
+                        ((df['Season'] == ctx.current_season) & (df['Game Week'] < ctx.next_gw) | (df['Season'] != ctx.current_season))]
+            .assign(**{'Player Fixture Error': lambda df: np.abs(df['Expected Points'] - df['Fixture Total Points'])})
+            .assign(**{'Player Fixture Error Simple': lambda df: np.abs(df['Expected Points Simple'] - df['Fixture Total Points'])})
+            .assign(**{'Player Fixture Sq Error': lambda df: df['Player Fixture Error'] ** 2})
+            .assign(**{'Player Fixture Sq Error Simple': lambda df: df['Player Fixture Error Simple'] ** 2})
+            .groupby(['Season', 'Game Week'])
+            [['Player Fixture Error', 'Player Fixture Sq Error', 'Expected Points', 'Player Fixture Error Simple', 'Player Fixture Sq Error Simple', 'Expected Points Simple', 'Fixture Total Points', 'Player Code']]
+            .agg({'Player Fixture Error': 'sum', 'Player Fixture Sq Error': 'sum', 'Expected Points': 'sum', 'Player Fixture Error Simple': 'sum', 'Player Fixture Sq Error Simple': 'sum', 'Expected Points Simple': 'sum',
+                  'Fixture Total Points': 'sum', 'Player Code': 'count'})
+            .reset_index()
+            .pipe(add_gws_ago)
+            [lambda df: df['GWs Ago'] <= ctx.fixtures_look_back]
+            .sort_values(['Season', 'Game Week'])
+            .rename(columns={'Player Code': 'Player Count'})
+            .assign(**{'Avg Expected Points': lambda df: df['Expected Points'] / df['Player Count']})
+            .assign(**{'Avg Expected Points Simple': lambda df: df['Expected Points Simple'] / df['Player Count']})
+            .assign(**{'Avg Fixture Total Points': lambda df: df['Fixture Total Points'] / df['Player Count']})
+            .assign(**{'Error': lambda df: df['Player Fixture Error'] / df['Player Count']})
+            .assign(**{'Error Simple': lambda df: df['Player Fixture Error Simple'] / df['Player Count']})
+            .assign(**{'Sq Error': lambda df: df['Player Fixture Sq Error'] / df['Player Count']})
+            .assign(**{'Sq Error Simple': lambda df: df['Player Fixture Sq Error Simple'] / df['Player Count']})
+            .assign(**{'Season Game Week': lambda df: df['Season'] + ', GW ' + df['Game Week'].apply('{:.0f}'.format)}))
diff --git a/fpldata/common.py b/fpldata/common.py
@@ -4,23 +4,22 @@
 
 import pandas as pd
 import numpy as np
+import datetime as dt
 from datadict import DataDict
-from typing import Tuple, NoReturn
-from fplpandas import FPLPandas
 
 # Define type aliases
 DF = pd.DataFrame
 S = pd.Series
 
+POSITION_BY_TYPE: dict = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}
+FIXTURE_TYPES: list = ['Home', 'Away', '']
+STATS_TYPES: list = ['Goals Scored', 'Goals Conceded', 'Clean Sheets']
+FIXTURE_STATS_TYPES: list = [STATS_TYPES, FIXTURE_TYPES]
+LOCAL_COL_PREFIX = '_'
+PRIVATE_COL_PREFIX = '__'
 
-class Context:
-    POSITION_BY_TYPE: dict = {1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}
-    FIXTURE_TYPES: list = ['Home', 'Away', '']
-    STATS_TYPES: list = ['Goals Scored', 'Goals Conceded', 'Clean Sheets']
-    FIXTURE_STATS_TYPES: list = [STATS_TYPES, FIXTURE_TYPES]
-    LOCAL_COL_PREFIX = '_'
-    PRIVATE_COL_PREFIX = '__'
 
+class Context:
     total_gws: int                      # The number game weeks in a season.
     next_gw: int                        # The upcoming game week.
     def_next_gws: str                   # The default forecast time horizon, e.g. 'Next 8 GWs'
@@ -30,59 +29,7 @@ class Context:
     last_season: str                    # The name of the last season, e.g. '2019-20'.
     current_season: str                 # The name of the current season, e.g. '2020-21'.
     dd: DataDict                        # The data dictionary to use for column remapping, formatting and descriptions.
-
-
-class FPL:
-    def get_game_weeks(self) -> DF:
-        raise NotImplementedError
-
-    def get_teams(self) -> DF:
-        raise NotImplementedError
-
-    def get_teams_last_season(self) -> DF:
-        raise NotImplementedError
-
-    def get_fixtures(self) -> DF:
-        raise NotImplementedError
-
-    def get_fixtures_last_season(self) -> DF:
-        raise NotImplementedError
-
-    def get_players(self) -> Tuple[DF, DF, DF]:
-        raise NotImplementedError
-
-    def get_players_last_season(self) -> Tuple[DF, DF, DF]:
-        raise NotImplementedError
-
-    def assert_context(self, ctx: Context) -> NoReturn:
-        raise NotImplementedError
-
-
-class FPLPandasEx(FPLPandas, FPL):
-    TEAMS_FILE = 'teams.csv'  # File with team data for last season
-    PLAYERS_FILE = f'players.csv'  # File with player data for last season
-    PLAYERS_HISTORY_FILE = f'players_history.csv'  # File with player fixture data for last season
-    FIXTURES_FILE = f'fixtures.csv'  # File with fixture data for last season
-
-    last_season_path: str
-
-    def __init__(self, last_season_path):
-        self.last_season_path = last_season_path
-        super().__init__()
-
-    def get_teams_last_season(self) -> DF:
-        return pd.read_csv(f'{self.last_season_path}/{self.TEAMS_FILE}', index_col=['id'], na_values='None')
-
-    def get_fixtures_last_season(self) -> DF:
-        return pd.read_csv(f'{self.last_season_path}/{self.FIXTURES_FILE}', index_col=['id'], na_values='None')
-
-    def get_players_last_season(self) -> Tuple[DF, DF, DF]:
-        return (pd.read_csv(f'{self.last_season_path}/{self.PLAYERS_FILE}', index_col=['id'], na_values='None'),
-                None,
-                pd.read_csv(f'{self.last_season_path}/{self.PLAYERS_HISTORY_FILE}', index_col=['player_id', 'fixture'], na_values='None'))
-
-    def assert_context(self, ctx: Context) -> NoReturn:
-        assert ctx.next_gw + len(ctx.next_gw_counts.keys()) == ctx.total_gws
+    now: dt.datetime                    # The current date/time. This is required for fixing time when running test mode.
 
 
 def validate_df(df: DF, df_name: str, required_columns: list):
@@ -149,5 +96,5 @@ def is_notebook():
         return False
 
 
-def remove_temp_cols(df: DF, ctx: Context) -> DF:
-    return df[[col for col in df.columns if not col.startswith(ctx.LOCAL_COL_PREFIX)]]
+def remove_temp_cols(df: DF) -> DF:
+    return df[[col for col in df.columns if not col.startswith(LOCAL_COL_PREFIX)]]