feat: improve download,train,predict flow by saving/loading model,sca…

…ler,df by timestamp
BlairCurrey · Feb 4, 2024 · 2cccfa0 · 2cccfa0
1 parent dbb4a84
commit 2cccfa0
Show file tree

Hide file tree

Showing 11 changed files with 117 additions and 45 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 data
+assets
 __pycache__
diff --git a/README.md b/README.md
@@ -80,9 +80,9 @@ Then I would train the model on all the games I can on a game-by-game basis. So
 
 score differential is wrong? look at first game. the number for the 2 teams dont match
 
-- [ ] cleanup
-  - [ ] the get_data and load_data is duplicated in data.py and get_data.py/load_data.py. just use one or the other.
-  - [ ] move notebook code to python files. think about a managable way to share logic between notebook and python files so I can drop into the pipeline and inspect as needed.
+- [x] cleanup
+  - [x] the get_data and load_data is duplicated in data.py and get_data.py/load_data.py. just use one or the other.
+  - [x] move notebook code to python files. think about a managable way to share logic between notebook and python files so I can drop into the pipeline and inspect as needed.
     - probably just put everything in a functions that are imported into the python file and notebook?
 - [x] simple model to predict spread
   - [x] use sklearn to train model
@@ -99,9 +99,14 @@ score differential is wrong? look at first game. the number for the 2 teams dont
     - what to do with it? save configuration then recreate it when needed? pickle?
   - [x] predict spread
 - [ ] github workflow
-  - [ ] periodically update the data (and release?)
-  - [ ] periodically train the model (and release? what? the configuration... as what filetype? json?)
+  - [ ] periodically train the model (and release model, scaler, running_avg_dfall w/ same timestamp)
+    - [ ] add save functionality to --train flag that saves the running_avg_df to assets
+    - [ ] update predict fn to only predict from this saved df. should ensure its always using latest data that model was trained with (instead of using new data model wasnt trained with when building from csv).
   - [ ] periodically get upcoming games and make predictions. publish on github pages. get booky spread too?
+- Quality of Life Improvements
+  - [ ] add cli doc generator. look into `argparse.HelpFormatter` to generate a markdown file.
+  - [ ] add types
+  - [ ] unit tests
 - [ ] improve features/model. either at game aggregation level or team @ week aggregation level
   - [ ] W/L record or games played and win pct? (win and loss column on game aggregation)
   - [ ] success rate (calculate success (0 or 1) from each play).

diff --git a/nfl_analytics/assets/trained_model.joblib b/nfl_analytics/assets/trained_model.joblib
diff --git a/nfl_analytics/assets/trained_scaler.joblib b/nfl_analytics/assets/trained_scaler.joblib
diff --git a/nfl_analytics/config.py b/nfl_analytics/config.py
@@ -51,3 +51,6 @@
     "NO",
     "TEN",
 ]
+RUNNING_AVG_DF_FILENAME = "running_average"
+TRAINED_MODEL_FILENAME = "trained_model"
+TRAINED_SCALER_FILENAME = "trained_scaler"
diff --git a/nfl_analytics/data.py b/nfl_analytics/data.py
@@ -10,7 +10,14 @@
 
 import pandas as pd
 
-from nfl_analytics.config import DATA_DIR
+from nfl_analytics.config import (
+    DATA_DIR,
+    ASSET_DIR as ASSET_DIR_,
+)
+
+
+THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+ASSET_DIR = os.path.join(THIS_DIR, ASSET_DIR_)
 
 
 def download_data(years=range(1999, 2024)):
@@ -32,9 +39,8 @@ def download_data(years=range(1999, 2024)):
             )
 
 
-def load_dataframe():
-    script_dir = os.path.dirname(os.path.abspath(__file__))
-    data_directory = os.path.join(script_dir, DATA_DIR)
+def load_dataframe_from_raw():
+    data_directory = os.path.join(THIS_DIR, DATA_DIR)
 
     if not os.path.exists(data_directory):
         raise FileNotFoundError(f"Data directory '{data_directory}' not found.")
@@ -65,6 +71,7 @@ def load_dataframe():
             file_path = os.path.join(data_directory, filename)
 
             df = pd.read_csv(file_path, compression="gzip", low_memory=False)
+
             # Save year from filename on dataframe
             year = get_year_from_filename(filename)
             df["year"] = year
@@ -84,7 +91,7 @@ def get_year_from_filename(filename):
 def load_sqlite():
     db_dir = "/tmp/nfl-analytics.db"
     # load into pandas first and use to_sql to infer datatypes
-    df = load_dataframe()
+    df = load_dataframe_from_raw()
 
     print(f"Loading into SQLite database: {db_dir}")
 
@@ -97,6 +104,15 @@ def load_sqlite():
     print(cursor.fetchall())
 
 
+def save_dataframe(df, filename_):
+    os.makedirs(ASSET_DIR, exist_ok=True)
+    filename = f"{filename_}.csv.gz"
+
+    save_path = os.path.join(ASSET_DIR, filename)
+    df.to_csv(save_path, index=False, compression="gzip")
+    print(f"Running average dataframe saved to {filename}")
+
+
 if __name__ == "__main__":
     download_data()
     load_sqlite()
diff --git a/nfl_analytics/dataframes.py b/nfl_analytics/dataframes.py
@@ -3,7 +3,7 @@
 Handles everything between getting the data and training/using the model.
 """
 
-from nfl_analytics.data import load_dataframe
+from nfl_analytics.data import load_dataframe_from_raw
 import pandas as pd
 
 
@@ -67,7 +67,7 @@ def build_running_avg_dataframe(df_raw=None):
     Used to create prediction inputs and build the training dataset
     """
     if df_raw is None:
-        df_raw = load_dataframe()
+        df_raw = load_dataframe_from_raw()
 
     df_sacks = add_sack_yards(df_raw)
     # df_game is team games stats by team: week 1, DET, 250 pass, 120 run, etc.

diff --git a/nfl_analytics/dev_notebook.ipynb b/nfl_analytics/dev_notebook.ipynb
@@ -11,7 +11,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from nfl_analytics.data import load_dataframe\n",
+    "from nfl_analytics.data import load_dataframe_from_raw\n",
     "import pandas as pd"
    ]
   },
@@ -34,7 +34,7 @@
     }
    ],
    "source": [
-    "df = load_dataframe()\n",
+    "df = load_dataframe_from_raw()\n",
     "df[df['year'] == 2023]['posteam'].unique()"
    ]
   },

diff --git a/nfl_analytics/main.py b/nfl_analytics/main.py
@@ -1,19 +1,30 @@
 import argparse
 import time
 
-from nfl_analytics.data import download_data, load_dataframe
+import pandas as pd
+from joblib import load
+
+from nfl_analytics.data import (
+    download_data,
+    load_dataframe_from_raw,
+    save_dataframe,
+)
 from nfl_analytics.model import (
     train_model,
     predict,
     save_model_and_scaler,
-    load_model_and_scaler,
 )
 from nfl_analytics.dataframes import (
     build_running_avg_dataframe,
     build_training_dataframe,
 )
-from nfl_analytics.utils import is_valid_year
-from nfl_analytics.config import TEAMS
+from nfl_analytics.utils import is_valid_year, get_latest_timestamped_filepath
+from nfl_analytics.config import (
+    TEAMS,
+    RUNNING_AVG_DF_FILENAME,
+    TRAINED_MODEL_FILENAME,
+    TRAINED_SCALER_FILENAME,
+)
 
 
 # ROUGH CLI docs:
@@ -50,39 +61,45 @@ def main():
             invalid_years = [year for year in year_set if not is_valid_year(year)]
 
             if invalid_years:
-                print(
-                    f"Error: Invalid year(s) provided: {invalid_years}. No data downloaded."
-                )
+                print(f"Invalid year(s) provided: {invalid_years}. No data downloaded.")
             else:
                 download_data(year_set)
         else:
             download_data()
 
     if args.train:
-        print("Training model...")
-
         start_time = time.time()
-        df_raw = load_dataframe()
+        try:
+            df_raw = load_dataframe_from_raw()
+        except FileNotFoundError:
+            print("No data loaded from the files. Please run with --download first.")
+            return
         end_time = time.time()
         print(f"Loaded dataframe in {end_time - start_time} seconds")
 
+        print("Training model...")
+
         # This wont pick on updated data (downlaoded new data but still have combined, so it will use that)
         # Save combined dataframe to disk
         # save_dir = os.path.join("data", "combined")
         # os.makedirs(save_dir, exist_ok=True)
         # save_path = os.path.join(save_dir, "play_by_play_combined.parquet.gzip")
         # df_raw.to_parquet(save_path, compression="gzip")
 
+        timestamp = int(time.time())
+
         df_running_avg = build_running_avg_dataframe(df_raw)
+        save_dataframe(df_running_avg, f"{RUNNING_AVG_DF_FILENAME}-{timestamp}")
+
         df_training = build_training_dataframe(df_running_avg)
         model, scaler = train_model(df_training)
 
-        save_model_and_scaler(model, scaler)
+        save_model_and_scaler(model, scaler, timestamp)
 
     if args.predict:
         # TODO: this will silently predict based off old data if thats all we have.
         # Perhaps I should require the week/year in the predict fn? Or at least log
-        # year/week in predict?
+        # year/week in predict? Or maybe aligning everything by timestamp will resolve this?
         home_team = args.predict[0].upper()
         away_team = args.predict[1].upper()
 
@@ -92,13 +109,31 @@ def main():
                 return
 
         if home_team == away_team:
-            print("Error: Home and away team cannot be the same.")
+            print("Home and away team cannot be the same.")
             return
 
-        model, scaler = load_model_and_scaler()
-
-        # TODO: load directly from somewhere instead?
-        df_running_avg = build_running_avg_dataframe()
+        try:
+            latest_model_filepath = get_latest_timestamped_filepath(
+                TRAINED_MODEL_FILENAME, ".joblib"
+            )
+            latest_scaler_filepath = get_latest_timestamped_filepath(
+                TRAINED_SCALER_FILENAME, ".joblib"
+            )
+        except FileNotFoundError:
+            print(
+                "No trained model and/or scaler found. Please run with --train first."
+            )
+            return
+        model, scaler = load(latest_model_filepath), load(latest_scaler_filepath)
+
+        try:
+            latest_running_avg_filename = get_latest_timestamped_filepath(
+                RUNNING_AVG_DF_FILENAME, ".csv.gz"
+            )
+        except FileNotFoundError:
+            print("No running average dataframe found. Please run with --train first.")
+            return
+        df_running_avg = pd.read_csv(latest_running_avg_filename, low_memory=False)
 
         predicted_spread = predict(model, scaler, df_running_avg, home_team, away_team)
 

diff --git a/nfl_analytics/model.py b/nfl_analytics/model.py
@@ -6,7 +6,7 @@
 from sklearn.metrics import mean_squared_error, mean_absolute_error
 from sklearn.impute import SimpleImputer
 from sklearn.preprocessing import StandardScaler
-from joblib import dump, load
+from joblib import dump
 
 from nfl_analytics.config import FEATURES, ASSET_DIR
 
@@ -51,23 +51,18 @@ def train_model(df_training):
     return model, scaler
 
 
-def save_model_and_scaler(model, scaler):
+def save_model_and_scaler(model, scaler, timestamp):
     script_dir = os.path.dirname(os.path.abspath(__file__))
     asset_dir = os.path.join(script_dir, ASSET_DIR)
     os.makedirs(asset_dir, exist_ok=True)
 
-    dump(model, os.path.join(asset_dir, "trained_model.joblib"))
-    dump(scaler, os.path.join(asset_dir, "trained_scaler.joblib"))
-    print("Model and scaler saved")
+    model_filename = f"trained_model-{timestamp}.joblib"
+    scaler_filename = f"trained_scaler-{timestamp}.joblib"
 
-
-def load_model_and_scaler():
-    script_dir = os.path.dirname(os.path.abspath(__file__))
-    asset_dir = os.path.join(script_dir, ASSET_DIR)
-
-    model = load(os.path.join(asset_dir, "trained_model.joblib"))
-    scaler = load(os.path.join(asset_dir, "trained_scaler.joblib"))
-    return model, scaler
+    dump(model, os.path.join(asset_dir, model_filename))
+    dump(scaler, os.path.join(asset_dir, scaler_filename))
+    print(f"Model saved to {model_filename}")
+    print(f"Scaler saved to {scaler_filename}")
 
 
 def predict(model, scaler, df_running_avg, home_team, away_team):

diff --git a/nfl_analytics/utils.py b/nfl_analytics/utils.py
@@ -1,13 +1,30 @@
 import datetime
+import os
 
-from nfl_analytics.config import START_YEAR
+from nfl_analytics.config import START_YEAR, ASSET_DIR
 
 
 def is_valid_year(year):
     current_year = datetime.datetime.now().year
     return START_YEAR <= year <= current_year
 
 
+def get_latest_timestamped_filepath(starts_with, ends_with):
+    matching_files = [
+        file
+        for file in os.listdir(ASSET_DIR)
+        if file.startswith(starts_with) and file.endswith(ends_with)
+    ]
+
+    if not matching_files:
+        raise FileNotFoundError(f"No matching files found")
+
+    sorted_files = sorted(matching_files)
+    latest_filename = sorted_files[-1]
+
+    return os.path.join(ASSET_DIR, latest_filename)
+
+
 if __name__ == "__main__":
     print(is_valid_year(1998))  # False
     print(is_valid_year(1999))  # True