diff --git a/.gitignore b/.gitignore index ce99277..1111669 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ data +assets __pycache__ \ No newline at end of file diff --git a/README.md b/README.md index 40602f7..05d59fb 100644 --- a/README.md +++ b/README.md @@ -80,9 +80,9 @@ Then I would train the model on all the games I can on a game-by-game basis. So score differential is wrong? look at first game. the number for the 2 teams dont match -- [ ] cleanup - - [ ] the get_data and load_data is duplicated in data.py and get_data.py/load_data.py. just use one or the other. - - [ ] move notebook code to python files. think about a managable way to share logic between notebook and python files so I can drop into the pipeline and inspect as needed. +- [x] cleanup + - [x] the get_data and load_data is duplicated in data.py and get_data.py/load_data.py. just use one or the other. + - [x] move notebook code to python files. think about a managable way to share logic between notebook and python files so I can drop into the pipeline and inspect as needed. - probably just put everything in a functions that are imported into the python file and notebook? - [x] simple model to predict spread - [x] use sklearn to train model @@ -99,9 +99,14 @@ score differential is wrong? look at first game. the number for the 2 teams dont - what to do with it? save configuration then recreate it when needed? pickle? - [x] predict spread - [ ] github workflow - - [ ] periodically update the data (and release?) - - [ ] periodically train the model (and release? what? the configuration... as what filetype? json?) + - [ ] periodically train the model (and release model, scaler, running_avg_dfall w/ same timestamp) + - [ ] add save functionality to --train flag that saves the running_avg_df to assets + - [ ] update predict fn to only predict from this saved df. should ensure its always using latest data that model was trained with (instead of using new data model wasnt trained with when building from csv). - [ ] periodically get upcoming games and make predictions. publish on github pages. get booky spread too? +- Quality of Life Improvements + - [ ] add cli doc generator. look into `argparse.HelpFormatter` to generate a markdown file. + - [ ] add types + - [ ] unit tests - [ ] improve features/model. either at game aggregation level or team @ week aggregation level - [ ] W/L record or games played and win pct? (win and loss column on game aggregation) - [ ] success rate (calculate success (0 or 1) from each play). diff --git a/nfl_analytics/assets/trained_model.joblib b/nfl_analytics/assets/trained_model.joblib deleted file mode 100644 index 4df8b1a..0000000 Binary files a/nfl_analytics/assets/trained_model.joblib and /dev/null differ diff --git a/nfl_analytics/assets/trained_scaler.joblib b/nfl_analytics/assets/trained_scaler.joblib deleted file mode 100644 index 0fd9fb7..0000000 Binary files a/nfl_analytics/assets/trained_scaler.joblib and /dev/null differ diff --git a/nfl_analytics/config.py b/nfl_analytics/config.py index 01b025f..811702c 100644 --- a/nfl_analytics/config.py +++ b/nfl_analytics/config.py @@ -51,3 +51,6 @@ "NO", "TEN", ] +RUNNING_AVG_DF_FILENAME = "running_average" +TRAINED_MODEL_FILENAME = "trained_model" +TRAINED_SCALER_FILENAME = "trained_scaler" diff --git a/nfl_analytics/data.py b/nfl_analytics/data.py index 5f855c5..60c1957 100644 --- a/nfl_analytics/data.py +++ b/nfl_analytics/data.py @@ -10,7 +10,14 @@ import pandas as pd -from nfl_analytics.config import DATA_DIR +from nfl_analytics.config import ( + DATA_DIR, + ASSET_DIR as ASSET_DIR_, +) + + +THIS_DIR = os.path.dirname(os.path.abspath(__file__)) +ASSET_DIR = os.path.join(THIS_DIR, ASSET_DIR_) def download_data(years=range(1999, 2024)): @@ -32,9 +39,8 @@ def download_data(years=range(1999, 2024)): ) -def load_dataframe(): - script_dir = os.path.dirname(os.path.abspath(__file__)) - data_directory = os.path.join(script_dir, DATA_DIR) +def load_dataframe_from_raw(): + data_directory = os.path.join(THIS_DIR, DATA_DIR) if not os.path.exists(data_directory): raise FileNotFoundError(f"Data directory '{data_directory}' not found.") @@ -65,6 +71,7 @@ def load_dataframe(): file_path = os.path.join(data_directory, filename) df = pd.read_csv(file_path, compression="gzip", low_memory=False) + # Save year from filename on dataframe year = get_year_from_filename(filename) df["year"] = year @@ -84,7 +91,7 @@ def get_year_from_filename(filename): def load_sqlite(): db_dir = "/tmp/nfl-analytics.db" # load into pandas first and use to_sql to infer datatypes - df = load_dataframe() + df = load_dataframe_from_raw() print(f"Loading into SQLite database: {db_dir}") @@ -97,6 +104,15 @@ def load_sqlite(): print(cursor.fetchall()) +def save_dataframe(df, filename_): + os.makedirs(ASSET_DIR, exist_ok=True) + filename = f"{filename_}.csv.gz" + + save_path = os.path.join(ASSET_DIR, filename) + df.to_csv(save_path, index=False, compression="gzip") + print(f"Running average dataframe saved to {filename}") + + if __name__ == "__main__": download_data() load_sqlite() diff --git a/nfl_analytics/dataframes.py b/nfl_analytics/dataframes.py index 70d461e..b62469a 100644 --- a/nfl_analytics/dataframes.py +++ b/nfl_analytics/dataframes.py @@ -3,7 +3,7 @@ Handles everything between getting the data and training/using the model. """ -from nfl_analytics.data import load_dataframe +from nfl_analytics.data import load_dataframe_from_raw import pandas as pd @@ -67,7 +67,7 @@ def build_running_avg_dataframe(df_raw=None): Used to create prediction inputs and build the training dataset """ if df_raw is None: - df_raw = load_dataframe() + df_raw = load_dataframe_from_raw() df_sacks = add_sack_yards(df_raw) # df_game is team games stats by team: week 1, DET, 250 pass, 120 run, etc. diff --git a/nfl_analytics/dev_notebook.ipynb b/nfl_analytics/dev_notebook.ipynb index fa6ad43..74191c5 100644 --- a/nfl_analytics/dev_notebook.ipynb +++ b/nfl_analytics/dev_notebook.ipynb @@ -11,7 +11,7 @@ "metadata": {}, "outputs": [], "source": [ - "from nfl_analytics.data import load_dataframe\n", + "from nfl_analytics.data import load_dataframe_from_raw\n", "import pandas as pd" ] }, @@ -34,7 +34,7 @@ } ], "source": [ - "df = load_dataframe()\n", + "df = load_dataframe_from_raw()\n", "df[df['year'] == 2023]['posteam'].unique()" ] }, diff --git a/nfl_analytics/main.py b/nfl_analytics/main.py index 3de1b5f..132e2c7 100644 --- a/nfl_analytics/main.py +++ b/nfl_analytics/main.py @@ -1,19 +1,30 @@ import argparse import time -from nfl_analytics.data import download_data, load_dataframe +import pandas as pd +from joblib import load + +from nfl_analytics.data import ( + download_data, + load_dataframe_from_raw, + save_dataframe, +) from nfl_analytics.model import ( train_model, predict, save_model_and_scaler, - load_model_and_scaler, ) from nfl_analytics.dataframes import ( build_running_avg_dataframe, build_training_dataframe, ) -from nfl_analytics.utils import is_valid_year -from nfl_analytics.config import TEAMS +from nfl_analytics.utils import is_valid_year, get_latest_timestamped_filepath +from nfl_analytics.config import ( + TEAMS, + RUNNING_AVG_DF_FILENAME, + TRAINED_MODEL_FILENAME, + TRAINED_SCALER_FILENAME, +) # ROUGH CLI docs: @@ -50,22 +61,24 @@ def main(): invalid_years = [year for year in year_set if not is_valid_year(year)] if invalid_years: - print( - f"Error: Invalid year(s) provided: {invalid_years}. No data downloaded." - ) + print(f"Invalid year(s) provided: {invalid_years}. No data downloaded.") else: download_data(year_set) else: download_data() if args.train: - print("Training model...") - start_time = time.time() - df_raw = load_dataframe() + try: + df_raw = load_dataframe_from_raw() + except FileNotFoundError: + print("No data loaded from the files. Please run with --download first.") + return end_time = time.time() print(f"Loaded dataframe in {end_time - start_time} seconds") + print("Training model...") + # This wont pick on updated data (downlaoded new data but still have combined, so it will use that) # Save combined dataframe to disk # save_dir = os.path.join("data", "combined") @@ -73,16 +86,20 @@ def main(): # save_path = os.path.join(save_dir, "play_by_play_combined.parquet.gzip") # df_raw.to_parquet(save_path, compression="gzip") + timestamp = int(time.time()) + df_running_avg = build_running_avg_dataframe(df_raw) + save_dataframe(df_running_avg, f"{RUNNING_AVG_DF_FILENAME}-{timestamp}") + df_training = build_training_dataframe(df_running_avg) model, scaler = train_model(df_training) - save_model_and_scaler(model, scaler) + save_model_and_scaler(model, scaler, timestamp) if args.predict: # TODO: this will silently predict based off old data if thats all we have. # Perhaps I should require the week/year in the predict fn? Or at least log - # year/week in predict? + # year/week in predict? Or maybe aligning everything by timestamp will resolve this? home_team = args.predict[0].upper() away_team = args.predict[1].upper() @@ -92,13 +109,31 @@ def main(): return if home_team == away_team: - print("Error: Home and away team cannot be the same.") + print("Home and away team cannot be the same.") return - model, scaler = load_model_and_scaler() - - # TODO: load directly from somewhere instead? - df_running_avg = build_running_avg_dataframe() + try: + latest_model_filepath = get_latest_timestamped_filepath( + TRAINED_MODEL_FILENAME, ".joblib" + ) + latest_scaler_filepath = get_latest_timestamped_filepath( + TRAINED_SCALER_FILENAME, ".joblib" + ) + except FileNotFoundError: + print( + "No trained model and/or scaler found. Please run with --train first." + ) + return + model, scaler = load(latest_model_filepath), load(latest_scaler_filepath) + + try: + latest_running_avg_filename = get_latest_timestamped_filepath( + RUNNING_AVG_DF_FILENAME, ".csv.gz" + ) + except FileNotFoundError: + print("No running average dataframe found. Please run with --train first.") + return + df_running_avg = pd.read_csv(latest_running_avg_filename, low_memory=False) predicted_spread = predict(model, scaler, df_running_avg, home_team, away_team) diff --git a/nfl_analytics/model.py b/nfl_analytics/model.py index 9122d5c..0402e4f 100644 --- a/nfl_analytics/model.py +++ b/nfl_analytics/model.py @@ -6,7 +6,7 @@ from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler -from joblib import dump, load +from joblib import dump from nfl_analytics.config import FEATURES, ASSET_DIR @@ -51,23 +51,18 @@ def train_model(df_training): return model, scaler -def save_model_and_scaler(model, scaler): +def save_model_and_scaler(model, scaler, timestamp): script_dir = os.path.dirname(os.path.abspath(__file__)) asset_dir = os.path.join(script_dir, ASSET_DIR) os.makedirs(asset_dir, exist_ok=True) - dump(model, os.path.join(asset_dir, "trained_model.joblib")) - dump(scaler, os.path.join(asset_dir, "trained_scaler.joblib")) - print("Model and scaler saved") + model_filename = f"trained_model-{timestamp}.joblib" + scaler_filename = f"trained_scaler-{timestamp}.joblib" - -def load_model_and_scaler(): - script_dir = os.path.dirname(os.path.abspath(__file__)) - asset_dir = os.path.join(script_dir, ASSET_DIR) - - model = load(os.path.join(asset_dir, "trained_model.joblib")) - scaler = load(os.path.join(asset_dir, "trained_scaler.joblib")) - return model, scaler + dump(model, os.path.join(asset_dir, model_filename)) + dump(scaler, os.path.join(asset_dir, scaler_filename)) + print(f"Model saved to {model_filename}") + print(f"Scaler saved to {scaler_filename}") def predict(model, scaler, df_running_avg, home_team, away_team): diff --git a/nfl_analytics/utils.py b/nfl_analytics/utils.py index ab02bc7..409de9a 100644 --- a/nfl_analytics/utils.py +++ b/nfl_analytics/utils.py @@ -1,6 +1,7 @@ import datetime +import os -from nfl_analytics.config import START_YEAR +from nfl_analytics.config import START_YEAR, ASSET_DIR def is_valid_year(year): @@ -8,6 +9,22 @@ def is_valid_year(year): return START_YEAR <= year <= current_year +def get_latest_timestamped_filepath(starts_with, ends_with): + matching_files = [ + file + for file in os.listdir(ASSET_DIR) + if file.startswith(starts_with) and file.endswith(ends_with) + ] + + if not matching_files: + raise FileNotFoundError(f"No matching files found") + + sorted_files = sorted(matching_files) + latest_filename = sorted_files[-1] + + return os.path.join(ASSET_DIR, latest_filename) + + if __name__ == "__main__": print(is_valid_year(1998)) # False print(is_valid_year(1999)) # True