Skip to content

Commit

Permalink
feat: improve download,train,predict flow by saving/loading model,sca…
Browse files Browse the repository at this point in the history
…ler,df by timestamp
  • Loading branch information
BlairCurrey committed Feb 4, 2024
1 parent dbb4a84 commit 2cccfa0
Show file tree
Hide file tree
Showing 11 changed files with 117 additions and 45 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
data
assets
__pycache__
15 changes: 10 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,9 @@ Then I would train the model on all the games I can on a game-by-game basis. So

score differential is wrong? look at first game. the number for the 2 teams dont match

- [ ] cleanup
- [ ] the get_data and load_data is duplicated in data.py and get_data.py/load_data.py. just use one or the other.
- [ ] move notebook code to python files. think about a managable way to share logic between notebook and python files so I can drop into the pipeline and inspect as needed.
- [x] cleanup
- [x] the get_data and load_data is duplicated in data.py and get_data.py/load_data.py. just use one or the other.
- [x] move notebook code to python files. think about a managable way to share logic between notebook and python files so I can drop into the pipeline and inspect as needed.
- probably just put everything in a functions that are imported into the python file and notebook?
- [x] simple model to predict spread
- [x] use sklearn to train model
Expand All @@ -99,9 +99,14 @@ score differential is wrong? look at first game. the number for the 2 teams dont
- what to do with it? save configuration then recreate it when needed? pickle?
- [x] predict spread
- [ ] github workflow
- [ ] periodically update the data (and release?)
- [ ] periodically train the model (and release? what? the configuration... as what filetype? json?)
- [ ] periodically train the model (and release model, scaler, running_avg_dfall w/ same timestamp)
- [ ] add save functionality to --train flag that saves the running_avg_df to assets
- [ ] update predict fn to only predict from this saved df. should ensure its always using latest data that model was trained with (instead of using new data model wasnt trained with when building from csv).
- [ ] periodically get upcoming games and make predictions. publish on github pages. get booky spread too?
- Quality of Life Improvements
- [ ] add cli doc generator. look into `argparse.HelpFormatter` to generate a markdown file.
- [ ] add types
- [ ] unit tests
- [ ] improve features/model. either at game aggregation level or team @ week aggregation level
- [ ] W/L record or games played and win pct? (win and loss column on game aggregation)
- [ ] success rate (calculate success (0 or 1) from each play).
Expand Down
Binary file removed nfl_analytics/assets/trained_model.joblib
Binary file not shown.
Binary file removed nfl_analytics/assets/trained_scaler.joblib
Binary file not shown.
3 changes: 3 additions & 0 deletions nfl_analytics/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,6 @@
"NO",
"TEN",
]
RUNNING_AVG_DF_FILENAME = "running_average"
TRAINED_MODEL_FILENAME = "trained_model"
TRAINED_SCALER_FILENAME = "trained_scaler"
26 changes: 21 additions & 5 deletions nfl_analytics/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,14 @@

import pandas as pd

from nfl_analytics.config import DATA_DIR
from nfl_analytics.config import (
DATA_DIR,
ASSET_DIR as ASSET_DIR_,
)


THIS_DIR = os.path.dirname(os.path.abspath(__file__))
ASSET_DIR = os.path.join(THIS_DIR, ASSET_DIR_)


def download_data(years=range(1999, 2024)):
Expand All @@ -32,9 +39,8 @@ def download_data(years=range(1999, 2024)):
)


def load_dataframe():
script_dir = os.path.dirname(os.path.abspath(__file__))
data_directory = os.path.join(script_dir, DATA_DIR)
def load_dataframe_from_raw():
data_directory = os.path.join(THIS_DIR, DATA_DIR)

if not os.path.exists(data_directory):
raise FileNotFoundError(f"Data directory '{data_directory}' not found.")
Expand Down Expand Up @@ -65,6 +71,7 @@ def load_dataframe():
file_path = os.path.join(data_directory, filename)

df = pd.read_csv(file_path, compression="gzip", low_memory=False)

# Save year from filename on dataframe
year = get_year_from_filename(filename)
df["year"] = year
Expand All @@ -84,7 +91,7 @@ def get_year_from_filename(filename):
def load_sqlite():
db_dir = "/tmp/nfl-analytics.db"
# load into pandas first and use to_sql to infer datatypes
df = load_dataframe()
df = load_dataframe_from_raw()

print(f"Loading into SQLite database: {db_dir}")

Expand All @@ -97,6 +104,15 @@ def load_sqlite():
print(cursor.fetchall())


def save_dataframe(df, filename_):
os.makedirs(ASSET_DIR, exist_ok=True)
filename = f"{filename_}.csv.gz"

save_path = os.path.join(ASSET_DIR, filename)
df.to_csv(save_path, index=False, compression="gzip")
print(f"Running average dataframe saved to {filename}")


if __name__ == "__main__":
download_data()
load_sqlite()
4 changes: 2 additions & 2 deletions nfl_analytics/dataframes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Handles everything between getting the data and training/using the model.
"""

from nfl_analytics.data import load_dataframe
from nfl_analytics.data import load_dataframe_from_raw
import pandas as pd


Expand Down Expand Up @@ -67,7 +67,7 @@ def build_running_avg_dataframe(df_raw=None):
Used to create prediction inputs and build the training dataset
"""
if df_raw is None:
df_raw = load_dataframe()
df_raw = load_dataframe_from_raw()

df_sacks = add_sack_yards(df_raw)
# df_game is team games stats by team: week 1, DET, 250 pass, 120 run, etc.
Expand Down
4 changes: 2 additions & 2 deletions nfl_analytics/dev_notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"metadata": {},
"outputs": [],
"source": [
"from nfl_analytics.data import load_dataframe\n",
"from nfl_analytics.data import load_dataframe_from_raw\n",
"import pandas as pd"
]
},
Expand All @@ -34,7 +34,7 @@
}
],
"source": [
"df = load_dataframe()\n",
"df = load_dataframe_from_raw()\n",
"df[df['year'] == 2023]['posteam'].unique()"
]
},
Expand Down
69 changes: 52 additions & 17 deletions nfl_analytics/main.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,30 @@
import argparse
import time

from nfl_analytics.data import download_data, load_dataframe
import pandas as pd
from joblib import load

from nfl_analytics.data import (
download_data,
load_dataframe_from_raw,
save_dataframe,
)
from nfl_analytics.model import (
train_model,
predict,
save_model_and_scaler,
load_model_and_scaler,
)
from nfl_analytics.dataframes import (
build_running_avg_dataframe,
build_training_dataframe,
)
from nfl_analytics.utils import is_valid_year
from nfl_analytics.config import TEAMS
from nfl_analytics.utils import is_valid_year, get_latest_timestamped_filepath
from nfl_analytics.config import (
TEAMS,
RUNNING_AVG_DF_FILENAME,
TRAINED_MODEL_FILENAME,
TRAINED_SCALER_FILENAME,
)


# ROUGH CLI docs:
Expand Down Expand Up @@ -50,39 +61,45 @@ def main():
invalid_years = [year for year in year_set if not is_valid_year(year)]

if invalid_years:
print(
f"Error: Invalid year(s) provided: {invalid_years}. No data downloaded."
)
print(f"Invalid year(s) provided: {invalid_years}. No data downloaded.")
else:
download_data(year_set)
else:
download_data()

if args.train:
print("Training model...")

start_time = time.time()
df_raw = load_dataframe()
try:
df_raw = load_dataframe_from_raw()
except FileNotFoundError:
print("No data loaded from the files. Please run with --download first.")
return
end_time = time.time()
print(f"Loaded dataframe in {end_time - start_time} seconds")

print("Training model...")

# This wont pick on updated data (downlaoded new data but still have combined, so it will use that)
# Save combined dataframe to disk
# save_dir = os.path.join("data", "combined")
# os.makedirs(save_dir, exist_ok=True)
# save_path = os.path.join(save_dir, "play_by_play_combined.parquet.gzip")
# df_raw.to_parquet(save_path, compression="gzip")

timestamp = int(time.time())

df_running_avg = build_running_avg_dataframe(df_raw)
save_dataframe(df_running_avg, f"{RUNNING_AVG_DF_FILENAME}-{timestamp}")

df_training = build_training_dataframe(df_running_avg)
model, scaler = train_model(df_training)

save_model_and_scaler(model, scaler)
save_model_and_scaler(model, scaler, timestamp)

if args.predict:
# TODO: this will silently predict based off old data if thats all we have.
# Perhaps I should require the week/year in the predict fn? Or at least log
# year/week in predict?
# year/week in predict? Or maybe aligning everything by timestamp will resolve this?
home_team = args.predict[0].upper()
away_team = args.predict[1].upper()

Expand All @@ -92,13 +109,31 @@ def main():
return

if home_team == away_team:
print("Error: Home and away team cannot be the same.")
print("Home and away team cannot be the same.")
return

model, scaler = load_model_and_scaler()

# TODO: load directly from somewhere instead?
df_running_avg = build_running_avg_dataframe()
try:
latest_model_filepath = get_latest_timestamped_filepath(
TRAINED_MODEL_FILENAME, ".joblib"
)
latest_scaler_filepath = get_latest_timestamped_filepath(
TRAINED_SCALER_FILENAME, ".joblib"
)
except FileNotFoundError:
print(
"No trained model and/or scaler found. Please run with --train first."
)
return
model, scaler = load(latest_model_filepath), load(latest_scaler_filepath)

try:
latest_running_avg_filename = get_latest_timestamped_filepath(
RUNNING_AVG_DF_FILENAME, ".csv.gz"
)
except FileNotFoundError:
print("No running average dataframe found. Please run with --train first.")
return
df_running_avg = pd.read_csv(latest_running_avg_filename, low_memory=False)

predicted_spread = predict(model, scaler, df_running_avg, home_team, away_team)

Expand Down
21 changes: 8 additions & 13 deletions nfl_analytics/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from joblib import dump, load
from joblib import dump

from nfl_analytics.config import FEATURES, ASSET_DIR

Expand Down Expand Up @@ -51,23 +51,18 @@ def train_model(df_training):
return model, scaler


def save_model_and_scaler(model, scaler):
def save_model_and_scaler(model, scaler, timestamp):
script_dir = os.path.dirname(os.path.abspath(__file__))
asset_dir = os.path.join(script_dir, ASSET_DIR)
os.makedirs(asset_dir, exist_ok=True)

dump(model, os.path.join(asset_dir, "trained_model.joblib"))
dump(scaler, os.path.join(asset_dir, "trained_scaler.joblib"))
print("Model and scaler saved")
model_filename = f"trained_model-{timestamp}.joblib"
scaler_filename = f"trained_scaler-{timestamp}.joblib"


def load_model_and_scaler():
script_dir = os.path.dirname(os.path.abspath(__file__))
asset_dir = os.path.join(script_dir, ASSET_DIR)

model = load(os.path.join(asset_dir, "trained_model.joblib"))
scaler = load(os.path.join(asset_dir, "trained_scaler.joblib"))
return model, scaler
dump(model, os.path.join(asset_dir, model_filename))
dump(scaler, os.path.join(asset_dir, scaler_filename))
print(f"Model saved to {model_filename}")
print(f"Scaler saved to {scaler_filename}")


def predict(model, scaler, df_running_avg, home_team, away_team):
Expand Down
19 changes: 18 additions & 1 deletion nfl_analytics/utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,30 @@
import datetime
import os

from nfl_analytics.config import START_YEAR
from nfl_analytics.config import START_YEAR, ASSET_DIR


def is_valid_year(year):
current_year = datetime.datetime.now().year
return START_YEAR <= year <= current_year


def get_latest_timestamped_filepath(starts_with, ends_with):
matching_files = [
file
for file in os.listdir(ASSET_DIR)
if file.startswith(starts_with) and file.endswith(ends_with)
]

if not matching_files:
raise FileNotFoundError(f"No matching files found")

sorted_files = sorted(matching_files)
latest_filename = sorted_files[-1]

return os.path.join(ASSET_DIR, latest_filename)


if __name__ == "__main__":
print(is_valid_year(1998)) # False
print(is_valid_year(1999)) # True
Expand Down

0 comments on commit 2cccfa0

Please sign in to comment.