diff --git a/README.md b/README.md index d2c65b5..40602f7 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,7 @@ Then I would train the model on all the games I can on a game-by-game basis. So - could be useful for comparing the accuracy of my model. in particular "Distribution of the deviation of the final margin of victory from the Vegas spread" - for example, perhaps avg spread difference between vegas and reality is ~10 so a model with an average difference of 8 would be good - a concise little overview on features from a datascience.exchange comment about predicting matches (NOTE: not spread): https://datascience.stackexchange.com/questions/102827/how-to-predict-the-winner-of-a-future-sports-match +- article on when you need to scale data for ml: https://www.baeldung.com/cs/normalization-vs-standardization # TODO: @@ -89,6 +90,18 @@ score differential is wrong? look at first game. the number for the 2 teams dont - gradient boosting (better with non-linear)? - [x] some sort of basic analysis to see how it performed. including manually comparing to vegas spread (maybe I can find an average difference? https://www.theonlycolors.com/2020/9/29/21492301/vegas-always-knows-a-mathematical-deep-dive) - 9-10 pt avg difference (?). normaly distrubution means ~68% will be within 1 std deviation (identified as 14-15). could be a little lower because 1,2, etc. are within 14-15, but could be higher because ~32% will be more than 14-15. +- [x] add function to create matchups from 2 teams so we can predict next week's games. + - using the running_avg df to merge, similar to how we're merging the game_id to get the final training df + - in practice the merged records should share a week but in theory they could be different (week 12 detroit vs. week 6 ravens etc.). +- [x] cli + - [x] download data + - [x] train model + - what to do with it? save configuration then recreate it when needed? pickle? + - [x] predict spread +- [ ] github workflow + - [ ] periodically update the data (and release?) + - [ ] periodically train the model (and release? what? the configuration... as what filetype? json?) + - [ ] periodically get upcoming games and make predictions. publish on github pages. get booky spread too? - [ ] improve features/model. either at game aggregation level or team @ week aggregation level - [ ] W/L record or games played and win pct? (win and loss column on game aggregation) - [ ] success rate (calculate success (0 or 1) from each play). @@ -97,9 +110,11 @@ score differential is wrong? look at first game. the number for the 2 teams dont - [x] total points scored/allowed - [ ] maybe dont use first ~3 games? small sample size but dont want to throw out too much data. - [ ] games played (could be used as confidence in record/stats) -- [x] add function to create matchups from 2 teams so we can predict next week's games. - - using the running_avg df to merge, similar to how we're merging the game_id to get the final training df - - in practice the merged records should share a week but in theory they could be different (week 12 detroit vs. week 6 ravens etc.). +- [ ] rethink exposing build_running_avg_dataframe, build_training_dataframe instead of doing that inside train_model (with side effect of saving the build_running_avg_dataframe (to disk?) somewhere). + - just need to see how its actually used + - I guess its good for development purposes? maybe just make the df arg in train_model(df) optional and build from ground up if not provided which will be used in cli/deployment but developing can pass it df? idk +- [ ] write script that gets upcoming games and makes prediction from model. + - try to find a good source for the schedule (nflfastR for that too maybe?). # Current status: @@ -123,6 +138,29 @@ score differential is wrong? look at first game. the number for the 2 teams dont # Stray thoughts: +- model name idea: caliper. (like measuring the "spread") +- save model by pickeling with joblib/dump or save the configuration like: + +```python + # Save essential components (assumes linreg - does it work the same for others?) + coefficients = model.coef_ + intercept = model.intercept_ + # assumes using minmaxscaler (but maybe im not) + scaler_params = {'min_values': scaler.min_, 'scale_values': scaler.scale_} + + # Recreate the model + recreated_model = LinearRegression() + recreated_model.coef_ = coefficients + recreated_model.intercept_ = intercept + + # Recreate the scaler + recreated_scaler = MinMaxScaler() + recreated_scaler.min_ = scaler_params['min_values'] + recreated_scaler.scale_ = scaler_params['scale_values'] +``` + +- I think saving the configuration is probably better if I can. + - What should the model's be guess _exactly_ and what does that say about how the teams are modeled in the input? the spread consists of 2 numbers (usually the inverse of each). 1 for each team. Maybe just predict the hometeam? - probably need to squash 2 teams into 1 line like: home_team_pass_off, home_team_pass_def, away_team_pass_off, away_team_pass_def, etc. - Are lots of features bad? What about redundant or mostly redundant features (pass yards, rush yards, total yards (total yards are either equal or very similar to pass+rush yards)). Which should I pick in that case (probably the less aggregated ones)? diff --git a/nfl_analytics/assets/trained_model.joblib b/nfl_analytics/assets/trained_model.joblib new file mode 100644 index 0000000..4df8b1a Binary files /dev/null and b/nfl_analytics/assets/trained_model.joblib differ diff --git a/nfl_analytics/assets/trained_scaler.joblib b/nfl_analytics/assets/trained_scaler.joblib new file mode 100644 index 0000000..0fd9fb7 Binary files /dev/null and b/nfl_analytics/assets/trained_scaler.joblib differ diff --git a/nfl_analytics/config.py b/nfl_analytics/config.py new file mode 100644 index 0000000..01b025f --- /dev/null +++ b/nfl_analytics/config.py @@ -0,0 +1,53 @@ +DATA_DIR = "data" +ASSET_DIR = "assets" +START_YEAR = 1999 +FEATURES = [ + "away_rushing_avg", + "home_rushing_avg", + "away_passing_avg", + "home_passing_avg", + "away_sack_yards_avg", + "home_sack_yards_avg", + "away_score_differential_post_avg", + "home_score_differential_post_avg", + "away_points_scored_avg", + "home_points_scored_avg", + "away_points_allowed_avg", + "home_points_allowed_avg", + "away_mean_epa_avg", + "home_mean_epa_avg", +] +TEAMS = [ + "WAS", + "ARI", + "BUF", + "NYJ", + "ATL", + "CAR", + "CIN", + "CLE", + "NYG", + "DAL", + "DET", + "KC", + "CHI", + "GB", + "BAL", + "HOU", + "IND", + "JAX", + "SEA", + "LA", + "LV", + "DEN", + "MIA", + "LAC", + "PHI", + "NE", + "PIT", + "SF", + "MIN", + "TB", + "NO", + "TEN", +] diff --git a/nfl_analytics/data.py b/nfl_analytics/data.py index 7795aae..5f855c5 100644 --- a/nfl_analytics/data.py +++ b/nfl_analytics/data.py @@ -1,32 +1,65 @@ +""" +Handles fetching and loading the play by play data. Essentially, +everything before tranforming it. +""" + import urllib.request +from urllib.error import HTTPError import os -import pandas as pd import sqlite3 +import pandas as pd -def get(): - years = range(1999, 2024) +from nfl_analytics.config import DATA_DIR - save_directory = "data" - os.makedirs(save_directory, exist_ok=True) + +def download_data(years=range(1999, 2024)): + os.makedirs(DATA_DIR, exist_ok=True) for year in years: # year gets parsed from this filename and depends on this format filename = f"play_by_play_{year}.csv.gz" url = f"https://github.com/nflverse/nflverse-data/releases/download/pbp/{filename}" - save_path = os.path.join(save_directory, filename) + save_path = os.path.join(DATA_DIR, filename) print(f"Downloading {url}") - urllib.request.urlretrieve(url, save_path) + + try: + urllib.request.urlretrieve(url, save_path) + except HTTPError as e: + print( + f"Error: Failed to download data for {year}. HTTP Error {e.code}: {e.reason}. Season for that year may not exist yet." + ) -def load_pandas(): +def load_dataframe(): script_dir = os.path.dirname(os.path.abspath(__file__)) - data_directory = os.path.join(script_dir, "data") + data_directory = os.path.join(script_dir, DATA_DIR) + if not os.path.exists(data_directory): + raise FileNotFoundError(f"Data directory '{data_directory}' not found.") + + files = os.listdir(data_directory) + + if not files: + raise FileNotFoundError(f"No data files found in the data directory.") + + # This wont pick on updated data (downlaoded new data but still have combined, so it will use that) + # # load saved combined from disk if exists + # combined_file_path = os.path.join( + # data_directory, "combined", "play_by_play_combined.parquet.gzip" + # ) + # if not skip_combined and os.path.exists(combined_file_path): + # print(f"Reading combined file {combined_file_path}") + # combined_df = pd.read_parquet(combined_file_path) + # return combined_df + # else: + # print("Combined file does not exist. Loading individual files.") + + # make combined dataframe from individual files combined_df = pd.DataFrame() - for filename in os.listdir(data_directory): + for filename in files: if filename.endswith(".csv.gz"): print(f"Reading {filename}") file_path = os.path.join(data_directory, filename) @@ -37,6 +70,9 @@ def load_pandas(): df["year"] = year combined_df = pd.concat([combined_df, df], ignore_index=True) + if combined_df.empty: + raise FileNotFoundError("No data loaded from the files.") + return combined_df @@ -46,12 +82,14 @@ def get_year_from_filename(filename): def load_sqlite(): + db_dir = "/tmp/nfl-analytics.db" # load into pandas first and use to_sql to infer datatypes - df = load_pandas() + df = load_dataframe() + + print(f"Loading into SQLite database: {db_dir}") table_name = "plays" - db_conn = sqlite3.connect(database="/tmp/nfl-analytics.db") - # TODO: remove drop table after developing? + db_conn = sqlite3.connect(database=db_dir) db_conn.execute(f"DROP TABLE IF EXISTS {table_name}") df.to_sql(table_name, db_conn, index=False) @@ -59,30 +97,6 @@ def load_sqlite(): print(cursor.fetchall()) -# def build(): -# # TODO: do all the things the dev notebook is doing. splitting into nice functions as necessary -# # For example, could make a function for each time in notebook we are initializing a new dataframe (just a rough guide). -# pass - - -class Pipeline: - def __init__(self, debug=False): - self.debug = debug - # self.df = pd.DataFrame() - - def _fetch_play_by_play(self, years=range(1999, 2024)): - pass - - def _load(self): - pass - - def _build(self): - pass - - # def stuffthatbuildcalls (so I can run in the dev notebook) - # if debug: true, print stuff - - if __name__ == "__main__": - get() + download_data() load_sqlite() diff --git a/nfl_analytics/dataframes.py b/nfl_analytics/dataframes.py new file mode 100644 index 0000000..70d461e --- /dev/null +++ b/nfl_analytics/dataframes.py @@ -0,0 +1,233 @@ +""" +Builds the dataframes used for training and prediction. +Handles everything between getting the data and training/using the model. +""" + +from nfl_analytics.data import load_dataframe +import pandas as pd + + +def build_training_dataframe(df_running_avg=None): + if df_running_avg is None: + df_running_avg = build_running_avg_dataframe() + + # Create a new column 'is_home' to indicate whether the team is playing at home + df_running_avg["is_home"] = df_running_avg.apply( + lambda row: True if row["team"] == row["home_team"] else False, axis=1 + ) + + # Group by game_id and is_home and aggregate using the first value + squashed_df = ( + df_running_avg.groupby(["game_id", "is_home"])[ + [ + "rushing_avg", + "passing_avg", + "yards_gained_avg", + "sack_yards_avg", + "passing_yards_defense_avg", + "rushing_yards_defense_avg", + "yards_gained_defense_avg", + "sack_yards_defense_avg", + "score_differential_post_avg", + "points_scored_avg", + "points_allowed_avg", + "mean_epa_avg", + ] + ] + .first() + .unstack() + ) + + squashed_df.columns = [ + f"{'home' if is_home else 'away'}_{col}" for col, is_home in squashed_df.columns + ] + squashed_df.reset_index(inplace=True) + + # Merge with the original DataFrame to get the rest of the columns + return pd.merge( + df_running_avg[ + [ + "game_id", + "week", + "year", + "team", + "home_team", + "away_team", + "home_spread", + ] + ], + squashed_df, + on="game_id", + ) + + +def build_running_avg_dataframe(df_raw=None): + """ + Builds a dataframe with weakly running averages for each team by year. + Used to create prediction inputs and build the training dataset + """ + if df_raw is None: + df_raw = load_dataframe() + + df_sacks = add_sack_yards(df_raw) + # df_game is team games stats by team: week 1, DET, 250 pass, 120 run, etc. + df_game_posteam = df_sacks.groupby(["game_id", "posteam"]) + df_game = aggregate_game_stats(df_sacks, df_game_posteam) + df_game = adjust_game_dataframe(df_game, df_game_posteam) + df_running_avg = df_game[ + [ + "game_id", + "team", + "week", + "year", + "home_team", + "away_team", + "score_differential_post", + ] + ].copy() + + # Set the home_spread + # This will be our target variable. It's the spread relative to the home team. We want this because we need to predict a single spread value (which we can then invert for the away team's spread). + df_running_avg["home_spread"] = df_game.apply( + lambda row: -row["score_differential_post"] + if row["team"] != row["home_team"] + else row["score_differential_post"], + axis=1, + ) + + # Get the running average for each team by team and year + # Uses lambda and shift to not include current row in running average + # Expand is an expanding window function that gets everything from the first to current row + df_running_avg[ + [ + "rushing_avg", + "passing_avg", + "yards_gained_avg", + "sack_yards_avg", + "passing_yards_defense_avg", + "rushing_yards_defense_avg", + "yards_gained_defense_avg", + "sack_yards_defense_avg", + "score_differential_post_avg", + "points_scored_avg", + "points_allowed_avg", + "mean_epa_avg", + ] + ] = ( + df_game.groupby(["team", "year"])[ + [ + "rushing_yards", + "passing_yards", + "yards_gained", + "sack_yards", + "passing_yards_defense", + "rushing_yards_defense", + "yards_gained_defense", + "sack_yards_defense", + "score_differential_post", + "points_scored", + "points_allowed", + "mean_epa", + ] + ] + .apply(lambda x: x.shift().expanding().mean()) + .reset_index(level=[0, 1], drop=True) + ) + + return df_running_avg + + +def add_sack_yards(df_raw): + df = df_raw.copy() + # Sack yards would be necessary to get accurate TEAM passing stats. + # Team passing yards are sum(passing_yards) - sum(sack_yards) + # Player passing stats are simply sum(passing_yards). + df["sack_yards"] = pd.NA + + # Set sack_yards to yards_gained for rows where sack is not equal to 0 + df.loc[df["sack"] != 0, "sack_yards"] = df["yards_gained"] + + return df + + +def aggregate_game_stats(df_sacks, df_game_posteam): + # Group by game and team and combine offensive and defensive stats into single record + + # Separate offensive and defensive stats + offensive_stats = ( + df_game_posteam[ + ["passing_yards", "rushing_yards", "yards_gained", "sack_yards"] + ] + .sum() + .reset_index() + ) + defensive_stats = ( + df_sacks.groupby(["game_id", "defteam"])[ + ["passing_yards", "rushing_yards", "yards_gained", "sack_yards"] + ] + .sum() + .reset_index() + ) + + # Rename columns for defensive stats to distinguish them + defensive_stats.rename( + columns={ + "defteam": "team", + "passing_yards": "passing_yards_defense", + "rushing_yards": "rushing_yards_defense", + "yards_gained": "yards_gained_defense", + "sack_yards": "sack_yards_defense", + }, + inplace=True, + ) + + return pd.merge( + offensive_stats, + defensive_stats, + left_on=["game_id", "posteam"], + right_on=["game_id", "team"], + ) + + +def adjust_game_dataframe(df_game, df_game_posteam): + df = df_game.copy() + + # Add home_team, away_team, home_score, away_score + df[["home_team", "away_team", "home_score", "away_score"]] = ( + df_game_posteam[["home_team", "away_team", "home_score", "away_score"]] + .first() + .reset_index(drop=True) + ) + + df["points_scored"] = df.apply( + lambda row: row["home_score"] + if row["posteam"] == row["home_team"] + else row["away_score"], + axis=1, + ) + df["points_allowed"] = df.apply( + lambda row: row["away_score"] + if row["posteam"] == row["home_team"] + else row["home_score"], + axis=1, + ) + + df.drop(["posteam"], axis=1, inplace=True) + + # sets score differential to last value for each game and team + df[["score_differential_post", "week", "year"]] = ( + df_game_posteam[["score_differential_post", "week", "year"]] + .last() + .reset_index(drop=True) + ) + + df["mean_epa"] = df_game_posteam["epa"].mean().reset_index(drop=True) + + return df + + +if __name__ == "__main__": + df_running_avg = build_running_avg_dataframe() + print(df_running_avg.tail()) + df_train = build_training_dataframe() + print(df_train.tail()) diff --git a/nfl_analytics/data_dev.ipynb b/nfl_analytics/dev_notebook.ipynb similarity index 91% rename from nfl_analytics/data_dev.ipynb rename to nfl_analytics/dev_notebook.ipynb index edccb38..fa6ad43 100644 --- a/nfl_analytics/data_dev.ipynb +++ b/nfl_analytics/dev_notebook.ipynb @@ -7,17 +7,17 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "from nfl_analytics.data import load_pandas\n", + "from nfl_analytics.data import load_dataframe\n", "import pandas as pd" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -29,494 +29,13 @@ "Reading play_by_play_2006.csv.gz\n", "Reading play_by_play_2014.csv.gz\n", "Reading play_by_play_2020.csv.gz\n", - "Reading play_by_play_2016.csv.gz\n", - "Reading play_by_play_2008.csv.gz\n", - "Reading play_by_play_2004.csv.gz\n", - "Reading play_by_play_2000.csv.gz\n", - "Reading play_by_play_2012.csv.gz\n", - "Reading play_by_play_2010.csv.gz\n", - "Reading play_by_play_2002.csv.gz\n", - "Reading play_by_play_2007.csv.gz\n", - "Reading play_by_play_2019.csv.gz\n", - "Reading play_by_play_2015.csv.gz\n", - "Reading play_by_play_2023.csv.gz\n", - "Reading play_by_play_2009.csv.gz\n", - "Reading play_by_play_2017.csv.gz\n", - "Reading play_by_play_2005.csv.gz\n", - "Reading play_by_play_2021.csv.gz\n", - "Reading play_by_play_1999.csv.gz\n", - "Reading play_by_play_2001.csv.gz\n", - "Reading play_by_play_2013.csv.gz\n", - "Reading play_by_play_2011.csv.gz\n", - "Reading play_by_play_2003.csv.gz\n" + "Reading play_by_play_2016.csv.gz\n" ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
play_idgame_idold_game_idhome_teamaway_teamseason_typeweekposteamposteam_typedefteam...home_opening_kickoffqb_epaxyac_epaxyac_mean_yardagexyac_median_yardagexyac_successxyac_fdxpasspass_oeyear
012022_01_BAL_NYJ2022091107NYJBALREG1NaNNaNNaN...10.000000NaNNaNNaNNaNNaNNaNNaN2022
1432022_01_BAL_NYJ2022091107NYJBALREG1NYJhomeBAL...1-0.443521NaNNaNNaNNaNNaNNaNNaN2022
2682022_01_BAL_NYJ2022091107NYJBALREG1NYJhomeBAL...11.468819NaNNaNNaNNaNNaN0.440373-44.0372912022
3892022_01_BAL_NYJ2022091107NYJBALREG1NYJhomeBAL...1-0.4921920.7272616.9881256.00.606930.2275980.38990461.0095982022
41152022_01_BAL_NYJ2022091107NYJBALREG1NYJhomeBAL...1-0.325931NaNNaNNaNNaNNaN0.443575-44.3574942022
\n", - "

5 rows × 373 columns

\n", - "
" - ], - "text/plain": [ - " play_id game_id old_game_id home_team away_team season_type \\\n", - "0 1 2022_01_BAL_NYJ 2022091107 NYJ BAL REG \n", - "1 43 2022_01_BAL_NYJ 2022091107 NYJ BAL REG \n", - "2 68 2022_01_BAL_NYJ 2022091107 NYJ BAL REG \n", - "3 89 2022_01_BAL_NYJ 2022091107 NYJ BAL REG \n", - "4 115 2022_01_BAL_NYJ 2022091107 NYJ BAL REG \n", - "\n", - " week posteam posteam_type defteam ... home_opening_kickoff qb_epa \\\n", - "0 1 NaN NaN NaN ... 1 0.000000 \n", - "1 1 NYJ home BAL ... 1 -0.443521 \n", - "2 1 NYJ home BAL ... 1 1.468819 \n", - "3 1 NYJ home BAL ... 1 -0.492192 \n", - "4 1 NYJ home BAL ... 1 -0.325931 \n", - "\n", - " xyac_epa xyac_mean_yardage xyac_median_yardage xyac_success xyac_fd \\\n", - "0 NaN NaN NaN NaN NaN \n", - "1 NaN NaN NaN NaN NaN \n", - "2 NaN NaN NaN NaN NaN \n", - "3 0.727261 6.988125 6.0 0.60693 0.227598 \n", - "4 NaN NaN NaN NaN NaN \n", - "\n", - " xpass pass_oe year \n", - "0 NaN NaN 2022 \n", - "1 NaN NaN 2022 \n", - "2 0.440373 -44.037291 2022 \n", - "3 0.389904 61.009598 2022 \n", - "4 0.443575 -44.357494 2022 \n", - "\n", - "[5 rows x 373 columns]" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = load_pandas()\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
game_idold_game_idyearweekposteam
02022_01_BAL_NYJ202209110720221NaN
12022_01_BAL_NYJ202209110720221NYJ
22022_01_BAL_NYJ202209110720221NYJ
32022_01_BAL_NYJ202209110720221NYJ
42022_01_BAL_NYJ202209110720221NYJ
52022_01_BAL_NYJ202209110720221NYJ
62022_01_BAL_NYJ202209110720221NYJ
72022_01_BAL_NYJ202209110720221BAL
82022_01_BAL_NYJ202209110720221BAL
92022_01_BAL_NYJ202209110720221BAL
\n", - "
" - ], - "text/plain": [ - " game_id old_game_id year week posteam\n", - "0 2022_01_BAL_NYJ 2022091107 2022 1 NaN\n", - "1 2022_01_BAL_NYJ 2022091107 2022 1 NYJ\n", - "2 2022_01_BAL_NYJ 2022091107 2022 1 NYJ\n", - "3 2022_01_BAL_NYJ 2022091107 2022 1 NYJ\n", - "4 2022_01_BAL_NYJ 2022091107 2022 1 NYJ\n", - "5 2022_01_BAL_NYJ 2022091107 2022 1 NYJ\n", - "6 2022_01_BAL_NYJ 2022091107 2022 1 NYJ\n", - "7 2022_01_BAL_NYJ 2022091107 2022 1 BAL\n", - "8 2022_01_BAL_NYJ 2022091107 2022 1 BAL\n", - "9 2022_01_BAL_NYJ 2022091107 2022 1 BAL" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[['game_id', 'old_game_id', 'year', 'week', 'posteam']].head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
epa
yearposteam
1999ARI-0.100310
ATL-0.073998
BAL-0.043631
BUF-0.014478
CAR0.059887
.........
2023SEA0.015927
SF0.119782
TB-0.007973
TEN-0.031331
WAS-0.084474
\n", - "

797 rows × 1 columns

\n", - "
" - ], - "text/plain": [ - " epa\n", - "year posteam \n", - "1999 ARI -0.100310\n", - " ATL -0.073998\n", - " BAL -0.043631\n", - " BUF -0.014478\n", - " CAR 0.059887\n", - "... ...\n", - "2023 SEA 0.015927\n", - " SF 0.119782\n", - " TB -0.007973\n", - " TEN -0.031331\n", - " WAS -0.084474\n", - "\n", - "[797 rows x 1 columns]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "df.groupby(['year', 'posteam'])[['epa']].mean()" + "df = load_dataframe()\n", + "df[df['year'] == 2023]['posteam'].unique()" ] }, { @@ -4602,7 +4121,6 @@ "from sklearn.impute import SimpleImputer\n", "from sklearn.preprocessing import StandardScaler\n", "\n", - "# Assuming your DataFrame is named df\n", "# Drop rows with NaN for week 1\n", "df_train = result_df[result_df['week'] > 1]\n", "\n", @@ -4637,7 +4155,6 @@ "# Split the data into training and testing sets\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "\n", - "# TODO: do i need to scale the data? https://www.baeldung.com/cs/normalization-vs-standardization\n", "# Standardize the features\n", "scaler = StandardScaler()\n", "X_train_scaled = scaler.fit_transform(X_train)\n", @@ -4690,8 +4207,7 @@ } ], "source": [ - "# Gut-checking the model with future (as of writing) conference titles games.\n", - "# TODO: negate the target variable? I think training w/ positive spread for winner might not be right.\n", + "# Gut-checking the model with future (as of writing) conference titles games 01/28/2024.\n", "\n", "# Odds are -7 SF (7 DET). Some places -7.5 SF (7.5 DET).\n", "# Very close - nice. This suggests taking Detroit to beat the spread although not by much.\n", diff --git a/nfl_analytics/get_data.py b/nfl_analytics/get_data.py deleted file mode 100644 index 7175bb1..0000000 --- a/nfl_analytics/get_data.py +++ /dev/null @@ -1,14 +0,0 @@ -import urllib.request -import os - -years = range(1999, 2024) - -save_directory = "data" -os.makedirs(save_directory, exist_ok=True) - -for year in years: - filename = f"play_by_play_{year}.csv.gz" - url = f"https://github.com/nflverse/nflverse-data/releases/download/pbp/{filename}" - save_path = os.path.join(save_directory, filename) - - urllib.request.urlretrieve(url, save_path) diff --git a/nfl_analytics/load_data.py b/nfl_analytics/load_data.py deleted file mode 100644 index b067057..0000000 --- a/nfl_analytics/load_data.py +++ /dev/null @@ -1,27 +0,0 @@ -# Loads csvs into pandas dataframe and sqlite db -import os -import pandas as pd -import sqlite3 - -data_directory = "nfl_analytics/data" -combined_df = pd.DataFrame() - -for filename in os.listdir(data_directory): - if filename.endswith(".csv.gz"): - print(f"Reading {filename}") - file_path = os.path.join(data_directory, filename) - - # Read the CSV file into a DataFrame and concat to combined df - df = pd.read_csv(file_path, compression="gzip", low_memory=False) - combined_df = pd.concat([combined_df, df], ignore_index=True) - -print(combined_df.head()) - -table_name = "plays" -db_conn = sqlite3.connect(database="/tmp/my.db") -# TODO: remove drop table after developing? -db_conn.execute(f"DROP TABLE IF EXISTS {table_name}") -num_rows_inserted = combined_df.to_sql(table_name, db_conn, index=False) - -cursor = db_conn.execute(f"SELECT * from {table_name} LIMIT 10") -print(cursor.fetchall()) diff --git a/nfl_analytics/main.py b/nfl_analytics/main.py new file mode 100644 index 0000000..3de1b5f --- /dev/null +++ b/nfl_analytics/main.py @@ -0,0 +1,111 @@ +import argparse +import time + +from nfl_analytics.data import download_data, load_dataframe +from nfl_analytics.model import ( + train_model, + predict, + save_model_and_scaler, + load_model_and_scaler, +) +from nfl_analytics.dataframes import ( + build_running_avg_dataframe, + build_training_dataframe, +) +from nfl_analytics.utils import is_valid_year +from nfl_analytics.config import TEAMS + + +# ROUGH CLI docs: +# --download: optional. takes list of years. or if empty, defaults to downloading all years. usage: python main.py --download 2021 2022 +# --train: optional. if present, trains the model. usage: python main.py --train +# --predict: optional. takes two arguments, home team and away team. usage: python main.py --predict "CHI" "MIN" + + +def main(): + parser = argparse.ArgumentParser(description="Manage NFL Spread Predictor Pipeline") + parser.add_argument( + "--download", + nargs="*", + type=int, + metavar="year", + help="Download data for the specified years. The year corresponds to the season start.", + ) + parser.add_argument( + "--train", + action="store_true", + help="Train the model using the downloaded data.", + ) + parser.add_argument( + "--predict", + nargs=2, + metavar=("home_team", "away_team"), + help="Specify the home and away teams for prediction.", + ) + args = parser.parse_args() + + if args.download is not None: + if args.download: + year_set = set(args.download) + invalid_years = [year for year in year_set if not is_valid_year(year)] + + if invalid_years: + print( + f"Error: Invalid year(s) provided: {invalid_years}. No data downloaded." + ) + else: + download_data(year_set) + else: + download_data() + + if args.train: + print("Training model...") + + start_time = time.time() + df_raw = load_dataframe() + end_time = time.time() + print(f"Loaded dataframe in {end_time - start_time} seconds") + + # This wont pick on updated data (downlaoded new data but still have combined, so it will use that) + # Save combined dataframe to disk + # save_dir = os.path.join("data", "combined") + # os.makedirs(save_dir, exist_ok=True) + # save_path = os.path.join(save_dir, "play_by_play_combined.parquet.gzip") + # df_raw.to_parquet(save_path, compression="gzip") + + df_running_avg = build_running_avg_dataframe(df_raw) + df_training = build_training_dataframe(df_running_avg) + model, scaler = train_model(df_training) + + save_model_and_scaler(model, scaler) + + if args.predict: + # TODO: this will silently predict based off old data if thats all we have. + # Perhaps I should require the week/year in the predict fn? Or at least log + # year/week in predict? + home_team = args.predict[0].upper() + away_team = args.predict[1].upper() + + for team in [home_team, away_team]: + if team not in TEAMS: + print(f"Invalid team: {team}") + return + + if home_team == away_team: + print("Error: Home and away team cannot be the same.") + return + + model, scaler = load_model_and_scaler() + + # TODO: load directly from somewhere instead? + df_running_avg = build_running_avg_dataframe() + + predicted_spread = predict(model, scaler, df_running_avg, home_team, away_team) + + print( + f"Predicted spread for {home_team} (home) vs {away_team} (away): {predicted_spread}" + ) + + +if __name__ == "__main__": + main() diff --git a/nfl_analytics/model.py b/nfl_analytics/model.py new file mode 100644 index 0000000..9122d5c --- /dev/null +++ b/nfl_analytics/model.py @@ -0,0 +1,152 @@ +import os + +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LinearRegression +from sklearn.metrics import mean_squared_error, mean_absolute_error +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import StandardScaler +from joblib import dump, load + +from nfl_analytics.config import FEATURES, ASSET_DIR + + +def train_model(df_training): + # Drop week 1 because is all NaN + df_train = df_training[df_training["week"] > 1] + + # Dont use unnecessary columns like 'game_id', 'week', 'year', 'team', 'home_team', 'away_team' + # Keep only relevant columns for prediction + target = "home_spread" + select_columns = FEATURES + [target] + + df_train = df_train[select_columns] + + # TODO: why are there missing values? + imputer = SimpleImputer(strategy="mean") + df_imputed = pd.DataFrame(imputer.fit_transform(df_train), columns=df_train.columns) + + X = df_imputed.drop(target, axis=1) + y = df_imputed[target] + + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42 + ) + + # Note: scaler is transformed by fit_transform. Must re-use the same scaler for prediction. + scaler = StandardScaler() + X_train_scaled = scaler.fit_transform(X_train) + X_test_scaled = scaler.transform(X_test) + + model = LinearRegression() + model.fit(X_train_scaled, y_train) + + y_pred = model.predict(X_test_scaled) + + mse = mean_squared_error(y_test, y_pred) + mae = mean_absolute_error(y_test, y_pred) + print(f"Mean Squared Error: {mse}") + print(f"Mean Absolute Error: {mae}") + + return model, scaler + + +def save_model_and_scaler(model, scaler): + script_dir = os.path.dirname(os.path.abspath(__file__)) + asset_dir = os.path.join(script_dir, ASSET_DIR) + os.makedirs(asset_dir, exist_ok=True) + + dump(model, os.path.join(asset_dir, "trained_model.joblib")) + dump(scaler, os.path.join(asset_dir, "trained_scaler.joblib")) + print("Model and scaler saved") + + +def load_model_and_scaler(): + script_dir = os.path.dirname(os.path.abspath(__file__)) + asset_dir = os.path.join(script_dir, ASSET_DIR) + + model = load(os.path.join(asset_dir, "trained_model.joblib")) + scaler = load(os.path.join(asset_dir, "trained_scaler.joblib")) + return model, scaler + + +def predict(model, scaler, df_running_avg, home_team, away_team): + matchup = make_matchup(df_running_avg, home_team, away_team) + matchup_input = get_matchup_input(scaler, matchup) + + return model.predict(matchup_input)[0] + + +def make_matchup(df_running_avg, home_team, away_team, week=None, year=None): + """Merge given team/week/years stats into a single row. + To be used for predicting spreads for future games.""" + + df = df_running_avg.copy() + + if year is None: + year = df["year"].max() + + if week is None: + last_week = df[df["year"] == year]["week"].max() + week = last_week + + # df_running_avg include running averages prior to that week, and data about + # that week itself: teams, final scores, etc.). Basically (and literally at + # the time of writing) anything not suffixed with `_avg`. The data about the + # week itself are necessary for training the model but dont make sense in + # the context of predicting future games so they are not included here. + cols = [ + "rushing_avg", + "passing_avg", + "yards_gained_avg", + "sack_yards_avg", + "passing_yards_defense_avg", + "rushing_yards_defense_avg", + "yards_gained_defense_avg", + "sack_yards_defense_avg", + "score_differential_post_avg", + "points_scored_avg", + "points_allowed_avg", + "mean_epa_avg", + ] + + # Select data for the specified week, home team, and away team in the specified year + home_data = ( + df[(df["year"] == year) & (df["week"] == week) & (df["team"] == home_team)][ + cols + ] + .add_prefix("home_") + .reset_index(drop=True) + ) + away_data = ( + df[(df["year"] == year) & (df["week"] == week) & (df["team"] == away_team)][ + cols + ] + .add_prefix("away_") + .reset_index(drop=True) + ) + + return pd.concat([home_data, away_data], axis=1) + + +def get_matchup_input(scaler, matchup): + reshaped_matchup = matchup[FEATURES].values.reshape(1, -1) + return scaler.transform(reshaped_matchup) + + +if __name__ == "__main__": + from nfl_analytics.dataframes import ( + build_running_avg_dataframe, + build_training_dataframe, + ) + + df_running_avg = build_running_avg_dataframe() + df_training = build_training_dataframe() + model, scaler = train_model(df_training) + print(make_matchup(df_running_avg, "KC", "SF").tail()) + # first team is home but this is superbowl so neither is technically home + # week 22 (? its the superbowl) 2023 (2023 SEASON, year is 2024) + kc_sf = predict(model, scaler, df_running_avg, "KC", "SF") + print(f"Prediction: {kc_sf}") + sf_kc = predict(model, scaler, df_running_avg, "SF", "KC") + print(f"Prediction: {sf_kc}") diff --git a/nfl_analytics/train_dev.ipynb b/nfl_analytics/train_dev.ipynb deleted file mode 100644 index e69de29..0000000 diff --git a/nfl_analytics/utils.py b/nfl_analytics/utils.py new file mode 100644 index 0000000..ab02bc7 --- /dev/null +++ b/nfl_analytics/utils.py @@ -0,0 +1,17 @@ +import datetime + +from nfl_analytics.config import START_YEAR + + +def is_valid_year(year): + current_year = datetime.datetime.now().year + return START_YEAR <= year <= current_year + + +if __name__ == "__main__": + print(is_valid_year(1998)) # False + print(is_valid_year(1999)) # True + print(is_valid_year(2000)) # True + print(is_valid_year(2023)) # True + print(is_valid_year(2024)) # True + print(is_valid_year(2025)) # False (if current year is 2024) diff --git a/poetry.lock b/poetry.lock index b9eef52..d0e3e18 100644 --- a/poetry.lock +++ b/poetry.lock @@ -121,6 +121,87 @@ traitlets = ">=4" [package.extras] test = ["pytest"] +[[package]] +name = "cramjam" +version = "2.8.1" +description = "Thin Python bindings to de/compression algorithms in Rust" +optional = false +python-versions = ">=3.7" +files = [ + {file = "cramjam-2.8.1-cp310-cp310-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:1afc66031e5165f4eae27e6e1f89c0def6c2ece826903ebb0194ee2f467ff8e6"}, + {file = "cramjam-2.8.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:4af4b74b16c22d6a0092d6a7db642ee097f4b0bfa0389d5a07552a2fc48eb0b6"}, + {file = "cramjam-2.8.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:e858a56d12a61d0e19b2286a7661b36b52b3cad4fa84d8aaeb0b0ed0b2338d36"}, + {file = "cramjam-2.8.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc25199e44ee8ca55c62d6da2a74ea48e759058f2c96ca1e5d512aad6ce6005b"}, + {file = "cramjam-2.8.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:130aee295a77256aa867a4562961e99db56ddf8081df1e9bfb1607a409dcc4df"}, + {file = "cramjam-2.8.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ccfa252b800d6cdae8cc20728c41b94a39f5cadee8693ab2539ea02285e8e015"}, + {file = "cramjam-2.8.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b8bd1cc5706b235655b742e8e350c13ffc2036efc098a192bd9f4df8b153c03e"}, + {file = "cramjam-2.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dedae22ed4f8997ce584256b4a790354f314baa209a9015aa1ab6e383f6177c5"}, + {file = "cramjam-2.8.1-cp310-none-win32.whl", hash = "sha256:f1af221507fbcd5bd46a92d41ca60410400624328b5c92ec2efb13deca86b6e9"}, + {file = "cramjam-2.8.1-cp310-none-win_amd64.whl", hash = "sha256:94017d3057d53bad33ec4ef46409a2b410a67c0905eb094b441c1687ca1f738a"}, + {file = "cramjam-2.8.1-cp311-cp311-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:033b1720718c68cacab1dbeee6d8e8e7ed851ac9abb8aca6ae9cc869e7087df8"}, + {file = "cramjam-2.8.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4597d720efdd92f5a4f09289d08f0fcf690ae8a16295dd356dd66fcc4aa10e30"}, + {file = "cramjam-2.8.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:ddafdb489ad4b84cb7e8c8423a21f37a5e6dfe7e32847bd0048f680d703184cf"}, + {file = "cramjam-2.8.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cbf9df8f296422410722fb5349159432eea11ac5fec7bf59213c7148f3efa04"}, + {file = "cramjam-2.8.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e588b11966152234f8ffa95089dd447a7ec299ea319acee15a78387cfc267ba9"}, + {file = "cramjam-2.8.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9050a662c484752b18a07ba0bd01601f229f40152d374ea79c91da239d6b929b"}, + {file = "cramjam-2.8.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a909ef9b6099e4df03a7b602d75e0c43c23359f21cd138886a51f7ae268fffc9"}, + {file = "cramjam-2.8.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a6b06704e99a116dbdae78c4fef47d41bec1ef08a911c50807cde608d99764b"}, + {file = "cramjam-2.8.1-cp311-none-win32.whl", hash = "sha256:9f069be2b8cfa16d2ee5c1fc7cb6864f8d5db03e31d0f07a38a227dbd470d1ad"}, + {file = "cramjam-2.8.1-cp311-none-win_amd64.whl", hash = "sha256:0f50414cc2f216d0c4d311e81412bf32fda6e4d1e867de2f4bdd2c1d05319d00"}, + {file = "cramjam-2.8.1-cp312-cp312-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:000e2321226b83fee2fa6b353d649f034266fa6d42d5aa3521c7457ca7fa3118"}, + {file = "cramjam-2.8.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:80373dccc3dc25446d5d5ae7c8641daa2dbd4973c98bd2ab3e113fa57a653647"}, + {file = "cramjam-2.8.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:bc2d6ccaa164ba122b724f4958821d08ed7c49a90662ce800c24021c5900cbbe"}, + {file = "cramjam-2.8.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a726fb5ee28e4258215c327d65bf6791f25e7742a5d262352e2c344ac683a0f8"}, + {file = "cramjam-2.8.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d087048ef9244cf42b8958ec9eabacbf3c337fa0697a4573e99b189253f312d4"}, + {file = "cramjam-2.8.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4230d22a249ae91459ceae15e55abac5cef983042ddbd0484e2863034dd63725"}, + {file = "cramjam-2.8.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd52a1767750b3a83308179e3ff0b3465551c8f4419676847300989582e55a61"}, + {file = "cramjam-2.8.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f16f327dd8d299ac137bf3288e89772187533d9da367bd3d270f5546273e7ce8"}, + {file = "cramjam-2.8.1-cp312-none-win32.whl", hash = "sha256:d7118d61d7a037f80ef6a2d809cd01492b574983947593570ac757d2a87eae6e"}, + {file = "cramjam-2.8.1-cp312-none-win_amd64.whl", hash = "sha256:465f868c14b921af0cb66f7bfa4f3c7a131cc47de1c9390dfa1d96fbe90778de"}, + {file = "cramjam-2.8.1-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:77c952be40131c47ebbcb534862530ba67a80c86a7099532cace517109c2b428"}, + {file = "cramjam-2.8.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8e5b8f4d23e0a4a97381ded35a6a7fe839eba1b0372c1e74ee3713adc59f5c1c"}, + {file = "cramjam-2.8.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8d1c2b51b0754736f20c1ea3638118a7acd60cc5ab32c6751e247cd6ada15c79"}, + {file = "cramjam-2.8.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d3b3d1167928bf0ad846239758a8fd6536d560886b63174359392686721e8902"}, + {file = "cramjam-2.8.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9f1b2fd1108b040a822e8d6c32aebe46275095128783f944971c3b37671c6af0"}, + {file = "cramjam-2.8.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c676bb517a2edd809684e3495eccb7868bd5f1be0684853f226166aa54ed072"}, + {file = "cramjam-2.8.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:133bc01739c897ebf7dcfb3e0fe11b4ca5851bcda1e021c4877e11cf1712a4e5"}, + {file = "cramjam-2.8.1-cp37-none-win32.whl", hash = "sha256:b85e8c7380ee7a1b6b530764f929a0766b5ee1d8cbbfa0c0fca2c08415a7532f"}, + {file = "cramjam-2.8.1-cp37-none-win_amd64.whl", hash = "sha256:6611b41ff76ff252ce67acc13a83aae17ea3131e5926fa1a96e266ff803d4d67"}, + {file = "cramjam-2.8.1-cp38-cp38-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:db7086436d50d6fb2c2b38cfbec5a860a2595cbed0e7a3e541c05180f5e26005"}, + {file = "cramjam-2.8.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:f2332835c2db911cc572e6604860e5f2da439833f855ed1c1dac96fd5fc1025a"}, + {file = "cramjam-2.8.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:4b2bdcaeaa649efa4c41e7add81a205359c7540c58766151488955ebe12afa9f"}, + {file = "cramjam-2.8.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3f8d44cbf2f680a39f677cafbadb3353ee09ce9d2ed51b52eba5261cb3b935a"}, + {file = "cramjam-2.8.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a6d283676a5979236011bd3daed39560e5d0f39df6cceab31a291d5203a36cde"}, + {file = "cramjam-2.8.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dd3787a6f9bbfea1a74aa9457d73eae18393de67cab03f89ac20beaaed43cb8d"}, + {file = "cramjam-2.8.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b0200d1530c9ae9c0988e2878a08624de1cb2b841eea2dcb9a10a236ff43747c"}, + {file = "cramjam-2.8.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:acba9d6fe6653e1f387aee53cc8d6d9d90d5416b29a233e40dcdf84e6b1002b9"}, + {file = "cramjam-2.8.1-cp38-none-win32.whl", hash = "sha256:6d4e357d98d5a4ac7291b88a4e54adcb85506a00aa9e72fa222a2caa3b881828"}, + {file = "cramjam-2.8.1-cp38-none-win_amd64.whl", hash = "sha256:960f0a65db3de5f92762c0dfcad514982ea888f300677d83a1d3eb46eee6b109"}, + {file = "cramjam-2.8.1-cp39-cp39-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:9ccbc9b23b42aa65cf6878908663e73aeb55971b9e3d0ce3c7733b387934f030"}, + {file = "cramjam-2.8.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:bfac3506ed0c21b4b2a27af50de9ad2bdd5cc02aed977782fa01d21cc6f54de9"}, + {file = "cramjam-2.8.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:1f3ef0d616fb8f4d8eb75b3b22653385b88fe493895d763f5f235c7c6d64e570"}, + {file = "cramjam-2.8.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f18a8d637ac9497e0be2a8fa49cca52259196d18ca73b36a385b6d925cb21b7"}, + {file = "cramjam-2.8.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d21a370320b6959d7bd581ca286439a113e325477cc92aefb0be378bd351323a"}, + {file = "cramjam-2.8.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5c2d49a2f8e2a4501c460ad8452ce269c9ddd5c4671ad41cf41104a3fa6ca2e7"}, + {file = "cramjam-2.8.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8366b00f578ea1b202f20b5ebab622ac10599d1b08c36ed9089e27a452c76d2e"}, + {file = "cramjam-2.8.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ab8228f4b343e08dd2f4e2a714427979dfd25184cde08198c1d149249ab29a9"}, + {file = "cramjam-2.8.1-cp39-none-win32.whl", hash = "sha256:a280e51ea157bc2dd9dae2751acfa51bccc0453ce6d99521c18b73719d724b56"}, + {file = "cramjam-2.8.1-cp39-none-win_amd64.whl", hash = "sha256:2a209e5b7f1e62d8dc27278948176391d35defd0202cd9b0d577126073a781a5"}, + {file = "cramjam-2.8.1-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3ca01ea39e0b9c7938fe4a5b73dbcd0f43c2b0aaa04de39fe8deb1b69d4a59c0"}, + {file = "cramjam-2.8.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a83ae10a853fe93882d32da53f50a62927b6408cf486f7475077b04a93c3475"}, + {file = "cramjam-2.8.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0645a226ae669c26b2232ed30ec0ed34688da3fe6473b3998edc3e11af9a6cb3"}, + {file = "cramjam-2.8.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:02dc04049680ab834feb092d6d1b6632eb03ba9069b2415a7fe6515d42fe10e0"}, + {file = "cramjam-2.8.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4c447fb8a01ae435fd7ed3d1bebeb168aa35c5d9edb0326d6207381b54b8f558"}, + {file = "cramjam-2.8.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c085d020fc15e284640d2ed3a8b121a26a06c0ddfcba87b968460421a565753a"}, + {file = "cramjam-2.8.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:19af180433b5c25ce25698edf28f26a9f66626a2803a2f1682fbc83d38a7842f"}, + {file = "cramjam-2.8.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4a259336a60c8c2d4067e19ea78903e55b1e2963831f9f5a5b6d9a395088974"}, + {file = "cramjam-2.8.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af6331df89641d3dd2fa40b32a7d22353ac6bb62c87baa28ce4073bebfc8f686"}, + {file = "cramjam-2.8.1.tar.gz", hash = "sha256:f7e233791761b61e3d85598b25db75868d9f1cd35b2bd10eb34ec4755ff2add1"}, +] + +[package.extras] +dev = ["black (==22.3.0)", "hypothesis", "numpy", "pytest (>=5.30)", "pytest-xdist"] + [[package]] name = "debugpy" version = "1.8.0" @@ -173,6 +254,98 @@ files = [ [package.extras] tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"] +[[package]] +name = "fastparquet" +version = "2023.10.1" +description = "Python support for Parquet file format" +optional = false +python-versions = ">=3.8" +files = [ + {file = "fastparquet-2023.10.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:75a00475e96d26214dace147b27ab782da7a0ae230cade05ea9181c3aec2e637"}, + {file = "fastparquet-2023.10.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:af0c1d5559aa0a4fff8eb3b301c8177b6813bb15fe9d2007ad0dc89f8fa519c5"}, + {file = "fastparquet-2023.10.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b798cdfa8f01cd573b135a493a4d0686ebbcd3a412d6e59889a7ae41ff90efeb"}, + {file = "fastparquet-2023.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a746f521da7459707899fc33b334b2d21f590380f472fc27642f3ef28ee451d2"}, + {file = "fastparquet-2023.10.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e86d64898e846ed0f5745427436e5772fd7bb5d9a930f1dca8233e90385e126b"}, + {file = "fastparquet-2023.10.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5c3afafd4a0907216f5ee4f835f47ad16b84c5dede4c5ca4c0754dffe3eb72d7"}, + {file = "fastparquet-2023.10.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:68d26a1172be5b02018f6c28603f195807955d8262b913349385d977f3ae081f"}, + {file = "fastparquet-2023.10.1-cp310-cp310-win_amd64.whl", hash = "sha256:b7086ca3a0d8ae8680b380da9b7057a1491d629945b1dd228eba5b362e2e39aa"}, + {file = "fastparquet-2023.10.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7f8d53f5e5049b21893964cd27154c2a7c8180f3ffd1f2693f80e0f834a3a35e"}, + {file = "fastparquet-2023.10.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ea74f28494fda892641a564f728d046a074fdea5b9ff664ef9554c0da563bad4"}, + {file = "fastparquet-2023.10.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ab978612d21630033df0a0b12423ed826fe36e83a1710b155968c3c6e2b3174a"}, + {file = "fastparquet-2023.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:efc30c502feaa67c058c496eb4a734eba8bd373f0d24a32cc69360c79f7220ef"}, + {file = "fastparquet-2023.10.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:99568ae6bbbd973b32d796cb664ba156b101e5d1931dba780fe2dc0d9b227dfd"}, + {file = "fastparquet-2023.10.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:53b9ad8d646c2609854cbe7d7b17be343664cabae1cd0eb119011e389df8484d"}, + {file = "fastparquet-2023.10.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:b2a9ee49039223a1e216c409c25072be1f362de27197cbec5f90cf2e736df3b0"}, + {file = "fastparquet-2023.10.1-cp311-cp311-win_amd64.whl", hash = "sha256:9133d2f975c6e05187be4b558060e6a4aafeba02dceaf849cf6ad46d32e59405"}, + {file = "fastparquet-2023.10.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b826696cd48f1defb6fcafb4c9798102233e54f3f3491251c034dde3d94f420a"}, + {file = "fastparquet-2023.10.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bf2d58bee17e0eea8565c2bcd2b339ee032472751651e21f000eb564ad3cd5cf"}, + {file = "fastparquet-2023.10.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9296098d06c6692ee477fe491381eda39fc0dcfe2fce210496491fe16ce27ef8"}, + {file = "fastparquet-2023.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c975d648ea491e684135e9e3c0a15b440d66d0772fe497269e5c9c4eaaeb62a2"}, + {file = "fastparquet-2023.10.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4c5208db1f38c8ac5f50f309f77bdb828fa7f247b82e2df88d847ad3bec38903"}, + {file = "fastparquet-2023.10.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:118d1832ed2098f313936044012083c8c1b07da85ee11612895f3c4ef27bfd8a"}, + {file = "fastparquet-2023.10.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:35cff597d2778b6fe8ef7dc36cba056cd7337151dbfc2fb08abaa6b109c75140"}, + {file = "fastparquet-2023.10.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da633a0bd1889f30acb1b6dffa99832739802d0ae5f455b4e5eb720ab701e09"}, + {file = "fastparquet-2023.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8aff041900ebcb4c2510bede80695fed35fb7c24dfd83b60ba8b56d7ede4e0fe"}, + {file = "fastparquet-2023.10.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62aabf43b6bbbc074b89f9a4769f7276204b6585d2d8fae770a0b782da5b9fc9"}, + {file = "fastparquet-2023.10.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ba85d9e5c298515a347f94bc65c0b570391b344d765dc349bafb35137466ddb2"}, + {file = "fastparquet-2023.10.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2be7d33969e724c8aa777122d6032845a362cb2075f6e6f2c5b2150bd6223cc8"}, + {file = "fastparquet-2023.10.1-cp38-cp38-win_amd64.whl", hash = "sha256:2a0c1f485d3085fe98dbae9ead2e97a886deb99d3db7af635296bfd3f4f2f814"}, + {file = "fastparquet-2023.10.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1fac5319aabcbc4acc2feb5df68336de755de7d60a2ee9329fef178ac016e236"}, + {file = "fastparquet-2023.10.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c61d26705e9a2ad2d52ed1d527c75e96e6a9a04be35bd4c8d6f4accd778f9b05"}, + {file = "fastparquet-2023.10.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2635f0f37a983e35be9b8013b84361e3d0cdd4f514b822016445c029b1c6e007"}, + {file = "fastparquet-2023.10.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cde8f6798d37e2af38ada058fc7018c2157d90a8dd728c0c59fab85b8adb9215"}, + {file = "fastparquet-2023.10.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:20c17c5e7186723a175c9e7da94285bdef3cb477cb7cca0e2812b1e245279671"}, + {file = "fastparquet-2023.10.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:91ee6b5b0efc18586e61da6662119de92fc7bf552c3a08a13eb2af16bc12f16a"}, + {file = "fastparquet-2023.10.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:332cb3b204e1de64dcfc4c5d0b517ea665856d19c139f693e8c9efc11992e19e"}, + {file = "fastparquet-2023.10.1-cp39-cp39-win_amd64.whl", hash = "sha256:5eb06a70daf50d70290b87f3a5ca6f25eb24ad850bcc68197b5438d92b11c763"}, + {file = "fastparquet-2023.10.1.tar.gz", hash = "sha256:076fedfba2b56782b4823c1d351424425cfeaa5b8644c542416ca1363fe6d921"}, +] + +[package.dependencies] +cramjam = ">=2.3" +fsspec = "*" +numpy = ">=1.20.3" +packaging = "*" +pandas = ">=1.5.0" + +[package.extras] +lzo = ["python-lzo"] + +[[package]] +name = "fsspec" +version = "2023.12.2" +description = "File-system specification" +optional = false +python-versions = ">=3.8" +files = [ + {file = "fsspec-2023.12.2-py3-none-any.whl", hash = "sha256:d800d87f72189a745fa3d6b033b9dc4a34ad069f60ca60b943a63599f5501960"}, + {file = "fsspec-2023.12.2.tar.gz", hash = "sha256:8548d39e8810b59c38014934f6b31e57f40c1b20f911f4cc2b85389c7e9bf0cb"}, +] + +[package.extras] +abfs = ["adlfs"] +adl = ["adlfs"] +arrow = ["pyarrow (>=1)"] +dask = ["dask", "distributed"] +devel = ["pytest", "pytest-cov"] +dropbox = ["dropbox", "dropboxdrivefs", "requests"] +full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"] +fuse = ["fusepy"] +gcs = ["gcsfs"] +git = ["pygit2"] +github = ["requests"] +gs = ["gcsfs"] +gui = ["panel"] +hdfs = ["pyarrow (>=1)"] +http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "requests"] +libarchive = ["libarchive-c"] +oci = ["ocifs"] +s3 = ["s3fs"] +sftp = ["paramiko"] +smb = ["smbprotocol"] +ssh = ["paramiko"] +tqdm = ["tqdm"] + [[package]] name = "ipykernel" version = "6.29.0" @@ -957,4 +1130,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "e9e2ee4b6fdb1aba77a21d7e986f962a6e87ed9bec9353ad443ad7129a575d6b" +content-hash = "56a680440d968ef6670819b80fb63d6a7806417f8582fbe94b467a8f6d4a886b" diff --git a/pyproject.toml b/pyproject.toml index bfc36a8..27c9a58 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ python = "^3.12" pandas = "^2.2.0" ipykernel = "^6.29.0" scikit-learn = "^1.4.0" +fastparquet = "^2023.10.1" [build-system]