diff --git a/README.md b/README.md
index d2c65b5..40602f7 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,7 @@ Then I would train the model on all the games I can on a game-by-game basis. So
- could be useful for comparing the accuracy of my model. in particular "Distribution of the deviation of the final margin of victory from the Vegas spread"
- for example, perhaps avg spread difference between vegas and reality is ~10 so a model with an average difference of 8 would be good
- a concise little overview on features from a datascience.exchange comment about predicting matches (NOTE: not spread): https://datascience.stackexchange.com/questions/102827/how-to-predict-the-winner-of-a-future-sports-match
+- article on when you need to scale data for ml: https://www.baeldung.com/cs/normalization-vs-standardization
# TODO:
@@ -89,6 +90,18 @@ score differential is wrong? look at first game. the number for the 2 teams dont
- gradient boosting (better with non-linear)?
- [x] some sort of basic analysis to see how it performed. including manually comparing to vegas spread (maybe I can find an average difference? https://www.theonlycolors.com/2020/9/29/21492301/vegas-always-knows-a-mathematical-deep-dive)
- 9-10 pt avg difference (?). normaly distrubution means ~68% will be within 1 std deviation (identified as 14-15). could be a little lower because 1,2, etc. are within 14-15, but could be higher because ~32% will be more than 14-15.
+- [x] add function to create matchups from 2 teams so we can predict next week's games.
+ - using the running_avg df to merge, similar to how we're merging the game_id to get the final training df
+ - in practice the merged records should share a week but in theory they could be different (week 12 detroit vs. week 6 ravens etc.).
+- [x] cli
+ - [x] download data
+ - [x] train model
+ - what to do with it? save configuration then recreate it when needed? pickle?
+ - [x] predict spread
+- [ ] github workflow
+ - [ ] periodically update the data (and release?)
+ - [ ] periodically train the model (and release? what? the configuration... as what filetype? json?)
+ - [ ] periodically get upcoming games and make predictions. publish on github pages. get booky spread too?
- [ ] improve features/model. either at game aggregation level or team @ week aggregation level
- [ ] W/L record or games played and win pct? (win and loss column on game aggregation)
- [ ] success rate (calculate success (0 or 1) from each play).
@@ -97,9 +110,11 @@ score differential is wrong? look at first game. the number for the 2 teams dont
- [x] total points scored/allowed
- [ ] maybe dont use first ~3 games? small sample size but dont want to throw out too much data.
- [ ] games played (could be used as confidence in record/stats)
-- [x] add function to create matchups from 2 teams so we can predict next week's games.
- - using the running_avg df to merge, similar to how we're merging the game_id to get the final training df
- - in practice the merged records should share a week but in theory they could be different (week 12 detroit vs. week 6 ravens etc.).
+- [ ] rethink exposing build_running_avg_dataframe, build_training_dataframe instead of doing that inside train_model (with side effect of saving the build_running_avg_dataframe (to disk?) somewhere).
+ - just need to see how its actually used
+ - I guess its good for development purposes? maybe just make the df arg in train_model(df) optional and build from ground up if not provided which will be used in cli/deployment but developing can pass it df? idk
+- [ ] write script that gets upcoming games and makes prediction from model.
+ - try to find a good source for the schedule (nflfastR for that too maybe?).
# Current status:
@@ -123,6 +138,29 @@ score differential is wrong? look at first game. the number for the 2 teams dont
# Stray thoughts:
+- model name idea: caliper. (like measuring the "spread")
+- save model by pickeling with joblib/dump or save the configuration like:
+
+```python
+ # Save essential components (assumes linreg - does it work the same for others?)
+ coefficients = model.coef_
+ intercept = model.intercept_
+ # assumes using minmaxscaler (but maybe im not)
+ scaler_params = {'min_values': scaler.min_, 'scale_values': scaler.scale_}
+
+ # Recreate the model
+ recreated_model = LinearRegression()
+ recreated_model.coef_ = coefficients
+ recreated_model.intercept_ = intercept
+
+ # Recreate the scaler
+ recreated_scaler = MinMaxScaler()
+ recreated_scaler.min_ = scaler_params['min_values']
+ recreated_scaler.scale_ = scaler_params['scale_values']
+```
+
+- I think saving the configuration is probably better if I can.
+
- What should the model's be guess _exactly_ and what does that say about how the teams are modeled in the input? the spread consists of 2 numbers (usually the inverse of each). 1 for each team. Maybe just predict the hometeam?
- probably need to squash 2 teams into 1 line like: home_team_pass_off, home_team_pass_def, away_team_pass_off, away_team_pass_def, etc.
- Are lots of features bad? What about redundant or mostly redundant features (pass yards, rush yards, total yards (total yards are either equal or very similar to pass+rush yards)). Which should I pick in that case (probably the less aggregated ones)?
diff --git a/nfl_analytics/assets/trained_model.joblib b/nfl_analytics/assets/trained_model.joblib
new file mode 100644
index 0000000..4df8b1a
Binary files /dev/null and b/nfl_analytics/assets/trained_model.joblib differ
diff --git a/nfl_analytics/assets/trained_scaler.joblib b/nfl_analytics/assets/trained_scaler.joblib
new file mode 100644
index 0000000..0fd9fb7
Binary files /dev/null and b/nfl_analytics/assets/trained_scaler.joblib differ
diff --git a/nfl_analytics/config.py b/nfl_analytics/config.py
new file mode 100644
index 0000000..01b025f
--- /dev/null
+++ b/nfl_analytics/config.py
@@ -0,0 +1,53 @@
+DATA_DIR = "data"
+ASSET_DIR = "assets"
+START_YEAR = 1999
+FEATURES = [
+ "away_rushing_avg",
+ "home_rushing_avg",
+ "away_passing_avg",
+ "home_passing_avg",
+ "away_sack_yards_avg",
+ "home_sack_yards_avg",
+ "away_score_differential_post_avg",
+ "home_score_differential_post_avg",
+ "away_points_scored_avg",
+ "home_points_scored_avg",
+ "away_points_allowed_avg",
+ "home_points_allowed_avg",
+ "away_mean_epa_avg",
+ "home_mean_epa_avg",
+]
+TEAMS = [
+ "WAS",
+ "ARI",
+ "BUF",
+ "NYJ",
+ "ATL",
+ "CAR",
+ "CIN",
+ "CLE",
+ "NYG",
+ "DAL",
+ "DET",
+ "KC",
+ "CHI",
+ "GB",
+ "BAL",
+ "HOU",
+ "IND",
+ "JAX",
+ "SEA",
+ "LA",
+ "LV",
+ "DEN",
+ "MIA",
+ "LAC",
+ "PHI",
+ "NE",
+ "PIT",
+ "SF",
+ "MIN",
+ "TB",
+ "NO",
+ "TEN",
+]
diff --git a/nfl_analytics/data.py b/nfl_analytics/data.py
index 7795aae..5f855c5 100644
--- a/nfl_analytics/data.py
+++ b/nfl_analytics/data.py
@@ -1,32 +1,65 @@
+"""
+Handles fetching and loading the play by play data. Essentially,
+everything before tranforming it.
+"""
+
import urllib.request
+from urllib.error import HTTPError
import os
-import pandas as pd
import sqlite3
+import pandas as pd
-def get():
- years = range(1999, 2024)
+from nfl_analytics.config import DATA_DIR
- save_directory = "data"
- os.makedirs(save_directory, exist_ok=True)
+
+def download_data(years=range(1999, 2024)):
+ os.makedirs(DATA_DIR, exist_ok=True)
for year in years:
# year gets parsed from this filename and depends on this format
filename = f"play_by_play_{year}.csv.gz"
url = f"https://github.com/nflverse/nflverse-data/releases/download/pbp/{filename}"
- save_path = os.path.join(save_directory, filename)
+ save_path = os.path.join(DATA_DIR, filename)
print(f"Downloading {url}")
- urllib.request.urlretrieve(url, save_path)
+
+ try:
+ urllib.request.urlretrieve(url, save_path)
+ except HTTPError as e:
+ print(
+ f"Error: Failed to download data for {year}. HTTP Error {e.code}: {e.reason}. Season for that year may not exist yet."
+ )
-def load_pandas():
+def load_dataframe():
script_dir = os.path.dirname(os.path.abspath(__file__))
- data_directory = os.path.join(script_dir, "data")
+ data_directory = os.path.join(script_dir, DATA_DIR)
+ if not os.path.exists(data_directory):
+ raise FileNotFoundError(f"Data directory '{data_directory}' not found.")
+
+ files = os.listdir(data_directory)
+
+ if not files:
+ raise FileNotFoundError(f"No data files found in the data directory.")
+
+ # This wont pick on updated data (downlaoded new data but still have combined, so it will use that)
+ # # load saved combined from disk if exists
+ # combined_file_path = os.path.join(
+ # data_directory, "combined", "play_by_play_combined.parquet.gzip"
+ # )
+ # if not skip_combined and os.path.exists(combined_file_path):
+ # print(f"Reading combined file {combined_file_path}")
+ # combined_df = pd.read_parquet(combined_file_path)
+ # return combined_df
+ # else:
+ # print("Combined file does not exist. Loading individual files.")
+
+ # make combined dataframe from individual files
combined_df = pd.DataFrame()
- for filename in os.listdir(data_directory):
+ for filename in files:
if filename.endswith(".csv.gz"):
print(f"Reading {filename}")
file_path = os.path.join(data_directory, filename)
@@ -37,6 +70,9 @@ def load_pandas():
df["year"] = year
combined_df = pd.concat([combined_df, df], ignore_index=True)
+ if combined_df.empty:
+ raise FileNotFoundError("No data loaded from the files.")
+
return combined_df
@@ -46,12 +82,14 @@ def get_year_from_filename(filename):
def load_sqlite():
+ db_dir = "/tmp/nfl-analytics.db"
# load into pandas first and use to_sql to infer datatypes
- df = load_pandas()
+ df = load_dataframe()
+
+ print(f"Loading into SQLite database: {db_dir}")
table_name = "plays"
- db_conn = sqlite3.connect(database="/tmp/nfl-analytics.db")
- # TODO: remove drop table after developing?
+ db_conn = sqlite3.connect(database=db_dir)
db_conn.execute(f"DROP TABLE IF EXISTS {table_name}")
df.to_sql(table_name, db_conn, index=False)
@@ -59,30 +97,6 @@ def load_sqlite():
print(cursor.fetchall())
-# def build():
-# # TODO: do all the things the dev notebook is doing. splitting into nice functions as necessary
-# # For example, could make a function for each time in notebook we are initializing a new dataframe (just a rough guide).
-# pass
-
-
-class Pipeline:
- def __init__(self, debug=False):
- self.debug = debug
- # self.df = pd.DataFrame()
-
- def _fetch_play_by_play(self, years=range(1999, 2024)):
- pass
-
- def _load(self):
- pass
-
- def _build(self):
- pass
-
- # def stuffthatbuildcalls (so I can run in the dev notebook)
- # if debug: true, print stuff
-
-
if __name__ == "__main__":
- get()
+ download_data()
load_sqlite()
diff --git a/nfl_analytics/dataframes.py b/nfl_analytics/dataframes.py
new file mode 100644
index 0000000..70d461e
--- /dev/null
+++ b/nfl_analytics/dataframes.py
@@ -0,0 +1,233 @@
+"""
+Builds the dataframes used for training and prediction.
+Handles everything between getting the data and training/using the model.
+"""
+
+from nfl_analytics.data import load_dataframe
+import pandas as pd
+
+
+def build_training_dataframe(df_running_avg=None):
+ if df_running_avg is None:
+ df_running_avg = build_running_avg_dataframe()
+
+ # Create a new column 'is_home' to indicate whether the team is playing at home
+ df_running_avg["is_home"] = df_running_avg.apply(
+ lambda row: True if row["team"] == row["home_team"] else False, axis=1
+ )
+
+ # Group by game_id and is_home and aggregate using the first value
+ squashed_df = (
+ df_running_avg.groupby(["game_id", "is_home"])[
+ [
+ "rushing_avg",
+ "passing_avg",
+ "yards_gained_avg",
+ "sack_yards_avg",
+ "passing_yards_defense_avg",
+ "rushing_yards_defense_avg",
+ "yards_gained_defense_avg",
+ "sack_yards_defense_avg",
+ "score_differential_post_avg",
+ "points_scored_avg",
+ "points_allowed_avg",
+ "mean_epa_avg",
+ ]
+ ]
+ .first()
+ .unstack()
+ )
+
+ squashed_df.columns = [
+ f"{'home' if is_home else 'away'}_{col}" for col, is_home in squashed_df.columns
+ ]
+ squashed_df.reset_index(inplace=True)
+
+ # Merge with the original DataFrame to get the rest of the columns
+ return pd.merge(
+ df_running_avg[
+ [
+ "game_id",
+ "week",
+ "year",
+ "team",
+ "home_team",
+ "away_team",
+ "home_spread",
+ ]
+ ],
+ squashed_df,
+ on="game_id",
+ )
+
+
+def build_running_avg_dataframe(df_raw=None):
+ """
+ Builds a dataframe with weakly running averages for each team by year.
+ Used to create prediction inputs and build the training dataset
+ """
+ if df_raw is None:
+ df_raw = load_dataframe()
+
+ df_sacks = add_sack_yards(df_raw)
+ # df_game is team games stats by team: week 1, DET, 250 pass, 120 run, etc.
+ df_game_posteam = df_sacks.groupby(["game_id", "posteam"])
+ df_game = aggregate_game_stats(df_sacks, df_game_posteam)
+ df_game = adjust_game_dataframe(df_game, df_game_posteam)
+ df_running_avg = df_game[
+ [
+ "game_id",
+ "team",
+ "week",
+ "year",
+ "home_team",
+ "away_team",
+ "score_differential_post",
+ ]
+ ].copy()
+
+ # Set the home_spread
+ # This will be our target variable. It's the spread relative to the home team. We want this because we need to predict a single spread value (which we can then invert for the away team's spread).
+ df_running_avg["home_spread"] = df_game.apply(
+ lambda row: -row["score_differential_post"]
+ if row["team"] != row["home_team"]
+ else row["score_differential_post"],
+ axis=1,
+ )
+
+ # Get the running average for each team by team and year
+ # Uses lambda and shift to not include current row in running average
+ # Expand is an expanding window function that gets everything from the first to current row
+ df_running_avg[
+ [
+ "rushing_avg",
+ "passing_avg",
+ "yards_gained_avg",
+ "sack_yards_avg",
+ "passing_yards_defense_avg",
+ "rushing_yards_defense_avg",
+ "yards_gained_defense_avg",
+ "sack_yards_defense_avg",
+ "score_differential_post_avg",
+ "points_scored_avg",
+ "points_allowed_avg",
+ "mean_epa_avg",
+ ]
+ ] = (
+ df_game.groupby(["team", "year"])[
+ [
+ "rushing_yards",
+ "passing_yards",
+ "yards_gained",
+ "sack_yards",
+ "passing_yards_defense",
+ "rushing_yards_defense",
+ "yards_gained_defense",
+ "sack_yards_defense",
+ "score_differential_post",
+ "points_scored",
+ "points_allowed",
+ "mean_epa",
+ ]
+ ]
+ .apply(lambda x: x.shift().expanding().mean())
+ .reset_index(level=[0, 1], drop=True)
+ )
+
+ return df_running_avg
+
+
+def add_sack_yards(df_raw):
+ df = df_raw.copy()
+ # Sack yards would be necessary to get accurate TEAM passing stats.
+ # Team passing yards are sum(passing_yards) - sum(sack_yards)
+ # Player passing stats are simply sum(passing_yards).
+ df["sack_yards"] = pd.NA
+
+ # Set sack_yards to yards_gained for rows where sack is not equal to 0
+ df.loc[df["sack"] != 0, "sack_yards"] = df["yards_gained"]
+
+ return df
+
+
+def aggregate_game_stats(df_sacks, df_game_posteam):
+ # Group by game and team and combine offensive and defensive stats into single record
+
+ # Separate offensive and defensive stats
+ offensive_stats = (
+ df_game_posteam[
+ ["passing_yards", "rushing_yards", "yards_gained", "sack_yards"]
+ ]
+ .sum()
+ .reset_index()
+ )
+ defensive_stats = (
+ df_sacks.groupby(["game_id", "defteam"])[
+ ["passing_yards", "rushing_yards", "yards_gained", "sack_yards"]
+ ]
+ .sum()
+ .reset_index()
+ )
+
+ # Rename columns for defensive stats to distinguish them
+ defensive_stats.rename(
+ columns={
+ "defteam": "team",
+ "passing_yards": "passing_yards_defense",
+ "rushing_yards": "rushing_yards_defense",
+ "yards_gained": "yards_gained_defense",
+ "sack_yards": "sack_yards_defense",
+ },
+ inplace=True,
+ )
+
+ return pd.merge(
+ offensive_stats,
+ defensive_stats,
+ left_on=["game_id", "posteam"],
+ right_on=["game_id", "team"],
+ )
+
+
+def adjust_game_dataframe(df_game, df_game_posteam):
+ df = df_game.copy()
+
+ # Add home_team, away_team, home_score, away_score
+ df[["home_team", "away_team", "home_score", "away_score"]] = (
+ df_game_posteam[["home_team", "away_team", "home_score", "away_score"]]
+ .first()
+ .reset_index(drop=True)
+ )
+
+ df["points_scored"] = df.apply(
+ lambda row: row["home_score"]
+ if row["posteam"] == row["home_team"]
+ else row["away_score"],
+ axis=1,
+ )
+ df["points_allowed"] = df.apply(
+ lambda row: row["away_score"]
+ if row["posteam"] == row["home_team"]
+ else row["home_score"],
+ axis=1,
+ )
+
+ df.drop(["posteam"], axis=1, inplace=True)
+
+ # sets score differential to last value for each game and team
+ df[["score_differential_post", "week", "year"]] = (
+ df_game_posteam[["score_differential_post", "week", "year"]]
+ .last()
+ .reset_index(drop=True)
+ )
+
+ df["mean_epa"] = df_game_posteam["epa"].mean().reset_index(drop=True)
+
+ return df
+
+
+if __name__ == "__main__":
+ df_running_avg = build_running_avg_dataframe()
+ print(df_running_avg.tail())
+ df_train = build_training_dataframe()
+ print(df_train.tail())
diff --git a/nfl_analytics/data_dev.ipynb b/nfl_analytics/dev_notebook.ipynb
similarity index 91%
rename from nfl_analytics/data_dev.ipynb
rename to nfl_analytics/dev_notebook.ipynb
index edccb38..fa6ad43 100644
--- a/nfl_analytics/data_dev.ipynb
+++ b/nfl_analytics/dev_notebook.ipynb
@@ -7,17 +7,17 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
- "from nfl_analytics.data import load_pandas\n",
+ "from nfl_analytics.data import load_dataframe\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
@@ -29,494 +29,13 @@
"Reading play_by_play_2006.csv.gz\n",
"Reading play_by_play_2014.csv.gz\n",
"Reading play_by_play_2020.csv.gz\n",
- "Reading play_by_play_2016.csv.gz\n",
- "Reading play_by_play_2008.csv.gz\n",
- "Reading play_by_play_2004.csv.gz\n",
- "Reading play_by_play_2000.csv.gz\n",
- "Reading play_by_play_2012.csv.gz\n",
- "Reading play_by_play_2010.csv.gz\n",
- "Reading play_by_play_2002.csv.gz\n",
- "Reading play_by_play_2007.csv.gz\n",
- "Reading play_by_play_2019.csv.gz\n",
- "Reading play_by_play_2015.csv.gz\n",
- "Reading play_by_play_2023.csv.gz\n",
- "Reading play_by_play_2009.csv.gz\n",
- "Reading play_by_play_2017.csv.gz\n",
- "Reading play_by_play_2005.csv.gz\n",
- "Reading play_by_play_2021.csv.gz\n",
- "Reading play_by_play_1999.csv.gz\n",
- "Reading play_by_play_2001.csv.gz\n",
- "Reading play_by_play_2013.csv.gz\n",
- "Reading play_by_play_2011.csv.gz\n",
- "Reading play_by_play_2003.csv.gz\n"
+ "Reading play_by_play_2016.csv.gz\n"
]
- },
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " play_id | \n",
- " game_id | \n",
- " old_game_id | \n",
- " home_team | \n",
- " away_team | \n",
- " season_type | \n",
- " week | \n",
- " posteam | \n",
- " posteam_type | \n",
- " defteam | \n",
- " ... | \n",
- " home_opening_kickoff | \n",
- " qb_epa | \n",
- " xyac_epa | \n",
- " xyac_mean_yardage | \n",
- " xyac_median_yardage | \n",
- " xyac_success | \n",
- " xyac_fd | \n",
- " xpass | \n",
- " pass_oe | \n",
- " year | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 1 | \n",
- " 2022_01_BAL_NYJ | \n",
- " 2022091107 | \n",
- " NYJ | \n",
- " BAL | \n",
- " REG | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " ... | \n",
- " 1 | \n",
- " 0.000000 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 2022 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 43 | \n",
- " 2022_01_BAL_NYJ | \n",
- " 2022091107 | \n",
- " NYJ | \n",
- " BAL | \n",
- " REG | \n",
- " 1 | \n",
- " NYJ | \n",
- " home | \n",
- " BAL | \n",
- " ... | \n",
- " 1 | \n",
- " -0.443521 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 2022 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 68 | \n",
- " 2022_01_BAL_NYJ | \n",
- " 2022091107 | \n",
- " NYJ | \n",
- " BAL | \n",
- " REG | \n",
- " 1 | \n",
- " NYJ | \n",
- " home | \n",
- " BAL | \n",
- " ... | \n",
- " 1 | \n",
- " 1.468819 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 0.440373 | \n",
- " -44.037291 | \n",
- " 2022 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 89 | \n",
- " 2022_01_BAL_NYJ | \n",
- " 2022091107 | \n",
- " NYJ | \n",
- " BAL | \n",
- " REG | \n",
- " 1 | \n",
- " NYJ | \n",
- " home | \n",
- " BAL | \n",
- " ... | \n",
- " 1 | \n",
- " -0.492192 | \n",
- " 0.727261 | \n",
- " 6.988125 | \n",
- " 6.0 | \n",
- " 0.60693 | \n",
- " 0.227598 | \n",
- " 0.389904 | \n",
- " 61.009598 | \n",
- " 2022 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 115 | \n",
- " 2022_01_BAL_NYJ | \n",
- " 2022091107 | \n",
- " NYJ | \n",
- " BAL | \n",
- " REG | \n",
- " 1 | \n",
- " NYJ | \n",
- " home | \n",
- " BAL | \n",
- " ... | \n",
- " 1 | \n",
- " -0.325931 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 0.443575 | \n",
- " -44.357494 | \n",
- " 2022 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 373 columns
\n",
- "
"
- ],
- "text/plain": [
- " play_id game_id old_game_id home_team away_team season_type \\\n",
- "0 1 2022_01_BAL_NYJ 2022091107 NYJ BAL REG \n",
- "1 43 2022_01_BAL_NYJ 2022091107 NYJ BAL REG \n",
- "2 68 2022_01_BAL_NYJ 2022091107 NYJ BAL REG \n",
- "3 89 2022_01_BAL_NYJ 2022091107 NYJ BAL REG \n",
- "4 115 2022_01_BAL_NYJ 2022091107 NYJ BAL REG \n",
- "\n",
- " week posteam posteam_type defteam ... home_opening_kickoff qb_epa \\\n",
- "0 1 NaN NaN NaN ... 1 0.000000 \n",
- "1 1 NYJ home BAL ... 1 -0.443521 \n",
- "2 1 NYJ home BAL ... 1 1.468819 \n",
- "3 1 NYJ home BAL ... 1 -0.492192 \n",
- "4 1 NYJ home BAL ... 1 -0.325931 \n",
- "\n",
- " xyac_epa xyac_mean_yardage xyac_median_yardage xyac_success xyac_fd \\\n",
- "0 NaN NaN NaN NaN NaN \n",
- "1 NaN NaN NaN NaN NaN \n",
- "2 NaN NaN NaN NaN NaN \n",
- "3 0.727261 6.988125 6.0 0.60693 0.227598 \n",
- "4 NaN NaN NaN NaN NaN \n",
- "\n",
- " xpass pass_oe year \n",
- "0 NaN NaN 2022 \n",
- "1 NaN NaN 2022 \n",
- "2 0.440373 -44.037291 2022 \n",
- "3 0.389904 61.009598 2022 \n",
- "4 0.443575 -44.357494 2022 \n",
- "\n",
- "[5 rows x 373 columns]"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df = load_pandas()\n",
- "df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " game_id | \n",
- " old_game_id | \n",
- " year | \n",
- " week | \n",
- " posteam | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 2022_01_BAL_NYJ | \n",
- " 2022091107 | \n",
- " 2022 | \n",
- " 1 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 2022_01_BAL_NYJ | \n",
- " 2022091107 | \n",
- " 2022 | \n",
- " 1 | \n",
- " NYJ | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 2022_01_BAL_NYJ | \n",
- " 2022091107 | \n",
- " 2022 | \n",
- " 1 | \n",
- " NYJ | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 2022_01_BAL_NYJ | \n",
- " 2022091107 | \n",
- " 2022 | \n",
- " 1 | \n",
- " NYJ | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 2022_01_BAL_NYJ | \n",
- " 2022091107 | \n",
- " 2022 | \n",
- " 1 | \n",
- " NYJ | \n",
- "
\n",
- " \n",
- " 5 | \n",
- " 2022_01_BAL_NYJ | \n",
- " 2022091107 | \n",
- " 2022 | \n",
- " 1 | \n",
- " NYJ | \n",
- "
\n",
- " \n",
- " 6 | \n",
- " 2022_01_BAL_NYJ | \n",
- " 2022091107 | \n",
- " 2022 | \n",
- " 1 | \n",
- " NYJ | \n",
- "
\n",
- " \n",
- " 7 | \n",
- " 2022_01_BAL_NYJ | \n",
- " 2022091107 | \n",
- " 2022 | \n",
- " 1 | \n",
- " BAL | \n",
- "
\n",
- " \n",
- " 8 | \n",
- " 2022_01_BAL_NYJ | \n",
- " 2022091107 | \n",
- " 2022 | \n",
- " 1 | \n",
- " BAL | \n",
- "
\n",
- " \n",
- " 9 | \n",
- " 2022_01_BAL_NYJ | \n",
- " 2022091107 | \n",
- " 2022 | \n",
- " 1 | \n",
- " BAL | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " game_id old_game_id year week posteam\n",
- "0 2022_01_BAL_NYJ 2022091107 2022 1 NaN\n",
- "1 2022_01_BAL_NYJ 2022091107 2022 1 NYJ\n",
- "2 2022_01_BAL_NYJ 2022091107 2022 1 NYJ\n",
- "3 2022_01_BAL_NYJ 2022091107 2022 1 NYJ\n",
- "4 2022_01_BAL_NYJ 2022091107 2022 1 NYJ\n",
- "5 2022_01_BAL_NYJ 2022091107 2022 1 NYJ\n",
- "6 2022_01_BAL_NYJ 2022091107 2022 1 NYJ\n",
- "7 2022_01_BAL_NYJ 2022091107 2022 1 BAL\n",
- "8 2022_01_BAL_NYJ 2022091107 2022 1 BAL\n",
- "9 2022_01_BAL_NYJ 2022091107 2022 1 BAL"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[['game_id', 'old_game_id', 'year', 'week', 'posteam']].head(10)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " | \n",
- " epa | \n",
- "
\n",
- " \n",
- " year | \n",
- " posteam | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 1999 | \n",
- " ARI | \n",
- " -0.100310 | \n",
- "
\n",
- " \n",
- " ATL | \n",
- " -0.073998 | \n",
- "
\n",
- " \n",
- " BAL | \n",
- " -0.043631 | \n",
- "
\n",
- " \n",
- " BUF | \n",
- " -0.014478 | \n",
- "
\n",
- " \n",
- " CAR | \n",
- " 0.059887 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 2023 | \n",
- " SEA | \n",
- " 0.015927 | \n",
- "
\n",
- " \n",
- " SF | \n",
- " 0.119782 | \n",
- "
\n",
- " \n",
- " TB | \n",
- " -0.007973 | \n",
- "
\n",
- " \n",
- " TEN | \n",
- " -0.031331 | \n",
- "
\n",
- " \n",
- " WAS | \n",
- " -0.084474 | \n",
- "
\n",
- " \n",
- "
\n",
- "
797 rows × 1 columns
\n",
- "
"
- ],
- "text/plain": [
- " epa\n",
- "year posteam \n",
- "1999 ARI -0.100310\n",
- " ATL -0.073998\n",
- " BAL -0.043631\n",
- " BUF -0.014478\n",
- " CAR 0.059887\n",
- "... ...\n",
- "2023 SEA 0.015927\n",
- " SF 0.119782\n",
- " TB -0.007973\n",
- " TEN -0.031331\n",
- " WAS -0.084474\n",
- "\n",
- "[797 rows x 1 columns]"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
}
],
"source": [
- "df.groupby(['year', 'posteam'])[['epa']].mean()"
+ "df = load_dataframe()\n",
+ "df[df['year'] == 2023]['posteam'].unique()"
]
},
{
@@ -4602,7 +4121,6 @@
"from sklearn.impute import SimpleImputer\n",
"from sklearn.preprocessing import StandardScaler\n",
"\n",
- "# Assuming your DataFrame is named df\n",
"# Drop rows with NaN for week 1\n",
"df_train = result_df[result_df['week'] > 1]\n",
"\n",
@@ -4637,7 +4155,6 @@
"# Split the data into training and testing sets\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
- "# TODO: do i need to scale the data? https://www.baeldung.com/cs/normalization-vs-standardization\n",
"# Standardize the features\n",
"scaler = StandardScaler()\n",
"X_train_scaled = scaler.fit_transform(X_train)\n",
@@ -4690,8 +4207,7 @@
}
],
"source": [
- "# Gut-checking the model with future (as of writing) conference titles games.\n",
- "# TODO: negate the target variable? I think training w/ positive spread for winner might not be right.\n",
+ "# Gut-checking the model with future (as of writing) conference titles games 01/28/2024.\n",
"\n",
"# Odds are -7 SF (7 DET). Some places -7.5 SF (7.5 DET).\n",
"# Very close - nice. This suggests taking Detroit to beat the spread although not by much.\n",
diff --git a/nfl_analytics/get_data.py b/nfl_analytics/get_data.py
deleted file mode 100644
index 7175bb1..0000000
--- a/nfl_analytics/get_data.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import urllib.request
-import os
-
-years = range(1999, 2024)
-
-save_directory = "data"
-os.makedirs(save_directory, exist_ok=True)
-
-for year in years:
- filename = f"play_by_play_{year}.csv.gz"
- url = f"https://github.com/nflverse/nflverse-data/releases/download/pbp/{filename}"
- save_path = os.path.join(save_directory, filename)
-
- urllib.request.urlretrieve(url, save_path)
diff --git a/nfl_analytics/load_data.py b/nfl_analytics/load_data.py
deleted file mode 100644
index b067057..0000000
--- a/nfl_analytics/load_data.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Loads csvs into pandas dataframe and sqlite db
-import os
-import pandas as pd
-import sqlite3
-
-data_directory = "nfl_analytics/data"
-combined_df = pd.DataFrame()
-
-for filename in os.listdir(data_directory):
- if filename.endswith(".csv.gz"):
- print(f"Reading {filename}")
- file_path = os.path.join(data_directory, filename)
-
- # Read the CSV file into a DataFrame and concat to combined df
- df = pd.read_csv(file_path, compression="gzip", low_memory=False)
- combined_df = pd.concat([combined_df, df], ignore_index=True)
-
-print(combined_df.head())
-
-table_name = "plays"
-db_conn = sqlite3.connect(database="/tmp/my.db")
-# TODO: remove drop table after developing?
-db_conn.execute(f"DROP TABLE IF EXISTS {table_name}")
-num_rows_inserted = combined_df.to_sql(table_name, db_conn, index=False)
-
-cursor = db_conn.execute(f"SELECT * from {table_name} LIMIT 10")
-print(cursor.fetchall())
diff --git a/nfl_analytics/main.py b/nfl_analytics/main.py
new file mode 100644
index 0000000..3de1b5f
--- /dev/null
+++ b/nfl_analytics/main.py
@@ -0,0 +1,111 @@
+import argparse
+import time
+
+from nfl_analytics.data import download_data, load_dataframe
+from nfl_analytics.model import (
+ train_model,
+ predict,
+ save_model_and_scaler,
+ load_model_and_scaler,
+)
+from nfl_analytics.dataframes import (
+ build_running_avg_dataframe,
+ build_training_dataframe,
+)
+from nfl_analytics.utils import is_valid_year
+from nfl_analytics.config import TEAMS
+
+
+# ROUGH CLI docs:
+# --download: optional. takes list of years. or if empty, defaults to downloading all years. usage: python main.py --download 2021 2022
+# --train: optional. if present, trains the model. usage: python main.py --train
+# --predict: optional. takes two arguments, home team and away team. usage: python main.py --predict "CHI" "MIN"
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Manage NFL Spread Predictor Pipeline")
+ parser.add_argument(
+ "--download",
+ nargs="*",
+ type=int,
+ metavar="year",
+ help="Download data for the specified years. The year corresponds to the season start.",
+ )
+ parser.add_argument(
+ "--train",
+ action="store_true",
+ help="Train the model using the downloaded data.",
+ )
+ parser.add_argument(
+ "--predict",
+ nargs=2,
+ metavar=("home_team", "away_team"),
+ help="Specify the home and away teams for prediction.",
+ )
+ args = parser.parse_args()
+
+ if args.download is not None:
+ if args.download:
+ year_set = set(args.download)
+ invalid_years = [year for year in year_set if not is_valid_year(year)]
+
+ if invalid_years:
+ print(
+ f"Error: Invalid year(s) provided: {invalid_years}. No data downloaded."
+ )
+ else:
+ download_data(year_set)
+ else:
+ download_data()
+
+ if args.train:
+ print("Training model...")
+
+ start_time = time.time()
+ df_raw = load_dataframe()
+ end_time = time.time()
+ print(f"Loaded dataframe in {end_time - start_time} seconds")
+
+ # This wont pick on updated data (downlaoded new data but still have combined, so it will use that)
+ # Save combined dataframe to disk
+ # save_dir = os.path.join("data", "combined")
+ # os.makedirs(save_dir, exist_ok=True)
+ # save_path = os.path.join(save_dir, "play_by_play_combined.parquet.gzip")
+ # df_raw.to_parquet(save_path, compression="gzip")
+
+ df_running_avg = build_running_avg_dataframe(df_raw)
+ df_training = build_training_dataframe(df_running_avg)
+ model, scaler = train_model(df_training)
+
+ save_model_and_scaler(model, scaler)
+
+ if args.predict:
+ # TODO: this will silently predict based off old data if thats all we have.
+ # Perhaps I should require the week/year in the predict fn? Or at least log
+ # year/week in predict?
+ home_team = args.predict[0].upper()
+ away_team = args.predict[1].upper()
+
+ for team in [home_team, away_team]:
+ if team not in TEAMS:
+ print(f"Invalid team: {team}")
+ return
+
+ if home_team == away_team:
+ print("Error: Home and away team cannot be the same.")
+ return
+
+ model, scaler = load_model_and_scaler()
+
+ # TODO: load directly from somewhere instead?
+ df_running_avg = build_running_avg_dataframe()
+
+ predicted_spread = predict(model, scaler, df_running_avg, home_team, away_team)
+
+ print(
+ f"Predicted spread for {home_team} (home) vs {away_team} (away): {predicted_spread}"
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/nfl_analytics/model.py b/nfl_analytics/model.py
new file mode 100644
index 0000000..9122d5c
--- /dev/null
+++ b/nfl_analytics/model.py
@@ -0,0 +1,152 @@
+import os
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error, mean_absolute_error
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler
+from joblib import dump, load
+
+from nfl_analytics.config import FEATURES, ASSET_DIR
+
+
+def train_model(df_training):
+ # Drop week 1 because is all NaN
+ df_train = df_training[df_training["week"] > 1]
+
+ # Dont use unnecessary columns like 'game_id', 'week', 'year', 'team', 'home_team', 'away_team'
+ # Keep only relevant columns for prediction
+ target = "home_spread"
+ select_columns = FEATURES + [target]
+
+ df_train = df_train[select_columns]
+
+ # TODO: why are there missing values?
+ imputer = SimpleImputer(strategy="mean")
+ df_imputed = pd.DataFrame(imputer.fit_transform(df_train), columns=df_train.columns)
+
+ X = df_imputed.drop(target, axis=1)
+ y = df_imputed[target]
+
+ X_train, X_test, y_train, y_test = train_test_split(
+ X, y, test_size=0.2, random_state=42
+ )
+
+ # Note: scaler is transformed by fit_transform. Must re-use the same scaler for prediction.
+ scaler = StandardScaler()
+ X_train_scaled = scaler.fit_transform(X_train)
+ X_test_scaled = scaler.transform(X_test)
+
+ model = LinearRegression()
+ model.fit(X_train_scaled, y_train)
+
+ y_pred = model.predict(X_test_scaled)
+
+ mse = mean_squared_error(y_test, y_pred)
+ mae = mean_absolute_error(y_test, y_pred)
+ print(f"Mean Squared Error: {mse}")
+ print(f"Mean Absolute Error: {mae}")
+
+ return model, scaler
+
+
+def save_model_and_scaler(model, scaler):
+ script_dir = os.path.dirname(os.path.abspath(__file__))
+ asset_dir = os.path.join(script_dir, ASSET_DIR)
+ os.makedirs(asset_dir, exist_ok=True)
+
+ dump(model, os.path.join(asset_dir, "trained_model.joblib"))
+ dump(scaler, os.path.join(asset_dir, "trained_scaler.joblib"))
+ print("Model and scaler saved")
+
+
+def load_model_and_scaler():
+ script_dir = os.path.dirname(os.path.abspath(__file__))
+ asset_dir = os.path.join(script_dir, ASSET_DIR)
+
+ model = load(os.path.join(asset_dir, "trained_model.joblib"))
+ scaler = load(os.path.join(asset_dir, "trained_scaler.joblib"))
+ return model, scaler
+
+
+def predict(model, scaler, df_running_avg, home_team, away_team):
+ matchup = make_matchup(df_running_avg, home_team, away_team)
+ matchup_input = get_matchup_input(scaler, matchup)
+
+ return model.predict(matchup_input)[0]
+
+
+def make_matchup(df_running_avg, home_team, away_team, week=None, year=None):
+ """Merge given team/week/years stats into a single row.
+ To be used for predicting spreads for future games."""
+
+ df = df_running_avg.copy()
+
+ if year is None:
+ year = df["year"].max()
+
+ if week is None:
+ last_week = df[df["year"] == year]["week"].max()
+ week = last_week
+
+ # df_running_avg include running averages prior to that week, and data about
+ # that week itself: teams, final scores, etc.). Basically (and literally at
+ # the time of writing) anything not suffixed with `_avg`. The data about the
+ # week itself are necessary for training the model but dont make sense in
+ # the context of predicting future games so they are not included here.
+ cols = [
+ "rushing_avg",
+ "passing_avg",
+ "yards_gained_avg",
+ "sack_yards_avg",
+ "passing_yards_defense_avg",
+ "rushing_yards_defense_avg",
+ "yards_gained_defense_avg",
+ "sack_yards_defense_avg",
+ "score_differential_post_avg",
+ "points_scored_avg",
+ "points_allowed_avg",
+ "mean_epa_avg",
+ ]
+
+ # Select data for the specified week, home team, and away team in the specified year
+ home_data = (
+ df[(df["year"] == year) & (df["week"] == week) & (df["team"] == home_team)][
+ cols
+ ]
+ .add_prefix("home_")
+ .reset_index(drop=True)
+ )
+ away_data = (
+ df[(df["year"] == year) & (df["week"] == week) & (df["team"] == away_team)][
+ cols
+ ]
+ .add_prefix("away_")
+ .reset_index(drop=True)
+ )
+
+ return pd.concat([home_data, away_data], axis=1)
+
+
+def get_matchup_input(scaler, matchup):
+ reshaped_matchup = matchup[FEATURES].values.reshape(1, -1)
+ return scaler.transform(reshaped_matchup)
+
+
+if __name__ == "__main__":
+ from nfl_analytics.dataframes import (
+ build_running_avg_dataframe,
+ build_training_dataframe,
+ )
+
+ df_running_avg = build_running_avg_dataframe()
+ df_training = build_training_dataframe()
+ model, scaler = train_model(df_training)
+ print(make_matchup(df_running_avg, "KC", "SF").tail())
+ # first team is home but this is superbowl so neither is technically home
+ # week 22 (? its the superbowl) 2023 (2023 SEASON, year is 2024)
+ kc_sf = predict(model, scaler, df_running_avg, "KC", "SF")
+ print(f"Prediction: {kc_sf}")
+ sf_kc = predict(model, scaler, df_running_avg, "SF", "KC")
+ print(f"Prediction: {sf_kc}")
diff --git a/nfl_analytics/train_dev.ipynb b/nfl_analytics/train_dev.ipynb
deleted file mode 100644
index e69de29..0000000
diff --git a/nfl_analytics/utils.py b/nfl_analytics/utils.py
new file mode 100644
index 0000000..ab02bc7
--- /dev/null
+++ b/nfl_analytics/utils.py
@@ -0,0 +1,17 @@
+import datetime
+
+from nfl_analytics.config import START_YEAR
+
+
+def is_valid_year(year):
+ current_year = datetime.datetime.now().year
+ return START_YEAR <= year <= current_year
+
+
+if __name__ == "__main__":
+ print(is_valid_year(1998)) # False
+ print(is_valid_year(1999)) # True
+ print(is_valid_year(2000)) # True
+ print(is_valid_year(2023)) # True
+ print(is_valid_year(2024)) # True
+ print(is_valid_year(2025)) # False (if current year is 2024)
diff --git a/poetry.lock b/poetry.lock
index b9eef52..d0e3e18 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -121,6 +121,87 @@ traitlets = ">=4"
[package.extras]
test = ["pytest"]
+[[package]]
+name = "cramjam"
+version = "2.8.1"
+description = "Thin Python bindings to de/compression algorithms in Rust"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "cramjam-2.8.1-cp310-cp310-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:1afc66031e5165f4eae27e6e1f89c0def6c2ece826903ebb0194ee2f467ff8e6"},
+ {file = "cramjam-2.8.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:4af4b74b16c22d6a0092d6a7db642ee097f4b0bfa0389d5a07552a2fc48eb0b6"},
+ {file = "cramjam-2.8.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:e858a56d12a61d0e19b2286a7661b36b52b3cad4fa84d8aaeb0b0ed0b2338d36"},
+ {file = "cramjam-2.8.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc25199e44ee8ca55c62d6da2a74ea48e759058f2c96ca1e5d512aad6ce6005b"},
+ {file = "cramjam-2.8.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:130aee295a77256aa867a4562961e99db56ddf8081df1e9bfb1607a409dcc4df"},
+ {file = "cramjam-2.8.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ccfa252b800d6cdae8cc20728c41b94a39f5cadee8693ab2539ea02285e8e015"},
+ {file = "cramjam-2.8.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b8bd1cc5706b235655b742e8e350c13ffc2036efc098a192bd9f4df8b153c03e"},
+ {file = "cramjam-2.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dedae22ed4f8997ce584256b4a790354f314baa209a9015aa1ab6e383f6177c5"},
+ {file = "cramjam-2.8.1-cp310-none-win32.whl", hash = "sha256:f1af221507fbcd5bd46a92d41ca60410400624328b5c92ec2efb13deca86b6e9"},
+ {file = "cramjam-2.8.1-cp310-none-win_amd64.whl", hash = "sha256:94017d3057d53bad33ec4ef46409a2b410a67c0905eb094b441c1687ca1f738a"},
+ {file = "cramjam-2.8.1-cp311-cp311-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:033b1720718c68cacab1dbeee6d8e8e7ed851ac9abb8aca6ae9cc869e7087df8"},
+ {file = "cramjam-2.8.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4597d720efdd92f5a4f09289d08f0fcf690ae8a16295dd356dd66fcc4aa10e30"},
+ {file = "cramjam-2.8.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:ddafdb489ad4b84cb7e8c8423a21f37a5e6dfe7e32847bd0048f680d703184cf"},
+ {file = "cramjam-2.8.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cbf9df8f296422410722fb5349159432eea11ac5fec7bf59213c7148f3efa04"},
+ {file = "cramjam-2.8.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e588b11966152234f8ffa95089dd447a7ec299ea319acee15a78387cfc267ba9"},
+ {file = "cramjam-2.8.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9050a662c484752b18a07ba0bd01601f229f40152d374ea79c91da239d6b929b"},
+ {file = "cramjam-2.8.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a909ef9b6099e4df03a7b602d75e0c43c23359f21cd138886a51f7ae268fffc9"},
+ {file = "cramjam-2.8.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a6b06704e99a116dbdae78c4fef47d41bec1ef08a911c50807cde608d99764b"},
+ {file = "cramjam-2.8.1-cp311-none-win32.whl", hash = "sha256:9f069be2b8cfa16d2ee5c1fc7cb6864f8d5db03e31d0f07a38a227dbd470d1ad"},
+ {file = "cramjam-2.8.1-cp311-none-win_amd64.whl", hash = "sha256:0f50414cc2f216d0c4d311e81412bf32fda6e4d1e867de2f4bdd2c1d05319d00"},
+ {file = "cramjam-2.8.1-cp312-cp312-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:000e2321226b83fee2fa6b353d649f034266fa6d42d5aa3521c7457ca7fa3118"},
+ {file = "cramjam-2.8.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:80373dccc3dc25446d5d5ae7c8641daa2dbd4973c98bd2ab3e113fa57a653647"},
+ {file = "cramjam-2.8.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:bc2d6ccaa164ba122b724f4958821d08ed7c49a90662ce800c24021c5900cbbe"},
+ {file = "cramjam-2.8.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a726fb5ee28e4258215c327d65bf6791f25e7742a5d262352e2c344ac683a0f8"},
+ {file = "cramjam-2.8.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d087048ef9244cf42b8958ec9eabacbf3c337fa0697a4573e99b189253f312d4"},
+ {file = "cramjam-2.8.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4230d22a249ae91459ceae15e55abac5cef983042ddbd0484e2863034dd63725"},
+ {file = "cramjam-2.8.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd52a1767750b3a83308179e3ff0b3465551c8f4419676847300989582e55a61"},
+ {file = "cramjam-2.8.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f16f327dd8d299ac137bf3288e89772187533d9da367bd3d270f5546273e7ce8"},
+ {file = "cramjam-2.8.1-cp312-none-win32.whl", hash = "sha256:d7118d61d7a037f80ef6a2d809cd01492b574983947593570ac757d2a87eae6e"},
+ {file = "cramjam-2.8.1-cp312-none-win_amd64.whl", hash = "sha256:465f868c14b921af0cb66f7bfa4f3c7a131cc47de1c9390dfa1d96fbe90778de"},
+ {file = "cramjam-2.8.1-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:77c952be40131c47ebbcb534862530ba67a80c86a7099532cace517109c2b428"},
+ {file = "cramjam-2.8.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8e5b8f4d23e0a4a97381ded35a6a7fe839eba1b0372c1e74ee3713adc59f5c1c"},
+ {file = "cramjam-2.8.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8d1c2b51b0754736f20c1ea3638118a7acd60cc5ab32c6751e247cd6ada15c79"},
+ {file = "cramjam-2.8.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d3b3d1167928bf0ad846239758a8fd6536d560886b63174359392686721e8902"},
+ {file = "cramjam-2.8.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9f1b2fd1108b040a822e8d6c32aebe46275095128783f944971c3b37671c6af0"},
+ {file = "cramjam-2.8.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c676bb517a2edd809684e3495eccb7868bd5f1be0684853f226166aa54ed072"},
+ {file = "cramjam-2.8.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:133bc01739c897ebf7dcfb3e0fe11b4ca5851bcda1e021c4877e11cf1712a4e5"},
+ {file = "cramjam-2.8.1-cp37-none-win32.whl", hash = "sha256:b85e8c7380ee7a1b6b530764f929a0766b5ee1d8cbbfa0c0fca2c08415a7532f"},
+ {file = "cramjam-2.8.1-cp37-none-win_amd64.whl", hash = "sha256:6611b41ff76ff252ce67acc13a83aae17ea3131e5926fa1a96e266ff803d4d67"},
+ {file = "cramjam-2.8.1-cp38-cp38-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:db7086436d50d6fb2c2b38cfbec5a860a2595cbed0e7a3e541c05180f5e26005"},
+ {file = "cramjam-2.8.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:f2332835c2db911cc572e6604860e5f2da439833f855ed1c1dac96fd5fc1025a"},
+ {file = "cramjam-2.8.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:4b2bdcaeaa649efa4c41e7add81a205359c7540c58766151488955ebe12afa9f"},
+ {file = "cramjam-2.8.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3f8d44cbf2f680a39f677cafbadb3353ee09ce9d2ed51b52eba5261cb3b935a"},
+ {file = "cramjam-2.8.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a6d283676a5979236011bd3daed39560e5d0f39df6cceab31a291d5203a36cde"},
+ {file = "cramjam-2.8.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dd3787a6f9bbfea1a74aa9457d73eae18393de67cab03f89ac20beaaed43cb8d"},
+ {file = "cramjam-2.8.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b0200d1530c9ae9c0988e2878a08624de1cb2b841eea2dcb9a10a236ff43747c"},
+ {file = "cramjam-2.8.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:acba9d6fe6653e1f387aee53cc8d6d9d90d5416b29a233e40dcdf84e6b1002b9"},
+ {file = "cramjam-2.8.1-cp38-none-win32.whl", hash = "sha256:6d4e357d98d5a4ac7291b88a4e54adcb85506a00aa9e72fa222a2caa3b881828"},
+ {file = "cramjam-2.8.1-cp38-none-win_amd64.whl", hash = "sha256:960f0a65db3de5f92762c0dfcad514982ea888f300677d83a1d3eb46eee6b109"},
+ {file = "cramjam-2.8.1-cp39-cp39-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:9ccbc9b23b42aa65cf6878908663e73aeb55971b9e3d0ce3c7733b387934f030"},
+ {file = "cramjam-2.8.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:bfac3506ed0c21b4b2a27af50de9ad2bdd5cc02aed977782fa01d21cc6f54de9"},
+ {file = "cramjam-2.8.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:1f3ef0d616fb8f4d8eb75b3b22653385b88fe493895d763f5f235c7c6d64e570"},
+ {file = "cramjam-2.8.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f18a8d637ac9497e0be2a8fa49cca52259196d18ca73b36a385b6d925cb21b7"},
+ {file = "cramjam-2.8.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d21a370320b6959d7bd581ca286439a113e325477cc92aefb0be378bd351323a"},
+ {file = "cramjam-2.8.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5c2d49a2f8e2a4501c460ad8452ce269c9ddd5c4671ad41cf41104a3fa6ca2e7"},
+ {file = "cramjam-2.8.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8366b00f578ea1b202f20b5ebab622ac10599d1b08c36ed9089e27a452c76d2e"},
+ {file = "cramjam-2.8.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ab8228f4b343e08dd2f4e2a714427979dfd25184cde08198c1d149249ab29a9"},
+ {file = "cramjam-2.8.1-cp39-none-win32.whl", hash = "sha256:a280e51ea157bc2dd9dae2751acfa51bccc0453ce6d99521c18b73719d724b56"},
+ {file = "cramjam-2.8.1-cp39-none-win_amd64.whl", hash = "sha256:2a209e5b7f1e62d8dc27278948176391d35defd0202cd9b0d577126073a781a5"},
+ {file = "cramjam-2.8.1-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3ca01ea39e0b9c7938fe4a5b73dbcd0f43c2b0aaa04de39fe8deb1b69d4a59c0"},
+ {file = "cramjam-2.8.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a83ae10a853fe93882d32da53f50a62927b6408cf486f7475077b04a93c3475"},
+ {file = "cramjam-2.8.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0645a226ae669c26b2232ed30ec0ed34688da3fe6473b3998edc3e11af9a6cb3"},
+ {file = "cramjam-2.8.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:02dc04049680ab834feb092d6d1b6632eb03ba9069b2415a7fe6515d42fe10e0"},
+ {file = "cramjam-2.8.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4c447fb8a01ae435fd7ed3d1bebeb168aa35c5d9edb0326d6207381b54b8f558"},
+ {file = "cramjam-2.8.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c085d020fc15e284640d2ed3a8b121a26a06c0ddfcba87b968460421a565753a"},
+ {file = "cramjam-2.8.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:19af180433b5c25ce25698edf28f26a9f66626a2803a2f1682fbc83d38a7842f"},
+ {file = "cramjam-2.8.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4a259336a60c8c2d4067e19ea78903e55b1e2963831f9f5a5b6d9a395088974"},
+ {file = "cramjam-2.8.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af6331df89641d3dd2fa40b32a7d22353ac6bb62c87baa28ce4073bebfc8f686"},
+ {file = "cramjam-2.8.1.tar.gz", hash = "sha256:f7e233791761b61e3d85598b25db75868d9f1cd35b2bd10eb34ec4755ff2add1"},
+]
+
+[package.extras]
+dev = ["black (==22.3.0)", "hypothesis", "numpy", "pytest (>=5.30)", "pytest-xdist"]
+
[[package]]
name = "debugpy"
version = "1.8.0"
@@ -173,6 +254,98 @@ files = [
[package.extras]
tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"]
+[[package]]
+name = "fastparquet"
+version = "2023.10.1"
+description = "Python support for Parquet file format"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "fastparquet-2023.10.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:75a00475e96d26214dace147b27ab782da7a0ae230cade05ea9181c3aec2e637"},
+ {file = "fastparquet-2023.10.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:af0c1d5559aa0a4fff8eb3b301c8177b6813bb15fe9d2007ad0dc89f8fa519c5"},
+ {file = "fastparquet-2023.10.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b798cdfa8f01cd573b135a493a4d0686ebbcd3a412d6e59889a7ae41ff90efeb"},
+ {file = "fastparquet-2023.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a746f521da7459707899fc33b334b2d21f590380f472fc27642f3ef28ee451d2"},
+ {file = "fastparquet-2023.10.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e86d64898e846ed0f5745427436e5772fd7bb5d9a930f1dca8233e90385e126b"},
+ {file = "fastparquet-2023.10.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5c3afafd4a0907216f5ee4f835f47ad16b84c5dede4c5ca4c0754dffe3eb72d7"},
+ {file = "fastparquet-2023.10.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:68d26a1172be5b02018f6c28603f195807955d8262b913349385d977f3ae081f"},
+ {file = "fastparquet-2023.10.1-cp310-cp310-win_amd64.whl", hash = "sha256:b7086ca3a0d8ae8680b380da9b7057a1491d629945b1dd228eba5b362e2e39aa"},
+ {file = "fastparquet-2023.10.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7f8d53f5e5049b21893964cd27154c2a7c8180f3ffd1f2693f80e0f834a3a35e"},
+ {file = "fastparquet-2023.10.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ea74f28494fda892641a564f728d046a074fdea5b9ff664ef9554c0da563bad4"},
+ {file = "fastparquet-2023.10.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ab978612d21630033df0a0b12423ed826fe36e83a1710b155968c3c6e2b3174a"},
+ {file = "fastparquet-2023.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:efc30c502feaa67c058c496eb4a734eba8bd373f0d24a32cc69360c79f7220ef"},
+ {file = "fastparquet-2023.10.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:99568ae6bbbd973b32d796cb664ba156b101e5d1931dba780fe2dc0d9b227dfd"},
+ {file = "fastparquet-2023.10.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:53b9ad8d646c2609854cbe7d7b17be343664cabae1cd0eb119011e389df8484d"},
+ {file = "fastparquet-2023.10.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:b2a9ee49039223a1e216c409c25072be1f362de27197cbec5f90cf2e736df3b0"},
+ {file = "fastparquet-2023.10.1-cp311-cp311-win_amd64.whl", hash = "sha256:9133d2f975c6e05187be4b558060e6a4aafeba02dceaf849cf6ad46d32e59405"},
+ {file = "fastparquet-2023.10.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b826696cd48f1defb6fcafb4c9798102233e54f3f3491251c034dde3d94f420a"},
+ {file = "fastparquet-2023.10.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bf2d58bee17e0eea8565c2bcd2b339ee032472751651e21f000eb564ad3cd5cf"},
+ {file = "fastparquet-2023.10.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9296098d06c6692ee477fe491381eda39fc0dcfe2fce210496491fe16ce27ef8"},
+ {file = "fastparquet-2023.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c975d648ea491e684135e9e3c0a15b440d66d0772fe497269e5c9c4eaaeb62a2"},
+ {file = "fastparquet-2023.10.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4c5208db1f38c8ac5f50f309f77bdb828fa7f247b82e2df88d847ad3bec38903"},
+ {file = "fastparquet-2023.10.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:118d1832ed2098f313936044012083c8c1b07da85ee11612895f3c4ef27bfd8a"},
+ {file = "fastparquet-2023.10.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:35cff597d2778b6fe8ef7dc36cba056cd7337151dbfc2fb08abaa6b109c75140"},
+ {file = "fastparquet-2023.10.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da633a0bd1889f30acb1b6dffa99832739802d0ae5f455b4e5eb720ab701e09"},
+ {file = "fastparquet-2023.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8aff041900ebcb4c2510bede80695fed35fb7c24dfd83b60ba8b56d7ede4e0fe"},
+ {file = "fastparquet-2023.10.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62aabf43b6bbbc074b89f9a4769f7276204b6585d2d8fae770a0b782da5b9fc9"},
+ {file = "fastparquet-2023.10.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ba85d9e5c298515a347f94bc65c0b570391b344d765dc349bafb35137466ddb2"},
+ {file = "fastparquet-2023.10.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2be7d33969e724c8aa777122d6032845a362cb2075f6e6f2c5b2150bd6223cc8"},
+ {file = "fastparquet-2023.10.1-cp38-cp38-win_amd64.whl", hash = "sha256:2a0c1f485d3085fe98dbae9ead2e97a886deb99d3db7af635296bfd3f4f2f814"},
+ {file = "fastparquet-2023.10.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1fac5319aabcbc4acc2feb5df68336de755de7d60a2ee9329fef178ac016e236"},
+ {file = "fastparquet-2023.10.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c61d26705e9a2ad2d52ed1d527c75e96e6a9a04be35bd4c8d6f4accd778f9b05"},
+ {file = "fastparquet-2023.10.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2635f0f37a983e35be9b8013b84361e3d0cdd4f514b822016445c029b1c6e007"},
+ {file = "fastparquet-2023.10.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cde8f6798d37e2af38ada058fc7018c2157d90a8dd728c0c59fab85b8adb9215"},
+ {file = "fastparquet-2023.10.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:20c17c5e7186723a175c9e7da94285bdef3cb477cb7cca0e2812b1e245279671"},
+ {file = "fastparquet-2023.10.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:91ee6b5b0efc18586e61da6662119de92fc7bf552c3a08a13eb2af16bc12f16a"},
+ {file = "fastparquet-2023.10.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:332cb3b204e1de64dcfc4c5d0b517ea665856d19c139f693e8c9efc11992e19e"},
+ {file = "fastparquet-2023.10.1-cp39-cp39-win_amd64.whl", hash = "sha256:5eb06a70daf50d70290b87f3a5ca6f25eb24ad850bcc68197b5438d92b11c763"},
+ {file = "fastparquet-2023.10.1.tar.gz", hash = "sha256:076fedfba2b56782b4823c1d351424425cfeaa5b8644c542416ca1363fe6d921"},
+]
+
+[package.dependencies]
+cramjam = ">=2.3"
+fsspec = "*"
+numpy = ">=1.20.3"
+packaging = "*"
+pandas = ">=1.5.0"
+
+[package.extras]
+lzo = ["python-lzo"]
+
+[[package]]
+name = "fsspec"
+version = "2023.12.2"
+description = "File-system specification"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "fsspec-2023.12.2-py3-none-any.whl", hash = "sha256:d800d87f72189a745fa3d6b033b9dc4a34ad069f60ca60b943a63599f5501960"},
+ {file = "fsspec-2023.12.2.tar.gz", hash = "sha256:8548d39e8810b59c38014934f6b31e57f40c1b20f911f4cc2b85389c7e9bf0cb"},
+]
+
+[package.extras]
+abfs = ["adlfs"]
+adl = ["adlfs"]
+arrow = ["pyarrow (>=1)"]
+dask = ["dask", "distributed"]
+devel = ["pytest", "pytest-cov"]
+dropbox = ["dropbox", "dropboxdrivefs", "requests"]
+full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"]
+fuse = ["fusepy"]
+gcs = ["gcsfs"]
+git = ["pygit2"]
+github = ["requests"]
+gs = ["gcsfs"]
+gui = ["panel"]
+hdfs = ["pyarrow (>=1)"]
+http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "requests"]
+libarchive = ["libarchive-c"]
+oci = ["ocifs"]
+s3 = ["s3fs"]
+sftp = ["paramiko"]
+smb = ["smbprotocol"]
+ssh = ["paramiko"]
+tqdm = ["tqdm"]
+
[[package]]
name = "ipykernel"
version = "6.29.0"
@@ -957,4 +1130,4 @@ files = [
[metadata]
lock-version = "2.0"
python-versions = "^3.12"
-content-hash = "e9e2ee4b6fdb1aba77a21d7e986f962a6e87ed9bec9353ad443ad7129a575d6b"
+content-hash = "56a680440d968ef6670819b80fb63d6a7806417f8582fbe94b467a8f6d4a886b"
diff --git a/pyproject.toml b/pyproject.toml
index bfc36a8..27c9a58 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,6 +10,7 @@ python = "^3.12"
pandas = "^2.2.0"
ipykernel = "^6.29.0"
scikit-learn = "^1.4.0"
+fastparquet = "^2023.10.1"
[build-system]