diff --git a/README.md b/README.md
index d2c65b5..40602f7 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,7 @@ Then I would train the model on all the games I can on a game-by-game basis. So
   - could be useful for comparing the accuracy of my model. in particular "Distribution of the deviation of the final margin of victory from the Vegas spread"
   - for example, perhaps avg spread difference between vegas and reality is ~10 so a model with an average difference of 8 would be good
 - a concise little overview on features from a datascience.exchange comment about predicting matches (NOTE: not spread): https://datascience.stackexchange.com/questions/102827/how-to-predict-the-winner-of-a-future-sports-match
+- article on when you need to scale data for ml: https://www.baeldung.com/cs/normalization-vs-standardization
 
 # TODO:
 
@@ -89,6 +90,18 @@ score differential is wrong? look at first game. the number for the 2 teams dont
     - gradient boosting (better with non-linear)?
   - [x] some sort of basic analysis to see how it performed. including manually comparing to vegas spread (maybe I can find an average difference? https://www.theonlycolors.com/2020/9/29/21492301/vegas-always-knows-a-mathematical-deep-dive)
     - 9-10 pt avg difference (?). normaly distrubution means ~68% will be within 1 std deviation (identified as 14-15). could be a little lower because 1,2, etc. are within 14-15, but could be higher because ~32% will be more than 14-15.
+- [x] add function to create matchups from 2 teams so we can predict next week's games.
+  - using the running_avg df to merge, similar to how we're merging the game_id to get the final training df
+  - in practice the merged records should share a week but in theory they could be different (week 12 detroit vs. week 6 ravens etc.).
+- [x] cli
+  - [x] download data
+  - [x] train model
+    - what to do with it? save configuration then recreate it when needed? pickle?
+  - [x] predict spread
+- [ ] github workflow
+  - [ ] periodically update the data (and release?)
+  - [ ] periodically train the model (and release? what? the configuration... as what filetype? json?)
+  - [ ] periodically get upcoming games and make predictions. publish on github pages. get booky spread too?
 - [ ] improve features/model. either at game aggregation level or team @ week aggregation level
   - [ ] W/L record or games played and win pct? (win and loss column on game aggregation)
   - [ ] success rate (calculate success (0 or 1) from each play).
@@ -97,9 +110,11 @@ score differential is wrong? look at first game. the number for the 2 teams dont
   - [x] total points scored/allowed
   - [ ] maybe dont use first ~3 games? small sample size but dont want to throw out too much data.
   - [ ] games played (could be used as confidence in record/stats)
-- [x] add function to create matchups from 2 teams so we can predict next week's games.
-  - using the running_avg df to merge, similar to how we're merging the game_id to get the final training df
-  - in practice the merged records should share a week but in theory they could be different (week 12 detroit vs. week 6 ravens etc.).
+- [ ] rethink exposing build_running_avg_dataframe, build_training_dataframe instead of doing that inside train_model (with side effect of saving the build_running_avg_dataframe (to disk?) somewhere).
+  - just need to see how its actually used
+  - I guess its good for development purposes? maybe just make the df arg in train_model(df) optional and build from ground up if not provided which will be used in cli/deployment but developing can pass it df? idk
+- [ ] write script that gets upcoming games and makes prediction from model.
+  - try to find a good source for the schedule (nflfastR for that too maybe?).
 
 # Current status:
 
@@ -123,6 +138,29 @@ score differential is wrong? look at first game. the number for the 2 teams dont
 
 # Stray thoughts:
 
+- model name idea: caliper. (like measuring the "spread")
+- save model by pickeling with joblib/dump or save the configuration like:
+
+```python
+ # Save essential components (assumes linreg - does it work the same for others?)
+ coefficients = model.coef_
+ intercept = model.intercept_
+ # assumes using minmaxscaler (but maybe im not)
+ scaler_params = {'min_values': scaler.min_, 'scale_values': scaler.scale_}
+
+ # Recreate the model
+ recreated_model = LinearRegression()
+ recreated_model.coef_ = coefficients
+ recreated_model.intercept_ = intercept
+
+ # Recreate the scaler
+ recreated_scaler = MinMaxScaler()
+ recreated_scaler.min_ = scaler_params['min_values']
+ recreated_scaler.scale_ = scaler_params['scale_values']
+```
+
+- I think saving the configuration is probably better if I can.
+
 - What should the model's be guess _exactly_ and what does that say about how the teams are modeled in the input? the spread consists of 2 numbers (usually the inverse of each). 1 for each team. Maybe just predict the hometeam?
   - probably need to squash 2 teams into 1 line like: home_team_pass_off, home_team_pass_def, away_team_pass_off, away_team_pass_def, etc.
 - Are lots of features bad? What about redundant or mostly redundant features (pass yards, rush yards, total yards (total yards are either equal or very similar to pass+rush yards)). Which should I pick in that case (probably the less aggregated ones)?
diff --git a/nfl_analytics/assets/trained_model.joblib b/nfl_analytics/assets/trained_model.joblib
new file mode 100644
index 0000000..4df8b1a
Binary files /dev/null and b/nfl_analytics/assets/trained_model.joblib differ
diff --git a/nfl_analytics/assets/trained_scaler.joblib b/nfl_analytics/assets/trained_scaler.joblib
new file mode 100644
index 0000000..0fd9fb7
Binary files /dev/null and b/nfl_analytics/assets/trained_scaler.joblib differ
diff --git a/nfl_analytics/config.py b/nfl_analytics/config.py
new file mode 100644
index 0000000..01b025f
--- /dev/null
+++ b/nfl_analytics/config.py
@@ -0,0 +1,53 @@
+DATA_DIR = "data"
+ASSET_DIR = "assets"
+START_YEAR = 1999
+FEATURES = [
+    "away_rushing_avg",
+    "home_rushing_avg",
+    "away_passing_avg",
+    "home_passing_avg",
+    "away_sack_yards_avg",
+    "home_sack_yards_avg",
+    "away_score_differential_post_avg",
+    "home_score_differential_post_avg",
+    "away_points_scored_avg",
+    "home_points_scored_avg",
+    "away_points_allowed_avg",
+    "home_points_allowed_avg",
+    "away_mean_epa_avg",
+    "home_mean_epa_avg",
+]
+TEAMS = [
+    "WAS",
+    "ARI",
+    "BUF",
+    "NYJ",
+    "ATL",
+    "CAR",
+    "CIN",
+    "CLE",
+    "NYG",
+    "DAL",
+    "DET",
+    "KC",
+    "CHI",
+    "GB",
+    "BAL",
+    "HOU",
+    "IND",
+    "JAX",
+    "SEA",
+    "LA",
+    "LV",
+    "DEN",
+    "MIA",
+    "LAC",
+    "PHI",
+    "NE",
+    "PIT",
+    "SF",
+    "MIN",
+    "TB",
+    "NO",
+    "TEN",
+]
diff --git a/nfl_analytics/data.py b/nfl_analytics/data.py
index 7795aae..5f855c5 100644
--- a/nfl_analytics/data.py
+++ b/nfl_analytics/data.py
@@ -1,32 +1,65 @@
+"""
+Handles fetching and loading the play by play data. Essentially, 
+everything before tranforming it.
+"""
+
 import urllib.request
+from urllib.error import HTTPError
 import os
-import pandas as pd
 import sqlite3
 
+import pandas as pd
 
-def get():
-    years = range(1999, 2024)
+from nfl_analytics.config import DATA_DIR
 
-    save_directory = "data"
-    os.makedirs(save_directory, exist_ok=True)
+
+def download_data(years=range(1999, 2024)):
+    os.makedirs(DATA_DIR, exist_ok=True)
 
     for year in years:
         # year gets parsed from this filename and depends on this format
         filename = f"play_by_play_{year}.csv.gz"
         url = f"https://github.com/nflverse/nflverse-data/releases/download/pbp/{filename}"
-        save_path = os.path.join(save_directory, filename)
+        save_path = os.path.join(DATA_DIR, filename)
 
         print(f"Downloading {url}")
-        urllib.request.urlretrieve(url, save_path)
+
+        try:
+            urllib.request.urlretrieve(url, save_path)
+        except HTTPError as e:
+            print(
+                f"Error: Failed to download data for {year}. HTTP Error {e.code}: {e.reason}. Season for that year may not exist yet."
+            )
 
 
-def load_pandas():
+def load_dataframe():
     script_dir = os.path.dirname(os.path.abspath(__file__))
-    data_directory = os.path.join(script_dir, "data")
+    data_directory = os.path.join(script_dir, DATA_DIR)
 
+    if not os.path.exists(data_directory):
+        raise FileNotFoundError(f"Data directory '{data_directory}' not found.")
+
+    files = os.listdir(data_directory)
+
+    if not files:
+        raise FileNotFoundError(f"No data files found in the data directory.")
+
+    # This wont pick on updated data (downlaoded new data but still have combined, so it will use that)
+    # # load saved combined from disk if exists
+    # combined_file_path = os.path.join(
+    #     data_directory, "combined", "play_by_play_combined.parquet.gzip"
+    # )
+    # if not skip_combined and os.path.exists(combined_file_path):
+    #     print(f"Reading combined file {combined_file_path}")
+    #     combined_df = pd.read_parquet(combined_file_path)
+    #     return combined_df
+    # else:
+    #     print("Combined file does not exist. Loading individual files.")
+
+    # make combined dataframe from individual files
     combined_df = pd.DataFrame()
 
-    for filename in os.listdir(data_directory):
+    for filename in files:
         if filename.endswith(".csv.gz"):
             print(f"Reading {filename}")
             file_path = os.path.join(data_directory, filename)
@@ -37,6 +70,9 @@ def load_pandas():
             df["year"] = year
             combined_df = pd.concat([combined_df, df], ignore_index=True)
 
+    if combined_df.empty:
+        raise FileNotFoundError("No data loaded from the files.")
+
     return combined_df
 
 
@@ -46,12 +82,14 @@ def get_year_from_filename(filename):
 
 
 def load_sqlite():
+    db_dir = "/tmp/nfl-analytics.db"
     # load into pandas first and use to_sql to infer datatypes
-    df = load_pandas()
+    df = load_dataframe()
+
+    print(f"Loading into SQLite database: {db_dir}")
 
     table_name = "plays"
-    db_conn = sqlite3.connect(database="/tmp/nfl-analytics.db")
-    # TODO: remove drop table after developing?
+    db_conn = sqlite3.connect(database=db_dir)
     db_conn.execute(f"DROP TABLE IF EXISTS {table_name}")
     df.to_sql(table_name, db_conn, index=False)
 
@@ -59,30 +97,6 @@ def load_sqlite():
     print(cursor.fetchall())
 
 
-# def build():
-#     # TODO: do all the things the dev notebook is doing. splitting into nice functions as necessary
-#     # For example, could make a function for each time in notebook we are initializing a new dataframe (just a rough guide).
-#     pass
-
-
-class Pipeline:
-    def __init__(self, debug=False):
-        self.debug = debug
-        # self.df = pd.DataFrame()
-
-    def _fetch_play_by_play(self, years=range(1999, 2024)):
-        pass
-
-    def _load(self):
-        pass
-
-    def _build(self):
-        pass
-
-    # def stuffthatbuildcalls (so I can run in the dev notebook)
-    # if debug: true, print stuff
-
-
 if __name__ == "__main__":
-    get()
+    download_data()
     load_sqlite()
diff --git a/nfl_analytics/dataframes.py b/nfl_analytics/dataframes.py
new file mode 100644
index 0000000..70d461e
--- /dev/null
+++ b/nfl_analytics/dataframes.py
@@ -0,0 +1,233 @@
+"""
+Builds the dataframes used for training and prediction.
+Handles everything between getting the data and training/using the model.
+"""
+
+from nfl_analytics.data import load_dataframe
+import pandas as pd
+
+
+def build_training_dataframe(df_running_avg=None):
+    if df_running_avg is None:
+        df_running_avg = build_running_avg_dataframe()
+
+    # Create a new column 'is_home' to indicate whether the team is playing at home
+    df_running_avg["is_home"] = df_running_avg.apply(
+        lambda row: True if row["team"] == row["home_team"] else False, axis=1
+    )
+
+    # Group by game_id and is_home and aggregate using the first value
+    squashed_df = (
+        df_running_avg.groupby(["game_id", "is_home"])[
+            [
+                "rushing_avg",
+                "passing_avg",
+                "yards_gained_avg",
+                "sack_yards_avg",
+                "passing_yards_defense_avg",
+                "rushing_yards_defense_avg",
+                "yards_gained_defense_avg",
+                "sack_yards_defense_avg",
+                "score_differential_post_avg",
+                "points_scored_avg",
+                "points_allowed_avg",
+                "mean_epa_avg",
+            ]
+        ]
+        .first()
+        .unstack()
+    )
+
+    squashed_df.columns = [
+        f"{'home' if is_home else 'away'}_{col}" for col, is_home in squashed_df.columns
+    ]
+    squashed_df.reset_index(inplace=True)
+
+    # Merge with the original DataFrame to get the rest of the columns
+    return pd.merge(
+        df_running_avg[
+            [
+                "game_id",
+                "week",
+                "year",
+                "team",
+                "home_team",
+                "away_team",
+                "home_spread",
+            ]
+        ],
+        squashed_df,
+        on="game_id",
+    )
+
+
+def build_running_avg_dataframe(df_raw=None):
+    """
+    Builds a dataframe with weakly running averages for each team by year.
+    Used to create prediction inputs and build the training dataset
+    """
+    if df_raw is None:
+        df_raw = load_dataframe()
+
+    df_sacks = add_sack_yards(df_raw)
+    # df_game is team games stats by team: week 1, DET, 250 pass, 120 run, etc.
+    df_game_posteam = df_sacks.groupby(["game_id", "posteam"])
+    df_game = aggregate_game_stats(df_sacks, df_game_posteam)
+    df_game = adjust_game_dataframe(df_game, df_game_posteam)
+    df_running_avg = df_game[
+        [
+            "game_id",
+            "team",
+            "week",
+            "year",
+            "home_team",
+            "away_team",
+            "score_differential_post",
+        ]
+    ].copy()
+
+    # Set the home_spread
+    # This will be our target variable. It's the spread relative to the home team. We want this because we need to predict a single spread value (which we can then invert for the away team's spread).
+    df_running_avg["home_spread"] = df_game.apply(
+        lambda row: -row["score_differential_post"]
+        if row["team"] != row["home_team"]
+        else row["score_differential_post"],
+        axis=1,
+    )
+
+    # Get the running average for each team by team and year
+    # Uses lambda and shift to not include current row in running average
+    # Expand is an expanding window function that gets everything from the first to current row
+    df_running_avg[
+        [
+            "rushing_avg",
+            "passing_avg",
+            "yards_gained_avg",
+            "sack_yards_avg",
+            "passing_yards_defense_avg",
+            "rushing_yards_defense_avg",
+            "yards_gained_defense_avg",
+            "sack_yards_defense_avg",
+            "score_differential_post_avg",
+            "points_scored_avg",
+            "points_allowed_avg",
+            "mean_epa_avg",
+        ]
+    ] = (
+        df_game.groupby(["team", "year"])[
+            [
+                "rushing_yards",
+                "passing_yards",
+                "yards_gained",
+                "sack_yards",
+                "passing_yards_defense",
+                "rushing_yards_defense",
+                "yards_gained_defense",
+                "sack_yards_defense",
+                "score_differential_post",
+                "points_scored",
+                "points_allowed",
+                "mean_epa",
+            ]
+        ]
+        .apply(lambda x: x.shift().expanding().mean())
+        .reset_index(level=[0, 1], drop=True)
+    )
+
+    return df_running_avg
+
+
+def add_sack_yards(df_raw):
+    df = df_raw.copy()
+    # Sack yards would be necessary to get accurate TEAM passing stats.
+    # Team passing yards are sum(passing_yards) - sum(sack_yards)
+    # Player passing stats are simply sum(passing_yards).
+    df["sack_yards"] = pd.NA
+
+    # Set sack_yards to yards_gained for rows where sack is not equal to 0
+    df.loc[df["sack"] != 0, "sack_yards"] = df["yards_gained"]
+
+    return df
+
+
+def aggregate_game_stats(df_sacks, df_game_posteam):
+    # Group by game and team and combine offensive and defensive stats into single record
+
+    # Separate offensive and defensive stats
+    offensive_stats = (
+        df_game_posteam[
+            ["passing_yards", "rushing_yards", "yards_gained", "sack_yards"]
+        ]
+        .sum()
+        .reset_index()
+    )
+    defensive_stats = (
+        df_sacks.groupby(["game_id", "defteam"])[
+            ["passing_yards", "rushing_yards", "yards_gained", "sack_yards"]
+        ]
+        .sum()
+        .reset_index()
+    )
+
+    # Rename columns for defensive stats to distinguish them
+    defensive_stats.rename(
+        columns={
+            "defteam": "team",
+            "passing_yards": "passing_yards_defense",
+            "rushing_yards": "rushing_yards_defense",
+            "yards_gained": "yards_gained_defense",
+            "sack_yards": "sack_yards_defense",
+        },
+        inplace=True,
+    )
+
+    return pd.merge(
+        offensive_stats,
+        defensive_stats,
+        left_on=["game_id", "posteam"],
+        right_on=["game_id", "team"],
+    )
+
+
+def adjust_game_dataframe(df_game, df_game_posteam):
+    df = df_game.copy()
+
+    # Add home_team, away_team, home_score, away_score
+    df[["home_team", "away_team", "home_score", "away_score"]] = (
+        df_game_posteam[["home_team", "away_team", "home_score", "away_score"]]
+        .first()
+        .reset_index(drop=True)
+    )
+
+    df["points_scored"] = df.apply(
+        lambda row: row["home_score"]
+        if row["posteam"] == row["home_team"]
+        else row["away_score"],
+        axis=1,
+    )
+    df["points_allowed"] = df.apply(
+        lambda row: row["away_score"]
+        if row["posteam"] == row["home_team"]
+        else row["home_score"],
+        axis=1,
+    )
+
+    df.drop(["posteam"], axis=1, inplace=True)
+
+    # sets score differential to last value for each game and team
+    df[["score_differential_post", "week", "year"]] = (
+        df_game_posteam[["score_differential_post", "week", "year"]]
+        .last()
+        .reset_index(drop=True)
+    )
+
+    df["mean_epa"] = df_game_posteam["epa"].mean().reset_index(drop=True)
+
+    return df
+
+
+if __name__ == "__main__":
+    df_running_avg = build_running_avg_dataframe()
+    print(df_running_avg.tail())
+    df_train = build_training_dataframe()
+    print(df_train.tail())
diff --git a/nfl_analytics/data_dev.ipynb b/nfl_analytics/dev_notebook.ipynb
similarity index 91%
rename from nfl_analytics/data_dev.ipynb
rename to nfl_analytics/dev_notebook.ipynb
index edccb38..fa6ad43 100644
--- a/nfl_analytics/data_dev.ipynb
+++ b/nfl_analytics/dev_notebook.ipynb
@@ -7,17 +7,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from nfl_analytics.data import load_pandas\n",
+    "from nfl_analytics.data import load_dataframe\n",
     "import pandas as pd"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -29,494 +29,13 @@
       "Reading play_by_play_2006.csv.gz\n",
       "Reading play_by_play_2014.csv.gz\n",
       "Reading play_by_play_2020.csv.gz\n",
-      "Reading play_by_play_2016.csv.gz\n",
-      "Reading play_by_play_2008.csv.gz\n",
-      "Reading play_by_play_2004.csv.gz\n",
-      "Reading play_by_play_2000.csv.gz\n",
-      "Reading play_by_play_2012.csv.gz\n",
-      "Reading play_by_play_2010.csv.gz\n",
-      "Reading play_by_play_2002.csv.gz\n",
-      "Reading play_by_play_2007.csv.gz\n",
-      "Reading play_by_play_2019.csv.gz\n",
-      "Reading play_by_play_2015.csv.gz\n",
-      "Reading play_by_play_2023.csv.gz\n",
-      "Reading play_by_play_2009.csv.gz\n",
-      "Reading play_by_play_2017.csv.gz\n",
-      "Reading play_by_play_2005.csv.gz\n",
-      "Reading play_by_play_2021.csv.gz\n",
-      "Reading play_by_play_1999.csv.gz\n",
-      "Reading play_by_play_2001.csv.gz\n",
-      "Reading play_by_play_2013.csv.gz\n",
-      "Reading play_by_play_2011.csv.gz\n",
-      "Reading play_by_play_2003.csv.gz\n"
+      "Reading play_by_play_2016.csv.gz\n"
      ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>play_id</th>\n",
-       "      <th>game_id</th>\n",
-       "      <th>old_game_id</th>\n",
-       "      <th>home_team</th>\n",
-       "      <th>away_team</th>\n",
-       "      <th>season_type</th>\n",
-       "      <th>week</th>\n",
-       "      <th>posteam</th>\n",
-       "      <th>posteam_type</th>\n",
-       "      <th>defteam</th>\n",
-       "      <th>...</th>\n",
-       "      <th>home_opening_kickoff</th>\n",
-       "      <th>qb_epa</th>\n",
-       "      <th>xyac_epa</th>\n",
-       "      <th>xyac_mean_yardage</th>\n",
-       "      <th>xyac_median_yardage</th>\n",
-       "      <th>xyac_success</th>\n",
-       "      <th>xyac_fd</th>\n",
-       "      <th>xpass</th>\n",
-       "      <th>pass_oe</th>\n",
-       "      <th>year</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>2022_01_BAL_NYJ</td>\n",
-       "      <td>2022091107</td>\n",
-       "      <td>NYJ</td>\n",
-       "      <td>BAL</td>\n",
-       "      <td>REG</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>2022</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>43</td>\n",
-       "      <td>2022_01_BAL_NYJ</td>\n",
-       "      <td>2022091107</td>\n",
-       "      <td>NYJ</td>\n",
-       "      <td>BAL</td>\n",
-       "      <td>REG</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NYJ</td>\n",
-       "      <td>home</td>\n",
-       "      <td>BAL</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1</td>\n",
-       "      <td>-0.443521</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>2022</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>68</td>\n",
-       "      <td>2022_01_BAL_NYJ</td>\n",
-       "      <td>2022091107</td>\n",
-       "      <td>NYJ</td>\n",
-       "      <td>BAL</td>\n",
-       "      <td>REG</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NYJ</td>\n",
-       "      <td>home</td>\n",
-       "      <td>BAL</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1.468819</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.440373</td>\n",
-       "      <td>-44.037291</td>\n",
-       "      <td>2022</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>89</td>\n",
-       "      <td>2022_01_BAL_NYJ</td>\n",
-       "      <td>2022091107</td>\n",
-       "      <td>NYJ</td>\n",
-       "      <td>BAL</td>\n",
-       "      <td>REG</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NYJ</td>\n",
-       "      <td>home</td>\n",
-       "      <td>BAL</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1</td>\n",
-       "      <td>-0.492192</td>\n",
-       "      <td>0.727261</td>\n",
-       "      <td>6.988125</td>\n",
-       "      <td>6.0</td>\n",
-       "      <td>0.60693</td>\n",
-       "      <td>0.227598</td>\n",
-       "      <td>0.389904</td>\n",
-       "      <td>61.009598</td>\n",
-       "      <td>2022</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>115</td>\n",
-       "      <td>2022_01_BAL_NYJ</td>\n",
-       "      <td>2022091107</td>\n",
-       "      <td>NYJ</td>\n",
-       "      <td>BAL</td>\n",
-       "      <td>REG</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NYJ</td>\n",
-       "      <td>home</td>\n",
-       "      <td>BAL</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1</td>\n",
-       "      <td>-0.325931</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.443575</td>\n",
-       "      <td>-44.357494</td>\n",
-       "      <td>2022</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5 rows × 373 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   play_id          game_id  old_game_id home_team away_team season_type  \\\n",
-       "0        1  2022_01_BAL_NYJ   2022091107       NYJ       BAL         REG   \n",
-       "1       43  2022_01_BAL_NYJ   2022091107       NYJ       BAL         REG   \n",
-       "2       68  2022_01_BAL_NYJ   2022091107       NYJ       BAL         REG   \n",
-       "3       89  2022_01_BAL_NYJ   2022091107       NYJ       BAL         REG   \n",
-       "4      115  2022_01_BAL_NYJ   2022091107       NYJ       BAL         REG   \n",
-       "\n",
-       "   week posteam posteam_type defteam  ... home_opening_kickoff    qb_epa  \\\n",
-       "0     1     NaN          NaN     NaN  ...                    1  0.000000   \n",
-       "1     1     NYJ         home     BAL  ...                    1 -0.443521   \n",
-       "2     1     NYJ         home     BAL  ...                    1  1.468819   \n",
-       "3     1     NYJ         home     BAL  ...                    1 -0.492192   \n",
-       "4     1     NYJ         home     BAL  ...                    1 -0.325931   \n",
-       "\n",
-       "   xyac_epa  xyac_mean_yardage  xyac_median_yardage  xyac_success   xyac_fd  \\\n",
-       "0       NaN                NaN                  NaN           NaN       NaN   \n",
-       "1       NaN                NaN                  NaN           NaN       NaN   \n",
-       "2       NaN                NaN                  NaN           NaN       NaN   \n",
-       "3  0.727261           6.988125                  6.0       0.60693  0.227598   \n",
-       "4       NaN                NaN                  NaN           NaN       NaN   \n",
-       "\n",
-       "      xpass    pass_oe  year  \n",
-       "0       NaN        NaN  2022  \n",
-       "1       NaN        NaN  2022  \n",
-       "2  0.440373 -44.037291  2022  \n",
-       "3  0.389904  61.009598  2022  \n",
-       "4  0.443575 -44.357494  2022  \n",
-       "\n",
-       "[5 rows x 373 columns]"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df = load_pandas()\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>game_id</th>\n",
-       "      <th>old_game_id</th>\n",
-       "      <th>year</th>\n",
-       "      <th>week</th>\n",
-       "      <th>posteam</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>2022_01_BAL_NYJ</td>\n",
-       "      <td>2022091107</td>\n",
-       "      <td>2022</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2022_01_BAL_NYJ</td>\n",
-       "      <td>2022091107</td>\n",
-       "      <td>2022</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NYJ</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2022_01_BAL_NYJ</td>\n",
-       "      <td>2022091107</td>\n",
-       "      <td>2022</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NYJ</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>2022_01_BAL_NYJ</td>\n",
-       "      <td>2022091107</td>\n",
-       "      <td>2022</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NYJ</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>2022_01_BAL_NYJ</td>\n",
-       "      <td>2022091107</td>\n",
-       "      <td>2022</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NYJ</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>2022_01_BAL_NYJ</td>\n",
-       "      <td>2022091107</td>\n",
-       "      <td>2022</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NYJ</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>2022_01_BAL_NYJ</td>\n",
-       "      <td>2022091107</td>\n",
-       "      <td>2022</td>\n",
-       "      <td>1</td>\n",
-       "      <td>NYJ</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>2022_01_BAL_NYJ</td>\n",
-       "      <td>2022091107</td>\n",
-       "      <td>2022</td>\n",
-       "      <td>1</td>\n",
-       "      <td>BAL</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>2022_01_BAL_NYJ</td>\n",
-       "      <td>2022091107</td>\n",
-       "      <td>2022</td>\n",
-       "      <td>1</td>\n",
-       "      <td>BAL</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9</th>\n",
-       "      <td>2022_01_BAL_NYJ</td>\n",
-       "      <td>2022091107</td>\n",
-       "      <td>2022</td>\n",
-       "      <td>1</td>\n",
-       "      <td>BAL</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "           game_id  old_game_id  year  week posteam\n",
-       "0  2022_01_BAL_NYJ   2022091107  2022     1     NaN\n",
-       "1  2022_01_BAL_NYJ   2022091107  2022     1     NYJ\n",
-       "2  2022_01_BAL_NYJ   2022091107  2022     1     NYJ\n",
-       "3  2022_01_BAL_NYJ   2022091107  2022     1     NYJ\n",
-       "4  2022_01_BAL_NYJ   2022091107  2022     1     NYJ\n",
-       "5  2022_01_BAL_NYJ   2022091107  2022     1     NYJ\n",
-       "6  2022_01_BAL_NYJ   2022091107  2022     1     NYJ\n",
-       "7  2022_01_BAL_NYJ   2022091107  2022     1     BAL\n",
-       "8  2022_01_BAL_NYJ   2022091107  2022     1     BAL\n",
-       "9  2022_01_BAL_NYJ   2022091107  2022     1     BAL"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[['game_id', 'old_game_id', 'year', 'week', 'posteam']].head(10)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th>epa</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>year</th>\n",
-       "      <th>posteam</th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"5\" valign=\"top\">1999</th>\n",
-       "      <th>ARI</th>\n",
-       "      <td>-0.100310</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>ATL</th>\n",
-       "      <td>-0.073998</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>BAL</th>\n",
-       "      <td>-0.043631</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>BUF</th>\n",
-       "      <td>-0.014478</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>CAR</th>\n",
-       "      <td>0.059887</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"5\" valign=\"top\">2023</th>\n",
-       "      <th>SEA</th>\n",
-       "      <td>0.015927</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>SF</th>\n",
-       "      <td>0.119782</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>TB</th>\n",
-       "      <td>-0.007973</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>TEN</th>\n",
-       "      <td>-0.031331</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>WAS</th>\n",
-       "      <td>-0.084474</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>797 rows × 1 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                   epa\n",
-       "year posteam          \n",
-       "1999 ARI     -0.100310\n",
-       "     ATL     -0.073998\n",
-       "     BAL     -0.043631\n",
-       "     BUF     -0.014478\n",
-       "     CAR      0.059887\n",
-       "...                ...\n",
-       "2023 SEA      0.015927\n",
-       "     SF       0.119782\n",
-       "     TB      -0.007973\n",
-       "     TEN     -0.031331\n",
-       "     WAS     -0.084474\n",
-       "\n",
-       "[797 rows x 1 columns]"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
     }
    ],
    "source": [
-    "df.groupby(['year', 'posteam'])[['epa']].mean()"
+    "df = load_dataframe()\n",
+    "df[df['year'] == 2023]['posteam'].unique()"
    ]
   },
   {
@@ -4602,7 +4121,6 @@
     "from sklearn.impute import SimpleImputer\n",
     "from sklearn.preprocessing import StandardScaler\n",
     "\n",
-    "# Assuming your DataFrame is named df\n",
     "# Drop rows with NaN for week 1\n",
     "df_train = result_df[result_df['week'] > 1]\n",
     "\n",
@@ -4637,7 +4155,6 @@
     "# Split the data into training and testing sets\n",
     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
     "\n",
-    "# TODO: do i need to scale the data? https://www.baeldung.com/cs/normalization-vs-standardization\n",
     "# Standardize the features\n",
     "scaler = StandardScaler()\n",
     "X_train_scaled = scaler.fit_transform(X_train)\n",
@@ -4690,8 +4207,7 @@
     }
    ],
    "source": [
-    "# Gut-checking the model with future (as of writing) conference titles games.\n",
-    "# TODO: negate the target variable? I think training w/ positive spread for winner might not be right.\n",
+    "# Gut-checking the model with future (as of writing) conference titles games 01/28/2024.\n",
     "\n",
     "# Odds are -7 SF (7 DET). Some places -7.5 SF (7.5 DET).\n",
     "# Very close - nice. This suggests taking Detroit to beat the spread although not by much.\n",
diff --git a/nfl_analytics/get_data.py b/nfl_analytics/get_data.py
deleted file mode 100644
index 7175bb1..0000000
--- a/nfl_analytics/get_data.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import urllib.request
-import os
-
-years = range(1999, 2024)
-
-save_directory = "data"
-os.makedirs(save_directory, exist_ok=True)
-
-for year in years:
-    filename = f"play_by_play_{year}.csv.gz"
-    url = f"https://github.com/nflverse/nflverse-data/releases/download/pbp/{filename}"
-    save_path = os.path.join(save_directory, filename)
-
-    urllib.request.urlretrieve(url, save_path)
diff --git a/nfl_analytics/load_data.py b/nfl_analytics/load_data.py
deleted file mode 100644
index b067057..0000000
--- a/nfl_analytics/load_data.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Loads csvs into pandas dataframe and sqlite db
-import os
-import pandas as pd
-import sqlite3
-
-data_directory = "nfl_analytics/data"
-combined_df = pd.DataFrame()
-
-for filename in os.listdir(data_directory):
-    if filename.endswith(".csv.gz"):
-        print(f"Reading {filename}")
-        file_path = os.path.join(data_directory, filename)
-
-        # Read the CSV file into a DataFrame and concat to combined df
-        df = pd.read_csv(file_path, compression="gzip", low_memory=False)
-        combined_df = pd.concat([combined_df, df], ignore_index=True)
-
-print(combined_df.head())
-
-table_name = "plays"
-db_conn = sqlite3.connect(database="/tmp/my.db")
-# TODO: remove drop table after developing?
-db_conn.execute(f"DROP TABLE IF EXISTS {table_name}")
-num_rows_inserted = combined_df.to_sql(table_name, db_conn, index=False)
-
-cursor = db_conn.execute(f"SELECT * from {table_name} LIMIT 10")
-print(cursor.fetchall())
diff --git a/nfl_analytics/main.py b/nfl_analytics/main.py
new file mode 100644
index 0000000..3de1b5f
--- /dev/null
+++ b/nfl_analytics/main.py
@@ -0,0 +1,111 @@
+import argparse
+import time
+
+from nfl_analytics.data import download_data, load_dataframe
+from nfl_analytics.model import (
+    train_model,
+    predict,
+    save_model_and_scaler,
+    load_model_and_scaler,
+)
+from nfl_analytics.dataframes import (
+    build_running_avg_dataframe,
+    build_training_dataframe,
+)
+from nfl_analytics.utils import is_valid_year
+from nfl_analytics.config import TEAMS
+
+
+# ROUGH CLI docs:
+# --download: optional. takes list of years. or if empty, defaults to downloading all years. usage: python main.py --download 2021 2022
+# --train: optional. if present, trains the model. usage: python main.py --train
+# --predict: optional. takes two arguments, home team and away team. usage: python main.py --predict "CHI" "MIN"
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Manage NFL Spread Predictor Pipeline")
+    parser.add_argument(
+        "--download",
+        nargs="*",
+        type=int,
+        metavar="year",
+        help="Download data for the specified years. The year corresponds to the season start.",
+    )
+    parser.add_argument(
+        "--train",
+        action="store_true",
+        help="Train the model using the downloaded data.",
+    )
+    parser.add_argument(
+        "--predict",
+        nargs=2,
+        metavar=("home_team", "away_team"),
+        help="Specify the home and away teams for prediction.",
+    )
+    args = parser.parse_args()
+
+    if args.download is not None:
+        if args.download:
+            year_set = set(args.download)
+            invalid_years = [year for year in year_set if not is_valid_year(year)]
+
+            if invalid_years:
+                print(
+                    f"Error: Invalid year(s) provided: {invalid_years}. No data downloaded."
+                )
+            else:
+                download_data(year_set)
+        else:
+            download_data()
+
+    if args.train:
+        print("Training model...")
+
+        start_time = time.time()
+        df_raw = load_dataframe()
+        end_time = time.time()
+        print(f"Loaded dataframe in {end_time - start_time} seconds")
+
+        # This wont pick on updated data (downlaoded new data but still have combined, so it will use that)
+        # Save combined dataframe to disk
+        # save_dir = os.path.join("data", "combined")
+        # os.makedirs(save_dir, exist_ok=True)
+        # save_path = os.path.join(save_dir, "play_by_play_combined.parquet.gzip")
+        # df_raw.to_parquet(save_path, compression="gzip")
+
+        df_running_avg = build_running_avg_dataframe(df_raw)
+        df_training = build_training_dataframe(df_running_avg)
+        model, scaler = train_model(df_training)
+
+        save_model_and_scaler(model, scaler)
+
+    if args.predict:
+        # TODO: this will silently predict based off old data if thats all we have.
+        # Perhaps I should require the week/year in the predict fn? Or at least log
+        # year/week in predict?
+        home_team = args.predict[0].upper()
+        away_team = args.predict[1].upper()
+
+        for team in [home_team, away_team]:
+            if team not in TEAMS:
+                print(f"Invalid team: {team}")
+                return
+
+        if home_team == away_team:
+            print("Error: Home and away team cannot be the same.")
+            return
+
+        model, scaler = load_model_and_scaler()
+
+        # TODO: load directly from somewhere instead?
+        df_running_avg = build_running_avg_dataframe()
+
+        predicted_spread = predict(model, scaler, df_running_avg, home_team, away_team)
+
+        print(
+            f"Predicted spread for {home_team} (home) vs {away_team} (away): {predicted_spread}"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nfl_analytics/model.py b/nfl_analytics/model.py
new file mode 100644
index 0000000..9122d5c
--- /dev/null
+++ b/nfl_analytics/model.py
@@ -0,0 +1,152 @@
+import os
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error, mean_absolute_error
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler
+from joblib import dump, load
+
+from nfl_analytics.config import FEATURES, ASSET_DIR
+
+
+def train_model(df_training):
+    # Drop week 1 because is all NaN
+    df_train = df_training[df_training["week"] > 1]
+
+    # Dont use unnecessary columns like 'game_id', 'week', 'year', 'team', 'home_team', 'away_team'
+    # Keep only relevant columns for prediction
+    target = "home_spread"
+    select_columns = FEATURES + [target]
+
+    df_train = df_train[select_columns]
+
+    # TODO: why are there missing values?
+    imputer = SimpleImputer(strategy="mean")
+    df_imputed = pd.DataFrame(imputer.fit_transform(df_train), columns=df_train.columns)
+
+    X = df_imputed.drop(target, axis=1)
+    y = df_imputed[target]
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42
+    )
+
+    # Note: scaler is transformed by fit_transform. Must re-use the same scaler for prediction.
+    scaler = StandardScaler()
+    X_train_scaled = scaler.fit_transform(X_train)
+    X_test_scaled = scaler.transform(X_test)
+
+    model = LinearRegression()
+    model.fit(X_train_scaled, y_train)
+
+    y_pred = model.predict(X_test_scaled)
+
+    mse = mean_squared_error(y_test, y_pred)
+    mae = mean_absolute_error(y_test, y_pred)
+    print(f"Mean Squared Error: {mse}")
+    print(f"Mean Absolute Error: {mae}")
+
+    return model, scaler
+
+
+def save_model_and_scaler(model, scaler):
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    asset_dir = os.path.join(script_dir, ASSET_DIR)
+    os.makedirs(asset_dir, exist_ok=True)
+
+    dump(model, os.path.join(asset_dir, "trained_model.joblib"))
+    dump(scaler, os.path.join(asset_dir, "trained_scaler.joblib"))
+    print("Model and scaler saved")
+
+
+def load_model_and_scaler():
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    asset_dir = os.path.join(script_dir, ASSET_DIR)
+
+    model = load(os.path.join(asset_dir, "trained_model.joblib"))
+    scaler = load(os.path.join(asset_dir, "trained_scaler.joblib"))
+    return model, scaler
+
+
+def predict(model, scaler, df_running_avg, home_team, away_team):
+    matchup = make_matchup(df_running_avg, home_team, away_team)
+    matchup_input = get_matchup_input(scaler, matchup)
+
+    return model.predict(matchup_input)[0]
+
+
+def make_matchup(df_running_avg, home_team, away_team, week=None, year=None):
+    """Merge given team/week/years stats into a single row.
+    To be used for predicting spreads for future games."""
+
+    df = df_running_avg.copy()
+
+    if year is None:
+        year = df["year"].max()
+
+    if week is None:
+        last_week = df[df["year"] == year]["week"].max()
+        week = last_week
+
+    # df_running_avg include running averages prior to that week, and data about
+    # that week itself: teams, final scores, etc.). Basically (and literally at
+    # the time of writing) anything not suffixed with `_avg`. The data about the
+    # week itself are necessary for training the model but dont make sense in
+    # the context of predicting future games so they are not included here.
+    cols = [
+        "rushing_avg",
+        "passing_avg",
+        "yards_gained_avg",
+        "sack_yards_avg",
+        "passing_yards_defense_avg",
+        "rushing_yards_defense_avg",
+        "yards_gained_defense_avg",
+        "sack_yards_defense_avg",
+        "score_differential_post_avg",
+        "points_scored_avg",
+        "points_allowed_avg",
+        "mean_epa_avg",
+    ]
+
+    # Select data for the specified week, home team, and away team in the specified year
+    home_data = (
+        df[(df["year"] == year) & (df["week"] == week) & (df["team"] == home_team)][
+            cols
+        ]
+        .add_prefix("home_")
+        .reset_index(drop=True)
+    )
+    away_data = (
+        df[(df["year"] == year) & (df["week"] == week) & (df["team"] == away_team)][
+            cols
+        ]
+        .add_prefix("away_")
+        .reset_index(drop=True)
+    )
+
+    return pd.concat([home_data, away_data], axis=1)
+
+
+def get_matchup_input(scaler, matchup):
+    reshaped_matchup = matchup[FEATURES].values.reshape(1, -1)
+    return scaler.transform(reshaped_matchup)
+
+
+if __name__ == "__main__":
+    from nfl_analytics.dataframes import (
+        build_running_avg_dataframe,
+        build_training_dataframe,
+    )
+
+    df_running_avg = build_running_avg_dataframe()
+    df_training = build_training_dataframe()
+    model, scaler = train_model(df_training)
+    print(make_matchup(df_running_avg, "KC", "SF").tail())
+    # first team is home but this is superbowl so neither is technically home
+    # week 22 (? its the superbowl) 2023 (2023 SEASON, year is 2024)
+    kc_sf = predict(model, scaler, df_running_avg, "KC", "SF")
+    print(f"Prediction: {kc_sf}")
+    sf_kc = predict(model, scaler, df_running_avg, "SF", "KC")
+    print(f"Prediction: {sf_kc}")
diff --git a/nfl_analytics/train_dev.ipynb b/nfl_analytics/train_dev.ipynb
deleted file mode 100644
index e69de29..0000000
diff --git a/nfl_analytics/utils.py b/nfl_analytics/utils.py
new file mode 100644
index 0000000..ab02bc7
--- /dev/null
+++ b/nfl_analytics/utils.py
@@ -0,0 +1,17 @@
+import datetime
+
+from nfl_analytics.config import START_YEAR
+
+
+def is_valid_year(year):
+    current_year = datetime.datetime.now().year
+    return START_YEAR <= year <= current_year
+
+
+if __name__ == "__main__":
+    print(is_valid_year(1998))  # False
+    print(is_valid_year(1999))  # True
+    print(is_valid_year(2000))  # True
+    print(is_valid_year(2023))  # True
+    print(is_valid_year(2024))  # True
+    print(is_valid_year(2025))  # False (if current year is 2024)
diff --git a/poetry.lock b/poetry.lock
index b9eef52..d0e3e18 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -121,6 +121,87 @@ traitlets = ">=4"
 [package.extras]
 test = ["pytest"]
 
+[[package]]
+name = "cramjam"
+version = "2.8.1"
+description = "Thin Python bindings to de/compression algorithms in Rust"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "cramjam-2.8.1-cp310-cp310-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:1afc66031e5165f4eae27e6e1f89c0def6c2ece826903ebb0194ee2f467ff8e6"},
+    {file = "cramjam-2.8.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:4af4b74b16c22d6a0092d6a7db642ee097f4b0bfa0389d5a07552a2fc48eb0b6"},
+    {file = "cramjam-2.8.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:e858a56d12a61d0e19b2286a7661b36b52b3cad4fa84d8aaeb0b0ed0b2338d36"},
+    {file = "cramjam-2.8.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc25199e44ee8ca55c62d6da2a74ea48e759058f2c96ca1e5d512aad6ce6005b"},
+    {file = "cramjam-2.8.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:130aee295a77256aa867a4562961e99db56ddf8081df1e9bfb1607a409dcc4df"},
+    {file = "cramjam-2.8.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ccfa252b800d6cdae8cc20728c41b94a39f5cadee8693ab2539ea02285e8e015"},
+    {file = "cramjam-2.8.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b8bd1cc5706b235655b742e8e350c13ffc2036efc098a192bd9f4df8b153c03e"},
+    {file = "cramjam-2.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dedae22ed4f8997ce584256b4a790354f314baa209a9015aa1ab6e383f6177c5"},
+    {file = "cramjam-2.8.1-cp310-none-win32.whl", hash = "sha256:f1af221507fbcd5bd46a92d41ca60410400624328b5c92ec2efb13deca86b6e9"},
+    {file = "cramjam-2.8.1-cp310-none-win_amd64.whl", hash = "sha256:94017d3057d53bad33ec4ef46409a2b410a67c0905eb094b441c1687ca1f738a"},
+    {file = "cramjam-2.8.1-cp311-cp311-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:033b1720718c68cacab1dbeee6d8e8e7ed851ac9abb8aca6ae9cc869e7087df8"},
+    {file = "cramjam-2.8.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4597d720efdd92f5a4f09289d08f0fcf690ae8a16295dd356dd66fcc4aa10e30"},
+    {file = "cramjam-2.8.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:ddafdb489ad4b84cb7e8c8423a21f37a5e6dfe7e32847bd0048f680d703184cf"},
+    {file = "cramjam-2.8.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cbf9df8f296422410722fb5349159432eea11ac5fec7bf59213c7148f3efa04"},
+    {file = "cramjam-2.8.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e588b11966152234f8ffa95089dd447a7ec299ea319acee15a78387cfc267ba9"},
+    {file = "cramjam-2.8.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9050a662c484752b18a07ba0bd01601f229f40152d374ea79c91da239d6b929b"},
+    {file = "cramjam-2.8.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a909ef9b6099e4df03a7b602d75e0c43c23359f21cd138886a51f7ae268fffc9"},
+    {file = "cramjam-2.8.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a6b06704e99a116dbdae78c4fef47d41bec1ef08a911c50807cde608d99764b"},
+    {file = "cramjam-2.8.1-cp311-none-win32.whl", hash = "sha256:9f069be2b8cfa16d2ee5c1fc7cb6864f8d5db03e31d0f07a38a227dbd470d1ad"},
+    {file = "cramjam-2.8.1-cp311-none-win_amd64.whl", hash = "sha256:0f50414cc2f216d0c4d311e81412bf32fda6e4d1e867de2f4bdd2c1d05319d00"},
+    {file = "cramjam-2.8.1-cp312-cp312-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:000e2321226b83fee2fa6b353d649f034266fa6d42d5aa3521c7457ca7fa3118"},
+    {file = "cramjam-2.8.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:80373dccc3dc25446d5d5ae7c8641daa2dbd4973c98bd2ab3e113fa57a653647"},
+    {file = "cramjam-2.8.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:bc2d6ccaa164ba122b724f4958821d08ed7c49a90662ce800c24021c5900cbbe"},
+    {file = "cramjam-2.8.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a726fb5ee28e4258215c327d65bf6791f25e7742a5d262352e2c344ac683a0f8"},
+    {file = "cramjam-2.8.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d087048ef9244cf42b8958ec9eabacbf3c337fa0697a4573e99b189253f312d4"},
+    {file = "cramjam-2.8.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4230d22a249ae91459ceae15e55abac5cef983042ddbd0484e2863034dd63725"},
+    {file = "cramjam-2.8.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd52a1767750b3a83308179e3ff0b3465551c8f4419676847300989582e55a61"},
+    {file = "cramjam-2.8.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f16f327dd8d299ac137bf3288e89772187533d9da367bd3d270f5546273e7ce8"},
+    {file = "cramjam-2.8.1-cp312-none-win32.whl", hash = "sha256:d7118d61d7a037f80ef6a2d809cd01492b574983947593570ac757d2a87eae6e"},
+    {file = "cramjam-2.8.1-cp312-none-win_amd64.whl", hash = "sha256:465f868c14b921af0cb66f7bfa4f3c7a131cc47de1c9390dfa1d96fbe90778de"},
+    {file = "cramjam-2.8.1-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:77c952be40131c47ebbcb534862530ba67a80c86a7099532cace517109c2b428"},
+    {file = "cramjam-2.8.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8e5b8f4d23e0a4a97381ded35a6a7fe839eba1b0372c1e74ee3713adc59f5c1c"},
+    {file = "cramjam-2.8.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8d1c2b51b0754736f20c1ea3638118a7acd60cc5ab32c6751e247cd6ada15c79"},
+    {file = "cramjam-2.8.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d3b3d1167928bf0ad846239758a8fd6536d560886b63174359392686721e8902"},
+    {file = "cramjam-2.8.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9f1b2fd1108b040a822e8d6c32aebe46275095128783f944971c3b37671c6af0"},
+    {file = "cramjam-2.8.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c676bb517a2edd809684e3495eccb7868bd5f1be0684853f226166aa54ed072"},
+    {file = "cramjam-2.8.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:133bc01739c897ebf7dcfb3e0fe11b4ca5851bcda1e021c4877e11cf1712a4e5"},
+    {file = "cramjam-2.8.1-cp37-none-win32.whl", hash = "sha256:b85e8c7380ee7a1b6b530764f929a0766b5ee1d8cbbfa0c0fca2c08415a7532f"},
+    {file = "cramjam-2.8.1-cp37-none-win_amd64.whl", hash = "sha256:6611b41ff76ff252ce67acc13a83aae17ea3131e5926fa1a96e266ff803d4d67"},
+    {file = "cramjam-2.8.1-cp38-cp38-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:db7086436d50d6fb2c2b38cfbec5a860a2595cbed0e7a3e541c05180f5e26005"},
+    {file = "cramjam-2.8.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:f2332835c2db911cc572e6604860e5f2da439833f855ed1c1dac96fd5fc1025a"},
+    {file = "cramjam-2.8.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:4b2bdcaeaa649efa4c41e7add81a205359c7540c58766151488955ebe12afa9f"},
+    {file = "cramjam-2.8.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3f8d44cbf2f680a39f677cafbadb3353ee09ce9d2ed51b52eba5261cb3b935a"},
+    {file = "cramjam-2.8.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a6d283676a5979236011bd3daed39560e5d0f39df6cceab31a291d5203a36cde"},
+    {file = "cramjam-2.8.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dd3787a6f9bbfea1a74aa9457d73eae18393de67cab03f89ac20beaaed43cb8d"},
+    {file = "cramjam-2.8.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b0200d1530c9ae9c0988e2878a08624de1cb2b841eea2dcb9a10a236ff43747c"},
+    {file = "cramjam-2.8.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:acba9d6fe6653e1f387aee53cc8d6d9d90d5416b29a233e40dcdf84e6b1002b9"},
+    {file = "cramjam-2.8.1-cp38-none-win32.whl", hash = "sha256:6d4e357d98d5a4ac7291b88a4e54adcb85506a00aa9e72fa222a2caa3b881828"},
+    {file = "cramjam-2.8.1-cp38-none-win_amd64.whl", hash = "sha256:960f0a65db3de5f92762c0dfcad514982ea888f300677d83a1d3eb46eee6b109"},
+    {file = "cramjam-2.8.1-cp39-cp39-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:9ccbc9b23b42aa65cf6878908663e73aeb55971b9e3d0ce3c7733b387934f030"},
+    {file = "cramjam-2.8.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:bfac3506ed0c21b4b2a27af50de9ad2bdd5cc02aed977782fa01d21cc6f54de9"},
+    {file = "cramjam-2.8.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:1f3ef0d616fb8f4d8eb75b3b22653385b88fe493895d763f5f235c7c6d64e570"},
+    {file = "cramjam-2.8.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f18a8d637ac9497e0be2a8fa49cca52259196d18ca73b36a385b6d925cb21b7"},
+    {file = "cramjam-2.8.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d21a370320b6959d7bd581ca286439a113e325477cc92aefb0be378bd351323a"},
+    {file = "cramjam-2.8.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5c2d49a2f8e2a4501c460ad8452ce269c9ddd5c4671ad41cf41104a3fa6ca2e7"},
+    {file = "cramjam-2.8.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8366b00f578ea1b202f20b5ebab622ac10599d1b08c36ed9089e27a452c76d2e"},
+    {file = "cramjam-2.8.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ab8228f4b343e08dd2f4e2a714427979dfd25184cde08198c1d149249ab29a9"},
+    {file = "cramjam-2.8.1-cp39-none-win32.whl", hash = "sha256:a280e51ea157bc2dd9dae2751acfa51bccc0453ce6d99521c18b73719d724b56"},
+    {file = "cramjam-2.8.1-cp39-none-win_amd64.whl", hash = "sha256:2a209e5b7f1e62d8dc27278948176391d35defd0202cd9b0d577126073a781a5"},
+    {file = "cramjam-2.8.1-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3ca01ea39e0b9c7938fe4a5b73dbcd0f43c2b0aaa04de39fe8deb1b69d4a59c0"},
+    {file = "cramjam-2.8.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a83ae10a853fe93882d32da53f50a62927b6408cf486f7475077b04a93c3475"},
+    {file = "cramjam-2.8.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0645a226ae669c26b2232ed30ec0ed34688da3fe6473b3998edc3e11af9a6cb3"},
+    {file = "cramjam-2.8.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:02dc04049680ab834feb092d6d1b6632eb03ba9069b2415a7fe6515d42fe10e0"},
+    {file = "cramjam-2.8.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4c447fb8a01ae435fd7ed3d1bebeb168aa35c5d9edb0326d6207381b54b8f558"},
+    {file = "cramjam-2.8.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c085d020fc15e284640d2ed3a8b121a26a06c0ddfcba87b968460421a565753a"},
+    {file = "cramjam-2.8.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:19af180433b5c25ce25698edf28f26a9f66626a2803a2f1682fbc83d38a7842f"},
+    {file = "cramjam-2.8.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4a259336a60c8c2d4067e19ea78903e55b1e2963831f9f5a5b6d9a395088974"},
+    {file = "cramjam-2.8.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af6331df89641d3dd2fa40b32a7d22353ac6bb62c87baa28ce4073bebfc8f686"},
+    {file = "cramjam-2.8.1.tar.gz", hash = "sha256:f7e233791761b61e3d85598b25db75868d9f1cd35b2bd10eb34ec4755ff2add1"},
+]
+
+[package.extras]
+dev = ["black (==22.3.0)", "hypothesis", "numpy", "pytest (>=5.30)", "pytest-xdist"]
+
 [[package]]
 name = "debugpy"
 version = "1.8.0"
@@ -173,6 +254,98 @@ files = [
 [package.extras]
 tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"]
 
+[[package]]
+name = "fastparquet"
+version = "2023.10.1"
+description = "Python support for Parquet file format"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "fastparquet-2023.10.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:75a00475e96d26214dace147b27ab782da7a0ae230cade05ea9181c3aec2e637"},
+    {file = "fastparquet-2023.10.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:af0c1d5559aa0a4fff8eb3b301c8177b6813bb15fe9d2007ad0dc89f8fa519c5"},
+    {file = "fastparquet-2023.10.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b798cdfa8f01cd573b135a493a4d0686ebbcd3a412d6e59889a7ae41ff90efeb"},
+    {file = "fastparquet-2023.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a746f521da7459707899fc33b334b2d21f590380f472fc27642f3ef28ee451d2"},
+    {file = "fastparquet-2023.10.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e86d64898e846ed0f5745427436e5772fd7bb5d9a930f1dca8233e90385e126b"},
+    {file = "fastparquet-2023.10.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5c3afafd4a0907216f5ee4f835f47ad16b84c5dede4c5ca4c0754dffe3eb72d7"},
+    {file = "fastparquet-2023.10.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:68d26a1172be5b02018f6c28603f195807955d8262b913349385d977f3ae081f"},
+    {file = "fastparquet-2023.10.1-cp310-cp310-win_amd64.whl", hash = "sha256:b7086ca3a0d8ae8680b380da9b7057a1491d629945b1dd228eba5b362e2e39aa"},
+    {file = "fastparquet-2023.10.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7f8d53f5e5049b21893964cd27154c2a7c8180f3ffd1f2693f80e0f834a3a35e"},
+    {file = "fastparquet-2023.10.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ea74f28494fda892641a564f728d046a074fdea5b9ff664ef9554c0da563bad4"},
+    {file = "fastparquet-2023.10.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ab978612d21630033df0a0b12423ed826fe36e83a1710b155968c3c6e2b3174a"},
+    {file = "fastparquet-2023.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:efc30c502feaa67c058c496eb4a734eba8bd373f0d24a32cc69360c79f7220ef"},
+    {file = "fastparquet-2023.10.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:99568ae6bbbd973b32d796cb664ba156b101e5d1931dba780fe2dc0d9b227dfd"},
+    {file = "fastparquet-2023.10.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:53b9ad8d646c2609854cbe7d7b17be343664cabae1cd0eb119011e389df8484d"},
+    {file = "fastparquet-2023.10.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:b2a9ee49039223a1e216c409c25072be1f362de27197cbec5f90cf2e736df3b0"},
+    {file = "fastparquet-2023.10.1-cp311-cp311-win_amd64.whl", hash = "sha256:9133d2f975c6e05187be4b558060e6a4aafeba02dceaf849cf6ad46d32e59405"},
+    {file = "fastparquet-2023.10.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b826696cd48f1defb6fcafb4c9798102233e54f3f3491251c034dde3d94f420a"},
+    {file = "fastparquet-2023.10.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bf2d58bee17e0eea8565c2bcd2b339ee032472751651e21f000eb564ad3cd5cf"},
+    {file = "fastparquet-2023.10.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9296098d06c6692ee477fe491381eda39fc0dcfe2fce210496491fe16ce27ef8"},
+    {file = "fastparquet-2023.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c975d648ea491e684135e9e3c0a15b440d66d0772fe497269e5c9c4eaaeb62a2"},
+    {file = "fastparquet-2023.10.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4c5208db1f38c8ac5f50f309f77bdb828fa7f247b82e2df88d847ad3bec38903"},
+    {file = "fastparquet-2023.10.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:118d1832ed2098f313936044012083c8c1b07da85ee11612895f3c4ef27bfd8a"},
+    {file = "fastparquet-2023.10.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:35cff597d2778b6fe8ef7dc36cba056cd7337151dbfc2fb08abaa6b109c75140"},
+    {file = "fastparquet-2023.10.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da633a0bd1889f30acb1b6dffa99832739802d0ae5f455b4e5eb720ab701e09"},
+    {file = "fastparquet-2023.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8aff041900ebcb4c2510bede80695fed35fb7c24dfd83b60ba8b56d7ede4e0fe"},
+    {file = "fastparquet-2023.10.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62aabf43b6bbbc074b89f9a4769f7276204b6585d2d8fae770a0b782da5b9fc9"},
+    {file = "fastparquet-2023.10.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ba85d9e5c298515a347f94bc65c0b570391b344d765dc349bafb35137466ddb2"},
+    {file = "fastparquet-2023.10.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2be7d33969e724c8aa777122d6032845a362cb2075f6e6f2c5b2150bd6223cc8"},
+    {file = "fastparquet-2023.10.1-cp38-cp38-win_amd64.whl", hash = "sha256:2a0c1f485d3085fe98dbae9ead2e97a886deb99d3db7af635296bfd3f4f2f814"},
+    {file = "fastparquet-2023.10.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1fac5319aabcbc4acc2feb5df68336de755de7d60a2ee9329fef178ac016e236"},
+    {file = "fastparquet-2023.10.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c61d26705e9a2ad2d52ed1d527c75e96e6a9a04be35bd4c8d6f4accd778f9b05"},
+    {file = "fastparquet-2023.10.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2635f0f37a983e35be9b8013b84361e3d0cdd4f514b822016445c029b1c6e007"},
+    {file = "fastparquet-2023.10.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cde8f6798d37e2af38ada058fc7018c2157d90a8dd728c0c59fab85b8adb9215"},
+    {file = "fastparquet-2023.10.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:20c17c5e7186723a175c9e7da94285bdef3cb477cb7cca0e2812b1e245279671"},
+    {file = "fastparquet-2023.10.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:91ee6b5b0efc18586e61da6662119de92fc7bf552c3a08a13eb2af16bc12f16a"},
+    {file = "fastparquet-2023.10.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:332cb3b204e1de64dcfc4c5d0b517ea665856d19c139f693e8c9efc11992e19e"},
+    {file = "fastparquet-2023.10.1-cp39-cp39-win_amd64.whl", hash = "sha256:5eb06a70daf50d70290b87f3a5ca6f25eb24ad850bcc68197b5438d92b11c763"},
+    {file = "fastparquet-2023.10.1.tar.gz", hash = "sha256:076fedfba2b56782b4823c1d351424425cfeaa5b8644c542416ca1363fe6d921"},
+]
+
+[package.dependencies]
+cramjam = ">=2.3"
+fsspec = "*"
+numpy = ">=1.20.3"
+packaging = "*"
+pandas = ">=1.5.0"
+
+[package.extras]
+lzo = ["python-lzo"]
+
+[[package]]
+name = "fsspec"
+version = "2023.12.2"
+description = "File-system specification"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "fsspec-2023.12.2-py3-none-any.whl", hash = "sha256:d800d87f72189a745fa3d6b033b9dc4a34ad069f60ca60b943a63599f5501960"},
+    {file = "fsspec-2023.12.2.tar.gz", hash = "sha256:8548d39e8810b59c38014934f6b31e57f40c1b20f911f4cc2b85389c7e9bf0cb"},
+]
+
+[package.extras]
+abfs = ["adlfs"]
+adl = ["adlfs"]
+arrow = ["pyarrow (>=1)"]
+dask = ["dask", "distributed"]
+devel = ["pytest", "pytest-cov"]
+dropbox = ["dropbox", "dropboxdrivefs", "requests"]
+full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"]
+fuse = ["fusepy"]
+gcs = ["gcsfs"]
+git = ["pygit2"]
+github = ["requests"]
+gs = ["gcsfs"]
+gui = ["panel"]
+hdfs = ["pyarrow (>=1)"]
+http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "requests"]
+libarchive = ["libarchive-c"]
+oci = ["ocifs"]
+s3 = ["s3fs"]
+sftp = ["paramiko"]
+smb = ["smbprotocol"]
+ssh = ["paramiko"]
+tqdm = ["tqdm"]
+
 [[package]]
 name = "ipykernel"
 version = "6.29.0"
@@ -957,4 +1130,4 @@ files = [
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "e9e2ee4b6fdb1aba77a21d7e986f962a6e87ed9bec9353ad443ad7129a575d6b"
+content-hash = "56a680440d968ef6670819b80fb63d6a7806417f8582fbe94b467a8f6d4a886b"
diff --git a/pyproject.toml b/pyproject.toml
index bfc36a8..27c9a58 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,6 +10,7 @@ python = "^3.12"
 pandas = "^2.2.0"
 ipykernel = "^6.29.0"
 scikit-learn = "^1.4.0"
+fastparquet = "^2023.10.1"
 
 
 [build-system]