From 96cc2e7d316f78b4edb1fae4fa503673a0bae614 Mon Sep 17 00:00:00 2001 From: Blair Currey <12960453+BlairCurrey@users.noreply.github.com> Date: Tue, 6 Feb 2024 22:33:06 -0500 Subject: [PATCH] chore: update comment --- README.md | 2 ++ nfl_analytics/data.py | 7 +------ 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index d416906..a47f154 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,8 @@ score differential is wrong? look at first game. the number for the 2 teams dont - [0] (maybe) if there are any hardcoded paths (like asset dir?), think about how to not hardcode them. - punting on this one. not really important to make this configurable. - Quality of Life Improvements + - [ ] rename model? LinRegSpreadPredictor? in the release at least, not sure if anywhere else + - LinReg is descriptive but is it an implementation detail. Do I want to have an DecisionTreeSpreadPredictor in the future? Or would I only have a decision tree based model if it replaced the lin reg one? Maybe thats a "wait until (if) you actually have another model" problem. - [ ] suppress pandas warnings?? "import pandas as pd" - [ ] add cli doc generator. look into `argparse.HelpFormatter` to generate a markdown file. - [ ] add types diff --git a/nfl_analytics/data.py b/nfl_analytics/data.py index 2de0295..3650c03 100644 --- a/nfl_analytics/data.py +++ b/nfl_analytics/data.py @@ -88,13 +88,8 @@ def load_dataframe_from_raw(): print(f"Reading {filename}") file_path = os.path.join(DATA_DIR, filename) - # TODO: Throws DtypeWarning about mixed types and says "Specify dtype option on import or set low_memory=False."" - # However, model training results are unchanged and this is required to run - # in gh actions without timing out. Perhaps an alternative solution to gh actions - # timeing out would enable using low_memory=False. Like: https://github.com/actions/runner-images/discussions/7188#discussioncomment-6750749 - # Or maybe using chunksize and iterator? https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html + # FWIW, low_memory seems to work fine (no model performance change) but it does warn of differing column types df = pd.read_csv(file_path, compression="gzip", low_memory=False) - # df = pd.read_csv(file_path, compression="gzip", low_memory=True) # Save year from filename on dataframe year = get_year_from_filename(filename)