Skip to content

Commit

Permalink
test: gh action using remote loading instead of downloading
Browse files Browse the repository at this point in the history
  • Loading branch information
BlairCurrey committed Feb 6, 2024
1 parent a7e4791 commit c61e4ab
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 1 deletion.
18 changes: 18 additions & 0 deletions nfl_analytics/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,24 @@ def download_data(years=range(1999, 2024)):
)


def load_dataframe_from_remote(years=range(1999, 2024)):
combined_df = pd.DataFrame()

for year in years:
url = f"https://github.com/nflverse/nflverse-data/releases/download/pbp/play_by_play_{year}.csv.gz"
print(f"Reading from remote: {url}")
df = pd.read_csv(url, low_memory=False)

# Save year on dataframe
df["year"] = year
combined_df = pd.concat([combined_df, df], ignore_index=True)

if combined_df.empty:
raise FileNotFoundError("No data loaded from the remote files.")

return combined_df


def load_dataframe_from_raw():
if not os.path.exists(DATA_DIR):
raise FileNotFoundError(f"Data directory '{DATA_DIR}' not found.")
Expand Down
4 changes: 3 additions & 1 deletion nfl_analytics/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
download_data,
load_dataframe_from_raw,
save_dataframe,
load_dataframe_from_remote,
)
from nfl_analytics.model import (
train_model,
Expand Down Expand Up @@ -70,7 +71,8 @@ def main():
if args.train:
start_time = time.time()
try:
df_raw = load_dataframe_from_raw()
# df_raw = load_dataframe_from_raw()
df_raw = load_dataframe_from_remote()
except FileNotFoundError as e:
print(f"Error loading data: {e}")
print("Please run with --download first.")
Expand Down

0 comments on commit c61e4ab

Please sign in to comment.