diff --git a/nchs_mortality/delphi_nchs_mortality/pull.py b/nchs_mortality/delphi_nchs_mortality/pull.py index 18bbfd59a..048622e7d 100644 --- a/nchs_mortality/delphi_nchs_mortality/pull.py +++ b/nchs_mortality/delphi_nchs_mortality/pull.py @@ -22,7 +22,7 @@ def standardize_columns(df): return df.rename(columns=dict(rename_pairs)) -def pull_nchs_mortality_data(socrata_token: str, test_file: Optional[str] = None): +def pull_nchs_mortality_data(socrata_token: str, backup_dir: str, custom_run: bool, test_file: Optional[str] = None): """Pull the latest NCHS Mortality data, and conforms it into a dataset. The output dataset has: @@ -40,6 +40,10 @@ def pull_nchs_mortality_data(socrata_token: str, test_file: Optional[str] = None ---------- socrata_token: str My App Token for pulling the NCHS mortality data + backup_dir: str + Directory to which to save raw backup data + custom_run: bool + Flag indicating if the current run is a patch. If so, don't save any data to disk test_file: Optional[str] When not null, name of file from which to read test data @@ -60,6 +64,9 @@ def pull_nchs_mortality_data(socrata_token: str, test_file: Optional[str] = None client = Socrata("data.cdc.gov", socrata_token) results = client.get("r8kw-7aab", limit=10**10) df = pd.DataFrame.from_records(results) + + create_backup_csv(df, backup_dir, custom_run = custom_run) + # drop "By Total" rows df = df[df["group"].transform(str.lower) == "by week"] diff --git a/nchs_mortality/delphi_nchs_mortality/run.py b/nchs_mortality/delphi_nchs_mortality/run.py index 50ce46cfb..6454a009b 100644 --- a/nchs_mortality/delphi_nchs_mortality/run.py +++ b/nchs_mortality/delphi_nchs_mortality/run.py @@ -59,6 +59,8 @@ def run_module(params: Dict[str, Any]): days=date.today().weekday() + 2) export_start_date = export_start_date.strftime('%Y-%m-%d') daily_export_dir = params["common"]["daily_export_dir"] + backup_dir = params["common"]["backup_dir"] + custom_run = params["common"].get("custom_run", False) socrata_token = params["indicator"]["socrata_token"] test_file = params["indicator"].get("test_file", None) @@ -70,7 +72,8 @@ def run_module(params: Dict[str, Any]): daily_arch_diff.update_cache() stats = [] - df_pull = pull_nchs_mortality_data(socrata_token, test_file) + df_pull = pull_nchs_mortality_data(socrata_token, backup_dir, + is_patch = custom_run, test_file) for metric in METRICS: for geo in ["state", "nation"]: if metric == 'percent_of_expected_deaths': diff --git a/nchs_mortality/tests/test_pull.py b/nchs_mortality/tests/test_pull.py index fa58b04a5..e99311ba1 100644 --- a/nchs_mortality/tests/test_pull.py +++ b/nchs_mortality/tests/test_pull.py @@ -34,7 +34,7 @@ def test_standardize_columns(self): pd.testing.assert_frame_equal(expected, df) def test_good_file(self): - df = pull_nchs_mortality_data(SOCRATA_TOKEN, "test_data.csv") + df = pull_nchs_mortality_data(SOCRATA_TOKEN, backup_dir = "", custom_run = True, "test_data.csv") # Test columns assert ( @@ -90,9 +90,9 @@ def test_good_file(self): def test_bad_file_with_inconsistent_time_col(self): with pytest.raises(ValueError): pull_nchs_mortality_data( - SOCRATA_TOKEN, "bad_data_with_inconsistent_time_col.csv" + SOCRATA_TOKEN, backup_dir = "", custom_run = True, "bad_data_with_inconsistent_time_col.csv" ) def test_bad_file_with_missing_cols(self): with pytest.raises(ValueError): - pull_nchs_mortality_data(SOCRATA_TOKEN, "bad_data_with_missing_cols.csv") + pull_nchs_mortality_data(SOCRATA_TOKEN, backup_dir = "", custom_run = True, "bad_data_with_missing_cols.csv")