use helper to save nchs data to disk right after pulling

cmu-delphi · Oct 10, 2024 · 3c22ff1 · 3c22ff1
1 parent 2a43f9a
commit 3c22ff1
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 5 deletions.
diff --git a/nchs_mortality/delphi_nchs_mortality/pull.py b/nchs_mortality/delphi_nchs_mortality/pull.py
@@ -22,7 +22,7 @@ def standardize_columns(df):
     return df.rename(columns=dict(rename_pairs))
 
 
-def pull_nchs_mortality_data(socrata_token: str, test_file: Optional[str] = None):
+def pull_nchs_mortality_data(socrata_token: str, backup_dir: str, custom_run: bool, test_file: Optional[str] = None):
     """Pull the latest NCHS Mortality data, and conforms it into a dataset.
 
     The output dataset has:
@@ -40,6 +40,10 @@ def pull_nchs_mortality_data(socrata_token: str, test_file: Optional[str] = None
     ----------
     socrata_token: str
         My App Token for pulling the NCHS mortality data
+    backup_dir: str
+        Directory to which to save raw backup data
+    custom_run: bool
+        Flag indicating if the current run is a patch. If so, don't save any data to disk
     test_file: Optional[str]
         When not null, name of file from which to read test data
 
@@ -60,6 +64,9 @@ def pull_nchs_mortality_data(socrata_token: str, test_file: Optional[str] = None
         client = Socrata("data.cdc.gov", socrata_token)
         results = client.get("r8kw-7aab", limit=10**10)
         df = pd.DataFrame.from_records(results)
+
+        create_backup_csv(df, backup_dir, custom_run = custom_run)
+
         # drop "By Total" rows
         df = df[df["group"].transform(str.lower) == "by week"]
 

diff --git a/nchs_mortality/delphi_nchs_mortality/run.py b/nchs_mortality/delphi_nchs_mortality/run.py
@@ -59,6 +59,8 @@ def run_module(params: Dict[str, Any]):
                 days=date.today().weekday() + 2)
         export_start_date = export_start_date.strftime('%Y-%m-%d')
     daily_export_dir = params["common"]["daily_export_dir"]
+    backup_dir = params["common"]["backup_dir"]
+    custom_run = params["common"].get("custom_run", False)
     socrata_token = params["indicator"]["socrata_token"]
     test_file = params["indicator"].get("test_file", None)
 
@@ -70,7 +72,8 @@ def run_module(params: Dict[str, Any]):
         daily_arch_diff.update_cache()
 
     stats = []
-    df_pull = pull_nchs_mortality_data(socrata_token, test_file)
+    df_pull = pull_nchs_mortality_data(socrata_token, backup_dir,
+        is_patch = custom_run, test_file)
     for metric in METRICS:
         for geo in ["state", "nation"]:
             if metric == 'percent_of_expected_deaths':

diff --git a/nchs_mortality/tests/test_pull.py b/nchs_mortality/tests/test_pull.py
@@ -34,7 +34,7 @@ def test_standardize_columns(self):
         pd.testing.assert_frame_equal(expected, df)
 
     def test_good_file(self):
-        df = pull_nchs_mortality_data(SOCRATA_TOKEN, "test_data.csv")
+        df = pull_nchs_mortality_data(SOCRATA_TOKEN, backup_dir = "", custom_run = True, "test_data.csv")
 
         # Test columns
         assert (
@@ -90,9 +90,9 @@ def test_good_file(self):
     def test_bad_file_with_inconsistent_time_col(self):
         with pytest.raises(ValueError):
             pull_nchs_mortality_data(
-                SOCRATA_TOKEN, "bad_data_with_inconsistent_time_col.csv"
+                SOCRATA_TOKEN, backup_dir = "", custom_run = True, "bad_data_with_inconsistent_time_col.csv"
             )
 
     def test_bad_file_with_missing_cols(self):
         with pytest.raises(ValueError):
-            pull_nchs_mortality_data(SOCRATA_TOKEN, "bad_data_with_missing_cols.csv")
+            pull_nchs_mortality_data(SOCRATA_TOKEN, backup_dir = "", custom_run = True, "bad_data_with_missing_cols.csv")