Skip to content

Commit

Permalink
use helper to save nchs data to disk right after pulling
Browse files Browse the repository at this point in the history
  • Loading branch information
nmdefries committed Oct 10, 2024
1 parent 2a43f9a commit 3c22ff1
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 5 deletions.
9 changes: 8 additions & 1 deletion nchs_mortality/delphi_nchs_mortality/pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def standardize_columns(df):
return df.rename(columns=dict(rename_pairs))


def pull_nchs_mortality_data(socrata_token: str, test_file: Optional[str] = None):
def pull_nchs_mortality_data(socrata_token: str, backup_dir: str, custom_run: bool, test_file: Optional[str] = None):
"""Pull the latest NCHS Mortality data, and conforms it into a dataset.
The output dataset has:
Expand All @@ -40,6 +40,10 @@ def pull_nchs_mortality_data(socrata_token: str, test_file: Optional[str] = None
----------
socrata_token: str
My App Token for pulling the NCHS mortality data
backup_dir: str
Directory to which to save raw backup data
custom_run: bool
Flag indicating if the current run is a patch. If so, don't save any data to disk
test_file: Optional[str]
When not null, name of file from which to read test data
Expand All @@ -60,6 +64,9 @@ def pull_nchs_mortality_data(socrata_token: str, test_file: Optional[str] = None
client = Socrata("data.cdc.gov", socrata_token)
results = client.get("r8kw-7aab", limit=10**10)
df = pd.DataFrame.from_records(results)

create_backup_csv(df, backup_dir, custom_run = custom_run)

# drop "By Total" rows
df = df[df["group"].transform(str.lower) == "by week"]

Expand Down
5 changes: 4 additions & 1 deletion nchs_mortality/delphi_nchs_mortality/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ def run_module(params: Dict[str, Any]):
days=date.today().weekday() + 2)
export_start_date = export_start_date.strftime('%Y-%m-%d')
daily_export_dir = params["common"]["daily_export_dir"]
backup_dir = params["common"]["backup_dir"]
custom_run = params["common"].get("custom_run", False)
socrata_token = params["indicator"]["socrata_token"]
test_file = params["indicator"].get("test_file", None)

Expand All @@ -70,7 +72,8 @@ def run_module(params: Dict[str, Any]):
daily_arch_diff.update_cache()

stats = []
df_pull = pull_nchs_mortality_data(socrata_token, test_file)
df_pull = pull_nchs_mortality_data(socrata_token, backup_dir,
is_patch = custom_run, test_file)
for metric in METRICS:
for geo in ["state", "nation"]:
if metric == 'percent_of_expected_deaths':
Expand Down
6 changes: 3 additions & 3 deletions nchs_mortality/tests/test_pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_standardize_columns(self):
pd.testing.assert_frame_equal(expected, df)

def test_good_file(self):
df = pull_nchs_mortality_data(SOCRATA_TOKEN, "test_data.csv")
df = pull_nchs_mortality_data(SOCRATA_TOKEN, backup_dir = "", custom_run = True, "test_data.csv")

# Test columns
assert (
Expand Down Expand Up @@ -90,9 +90,9 @@ def test_good_file(self):
def test_bad_file_with_inconsistent_time_col(self):
with pytest.raises(ValueError):
pull_nchs_mortality_data(
SOCRATA_TOKEN, "bad_data_with_inconsistent_time_col.csv"
SOCRATA_TOKEN, backup_dir = "", custom_run = True, "bad_data_with_inconsistent_time_col.csv"
)

def test_bad_file_with_missing_cols(self):
with pytest.raises(ValueError):
pull_nchs_mortality_data(SOCRATA_TOKEN, "bad_data_with_missing_cols.csv")
pull_nchs_mortality_data(SOCRATA_TOKEN, backup_dir = "", custom_run = True, "bad_data_with_missing_cols.csv")

0 comments on commit 3c22ff1

Please sign in to comment.