diff --git a/nchs_mortality/.pylintrc b/nchs_mortality/.pylintrc index f30837c7e..c72b4e124 100644 --- a/nchs_mortality/.pylintrc +++ b/nchs_mortality/.pylintrc @@ -4,6 +4,8 @@ disable=logging-format-interpolation, too-many-locals, too-many-arguments, + too-many-branches, + too-many-statements, # Allow pytest functions to be part of a class. no-self-use, # Allow pytest classes to have one test. diff --git a/nchs_mortality/README.md b/nchs_mortality/README.md index fd75e04ad..ccd07142a 100644 --- a/nchs_mortality/README.md +++ b/nchs_mortality/README.md @@ -8,7 +8,7 @@ the state-level data as-is. For detailed information see the files `MyAppToken` is required when fetching data from SODA Consumer API (https://dev.socrata.com/foundry/data.cdc.gov/r8kw-7aab). Follow the steps below to create a MyAppToken. -- Click the `Sign up for an app toekn` buttom in the linked website +- Click the `Sign up for an app token` button in the linked website - Sign In or Sign Up with Socrata ID - Clck the `Create New App Token` button - Fill in `Application Name` and `Description` (You can just use NCHS_Mortality diff --git a/nchs_mortality/delphi_nchs_mortality/constants.py b/nchs_mortality/delphi_nchs_mortality/constants.py index 164b84307..800444e58 100644 --- a/nchs_mortality/delphi_nchs_mortality/constants.py +++ b/nchs_mortality/delphi_nchs_mortality/constants.py @@ -25,7 +25,6 @@ "prop" ] INCIDENCE_BASE = 100000 -GEO_RES = "state" # this is necessary as a delimiter in the f-string expressions we use to # construct detailed error reports diff --git a/nchs_mortality/delphi_nchs_mortality/pull.py b/nchs_mortality/delphi_nchs_mortality/pull.py index 45887041e..bb4d1a24d 100644 --- a/nchs_mortality/delphi_nchs_mortality/pull.py +++ b/nchs_mortality/delphi_nchs_mortality/pull.py @@ -96,8 +96,6 @@ def pull_nchs_mortality_data(token: str, test_file: Optional[str]=None): {NEWLINE.join(df.columns)} """) from exc - # Drop rows for locations outside US - df = df[df["state"] != "United States"] df = df[keep_columns + ["timestamp", "state"]].set_index("timestamp") # NCHS considers NYC as an individual state, however, we want it included @@ -124,6 +122,11 @@ def pull_nchs_mortality_data(token: str, test_file: Optional[str]=None): # Add population info keep_columns.extend(["timestamp", "geo_id", "population"]) gmpr = GeoMapper() - df = gmpr.add_population_column(df, "state_name", geocode_col="state") - df = gmpr.add_geocode(df, "state_name", "state_id", from_col="state", new_col="geo_id") + # Map state to geo_id, but set dropna=False as we also have national data + df = gmpr.add_population_column(df, "state_name", + geocode_col="state", dropna=False) + df = gmpr.add_geocode(df, "state_name", "state_id", + from_col="state", new_col="geo_id", dropna=False) + # Manually set geo_id for national data + df.loc[df["state"] == "United States", "geo_id"] = "us" return df[keep_columns] diff --git a/nchs_mortality/delphi_nchs_mortality/run.py b/nchs_mortality/delphi_nchs_mortality/run.py index b8a7832d4..464fbfdcb 100644 --- a/nchs_mortality/delphi_nchs_mortality/run.py +++ b/nchs_mortality/delphi_nchs_mortality/run.py @@ -13,7 +13,7 @@ from .archive_diffs import arch_diffs from .constants import (METRICS, SENSOR_NAME_MAP, - SENSORS, INCIDENCE_BASE, GEO_RES) + SENSORS, INCIDENCE_BASE) from .pull import pull_nchs_mortality_data @@ -72,51 +72,54 @@ def run_module(params: Dict[str, Any]): stats = [] df_pull = pull_nchs_mortality_data(token, test_file) for metric in METRICS: - if metric == 'percent_of_expected_deaths': - logger.info("Generating signal and exporting to CSV", - metric = metric) - df = df_pull.copy() - df["val"] = df[metric] - df["se"] = np.nan - df["sample_size"] = np.nan - df = add_nancodes(df) - # df = df[~df["val"].isnull()] - sensor_name = "_".join([SENSOR_NAME_MAP[metric]]) - dates = create_export_csv( - df, - geo_res=GEO_RES, - export_dir=daily_export_dir, - start_date=datetime.strptime(export_start_date, "%Y-%m-%d"), - sensor=sensor_name, - weekly_dates=True - ) - if len(dates) > 0: - stats.append((max(dates), len(dates))) - else: - for sensor in SENSORS: + for geo in ["state", "nation"]: + if metric == 'percent_of_expected_deaths': logger.info("Generating signal and exporting to CSV", - metric = metric, - sensor = sensor) + metric=metric, geo_level=geo) df = df_pull.copy() - if sensor == "num": - df["val"] = df[metric] + if geo == "nation": + df = df[df["geo_id"] == "us"] else: - df["val"] = df[metric] / df["population"] * INCIDENCE_BASE + df = df[df["geo_id"] != "us"] + df["val"] = df[metric] df["se"] = np.nan df["sample_size"] = np.nan df = add_nancodes(df) - # df = df[~df["val"].isnull()] - sensor_name = "_".join([SENSOR_NAME_MAP[metric], sensor]) dates = create_export_csv( df, - geo_res=GEO_RES, + geo_res=geo, export_dir=daily_export_dir, start_date=datetime.strptime(export_start_date, "%Y-%m-%d"), - sensor=sensor_name, + sensor=SENSOR_NAME_MAP[metric], weekly_dates=True ) - if len(dates) > 0: - stats.append((max(dates), len(dates))) + else: + for sensor in SENSORS: + logger.info("Generating signal and exporting to CSV", + metric=metric, sensor=sensor, geo_level=geo) + df = df_pull.copy() + if geo == "nation": + df = df[df["geo_id"] == "us"] + else: + df = df[df["geo_id"] != "us"] + if sensor == "num": + df["val"] = df[metric] + else: + df["val"] = df[metric] / df["population"] * INCIDENCE_BASE + df["se"] = np.nan + df["sample_size"] = np.nan + df = add_nancodes(df) + sensor_name = "_".join([SENSOR_NAME_MAP[metric], sensor]) + dates = create_export_csv( + df, + geo_res=geo, + export_dir=daily_export_dir, + start_date=datetime.strptime(export_start_date, "%Y-%m-%d"), + sensor=sensor_name, + weekly_dates=True + ) + if len(dates) > 0: + stats.append((max(dates), len(dates))) # Weekly run of archive utility on Monday # - Does not upload to S3, that is handled by daily run of archive utility diff --git a/nchs_mortality/tests/test_run.py b/nchs_mortality/tests/test_run.py index d1355afd3..a13d30845 100644 --- a/nchs_mortality/tests/test_run.py +++ b/nchs_mortality/tests/test_run.py @@ -19,6 +19,7 @@ def test_output_files_exist(self, run_as_module, date): for output_folder in folders: csv_files = listdir(output_folder) + geos = ["nation", "state"] dates = [ "202030", "202031", @@ -38,15 +39,14 @@ def test_output_files_exist(self, run_as_module, date): sensors = ["num", "prop"] expected_files = [] - for d in dates: - for metric in metrics: - if metric == "deaths_percent_of_expected": - expected_files += ["weekly_" + d + "_state_" \ - + metric + ".csv"] - else: - for sensor in sensors: - expected_files += ["weekly_" + d + "_state_" \ - + metric + "_" + sensor + ".csv"] + for geo in geos: + for d in dates: + for metric in metrics: + if metric == "deaths_percent_of_expected": + expected_files += [f"weekly_{d}_{geo}_{metric}.csv"] + else: + for sensor in sensors: + expected_files += [f"weekly_{d}_{geo}_{metric}_{sensor}.csv"] assert set(expected_files).issubset(set(csv_files)) # the 14th was a Monday @@ -58,12 +58,14 @@ def test_output_file_format(self, run_as_module, date): if is_mon_or_thurs: folders.append("receiving") - for output_folder in folders: - df = pd.read_csv( - join(output_folder, "weekly_202026_state_deaths_covid_incidence_prop.csv") - ) - expected_columns = [ - "geo_id", "val", "se", "sample_size", - "missing_val", "missing_se", "missing_sample_size" - ] - assert (df.columns.values == expected_columns).all() + geos = ["nation", "state"] + for geo in geos: + for output_folder in folders: + df = pd.read_csv( + join(output_folder, f"weekly_202026_{geo}_deaths_covid_incidence_prop.csv") + ) + expected_columns = [ + "geo_id", "val", "se", "sample_size", + "missing_val", "missing_se", "missing_sample_size" + ] + assert (df.columns.values == expected_columns).all()