From 34d3d7e285dae335098485c41dbac2da4133e680 Mon Sep 17 00:00:00 2001 From: Rostyslav Zatserkovnyi Date: Wed, 22 Nov 2023 17:02:41 +0200 Subject: [PATCH 01/11] Synthesize national data for nchs-mortality signals --- .../delphi_nchs_mortality/constants.py | 2 +- nchs_mortality/delphi_nchs_mortality/run.py | 81 ++++++++++++------- nchs_mortality/tests/test_run.py | 52 ++++++++---- 3 files changed, 88 insertions(+), 47 deletions(-) diff --git a/nchs_mortality/delphi_nchs_mortality/constants.py b/nchs_mortality/delphi_nchs_mortality/constants.py index 164b84307..4e8cdc144 100644 --- a/nchs_mortality/delphi_nchs_mortality/constants.py +++ b/nchs_mortality/delphi_nchs_mortality/constants.py @@ -25,7 +25,7 @@ "prop" ] INCIDENCE_BASE = 100000 -GEO_RES = "state" +GEO_RES = ["state", "nation"] # this is necessary as a delimiter in the f-string expressions we use to # construct detailed error reports diff --git a/nchs_mortality/delphi_nchs_mortality/run.py b/nchs_mortality/delphi_nchs_mortality/run.py index b8a7832d4..03cb67ede 100644 --- a/nchs_mortality/delphi_nchs_mortality/run.py +++ b/nchs_mortality/delphi_nchs_mortality/run.py @@ -10,6 +10,7 @@ import numpy as np from delphi_utils import S3ArchiveDiffer, get_structured_logger, create_export_csv, Nans +from delphi_utils.geomap import GeoMapper from .archive_diffs import arch_diffs from .constants import (METRICS, SENSOR_NAME_MAP, @@ -29,6 +30,21 @@ def add_nancodes(df): df.loc[remaining_nans_mask, "missing_val"] = Nans.OTHER return df +def county_to_nation(df): + """Aggregate county data to national data.""" + gmpr = GeoMapper() + + # Convert from state code (e.g. 'al', 'ca') to nation code + df = gmpr.add_geocode(df, "state_code", "nation", from_col="geo_id", new_col="nation") + + # Replace old geo_id column with new nation column + df.drop(columns="geo_id", inplace=True) + df.rename(columns={"nation": "geo_id"}, inplace=True) + + # Sum up numeric values, like incidence, deaths or population + df = df.groupby(["timestamp", "geo_id"]).sum(numeric_only=True).reset_index() + return df + def run_module(params: Dict[str, Any]): """Run module for processing NCHS mortality data. @@ -71,45 +87,24 @@ def run_module(params: Dict[str, Any]): stats = [] df_pull = pull_nchs_mortality_data(token, test_file) - for metric in METRICS: - if metric == 'percent_of_expected_deaths': - logger.info("Generating signal and exporting to CSV", - metric = metric) - df = df_pull.copy() - df["val"] = df[metric] - df["se"] = np.nan - df["sample_size"] = np.nan - df = add_nancodes(df) - # df = df[~df["val"].isnull()] - sensor_name = "_".join([SENSOR_NAME_MAP[metric]]) - dates = create_export_csv( - df, - geo_res=GEO_RES, - export_dir=daily_export_dir, - start_date=datetime.strptime(export_start_date, "%Y-%m-%d"), - sensor=sensor_name, - weekly_dates=True - ) - if len(dates) > 0: - stats.append((max(dates), len(dates))) - else: - for sensor in SENSORS: + for geo in GEO_RES: + for metric in METRICS: + if metric == 'percent_of_expected_deaths': logger.info("Generating signal and exporting to CSV", metric = metric, sensor = sensor) df = df_pull.copy() - if sensor == "num": - df["val"] = df[metric] - else: - df["val"] = df[metric] / df["population"] * INCIDENCE_BASE + if geo == "nation": + df = county_to_nation(df) + df["val"] = df[metric] df["se"] = np.nan df["sample_size"] = np.nan - df = add_nancodes(df) # df = df[~df["val"].isnull()] - sensor_name = "_".join([SENSOR_NAME_MAP[metric], sensor]) + df = add_nancodes(df) + sensor_name = "_".join([SENSOR_NAME_MAP[metric]]) dates = create_export_csv( df, - geo_res=GEO_RES, + geo_res=geo, export_dir=daily_export_dir, start_date=datetime.strptime(export_start_date, "%Y-%m-%d"), sensor=sensor_name, @@ -117,6 +112,32 @@ def run_module(params: Dict[str, Any]): ) if len(dates) > 0: stats.append((max(dates), len(dates))) + else: + for sensor in SENSORS: + logger.info("Generating signal and exporting to CSV", + metric = metric) + df = df_pull.copy() + if geo == "nation": + df = county_to_nation(df) + if sensor == "num": + df["val"] = df[metric] + else: + df["val"] = df[metric] / df["population"] * INCIDENCE_BASE + df["se"] = np.nan + df["sample_size"] = np.nan + # df = df[~df["val"].isnull()] + df = add_nancodes(df) + sensor_name = "_".join([SENSOR_NAME_MAP[metric], sensor]) + dates = create_export_csv( + df, + geo_res=geo, + export_dir=daily_export_dir, + start_date=datetime.strptime(export_start_date, "%Y-%m-%d"), + sensor=sensor_name, + weekly_dates=True + ) + if len(dates) > 0: + stats.append((max(dates), len(dates))) # Weekly run of archive utility on Monday # - Does not upload to S3, that is handled by daily run of archive utility diff --git a/nchs_mortality/tests/test_run.py b/nchs_mortality/tests/test_run.py index d1355afd3..4780be05f 100644 --- a/nchs_mortality/tests/test_run.py +++ b/nchs_mortality/tests/test_run.py @@ -19,6 +19,7 @@ def test_output_files_exist(self, run_as_module, date): for output_folder in folders: csv_files = listdir(output_folder) + geos = ["nation", "state"] dates = [ "202030", "202031", @@ -38,15 +39,14 @@ def test_output_files_exist(self, run_as_module, date): sensors = ["num", "prop"] expected_files = [] - for d in dates: - for metric in metrics: - if metric == "deaths_percent_of_expected": - expected_files += ["weekly_" + d + "_state_" \ - + metric + ".csv"] - else: - for sensor in sensors: - expected_files += ["weekly_" + d + "_state_" \ - + metric + "_" + sensor + ".csv"] + for geo in geos: + for d in dates: + for metric in metrics: + if metric == "deaths_percent_of_expected": + expected_files += [f"weekly_{d}_{geo}_{metric}.csv"] + else: + for sensor in sensors: + expected_files += [f"weekly_{d}_{geo}_{metric}_{sensor}.csv"] assert set(expected_files).issubset(set(csv_files)) # the 14th was a Monday @@ -54,16 +54,36 @@ def test_output_files_exist(self, run_as_module, date): def test_output_file_format(self, run_as_module, date): is_mon_or_thurs = dt.datetime.strptime(date, "%Y-%m-%d").weekday() == (0 or 3) + folders = ["daily_cache"] + if is_mon_or_thurs: + folders.append("receiving") + + geos = ["nation", "state"] + for geo in geos: + for output_folder in folders: + df = pd.read_csv( + join(output_folder, f"weekly_202026_{geo}_deaths_covid_incidence_prop.csv") + ) + expected_columns = [ + "geo_id", "val", "se", "sample_size", + "missing_val", "missing_se", "missing_sample_size" + ] + assert (df.columns.values == expected_columns).all() + + @pytest.mark.parametrize("date", ["2020-09-14", "2020-09-17", "2020-09-18"]) + def test_nation_state_aggregation(self, run_as_module, date): + is_mon_or_thurs = dt.datetime.strptime(date, "%Y-%m-%d").weekday() == (0 or 3) + folders = ["daily_cache"] if is_mon_or_thurs: folders.append("receiving") for output_folder in folders: - df = pd.read_csv( - join(output_folder, "weekly_202026_state_deaths_covid_incidence_prop.csv") + state = pd.read_csv( + join(output_folder, f"weekly_202026_state_deaths_covid_incidence_num.csv") ) - expected_columns = [ - "geo_id", "val", "se", "sample_size", - "missing_val", "missing_se", "missing_sample_size" - ] - assert (df.columns.values == expected_columns).all() + nation = pd.read_csv( + join(output_folder, f"weekly_202026_nation_deaths_covid_incidence_num.csv") + ) + # Assert that the national value is the sum of state values + assert (state['val'].sum() == nation['val'].iloc[0]) \ No newline at end of file From 52caab2b349d63069ee9821f995069bb9195ddc6 Mon Sep 17 00:00:00 2001 From: Rostyslav Zatserkovnyi Date: Wed, 22 Nov 2023 17:05:48 +0200 Subject: [PATCH 02/11] Switch where sensor is logged --- nchs_mortality/delphi_nchs_mortality/run.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nchs_mortality/delphi_nchs_mortality/run.py b/nchs_mortality/delphi_nchs_mortality/run.py index 03cb67ede..05d6319a4 100644 --- a/nchs_mortality/delphi_nchs_mortality/run.py +++ b/nchs_mortality/delphi_nchs_mortality/run.py @@ -91,8 +91,7 @@ def run_module(params: Dict[str, Any]): for metric in METRICS: if metric == 'percent_of_expected_deaths': logger.info("Generating signal and exporting to CSV", - metric = metric, - sensor = sensor) + metric = metric) df = df_pull.copy() if geo == "nation": df = county_to_nation(df) @@ -115,7 +114,8 @@ def run_module(params: Dict[str, Any]): else: for sensor in SENSORS: logger.info("Generating signal and exporting to CSV", - metric = metric) + metric = metric, + sensor=sensor) df = df_pull.copy() if geo == "nation": df = county_to_nation(df) From 572c869f1c0750ae850e3c8430fb702f502f59ea Mon Sep 17 00:00:00 2001 From: Rostyslav Zatserkovnyi Date: Wed, 22 Nov 2023 17:19:37 +0200 Subject: [PATCH 03/11] Tone down linting --- nchs_mortality/.pylintrc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nchs_mortality/.pylintrc b/nchs_mortality/.pylintrc index f30837c7e..c72b4e124 100644 --- a/nchs_mortality/.pylintrc +++ b/nchs_mortality/.pylintrc @@ -4,6 +4,8 @@ disable=logging-format-interpolation, too-many-locals, too-many-arguments, + too-many-branches, + too-many-statements, # Allow pytest functions to be part of a class. no-self-use, # Allow pytest classes to have one test. From 65eba2d118347785ab148720b5c56728a5254a79 Mon Sep 17 00:00:00 2001 From: Rostyslav Zatserkovnyi Date: Wed, 22 Nov 2023 18:07:28 +0200 Subject: [PATCH 04/11] Add newline --- nchs_mortality/tests/test_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nchs_mortality/tests/test_run.py b/nchs_mortality/tests/test_run.py index 4780be05f..342525a47 100644 --- a/nchs_mortality/tests/test_run.py +++ b/nchs_mortality/tests/test_run.py @@ -86,4 +86,4 @@ def test_nation_state_aggregation(self, run_as_module, date): join(output_folder, f"weekly_202026_nation_deaths_covid_incidence_num.csv") ) # Assert that the national value is the sum of state values - assert (state['val'].sum() == nation['val'].iloc[0]) \ No newline at end of file + assert (state['val'].sum() == nation['val'].iloc[0]) From 59ffab0c4614e238edf496315de161bca8ff0f5c Mon Sep 17 00:00:00 2001 From: Rostyslav Zatserkovnyi Date: Wed, 22 Nov 2023 18:26:18 +0200 Subject: [PATCH 05/11] Do not generate national data for deaths_percent_of_expected --- .../delphi_nchs_mortality/constants.py | 1 - nchs_mortality/delphi_nchs_mortality/run.py | 50 +++++++++---------- nchs_mortality/tests/test_run.py | 3 +- 3 files changed, 26 insertions(+), 28 deletions(-) diff --git a/nchs_mortality/delphi_nchs_mortality/constants.py b/nchs_mortality/delphi_nchs_mortality/constants.py index 4e8cdc144..800444e58 100644 --- a/nchs_mortality/delphi_nchs_mortality/constants.py +++ b/nchs_mortality/delphi_nchs_mortality/constants.py @@ -25,7 +25,6 @@ "prop" ] INCIDENCE_BASE = 100000 -GEO_RES = ["state", "nation"] # this is necessary as a delimiter in the f-string expressions we use to # construct detailed error reports diff --git a/nchs_mortality/delphi_nchs_mortality/run.py b/nchs_mortality/delphi_nchs_mortality/run.py index 05d6319a4..01ea0c8f5 100644 --- a/nchs_mortality/delphi_nchs_mortality/run.py +++ b/nchs_mortality/delphi_nchs_mortality/run.py @@ -14,7 +14,7 @@ from .archive_diffs import arch_diffs from .constants import (METRICS, SENSOR_NAME_MAP, - SENSORS, INCIDENCE_BASE, GEO_RES) + SENSORS, INCIDENCE_BASE) from .pull import pull_nchs_mortality_data @@ -87,31 +87,29 @@ def run_module(params: Dict[str, Any]): stats = [] df_pull = pull_nchs_mortality_data(token, test_file) - for geo in GEO_RES: - for metric in METRICS: - if metric == 'percent_of_expected_deaths': - logger.info("Generating signal and exporting to CSV", - metric = metric) - df = df_pull.copy() - if geo == "nation": - df = county_to_nation(df) - df["val"] = df[metric] - df["se"] = np.nan - df["sample_size"] = np.nan - # df = df[~df["val"].isnull()] - df = add_nancodes(df) - sensor_name = "_".join([SENSOR_NAME_MAP[metric]]) - dates = create_export_csv( - df, - geo_res=geo, - export_dir=daily_export_dir, - start_date=datetime.strptime(export_start_date, "%Y-%m-%d"), - sensor=sensor_name, - weekly_dates=True - ) - if len(dates) > 0: - stats.append((max(dates), len(dates))) - else: + for metric in METRICS: + if metric == 'percent_of_expected_deaths': + logger.info("Generating signal and exporting to CSV", + metric = metric) + df = df_pull.copy() + df["val"] = df[metric] + df["se"] = np.nan + df["sample_size"] = np.nan + # df = df[~df["val"].isnull()] + df = add_nancodes(df) + sensor_name = "_".join([SENSOR_NAME_MAP[metric]]) + dates = create_export_csv( + df, + geo_res="state", + export_dir=daily_export_dir, + start_date=datetime.strptime(export_start_date, "%Y-%m-%d"), + sensor=sensor_name, + weekly_dates=True + ) + if len(dates) > 0: + stats.append((max(dates), len(dates))) + else: + for geo in ["state", "nation"]: for sensor in SENSORS: logger.info("Generating signal and exporting to CSV", metric = metric, diff --git a/nchs_mortality/tests/test_run.py b/nchs_mortality/tests/test_run.py index 342525a47..6fa3ad48d 100644 --- a/nchs_mortality/tests/test_run.py +++ b/nchs_mortality/tests/test_run.py @@ -43,7 +43,8 @@ def test_output_files_exist(self, run_as_module, date): for d in dates: for metric in metrics: if metric == "deaths_percent_of_expected": - expected_files += [f"weekly_{d}_{geo}_{metric}.csv"] + # No nation aggregation for this metric + expected_files += [f"weekly_{d}_state_{metric}.csv"] else: for sensor in sensors: expected_files += [f"weekly_{d}_{geo}_{metric}_{sensor}.csv"] From 58c0c97f9876da66bee95259938b056fd29e306c Mon Sep 17 00:00:00 2001 From: Rostyslav Zatserkovnyi Date: Wed, 22 Nov 2023 18:34:27 +0200 Subject: [PATCH 06/11] Remove aggregation method + remove unneeded lint exclusion --- nchs_mortality/.pylintrc | 1 - nchs_mortality/delphi_nchs_mortality/run.py | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/nchs_mortality/.pylintrc b/nchs_mortality/.pylintrc index c72b4e124..6f75905fa 100644 --- a/nchs_mortality/.pylintrc +++ b/nchs_mortality/.pylintrc @@ -4,7 +4,6 @@ disable=logging-format-interpolation, too-many-locals, too-many-arguments, - too-many-branches, too-many-statements, # Allow pytest functions to be part of a class. no-self-use, diff --git a/nchs_mortality/delphi_nchs_mortality/run.py b/nchs_mortality/delphi_nchs_mortality/run.py index 01ea0c8f5..0b595a48c 100644 --- a/nchs_mortality/delphi_nchs_mortality/run.py +++ b/nchs_mortality/delphi_nchs_mortality/run.py @@ -30,8 +30,8 @@ def add_nancodes(df): df.loc[remaining_nans_mask, "missing_val"] = Nans.OTHER return df -def county_to_nation(df): - """Aggregate county data to national data.""" +def state_to_nation(df): + """Aggregate state data to national data.""" gmpr = GeoMapper() # Convert from state code (e.g. 'al', 'ca') to nation code @@ -116,7 +116,7 @@ def run_module(params: Dict[str, Any]): sensor=sensor) df = df_pull.copy() if geo == "nation": - df = county_to_nation(df) + df = state_to_nation(df) if sensor == "num": df["val"] = df[metric] else: From f34243d1ceb7d041b079cb86db9863d36f6aca61 Mon Sep 17 00:00:00 2001 From: Rostyslav Zatserkovnyi Date: Wed, 22 Nov 2023 18:35:02 +0200 Subject: [PATCH 07/11] Lint fix --- nchs_mortality/.pylintrc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nchs_mortality/.pylintrc b/nchs_mortality/.pylintrc index 6f75905fa..ef21dd768 100644 --- a/nchs_mortality/.pylintrc +++ b/nchs_mortality/.pylintrc @@ -4,7 +4,7 @@ disable=logging-format-interpolation, too-many-locals, too-many-arguments, - too-many-statements, + too-many-branches, # Allow pytest functions to be part of a class. no-self-use, # Allow pytest classes to have one test. From 32bae9c3418803841d488418fdd8c4007dd7d7af Mon Sep 17 00:00:00 2001 From: Rostyslav Zatserkovnyi Date: Wed, 10 Jan 2024 15:30:11 +0200 Subject: [PATCH 08/11] Use national data rather than sum of cell data --- nchs_mortality/.pylintrc | 1 - nchs_mortality/delphi_nchs_mortality/pull.py | 9 +++++---- nchs_mortality/delphi_nchs_mortality/run.py | 21 ++++---------------- nchs_mortality/tests/test_run.py | 18 ----------------- 4 files changed, 9 insertions(+), 40 deletions(-) diff --git a/nchs_mortality/.pylintrc b/nchs_mortality/.pylintrc index ef21dd768..f30837c7e 100644 --- a/nchs_mortality/.pylintrc +++ b/nchs_mortality/.pylintrc @@ -4,7 +4,6 @@ disable=logging-format-interpolation, too-many-locals, too-many-arguments, - too-many-branches, # Allow pytest functions to be part of a class. no-self-use, # Allow pytest classes to have one test. diff --git a/nchs_mortality/delphi_nchs_mortality/pull.py b/nchs_mortality/delphi_nchs_mortality/pull.py index 45887041e..15c653696 100644 --- a/nchs_mortality/delphi_nchs_mortality/pull.py +++ b/nchs_mortality/delphi_nchs_mortality/pull.py @@ -96,8 +96,6 @@ def pull_nchs_mortality_data(token: str, test_file: Optional[str]=None): {NEWLINE.join(df.columns)} """) from exc - # Drop rows for locations outside US - df = df[df["state"] != "United States"] df = df[keep_columns + ["timestamp", "state"]].set_index("timestamp") # NCHS considers NYC as an individual state, however, we want it included @@ -124,6 +122,9 @@ def pull_nchs_mortality_data(token: str, test_file: Optional[str]=None): # Add population info keep_columns.extend(["timestamp", "geo_id", "population"]) gmpr = GeoMapper() - df = gmpr.add_population_column(df, "state_name", geocode_col="state") - df = gmpr.add_geocode(df, "state_name", "state_id", from_col="state", new_col="geo_id") + # Map state to geo_id, but set dropna=False as we also have national data + df = gmpr.add_population_column(df, "state_name", geocode_col="state", dropna=False) + df = gmpr.add_geocode(df, "state_name", "state_id", from_col="state", new_col="geo_id", dropna=False) + # Manually set geo_id for national data + df.loc[df["state"] == "United States", "geo_id"] = "us" return df[keep_columns] diff --git a/nchs_mortality/delphi_nchs_mortality/run.py b/nchs_mortality/delphi_nchs_mortality/run.py index 0b595a48c..1517fdf0b 100644 --- a/nchs_mortality/delphi_nchs_mortality/run.py +++ b/nchs_mortality/delphi_nchs_mortality/run.py @@ -10,7 +10,6 @@ import numpy as np from delphi_utils import S3ArchiveDiffer, get_structured_logger, create_export_csv, Nans -from delphi_utils.geomap import GeoMapper from .archive_diffs import arch_diffs from .constants import (METRICS, SENSOR_NAME_MAP, @@ -30,21 +29,6 @@ def add_nancodes(df): df.loc[remaining_nans_mask, "missing_val"] = Nans.OTHER return df -def state_to_nation(df): - """Aggregate state data to national data.""" - gmpr = GeoMapper() - - # Convert from state code (e.g. 'al', 'ca') to nation code - df = gmpr.add_geocode(df, "state_code", "nation", from_col="geo_id", new_col="nation") - - # Replace old geo_id column with new nation column - df.drop(columns="geo_id", inplace=True) - df.rename(columns={"nation": "geo_id"}, inplace=True) - - # Sum up numeric values, like incidence, deaths or population - df = df.groupby(["timestamp", "geo_id"]).sum(numeric_only=True).reset_index() - return df - def run_module(params: Dict[str, Any]): """Run module for processing NCHS mortality data. @@ -92,6 +76,7 @@ def run_module(params: Dict[str, Any]): logger.info("Generating signal and exporting to CSV", metric = metric) df = df_pull.copy() + df = df[df["geo_id"] != "us"] df["val"] = df[metric] df["se"] = np.nan df["sample_size"] = np.nan @@ -116,7 +101,9 @@ def run_module(params: Dict[str, Any]): sensor=sensor) df = df_pull.copy() if geo == "nation": - df = state_to_nation(df) + df = df[df["geo_id"] == "us"] + else: + df = df[df["geo_id"] != "us"] if sensor == "num": df["val"] = df[metric] else: diff --git a/nchs_mortality/tests/test_run.py b/nchs_mortality/tests/test_run.py index 6fa3ad48d..b5a35d0f8 100644 --- a/nchs_mortality/tests/test_run.py +++ b/nchs_mortality/tests/test_run.py @@ -70,21 +70,3 @@ def test_output_file_format(self, run_as_module, date): "missing_val", "missing_se", "missing_sample_size" ] assert (df.columns.values == expected_columns).all() - - @pytest.mark.parametrize("date", ["2020-09-14", "2020-09-17", "2020-09-18"]) - def test_nation_state_aggregation(self, run_as_module, date): - is_mon_or_thurs = dt.datetime.strptime(date, "%Y-%m-%d").weekday() == (0 or 3) - - folders = ["daily_cache"] - if is_mon_or_thurs: - folders.append("receiving") - - for output_folder in folders: - state = pd.read_csv( - join(output_folder, f"weekly_202026_state_deaths_covid_incidence_num.csv") - ) - nation = pd.read_csv( - join(output_folder, f"weekly_202026_nation_deaths_covid_incidence_num.csv") - ) - # Assert that the national value is the sum of state values - assert (state['val'].sum() == nation['val'].iloc[0]) From 7c6e2d82ae2e85da16c55ee73658e7473e975494 Mon Sep 17 00:00:00 2001 From: Rostyslav Zatserkovnyi Date: Wed, 10 Jan 2024 15:33:38 +0200 Subject: [PATCH 09/11] Lint --- nchs_mortality/.pylintrc | 2 ++ nchs_mortality/delphi_nchs_mortality/pull.py | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/nchs_mortality/.pylintrc b/nchs_mortality/.pylintrc index f30837c7e..c72b4e124 100644 --- a/nchs_mortality/.pylintrc +++ b/nchs_mortality/.pylintrc @@ -4,6 +4,8 @@ disable=logging-format-interpolation, too-many-locals, too-many-arguments, + too-many-branches, + too-many-statements, # Allow pytest functions to be part of a class. no-self-use, # Allow pytest classes to have one test. diff --git a/nchs_mortality/delphi_nchs_mortality/pull.py b/nchs_mortality/delphi_nchs_mortality/pull.py index 15c653696..bb4d1a24d 100644 --- a/nchs_mortality/delphi_nchs_mortality/pull.py +++ b/nchs_mortality/delphi_nchs_mortality/pull.py @@ -123,8 +123,10 @@ def pull_nchs_mortality_data(token: str, test_file: Optional[str]=None): keep_columns.extend(["timestamp", "geo_id", "population"]) gmpr = GeoMapper() # Map state to geo_id, but set dropna=False as we also have national data - df = gmpr.add_population_column(df, "state_name", geocode_col="state", dropna=False) - df = gmpr.add_geocode(df, "state_name", "state_id", from_col="state", new_col="geo_id", dropna=False) + df = gmpr.add_population_column(df, "state_name", + geocode_col="state", dropna=False) + df = gmpr.add_geocode(df, "state_name", "state_id", + from_col="state", new_col="geo_id", dropna=False) # Manually set geo_id for national data df.loc[df["state"] == "United States", "geo_id"] = "us" return df[keep_columns] From 84191a53343ce15167eb56042e16a9578a094d06 Mon Sep 17 00:00:00 2001 From: Rostyslav Zatserkovnyi Date: Wed, 10 Jan 2024 15:52:44 +0200 Subject: [PATCH 10/11] Add national data for deaths_percent_of_expected --- nchs_mortality/delphi_nchs_mortality/run.py | 44 +++++++++++---------- nchs_mortality/tests/test_run.py | 3 +- 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/nchs_mortality/delphi_nchs_mortality/run.py b/nchs_mortality/delphi_nchs_mortality/run.py index 1517fdf0b..3e57f9d97 100644 --- a/nchs_mortality/delphi_nchs_mortality/run.py +++ b/nchs_mortality/delphi_nchs_mortality/run.py @@ -73,26 +73,30 @@ def run_module(params: Dict[str, Any]): df_pull = pull_nchs_mortality_data(token, test_file) for metric in METRICS: if metric == 'percent_of_expected_deaths': - logger.info("Generating signal and exporting to CSV", - metric = metric) - df = df_pull.copy() - df = df[df["geo_id"] != "us"] - df["val"] = df[metric] - df["se"] = np.nan - df["sample_size"] = np.nan - # df = df[~df["val"].isnull()] - df = add_nancodes(df) - sensor_name = "_".join([SENSOR_NAME_MAP[metric]]) - dates = create_export_csv( - df, - geo_res="state", - export_dir=daily_export_dir, - start_date=datetime.strptime(export_start_date, "%Y-%m-%d"), - sensor=sensor_name, - weekly_dates=True - ) - if len(dates) > 0: - stats.append((max(dates), len(dates))) + for geo in ["state", "nation"]: + logger.info("Generating signal and exporting to CSV", + metric = metric) + df = df_pull.copy() + if geo == "nation": + df = df[df["geo_id"] == "us"] + else: + df = df[df["geo_id"] != "us"] + df["val"] = df[metric] + df["se"] = np.nan + df["sample_size"] = np.nan + # df = df[~df["val"].isnull()] + df = add_nancodes(df) + sensor_name = "_".join([SENSOR_NAME_MAP[metric]]) + dates = create_export_csv( + df, + geo_res=geo, + export_dir=daily_export_dir, + start_date=datetime.strptime(export_start_date, "%Y-%m-%d"), + sensor=sensor_name, + weekly_dates=True + ) + if len(dates) > 0: + stats.append((max(dates), len(dates))) else: for geo in ["state", "nation"]: for sensor in SENSORS: diff --git a/nchs_mortality/tests/test_run.py b/nchs_mortality/tests/test_run.py index b5a35d0f8..a13d30845 100644 --- a/nchs_mortality/tests/test_run.py +++ b/nchs_mortality/tests/test_run.py @@ -43,8 +43,7 @@ def test_output_files_exist(self, run_as_module, date): for d in dates: for metric in metrics: if metric == "deaths_percent_of_expected": - # No nation aggregation for this metric - expected_files += [f"weekly_{d}_state_{metric}.csv"] + expected_files += [f"weekly_{d}_{geo}_{metric}.csv"] else: for sensor in sensors: expected_files += [f"weekly_{d}_{geo}_{metric}_{sensor}.csv"] From 9271264aed46e047611c27e72a50a05e0d2fb49d Mon Sep 17 00:00:00 2001 From: Rostyslav Zatserkovnyi Date: Thu, 11 Jan 2024 22:02:56 +0200 Subject: [PATCH 11/11] Code review, small readme tweak --- nchs_mortality/README.md | 2 +- nchs_mortality/delphi_nchs_mortality/run.py | 23 +++++++-------------- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/nchs_mortality/README.md b/nchs_mortality/README.md index fd75e04ad..ccd07142a 100644 --- a/nchs_mortality/README.md +++ b/nchs_mortality/README.md @@ -8,7 +8,7 @@ the state-level data as-is. For detailed information see the files `MyAppToken` is required when fetching data from SODA Consumer API (https://dev.socrata.com/foundry/data.cdc.gov/r8kw-7aab). Follow the steps below to create a MyAppToken. -- Click the `Sign up for an app toekn` buttom in the linked website +- Click the `Sign up for an app token` button in the linked website - Sign In or Sign Up with Socrata ID - Clck the `Create New App Token` button - Fill in `Application Name` and `Description` (You can just use NCHS_Mortality diff --git a/nchs_mortality/delphi_nchs_mortality/run.py b/nchs_mortality/delphi_nchs_mortality/run.py index 3e57f9d97..464fbfdcb 100644 --- a/nchs_mortality/delphi_nchs_mortality/run.py +++ b/nchs_mortality/delphi_nchs_mortality/run.py @@ -72,10 +72,10 @@ def run_module(params: Dict[str, Any]): stats = [] df_pull = pull_nchs_mortality_data(token, test_file) for metric in METRICS: - if metric == 'percent_of_expected_deaths': - for geo in ["state", "nation"]: + for geo in ["state", "nation"]: + if metric == 'percent_of_expected_deaths': logger.info("Generating signal and exporting to CSV", - metric = metric) + metric=metric, geo_level=geo) df = df_pull.copy() if geo == "nation": df = df[df["geo_id"] == "us"] @@ -84,25 +84,19 @@ def run_module(params: Dict[str, Any]): df["val"] = df[metric] df["se"] = np.nan df["sample_size"] = np.nan - # df = df[~df["val"].isnull()] df = add_nancodes(df) - sensor_name = "_".join([SENSOR_NAME_MAP[metric]]) dates = create_export_csv( df, geo_res=geo, export_dir=daily_export_dir, start_date=datetime.strptime(export_start_date, "%Y-%m-%d"), - sensor=sensor_name, + sensor=SENSOR_NAME_MAP[metric], weekly_dates=True ) - if len(dates) > 0: - stats.append((max(dates), len(dates))) - else: - for geo in ["state", "nation"]: + else: for sensor in SENSORS: logger.info("Generating signal and exporting to CSV", - metric = metric, - sensor=sensor) + metric=metric, sensor=sensor, geo_level=geo) df = df_pull.copy() if geo == "nation": df = df[df["geo_id"] == "us"] @@ -114,7 +108,6 @@ def run_module(params: Dict[str, Any]): df["val"] = df[metric] / df["population"] * INCIDENCE_BASE df["se"] = np.nan df["sample_size"] = np.nan - # df = df[~df["val"].isnull()] df = add_nancodes(df) sensor_name = "_".join([SENSOR_NAME_MAP[metric], sensor]) dates = create_export_csv( @@ -125,8 +118,8 @@ def run_module(params: Dict[str, Any]): sensor=sensor_name, weekly_dates=True ) - if len(dates) > 0: - stats.append((max(dates), len(dates))) + if len(dates) > 0: + stats.append((max(dates), len(dates))) # Weekly run of archive utility on Monday # - Does not upload to S3, that is handled by daily run of archive utility