cmu-delphi · melange396 · Jan 16, 2024 · Oct 17, 2023 · Dec 6, 2023 · Dec 7, 2023
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.49
+current_version = 0.3.50
 commit = True
 message = chore: bump covidcast-indicators to {new_version}
 tag = False
diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml
@@ -5,37 +5,49 @@ name: Python package
 
 on:
   push:
-    branches: [ main, prod ]
+    branches: [main, prod]
   pull_request:
-    types: [ opened, synchronize, reopened, ready_for_review ]
-    branches: [ main, prod ]
+    types: [opened, synchronize, reopened, ready_for_review]
+    branches: [main, prod]
 
 jobs:
   build:
     runs-on: ubuntu-20.04
     if: github.event.pull_request.draft == false
     strategy:
       matrix:
-        packages: [_delphi_utils_python, changehc, claims_hosp, doctor_visits, google_symptoms, hhs_hosp, nchs_mortality, quidel_covidtest, sir_complainsalot]
+        packages:
+          [
+            _delphi_utils_python,
+            changehc,
+            claims_hosp,
+            doctor_visits,
+            google_symptoms,
+            hhs_hosp,
+            nchs_mortality,
+            nwss_wastewater,
+            quidel_covidtest,
+            sir_complainsalot,
+          ]
     defaults:
       run:
         working-directory: ${{ matrix.packages }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v2
-      with:
-        python-version: 3.8
-    - name: Install testing dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install pylint pytest pydocstyle wheel
-    - name: Install
-      run: |
-        make install-ci
-    - name: Lint
-      run: |
-        make lint
-    - name: Test
-      run: |
-        make test
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+      - name: Install testing dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pylint pytest pydocstyle wheel
+      - name: Install
+        run: |
+          make install-ci
+      - name: Lint
+        run: |
+          make lint
+      - name: Test
+        run: |
+          make test
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -10,7 +10,7 @@
    - TODO: #527 Get this list automatically from python-ci.yml at runtime.
  */
 
-def indicator_list = ["backfill_corrections", "changehc", "claims_hosp", "google_symptoms", "hhs_hosp", "nchs_mortality", "quidel_covidtest", "sir_complainsalot", "doctor_visits"]
+def indicator_list = ["backfill_corrections", "changehc", "claims_hosp", "google_symptoms", "hhs_hosp", "nchs_mortality", "quidel_covidtest", "sir_complainsalot", "doctor_visits", "nwss_wastewater"]
 def build_package_main = [:]
 def build_package_prod = [:]
 def deploy_staging = [:]

diff --git a/_delphi_utils_python/.bumpversion.cfg b/_delphi_utils_python/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.21
+current_version = 0.3.22
 commit = True
 message = chore: bump delphi_utils to {new_version}
 tag = False

diff --git a/_delphi_utils_python/delphi_utils/__init__.py b/_delphi_utils_python/delphi_utils/__init__.py
@@ -15,4 +15,4 @@
 from .nancodes import Nans
 from .weekday import Weekday
 
-__version__ = "0.3.21"
+__version__ = "0.3.22"
diff --git a/_delphi_utils_python/delphi_utils/nancodes.py b/_delphi_utils_python/delphi_utils/nancodes.py
@@ -1,13 +1,41 @@
 """Unified not-a-number codes for CMU Delphi codebase."""
 
 from enum import IntEnum
+import pandas as pd
+
 
 class Nans(IntEnum):
-    """An enum of not-a-number codes for the indicators."""
+    """An enum of not-a-number codes for the indicators.
+
+    See the descriptions here: https://cmu-delphi.github.io/delphi-epidata/api/missing_codes.html
+    """
 
     NOT_MISSING = 0
     NOT_APPLICABLE = 1
     REGION_EXCEPTION = 2
     CENSORED = 3
     DELETED = 4
     OTHER = 5
+
+
+def add_default_nancodes(df: pd.DataFrame):
+    """Add some default nancodes to the dataframe.
+
+    This method sets the `"missing_val"` column to NOT_MISSING whenever the
+    `"val"` column has `isnull()` as `False`; if `isnull()` is `True`, then it
+    sets `"missing_val"` to `OTHER`. It also sets both the `"missing_se"` and
+    `"missing_sample_size"` columns to `NOT_APPLICABLE`.
+
+    Returns
+    -------
+    pd.DataFrame
+    """
+    # Default missingness codes
+    df["missing_val"] = Nans.NOT_MISSING
+    df["missing_se"] = Nans.NOT_APPLICABLE
+    df["missing_sample_size"] = Nans.NOT_APPLICABLE
+
+    # Mark any remaining nans with unknown
+    remaining_nans_mask = df["val"].isnull()
+    df.loc[remaining_nans_mask, "missing_val"] = Nans.OTHER
+    return df
diff --git a/_delphi_utils_python/setup.py b/_delphi_utils_python/setup.py
@@ -27,7 +27,7 @@
 
 setup(
     name="delphi_utils",
-    version="0.3.21",
+    version="0.3.22",
     description="Shared Utility Functions for Indicators",
     long_description=long_description,
     long_description_content_type="text/markdown",

diff --git a/ansible/templates/nwss_wastewater-params-prod.json.j2 b/ansible/templates/nwss_wastewater-params-prod.json.j2
@@ -0,0 +1,13 @@
+{
+  "common": {
+    "export_dir": "./receiving",
+    "log_filename": "./nwss_wastewater.log",
+    "log_exceptions": false
+  },
+  "indicator": {
+    "wip_signal": true,
+    "export_start_date": "2020-02-01",
+    "static_file_dir": "./static",
+    "token": ""
+  }
+}
diff --git a/changehc/version.cfg b/changehc/version.cfg
@@ -1 +1 @@
-current_version = 0.3.49
+current_version = 0.3.50
diff --git a/claims_hosp/version.cfg b/claims_hosp/version.cfg
@@ -1 +1 @@
-current_version = 0.3.49
+current_version = 0.3.50
diff --git a/doctor_visits/version.cfg b/doctor_visits/version.cfg
@@ -1 +1 @@
-current_version = 0.3.49
+current_version = 0.3.50
diff --git a/google_symptoms/version.cfg b/google_symptoms/version.cfg
@@ -1 +1 @@
-current_version = 0.3.49
+current_version = 0.3.50
diff --git a/hhs_hosp/version.cfg b/hhs_hosp/version.cfg
@@ -1 +1 @@
-current_version = 0.3.49
+current_version = 0.3.50
diff --git a/nchs_mortality/.pylintrc b/nchs_mortality/.pylintrc
@@ -4,6 +4,8 @@
 disable=logging-format-interpolation,
     too-many-locals,
     too-many-arguments,
+    too-many-branches,
+    too-many-statements,
     # Allow pytest functions to be part of a class.
     no-self-use,
     # Allow pytest classes to have one test.

diff --git a/nchs_mortality/README.md b/nchs_mortality/README.md
@@ -8,9 +8,9 @@ the state-level data as-is. For detailed information see the files
 `MyAppToken` is required when fetching data from SODA Consumer API 
 (https://dev.socrata.com/foundry/data.cdc.gov/r8kw-7aab). Follow the 
 steps below to create a MyAppToken.
-- Click the `Sign up for an app toekn` buttom in the linked website
+- Click the `Sign up for an app token` button in the linked website
 - Sign In or Sign Up with Socrata ID
-- Clck the `Create New App Token` button
+- Click the `Create New App Token` button
 - Fill in `Application Name` and `Description` (You can just use NCHS_Mortality
   for both) and click `Save`
 - Copy the `App Token`

diff --git a/nchs_mortality/delphi_nchs_mortality/constants.py b/nchs_mortality/delphi_nchs_mortality/constants.py
@@ -25,7 +25,6 @@
         "prop"
 ]
 INCIDENCE_BASE = 100000
-GEO_RES = "state"
 
 # this is necessary as a delimiter in the f-string expressions we use to
 # construct detailed error reports

diff --git a/nchs_mortality/delphi_nchs_mortality/pull.py b/nchs_mortality/delphi_nchs_mortality/pull.py
@@ -96,8 +96,6 @@ def pull_nchs_mortality_data(token: str, test_file: Optional[str]=None):
 {NEWLINE.join(df.columns)}
 """) from exc
 
-    # Drop rows for locations outside US
-    df = df[df["state"] != "United States"]
     df = df[keep_columns + ["timestamp", "state"]].set_index("timestamp")
 
     # NCHS considers NYC as an individual state, however, we want it included
@@ -124,6 +122,11 @@ def pull_nchs_mortality_data(token: str, test_file: Optional[str]=None):
     # Add population info
     keep_columns.extend(["timestamp", "geo_id", "population"])
     gmpr = GeoMapper()
-    df = gmpr.add_population_column(df, "state_name", geocode_col="state")
-    df = gmpr.add_geocode(df, "state_name", "state_id", from_col="state", new_col="geo_id")
+    # Map state to geo_id, but set dropna=False as we also have national data
+    df = gmpr.add_population_column(df, "state_name",
+                                    geocode_col="state", dropna=False)
+    df = gmpr.add_geocode(df, "state_name", "state_id",
+                          from_col="state", new_col="geo_id", dropna=False)
+    # Manually set geo_id for national data
+    df.loc[df["state"] == "United States", "geo_id"] = "us"
     return df[keep_columns]
diff --git a/nchs_mortality/delphi_nchs_mortality/run.py b/nchs_mortality/delphi_nchs_mortality/run.py
@@ -13,7 +13,7 @@
 
 from .archive_diffs import arch_diffs
 from .constants import (METRICS, SENSOR_NAME_MAP,
-                        SENSORS, INCIDENCE_BASE, GEO_RES)
+                        SENSORS, INCIDENCE_BASE)
 from .pull import pull_nchs_mortality_data
 
 
@@ -72,51 +72,54 @@ def run_module(params: Dict[str, Any]):
     stats = []
     df_pull = pull_nchs_mortality_data(token, test_file)
     for metric in METRICS:
-        if metric == 'percent_of_expected_deaths':
-            logger.info("Generating signal and exporting to CSV",
-                        metric = metric)
-            df = df_pull.copy()
-            df["val"] = df[metric]
-            df["se"] = np.nan
-            df["sample_size"] = np.nan
-            df = add_nancodes(df)
-            # df = df[~df["val"].isnull()]
-            sensor_name = "_".join([SENSOR_NAME_MAP[metric]])
-            dates = create_export_csv(
-                df,
-                geo_res=GEO_RES,
-                export_dir=daily_export_dir,
-                start_date=datetime.strptime(export_start_date, "%Y-%m-%d"),
-                sensor=sensor_name,
-                weekly_dates=True
-            )
-            if len(dates) > 0:
-                stats.append((max(dates), len(dates)))
-        else:
-            for sensor in SENSORS:
+        for geo in ["state", "nation"]:
+            if metric == 'percent_of_expected_deaths':
                 logger.info("Generating signal and exporting to CSV",
-                            metric = metric,
-                            sensor = sensor)
+                            metric=metric, geo_level=geo)
                 df = df_pull.copy()
-                if sensor == "num":
-                    df["val"] = df[metric]
+                if geo == "nation":
+                    df = df[df["geo_id"] == "us"]
                 else:
-                    df["val"] = df[metric] / df["population"] * INCIDENCE_BASE
+                    df = df[df["geo_id"] != "us"]
+                df["val"] = df[metric]
                 df["se"] = np.nan
                 df["sample_size"] = np.nan
                 df = add_nancodes(df)
-                # df = df[~df["val"].isnull()]
-                sensor_name = "_".join([SENSOR_NAME_MAP[metric], sensor])
                 dates = create_export_csv(
                     df,
-                    geo_res=GEO_RES,
+                    geo_res=geo,
                     export_dir=daily_export_dir,
                     start_date=datetime.strptime(export_start_date, "%Y-%m-%d"),
-                    sensor=sensor_name,
+                    sensor=SENSOR_NAME_MAP[metric],
                     weekly_dates=True
                 )
-                if len(dates) > 0:
-                    stats.append((max(dates), len(dates)))
+            else:
+                for sensor in SENSORS:
+                    logger.info("Generating signal and exporting to CSV",
+                                metric=metric, sensor=sensor, geo_level=geo)
+                    df = df_pull.copy()
+                    if geo == "nation":
+                        df = df[df["geo_id"] == "us"]
+                    else:
+                        df = df[df["geo_id"] != "us"]
+                    if sensor == "num":
+                        df["val"] = df[metric]
+                    else:
+                        df["val"] = df[metric] / df["population"] * INCIDENCE_BASE
+                    df["se"] = np.nan
+                    df["sample_size"] = np.nan
+                    df = add_nancodes(df)
+                    sensor_name = "_".join([SENSOR_NAME_MAP[metric], sensor])
+                    dates = create_export_csv(
+                        df,
+                        geo_res=geo,
+                        export_dir=daily_export_dir,
+                        start_date=datetime.strptime(export_start_date, "%Y-%m-%d"),
+                        sensor=sensor_name,
+                        weekly_dates=True
+                    )
+            if len(dates) > 0:
+                stats.append((max(dates), len(dates)))
 
 #     Weekly run of archive utility on Monday
 #     - Does not upload to S3, that is handled by daily run of archive utility

diff --git a/nchs_mortality/tests/test_run.py b/nchs_mortality/tests/test_run.py
@@ -19,6 +19,7 @@ def test_output_files_exist(self, run_as_module, date):
         for output_folder in folders:
             csv_files = listdir(output_folder)
 
+            geos = ["nation", "state"]
             dates = [
                 "202030",
                 "202031",
@@ -38,15 +39,14 @@ def test_output_files_exist(self, run_as_module, date):
             sensors = ["num", "prop"]
 
             expected_files = []
-            for d in dates:
-                for metric in metrics:
-                    if metric == "deaths_percent_of_expected":
-                        expected_files += ["weekly_" + d + "_state_" \
-                                           + metric + ".csv"]
-                    else:
-                        for sensor in sensors:
-                            expected_files += ["weekly_" + d + "_state_" \
-                                               + metric + "_" + sensor + ".csv"]
+            for geo in geos:
+                for d in dates:
+                    for metric in metrics:
+                        if metric == "deaths_percent_of_expected":
+                            expected_files += [f"weekly_{d}_{geo}_{metric}.csv"]
+                        else:
+                            for sensor in sensors:
+                                expected_files += [f"weekly_{d}_{geo}_{metric}_{sensor}.csv"]
             assert set(expected_files).issubset(set(csv_files))
 
     # the 14th was a Monday
@@ -58,12 +58,14 @@ def test_output_file_format(self, run_as_module, date):
         if is_mon_or_thurs:
             folders.append("receiving")
 
-        for output_folder in folders:
-            df = pd.read_csv(
-                join(output_folder, "weekly_202026_state_deaths_covid_incidence_prop.csv")
-            )
-            expected_columns = [
-                "geo_id", "val", "se", "sample_size",
-                "missing_val", "missing_se", "missing_sample_size"
-            ]
-            assert (df.columns.values == expected_columns).all()
+        geos = ["nation", "state"]
+        for geo in geos:
+            for output_folder in folders:
+                df = pd.read_csv(
+                    join(output_folder, f"weekly_202026_{geo}_deaths_covid_incidence_prop.csv")
+                )
+                expected_columns = [
+                    "geo_id", "val", "se", "sample_size",
+                    "missing_val", "missing_se", "missing_sample_size"
+                ]
+                assert (df.columns.values == expected_columns).all()
diff --git a/nchs_mortality/version.cfg b/nchs_mortality/version.cfg
@@ -1 +1 @@
-current_version = 0.3.49
+current_version = 0.3.50