Merge pull request #1349 from cmu-delphi/release/delphi-epidata-4.1.14

Release Delphi Epidata 4.1.14
cmu-delphi · Nov 15, 2023 · 1488884 · 1488884
2 parents 6da4b20 + 0424631
commit 1488884
Show file tree

Hide file tree

Showing 29 changed files with 287 additions and 181 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 4.1.13
+current_version = 4.1.14
 commit = False
 tag = False
 

diff --git a/.env.example b/.env.example
@@ -4,3 +4,6 @@ FLASK_SECRET=abc
 #API_KEY_REQUIRED_STARTING_AT=2021-07-30
 API_KEY_ADMIN_PASSWORD=abc
 API_KEY_REGISTER_WEBHOOK_TOKEN=abc
+
+# Sentry
+# If setting a Sentry DSN, note that the URL should NOT be quoted!
diff --git a/.github/workflows/performance-tests-one-time.yml b/.github/workflows/performance-tests-one-time.yml
@@ -1,9 +1,9 @@
-name: One-time performance testing - 26th October 2023
+name: One-time performance testing - 8th November 2023
 
-# Run "At every 30th minute on day-of-month 26 in October"
+# Run "At every 30th minute on day-of-month 8 in November"
 on:
   schedule:
-    - cron: '*/30 * 26 10 *'
+    - cron: '*/30 * 8 11 *'
 
 # Add some extra perms to comment on a PR
 permissions:
@@ -65,6 +65,8 @@ jobs:
           path: delphi-admin
       - name: Build & run Locust
         continue-on-error: true # sometimes ~2-5 queries fail, we shouldn't end the run if that's the case
+        env:
+          PERFTEST_API_KEY: ${{secrets.PERFTEST_API_KEY}}
         run: |
           cd delphi-admin/load-testing/locust
           docker build -t locust .

diff --git a/.github/workflows/performance-tests.yml b/.github/workflows/performance-tests.yml
@@ -73,6 +73,8 @@ jobs:
           path: delphi-admin
       - name: Build & run Locust
         continue-on-error: true # sometimes ~2-5 queries fail, we shouldn't end the run if that's the case
+        env:
+          PERFTEST_API_KEY: ${{secrets.PERFTEST_API_KEY}}
         run: |
           cd delphi-admin/load-testing/locust
           docker build -t locust .

diff --git a/dev/local/Makefile b/dev/local/Makefile
@@ -77,6 +77,7 @@ LOG_REDIS:=delphi_redis_instance_$(NOW).log
 WEB_CONTAINER_ID:=$(shell docker ps -q --filter 'name=delphi_web_epidata')
 DATABASE_CONTAINER_ID:=$(shell docker ps -q --filter 'name=delphi_database_epidata')
 REDIS_CONTAINER_ID:=$(shell docker ps -q --filter 'name=delphi_redis')
+ENV_FILE:=repos/delphi/delphi-epidata/.env
 
 M1=
 ifeq ($(shell uname -smp), Darwin arm64 arm)
@@ -104,8 +105,10 @@ web:
 	@# Run the web server
 	@# MODULE_NAME specifies the location of the `app` variable, the actual WSGI application object to run.
 	@# see https://github.com/tiangolo/meinheld-gunicorn-docker#module_name
+	@touch $(ENV_FILE)
 	@docker run --rm -p 127.0.0.1:10080:80 \
 		$(M1) \
+		--env-file $(ENV_FILE) \
 		--env "MODULE_NAME=delphi.epidata.server.main" \
 		--env "SQLALCHEMY_DATABASE_URI=$(sqlalchemy_uri)" \
 		--env "FLASK_SECRET=abc" --env "FLASK_PREFIX=/epidata" --env "LOG_DEBUG" \

diff --git a/dev/local/setup.cfg b/dev/local/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = Delphi Development
-version = 4.1.13
+version = 4.1.14
 
 [options]
 packages =

diff --git a/devops/Dockerfile b/devops/Dockerfile
@@ -7,7 +7,6 @@ FROM tiangolo/meinheld-gunicorn:python3.8
 LABEL org.opencontainers.image.source=https://github.com/cmu-delphi/delphi-epidata
 
 COPY ./devops/gunicorn_conf.py /app
-COPY ./devops/start_wrapper.sh /
 RUN mkdir -p /app/delphi/epidata
 COPY ./src/server /app/delphi/epidata/server
 COPY ./src/common /app/delphi/epidata/common
@@ -18,7 +17,6 @@ COPY requirements.api.txt /app/requirements_also.txt
 RUN ln -s -f /usr/share/zoneinfo/America/New_York /etc/localtime \
       && rm -rf /app/delphi/epidata/__pycache__ \
       && chmod -R o+r /app/delphi/epidata \
-      && chmod 755 /start_wrapper.sh \
       && pip install --no-cache-dir -r /tmp/requirements.txt -r requirements_also.txt
 # the file /tmp/requirements.txt is created in the parent docker definition.  (see:
 #   https://github.com/tiangolo/meinheld-gunicorn-docker/blob/master/docker-images/python3.8.dockerfile#L5 )
@@ -28,4 +26,4 @@ RUN ln -s -f /usr/share/zoneinfo/America/New_York /etc/localtime \
 ENV PYTHONUNBUFFERED 1
 
 ENTRYPOINT [ "/entrypoint.sh" ]
-CMD [ "/start_wrapper.sh" ]
+CMD [ "/start.sh" ]
diff --git a/devops/start_wrapper.sh b/devops/start_wrapper.sh
diff --git a/docs/epidata_development.md b/docs/epidata_development.md
@@ -388,3 +388,13 @@ The command above maps two local directories into the container:
 - `/repos/delphi/delphi-epidata/src`: Just the source code, which forms the
   container's `delphi.epidata` python package.
 
+## instrumentation with Sentry
+
+Delphi uses [Sentry](https://sentry.io/welcome/) in production for debugging, APM, and other observability purposes. You can instrument your local environment if you want to take advantage of Sentry's features during the development process. In most cases this option is available to internal Delphi team members only.
+
+The bare minimum to set up instrumentation is to supply the DSN for the [epidata-api](https://cmu-delphi.sentry.io/projects/epidata-api/?project=4506123377442816) Sentry project to the application environment.
+
+- You can get the DSN from the Sentry [project's keys config](https://cmu-delphi.sentry.io/settings/projects/epidata-api/keys/), or by asking someone in the prodsys, DevOps, or sysadmin space.
+- Once you have the DSN, add it to your local `.env` file and rebuild your containers to start sending telemetry to Sentry.
+
+Additional internal documentation for Sentry can be found [here](https://bookstack.delphi.cmu.edu/books/systems-handbook/page/sentry).
diff --git a/docs/symptom-survey/publications.md b/docs/symptom-survey/publications.md
@@ -26,14 +26,17 @@ Pandemic"](https://www.pnas.org/topic/548) in *PNAS*:
 
 Research publications using the survey data include:
 
+- W. Dempsey (2023). [Addressing selection bias and measurement error in
+  COVID-19 case count data using auxiliary information](https://doi.org/10.1214/23-AOAS1744).
+  *Annals of Applied Statistics* 17 (4), 2903-2923.
 - Ma, M.Z., Chen, S.X. (2023). [Beyond the surface: accounting for confounders
   in understanding the link between collectivism and COVID-19 pandemic in the
   United States](https://doi.org/10.1186/s12889-023-16384-2). *BMC Public
   Health* 23, 1513.
 - C.K. Ettman, E. Badillo Goicoechea, and E.A. Stuart (2023). [Evolution of
   depression and anxiety over the COVID-19 pandemic and across demographic
   groups in a large sample of U.S. adults](https://doi.org/10.1016/j.focus.2023.100140).
-  *AJPM Focus*.
+  *AJPM Focus* 2 (4), 100140.
 - M. Rubinstein, Z. Branson, and E.H. Kennedy (2023). [Heterogeneous
   interventional effects with multiple mediators: Semiparametric and
   nonparametric approaches](https://doi.org/10.1515/jci-2022-0070). *Journal of

diff --git a/integrations/acquisition/covid_hosp/state_daily/test_scenarios.py b/integrations/acquisition/covid_hosp/state_daily/test_scenarios.py
@@ -47,62 +47,122 @@ def setUp(self):
         cur.execute('delete from api_user')
         cur.execute('insert into api_user(api_key, email) values("key", "email")')
 
-  @freeze_time("2021-03-16")
-  def test_acquire_dataset(self):
-    """Acquire a new dataset."""
+  def get_modified_dataset(self, critical_staffing_shortage_today_yes, reporting_cutoff_start):
+    """Get a simplified version of a test dataset.
 
-    # make sure the data does not yet exist
-    with self.subTest(name='no data yet'):
-      response = Epidata.covid_hosp('MA', Epidata.range(20200101, 20210101))
-      self.assertEqual(response['result'], -2, response)
+    Only WY data is modified. The issue date is specified in the metadata file.
+    """
+    df = self.test_utils.load_sample_dataset()
+    df_new = pd.DataFrame(df[df["state"] == "WY"], columns=df.columns).reset_index(drop=True)
+    df_new["critical_staffing_shortage_today_yes"] = critical_staffing_shortage_today_yes
+    df_new["reporting_cutoff_start"] = reporting_cutoff_start
+    return df_new
 
-    # acquire sample data into local database
-    # mock out network calls to external hosts
-    with self.subTest(name='first acquisition'), \
-         patch.object(Network, 'fetch_metadata', return_value=self.test_utils.load_sample_metadata()) as mock_fetch_meta, \
-         patch.object(Network, 'fetch_dataset', side_effect=[self.test_utils.load_sample_dataset("dataset0.csv"), # dataset for 3/13
-                                                             self.test_utils.load_sample_dataset("dataset0.csv"), # first dataset for 3/15
-                                                             self.test_utils.load_sample_dataset()] # second dataset for 3/15
-                      ) as mock_fetch:
-      acquired = Update.run()
-      self.assertTrue(acquired)
-      self.assertEqual(mock_fetch_meta.call_count, 1)
-
-    # make sure the data now exists
-    with self.subTest(name='initial data checks'):
-      response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101))
-      self.assertEqual(response['result'], 1)
-      self.assertEqual(len(response['epidata']), 1)
-      row = response['epidata'][0]
-      self.assertEqual(row['state'], 'WY')
-      self.assertEqual(row['date'], 20201209)
-      self.assertEqual(row['issue'], 20210315)
-      self.assertEqual(row['critical_staffing_shortage_today_yes'], 8)
-      self.assertEqual(row['total_patients_hospitalized_confirmed_influenza_covid_coverage'], 56)
-      actual = row['inpatient_bed_covid_utilization']
-      expected = 0.11729857819905214
-      self.assertAlmostEqual(actual, expected)
-      self.assertIsNone(row['critical_staffing_shortage_today_no'])
-
-      # expect 61 fields per row (63 database columns, except `id` and `record_type`)
-      self.assertEqual(len(row), 118)
-
-    with self.subTest(name='all date batches acquired'):
-      response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101), issues=20210313)
-      self.assertEqual(response['result'], 1)
-
-    # re-acquisition of the same dataset should be a no-op
-    with self.subTest(name='second acquisition'), \
-         patch.object(Network, 'fetch_metadata', return_value=self.test_utils.load_sample_metadata()) as mock_fetch_meta, \
-         patch.object(Network, 'fetch_dataset', return_value=self.test_utils.load_sample_dataset()) as mock_fetch:
-      acquired = Update.run()
-      self.assertFalse(acquired)
+  def test_acquire_dataset(self):
+    """Acquire a new dataset."""
 
-    # make sure the data still exists
-    with self.subTest(name='final data checks'):
-      response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101))
-      self.assertEqual(response['result'], 1)
-      self.assertEqual(len(response['epidata']), 1)
+    with freeze_time("2021-03-15"):
+      # make sure the data does not yet exist
+      with self.subTest(name='no data yet'):
+        response = Epidata.covid_hosp('MA', Epidata.range(20200101, 20210101))
+        self.assertEqual(response['result'], -2, response)
+
+      # acquire sample data into local database
+      # mock out network calls to external hosts
+      # issues: 3/13, 3/15
+      with self.subTest(name='first acquisition'), \
+          patch.object(Network, 'fetch_metadata',
+                       return_value=self.test_utils.load_sample_metadata("metadata.csv")) as mock_fetch_meta, \
+          patch.object(Network, 'fetch_dataset', side_effect=[
+            self.test_utils.load_sample_dataset(),
+            self.test_utils.load_sample_dataset()
+          ]) as mock_fetch:
+        acquired = Update.run()
+        self.assertTrue(acquired)
+        self.assertEqual(mock_fetch_meta.call_count, 1)
+
+      # make sure the data now exists
+      with self.subTest(name='initial data checks'):
+        response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101))
+        self.assertEqual(response['result'], 1)
+        self.assertEqual(len(response['epidata']), 1)
+        row = response['epidata'][0]
+        self.assertEqual(row['state'], 'WY')
+        self.assertEqual(row['date'], 20201209)
+        self.assertEqual(row['issue'], 20210315) # include today's data by default
+        self.assertEqual(row['critical_staffing_shortage_today_yes'], 8)
+        self.assertEqual(row['total_patients_hospitalized_confirmed_influenza_covid_coverage'], 56)
+        self.assertIsNone(row['critical_staffing_shortage_today_no'])
+
+        # expect 61 fields per row (63 database columns, except `id` and `record_type`)
+        self.assertEqual(len(row), 118)
+
+      with self.subTest(name='all date batches acquired'):
+        response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101), issues=20210313)
+        self.assertEqual(response['result'], 1)
+
+      # re-acquisition of the same dataset should be a no-op
+      # issues: 3/13, 3/15
+      with self.subTest(name='second acquisition'), \
+          patch.object(Network, 'fetch_metadata',
+                       return_value=self.test_utils.load_sample_metadata("metadata.csv")) as mock_fetch_meta, \
+          patch.object(Network, 'fetch_dataset', side_effect=[
+            self.test_utils.load_sample_dataset(),
+            self.test_utils.load_sample_dataset()
+          ]) as mock_fetch:
+        acquired = Update.run()
+        self.assertFalse(acquired)
+
+        # make sure the data still exists
+        response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101))
+        self.assertEqual(response['result'], 1)
+        self.assertEqual(len(response['epidata']), 1)
+
+    with freeze_time("2021-03-16"):
+      # simulate issue posted after yesterday's run
+      with self.subTest(name='late issue posted'), \
+          patch.object(Network, 'fetch_metadata',
+                      return_value=self.test_utils.load_sample_metadata("metadata2.csv")) as mock_fetch_meta, \
+          patch.object(Network, 'fetch_dataset', side_effect=[
+            self.get_modified_dataset(critical_staffing_shortage_today_yes = 9, reporting_cutoff_start="2020-12-09"),
+            self.get_modified_dataset(critical_staffing_shortage_today_yes = 10, reporting_cutoff_start="2020-12-09"),
+            self.get_modified_dataset(critical_staffing_shortage_today_yes = 11, reporting_cutoff_start="2020-12-10"),
+            self.get_modified_dataset(critical_staffing_shortage_today_yes = 12, reporting_cutoff_start="2020-12-10"),
+          ]) as mock_fetch:
+        acquired = Update.run()
+        self.assertTrue(acquired)
+        self.assertEqual(mock_fetch_meta.call_count, 1)
+
+      # make sure everything was filed correctly
+      with self.subTest(name='late issue data checks'):
+        response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101))
+        self.assertEqual(response['result'], 1)
+        self.assertEqual(len(response['epidata']), 2)
+
+        # should have data from 03-15 00:00:01AM
+        row = response['epidata'][0]
+        self.assertEqual(row['state'], 'WY')
+        self.assertEqual(row['date'], 20201209)
+        self.assertEqual(row['issue'], 20210315) # include today's data by default
+        self.assertEqual(row['critical_staffing_shortage_today_yes'], 10)
+        self.assertEqual(row['total_patients_hospitalized_confirmed_influenza_covid_coverage'], 56)
+        self.assertIsNone(row['critical_staffing_shortage_today_no'])
+
+        # should have data from 03-16 00:00:01AM
+        row = response['epidata'][1]
+        self.assertEqual(row['state'], 'WY')
+        self.assertEqual(row['date'], 20201210)
+        self.assertEqual(row['issue'], 20210316) # include today's data by default
+        self.assertEqual(row['critical_staffing_shortage_today_yes'], 12)
+        self.assertEqual(row['total_patients_hospitalized_confirmed_influenza_covid_coverage'], 56)
+        self.assertIsNone(row['critical_staffing_shortage_today_no'])
+
+        # expect 61 fields per row (63 database columns, except `id` and `record_type`)
+        self.assertEqual(len(row), 118)
+
+      with self.subTest(name='all date batches acquired'):
+        response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101), issues=20210316)
+        self.assertEqual(response['result'], 1)
 
 
   @freeze_time("2021-03-16")
@@ -121,7 +181,7 @@ def test_acquire_specific_issue(self):
     self.assertEqual(pre_max_issue, pd.Timestamp('1900-01-01 00:00:00'))
     with self.subTest(name='first acquisition'), \
          patch.object(Network, 'fetch_metadata', return_value=self.test_utils.load_sample_metadata()) as mock_fetch_meta, \
-         patch.object(Network, 'fetch_dataset', side_effect=[self.test_utils.load_sample_dataset("dataset0.csv")]
+         patch.object(Network, 'fetch_dataset', side_effect=[self.test_utils.load_sample_dataset()]
                       ) as mock_fetch:
       acquired = Utils.update_dataset(Database,
                                       Network,

diff --git a/requirements.api.txt b/requirements.api.txt
@@ -5,16 +5,16 @@ Flask-Limiter==3.3.0
 jinja2==3.0.3
 more_itertools==8.4.0
 mysqlclient==2.1.1
-newrelic
 orjson==3.4.7
 pandas==1.2.3
 python-dotenv==0.15.0
 pyyaml
 redis==3.5.3
 requests==2.31.0
 scipy==1.10.0
+sentry-sdk[flask]
 SQLAlchemy==1.4.40
 structlog==22.1.0
 tenacity==7.0.0
 typing-extensions
-werkzeug==2.2.3
+werkzeug==2.3.8
diff --git a/requirements.dev.txt b/requirements.dev.txt
@@ -1,4 +1,4 @@
-aiohttp==3.8.5
+aiohttp==3.8.6
 black>=20.8b1
 bump2version==1.0.1
 covidcast==0.1.5

diff --git a/src/acquisition/covid_hosp/common/database.py b/src/acquisition/covid_hosp/common/database.py
@@ -186,9 +186,13 @@ def nan_safe_dtype(dtype, value):
 
     num_columns = 2 + len(dataframe_columns_and_types) + len(self.additional_fields)
     value_placeholders = ', '.join(['%s'] * num_columns)
-    columns = ', '.join(f'`{i.sql_name}`' for i in dataframe_columns_and_types + self.additional_fields)
+    col_names = [f'`{i.sql_name}`' for i in dataframe_columns_and_types + self.additional_fields]
+    columns = ', '.join(col_names)
+    updates = ', '.join(f'{c}=new_values.{c}' for c in col_names)
+    # NOTE: list in `updates` presumes `publication_col_name` is part of the unique key and thus not needed in UPDATE
     sql = f'INSERT INTO `{self.table_name}` (`id`, `{self.publication_col_name}`, {columns}) ' \
-          f'VALUES ({value_placeholders})'
+          f'VALUES ({value_placeholders}) AS new_values ' \
+          f'ON DUPLICATE KEY UPDATE {updates}'
     id_and_publication_date = (0, publication_date)
     if logger:
       logger.info('updating values', count=len(dataframe.index))