Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add rvdss indicator/data source #1542

Draft
wants to merge 33 commits into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
62b9070
Create rvdss_historic.py
cchuong Sep 13, 2024
073aac9
Create rvdss_update.py
cchuong Sep 13, 2024
01af95f
create utils.py for common functions
cchuong Sep 14, 2024
6a002e0
create constants.py and update utils
cchuong Sep 16, 2024
714455c
Update rvdss_historic.py
cchuong Sep 16, 2024
6ee8bb7
Update rvdss_update.py
cchuong Sep 16, 2024
8814554
fix typo and add missing abbreviation to constants
cchuong Sep 17, 2024
d7905c8
fix typo
cchuong Sep 18, 2024
08f908a
add missing geo
cchuong Sep 18, 2024
07ed998
Update constants.py
cchuong Sep 18, 2024
fd5bf15
Revert "Update constants.py"
cchuong Sep 18, 2024
678b468
Revert "add missing geo"
cchuong Sep 18, 2024
4bfc933
fix geo and virus abbreviation
cchuong Sep 20, 2024
e8957c3
remove "province of" from geo_values
cchuong Sep 20, 2024
7720a24
construct urls automatically
nmdefries Sep 23, 2024
59f79bf
comment constants
nmdefries Sep 23, 2024
e70b0e9
note historic urls don't need to be updated
nmdefries Sep 23, 2024
72d1906
be stricter about importing local fns
nmdefries Sep 23, 2024
bf51bd3
move dashboard file names to constants
nmdefries Sep 23, 2024
ee3cadf
move run-the-whole-pipeline code into main()
nmdefries Sep 23, 2024
180e67f
add code to calculate number of positive tests back in
cchuong Sep 24, 2024
6bd6e24
update abbreviate_geo to remove periods and other spelling
cchuong Sep 24, 2024
a7666b8
fix lab name missing province
cchuong Sep 24, 2024
503165e
comment historic script
nmdefries Sep 24, 2024
256e697
move output file names to constants
nmdefries Sep 24, 2024
cd83087
replace boolean comparisons with pythonic "not"
nmdefries Sep 24, 2024
969295b
actually put csv names in constants
nmdefries Sep 25, 2024
00f3f9a
break more helper functions and add doctsrings
cchuong Oct 2, 2024
ecca542
add more comments
cchuong Oct 4, 2024
31ec961
calculate update dates in a new function
cchuong Oct 10, 2024
0be5f08
combine different spellings of labs
cchuong Oct 13, 2024
5696636
change slash to underscore in constants and move more processing code…
cchuong Oct 13, 2024
30f3df6
rvdss interface and new fn layout so current/historical data can be e…
nmdefries Nov 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 117 additions & 0 deletions src/acquisition/rvdss/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from datetime import datetime

# The dataset calls the same viruses, provinces, regions (province groups),
# and country by multiple names. Map each of those to a common abbreviation.
VIRUSES = {
"parainfluenza": "hpiv",
"piv": "hpiv",
"para": "hpiv",
"adenovirus": "adv",
"adeno": "adv",
"human metapneumovirus": "hmpv",
"enterovirus_rhinovirus": "evrv",
"rhinovirus": "evrv",
"rhv": "evrv",
"entero_rhino": "evrv",
"rhino":"evrv",
"ev_rv":"evrv",
"coronavirus":"hcov",
"coron":"hcov",
"coro":"hcov",
"respiratory syncytial virus":"rsv",
"influenza":"flu",
"sars-cov-2":"sarscov2",
}

GEOS = {
"newfoundland": "nl",
"newfoundland and labrador": "nl",
"prince edward island":"pe",
"nova scotia":"ns",
"new brunswick":"nb",
"québec":"qc",
"quebec":"qc",
"ontario":"on",
"manitoba" : "mb",
"saskatchewan":"sk",
"alberta": "ab",
"british columbia" :"bc",
"yukon" : "yt",
"northwest territories" : "nt",
"nunavut" : "nu",
"canada":"ca",
"can":"ca" ,
"at":"atlantic",
"atl":"atlantic",
"pr" :"prairies" ,
"terr" :"territories",
"uhn sinai hospital":"uhn mount sinai hospital"
}

# Regions are groups of provinces that are geographically close together. Some single provinces are reported as their own region (e.g. Québec, Ontario).
REGIONS = ['atlantic','atl','at','province of québec','québec','qc','province of ontario','ontario','on',
'prairies', 'pr', "british columbia",'bc',"territories",'terr',]
NATION = ["canada","can",'ca',]

# Construct dashboard and data report URLS.
DASHBOARD_BASE_URL = "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/"
DASHBOARD_W_DATE_URL = DASHBOARD_BASE_URL + "archive/{date}/"

# May not need this since we write a function for this in pull_historic
DASHBOARD_BASE_URLS_2023_2024_SEASON = (
DASHBOARD_W_DATE_URL.format(date = date) for date in
(
"2024-06-20",
"2024-06-27",
"2024-07-04",
"2024-07-11",
"2024-07-18",
"2024-08-01",
"2024-08-08",
"2024-08-15",
"2024-08-22",
"2024-08-29",
"2024-09-05"
)
)

SEASON_BASE_URL = "https://www.canada.ca"
ALTERNATIVE_SEASON_BASE_URL = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/"
HISTORIC_SEASON_REPORTS_URL = SEASON_BASE_URL+"/en/public-health/services/surveillance/respiratory-virus-detections-canada/{year_range}.html"
DASHBOARD_ARCHIVED_DATES_URL= "https://health-infobase.canada.ca/src/js/respiratory-virus-detections/ArchiveData.json"

# Each URL created here points to a list of all data reports made during that
# season, e.g.
# https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2014-2015.html.
# The Public Health Agency of Canada site switched in 2024 to reporting
# disease data in a dashboard with a static URL. Therefore, this collection
# of URLs does _NOT_ need to be updated. It is used for fetching historical
# data (for dates on or before June 8, 2024) only.
HISTORIC_SEASON_URLS = (HISTORIC_SEASON_REPORTS_URL.format(year_range = year_range) for year_range in
(
"2013-2014",
"2014-2015",
"2015-2016",
"2016-2017",
"2017-2018",
"2018-2019",
"2019-2020",
"2020-2021",
"2021-2022",
"2022-2023",
"2023-2024"
)
)

DASHBOARD_UPDATE_DATE_FILE = "RVD_UpdateDate.csv"
DASHBOARD_DATA_FILE = "RVD_WeeklyData.csv"


RESP_DETECTIONS_OUTPUT_FILE = "respiratory_detections.csv"
POSITIVE_TESTS_OUTPUT_FILE = "positive_tests.csv"
COUNTS_OUTPUT_FILE = "number_of_detections.csv"

FIRST_WEEK_OF_YEAR = 35

UPDATE_DATES_FILE = "update_dates.txt"
NOW = datetime.now()
121 changes: 121 additions & 0 deletions src/acquisition/rvdss/database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""
===============
=== Purpose ===
===============

Stores data provided by rvdss Corp., which contains flu lab test results.
See: rvdss.py


=======================
=== Data Dictionary ===
=======================

`rvdss` is the table where rvdss data is stored.
+----------+-------------+------+-----+---------+----------------+
| Field | Type | Null | Key | Default | Extra |
+----------+-------------+------+-----+---------+----------------+
| id | int(11) | NO | PRI | NULL | auto_increment |
| location | varchar(8) | NO | MUL | NULL | |
| epiweek | int(11) | NO | MUL | NULL | |
| value | float | NO | | NULL | |
+----------+-------------+------+-----+---------+----------------+
id: unique identifier for each record
location: hhs1-10
epiweek: the epiweek during which the queries were executed
value: number of total test records per facility, within each epiweek

=================
=== Changelog ===
=================
2017-12-14:
* add "need update" check

2017-12-02:
* original version
"""

# standard library
import argparse

# third party
import mysql.connector

# first party
from delphi.epidata.acquisition.rvdss import rvdss
import delphi.operations.secrets as secrets
from delphi.utils.epidate import EpiDate
import delphi.utils.epiweek as flu
from delphi.utils.geo.locations import Locations

LOCATIONS = Locations.hhs_list
DATAPATH = "/home/automation/rvdss_data"


def update(locations, first=None, last=None, force_update=False, load_email=True):
# download and prepare data first
qd = rvdss.rvdssData(DATAPATH, load_email)
if not qd.need_update and not force_update:
print("Data not updated, nothing needs change.")
return

qd_data = qd.load_csv()
qd_measurements = qd.prepare_measurements(qd_data, start_weekday=4)
qd_ts = rvdss.measurement_to_ts(qd_measurements, 7, startweek=first, endweek=last)
# connect to the database
u, p = secrets.db.epi
cnx = mysql.connector.connect(user=u, password=p, database="epidata")
cur = cnx.cursor()

def get_num_rows():
cur.execute("SELECT count(1) `num` FROM `rvdss`")
for (num,) in cur:
pass
return num

# check from 4 weeks preceeding the last week with data through this week
cur.execute("SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `rvdss`")
for (ew0, ew1) in cur:
ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4)
ew0 = ew0 if first is None else first
ew1 = ew1 if last is None else last
print(f"Checking epiweeks between {int(ew0)} and {int(ew1)}...")

# keep track of how many rows were added
rows_before = get_num_rows()

# check rvdss for new and/or revised data
sql = """
INSERT INTO
`rvdss` (`location`, `epiweek`, `value`)
VALUES
(%s, %s, %s)
ON DUPLICATE KEY UPDATE
`value` = %s
"""

total_rows = 0

for location in locations:
if location not in qd_ts:
continue
ews = sorted(qd_ts[location].keys())
num_missing = 0
for ew in ews:
v = qd_ts[location][ew]
sql_data = (location, ew, v, v)
cur.execute(sql, sql_data)
total_rows += 1
if v == 0:
num_missing += 1
if num_missing > 0:
print(f" [{location}] missing {int(num_missing)}/{len(ews)} value(s)")

# keep track of how many rows were added
rows_after = get_num_rows()
print(f"Inserted {int(rows_after - rows_before)}/{int(total_rows)} row(s)")

# cleanup
cur.close()
cnx.commit()
cnx.close()
Loading
Loading