From 62b9070bd815244b09aaa74ce89a4c96ff1db0ef Mon Sep 17 00:00:00 2001 From: cchuong Date: Thu, 12 Sep 2024 21:21:20 -0700 Subject: [PATCH 01/33] Create rvdss_historic.py --- src/acquisition/rvdss/rvdss_historic.py | 652 ++++++++++++++++++++++++ 1 file changed, 652 insertions(+) create mode 100644 src/acquisition/rvdss/rvdss_historic.py diff --git a/src/acquisition/rvdss/rvdss_historic.py b/src/acquisition/rvdss/rvdss_historic.py new file mode 100644 index 000000000..b50385f04 --- /dev/null +++ b/src/acquisition/rvdss/rvdss_historic.py @@ -0,0 +1,652 @@ +from bs4 import BeautifulSoup +import requests +import regex as re +import pandas as pd +from epiweeks import Week +from datetime import datetime,timedelta +import math +import io + + #%% Functions + + # Report Functions +def get_report_season(soup): + # Find the url in the page html and get the season + canonical_url = str(soup.find_all('link',rel="canonical")) + matches = re.search("20[0-9]{2}-20[0-9]{2}",canonical_url) + + if matches: + season = matches.group(0) + years=season.split("-") + return(years) + +def append_urls(urls): + # Add https to the urls + for i in range(len(urls)): + temp_url = urls[i] + + http_present = re.search("http",temp_url) + if not http_present: + urls[i]="https://www.canada.ca"+temp_url + return(urls) + +def report_urls(soup): + # Get links for individual weeks + year= "-".join(get_report_season(soup)) + links=soup.find_all('a') + alternative_url = "http://www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/"+year + + urls = [link.get("href") for link in links if "ending" in str(link) or + alternative_url in str(link)] + + report_links = append_urls(urls) + return(report_links) + +def report_weeks(soup): + links=soup.find_all('a') + full_weeks = [link.text for link in links if "Week" in str(link)] + weeks= [int(re.search('Week (.+?) ', week).group(1)) for week in full_weeks] + return(weeks) + +def get_report_date(week,start_year,epi=False): + if week < 35: + year=int(start_year)+1 + else: + year=int(start_year) + + epi_week = Week(year, week) + + if epi==False: + report_date = str(epi_week.enddate()) + else: + report_date = str(epi_week) + + return(report_date) + +def abbreviate_virus(full_name): + lowercase=full_name.lower() + + if any(name in lowercase for name in ["parainfluenza","para","piv"]): + if "hpiv" not in lowercase: + abbrev = re.sub("parainfluenza|para|piv","hpiv",lowercase) + else: + abbrev = lowercase + elif any(name in lowercase for name in ["adenovirus","adeno"]): + abbrev = re.sub("adenovirus|adeno","adv",lowercase) + elif "human metapneumovirus" in lowercase: + abbrev = re.sub("human metapneumovirus","hmpv",lowercase) + elif any(name in lowercase for name in ["enterovirus/rhinovirus","rhinovirus","rhv","entero/rhino","rhino","ev/rv","evrv"]): + abbrev = re.sub("enterovirus/rhinovirus|rhinovirus|rhv|entero/rhino|rhino|ev/rv|evrv","ev_rv",lowercase) + elif any(name in lowercase for name in ["coronavirus","coron","coro"]): + abbrev = re.sub("coronavirus|coron|coro","hcov",lowercase) + elif "respiratory syncytial virus" in lowercase: + abbrev = re.sub("respiratory syncytial virus","rsv",lowercase) + elif "influenza" in lowercase: + abbrev = re.sub("influenza","flu",lowercase) + elif "sarscov2" in lowercase: + abbrev = re.sub("sarscov2","sars-cov-2",lowercase) + else: + abbrev=lowercase + return(abbrev) + +def abbreviate_geo(full_name): + lowercase=full_name.lower() + + if "newfoundland" in lowercase: + abbrev = "nl" + elif "prince edward island" in lowercase: + abbrev = "pe" + elif "nova scotia" in lowercase: + abbrev = "ns" + elif "new brunswick" in lowercase: + abbrev = "nb" + elif "nova scotia" in lowercase: + abbrev = "ns" + elif re.match('|'.join(("^québec$", "province of québec","quebec")),lowercase): + abbrev = "qc" + elif re.match('|'.join(("^ontario$", "province of ontario")),lowercase): + abbrev = "on" + elif "manitoba" in lowercase: + abbrev = "mb" + elif "saskatchewan" in lowercase: + abbrev = "sk" + elif "alberta" in lowercase: + abbrev = "ab" + elif "british columbia" in lowercase: + abbrev = "bc" + elif "yukon" in lowercase: + abbrev = "yk" + elif "northwest territories" in lowercase: + abbrev = "nt" + elif "nunavut" in lowercase: + abbrev = "nu" + elif re.match("canada|can",lowercase): + abbrev = "ca" + elif re.match(r"^at\b",lowercase): + abbrev = "atlantic" + elif "pr" in lowercase: + abbrev = "prairies" + elif "terr" in lowercase: + abbrev = "territories" + else: + abbrev=lowercase + return(abbrev) + + +def get_table_captions(soup): + captions = soup.findAll('summary') + + table_identifiers = ["respiratory","number","positive","abbreviation"] + if sum([all(name not in cap.text.lower() for name in table_identifiers) for cap in captions]) != 0: + figcaptions = soup.findAll('figcaption') + captions = captions + figcaptions + + remove_list=[] + for i in range(len(captions)): + caption = captions[i] + + matches = ["period","abbreviation","cumulative", "compared"] #skip historic comparisons and cumulative tables + if any(x in caption.text.lower() for x in matches): + remove_list.append(caption) + + elif caption.has_attr('class'): + remove_list.append(caption) + + elif all(name not in caption.text.lower() for name in table_identifiers): + remove_list.append(caption) + + new_captions = [cap for cap in captions if cap not in remove_list] + new_captions = list(set(new_captions)) + + return(new_captions) + +def get_modified_dates(soup,week_end_date): + # get the date the report page was modfified + meta_tags=soup.find_all("meta",title="W3CDTF") + for tag in meta_tags: + if tag.get("name", None) == "dcterms.modified" or tag.get("property", None) == "dcterms.modified": + modified_date = tag.get("content", None) + + mod_date = datetime.strptime(modified_date, "%Y-%m-%d") + week_date = datetime.strptime(week_end_date, "%Y-%m-%d") + + diff_days = (mod_date-week_date).days + + # manually create a new modified date if the existing one is too long after the week + if diff_days > 0 and diff_days < 14: + new_modified_date = mod_date + else: + new_lag = timedelta(days=5) + new_modified_date = week_date + new_lag + + new_modified_date_string = new_modified_date.strftime("%Y-%m-%d") + + return(new_modified_date_string) + +def check_date_format(date_string): + if not re.search("[0-9]{4}-[0-9]{2}-[0-9]{2}",date_string): + if re.search(r"/",date_string): + new_date = re.sub(r"/","-",date_string) + new_date = datetime.strptime(new_date,"%d-%m-%Y").strftime("%Y-%m-%d") + elif re.search("[0-9]{2}-[0-9]{2}-[0-9]{4}",date_string): + new_date = datetime.strptime(date_string,"%d-%m-%Y").strftime("%Y-%m-%d") + else: + raise AssertionError("Unrecognised date format") + else: + new_date=date_string + + return(new_date) + +def check_duplicate_rows(table): + if table['week'].duplicated().any(): + table.columns = [re.sub("canada","can",t) for t in table.columns] + duplicated_rows = table[table.duplicated('week',keep=False)] + grouped = duplicated_rows.groupby("week") + duplicates_drop = [] + + for name, group in grouped: + duplicates_drop.append(group['can tests'].idxmin()) + + new_table = table.drop(duplicates_drop).reset_index(drop=True) + + else: + new_table=table + return(new_table) + +def create_geo_types(geo,default_geo): + regions = ['atlantic','atl','province of québec','québec','qc','province of ontario','ontario','on', + 'prairies', 'pr', "british columbia", 'bc',"territories",'terr'] + nation = ["canada","can",'ca'] + + if geo in nation: + geo_type="nation" + elif geo in regions: + geo_type="region" + else: + geo_type = default_geo + return(geo_type) + +def create_detections_table(table,modified_date,week_number,week_end_date,start_year): + lab_columns =[col for col in table.columns if 'reporting' in col][0] + table=table.rename(columns={lab_columns:"geo_value"}) + table['geo_value']=table['geo_value'].str.lower() + + pat1 = "positive" + pat2 = 'pos' + combined_pat = '|'.join((pat1, pat2)) + + pat3 = r"test\b" + pat4 = 'tested' + combined_pat2 = '|'.join((pat3, pat4)) + + pat5 =r"^ah3" + pat6= r"^auns" + pat7= r"^ah1pdm09" + pat8= r"^ah1n1pdm09" + combined_pat3 = '|'.join((pat5, pat6,pat7,pat8)) + + table.columns=[re.sub(combined_pat, "positive_tests",col) for col in table.columns] #making naming consistent + table.columns=[re.sub(combined_pat2, "tests",col) for col in table.columns] + table.columns=[re.sub(combined_pat3, r"flu_\g<0>",col) for col in table.columns] # add flu as a prefix + table.columns=[re.sub("total ", "",col) for col in table.columns] + matches=['test','geo_value'] + new_names = [] + for i in range(len(table.columns)): + if not any(x in table.columns[i] for x in matches): + new_names.append(table.columns[i]+ " positive_tests") + else: + new_names.append(table.columns[i]) + + table.columns=new_names + table.columns=[re.sub("other hpiv", "hpiv other",col) for col in table.columns] + table['geo_value'] = [re.sub("^québec$","province of québec",name) for name in table['geo_value']] + table['geo_value'] = [re.sub("^ontario$","province of ontario",name) for name in table['geo_value']] + + table['geo_value'] = [abbreviate_geo(g) for g in table['geo_value']] + geo_types = [create_geo_types(g,"lab") for g in table['geo_value']] + + table = table.assign(**{'epiweek': get_report_date(week_number, start_year,epi=True), + 'time_value': week_end_date, + 'issue': modified_date, + 'geo_type':geo_types}) + + table.columns =[re.sub(" ","_",col) for col in table.columns] + return(table) + +def create_number_detections_table(table,modified_date,start_year): + week_columns = table.columns.get_indexer(table.columns[~table.columns.str.contains('week')]) + + for index in week_columns: + new_name = abbreviate_virus(table.columns[index]) + " positive_tests" + table.rename(columns={table.columns[index]: new_name}, inplace=True) + + if "week end" not in table.columns: + week_ends = [get_report_date(week,start_year) for week in table["week"]] + table.insert(1,"week end",week_ends) + + table = table.assign(**{'issue': modified_date, + 'geo_type': "nation", + 'geo_value': "ca"}) + + table=table.rename(columns={'week end':"time_value"}) + table.columns =[re.sub(" ","_",col) for col in table.columns] + table['time_value'] = [check_date_format(d) for d in table['time_value']] + + table=table.rename(columns={'week':"epiweek"}) + table['epiweek'] = [get_report_date(week, start_year,epi=True) for week in table['epiweek']] + return(table) + +def create_percent_positive_detection_table(table,modified_date,start_year, flu=False,overwrite_weeks=False): + table = check_duplicate_rows(table) + table.columns=[re.sub(" *%", "_pct_positive",col) for col in table.columns] + table.columns = [re.sub(' +', ' ',col) for col in table.columns] + table.insert(2,"issue",modified_date) + table=table.rename(columns={'week end':"time_value"}) + table['time_value'] = [check_date_format(d) for d in table['time_value']] + + # get the name of the virus for the table to append to column names + virus_prefix=[] + if flu: + virus_prefix=['flu_a_pct_positive','flu_b_pct_positive'] + virus="flu" + table.columns=[re.sub("a_pct","flu_a_pct",c) for c in table.columns] + table.columns=[re.sub("b_pct","flu_b_pct",c) for c in table.columns] + else: + names=[] + for j in range(len(table.columns)): + old_name = table.columns[j] + if "pct_positive" in table.columns[j]: + virus_prefix=[table.columns[j]] + virus=re.match("(.*?)_pct_positive",old_name).group(1) + geo = table.columns[j-1].split(" ")[0] + new_name = geo + " " + old_name + else: + new_name=old_name + names.append(new_name) + table.columns=names + + # Remake the weeks column from dates + if overwrite_weeks==True: + week_ends = [datetime.strptime(date_string, "%Y-%m-%d") for date_string in table['time_value']] + table["week"] = [Week.fromdate(d).week for d in week_ends] + + # Change order of column names so tthey start with stubbnames + table = table.rename(columns=lambda x: ' '.join(x.split(' ')[::-1])) # + stubnames= virus_prefix+['tests'] + table= pd.wide_to_long(table, stubnames, i=['week','time_value','issue'], + j='geo_value', sep=" ", suffix=r'\w+').reset_index() + + table.columns=[re.sub("tests",virus+"_tests",c) for c in table.columns] + table.columns =[re.sub(" ","_",col) for col in table.columns] + + table=table.rename(columns={'week':"epiweek"}) + table['epiweek'] = [get_report_date(week, start_year,epi=True) for week in table['epiweek']] + + table['geo_value']= [abbreviate_geo(g) for g in table['geo_value']] + geo_types = [create_geo_types(g,"lab") for g in table['geo_value']] + table.insert(3,"geo_type",geo_types) + + table = table.set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) + + return(table) + +def get_season_reports(url): + page=requests.get(url) + soup=BeautifulSoup(page.text,'html.parser') + + # get season, weeks, urls and week ends + season = get_report_season(soup) + urls=report_urls(soup) + weeks= report_weeks(soup) + end_dates = [get_report_date(week, season[0]) for week in weeks] + + # create tables to hold all the data for the season + all_positive_tables=pd.DataFrame() + all_number_tables=pd.DataFrame() + all_respiratory_detection_table=pd.DataFrame() + + for week_num in range(len(urls)): + current_week = weeks[week_num] + current_week_end = end_dates[week_num] + + # Skip empty pages + if season[0] == '2019': + if current_week == 5: + continue + elif current_week == 47: + continue + + # Get page for the current week + temp_url=urls[week_num] + temp_page=requests.get(temp_url) + new_soup = BeautifulSoup(temp_page.text, 'html.parser') + captions = get_table_captions(new_soup) + modified_date = get_modified_dates(new_soup,current_week_end) + + positive_tables=[] + number_table_exists = False + for i in range(len(captions)): + caption=captions[i] + tab = caption.find_next('table') + + # Remove footers from tables + if tab.find('tfoot'): + tab.tfoot.decompose() + + # Delete duplicate entry from week 35 of the 2019-2020 season + if season[0] == '2019' and current_week == 35: + if "Positive Adenovirus" in caption.text: + tab.select_one('td').decompose() + + # Replace commas with periods + tab2 = re.sub(",",r".",str(tab)) + + # Read table + na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"N.D.","-"] + table = pd.read_html(tab2,na_values=na_values)[0].dropna(how="all") + + # Check for multiline headers + if isinstance(table.columns, pd.MultiIndex): + table.columns = [c[0] + " " + c[1] if c[0] != c[1] else c[0] for c in table.columns] + + # Make column names lowercase + table.columns=table.columns.str.lower() + + if season[0] == '2017': + if current_week == 35 and "entero" in caption.text.lower(): + # Remove french from headers in week 35 for the entero table + table.columns = ['week', 'week end', 'canada tests', 'entero/rhino%', 'at tests', + 'entero/rhino%.1', 'qc tests', 'entero/rhino%.2', 'on tests', + 'entero/rhino%.3', 'pr tests', 'entero/rhino%.4', 'bc tests', + 'entero/rhino%.5'] + elif current_week == 35 and "adeno" in caption.text.lower(): + # Remove > from column name + table = table.rename(columns={'>week end':"week end"}) + elif current_week == 47 and "rsv" in caption.text.lower(): + # fix date written as 201-11-25 + table.loc[table['week'] == 47, 'week end'] = "2017-11-25" + elif season[0] == '2015' and current_week == 41: + # Fix date written m-d-y not d-m-y + table=table.replace("10-17-2015","17-10-2015",regex=True) + elif season[0] == '2022' and current_week == 11 and "hmpv" in caption.text.lower(): + # fix date written as 022-09-03 + table.loc[table['week'] == 35, 'week end'] = "2022-09-03" + + # Rename columns + table.columns = [re.sub("\xa0"," ", col) for col in table.columns] # \xa0 to space + table.columns = [re.sub("flutest","flu test", col) for col in table.columns] + table.columns = [re.sub("(.*?)(\.\d+)", "\\1", c) for c in table.columns] # remove .# for duplicated columns + table.columns =[re.sub("\.", "", s)for s in table.columns] #remove periods + table.columns =[re.sub(r"\((all)\)", "", s)for s in table.columns] # remove (all) + table.columns =[re.sub(r"\s*\(|\)", "", s)for s in table.columns] # remove (all) + table.columns =[re.sub(r"h1n1 2009 |h1n12009", "ah1n1pdm09", s)for s in table.columns] # remove (all) + table.columns =[abbreviate_virus(col) for col in table.columns] # abbreviate viruses + table.columns = [re.sub(' +', ' ', col) for col in table.columns] # Make any muliple spaces into one space + table.columns = [re.sub(r'\(|\)', '', col) for col in table.columns] # replace () for _ + table.columns = [re.sub(r'/', '_', col) for col in table.columns] # replace / with _ + table.columns = [re.sub(r"^at\b","atl ",t) for t in table.columns] + + if "reporting laboratory" in str(table.columns): + respiratory_detection_table = create_detections_table(table,modified_date,current_week,current_week_end,season[0]) + respiratory_detection_table = respiratory_detection_table.set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) + elif "number" in caption.text.lower(): + number_table_exists = True + number_detections_table = create_number_detections_table(table,modified_date,season[0]) + number_detections_table = number_detections_table.set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) + elif "positive" in caption.text.lower(): + flu = " influenza" in caption.text.lower() + + # tables are missing week 53 + if season[0]=="2014" and current_week==2: + overwrite_weeks=True + elif season[0]=="2014" and current_week==3: + overwrite_weeks=True + else: + overwrite_weeks=False + + pos_table = create_percent_positive_detection_table(table,modified_date,season[0],flu,overwrite_weeks) + + # Check for percentages >100 + # One in 2014-2015 week 39, left in + if season[0] != '2014' and current_week != 39: + for k in range(len(pos_table.columns)): + if "pct_positive" in pos_table.columns[k]: + assert all([0 <= val <= 100 or math.isnan(val) for val in pos_table[pos_table.columns[k]]]), "Percentage not from 0-100" + + positive_tables.append(pos_table) + + # create path to save files + path = "season_" + season[0]+"_"+season[1] + + # combine all the positive tables + combined_positive_tables=pd.concat(positive_tables,axis=1) + + # Check if the indices are already in the season table + # If not, add the weeks tables into the season table + if respiratory_detection_table.index.isin(all_respiratory_detection_table.index).any() == False: + all_respiratory_detection_table= pd.concat([all_respiratory_detection_table,respiratory_detection_table]) + + if combined_positive_tables.index.isin(all_positive_tables.index).any() == False: + all_positive_tables=pd.concat([all_positive_tables,combined_positive_tables]) + + if number_table_exists == True: + if number_detections_table.index.isin(all_number_tables.index).any() == False: + all_number_tables=pd.concat([all_number_tables,number_detections_table]) + + # write files to csvs + all_respiratory_detection_table.to_csv(path+"/"+path+"_respiratory_detections.csv", index=True) + all_positive_tables.to_csv(path+"/"+path+"_positive_tests.csv", index=True) + + # Write the number of detections table to csv if it exists (i.e has rows) + if len(all_number_tables) != 0: + all_number_tables.to_csv(path+"/"+path+"_number_of_detections.csv", index=True) + +# Dashboard functions +def get_revised_data(base_url): + headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' + } + + # Get update date + update_date_url = base_url + "RVD_UpdateDate.csv" + update_date_url_response = requests.get(update_date_url, headers=headers) + update_date = datetime.strptime(update_date_url_response.text,"%m/%d/%Y %H:%M:%S").strftime("%Y-%m-%d") + + # Get update data + url = base_url+"RVD_WeeklyData.csv" + + url_response = requests.get(url, headers=headers) + df = pd.read_csv(io.StringIO(url_response.text)) + + df['virus'] = [abbreviate_virus(v) for v in df['virus']] + epiw = df.apply(lambda x: Week(x['year'],x['week']),axis=1) + df.insert(0,"epiweek",[int(str(w)) for w in epiw]) + df['epiweek'] = [int(str(w)) for w in df['epiweek']] + df['province'] = [abbreviate_geo(g) for g in df['province']] + df=df.rename(columns={'province':"geo_value",'date':'time_value',"detections":"positivetests"}) + df['time_value'] = [check_date_format(d) for d in df['time_value']] + df['geo_type'] = [create_geo_types(g,"province") for g in df['geo_value']] + df.insert(1,"issue",update_date) + + df=df.drop(["weekorder","region","year","week"],axis=1) + + df = df.pivot(index=['epiweek','time_value','issue','geo_type','geo_value'], + columns="virus",values=['tests','percentpositive','positivetests']) + df.columns = ['_'.join(col).strip() for col in df.columns.values] + df = df.rename(columns=lambda x: '_'.join(x.split('_')[1:]+x.split('_')[:1])) + df.columns=[re.sub("positivetests", "positive_tests",col) for col in df.columns] + df.columns=[re.sub("percentpositive", "pct_positive",col) for col in df.columns] + df.columns=[re.sub(r' ','_',c) for c in df.columns] + + for k in range(len(df.columns)): + if "pct_positive" in df.columns[k]: + assert all([0 <= val <= 100 or math.isnan(val) for val in df[df.columns[k]]]), "Percentage not from 0-100" + + return(df) + +def get_weekly_data(base_url,start_year): + headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' + } + + # Get update date + update_date_url = base_url + "RVD_UpdateDate.csv" + update_date_url_response = requests.get(update_date_url, headers=headers) + update_date = datetime.strptime(update_date_url_response.text,"%m/%d/%Y %H:%M:%S").strftime("%Y-%m-%d") + + # Get current week and year + summary_url = base_url + "RVD_SummaryText.csv" + summary_url_response = requests.get(summary_url, headers=headers) + summary_df = pd.read_csv(io.StringIO(summary_url_response.text)) + + week_df = summary_df[(summary_df['Section'] == "summary") & (summary_df['Type']=="title")] + week_string = week_df.iloc[0]['Text'].lower() + current_week = int(re.search("week (.+?) ", week_string).group(1)) + + if current_week < 34: + current_year = start_year+1 + else: + current_year = start_year + + current_epiweek= Week(current_year,current_week) + + # Get weekly data + weekly_url = base_url + "RVD_CurrentWeekTable.csv" + weekly_url_response = requests.get(weekly_url, headers=headers) + weekly_url_response.encoding='UTF-8' + df_weekly = pd.read_csv(io.StringIO(weekly_url_response.text)) + + df_weekly = df_weekly.rename(columns=lambda x: '_'.join(x.split('_')[1:]+x.split('_')[:1])) + df_weekly.insert(0,"epiweek",int(str(current_epiweek))) + df_weekly.insert(1,"time_value",str(current_epiweek.enddate())) + df_weekly.insert(2,"issue",update_date) + df_weekly.columns=[abbreviate_virus(c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'test\b','tests',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'pos\b','positive_tests',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'flua_','flu_a',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'flub_','flu_b',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'bpositive','b_positive',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'apositive','a_positive',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'flu_ah1_','flu_ah1pdm09_',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r' ','_',c) for c in df_weekly.columns] + df_weekly=df_weekly.rename(columns={'reportinglaboratory':"geo_value"}) + df_weekly['geo_value'] = [abbreviate_geo(g) for g in df_weekly['geo_value']] + df_weekly['geo_type'] = [create_geo_types(g,"lab") for g in df_weekly['geo_value']] + + #df_weekly=df_weekly.drop(["weekorder","date","week"],axis=1) + + return(df_weekly) + + + #%% Scrape each season + +urls = ["https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2013-2014.html", +"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2014-2015.html", +"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2015-2016.html", +"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2016-2017.html", +"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2017-2018.html", +"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2018-2019.html", +"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2019-2020.html", +"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2020-2021.html", +"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2021-2022.html", +"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2022-2023.html", +"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2023-2024.html"] + +[get_season_reports(url) for url in urls] + + + #%% Update the end of the 2023-2024 season with the dashboard data + +base_urls=["https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-06-20/", +"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-06-27/", +"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-04/", +"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-11/", +"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-18/", +"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-01/", +"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-08/", +"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-15/", +"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-22/", +"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-29/", +"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-09-05/"] + +# Load old csvs +old_detection_data = pd.read_csv('season_2023_2024/season_2023_2024_respiratory_detections.csv').set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) +old_positive_data = pd.read_csv('season_2023_2024/season_2023_2024_positive_tests.csv').set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) + +for base_url in base_urls: + # Get weekly dashboard data + weekly_data = get_weekly_data(base_url,2023).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) + positive_data = get_revised_data(base_url) + + # Check if indices are already present in the old data + # If not, add the new data + if weekly_data.index.isin(old_detection_data.index).any() == False: + old_detection_data= pd.concat([old_detection_data,weekly_data],axis=0) + + if positive_data.index.isin(old_positive_data.index).any() == False: + old_positive_data= pd.concat([old_positive_data,positive_data],axis=0) + +# Overwrite/update csvs +old_detection_data.to_csv('season_2023_2024/season_2023_2024_respiratory_detections.csv',index=True) +old_positive_data.to_csv('season_2023_2024/season_2023_2024_positive_tests.csv',index=True) + From 073aac9a9f1c0fa2d661bdd8f9ff92d06c36942d Mon Sep 17 00:00:00 2001 From: cchuong Date: Thu, 12 Sep 2024 21:22:44 -0700 Subject: [PATCH 02/33] Create rvdss_update.py --- src/acquisition/rvdss/rvdss_update.py | 225 ++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 src/acquisition/rvdss/rvdss_update.py diff --git a/src/acquisition/rvdss/rvdss_update.py b/src/acquisition/rvdss/rvdss_update.py new file mode 100644 index 000000000..e8c9bd46c --- /dev/null +++ b/src/acquisition/rvdss/rvdss_update.py @@ -0,0 +1,225 @@ +import requests +import pandas as pd +import io +import regex as re +from epiweeks import Week +from datetime import datetime +import math +import os + +def abbreviate_virus(full_name): + lowercase=full_name.lower() + + if any(name in lowercase for name in ["parainfluenza","para","piv"]): + if "hpiv" not in lowercase: + abbrev = re.sub("parainfluenza|para|piv","hpiv",lowercase) + else: + abbrev = lowercase + elif any(name in lowercase for name in ["adenovirus","adeno"]): + abbrev = re.sub("adenovirus|adeno","adv",lowercase) + elif "human metapneumovirus" in lowercase: + abbrev = re.sub("human metapneumovirus","hmpv",lowercase) + elif any(name in lowercase for name in ["enterovirus/rhinovirus","rhinovirus","rhv","entero/rhino","rhino","ev/rv","evrv"]): + abbrev = re.sub("enterovirus/rhinovirus|rhinovirus|rhv|entero/rhino|rhino|ev/rv|evrv","ev_rv",lowercase) + elif any(name in lowercase for name in ["coronavirus","coron","coro"]): + abbrev = re.sub("coronavirus|coron|coro","hcov",lowercase) + elif "respiratory syncytial virus" in lowercase: + abbrev = re.sub("respiratory syncytial virus","rsv",lowercase) + elif "influenza" in lowercase: + abbrev = re.sub("influenza","flu",lowercase) + elif "sarscov2" in lowercase: + abbrev = re.sub("sarscov2","sars-cov-2",lowercase) + else: + abbrev=lowercase + return(abbrev) + +def abbreviate_geo(full_name): + lowercase=full_name.lower() + + if "newfoundland" in lowercase: + abbrev = "nl" + elif "prince edward island" in lowercase: + abbrev = "pe" + elif "nova scotia" in lowercase: + abbrev = "ns" + elif "new brunswick" in lowercase: + abbrev = "nb" + elif "nova scotia" in lowercase: + abbrev = "ns" + elif re.match('|'.join(("^québec$", "province of québec","quebec")),lowercase): + abbrev = "qc" + elif re.match('|'.join(("^ontario$", "province of ontario")),lowercase): + abbrev = "on" + elif "manitoba" in lowercase: + abbrev = "mb" + elif "saskatchewan" in lowercase: + abbrev = "sk" + elif "alberta" in lowercase: + abbrev = "ab" + elif "british columbia" in lowercase: + abbrev = "bc" + elif "yukon" in lowercase: + abbrev = "yk" + elif "northwest territories" in lowercase: + abbrev = "nt" + elif "nunavut" in lowercase: + abbrev = "nu" + elif re.match("canada|can",lowercase): + abbrev = "ca" + elif re.match(r"^at\b",lowercase): + abbrev = "atlantic" + elif "pr" in lowercase: + abbrev = "prairies" + elif "terr" in lowercase: + abbrev = "territories" + else: + abbrev=lowercase + return(abbrev) + +def create_geo_types(geo,default_geo): + regions = ['atlantic','atl','province of québec','québec','qc','province of ontario','ontario','on', + 'prairies', 'pr', "british columbia", 'bc',"territories",'terr'] + nation = ["canada","can",'ca'] + + if geo in nation: + geo_type="nation" + elif geo in regions: + geo_type="region" + else: + geo_type = default_geo + return(geo_type) + +def check_date_format(date_string): + if not re.search("[0-9]{4}-[0-9]{2}-[0-9]{2}",date_string): + if re.search(r"/",date_string): + new_date = re.sub(r"/","-",date_string) + new_date = datetime.strptime(new_date,"%d-%m-%Y").strftime("%Y-%m-%d") + elif re.search("[0-9]{2}-[0-9]{2}-[0-9]{4}",date_string): + new_date = datetime.strptime(date_string,"%d-%m-%Y").strftime("%Y-%m-%d") + else: + raise AssertionError("Unrecognised date format") + else: + new_date=date_string + + return(new_date) + +def get_revised_data(base_url): + headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' + } + + # Get update date + update_date_url = base_url + "RVD_UpdateDate.csv" + update_date_url_response = requests.get(update_date_url, headers=headers) + update_date = datetime.strptime(update_date_url_response.text,"%m/%d/%Y %H:%M:%S").strftime("%Y-%m-%d") + + # Get update data + url = base_url+"RVD_WeeklyData.csv" + + url_response = requests.get(url, headers=headers) + df = pd.read_csv(io.StringIO(url_response.text)) + + df['virus'] = [abbreviate_virus(v) for v in df['virus']] + epiw = df.apply(lambda x: Week(x['year'],x['week']),axis=1) + df.insert(0,"epiweek",[int(str(w)) for w in epiw]) + df['epiweek'] = [int(str(w)) for w in df['epiweek']] + df['province'] = [abbreviate_geo(g) for g in df['province']] + df=df.rename(columns={'province':"geo_value",'date':'time_value',"detections":"positivetests"}) + df['time_value'] = [check_date_format(d) for d in df['time_value']] + df['geo_type'] = [create_geo_types(g,"province") for g in df['geo_value']] + df.insert(1,"issue",update_date) + + df=df.drop(["weekorder","region","year","week"],axis=1) + + df = df.pivot(index=['epiweek','time_value','issue','geo_type','geo_value'], + columns="virus",values=['tests','percentpositive','positivetests']) + df.columns = ['_'.join(col).strip() for col in df.columns.values] + df = df.rename(columns=lambda x: '_'.join(x.split('_')[1:]+x.split('_')[:1])) + df.columns=[re.sub("positivetests", "positive_tests",col) for col in df.columns] + df.columns=[re.sub("percentpositive", "pct_positive",col) for col in df.columns] + df.columns=[re.sub(r' ','_',c) for c in df.columns] + + for k in range(len(df.columns)): + if "pct_positive" in df.columns[k]: + assert all([0 <= val <= 100 or math.isnan(val) for val in df[df.columns[k]]]), "Percentage not from 0-100" + + return(df) + +def get_weekly_data(base_url,start_year): + headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' + } + + # Get update date + update_date_url = base_url + "RVD_UpdateDate.csv" + update_date_url_response = requests.get(update_date_url, headers=headers) + update_date = datetime.strptime(update_date_url_response.text,"%m/%d/%Y %H:%M:%S").strftime("%Y-%m-%d") + + # Get current week and year + summary_url = base_url + "RVD_SummaryText.csv" + summary_url_response = requests.get(summary_url, headers=headers) + summary_df = pd.read_csv(io.StringIO(summary_url_response.text)) + + week_df = summary_df[(summary_df['Section'] == "summary") & (summary_df['Type']=="title")] + week_string = week_df.iloc[0]['Text'].lower() + current_week = int(re.search("week (.+?) ", week_string).group(1)) + + if current_week < 34: + current_year = start_year+1 + else: + current_year = start_year + + current_epiweek= Week(current_year,current_week) + + # Get weekly data + weekly_url = base_url + "RVD_CurrentWeekTable.csv" + weekly_url_response = requests.get(weekly_url, headers=headers) + weekly_url_response.encoding='UTF-8' + df_weekly = pd.read_csv(io.StringIO(weekly_url_response.text)) + + df_weekly = df_weekly.rename(columns=lambda x: '_'.join(x.split('_')[1:]+x.split('_')[:1])) + df_weekly.insert(0,"epiweek",int(str(current_epiweek))) + df_weekly.insert(1,"time_value",str(current_epiweek.enddate())) + df_weekly.insert(2,"issue",update_date) + df_weekly.columns=[abbreviate_virus(c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'test\b','tests',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'pos\b','positive_tests',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'flua_','flu_a',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'flub_','flu_b',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'bpositive','b_positive',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'apositive','a_positive',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'flu_ah1_','flu_ah1pdm09_',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r' ','_',c) for c in df_weekly.columns] + df_weekly=df_weekly.rename(columns={'reportinglaboratory':"geo_value"}) + df_weekly['geo_value'] = [abbreviate_geo(g) for g in df_weekly['geo_value']] + df_weekly['geo_type'] = [create_geo_types(g,"lab") for g in df_weekly['geo_value']] + + df_weekly=df_weekly.drop(["weekorder","date","week"],axis=1) + + return(df_weekly) + +base_url = "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/" + +weekly_data = get_weekly_data(base_url,2024).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) +positive_data = get_revised_data(base_url) + +path1 = './season_2024_2025_respiratory_detections.csv' +path2 = './season_2024_2025_positive_tests.csv' + +if os.path.exists(path1)==False: + weekly_data.to_csv(path1,index=True) +else: + old_detection_data = pd.read_csv(path1).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) + if weekly_data.index.isin(old_detection_data.index).any() == False: + old_detection_data= pd.concat([old_detection_data,weekly_data],axis=0) + old_detection_data.to_csv(path1,index=True) + +if os.path.exists(path2)==False: + positive_data.to_csv(path2,index=True) +else: + old_positive_data = pd.read_csv(path2).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) + if positive_data.index.isin(old_positive_data.index).any() == False: + old_positive_data= pd.concat([old_positive_data,positive_data],axis=0) + old_positive_data.to_csv(path2,index=True) + + \ No newline at end of file From 01af95f809bd9e677760237368f7b72ecb30a6e7 Mon Sep 17 00:00:00 2001 From: cchuong Date: Sat, 14 Sep 2024 12:45:26 -0700 Subject: [PATCH 03/33] create utils.py for common functions --- src/acquisition/rvdss/rvdss_historic.py | 200 +---------------------- src/acquisition/rvdss/rvdss_update.py | 202 +----------------------- src/acquisition/rvdss/utils.py | 199 +++++++++++++++++++++++ 3 files changed, 209 insertions(+), 392 deletions(-) create mode 100644 src/acquisition/rvdss/utils.py diff --git a/src/acquisition/rvdss/rvdss_historic.py b/src/acquisition/rvdss/rvdss_historic.py index b50385f04..f332cffe2 100644 --- a/src/acquisition/rvdss/rvdss_historic.py +++ b/src/acquisition/rvdss/rvdss_historic.py @@ -5,8 +5,8 @@ from epiweeks import Week from datetime import datetime,timedelta import math -import io +from utils import abbreviate_virus,abbreviate_geo,create_geo_types,check_date_format,get_revised_data,get_weekly_data #%% Functions # Report Functions @@ -25,16 +25,18 @@ def append_urls(urls): for i in range(len(urls)): temp_url = urls[i] - http_present = re.search("http",temp_url) + http_present = re.search("http:",temp_url) if not http_present: urls[i]="https://www.canada.ca"+temp_url + else: + urls[i]=re.sub("http:","https:",temp_url) return(urls) def report_urls(soup): # Get links for individual weeks year= "-".join(get_report_season(soup)) links=soup.find_all('a') - alternative_url = "http://www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/"+year + alternative_url = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/"+year urls = [link.get("href") for link in links if "ending" in str(link) or alternative_url in str(link)] @@ -63,74 +65,7 @@ def get_report_date(week,start_year,epi=False): return(report_date) -def abbreviate_virus(full_name): - lowercase=full_name.lower() - - if any(name in lowercase for name in ["parainfluenza","para","piv"]): - if "hpiv" not in lowercase: - abbrev = re.sub("parainfluenza|para|piv","hpiv",lowercase) - else: - abbrev = lowercase - elif any(name in lowercase for name in ["adenovirus","adeno"]): - abbrev = re.sub("adenovirus|adeno","adv",lowercase) - elif "human metapneumovirus" in lowercase: - abbrev = re.sub("human metapneumovirus","hmpv",lowercase) - elif any(name in lowercase for name in ["enterovirus/rhinovirus","rhinovirus","rhv","entero/rhino","rhino","ev/rv","evrv"]): - abbrev = re.sub("enterovirus/rhinovirus|rhinovirus|rhv|entero/rhino|rhino|ev/rv|evrv","ev_rv",lowercase) - elif any(name in lowercase for name in ["coronavirus","coron","coro"]): - abbrev = re.sub("coronavirus|coron|coro","hcov",lowercase) - elif "respiratory syncytial virus" in lowercase: - abbrev = re.sub("respiratory syncytial virus","rsv",lowercase) - elif "influenza" in lowercase: - abbrev = re.sub("influenza","flu",lowercase) - elif "sarscov2" in lowercase: - abbrev = re.sub("sarscov2","sars-cov-2",lowercase) - else: - abbrev=lowercase - return(abbrev) - -def abbreviate_geo(full_name): - lowercase=full_name.lower() - - if "newfoundland" in lowercase: - abbrev = "nl" - elif "prince edward island" in lowercase: - abbrev = "pe" - elif "nova scotia" in lowercase: - abbrev = "ns" - elif "new brunswick" in lowercase: - abbrev = "nb" - elif "nova scotia" in lowercase: - abbrev = "ns" - elif re.match('|'.join(("^québec$", "province of québec","quebec")),lowercase): - abbrev = "qc" - elif re.match('|'.join(("^ontario$", "province of ontario")),lowercase): - abbrev = "on" - elif "manitoba" in lowercase: - abbrev = "mb" - elif "saskatchewan" in lowercase: - abbrev = "sk" - elif "alberta" in lowercase: - abbrev = "ab" - elif "british columbia" in lowercase: - abbrev = "bc" - elif "yukon" in lowercase: - abbrev = "yk" - elif "northwest territories" in lowercase: - abbrev = "nt" - elif "nunavut" in lowercase: - abbrev = "nu" - elif re.match("canada|can",lowercase): - abbrev = "ca" - elif re.match(r"^at\b",lowercase): - abbrev = "atlantic" - elif "pr" in lowercase: - abbrev = "prairies" - elif "terr" in lowercase: - abbrev = "territories" - else: - abbrev=lowercase - return(abbrev) + def get_table_captions(soup): @@ -183,19 +118,6 @@ def get_modified_dates(soup,week_end_date): return(new_modified_date_string) -def check_date_format(date_string): - if not re.search("[0-9]{4}-[0-9]{2}-[0-9]{2}",date_string): - if re.search(r"/",date_string): - new_date = re.sub(r"/","-",date_string) - new_date = datetime.strptime(new_date,"%d-%m-%Y").strftime("%Y-%m-%d") - elif re.search("[0-9]{2}-[0-9]{2}-[0-9]{4}",date_string): - new_date = datetime.strptime(date_string,"%d-%m-%Y").strftime("%Y-%m-%d") - else: - raise AssertionError("Unrecognised date format") - else: - new_date=date_string - - return(new_date) def check_duplicate_rows(table): if table['week'].duplicated().any(): @@ -213,18 +135,7 @@ def check_duplicate_rows(table): new_table=table return(new_table) -def create_geo_types(geo,default_geo): - regions = ['atlantic','atl','province of québec','québec','qc','province of ontario','ontario','on', - 'prairies', 'pr', "british columbia", 'bc',"territories",'terr'] - nation = ["canada","can",'ca'] - - if geo in nation: - geo_type="nation" - elif geo in regions: - geo_type="region" - else: - geo_type = default_geo - return(geo_type) + def create_detections_table(table,modified_date,week_number,week_end_date,start_year): lab_columns =[col for col in table.columns if 'reporting' in col][0] @@ -501,103 +412,6 @@ def get_season_reports(url): if len(all_number_tables) != 0: all_number_tables.to_csv(path+"/"+path+"_number_of_detections.csv", index=True) -# Dashboard functions -def get_revised_data(base_url): - headers = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' - } - - # Get update date - update_date_url = base_url + "RVD_UpdateDate.csv" - update_date_url_response = requests.get(update_date_url, headers=headers) - update_date = datetime.strptime(update_date_url_response.text,"%m/%d/%Y %H:%M:%S").strftime("%Y-%m-%d") - - # Get update data - url = base_url+"RVD_WeeklyData.csv" - - url_response = requests.get(url, headers=headers) - df = pd.read_csv(io.StringIO(url_response.text)) - - df['virus'] = [abbreviate_virus(v) for v in df['virus']] - epiw = df.apply(lambda x: Week(x['year'],x['week']),axis=1) - df.insert(0,"epiweek",[int(str(w)) for w in epiw]) - df['epiweek'] = [int(str(w)) for w in df['epiweek']] - df['province'] = [abbreviate_geo(g) for g in df['province']] - df=df.rename(columns={'province':"geo_value",'date':'time_value',"detections":"positivetests"}) - df['time_value'] = [check_date_format(d) for d in df['time_value']] - df['geo_type'] = [create_geo_types(g,"province") for g in df['geo_value']] - df.insert(1,"issue",update_date) - - df=df.drop(["weekorder","region","year","week"],axis=1) - - df = df.pivot(index=['epiweek','time_value','issue','geo_type','geo_value'], - columns="virus",values=['tests','percentpositive','positivetests']) - df.columns = ['_'.join(col).strip() for col in df.columns.values] - df = df.rename(columns=lambda x: '_'.join(x.split('_')[1:]+x.split('_')[:1])) - df.columns=[re.sub("positivetests", "positive_tests",col) for col in df.columns] - df.columns=[re.sub("percentpositive", "pct_positive",col) for col in df.columns] - df.columns=[re.sub(r' ','_',c) for c in df.columns] - - for k in range(len(df.columns)): - if "pct_positive" in df.columns[k]: - assert all([0 <= val <= 100 or math.isnan(val) for val in df[df.columns[k]]]), "Percentage not from 0-100" - - return(df) - -def get_weekly_data(base_url,start_year): - headers = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' - } - - # Get update date - update_date_url = base_url + "RVD_UpdateDate.csv" - update_date_url_response = requests.get(update_date_url, headers=headers) - update_date = datetime.strptime(update_date_url_response.text,"%m/%d/%Y %H:%M:%S").strftime("%Y-%m-%d") - - # Get current week and year - summary_url = base_url + "RVD_SummaryText.csv" - summary_url_response = requests.get(summary_url, headers=headers) - summary_df = pd.read_csv(io.StringIO(summary_url_response.text)) - - week_df = summary_df[(summary_df['Section'] == "summary") & (summary_df['Type']=="title")] - week_string = week_df.iloc[0]['Text'].lower() - current_week = int(re.search("week (.+?) ", week_string).group(1)) - - if current_week < 34: - current_year = start_year+1 - else: - current_year = start_year - - current_epiweek= Week(current_year,current_week) - - # Get weekly data - weekly_url = base_url + "RVD_CurrentWeekTable.csv" - weekly_url_response = requests.get(weekly_url, headers=headers) - weekly_url_response.encoding='UTF-8' - df_weekly = pd.read_csv(io.StringIO(weekly_url_response.text)) - - df_weekly = df_weekly.rename(columns=lambda x: '_'.join(x.split('_')[1:]+x.split('_')[:1])) - df_weekly.insert(0,"epiweek",int(str(current_epiweek))) - df_weekly.insert(1,"time_value",str(current_epiweek.enddate())) - df_weekly.insert(2,"issue",update_date) - df_weekly.columns=[abbreviate_virus(c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'test\b','tests',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'pos\b','positive_tests',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'flua_','flu_a',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'flub_','flu_b',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'bpositive','b_positive',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'apositive','a_positive',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'flu_ah1_','flu_ah1pdm09_',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r' ','_',c) for c in df_weekly.columns] - df_weekly=df_weekly.rename(columns={'reportinglaboratory':"geo_value"}) - df_weekly['geo_value'] = [abbreviate_geo(g) for g in df_weekly['geo_value']] - df_weekly['geo_type'] = [create_geo_types(g,"lab") for g in df_weekly['geo_value']] - - #df_weekly=df_weekly.drop(["weekorder","date","week"],axis=1) - - return(df_weekly) - - #%% Scrape each season urls = ["https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2013-2014.html", diff --git a/src/acquisition/rvdss/rvdss_update.py b/src/acquisition/rvdss/rvdss_update.py index e8c9bd46c..9afc36de3 100644 --- a/src/acquisition/rvdss/rvdss_update.py +++ b/src/acquisition/rvdss/rvdss_update.py @@ -1,207 +1,11 @@ -import requests import pandas as pd -import io -import regex as re -from epiweeks import Week -from datetime import datetime -import math import os - -def abbreviate_virus(full_name): - lowercase=full_name.lower() - - if any(name in lowercase for name in ["parainfluenza","para","piv"]): - if "hpiv" not in lowercase: - abbrev = re.sub("parainfluenza|para|piv","hpiv",lowercase) - else: - abbrev = lowercase - elif any(name in lowercase for name in ["adenovirus","adeno"]): - abbrev = re.sub("adenovirus|adeno","adv",lowercase) - elif "human metapneumovirus" in lowercase: - abbrev = re.sub("human metapneumovirus","hmpv",lowercase) - elif any(name in lowercase for name in ["enterovirus/rhinovirus","rhinovirus","rhv","entero/rhino","rhino","ev/rv","evrv"]): - abbrev = re.sub("enterovirus/rhinovirus|rhinovirus|rhv|entero/rhino|rhino|ev/rv|evrv","ev_rv",lowercase) - elif any(name in lowercase for name in ["coronavirus","coron","coro"]): - abbrev = re.sub("coronavirus|coron|coro","hcov",lowercase) - elif "respiratory syncytial virus" in lowercase: - abbrev = re.sub("respiratory syncytial virus","rsv",lowercase) - elif "influenza" in lowercase: - abbrev = re.sub("influenza","flu",lowercase) - elif "sarscov2" in lowercase: - abbrev = re.sub("sarscov2","sars-cov-2",lowercase) - else: - abbrev=lowercase - return(abbrev) - -def abbreviate_geo(full_name): - lowercase=full_name.lower() - - if "newfoundland" in lowercase: - abbrev = "nl" - elif "prince edward island" in lowercase: - abbrev = "pe" - elif "nova scotia" in lowercase: - abbrev = "ns" - elif "new brunswick" in lowercase: - abbrev = "nb" - elif "nova scotia" in lowercase: - abbrev = "ns" - elif re.match('|'.join(("^québec$", "province of québec","quebec")),lowercase): - abbrev = "qc" - elif re.match('|'.join(("^ontario$", "province of ontario")),lowercase): - abbrev = "on" - elif "manitoba" in lowercase: - abbrev = "mb" - elif "saskatchewan" in lowercase: - abbrev = "sk" - elif "alberta" in lowercase: - abbrev = "ab" - elif "british columbia" in lowercase: - abbrev = "bc" - elif "yukon" in lowercase: - abbrev = "yk" - elif "northwest territories" in lowercase: - abbrev = "nt" - elif "nunavut" in lowercase: - abbrev = "nu" - elif re.match("canada|can",lowercase): - abbrev = "ca" - elif re.match(r"^at\b",lowercase): - abbrev = "atlantic" - elif "pr" in lowercase: - abbrev = "prairies" - elif "terr" in lowercase: - abbrev = "territories" - else: - abbrev=lowercase - return(abbrev) - -def create_geo_types(geo,default_geo): - regions = ['atlantic','atl','province of québec','québec','qc','province of ontario','ontario','on', - 'prairies', 'pr', "british columbia", 'bc',"territories",'terr'] - nation = ["canada","can",'ca'] - - if geo in nation: - geo_type="nation" - elif geo in regions: - geo_type="region" - else: - geo_type = default_geo - return(geo_type) - -def check_date_format(date_string): - if not re.search("[0-9]{4}-[0-9]{2}-[0-9]{2}",date_string): - if re.search(r"/",date_string): - new_date = re.sub(r"/","-",date_string) - new_date = datetime.strptime(new_date,"%d-%m-%Y").strftime("%Y-%m-%d") - elif re.search("[0-9]{2}-[0-9]{2}-[0-9]{4}",date_string): - new_date = datetime.strptime(date_string,"%d-%m-%Y").strftime("%Y-%m-%d") - else: - raise AssertionError("Unrecognised date format") - else: - new_date=date_string - - return(new_date) - -def get_revised_data(base_url): - headers = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' - } - - # Get update date - update_date_url = base_url + "RVD_UpdateDate.csv" - update_date_url_response = requests.get(update_date_url, headers=headers) - update_date = datetime.strptime(update_date_url_response.text,"%m/%d/%Y %H:%M:%S").strftime("%Y-%m-%d") - - # Get update data - url = base_url+"RVD_WeeklyData.csv" - - url_response = requests.get(url, headers=headers) - df = pd.read_csv(io.StringIO(url_response.text)) - - df['virus'] = [abbreviate_virus(v) for v in df['virus']] - epiw = df.apply(lambda x: Week(x['year'],x['week']),axis=1) - df.insert(0,"epiweek",[int(str(w)) for w in epiw]) - df['epiweek'] = [int(str(w)) for w in df['epiweek']] - df['province'] = [abbreviate_geo(g) for g in df['province']] - df=df.rename(columns={'province':"geo_value",'date':'time_value',"detections":"positivetests"}) - df['time_value'] = [check_date_format(d) for d in df['time_value']] - df['geo_type'] = [create_geo_types(g,"province") for g in df['geo_value']] - df.insert(1,"issue",update_date) - - df=df.drop(["weekorder","region","year","week"],axis=1) - - df = df.pivot(index=['epiweek','time_value','issue','geo_type','geo_value'], - columns="virus",values=['tests','percentpositive','positivetests']) - df.columns = ['_'.join(col).strip() for col in df.columns.values] - df = df.rename(columns=lambda x: '_'.join(x.split('_')[1:]+x.split('_')[:1])) - df.columns=[re.sub("positivetests", "positive_tests",col) for col in df.columns] - df.columns=[re.sub("percentpositive", "pct_positive",col) for col in df.columns] - df.columns=[re.sub(r' ','_',c) for c in df.columns] - - for k in range(len(df.columns)): - if "pct_positive" in df.columns[k]: - assert all([0 <= val <= 100 or math.isnan(val) for val in df[df.columns[k]]]), "Percentage not from 0-100" - - return(df) - -def get_weekly_data(base_url,start_year): - headers = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' - } - - # Get update date - update_date_url = base_url + "RVD_UpdateDate.csv" - update_date_url_response = requests.get(update_date_url, headers=headers) - update_date = datetime.strptime(update_date_url_response.text,"%m/%d/%Y %H:%M:%S").strftime("%Y-%m-%d") - - # Get current week and year - summary_url = base_url + "RVD_SummaryText.csv" - summary_url_response = requests.get(summary_url, headers=headers) - summary_df = pd.read_csv(io.StringIO(summary_url_response.text)) - - week_df = summary_df[(summary_df['Section'] == "summary") & (summary_df['Type']=="title")] - week_string = week_df.iloc[0]['Text'].lower() - current_week = int(re.search("week (.+?) ", week_string).group(1)) - - if current_week < 34: - current_year = start_year+1 - else: - current_year = start_year - - current_epiweek= Week(current_year,current_week) - - # Get weekly data - weekly_url = base_url + "RVD_CurrentWeekTable.csv" - weekly_url_response = requests.get(weekly_url, headers=headers) - weekly_url_response.encoding='UTF-8' - df_weekly = pd.read_csv(io.StringIO(weekly_url_response.text)) - - df_weekly = df_weekly.rename(columns=lambda x: '_'.join(x.split('_')[1:]+x.split('_')[:1])) - df_weekly.insert(0,"epiweek",int(str(current_epiweek))) - df_weekly.insert(1,"time_value",str(current_epiweek.enddate())) - df_weekly.insert(2,"issue",update_date) - df_weekly.columns=[abbreviate_virus(c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'test\b','tests',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'pos\b','positive_tests',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'flua_','flu_a',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'flub_','flu_b',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'bpositive','b_positive',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'apositive','a_positive',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'flu_ah1_','flu_ah1pdm09_',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r' ','_',c) for c in df_weekly.columns] - df_weekly=df_weekly.rename(columns={'reportinglaboratory':"geo_value"}) - df_weekly['geo_value'] = [abbreviate_geo(g) for g in df_weekly['geo_value']] - df_weekly['geo_type'] = [create_geo_types(g,"lab") for g in df_weekly['geo_value']] - - df_weekly=df_weekly.drop(["weekorder","date","week"],axis=1) - - return(df_weekly) +import utils base_url = "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/" -weekly_data = get_weekly_data(base_url,2024).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) -positive_data = get_revised_data(base_url) +weekly_data = utils.get_weekly_data(base_url,2024).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) +positive_data = utils.get_revised_data(base_url) path1 = './season_2024_2025_respiratory_detections.csv' path2 = './season_2024_2025_positive_tests.csv' diff --git a/src/acquisition/rvdss/utils.py b/src/acquisition/rvdss/utils.py new file mode 100644 index 000000000..940c4389d --- /dev/null +++ b/src/acquisition/rvdss/utils.py @@ -0,0 +1,199 @@ +import requests +import pandas as pd +import io +import regex as re +from epiweeks import Week +from datetime import datetime +import math + +def abbreviate_virus(full_name): + lowercase=full_name.lower() + + if any(name in lowercase for name in ["parainfluenza","para","piv"]): + if "hpiv" not in lowercase: + abbrev = re.sub("parainfluenza|para|piv","hpiv",lowercase) + else: + abbrev = lowercase + elif any(name in lowercase for name in ["adenovirus","adeno"]): + abbrev = re.sub("adenovirus|adeno","adv",lowercase) + elif "human metapneumovirus" in lowercase: + abbrev = re.sub("human metapneumovirus","hmpv",lowercase) + elif any(name in lowercase for name in ["enterovirus/rhinovirus","rhinovirus","rhv","entero/rhino","rhino","ev/rv","evrv"]): + abbrev = re.sub("enterovirus/rhinovirus|rhinovirus|rhv|entero/rhino|rhino|ev/rv|evrv","ev_rv",lowercase) + elif any(name in lowercase for name in ["coronavirus","coron","coro"]): + abbrev = re.sub("coronavirus|coron|coro","hcov",lowercase) + elif "respiratory syncytial virus" in lowercase: + abbrev = re.sub("respiratory syncytial virus","rsv",lowercase) + elif "influenza" in lowercase: + abbrev = re.sub("influenza","flu",lowercase) + elif "sarscov2" in lowercase: + abbrev = re.sub("sarscov2","sars-cov-2",lowercase) + else: + abbrev=lowercase + return(abbrev) + +def abbreviate_geo(full_name): + lowercase=full_name.lower() + + if "newfoundland" in lowercase: + abbrev = "nl" + elif "prince edward island" in lowercase: + abbrev = "pe" + elif "nova scotia" in lowercase: + abbrev = "ns" + elif "new brunswick" in lowercase: + abbrev = "nb" + elif "nova scotia" in lowercase: + abbrev = "ns" + elif re.match('|'.join(("^québec$", "province of québec","quebec")),lowercase): + abbrev = "qc" + elif re.match('|'.join(("^ontario$", "province of ontario")),lowercase): + abbrev = "on" + elif "manitoba" in lowercase: + abbrev = "mb" + elif "saskatchewan" in lowercase: + abbrev = "sk" + elif "alberta" in lowercase: + abbrev = "ab" + elif "british columbia" in lowercase: + abbrev = "bc" + elif "yukon" in lowercase: + abbrev = "yk" + elif "northwest territories" in lowercase: + abbrev = "nt" + elif "nunavut" in lowercase: + abbrev = "nu" + elif re.match("canada|can",lowercase): + abbrev = "ca" + elif re.match(r"^at\b",lowercase): + abbrev = "atlantic" + elif "pr" in lowercase: + abbrev = "prairies" + elif "terr" in lowercase: + abbrev = "territories" + else: + abbrev=lowercase + return(abbrev) + +def create_geo_types(geo,default_geo): + regions = ['atlantic','atl','province of québec','québec','qc','province of ontario','ontario','on', + 'prairies', 'pr', "british columbia", 'bc',"territories",'terr'] + nation = ["canada","can",'ca'] + + if geo in nation: + geo_type="nation" + elif geo in regions: + geo_type="region" + else: + geo_type = default_geo + return(geo_type) + +def check_date_format(date_string): + if not re.search("[0-9]{4}-[0-9]{2}-[0-9]{2}",date_string): + if re.search(r"/",date_string): + new_date = re.sub(r"/","-",date_string) + new_date = datetime.strptime(new_date,"%d-%m-%Y").strftime("%Y-%m-%d") + elif re.search("[0-9]{2}-[0-9]{2}-[0-9]{4}",date_string): + new_date = datetime.strptime(date_string,"%d-%m-%Y").strftime("%Y-%m-%d") + else: + raise AssertionError("Unrecognised date format") + else: + new_date=date_string + + return(new_date) + +def get_revised_data(base_url): + headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' + } + + # Get update date + update_date_url = base_url + "RVD_UpdateDate.csv" + update_date_url_response = requests.get(update_date_url, headers=headers) + update_date = datetime.strptime(update_date_url_response.text,"%m/%d/%Y %H:%M:%S").strftime("%Y-%m-%d") + + # Get update data + url = base_url+"RVD_WeeklyData.csv" + + url_response = requests.get(url, headers=headers) + df = pd.read_csv(io.StringIO(url_response.text)) + + df['virus'] = [abbreviate_virus(v) for v in df['virus']] + epiw = df.apply(lambda x: Week(x['year'],x['week']),axis=1) + df.insert(0,"epiweek",[int(str(w)) for w in epiw]) + df['epiweek'] = [int(str(w)) for w in df['epiweek']] + df['province'] = [abbreviate_geo(g) for g in df['province']] + df=df.rename(columns={'province':"geo_value",'date':'time_value',"detections":"positivetests"}) + df['time_value'] = [check_date_format(d) for d in df['time_value']] + df['geo_type'] = [create_geo_types(g,"province") for g in df['geo_value']] + df.insert(1,"issue",update_date) + + df=df.drop(["weekorder","region","year","week"],axis=1) + + df = df.pivot(index=['epiweek','time_value','issue','geo_type','geo_value'], + columns="virus",values=['tests','percentpositive','positivetests']) + df.columns = ['_'.join(col).strip() for col in df.columns.values] + df = df.rename(columns=lambda x: '_'.join(x.split('_')[1:]+x.split('_')[:1])) + df.columns=[re.sub("positivetests", "positive_tests",col) for col in df.columns] + df.columns=[re.sub("percentpositive", "pct_positive",col) for col in df.columns] + df.columns=[re.sub(r' ','_',c) for c in df.columns] + + for k in range(len(df.columns)): + if "pct_positive" in df.columns[k]: + assert all([0 <= val <= 100 or math.isnan(val) for val in df[df.columns[k]]]), "Percentage not from 0-100" + + return(df) + +def get_weekly_data(base_url,start_year): + headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' + } + + # Get update date + update_date_url = base_url + "RVD_UpdateDate.csv" + update_date_url_response = requests.get(update_date_url, headers=headers) + update_date = datetime.strptime(update_date_url_response.text,"%m/%d/%Y %H:%M:%S").strftime("%Y-%m-%d") + + # Get current week and year + summary_url = base_url + "RVD_SummaryText.csv" + summary_url_response = requests.get(summary_url, headers=headers) + summary_df = pd.read_csv(io.StringIO(summary_url_response.text)) + + week_df = summary_df[(summary_df['Section'] == "summary") & (summary_df['Type']=="title")] + week_string = week_df.iloc[0]['Text'].lower() + current_week = int(re.search("week (.+?) ", week_string).group(1)) + + if current_week < 34: + current_year = start_year+1 + else: + current_year = start_year + + current_epiweek= Week(current_year,current_week) + + # Get weekly data + weekly_url = base_url + "RVD_CurrentWeekTable.csv" + weekly_url_response = requests.get(weekly_url, headers=headers) + weekly_url_response.encoding='UTF-8' + df_weekly = pd.read_csv(io.StringIO(weekly_url_response.text)) + + df_weekly = df_weekly.rename(columns=lambda x: '_'.join(x.split('_')[1:]+x.split('_')[:1])) + df_weekly.insert(0,"epiweek",int(str(current_epiweek))) + df_weekly.insert(1,"time_value",str(current_epiweek.enddate())) + df_weekly.insert(2,"issue",update_date) + df_weekly.columns=[abbreviate_virus(c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'test\b','tests',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'pos\b','positive_tests',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'flua_','flu_a',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'flub_','flu_b',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'bpositive','b_positive',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'apositive','a_positive',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r'flu_ah1_','flu_ah1pdm09_',c) for c in df_weekly.columns] + df_weekly.columns=[re.sub(r' ','_',c) for c in df_weekly.columns] + df_weekly=df_weekly.rename(columns={'reportinglaboratory':"geo_value"}) + df_weekly['geo_value'] = [abbreviate_geo(g) for g in df_weekly['geo_value']] + df_weekly['geo_type'] = [create_geo_types(g,"lab") for g in df_weekly['geo_value']] + + if df_weekly.columns.isin(["weekorder","date","week"]).all(): + df_weekly=df_weekly.drop(["weekorder","date","week"],axis=1) + + return(df_weekly) \ No newline at end of file From 6a002e04b62c98f44898f3f9a17d839c97ff7bee Mon Sep 17 00:00:00 2001 From: cchuong Date: Mon, 16 Sep 2024 11:39:30 -0700 Subject: [PATCH 04/33] create constants.py and update utils --- src/acquisition/rvdss/constants.py | 80 +++++++++++++++++++++++ src/acquisition/rvdss/rvdss_historic.py | 30 ++++++--- src/acquisition/rvdss/utils.py | 84 +++++-------------------- 3 files changed, 115 insertions(+), 79 deletions(-) create mode 100644 src/acquisition/rvdss/constants.py diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py new file mode 100644 index 000000000..ddd062e3a --- /dev/null +++ b/src/acquisition/rvdss/constants.py @@ -0,0 +1,80 @@ +VIRUSES = { + "parainfluenza": "hpiv", + "piv": "hpiv", + "para": "hpiv", + "adenovirus": "adv", + "adeno": "adv", + "human metapneumovirus": "hmpv", + "enterovirus/rhinovirus": "ev_rv", + "rhinovirus": "ev_rv", + "rhv": "ev_rv", + "entero/rhino": "ev_rv", + "rhino":"ev_rv", + "ev/rv":"ev_rv", + "evrv":"ev_rv", + "coronavirus":"hcov", + "coron":"hcov", + "coro":"hcov", + "respiratory syncytial virus":"rsv", + "influenza":"flu", + "sarscov2":"sars-cov-2" +} + +GEOS = { + "newfoundland": "nl", + "prince edward island":"pe", + "nova scotia":"ns", + "new brunswick":"nb", + "québec":"qc", + "province of québec":"qc", + "quebec":"qc", + "ontario":"on", + "province of ontario":"on", + "manitoba" : "mb", + "saskatchewan":"sk", + "alberta": "ab", + "british columbia" :"bc", + "yukon" : "yk", + "northwest territories" : "nt", + "nunavut" : "nu", + "canada":"ca", + "can":"ca" , + "at":"atlantic", + "pr" :"prairies" , + "terr" :"territories" + } + +REGIONS = ['atlantic','atl','province of québec','québec','qc','province of ontario','ontario','on', + 'prairies', 'pr', "british columbia", 'bc',"territories",'terr'] +NATION = ["canada","can",'ca'] + +BASHBOARD_BASE_URLS_2023=["https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-06-20/", +"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-06-27/", +"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-04/", +"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-11/", +"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-18/", +"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-01/", +"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-08/", +"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-15/", +"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-22/", +"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-29/", +"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-09-05/"] + +HISTORIC_SEASON_URL = ["https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2013-2014.html", +"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2014-2015.html", +"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2015-2016.html", +"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2016-2017.html", +"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2017-2018.html", +"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2018-2019.html", +"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2019-2020.html", +"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2020-2021.html", +"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2021-2022.html", +"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2022-2023.html", +"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2023-2024.html"] + +ALTENRATIVE_SEASON_BASE_URL = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/" + +SEASON_BASE_URL = "https://www.canada.ca" + +LAST_WEEK_OF_YEAR = 35 + diff --git a/src/acquisition/rvdss/rvdss_historic.py b/src/acquisition/rvdss/rvdss_historic.py index f332cffe2..e7bbd25ff 100644 --- a/src/acquisition/rvdss/rvdss_historic.py +++ b/src/acquisition/rvdss/rvdss_historic.py @@ -64,8 +64,6 @@ def get_report_date(week,start_year,epi=False): report_date = str(epi_week) return(report_date) - - def get_table_captions(soup): @@ -257,6 +255,17 @@ def create_percent_positive_detection_table(table,modified_date,start_year, flu= geo_types = [create_geo_types(g,"lab") for g in table['geo_value']] table.insert(3,"geo_type",geo_types) + # Calculate number of positive tests based on pct_positive and total tests + if flu: + table["flu_a_positive_tests"] = (table["flu_a_pct_positive"]/100)*table["flu_tests"] + table["flu_b_positive_tests"] = (table["flu_b_pct_positive"]/100)*table["flu_tests"] + + table["flu_positive_tests"] = table["flu_a_positive_tests"] + table["flu_b_positive_tests"] + table["flu_pct_positive"] = (table["flu_positive_tests"]/table["flu_tests"])*100 + else: + table[virus+"_positive_tests"] = (table[virus+"_pct_positive"]/100) *table[virus+"_tests"] + + table = table.set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) return(table) @@ -309,12 +318,15 @@ def get_season_reports(url): if "Positive Adenovirus" in caption.text: tab.select_one('td').decompose() - # Replace commas with periods - tab2 = re.sub(",",r".",str(tab)) - + if not "number" in caption.text.lower(): + # Replace commas with periods + tab = re.sub(",",r".",str(tab)) + else: + tab = re.sub(",",r"",str(tab)) + # Read table na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"N.D.","-"] - table = pd.read_html(tab2,na_values=na_values)[0].dropna(how="all") + table = pd.read_html(tab,na_values=na_values)[0].dropna(how="all") # Check for multiline headers if isinstance(table.columns, pd.MultiIndex): @@ -405,12 +417,12 @@ def get_season_reports(url): all_number_tables=pd.concat([all_number_tables,number_detections_table]) # write files to csvs - all_respiratory_detection_table.to_csv(path+"/"+path+"_respiratory_detections.csv", index=True) - all_positive_tables.to_csv(path+"/"+path+"_positive_tests.csv", index=True) + all_respiratory_detection_table.to_csv(path+"/respiratory_detections.csv", index=True) + all_positive_tables.to_csv(path+"/positive_tests.csv", index=True) # Write the number of detections table to csv if it exists (i.e has rows) if len(all_number_tables) != 0: - all_number_tables.to_csv(path+"/"+path+"_number_of_detections.csv", index=True) + all_number_tables.to_csv(path+"/number_of_detections.csv", index=True) #%% Scrape each season diff --git a/src/acquisition/rvdss/utils.py b/src/acquisition/rvdss/utils.py index 940c4389d..7b8c3dc52 100644 --- a/src/acquisition/rvdss/utils.py +++ b/src/acquisition/rvdss/utils.py @@ -6,83 +6,27 @@ from datetime import datetime import math +from constants import VIRUSES, GEOS, REGIONS, NATION, LAST_WEEK_OF_YEAR + def abbreviate_virus(full_name): lowercase=full_name.lower() - - if any(name in lowercase for name in ["parainfluenza","para","piv"]): - if "hpiv" not in lowercase: - abbrev = re.sub("parainfluenza|para|piv","hpiv",lowercase) - else: - abbrev = lowercase - elif any(name in lowercase for name in ["adenovirus","adeno"]): - abbrev = re.sub("adenovirus|adeno","adv",lowercase) - elif "human metapneumovirus" in lowercase: - abbrev = re.sub("human metapneumovirus","hmpv",lowercase) - elif any(name in lowercase for name in ["enterovirus/rhinovirus","rhinovirus","rhv","entero/rhino","rhino","ev/rv","evrv"]): - abbrev = re.sub("enterovirus/rhinovirus|rhinovirus|rhv|entero/rhino|rhino|ev/rv|evrv","ev_rv",lowercase) - elif any(name in lowercase for name in ["coronavirus","coron","coro"]): - abbrev = re.sub("coronavirus|coron|coro","hcov",lowercase) - elif "respiratory syncytial virus" in lowercase: - abbrev = re.sub("respiratory syncytial virus","rsv",lowercase) - elif "influenza" in lowercase: - abbrev = re.sub("influenza","flu",lowercase) - elif "sarscov2" in lowercase: - abbrev = re.sub("sarscov2","sars-cov-2",lowercase) - else: - abbrev=lowercase - return(abbrev) + keys = (re.escape(k) for k in VIRUSES.keys()) + pattern = re.compile(r'\b(' + '|'.join(keys) + r')\b') + result = pattern.sub(lambda x: VIRUSES[x.group()], lowercase) + return(result) def abbreviate_geo(full_name): lowercase=full_name.lower() - - if "newfoundland" in lowercase: - abbrev = "nl" - elif "prince edward island" in lowercase: - abbrev = "pe" - elif "nova scotia" in lowercase: - abbrev = "ns" - elif "new brunswick" in lowercase: - abbrev = "nb" - elif "nova scotia" in lowercase: - abbrev = "ns" - elif re.match('|'.join(("^québec$", "province of québec","quebec")),lowercase): - abbrev = "qc" - elif re.match('|'.join(("^ontario$", "province of ontario")),lowercase): - abbrev = "on" - elif "manitoba" in lowercase: - abbrev = "mb" - elif "saskatchewan" in lowercase: - abbrev = "sk" - elif "alberta" in lowercase: - abbrev = "ab" - elif "british columbia" in lowercase: - abbrev = "bc" - elif "yukon" in lowercase: - abbrev = "yk" - elif "northwest territories" in lowercase: - abbrev = "nt" - elif "nunavut" in lowercase: - abbrev = "nu" - elif re.match("canada|can",lowercase): - abbrev = "ca" - elif re.match(r"^at\b",lowercase): - abbrev = "atlantic" - elif "pr" in lowercase: - abbrev = "prairies" - elif "terr" in lowercase: - abbrev = "territories" - else: - abbrev=lowercase - return(abbrev) + keys = (re.escape(k) for k in GEOS.keys()) + pattern = re.compile(r'\b(' + '|'.join(keys) + r')\b') + + result = pattern.sub(lambda x: GEOS[x.group()], lowercase) + return(result) def create_geo_types(geo,default_geo): - regions = ['atlantic','atl','province of québec','québec','qc','province of ontario','ontario','on', - 'prairies', 'pr', "british columbia", 'bc',"territories",'terr'] - nation = ["canada","can",'ca'] - - if geo in nation: + if geo in NATION: geo_type="nation" - elif geo in regions: + elif geo in REGIONS: geo_type="region" else: geo_type = default_geo @@ -163,7 +107,7 @@ def get_weekly_data(base_url,start_year): week_string = week_df.iloc[0]['Text'].lower() current_week = int(re.search("week (.+?) ", week_string).group(1)) - if current_week < 34: + if current_week < LAST_WEEK_OF_YEAR: current_year = start_year+1 else: current_year = start_year From 714455cbdc895977e1700323e7d954d323706755 Mon Sep 17 00:00:00 2001 From: cchuong Date: Mon, 16 Sep 2024 11:45:34 -0700 Subject: [PATCH 05/33] Update rvdss_historic.py --- src/acquisition/rvdss/rvdss_historic.py | 77 ++++++------------------- 1 file changed, 19 insertions(+), 58 deletions(-) diff --git a/src/acquisition/rvdss/rvdss_historic.py b/src/acquisition/rvdss/rvdss_historic.py index e7bbd25ff..c130ed332 100644 --- a/src/acquisition/rvdss/rvdss_historic.py +++ b/src/acquisition/rvdss/rvdss_historic.py @@ -6,6 +6,7 @@ from datetime import datetime,timedelta import math +from constants import BASHBOARD_BASE_URLS_2023, HISTORIC_SEASON_URL, ALTENRATIVE_SEASON_BASE_URL, SEASON_BASE_URL, LAST_WEEK_OF_YEAR from utils import abbreviate_virus,abbreviate_geo,create_geo_types,check_date_format,get_revised_data,get_weekly_data #%% Functions @@ -27,7 +28,7 @@ def append_urls(urls): http_present = re.search("http:",temp_url) if not http_present: - urls[i]="https://www.canada.ca"+temp_url + urls[i]=SEASON_BASE_URL+temp_url else: urls[i]=re.sub("http:","https:",temp_url) return(urls) @@ -36,7 +37,7 @@ def report_urls(soup): # Get links for individual weeks year= "-".join(get_report_season(soup)) links=soup.find_all('a') - alternative_url = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/"+year + alternative_url = ALTENRATIVE_SEASON_BASE_URL+year urls = [link.get("href") for link in links if "ending" in str(link) or alternative_url in str(link)] @@ -51,7 +52,7 @@ def report_weeks(soup): return(weeks) def get_report_date(week,start_year,epi=False): - if week < 35: + if week < LAST_WEEK_OF_YEAR: year=int(start_year)+1 else: year=int(start_year) @@ -79,14 +80,16 @@ def get_table_captions(soup): caption = captions[i] matches = ["period","abbreviation","cumulative", "compared"] #skip historic comparisons and cumulative tables - if any(x in caption.text.lower() for x in matches): + if any(x in caption.text.lower() for x in matches) or caption.has_attr('class') or all(name not in caption.text.lower() for name in table_identifiers): remove_list.append(caption) + ''' elif caption.has_attr('class'): remove_list.append(caption) elif all(name not in caption.text.lower() for name in table_identifiers): remove_list.append(caption) + ''' new_captions = [cap for cap in captions if cap not in remove_list] new_captions = list(set(new_captions)) @@ -255,17 +258,6 @@ def create_percent_positive_detection_table(table,modified_date,start_year, flu= geo_types = [create_geo_types(g,"lab") for g in table['geo_value']] table.insert(3,"geo_type",geo_types) - # Calculate number of positive tests based on pct_positive and total tests - if flu: - table["flu_a_positive_tests"] = (table["flu_a_pct_positive"]/100)*table["flu_tests"] - table["flu_b_positive_tests"] = (table["flu_b_pct_positive"]/100)*table["flu_tests"] - - table["flu_positive_tests"] = table["flu_a_positive_tests"] + table["flu_b_positive_tests"] - table["flu_pct_positive"] = (table["flu_positive_tests"]/table["flu_tests"])*100 - else: - table[virus+"_positive_tests"] = (table[virus+"_pct_positive"]/100) *table[virus+"_tests"] - - table = table.set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) return(table) @@ -291,11 +283,9 @@ def get_season_reports(url): # Skip empty pages if season[0] == '2019': - if current_week == 5: - continue - elif current_week == 47: + if current_week == 5 or current_week == 47: continue - + # Get page for the current week temp_url=urls[week_num] temp_page=requests.get(temp_url) @@ -318,15 +308,12 @@ def get_season_reports(url): if "Positive Adenovirus" in caption.text: tab.select_one('td').decompose() - if not "number" in caption.text.lower(): - # Replace commas with periods - tab = re.sub(",",r".",str(tab)) - else: - tab = re.sub(",",r"",str(tab)) - + # Replace commas with periods + tab2 = re.sub(",",r".",str(tab)) + # Read table na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"N.D.","-"] - table = pd.read_html(tab,na_values=na_values)[0].dropna(how="all") + table = pd.read_html(tab2,na_values=na_values)[0].dropna(how="all") # Check for multiline headers if isinstance(table.columns, pd.MultiIndex): @@ -425,41 +412,15 @@ def get_season_reports(url): all_number_tables.to_csv(path+"/number_of_detections.csv", index=True) #%% Scrape each season - -urls = ["https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2013-2014.html", -"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2014-2015.html", -"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2015-2016.html", -"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2016-2017.html", -"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2017-2018.html", -"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2018-2019.html", -"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2019-2020.html", -"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2020-2021.html", -"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2021-2022.html", -"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2022-2023.html", -"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2023-2024.html"] - -[get_season_reports(url) for url in urls] - +[get_season_reports(url) for url in HISTORIC_SEASON_URL] #%% Update the end of the 2023-2024 season with the dashboard data - -base_urls=["https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-06-20/", -"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-06-27/", -"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-04/", -"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-11/", -"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-18/", -"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-01/", -"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-08/", -"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-15/", -"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-22/", -"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-29/", -"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-09-05/"] # Load old csvs -old_detection_data = pd.read_csv('season_2023_2024/season_2023_2024_respiratory_detections.csv').set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) -old_positive_data = pd.read_csv('season_2023_2024/season_2023_2024_positive_tests.csv').set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) +old_detection_data = pd.read_csv('season_2023_2024/respiratory_detections.csv').set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) +old_positive_data = pd.read_csv('season_2023_2024/positive_tests.csv').set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) -for base_url in base_urls: +for base_url in BASHBOARD_BASE_URLS_2023: # Get weekly dashboard data weekly_data = get_weekly_data(base_url,2023).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) positive_data = get_revised_data(base_url) @@ -473,6 +434,6 @@ def get_season_reports(url): old_positive_data= pd.concat([old_positive_data,positive_data],axis=0) # Overwrite/update csvs -old_detection_data.to_csv('season_2023_2024/season_2023_2024_respiratory_detections.csv',index=True) -old_positive_data.to_csv('season_2023_2024/season_2023_2024_positive_tests.csv',index=True) +old_detection_data.to_csv('season_2023_2024/respiratory_detections.csv',index=True) +old_positive_data.to_csv('season_2023_2024/positive_tests.csv',index=True) From 6ee8bb7f8a06f9a125b5de9980bd3effe268384d Mon Sep 17 00:00:00 2001 From: cchuong Date: Mon, 16 Sep 2024 11:46:39 -0700 Subject: [PATCH 06/33] Update rvdss_update.py --- src/acquisition/rvdss/rvdss_update.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/acquisition/rvdss/rvdss_update.py b/src/acquisition/rvdss/rvdss_update.py index 9afc36de3..ee061b9b0 100644 --- a/src/acquisition/rvdss/rvdss_update.py +++ b/src/acquisition/rvdss/rvdss_update.py @@ -7,8 +7,8 @@ weekly_data = utils.get_weekly_data(base_url,2024).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) positive_data = utils.get_revised_data(base_url) -path1 = './season_2024_2025_respiratory_detections.csv' -path2 = './season_2024_2025_positive_tests.csv' +path1 = './respiratory_detections.csv' +path2 = './positive_tests.csv' if os.path.exists(path1)==False: weekly_data.to_csv(path1,index=True) From 881455420cba2b1ba9c27f51f6af59a1c1f6855e Mon Sep 17 00:00:00 2001 From: cchuong Date: Mon, 16 Sep 2024 22:30:32 -0700 Subject: [PATCH 07/33] fix typo and add missing abbreviation to constants --- src/acquisition/rvdss/constants.py | 5 +++-- src/acquisition/rvdss/rvdss_historic.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py index ddd062e3a..f06f70f05 100644 --- a/src/acquisition/rvdss/constants.py +++ b/src/acquisition/rvdss/constants.py @@ -40,15 +40,16 @@ "canada":"ca", "can":"ca" , "at":"atlantic", + "atl":"atlantic", "pr" :"prairies" , "terr" :"territories" } -REGIONS = ['atlantic','atl','province of québec','québec','qc','province of ontario','ontario','on', +REGIONS = ['atlantic','atl','at','province of québec','québec','qc','province of ontario','ontario','on', 'prairies', 'pr', "british columbia", 'bc',"territories",'terr'] NATION = ["canada","can",'ca'] -BASHBOARD_BASE_URLS_2023=["https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-06-20/", +DASHBOARD_BASE_URLS_2023=["https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-06-20/", "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-06-27/", "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-04/", "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-11/", diff --git a/src/acquisition/rvdss/rvdss_historic.py b/src/acquisition/rvdss/rvdss_historic.py index c130ed332..f253fd0fe 100644 --- a/src/acquisition/rvdss/rvdss_historic.py +++ b/src/acquisition/rvdss/rvdss_historic.py @@ -6,7 +6,7 @@ from datetime import datetime,timedelta import math -from constants import BASHBOARD_BASE_URLS_2023, HISTORIC_SEASON_URL, ALTENRATIVE_SEASON_BASE_URL, SEASON_BASE_URL, LAST_WEEK_OF_YEAR +from constants import DASHBOARD_BASE_URLS_2023, HISTORIC_SEASON_URL, ALTENRATIVE_SEASON_BASE_URL, SEASON_BASE_URL, LAST_WEEK_OF_YEAR from utils import abbreviate_virus,abbreviate_geo,create_geo_types,check_date_format,get_revised_data,get_weekly_data #%% Functions @@ -420,7 +420,7 @@ def get_season_reports(url): old_detection_data = pd.read_csv('season_2023_2024/respiratory_detections.csv').set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) old_positive_data = pd.read_csv('season_2023_2024/positive_tests.csv').set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) -for base_url in BASHBOARD_BASE_URLS_2023: +for base_url in DASHBOARD_BASE_URLS_2023: # Get weekly dashboard data weekly_data = get_weekly_data(base_url,2023).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) positive_data = get_revised_data(base_url) From d7905c8735b4de563270658bc0158c62ad2a2b1c Mon Sep 17 00:00:00 2001 From: cchuong Date: Tue, 17 Sep 2024 23:49:17 -0700 Subject: [PATCH 08/33] fix typo --- src/acquisition/rvdss/constants.py | 2 +- src/acquisition/rvdss/rvdss_historic.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py index f06f70f05..c8e9daf72 100644 --- a/src/acquisition/rvdss/constants.py +++ b/src/acquisition/rvdss/constants.py @@ -73,7 +73,7 @@ "https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2022-2023.html", "https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2023-2024.html"] -ALTENRATIVE_SEASON_BASE_URL = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/" +ALTERNATIVE_SEASON_BASE_URL = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/" SEASON_BASE_URL = "https://www.canada.ca" diff --git a/src/acquisition/rvdss/rvdss_historic.py b/src/acquisition/rvdss/rvdss_historic.py index f253fd0fe..4400c9ad8 100644 --- a/src/acquisition/rvdss/rvdss_historic.py +++ b/src/acquisition/rvdss/rvdss_historic.py @@ -6,7 +6,7 @@ from datetime import datetime,timedelta import math -from constants import DASHBOARD_BASE_URLS_2023, HISTORIC_SEASON_URL, ALTENRATIVE_SEASON_BASE_URL, SEASON_BASE_URL, LAST_WEEK_OF_YEAR +from constants import DASHBOARD_BASE_URLS_2023, HISTORIC_SEASON_URL, ALTERNATIVE_SEASON_BASE_URL, SEASON_BASE_URL, LAST_WEEK_OF_YEAR from utils import abbreviate_virus,abbreviate_geo,create_geo_types,check_date_format,get_revised_data,get_weekly_data #%% Functions @@ -37,7 +37,7 @@ def report_urls(soup): # Get links for individual weeks year= "-".join(get_report_season(soup)) links=soup.find_all('a') - alternative_url = ALTENRATIVE_SEASON_BASE_URL+year + alternative_url = ALTERNATIVE_SEASON_BASE_URL+year urls = [link.get("href") for link in links if "ending" in str(link) or alternative_url in str(link)] From 08f908afd5c3a4d2581bdde8b8a476fa7de4e9e2 Mon Sep 17 00:00:00 2001 From: cchuong Date: Wed, 18 Sep 2024 00:07:11 -0700 Subject: [PATCH 09/33] add missing geo --- src/acquisition/rvdss/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py index c8e9daf72..ef35abe7a 100644 --- a/src/acquisition/rvdss/constants.py +++ b/src/acquisition/rvdss/constants.py @@ -22,6 +22,7 @@ GEOS = { "newfoundland": "nl", + "newfoundland and labrador": "nl", "prince edward island":"pe", "nova scotia":"ns", "new brunswick":"nb", From 07ed9988b172f69e2b86d11c917c6251bf570b20 Mon Sep 17 00:00:00 2001 From: cchuong Date: Wed, 18 Sep 2024 00:45:57 -0700 Subject: [PATCH 10/33] Update constants.py --- src/acquisition/rvdss/constants.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py index ef35abe7a..a32b5d8e0 100644 --- a/src/acquisition/rvdss/constants.py +++ b/src/acquisition/rvdss/constants.py @@ -21,8 +21,7 @@ } GEOS = { - "newfoundland": "nl", - "newfoundland and labrador": "nl", + "newfoundland|newfoundland and labrador": "nl", "prince edward island":"pe", "nova scotia":"ns", "new brunswick":"nb", From fd5bf159a366a55da432357e6f8e718ec9efe822 Mon Sep 17 00:00:00 2001 From: cchuong Date: Wed, 18 Sep 2024 00:56:24 -0700 Subject: [PATCH 11/33] Revert "Update constants.py" This reverts commit 07ed9988b172f69e2b86d11c917c6251bf570b20. --- src/acquisition/rvdss/constants.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py index a32b5d8e0..ef35abe7a 100644 --- a/src/acquisition/rvdss/constants.py +++ b/src/acquisition/rvdss/constants.py @@ -21,7 +21,8 @@ } GEOS = { - "newfoundland|newfoundland and labrador": "nl", + "newfoundland": "nl", + "newfoundland and labrador": "nl", "prince edward island":"pe", "nova scotia":"ns", "new brunswick":"nb", From 678b468e7e8527bd89b913a799870fb92dcea38b Mon Sep 17 00:00:00 2001 From: cchuong Date: Wed, 18 Sep 2024 00:56:57 -0700 Subject: [PATCH 12/33] Revert "add missing geo" This reverts commit 08f908afd5c3a4d2581bdde8b8a476fa7de4e9e2. --- src/acquisition/rvdss/constants.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py index ef35abe7a..c8e9daf72 100644 --- a/src/acquisition/rvdss/constants.py +++ b/src/acquisition/rvdss/constants.py @@ -22,7 +22,6 @@ GEOS = { "newfoundland": "nl", - "newfoundland and labrador": "nl", "prince edward island":"pe", "nova scotia":"ns", "new brunswick":"nb", From 4bfc93300de9ae823d7029addd1b38d917b92a97 Mon Sep 17 00:00:00 2001 From: cchuong Date: Thu, 19 Sep 2024 21:21:27 -0700 Subject: [PATCH 13/33] fix geo and virus abbreviation --- src/acquisition/rvdss/constants.py | 16 +++++++------- src/acquisition/rvdss/rvdss_historic.py | 28 ++++++++++++++++++------- src/acquisition/rvdss/utils.py | 2 +- 3 files changed, 29 insertions(+), 17 deletions(-) diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py index c8e9daf72..53bdfbbae 100644 --- a/src/acquisition/rvdss/constants.py +++ b/src/acquisition/rvdss/constants.py @@ -5,23 +5,23 @@ "adenovirus": "adv", "adeno": "adv", "human metapneumovirus": "hmpv", - "enterovirus/rhinovirus": "ev_rv", - "rhinovirus": "ev_rv", - "rhv": "ev_rv", - "entero/rhino": "ev_rv", - "rhino":"ev_rv", - "ev/rv":"ev_rv", - "evrv":"ev_rv", + "enterovirus/rhinovirus": "evrv", + "rhinovirus": "evrv", + "rhv": "evrv", + "entero/rhino": "evrv", + "rhino":"evrv", + "ev/rv":"evrv", "coronavirus":"hcov", "coron":"hcov", "coro":"hcov", "respiratory syncytial virus":"rsv", "influenza":"flu", - "sarscov2":"sars-cov-2" + "sars-cov-2":"sarscov2" } GEOS = { "newfoundland": "nl", + "newfoundland and labrador": "nl", "prince edward island":"pe", "nova scotia":"ns", "new brunswick":"nb", diff --git a/src/acquisition/rvdss/rvdss_historic.py b/src/acquisition/rvdss/rvdss_historic.py index 4400c9ad8..34ef052c2 100644 --- a/src/acquisition/rvdss/rvdss_historic.py +++ b/src/acquisition/rvdss/rvdss_historic.py @@ -137,7 +137,6 @@ def check_duplicate_rows(table): return(new_table) - def create_detections_table(table,modified_date,week_number,week_end_date,start_year): lab_columns =[col for col in table.columns if 'reporting' in col][0] table=table.rename(columns={lab_columns:"geo_value"}) @@ -159,7 +158,7 @@ def create_detections_table(table,modified_date,week_number,week_end_date,start_ table.columns=[re.sub(combined_pat, "positive_tests",col) for col in table.columns] #making naming consistent table.columns=[re.sub(combined_pat2, "tests",col) for col in table.columns] - table.columns=[re.sub(combined_pat3, r"flu_\g<0>",col) for col in table.columns] # add flu as a prefix + table.columns=[re.sub(combined_pat3, r"flu\g<0>",col) for col in table.columns] # add flu as a prefix table.columns=[re.sub("total ", "",col) for col in table.columns] matches=['test','geo_value'] new_names = [] @@ -170,7 +169,13 @@ def create_detections_table(table,modified_date,week_number,week_end_date,start_ new_names.append(table.columns[i]) table.columns=new_names - table.columns=[re.sub("other hpiv", "hpiv other",col) for col in table.columns] + + # remove any underscores or spaces from virus names + table.columns=[re.sub(" positive","_positive",t) for t in table.columns] + table.columns=[re.sub(" tests","_tests",t) for t in table.columns] + table.columns=[re.sub(" ","",t) for t in table.columns] + + table['geo_value'] = [re.sub("^québec$","province of québec",name) for name in table['geo_value']] table['geo_value'] = [re.sub("^ontario$","province of ontario",name) for name in table['geo_value']] @@ -219,10 +224,10 @@ def create_percent_positive_detection_table(table,modified_date,start_year, flu= # get the name of the virus for the table to append to column names virus_prefix=[] if flu: - virus_prefix=['flu_a_pct_positive','flu_b_pct_positive'] + virus_prefix=['flua_pct_positive','flub_pct_positive'] virus="flu" - table.columns=[re.sub("a_pct","flu_a_pct",c) for c in table.columns] - table.columns=[re.sub("b_pct","flu_b_pct",c) for c in table.columns] + table.columns=[re.sub("a_pct","flua_pct",c) for c in table.columns] + table.columns=[re.sub("b_pct","flub_pct",c) for c in table.columns] else: names=[] for j in range(len(table.columns)): @@ -309,11 +314,14 @@ def get_season_reports(url): tab.select_one('td').decompose() # Replace commas with periods - tab2 = re.sub(",",r".",str(tab)) + if "number" not in caption.text.lower(): + tab = re.sub(",",r".",str(tab)) + else: + tab = re.sub(",","",str(tab)) # Read table na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"N.D.","-"] - table = pd.read_html(tab2,na_values=na_values)[0].dropna(how="all") + table = pd.read_html(tab,na_values=na_values)[0].dropna(how="all") # Check for multiline headers if isinstance(table.columns, pd.MultiIndex): @@ -356,6 +364,10 @@ def get_season_reports(url): table.columns = [re.sub(r'/', '_', col) for col in table.columns] # replace / with _ table.columns = [re.sub(r"^at\b","atl ",t) for t in table.columns] + table.columns = [re.sub(r"flu a","flua",t) for t in table.columns] + table.columns = [re.sub(r"flu b","flub",t) for t in table.columns] + table.columns = [re.sub(r"other hpiv","hpivother",t) for t in table.columns] + if "reporting laboratory" in str(table.columns): respiratory_detection_table = create_detections_table(table,modified_date,current_week,current_week_end,season[0]) respiratory_detection_table = respiratory_detection_table.set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) diff --git a/src/acquisition/rvdss/utils.py b/src/acquisition/rvdss/utils.py index 7b8c3dc52..7d5bad4fa 100644 --- a/src/acquisition/rvdss/utils.py +++ b/src/acquisition/rvdss/utils.py @@ -18,7 +18,7 @@ def abbreviate_virus(full_name): def abbreviate_geo(full_name): lowercase=full_name.lower() keys = (re.escape(k) for k in GEOS.keys()) - pattern = re.compile(r'\b(' + '|'.join(keys) + r')\b') + pattern = re.compile(r'^\b(' + '|'.join(keys) + r')\b$') result = pattern.sub(lambda x: GEOS[x.group()], lowercase) return(result) From e8957c368487a22b7d24c4afdfd5ecbf9a35ce3f Mon Sep 17 00:00:00 2001 From: cchuong Date: Fri, 20 Sep 2024 01:19:42 -0700 Subject: [PATCH 14/33] remove "province of" from geo_values --- src/acquisition/rvdss/constants.py | 2 -- src/acquisition/rvdss/utils.py | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py index 53bdfbbae..d14375dbb 100644 --- a/src/acquisition/rvdss/constants.py +++ b/src/acquisition/rvdss/constants.py @@ -26,10 +26,8 @@ "nova scotia":"ns", "new brunswick":"nb", "québec":"qc", - "province of québec":"qc", "quebec":"qc", "ontario":"on", - "province of ontario":"on", "manitoba" : "mb", "saskatchewan":"sk", "alberta": "ab", diff --git a/src/acquisition/rvdss/utils.py b/src/acquisition/rvdss/utils.py index 7d5bad4fa..015a3e47b 100644 --- a/src/acquisition/rvdss/utils.py +++ b/src/acquisition/rvdss/utils.py @@ -17,6 +17,8 @@ def abbreviate_virus(full_name): def abbreviate_geo(full_name): lowercase=full_name.lower() + lowercase = re.sub("province of ","",lowercase) + keys = (re.escape(k) for k in GEOS.keys()) pattern = re.compile(r'^\b(' + '|'.join(keys) + r')\b$') From 7720a24bec074ec743b45651e966b836779dbf5a Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 23 Sep 2024 17:16:48 -0400 Subject: [PATCH 15/33] construct urls automatically --- src/acquisition/rvdss/constants.py | 61 +++++++++++++++++------------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py index d14375dbb..d25bb0b8e 100644 --- a/src/acquisition/rvdss/constants.py +++ b/src/acquisition/rvdss/constants.py @@ -44,36 +44,45 @@ } REGIONS = ['atlantic','atl','at','province of québec','québec','qc','province of ontario','ontario','on', - 'prairies', 'pr', "british columbia", 'bc',"territories",'terr'] + 'prairies', 'pr', "british columbia",'bc',"territories",'terr'] NATION = ["canada","can",'ca'] -DASHBOARD_BASE_URLS_2023=["https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-06-20/", -"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-06-27/", -"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-04/", -"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-11/", -"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-07-18/", -"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-01/", -"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-08/", -"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-15/", -"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-22/", -"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-08-29/", -"https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/2024-09-05/"] - -HISTORIC_SEASON_URL = ["https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2013-2014.html", -"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2014-2015.html", -"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2015-2016.html", -"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2016-2017.html", -"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2017-2018.html", -"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2018-2019.html", -"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2019-2020.html", -"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2020-2021.html", -"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2021-2022.html", -"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2022-2023.html", -"https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2023-2024.html"] +DASHBOARD_BASE_URL_2023 = "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/{date}/" +DASHBOARD_BASE_URLS_2023 = ( + DASHBOARD_BASE_URL_2023.format(date = date) for date in + ( + "2024-06-20", + "2024-06-27", + "2024-07-04", + "2024-07-11", + "2024-07-18", + "2024-08-01", + "2024-08-08", + "2024-08-15", + "2024-08-22", + "2024-08-29", + "2024-09-05" + ) +) +SEASON_BASE_URL = "https://www.canada.ca" ALTERNATIVE_SEASON_BASE_URL = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/" +HISTORIC_SEASON_REPORTS_URL + "/en/public-health/services/surveillance/respiratory-virus-detections-canada/{year_range}.html" -SEASON_BASE_URL = "https://www.canada.ca" +HISTORIC_SEASON_URL = (HISTORIC_SEASON_REPORTS_URL.format(year_range = year_range) for year_range in + ( + "2013-2014", + "2014-2015", + "2015-2016", + "2016-2017", + "2017-2018", + "2018-2019", + "2019-2020", + "2020-2021", + "2021-2022", + "2022-2023", + "2023-2024" + ) +) LAST_WEEK_OF_YEAR = 35 - From 59f79bfc3353f83b0c667b00b6f60bc0b62d7e03 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 23 Sep 2024 17:23:08 -0400 Subject: [PATCH 16/33] comment constants --- src/acquisition/rvdss/constants.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py index d25bb0b8e..769083394 100644 --- a/src/acquisition/rvdss/constants.py +++ b/src/acquisition/rvdss/constants.py @@ -1,3 +1,5 @@ +# The dataset calls the same viruses, provinces, regions (province groups), +# and country by multiple names. Map each of those to a common abbreviation. VIRUSES = { "parainfluenza": "hpiv", "piv": "hpiv", @@ -16,7 +18,7 @@ "coro":"hcov", "respiratory syncytial virus":"rsv", "influenza":"flu", - "sars-cov-2":"sarscov2" + "sars-cov-2":"sarscov2", } GEOS = { @@ -40,13 +42,15 @@ "at":"atlantic", "atl":"atlantic", "pr" :"prairies" , - "terr" :"territories" + "terr" :"territories", } +# Regions are groups of provinces that are geographically close together. Some single provinces are reported as their own region (e.g. Québec, Ontario). REGIONS = ['atlantic','atl','at','province of québec','québec','qc','province of ontario','ontario','on', - 'prairies', 'pr', "british columbia",'bc',"territories",'terr'] -NATION = ["canada","can",'ca'] + 'prairies', 'pr', "british columbia",'bc',"territories",'terr',] +NATION = ["canada","can",'ca',] +# Construct dashboard and data report URLS. DASHBOARD_BASE_URL_2023 = "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/{date}/" DASHBOARD_BASE_URLS_2023 = ( DASHBOARD_BASE_URL_2023.format(date = date) for date in @@ -69,6 +73,8 @@ ALTERNATIVE_SEASON_BASE_URL = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/" HISTORIC_SEASON_REPORTS_URL + "/en/public-health/services/surveillance/respiratory-virus-detections-canada/{year_range}.html" +# Each URL created here points to a list of all data reports made during that season, e.g. +# https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2014-2015.html. HISTORIC_SEASON_URL = (HISTORIC_SEASON_REPORTS_URL.format(year_range = year_range) for year_range in ( "2013-2014", From e70b0e9f478c21bc0dc9706c51edc22b0d95733b Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 23 Sep 2024 17:33:38 -0400 Subject: [PATCH 17/33] note historic urls don't need to be updated --- src/acquisition/rvdss/constants.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py index 769083394..7bbb03746 100644 --- a/src/acquisition/rvdss/constants.py +++ b/src/acquisition/rvdss/constants.py @@ -73,8 +73,13 @@ ALTERNATIVE_SEASON_BASE_URL = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/" HISTORIC_SEASON_REPORTS_URL + "/en/public-health/services/surveillance/respiratory-virus-detections-canada/{year_range}.html" -# Each URL created here points to a list of all data reports made during that season, e.g. +# Each URL created here points to a list of all data reports made during that +# season, e.g. # https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2014-2015.html. +# The Public Health Agency of Canada site switched in 2024 to reporting +# disease data in a dashboard with a static URL. Therefore, this collection +# of URLs does _NOT_ need to be updated. It is used for fetching historical +# data (for dates on or before June 8, 2024) only. HISTORIC_SEASON_URL = (HISTORIC_SEASON_REPORTS_URL.format(year_range = year_range) for year_range in ( "2013-2014", From 72d190634b83b23700607e6dc5032258312102aa Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 23 Sep 2024 17:55:07 -0400 Subject: [PATCH 18/33] be stricter about importing local fns --- src/acquisition/rvdss/constants.py | 5 +++-- src/acquisition/rvdss/rvdss_historic.py | 12 +++++++++--- src/acquisition/rvdss/rvdss_update.py | 9 +++++---- src/acquisition/rvdss/utils.py | 4 +++- 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py index 7bbb03746..f4a88e0b4 100644 --- a/src/acquisition/rvdss/constants.py +++ b/src/acquisition/rvdss/constants.py @@ -51,9 +51,10 @@ NATION = ["canada","can",'ca',] # Construct dashboard and data report URLS. -DASHBOARD_BASE_URL_2023 = "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/archive/{date}/" +DASHBOARD_BASE_URL = "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/" +DASHBOARD_W_DATE_URL = DASHBOARD_BASE_URL + "archive/{date}/" DASHBOARD_BASE_URLS_2023 = ( - DASHBOARD_BASE_URL_2023.format(date = date) for date in + DASHBOARD_W_DATE_URL.format(date = date) for date in ( "2024-06-20", "2024-06-27", diff --git a/src/acquisition/rvdss/rvdss_historic.py b/src/acquisition/rvdss/rvdss_historic.py index 34ef052c2..98748317c 100644 --- a/src/acquisition/rvdss/rvdss_historic.py +++ b/src/acquisition/rvdss/rvdss_historic.py @@ -3,11 +3,17 @@ import regex as re import pandas as pd from epiweeks import Week -from datetime import datetime,timedelta +from datetime import datetime, timedelta import math -from constants import DASHBOARD_BASE_URLS_2023, HISTORIC_SEASON_URL, ALTERNATIVE_SEASON_BASE_URL, SEASON_BASE_URL, LAST_WEEK_OF_YEAR -from utils import abbreviate_virus,abbreviate_geo,create_geo_types,check_date_format,get_revised_data,get_weekly_data +from delphi.epidata.acquisition.rvdss.constants import ( + DASHBOARD_BASE_URLS_2023, HISTORIC_SEASON_URL, + ALTERNATIVE_SEASON_BASE_URL, SEASON_BASE_URL, LAST_WEEK_OF_YEAR + ) +from delphi.epidata.acquisition.rvdss.utils import ( + abbreviate_virus, abbreviate_geo, create_geo_types, check_date_format, + get_revised_data, get_weekly_data + ) #%% Functions # Report Functions diff --git a/src/acquisition/rvdss/rvdss_update.py b/src/acquisition/rvdss/rvdss_update.py index ee061b9b0..6aa62f77e 100644 --- a/src/acquisition/rvdss/rvdss_update.py +++ b/src/acquisition/rvdss/rvdss_update.py @@ -1,11 +1,12 @@ import pandas as pd import os -import utils -base_url = "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/" +from delphi.epidata.acquisition.rvdss.utils import get_weekly_data, get_revised_data +from delphi.epidata.acquisition.rvdss.constants import DASHBOARD_BASE_URL -weekly_data = utils.get_weekly_data(base_url,2024).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) -positive_data = utils.get_revised_data(base_url) + +weekly_data = get_weekly_data(DASHBOARD_BASE_URL,2024).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) +positive_data = get_revised_data(DASHBOARD_BASE_URL) path1 = './respiratory_detections.csv' path2 = './positive_tests.csv' diff --git a/src/acquisition/rvdss/utils.py b/src/acquisition/rvdss/utils.py index 015a3e47b..6e3905d4b 100644 --- a/src/acquisition/rvdss/utils.py +++ b/src/acquisition/rvdss/utils.py @@ -6,7 +6,9 @@ from datetime import datetime import math -from constants import VIRUSES, GEOS, REGIONS, NATION, LAST_WEEK_OF_YEAR +from delphi.epidata.acquisition.rvdss.constants import ( + VIRUSES, GEOS, REGIONS, NATION, LAST_WEEK_OF_YEAR + ) def abbreviate_virus(full_name): lowercase=full_name.lower() From bf51bd3a8bd0b75a17082ad223d7d1ea6be47f08 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 23 Sep 2024 18:19:57 -0400 Subject: [PATCH 19/33] move dashboard file names to constants --- src/acquisition/rvdss/constants.py | 3 +++ src/acquisition/rvdss/utils.py | 7 ++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py index f4a88e0b4..d118c7dc6 100644 --- a/src/acquisition/rvdss/constants.py +++ b/src/acquisition/rvdss/constants.py @@ -97,4 +97,7 @@ ) ) +DASHBOARD_UPDATE_DATE_FILE = "RVD_UpdateDate.csv" +DASHBOARD_DATA_FILE = "RVD_WeeklyData.csv" + LAST_WEEK_OF_YEAR = 35 diff --git a/src/acquisition/rvdss/utils.py b/src/acquisition/rvdss/utils.py index 6e3905d4b..7df4012e2 100644 --- a/src/acquisition/rvdss/utils.py +++ b/src/acquisition/rvdss/utils.py @@ -7,7 +7,8 @@ import math from delphi.epidata.acquisition.rvdss.constants import ( - VIRUSES, GEOS, REGIONS, NATION, LAST_WEEK_OF_YEAR + VIRUSES, GEOS, REGIONS, NATION, LAST_WEEK_OF_YEAR, + DASHBOARD_UPDATE_DATE_FILE, DASHBOARD_DATA_FILE ) def abbreviate_virus(full_name): @@ -56,12 +57,12 @@ def get_revised_data(base_url): } # Get update date - update_date_url = base_url + "RVD_UpdateDate.csv" + update_date_url = base_url + DASHBOARD_UPDATE_DATE_FILE update_date_url_response = requests.get(update_date_url, headers=headers) update_date = datetime.strptime(update_date_url_response.text,"%m/%d/%Y %H:%M:%S").strftime("%Y-%m-%d") # Get update data - url = base_url+"RVD_WeeklyData.csv" + url = base_url+DASHBOARD_DATA_FILE url_response = requests.get(url, headers=headers) df = pd.read_csv(io.StringIO(url_response.text)) From ee3cadfdb896cbdb217f1feaefe067a18ec1fa10 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 23 Sep 2024 18:31:01 -0400 Subject: [PATCH 20/33] move run-the-whole-pipeline code into main() --- src/acquisition/rvdss/rvdss_historic.py | 47 +++++++++++++------------ src/acquisition/rvdss/rvdss_update.py | 40 +++++++++++---------- 2 files changed, 46 insertions(+), 41 deletions(-) diff --git a/src/acquisition/rvdss/rvdss_historic.py b/src/acquisition/rvdss/rvdss_historic.py index 98748317c..1efe54c7a 100644 --- a/src/acquisition/rvdss/rvdss_historic.py +++ b/src/acquisition/rvdss/rvdss_historic.py @@ -429,29 +429,32 @@ def get_season_reports(url): if len(all_number_tables) != 0: all_number_tables.to_csv(path+"/number_of_detections.csv", index=True) - #%% Scrape each season -[get_season_reports(url) for url in HISTORIC_SEASON_URL] +def main(): + #%% Scrape each season + [get_season_reports(url) for url in HISTORIC_SEASON_URL] - #%% Update the end of the 2023-2024 season with the dashboard data + #%% Update the end of the 2023-2024 season with the dashboard data -# Load old csvs -old_detection_data = pd.read_csv('season_2023_2024/respiratory_detections.csv').set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) -old_positive_data = pd.read_csv('season_2023_2024/positive_tests.csv').set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) + # Load old csvs + old_detection_data = pd.read_csv('season_2023_2024/respiratory_detections.csv').set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) + old_positive_data = pd.read_csv('season_2023_2024/positive_tests.csv').set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) -for base_url in DASHBOARD_BASE_URLS_2023: - # Get weekly dashboard data - weekly_data = get_weekly_data(base_url,2023).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) - positive_data = get_revised_data(base_url) - - # Check if indices are already present in the old data - # If not, add the new data - if weekly_data.index.isin(old_detection_data.index).any() == False: - old_detection_data= pd.concat([old_detection_data,weekly_data],axis=0) - - if positive_data.index.isin(old_positive_data.index).any() == False: - old_positive_data= pd.concat([old_positive_data,positive_data],axis=0) - -# Overwrite/update csvs -old_detection_data.to_csv('season_2023_2024/respiratory_detections.csv',index=True) -old_positive_data.to_csv('season_2023_2024/positive_tests.csv',index=True) + for base_url in DASHBOARD_BASE_URLS_2023: + # Get weekly dashboard data + weekly_data = get_weekly_data(base_url,2023).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) + positive_data = get_revised_data(base_url) + + # Check if indices are already present in the old data + # If not, add the new data + if weekly_data.index.isin(old_detection_data.index).any() == False: + old_detection_data= pd.concat([old_detection_data,weekly_data],axis=0) + + if positive_data.index.isin(old_positive_data.index).any() == False: + old_positive_data= pd.concat([old_positive_data,positive_data],axis=0) + + # Overwrite/update csvs + old_detection_data.to_csv('season_2023_2024/respiratory_detections.csv',index=True) + old_positive_data.to_csv('season_2023_2024/positive_tests.csv',index=True) +if __name__ == '__main__': + main() diff --git a/src/acquisition/rvdss/rvdss_update.py b/src/acquisition/rvdss/rvdss_update.py index 6aa62f77e..502b867ba 100644 --- a/src/acquisition/rvdss/rvdss_update.py +++ b/src/acquisition/rvdss/rvdss_update.py @@ -5,26 +5,28 @@ from delphi.epidata.acquisition.rvdss.constants import DASHBOARD_BASE_URL -weekly_data = get_weekly_data(DASHBOARD_BASE_URL,2024).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) -positive_data = get_revised_data(DASHBOARD_BASE_URL) +def main(): + weekly_data = get_weekly_data(DASHBOARD_BASE_URL,2024).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) + positive_data = get_revised_data(DASHBOARD_BASE_URL) -path1 = './respiratory_detections.csv' -path2 = './positive_tests.csv' + path1 = './respiratory_detections.csv' + path2 = './positive_tests.csv' -if os.path.exists(path1)==False: - weekly_data.to_csv(path1,index=True) -else: - old_detection_data = pd.read_csv(path1).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) - if weekly_data.index.isin(old_detection_data.index).any() == False: - old_detection_data= pd.concat([old_detection_data,weekly_data],axis=0) - old_detection_data.to_csv(path1,index=True) + if os.path.exists(path1)==False: + weekly_data.to_csv(path1,index=True) + else: + old_detection_data = pd.read_csv(path1).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) + if weekly_data.index.isin(old_detection_data.index).any() == False: + old_detection_data= pd.concat([old_detection_data,weekly_data],axis=0) + old_detection_data.to_csv(path1,index=True) -if os.path.exists(path2)==False: - positive_data.to_csv(path2,index=True) -else: - old_positive_data = pd.read_csv(path2).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) - if positive_data.index.isin(old_positive_data.index).any() == False: - old_positive_data= pd.concat([old_positive_data,positive_data],axis=0) - old_positive_data.to_csv(path2,index=True) + if os.path.exists(path2)==False: + positive_data.to_csv(path2,index=True) + else: + old_positive_data = pd.read_csv(path2).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) + if positive_data.index.isin(old_positive_data.index).any() == False: + old_positive_data= pd.concat([old_positive_data,positive_data],axis=0) + old_positive_data.to_csv(path2,index=True) - \ No newline at end of file +if __name__ == '__main__': + main() \ No newline at end of file From 180e67ff60a2b4211a87bcb6863f09ceae7eee97 Mon Sep 17 00:00:00 2001 From: cchuong Date: Mon, 23 Sep 2024 17:49:46 -0700 Subject: [PATCH 21/33] add code to calculate number of positive tests back in --- src/acquisition/rvdss/rvdss_historic.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/acquisition/rvdss/rvdss_historic.py b/src/acquisition/rvdss/rvdss_historic.py index 1efe54c7a..de5ef47a5 100644 --- a/src/acquisition/rvdss/rvdss_historic.py +++ b/src/acquisition/rvdss/rvdss_historic.py @@ -269,6 +269,16 @@ def create_percent_positive_detection_table(table,modified_date,start_year, flu= geo_types = [create_geo_types(g,"lab") for g in table['geo_value']] table.insert(3,"geo_type",geo_types) + # Calculate number of positive tests based on pct_positive and total tests + if flu: + table["flua_positive_tests"] = (table["flua_pct_positive"]/100)*table["flu_tests"] + table["flub_positive_tests"] = (table["flub_pct_positive"]/100)*table["flu_tests"] + + table["flu_positive_tests"] = table["flua_positive_tests"] + table["flub_positive_tests"] + table["flu_pct_positive"] = (table["flu_positive_tests"]/table["flu_tests"])*100 + else: + table[virus+"_positive_tests"] = (table[virus+"_pct_positive"]/100) *table[virus+"_tests"] + table = table.set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) return(table) From 6bd6e24427ff4c67cf764bd1ee4eb1922770fd81 Mon Sep 17 00:00:00 2001 From: cchuong Date: Mon, 23 Sep 2024 18:04:27 -0700 Subject: [PATCH 22/33] update abbreviate_geo to remove periods and other spelling --- src/acquisition/rvdss/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/acquisition/rvdss/utils.py b/src/acquisition/rvdss/utils.py index 7df4012e2..f6902ac35 100644 --- a/src/acquisition/rvdss/utils.py +++ b/src/acquisition/rvdss/utils.py @@ -21,6 +21,9 @@ def abbreviate_virus(full_name): def abbreviate_geo(full_name): lowercase=full_name.lower() lowercase = re.sub("province of ","",lowercase) + lowercase=[re.sub("\.|\*","",l) for l in lowercase] + lowercase=[re.sub("/territoires","",l) for l in lowercase] + lowercase=[re.sub("cana","can",l) for l in lowercase] keys = (re.escape(k) for k in GEOS.keys()) pattern = re.compile(r'^\b(' + '|'.join(keys) + r')\b$') From a7666b81102469a00d9cb6ea3cc4f9efe4dfcb01 Mon Sep 17 00:00:00 2001 From: cchuong Date: Tue, 24 Sep 2024 12:43:26 -0700 Subject: [PATCH 23/33] fix lab name missing province --- src/acquisition/rvdss/rvdss_historic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/acquisition/rvdss/rvdss_historic.py b/src/acquisition/rvdss/rvdss_historic.py index de5ef47a5..c5ff59b72 100644 --- a/src/acquisition/rvdss/rvdss_historic.py +++ b/src/acquisition/rvdss/rvdss_historic.py @@ -148,6 +148,9 @@ def create_detections_table(table,modified_date,week_number,week_end_date,start_ table=table.rename(columns={lab_columns:"geo_value"}) table['geo_value']=table['geo_value'].str.lower() + if start_year==2016 and week_number==3: + table["geo_value"]=[re.sub("^province of$","alberta",c) for c in table["geo_value"]] + pat1 = "positive" pat2 = 'pos' combined_pat = '|'.join((pat1, pat2)) From 503165e8cc4e3c0c03168f40bab860fb46ade13a Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 24 Sep 2024 16:09:25 -0400 Subject: [PATCH 24/33] comment historic script --- src/acquisition/rvdss/rvdss_historic.py | 67 ++++++++++++++++--------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/src/acquisition/rvdss/rvdss_historic.py b/src/acquisition/rvdss/rvdss_historic.py index c5ff59b72..cc69ef7ad 100644 --- a/src/acquisition/rvdss/rvdss_historic.py +++ b/src/acquisition/rvdss/rvdss_historic.py @@ -1,3 +1,11 @@ +""" +Script to fetch historical data, before data reporting moved to the dashboard +format. This covers dates from the 2014-2015 season to tne 2023-2024 season. + +This script should not be run in production; it will not fetch newly-posted +data. +""" + from bs4 import BeautifulSoup import requests import regex as re @@ -17,9 +25,10 @@ #%% Functions # Report Functions -def get_report_season(soup): - # Find the url in the page html and get the season +def get_report_season_years(soup): + # Find the url in the page html and get the years included in the season canonical_url = str(soup.find_all('link',rel="canonical")) + # The season range is in YYYY-YYYY format matches = re.search("20[0-9]{2}-20[0-9]{2}",canonical_url) if matches: @@ -27,7 +36,7 @@ def get_report_season(soup): years=season.split("-") return(years) -def append_urls(urls): +def add_https_prefix(urls): # Add https to the urls for i in range(len(urls)): temp_url = urls[i] @@ -39,16 +48,16 @@ def append_urls(urls): urls[i]=re.sub("http:","https:",temp_url) return(urls) -def report_urls(soup): +def construct_weekly_report_urls(soup): # Get links for individual weeks - year= "-".join(get_report_season(soup)) + year= "-".join(get_report_season_years(soup)) links=soup.find_all('a') alternative_url = ALTERNATIVE_SEASON_BASE_URL+year urls = [link.get("href") for link in links if "ending" in str(link) or alternative_url in str(link)] - report_links = append_urls(urls) + report_links = add_https_prefix(urls) return(report_links) def report_weeks(soup): @@ -88,14 +97,6 @@ def get_table_captions(soup): matches = ["period","abbreviation","cumulative", "compared"] #skip historic comparisons and cumulative tables if any(x in caption.text.lower() for x in matches) or caption.has_attr('class') or all(name not in caption.text.lower() for name in table_identifiers): remove_list.append(caption) - - ''' - elif caption.has_attr('class'): - remove_list.append(caption) - - elif all(name not in caption.text.lower() for name in table_identifiers): - remove_list.append(caption) - ''' new_captions = [cap for cap in captions if cap not in remove_list] new_captions = list(set(new_captions)) @@ -103,7 +104,15 @@ def get_table_captions(soup): return(new_captions) def get_modified_dates(soup,week_end_date): - # get the date the report page was modfified + """ + Get the date the report page was modfified + + Reports include both posted dates and modified dates. Fairly often on + historical data reports, posted date falls before the end of the week + being reported on. Then the page is modified later, presumably with + updated full-week data. Therefore, we use the modified date as the issue + date for a given report. + """ meta_tags=soup.find_all("meta",title="W3CDTF") for tag in meta_tags: if tag.get("name", None) == "dcterms.modified" or tag.get("property", None) == "dcterms.modified": @@ -114,7 +123,14 @@ def get_modified_dates(soup,week_end_date): diff_days = (mod_date-week_date).days - # manually create a new modified date if the existing one is too long after the week + # Manually create a new modified date if the existing one is too long after the week. + # Historically, we commonly see data reports being modified ~5 days after + # the end of the week being reported on. In some cases, though, the + # modified date falls a long time (up to a year) after the end of the + # week being reported on. We expect that any changes made to the report + # at that point were primarily wording, and not data, changes. So if the + # modified date is NOT within 0-14 days after the end of the week, set + # the issue date to be 5 days after the end of the week. if diff_days > 0 and diff_days < 14: new_modified_date = mod_date else: @@ -126,7 +142,7 @@ def get_modified_dates(soup,week_end_date): return(new_modified_date_string) -def check_duplicate_rows(table): +def deduplicate_rows(table): if table['week'].duplicated().any(): table.columns = [re.sub("canada","can",t) for t in table.columns] duplicated_rows = table[table.duplicated('week',keep=False)] @@ -165,11 +181,14 @@ def create_detections_table(table,modified_date,week_number,week_end_date,start_ pat8= r"^ah1n1pdm09" combined_pat3 = '|'.join((pat5, pat6,pat7,pat8)) - table.columns=[re.sub(combined_pat, "positive_tests",col) for col in table.columns] #making naming consistent + # make naming consistent + table.columns=[re.sub(combined_pat, "positive_tests",col) for col in table.columns] table.columns=[re.sub(combined_pat2, "tests",col) for col in table.columns] - table.columns=[re.sub(combined_pat3, r"flu\g<0>",col) for col in table.columns] # add flu as a prefix + # add flu as a prefix + table.columns=[re.sub(combined_pat3, r"flu\g<0>",col) for col in table.columns] table.columns=[re.sub("total ", "",col) for col in table.columns] matches=['test','geo_value'] + new_names = [] for i in range(len(table.columns)): if not any(x in table.columns[i] for x in matches): @@ -223,7 +242,7 @@ def create_number_detections_table(table,modified_date,start_year): return(table) def create_percent_positive_detection_table(table,modified_date,start_year, flu=False,overwrite_weeks=False): - table = check_duplicate_rows(table) + table = deduplicate_rows(table) table.columns=[re.sub(" *%", "_pct_positive",col) for col in table.columns] table.columns = [re.sub(' +', ' ',col) for col in table.columns] table.insert(2,"issue",modified_date) @@ -291,8 +310,8 @@ def get_season_reports(url): soup=BeautifulSoup(page.text,'html.parser') # get season, weeks, urls and week ends - season = get_report_season(soup) - urls=report_urls(soup) + season = get_report_season_years(soup) + urls=construct_weekly_report_urls(soup) weeks= report_weeks(soup) end_dates = [get_report_date(week, season[0]) for week in weeks] @@ -443,10 +462,10 @@ def get_season_reports(url): all_number_tables.to_csv(path+"/number_of_detections.csv", index=True) def main(): - #%% Scrape each season + # Scrape each season. Saves data to CSVs as a side effect. [get_season_reports(url) for url in HISTORIC_SEASON_URL] - #%% Update the end of the 2023-2024 season with the dashboard data + # Update the end of the 2023-2024 season with the dashboard data # Load old csvs old_detection_data = pd.read_csv('season_2023_2024/respiratory_detections.csv').set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) From 256e697717a7d4181ed34d60a9ecd6299bf68907 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 24 Sep 2024 16:16:50 -0400 Subject: [PATCH 25/33] move output file names to constants --- src/acquisition/rvdss/rvdss_historic.py | 17 +++++++++-------- src/acquisition/rvdss/rvdss_update.py | 11 ++++++++--- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/src/acquisition/rvdss/rvdss_historic.py b/src/acquisition/rvdss/rvdss_historic.py index cc69ef7ad..ee15825af 100644 --- a/src/acquisition/rvdss/rvdss_historic.py +++ b/src/acquisition/rvdss/rvdss_historic.py @@ -1,6 +1,6 @@ """ Script to fetch historical data, before data reporting moved to the dashboard -format. This covers dates from the 2014-2015 season to tne 2023-2024 season. +format. This covers dates from the 2014-2015 season to the 2023-2024 season. This script should not be run in production; it will not fetch newly-posted data. @@ -16,7 +16,8 @@ from delphi.epidata.acquisition.rvdss.constants import ( DASHBOARD_BASE_URLS_2023, HISTORIC_SEASON_URL, - ALTERNATIVE_SEASON_BASE_URL, SEASON_BASE_URL, LAST_WEEK_OF_YEAR + ALTERNATIVE_SEASON_BASE_URL, SEASON_BASE_URL, LAST_WEEK_OF_YEAR, + RESP_COUNTS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE ) from delphi.epidata.acquisition.rvdss.utils import ( abbreviate_virus, abbreviate_geo, create_geo_types, check_date_format, @@ -454,8 +455,8 @@ def get_season_reports(url): all_number_tables=pd.concat([all_number_tables,number_detections_table]) # write files to csvs - all_respiratory_detection_table.to_csv(path+"/respiratory_detections.csv", index=True) - all_positive_tables.to_csv(path+"/positive_tests.csv", index=True) + all_respiratory_detection_table.to_csv(path+"/" + RESP_COUNTS_OUTPUT_FILE, index=True) + all_positive_tables.to_csv(path+"/" + POSITIVE_TESTS_OUTPUT_FILE, index=True) # Write the number of detections table to csv if it exists (i.e has rows) if len(all_number_tables) != 0: @@ -468,8 +469,8 @@ def main(): # Update the end of the 2023-2024 season with the dashboard data # Load old csvs - old_detection_data = pd.read_csv('season_2023_2024/respiratory_detections.csv').set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) - old_positive_data = pd.read_csv('season_2023_2024/positive_tests.csv').set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) + old_detection_data = pd.read_csv('season_2023_2024/' + RESP_COUNTS_OUTPUT_FILE).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) + old_positive_data = pd.read_csv('season_2023_2024/' + POSITIVE_TESTS_OUTPUT_FILE).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) for base_url in DASHBOARD_BASE_URLS_2023: # Get weekly dashboard data @@ -485,8 +486,8 @@ def main(): old_positive_data= pd.concat([old_positive_data,positive_data],axis=0) # Overwrite/update csvs - old_detection_data.to_csv('season_2023_2024/respiratory_detections.csv',index=True) - old_positive_data.to_csv('season_2023_2024/positive_tests.csv',index=True) + old_detection_data.to_csv('season_2023_2024/' + RESP_COUNTS_OUTPUT_FILE,index=True) + old_positive_data.to_csv('season_2023_2024/' + POSITIVE_TESTS_OUTPUT_FILE,index=True) if __name__ == '__main__': main() diff --git a/src/acquisition/rvdss/rvdss_update.py b/src/acquisition/rvdss/rvdss_update.py index 502b867ba..1894dd905 100644 --- a/src/acquisition/rvdss/rvdss_update.py +++ b/src/acquisition/rvdss/rvdss_update.py @@ -1,16 +1,21 @@ +""" +Script to fetch new data, after data reporting moved to the dashboard +format. This covers dates following the 2023-2024 season (exclusive). +""" + import pandas as pd import os from delphi.epidata.acquisition.rvdss.utils import get_weekly_data, get_revised_data -from delphi.epidata.acquisition.rvdss.constants import DASHBOARD_BASE_URL +from delphi.epidata.acquisition.rvdss.constants import DASHBOARD_BASE_URL, RESP_COUNTS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE def main(): weekly_data = get_weekly_data(DASHBOARD_BASE_URL,2024).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) positive_data = get_revised_data(DASHBOARD_BASE_URL) - path1 = './respiratory_detections.csv' - path2 = './positive_tests.csv' + path1 = './' + RESP_COUNTS_OUTPUT_FILE + path2 = './' + POSITIVE_TESTS_OUTPUT_FILE if os.path.exists(path1)==False: weekly_data.to_csv(path1,index=True) From cd8308781effbf5fda3b60365b8a71f4f9e3823c Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 24 Sep 2024 16:18:32 -0400 Subject: [PATCH 26/33] replace boolean comparisons with pythonic "not" --- src/acquisition/rvdss/rvdss_historic.py | 16 ++++++++-------- src/acquisition/rvdss/rvdss_update.py | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/acquisition/rvdss/rvdss_historic.py b/src/acquisition/rvdss/rvdss_historic.py index ee15825af..918ddbb78 100644 --- a/src/acquisition/rvdss/rvdss_historic.py +++ b/src/acquisition/rvdss/rvdss_historic.py @@ -75,7 +75,7 @@ def get_report_date(week,start_year,epi=False): epi_week = Week(year, week) - if epi==False: + if not epi: report_date = str(epi_week.enddate()) else: report_date = str(epi_week) @@ -272,7 +272,7 @@ def create_percent_positive_detection_table(table,modified_date,start_year, flu= table.columns=names # Remake the weeks column from dates - if overwrite_weeks==True: + if overwrite_weeks: week_ends = [datetime.strptime(date_string, "%Y-%m-%d") for date_string in table['time_value']] table["week"] = [Week.fromdate(d).week for d in week_ends] @@ -444,14 +444,14 @@ def get_season_reports(url): # Check if the indices are already in the season table # If not, add the weeks tables into the season table - if respiratory_detection_table.index.isin(all_respiratory_detection_table.index).any() == False: + if not respiratory_detection_table.index.isin(all_respiratory_detection_table.index).any(): all_respiratory_detection_table= pd.concat([all_respiratory_detection_table,respiratory_detection_table]) - if combined_positive_tables.index.isin(all_positive_tables.index).any() == False: + if not combined_positive_tables.index.isin(all_positive_tables.index).any(): all_positive_tables=pd.concat([all_positive_tables,combined_positive_tables]) - if number_table_exists == True: - if number_detections_table.index.isin(all_number_tables.index).any() == False: + if number_table_exists: + if not number_detections_table.index.isin(all_number_tables.index).any(): all_number_tables=pd.concat([all_number_tables,number_detections_table]) # write files to csvs @@ -479,10 +479,10 @@ def main(): # Check if indices are already present in the old data # If not, add the new data - if weekly_data.index.isin(old_detection_data.index).any() == False: + if not weekly_data.index.isin(old_detection_data.index).any(): old_detection_data= pd.concat([old_detection_data,weekly_data],axis=0) - if positive_data.index.isin(old_positive_data.index).any() == False: + if not positive_data.index.isin(old_positive_data.index).any(): old_positive_data= pd.concat([old_positive_data,positive_data],axis=0) # Overwrite/update csvs diff --git a/src/acquisition/rvdss/rvdss_update.py b/src/acquisition/rvdss/rvdss_update.py index 1894dd905..cab8d68bc 100644 --- a/src/acquisition/rvdss/rvdss_update.py +++ b/src/acquisition/rvdss/rvdss_update.py @@ -17,19 +17,19 @@ def main(): path1 = './' + RESP_COUNTS_OUTPUT_FILE path2 = './' + POSITIVE_TESTS_OUTPUT_FILE - if os.path.exists(path1)==False: + if not os.path.exists(path1): weekly_data.to_csv(path1,index=True) else: old_detection_data = pd.read_csv(path1).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) - if weekly_data.index.isin(old_detection_data.index).any() == False: + if not weekly_data.index.isin(old_detection_data.index).any(): old_detection_data= pd.concat([old_detection_data,weekly_data],axis=0) old_detection_data.to_csv(path1,index=True) - if os.path.exists(path2)==False: + if not os.path.exists(path2): positive_data.to_csv(path2,index=True) else: old_positive_data = pd.read_csv(path2).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) - if positive_data.index.isin(old_positive_data.index).any() == False: + if not positive_data.index.isin(old_positive_data.index).any(): old_positive_data= pd.concat([old_positive_data,positive_data],axis=0) old_positive_data.to_csv(path2,index=True) From 969295bf7fb1b4d97c691567de640097c199f889 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 25 Sep 2024 16:47:06 -0400 Subject: [PATCH 27/33] actually put csv names in constants --- src/acquisition/rvdss/constants.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py index d118c7dc6..d14509015 100644 --- a/src/acquisition/rvdss/constants.py +++ b/src/acquisition/rvdss/constants.py @@ -100,4 +100,7 @@ DASHBOARD_UPDATE_DATE_FILE = "RVD_UpdateDate.csv" DASHBOARD_DATA_FILE = "RVD_WeeklyData.csv" +RESP_COUNTS_OUTPUT_FILE = "respiratory_detections.csv" +POSITIVE_TESTS_OUTPUT_FILE = "positive_tests.csv" + LAST_WEEK_OF_YEAR = 35 From 00f3f9a697b08170f04f0356ef5767ad20229d71 Mon Sep 17 00:00:00 2001 From: cchuong Date: Wed, 2 Oct 2024 07:30:46 -0700 Subject: [PATCH 28/33] break more helper functions and add doctsrings --- src/acquisition/rvdss/constants.py | 2 +- src/acquisition/rvdss/rvdss_historic.py | 135 ++++++++++++++++-------- src/acquisition/rvdss/utils.py | 6 +- 3 files changed, 95 insertions(+), 48 deletions(-) diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py index d14509015..b1167bfb8 100644 --- a/src/acquisition/rvdss/constants.py +++ b/src/acquisition/rvdss/constants.py @@ -72,7 +72,7 @@ SEASON_BASE_URL = "https://www.canada.ca" ALTERNATIVE_SEASON_BASE_URL = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/" -HISTORIC_SEASON_REPORTS_URL + "/en/public-health/services/surveillance/respiratory-virus-detections-canada/{year_range}.html" +HISTORIC_SEASON_REPORTS_URL = "/en/public-health/services/surveillance/respiratory-virus-detections-canada/{year_range}.html" # Each URL created here points to a list of all data reports made during that # season, e.g. diff --git a/src/acquisition/rvdss/rvdss_historic.py b/src/acquisition/rvdss/rvdss_historic.py index 918ddbb78..1ecc67e98 100644 --- a/src/acquisition/rvdss/rvdss_historic.py +++ b/src/acquisition/rvdss/rvdss_historic.py @@ -27,6 +27,7 @@ # Report Functions def get_report_season_years(soup): + """Get the start year of the season and the year the season ends """ # Find the url in the page html and get the years included in the season canonical_url = str(soup.find_all('link',rel="canonical")) # The season range is in YYYY-YYYY format @@ -38,7 +39,7 @@ def get_report_season_years(soup): return(years) def add_https_prefix(urls): - # Add https to the urls + """ Add https to urls, and changes any http to https""" for i in range(len(urls)): temp_url = urls[i] @@ -50,7 +51,7 @@ def add_https_prefix(urls): return(urls) def construct_weekly_report_urls(soup): - # Get links for individual weeks + """ Construct links for each week in a season""" year= "-".join(get_report_season_years(soup)) links=soup.find_all('a') alternative_url = ALTERNATIVE_SEASON_BASE_URL+year @@ -62,12 +63,21 @@ def construct_weekly_report_urls(soup): return(report_links) def report_weeks(soup): + """ Get a list of all the weeks in a season""" links=soup.find_all('a') full_weeks = [link.text for link in links if "Week" in str(link)] weeks= [int(re.search('Week (.+?) ', week).group(1)) for week in full_weeks] return(weeks) def get_report_date(week,start_year,epi=False): + """ + Get the end date of the current reporting/epiweek + + week - the epidemiological week number + start_year - the year the season starts in + epi - if True, return the date in cdc format (yearweek) + + """ if week < LAST_WEEK_OF_YEAR: year=int(start_year)+1 else: @@ -83,19 +93,30 @@ def get_report_date(week,start_year,epi=False): return(report_date) -def get_table_captions(soup): +def parse_table_captions(soup): + """ + finds all the table captions for the current week so tables can be identified + + The captions from the 'summary' tag require less parsing, but sometimes they + are missing. In that case, use the figure captions + """ captions = soup.findAll('summary') table_identifiers = ["respiratory","number","positive","abbreviation"] + + # For every caption, check if all of the table identifiers are missing. If they are, + # this means the caption is noninformative (i.e just says Figure 1). If any of the captions are + # noninformative, use the figure captions as captions if sum([all(name not in cap.text.lower() for name in table_identifiers) for cap in captions]) != 0: figcaptions = soup.findAll('figcaption') captions = captions + figcaptions - + remove_list=[] for i in range(len(captions)): caption = captions[i] matches = ["period","abbreviation","cumulative", "compared"] #skip historic comparisons and cumulative tables + # remove any captions with a class or that are uninformative if any(x in caption.text.lower() for x in matches) or caption.has_attr('class') or all(name not in caption.text.lower() for name in table_identifiers): remove_list.append(caption) @@ -144,8 +165,12 @@ def get_modified_dates(soup,week_end_date): def deduplicate_rows(table): + """ + Sometimes tables have more than one row for the same week + In that case, keep the row that has the highest canada tests + (i.e drop the rows with the lower counts) + """ if table['week'].duplicated().any(): - table.columns = [re.sub("canada","can",t) for t in table.columns] duplicated_rows = table[table.duplicated('week',keep=False)] grouped = duplicated_rows.groupby("week") duplicates_drop = [] @@ -159,14 +184,23 @@ def deduplicate_rows(table): new_table=table return(new_table) - -def create_detections_table(table,modified_date,week_number,week_end_date,start_year): - lab_columns =[col for col in table.columns if 'reporting' in col][0] - table=table.rename(columns={lab_columns:"geo_value"}) - table['geo_value']=table['geo_value'].str.lower() +def add_flu_prefix(flu_subtype): + """ Add the prefix `flu` when only the subtype is reported """ - if start_year==2016 and week_number==3: - table["geo_value"]=[re.sub("^province of$","alberta",c) for c in table["geo_value"]] + pat1 =r"^ah3" + pat2= r"^auns" + pat3= r"^ah1pdm09" + pat4= r"^ah1n1pdm09" + combined_pat = '|'.join((pat1, pat2,pat3,pat4)) + + full_fluname = re.sub(combined_pat, r"flu\g<0>",flu_subtype) + return(full_fluname) + +def make_signal_type_spelling_consistent(signal): + """ + Make the signal type (i.e. percent positive, number tests, total tests) have consistent spelling + Also remove total from signal names + """ pat1 = "positive" pat2 = 'pos' @@ -176,18 +210,50 @@ def create_detections_table(table,modified_date,week_number,week_end_date,start_ pat4 = 'tested' combined_pat2 = '|'.join((pat3, pat4)) - pat5 =r"^ah3" - pat6= r"^auns" - pat7= r"^ah1pdm09" - pat8= r"^ah1n1pdm09" - combined_pat3 = '|'.join((pat5, pat6,pat7,pat8)) + new_signal = re.sub(combined_pat, "positive_tests",signal) + new_signal = re.sub(combined_pat2, "positive_tests",signal) + new_signal = re.sub("total ", "",signal) + return(new_signal) + +def preprocess_table_columns(table): + """ + Remove characters like . or * from columns + Abbreviate the viruses in columns + Change some naming of signals in columns (i.e order of hpiv and other) + Change some naming of locations in columns (i.e at instead of atl) + """ + table.columns = [re.sub("\xa0"," ", col) for col in table.columns] # \xa0 to space + table.columns = [re.sub("(.*?)(\.\d+)", "\\1", c) for c in table.columns] # remove .# for duplicated columns + table.columns =[re.sub("\.", "", s)for s in table.columns] #remove periods + table.columns =[re.sub(r"\((all)\)", "", s)for s in table.columns] # remove (all) + table.columns =[re.sub(r"\s*\(|\)", "", s)for s in table.columns] + table.columns = [re.sub(' +', ' ', col) for col in table.columns] # Make any muliple spaces into one space + table.columns = [re.sub(r'\(|\)', '', col) for col in table.columns] # replace () for _ + table.columns = [re.sub(r'/', '_', col) for col in table.columns] # replace / with _ + + table.columns = [re.sub(r"^at\b","atl ",t) for t in table.columns] + table.columns = [re.sub("canada","can",t) for t in table.columns] + + table.columns =[re.sub(r"h1n1 2009 |h1n12009", "ah1n1pdm09", s)for s in table.columns] + table.columns =[abbreviate_virus(col) for col in table.columns] # abbreviate viruses + table.columns = [re.sub(r"flu a","flua",t) for t in table.columns] + table.columns = [re.sub(r"flu b","flub",t) for t in table.columns] + table.columns = [re.sub("flutest","flu test", col) for col in table.columns] + table.columns = [re.sub(r"other hpiv","hpivother",t) for t in table.columns] + + return(table) + +def create_detections_table(table,modified_date,week_number,week_end_date,start_year): + lab_columns =[col for col in table.columns if 'reporting' in col][0] + table=table.rename(columns={lab_columns:"geo_value"}) + table['geo_value']=table['geo_value'].str.lower() + + if start_year==2016 and week_number==3: + table["geo_value"]=[re.sub("^province of$","alberta",c) for c in table["geo_value"]] # make naming consistent - table.columns=[re.sub(combined_pat, "positive_tests",col) for col in table.columns] - table.columns=[re.sub(combined_pat2, "tests",col) for col in table.columns] - # add flu as a prefix - table.columns=[re.sub(combined_pat3, r"flu\g<0>",col) for col in table.columns] - table.columns=[re.sub("total ", "",col) for col in table.columns] + table.columns=[make_signal_type_spelling_consistent(col) for col in table.columns] + table.columns=[add_flu_prefix(col) for col in table.columns] matches=['test','geo_value'] new_names = [] @@ -204,10 +270,6 @@ def create_detections_table(table,modified_date,week_number,week_end_date,start_ table.columns=[re.sub(" tests","_tests",t) for t in table.columns] table.columns=[re.sub(" ","",t) for t in table.columns] - - table['geo_value'] = [re.sub("^québec$","province of québec",name) for name in table['geo_value']] - table['geo_value'] = [re.sub("^ontario$","province of ontario",name) for name in table['geo_value']] - table['geo_value'] = [abbreviate_geo(g) for g in table['geo_value']] geo_types = [create_geo_types(g,"lab") for g in table['geo_value']] @@ -310,7 +372,7 @@ def get_season_reports(url): page=requests.get(url) soup=BeautifulSoup(page.text,'html.parser') - # get season, weeks, urls and week ends + # get season, week numbers, urls and week ends season = get_report_season_years(soup) urls=construct_weekly_report_urls(soup) weeks= report_weeks(soup) @@ -334,7 +396,7 @@ def get_season_reports(url): temp_url=urls[week_num] temp_page=requests.get(temp_url) new_soup = BeautifulSoup(temp_page.text, 'html.parser') - captions = get_table_captions(new_soup) + captions = parse_table_captions(new_soup) modified_date = get_modified_dates(new_soup,current_week_end) positive_tables=[] @@ -390,22 +452,7 @@ def get_season_reports(url): table.loc[table['week'] == 35, 'week end'] = "2022-09-03" # Rename columns - table.columns = [re.sub("\xa0"," ", col) for col in table.columns] # \xa0 to space - table.columns = [re.sub("flutest","flu test", col) for col in table.columns] - table.columns = [re.sub("(.*?)(\.\d+)", "\\1", c) for c in table.columns] # remove .# for duplicated columns - table.columns =[re.sub("\.", "", s)for s in table.columns] #remove periods - table.columns =[re.sub(r"\((all)\)", "", s)for s in table.columns] # remove (all) - table.columns =[re.sub(r"\s*\(|\)", "", s)for s in table.columns] # remove (all) - table.columns =[re.sub(r"h1n1 2009 |h1n12009", "ah1n1pdm09", s)for s in table.columns] # remove (all) - table.columns =[abbreviate_virus(col) for col in table.columns] # abbreviate viruses - table.columns = [re.sub(' +', ' ', col) for col in table.columns] # Make any muliple spaces into one space - table.columns = [re.sub(r'\(|\)', '', col) for col in table.columns] # replace () for _ - table.columns = [re.sub(r'/', '_', col) for col in table.columns] # replace / with _ - table.columns = [re.sub(r"^at\b","atl ",t) for t in table.columns] - - table.columns = [re.sub(r"flu a","flua",t) for t in table.columns] - table.columns = [re.sub(r"flu b","flub",t) for t in table.columns] - table.columns = [re.sub(r"other hpiv","hpivother",t) for t in table.columns] + table= preprocess_table_columns(table) if "reporting laboratory" in str(table.columns): respiratory_detection_table = create_detections_table(table,modified_date,current_week,current_week_end,season[0]) diff --git a/src/acquisition/rvdss/utils.py b/src/acquisition/rvdss/utils.py index f6902ac35..481c47804 100644 --- a/src/acquisition/rvdss/utils.py +++ b/src/acquisition/rvdss/utils.py @@ -21,9 +21,9 @@ def abbreviate_virus(full_name): def abbreviate_geo(full_name): lowercase=full_name.lower() lowercase = re.sub("province of ","",lowercase) - lowercase=[re.sub("\.|\*","",l) for l in lowercase] - lowercase=[re.sub("/territoires","",l) for l in lowercase] - lowercase=[re.sub("cana","can",l) for l in lowercase] + lowercase=re.sub("\.|\*","",lowercase) + lowercase=re.sub("/territoires","",lowercase) + lowercase=re.sub("^cana$","can",lowercase) keys = (re.escape(k) for k in GEOS.keys()) pattern = re.compile(r'^\b(' + '|'.join(keys) + r')\b$') From ecca542e9225bccbae85253da467a63d06e08897 Mon Sep 17 00:00:00 2001 From: cchuong Date: Fri, 4 Oct 2024 15:08:41 -0700 Subject: [PATCH 29/33] add more comments --- src/acquisition/rvdss/constants.py | 4 +- src/acquisition/rvdss/rvdss_historic.py | 75 +++++++++++++++++++------ 2 files changed, 60 insertions(+), 19 deletions(-) diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py index b1167bfb8..47bc6f9f9 100644 --- a/src/acquisition/rvdss/constants.py +++ b/src/acquisition/rvdss/constants.py @@ -53,7 +53,7 @@ # Construct dashboard and data report URLS. DASHBOARD_BASE_URL = "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/" DASHBOARD_W_DATE_URL = DASHBOARD_BASE_URL + "archive/{date}/" -DASHBOARD_BASE_URLS_2023 = ( +DASHBOARD_BASE_URLS_2023_2024_SEASON = ( DASHBOARD_W_DATE_URL.format(date = date) for date in ( "2024-06-20", @@ -72,7 +72,7 @@ SEASON_BASE_URL = "https://www.canada.ca" ALTERNATIVE_SEASON_BASE_URL = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/" -HISTORIC_SEASON_REPORTS_URL = "/en/public-health/services/surveillance/respiratory-virus-detections-canada/{year_range}.html" +HISTORIC_SEASON_REPORTS_URL = SEASON_BASE_URL+"/en/public-health/services/surveillance/respiratory-virus-detections-canada/{year_range}.html" # Each URL created here points to a list of all data reports made during that # season, e.g. diff --git a/src/acquisition/rvdss/rvdss_historic.py b/src/acquisition/rvdss/rvdss_historic.py index 1ecc67e98..090da7b50 100644 --- a/src/acquisition/rvdss/rvdss_historic.py +++ b/src/acquisition/rvdss/rvdss_historic.py @@ -15,7 +15,7 @@ import math from delphi.epidata.acquisition.rvdss.constants import ( - DASHBOARD_BASE_URLS_2023, HISTORIC_SEASON_URL, + DASHBOARD_BASE_URLS_2023_2024_SEASON, HISTORIC_SEASON_URL, ALTERNATIVE_SEASON_BASE_URL, SEASON_BASE_URL, LAST_WEEK_OF_YEAR, RESP_COUNTS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE ) @@ -91,9 +91,8 @@ def get_report_date(week,start_year,epi=False): report_date = str(epi_week) return(report_date) - -def parse_table_captions(soup): +def extract_captions_of_interest(soup): """ finds all the table captions for the current week so tables can be identified @@ -369,6 +368,8 @@ def create_percent_positive_detection_table(table,modified_date,start_year, flu= return(table) def get_season_reports(url): + # From the url, go to the main landing page for a season + # which contains all the links to each week in the season page=requests.get(url) soup=BeautifulSoup(page.text,'html.parser') @@ -387,7 +388,9 @@ def get_season_reports(url): current_week = weeks[week_num] current_week_end = end_dates[week_num] - # Skip empty pages + # In the 2019=2020 season, the webpages for weeks 5 and 47 only have + # the abbreviations table and the headers for the respiratory detections + # table, so they are effectively empty, and skipped if season[0] == '2019': if current_week == 5 or current_week == 47: continue @@ -396,7 +399,7 @@ def get_season_reports(url): temp_url=urls[week_num] temp_page=requests.get(temp_url) new_soup = BeautifulSoup(temp_page.text, 'html.parser') - captions = parse_table_captions(new_soup) + captions = extract_captions_of_interest(new_soup) modified_date = get_modified_dates(new_soup,current_week_end) positive_tables=[] @@ -405,55 +408,87 @@ def get_season_reports(url): caption=captions[i] tab = caption.find_next('table') - # Remove footers from tables + # Remove footers from tables so the text isn't read in as a table row if tab.find('tfoot'): tab.tfoot.decompose() - # Delete duplicate entry from week 35 of the 2019-2020 season + # In the positive adenovirus table in week 35 of the 2019-2020 season + # The week number has been duplicated, which makes all the entries in the table + # are one column to the right of where they should be. To fix this the + # entry in the table (which is the first "td" element in the html) is deleted if season[0] == '2019' and current_week == 35: if "Positive Adenovirus" in caption.text: tab.select_one('td').decompose() # Replace commas with periods + # Some "number of detections" tables have number with commas (i.e 1,000) + # In this case the commas must be deleted, otherwise turn into periods + # because some tables have commas instead of decimal points if "number" not in caption.text.lower(): tab = re.sub(",",r".",str(tab)) else: tab = re.sub(",","",str(tab)) - # Read table + # Read table, coding all the abbreviations for missing data into NA + # Also use dropna because removing footers causes the html to have an empty row na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"N.D.","-"] table = pd.read_html(tab,na_values=na_values)[0].dropna(how="all") # Check for multiline headers + # If there are any, combine them into a single line header if isinstance(table.columns, pd.MultiIndex): table.columns = [c[0] + " " + c[1] if c[0] != c[1] else c[0] for c in table.columns] # Make column names lowercase table.columns=table.columns.str.lower() + # One-off edge cases where tables need to be manually adjusted because + # they will cause errors otherwise if season[0] == '2017': if current_week == 35 and "entero" in caption.text.lower(): - # Remove french from headers in week 35 for the entero table + # The positive enterovirus table in week 35 of the 2017-2018 season has french + # in the headers,so the french needs to be removed table.columns = ['week', 'week end', 'canada tests', 'entero/rhino%', 'at tests', 'entero/rhino%.1', 'qc tests', 'entero/rhino%.2', 'on tests', 'entero/rhino%.3', 'pr tests', 'entero/rhino%.4', 'bc tests', 'entero/rhino%.5'] elif current_week == 35 and "adeno" in caption.text.lower(): - # Remove > from column name + # In week 35 of the 2017-2018, the positive adenovirus table has ">week end" + # instead of "week end", so remove > from the column table = table.rename(columns={'>week end':"week end"}) elif current_week == 47 and "rsv" in caption.text.lower(): - # fix date written as 201-11-25 + # In week 47 of the 2017-2018 season, a date is written as 201-11-25, + # instead of 2017-11-25 table.loc[table['week'] == 47, 'week end'] = "2017-11-25" elif season[0] == '2015' and current_week == 41: - # Fix date written m-d-y not d-m-y + # In week 41 of the 2015-2016 season, a date written in m-d-y format not d-m-y table=table.replace("10-17-2015","17-10-2015",regex=True) elif season[0] == '2022' and current_week == 11 and "hmpv" in caption.text.lower(): - # fix date written as 022-09-03 + # In week 11 of the 2022-2023 season, in the positive hmpv table, + # a date is written as 022-09-03, instead of 2022-09-03 table.loc[table['week'] == 35, 'week end'] = "2022-09-03" # Rename columns table= preprocess_table_columns(table) + # If "reporting laboratory" is one of the columns of the table, the table must be + # the "Respiratory virus detections " table for a given week + # this is the lab level table that has weekly positive tests for each virus, with no revisions + # and each row represents a lab + + # If "number" is in the table caption, the table must be the + # "Number of positive respiratory detections" table, for a given week + # this is a national level table, reporting the number of detections for each virus, + # this table has revisions, so each row is a week in the season, with weeks going from the + # start of the season up to and including the current week + + # If "positive" is in the table caption, the table must be one of the + # "Positive [virus] Tests (%)" table, for a given week + # This is a region level table, reporting the total tests and percent positive tests for each virus, + # this table has revisions, so each row is a week in the season, with weeks going from the + # start of the season up to and including the current week + # The columns have the region information (i.e Pr tests, meaning this columns has the tests for the prairies) + if "reporting laboratory" in str(table.columns): respiratory_detection_table = create_detections_table(table,modified_date,current_week,current_week_end,season[0]) respiratory_detection_table = respiratory_detection_table.set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) @@ -465,9 +500,13 @@ def get_season_reports(url): flu = " influenza" in caption.text.lower() # tables are missing week 53 - if season[0]=="2014" and current_week==2: - overwrite_weeks=True - elif season[0]=="2014" and current_week==3: + # In the 2014-2015 season the year ends at week 53 before starting at week 1 again. + # weeks 53,2 and 3 skip week 53 in the positive detection tables, going from 52 to 1, + # this means the week numbers following 52 are 1 larger then they should be + # fix this by overwriting the week number columns + + missing_week_53 = [53,2,3] + if season[0]=="2014" and current_week in missing_week_53: overwrite_weeks=True else: overwrite_weeks=False @@ -491,6 +530,8 @@ def get_season_reports(url): # Check if the indices are already in the season table # If not, add the weeks tables into the season table + + # check for deduplication pandas if not respiratory_detection_table.index.isin(all_respiratory_detection_table.index).any(): all_respiratory_detection_table= pd.concat([all_respiratory_detection_table,respiratory_detection_table]) @@ -519,7 +560,7 @@ def main(): old_detection_data = pd.read_csv('season_2023_2024/' + RESP_COUNTS_OUTPUT_FILE).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) old_positive_data = pd.read_csv('season_2023_2024/' + POSITIVE_TESTS_OUTPUT_FILE).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) - for base_url in DASHBOARD_BASE_URLS_2023: + for base_url in DASHBOARD_BASE_URLS_2023_2024_SEASON: # Get weekly dashboard data weekly_data = get_weekly_data(base_url,2023).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) positive_data = get_revised_data(base_url) From 31ec961282db07f4a39068055496e58badc6932b Mon Sep 17 00:00:00 2001 From: cchuong Date: Thu, 10 Oct 2024 12:32:59 -0700 Subject: [PATCH 30/33] calculate update dates in a new function --- src/acquisition/rvdss/rvdss_update.py | 11 ++++++++--- src/acquisition/rvdss/utils.py | 25 +++++++------------------ 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/src/acquisition/rvdss/rvdss_update.py b/src/acquisition/rvdss/rvdss_update.py index cab8d68bc..7aed18974 100644 --- a/src/acquisition/rvdss/rvdss_update.py +++ b/src/acquisition/rvdss/rvdss_update.py @@ -6,13 +6,18 @@ import pandas as pd import os -from delphi.epidata.acquisition.rvdss.utils import get_weekly_data, get_revised_data +from delphi.epidata.acquisition.rvdss.utils import get_weekly_data, get_revised_data, get_dashboard_update_date from delphi.epidata.acquisition.rvdss.constants import DASHBOARD_BASE_URL, RESP_COUNTS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE def main(): - weekly_data = get_weekly_data(DASHBOARD_BASE_URL,2024).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) - positive_data = get_revised_data(DASHBOARD_BASE_URL) + headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' + } + + update_date = get_dashboard_update_date(DASHBOARD_BASE_URL,headers) + weekly_data = get_weekly_data(DASHBOARD_BASE_URL,2024,headers,update_date).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) + positive_data = get_revised_data(DASHBOARD_BASE_URL,headers,update_date) path1 = './' + RESP_COUNTS_OUTPUT_FILE path2 = './' + POSITIVE_TESTS_OUTPUT_FILE diff --git a/src/acquisition/rvdss/utils.py b/src/acquisition/rvdss/utils.py index 481c47804..4d98ce198 100644 --- a/src/acquisition/rvdss/utils.py +++ b/src/acquisition/rvdss/utils.py @@ -54,16 +54,14 @@ def check_date_format(date_string): return(new_date) -def get_revised_data(base_url): - headers = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' - } - +def get_dashboard_update_date(base_url,headers): # Get update date update_date_url = base_url + DASHBOARD_UPDATE_DATE_FILE update_date_url_response = requests.get(update_date_url, headers=headers) update_date = datetime.strptime(update_date_url_response.text,"%m/%d/%Y %H:%M:%S").strftime("%Y-%m-%d") + return(update_date) +def get_revised_data(base_url,headers,update_date): # Get update data url = base_url+DASHBOARD_DATA_FILE @@ -80,7 +78,7 @@ def get_revised_data(base_url): df['geo_type'] = [create_geo_types(g,"province") for g in df['geo_value']] df.insert(1,"issue",update_date) - df=df.drop(["weekorder","region","year","week"],axis=1) + #df=df.drop(["weekorder","region","year","week"],axis=1) df = df.pivot(index=['epiweek','time_value','issue','geo_type','geo_value'], columns="virus",values=['tests','percentpositive','positivetests']) @@ -96,16 +94,7 @@ def get_revised_data(base_url): return(df) -def get_weekly_data(base_url,start_year): - headers = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' - } - - # Get update date - update_date_url = base_url + "RVD_UpdateDate.csv" - update_date_url_response = requests.get(update_date_url, headers=headers) - update_date = datetime.strptime(update_date_url_response.text,"%m/%d/%Y %H:%M:%S").strftime("%Y-%m-%d") - +def get_weekly_data(base_url,start_year,headers,update_date): # Get current week and year summary_url = base_url + "RVD_SummaryText.csv" summary_url_response = requests.get(summary_url, headers=headers) @@ -145,7 +134,7 @@ def get_weekly_data(base_url,start_year): df_weekly['geo_value'] = [abbreviate_geo(g) for g in df_weekly['geo_value']] df_weekly['geo_type'] = [create_geo_types(g,"lab") for g in df_weekly['geo_value']] - if df_weekly.columns.isin(["weekorder","date","week"]).all(): - df_weekly=df_weekly.drop(["weekorder","date","week"],axis=1) + # if df_weekly.columns.isin(["weekorder","date","week"]).all(): + # df_weekly=df_weekly.drop(["weekorder","date","week"],axis=1) return(df_weekly) \ No newline at end of file From 0be5f08528bd3532cb567feed0b6ac57875955ff Mon Sep 17 00:00:00 2001 From: cchuong Date: Sat, 12 Oct 2024 17:18:03 -0700 Subject: [PATCH 31/33] combine different spellings of labs --- src/acquisition/rvdss/constants.py | 1 + src/acquisition/rvdss/utils.py | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py index 47bc6f9f9..6c864db46 100644 --- a/src/acquisition/rvdss/constants.py +++ b/src/acquisition/rvdss/constants.py @@ -43,6 +43,7 @@ "atl":"atlantic", "pr" :"prairies" , "terr" :"territories", + "uhn sinai hospital":"uhn mount sinai hospital" } # Regions are groups of provinces that are geographically close together. Some single provinces are reported as their own region (e.g. Québec, Ontario). diff --git a/src/acquisition/rvdss/utils.py b/src/acquisition/rvdss/utils.py index 4d98ce198..24a2b8337 100644 --- a/src/acquisition/rvdss/utils.py +++ b/src/acquisition/rvdss/utils.py @@ -5,6 +5,8 @@ from epiweeks import Week from datetime import datetime import math +from unidecode import unidecode +import string from delphi.epidata.acquisition.rvdss.constants import ( VIRUSES, GEOS, REGIONS, NATION, LAST_WEEK_OF_YEAR, @@ -24,11 +26,19 @@ def abbreviate_geo(full_name): lowercase=re.sub("\.|\*","",lowercase) lowercase=re.sub("/territoires","",lowercase) lowercase=re.sub("^cana$","can",lowercase) + lowercase =lowercase.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation),'.'+"'")) + lowercase=re.sub(' +', ' ', lowercase) + + new_name=unidecode(lowercase) + new_name=re.sub(' +', ' ', new_name) keys = (re.escape(k) for k in GEOS.keys()) pattern = re.compile(r'^\b(' + '|'.join(keys) + r')\b$') - result = pattern.sub(lambda x: GEOS[x.group()], lowercase) + result = pattern.sub(lambda x: GEOS[x.group()], new_name) + + if result == new_name: + result = lowercase return(result) def create_geo_types(geo,default_geo): From 56966360511b9d7a24cf8825e6aa4f7608a01324 Mon Sep 17 00:00:00 2001 From: cchuong Date: Sat, 12 Oct 2024 18:27:42 -0700 Subject: [PATCH 32/33] change slash to underscore in constants and move more processing code into seperate function --- src/acquisition/rvdss/constants.py | 6 +++--- src/acquisition/rvdss/rvdss_historic.py | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py index 6c864db46..94a30bf04 100644 --- a/src/acquisition/rvdss/constants.py +++ b/src/acquisition/rvdss/constants.py @@ -7,12 +7,12 @@ "adenovirus": "adv", "adeno": "adv", "human metapneumovirus": "hmpv", - "enterovirus/rhinovirus": "evrv", + "enterovirus_rhinovirus": "evrv", "rhinovirus": "evrv", "rhv": "evrv", - "entero/rhino": "evrv", + "entero_rhino": "evrv", "rhino":"evrv", - "ev/rv":"evrv", + "ev_rv":"evrv", "coronavirus":"hcov", "coron":"hcov", "coro":"hcov", diff --git a/src/acquisition/rvdss/rvdss_historic.py b/src/acquisition/rvdss/rvdss_historic.py index 090da7b50..ee22f2eed 100644 --- a/src/acquisition/rvdss/rvdss_historic.py +++ b/src/acquisition/rvdss/rvdss_historic.py @@ -210,8 +210,9 @@ def make_signal_type_spelling_consistent(signal): combined_pat2 = '|'.join((pat3, pat4)) new_signal = re.sub(combined_pat, "positive_tests",signal) - new_signal = re.sub(combined_pat2, "positive_tests",signal) - new_signal = re.sub("total ", "",signal) + new_signal = re.sub(combined_pat2, "tests",new_signal) + new_signal =re.sub(" *%", "_pct_positive",new_signal) + new_signal = re.sub("total ", "",new_signal) return(new_signal) def preprocess_table_columns(table): @@ -240,6 +241,7 @@ def preprocess_table_columns(table): table.columns = [re.sub("flutest","flu test", col) for col in table.columns] table.columns = [re.sub(r"other hpiv","hpivother",t) for t in table.columns] + table.columns=[make_signal_type_spelling_consistent(col) for col in table.columns] return(table) def create_detections_table(table,modified_date,week_number,week_end_date,start_year): @@ -251,9 +253,8 @@ def create_detections_table(table,modified_date,week_number,week_end_date,start_ table["geo_value"]=[re.sub("^province of$","alberta",c) for c in table["geo_value"]] # make naming consistent - table.columns=[make_signal_type_spelling_consistent(col) for col in table.columns] table.columns=[add_flu_prefix(col) for col in table.columns] - matches=['test','geo_value'] + matches=['test','geo_value','positive'] new_names = [] for i in range(len(table.columns)): @@ -305,7 +306,6 @@ def create_number_detections_table(table,modified_date,start_year): def create_percent_positive_detection_table(table,modified_date,start_year, flu=False,overwrite_weeks=False): table = deduplicate_rows(table) - table.columns=[re.sub(" *%", "_pct_positive",col) for col in table.columns] table.columns = [re.sub(' +', ' ',col) for col in table.columns] table.insert(2,"issue",modified_date) table=table.rename(columns={'week end':"time_value"}) From 30f3df65b0e168757833a62929a30dd219f5ae1c Mon Sep 17 00:00:00 2001 From: nmdefries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 22 Nov 2024 13:35:10 -0500 Subject: [PATCH 33/33] rvdss interface and new fn layout so current/historical data can be easily fetched (#1551) * add basic sql tables -- needs update with real col names * rename files * add main fn with CLI; remove date range params in package frontend fn stubs * start filling out historical fn stubs * rest of new fn layout. adds CLI * dashboard results can be stored directly in list in fetch_historical_dashboard_data * Add in archived dashboards, and calculate start year from data * address todos and fix historical fetching * Change misspelled CB to BC * Update imports --------- Co-authored-by: cchuong --- src/acquisition/rvdss/constants.py | 18 +- src/acquisition/rvdss/database.py | 121 +++++++++++++ .../{rvdss_historic.py => pull_historic.py} | 158 ++++++----------- src/acquisition/rvdss/run.py | 128 ++++++++++++++ src/acquisition/rvdss/rvdss_update.py | 42 ----- src/acquisition/rvdss/utils.py | 160 ++++++++++++++---- src/ddl/rvdss.sql | 49 ++++++ 7 files changed, 488 insertions(+), 188 deletions(-) create mode 100644 src/acquisition/rvdss/database.py rename src/acquisition/rvdss/{rvdss_historic.py => pull_historic.py} (79%) create mode 100644 src/acquisition/rvdss/run.py delete mode 100644 src/acquisition/rvdss/rvdss_update.py create mode 100644 src/ddl/rvdss.sql diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py index 94a30bf04..f06f1d5e2 100644 --- a/src/acquisition/rvdss/constants.py +++ b/src/acquisition/rvdss/constants.py @@ -1,3 +1,5 @@ +from datetime import datetime + # The dataset calls the same viruses, provinces, regions (province groups), # and country by multiple names. Map each of those to a common abbreviation. VIRUSES = { @@ -34,7 +36,7 @@ "saskatchewan":"sk", "alberta": "ab", "british columbia" :"bc", - "yukon" : "yk", + "yukon" : "yt", "northwest territories" : "nt", "nunavut" : "nu", "canada":"ca", @@ -54,6 +56,8 @@ # Construct dashboard and data report URLS. DASHBOARD_BASE_URL = "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/" DASHBOARD_W_DATE_URL = DASHBOARD_BASE_URL + "archive/{date}/" + +# May not need this since we write a function for this in pull_historic DASHBOARD_BASE_URLS_2023_2024_SEASON = ( DASHBOARD_W_DATE_URL.format(date = date) for date in ( @@ -74,6 +78,7 @@ SEASON_BASE_URL = "https://www.canada.ca" ALTERNATIVE_SEASON_BASE_URL = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/" HISTORIC_SEASON_REPORTS_URL = SEASON_BASE_URL+"/en/public-health/services/surveillance/respiratory-virus-detections-canada/{year_range}.html" +DASHBOARD_ARCHIVED_DATES_URL= "https://health-infobase.canada.ca/src/js/respiratory-virus-detections/ArchiveData.json" # Each URL created here points to a list of all data reports made during that # season, e.g. @@ -82,7 +87,7 @@ # disease data in a dashboard with a static URL. Therefore, this collection # of URLs does _NOT_ need to be updated. It is used for fetching historical # data (for dates on or before June 8, 2024) only. -HISTORIC_SEASON_URL = (HISTORIC_SEASON_REPORTS_URL.format(year_range = year_range) for year_range in +HISTORIC_SEASON_URLS = (HISTORIC_SEASON_REPORTS_URL.format(year_range = year_range) for year_range in ( "2013-2014", "2014-2015", @@ -101,7 +106,12 @@ DASHBOARD_UPDATE_DATE_FILE = "RVD_UpdateDate.csv" DASHBOARD_DATA_FILE = "RVD_WeeklyData.csv" -RESP_COUNTS_OUTPUT_FILE = "respiratory_detections.csv" + +RESP_DETECTIONS_OUTPUT_FILE = "respiratory_detections.csv" POSITIVE_TESTS_OUTPUT_FILE = "positive_tests.csv" +COUNTS_OUTPUT_FILE = "number_of_detections.csv" + +FIRST_WEEK_OF_YEAR = 35 -LAST_WEEK_OF_YEAR = 35 +UPDATE_DATES_FILE = "update_dates.txt" +NOW = datetime.now() diff --git a/src/acquisition/rvdss/database.py b/src/acquisition/rvdss/database.py new file mode 100644 index 000000000..4e1ea1c87 --- /dev/null +++ b/src/acquisition/rvdss/database.py @@ -0,0 +1,121 @@ +""" +=============== +=== Purpose === +=============== + +Stores data provided by rvdss Corp., which contains flu lab test results. +See: rvdss.py + + +======================= +=== Data Dictionary === +======================= + +`rvdss` is the table where rvdss data is stored. ++----------+-------------+------+-----+---------+----------------+ +| Field | Type | Null | Key | Default | Extra | ++----------+-------------+------+-----+---------+----------------+ +| id | int(11) | NO | PRI | NULL | auto_increment | +| location | varchar(8) | NO | MUL | NULL | | +| epiweek | int(11) | NO | MUL | NULL | | +| value | float | NO | | NULL | | ++----------+-------------+------+-----+---------+----------------+ +id: unique identifier for each record +location: hhs1-10 +epiweek: the epiweek during which the queries were executed +value: number of total test records per facility, within each epiweek + +================= +=== Changelog === +================= +2017-12-14: + * add "need update" check + +2017-12-02: + * original version +""" + +# standard library +import argparse + +# third party +import mysql.connector + +# first party +from delphi.epidata.acquisition.rvdss import rvdss +import delphi.operations.secrets as secrets +from delphi.utils.epidate import EpiDate +import delphi.utils.epiweek as flu +from delphi.utils.geo.locations import Locations + +LOCATIONS = Locations.hhs_list +DATAPATH = "/home/automation/rvdss_data" + + +def update(locations, first=None, last=None, force_update=False, load_email=True): + # download and prepare data first + qd = rvdss.rvdssData(DATAPATH, load_email) + if not qd.need_update and not force_update: + print("Data not updated, nothing needs change.") + return + + qd_data = qd.load_csv() + qd_measurements = qd.prepare_measurements(qd_data, start_weekday=4) + qd_ts = rvdss.measurement_to_ts(qd_measurements, 7, startweek=first, endweek=last) + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() + + def get_num_rows(): + cur.execute("SELECT count(1) `num` FROM `rvdss`") + for (num,) in cur: + pass + return num + + # check from 4 weeks preceeding the last week with data through this week + cur.execute("SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `rvdss`") + for (ew0, ew1) in cur: + ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) + ew0 = ew0 if first is None else first + ew1 = ew1 if last is None else last + print(f"Checking epiweeks between {int(ew0)} and {int(ew1)}...") + + # keep track of how many rows were added + rows_before = get_num_rows() + + # check rvdss for new and/or revised data + sql = """ + INSERT INTO + `rvdss` (`location`, `epiweek`, `value`) + VALUES + (%s, %s, %s) + ON DUPLICATE KEY UPDATE + `value` = %s + """ + + total_rows = 0 + + for location in locations: + if location not in qd_ts: + continue + ews = sorted(qd_ts[location].keys()) + num_missing = 0 + for ew in ews: + v = qd_ts[location][ew] + sql_data = (location, ew, v, v) + cur.execute(sql, sql_data) + total_rows += 1 + if v == 0: + num_missing += 1 + if num_missing > 0: + print(f" [{location}] missing {int(num_missing)}/{len(ews)} value(s)") + + # keep track of how many rows were added + rows_after = get_num_rows() + print(f"Inserted {int(rows_after - rows_before)}/{int(total_rows)} row(s)") + + # cleanup + cur.close() + cnx.commit() + cnx.close() diff --git a/src/acquisition/rvdss/rvdss_historic.py b/src/acquisition/rvdss/pull_historic.py similarity index 79% rename from src/acquisition/rvdss/rvdss_historic.py rename to src/acquisition/rvdss/pull_historic.py index ee22f2eed..82ff48910 100644 --- a/src/acquisition/rvdss/rvdss_historic.py +++ b/src/acquisition/rvdss/pull_historic.py @@ -14,14 +14,15 @@ from datetime import datetime, timedelta import math -from delphi.epidata.acquisition.rvdss.constants import ( - DASHBOARD_BASE_URLS_2023_2024_SEASON, HISTORIC_SEASON_URL, - ALTERNATIVE_SEASON_BASE_URL, SEASON_BASE_URL, LAST_WEEK_OF_YEAR, - RESP_COUNTS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE +from constants import ( + HISTORIC_SEASON_URLS, + ALTERNATIVE_SEASON_BASE_URL, SEASON_BASE_URL, FIRST_WEEK_OF_YEAR, + DASHBOARD_ARCHIVED_DATES_URL, + DASHBOARD_BASE_URL ) -from delphi.epidata.acquisition.rvdss.utils import ( +from utils import ( abbreviate_virus, abbreviate_geo, create_geo_types, check_date_format, - get_revised_data, get_weekly_data + fetch_dashboard_data,preprocess_table_columns, add_flu_prefix ) #%% Functions @@ -78,7 +79,7 @@ def get_report_date(week,start_year,epi=False): epi - if True, return the date in cdc format (yearweek) """ - if week < LAST_WEEK_OF_YEAR: + if week < FIRST_WEEK_OF_YEAR: year=int(start_year)+1 else: year=int(start_year) @@ -137,9 +138,9 @@ def get_modified_dates(soup,week_end_date): meta_tags=soup.find_all("meta",title="W3CDTF") for tag in meta_tags: if tag.get("name", None) == "dcterms.modified" or tag.get("property", None) == "dcterms.modified": - modified_date = tag.get("content", None) + date_modified = tag.get("content", None) - mod_date = datetime.strptime(modified_date, "%Y-%m-%d") + mod_date = datetime.strptime(date_modified, "%Y-%m-%d") week_date = datetime.strptime(week_end_date, "%Y-%m-%d") diff_days = (mod_date-week_date).days @@ -183,65 +184,13 @@ def deduplicate_rows(table): new_table=table return(new_table) -def add_flu_prefix(flu_subtype): - """ Add the prefix `flu` when only the subtype is reported """ +def drop_ah1_columns(table): + h1n1_column_exists = any([re.search("h1n1",c) for c in table.columns]) + ah1_column_exists = any([re.search(r"ah1\b",c) for c in table.columns]) - pat1 =r"^ah3" - pat2= r"^auns" - pat3= r"^ah1pdm09" - pat4= r"^ah1n1pdm09" - combined_pat = '|'.join((pat1, pat2,pat3,pat4)) - - full_fluname = re.sub(combined_pat, r"flu\g<0>",flu_subtype) - return(full_fluname) - -def make_signal_type_spelling_consistent(signal): - """ - Make the signal type (i.e. percent positive, number tests, total tests) have consistent spelling - Also remove total from signal names - """ - - pat1 = "positive" - pat2 = 'pos' - combined_pat = '|'.join((pat1, pat2)) - - pat3 = r"test\b" - pat4 = 'tested' - combined_pat2 = '|'.join((pat3, pat4)) - - new_signal = re.sub(combined_pat, "positive_tests",signal) - new_signal = re.sub(combined_pat2, "tests",new_signal) - new_signal =re.sub(" *%", "_pct_positive",new_signal) - new_signal = re.sub("total ", "",new_signal) - return(new_signal) - -def preprocess_table_columns(table): - """ - Remove characters like . or * from columns - Abbreviate the viruses in columns - Change some naming of signals in columns (i.e order of hpiv and other) - Change some naming of locations in columns (i.e at instead of atl) - """ - table.columns = [re.sub("\xa0"," ", col) for col in table.columns] # \xa0 to space - table.columns = [re.sub("(.*?)(\.\d+)", "\\1", c) for c in table.columns] # remove .# for duplicated columns - table.columns =[re.sub("\.", "", s)for s in table.columns] #remove periods - table.columns =[re.sub(r"\((all)\)", "", s)for s in table.columns] # remove (all) - table.columns =[re.sub(r"\s*\(|\)", "", s)for s in table.columns] - table.columns = [re.sub(' +', ' ', col) for col in table.columns] # Make any muliple spaces into one space - table.columns = [re.sub(r'\(|\)', '', col) for col in table.columns] # replace () for _ - table.columns = [re.sub(r'/', '_', col) for col in table.columns] # replace / with _ - - table.columns = [re.sub(r"^at\b","atl ",t) for t in table.columns] - table.columns = [re.sub("canada","can",t) for t in table.columns] - - table.columns =[re.sub(r"h1n1 2009 |h1n12009", "ah1n1pdm09", s)for s in table.columns] - table.columns =[abbreviate_virus(col) for col in table.columns] # abbreviate viruses - table.columns = [re.sub(r"flu a","flua",t) for t in table.columns] - table.columns = [re.sub(r"flu b","flub",t) for t in table.columns] - table.columns = [re.sub("flutest","flu test", col) for col in table.columns] - table.columns = [re.sub(r"other hpiv","hpivother",t) for t in table.columns] - - table.columns=[make_signal_type_spelling_consistent(col) for col in table.columns] + if ah1_column_exists and h1n1_column_exists: + column_name_to_drop = list(table.filter(regex=r'ah1\b')) + table.drop(columns = column_name_to_drop,inplace=True) return(table) def create_detections_table(table,modified_date,week_number,week_end_date,start_year): @@ -367,7 +316,7 @@ def create_percent_positive_detection_table(table,modified_date,start_year, flu= return(table) -def get_season_reports(url): +def fetch_one_season_from_report(url): # From the url, go to the main landing page for a season # which contains all the links to each week in the season page=requests.get(url) @@ -382,13 +331,13 @@ def get_season_reports(url): # create tables to hold all the data for the season all_positive_tables=pd.DataFrame() all_number_tables=pd.DataFrame() - all_respiratory_detection_table=pd.DataFrame() + all_respiratory_detection_tables=pd.DataFrame() for week_num in range(len(urls)): current_week = weeks[week_num] current_week_end = end_dates[week_num] - # In the 2019=2020 season, the webpages for weeks 5 and 47 only have + # In the 2019-2020 season, the webpages for weeks 5 and 47 only have # the abbreviations table and the headers for the respiratory detections # table, so they are effectively empty, and skipped if season[0] == '2019': @@ -399,6 +348,7 @@ def get_season_reports(url): temp_url=urls[week_num] temp_page=requests.get(temp_url) new_soup = BeautifulSoup(temp_page.text, 'html.parser') + captions = extract_captions_of_interest(new_soup) modified_date = get_modified_dates(new_soup,current_week_end) @@ -431,7 +381,7 @@ def get_season_reports(url): # Read table, coding all the abbreviations for missing data into NA # Also use dropna because removing footers causes the html to have an empty row - na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"N.D.","-"] + na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"not available","not tested","N.D.","-"] table = pd.read_html(tab,na_values=na_values)[0].dropna(how="all") # Check for multiline headers @@ -468,6 +418,9 @@ def get_season_reports(url): # a date is written as 022-09-03, instead of 2022-09-03 table.loc[table['week'] == 35, 'week end'] = "2022-09-03" + # check if both ah1 and h1n1 are given. If so drop one since they are the same virus and ah1 is always empty + table = drop_ah1_columns(table) + # Rename columns table= preprocess_table_columns(table) @@ -523,17 +476,17 @@ def get_season_reports(url): positive_tables.append(pos_table) # create path to save files - path = "season_" + season[0]+"_"+season[1] + #path = "season_" + season[0]+"_"+season[1] # combine all the positive tables - combined_positive_tables=pd.concat(positive_tables,axis=1) + combined_positive_tables =pd.concat(positive_tables,axis=1) # Check if the indices are already in the season table # If not, add the weeks tables into the season table # check for deduplication pandas - if not respiratory_detection_table.index.isin(all_respiratory_detection_table.index).any(): - all_respiratory_detection_table= pd.concat([all_respiratory_detection_table,respiratory_detection_table]) + if not respiratory_detection_table.index.isin(all_respiratory_detection_tables.index).any(): + all_respiratory_detection_tables= pd.concat([all_respiratory_detection_tables,respiratory_detection_table]) if not combined_positive_tables.index.isin(all_positive_tables.index).any(): all_positive_tables=pd.concat([all_positive_tables,combined_positive_tables]) @@ -542,40 +495,33 @@ def get_season_reports(url): if not number_detections_table.index.isin(all_number_tables.index).any(): all_number_tables=pd.concat([all_number_tables,number_detections_table]) - # write files to csvs - all_respiratory_detection_table.to_csv(path+"/" + RESP_COUNTS_OUTPUT_FILE, index=True) - all_positive_tables.to_csv(path+"/" + POSITIVE_TESTS_OUTPUT_FILE, index=True) - - # Write the number of detections table to csv if it exists (i.e has rows) - if len(all_number_tables) != 0: - all_number_tables.to_csv(path+"/number_of_detections.csv", index=True) - -def main(): - # Scrape each season. Saves data to CSVs as a side effect. - [get_season_reports(url) for url in HISTORIC_SEASON_URL] - - # Update the end of the 2023-2024 season with the dashboard data + return { + "respiratory_detection": all_respiratory_detection_tables, + "positive": all_positive_tables, + "count": all_number_tables, + } - # Load old csvs - old_detection_data = pd.read_csv('season_2023_2024/' + RESP_COUNTS_OUTPUT_FILE).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) - old_positive_data = pd.read_csv('season_2023_2024/' + POSITIVE_TESTS_OUTPUT_FILE).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) +def fetch_archived_dashboard_dates(archive_url): + r=requests.get(archive_url) + values=r.json() + data=pd.json_normalize(values) + english_data = data[data["lang"]=="en"] + + archived_dates=english_data['date'].to_list() + return(archived_dates) - for base_url in DASHBOARD_BASE_URLS_2023_2024_SEASON: - # Get weekly dashboard data - weekly_data = get_weekly_data(base_url,2023).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) - positive_data = get_revised_data(base_url) - # Check if indices are already present in the old data - # If not, add the new data - if not weekly_data.index.isin(old_detection_data.index).any(): - old_detection_data= pd.concat([old_detection_data,weekly_data],axis=0) +def fetch_report_data(): + # Scrape each season. + dict_list = [fetch_one_season_from_report(url) for url in HISTORIC_SEASON_URLS] - if not positive_data.index.isin(old_positive_data.index).any(): - old_positive_data= pd.concat([old_positive_data,positive_data],axis=0) + return dict_list - # Overwrite/update csvs - old_detection_data.to_csv('season_2023_2024/' + RESP_COUNTS_OUTPUT_FILE,index=True) - old_positive_data.to_csv('season_2023_2024/' + POSITIVE_TESTS_OUTPUT_FILE,index=True) +def fetch_historical_dashboard_data(): + # Update the end of the 2023-2024 season with the dashboard data + archived_dates = fetch_archived_dashboard_dates(DASHBOARD_ARCHIVED_DATES_URL) + + archived_urls= [DASHBOARD_BASE_URL + "archive/"+ date+"/" for date in archived_dates] + dict_list = [fetch_dashboard_data(url) for url in archived_urls] -if __name__ == '__main__': - main() + return dict_list diff --git a/src/acquisition/rvdss/run.py b/src/acquisition/rvdss/run.py new file mode 100644 index 000000000..599fc89de --- /dev/null +++ b/src/acquisition/rvdss/run.py @@ -0,0 +1,128 @@ +""" +Defines command line interface for the rvdss indicator. Current data (covering the most recent epiweek) and historical data (covering all data before the most recent epiweek) can be generated together or separately. + +Defines top-level functions to fetch data and save to disk or DB. +""" + +import pandas as pd +import os +import argparse + +from utils import fetch_dashboard_data, check_most_recent_update_date,get_dashboard_update_date +from constants import DASHBOARD_BASE_URL, RESP_DETECTIONS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE, COUNTS_OUTPUT_FILE,UPDATE_DATES_FILE +from pull_historic import fetch_report_data,fetch_historical_dashboard_data + +def update_current_data(): + + ## Check if data for current update date has already been fetched + headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' + } + + update_date = get_dashboard_update_date(DASHBOARD_BASE_URL, headers) + already_updated = check_most_recent_update_date(update_date,UPDATE_DATES_FILE) + + if not already_updated: + with open(UPDATE_DATES_FILE, 'a') as testfile: + testfile.write(update_date+ "\n") + + ## TODO: what is the base path for these files? + base_path = "." + + data_dict = fetch_dashboard_data(DASHBOARD_BASE_URL) + + table_types = { + "respiratory_detection": RESP_DETECTIONS_OUTPUT_FILE, + "positive": POSITIVE_TESTS_OUTPUT_FILE, + # "count": COUNTS_OUTPUT_FILE, # Dashboards don't contain this data. + } + for tt in table_types.keys(): + data = data_dict[tt] + + # Write the tables to separate csvs + path = base_path + "/" + table_types[tt] + + # Since this function generates new data weekly, we need to combine it with the existing data, if it exists. + if not os.path.exists(path): + data.to_csv(path,index=True) + else: + old_data = pd.read_csv(path).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) + + # If index already exists in the data on disk, don't add the new data -- we may have already run the weekly data fetch. + ## TODO: The check on index maybe should be stricter? Although we do deduplication upstream, so this probably won't find true duplicates + if not data.index.isin(old_data.index).any(): + old_data= pd.concat([old_data,data],axis=0) + old_data.to_csv(path,index=True) + + # ## TODO + # update_database(data) + else: + print("Data is already up to date") + +def update_historical_data(): + ## TODO: what is the base path for these files? + base_path = "." + + report_dict_list = fetch_report_data() # a dict for every season, and every seasonal dict has 2/3 tables inside + + # a dict with an entry for every week that has an archival dashboard, and each entry has 2/3 tables + dashboard_dict_list = fetch_historical_dashboard_data() + + table_types = { + "respiratory_detection": RESP_DETECTIONS_OUTPUT_FILE, + "positive": POSITIVE_TESTS_OUTPUT_FILE, + "count": COUNTS_OUTPUT_FILE, + } + for tt in table_types.keys(): + # Merge tables together from dashboards and reports for each table type. + dashboard_data = [elem.get(tt, pd.DataFrame()) for elem in dashboard_dict_list] # a list of all the dashboard tables + report_data = [elem.get(tt, None) for elem in report_dict_list] # a list of the report table + + all_report_tables = pd.concat(report_data) + all_dashboard_tables = pd.concat(dashboard_data) + + data = pd.concat([all_report_tables, all_dashboard_tables]) + + # Write the tables to separate csvs + if not data.empty: + data.to_csv(base_path +"/" + table_types[tt], index=True) + + # ## TODO + # update_database(data) + + +def main(): + # args and usage + parser = argparse.ArgumentParser() + # fmt: off + parser.add_argument( + "--current", + "-c", + action="store_true", + help="fetch current data, that is, data for the latest epiweek" + ) + parser.add_argument( + "--historical", + "-hist", + action="store_true", + help="fetch historical data, that is, data for all available time periods other than the latest epiweek" + ) + # fmt: on + args = parser.parse_args() + + current_flag, historical_flag = ( + args.current, + args.historical, + ) + if not current_flag and not historical_flag: + raise Exception("no data was requested") + + # Decide what to update + if current_flag: + update_current_data() + if historical_flag: + update_historical_data() + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/rvdss/rvdss_update.py b/src/acquisition/rvdss/rvdss_update.py deleted file mode 100644 index 7aed18974..000000000 --- a/src/acquisition/rvdss/rvdss_update.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -Script to fetch new data, after data reporting moved to the dashboard -format. This covers dates following the 2023-2024 season (exclusive). -""" - -import pandas as pd -import os - -from delphi.epidata.acquisition.rvdss.utils import get_weekly_data, get_revised_data, get_dashboard_update_date -from delphi.epidata.acquisition.rvdss.constants import DASHBOARD_BASE_URL, RESP_COUNTS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE - - -def main(): - headers = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' - } - - update_date = get_dashboard_update_date(DASHBOARD_BASE_URL,headers) - weekly_data = get_weekly_data(DASHBOARD_BASE_URL,2024,headers,update_date).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) - positive_data = get_revised_data(DASHBOARD_BASE_URL,headers,update_date) - - path1 = './' + RESP_COUNTS_OUTPUT_FILE - path2 = './' + POSITIVE_TESTS_OUTPUT_FILE - - if not os.path.exists(path1): - weekly_data.to_csv(path1,index=True) - else: - old_detection_data = pd.read_csv(path1).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) - if not weekly_data.index.isin(old_detection_data.index).any(): - old_detection_data= pd.concat([old_detection_data,weekly_data],axis=0) - old_detection_data.to_csv(path1,index=True) - - if not os.path.exists(path2): - positive_data.to_csv(path2,index=True) - else: - old_positive_data = pd.read_csv(path2).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']) - if not positive_data.index.isin(old_positive_data.index).any(): - old_positive_data= pd.concat([old_positive_data,positive_data],axis=0) - old_positive_data.to_csv(path2,index=True) - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/src/acquisition/rvdss/utils.py b/src/acquisition/rvdss/utils.py index 24a2b8337..28c3fcdb1 100644 --- a/src/acquisition/rvdss/utils.py +++ b/src/acquisition/rvdss/utils.py @@ -8,8 +8,8 @@ from unidecode import unidecode import string -from delphi.epidata.acquisition.rvdss.constants import ( - VIRUSES, GEOS, REGIONS, NATION, LAST_WEEK_OF_YEAR, +from constants import ( + VIRUSES, GEOS, REGIONS, NATION, DASHBOARD_UPDATE_DATE_FILE, DASHBOARD_DATA_FILE ) @@ -27,6 +27,7 @@ def abbreviate_geo(full_name): lowercase=re.sub("/territoires","",lowercase) lowercase=re.sub("^cana$","can",lowercase) lowercase =lowercase.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation),'.'+"'")) + lowercase=re.sub("kidshospital","kids hospital",lowercase) lowercase=re.sub(' +', ' ', lowercase) new_name=unidecode(lowercase) @@ -70,8 +71,88 @@ def get_dashboard_update_date(base_url,headers): update_date_url_response = requests.get(update_date_url, headers=headers) update_date = datetime.strptime(update_date_url_response.text,"%m/%d/%Y %H:%M:%S").strftime("%Y-%m-%d") return(update_date) + +def check_most_recent_update_date(date,date_file): + with open(date_file) as file: + current_date = date + contents = file.read() + + already_updated = current_date in contents + return(already_updated) + +def preprocess_table_columns(table): + """ + Remove characters like . or * from columns + Abbreviate the viruses in columns + Change some naming of signals in columns (i.e order of hpiv and other) + Change some naming of locations in columns (i.e at instead of atl) + """ + table.columns = [re.sub("\xa0"," ", col) for col in table.columns] # \xa0 to space + table.columns = [re.sub("(.*?)(\.\d+)", "\\1", c) for c in table.columns] # remove .# for duplicated columns + table.columns =[re.sub("\.", "", s)for s in table.columns] #remove periods + table.columns =[re.sub(r"\((all)\)", "", s)for s in table.columns] # remove (all) + table.columns =[re.sub(r"\s*\(|\)", "", s)for s in table.columns] + table.columns = [re.sub(' +', ' ', col) for col in table.columns] # Make any muliple spaces into one space + table.columns = [re.sub(r'\(|\)', '', col) for col in table.columns] # replace () for _ + table.columns = [re.sub(r'/', '_', col) for col in table.columns] # replace / with _ + + table.columns = [re.sub(r"^at\b","atl ",t) for t in table.columns] + table.columns = [re.sub("canada","can",t) for t in table.columns] + table.columns = [re.sub(r"\bcb\b","bc",t) for t in table.columns] + + table.columns =[re.sub(r"h1n1 2009 |h1n12009|a_h1|ah1\b", "ah1n1pdm09", s)for s in table.columns] + table.columns =[re.sub(r"a_uns", "auns", s)for s in table.columns] + table.columns =[re.sub(r"a_h3", "ah3", s)for s in table.columns] + + table.columns =[abbreviate_virus(col) for col in table.columns] # abbreviate viruses + table.columns = [re.sub(r"flu a","flua",t) for t in table.columns] + table.columns = [re.sub(r"flu b","flub",t) for t in table.columns] + table.columns = [re.sub(r"flutest\b","flu test", col) for col in table.columns] + table.columns = [re.sub(r"other hpiv|other_hpiv","hpivother",t) for t in table.columns] + + table.columns=[re.sub(r'bpositive','b_positive',c) for c in table.columns] + table.columns=[re.sub(r'apositive','a_positive',c) for c in table.columns] + table.columns=[re.sub(r'hpiv_1','hpiv1',c) for c in table.columns] + table.columns=[re.sub(r'hpiv_2','hpiv2',c) for c in table.columns] + table.columns=[re.sub(r'hpiv_3','hpiv3',c) for c in table.columns] + table.columns=[re.sub(r'hpiv_4','hpiv4',c) for c in table.columns] + + table.columns=[make_signal_type_spelling_consistent(col) for col in table.columns] + return(table) + +def add_flu_prefix(flu_subtype): + """ Add the prefix `flu` when only the subtype is reported """ + + pat1 =r"^ah3" + pat2= r"^auns" + pat3= r"^ah1pdm09" + pat4= r"^ah1n1pdm09" + combined_pat = '|'.join((pat1, pat2,pat3,pat4)) + + full_fluname = re.sub(combined_pat, r"flu\g<0>",flu_subtype) + return(full_fluname) + +def make_signal_type_spelling_consistent(signal): + """ + Make the signal type (i.e. percent positive, number tests, total tests) have consistent spelling + Also remove total from signal names + """ + + pat1 = r"positive\b" + pat2 = r'pos\b' + combined_pat = '|'.join((pat1, pat2)) -def get_revised_data(base_url,headers,update_date): + pat3 = r"test\b" + pat4 = 'tested' + combined_pat2 = '|'.join((pat3, pat4)) + + new_signal = re.sub(combined_pat, "positive_tests",signal) + new_signal = re.sub(combined_pat2, "tests",new_signal) + new_signal =re.sub(" *%", "_pct_positive",new_signal) + new_signal = re.sub("total ", "",new_signal) + return(new_signal) + +def get_positive_data(base_url,headers,update_date): # Get update data url = base_url+DASHBOARD_DATA_FILE @@ -90,10 +171,14 @@ def get_revised_data(base_url,headers,update_date): #df=df.drop(["weekorder","region","year","week"],axis=1) - df = df.pivot(index=['epiweek','time_value','issue','geo_type','geo_value'], + df = df.pivot(index=['epiweek','time_value','issue','geo_type','geo_value','region','week','weekorder','year'], columns="virus",values=['tests','percentpositive','positivetests']) + df.columns = ['_'.join(col).strip() for col in df.columns.values] df = df.rename(columns=lambda x: '_'.join(x.split('_')[1:]+x.split('_')[:1])) + df.columns = [re.sub(r'/', '', col) for col in df.columns] # replace / with _ + df.columns = [re.sub(r"flu a","flua",t) for t in df.columns] + df.columns = [re.sub(r"flu b","flub",t) for t in df.columns] df.columns=[re.sub("positivetests", "positive_tests",col) for col in df.columns] df.columns=[re.sub("percentpositive", "pct_positive",col) for col in df.columns] df.columns=[re.sub(r' ','_',c) for c in df.columns] @@ -104,7 +189,7 @@ def get_revised_data(base_url,headers,update_date): return(df) -def get_weekly_data(base_url,start_year,headers,update_date): +def get_detections_data(base_url,headers,update_date): # Get current week and year summary_url = base_url + "RVD_SummaryText.csv" summary_url_response = requests.get(summary_url, headers=headers) @@ -113,38 +198,41 @@ def get_weekly_data(base_url,start_year,headers,update_date): week_df = summary_df[(summary_df['Section'] == "summary") & (summary_df['Type']=="title")] week_string = week_df.iloc[0]['Text'].lower() current_week = int(re.search("week (.+?) ", week_string).group(1)) - - if current_week < LAST_WEEK_OF_YEAR: - current_year = start_year+1 - else: - current_year = start_year + current_year= int(re.search("20\d{2}", week_string).group(0)) current_epiweek= Week(current_year,current_week) # Get weekly data - weekly_url = base_url + "RVD_CurrentWeekTable.csv" - weekly_url_response = requests.get(weekly_url, headers=headers) - weekly_url_response.encoding='UTF-8' - df_weekly = pd.read_csv(io.StringIO(weekly_url_response.text)) - - df_weekly = df_weekly.rename(columns=lambda x: '_'.join(x.split('_')[1:]+x.split('_')[:1])) - df_weekly.insert(0,"epiweek",int(str(current_epiweek))) - df_weekly.insert(1,"time_value",str(current_epiweek.enddate())) - df_weekly.insert(2,"issue",update_date) - df_weekly.columns=[abbreviate_virus(c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'test\b','tests',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'pos\b','positive_tests',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'flua_','flu_a',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'flub_','flu_b',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'bpositive','b_positive',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'apositive','a_positive',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r'flu_ah1_','flu_ah1pdm09_',c) for c in df_weekly.columns] - df_weekly.columns=[re.sub(r' ','_',c) for c in df_weekly.columns] - df_weekly=df_weekly.rename(columns={'reportinglaboratory':"geo_value"}) - df_weekly['geo_value'] = [abbreviate_geo(g) for g in df_weekly['geo_value']] - df_weekly['geo_type'] = [create_geo_types(g,"lab") for g in df_weekly['geo_value']] - - # if df_weekly.columns.isin(["weekorder","date","week"]).all(): - # df_weekly=df_weekly.drop(["weekorder","date","week"],axis=1) - - return(df_weekly) \ No newline at end of file + detections_url = base_url + "RVD_CurrentWeekTable.csv" + detections_url_response = requests.get(detections_url, headers=headers) + detections_url_response.encoding='UTF-8' + df_detections = pd.read_csv(io.StringIO(detections_url_response.text)) + + df_detections = df_detections.rename(columns=lambda x: '_'.join(x.split('_')[1:]+x.split('_')[:1])) + df_detections.insert(0,"epiweek",int(str(current_epiweek))) + df_detections.insert(1,"time_value",str(current_epiweek.enddate())) + df_detections.insert(2,"issue",update_date) + df_detections=preprocess_table_columns(df_detections) + + df_detections.columns=[re.sub(r' ','_',c) for c in df_detections.columns] + df_detections=df_detections.rename(columns={'reportinglaboratory':"geo_value"}) + df_detections['geo_value'] = [abbreviate_geo(g) for g in df_detections['geo_value']] + df_detections['geo_type'] = [create_geo_types(g,"lab") for g in df_detections['geo_value']] + + return(df_detections.set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])) + +def fetch_dashboard_data(url): + headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' + } + + update_date = get_dashboard_update_date(url, headers) + + detections_data = get_detections_data(url,headers,update_date) + positive_data = get_positive_data(url,headers,update_date) + + return { + "respiratory_detection": detections_data, + "positive": positive_data, + # "count": None, # Dashboards don't contain this data. + } diff --git a/src/ddl/rvdss.sql b/src/ddl/rvdss.sql new file mode 100644 index 000000000..d3a17a5b5 --- /dev/null +++ b/src/ddl/rvdss.sql @@ -0,0 +1,49 @@ +USE epidata; +/* +TODO: briefly describe data source and define all columns. +*/ + +CREATE TABLE `rvdss_repiratory_detections` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `date` date NOT NULL, + `geo_type` char(20) NOT NULL, + `geo_value` char(20) NOT NULL, + `epiweek` int(11) NOT NULL, + `flua_positive_tests` int(11) NOT NULL, + `flua_percent_positive_tests` double NOT NULL, + `flu_total_tests` int(11) NOT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `date` (`date`,`geo_value`), + KEY `state` (`state`), + KEY `epiweek` (`epiweek`), +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +CREATE TABLE `rvdss_testing` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `date` date NOT NULL, + `geo_type` char(20) NOT NULL, + `geo_value` char(20) NOT NULL, + `epiweek` int(11) NOT NULL, + `flua_positive_tests` int(11) NOT NULL, + `flua_percent_positive_tests` double NOT NULL, + `flu_total_tests` int(11) NOT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `date` (`date`,`geo_value`), + KEY `state` (`state`), + KEY `epiweek` (`epiweek`), +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +CREATE TABLE `rvdss_detections_counts` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `date` date NOT NULL, + `geo_type` char(20) NOT NULL, + `geo_value` char(20) NOT NULL, + `epiweek` int(11) NOT NULL, + `flua_positive_tests` int(11) NOT NULL, + `flua_percent_positive_tests` double NOT NULL, + `flu_total_tests` int(11) NOT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `date` (`date`,`geo_value`), + KEY `state` (`state`), + KEY `epiweek` (`epiweek`), +) ENGINE=InnoDB DEFAULT CHARSET=utf8;