Skip to content

Commit

Permalink
Merge pull request #110 from DynamicGenetics/develop
Browse files Browse the repository at this point in the history
WP4 latest including
- two new live data sources: vaccination data and Twitter data.
- updates to the PHW scraper to allow the functions to be used for both vaccination and case data, and updates to the scheduler which will no longer sends the traceback to Slack - just alerts that there has been an error.

Moreover, Twitter data now includes measures collected from the tweet-collection acquired using v2APIs.
All functions used to work with the Twitter database are part of a separated suite available here: https://github.com/ninadicara/tweet-suite.
The Twitter data will only update if the Tweet-Suite collector is set to update daily.
  • Loading branch information
leriomaggio authored Sep 2, 2021
2 parents 5071ac2 + 907c523 commit c2e7304
Show file tree
Hide file tree
Showing 29 changed files with 302 additions and 426 deletions.
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
#deprec (chris' temporary bin folders)
data/deprec/

# leriomaggio: ignored as this is a folder for local dev exps.
data/local
# Nina local development data folder
backend/datasets/data/static/source/local

# Ignore any database files
*.db

# LA Keys
data/la_keys.geojson

Expand Down
File renamed without changes.
81 changes: 81 additions & 0 deletions backend/data_collection/phw_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""Module containing functions required for attaining and cleaning data
from the Public Health Wales public datasets.
"""

import requests
import os
import logging
import pandas as pd
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)

# Dataset names to be used with run_phw_scraper
COVID_CASES = "Rapid COVID-19 surveillance data"
VAX_RATES = "COVID19 vaccination downloadable data "


class PHWDownload:
"""Get latest data from PHW, where dataset defines what is being scraped.
This function will get the latest data, write it to the raw folder as a .xlsx.
It will then clean it it, and write the cleaned data to the clean folder.
Parameters
----------
dataset : str
The name of the file to be downloaded from the PHW website.
folder : str
File path to write the scraped data to.
"""

def __init__(self, dataset, folder):
"""Setup the filenames and paths based on the folders provided."""

self.dataset = dataset
self.folder = folder
self.filename = dataset.replace(" ", "-")
self.path = os.path.join(folder, self.filename + ".xlsx")

def get_url(self):
"""Get the URL of the latest dataset."""

# Get the content from the webpage
response = requests.get(
"http://www2.nphs.wales.nhs.uk:8080/CommunitySurveillanceDocs.nsf/PublicPage?OpenPage&AutoFramed"
)
# Parse the content using Beautiful Soup
soup = BeautifulSoup(response.content, "html.parser")

try:
# Find the element with the title that we need, and get the parent row (<tr>)
element = soup.find(title=self.dataset).find_parent("tr")
# Extract the URL from the row element (it is the first href in the row.)
url = "http://www2.nphs.wales.nhs.uk:8080" + element.find("a").get("href")
except AttributeError:
# If the element can't be found then return url as None.
url = None

return url

def save_data(self):
"""
Downloads PHW dashboard data Excel file, saves as xlsx to given output path.
Runs the function `get_url`.
"""

# Get the data URL
try:
url = self.get_url()
except Exception as e:
raise e

if url is not None:
r = requests.get(url, allow_redirects=True)
# Save in native xlsx format
output = open(self.path, "wb")
output.write(r.content)
output.close()
logger.info("Successfully saved {}".format(self.dataset))
elif url is None:
logger.error("Could not get URL to download {}".format(self.dataset))
46 changes: 23 additions & 23 deletions backend/datasets/data/live/LA_live_master.csv
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
area_code,area_name,covidIncidence_100k,groups_count,total_vol_count,new_vol_count,tweets_percent,has_someone_close_count,has_someone_close_pct,vol_increase_pct
W06000001,Isle of Anglesey,3212.3,4.0,443.0,247.0,2.2842639593908634,2246.0,56.9,126.0204081632653
W06000002,Gwynedd,2622.8,12.0,854.0,436.0,1.1553273427471118,3534.0,58.2,104.30622009569377
W06000003,Conwy,3267.0,8.0,1035.0,645.0,2.2332506203473943,3806.0,58.1,165.3846153846154
W06000004,Denbighshire,4492.4000000000015,4.0,1245.0,843.0,1.5544041450777202,2928.0,57.4,209.7014925373134
W06000005,Flintshire,6097.4000000000015,8.0,1356.0,966.0,1.3544018058690743,5173.0,57.3,247.6923076923077
W06000006,Wrexham,8365.9,7.0,874.0,584.0,2.3661270236612704,4248.0,58.7,201.3793103448276
W06000008,Ceredigion,2408.7000000000007,6.0,517.0,285.0,2.5787965616045847,2441.0,58.9,122.84482758620689
W06000009,Pembrokeshire,2710.3,15.0,1038.0,567.0,1.859504132231405,3785.0,58.0,120.38216560509554
W06000010,Carmarthenshire,5658.700000000002,11.0,1362.0,745.0,1.829268292682927,5501.0,58.7,120.74554294975688
W06000011,Swansea,7005.1,12.0,2491.0,1388.0,2.3779724655819776,8236.0,59.2,125.8386219401632
W06000012,Neath Port Talbot,8043.1,11.0,792.0,352.0,1.818181818181818,3717.0,57.9,80.0
W06000013,Bridgend,8523.7,5.0,1553.0,1114.0,3.096330275229358,4392.0,58.9,253.75854214123007
W06000014,Vale of Glamorgan,5526.0,10.0,1239.0,805.0,2.769461077844311,5766.0,60.6,185.48387096774192
W06000015,Cardiff,7027.200000000002,21.0,5039.0,3106.0,2.288534888938748,13684.0,60.6,160.68287635799277
W06000016,Rhondda Cynon Taf,8789.1,11.0,1628.0,845.0,2.5950512975256492,7190.0,57.1,107.91826309067689
W06000018,Caerphilly,7373.700000000002,6.0,1549.0,813.0,2.434928631402183,4570.0,58.6,110.46195652173914
W06000019,Blaenau Gwent,8783.0,2.0,614.0,293.0,2.0887728459530033,1522.0,56.7,91.27725856697819
W06000020,Torfaen,6806.0,10.0,837.0,397.0,3.492647058823529,2938.0,57.3,90.22727272727272
W06000021,Monmouthshire,4394.8,13.0,894.0,584.0,1.44,3913.0,61.6,188.38709677419354
W06000022,Newport,7213.1,3.0,1788.0,958.0,2.1331058020477816,3956.0,59.0,115.42168674698796
W06000023,Powys,3101.9,21.0,2078.0,856.0,1.9784172661870505,4243.0,59.5,70.04909983633388
W06000024,Merthyr Tydfil,11099.7,2.0,448.0,243.0,3.0660377358490565,1627.0,56.5,118.53658536585365
area_code,area_name,covidIncidence_100k,groups_count,total_vol_count,new_vol_count,tweets_percent,has_someone_close_count,has_someone_close_pct,vax1_pct,vax2_pct,vader_comp,vol_increase_pct
W06000001,Isle of Anglesey,3082.4,4.0,443.0,247.0,2.2842639593908634,2246.0,56.9,88.8,80.7,0.2250427228923505,126.0204081632653
W06000002,Gwynedd,2555.4,12.0,854.0,436.0,1.1553273427471118,3534.0,58.2,85.5,74.6,0.24743269429239656,104.30622009569377
W06000003,Conwy,3217.5,8.0,1035.0,645.0,2.2332506203473943,3806.0,58.1,87.5,78.4,0.3161522153541668,165.3846153846154
W06000004,Denbighshire,4445.3,4.0,1245.0,843.0,1.5544041450777202,2928.0,57.4,85.6,73.4,0.2975364184455571,209.7014925373134
W06000005,Flintshire,6048.0,8.0,1356.0,966.0,1.3544018058690743,5173.0,57.3,87.0,72.1,0.35379189468402683,247.6923076923077
W06000006,Wrexham,8333.5,7.0,874.0,584.0,2.3661270236612704,4248.0,58.7,85.6,72.7,0.28499944262448723,201.3793103448276
W06000008,Ceredigion,2400.4,6.0,517.0,285.0,2.5787965616045847,2441.0,58.9,81.8,69.8,0.31153695710532325,122.84482758620689
W06000009,Pembrokeshire,2680.9,15.0,1038.0,567.0,1.859504132231405,3785.0,58.0,87.0,75.0,0.25005898592490006,120.38216560509554
W06000010,Carmarthenshire,5631.200000000001,11.0,1362.0,745.0,1.829268292682927,5501.0,58.7,86.5,74.5,0.261068814866648,120.74554294975688
W06000011,Swansea,6920.400000000001,12.0,2491.0,1388.0,2.3779724655819776,8236.0,59.2,85.5,75.8,0.28807985715014695,125.8386219401632
W06000012,Neath Port Talbot,7967.8,11.0,792.0,352.0,1.818181818181818,3717.0,57.9,86.5,76.2,0.23843450812076922,80.0
W06000013,Bridgend,8504.0,5.0,1553.0,1114.0,3.096330275229358,4392.0,58.9,88.5,78.3,0.26253533695997,253.75854214123007
W06000014,Vale of Glamorgan,5498.3,10.0,1239.0,805.0,2.769461077844311,5766.0,60.6,88.3,83.2,0.26283085731134676,185.48387096774192
W06000015,Cardiff,6983.6,21.0,5039.0,3106.0,2.288534888938748,13684.0,60.6,80.6,72.9,0.2620267040456928,160.68287635799277
W06000016,Rhondda Cynon Taf,8764.300000000001,11.0,1628.0,845.0,2.5950512975256492,7190.0,57.1,87.8,78.1,0.26596758084337674,107.91826309067689
W06000018,Caerphilly,7338.900000000001,6.0,1549.0,813.0,2.434928631402183,4570.0,58.6,89.5,78.8,0.29034804899268235,110.46195652173914
W06000019,Blaenau Gwent,8737.2,2.0,614.0,293.0,2.0887728459530033,1522.0,56.7,89.2,79.3,0.3460484923290237,91.27725856697819
W06000020,Torfaen,6773.0,10.0,837.0,397.0,3.492647058823529,2938.0,57.3,88.6,78.7,0.25536565895329094,90.22727272727272
W06000021,Monmouthshire,4367.3,13.0,894.0,584.0,1.44,3913.0,61.6,91.5,82.5,0.2461608846192556,188.38709677419354
W06000022,Newport,7164.0,3.0,1788.0,958.0,2.1331058020477816,3956.0,59.0,84.0,71.2,0.2910521950066565,115.42168674698796
W06000023,Powys,3079.2000000000003,21.0,2078.0,856.0,1.9784172661870505,4243.0,59.5,90.4,82.7,0.26935401117046875,70.04909983633388
W06000024,Merthyr Tydfil,10960.400000000001,2.0,448.0,243.0,3.0660377358490565,1627.0,56.5,86.0,76.4,0.2033578983183109,118.53658536585365
23 changes: 0 additions & 23 deletions backend/datasets/data/live/cleaned/phwCovidStatement.csv

This file was deleted.

Binary file not shown.
Binary file not shown.
Binary file not shown.
3 changes: 1 addition & 2 deletions backend/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,8 +653,7 @@ def _merge_datasets(self):

@staticmethod
def _create_over_65_col(data):
"""Create a new over_65 column in the master and drop the redundant columns.
"""
"""Create a new over_65 column in the master and drop the redundant columns."""
# Crate new 'over 65' variable for the population data (adding up all
# singlar age cols from 65:90+)
age_cols = data.columns[
Expand Down
98 changes: 93 additions & 5 deletions backend/datasets/live.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,39 @@
import pandas as pd
import os
from functools import partial
import sqlite3

from datasets import LIVE_DATA_FOLDER
from datasets import LIVE_DATA_FOLDER, LIVE_RAW_DATA_FOLDER

from datasets.dataset import DataResolution, DataFrequency, Dataset, MasterDataset

p_live = partial(os.path.join, LIVE_DATA_FOLDER)

SOURCE_COVID_COUNT_LA = pd.read_csv(p_live("phwCovidStatement.csv"))
p_raw = partial(os.path.join, LIVE_RAW_DATA_FOLDER)

# Get, and do some tidying, of the PHW data
SOURCE_COVID_COUNT_LA = pd.read_excel(
p_raw("Rapid-COVID-19-surveillance-data.xlsx"),
sheet_name="Tests by specimen date",
usecols="A, B, E",
) # LA, date, cumulative cases per 100,000
SOURCE_COVID_COUNT_LA["Specimen date"] = pd.to_datetime(
SOURCE_COVID_COUNT_LA["Specimen date"]
)
latest_date = SOURCE_COVID_COUNT_LA["Specimen date"].max()
# Filter data by the latest date
SOURCE_COVID_COUNT_LA = SOURCE_COVID_COUNT_LA[
SOURCE_COVID_COUNT_LA["Specimen date"] == latest_date
]

SOURCE_VAX_PCT_LA = pd.read_excel(
p_raw("COVID19-vaccination-downloadable-data-.xlsx"),
sheet_name="HealthBoard_LocalAuthority",
usecols="A, B, E, G",
skiprows=1,
)
SOURCE_VAX_PCT_LA = SOURCE_VAX_PCT_LA[
SOURCE_VAX_PCT_LA["Risk group"] == "Wales residents aged 18 years and older"
]

SOURCE_GROUP_COUNTS_LA = pd.read_csv(p_live("groupCount_LA.csv"))

Expand All @@ -37,15 +62,69 @@
SOURCE_ZOE_SUPPORT_LA = pd.read_csv(p_live("help_need20200531.csv"), nrows=3).T
SOURCE_ZOE_SUPPORT_LA.reset_index(level=0, inplace=True)

# This query gets the average sentiment for the past seven days, linked to local authority areas.
# This is then queried from the twitter data collection database.
VADER_QUERY = """SELECT AVG(vader_comp_avg),lsoa,lsoa_name from
(
SELECT AVG(tweets.vader_comp) as vader_comp_avg, tweets.author_id, matchedplaces.lsoa, matchedplaces.lsoa_name
FROM tweets
JOIN places ON tweets.place_id = places.id
JOIN matchedplaces ON matchedplaces.place_id = places.id
WHERE strftime("%Y-%m-%d %H:%M:%S", tweets.created_at) > date('now','start of day','-7 days')
GROUP BY tweets.author_id,matchedplaces.lsoa
) as lastweek_tweets
GROUP BY lsoa;
"""

SOURCE_TWEET_SENTIMENT_LA = pd.read_sql(
VADER_QUERY,
con=sqlite3.connect(os.path.join(LIVE_RAW_DATA_FOLDER, "phw_tweets.db")),
)

# Labelling this as a pct, it's not really but ensures it doesn't get changed.
LA_VADER = Dataset(
data=SOURCE_TWEET_SENTIMENT_LA,
res=DataResolution.LA,
key_col="lsoa",
key_is_code=True,
csv_name="vader_pct",
rename={"AVG(vader_comp_avg)": "vader_comp", "lsoa": "lad19cd"},
keep_cols=["lad19cd", "vader_comp"],
)

LA_COVID = Dataset(
data=SOURCE_COVID_COUNT_LA,
res=DataResolution.LA,
key_col="la_name",
key_col="Local Authority",
key_is_code=False,
csv_name="covid_count",
rename={
"Cumulative incidence per 100,000 population": "covidIncidence_100k",
"Local Authority": "lad19nm",
},
keep_cols=["lad19nm", "covidIncidence_100k"],
)

LA_VAX_1 = Dataset(
data=SOURCE_VAX_PCT_LA,
res=DataResolution.LA,
key_col="Area of residence",
key_is_code=False,
csv_name="vax1_pct",
rename={"Area of residence": "lad19nm", "Uptake(%) - Dose1": "vax1_pct"},
keep_cols=["lad19nm", "vax1_pct"],
)

LA_VAX_2 = Dataset(
data=SOURCE_VAX_PCT_LA,
res=DataResolution.LA,
key_col="Area of residence",
key_is_code=False,
csv_name="vax2_pct",
rename={"Area of residence": "lad19nm", "Uptake(%) - Dose2": "vax2_pct"},
keep_cols=["lad19nm", "vax2_pct"],
)

LA_GROUP_COUNTS = Dataset(
data=SOURCE_GROUP_COUNTS_LA,
res=DataResolution.LA,
Expand Down Expand Up @@ -83,7 +162,16 @@
)

LA_LIVE = MasterDataset(
datasets=[LA_COVID, LA_GROUP_COUNTS, LA_WCVA, LA_TWEETS, LA_ZOE_SUPPORT],
datasets=[
LA_COVID,
LA_GROUP_COUNTS,
LA_WCVA,
LA_TWEETS,
LA_ZOE_SUPPORT,
LA_VAX_1,
LA_VAX_2,
LA_VADER,
],
res=DataResolution.LA,
freq=DataFrequency.LIVE,
from_csv=False,
Expand Down
46 changes: 32 additions & 14 deletions backend/generate_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,13 +305,7 @@ def write(self, filepath: str = None):
"""
if not filepath:
filepath = os.path.join(
BASE_FOLDER,
"..",
"..",
"frontend",
"map",
"data",
"data.json",
BASE_FOLDER, "..", "..", "frontend", "map", "data", "data.json",
)

with open(filepath, "w") as outfile:
Expand Down Expand Up @@ -408,6 +402,24 @@ def write(self, filepath: str = None):
data_type="per100k",
)

VAX_DOSE1 = Variable(
data=LA_LIVE_MASTER["vax1_pct"],
label="Vaccine Uptake Dose 1 (per 100 pop)",
data_class="support",
la_and_lsoa=False,
invert=False,
data_type="percentage",
)

VAX_DOSE2 = Variable(
data=LA_LIVE_MASTER["vax2_pct"],
label="Vaccine Update Dose 2 (per 100 pop)",
data_class="support",
la_and_lsoa=False,
invert=False,
data_type="percentage",
)

GROUPS = Variable(
data=LA_LIVE_MASTER["groups_count"],
label="Community Support Groups (per 100 pop)",
Expand Down Expand Up @@ -473,11 +485,22 @@ def write(self, filepath: str = None):
data_type="percentage",
)

VADER_SENTIMENT = Variable(
data=LA_LIVE_MASTER["vader_comp"],
label="Avg Twitter Sentiment Past 7 Days",
data_class="support",
la_and_lsoa=False,
invert=False,
data_type="percentage",
)


LA_VARBS = Variables(
(
VOLS_TOTAL,
VOLS_INCREASE,
VAX_DOSE1,
VAX_DOSE2,
GROUPS,
TWEETS,
BELONGING,
Expand All @@ -490,16 +513,11 @@ def write(self, filepath: str = None):
GP_DIGITAL,
HAS_INTERNET,
ZOE_SUPPORT,
VADER_SENTIMENT,
)
)

LSOA_VARBS = Variables(
(
LSOA_WIMD,
LSOA_OVER_65,
LSOA_POPDENSITY,
)
)
LSOA_VARBS = Variables((LSOA_WIMD, LSOA_OVER_65, LSOA_POPDENSITY,))

# Finally, create the data with the json function!
DATA = DataDashboard(la_data=LA_VARBS, lsoa_data=LSOA_VARBS)
Expand Down
Loading

0 comments on commit c2e7304

Please sign in to comment.