Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor doctor_visits: Load source file only once #1978

Open
wants to merge 32 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
c639049
replace modify_claims_drops with direct modification in update_sensor
minhkhul Jun 24, 2024
1dd80be
cleanup Config
minhkhul Jun 24, 2024
749ed2d
cleanup Config
minhkhul Jun 24, 2024
9d8d521
change test
minhkhul Jun 24, 2024
ca38bb7
lint
minhkhul Jun 24, 2024
17259d0
fix test geomap
minhkhul Jun 24, 2024
6d841da
lint
minhkhul Jun 24, 2024
4ec46df
lint
minhkhul Jun 24, 2024
9740899
adding logging for comparing processing time
aysim319 Jun 28, 2024
aacc545
using dask for read/write large files
aysim319 Jun 28, 2024
dbde5c7
undo testing change and also using datetime instead of str for date p…
aysim319 Jun 28, 2024
1394d3d
refactored reading into seperate function
aysim319 Jun 28, 2024
dfc3be2
organizing code
aysim319 Jun 28, 2024
e07c697
only procesing once and passing along the dataframe
aysim319 Jul 1, 2024
d1ee4ce
added/updated tests
aysim319 Jul 1, 2024
fc2c58d
Merge pull request #1981 from cmu-delphi/optimize_with_dask
aysim319 Jul 1, 2024
b52d80a
in progress cleaning up writing csv
aysim319 Jul 1, 2024
58b51a6
Merge branch 'main' into doctor_visits_refactor_for_speed
minhkhul Jul 2, 2024
81381d6
optimized write_csv
aysim319 Jul 3, 2024
bfa853a
lint
aysim319 Jul 3, 2024
073651f
reverting to assert
aysim319 Jul 9, 2024
dd06a91
cleaning more stuff
aysim319 Jul 9, 2024
4ddd5a0
version locking at 2024.6 due to pandas
aysim319 Jul 9, 2024
593279b
aligned preprocessing to match current & rollback write for consisten…
aysim319 Jul 11, 2024
9920821
pip versioning
aysim319 Jul 11, 2024
cd83691
rewording variable and also ensure that column order is the same
aysim319 Jul 11, 2024
79c34d3
Update doctor_visits/setup.py
aysim319 Jul 12, 2024
7896042
latest version supported for 3.8 is 2023.5.*
aysim319 Jul 12, 2024
e2f7953
fix param
aysim319 Jul 12, 2024
a4f67c0
added notes for when we upgrade to 3.9+
aysim319 Jul 12, 2024
9408c81
reverting unneeded change
aysim319 Sep 9, 2024
b2f8b0e
merge with main
aysim319 Sep 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 26 additions & 7 deletions doctor_visits/delphi_doctor_visits/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,37 @@ class Config:
# data columns
CLI_COLS = ["Covid_like", "Flu_like", "Mixed"]
FLU1_COL = ["Flu1"]
COUNT_COLS = CLI_COLS + FLU1_COL + ["Denominator"]
COUNT_COLS = ["Denominator"] + FLU1_COL + CLI_COLS
DATE_COL = "ServiceDate"
GEO_COL = "PatCountyFIPS"
AGE_COL = "PatAgeGroup"
HRR_COLS = ["Pat HRR Name", "Pat HRR ID"]
ID_COLS = [DATE_COL] + [GEO_COL] + [AGE_COL] + HRR_COLS
ID_COLS = [DATE_COL] + [GEO_COL] + HRR_COLS + [AGE_COL]
FILT_COLS = ID_COLS + COUNT_COLS
DTYPES = {"ServiceDate": str, "PatCountyFIPS": str,
"Denominator": int, "Flu1": int,
"Covid_like": int, "Flu_like": int,
"Mixed": int, "PatAgeGroup": str,
"Pat HRR Name": str, "Pat HRR ID": float}
DTYPES = {
"ServiceDate": str,
"PatCountyFIPS": str,
"Denominator": int,
"Flu1": int,
"Covid_like": int,
"Flu_like": int,
"Mixed": int,
"PatAgeGroup": str,
"Pat HRR Name": str,
"Pat HRR ID": float,
"servicedate": str,
"patCountyFIPS": str,
"patAgeGroup": str,
"patHRRname": str,
"patHRRid": float,
}
DEVIANT_COLS_MAP = {
"servicedate": "ServiceDate",
"patCountyFIPS": "PatCountyFIPS",
"patHRRname": "Pat HRR Name",
"patAgeGroup": "PatAgeGroup",
"patHRRid": "Pat HRR ID",
}

SMOOTHER_BANDWIDTH = 100 # bandwidth for the linear left Gaussian filter
MAX_BACKFILL_WINDOW = 7 # maximum number of days used to average a backfill correction
Expand Down
52 changes: 0 additions & 52 deletions doctor_visits/delphi_doctor_visits/modify_claims_drops.py

This file was deleted.

4 changes: 0 additions & 4 deletions doctor_visits/delphi_doctor_visits/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
# first party
from .update_sensor import update_sensor, write_to_csv
from .download_claims_ftp_files import download
from .modify_claims_drops import modify_and_write
from .get_latest_claims_name import get_latest_filename


Expand Down Expand Up @@ -55,9 +54,6 @@ def run_module(params): # pylint: disable=too-many-statements
# find the latest files (these have timestamps)
claims_file = get_latest_filename(params["indicator"]["input_dir"], logger)

# modify data
modify_and_write(claims_file, logger)

## get end date from input file
# the filename is expected to be in the format:
# "EDI_AGG_OUTPATIENT_DDMMYYYY_HHMM{timezone}.csv.gz"
Expand Down
9 changes: 7 additions & 2 deletions doctor_visits/delphi_doctor_visits/update_sensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# standard packages
from datetime import timedelta
from multiprocessing import Pool, cpu_count
from pathlib import Path

# third party
import numpy as np
Expand Down Expand Up @@ -86,12 +87,16 @@ def update_sensor(
# as of 2020-05-11, input file expected to have 10 columns
# id cols: ServiceDate, PatCountyFIPS, PatAgeGroup, Pat HRR ID/Pat HRR Name
# value cols: Denominator, Covid_like, Flu_like, Flu1, Mixed
filename = Path(filepath).name
data = pd.read_csv(
filepath,
usecols=Config.FILT_COLS,
dtype=Config.DTYPES,
parse_dates=[Config.DATE_COL],
)
logger.info(f"Starting processing {filename} ")
data.rename(columns=Config.DEVIANT_COLS_MAP, inplace=True)
aysim319 marked this conversation as resolved.
Show resolved Hide resolved
data = data[Config.FILT_COLS]
data[Config.DATE_COL] = data[Config.DATE_COL].apply(pd.to_datetime)
logger.info(f"finished processing {filename} ")
assert (
np.sum(data.duplicated(subset=Config.ID_COLS)) == 0
), "Duplicated data! Check the input file"
Expand Down
Binary file not shown.
8 changes: 4 additions & 4 deletions doctor_visits/tests/test_geomap.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
from delphi_doctor_visits.geo_maps import GeoMaps
from delphi_doctor_visits.config import Config

CONFIG = Config()
DATA = pd.read_csv(
"test_data/SYNEDI_AGG_OUTPATIENT_07022020_1455CDT.csv.gz",
usecols=CONFIG.FILT_COLS,
dtype=CONFIG.DTYPES,
parse_dates=[CONFIG.DATE_COL],
dtype=Config.DTYPES,
nrows=9,
)
DATA.rename(columns=Config.DEVIANT_COLS_MAP, inplace=True)
DATA = DATA[Config.FILT_COLS]
DATA[Config.DATE_COL] = DATA[Config.DATE_COL].apply(pd.to_datetime)

GM = GeoMaps()

Expand Down
25 changes: 0 additions & 25 deletions doctor_visits/tests/test_modify_claims_drops.py

This file was deleted.

Loading