Skip to content

Commit

Permalink
StudyConfig infrastructure (#233)
Browse files Browse the repository at this point in the history
* StudyConfig infrastructure

* light docs tweaks

* cleanup dangling cursor

* renaming StudyConfig params, cleanup

* move db_type to DatabaseBackend

* update setting db_config
  • Loading branch information
dogversioning authored May 9, 2024
1 parent 0aacf20 commit 528f3c0
Show file tree
Hide file tree
Showing 21 changed files with 402 additions and 220 deletions.
24 changes: 17 additions & 7 deletions cumulus_library/apis/umls.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,22 @@ def get_vsac_valuesets(
all_responses.append(valueset[0])
return all_responses

def get_latest_umls_file_release(self, target: str):
if target not in VALID_UMLS_DOWNLOADS:
raise errors.ApiError(
f"'{target}' is not a valid umls download type.\n\n"
f"Expected values: {','.join(VALID_UMLS_DOWNLOADS)}"
)
release_payload = {"releaseType": target, "current": "true"}
return self.session.get(
"https://uts-ws.nlm.nih.gov/releases", params=release_payload
).json()[0]

def download_umls_files(
self,
target: str = "umls-metathesaurus-mrconso-file",
target: str = "umls-metathesaurus-full-subset",
path: pathlib.Path | None = None,
unzip: bool = True,
):
"""Downloads an available file from the UMLS Download API and unzips it
target: the UMLS resource to download (default: the MRCONSO.RRF file)
Expand All @@ -113,10 +125,7 @@ def download_umls_files(
)
if path is None:
path = pathlib.Path.cwd()
release_payload = {"releaseType": target, "current": "true"}
file_meta = self.session.get(
"https://uts-ws.nlm.nih.gov/releases", params=release_payload
).json()[0]
file_meta = self.get_latest_umls_file_release(target)

# This particular endpoint requires the API key as a param rather than a
# basic auth header ¯\_(ツ)_/¯.
Expand Down Expand Up @@ -144,5 +153,6 @@ def download_umls_files(
f"{chunks_read/1000} MB"
),
)
base_utils.unzip_file(path / file_meta["fileName"], path)
(path / file_meta["fileName"]).unlink()
if unzip:
base_utils.unzip_file(path / file_meta["fileName"], path)
(path / file_meta["fileName"]).unlink()
41 changes: 27 additions & 14 deletions cumulus_library/base_utils.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,42 @@
""" Collection of small commonly used utility functions """

import dataclasses
import datetime
import json
import os
import pathlib
import shutil
import zipfile
from contextlib import contextmanager

from rich import progress

from cumulus_library import databases

def filepath(filename: str) -> str:
return os.path.join(os.path.dirname(__file__), filename)

@dataclasses.dataclass
class StudyConfig:
"""Class for containing study-centric parameters
The intent of this class is that if you want something passed through to the
prepare_queries section of a study, this is the place it should go. If you're
doing something above that level, consider explicit arguments instead. This should
be an interface aimed at a study author.
:param db_backend: a databaseBackend object for a specific target database
:keyword db_type: the argument passed in from the CLI indicating the requested DB
(this is easier to use in things like jinja templates than db_backend, if they
need to run DB technology aware queries)
:keyword replace_existing: If the study downloads data from an external resource,
force it to skip any cached data when running
:keyword stats_build: If the study runs a stats builder, force regeneration of
any sampling or other stochastic techniques
:keyword umls: A UMLS API key
"""

db: databases.DatabaseBackend
force_upload: bool = False
stats_build: bool = False
umls_key: str | None = None


def load_text(path: str) -> str:
Expand Down Expand Up @@ -41,17 +65,6 @@ def filter_strip(commands) -> list[str]:
return list(filter(None, [c.strip() for c in commands]))


def list_coding(code_display: dict, system=None) -> list[dict]:
as_list = []
for code, display in code_display.items():
if system:
item = {"code": code, "display": display, "system": system}
else:
item = {"code": code, "display": display}
as_list.append(item)
return as_list


@contextmanager
def query_console_output(
verbose: bool, query: str, progress_bar: progress.Progress, task: progress.Task
Expand Down
84 changes: 54 additions & 30 deletions cumulus_library/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,20 +103,20 @@ def clean_and_build_study(
self,
target: pathlib.Path,
*,
stats_build: bool,
config: base_utils.StudyConfig,
continue_from: str | None = None,
) -> None:
"""Recreates study views/tables
:param target: A path to the study directory
:param stats_build: if True, forces creation of new stats tables
:param config: A StudyConfig object containing optional params
:keyword continue_from: Restart a run from a specific sql file (for dev only)
"""
studyparser = study_parser.StudyManifestParser(target, self.data_path)
try:
if not continue_from:
studyparser.run_protected_table_builder(
self.cursor, self.schema_name, verbose=self.verbose
self.cursor, self.schema_name, verbose=self.verbose, config=config
)
self.update_transactions(studyparser.get_study_prefix(), "started")
cleaned_tables = studyparser.clean_study(
Expand All @@ -127,25 +127,31 @@ def clean_and_build_study(
)
# If the study hasn't been created before, force stats table generation
if len(cleaned_tables) == 0:
stats_build = True
config.stats_build = True
studyparser.run_table_builder(
self.cursor,
self.schema_name,
verbose=self.verbose,
parser=self.db.parser(),
config=config,
)
else:
self.update_transactions(studyparser.get_study_prefix(), "resumed")

studyparser.build_study(self.cursor, self.verbose, continue_from)
studyparser.build_study(
self.cursor,
verbose=self.verbose,
continue_from=continue_from,
config=config,
)
studyparser.run_counts_builders(
self.cursor, self.schema_name, verbose=self.verbose
self.cursor, self.schema_name, verbose=self.verbose, config=config
)
studyparser.run_statistics_builders(
self.cursor,
self.schema_name,
verbose=self.verbose,
stats_build=stats_build,
config=config,
)
self.update_transactions(studyparser.get_study_prefix(), "finished")

Expand All @@ -158,12 +164,16 @@ def clean_and_build_study(
raise e

def run_matching_table_builder(
self, target: pathlib.Path, table_builder_name: str
self,
target: pathlib.Path,
table_builder_name: str,
config: base_utils.StudyConfig,
) -> None:
"""Runs a single table builder
:param target: A path to the study directory
:param table_builder_name: a builder file referenced in the study's manifest
:param config: A StudyConfig object containing optional params
"""
studyparser = study_parser.StudyManifestParser(target)
studyparser.run_matching_table_builder(
Expand All @@ -172,26 +182,27 @@ def run_matching_table_builder(
table_builder_name,
self.verbose,
parser=self.db.parser(),
config=config,
)

def clean_and_build_all(self, study_dict: dict, stats_build: bool) -> None:
"""Builds views for all studies.
def clean_and_build_all(
self, study_dict: dict, config: base_utils.StudyConfig
) -> None:
"""Builds tables for all studies.
NOTE: By design, this method will always exclude the `template` study dir,
since 99% of the time you don't need a live copy in the database.
:param study_dict: A dict of paths
:param stats_build: if True, regen stats tables
:param config: A StudyConfig object containing optional params
"""
study_dict = dict(study_dict)
study_dict.pop("template")
for precursor_study in ["vocab", "core"]:
self.clean_and_build_study(
study_dict[precursor_study], stats_build=stats_build
)
self.clean_and_build_study(study_dict[precursor_study], config=config)
study_dict.pop(precursor_study)
for key in study_dict:
self.clean_and_build_study(study_dict[key], stats_build=stats_build)
self.clean_and_build_study(study_dict[key], config=config)

### Data exporters
def export_study(
Expand All @@ -212,11 +223,16 @@ def export_all(self, study_dict: dict, data_path: pathlib.Path, archive: bool):
self.export_study(study_dict[key], data_path, archive)

def generate_study_sql(
self, target: pathlib.Path, builder: str | None = None
self,
target: pathlib.Path,
*,
config: base_utils.StudyConfig,
builder: str | None = None,
) -> None:
"""Materializes study sql from templates
:param target: A path to the study directory
:param config: A StudyConfig object containing optional params
:param builder: Specify a single builder to generate sql from
"""
studyparser = study_parser.StudyManifestParser(target)
Expand All @@ -226,6 +242,7 @@ def generate_study_sql(
builder=builder,
verbose=self.verbose,
parser=self.db.parser(),
config=config,
)

def generate_study_markdown(
Expand Down Expand Up @@ -327,9 +344,14 @@ def run_cli(args: dict):

# all other actions require connecting to the database
else:
db_backend = databases.create_db_backend(args)
config = base_utils.StudyConfig(
db=databases.create_db_backend(args),
force_upload=args.get("replace_existing", False),
stats_build=args.get("stats_build", False),
umls_key=args.get("umls"),
)
try:
runner = StudyRunner(db_backend, data_path=args.get("data_path"))
runner = StudyRunner(config.db, data_path=args.get("data_path"))
if args.get("verbose"):
runner.verbose = True
console.print("[italic] Connecting to database...")
Expand All @@ -354,18 +376,17 @@ def run_cli(args: dict):
)
elif args["action"] == "build":
if "all" in args["target"]:
runner.clean_and_build_all(study_dict, args["stats_build"])
runner.clean_and_build_all(study_dict, config=config)
else:
for target in args["target"]:
if args["builder"]:
runner.run_matching_table_builder(
study_dict[target], args["builder"]
study_dict[target], config=config
)
else:
runner.clean_and_build_study(
study_dict[target],
stats_build=args["stats_build"],
continue_from=args["continue_from"],
config=config,
)

elif args["action"] == "export":
Expand Down Expand Up @@ -394,13 +415,15 @@ def run_cli(args: dict):

elif args["action"] == "generate-sql":
for target in args["target"]:
runner.generate_study_sql(study_dict[target], args["builder"])
runner.generate_study_sql(
study_dict[target], builder=args["builder"], config=config
)

elif args["action"] == "generate-md":
for target in args["target"]:
runner.generate_study_markdown(study_dict[target])
finally:
db_backend.close()
config.db.close()


def main(cli_args=None):
Expand All @@ -421,17 +444,18 @@ def main(cli_args=None):
break

arg_env_pairs = (
("data_path", "CUMULUS_LIBRARY_DATA_PATH"),
("db_type", "CUMULUS_LIBRARY_DB_TYPE"),
("id", "CUMULUS_AGGREGATOR_ID"),
("load_ndjson_dir", "CUMULUS_LIBRARY_LOAD_NDJSON_DIR"),
("profile", "CUMULUS_LIBRARY_PROFILE"),
("schema_name", "CUMULUS_LIBRARY_DATABASE"),
("workgroup", "CUMULUS_LIBRARY_WORKGROUP"),
("region", "CUMULUS_LIBRARY_REGION"),
("schema_name", "CUMULUS_LIBRARY_DATABASE"),
("study_dir", "CUMULUS_LIBRARY_STUDY_DIR"),
("data_path", "CUMULUS_LIBRARY_DATA_PATH"),
("load_ndjson_dir", "CUMULUS_LIBRARY_LOAD_NDJSON_DIR"),
("user", "CUMULUS_AGGREGATOR_USER"),
("id", "CUMULUS_AGGREGATOR_ID"),
("umls", "UMLS_API_KEY"),
("url", "CUMULUS_AGGREGATOR_URL"),
("user", "CUMULUS_AGGREGATOR_USER"),
("workgroup", "CUMULUS_LIBRARY_WORKGROUP"),
)
read_env_vars = []
for pair in arg_env_pairs:
Expand Down
9 changes: 9 additions & 0 deletions cumulus_library/cli_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,15 @@ def get_parser() -> argparse.ArgumentParser:
),
dest="stats_build",
)
build.add_argument(
"--umls-key",
help="An API Key for the UMLS API",
)
build.add_argument(
"--force-upload",
action="store_true",
help="Forces file downloads/uploads to occur, even if they already exist",
)
build.add_argument(
"--continue",
dest="continue_from",
Expand Down
Loading

0 comments on commit 528f3c0

Please sign in to comment.