Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Valuset builder #298

Merged
merged 7 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ MRCONSO.RRF
*.zip
coverage.xml
*.parquet
valueset_data/

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
6 changes: 6 additions & 0 deletions cumulus_library/.sqlfluff
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ join_cols_by_table =
join_id = subject_ref
join_tables = ['table_a','table_b']
join_table_aliases = ['a','b']
keywords = ['key','words']
local_location = /var/study/data/
neg_source_table = neg_source_table
output_table_name = 'created_table'
Expand All @@ -102,6 +103,7 @@ prefix = Test
primary_ref = encounter_ref
pos_source_table = pos_source_table
remote_location = s3://bucket/study/data/
sab = 'MED-RT'
schema_name = test_schema
schema =
{
Expand Down Expand Up @@ -236,14 +238,18 @@ schema =
}
source_table = source_table
source_id = source_id
steward = steward
study_prefix = study
table_cols = ["a","b"]
table_cols_types = ["varchar", "varchar"]
table_name = test_table
table_names = ["test_table"]
table_prefix = 'foo'
tables = ["test_a", "test_b"]
table_suffix = 2024_01_01_11_11_11
target_col_prefix = prefix
target_table = target_table
tier = 2
type_casts={"b": "VARCHAR"}
unnests =
[
Expand Down
10 changes: 5 additions & 5 deletions cumulus_library/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
"""Package metadata"""

from .base_utils import StudyConfig
from .builders.base_table_builder import BaseTableBuilder
from .builders.counts import CountsBuilder
from .study_manifest import StudyManifest
from cumulus_library.base_utils import StudyConfig
from cumulus_library.builders.base_table_builder import BaseTableBuilder
from cumulus_library.builders.counts import CountsBuilder
from cumulus_library.study_manifest import StudyManifest

__all__ = ["BaseTableBuilder", "CountsBuilder", "StudyConfig", "StudyManifest"]
__version__ = "3.1.0"
__version__ = "4.0.0"
13 changes: 9 additions & 4 deletions cumulus_library/actions/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@
log_utils,
study_manifest,
)
from cumulus_library.builders import protected_table_builder
from cumulus_library.statistics import psm
from cumulus_library.builders import protected_table_builder, psm_builder, valueset_builder
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: your own life might be better if you re-exported all the individual builder classes in builders/__init__.py, so that you could do the following:

from cumulus_library import builders
builder = builders.PsmBuilder()

This also avoids some churn as you add/remove/rename builders.

(I've wound up on this pattern in the ETL anyway - all the cumulus_etl.* modules are toplevel concepts like "loaders" or "nlp" and then those modules are little fiefdoms that export their internals out to the top level. Helps me mentally map the code structure too, to keep semantic meaning one level deep. I don't always hit that mark, but I try 😄)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i'm slowly coming around to this view of things - i may do it in a separate PR for my own sanity.



@contextlib.contextmanager
Expand Down Expand Up @@ -209,16 +208,22 @@ def run_statistics_builders(
with open(toml_path, "rb") as file:
stats_config = tomllib.load(file)
config_type = stats_config["config_type"]
target_table = stats_config["target_table"]
target_table = stats_config.get("target_table", stats_config.get("table_prefix", ""))

if (target_table,) in existing_stats and not config.stats_build:
continue
if config_type == "psm":
builder = psm.PsmBuilder(
builder = psm_builder.PsmBuilder(
toml_config_path=toml_path,
config=stats_config,
data_path=manifest.data_path / f"{manifest.get_study_prefix()}/psm",
)
elif config_type == "valueset":
builder = valueset_builder.ValuesetBuilder(
toml_config_path=toml_path,
config=stats_config,
data_path=manifest.data_path / f"{manifest.get_study_prefix()}/valueset",
)
else:
raise errors.StudyManifestParsingError( # pragma: no cover
f"{toml_path} references an invalid statistics type {config_type}."
Expand Down
2 changes: 1 addition & 1 deletion cumulus_library/actions/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def _create_table_from_parquet(archive, file, study_name, config):
query = base_templates.get_ctas_from_parquet_query(
schema_name=config.schema,
table_name=parquet_path.stem.replace(".", "_"),
local_location=parquet_path.parent,
local_location=f"{parquet_path.parent}/*.parquet",
remote_location=s3_path,
table_cols=list(table_types.index),
remote_table_cols_types=remote_types,
Expand Down
2 changes: 1 addition & 1 deletion cumulus_library/builders/counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pathlib import Path

from cumulus_library import BaseTableBuilder, errors, study_manifest
from cumulus_library.statistics.statistics_templates import counts_templates
from cumulus_library.builders.statistics_templates import counts_templates

# Defined here for easy overriding by tests
DEFAULT_MIN_SUBJECT = 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from psmpy.functions import cohenD

from cumulus_library import BaseTableBuilder, base_utils, databases
from cumulus_library.statistics.statistics_templates import psm_templates
from cumulus_library.builders.statistics_templates import psm_templates
from cumulus_library.template_sql import base_templates


Expand Down
133 changes: 133 additions & 0 deletions cumulus_library/builders/valueset/additional_rules_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
"""Builder for generating subsets of RxNorm data from a given valueset"""

import pathlib

from cumulus_library import BaseTableBuilder, base_utils, study_manifest
from cumulus_library.builders.valueset import valueset_utils
from cumulus_library.template_sql import base_templates


class AdditionalRulesBuilder(BaseTableBuilder):
display_text = "Generating rulesets..."
base_path = pathlib.Path(__file__).resolve().parent

def prepare_queries(
self,
*args,
config: base_utils.StudyConfig,
manifest: study_manifest.StudyManifest,
valueset_config: valueset_utils.ValuesetConfig,
**kwargs,
):
study_prefix = manifest.get_prefix_with_seperator()
table_prefix = ""
if valueset_config.table_prefix:
table_prefix = f"{valueset_config.table_prefix}_"
self.queries.append(
base_templates.get_base_template(
"create_search_rules_descriptions",
self.base_path / "template_sql",
study_prefix=study_prefix,
table_prefix=table_prefix,
)
)
self.queries.append(
base_templates.get_create_table_from_tables(
table_name=f"{study_prefix}{table_prefix}potential_rules",
# From a domain logic perspective, the _rela table is
# the leftmost table and we're annotating with the
# data from rxnconso. Since rxnconso is much, much
# larger, we're moving it to the left in the actual
# constructed join for athena performance reasons
tables=[
f"{study_prefix}{table_prefix}all_rxnconso_keywords",
f"{study_prefix}{table_prefix}rela",
],
table_aliases=["r", "s"],
columns=[
"s.rxcui",
"r.rxcui",
"s.tty",
"r.tty",
"s.rui",
"s.rel",
"s.rela",
"s.str",
"r.str",
"r.keyword",
],
column_aliases={
"s.rxcui": "rxcui1",
"s.tty": "tty1",
"s.str": "str1",
"r.rxcui": "rxcui2",
"r.tty": "tty2",
"r.str": "str2",
},
join_clauses=[
"s.rxcui2 = r.rxcui",
(
"s.rxcui2 NOT IN (SELECT DISTINCT RXCUI FROM " # noqa: S608
f"{study_prefix}{table_prefix}rxnconso_keywords)"
),
],
)
)
self.queries.append(
base_templates.get_create_table_from_tables(
table_name=f"{study_prefix}{table_prefix}included_rels",
tables=[
f"{study_prefix}{table_prefix}potential_rules",
f"{study_prefix}{table_prefix}search_rules",
],
table_aliases=["r", "e"],
columns=[
"r.rxcui1",
"r.rxcui2",
"r.tty1",
"r.tty2",
"r.rui",
"r.rel",
"r.rela",
"r.str1",
"r.str2",
"r.keyword",
],
join_clauses=[
"r.REL NOT IN ('RB', 'PAR')",
"e.include = TRUE",
"r.TTY1 = e.TTY1",
"r.TTY2 = e.TTY2",
"r.RELA = e.RELA",
],
)
)
self.queries.append(
base_templates.get_base_template(
"create_included_keywords",
self.base_path / "template_sql",
study_prefix=study_prefix,
table_prefix=table_prefix,
)
)
self.queries.append(
base_templates.get_create_table_from_union(
table_name=f"{study_prefix}{table_prefix}combined_ruleset",
tables=[
f"{study_prefix}{table_prefix}included_keywords",
f"{study_prefix}{table_prefix}included_rels",
],
columns=[
"rxcui1",
"rxcui2",
"tty1",
"tty2",
"rui",
"rel",
"rela",
"str1",
"str2",
"keyword",
],
)
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
TTY1 RELA TTY2 INCLUDE
BN reformulated_to BN Yes
BN reformulation_of BN Yes
BN tradename_of IN Keyword
BN has_precise_ingredient PIN Keyword
BN ingredient_of SBD Yes
BN ingredient_of SBDC Yes
BN ingredient_of SBDF Yes
BN ingredient_of SBDG Yes
BPCK has_dose_form DF No
BPCK tradename_of GPCK Yes
BPCK contains SBD Yes
BPCK contains SCD Yes
DF No
DFG No
GPCK has_tradename BPCK Yes
GPCK has_dose_form DF No
GPCK contains SCD Keyword
IN has_tradename BN Keyword
IN part_of MIN Keyword
IN has_form PIN Keyword
IN ingredient_of SCDC Keyword
IN ingredient_of SCDF Keyword
IN ingredient_of SCDG Keyword
IN boss_of SCDFP Keyword
MIN has_part IN Keyword
MIN has_part PIN Keyword
MIN ingredients_of SCD Yes
PIN precise_ingredient_of BN Yes
PIN form_of IN Keyword
PIN part_of MIN Yes
PIN precise_ingredient_of SCDC Yes
PIN boss_of SCDFP Yes
SBD has_ingredient BN Yes
SBD contained_in BPCK Yes
SBD has_dose_form DF No
SBD quantified_form_of SBD Yes
SBD has_quantified_form SBD Yes
SBD consists_of SBDC Yes
SBD isa SBDF Keyword
SBD isa SBDFP Keyword
SBD isa SBDG Keyword
SBD tradename_of SCD Yes
SBD consists_of SCDC Yes
SBDC has_ingredient BN Yes
SBDC constitutes SBD Yes
SBDC tradename_of SCDC Yes
SBDF has_ingredient BN Yes
SBDF has_dose_form DF No
SBDF inverse_isa SBD Yes
SBDF isa SBDG Yes
SBDF tradename_of SCDF Yes
SBDF has_form SBDFP Yes
SBDFP form_of SBDF Yes
SBDFP tradename_of SCDFP Yes
SBDFP inverse_isa SBD Yes
SBDG has_ingredient BN Yes
SBDG has_doseformgroup DFG No
SBDG inverse_isa SBD Yes
SBDG inverse_isa SBDF Yes
SBDG tradename_of SCDG Keyword
SBDG tradename_of SCDGP Yes
SCD contained_in BPCK Yes
SCD has_dose_form DF No
SCD contained_in GPCK Yes
SCD has_ingredients MIN Yes
SCD has_tradename SBD Yes
SCD quantified_form_of SCD Yes
SCD has_quantified_form SCD Yes
SCD consists_of SCDC Yes
SCD isa SCDF Keyword
SCD isa SCDG Keyword
SCD isa SCDFP Yes
SCDC has_ingredient IN Keyword
SCDC has_precise_ingredient PIN Keyword
SCDC constitutes SBD Yes
SCDC has_tradename SBDC Yes
SCDC constitutes SCD Yes
SCDF has_dose_form DF No
SCDF has_ingredient IN Keyword
SCDF has_tradename SBDF Keyword
SCDF inverse_isa SCD Yes
SCDF isa SCDG Keyword
SCDF has_form SCDFP Yes
SCDFP inverse_isa SCD Yes
SCDFP has_tradename SBDFP Yes
SCDFP form_of SCDF Yes
SCDFP isa SCDGP Yes
SCDFP has_boss IN Keyword
SCDFP has_boss PIN Keyword
SCDG has_doseformgroup DFG No
SCDG has_ingredient IN Keyword
SCDG has_tradename SBDG Keyword
SCDG inverse_isa SCD Yes
SCDG inverse_isa SCDF Yes
SCDGP has_tradename SBDG Keyword
SCDGP inverse_isa SCDFP Yes
SCDGP form_of SCDG Keyword
Loading
Loading