smart-on-fhir · dogversioning · Sep 24, 2024 · Sep 23, 2024 · Sep 23, 2024 · Sep 23, 2024
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,7 @@ MRCONSO.RRF
 *.zip
 coverage.xml
 *.parquet
+valueset_data/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/cumulus_library/.sqlfluff b/cumulus_library/.sqlfluff
@@ -94,6 +94,7 @@ join_cols_by_table =
 join_id = subject_ref
 join_tables = ['table_a','table_b']
 join_table_aliases = ['a','b']
+keywords = ['key','words']
 local_location = /var/study/data/
 neg_source_table = neg_source_table
 output_table_name = 'created_table'
@@ -102,6 +103,7 @@ prefix = Test
 primary_ref = encounter_ref
 pos_source_table = pos_source_table
 remote_location = s3://bucket/study/data/
+sab = 'MED-RT'
 schema_name = test_schema
 schema = 
     {
@@ -236,14 +238,18 @@ schema =
     }
 source_table = source_table
 source_id = source_id
+steward = steward
+study_prefix = study
 table_cols = ["a","b"]
 table_cols_types = ["varchar", "varchar"]
 table_name = test_table
 table_names = ["test_table"]
+table_prefix = 'foo'
 tables = ["test_a", "test_b"]
 table_suffix = 2024_01_01_11_11_11
 target_col_prefix = prefix
 target_table = target_table
+tier = 2
 type_casts={"b": "VARCHAR"}
 unnests = 
     [

diff --git a/cumulus_library/__init__.py b/cumulus_library/__init__.py
@@ -1,9 +1,9 @@
 """Package metadata"""
 
-from .base_utils import StudyConfig
-from .builders.base_table_builder import BaseTableBuilder
-from .builders.counts import CountsBuilder
-from .study_manifest import StudyManifest
+from cumulus_library.base_utils import StudyConfig
+from cumulus_library.builders.base_table_builder import BaseTableBuilder
+from cumulus_library.builders.counts import CountsBuilder
+from cumulus_library.study_manifest import StudyManifest
 
 __all__ = ["BaseTableBuilder", "CountsBuilder", "StudyConfig", "StudyManifest"]
-__version__ = "3.1.0"
+__version__ = "4.0.0"
diff --git a/cumulus_library/actions/builder.py b/cumulus_library/actions/builder.py
@@ -19,8 +19,7 @@
     log_utils,
     study_manifest,
 )
-from cumulus_library.builders import protected_table_builder
-from cumulus_library.statistics import psm
+from cumulus_library.builders import protected_table_builder, psm_builder, valueset_builder
 
 
 @contextlib.contextmanager
@@ -214,11 +213,17 @@ def run_statistics_builders(
         if (target_table,) in existing_stats and not config.stats_build:
             continue
         if config_type == "psm":
-            builder = psm.PsmBuilder(
+            builder = psm_builder.PsmBuilder(
                 toml_config_path=toml_path,
                 config=stats_config,
                 data_path=manifest.data_path / f"{manifest.get_study_prefix()}/psm",
             )
+        elif config_type == "valueset":
+            builder = valueset_builder.ValuesetBuilder(
+                toml_config_path=toml_path,
+                config=stats_config,
+                data_path=manifest.data_path / f"{manifest.get_study_prefix()}/valueset",
+            )
         else:
             raise errors.StudyManifestParsingError(  # pragma: no cover
                 f"{toml_path} references an invalid statistics type {config_type}."

diff --git a/cumulus_library/actions/importer.py b/cumulus_library/actions/importer.py
@@ -27,7 +27,7 @@ def _create_table_from_parquet(archive, file, study_name, config):
         query = base_templates.get_ctas_from_parquet_query(
             schema_name=config.schema,
             table_name=parquet_path.stem.replace(".", "_"),
-            local_location=parquet_path.parent,
+            local_location=f"{parquet_path.parent}/*.parquet",
             remote_location=s3_path,
             table_cols=list(table_types.index),
             remote_table_cols_types=remote_types,

diff --git a/cumulus_library/builders/counts.py b/cumulus_library/builders/counts.py
@@ -4,7 +4,7 @@
 from pathlib import Path
 
 from cumulus_library import BaseTableBuilder, errors, study_manifest
-from cumulus_library.statistics.statistics_templates import counts_templates
+from cumulus_library.builders.statistics_templates import counts_templates
 
 # Defined here for easy overriding by tests
 DEFAULT_MIN_SUBJECT = 10

diff --git a/cumulus_library/builders/psm.py → cumulus_library/builders/psm_builder.py b/cumulus_library/builders/psm.py → cumulus_library/builders/psm_builder.py
@@ -17,7 +17,7 @@
 from psmpy.functions import cohenD
 
 from cumulus_library import BaseTableBuilder, base_utils, databases
-from cumulus_library.statistics.statistics_templates import psm_templates
+from cumulus_library.builders.statistics_templates import psm_templates
 from cumulus_library.template_sql import base_templates
 
 

diff --git a/cumulus_library/builders/valueset/additional_rules_builder.py b/cumulus_library/builders/valueset/additional_rules_builder.py
@@ -0,0 +1,133 @@
+"""Builder for generating subsets of RxNorm data from a given valueset"""
+
+import pathlib
+
+from cumulus_library import BaseTableBuilder, base_utils, study_manifest
+from cumulus_library.builders.valueset import valueset_utils
+from cumulus_library.template_sql import base_templates
+
+
+class AdditionalRulesBuilder(BaseTableBuilder):
+    display_text = "Generating rulesets..."
+    base_path = pathlib.Path(__file__).resolve().parent
+
+    def prepare_queries(
+        self,
+        *args,
+        config: base_utils.StudyConfig,
+        manifest: study_manifest.StudyManifest,
+        valueset_config: valueset_utils.ValuesetConfig,
+        **kwargs,
+    ):
+        study_prefix = manifest.get_prefix_with_seperator()
+        table_prefix = ""
+        if valueset_config.table_prefix:
+            table_prefix = f"{valueset_config.table_prefix}_"
+        self.queries.append(
+            base_templates.get_base_template(
+                "create_search_rules_descriptions",
+                self.base_path / "template_sql",
+                study_prefix=study_prefix,
+                table_prefix=table_prefix,
+            )
+        )
+        self.queries.append(
+            base_templates.get_create_table_from_tables(
+                table_name=f"{study_prefix}{table_prefix}potential_rules",
+                # From a domain logic perspective, the _rela table is
+                # the leftmost table and we're annotating with the
+                # data from rxnconso. Since rxnconso is much, much
+                # larger, we're moving it to the left in the actual
+                # constructed join for athena performance reasons
+                tables=[
+                    f"{study_prefix}{table_prefix}all_rxnconso_keywords",
+                    f"{study_prefix}{table_prefix}rela",
+                ],
+                table_aliases=["r", "s"],
+                columns=[
+                    "s.rxcui",
+                    "r.rxcui",
+                    "s.tty",
+                    "r.tty",
+                    "s.rui",
+                    "s.rel",
+                    "s.rela",
+                    "s.str",
+                    "r.str",
+                    "r.keyword",
+                ],
+                column_aliases={
+                    "s.rxcui": "rxcui1",
+                    "s.tty": "tty1",
+                    "s.str": "str1",
+                    "r.rxcui": "rxcui2",
+                    "r.tty": "tty2",
+                    "r.str": "str2",
+                },
+                join_clauses=[
+                    "s.rxcui2 = r.rxcui",
+                    (
+                        "s.rxcui2 NOT IN (SELECT DISTINCT RXCUI FROM "  # noqa: S608
+                        f"{study_prefix}{table_prefix}rxnconso_keywords)"
+                    ),
+                ],
+            )
+        )
+        self.queries.append(
+            base_templates.get_create_table_from_tables(
+                table_name=f"{study_prefix}{table_prefix}included_rels",
+                tables=[
+                    f"{study_prefix}{table_prefix}potential_rules",
+                    f"{study_prefix}{table_prefix}search_rules",
+                ],
+                table_aliases=["r", "e"],
+                columns=[
+                    "r.rxcui1",
+                    "r.rxcui2",
+                    "r.tty1",
+                    "r.tty2",
+                    "r.rui",
+                    "r.rel",
+                    "r.rela",
+                    "r.str1",
+                    "r.str2",
+                    "r.keyword",
+                ],
+                join_clauses=[
+                    "r.REL NOT IN ('RB', 'PAR')",
+                    "e.include = TRUE",
+                    "r.TTY1 = e.TTY1",
+                    "r.TTY2 = e.TTY2",
+                    "r.RELA = e.RELA",
+                ],
+            )
+        )
+        self.queries.append(
+            base_templates.get_base_template(
+                "create_included_keywords",
+                self.base_path / "template_sql",
+                study_prefix=study_prefix,
+                table_prefix=table_prefix,
+            )
+        )
+        self.queries.append(
+            base_templates.get_create_table_from_union(
+                table_name=f"{study_prefix}{table_prefix}combined_ruleset",
+                tables=[
+                    f"{study_prefix}{table_prefix}included_keywords",
+                    f"{study_prefix}{table_prefix}included_rels",
+                ],
+                columns=[
+                    "rxcui1",
+                    "rxcui2",
+                    "tty1",
+                    "tty2",
+                    "rui",
+                    "rel",
+                    "rela",
+                    "str1",
+                    "str2",
+                    "keyword",
+                ],
+            )
+        )
diff --git a/cumulus_library/builders/valueset/expansion_rules.tsv b/cumulus_library/builders/valueset/expansion_rules.tsv
@@ -0,0 +1,98 @@
+TTY1	RELA	TTY2	INCLUDE
+BN	reformulated_to	BN	Yes
+BN	reformulation_of	BN	Yes
+BN	tradename_of	IN	Keyword
+BN	has_precise_ingredient	PIN	Keyword
+BN	ingredient_of	SBD	Yes
+BN	ingredient_of	SBDC	Yes
+BN	ingredient_of	SBDF	Yes
+BN	ingredient_of	SBDG	Yes
+BPCK	has_dose_form	DF	No
+BPCK	tradename_of	GPCK	Yes
+BPCK	contains	SBD	Yes
+BPCK	contains	SCD	Yes
+DF			No
+DFG			No
+GPCK	has_tradename	BPCK	Yes
+GPCK	has_dose_form	DF	No
+GPCK	contains	SCD	Keyword
+IN	has_tradename	BN	Keyword
+IN	part_of	MIN	Keyword
+IN	has_form	PIN	Keyword
+IN	ingredient_of	SCDC	Keyword
+IN	ingredient_of	SCDF	Keyword
+IN	ingredient_of	SCDG	Keyword
+IN	boss_of	SCDFP	Keyword
+MIN	has_part	IN	Keyword
+MIN	has_part	PIN	Keyword
+MIN	ingredients_of	SCD	Yes
+PIN	precise_ingredient_of	BN	Yes
+PIN	form_of	IN	Keyword
+PIN	part_of	MIN	Yes
+PIN	precise_ingredient_of	SCDC	Yes
+PIN	boss_of	SCDFP	Yes
+SBD	has_ingredient	BN	Yes
+SBD	contained_in	BPCK	Yes
+SBD	has_dose_form	DF	No
+SBD	quantified_form_of	SBD	Yes
+SBD	has_quantified_form	SBD	Yes
+SBD	consists_of	SBDC	Yes
+SBD	isa	SBDF	Keyword
+SBD	isa	SBDFP	Keyword
+SBD	isa	SBDG	Keyword
+SBD	tradename_of	SCD	Yes
+SBD	consists_of	SCDC	Yes
+SBDC	has_ingredient	BN	Yes
+SBDC	constitutes	SBD	Yes
+SBDC	tradename_of	SCDC	Yes
+SBDF	has_ingredient	BN	Yes 
+SBDF	has_dose_form	DF	No
+SBDF	inverse_isa	SBD	Yes
+SBDF	isa	SBDG	Yes
+SBDF	tradename_of	SCDF	Yes
+SBDF	has_form	SBDFP	Yes
+SBDFP	form_of	SBDF	Yes
+SBDFP	tradename_of	SCDFP	Yes
+SBDFP	inverse_isa	SBD	Yes
+SBDG	has_ingredient	BN	Yes 
+SBDG	has_doseformgroup	DFG	No
+SBDG	inverse_isa	SBD	Yes
+SBDG	inverse_isa	SBDF	Yes
+SBDG	tradename_of	SCDG	Keyword
+SBDG	tradename_of	SCDGP	Yes
+SCD	contained_in	BPCK	Yes
+SCD	has_dose_form	DF	No
+SCD	contained_in	GPCK	Yes
+SCD	has_ingredients	MIN	Yes
+SCD	has_tradename	SBD	Yes
+SCD	quantified_form_of	SCD	Yes
+SCD	has_quantified_form	SCD	Yes
+SCD	consists_of	SCDC	Yes
+SCD	isa	SCDF	Keyword
+SCD	isa	SCDG	Keyword
+SCD	isa	SCDFP	Yes
+SCDC	has_ingredient	IN	Keyword
+SCDC	has_precise_ingredient	PIN	Keyword
+SCDC	constitutes	SBD	Yes
+SCDC	has_tradename	SBDC	Yes
+SCDC	constitutes	SCD	Yes
+SCDF	has_dose_form	DF	No
+SCDF	has_ingredient	IN	Keyword
+SCDF	has_tradename	SBDF	Keyword
+SCDF	inverse_isa	SCD	Yes
+SCDF	isa	SCDG	Keyword
+SCDF	has_form	SCDFP	Yes
+SCDFP	inverse_isa	SCD	Yes
+SCDFP	has_tradename	SBDFP	Yes
+SCDFP	form_of	SCDF	Yes
+SCDFP	isa	SCDGP	Yes
+SCDFP	has_boss	IN	Keyword
+SCDFP	has_boss	PIN	Keyword
+SCDG	has_doseformgroup	DFG	No
+SCDG	has_ingredient	IN	Keyword
+SCDG	has_tradename	SBDG	Keyword
+SCDG	inverse_isa	SCD	Yes
+SCDG	inverse_isa	SCDF	Yes
+SCDGP	has_tradename	SBDG	Keyword
+SCDGP	inverse_isa	SCDFP	Yes
+SCDGP	form_of	SCDG	Keyword