diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index c8a451b1..56acd4c8 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,5 +1,5 @@ ### Checklist -- [ ] Consider if documentation (like in `docs/`) needs to be updated +- [ ] Consider if documentation (like in `docs/`, or if you've changed the structure of a table) needs to be updated - [ ] Consider if tests should be added - [ ] Update template repo if there are changes to study configuration \ No newline at end of file diff --git a/cumulus_library/studies/core/reference_sql/builder_condition.sql b/cumulus_library/studies/core/reference_sql/builder_condition.sql index 03648b18..492481fb 100644 --- a/cumulus_library/studies/core/reference_sql/builder_condition.sql +++ b/cumulus_library/studies/core/reference_sql/builder_condition.sql @@ -25,7 +25,7 @@ CREATE TABLE core__condition_dn_category AS ( s.row, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM flattened_rows AS s, @@ -66,7 +66,7 @@ CREATE TABLE core__condition_dn_clinical_status AS ( '0' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM condition AS s, @@ -131,7 +131,7 @@ CREATE TABLE core__condition_codable_concepts_display AS ( '0' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM condition AS s, @@ -147,7 +147,7 @@ CREATE TABLE core__condition_codable_concepts_display AS ( '1' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM condition AS s, @@ -163,7 +163,7 @@ CREATE TABLE core__condition_codable_concepts_display AS ( '2' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM condition AS s, @@ -179,7 +179,7 @@ CREATE TABLE core__condition_codable_concepts_display AS ( '3' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM condition AS s, @@ -195,7 +195,7 @@ CREATE TABLE core__condition_codable_concepts_display AS ( '4' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM condition AS s, @@ -211,7 +211,7 @@ CREATE TABLE core__condition_codable_concepts_display AS ( '5' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM condition AS s, @@ -227,7 +227,7 @@ CREATE TABLE core__condition_codable_concepts_display AS ( '6' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM condition AS s, @@ -351,7 +351,7 @@ CREATE TABLE core__condition_codable_concepts_all AS ( 0 AS row, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM condition AS s, @@ -391,7 +391,7 @@ CREATE TABLE core__condition_dn_verification_status AS ( '0' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM condition AS s, diff --git a/cumulus_library/studies/core/reference_sql/builder_documentreference.sql b/cumulus_library/studies/core/reference_sql/builder_documentreference.sql index ffffe5de..74ed6738 100644 --- a/cumulus_library/studies/core/reference_sql/builder_documentreference.sql +++ b/cumulus_library/studies/core/reference_sql/builder_documentreference.sql @@ -15,7 +15,7 @@ CREATE TABLE core__documentreference_dn_type AS ( 0 AS row, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM documentreference AS s, @@ -65,7 +65,7 @@ CREATE TABLE core__documentreference_dn_category AS ( '0' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM flattened_rows AS s, @@ -121,38 +121,16 @@ CREATE TABLE core__documentreference_dn_category AS ( -- ########################################################### -CREATE TABLE core__documentreference_dn_format AS ( - WITH - - system_format_0 AS ( - SELECT DISTINCT - s.id AS id, - u.parent_col.format.code, - u.parent_col.format.display, - u.parent_col.format.system AS system - FROM - documentreference AS s, - UNNEST(s.content) AS u (parent_col) - ), --noqa: LT07 - - union_table AS ( - SELECT - id, - system, - code, - display - FROM system_format_0 - +CREATE TABLE IF NOT EXISTS "cumulus_mhg_dev_db"."core__documentreference_dn_format" +AS ( + SELECT * FROM ( + VALUES + (cast(NULL AS varchar),cast(NULL AS varchar),cast(NULL AS varchar),cast(NULL AS varchar)) ) - SELECT - id, - code, - system, - display - FROM union_table + AS t ("id","code","system","display") + WHERE 1 = 0 -- ensure empty table ); - -- ########################################################### diff --git a/cumulus_library/studies/core/reference_sql/builder_encounter.sql b/cumulus_library/studies/core/reference_sql/builder_encounter.sql index 2b75cc4d..339b6468 100644 --- a/cumulus_library/studies/core/reference_sql/builder_encounter.sql +++ b/cumulus_library/studies/core/reference_sql/builder_encounter.sql @@ -26,7 +26,7 @@ CREATE TABLE core__encounter_dn_type AS ( '0' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM flattened_rows AS s, @@ -42,7 +42,7 @@ CREATE TABLE core__encounter_dn_type AS ( '1' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM flattened_rows AS s, @@ -58,7 +58,7 @@ CREATE TABLE core__encounter_dn_type AS ( '2' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM flattened_rows AS s, @@ -74,7 +74,7 @@ CREATE TABLE core__encounter_dn_type AS ( '3' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM flattened_rows AS s, @@ -90,7 +90,7 @@ CREATE TABLE core__encounter_dn_type AS ( '4' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM flattened_rows AS s, @@ -106,7 +106,7 @@ CREATE TABLE core__encounter_dn_type AS ( '5' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM flattened_rows AS s, @@ -122,7 +122,7 @@ CREATE TABLE core__encounter_dn_type AS ( '6' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM flattened_rows AS s, @@ -138,7 +138,7 @@ CREATE TABLE core__encounter_dn_type AS ( '7' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM flattened_rows AS s, @@ -154,7 +154,7 @@ CREATE TABLE core__encounter_dn_type AS ( '8' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM flattened_rows AS s, @@ -290,7 +290,7 @@ CREATE TABLE core__encounter_dn_type AS ( -- ########################################################### -CREATE TABLE IF NOT EXISTS "main"."core__encounter_dn_servicetype" +CREATE TABLE IF NOT EXISTS "cumulus_mhg_dev_db"."core__encounter_dn_servicetype" AS ( SELECT * FROM ( VALUES @@ -302,7 +302,7 @@ AS ( -- ########################################################### -CREATE TABLE IF NOT EXISTS "main"."core__encounter_dn_priority" +CREATE TABLE IF NOT EXISTS "cumulus_mhg_dev_db"."core__encounter_dn_priority" AS ( SELECT * FROM ( VALUES @@ -334,7 +334,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS ( '0' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM flattened_rows AS s, @@ -350,7 +350,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS ( '1' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM flattened_rows AS s, @@ -366,7 +366,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS ( '2' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM flattened_rows AS s, @@ -382,7 +382,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS ( '3' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM flattened_rows AS s, @@ -398,7 +398,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS ( '4' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM flattened_rows AS s, @@ -414,7 +414,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS ( '5' AS priority, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM flattened_rows AS s, @@ -520,91 +520,23 @@ CREATE TABLE core__encounter_dn_reasoncode AS ( -- ########################################################### -CREATE TABLE core__encounter_dn_dischargedisposition AS ( - WITH - - system_dischargedisposition_0 AS ( - SELECT DISTINCT - s.id AS id, - 0 AS row, - u.coding.code, - u.coding.display, - u.coding.system AS system, - u.coding.userSelected - FROM - encounter AS s, - UNNEST(s.hospitalization.dischargedisposition.coding) AS u (coding) - ), --noqa: LT07 - - union_table AS ( - SELECT - id, - row, - system, - code, - display, - userSelected - FROM system_dischargedisposition_0 - +CREATE TABLE IF NOT EXISTS "cumulus_mhg_dev_db"."core__encounter_dn_dischargedisposition" +AS ( + SELECT * FROM ( + VALUES + (cast(NULL AS varchar),cast(NULL AS bigint),cast(NULL AS varchar),cast(NULL AS varchar),cast(NULL AS varchar),cast(NULL AS boolean)) ) - SELECT - id, - code, - system, - display, - userSelected - FROM union_table + AS t ("id","row","code","system","display","userSelected") + WHERE 1 = 0 -- ensure empty table ); - -- ########################################################### CREATE TABLE core__encounter AS WITH -temp_encounter_completion AS ( - WITH - -- Start by grabbing group names and exports times for each Encounter. - temp_completion_times AS ( - SELECT - ece.encounter_id, - -- note that we don't chop the export time down to a DATE, - -- as is typical in the core study - min(from_iso8601_timestamp(ece.export_time)) AS earliest_export - FROM etl__completion_encounters AS ece - GROUP BY ece.encounter_id - ), - - -- Then examine all tables that are at least as recently loaded as the - -- Encounter. (This is meant to detect Conditions that maybe aren't - -- loaded into Athena yet for the Encounter.) - -- Make sure that we have all the tables we care about. - temp_completed_tables AS ( - SELECT - ece.encounter_id, - ( - BOOL_OR(ec.table_name = 'condition') - AND BOOL_OR(ec.table_name = 'documentreference') - AND BOOL_OR(ec.table_name = 'medicationrequest') - AND BOOL_OR(ec.table_name = 'observation') - ) AS is_complete - FROM etl__completion_encounters AS ece - INNER JOIN temp_completion_times AS tct ON tct.encounter_id = ece.encounter_id - INNER JOIN etl__completion AS ec ON ec.group_name = ece.group_name - WHERE tct.earliest_export <= from_iso8601_timestamp(ec.export_time) - GROUP BY ece.encounter_id - ) - - -- Left join back with main completion_encounters table, - -- to catch rows that are completion-tracked but not in - -- temp_completed_tables. - SELECT - ece.encounter_id AS id, - (is_complete IS NOT NULL AND is_complete) AS is_complete - FROM etl__completion_encounters AS ece - LEFT JOIN temp_completed_tables AS tct ON tct.encounter_id = ece.encounter_id -), +temp_encounter_completion AS (SELECT cast('' AS varchar) AS id, FALSE AS is_complete WHERE 1=0), temp_encounter_nullable AS ( SELECT DISTINCT @@ -715,48 +647,7 @@ WHERE CREATE TABLE core__incomplete_encounter AS WITH -temp_encounter_completion AS ( - WITH - -- Start by grabbing group names and exports times for each Encounter. - temp_completion_times AS ( - SELECT - ece.encounter_id, - -- note that we don't chop the export time down to a DATE, - -- as is typical in the core study - min(from_iso8601_timestamp(ece.export_time)) AS earliest_export - FROM etl__completion_encounters AS ece - GROUP BY ece.encounter_id - ), - - -- Then examine all tables that are at least as recently loaded as the - -- Encounter. (This is meant to detect Conditions that maybe aren't - -- loaded into Athena yet for the Encounter.) - -- Make sure that we have all the tables we care about. - temp_completed_tables AS ( - SELECT - ece.encounter_id, - ( - BOOL_OR(ec.table_name = 'condition') - AND BOOL_OR(ec.table_name = 'documentreference') - AND BOOL_OR(ec.table_name = 'medicationrequest') - AND BOOL_OR(ec.table_name = 'observation') - ) AS is_complete - FROM etl__completion_encounters AS ece - INNER JOIN temp_completion_times AS tct ON tct.encounter_id = ece.encounter_id - INNER JOIN etl__completion AS ec ON ec.group_name = ece.group_name - WHERE tct.earliest_export <= from_iso8601_timestamp(ec.export_time) - GROUP BY ece.encounter_id - ) - - -- Left join back with main completion_encounters table, - -- to catch rows that are completion-tracked but not in - -- temp_completed_tables. - SELECT - ece.encounter_id AS id, - (is_complete IS NOT NULL AND is_complete) AS is_complete - FROM etl__completion_encounters AS ece - LEFT JOIN temp_completed_tables AS tct ON tct.encounter_id = ece.encounter_id -) +temp_encounter_completion AS (SELECT cast('' AS varchar) AS id, FALSE AS is_complete WHERE 1=0) SELECT DISTINCT tec.id FROM temp_encounter_completion AS tec diff --git a/cumulus_library/studies/core/reference_sql/builder_medicationrequest.sql b/cumulus_library/studies/core/reference_sql/builder_medicationrequest.sql index 5afa591f..401d68e3 100644 --- a/cumulus_library/studies/core/reference_sql/builder_medicationrequest.sql +++ b/cumulus_library/studies/core/reference_sql/builder_medicationrequest.sql @@ -6,7 +6,7 @@ -- ########################################################### -CREATE TABLE IF NOT EXISTS "main"."core__medication_dn_code" +CREATE TABLE IF NOT EXISTS "cumulus_mhg_dev_db"."core__medication_dn_code" AS ( SELECT * FROM ( VALUES @@ -27,7 +27,7 @@ CREATE TABLE core__medicationrequest_dn_inline_code AS ( 0 AS row, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM medicationrequest AS s, @@ -57,7 +57,7 @@ CREATE TABLE core__medicationrequest_dn_inline_code AS ( -- ########################################################### -CREATE TABLE IF NOT EXISTS "main"."core__medicationrequest_dn_contained_code" +CREATE TABLE IF NOT EXISTS "cumulus_mhg_dev_db"."core__medicationrequest_dn_contained_code" AS ( SELECT * FROM ( VALUES @@ -88,7 +88,7 @@ CREATE TABLE core__medicationrequest_dn_category AS ( s.row, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM flattened_rows AS s, diff --git a/cumulus_library/studies/core/reference_sql/builder_observation.sql b/cumulus_library/studies/core/reference_sql/builder_observation.sql index d3325914..0883c588 100644 --- a/cumulus_library/studies/core/reference_sql/builder_observation.sql +++ b/cumulus_library/studies/core/reference_sql/builder_observation.sql @@ -25,7 +25,7 @@ CREATE TABLE core__observation_dn_category AS ( s.row, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM flattened_rows AS s, @@ -65,7 +65,7 @@ CREATE TABLE core__observation_dn_code AS ( 0 AS row, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM observation AS s, @@ -95,167 +95,43 @@ CREATE TABLE core__observation_dn_code AS ( -- ########################################################### -CREATE TABLE core__observation_component_code AS ( - WITH - - flattened_rows AS ( - SELECT DISTINCT - t.id AS id, - ROW_NUMBER() OVER (PARTITION BY id) AS row, - r."code" - FROM - observation AS t, - UNNEST(t."component") AS parent (r) - ), - - system_code_0 AS ( - SELECT DISTINCT - s.id AS id, - s.row, - u.coding.code, - u.coding.display, - u.coding.system AS system, - u.coding.userSelected - FROM - flattened_rows AS s, - UNNEST(s.code.coding) AS u (coding) - ), --noqa: LT07 - - union_table AS ( - SELECT - id, - row, - system, - code, - display, - userSelected - FROM system_code_0 - +CREATE TABLE IF NOT EXISTS "cumulus_mhg_dev_db"."core__observation_component_code" +AS ( + SELECT * FROM ( + VALUES + (cast(NULL AS varchar),cast(NULL AS bigint),cast(NULL AS varchar),cast(NULL AS varchar),cast(NULL AS varchar),cast(NULL AS boolean)) ) - SELECT - id, - row, - code, - system, - display, - userSelected - FROM union_table + AS t ("id","row","code","system","display","userSelected") + WHERE 1 = 0 -- ensure empty table ); - -- ########################################################### -CREATE TABLE core__observation_component_dataabsentreason AS ( - WITH - - flattened_rows AS ( - SELECT DISTINCT - t.id AS id, - ROW_NUMBER() OVER (PARTITION BY id) AS row, - r."dataabsentreason" - FROM - observation AS t, - UNNEST(t."component") AS parent (r) - ), - - system_dataabsentreason_0 AS ( - SELECT DISTINCT - s.id AS id, - s.row, - u.coding.code, - u.coding.display, - u.coding.system AS system, - u.coding.userSelected - FROM - flattened_rows AS s, - UNNEST(s.dataabsentreason.coding) AS u (coding) - ), --noqa: LT07 - - union_table AS ( - SELECT - id, - row, - system, - code, - display, - userSelected - FROM system_dataabsentreason_0 - +CREATE TABLE IF NOT EXISTS "cumulus_mhg_dev_db"."core__observation_component_dataabsentreason" +AS ( + SELECT * FROM ( + VALUES + (cast(NULL AS varchar),cast(NULL AS bigint),cast(NULL AS varchar),cast(NULL AS varchar),cast(NULL AS varchar),cast(NULL AS boolean)) ) - SELECT - id, - row, - code, - system, - display, - userSelected - FROM union_table + AS t ("id","row","code","system","display","userSelected") + WHERE 1 = 0 -- ensure empty table ); - -- ########################################################### -CREATE TABLE core__observation_component_interpretation AS ( - WITH - - flattened_rows AS ( - SELECT DISTINCT - t.id AS id, - ROW_NUMBER() OVER (PARTITION BY id) AS row, - r."interpretation" - FROM - observation AS t, - UNNEST(t."component") AS parent (r) - ), - - child_flattened_rows AS ( - SELECT DISTINCT - s.id, - s.row, -- keep the parent row number - u."interpretation" - FROM - flattened_rows AS s, - UNNEST(s.interpretation) AS u ("interpretation") - ), - - system_interpretation_0 AS ( - SELECT DISTINCT - s.id AS id, - s.row, - u.coding.code, - u.coding.display, - u.coding.system AS system, - u.coding.userSelected - FROM - child_flattened_rows AS s, - UNNEST(s.interpretation.coding) AS u (coding) - ), --noqa: LT07 - - union_table AS ( - SELECT - id, - row, - system, - code, - display, - userSelected - FROM system_interpretation_0 - +CREATE TABLE IF NOT EXISTS "cumulus_mhg_dev_db"."core__observation_component_interpretation" +AS ( + SELECT * FROM ( + VALUES + (cast(NULL AS varchar),cast(NULL AS bigint),cast(NULL AS varchar),cast(NULL AS varchar),cast(NULL AS varchar),cast(NULL AS boolean)) ) - SELECT - id, - row, - code, - system, - display, - userSelected - FROM union_table + AS t ("id","row","code","system","display","userSelected") + WHERE 1 = 0 -- ensure empty table ); - -- ########################################################### -CREATE TABLE IF NOT EXISTS "main"."core__observation_component_valuecodeableconcept" +CREATE TABLE IF NOT EXISTS "cumulus_mhg_dev_db"."core__observation_component_valuecodeableconcept" AS ( SELECT * FROM ( VALUES @@ -267,7 +143,7 @@ AS ( -- ########################################################### -CREATE TABLE IF NOT EXISTS "main"."core__observation_dn_interpretation" +CREATE TABLE IF NOT EXISTS "cumulus_mhg_dev_db"."core__observation_dn_interpretation" AS ( SELECT * FROM ( VALUES @@ -288,7 +164,7 @@ CREATE TABLE core__observation_dn_valuecodeableconcept AS ( 0 AS row, u.coding.code, u.coding.display, - u.coding.system AS system, + u.coding.system, u.coding.userSelected FROM observation AS s, @@ -318,7 +194,7 @@ CREATE TABLE core__observation_dn_valuecodeableconcept AS ( -- ########################################################### -CREATE TABLE IF NOT EXISTS "main"."core__observation_dn_dataabsentreason" +CREATE TABLE IF NOT EXISTS "cumulus_mhg_dev_db"."core__observation_dn_dataabsentreason" AS ( SELECT * FROM ( VALUES @@ -415,13 +291,40 @@ WHERE CREATE TABLE core__observation_component_valuequantity AS ( + WITH + + flattened_rows AS ( + SELECT DISTINCT + t.id AS id, + ROW_NUMBER() OVER (PARTITION BY id) AS row, + r."component" + FROM + observation AS t, + UNNEST(t."component") AS r ("component") + ), + + flattened_quantities AS ( + SELECT + f.id, + f.row, + f.component.valueQuantity.value AS value, + cast(NULL as varchar) AS comparator, + f.component.valueQuantity.unit AS unit, + f.component.valueQuantity.system AS system, + f.component.valueQuantity.code AS code + FROM flattened_rows AS f + WHERE f.component.valueQuantity IS NOT NULL + ) + SELECT - 'x' AS id, - CAST(NULL AS BIGINT) AS row, - CAST(NULL AS DOUBLE) AS value, -- noqa: disable=L029 - 'x' AS comparator, - 'x' AS unit, - 'x' AS system, - 'x' AS code - WHERE 1=0 -- empty table + f.id, + f.row, + -- We ensure value is a double, because nullable_cols() above will cast + -- as varchar if value isn't in the schema. + CAST(f.value AS DOUBLE) AS value, -- noqa: disable=L029 + f.comparator, + f.unit, + f.system, + f.code + FROM flattened_quantities AS f ); diff --git a/cumulus_library/studies/core/reference_sql/builder_patient.sql b/cumulus_library/studies/core/reference_sql/builder_patient.sql index 4e8bcbc8..1afbeed1 100644 --- a/cumulus_library/studies/core/reference_sql/builder_patient.sql +++ b/cumulus_library/studies/core/reference_sql/builder_patient.sql @@ -6,198 +6,26 @@ -- ########################################################### -CREATE TABLE core__patient_ext_race AS ( - WITH - - system_ombCategory AS ( - SELECT DISTINCT - s.id AS id, - '0' AS priority, - 'ombCategory' AS system, -- noqa: RF04 - ext_child.ext.valuecoding.code AS race_code, - ext_child.ext.valuecoding.display AS race_display - FROM - patient AS s, - UNNEST(s.extension) AS ext_parent (ext), - UNNEST(ext_parent.ext.extension) AS ext_child (ext) - WHERE - ext_parent.ext.url = 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-race' - AND ext_child.ext.url = 'ombCategory' - AND ext_child.ext.valuecoding.display != '' - ), - - system_detailed AS ( - SELECT DISTINCT - s.id AS id, - '1' AS priority, - 'detailed' AS system, -- noqa: RF04 - ext_child.ext.valuecoding.code AS race_code, - ext_child.ext.valuecoding.display AS race_display - FROM - patient AS s, - UNNEST(s.extension) AS ext_parent (ext), - UNNEST(ext_parent.ext.extension) AS ext_child (ext) - WHERE - ext_parent.ext.url = 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-race' - AND ext_child.ext.url = 'detailed' - AND ext_child.ext.valuecoding.display != '' - ), - - union_table AS ( - SELECT - id, - priority, - system, - race_code, - race_display - FROM system_ombCategory - UNION - SELECT - id, - priority, - system, - race_code, - race_display - FROM system_detailed - - ORDER BY id, priority - ) - - SELECT - id, - system, - race_code, - race_display - FROM ( - SELECT - id, - system, - LOWER( - ARRAY_JOIN( - ARRAY_SORT( - ARRAY_AGG( - race_code - ) - ), '; ' - ) - ) - AS race_code, - LOWER( - ARRAY_JOIN( - ARRAY_SORT( - ARRAY_AGG( - race_display - ) - ), '; ' - ) - ) AS race_display, - ROW_NUMBER() - OVER ( - PARTITION BY id - ORDER BY priority ASC - ) AS available_priority - FROM union_table - GROUP BY id, system, priority +CREATE TABLE IF NOT EXISTS "cumulus_mhg_dev_db"."core__patient_ext_race" +AS ( + SELECT * FROM ( + VALUES + (cast(NULL AS varchar),cast(NULL AS varchar),cast(NULL AS varchar),cast(NULL AS varchar)) ) - WHERE available_priority = 1 + AS t ("id","system","race_code","race_display") + WHERE 1 = 0 -- ensure empty table ); -- ########################################################### -CREATE TABLE core__patient_ext_ethnicity AS ( - WITH - - system_ombCategory AS ( - SELECT DISTINCT - s.id AS id, - '0' AS priority, - 'ombCategory' AS system, -- noqa: RF04 - ext_child.ext.valuecoding.code AS ethnicity_code, - ext_child.ext.valuecoding.display AS ethnicity_display - FROM - patient AS s, - UNNEST(s.extension) AS ext_parent (ext), - UNNEST(ext_parent.ext.extension) AS ext_child (ext) - WHERE - ext_parent.ext.url = 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity' - AND ext_child.ext.url = 'ombCategory' - AND ext_child.ext.valuecoding.display != '' - ), - - system_detailed AS ( - SELECT DISTINCT - s.id AS id, - '1' AS priority, - 'detailed' AS system, -- noqa: RF04 - ext_child.ext.valuecoding.code AS ethnicity_code, - ext_child.ext.valuecoding.display AS ethnicity_display - FROM - patient AS s, - UNNEST(s.extension) AS ext_parent (ext), - UNNEST(ext_parent.ext.extension) AS ext_child (ext) - WHERE - ext_parent.ext.url = 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity' - AND ext_child.ext.url = 'detailed' - AND ext_child.ext.valuecoding.display != '' - ), - - union_table AS ( - SELECT - id, - priority, - system, - ethnicity_code, - ethnicity_display - FROM system_ombCategory - UNION - SELECT - id, - priority, - system, - ethnicity_code, - ethnicity_display - FROM system_detailed - - ORDER BY id, priority - ) - - SELECT - id, - system, - ethnicity_code, - ethnicity_display - FROM ( - SELECT - id, - system, - LOWER( - ARRAY_JOIN( - ARRAY_SORT( - ARRAY_AGG( - ethnicity_code - ) - ), '; ' - ) - ) - AS ethnicity_code, - LOWER( - ARRAY_JOIN( - ARRAY_SORT( - ARRAY_AGG( - ethnicity_display - ) - ), '; ' - ) - ) AS ethnicity_display, - ROW_NUMBER() - OVER ( - PARTITION BY id - ORDER BY priority ASC - ) AS available_priority - FROM union_table - GROUP BY id, system, priority +CREATE TABLE IF NOT EXISTS "cumulus_mhg_dev_db"."core__patient_ext_ethnicity" +AS ( + SELECT * FROM ( + VALUES + (cast(NULL AS varchar),cast(NULL AS varchar),cast(NULL AS varchar),cast(NULL AS varchar)) ) - WHERE available_priority = 1 + AS t ("id","system","ethnicity_code","ethnicity_display") + WHERE 1 = 0 -- ensure empty table ); -- ########################################################### diff --git a/docs/core-study-details.md b/docs/core-study-details.md index f209fc5f..9bf79279 100644 --- a/docs/core-study-details.md +++ b/docs/core-study-details.md @@ -1,7 +1,7 @@ --- title: Core Study Details parent: Library -nav_order: 5 +nav_order: 6 # audience: clinical researchers, IRB reviewers # type: reference --- @@ -59,6 +59,38 @@ You can see which encounters were ignored as incomplete by examining the Usually, you can resolve this by running the ETL process again for the encounters, making sure to include all associated resources. +## Optional fields + +The core study includes several fields that are considered optional by FHIR/US core. +These are included due to their general utility in clinical informatics studies. +In practice, we have found that this data is usually present in FHIR exports from +EHR systems, but note that it is not guaranteed that a study relying on these +fields will work across multiple institutions without some additional work. + +Per resource, the optional fields are as follows: +- Condition + - recordedDate + - encounter_ref +- Encounter + - serviceType + - priority + - reasonCode + - dischargeDisposition +- Observation + - dataAbsentReason +- Observation - vital signs + - valueCodeableConcept + - interpretation +- Patient + - US Core race extension + - US Core ethnicity extension + +## Deprecation Notice + +The `core__observation` table is currently deprecated, and will be removed in a +future version. When possible, use one of the targeted profile tables (labs, +vital signs) instead. + ## core count tables ### core__count_condition_month @@ -190,7 +222,7 @@ making sure to include all associated resources. |category_system |varchar| | |category_display |varchar| | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |code_display |varchar| | |subject_ref |varchar| | |encounter_ref |varchar| | @@ -209,7 +241,7 @@ making sure to include all associated resources. |------------|-------|-----------| |id |varchar| | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -220,7 +252,7 @@ making sure to include all associated resources. |------------|-------|-----------| |id |varchar| | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -232,7 +264,7 @@ making sure to include all associated resources. |id |varchar| | |row |bigint | | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -243,7 +275,7 @@ making sure to include all associated resources. |------------|-------|-----------| |id |varchar| | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -254,7 +286,7 @@ making sure to include all associated resources. |------------|-------|-----------| |id |varchar| | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -288,19 +320,19 @@ making sure to include all associated resources. |id |varchar| | |row |bigint | | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | ### core__documentreference_dn_format -| Column | Type |Description| -|-----------|-------|-----------| -|id |varchar| | -|code |varchar| | -|code_system|varchar| | -|display |varchar| | +|Column | Type |Description| +|-------|-------|-----------| +|id |varchar| | +|code |varchar| | +|system |varchar| | +|display|varchar| | ### core__documentreference_dn_type @@ -309,7 +341,7 @@ making sure to include all associated resources. |------------|-------|-----------| |id |varchar| | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -368,8 +400,9 @@ making sure to include all associated resources. | Column | Type |Description| |------------|-------|-----------| |id |varchar| | +|row |bigint | | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -379,8 +412,9 @@ making sure to include all associated resources. | Column | Type |Description| |------------|-------|-----------| |id |varchar| | +|row |bigint | | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -392,7 +426,7 @@ making sure to include all associated resources. |id |varchar| | |row |bigint | | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -402,8 +436,9 @@ making sure to include all associated resources. | Column | Type |Description| |------------|-------|-----------| |id |varchar| | +|row |bigint | | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -415,7 +450,7 @@ making sure to include all associated resources. |id |varchar| | |row |bigint | | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -477,30 +512,30 @@ making sure to include all associated resources. |id |varchar| | |row |bigint | | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | ### core__medicationrequest -| Column | Type |Description| -|----------------------|-------|-----------| -|id |varchar| | -|status |varchar| | -|intent |varchar| | -|category_code |varchar| | -|category_code_system |varchar| | -|category_display |varchar| | -|reportedboolean |boolean| | -|reported_ref |varchar| | -|subject_ref |varchar| | -|encounter_ref |varchar| | -|authoredon |date | | -|authoredon_month |date | | -|medication_code |varchar| | -|medication_code_system|varchar| | -|medication_display |varchar| | +| Column | Type |Description| +|------------------|-------|-----------| +|id |varchar| | +|status |varchar| | +|intent |varchar| | +|category_code |varchar| | +|category_system |varchar| | +|category_display |varchar| | +|reportedboolean |boolean| | +|reported_ref |varchar| | +|subject_ref |varchar| | +|encounter_ref |varchar| | +|authoredon |date | | +|authoredon_month |date | | +|medication_code |varchar| | +|medication_system |varchar| | +|medication_display|varchar| | ### core__medicationrequest_dn_category @@ -510,7 +545,7 @@ making sure to include all associated resources. |id |varchar| | |row |bigint | | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -522,7 +557,7 @@ making sure to include all associated resources. |id |varchar| | |row |bigint | | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected |boolean| | |contained_id |varchar| | @@ -535,7 +570,7 @@ making sure to include all associated resources. |------------|-------|-----------| |id |varchar| | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -596,7 +631,7 @@ making sure to include all associated resources. |id |varchar| | |row |bigint | | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -608,7 +643,7 @@ making sure to include all associated resources. |id |varchar| | |row |bigint | | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -620,7 +655,7 @@ making sure to include all associated resources. |id |varchar| | |row |bigint | | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -632,22 +667,22 @@ making sure to include all associated resources. |id |varchar| | |row |bigint | | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | ### core__observation_component_valuequantity -| Column | Type |Description| -|-----------|-------|-----------| -|id |varchar| | -|row |bigint | | -|value |double | | -|comparator |varchar| | -|unit |varchar| | -|code_system|varchar| | -|code |varchar| | +| Column | Type |Description| +|----------|-------|-----------| +|id |varchar| | +|row |bigint | | +|value |double | | +|comparator|varchar| | +|unit |varchar| | +|system |varchar| | +|code |varchar| | ### core__observation_dn_category @@ -657,7 +692,7 @@ making sure to include all associated resources. |id |varchar| | |row |bigint | | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -668,7 +703,7 @@ making sure to include all associated resources. |------------|-------|-----------| |id |varchar| | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -678,8 +713,9 @@ making sure to include all associated resources. | Column | Type |Description| |------------|-------|-----------| |id |varchar| | +|row |bigint | | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -691,7 +727,7 @@ making sure to include all associated resources. |id |varchar| | |row |bigint | | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -702,7 +738,7 @@ making sure to include all associated resources. |------------|-------|-----------| |id |varchar| | |code |varchar| | -|code_system |varchar| | +|system |varchar| | |display |varchar| | |userselected|boolean| | @@ -774,22 +810,22 @@ making sure to include all associated resources. ### core__patient_ext_ethnicity -| Column | Type |Description| -|-----------------|-----------|-----------| -|id |varchar | | -|system |varchar(11)| | -|ethnicity_code |varchar | | -|ethnicity_display|varchar | | +| Column | Type |Description| +|-----------------|-------|-----------| +|id |varchar| | +|system |varchar| | +|ethnicity_code |varchar| | +|ethnicity_display|varchar| | ### core__patient_ext_race -| Column | Type |Description| -|------------|-----------|-----------| -|id |varchar | | -|system |varchar(11)| | -|race_code |varchar | | -|race_display|varchar | | +| Column | Type |Description| +|------------|-------|-----------| +|id |varchar| | +|system |varchar| | +|race_code |varchar| | +|race_display|varchar| | ### core__study_period @@ -818,3 +854,4 @@ making sure to include all associated resources. |doc_type_code |varchar | | |doc_type_display |varchar | | |ed_note |boolean | | + diff --git a/docs/sharing-data.md b/docs/sharing-data.md index 5632d05d..3ae1d584 100644 --- a/docs/sharing-data.md +++ b/docs/sharing-data.md @@ -1,7 +1,7 @@ --- title: Data Sharing parent: Library -nav_order: 6 +nav_order: 7 # audience: IT security or clinical researcher with low to medium familiarity with project # type: explanation --- diff --git a/docs/statistics.md b/docs/statistics.md index a9a51d56..ae4ab75d 100644 --- a/docs/statistics.md +++ b/docs/statistics.md @@ -1,7 +1,7 @@ --- title: Statistics parent: Library -nav_order: 7 +nav_order: 8 has_children: true # audience: Clinical researchers interested in publications # type: reference diff --git a/docs/study-list.md b/docs/study-list.md index 4b29ad23..63d60884 100644 --- a/docs/study-list.md +++ b/docs/study-list.md @@ -1,7 +1,7 @@ --- title: Cumulus studies parent: Library -nav_order: 8 +nav_order: 9 # audience: Clinical researchers interested in publications # type: reference --- diff --git a/docs/third-party-software-citations.md b/docs/third-party-software-citations.md index 269702b2..34fa0bc4 100644 --- a/docs/third-party-software-citations.md +++ b/docs/third-party-software-citations.md @@ -1,7 +1,7 @@ --- title: Third party software citations parent: Library -nav_order: 9 +nav_order: 10 # audience: Clinical researchers interested in publications # type: reference --- diff --git a/docs/writing-sql.md b/docs/writing-sql.md new file mode 100644 index 00000000..468abecb --- /dev/null +++ b/docs/writing-sql.md @@ -0,0 +1,158 @@ +--- +title: SQL best practices +parent: Library +nav_order: 5 +# audience: clinical researcher or engineer familiar with project +# type: reference +--- + +This doc contains low-level advice on SQL patterns that will help +avoid errors and make queries more performant. + +# General Syntax + +Our primary target is +[Amazon Athena](https://aws.amazon.com/athena/), +which is based off of +[Trino](https://trino.io/); +Searching for topics on either of these DBs is a good way to debug syntax issues +not mentioned the scope of this document. The +[Trino functions reference](https://trino.io/docs/current/functions.html) +is particularly useful. + +We use +[DuckDB](https://duckdb.org/) +for our unit tests, and it could be used as a datastore locally if desired. If there +is a difference between DuckDb and Athena/Trino, we patch DuckDB with a +[user defined function](https://duckdb.org/docs/api/python/function.html). +You can see some examples of this in the +[DuckDB DatabaseBackend](https://github.com/smart-on-fhir/cumulus-library/blob/main/cumulus_library/databases/duckdb.py#L37). +Hopefully you will not have to do this. If you do, we are probably interested +in supporting it, so please reach out and let us know. + +## Dates + +FHIR dates/times come across the wires as ISO 8061-formatted strings. +To get them into actual timestamps, use the `from_is08061_timestamp` function as +part of your query, and `CASE` to either `date` or `timestamp`.If you want to get a +specific portion of a date, like a month or year, use the `date_trunc` function. +An example use case: + +```sql + SELECT + date(from_iso8601_timestamp(dr.date)) AS doc_date, + date_trunc('month', date(from_iso8601_timestamp(dr."context"."period"."start"))) + AS author_month, + FROM documentreference AS dr +``` + +## Arrays + +For 90% of use cases, avoid array handling entirely! The `core` study exists to shield +you from this. This is only relevant if you're doing something that is fussy, or if you +have a reason to work from raw FHIR objects in the database. + +FHIR objects are deeply nested, and usually we want to unpack these values to get them +into 2D tables. The ETL has done a lot of work already to make this safe for you, but it's +not perfect - we need the database to have a complete schema for a nested object, and the +ETL will fill this out for two levels of depth, but there's some cases where we just can't +guess how deep of a schema we need to prepopulate (extension can recurse indefinetely). In +cases where we do go deep, the ETL will try to infer the schema from the data. This can cause +headaches if someone else tries to run your study, since they may have different data. + +So - if you're going deep into the data model, you may want to check `information_schema.columns` +for the field in question, and verify that the field actually exists, before doing any +additional query. This also means that generally for raw FHIR traversal, you should use the +[TableBuilder pattern](https://docs.smarthealthit.org/cumulus/library/creating-sql-with-python.html#working-with-tablebuilders) +so you have access to boolean logic. + +There is good +[documentation](https://docs.aws.amazon.com/athena/latest/ug/querying-arrays.html) +for querying arrays in athena, but the following functions are particularly useful: + +- [unnest](https://docs.aws.amazon.com/athena/latest/ug/filtering-with-unnest.html) +will break out an array into a table, where keys are the column names and the rows +are the values in arrays +- [array_sort](https://docs.aws.amazon.com/athena/latest/ug/sorting-arrays.html) +does exactly what you'd think - provides the contents of a list in alpha sorted order +- [array_join](https://docs.aws.amazon.com/athena/latest/ug/converting-arrays-to-strings.html) +lets you convert arrays into a string, joined by the delimiter of your choice +- [array_agg](https://docs.aws.amazon.com/athena/latest/ug/arrays-and-aggregation.html) +lets you concatenate together arrays. This can be very slow, so use with caution + +## Cubes + +Cubing tables before export is central to our strategy for masking PII. We provide a +[count builder](https://github.com/smart-on-fhir/cumulus-library/blob/main/cumulus_library/statistics/counts.py) +specifically to manage the details of this for you, but you can implement this yourself +if you prefer using the +[cube operator](https://trino.io/docs/current/sql/select.html#cube). + +Importantly - make sure the table(s) you're using for your cube do not contain explicit +null values, since these are indistinguishable from the nulls created when cubing outputs. +Instead, +[coalesce](https://trino.io/docs/current/functions/conditional.html#coalesce) +fields to some static value, like 'None', if required. Note that we use the +string `cumulus__none` as a way to say 'this is a null value distinct from +a powerset null', and the dashboard will handle this gracefully during rendering. + +# Performance + +There are several best practices outlined in the +[Athena performance docs](https://docs.aws.amazon.com/athena/latest/ug/performance-tuning.html#performance-tuning-query-optimization-techniques) +that you can use to try and improve query performance, but we'll call out several patterns +here that we have found useful for improving performance in our specific use case. + +## Split complex queries out into simpler chained subqueries + +Athena (and most other large database systems) use a hive-style map-reduce algorithm to +run queries across several nodes to improve throughput. If the query analysis can't identify +a good way to farm out a query, it will run that query on a single node, which can take +significantly longer + +It is significantly faster to chain together simpler subqueries. In one example, using a +NOT IN clause to compare the contents of a column to a list of ~300 words, the complex +query took over an hour, while chaining 300 subqueries together with a UNION clause +took around 40 seconds, so its worth really leaning in to atomization (or writing +templates that will create more atomic queries, more on that below) if you start seeing +long run times. Try to limit number of joins, array operations, and aggregation inside +a single query. + +## Limit group by/order by usage + +The group by/limit by operation tends to be confined to a single node for execution, and +may interfere with map/reduce spreading, so it's best to avoid it unless its needed. Consider +using these once, and the end of a chain of subqueires, and having it be the only operation +performed at this step. + +It is generally faster to sort data outside of Athena if possible. This is the approach +we use with exporting powersets - we do the sorting as we generate the csv/parquet artifacts. + +## Join order is important + +Due to the mechanism by which Athena computes joins, it is generally faster to join, from +left to right, in order of largest to smallest table. + +## Use views sparingly + +Especially when writing queries designed for use at multiple institutions, be judicious +with your use of views - as the underlying tables get large, views can become a bottleneck +for institutions with large populations (especially state-level health information +exchanges). Good candidates for usage of views include static data sets (like coding +definition tables), and study tables intended for very small populations by design. + +# Jinja templating + +If you've worked with data tools like DBT before, you may already be familiar with +[Jinja templates](https://pypi.org/project/Jinja2/) +as a way of making SQL more dynamic at runtime. If not, learning just +[a few commands](https://docs.hyperquery.ai/docs/basic-syntax-overview) +will enable most of the basic functionality you'll need to conditionally render SQL. +We have a collection of +[generic templates](https://github.com/smart-on-fhir/cumulus-library/tree/main/cumulus_library/template_sql) +that cover some basic use cases, and several studies have custom jinja templates as well. The +[base template function](https://github.com/smart-on-fhir/cumulus-library/blob/main/cumulus_library/template_sql/base_templates.py#L20) +handles the loading and populating of these tables at run time (and base_template.py +contains a bunch of convenience functions that demonstrate the proper way to invoke it), +so you shouldn't need to worry about the lower level jinja infrastructure if you reuse +this entry point. \ No newline at end of file