Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replaced wildcard match with regex #237

Merged
merged 2 commits into from
May 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions cumulus_library/databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import json
import os
import pathlib
import re
import sys
from pathlib import Path
from typing import Any, Protocol
Expand Down Expand Up @@ -341,6 +342,13 @@ def __init__(self, db_file: str):
None,
duckdb.typing.VARCHAR,
)
self.connection.create_function(
# DuckDB's version is regexp_matches.
mikix marked this conversation as resolved.
Show resolved Hide resolved
"regexp_like",
self._compat_regexp_like,
None,
duckdb.typing.BOOLEAN,
)
self.connection.create_function(
# We frequently use Athena's date() function because it's easier than
# the more widely-supported way of CAST(x AS DATE).
Expand Down Expand Up @@ -393,6 +401,13 @@ def _compat_array_join(
return None
return delimiter.join(v for v in value if v is not None)

@staticmethod
def _compat_regexp_like(string: str | None, pattern: str | None) -> bool:
if string is None or pattern is None:
return None
match = re.search(pattern, string)
return match is not None

@staticmethod
def _compat_date(
value: str | datetime.datetime | datetime.date | None,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{% import 'core_utils.jinja' as utils -%}
{% import 'syntax.sql.jinja' as syntax -%}

CREATE TABLE core__medication AS (
WITH
Expand Down Expand Up @@ -32,15 +33,15 @@ CREATE TABLE core__medication AS (
mr.id,
substring(mr.med_ref, 2) AS medication_id
FROM mr_basics AS mr
WHERE mr.med_ref IS NOT NULL AND mr.med_ref LIKE '#%'
WHERE mr.med_ref IS NOT NULL AND {{ syntax.like('mr.med_ref', '#%') }}
),

external_refs AS (
SELECT DISTINCT
mr.id,
substring(mr.med_ref, 12) AS medication_id
FROM mr_basics AS mr
WHERE mr.med_ref IS NOT NULL AND mr.med_ref LIKE 'Medication/%'
WHERE mr.med_ref IS NOT NULL AND {{ syntax.like('mr.med_ref', 'Medication/%') }}
)

{# Internal: medication data from inline ETL extraction.
Expand Down
18 changes: 9 additions & 9 deletions cumulus_library/studies/core/reference_sql/builder_condition.sql
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ CREATE TABLE core__condition_dn_clinical_status AS (
condition AS s,
UNNEST(s.clinicalStatus.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://terminology.hl7.org/CodeSystem/condition-clinical'
REGEXP_LIKE(u.coding.system, 'http://terminology\.hl7\.org/CodeSystem/condition-clinical')
), --noqa: LT07

union_table AS (
Expand Down Expand Up @@ -137,7 +137,7 @@ CREATE TABLE core__condition_codable_concepts_display AS (
condition AS s,
UNNEST(s.code.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://snomed.info/sct'
REGEXP_LIKE(u.coding.system, 'http://snomed\.info/sct')
), --noqa: LT07

system_code_1 AS (
Expand All @@ -153,7 +153,7 @@ CREATE TABLE core__condition_codable_concepts_display AS (
condition AS s,
UNNEST(s.code.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://hl7.org/fhir/sid/icd-10-cm'
REGEXP_LIKE(u.coding.system, 'http://hl7\.org/fhir/sid/icd-10-cm')
), --noqa: LT07

system_code_2 AS (
Expand All @@ -169,7 +169,7 @@ CREATE TABLE core__condition_codable_concepts_display AS (
condition AS s,
UNNEST(s.code.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://hl7.org/fhir/sid/icd-9-cm'
REGEXP_LIKE(u.coding.system, 'http://hl7\.org/fhir/sid/icd-9-cm')
), --noqa: LT07

system_code_3 AS (
Expand All @@ -185,7 +185,7 @@ CREATE TABLE core__condition_codable_concepts_display AS (
condition AS s,
UNNEST(s.code.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://hl7.org/fhir/sid/icd-9-cm/diagnosis'
REGEXP_LIKE(u.coding.system, 'http://hl7\.org/fhir/sid/icd-9-cm/diagnosis')
), --noqa: LT07

system_code_4 AS (
Expand All @@ -201,7 +201,7 @@ CREATE TABLE core__condition_codable_concepts_display AS (
condition AS s,
UNNEST(s.code.coding) AS u (coding)
WHERE
u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.2.728286'
REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.2\.728286')
), --noqa: LT07

system_code_5 AS (
Expand All @@ -217,7 +217,7 @@ CREATE TABLE core__condition_codable_concepts_display AS (
condition AS s,
UNNEST(s.code.coding) AS u (coding)
WHERE
u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.4.698084.10375'
REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.4\.698084\.10375')
), --noqa: LT07

system_code_6 AS (
Expand All @@ -233,7 +233,7 @@ CREATE TABLE core__condition_codable_concepts_display AS (
condition AS s,
UNNEST(s.code.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://terminology.hl7.org/CodeSystem/data-absent-reason'
REGEXP_LIKE(u.coding.system, 'http://terminology\.hl7\.org/CodeSystem/data-absent-reason')
), --noqa: LT07

union_table AS (
Expand Down Expand Up @@ -397,7 +397,7 @@ CREATE TABLE core__condition_dn_verification_status AS (
condition AS s,
UNNEST(s.verificationStatus.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://terminology.hl7.org/CodeSystem/condition-ver-status'
REGEXP_LIKE(u.coding.system, 'http://terminology\.hl7\.org/CodeSystem/condition-ver-status')
), --noqa: LT07

union_table AS (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ CREATE TABLE core__documentreference_dn_category AS (
flattened_rows AS s,
UNNEST(s.category.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://hl7.org/fhir/us/core/ValueSet/us-core-documentreference-category'
REGEXP_LIKE(u.coding.system, 'http://hl7\.org/fhir/us/core/ValueSet/us-core-documentreference-category')
), --noqa: LT07

union_table AS (
Expand Down
30 changes: 15 additions & 15 deletions cumulus_library/studies/core/reference_sql/builder_encounter.sql
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ CREATE TABLE core__encounter_dn_type AS (
flattened_rows AS s,
UNNEST(s.type.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://terminology.hl7.org/CodeSystem/encounter-type'
REGEXP_LIKE(u.coding.system, 'http://terminology\.hl7\.org/CodeSystem/encounter-type')
), --noqa: LT07

system_type_1 AS (
Expand All @@ -48,7 +48,7 @@ CREATE TABLE core__encounter_dn_type AS (
flattened_rows AS s,
UNNEST(s.type.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://terminology.hl7.org/CodeSystem/v2-0004'
REGEXP_LIKE(u.coding.system, 'http://terminology\.hl7\.org/CodeSystem/v2-0004')
), --noqa: LT07

system_type_2 AS (
Expand All @@ -64,7 +64,7 @@ CREATE TABLE core__encounter_dn_type AS (
flattened_rows AS s,
UNNEST(s.type.coding) AS u (coding)
WHERE
u.coding.system LIKE 'urn:oid:2.16.840.1.113883.4.642.3.248'
REGEXP_LIKE(u.coding.system, 'urn:oid:2\.16\.840\.1\.113883\.4\.642\.3\.248')
), --noqa: LT07

system_type_3 AS (
Expand All @@ -80,7 +80,7 @@ CREATE TABLE core__encounter_dn_type AS (
flattened_rows AS s,
UNNEST(s.type.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://snomed.info/sct'
REGEXP_LIKE(u.coding.system, 'http://snomed\.info/sct')
), --noqa: LT07

system_type_4 AS (
Expand All @@ -96,7 +96,7 @@ CREATE TABLE core__encounter_dn_type AS (
flattened_rows AS s,
UNNEST(s.type.coding) AS u (coding)
WHERE
u.coding.system LIKE 'https://fhir.cerner.com/%/codeSet/71'
REGEXP_LIKE(u.coding.system, 'https://fhir\.cerner\.com/.*/codeSet/71')
), --noqa: LT07

system_type_5 AS (
Expand All @@ -112,7 +112,7 @@ CREATE TABLE core__encounter_dn_type AS (
flattened_rows AS s,
UNNEST(s.type.coding) AS u (coding)
WHERE
u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.10.698084.10110'
REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.10\.698084\.10110')
), --noqa: LT07

system_type_6 AS (
Expand All @@ -128,7 +128,7 @@ CREATE TABLE core__encounter_dn_type AS (
flattened_rows AS s,
UNNEST(s.type.coding) AS u (coding)
WHERE
u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.10.698084.18875'
REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.10\.698084\.18875')
), --noqa: LT07

system_type_7 AS (
Expand All @@ -144,7 +144,7 @@ CREATE TABLE core__encounter_dn_type AS (
flattened_rows AS s,
UNNEST(s.type.coding) AS u (coding)
WHERE
u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.10.698084.30'
REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.10\.698084\.30')
), --noqa: LT07

system_type_8 AS (
Expand All @@ -160,7 +160,7 @@ CREATE TABLE core__encounter_dn_type AS (
flattened_rows AS s,
UNNEST(s.type.coding) AS u (coding)
WHERE
u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.2.808267'
REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.2\.808267')
), --noqa: LT07

union_table AS (
Expand Down Expand Up @@ -340,7 +340,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS (
flattened_rows AS s,
UNNEST(s.reasoncode.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://terminology.hl7.org/CodeSystem/v3-ActPriority'
REGEXP_LIKE(u.coding.system, 'http://terminology\.hl7\.org/CodeSystem/v3-ActPriority')
), --noqa: LT07

system_reasoncode_1 AS (
Expand All @@ -356,7 +356,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS (
flattened_rows AS s,
UNNEST(s.reasoncode.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://snomed.info/sct'
REGEXP_LIKE(u.coding.system, 'http://snomed\.info/sct')
), --noqa: LT07

system_reasoncode_2 AS (
Expand All @@ -372,7 +372,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS (
flattened_rows AS s,
UNNEST(s.reasoncode.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://hl7.org/fhir/sid/icd-10-cm'
REGEXP_LIKE(u.coding.system, 'http://hl7\.org/fhir/sid/icd-10-cm')
), --noqa: LT07

system_reasoncode_3 AS (
Expand All @@ -388,7 +388,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS (
flattened_rows AS s,
UNNEST(s.reasoncode.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://hl7.org/fhir/sid/icd-9-cm'
REGEXP_LIKE(u.coding.system, 'http://hl7\.org/fhir/sid/icd-9-cm')
), --noqa: LT07

system_reasoncode_4 AS (
Expand All @@ -404,7 +404,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS (
flattened_rows AS s,
UNNEST(s.reasoncode.coding) AS u (coding)
WHERE
u.coding.system LIKE 'https://fhir.cerner.com/%/nomenclature'
REGEXP_LIKE(u.coding.system, 'https://fhir\.cerner\.com/.*/nomenclature')
), --noqa: LT07

system_reasoncode_5 AS (
Expand All @@ -420,7 +420,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS (
flattened_rows AS s,
UNNEST(s.reasoncode.coding) AS u (coding)
WHERE
u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.2.728286'
REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.2\.728286')
), --noqa: LT07

union_table AS (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,15 +86,17 @@ CREATE TABLE core__medication AS (
mr.id,
substring(mr.med_ref, 2) AS medication_id
FROM mr_basics AS mr
WHERE mr.med_ref IS NOT NULL AND mr.med_ref LIKE '#%'
WHERE mr.med_ref IS NOT NULL AND REGEXP_LIKE(mr.med_ref, '#.*')
)
),

external_refs AS (
SELECT DISTINCT
mr.id,
substring(mr.med_ref, 12) AS medication_id
FROM mr_basics AS mr
WHERE mr.med_ref IS NOT NULL AND mr.med_ref LIKE 'Medication/%'
WHERE mr.med_ref IS NOT NULL AND REGEXP_LIKE(mr.med_ref, 'Medication/.*')
)
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ CREATE TABLE {{ target_table }} AS (
UNNEST(s.{{ field_alias }}.coding) AS u (coding)
{%- if filter_priority %}
WHERE
u.coding.system LIKE '{{ system }}'
{{ syntax.like('u.coding.system', system) }}
{%- endif %}
), --noqa: LT07
{%- endfor %}
Expand Down
2 changes: 1 addition & 1 deletion cumulus_library/template_sql/coding_denormalize.sql.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ CREATE TABLE {{ target_table }} AS (
UNNEST(s.{{ parent_field }}) AS u (parent_col)
{%- if filter_priority %}
WHERE
u.parent_col.{{ column_name }}.system LIKE '{{ system }}'
{{ syntax.like("u.parent_col." + column_name + ".system", system) }}
{%- endif %}
), --noqa: LT07
{%- endfor %}
Expand Down
30 changes: 29 additions & 1 deletion cumulus_library/template_sql/shared_macros/syntax.sql.jinja
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{# Commonly used macros related to basic SQL syntax formatting #}
{# Commonly used macros related to basic SQL syntax formatting #}-- noqa:disable=JJ01

{%- macro comma_delineate(loop) -%}
{%- if not loop.last -%}
Expand All @@ -17,4 +17,32 @@
UNION
{%- endif -%}
{%- endmacro -%}

{#- The intent of this macro is to convert sql wildcard format to regex syntax.
So for example, a value of macro_string like:

https://fhir.cerner.com/%/codeSet/71

will be converted into a regex-escaped pattern with a regex match anything pattern:

https:\/\/fhir\.cerner\.com\/(.*)\/codeSet\/71

See https://trino.io/docs/current/functions/regexp.html for more information. We
are applying a limited ruleset currently, since this is based on common patterns
in coding system fields, where this is currently used; if needed this could be
made more robust.
-#}
{%- macro wildcard_to_regex(match_string) -%}
'^{{ match_string.replace("\'", "\\\''").replace(".", "\\.").replace("%", ".*") }}$'
{%- endmacro -%}


{#- This macro provides a SQL LIKE-esque interface to abstract away having to
worry about the slightly more performant regex based matching we want to use
in most cases.
-#}
{%- macro like(field, match_string) -%}
REGEXP_LIKE({{ field }}, {{ wildcard_to_regex(match_string) }})
{%- endmacro -%}

--noqa: LT12
3 changes: 2 additions & 1 deletion cumulus_library/template_sql/show_tables.sql.jinja
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{%- import 'syntax.sql.jinja' as syntax -%}
SELECT table_name
FROM information_schema.tables
WHERE
table_schema = '{{ schema_name }}'
AND table_type = 'BASE TABLE'
AND table_name LIKE '{{ prefix }}%';
AND {{ syntax.like('table_name', prefix +"%") }}
3 changes: 2 additions & 1 deletion cumulus_library/template_sql/show_views.sql.jinja
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{%- import 'syntax.sql.jinja' as syntax -%}
SELECT table_name
FROM information_schema.tables
WHERE
table_schema = '{{ schema_name }}'
AND table_type = 'VIEW'
AND table_name LIKE '{{ prefix }}%';
AND {{ syntax.like('table_name', prefix + "%") }}
Loading