Skip to content

Commit

Permalink
CU-86964zm4d fix preprocessing (CogStack#496)
Browse files Browse the repository at this point in the history
* CU-86964zm4d: Use ignore tag correctly to ignore certain parts of UK release

* CU-86964zm4d: Use OPCS4 later refset ID by default (and switch to older if needed)

* CU-86964zm4d: Fix OPCS4 refset ID tests.

Fix the default value being tested for (i.e in case of international release that'll be shown).
Add a test for old UK extension.

* CU-86964zm4d: Add note regarding OPCS refset ID relevance only for UK extensions.

* CU-86964zm4d: Fix checking of extension outside loops.

I.e determinie if a UK release/bundle is used for OPCS4/ICD10 mappings splitting.
Always returning separate refsets for ICD10 and OSC internally, even if the latter is None.
  • Loading branch information
mart-r authored Nov 1, 2024
1 parent 04efda5 commit e924798
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 14 deletions.
29 changes: 15 additions & 14 deletions medcat/utils/preprocess_snomed.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,15 +265,17 @@ def _determine_bundle(cls, data_path) -> Optional[SupportedBundles]:
return None

def _set_extension(self, release: str, extension: SupportedExtension) -> None:
self.opcs_refset_id = "1126441000000105"
# NOTE: now using the later refset IF by default
# NOTE: the OPCS4 refset ID is only relevant for UK releases
self.opcs_refset_id = '1382401000000109'
if (extension in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG) and
# using lexicographical comparison below
# e.g "20240101" > "20231122" results in True
# yet "20231121" > "20231122" results in False
len(release) == len("20231122") and release >= "20231122"):
len(release) == len("20231122") and release < "20231122"):
# NOTE for UK extensions starting from 20231122 the
# OPCS4 refset ID seems to be different
self.opcs_refset_id = '1382401000000109'
self.opcs_refset_id = "1126441000000105"
self._extension = extension

@classmethod
Expand Down Expand Up @@ -329,7 +331,7 @@ def to_concept_df(self):
contents_path = os.path.join(self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept])
concept_snapshot = self._extension.value.exp_files.get_concept()
description_snapshot = self._extension.value.exp_files.get_description()
if concept_snapshot in (None, _IGNORE_TAG) or (
if concept_snapshot is None or _IGNORE_TAG in concept_snapshot or (
self.bundle and self.bundle.value.has_invalid(
self._extension, [RefSetFileType.concept, RefSetFileType.description])):
continue
Expand Down Expand Up @@ -404,7 +406,7 @@ def list_all_relationships(self):
contents_path = os.path.join(self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept])
concept_snapshot = self._extension.value.exp_files.get_concept()
relationship_snapshot = self._extension.value.exp_files.get_relationship()
if concept_snapshot in (None, _IGNORE_TAG) or (
if concept_snapshot is None or _IGNORE_TAG in concept_snapshot or (
self.bundle and self.bundle.value.has_invalid(
self._extension, [RefSetFileType.concept, RefSetFileType.description])):
continue
Expand Down Expand Up @@ -440,7 +442,7 @@ def relationship2json(self, relationshipcode, output_jsonfile):
contents_path = os.path.join(self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.concept])
concept_snapshot = self._extension.value.exp_files.get_concept()
relationship_snapshot = self._extension.value.exp_files.get_relationship()
if concept_snapshot in (None, _IGNORE_TAG) or (
if concept_snapshot is None or _IGNORE_TAG in concept_snapshot or (
self.bundle and self.bundle.value.has_invalid(
self._extension, [RefSetFileType.concept, RefSetFileType.description])):
continue
Expand Down Expand Up @@ -476,10 +478,7 @@ def map_snomed2icd10(self):
dict: A dictionary containing the SNOMED CT to ICD-10 mappings including metadata.
"""
snomed2icd10df = self._map_snomed2refset()
if self._extension in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG):
return self._refset_df2dict(snomed2icd10df[0])
else:
return self._refset_df2dict(snomed2icd10df)
return self._refset_df2dict(snomed2icd10df[0])

def map_snomed2opcs4(self) -> dict:
"""
Expand All @@ -494,7 +493,8 @@ def map_snomed2opcs4(self) -> dict:
Returns:
dict: A dictionary containing the SNOMED CT to OPCS-4 mappings including metadata.
"""
if self._extension not in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG):
if all(ext not in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG)
for ext in self.exts):
raise AttributeError(
"OPCS-4 mapping does not exist in this edition")
snomed2opcs4df = self._map_snomed2refset()[1]
Expand Down Expand Up @@ -566,7 +566,7 @@ def _map_snomed2refset(self):
self._set_extension(snomed_release, self.exts[i])
refset_terminology = os.path.join(self.paths[i], PER_FILE_TYPE_PATHS[RefSetFileType.refset])
icd10_ref_set = self._extension.value.exp_files.get_refset()
if icd10_ref_set in (None, _IGNORE_TAG) or (
if icd10_ref_set is None or _IGNORE_TAG in icd10_ref_set or (
self.bundle and self.bundle.value.has_invalid(
self._extension, [RefSetFileType.concept, RefSetFileType.description])):
continue
Expand All @@ -582,13 +582,14 @@ def _map_snomed2refset(self):
dfs2merge.append(icd_mappings)
mapping_df = pd.concat(dfs2merge)
del dfs2merge
if self._extension in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG):
if any(ext in (SupportedExtension.UK_CLINICAL, SupportedExtension.UK_DRUG)
for ext in self.exts):
opcs_df = mapping_df[mapping_df['refsetId'] == self.opcs_refset_id]
icd10_df = mapping_df[mapping_df['refsetId']
== '999002271000000101']
return icd10_df, opcs_df
else:
return mapping_df
return mapping_df, None


class UnkownSnomedReleaseException(ValueError):
Expand Down
8 changes: 8 additions & 0 deletions tests/utils/test_preprocess_snomed.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def test_example_no_codfe_fails(self):


EXAMPLE_SNOMED_PATH_OLD = "SnomedCT_InternationalRF2_PRODUCTION_20220831T120000Z"
EXAMPLE_SNOMED_PATH_OLD_UK = "SnomedCT_UKClinicalRF2_PRODUCTION_20220831T120000Z"
EXAMPLE_SNOMED_PATH_NEW = "SnomedCT_UKClinicalRF2_PRODUCTION_20231122T000001Z"


Expand Down Expand Up @@ -87,6 +88,13 @@ def test_old_gets_old_OPCS4_mapping(self):
snomed = preprocess_snomed.Snomed(EXAMPLE_SNOMED_PATH_OLD)
snomed._set_extension(snomed._determine_release(EXAMPLE_SNOMED_PATH_OLD),
snomed._determine_extension(EXAMPLE_SNOMED_PATH_OLD))
self.assertEqual(snomed.opcs_refset_id, "1382401000000109") # defaults to this now

def test_old_gets_old_OPCS4_mapping_UK(self):
with patch_fake_files(EXAMPLE_SNOMED_PATH_OLD_UK):
snomed = preprocess_snomed.Snomed(EXAMPLE_SNOMED_PATH_OLD_UK)
snomed._set_extension(snomed._determine_release(EXAMPLE_SNOMED_PATH_OLD_UK),
snomed._determine_extension(EXAMPLE_SNOMED_PATH_OLD_UK))
self.assertEqual(snomed.opcs_refset_id, "1126441000000105")

def test_new_gets_new_OCPS4_mapping(self):
Expand Down

0 comments on commit e924798

Please sign in to comment.