From a18194ec3ae25ea468e5363e5b99905e2e4a83f6 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 6 Mar 2023 13:20:54 +0200 Subject: [PATCH 01/17] CU-8677ge6j8 Version identification and updating (#313) * Expose example model card version in metadata test * Add version detection along with tests * Move to a more comprehensive version string parser (regex) * Add more comprehensive versioning tests * Move MedCAT unzip to a separate method * Separate getting semantic version from string * Add new CDB with version information and use that with versioning tests * Add methods to get version info from CDB dump and model pack zip/folder * Exposing CDB file name and adding custom dev patch version support * Fix config.linking.filters.cuis - from empty dict to empty set * Add logging to versioning * Fix f-strings instead of (intended) r-strings * Add creating model pack archive to versioning CDB fix * Fix logger initialising * Making versioning a runnable module that allows fixing the config * Add docstrings to CLI methods * CU-8677ge6j8 Make explicit check regards to empty dict when fixing config * CU-8677ge6j8 Add tests regarding versioning changes * CU-8677ge6j8 Add missing return type hint * CU-8677ge6j8 Simplify action handling for CLI input * CU-8677ge6j8 Simplifying archive making method --- examples/cdb_new.dat | Bin 0 -> 3366 bytes medcat/cat.py | 36 ++- medcat/utils/versioning.py | 314 ++++++++++++++++++++++++ tests/utils/regression/test_metadata.py | 1 + tests/utils/test_versioning.py | 163 ++++++++++++ 5 files changed, 504 insertions(+), 10 deletions(-) create mode 100644 examples/cdb_new.dat create mode 100644 medcat/utils/versioning.py create mode 100644 tests/utils/test_versioning.py diff --git a/examples/cdb_new.dat b/examples/cdb_new.dat new file mode 100644 index 0000000000000000000000000000000000000000..27957d62bcdbb0013c393684f2036a9f4d7c8f4b GIT binary patch literal 3366 zcmbVP-ESR76~EWl_KjUTj$;#AB}i9-NRE&{Qz0d)FpepP+}Mq5C!wv{(cay;cZR#W zv+V5pOHG7?sA4fvEp>Q+C`3)vDCz@!gQouh|3HBU5aN|T0)A(9*ET%xz+TR5Q>iue53CuF zy@<0No@%LLZQsMbp<8d=dvnv8q19V&u54It&Zj(5JDg!7k2Arw)V4ijrXv}%kY^b- zvy}fdmnjcydmuNHUL32;%OH}eJWKM-X)qZmpYgqn#{mzRjJwJnnlWgcfe1<6%-;ML z9y4CiQrQy~aWYi9JoP=zS-ET<=BC->F;Bg)usBpb$l2kcZBLs~nsrqgd6_leygcI; z$lNRX`sJB&(7HSmajm@`sMegJEs2*2YsTj0BOdr(HV2wDuj$0|_gNsd*9oD$ucAc7 za7W>@29&U77#`9zXzp+yE{4P=*#;@_G?-W-WAWRelop&kR%l*6Pr_(#;_=kI7I7 zxZtF-JV|)UT=>ZK+8*X+JV|+yDjxxmaRJ)1x-S!^GnMSBGysvO>Lbu3I4$c*7XL_dUw39ln3Qa8+0^qb$i zF;O$;0)>s)sm<=pyEndXP2=v34fnq7-fMW@yt(P#TlOdymniYZ=y;b`kZaow1PX^a z!^%Ll2h8*-hd7s#62yEVOZSmZl#-m0?F?UF0{&`}ODj(eWvM4)L~TeVnT%Wl3}*WY z2M)ozpT+pe2o=akg0EsuWZAOF0xI-8p*f^*a<&95TcFw<536)P=+6<}Or#t- zgs4t+bI}WTy?qTRhJGG;8L|V24!sn~iH)xnDgn|SLV&u!x&$zkGHXTy8HRH>Bt&wZ zV}8nM`|)$x z@6WIEV0C4?tYV;3W0*_!QRUtU5%s|Pu5HJncjZcEH?3b5yMk?$!AL)2Y*eU?4O+jC zkC|ZXq_S(qK+_-Ej=6Z^%U~bCm;Nc2!=)z#_}$dKZ-w*2>^`I&DVugf0N9&8jg=o_#YFnnmHP;9zDN|MURLyA#?V*giN z_n249PJ@RX3S+h)eDqNYX(xJ^cl#F5*ZSrO!NI2RKP3W zvhCJ~hz;tfv(*(^>=3^+qwl`&%^obwUVHD`6%%@rYw~m$S_!qbs_&yQEY2;=EzDAi zK!FBs4q(F-f}FJEG^6Rzj!g5S;hkAspy%o}sOE-?7;-c|;&>3bi^Tut>HimxczV43 zfR@CkL>bi_P7EISdI!YvtRuV6c1wgJXa*(4BYHhH_1jkb*3?&QE}jtO6Y;rt zDt?!XXEgh?nmzxI*^B#Dd?r4J3x+ibr+~~{?Y+QxPRqx zsgUASf++DwJQ2^uvm^181eg9i*;I{iR4J%$kHlxR^TknaMoD6+*eQ`;ir;>J`PuFQcJ43Y0>p8@Zxd|E$lW5eHK4x%xap@}XyuEbwb^2Lc zxCR?wFWHfs^R%nTHCz{qEOdL#&|QZv@zgZksPzumFA#fGiJ@gz2DmeE0&yAsqQX}i zj*-Ke)ULZ0csLYK4#l%W@%&J{KocGDf*>qzpE$ROlJtYc1(NAvrV?MjPP4OgI8s>Q z3P?9DpyM&ct+BrKLwlqPTNEy-GWJ4e$|*k_8&R3FZpB}#al3HIjQCqcSx3kVv)kY+ zf^J5)9Qs>?euT<}B;eXlC0af@5-DUp-rA>IIjU$Y5lZ75%0vdOT=SqM;}&L0D}cF$ zVn{Qrp?rG#s0Xb~wRDyT`=B^0QOR2ok2GWp#UHV|#1PYBNR!9?) str: + """Attempt unpack the zip to a folder and get the model pack path. + + If the folder already exists, no unpacking is done. + + Args: + zip_path (str): The ZIP path + + Returns: + str: The model pack path + """ + base_dir = os.path.dirname(zip_path) + filename = os.path.basename(zip_path) + + foldername = filename.replace(".zip", '') + + model_pack_path = os.path.join(base_dir, foldername) + if os.path.exists(model_pack_path): + logger.info("Found an existing unziped model pack at: {}, the provided zip will not be touched.".format(model_pack_path)) + else: + logger.info("Unziping the model pack and loading models.") + shutil.unpack_archive(zip_path, extract_dir=model_pack_path) + return model_pack_path + @classmethod def load_model_pack(cls, zip_path: str, @@ -324,16 +349,7 @@ def load_model_pack(cls, from medcat.vocab import Vocab from medcat.meta_cat import MetaCAT - base_dir = os.path.dirname(zip_path) - filename = os.path.basename(zip_path) - foldername = filename.replace(".zip", '') - - model_pack_path = os.path.join(base_dir, foldername) - if os.path.exists(model_pack_path): - logger.info("Found an existing unziped model pack at: {}, the provided zip will not be touched.".format(model_pack_path)) - else: - logger.info("Unziping the model pack and loading models.") - shutil.unpack_archive(zip_path, extract_dir=model_pack_path) + model_pack_path = cls.attempt_unpack(zip_path) # Load the CDB cdb_path = os.path.join(model_pack_path, "cdb.dat") diff --git a/medcat/utils/versioning.py b/medcat/utils/versioning.py new file mode 100644 index 000000000..539af0339 --- /dev/null +++ b/medcat/utils/versioning.py @@ -0,0 +1,314 @@ +from typing import Tuple, List +import re +import os +import shutil +import argparse +import logging + +import dill + +from medcat.cat import CAT + +logger = logging.getLogger(__name__) + +SemanticVersion = Tuple[int, int, int] + + +# Regex as per: +# https://semver.org/#is-there-a-suggested-regular-expression-regex-to-check-a-semver-string +SEMANTIC_VERSION_REGEX = (r"^(0|[1-9]\d*)" # major + r"\.(0|[1-9]\d*)" # .minor + # CHANGE FROM NORM - allowing dev before patch version number + # but NOT capturing the group + r"\.(?:dev)?" + r"(0|[1-9]\d*)" # .patch + # and then some trailing stuff + r"(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?" + r"(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$") +SEMANTIC_VERSION_PATTERN = re.compile(SEMANTIC_VERSION_REGEX) + + +CDB_FILE_NAME = "cdb.dat" + + +def get_semantic_version(version: str) -> SemanticVersion: + """Get the semantiv version from the string. + + Args: + version (str): The version string. + + Raises: + ValueError: If the version string does not match the semantic versioning format. + + Returns: + SemanticVersion | Tuple[int, int, int]: The major, minor and patch version + """ + match = SEMANTIC_VERSION_PATTERN.match(version) + if not match: + raise ValueError(f"Unknown version string: {version}") + return int(match.group(1)), int(match.group(2)), int(match.group(3)) + + +def get_version_from_modelcard(d: dict) -> SemanticVersion: + """Gets the the major.minor.patch version from a model card. + + The version needs to be specified at: + model_card["MedCAT Version"] + The version is expected to be semantic (major.minor.patch). + + Args: + d (dict): The model card in dict format. + + Returns: + SemanticVersion | Tuple[int, int, int]: The major, minor and patch version + """ + version_str: str = d["MedCAT Version"] + return get_semantic_version(version_str) + + +def get_semantic_version_from_model(cat: CAT) -> SemanticVersion: + """Get the semantic version of a CAT model. + + This uses the `get_version_from_modelcard` method on the model's + model card. + + So it is equivalen to `get_version_from_modelcard(cat.get_model_card(as_dict=True))`. + + Args: + cat (CAT): The CAT model. + + Returns: + SemanticVersion | Tuple[int, int, int]: The major, minor and patch version + """ + return get_version_from_modelcard(cat.get_model_card(as_dict=True)) + + +def get_version_from_cdb_dump(cdb_path: str) -> SemanticVersion: + """Get the version from a CDB dump (cdb.dat). + + The version information is expected in the following location: + cdb["config"]["version"]["medcat_version"] + + Args: + cdb_path (str): The path to cdb.dat + + Returns: + SemanticVersion | Tuple[int, int, int]: The major, minor and patch version + """ + with open(cdb_path, 'rb') as f: + d = dill.load(f) + config: dict = d["config"] + version = config["version"]["medcat_version"] + return get_semantic_version(version) + + +def get_version_from_modelpack_zip(zip_path: str, cdb_file_name=CDB_FILE_NAME) -> SemanticVersion: + """Get the semantic version from a MedCAT model pack zip file. + + This involves simply reading the config on file and reading the version information from there. + + The zip file is extracted if it has not yet been extracted. + + Args: + zip_path (str): The zip file path for the model pack. + cdb_file_name (str, optional): The CDB file name to use. Defaults to "cdb.dat". + + Returns: + SemanticVersion | Tuple[int, int, int]: The major, minor and patch version + """ + model_pack_path = CAT.attempt_unpack(zip_path) + return get_version_from_cdb_dump(os.path.join(model_pack_path, cdb_file_name)) + + +UPDATE_VERSION = (1, 3, 0) + + +class ConfigUpgrader: + """Config updater. + + Attempts to upgrade pre 1.3.0 medcat configs to the newer format. + + Args: + zip_path (str): The model pack zip path. + cdb_file_name (str, optional): The CDB file name. Defaults to "cdb.dat". + """ + + def __init__(self, zip_path: str, cdb_file_name: str = CDB_FILE_NAME) -> None: + self.model_pack_path = CAT.attempt_unpack(zip_path) + self.cdb_path = os.path.join(self.model_pack_path, cdb_file_name) + self.current_version = get_version_from_cdb_dump(self.cdb_path) + logger.debug("Loaded model from %s at version %s", + self.model_pack_path, self.current_version) + + def needs_upgrade(self) -> bool: + """Check if the specified modelpack needs an upgrade. + + It needs an upgrade if its version is less than 1.3.0. + + Returns: + bool: Whether or not an upgrade is needed. + """ + return self.current_version < UPDATE_VERSION + + def _get_relevant_files(self, ignore_hidden: bool = True) -> List[str]: + """Get the list of relevant files with full path names. + + By default this will ignore hidden files (those that start with '.'). + + Args: + ignore_hidden (bool, optional): Whether to ignore hidden files. Defaults to True. + + Returns: + List[str]: The list of relevant file names to copy. + """ + return [os.path.join(self.model_pack_path, fn) # ignores hidden files + for fn in os.listdir(self.model_pack_path) if (ignore_hidden and not fn.startswith("."))] + + def _check_existance(self, files_to_copy: List[str], new_path: str, overwrite: bool): + if overwrite: + return # ignore all + if not os.path.exists(new_path): + os.makedirs(new_path) + return # all good, new folder + # check file existance in new (existing) path + for file_to_copy in files_to_copy: + new_file_name = os.path.join( + new_path, os.path.basename(file_to_copy)) + if os.path.exists(new_file_name): + raise ValueError(f"File already exists: {new_file_name}. " + "Pass overwrite=True to overwrite") + + def _copy_files(self, files_to_copy: List[str], new_path: str) -> None: + for file_to_copy in files_to_copy: + new_file_name = os.path.join( + new_path, os.path.basename(file_to_copy)) + if os.path.isdir(file_to_copy): + # if exists is OK since it should have been checked before + # if it was not to be overwritten + logger.debug("Copying folder %s to %s", + file_to_copy, new_file_name) + shutil.copytree(file_to_copy, new_file_name, + dirs_exist_ok=True) + else: + logger.debug("Copying file %s to %s", + file_to_copy, new_file_name) + shutil.copy(file_to_copy, new_file_name) + + def upgrade(self, new_path: str, overwrite: bool = False) -> None: + """Upgrade the model. + + The upgrade copies all the files from the original folder + to the new folder. + + After copying, it changes the config into the format + required by MedCAT after version 1.3.0. + + Args: + new_path (str): The path for the new model pack folder. + overwrite (bool, optional): Whether to overwrite new path. Defaults to False. + + Raises: + ValueError: If one of the target files exists and cannot be overwritten. + ValueError: If model pack does not need an upgrade + """ + if not self.needs_upgrade(): + raise ValueError(f"Model pack does not need ugprade: {self.model_pack_path} " + f"since it's at version: {self.current_version}") + logger.info("Starting to upgrade %s at (version %s)", + self.model_pack_path, self.current_version) + files_to_copy = self._get_relevant_files() + self._check_existance(files_to_copy, new_path, overwrite) + logger.debug("Copying files from %s", self.model_pack_path) + self._copy_files(files_to_copy, new_path) + logger.info("Going to try and fix CDB") + self._fix_cdb(new_path) + self._make_archive(new_path) + + def _fix_cdb(self, new_path: str) -> None: + new_cdb_path = os.path.join(new_path, os.path.basename(self.cdb_path)) + with open(new_cdb_path, 'rb') as f: + data = dill.load(f) + # make the changes + + logger.debug("Fixing CDB issue #1 (linking.filters.cui)") + # Number 1 + # the linking.filters.cuis is set to "{}" + # which is assumed to be an empty set, but actually + # evaluates to an empty dict instead + cuis = data['config']['linking']['filters']['cuis'] + if cuis == {}: + # though it _should_ be the empty set + data['config']['linking']['filters']['cuis'] = set(cuis) + # save modified version + logger.debug("Saving CDB back into %s", new_cdb_path) + with open(new_cdb_path, 'wb') as f: + dill.dump(data, f) + + def _make_archive(self, new_path: str): + logger.debug("Taking data from %s and writing it to %s.zip", + new_path, new_path) + shutil.make_archive( + base_name=new_path, format='zip', base_dir=new_path) + + +def parse_args() -> argparse.Namespace: + """Parse the arguments from the CLI. + + Returns: + argparse.Namespace: The parsed arguments. + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "action", help="The action. Currently, only 'fix-config' is available.", choices=['fix-config'], type=str.lower) + parser.add_argument("modelpack", help="MedCAT modelpack zip path") + parser.add_argument("newpath", help="The path for the new modelpack") + parser.add_argument( + "--overwrite", help="Allow overvwriting existing files", action="store_true") + parser.add_argument( + "--silent", help="Disable logging", action="store_true") + parser.add_argument( + "--verbose", help="Show debug output", action="store_true") + return parser.parse_args() + + +def setup_logging(args: argparse.Namespace) -> None: + """Setup logging for the runnable based on CLI arguments. + + Args: + args (argparse.Namespace): The parsed arguments. + """ + if not args.silent: + logger.addHandler(logging.StreamHandler()) + if args.verbose: + logger.setLevel(logging.DEBUG) + + +def fix_config(args: argparse.Namespace) -> None: + """Perform the fix-config action based on the CLI arguments. + + Args: + args (argparse.Namespace): The parsed arguments. + """ + logger.debug("Setting up upgrader") + upgrader = ConfigUpgrader(args.modelpack) + logger.debug("Starting the upgrade process") + upgrader.upgrade(args.newpath, overwrite=args.overwrite) + + +def main() -> None: + """Run the CLI associated with this module. + + Raises: + ValueError: If an unknown action is provided. + """ + args = parse_args() + setup_logging(args) + logger.debug("Will attempt to perform action %s", args.action) + if args.action == 'fix-config': + fix_config(args) + else: + raise ValueError(f"Unknown action: {args.action}") + + +if __name__ == "__main__": + main() diff --git a/tests/utils/regression/test_metadata.py b/tests/utils/regression/test_metadata.py index ad3a83533..7dd6f0911 100644 --- a/tests/utils/regression/test_metadata.py +++ b/tests/utils/regression/test_metadata.py @@ -4,6 +4,7 @@ # from Anthony's 1.4 model +EXAMPLE_VERSION = 1, 3, 0 MODEL_CARD_EXAMPLE = { "Model ID": "acd0dfc2f0df45de", "Last Modified On": "04 October 2022", diff --git a/tests/utils/test_versioning.py b/tests/utils/test_versioning.py new file mode 100644 index 000000000..30a3afdd3 --- /dev/null +++ b/tests/utils/test_versioning.py @@ -0,0 +1,163 @@ +import unittest +import os +import tempfile +import shutil + +import dill +import pydantic + +from medcat.utils.versioning import get_version_from_modelcard, get_semantic_version_from_model +from medcat.utils.versioning import get_version_from_cdb_dump, get_version_from_modelpack_zip +from medcat.utils.versioning import ConfigUpgrader +from medcat.cat import CAT +from medcat.cdb import CDB +from medcat.vocab import Vocab + +from .regression.test_metadata import MODEL_CARD_EXAMPLE, EXAMPLE_VERSION + + +CORRECT_SEMANTIC_VERSIONS = [("1.0.1-alpha-1", (1, 0, 1)), ("0.0.1-alpha-1", (0, 0, 1)), + ("1.0.0-alpha.1", (1, 0, 0) + ), ("1.0.0-0.3.7", (1, 0, 0)), + ("1.0.0-x.7.z.92", (1, 0, 0) + ), ("1.0.0-x-y-z.--", (1, 0, 0)), + ("1.0.0-alpha+001", (1, 0, 0) + ), ("1.0.0+20130313144700", (1, 0, 0)), + ("1.0.0-beta+exp.sha.5114f85", (1, 0, 0)), + ("1.0.0+21AF26D3----117B344092BD", (1, 0, 0))] +INCORRECT_SEMANTIC_VERSIONS = ["01.0.0", "0.01.0", "0.0.01", "0.0.0\nSOMETHING", + "1.0.space", "1.0.0- space"] + + +class VersionGettingFromModelCardTests(unittest.TestCase): + FAKE_MODEL_CARD1 = {"Something": "value"} + FAKE_MODEL_CARD2 = {"MedCAT Version": "not semantic"} + FAKE_MODEL_CARD3 = {"MedCAT Version": "almost.semantic"} + FAKE_MODEL_CARD4 = {"MedCAT Version": "closest.to.semantic"} + WRONG_VERSION_FAKE_MODELS = [FAKE_MODEL_CARD2, + FAKE_MODEL_CARD3, FAKE_MODEL_CARD4] + + def test_gets_correct_version(self): + maj, minor, patch = get_version_from_modelcard(MODEL_CARD_EXAMPLE) + self.assertEqual(EXAMPLE_VERSION, (maj, minor, patch)) + + def test_fails_upon_model_card_with_no_version_defined(self): + with self.assertRaises(KeyError): + get_version_from_modelcard(self.FAKE_MODEL_CARD1) + + def test_fails_upon_model_card_with_incorrect_version(self): + cntr = 0 + for fake_model_card in self.WRONG_VERSION_FAKE_MODELS: + with self.assertRaises(ValueError): + get_version_from_modelcard(fake_model_card) + cntr += 1 + self.assertEqual(cntr, len(self.WRONG_VERSION_FAKE_MODELS)) + + def test_fails_upon_wrong_version(self): + cntr = 0 + for wrong_version in INCORRECT_SEMANTIC_VERSIONS: + d = {"MedCAT Version": wrong_version} + with self.subTest(f"With version: {wrong_version}"): + with self.assertRaises(ValueError): + get_version_from_modelcard(d) + cntr += 1 + self.assertEqual(cntr, len(INCORRECT_SEMANTIC_VERSIONS)) + + def test_gets_version_from_correct_versions(self): + cntr = 0 + for version, expected in CORRECT_SEMANTIC_VERSIONS: + d = {"MedCAT Version": version} + with self.subTest(f"With version: {version}"): + got_version = get_version_from_modelcard(d) + self.assertEqual(got_version, expected) + cntr += 1 + self.assertEqual(cntr, len(CORRECT_SEMANTIC_VERSIONS)) + + +NEW_CDB_NAME = "cdb_new.dat" +CDB_PATH = os.path.join(os.path.dirname( + os.path.realpath(__file__)), "..", "..", "examples", NEW_CDB_NAME) +EXPECTED_CDB_VERSION = (1, 0, 0) + + +class VersionGettingFromCATTests(unittest.TestCase): + + def setUp(self) -> None: + self.cdb = CDB.load(CDB_PATH) + self.vocab = Vocab.load(os.path.join(os.path.dirname( + os.path.realpath(__file__)), "..", "..", "examples", "vocab.dat")) + self.cdb.config.general.spacy_model = "en_core_web_md" + self.cdb.config.ner.min_name_len = 2 + self.cdb.config.ner.upper_case_limit_len = 3 + self.cdb.config.general.spell_check = True + self.cdb.config.linking.train_count_threshold = 10 + self.cdb.config.linking.similarity_threshold = 0.3 + self.cdb.config.linking.train = True + self.cdb.config.linking.disamb_length_limit = 5 + self.cdb.config.general.full_unlink = True + self.meta_cat_dir = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "tmp") + self.undertest = CAT( + cdb=self.cdb, config=self.cdb.config, vocab=self.vocab, meta_cats=[]) + + def test_gets_correct_version(self): + version = get_semantic_version_from_model(self.undertest) + self.assertEqual(EXPECTED_CDB_VERSION, version) + + +class VersionGetterFromCDBTests(unittest.TestCase): + + def test_gets_version_from_cdb(self): + version = get_version_from_cdb_dump(CDB_PATH) + self.assertEqual(EXPECTED_CDB_VERSION, version) + + +class VersionGettFromModelPackTests(unittest.TestCase): + + def test_gets_version_from_model_pack(self): + # not strictly speaking a ZIP, but should work currently + # since the folder exists + model_pack_zip = os.path.dirname(CDB_PATH) + version = get_version_from_modelpack_zip( + model_pack_zip, cdb_file_name=NEW_CDB_NAME) + self.assertEqual(EXPECTED_CDB_VERSION, version) + + +class VersioningFixTests(unittest.TestCase): + + def break_cdb(self): + with open(self.broken_cdb_path, 'rb') as rf: + data = dill.load(rf) + data['config']['linking']['filters']['cuis'] = {} + with open(self.broken_cdb_path, 'wb') as wf: + dill.dump(data, wf) + + def setUp(self) -> None: + self.temp_folder = tempfile.TemporaryDirectory() + self.broken_cdb_path = os.path.join(self.temp_folder.name, "cdb.dat") + self.new_temp_folder = tempfile.TemporaryDirectory() + shutil.copyfile(CDB_PATH, self.broken_cdb_path) + self.break_cdb() + + def tearDown(self) -> None: + self.temp_folder.cleanup() + self.new_temp_folder.cleanup() + + def test_new_format_does_not_change_when_upgraded(self): + fixer = ConfigUpgrader(os.path.dirname( + CDB_PATH), cdb_file_name=NEW_CDB_NAME) + fixer.upgrade(self.new_temp_folder.name) + old_cdb = CDB.load(CDB_PATH) + new_cdb = CDB.load(os.path.join( + self.new_temp_folder.name, NEW_CDB_NAME)) + self.assertEqual(old_cdb.config.get_hash(), new_cdb.config.get_hash()) + + def test_old_format_needs_upgrade(self): + fixer = ConfigUpgrader(self.temp_folder.name) + self.assertTrue(fixer.needs_upgrade()) + + def test_fixes_old_format(self): + fixer = ConfigUpgrader(self.temp_folder.name) + fixer.upgrade(self.new_temp_folder.name) + new_cdb = CDB.load(os.path.join(self.new_temp_folder.name, "cdb.dat")) + self.assertIsInstance(new_cdb, CDB) From 642411d348680e3d2328a753efe52eb0639468d5 Mon Sep 17 00:00:00 2001 From: Xi Bai <82581439+baixiac@users.noreply.github.com> Date: Mon, 13 Mar 2023 11:51:19 +0000 Subject: [PATCH 02/17] Pin down transformers for the de-identification model (#314) * NO-TICKET pin down transformers for the de-id model --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e0fd4ba1b..8b75d5554 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ 'gensim>=4.3.0', # first to support 3.11 'spacy>=3.1.0', 'scipy~=1.9.2', # first to support 3.11 - 'transformers>=4.19.2', + 'transformers>=4.19.2,<4.22.0', # upper bound is needed for the de-id model until it is retrained 'torch>=1.13.0', # first to support 3.11 'tqdm>=4.27', 'scikit-learn>=1.1.3', # first to supporrt 3.11 From 1fc032ef3930c1f7a70cab9a23d7884f885fabea Mon Sep 17 00:00:00 2001 From: Anthony Shek <55877857+antsh3k@users.noreply.github.com> Date: Tue, 4 Apr 2023 11:28:29 +0100 Subject: [PATCH 03/17] Added function to remove CUI from cdb (#316) * Added function to remove CUI from cdb * Unit test for remove_cui --- medcat/cdb.py | 37 +++++++++++++++++++++++++++++++++++-- tests/test_cdb.py | 10 ++++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index 60fa1aff6..8a58166b8 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -98,7 +98,7 @@ def __init__(self, config: Union[Config, None] = None) -> None: def get_name(self, cui: str) -> str: """Returns preferred name if it exists, otherwise it will return - the logest name assigend to the concept. + the longest name assigned to the concept. Args: cui @@ -118,7 +118,7 @@ def update_cui2average_confidence(self, cui: str, new_sim: float) -> None: self.is_dirty = True def remove_names(self, cui: str, names: Dict) -> None: - """Remove names from an existing concept - efect is this name will never again be used to link to this concept. + """Remove names from an existing concept - effect is this name will never again be used to link to this concept. This will only remove the name from the linker (namely name2cuis and name2cuis2status), the name will still be present everywhere else. Why? Because it is bothersome to remove it from everywhere, but could also be useful to keep the removed names in e.g. cui2names. @@ -153,6 +153,39 @@ def remove_names(self, cui: str, names: Dict) -> None: self.name2cuis2status[name][_cui] = 'PD' self.is_dirty = True + def remove_cui(self, cui: str) -> None: + """This function takes a `CUI` as an argument and removes it from all the internal objects that reference it. + Args: + cui + """ + if cui in self.cui2names: + del self.cui2names[cui] + if cui in self.cui2snames: + del self.cui2snames[cui] + if cui in self.cui2context_vectors: + del self.cui2context_vectors[cui] + if cui in self.cui2count_train: + del self.cui2count_train[cui] + if cui in self.cui2tags: + del self.cui2tags[cui] + if cui in self.cui2type_ids: + del self.cui2type_ids[cui] + if cui in self.cui2preferred_name: + del self.cui2preferred_name[cui] + if cui in self.cui2average_confidence: + del self.cui2average_confidence[cui] + for name, cuis in self.name2cuis.items(): + if cui in cuis: + cuis.remove(cui) + for name, cuis2status in self.name2cuis2status.items(): + if cui in cuis2status: + del cuis2status[cui] + self.snames = set() + for cuis in self.cui2snames.values(): + self.snames |= cuis + self.name2count_train = {name: len(cuis) for name, cuis in self.name2cuis.items()} + self.is_dirty = True + def add_names(self, cui: str, names: Dict, name_status: str = 'A', full_build: bool = False) -> None: """Adds a name to an existing concept. diff --git a/tests/test_cdb.py b/tests/test_cdb.py index 219e43d33..505320c6a 100644 --- a/tests/test_cdb.py +++ b/tests/test_cdb.py @@ -64,6 +64,16 @@ def test_empty_count_train(self): stats = self.undertest.make_stats() self.assertFalse(np.isnan(stats["Average training examples per concept"])) self.undertest.cui2count_train = copied + + def test_remove_cui(self): + self.undertest.remove_cui('C0000039') + assert 'C0000039' not in self.undertest.cui2names + assert 'C0000039' not in self.undertest.cui2snames + assert 'C0000039' not in self.undertest.cui2count_train + assert 'C0000039' not in self.undertest.cui2type_ids + assert 'C0000039' not in self.undertest.cui2preferred_name + assert 'C0000039' not in self.undertest.name2cuis['virus~z'] + assert 'C0000039' not in self.undertest.name2cuis2status['virus~z'] if __name__ == '__main__': unittest.main() From 9ea067549ff41242aaf4df53236221a0f2c06898 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 17 Apr 2023 17:28:11 +0300 Subject: [PATCH 04/17] CU-862jjprjw Fix github actions failures (#317) * Added function to remove CUI from cdb --------- Co-authored-by: antsh3k --- .github/workflows/main.yml | 2 +- .github/workflows/production.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3c05835d0..c769dfc2e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -42,7 +42,7 @@ jobs: github.ref == 'refs/heads/master' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags') != true - runs-on: ubuntu-18.04 + runs-on: ubuntu-20.04 concurrency: publish-to-test-pypi needs: [build] diff --git a/.github/workflows/production.yml b/.github/workflows/production.yml index ad229ce9c..5088c1000 100644 --- a/.github/workflows/production.yml +++ b/.github/workflows/production.yml @@ -8,7 +8,7 @@ on: jobs: build-n-publish-to-pypi: - runs-on: ubuntu-18.04 + runs-on: ubuntu-20.04 concurrency: build-n-publish-to-pypi if: github.repository == 'CogStack/MedCAT' From e259b0c232e15d88abe72f82dce6673212cc2d73 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Thu, 4 May 2023 11:49:23 +0300 Subject: [PATCH 05/17] CU-862jr8wkk Pin pydantic dependency to avoid conflicts with v2.0 (#318) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8b75d5554..4e73b2f89 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ 'xxhash>=3.0.0', # allow later versions, tested with 3.1.0 'blis>=0.7.5', # allow later versions, tested with 0.7.9 'click>=8.0.4', # allow later versions, tested with 8.1.3 - 'pydantic>=1.10.0', # for spacy compatibility + 'pydantic>=1.10.0,<2.0', # for spacy compatibility; avoid 2.0 due to breaking changes # the following are not direct dependencies of MedCAT but needed for docs/building # hopefully will no longer need the transitive dependencies 'aiohttp==3.8.3', # 3.8.3 is needed for compatibility with fsspec <- datasets <- medcat From 9625ec0749f1a3b74c37f508ab111dda4d491707 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 9 May 2023 22:16:11 +0000 Subject: [PATCH 06/17] Bump django from 3.2.18 to 3.2.19 in /webapp/webapp Bumps [django](https://github.com/django/django) from 3.2.18 to 3.2.19. - [Commits](https://github.com/django/django/compare/3.2.18...3.2.19) --- updated-dependencies: - dependency-name: django dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- webapp/webapp/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp/webapp/requirements.txt b/webapp/webapp/requirements.txt index 9534e025a..b4adaa104 100644 --- a/webapp/webapp/requirements.txt +++ b/webapp/webapp/requirements.txt @@ -1,4 +1,4 @@ -Django==3.2.18 +Django==3.2.19 django-dbbackup==4.0.0b0 django-storages[boto3]==1.12.3 django-cron==0.5.1 From 564d15caa6d328733f8da8b56b4dfebc2982bc4d Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 5 Jun 2023 13:06:44 +0300 Subject: [PATCH 07/17] CU-863gntc58 Umlspt2ch (#322) * CU-863gntc58 Add parent to child relationship getter to UMLS preprocessing * CU-863gntc58 Only use ISA relationships * Make sure parents do not have themselves as children * CU-863gntc58 Only keep preferred names * CU-863gntc58 Fix typing issues * CU-863gntc58 Fix child-parent relationships being saved instea * Better system for avoiding parent-child being the same --- medcat/utils/preprocess_umls.py | 126 +++++++++++++++++++++++++++++--- 1 file changed, 117 insertions(+), 9 deletions(-) diff --git a/medcat/utils/preprocess_umls.py b/medcat/utils/preprocess_umls.py index 0b3505981..9cf0ccea4 100644 --- a/medcat/utils/preprocess_umls.py +++ b/medcat/utils/preprocess_umls.py @@ -1,6 +1,9 @@ from typing import List, Union import pandas as pd +import tqdm +import os +from typing import Dict, Set _DEFAULT_COLUMNS: list = [ "CUI", @@ -20,7 +23,7 @@ "STR", "SRL", "SUPPRESS", - "CVF", + "CVF", ] _DEFAULT_SEM_TYPE_COLUMNS: list = [ @@ -32,12 +35,24 @@ "CVF", ] +_DEFAULT_MRHIER_COLUMNS: list = [ + "CUI", + "AUI", + "CXN", + "PAUI", + "SAB", + "RELA", + "PTR", + "HCD", + "CVF", +] + medcat_csv_mapper: dict = { 'CUI': 'cui', 'STR': 'name', 'SAB': 'ontologies', 'ISPREF': 'name_status', - 'TUI': 'type_ids', # from MRSTY.RRF + 'TUI': 'type_ids', # from MRSTY.RRF } @@ -57,11 +72,13 @@ class UMLS: def __init__(self, main_file_name: str, sem_types_file: str, allow_languages: list = ['ENG'], sep: str = '|'): self.main_file_name = main_file_name self.sem_types_file = sem_types_file - self.main_columns = list(_DEFAULT_COLUMNS) # copy - self.sem_types_columns = list(_DEFAULT_SEM_TYPE_COLUMNS) # copy + self.main_columns = list(_DEFAULT_COLUMNS) # copy + self.sem_types_columns = list(_DEFAULT_SEM_TYPE_COLUMNS) # copy + self.mrhier_columns = list(_DEFAULT_MRHIER_COLUMNS) # copy self.sep = sep # copy in case of default list - self.allow_langugages = list(allow_languages) if allow_languages else allow_languages + self.allow_langugages = list( + allow_languages) if allow_languages else allow_languages def to_concept_df(self) -> pd.DataFrame: """Create a concept DataFrame. @@ -72,7 +89,8 @@ def to_concept_df(self) -> pd.DataFrame: """ # target columns: # cui, name, name_status, ontologies, description_type_ids, type_ids - df = pd.read_csv(self.main_file_name, names=self.main_columns, sep=self.sep, index_col=False) + df = pd.read_csv(self.main_file_name, + names=self.main_columns, sep=self.sep, index_col=False) # filter languages if self.allow_langugages: @@ -82,7 +100,8 @@ def to_concept_df(self) -> pd.DataFrame: # get TUI - sem_types = pd.read_csv(self.sem_types_file, names=self.sem_types_columns, sep=self.sep, index_col=False) + sem_types = pd.read_csv( + self.sem_types_file, names=self.sem_types_columns, sep=self.sep, index_col=False) df = df.merge(sem_types) # rename columns @@ -109,7 +128,8 @@ def map_umls2snomed(self) -> pd.DataFrame: Returns: pd.DataFrame: Dataframe that contains the SCUI (source CUI) as well as the UMLS CUI for each applicable concept """ - df = pd.read_csv(self.main_file_name, names=self.main_columns, sep=self.sep, index_col=False, dtype={'SCUI': 'str'}) + df = pd.read_csv(self.main_file_name, names=self.main_columns, + sep=self.sep, index_col=False, dtype={'SCUI': 'str'}) # get only SNOMED-CT US based concepts that have a SNOMED-CT (source) CUI df = df[df.SAB == 'SNOMEDCT_US'][df.SCUI.notna()] # sort by SCUI @@ -154,7 +174,8 @@ def map_umls2source(self, sources: Union[str, List[str]]) -> pd.DataFrame: Returns: pd.DataFrame: DataFrame that has the target source codes """ - df = pd.read_csv(self.main_file_name, names=self.main_columns, sep=self.sep, index_col=False, dtype={'CODE': 'str'}) + df = pd.read_csv(self.main_file_name, names=self.main_columns, + sep=self.sep, index_col=False, dtype={'CODE': 'str'}) # get the specified source(s) if isinstance(sources, list): df = df[df.SAB.isin(sources)][df.CODE.notna()] @@ -166,6 +187,76 @@ def map_umls2source(self, sources: Union[str, List[str]]) -> pd.DataFrame: df = df[['CODE',] + [col for col in df.columns.values if col != 'CODE']] return df + def get_pt2ch(self) -> dict: + """Generates a parent to children dict. + + It goes through all the < # TODO + + The resulting dictionary maps a CUI to a list of CUIs that + consider that CUI as their parent. + + PS: + This expects the MRHIER.RRF file to also exist in the same folder + as the MRCONSO.RRF file. + + Raises: + ValueError: If the MRHIER.RRF file wasn't found + + Returns: + dict: The dictionary of parent CUI and their children. + """ + path = self.main_file_name.rsplit('/', 1)[0] + hier_file = f"{path}/MRHIER.RRF" + + if not os.path.exists(hier_file): + raise ValueError( + f'Expected MRHIER.RRF to exist within the same parent folder ({path})') + + conso_df = pd.read_csv(self.main_file_name, names=self.main_columns, + sep=self.sep, index_col=False) + + hier_df = pd.read_csv(hier_file, sep=self.sep, index_col=False, + header=None, names=self.mrhier_columns) + + # filter languages + if self.allow_langugages: + conso_df = conso_df[conso_df["LAT"].isin(self.allow_langugages)] + + # create a AUI -> CUI map + aui_cui = dict(zip(conso_df["AUI"], conso_df["CUI"])) + + # remove non-preferred from conso + conso_df = conso_df[conso_df['ISPREF'] == 'Y'] + + # filter ISA relationships + hier_df = hier_df[hier_df['RELA'] == 'isa'] + + # merge dataframes + merged_df = pd.merge(conso_df, hier_df, on=['AUI', 'CUI']) + + # only keep CUI and parent AUI + cui_parent = merged_df[['CUI', 'PAUI']] + # only include CUIs with a parent + cui_parent = cui_parent[cui_parent['PAUI'].notna()] + + # create dict + pt2ch: Dict[str, Set[str]] = {} + for _, row in tqdm.tqdm(cui_parent.iterrows(), total=len(cui_parent.index)): + cur_cui = row['CUI'] + paui = row['PAUI'] + parent_cui = aui_cui[paui] + # avoid self as parent/child + if parent_cui == cur_cui: + continue + if parent_cui not in pt2ch: + pt2ch[parent_cui] = set() + pt2ch[parent_cui].add(cur_cui) + # move from set to list for consistency with SNOMED + pt2ch: Dict[str, List[str]] = pt2ch # type: ignore + for k, v in pt2ch.items(): + pt2ch[k] = list(v) + return pt2ch + if __name__ == '__main__': import sys @@ -187,3 +278,20 @@ def map_umls2source(self, sources: Union[str, List[str]]) -> pd.DataFrame: to_ICD10_man = umls.map_umls2source(sources=['ICD10']) print('As ICD-10(MAN):') print(to_ICD10_man.head()) + pt2ch = umls.get_pt2ch() + print('Get parent-child dict', len(pt2ch), + '' if len(pt2ch) > 1_000 else pt2ch) + all_vals = [len(v) for v in pt2ch.values()] + print('LEN of VALS:', sum(all_vals), 'max', + max(all_vals), 'min', min(all_vals), 'mean', sum(all_vals) / len(all_vals)) + import random + random_4_keys = random.sample(list(pt2ch.keys()), k=4) + + def _get_name(cui: str) -> str: + matches = df[df['cui'] == cui] + if len(matches.index) == 0: + return 'N/A' # UNKNOWN + return matches['name'].iloc[0] + print('FF RAW ', [f"{k}:{pt2ch[k]}" for k in random_4_keys]) + print('FIRST FEW', [ + (f"{_get_name(key)} ({key})", [f"{_get_name(child)} ({child})" for child in pt2ch[key]]) for key in random_4_keys]) From 5610ec5b0228fad6bb836f3dc80a67f32bd445a2 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 26 Jun 2023 16:32:27 +0300 Subject: [PATCH 08/17] Fix for Issue 325 (#326) * Issue-325 Add check for old/new spacy; fix code for nested entities * Issue-325 Fix a typing issue * Issue-325 Improve nested entity extraction in _doc_to_out; add type hint for individual entities * Issue-325 Remove unneccessary whitespace * Issue-325 Move spacy version detection from cat to utils.helpers --- medcat/cat.py | 53 +++++++++++++++++++++++++++++++++-------- medcat/utils/helpers.py | 20 ++++++++++++++++ 2 files changed, 63 insertions(+), 10 deletions(-) diff --git a/medcat/cat.py b/medcat/cat.py index cd6aa769d..bd19a6f0b 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -28,7 +28,7 @@ from medcat.utils.data_utils import make_mc_train_test, get_false_positives from medcat.utils.normalizers import BasicSpellChecker from medcat.utils.checkpoint import Checkpoint, CheckpointConfig, CheckpointManager -from medcat.utils.helpers import tkns_from_doc, get_important_config_parameters +from medcat.utils.helpers import tkns_from_doc, get_important_config_parameters, has_new_spacy from medcat.utils.hasher import Hasher from medcat.ner.vocab_based_ner import NER from medcat.linking.context_based_linker import Linker @@ -46,6 +46,9 @@ logger = logging.getLogger(__name__) # separate logger from the package-level one +HAS_NEW_SPACY = has_new_spacy() + + class CAT(object): """The main MedCAT class used to annotate documents, it is built on top of spaCy and works as a spaCy pipline. Creates an instance of a spaCy pipline that can @@ -1505,6 +1508,43 @@ def _mp_cons(self, in_q: Queue, out_list: List, min_free_memory: int, lock: Lock logger.warning(str(e)) sleep(2) + def _add_nested_ent(self, doc: Doc, _ents: List[Span], _ent: Union[Dict, Span]) -> None: + # if the entities are serialised (PipeRunner.serialize_entities) + # then the entities are dicts + # otherwise they're Span objects + meta_anns = None + if isinstance(_ent, dict): + start = _ent['start'] + end =_ent['end'] + label = _ent['label'] + cui = _ent['cui'] + detected_name = _ent['detected_name'] + context_similarity = _ent['context_similarity'] + id = _ent['id'] + if 'meta_anns' in _ent: + meta_anns = _ent['meta_anns'] + else: + start = _ent.start + end = _ent.end + label = _ent.label + cui = _ent._.cui + detected_name = _ent._.detected_name + context_similarity = _ent._.context_similarity + if _ent._.has('meta_anns'): + meta_anns = _ent._.meta_anns + if HAS_NEW_SPACY: + id = _ent.id + else: + id = _ent.ent_id + entity = Span(doc, start, end, label=label) + entity._.cui = cui + entity._.detected_name = detected_name + entity._.context_similarity = context_similarity + entity._.id = id + if meta_anns is not None: + entity._.meta_anns = meta_anns + _ents.append(entity) + def _doc_to_out(self, doc: Doc, only_cui: bool, @@ -1515,16 +1555,9 @@ def _doc_to_out(self, if doc is not None: out_ent: Dict = {} if self.config.general.show_nested_entities: - _ents = [] + _ents: List[Span] = [] for _ent in doc._.ents: - entity = Span(doc, _ent['start'], _ent['end'], label=_ent['label']) - entity._.cui = _ent['cui'] - entity._.detected_name = _ent['detected_name'] - entity._.context_similarity = _ent['context_similarity'] - entity._.id = _ent['id'] - if 'meta_anns' in _ent: - entity._.meta_anns = _ent['meta_anns'] - _ents.append(entity) + self._add_nested_ent(doc, _ents, _ent) else: _ents = doc.ents # type: ignore diff --git a/medcat/utils/helpers.py b/medcat/utils/helpers.py index 04f92730b..f783a9b06 100644 --- a/medcat/utils/helpers.py +++ b/medcat/utils/helpers.py @@ -3,6 +3,8 @@ from medcat.preprocessing.cleaners import clean_name from medcat.utils.other import TPL_ENT, TPL_ENTS +from spacy import __version__ as spacy_version + import logging logger = logging.getLogger(__name__) @@ -517,3 +519,21 @@ def run_cv(cdb_path, data_path, vocab_path, cv=100, nepochs=16, test_size=0.1, l fns[key] = [fn.get(key, 0)] return fps, fns, tps, ps, rs, f1s, cui_counts, examples + + +def has_new_spacy() -> bool: + """Figures out whether or not a newer version of spacy is installed. + + This plays a role in how some parts of the Span needs to be interacted with. + + As of writing, the new version starts at v3.3.1. + + Returns: + bool: Whether new version was detected. + """ + major, minor, patch_plus = spacy_version.split('.') + major, minor = int(major), int(minor) + patch = int(patch_plus) + return (major > 3 or + (major == 3 and minor > 3) or + (major == 3 and minor == 3 and patch >= 1)) From 65645b69cc2d34f438e32c819a1761318ab9c4c8 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 26 Jun 2023 16:39:24 +0300 Subject: [PATCH 09/17] CU-86783u6d9 Add wrapper to simplify De-ID model usage (#324) * CU-2wgnqg5 Add javadoc to a method * CU-2wgnqg5 Fix issues with typing * CU-2wgnqg5 Add (potential) progress bar to regression testing * CU-2wgnqg5 Add runnable regression checker with command line arguments * CU-2wgnqg5 Add better help message for a CLI argument * CU-2wgnqg5 Fix import to use proper namespace * CU-2wgnqg5 Add parent-child functionality for filters * CU-2wgnqg5 Add cui and children option to the config example * Revert "CU-2wgnqg5 Fix import to use proper namespace" This reverts commit 882be443fd45a33ea708000014f74b3a6554c3ce. * CU-2wgnqg5 Add default / empty children to translation layer * CU-2wgnqg5 Remove use of deprecated warning method * CU-2wgnqg5 Add new default test case that checks for 'heart rate' and its children 4 deep * CU-2wgnqg5 Remove unneccessary TODO comment * CU-2wgnqg5 Add possibility of using result reporting for regression checks * CU-2wgnqg5 Fix issue with delegations not shown for reports * CU-2wgnqg5 Add possibility of using reports for CLI regression testing * CU-2wgnqg5 Fix minor typing issues * CU-2wgnqg5 Fix typo in default regression config * CU-2wgnqg5 Make sure imports work both when running directly as well as when using as part of the project * CU-2wgnqg5 Add a new test case with the ANY strategy * CU-2wgnqg5 Fixing imports so that absolute imports are used * CU-2wgnqg5 Add new package to setup.py * CU-2wgnqg5 Fix typing issues * CU-2wgnqg5 Fix report output formating * CU-2vzhd93 Remove logging tutorials (move to MedCATtutorials) * CU-2wgnqg5 Move to a simpler filter design * CU-2wgnqg5 Add (optional) per-phrase results to results/reporting * CU-2wgnqg5 Add per-phrase information toggle to CLI * CU-2wgnqg5 Fix method signature changes between inherited classes * CU-2q50k3c: add contact email address. * added latest release news / accepted paper * Update README.md * CU-2zj4czk Move to a class based linking filter approach * CU-2zj4czk Move to identifier based linking filter access * CU-2zj4czk Use MCT filters when training supervised * New UMLS Full Model * CU-2zj4czk Make sure excluded CUIs are always specified (even if by an empty set) * CU-2zj4czk Add possibility of creating a copy of linking filters * CU-2zj4czk Use copies of linking.filters in train_supervised and _print_stats * CU-2zj4czk Add linking.filters merging functionality * CU-2zj4czk Add parameter to retain MCT filters within train_supervised * CU-2zj4czk Rename filters variable within print_stats method for better consistency and readability * CU-2zj4czk Consolidate some duplicate code between train_supervised and _print_stats * CU-2zj4czk Fix multi-project detection * CU-2zj4czk Fix linking filter merging * CU-2zj4czk Add tests for retaining filters from MCT along with a test-trainer export * CU-2zj4czk Remove debug print outputs from some tests * CU-2wgnqg5 Separate some of the regression code into different modules * Add URL of paper for Dutch model (#275) * CU-2wgnqg5 Add serialisation code along with tests * CU-2wgnqg5 Fix regression checker and case serialisation and add tests * CU-2wgnqg5 Add conversion code from MCT export to regression YAML along with tests * CU-2wgnqg5 Fix minor import and typing issues * CU-2wgnqg5 Add runnable to convert from MedCATtrainer to regression YAML * CU-2wgnqg5 Add for number of cases read from MCT export * CU-2wgnqg5 Add context selectors for conversion from MCT * CU-2wgnqg5 Add use of context selector to converter * CU-2wgnqg5 Add use of context selector to runnable * CU-2wgnqg5 Fix issue with typing * CU-2wgnqg5 Add regression case based progress bar in case the total of sub-cases is unknown * CU-2wgnqg5 Make sure (and test) that only 1 replacement '%s' is in each phrase for regression tests * CU-2wgnqg5 Add test cases for '%' replacement in context and some minor optimisation * CU-2wgnqg5 Add option to not show empty cases in report * CU-2wgnqg5 Fix verbose output mode/logging * CU-2wgnqg5 Fix name clashes in test cases * CU-2wgnqg5 Make conversion filter for both CUI and NAME * CU-2wgnqg5 Use different approach for generating targets for regression cases * CU-2wgnqg5 Add warning when no parent-child information is present (but continue to run) * Fix issue with typing * Add TODO comment regarding more comprehensive reporting * Fix whitespace issue * CU-2wgnqg5 Translation layer now able to confirm if a set of CUIs has a parent or child of a specified one * CU-2wgnqg5 Add reasons for failure of a regression case * CU-2wgnqg5 Make hiding failures a possibility from the CLI * CU-2wgnqg5 Use better report output for failures with summary * CU-2wgnqg5 Fix typing issues * CU-2wgnqg5 Add description to failed cases where applicable * CU-2wgnqg5 Fix successes not being reported on * CU-2wgnqg5 Rename some fail reasons for better readability * CU-2wgnqg5 Add test cases for specifeid CUI and name if/when none are found from the CDB * CU-2wgnqg5 Add extra information (names) in case of failure becasue name not in CDB * CU-2wgnqg5 Make converter consolidate different test cases with identical filters (CUI and name) into one with multiple phrases * CU-2wgnqg5 Remove use of TargetInfo and using a tuple instead * CU-2wgnqg5 Fix remnant targetinfo * CU-2wgnqg5 Fix remnant targetinfo stuff * CU-2wgnqg5 Fix remnant targetinfo in docstrings * CU-2wgnqg5 Fix missing argumnet in docstrings * CU-2wgnqg5 Allow only reports in regression checker * CU-2wgnqg5 Add medcat.utils.regression level parent logger * CU-2wgnqg5 Use medcat.utils.regression parent logger for verbose output in regression checker * CU-2wgnqg5 Move from logger.warn to logger.warning * CU-2wgnqg5 Fix issue with wrong targets being generated * CU-2wgnqg5 Fix checking tests * CU-2wgnqg5 Add dunder init to test (utils) packages to make the tests within discoverable * CU-2wgnqg5 Fix serialisation tests (add missing argument) * CU-2wgnqg5 Fix regression results tests (change method owner) * CU-2wgnqg5 Fix regression results tests (make names ordered) * CU-2wgnqg5 Remove unnecessary print output in test * CU-2wgnqg5 Update conversion code to not use target info * CU-2wgnqg5 Attempt to fix automated build on github actions (bin sklearn version) * CU-2wgnqg5 Move from sklearn to scikit-learn dependency * CU-2wgnqg5 Separate some code in converting, add docs * CU-2wgnqg5 Make yaml dumping save for yaml representation of regression checker * CU-2wgnqg5 Add initial editing code with some simple tests * CU-2wgnqg5 Add possibility for combinations to ignore identicals * CU-2wgnqg5 Add docs to the editing/combining methods * CU-2wgnqg5 Add runnable python file for combining different regression YAMLs * CU-2wgnqg5 Minor codebase improvements * CU-2wgnqg5 Make FailReasons serializable * CU-2wgnqg5 Add json output to regression checking * Make stats reporting not have np.nan values on empty train count (#277) * CU-327vb66 make stats reporting not have np.nan values on empty train count * CU-327vb66 start using scikit-learn instead of deprecated sklearn * Bump django from 3.2.15 to 3.2.16 in /webapp/webapp Bumps [django](https://github.com/django/django) from 3.2.15 to 3.2.16. - [Release notes](https://github.com/django/django/releases) - [Commits](https://github.com/django/django/compare/3.2.15...3.2.16) --- updated-dependencies: - dependency-name: django dependency-type: direct:production ... Signed-off-by: dependabot[bot] * Update ReadMe.md to show Licence change Updated News Section * CU-2wgnqg5 Add docstring to fail descriptor getter method * CU-2wgnqg5 Removed handled TODO * CU-33g09h4 Make strides towards PEP 257. Make all docstrings use triple double quotes; remove preceding whitespace from docstrings; remove raw-string docstrings where applicable; remove empty docstrings * CU-2zj4czk Add documentation regarding config.linking.filters * CU-2zj4czk Add test for leakage of extra_cui_filters * CU-33g09h4 Remove leftover whitespace from start of docstring * include joblib dep * CU-2zj4czk Add parameter to retain extra_cui_filters (instead of MCT filters). Make sure tests pass. * CU-33g09h4 Some docstring unification for config(s) * CU-33g09h4 Some docstring unification for pipe, meta_cat and vocab * CU-33g09h4 Some docstring unification for cdb * CU-33g09h4 Some docstring unification for cdb maker * CU-33g09h4 Some docstring unification for cdb and maker (Return: to Returns:) * CU-33g09h4 Some docstring unification for cat * CU-33g09h4 Fix typo in docstring * CU-33g09h4 Some docstring unification for utils * CU-33g09h4 Some docstring unification for tokenizers * CU-33g09h4 Some docstring unification for preprocessors * CU-33g09h4 Some docstring unification for NER parts * CU-33g09h4 Some docstring unification for NEO parts * CU-33g09h4 Some docstring unification for linking parts * CU-33g09h4 Some docstring unification for cogstack connection part * CU-33g09h4 Remove some leftover backticks from docstring types * CU-33g09h4 Remove some leftover 'Return:' -> 'Returns:' changes * CU-33g09h4 Fix typo in a return type name * CU-384mewq match post release branches in the production workflow (#283) * CU-346mpxm Add new JSON based (faster) serialization for CDB along with tests * CU-346mpxm Add new package to setup.py; add logger and docstrings to serializer; remove dead code and comments * CU-346mpxm Remove leftover codel; Fix type safety regarding optinal json path * CU-346mpxm Add logging on writing to serializer * CU-346mpxm Add logging on reading to serializer * CU-346mpxm Make deserializing consistent with previous CDB deserialising * CU-346mpxm Add JSON serialisation to CDB * CU-346mpxm Remove issue with circular imports * CU-346mpxm Make sure json files end with .json * CU-346mpxm Add json type format to modelpack creation * CU-346mpxm Add tests for json format modelpack creation * CU-346mpxm Add logging output to model pack creation and loading * CU-346mpxm Add model pack converter / runnable * Update README.md * CU-862hyd5wx Unify rosalind/vocab downloading in tests, identify and fail meaningfully in case of 503 * CU-862hyd5wx Remove unused imports in tests due to last commit * CU-862hyd5wx Add possibility of generating and using a simply vocab when Rosalind is down * CU-862hyd5wx Fix small typo in tests * Loosen dependency restrictions (#289) Signed-off-by: zethson Signed-off-by: zethson * bug found in snomed2OPCS func * markdown improvements * Mapping icd10 and opcs complete * get all children func added * pep8 fixes * Update README.md * Add confusion matrix to meta model evaluation * CU-862j0jcdu / CU-862j0jd2n Cdb json (#295) * CU-862j0jcdu Rename format parameter in model creation to specify it only applys to the CDB * CU-862j0jd2n Add addl_info to be JSON serialised when required * CU-862j0jd2n Add addl_info to docstring of CDB serializer * CU-38g55wn / CU-39cmv82 Support for python3.11 (and 3.10) (#285) * CU-38g55wn Move dependencies to (hopefully) support python 3.11 on Ubuntu * CU-38g55wn Attempt to fix dependencies for github dependency (gensim) * CU-38g55wn Attempt to fix dependencies for github dependency (gensim) x2 * CU-38g55wn Attempt to fix dependencies for github dependency (gensim) x3 * CU-38g55wn Attempt to fix dependencies for github dependency (gensim) x4 * CU-38g55wn Attempt to fix dependencies for github dependency (gensim) x5 - fix missing comma * CU-38g55wn Remove errorenous package from setup.py * CU-38g55wn Bump spacy version so as to (hopefully) fix pydantic issues * CU-38g55wn Bump spacy en_core_web_md version so as to (hopefully) fix requirements issues * CU-38g55wn Fix test typo that was fixed on newere en_core_web_md * CU-38g55wn Fix small issue in NER test * CU-38g55wn Fix small issue with NER test (int conversion) * CU-38g55wn Mark some places as ignore where newer mypy complains * CU-38g55wn Bump mypy dev requirement version * CU-38g55wn Add python 3.11 and 3.10 to workflow * CU-38g55wn Trying to install gensim over https rather tha ssh * CU-38g55wn Make python versions strings in GH worfklow so 3.10 doesn't get 'rounded' to 3.10 when read * CU-38g55wn Remove python 3.7 from workflow since it's not compatible with required versions of numpy and scipy * CU-38g55wn Universally fixing NER test regarding the 'movar~viruse' -> 'movar~virus' thing * CU-38g55wn Bump gensim version to 4.3.0 - the first to support 3.11 * CU-862hyd5wx Unify rosalind/vocab downloading in tests, identify and fail meaningfully in case of 503 * CU-862hyd5wx Remove unused imports in tests due to last commit * CU-862hyd5wx Add possibility of generating and using a simply vocab when Rosalind is down * CU-862hyd5wx Remove python 3.7 and add 3.10/3.11 to classifiers * CU-862hyd5wx Reorder python versions in GitHub workflow * CU-862hyd5wx Attempt to fix GHA by importing unittest.mock explicitly * CU-39cmvru Faster hashing (#286) * CU-39cmvru Add marking of CDB dirty if/when concepts change. Avoid calculating its hash separately if it hasn't been dirtied. Add tests to verify behaviour. * CU-39cmvru Add possibility to force recalculation of hash for CDB (inlcuding when getting hash for CAT) * CU-39cmvru Add possibility to force recalculation of hash for CDB through modelcat creation (new parameter, propageting through _versioning) * CU-39cmvru Remove previous hash from influencing hashing of CDB to produce consistent hash on every recalculation Add tests to make sure that is the case on the CDB level as well as the CAT/modelpack level. * CU-39cmvru Add logging around the (re)calclulation of the CDB hash * CU-39cmvru Fix typo in log message * CU-39cmvru Add test to make sure the CDB hash is saved to disk and loaded from disk * CU-39cmvru Add possibility to calculate hash upon saving of CDB if/when the hash is unknown (i.e when saving outside a model pack) * CU-39cmvru Add CDB dirty flag to all other methods that modify the CDB * Change confusion matrix to DF and add labels * Fix model config * CU-86777ey74 No elastic dependency (#298) * Removed elastic dependency * CU-86777ey74 Remove module that depends on elastic (cogstack/cogstack_conn) * CU-86777ey74 Remove medcat.cogstack package from setup.py packages * Docstring updated to google-style docstring * CU-2e77a2k Remove unused utility modules * CU-2e77a2k Remove deprecated utils * Bump django from 3.2.16 to 3.2.17 in /webapp/webapp Bumps [django](https://github.com/django/django) from 3.2.16 to 3.2.17. - [Release notes](https://github.com/django/django/releases) - [Commits](https://github.com/django/django/compare/3.2.16...3.2.17) --- updated-dependencies: - dependency-name: django dependency-type: direct:production ... Signed-off-by: dependabot[bot] * CU-33g0f3w Read the docs build failures (#306) * CU-33g0f3w Pin aiohttp dependency version for docs * CU-33g0f3w Pin aiohttp dependency version for docs (#303) * CU-33g0f3w Pin aiohttp dependency version for docs in setup.py * Read the docs build failures (#304) * CU-33g0f3w Pin aiohttp dependency version for docs * CU-33g0f3w Pin aiohttp dependency version for docs in setup.py * CU-33g0f3w Pin blis dependency version for docs in setup.py * Add options for loading meta models and additional NERs (#300) * CU-8677aud63 add options for loading meta models and addl NERs * CU-8677aud63 reduce memory usage during test * Style fix * NO-TICKET reduce the false positives on pushing to test pypi (#307) * CU-862j5by9q Regression touchup - metadata and ability to split suites into categories (#301) * CU-862j5by9q Add metadata to regression suite, loaded from model card if/when specified. A model can be specified upon creation to get the model card from. * CU-862j5by9q Remove f-string from string with no placeholders * CU-862j5by9q Make regression case hashable * CU-862j5by9q Add category separation to regression test suite along with automated tests and test example * CU-862j5by9q Add missing docstringgs to category separation * CU-862j5by9q Add saving to category separator and a convenience method for separation based on regression test YAML file and categories YAML file * CU-862j5by9q Add missing docstrings to new methods * CU-862j5by9q Fix typo in class name * CU-862j5by9q Fix saving issue for separation results * CU-862j5by9q Add runnable category separator * CU-862j5by9q Separate some file location constants in separation tests * CU-862j5by9q Add test for separation that checks that no information gets lost (in the specific situation) * CU-862j5by9q Add an anything-goes category description * CU-862j5by9q Fix anything-goes option * CU-862j5by9q Add tests for anything-goes category description * CU-862j5by9q Add possibility of using an overflow category when separating regression suite * CU-862j5by9q Add use of the overflow category to the runnable * CU-862j5by9q Fix linting and typing issues * CU-862j5by9q Add test for each individual separated suite * CU-862j5by9q Fix minor abstract class issues * CU-862j5by9q Rename categoryseparation module as category_separation * CU-862j5by9q Add docstrings to category_separator * CU-8677craqe make transformer_ner continue processing other entities after the first non-matching * Bump django from 3.2.17 to 3.2.18 in /webapp/webapp Bumps [django](https://github.com/django/django) from 3.2.17 to 3.2.18. - [Release notes](https://github.com/django/django/releases) - [Commits](https://github.com/django/django/compare/3.2.17...3.2.18) --- updated-dependencies: - dependency-name: django dependency-type: direct:production ... Signed-off-by: dependabot[bot] * CU-862j7b9jc Mypy full release - 1.0.0 (#308) * CU-862j7b9jc Add abstract base class to regression converting strategy where necessary * CU-862j7b9jc Bump mypy to version 1.0.0 * CU-862j7b9jc Mypy abc hotfix (#311) * CU-862j7b9jc Fix issue with duplicate imports * CU-862j7b9jc Fix issue with no whitespace after keyword (E275) * CU-862j7b9jc Remove unnecessary brackets from if statement * CU-8677ge6j8 Version identification and updating (#313) * Expose example model card version in metadata test * Add version detection along with tests * Move to a more comprehensive version string parser (regex) * Add more comprehensive versioning tests * Move MedCAT unzip to a separate method * Separate getting semantic version from string * Add new CDB with version information and use that with versioning tests * Add methods to get version info from CDB dump and model pack zip/folder * Exposing CDB file name and adding custom dev patch version support * Fix config.linking.filters.cuis - from empty dict to empty set * Add logging to versioning * Fix f-strings instead of (intended) r-strings * Add creating model pack archive to versioning CDB fix * Fix logger initialising * Making versioning a runnable module that allows fixing the config * Add docstrings to CLI methods * CU-8677ge6j8 Make explicit check regards to empty dict when fixing config * CU-8677ge6j8 Add tests regarding versioning changes * CU-8677ge6j8 Add missing return type hint * CU-8677ge6j8 Simplify action handling for CLI input * CU-8677ge6j8 Simplifying archive making method * Pin down transformers for the de-identification model (#314) * NO-TICKET pin down transformers for the de-id model * Added function to remove CUI from cdb (#316) * Added function to remove CUI from cdb * Unit test for remove_cui * CU-862jjprjw Fix github actions failures (#317) * Added function to remove CUI from cdb --------- Co-authored-by: antsh3k * CU-862jr8wkk Pin pydantic dependency to avoid conflicts with v2.0 (#318) * Bump django from 3.2.18 to 3.2.19 in /webapp/webapp Bumps [django](https://github.com/django/django) from 3.2.18 to 3.2.19. - [Commits](https://github.com/django/django/compare/3.2.18...3.2.19) --- updated-dependencies: - dependency-name: django dependency-type: direct:production ... Signed-off-by: dependabot[bot] * CU-863gntc58 Umlspt2ch (#322) * CU-863gntc58 Add parent to child relationship getter to UMLS preprocessing * CU-863gntc58 Only use ISA relationships * Make sure parents do not have themselves as children * CU-863gntc58 Only keep preferred names * CU-863gntc58 Fix typing issues * CU-863gntc58 Fix child-parent relationships being saved instea * Better system for avoiding parent-child being the same * CU-86783u6d9 Add wrapper to simplify De-ID model usage * CU-86783u6d9 Add wrapper to simplify De-ID model usage * CU-86783u6d9 Fix typoe (nod vs not) * CU-86783u6d9 Fix typo in docstring * CU-86783u6d9 Change loading method name to match CAT * CU-86783u6d9 Separate NER model from DeID model * Better separation of NER models from DeID models * CU-86783u6d9 Move deid method from helpers module to deid model and deprecated the use of the wrappers in the helpers module * Fix imports in deid model * Fix deid training method return value * CU-86783u6d9 Fix dunder call defaults for redaction * CU-86783u6d9 Add a few simple tests for the DeID model * CU-86783u6d9 Add redaction test for the DeID model * CU-86783u6d9 Add remove senitive data * CU-86783u6d9 Fix deid model validation * CU-86783u6d9 Add ChatGPT generated DeId trian data * CU-86783u6d9 Add Warning regarding deid training data * CU-86783u6d9 Fix model issue with multiple NER models * CU-86783u6d9 Fix merge conflict in docstring * CU-86783u6d9 Try and fix keyword argument duplication * CU-86783u6d9 Ignore mypy where needed * CU-86783u6d9 Fix issue with NER model being returned when loading a DeID model * CU-86783u6d9 Remove unused import * CU-86783u6d9 Update training data with some more examples * CU-86783u6d9 Add type hints and doc string to deid method * CU-86783u6d9 Add comment regarding deid_text method being outside the model class * CU-86783u6d9 Add missing return type * CU-86783u6d9 Expose get_entities in NER model * CU-86783u6d9 Expose dunder call in NER model * CU-86783u6d9 Remove dunder call in override in deid model * CU-86783u6d9 Fix deid model tests * CU-86783u6d9 Fix a few typos in docstrings * CU-86783u6d9 Fix a method name in docstrings --------- Signed-off-by: dependabot[bot] Signed-off-by: zethson Co-authored-by: tomolopolis Co-authored-by: Zeljko Co-authored-by: Sander Tan Co-authored-by: Xi Bai <82581439+baixiac@users.noreply.github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Anthony Shek <55877857+antsh3k@users.noreply.github.com> Co-authored-by: Lukas Heumos Co-authored-by: antsh3k Co-authored-by: James Brandreth Co-authored-by: Xi Bai --- medcat/utils/ner/deid.py | 144 +++++++++++++++++++++++++++ medcat/utils/ner/helpers.py | 15 +-- medcat/utils/ner/model.py | 109 ++++++++++++++++++++ tests/resources/deid_train_data.json | 1 + tests/utils/ner/__init__.py | 0 tests/utils/ner/test_deid.py | 118 ++++++++++++++++++++++ 6 files changed, 380 insertions(+), 7 deletions(-) create mode 100644 medcat/utils/ner/deid.py create mode 100644 medcat/utils/ner/model.py create mode 100644 tests/resources/deid_train_data.json create mode 100644 tests/utils/ner/__init__.py create mode 100644 tests/utils/ner/test_deid.py diff --git a/medcat/utils/ner/deid.py b/medcat/utils/ner/deid.py new file mode 100644 index 000000000..122433b13 --- /dev/null +++ b/medcat/utils/ner/deid.py @@ -0,0 +1,144 @@ +"""De-identification model. + +This describes a wrapper on the regular CAT model. +The idea is to simplify the use of a DeId-specific model. + +It tackles two use cases +1) Creation of a deid model +2) Loading and use of a deid model + +I.e for use case 1: + +Instead of: +cat = CAT(cdb=ner.cdb, addl_ner=ner) + +You can use: +deid = DeIdModel.create(ner) + + +And for use case 2: + +Instead of: +cat = CAT.load_model_pack(model_pack_path) +anon_text = deid_text(cat, text) + +You can use: +deid = DeIdModel.load_model_pack(model_pack_path) +anon_text = deid.deid_text(text) + +Or if/when structured output is desired: +deid = DeIdModel.load_model_pack(model_pack_path) +anon_doc = deid(text) # the spacy document + +The wrapper also exposes some CAT parts directly: +- config +- cdb +""" +from typing import Union, Tuple, Any + +from medcat.cat import CAT +from medcat.utils.ner.model import NerModel + + +class DeIdModel(NerModel): + """The DeID model. + + This wraps a CAT instance and simplifies its use as a + de-identification model. + + It provies methods for creating one from a TransformersNER + as well as loading from a model pack (along with some validation). + + It also exposes some useful parts of the CAT it wraps such as + the config and the concept database. + """ + + def __init__(self, cat: CAT) -> None: + self.cat = cat + + def train(self, json_path: Union[str, list, None], + *args, **kwargs) -> Tuple[Any, Any, Any]: + return super().train(json_path, *args, train_nr=0, **kwargs) # type: ignore + + def deid_text(self, text: str, redact: bool = False) -> str: + """Deidentify text and potentially redact information. + + Args: + text (str): The text to deidentify. + redact (bool): Whether to redact the information. + + Returns: + str: The deidentified text. + """ + return deid_text(self.cat, text, redact=redact) + + @classmethod + def load_model_pack(cls, model_pack_path: str) -> 'DeIdModel': + """Load DeId model from model pack. + + The method first loads the CAT instance. + + It then makes sure that the model pack corresponds to a + valid DeId model. + + Args: + model_pack_path (str): The model pack path. + + Raises: + ValueError: If the model pack does not correspond to a DeId model. + + Returns: + DeIdModel: The resulting DeI model. + """ + ner_model = NerModel.load_model_pack(model_pack_path) + cat = ner_model.cat + if not cls._is_deid_model(cat): + raise ValueError( + f"The model saved at {model_pack_path} is not a deid model " + f"({cls._get_reason_not_deid(cat)})") + model = cls(ner_model.cat) + return model + + @classmethod + def _is_deid_model(cls, cat: CAT) -> bool: + return not bool(cls._get_reason_not_deid(cat)) + + @classmethod + def _get_reason_not_deid(cls, cat: CAT) -> str: + if cat.vocab is not None: + return "Has vocab" + if len(cat._addl_ner) != 1: + return f"Incorrect number of addl_ner: {len(cat._addl_ner)}" + return "" + + +# For now, we will keep this method separate from the above class +# This is so that we wouldn't need to create a thorwaway object +# when calling the method from .helpers where it used to be. +# After the deprecated method in .helpers is removed, we can +# move this to a proper class method. +def deid_text(cat: CAT, text: str, redact: bool = False) -> str: + """De-identify text. + + De-identified text. + If redaction is enabled, identifiable entities will be + replaced with starts (e.g `*****`). + Otherwise, the replacement will be the CUI or in other words, + the type of information that was hidden (e.g [PATIENT]). + + + Args: + cat (CAT): The CAT object to use for deid. + text (str): The input document. + redact (bool, optional): Whether to redact. Defaults to False. + + Returns: + str: The de-identified document. + """ + new_text = str(text) + entities = cat.get_entities(text)['entities'] + for ent in sorted(entities.values(), key=lambda ent: ent['start'], reverse=True): + r = "*"*(ent['end']-ent['start'] + ) if redact else cat.cdb.get_name(ent['cui']) + new_text = new_text[:ent['start']] + f'[{r}]' + new_text[ent['end']:] + return new_text diff --git a/medcat/utils/ner/helpers.py b/medcat/utils/ner/helpers.py index 65b8660e9..518aecc22 100644 --- a/medcat/utils/ner/helpers.py +++ b/medcat/utils/ner/helpers.py @@ -1,14 +1,15 @@ from medcat.utils.data_utils import count_annotations from medcat.cdb import CDB +from medcat.utils.ner.deid import deid_text as _deid_text +from medcat.utils.decorators import deprecated -def deid_text(cat, text, redact=False): - new_text = str(text) - entities = cat.get_entities(text)['entities'] - for ent in sorted(entities.values(), key=lambda ent: ent['start'], reverse=True): - r = "*"*(ent['end']-ent['start']) if redact else cat.cdb.get_name(ent['cui']) - new_text = new_text[:ent['start']] + f'[{r}]' + new_text[ent['end']:] - return new_text + +@deprecated("API now allows creating a DeId model (medcat.utils.ner.deid.DeIdModel). " + "It aims to simplify the usage of DeId models. " + "The use of this model is encouraged over the use of this method.") +def deid_text(*args, **kwargs) -> str: + return _deid_text(*args, **kwargs) def make_or_update_cdb(json_path, cdb=None, min_count=0): diff --git a/medcat/utils/ner/model.py b/medcat/utils/ner/model.py new file mode 100644 index 000000000..553fb4c65 --- /dev/null +++ b/medcat/utils/ner/model.py @@ -0,0 +1,109 @@ +from typing import Any, List, Tuple, Union, Optional + +from spacy.tokens import Doc + +from medcat.ner.transformers_ner import TransformersNER +from medcat.cat import CAT +from medcat.cdb import CDB +from medcat.config import Config + + +class NerModel: + + """The NER model. + + This wraps a CAT instance and simplifies its use as a + NER model. + + It provies methods for creating one from a TransformersNER + as well as loading from a model pack (along with some validation). + + It also exposes some useful parts of the CAT it wraps such as + the config and the concept database. + """ + + def __init__(self, cat: CAT) -> None: + self.cat = cat + + def train(self, json_path: Union[str, list, None], train_nr: int = 0, + *args, **kwargs) -> Tuple[Any, Any, Any]: + """Train the underlying transformers NER model. + + All the extra arguments are passed to the TransformersNER train method. + + Args: + json_path (Union[str, list, None]): The JSON file path to read the training data from. + train_nr (int, optional): The number of the NER object in cat._addl_train to train. Defaults to 0. + + Returns: + Tuple[Any, Any, Any]: df, examples, dataset + """ + return self.cat._addl_ner[train_nr].train(json_path, *args, **kwargs) + + def __call__(self, text: Optional[str], *args, **kwargs) -> Optional[Doc]: + """Get the annotated document for text. + + Undefined arguments and keyword arguments get passed on to + the equivalent `CAT` method. + + Args: + text (Optional[str]): The input text. + + Returns: + Optional[Doc]: The annotated document. + """ + return self.cat(text, *args, **kwargs) + + def get_entities(self, text: str, *args, **kwargs) -> dict: + """Gets the entities recognized within a given text. + + The output format is identical to `CAT.get_entities`. + + Undefined arguments and keyword arguments get passed on to + CAT.get_entities. + + Args: + text (str): The input text. + + Returns: + dict: The output entities. + """ + return self.cat.get_entities(text, *args, **kwargs) + + @property + def config(self) -> Config: + return self.cat.config + + @property + def cdb(self) -> CDB: + return self.cat.cdb + + @classmethod + def create(cls, ner: Union[TransformersNER, List[TransformersNER]]) -> 'NerModel': + """Create a NER model with a TransformersNER + + Args: + ner (Union[TransformersNER, List[TransformersNER]]): The TransformersNER instance(s). + + Returns: + NerModel: The resulting model + """ + # expecting all to have the same CDB + cdb = ner.cdb if isinstance(ner, TransformersNER) else ner[0].cdb + cat = CAT(cdb=cdb, addl_ner=ner) + return cls(cat) + + @classmethod + def load_model_pack(cls, model_pack_path: str) -> 'NerModel': + """Load NER model from model pack. + + The method first wraps the loaded CAT instance. + + Args: + model_pack_path (str): The model pack path. + + Returns: + NerModel: The resulting DeI model. + """ + cat = CAT.load_model_pack(model_pack_path) + return cls(cat) diff --git a/tests/resources/deid_train_data.json b/tests/resources/deid_train_data.json new file mode 100644 index 000000000..55310bd5d --- /dev/null +++ b/tests/resources/deid_train_data.json @@ -0,0 +1 @@ +{"projects": [{"name": "chatGPT-gen", "documents": [{"name": "doc_0", "text": "\nPatient Name: John Smith\nAddress: 15 Maple Avenue\nCity: New York\nCC: Chronic back pain\n\nHX: Mr. Smith is a 52-year-old male who has been experiencing chronic back pain for the past six months. The pain initially started after a lifting incident at work. He describes the pain as a dull ache in the lower back, which worsens with prolonged sitting or standing. He has tried over-the-counter pain medications with limited relief. Mr. Smith decided to seek medical attention due to the persistent nature of his symptoms.\n\nFHX: No significant family history of back pain or spinal conditions.\n\nSHX: Office worker. Non-smoker. Occasional alcohol consumption.\n\nPhysical examination revealed tenderness over the lumbar spine with no signs of neurological deficit. X-rays performed on 6/10/2023 showed degenerative changes in the lumbar spine, consistent with spondylosis.\n\nSeen by Dr. R. Johnson on 6/15/2023.\n\n", "annotations": [{"start": 15, "end": 25, "cui": "PATIENT", "value": " John Smith"}, {"start": 35, "end": 50, "cui": "HOSPITAL", "value": " 15 Maple Avenue"}, {"start": 57, "end": 65, "cui": "HOSPITAL", "value": " New York"}, {"start": 97, "end": 102, "cui": "PATIENT", "value": " Smith"}, {"start": 433, "end": 438, "cui": "PATIENT", "value": " Smith"}, {"start": 879, "end": 880, "cui": "DOCTOR", "value": " R"}, {"start": 882, "end": 889, "cui": "PATIENT", "value": " Johnson"}]}, {"name": "doc_1", "text": "\nPatient Name: Emily Davis\nAddress: 22 Willow Lane\nCity: Los Angeles\nCC: Allergic rhinitis\n\nHX: Miss Davis is a 28-year-old female who presents with symptoms of allergic rhinitis. She complains of frequent sneezing, nasal congestion, and itchy eyes, which have been bothering her for the past two years. Symptoms are worse during the spring and fall seasons and improve with over-the-counter antihistamines. Miss Davis seeks medical advice to explore other treatment options.\n\nFHX: No significant family history of allergic rhinitis or other allergic conditions.\n\nSHX: Office administrator. Non-smoker. No alcohol or drug use.\n\nNasal examination revealed pale, boggy nasal mucosa with clear nasal discharge. Skin prick testing conducted on 6/12/2023 demonstrated positive reactions to grass pollen and dust mites.\n\nSeen by Dr. S. Patel on 6/17/2023.\n\n", "annotations": [{"start": 15, "end": 26, "cui": "PATIENT", "value": " Emily Davis"}, {"start": 36, "end": 50, "cui": "HOSPITAL", "value": " 22 Willow Lane"}, {"start": 57, "end": 68, "cui": "HOSPITAL", "value": " Los Angeles"}, {"start": 101, "end": 106, "cui": "PATIENT", "value": " Davis"}, {"start": 413, "end": 418, "cui": "PATIENT", "value": " Davis"}, {"start": 827, "end": 828, "cui": "DOCTOR", "value": " S"}, {"start": 830, "end": 835, "cui": "PATIENT", "value": " Patel"}]}, {"name": "doc_2", "text": "\nPatient Name: Michael Johnson\nAddress: 10 Oak Street\nCity: Chicago\nCC: Acute bronchitis\n\nHX: Mr. Johnson is a 42-year-old male who presents with symptoms of acute bronchitis. He reports a cough productive of yellowish sputum, mild chest discomfort, and low-grade fever for the past five days. He denies any shortness of breath or wheezing. Mr. Johnson sought medical attention due to the persistence of symptoms and concern about the nature of his illness.\n\nFHX: No significant family history of respiratory conditions or chronic lung diseases.\n\nSHX: Construction worker. Non-smoker. Occasional alcohol consumption.\n\nPulmonary examination revealed scattered coarse breath sounds with no signs of consolidation. Chest X-ray performed on 6/13/2023 showed no evidence of pneumonia.\n\nSeen by Dr. L. Anderson on 6/16/2023.\n\n", "annotations": [{"start": 15, "end": 30, "cui": "PATIENT", "value": " Michael Johnson"}, {"start": 40, "end": 53, "cui": "HOSPITAL", "value": " 10 Oak Street"}, {"start": 60, "end": 67, "cui": "HOSPITAL", "value": " Chicago"}, {"start": 98, "end": 105, "cui": "PATIENT", "value": " Johnson"}, {"start": 345, "end": 352, "cui": "PATIENT", "value": " Johnson"}, {"start": 793, "end": 794, "cui": "DOCTOR", "value": " L"}, {"start": 796, "end": 804, "cui": "PATIENT", "value": " Anderson"}]}, {"name": "doc_3", "text": "\nPatient Name: Sarah Thompson\nAddress: 5 Elm Street\nCity: San Francisco\nCC: Migraine headaches\n\nHX: Miss Thompson is a 30-year-old female who complains of recurrent migraine headaches. She describes the headaches as pulsating, moderate to severe in intensity, lasting for several hours to a day. The headaches are usually accompanied by nausea, vomiting, and sensitivity to light and sound. Miss Thompson reports experiencing these episodes once or twice a month for the past two years. She seeks medical advice to explore treatment options and alleviate her symptoms.\n\nFHX: Maternal aunt had a history of migraines. No other significant family history of neurological conditions.\n\nSHX: Graphic designer. Non-smoker. Rare alcohol consumption.\n\nNeurological examination revealed no focal deficits. Miss Thompson's headache characteristics and frequency are consistent with a diagnosis of migraines.\n\nSeen by Dr. K. Roberts on 6/19/2023.\n\n", "annotations": [{"start": 15, "end": 29, "cui": "PATIENT", "value": " Sarah Thompson"}, {"start": 39, "end": 51, "cui": "HOSPITAL", "value": " 5 Elm Street"}, {"start": 58, "end": 71, "cui": "HOSPITAL", "value": " San Francisco"}, {"start": 105, "end": 113, "cui": "PATIENT", "value": " Thompson"}, {"start": 396, "end": 404, "cui": "PATIENT", "value": " Thompson"}, {"start": 802, "end": 810, "cui": "PATIENT", "value": " Thompson"}, {"start": 911, "end": 912, "cui": "DOCTOR", "value": " K"}, {"start": 914, "end": 921, "cui": "PATIENT", "value": " Roberts"}]}, {"name": "doc_4", "text": "\nPatient Name: David Wilson\nAddress: 3 Pine Street\nCity: Houston\nCC: Gastroesophageal reflux disease (GERD)\n\nHX: Mr. Wilson is a 48-year-old male who presents with symptoms of gastroesophageal reflux disease. He complains of frequent heartburn, regurgitation, and a bitter taste in his mouth, particularly after meals. Symptoms have been bothering him for the past six months, and he has noticed a decrease in his appetite and unintentional weight loss. Mr. Wilson seeks medical advice to manage his symptoms and address the weight loss.\n\nFHX: No significant family history of gastrointestinal conditions.\n\nSHX: Accountant. Non-smoker. Occasional alcohol consumption.\n\nAbdominal examination revealed epigastric tenderness. Upper endoscopy performed on 6/16/2023 demonstrated evidence of esophagitis and hiatal hernia.\n\nSeen by Dr. J. Anderson on 6/21/2023.\n\n", "annotations": [{"start": 15, "end": 27, "cui": "PATIENT", "value": " David Wilson"}, {"start": 37, "end": 50, "cui": "HOSPITAL", "value": " 3 Pine Street"}, {"start": 57, "end": 64, "cui": "HOSPITAL", "value": " Houston"}, {"start": 117, "end": 123, "cui": "PATIENT", "value": " Wilson"}, {"start": 458, "end": 464, "cui": "PATIENT", "value": " Wilson"}, {"start": 831, "end": 832, "cui": "DOCTOR", "value": " J"}, {"start": 834, "end": 842, "cui": "PATIENT", "value": " Anderson"}]}, {"name": "doc_5", "text": "\nPatient Name: Olivia Martinez\nAddress: 12 Rose Lane\nCity: Miami\nCC: Depression\n\nHX: Miss Martinez is a 36-year-old female who presents with symptoms of depression. She reports feeling persistent sadness, loss of interest in activities, decreased energy, changes in appetite and sleep patterns, and difficulty concentrating for the past six months. These symptoms have significantly affected her daily functioning and overall quality of life. Miss Martinez seeks medical assistance to address her depressive symptoms.\n\nFHX: No significant family history of mood disorders.\n\nSHX: Teacher. Non-smoker. No alcohol or drug use.\n\nPsychiatric evaluation revealed a depressed mood, anhedonia, and impaired concentration. Based on the clinical presentation, Miss Martinez meets the criteria for major depressive disorder.\n\nSeen by Dr. A. Ramirez on 6/23/2023.\n\n", "annotations": [{"start": 15, "end": 30, "cui": "PATIENT", "value": " Olivia Martinez"}, {"start": 40, "end": 52, "cui": "HOSPITAL", "value": " 12 Rose Lane"}, {"start": 59, "end": 64, "cui": "HOSPITAL", "value": " Miami"}, {"start": 90, "end": 98, "cui": "PATIENT", "value": " Martinez"}, {"start": 448, "end": 456, "cui": "PATIENT", "value": " Martinez"}, {"start": 755, "end": 763, "cui": "PATIENT", "value": " Martinez"}, {"start": 827, "end": 828, "cui": "DOCTOR", "value": " A"}, {"start": 830, "end": 837, "cui": "PATIENT", "value": " Ramirez"}]}, {"name": "doc_6", "text": "\nPatient Name: Daniel Lee\nAddress: 8 Maple Street\nCity: Seattle\nCC: Hypertension\n\nHX: Mr. Lee is a 58-year-old male who presents with elevated blood pressure readings during routine check-ups. He has a family history of hypertension and is concerned about his cardiovascular health. Mr. Lee has no associated symptoms but seeks medical advice to manage his blood pressure and reduce the risk of complications.\n\nFHX: Father and paternal grandfather had hypertension. No other significant family history of cardiovascular diseases.\n\nSHX: Engineer. Non-smoker. Occasional alcohol consumption.\n\nPhysical examination revealed blood pressure consistently above the normal range. Further investigations, including 24-hour ambulatory blood pressure monitoring, confirmed the diagnosis of essential hypertension.\n\nSeen by Dr. H. Johnson on 6/25/2023.\n\n", "annotations": [{"start": 15, "end": 25, "cui": "PATIENT", "value": " Daniel Lee"}, {"start": 35, "end": 49, "cui": "HOSPITAL", "value": " 8 Maple Street"}, {"start": 56, "end": 63, "cui": "HOSPITAL", "value": " Seattle"}, {"start": 90, "end": 93, "cui": "PATIENT", "value": " Lee"}, {"start": 287, "end": 290, "cui": "PATIENT", "value": " Lee"}, {"start": 817, "end": 818, "cui": "DOCTOR", "value": " H"}, {"start": 820, "end": 827, "cui": "PATIENT", "value": " Johnson"}]}, {"name": "doc_7", "text": "\nPatient Name: Sophia Adams\nAddress: 18 Cedar Avenue\nCity: Boston\nCC: Urinary tract infection (UTI)\n\nHX: Miss Adams is a 24-year-old female who complains of urinary frequency, urgency, and a burning sensation during urination. Symptoms started two days ago and have progressively worsened. She denies any hematuria or fever. Miss Adams seeks medical attention due to the persistence of symptoms and concern about a possible urinary tract infection.\n\nFHX: No significant family history of urinary tract infections.\n\nSHX: Marketing executive. Non-smoker. No alcohol or drug use.\n\nUrinalysis revealed pyuria and positive leukocyte esterase, indicating a urinary tract infection. A midstream urine culture confirmed the presence of Escherichia coli.\n\nSeen by Dr. M. Patel on 6/28/2023.\n\n", "annotations": [{"start": 15, "end": 27, "cui": "PATIENT", "value": " Sophia Adams"}, {"start": 37, "end": 52, "cui": "HOSPITAL", "value": " 18 Cedar Avenue"}, {"start": 59, "end": 65, "cui": "HOSPITAL", "value": " Boston"}, {"start": 110, "end": 115, "cui": "PATIENT", "value": " Adams"}, {"start": 330, "end": 335, "cui": "PATIENT", "value": " Adams"}, {"start": 759, "end": 760, "cui": "DOCTOR", "value": " M"}, {"start": 762, "end": 767, "cui": "PATIENT", "value": " Patel"}]}, {"name": "doc_8", "text": "\nPatient Name: Benjamin Thompson\nAddress: 25 Oak Street\nCity: Chicago\nCC: Seasonal allergies\n\nHX: Mr. Thompson is a 40-year-old male who presents with symptoms of seasonal allergies. He reports sneezing, itching, and a runny nose, particularly during the spring and summer months. Symptoms significantly interfere with his daily activities and sleep. Mr. Thompson seeks medical advice to manage his allergic symptoms.\n\nFHX: Mother had a history of seasonal allergies. No other significant family history of allergic conditions.\n\nSHX: IT specialist. Non-smoker. No alcohol or drug use.\n\nAllergy testing conducted on 6/26/2023 demonstrated positive reactions to grass pollen and tree pollen.\n\nSeen by Dr. E. Anderson on 6/30/2023.\n\n", "annotations": [{"start": 15, "end": 32, "cui": "PATIENT", "value": " Benjamin Thompson"}, {"start": 42, "end": 55, "cui": "HOSPITAL", "value": " 25 Oak Street"}, {"start": 62, "end": 69, "cui": "HOSPITAL", "value": " Chicago"}, {"start": 102, "end": 110, "cui": "PATIENT", "value": " Thompson"}, {"start": 355, "end": 363, "cui": "PATIENT", "value": " Thompson"}, {"start": 703, "end": 704, "cui": "DOCTOR", "value": " E"}, {"start": 706, "end": 714, "cui": "PATIENT", "value": " Anderson"}]}, {"name": "doc_9", "text": "\nPatient Name: Emma Davis\nAddress: 6 Willow Lane\nCity: Los Angeles\nCC: Anxiety\n\nHX: Miss Davis is a 32-year-old female who presents with symptoms of anxiety. She reports excessive worrying, restlessness, irritability, muscle tension, and difficulty concentrating. These symptoms have been present for the past six months and have\n\n\n", "annotations": [{"start": 15, "end": 25, "cui": "PATIENT", "value": " Emma Davis"}, {"start": 35, "end": 48, "cui": "HOSPITAL", "value": " 6 Willow Lane"}, {"start": 55, "end": 66, "cui": "HOSPITAL", "value": " Los Angeles"}, {"start": 89, "end": 94, "cui": "PATIENT", "value": " Davis"}]}, {"name": "doc_10", "text": "\nPatient Name: Alexander Johnson\nAddress: 9 Elm Street\nCity: San Francisco\nCC: Asthma\n\nHX: Mr. Johnson is a 28-year-old male who presents with symptoms of asthma. He complains of recurrent episodes of wheezing, shortness of breath, and chest tightness, particularly during physical activity and exposure to triggers such as dust and pollen. Symptoms have been present since childhood and have recently worsened. Mr. Johnson seeks medical assistance to manage his asthma symptoms and improve his quality of life.\n\nFHX: Mother and paternal uncle have a history of asthma. No other significant family history of respiratory conditions.\n\nSHX: Sales representative. Non-smoker. No alcohol or drug use.\n\nPulmonary function tests revealed airflow obstruction with significant reversibility after bronchodilator administration, confirming the diagnosis of asthma.\n\nSeen by Dr. N. Patel on 7/2/2023.\n\n", "annotations": [{"start": 15, "end": 32, "cui": "PATIENT", "value": " Alexander Johnson"}, {"start": 42, "end": 54, "cui": "HOSPITAL", "value": " 9 Elm Street"}, {"start": 61, "end": 74, "cui": "HOSPITAL", "value": " San Francisco"}, {"start": 95, "end": 102, "cui": "PATIENT", "value": " Johnson"}, {"start": 416, "end": 423, "cui": "PATIENT", "value": " Johnson"}, {"start": 869, "end": 870, "cui": "DOCTOR", "value": " N"}, {"start": 872, "end": 877, "cui": "PATIENT", "value": " Patel"}]}, {"name": "doc_11", "text": "\nPatient Name: Lily Wilson\nAddress: 4 Pine Street\nCity: Houston\nCC: Gastroenteritis\n\nHX: Miss Wilson is a 22-year-old female who presents with symptoms of gastroenteritis. She reports diarrhea, abdominal cramping, nausea, and vomiting, which started after consuming a meal at a local restaurant. Symptoms have been ongoing for the past 24 hours, and she is concerned about dehydration and the persistence of symptoms. Miss Wilson seeks medical advice for symptom relief and to ensure appropriate management.\n\nFHX: No significant family history of gastrointestinal conditions.\n\nSHX: Student. Non-smoker. No alcohol or drug use.\n\nPhysical examination revealed mild abdominal tenderness with no signs of peritonitis. Based on the clinical presentation and recent food exposure, the diagnosis of gastroenteritis is likely.\n\nSeen by Dr. K. Roberts on 7/5/2023.\n\n", "annotations": [{"start": 15, "end": 26, "cui": "PATIENT", "value": " Lily Wilson"}, {"start": 36, "end": 49, "cui": "HOSPITAL", "value": " 4 Pine Street"}, {"start": 56, "end": 63, "cui": "HOSPITAL", "value": " Houston"}, {"start": 94, "end": 100, "cui": "PATIENT", "value": " Wilson"}, {"start": 423, "end": 429, "cui": "PATIENT", "value": " Wilson"}, {"start": 832, "end": 833, "cui": "DOCTOR", "value": " K"}, {"start": 835, "end": 842, "cui": "PATIENT", "value": " Roberts"}]}, {"name": "doc_12", "text": "\nPatient Name: Noah Thompson\nAddress: 19 Cedar Avenue\nCity: Boston\nCC: Insomnia\n\nHX: Mr. Thompson is a 45-year-old male who complains of difficulty falling asleep and maintaining sleep. He reports frequent awakenings during the night and feeling unrefreshed upon waking up. These symptoms have been present for the past three months and significantly affect his daytime functioning. Mr. Thompson seeks medical assistance to address his insomnia and improve his sleep quality.\n\nFHX: No significant family history of sleep disorders.\n\nSHX: Financial analyst. Non-smoker. Occasional alcohol consumption.\n\nSleep diary records revealed prolonged sleep latency and frequent awakenings during the night. Based on the clinical presentation, Mr. Thompson meets the criteria for chronic insomnia disorder.\n\nSeen by Dr. S. Ramirez on 7/8/2023.\n\n", "annotations": [{"start": 15, "end": 28, "cui": "PATIENT", "value": " Noah Thompson"}, {"start": 38, "end": 53, "cui": "HOSPITAL", "value": " 19 Cedar Avenue"}, {"start": 60, "end": 66, "cui": "HOSPITAL", "value": " Boston"}, {"start": 89, "end": 97, "cui": "PATIENT", "value": " Thompson"}, {"start": 387, "end": 395, "cui": "PATIENT", "value": " Thompson"}, {"start": 737, "end": 745, "cui": "PATIENT", "value": " Thompson"}, {"start": 809, "end": 810, "cui": "DOCTOR", "value": " S"}, {"start": 812, "end": 819, "cui": "PATIENT", "value": " Ramirez"}]}, {"name": "doc_13", "text": "\nPatient Name: Chloe Adams\nAddress: 14 Cedar Avenue\nCity: Boston\nCC: Sinusitis\n\nHX: Miss Adams is a 26-year-old female who presents with symptoms of sinusitis. She reports nasal congestion, facial pressure, headache, and thick nasal discharge, which have been bothering her for the past week. Miss Adams tried over-the-counter nasal decongestants with minimal relief. She seeks medical assistance to manage her symptoms and prevent complications.\n\nFHX: No significant family history of sinusitis or chronic sinus conditions.\n\nSHX: Graphic designer. Non-smoker. No alcohol or drug use.\n\nNasal examination revealed erythematous nasal mucosa with purulent discharge. Based on the clinical presentation, Miss Adams is diagnosed with acute sinusitis.\n\nSeen by Dr. L. Anderson on 7/11/2023.\n\n", "annotations": [{"start": 15, "end": 26, "cui": "PATIENT", "value": " Chloe Adams"}, {"start": 36, "end": 51, "cui": "HOSPITAL", "value": " 14 Cedar Avenue"}, {"start": 58, "end": 64, "cui": "HOSPITAL", "value": " Boston"}, {"start": 89, "end": 94, "cui": "PATIENT", "value": " Adams"}, {"start": 298, "end": 303, "cui": "PATIENT", "value": " Adams"}, {"start": 705, "end": 710, "cui": "PATIENT", "value": " Adams"}, {"start": 759, "end": 760, "cui": "DOCTOR", "value": " L"}, {"start": 762, "end": 770, "cui": "PATIENT", "value": " Anderson"}]}, {"name": "doc_14", "text": "\nPatient Name: Grace Turner\nAddress: 11 Maple Avenue\nCity: New York\nCC: Rheumatoid arthritis\n\nHX: Miss Turner\n", "annotations": [{"start": 15, "end": 27, "cui": "PATIENT", "value": " Grace Turner"}, {"start": 37, "end": 52, "cui": "HOSPITAL", "value": " 11 Maple Avenue"}, {"start": 59, "end": 67, "cui": "HOSPITAL", "value": " New York"}, {"start": 103, "end": 109, "cui": "PATIENT", "value": " Turner"}]}, {"name": "doc_15", "text": "\nPatient Name: Ethan Harris\nAddress: 16 Pine Street\nCity: Houston\nCC: Gout\n\nHX: Mr. Harris is a 55-year-old male who presents with symptoms of gout. He reports sudden and severe joint pain, swelling, and redness in his right big toe. The symptoms started yesterday, and he has a history of similar episodes in the past. Mr. Harris seeks medical assistance to manage his acute gout attack and prevent future flares.\n\nFHX: No significant family history of gout or other rheumatic conditions.\n\nSHX: Retired. Non-smoker. Occasional alcohol consumption.\n\nPhysical examination revealed warmth, tenderness, and erythema in the affected joint. Based on the clinical presentation and history of recurrent episodes, Mr. Harris is diagnosed with acute gouty arthritis.\n\nSeen by Dr. M. Johnson on 7/14/2023.\n\n", "annotations": [{"start": 15, "end": 27, "cui": "PATIENT", "value": " Ethan Harris"}, {"start": 37, "end": 51, "cui": "HOSPITAL", "value": " 16 Pine Street"}, {"start": 58, "end": 65, "cui": "HOSPITAL", "value": " Houston"}, {"start": 84, "end": 90, "cui": "PATIENT", "value": " Harris"}, {"start": 324, "end": 330, "cui": "PATIENT", "value": " Harris"}, {"start": 710, "end": 716, "cui": "PATIENT", "value": " Harris"}, {"start": 771, "end": 772, "cui": "DOCTOR", "value": " M"}, {"start": 774, "end": 781, "cui": "PATIENT", "value": " Johnson"}]}, {"name": "doc_16", "text": "\nPatient Name: Mia Clark\nAddress: 7 Willow Lane\nCity: Los Angeles\nCC: Urinary incontinence\n\nHX: Miss Clark is a 62-year-old female who complains of urinary incontinence. She reports involuntary urine leakage, particularly with coughing, sneezing, and physical exertion. Symptoms have been present for the past six months and have progressively worsened. Miss Clark seeks medical advice to address her urinary incontinence and improve her quality of life.\n\nFHX: No significant family history of urinary incontinence or pelvic floor disorders.\n\nSHX: Retired. Non-smoker. No alcohol or drug use.\n\nPelvic examination revealed weakened pelvic floor muscles. Based on the clinical presentation, Miss Clark is diagnosed with stress urinary incontinence.\n\nSeen by Dr. E. Patel on 7/17/2023.\n\n", "annotations": [{"start": 15, "end": 24, "cui": "PATIENT", "value": " Mia Clark"}, {"start": 34, "end": 47, "cui": "HOSPITAL", "value": " 7 Willow Lane"}, {"start": 54, "end": 65, "cui": "HOSPITAL", "value": " Los Angeles"}, {"start": 101, "end": 106, "cui": "PATIENT", "value": " Clark"}, {"start": 359, "end": 364, "cui": "PATIENT", "value": " Clark"}, {"start": 694, "end": 699, "cui": "PATIENT", "value": " Clark"}, {"start": 760, "end": 761, "cui": "DOCTOR", "value": " E"}, {"start": 763, "end": 768, "cui": "PATIENT", "value": " Patel"}]}, {"name": "doc_17", "text": "\nPatient Name: Samuel Wright\nAddress: 20 Oak Street\nCity: Chicago\nCC: Osteoarthritis\n\nHX: Mr. Wright is a 70-year-old male who presents with symptoms of osteoarthritis. He reports joint pain, stiffness, and reduced range of motion in his knees and hands. Symptoms have been progressively worsening over the past year and significantly affect his daily activities. Mr. Wright seeks medical assistance to manage his osteoarthritis symptoms and improve his functional ability.\n\nFHX: No significant family history of musculoskeletal conditions.\n\nSHX: Retired. Non-smoker. No alcohol or drug use.\n\nPhysical examination revealed crepitus, bony enlargement, and limited range of motion in the affected joints. Based on the clinical presentation and imaging findings, Mr. Wright is diagnosed with osteoarthritis.\n\nSeen by Dr. R. Anderson on 7/20/2023.\n\n", "annotations": [{"start": 15, "end": 28, "cui": "PATIENT", "value": " Samuel Wright"}, {"start": 38, "end": 51, "cui": "HOSPITAL", "value": " 20 Oak Street"}, {"start": 58, "end": 65, "cui": "HOSPITAL", "value": " Chicago"}, {"start": 94, "end": 100, "cui": "PATIENT", "value": " Wright"}, {"start": 368, "end": 374, "cui": "PATIENT", "value": " Wright"}, {"start": 764, "end": 770, "cui": "PATIENT", "value": " Wright"}, {"start": 818, "end": 819, "cui": "DOCTOR", "value": " R"}, {"start": 821, "end": 829, "cui": "PATIENT", "value": " Anderson"}]}, {"name": "doc_18", "text": "\nPatient Name: Harper Turner\nAddress: 13 Maple Avenue\nCity: New York\nCC: Hypothyroidism\n\nHX: Miss Turner is a 30-year-old female who presents with symptoms of hypothyroidism. She reports fatigue, weight gain, cold intolerance, constipation, and dry skin. These symptoms have been present for the past six months and have gradually worsened. Miss Turner seeks medical assistance to evaluate her thyroid function and explore appropriate treatment options.\n\nFHX: No significant family history of thyroid disorders.\n\nSHX: Office manager. Non-smoker. No alcohol or drug use.\n\nLaboratory tests revealed elevated thyroid-stimulating hormone (TSH) levels and decreased free thyroxine (T4) levels, confirming the diagnosis of primary hypothyroidism.\n\nSeen by Dr. S. Johnson on 7/23/2023.\n\n", "annotations": [{"start": 15, "end": 28, "cui": "PATIENT", "value": " Harper Turner"}, {"start": 38, "end": 53, "cui": "HOSPITAL", "value": " 13 Maple Avenue"}, {"start": 60, "end": 68, "cui": "HOSPITAL", "value": " New York"}, {"start": 98, "end": 104, "cui": "PATIENT", "value": " Turner"}, {"start": 346, "end": 352, "cui": "PATIENT", "value": " Turner"}, {"start": 754, "end": 755, "cui": "DOCTOR", "value": " S"}, {"start": 757, "end": 764, "cui": "PATIENT", "value": " Johnson"}]}, {"name": "doc_19", "text": "\nPatient Name: Ava Lewis\nAddress: 10 Pine Street\nCity: Houston\n\n", "annotations": [{"start": 15, "end": 24, "cui": "PATIENT", "value": " Ava Lewis"}, {"start": 34, "end": 48, "cui": "HOSPITAL", "value": " 10 Pine Street"}]}, {"name": "doc_20", "text": "\nPatient Name: Henry Adams\nAddress: 5 Elm Street\nCity: San Francisco\nCC: Type 2 diabetes mellitus\n\nHX: Mr. Adams is a 50-year-old male who presents with symptoms of increased thirst, frequent urination, and unintentional weight loss. He reports feeling fatigued and has a family history of diabetes. Laboratory tests revealed elevated fasting blood glucose levels and HbA1c levels, indicating poor glycemic control. Mr. Adams seeks medical assistance to manage his diabetes and prevent complications.\n\nFHX: Father and two siblings have a history of type 2 diabetes.\n\nSHX: Teacher. Non-smoker. No alcohol or drug use.\n\nBased on the clinical presentation and laboratory findings, Mr. Adams is diagnosed with type 2 diabetes mellitus.\n\nSeen by Dr. N. Patel on 7/26/2023.\n\n", "annotations": [{"start": 15, "end": 26, "cui": "PATIENT", "value": " Henry Adams"}, {"start": 36, "end": 48, "cui": "HOSPITAL", "value": " 5 Elm Street"}, {"start": 55, "end": 68, "cui": "HOSPITAL", "value": " San Francisco"}, {"start": 107, "end": 112, "cui": "PATIENT", "value": " Adams"}, {"start": 420, "end": 425, "cui": "PATIENT", "value": " Adams"}, {"start": 682, "end": 687, "cui": "PATIENT", "value": " Adams"}, {"start": 745, "end": 746, "cui": "DOCTOR", "value": " N"}, {"start": 748, "end": 753, "cui": "PATIENT", "value": " Patel"}]}, {"name": "doc_21", "text": "\nPatient Name: Emily Wright\nAddress: 21 Oak Street\nCity: Chicago\nCC: Migraine headaches\n\nHX: Miss Wright is a 25-year-old female who presents with recurrent episodes of severe headache accompanied by nausea, vomiting, and sensitivity to light and sound. She reports experiencing these symptoms since adolescence and seeks medical assistance to manage her migraines and improve her quality of life.\n\nFHX: Mother has a history of migraines.\n\nSHX: Graphic designer. Non-smoker. Occasional alcohol consumption.\n\nThe clinical presentation and symptom pattern are consistent with a diagnosis of migraine headaches.\n\nSeen by Dr. E. Anderson on 7/29/2023.\n\n", "annotations": [{"start": 15, "end": 27, "cui": "PATIENT", "value": " Emily Wright"}, {"start": 37, "end": 50, "cui": "HOSPITAL", "value": " 21 Oak Street"}, {"start": 57, "end": 64, "cui": "HOSPITAL", "value": " Chicago"}, {"start": 98, "end": 104, "cui": "PATIENT", "value": " Wright"}, {"start": 622, "end": 623, "cui": "DOCTOR", "value": " E"}, {"start": 625, "end": 633, "cui": "PATIENT", "value": " Anderson"}]}, {"name": "doc_22", "text": "\nPatient Name: Oliver Mitchell\nAddress: 15 Cedar Avenue\nCity: Boston\nCC: Plantar fasciitis\n\nHX: Mr. Mitchell is a 42-year-old male who presents with heel pain that is worse in the morning and improves with activity. He reports experiencing pain for the past three months, particularly after prolonged periods of standing or walking. Mr. Mitchell seeks medical assistance to alleviate his foot pain and restore his normal daily activities.\n\nFHX: No significant family history of foot or musculoskeletal conditions.\n\nSHX: IT specialist. Non-smoker. No alcohol or drug use.\n\nPhysical examination revealed tenderness and pain along the plantar fascia. Based on the clinical presentation, Mr. Mitchell is diagnosed with plantar fasciitis.\n\nSeen by Dr. L. Patel on 8/2/2023.\n\n", "annotations": [{"start": 15, "end": 30, "cui": "PATIENT", "value": " Oliver Mitchell"}, {"start": 40, "end": 55, "cui": "HOSPITAL", "value": " 15 Cedar Avenue"}, {"start": 62, "end": 68, "cui": "HOSPITAL", "value": " Boston"}, {"start": 100, "end": 108, "cui": "PATIENT", "value": " Mitchell"}, {"start": 337, "end": 345, "cui": "PATIENT", "value": " Mitchell"}, {"start": 688, "end": 696, "cui": "PATIENT", "value": " Mitchell"}, {"start": 747, "end": 748, "cui": "DOCTOR", "value": " L"}, {"start": 750, "end": 755, "cui": "PATIENT", "value": " Patel"}]}, {"name": "doc_23", "text": "\nPatient Name: Victoria Turner\nAddress: 17 Maple Avenue\nCity: New York\nCC: Chronic obstructive pulmonary disease (COPD)\n\nHX: Miss Turner is a 60-year-old female who presents with symptoms of chronic cough, sputum production, and shortness of breath, particularly during physical exertion. She reports a history of smoking for 30 years. Pulmonary function tests revealed airflow limitation and reduced forced expiratory volume. Miss Turner seeks medical assistance to manage her COPD symptoms and optimize her respiratory function.\n\nFHX: No significant family history of respiratory conditions.\n\nSHX: Retired. Former smoker. No alcohol or drug use.\n\nBased on the clinical presentation, smoking history, and pulmonary function test results, Miss Turner is diagnosed with chronic obstructive pulmonary disease.\n\nSeen by Dr. S. Johnson on 8/5/2023.\n\n", "annotations": [{"start": 15, "end": 30, "cui": "PATIENT", "value": " Victoria Turner"}, {"start": 40, "end": 55, "cui": "HOSPITAL", "value": " 17 Maple Avenue"}, {"start": 62, "end": 70, "cui": "HOSPITAL", "value": " New York"}, {"start": 130, "end": 136, "cui": "PATIENT", "value": " Turner"}, {"start": 432, "end": 438, "cui": "PATIENT", "value": " Turner"}, {"start": 744, "end": 750, "cui": "PATIENT", "value": " Turner"}, {"start": 821, "end": 822, "cui": "DOCTOR", "value": " S"}, {"start": 824, "end": 831, "cui": "PATIENT", "value": " Johnson"}]}, {"name": "doc_24", "text": "\nPatient Name: Oliver Parker\nAddress: 22 Oak Street\nCity: Chicago\nCC: Allergic rhinitis\n\nHX: Mr. Parker is a 32-year-old male who presents with symptoms of allergic rhinitis. He reports sneezing, nasal congestion, itching, and a runny nose, particularly during the spring and fall seasons. Symptoms significantly interfere with his daily activities\n\n", "annotations": [{"start": 15, "end": 28, "cui": "PATIENT", "value": " Oliver Parker"}, {"start": 38, "end": 51, "cui": "HOSPITAL", "value": " 22 Oak Street"}, {"start": 58, "end": 65, "cui": "HOSPITAL", "value": " Chicago"}, {"start": 97, "end": 103, "cui": "PATIENT", "value": " Parker"}]}, {"name": "doc_25", "text": "\nPatient Name: Isabella Cooper\nAddress: 12 Willow Lane\nCity: Los Angeles\nCC: Anxiety disorder\n\nHX: Miss Cooper is a 27-year-old female who presents with symptoms of anxiety. She reports excessive worry, restlessness, irritability, muscle tension, and difficulty sleeping. These symptoms have been present for the past year and have progressively worsened. Miss Cooper seeks medical assistance to address her anxiety symptoms and improve her overall well-being.\n\nFHX: No significant family history of anxiety disorders.\n\nSHX: Accountant. Non-smoker. Occasional alcohol consumption.\n\nPsychiatric evaluation revealed symptoms consistent with generalized anxiety disorder. Miss Cooper is experiencing significant distress and impairment in multiple areas of her life.\n\nSeen by Dr. E. Ramirez on 8/8/2023.\n\n", "annotations": [{"start": 15, "end": 30, "cui": "PATIENT", "value": " Isabella Cooper"}, {"start": 40, "end": 54, "cui": "HOSPITAL", "value": " 12 Willow Lane"}, {"start": 61, "end": 72, "cui": "HOSPITAL", "value": " Los Angeles"}, {"start": 104, "end": 110, "cui": "PATIENT", "value": " Cooper"}, {"start": 361, "end": 367, "cui": "PATIENT", "value": " Cooper"}, {"start": 674, "end": 680, "cui": "PATIENT", "value": " Cooper"}, {"start": 777, "end": 778, "cui": "DOCTOR", "value": " E"}, {"start": 780, "end": 787, "cui": "PATIENT", "value": " Ramirez"}]}, {"name": "doc_26", "text": "\nPatient Name: Jacob Martinez\nAddress: 18 Elm Street\nCity: San Francisco\nCC: Hypertensive crisis\n\nHX: Mr. Martinez is a 60-year-old male with a known history of hypertension. He presents with severe headache, chest pain, and shortness of breath. He reports missing his antihypertensive medication for the past three days. Upon measurement, his blood pressure is significantly elevated. Mr. Martinez seeks urgent medical attention to manage his hypertensive crisis.\n\nFHX: Father had a history of hypertension and stroke.\n\nSHX: Retired. Non-smoker. Occasional alcohol consumption.\n\nPhysical examination and blood pressure measurements confirm the diagnosis of hypertensive crisis. Immediate interventions are initiated to lower blood pressure and prevent complications.\n\nSeen by Dr. H. Johnson on 8/11/2023.\n\n", "annotations": [{"start": 15, "end": 29, "cui": "PATIENT", "value": " Jacob Martinez"}, {"start": 39, "end": 52, "cui": "HOSPITAL", "value": " 18 Elm Street"}, {"start": 59, "end": 72, "cui": "HOSPITAL", "value": " San Francisco"}, {"start": 106, "end": 114, "cui": "PATIENT", "value": " Martinez"}, {"start": 390, "end": 398, "cui": "PATIENT", "value": " Martinez"}, {"start": 781, "end": 782, "cui": "DOCTOR", "value": " H"}, {"start": 784, "end": 791, "cui": "PATIENT", "value": " Johnson"}]}, {"name": "doc_27", "text": "\nPatient Name: Ava Foster\nAddress: 14 Pine Street\nCity: Houston\nCC: Peptic ulcer disease\n\nHX: Miss Foster is a 35-year-old female who presents with symptoms of abdominal pain, particularly in the upper abdomen. She reports a burning sensation and occasional nausea. Symptoms worsen after meals. Miss Foster seeks medical assistance to evaluate her abdominal pain and determine the underlying cause.\n\nFHX: No significant family history of gastrointestinal conditions.\n\nSHX: Marketing executive. Non-smoker. Occasional alcohol consumption.\n\nGastroscopy reveals a duodenal ulcer. Helicobacter pylori testing is performed, and the results confirm the presence of H. pylori infection.\n\nSeen by Dr. M. Johnson on 8/14/2023.\n\n", "annotations": [{"start": 15, "end": 25, "cui": "PATIENT", "value": " Ava Foster"}, {"start": 35, "end": 49, "cui": "HOSPITAL", "value": " 14 Pine Street"}, {"start": 56, "end": 63, "cui": "HOSPITAL", "value": " Houston"}, {"start": 99, "end": 105, "cui": "PATIENT", "value": " Foster"}, {"start": 300, "end": 306, "cui": "PATIENT", "value": " Foster"}, {"start": 693, "end": 694, "cui": "DOCTOR", "value": " M"}, {"start": 696, "end": 703, "cui": "PATIENT", "value": " Johnson"}]}, {"name": "doc_28", "text": "\nPatient Name: William Turner\nAddress: 11 Cedar Avenue\nCity: Boston\nCC: Major depressive disorder\n\nHX: Mr. Turner is a 38-year-old male who presents with symptoms of depression. He reports a persistent depressed mood, loss of interest in activities, feelings of worthlessness, changes in appetite, and difficulty concentrating. These symptoms have been present for the past six months and significantly impair his daily functioning. Mr. Turner seeks medical assistance to address his depressive symptoms.\n\nFHX: No significant family history of mood disorders.\n\nSHX: Software engineer. Non-smoker. No alcohol or drug use.\n\nPsychiatric evaluation reveals symptoms consistent with major depressive disorder. Mr. Turner exhibits significant distress and impairment in multiple areas of his life.\n\nSeen by Dr. L. Anderson on 8/17/2023.\n\n", "annotations": [{"start": 15, "end": 29, "cui": "PATIENT", "value": " William Turner"}, {"start": 39, "end": 54, "cui": "HOSPITAL", "value": " 11 Cedar Avenue"}, {"start": 61, "end": 67, "cui": "HOSPITAL", "value": " Boston"}, {"start": 107, "end": 113, "cui": "PATIENT", "value": " Turner"}, {"start": 437, "end": 443, "cui": "PATIENT", "value": " Turner"}, {"start": 709, "end": 715, "cui": "PATIENT", "value": " Turner"}, {"start": 805, "end": 806, "cui": "DOCTOR", "value": " L"}, {"start": 808, "end": 816, "cui": "PATIENT", "value": " Anderson"}]}, {"name": "doc_29", "text": "\nPatient Name: Sophia Reed\nAddress: 9 Willow Lane\nCity: Los Angeles\nCC: Iron-deficiency anemia\n\nHX: Miss Reed is a 29-year-old female who presents with symptoms of fatigue, weakness, and shortness of breath. She reports heavy menstrual bleeding and follows a vegetarian diet. Miss Reed seeks medical assistance to evaluate her symptoms and determine the cause of her anemia.\n\nFHX: No significant\n\n", "annotations": [{"start": 15, "end": 26, "cui": "PATIENT", "value": " Sophia Reed"}, {"start": 36, "end": 49, "cui": "HOSPITAL", "value": " 9 Willow Lane"}, {"start": 56, "end": 67, "cui": "HOSPITAL", "value": " Los Angeles"}, {"start": 105, "end": 109, "cui": "PATIENT", "value": " Reed"}, {"start": 281, "end": 285, "cui": "PATIENT", "value": " Reed"}]}, {"name": "doc_30", "text": "\nName: Olivia Davis\nAddress: 12 Elm Street\nCity: Springfield\nCC: Chronic back pain.\n\nHX: Ms. Davis is a 45-year-old female who presents with chronic lower back pain for the past six months. The pain is described as dull and aching, primarily localized to the lumbar region. It worsens with prolonged sitting or physical activity. She has tried over-the-counter pain medications with limited relief.\n\nFHX: No family history of chronic back pain or spinal disorders.\n\nSHX: Office worker. Non-smoker. Rare alcohol consumption.\n\nOn examination, there is tenderness on palpation over the lumbar spine. Range of motion is slightly restricted. No neurological deficits are noted.\n\nSeen by Dr. R. Martinez on 10/15/2023.\n\n", "annotations": [{"start": 7, "end": 19, "cui": "PATIENT", "value": " Olivia Davis"}, {"start": 29, "end": 42, "cui": "HOSPITAL", "value": " 12 Elm Street"}, {"start": 49, "end": 60, "cui": "HOSPITAL", "value": " Springfield"}, {"start": 93, "end": 98, "cui": "PATIENT", "value": " Davis"}, {"start": 686, "end": 687, "cui": "DOCTOR", "value": " R"}, {"start": 689, "end": 697, "cui": "PATIENT", "value": " Martinez"}]}, {"name": "doc_31", "text": "\nName: Ethan Thompson\nAddress: 18 Oak Avenue\nCity: Riverside\nCC: Abdominal pain.\n\nHX: Mr. Thompson is a 32-year-old male presenting with intermittent abdominal pain for the past two weeks. The pain is localized to the right lower quadrant and is associated with occasional nausea. It is not aggravated by food intake. No changes in bowel movements or urinary symptoms.\n\nFHX: No significant family history of abdominal disorders.\n\nSHX: Office worker. Non-smoker. Occasional alcohol consumption.\n\nAbdominal examination reveals tenderness and mild guarding in the right lower quadrant. No rebound tenderness or palpable masses are noted.\n\nSeen by Dr. S. Reynolds on 10/18/2023.\n\n", "annotations": [{"start": 7, "end": 21, "cui": "PATIENT", "value": " Ethan Thompson"}, {"start": 31, "end": 44, "cui": "HOSPITAL", "value": " 18 Oak Avenue"}, {"start": 51, "end": 60, "cui": "HOSPITAL", "value": " Riverside"}, {"start": 90, "end": 98, "cui": "PATIENT", "value": " Thompson"}, {"start": 648, "end": 649, "cui": "DOCTOR", "value": " S"}, {"start": 651, "end": 659, "cui": "PATIENT", "value": " Reynolds"}]}, {"name": "doc_32", "text": "\nName: Sophia Walker\nAddress: 9 Maple Lane\nCity: Willowville\nCC: Fatigue and weakness.\n\nHX: Ms. Walker is a 52-year-old female who presents with complaints of persistent fatigue and weakness for the past two months. She reports feeling tired even after a good night's sleep and experiences difficulty in performing routine tasks. No specific triggers or alleviating factors identified.\n\nFHX: No family history of chronic fatigue or neuromuscular disorders.\n\nSHX: Homemaker. Non-smoker. No alcohol consumption.\n\nPhysical examination reveals generalized weakness without focal neurological deficits. No abnormal findings on cardiovascular or respiratory examination.\n\nSeen by Dr. L. Carter on 10/21/2023.\n\n", "annotations": [{"start": 7, "end": 20, "cui": "PATIENT", "value": " Sophia Walker"}, {"start": 30, "end": 42, "cui": "HOSPITAL", "value": " 9 Maple Lane"}, {"start": 49, "end": 60, "cui": "HOSPITAL", "value": " Willowville"}, {"start": 96, "end": 102, "cui": "PATIENT", "value": " Walker"}, {"start": 678, "end": 679, "cui": "DOCTOR", "value": " L"}, {"start": 681, "end": 687, "cui": "PATIENT", "value": " Carter"}]}, {"name": "doc_33", "text": "\nName: Benjamin Harris\nAddress: 5 Pine Street\nCity: Meadowville\nCC: Headache and dizziness.\n\nHX: Mr. Harris is a 38-year-old male presenting with recurrent headaches and dizziness for the past month. The headaches are described as throbbing in nature and occur mostly in the afternoon. Dizziness is experienced upon standing up quickly or with sudden head movements.\n\nFHX: No significant family history of migraines or vestibular disorders.\n\nSHX: Office worker. Non-smoker. Rare alcohol consumption.\n\nNeurological examination is unremarkable. No abnormal findings on visual acuity, coordination, or gait.\n\nSeen by Dr. M. Rodriguez on 10/24/2023.\n\n", "annotations": [{"start": 7, "end": 22, "cui": "PATIENT", "value": " Benjamin Harris"}, {"start": 32, "end": 45, "cui": "HOSPITAL", "value": " 5 Pine Street"}, {"start": 52, "end": 63, "cui": "HOSPITAL", "value": " Meadowville"}, {"start": 101, "end": 107, "cui": "PATIENT", "value": " Harris"}, {"start": 618, "end": 619, "cui": "DOCTOR", "value": " M"}, {"start": 621, "end": 630, "cui": "PATIENT", "value": " Rodriguez"}]}, {"name": "doc_34", "text": "\nName: Lily Green\nAddress: 23 Cedar Road\nCity: Woodville\nCC: Allergic rhinitis.\n\nHX: Ms. Green is a 28-year-old female presenting with symptoms of sneezing, nasal congestion, and itchy, watery eyes for the past few weeks. Symptoms are worse in the morning and improve throughout the day. She reports a history of seasonal allergies.\n\nFHX: No significant family history of allergic rhinitis or respiratory disorders.\n\nSHX: Teacher. Non-smoker. No alcohol consumption.\n\nPhysical examination reveals clear nasal discharge, congestion, and allergic shiners. No signs of respiratory distress.\n\nSeen by Dr. K. Mitchell on\n\n", "annotations": [{"start": 7, "end": 17, "cui": "PATIENT", "value": " Lily Green"}, {"start": 27, "end": 40, "cui": "HOSPITAL", "value": " 23 Cedar Road"}, {"start": 47, "end": 56, "cui": "HOSPITAL", "value": " Woodville"}, {"start": 89, "end": 94, "cui": "PATIENT", "value": " Green"}, {"start": 601, "end": 602, "cui": "DOCTOR", "value": " K"}, {"start": 604, "end": 612, "cui": "PATIENT", "value": " Mitchell"}]}, {"name": "doc_35", "text": "\nName: Henry Foster\nAddress: 14 Willow Street\nCity: Meadowville\nCC: Cough and shortness of breath.\n\nHX: Mr. Foster is a 62-year-old male presenting with a persistent cough and shortness of breath for the past two weeks. The cough is productive of yellowish sputum. He reports feeling breathless even with minimal exertion. No fever or chest pain.\n\nFHX: No significant family history of respiratory disorders.\n\nSHX: Retired. Former smoker (quit 10 years ago). No alcohol consumption.\n\nChest auscultation reveals decreased breath sounds and scattered crackles. No wheezing or dullness on percussion.\n\nSeen by Dr. S. Adams on 10/27/2023.\n\n", "annotations": [{"start": 7, "end": 19, "cui": "PATIENT", "value": " Henry Foster"}, {"start": 29, "end": 45, "cui": "HOSPITAL", "value": " 14 Willow Street"}, {"start": 52, "end": 63, "cui": "HOSPITAL", "value": " Meadowville"}, {"start": 108, "end": 114, "cui": "PATIENT", "value": " Foster"}, {"start": 611, "end": 612, "cui": "DOCTOR", "value": " S"}, {"start": 614, "end": 619, "cui": "PATIENT", "value": " Adams"}]}, {"name": "doc_36", "text": "\nName: Emily Evans\nAddress: 8 Cherry Lane\nCity: Riverside\nCC: Sleep disturbances.\n\nHX: Ms. Evans is a 35-year-old female presenting with complaints of sleep disturbances for the past month. She reports difficulty falling asleep and frequent awakenings during the night. No daytime sleepiness or snoring. No significant life stressors identified.\n\nFHX: No family history of sleep disorders or psychiatric conditions.\n\nSHX: Office worker. Non-smoker. Rare alcohol consumption.\n\nNo significant findings on physical examination. Normal mental status and intact concentration.\n\nSeen by Dr. L. Carter on 10/30/2023.\n\n", "annotations": [{"start": 7, "end": 18, "cui": "PATIENT", "value": " Emily Evans"}, {"start": 28, "end": 41, "cui": "HOSPITAL", "value": " 8 Cherry Lane"}, {"start": 48, "end": 57, "cui": "HOSPITAL", "value": " Riverside"}, {"start": 91, "end": 96, "cui": "PATIENT", "value": " Evans"}, {"start": 585, "end": 586, "cui": "DOCTOR", "value": " L"}, {"start": 588, "end": 594, "cui": "PATIENT", "value": " Carter"}]}, {"name": "doc_37", "text": "\nName: Samuel Hayes\nAddress: 11 Elm Street\nCity: Springfield\nCC: Abnormal mole.\n\nHX: Mr. Hayes is a 42-year-old male who noticed an abnormal mole on his back. The mole has increased in size and has an irregular border. He reports occasional itching but no pain or bleeding.\n\nFHX: No significant family history of skin cancer or melanoma.\n\nSHX: Construction worker. Non-smoker. Occasional alcohol consumption.\n\nSkin examination reveals a dark, asymmetrical mole with irregular borders and uneven coloration. No palpable lymph nodes in the surrounding area.\n\nSeen by Dr. R. Martinez on 11/2/2023.\n\n", "annotations": [{"start": 7, "end": 19, "cui": "PATIENT", "value": " Samuel Hayes"}, {"start": 29, "end": 42, "cui": "HOSPITAL", "value": " 11 Elm Street"}, {"start": 49, "end": 60, "cui": "HOSPITAL", "value": " Springfield"}, {"start": 89, "end": 94, "cui": "PATIENT", "value": " Hayes"}, {"start": 569, "end": 570, "cui": "DOCTOR", "value": " R"}, {"start": 572, "end": 580, "cui": "PATIENT", "value": " Martinez"}]}, {"name": "doc_38", "text": "\nName: Isabella Simmons\nAddress: 6 Oak Avenue\nCity: Willowville\nCC: Joint pain and swelling.\n\nHX: Ms. Simmons is a 55-year-old female presenting with joint pain and swelling in her hands and knees for the past three months. The pain is worse in the morning and improves with movement. No history of trauma or recent infections.\n\nFHX: No significant family history of autoimmune disorders or arthritis.\n\nSHX: Teacher. Non-smoker. Rare alcohol consumption.\n\nJoint examination reveals swelling and tenderness in the proximal and distal interphalangeal joints and knees. No erythema or warmth.\n\nSeen by Dr. S. Reynolds on 11/5/2023.\n\n", "annotations": [{"start": 7, "end": 23, "cui": "PATIENT", "value": " Isabella Simmons"}, {"start": 33, "end": 45, "cui": "HOSPITAL", "value": " 6 Oak Avenue"}, {"start": 52, "end": 63, "cui": "HOSPITAL", "value": " Willowville"}, {"start": 102, "end": 109, "cui": "PATIENT", "value": " Simmons"}, {"start": 603, "end": 604, "cui": "DOCTOR", "value": " S"}, {"start": 606, "end": 614, "cui": "PATIENT", "value": " Reynolds"}]}, {"name": "doc_39", "text": "\nName: Daniel Thompson\nAddress: 9 Maple Lane\nCity: Willowville\nCC: Epigastric pain and heartburn.\n\nHX: Mr. Thompson is a 48-year-old male presenting with epigastric pain and heartburn for the past two weeks. The pain is described as a burning sensation and is aggravated by spicy foods and lying down after meals. No vomiting or black, tarry stools.\n\nFHX: No significant family history of gastrointestinal disorders.\n\nSHX: Office worker. Non-smoker. Rare alcohol consumption.\n\nAbdominal examination reveals epigastric tenderness on palpation. No rebound tenderness or organomegaly.\n\nSeen by Dr. L. Carter on 11/8/2023.\n\n", "annotations": [{"start": 7, "end": 22, "cui": "PATIENT", "value": " Daniel Thompson"}, {"start": 32, "end": 44, "cui": "HOSPITAL", "value": " 9 Maple Lane"}, {"start": 51, "end": 62, "cui": "HOSPITAL", "value": " Willowville"}, {"start": 107, "end": 115, "cui": "PATIENT", "value": " Thompson"}, {"start": 595, "end": 596, "cui": "DOCTOR", "value": " L"}, {"start": 598, "end": 604, "cui": "PATIENT", "value": " Carter"}]}, {"name": "doc_40", "text": "\nName: Emily Turner\nAddress: 15 Pine Street\nCity: Meadowville\nCC: Fatigue and weight gain.\n\nHX: Ms. Turner is a 30-year-old female presenting with persistent fatigue and unexplained weight gain over the past six months. She reports feeling tired despite getting adequate sleep and has noticed a significant increase in her weight without changes in her diet or exercise routine.\n\nFHX: No significant family history of endocrine disorders or autoimmune conditions.\n\nSHX: Office worker. Non-smoker. Rare alcohol consumption.\n\nPhysical examination reveals no specific abnormalities. No edema or thyroid enlargement palpable.\n\nSeen by Dr. M. Rodriguez on 11/11/2023.\n\n", "annotations": [{"start": 7, "end": 19, "cui": "PATIENT", "value": " Emily Turner"}, {"start": 29, "end": 43, "cui": "HOSPITAL", "value": " 15 Pine Street"}, {"start": 50, "end": 61, "cui": "HOSPITAL", "value": " Meadowville"}, {"start": 100, "end": 106, "cui": "PATIENT", "value": " Turner"}, {"start": 635, "end": 636, "cui": "DOCTOR", "value": " M"}, {"start": 638, "end": 647, "cui": "PATIENT", "value": " Rodriguez"}]}, {"name": "doc_41", "text": "\nName: Oliver Clark\nAddress: 7 Cedar Road\nCity: Woodville\nCC: Swollen lymph nodes.\n\nHX: Mr. Clark is a 44-year-old male presenting with enlarged lymph nodes in his neck and groin for the past two weeks. The lymph nodes are painless and progressively increasing in size. No fever or night sweats reported.\n\nFHX: No significant family history of lymphatic disorders or malignancies.\n\nSHX: Teacher. Non-smoker. Rare alcohol consumption.\n\nLymph node examination reveals palpable, enlarged lymph nodes in the neck and groin regions. No other abnormal findings.\n\nSeen by Dr. K. Mitchell on 11/14/2023.\n\n", "annotations": [{"start": 7, "end": 19, "cui": "PATIENT", "value": " Oliver Clark"}, {"start": 29, "end": 41, "cui": "HOSPITAL", "value": " 7 Cedar Road"}, {"start": 48, "end": 57, "cui": "HOSPITAL", "value": " Woodville"}, {"start": 92, "end": 97, "cui": "PATIENT", "value": " Clark"}, {"start": 569, "end": 570, "cui": "DOCTOR", "value": " K"}, {"start": 572, "end": 580, "cui": "PATIENT", "value": " Mitchell"}]}, {"name": "doc_42", "text": "\nName: Ava Patterson\nAddress: 13 Willow Street\nCity: Meadowville\nCC: Irregular menstrual cycles.\n\nHX: Ms. Patterson is a 27-year-old female presenting with irregular menstrual cycles for the past six months. She reports unpredictable timing, varying durations, and occasional heavy bleeding during her periods. No significant pain or other associated symptoms.\n\nFHX: No significant family history of gynecological disorders or hormonal imbalances.\n\nSHX: Office worker. Non-smoker. No alcohol consumption.\n\nPelvic examination reveals no palpable masses or tenderness. Normal external genitalia and vaginal walls.\n\nSeen by Dr. S. Adams on 11/17/2023.\n\n", "annotations": [{"start": 7, "end": 20, "cui": "PATIENT", "value": " Ava Patterson"}, {"start": 30, "end": 46, "cui": "HOSPITAL", "value": " 13 Willow Street"}, {"start": 53, "end": 64, "cui": "HOSPITAL", "value": " Meadowville"}, {"start": 106, "end": 115, "cui": "PATIENT", "value": " Patterson"}, {"start": 625, "end": 626, "cui": "DOCTOR", "value": " S"}, {"start": 628, "end": 633, "cui": "PATIENT", "value": " Adams"}]}, {"name": "doc_43", "text": "\nName: Noah Turner\nAddress: 16 Oak Avenue\nCity: Riverside\nCC: Frequent urination and increased thirst.\n\nHX: Mr. Turner is a 58-year-old male presenting with frequent urination and increased thirst for the past month. He reports waking up multiple times during the night to urinate and feeling constantly thirsty throughout the day. No significant weight changes or other urinary symptoms.\n\nFHX: No significant family history of diabetes or renal disorders.\n\nSHX: Retired. Non-smoker. Occasional alcohol consumption.\n\nNo specific findings on physical examination. No edema or signs of dehydration.\n\nSeen by Dr. L. Carter on 11/20/2023.\n\n", "annotations": [{"start": 7, "end": 18, "cui": "PATIENT", "value": " Noah Turner"}, {"start": 28, "end": 41, "cui": "HOSPITAL", "value": " 16 Oak Avenue"}, {"start": 48, "end": 57, "cui": "HOSPITAL", "value": " Riverside"}, {"start": 112, "end": 118, "cui": "PATIENT", "value": " Turner"}, {"start": 610, "end": 611, "cui": "DOCTOR", "value": " L"}, {"start": 613, "end": 619, "cui": "PATIENT", "value": " Carter"}]}, {"name": "doc_44", "text": "\nName: Mia Mitchell\nAddress: 10 Cherry Lane\nCity: Riverside\nCC: Skin rash and itching.\n\nHX: Ms. Mitchell is a 36-year-old female presenting with a skin rash and intense itching for the past week. The rash is characterized by red, raised bumps and appears primarily on her arms and legs. It worsens at night and with exposure to heat.\n\nFHX: No significant family history of skin conditions or allergies.\n\nSHX: Office worker. Non-smoker. Rare alcohol consumption.\n\nSkin examination reveals multiple erythematous papules and plaques with excoriation marks. No signs of infection.\n\nSeen by Dr. R. Martinez on 11/23/2023.\n\n", "annotations": [{"start": 7, "end": 19, "cui": "PATIENT", "value": " Mia Mitchell"}, {"start": 29, "end": 43, "cui": "HOSPITAL", "value": " 10 Cherry Lane"}, {"start": 50, "end": 59, "cui": "HOSPITAL", "value": " Riverside"}, {"start": 96, "end": 104, "cui": "PATIENT", "value": " Mitchell"}, {"start": 590, "end": 591, "cui": "DOCTOR", "value": " R"}, {"start": 593, "end": 601, "cui": "PATIENT", "value": " Martinez"}]}, {"name": "doc_45", "text": "\nName: Ethan Johnson\nAddress: 11 Maple Lane\nCity: Willowville\nCC: Abdominal bloating and constipation.\n\nHX: Mr. Johnson is a 50-year-old male presenting with complaints of abdominal bloating and constipation for the past two months. He reports feeling full quickly after eating and experiences infrequent bowel movements. No significant changes in diet or exercise.\n\nFHX: No significant family history of gastrointestinal disorders.\n\nSHX: Construction worker. Non-smoker. Occasional alcohol consumption.\n\nAbdominal examination reveals distension and mild tenderness on palpation. No masses or organomegaly appreciated.\n\nSeen by Dr. S. Reynolds on 11/26/2023.\n\n", "annotations": [{"start": 7, "end": 20, "cui": "PATIENT", "value": " Ethan Johnson"}, {"start": 30, "end": 43, "cui": "HOSPITAL", "value": " 11 Maple Lane"}, {"start": 50, "end": 61, "cui": "HOSPITAL", "value": " Willowville"}, {"start": 112, "end": 119, "cui": "PATIENT", "value": " Johnson"}, {"start": 632, "end": 633, "cui": "DOCTOR", "value": " S"}, {"start": 635, "end": 643, "cui": "PATIENT", "value": " Reynolds"}]}, {"name": "doc_46", "text": "\nName: Sophia Nelson\nAddress: 17 Elm Street\nCity: Springfield\nCC: Anxiety and panic attacks.\n\nHX: Ms. Nelson is a 33-year-old female presenting with symptoms of anxiety and recurrent panic attacks for the past six months. She describes episodes of sudden fear, rapid heartbeat, shortness of breath, and sweating. No specific triggers identified.\n\nFHX: No significant family history of anxiety or psychiatric disorders.\n\nSHX: Office worker. Non-smoker. Rare alcohol consumption.\n\nNormal findings on physical examination. No signs of distress during the evaluation.\n\nSeen by Dr. M. Rodriguez on 11/29/2023.\n\n", "annotations": [{"start": 7, "end": 20, "cui": "PATIENT", "value": " Sophia Nelson"}, {"start": 30, "end": 43, "cui": "HOSPITAL", "value": " 17 Elm Street"}, {"start": 50, "end": 61, "cui": "HOSPITAL", "value": " Springfield"}, {"start": 102, "end": 108, "cui": "PATIENT", "value": " Nelson"}, {"start": 577, "end": 578, "cui": "DOCTOR", "value": " M"}, {"start": 580, "end": 589, "cui": "PATIENT", "value": " Rodriguez"}]}, {"name": "doc_47", "text": "\nName: Olivia Clark\nAddress: 9 Cedar Road\nCity: Woodville\nCC: Knee pain and swelling.\n\nHX: Ms. Clark is a 42-year-old female presenting with pain and swelling in her right knee for the past month. She reports that the symptoms started gradually and worsen with prolonged activity or climbing stairs. No history of trauma or previous knee issues.\n\nFHX: No significant family history of joint disorders or arthritis.\n\nSHX: Teacher. Non-smoker. No alcohol consumption.\n\nOn examination, there is swelling and tenderness in the right knee joint. Limited range of motion due to pain.\n\nSeen by Dr. K. Mitchell on 12/2/2023.\n\n", "annotations": [{"start": 7, "end": 19, "cui": "PATIENT", "value": " Olivia Clark"}, {"start": 29, "end": 41, "cui": "HOSPITAL", "value": " 9 Cedar Road"}, {"start": 48, "end": 57, "cui": "HOSPITAL", "value": " Woodville"}, {"start": 95, "end": 100, "cui": "PATIENT", "value": " Clark"}, {"start": 591, "end": 592, "cui": "DOCTOR", "value": " K"}, {"start": 594, "end": 602, "cui": "PATIENT", "value": " Mitchell"}]}, {"name": "doc_48", "text": "\nName: Benjamin Anderson\nAddress: 12 Oak Avenue\nCity: Riverside\nCC: Sore throat and difficulty swallowing.\n\nHX: Mr. Anderson is a 28-year-old male presenting with a sore throat and difficulty swallowing for the past week. He reports pain and discomfort with swallowing, especially with solid foods. No fever, cough, or other respiratory symptoms.\n\nFHX: No significant family history of throat infections or inflammatory conditions.\n\nSHX: Office worker. Non-smoker. Occasional alcohol consumption.\n\nThroat examination reveals erythema and swelling of the posterior pharynx. No tonsillar enlargement or exudate.\n\nSeen by Dr. L. Carter on 12/5/2023.\n\n", "annotations": [{"start": 7, "end": 24, "cui": "PATIENT", "value": " Benjamin Anderson"}, {"start": 34, "end": 47, "cui": "HOSPITAL", "value": " 12 Oak Avenue"}, {"start": 54, "end": 63, "cui": "HOSPITAL", "value": " Riverside"}, {"start": 116, "end": 124, "cui": "PATIENT", "value": " Anderson"}, {"start": 623, "end": 624, "cui": "DOCTOR", "value": " L"}, {"start": 626, "end": 632, "cui": "PATIENT", "value": " Carter"}]}, {"name": "doc_49", "text": "\nName: Lily Cooper\nAddress: 14 Cherry Lane\nCity: Riverside\nCC: Frequent headaches.\n\nHX: Ms. Cooper is a 25-year-old female presenting with recurrent headaches for the past three months. The headaches occur several times a week and are described as throbbing in nature. No specific triggers or associated symptoms identified.\n\nFHX: No significant family history of migraines or neurological disorders.\n\nSHX: Teacher. Non-smoker. Rare alcohol consumption.\n\nNormal neurological examination. No focal deficits or abnormalities.\n\nSeen by Dr. R. Martinez on 12/8/2023.\n\n", "annotations": [{"start": 7, "end": 18, "cui": "PATIENT", "value": " Lily Cooper"}, {"start": 28, "end": 42, "cui": "HOSPITAL", "value": " 14 Cherry Lane"}, {"start": 49, "end": 58, "cui": "HOSPITAL", "value": " Riverside"}, {"start": 92, "end": 98, "cui": "PATIENT", "value": " Cooper"}, {"start": 537, "end": 538, "cui": "DOCTOR", "value": " R"}, {"start": 540, "end": 548, "cui": "PATIENT", "value": " Martinez"}]}, {"name": "doc_50", "text": "\nName: Sophia Williams\nAddress: 15 Elm Street\nCity: Springfield\nCC: Fatigue and muscle weakness.\n\nHX: Ms. Williams is a 42-year-old female presenting with persistent fatigue and muscle weakness for the past two months. She reports feeling tired even after getting sufficient rest and experiences difficulty performing daily activities. No significant weight changes or other associated symptoms.\n\nFHX: No significant family history of muscular disorders or autoimmune conditions.\n\nSHX: Office worker. Non-smoker. Occasional alcohol consumption.\n\nPhysical examination reveals decreased muscle strength and generalized weakness. No specific findings on neurological evaluation.\n\nSeen by Dr. S. Adams on 12/11/2023.\n\n", "annotations": [{"start": 7, "end": 22, "cui": "PATIENT", "value": " Sophia Williams"}, {"start": 32, "end": 45, "cui": "HOSPITAL", "value": " 15 Elm Street"}, {"start": 52, "end": 63, "cui": "HOSPITAL", "value": " Springfield"}, {"start": 106, "end": 114, "cui": "PATIENT", "value": " Williams"}, {"start": 689, "end": 690, "cui": "DOCTOR", "value": " S"}, {"start": 692, "end": 697, "cui": "PATIENT", "value": " Adams"}]}, {"name": "doc_51", "text": "\nName: Benjamin Turner\nAddress: 16 Pine Street\nCity: Meadowville\nCC: Chest pain and shortness of breath.\n\nHX: Mr. Turner is a 52-year-old male presenting with complaints of chest pain and shortness of breath for the past week. The chest pain is described as a squeezing sensation and is accompanied by breathlessness during exertion. No associated symptoms of dizziness or palpitations.\n\nFHX: No significant family history of cardiac disorders or cardiovascular conditions.\n\nSHX: Retired. Non-smoker. Rare alcohol consumption.\n\nCardiovascular examination reveals regular heart sounds and no murmurs. No signs of respiratory distress.\n\nSeen by Dr. M. Rodriguez on 12/14/2023.\n\n", "annotations": [{"start": 7, "end": 22, "cui": "PATIENT", "value": " Benjamin Turner"}, {"start": 32, "end": 46, "cui": "HOSPITAL", "value": " 16 Pine Street"}, {"start": 53, "end": 64, "cui": "HOSPITAL", "value": " Meadowville"}, {"start": 114, "end": 120, "cui": "PATIENT", "value": " Turner"}, {"start": 647, "end": 648, "cui": "DOCTOR", "value": " M"}, {"start": 650, "end": 659, "cui": "PATIENT", "value": " Rodriguez"}]}, {"name": "doc_52", "text": "\nName: Chloe Parker\nAddress: 13 Cedar Road\nCity: Woodville\nCC: Frequent urination and burning sensation.\n\nHX: Ms. Parker is a 30-year-old female presenting with frequent urination and a burning sensation during urination for the past week. She reports a sense of urgency to urinate and occasional lower abdominal discomfort. No fever or back pain.\n\nFHX: No significant family history of urinary tract infections or urological conditions.\n\nSHX: Office worker. Non-smoker. Rare alcohol consumption.\n\nNo specific findings on physical examination. No costovertebral angle tenderness.\n\nSeen by Dr. K. Mitchell on 12/17/2023.\n\n", "annotations": [{"start": 7, "end": 19, "cui": "PATIENT", "value": " Chloe Parker"}, {"start": 29, "end": 42, "cui": "HOSPITAL", "value": " 13 Cedar Road"}, {"start": 49, "end": 58, "cui": "HOSPITAL", "value": " Woodville"}, {"start": 114, "end": 120, "cui": "PATIENT", "value": " Parker"}, {"start": 593, "end": 594, "cui": "DOCTOR", "value": " K"}, {"start": 596, "end": 604, "cui": "PATIENT", "value": " Mitchell"}]}, {"name": "doc_53", "text": "\nName: Oliver Lewis\nAddress: 10 Maple Lane\nCity: Willowville\nCC: Vision changes and eye pain.\n\nHX: Mr. Lewis is a 60-year-old male presenting with vision changes and intermittent eye pain in his right eye for the past month. He reports blurred vision and the sensation of pressure in the eye. No redness or discharge noted.\n\nFHX: No significant family history of eye disorders or ocular conditions.\n\nSHX: Construction worker. Non-smoker. Occasional alcohol consumption.\n\nVisual acuity testing reveals decreased vision in the right eye. No external abnormalities or conjunctival injection.\n\nSeen by Dr. S. Reynolds on 12/20/2023.\n\n", "annotations": [{"start": 7, "end": 19, "cui": "PATIENT", "value": " Oliver Lewis"}, {"start": 29, "end": 42, "cui": "HOSPITAL", "value": " 10 Maple Lane"}, {"start": 49, "end": 60, "cui": "HOSPITAL", "value": " Willowville"}, {"start": 103, "end": 108, "cui": "PATIENT", "value": " Lewis"}, {"start": 602, "end": 603, "cui": "DOCTOR", "value": " S"}, {"start": 605, "end": 613, "cui": "PATIENT", "value": " Reynolds"}]}, {"name": "doc_54", "text": "\nName: Emma Peterson\nAddress: 17 Oak Avenue\nCity: Riverside\nCC: Abdominal pain and diarrhea.\n\nHX: Ms. Peterson is a 38-year-old female presenting with abdominal pain and frequent episodes of diarrhea for the past week. The abdominal pain is crampy in nature and is associated with loose, watery stools. No blood or mucus in the stool.\n\nFHX: No significant family history of gastrointestinal disorders.\n\nSHX: Teacher. Non-smoker. No alcohol consumption.\n\nAbdominal examination reveals tenderness in the lower abdomen. No rebound tenderness or palpable masses.\n\nSeen by Dr. L. Carter on 12/23/2023.\n\n", "annotations": [{"start": 7, "end": 20, "cui": "PATIENT", "value": " Emma Peterson"}, {"start": 30, "end": 43, "cui": "HOSPITAL", "value": " 17 Oak Avenue"}, {"start": 50, "end": 59, "cui": "HOSPITAL", "value": " Riverside"}, {"start": 102, "end": 110, "cui": "PATIENT", "value": " Peterson"}, {"start": 572, "end": 573, "cui": "DOCTOR", "value": " L"}, {"start": 575, "end": 581, "cui": "PATIENT", "value": " Carter"}]}, {"name": "doc_55", "text": "\nName: Amelia Adams\nAddress: 11 Willow Street\nCity: Meadowville\nCC: Depression and loss of interest.\n\nHX: Ms. Adams is a 35-year-old female presenting with symptoms of depression and loss of interest in activities for the past six months. She reports feeling sad, hopeless, and having a decreased motivation to engage in previously enjoyed hobbies. No suicidal thoughts or changes in appetite.\n\nFHX: No significant family history of mood disorders or psychiatric conditions.\n\nSHX: Office worker. Non-smoker. Rare alcohol consumption.\n\nNo specific findings on physical examination. No signs of distress during the evaluation.\n\nSeen by Dr. R. Martinez on 12/26/2023.\n\n", "annotations": [{"start": 7, "end": 19, "cui": "PATIENT", "value": " Amelia Adams"}, {"start": 29, "end": 45, "cui": "HOSPITAL", "value": " 11 Willow Street"}, {"start": 52, "end": 63, "cui": "HOSPITAL", "value": " Meadowville"}, {"start": 110, "end": 115, "cui": "PATIENT", "value": " Adams"}, {"start": 638, "end": 639, "cui": "DOCTOR", "value": " R"}, {"start": 641, "end": 649, "cui": "PATIENT", "value": " Martinez"}]}, {"name": "doc_56", "text": "\nName: Henry Turner\nAddress: 12 Pine Street\nCity: Meadowville\nCC: Joint pain and stiffness.\n\nHX: Mr. Turner is a 60-year-old male presenting with joint pain and stiffness in his hands and knees for the past three months. He reports difficulty with movements, especially in the mornings, and occasional swelling in the affected joints. No history of trauma or previous joint disorders.\n\nFHX: No significant family history of arthritis or rheumatic conditions.\n\nSHX: Retired. Non-smoker. Rare alcohol consumption.\n\nOn examination, there is tenderness, warmth, and swelling in the affected joints. Limited range of motion due to pain.\n\nSeen by Dr. M. Rodriguez on 12/29/2023.\n\n", "annotations": [{"start": 7, "end": 19, "cui": "PATIENT", "value": " Henry Turner"}, {"start": 29, "end": 43, "cui": "HOSPITAL", "value": " 12 Pine Street"}, {"start": 50, "end": 61, "cui": "HOSPITAL", "value": " Meadowville"}, {"start": 101, "end": 107, "cui": "PATIENT", "value": " Turner"}, {"start": 645, "end": 646, "cui": "DOCTOR", "value": " M"}, {"start": 648, "end": 657, "cui": "PATIENT", "value": " Rodriguez"}]}, {"name": "doc_57", "text": "\nName: Harper Mitchell\nAddress: 14 Cedar Road\nCity: Woodville\nCC: Allergic rhinitis and nasal congestion.\n\nHX: Ms. Mitchell is a 28-year-old female presenting with symptoms of allergic rhinitis, including nasal congestion, sneezing, and itchy eyes, for the past two weeks. She reports these symptoms are worse in the morning and in certain environments. No history of sinus infections or nasal polyps.\n\nFHX: No significant family history of allergies or respiratory conditions.\n\nSHX: Office worker. Non-smoker. Rare alcohol consumption.\n\nNasal examination reveals nasal congestion, clear rhinorrhea, and pale, boggy nasal mucosa. No signs of septal deviation or polyps.\n\nSeen by Dr. K. Mitchell on 1/2/2024.\n\n", "annotations": [{"start": 7, "end": 22, "cui": "PATIENT", "value": " Harper Mitchell"}, {"start": 32, "end": 45, "cui": "HOSPITAL", "value": " 14 Cedar Road"}, {"start": 52, "end": 61, "cui": "HOSPITAL", "value": " Woodville"}, {"start": 115, "end": 123, "cui": "PATIENT", "value": " Mitchell"}, {"start": 683, "end": 684, "cui": "DOCTOR", "value": " K"}, {"start": 686, "end": 694, "cui": "PATIENT", "value": " Mitchell"}]}, {"name": "doc_58", "text": "\nName: Jackson Turner\nAddress: 9 Maple Lane\nCity: Willowville\nCC: Sleep disturbances and daytime sleepiness.\n\nHX: Mr. Turner is a 45-year-old male presenting with complaints of sleep disturbances and excessive daytime sleepiness for the past three months. He reports difficulty falling asleep, frequent awakenings during the night, and feeling tired during the day despite sufficient hours of sleep.\n\nFHX: No significant family history of sleep disorders or neurological conditions.\n\nSHX: Construction worker. Non-smoker. Occasional alcohol consumption.\n\nNo specific findings on physical examination. No signs of respiratory disorders.\n\nSeen by Dr. S. Reynolds on 1/5/2024.\n\n", "annotations": [{"start": 7, "end": 21, "cui": "PATIENT", "value": " Jackson Turner"}, {"start": 31, "end": 43, "cui": "HOSPITAL", "value": " 9 Maple Lane"}, {"start": 50, "end": 61, "cui": "HOSPITAL", "value": " Willowville"}, {"start": 118, "end": 124, "cui": "PATIENT", "value": " Turner"}, {"start": 649, "end": 650, "cui": "DOCTOR", "value": " S"}, {"start": 652, "end": 660, "cui": "PATIENT", "value": " Reynolds"}]}, {"name": "doc_59", "text": "\nName: Penelope Walker\nAddress: 13 Oak Avenue\nCity: Riverside\nCC: Nausea and vomiting.\n\nHX: Ms. Walker is a 42-year-old female presenting with symptoms of nausea and vomiting for the past two days. She reports episodes of sudden, uncontrollable vomiting and a persistent feeling of queasiness. No abdominal pain or changes in bowel movements.\n\nFHX: No significant family history of gastrointestinal disorders.\n\nSHX: Teacher. Non-smoker. No alcohol consumption.\n\nAbdominal examination reveals no tenderness or palpable masses. No signs of dehydration.\n\nSeen by Dr. L. Carter on 1/8/2024.", "annotations": [{"start": 7, "end": 22, "cui": "PATIENT", "value": " Penelope Walker"}, {"start": 32, "end": 45, "cui": "HOSPITAL", "value": " 13 Oak Avenue"}, {"start": 52, "end": 61, "cui": "HOSPITAL", "value": " Riverside"}, {"start": 96, "end": 102, "cui": "PATIENT", "value": " Walker"}, {"start": 564, "end": 565, "cui": "DOCTOR", "value": " L"}, {"start": 567, "end": 573, "cui": "PATIENT", "value": " Carter"}]}]}]} \ No newline at end of file diff --git a/tests/utils/ner/__init__.py b/tests/utils/ner/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/utils/ner/test_deid.py b/tests/utils/ner/test_deid.py new file mode 100644 index 000000000..dcc8938b8 --- /dev/null +++ b/tests/utils/ner/test_deid.py @@ -0,0 +1,118 @@ +from medcat.utils.ner import deid +from medcat.utils.ner import make_or_update_cdb + +from medcat.ner import transformers_ner + +from spacy.tokens import Doc + +from typing import Any, List, Tuple +import os + +import unittest + +FILE_DIR = os.path.dirname(os.path.realpath(__file__)) + + +# NB! This 'training data' is extremely flawed +# it is only (somewhat) useful for the purpose of this +# test +# DO NOT USE THIS DATA ELSEWHERE - IT WILL NOT BE USEFUL +TRAIN_DATA = os.path.join(FILE_DIR, "..", "..", + "resources", "deid_train_data.json") + + +class DeIDmodelCreationTests(unittest.TestCase): + + def test_can_make_cdb(self): + cdb = make_or_update_cdb(TRAIN_DATA) + self.assertIsNotNone(cdb) + + def test_can_create_model(self): + cdb = make_or_update_cdb(TRAIN_DATA) + config = transformers_ner.ConfigTransformersNER() + config.general['test_size'] = 0.1 # Usually set this to 0.1-0.2 + ner = transformers_ner.TransformersNER(cdb=cdb, config=config) + deid_model = deid.DeIdModel.create(ner) + self.assertIsNotNone(deid_model) + + +def _add_model(cls): + cdb = make_or_update_cdb(TRAIN_DATA) + config = transformers_ner.ConfigTransformersNER() + config.general['test_size'] = 0.1 # Usually set this to 0.1-0.2 + cls.ner = transformers_ner.TransformersNER(cdb=cdb, config=config) + cls.ner.training_arguments.num_train_epochs = 1 # Use 5-10 normally + # As we are NOT training on a GPU that can, we'll set it to 1 + cls.ner.training_arguments.per_device_train_batch_size = 1 + cls.ner.training_arguments.gradient_accumulation_steps = 1 # No need for acc + cls.ner.training_arguments.per_device_eval_batch_size = 1 + # For the metric to be used for best model we pick Recall here, as for deid that is most important + cls.ner.training_arguments.metric_for_best_model = 'eval_recall' + cls.deid_model = deid.DeIdModel.create(cls.ner) + + +def train_model_once(model: deid.DeIdModel, + _trained: List[Tuple[Tuple[Any, Any, Any], + deid.DeIdModel]] = [] + ) -> Tuple[Tuple[Any, Any, Any], deid.DeIdModel]: + if not _trained: + retval = model.train(TRAIN_DATA) + _trained.append((retval, model)) + return _trained[0] + + +class DeIDModelTests(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + _add_model(cls) + + def test_training(self): + df, examples, dataset = train_model_once(self.deid_model)[0] + self.assertIsNotNone(df) + self.assertIsNotNone(examples) + self.assertIsNotNone(dataset) + + +input_text = ''' +James Joyce +7 Eccles Street, +Dublin +CC: Memory difficulty. + +HX: Mr James is a 64 y/o RHM, had difficulty remembering names, phone numbers and events for 12 months prior to presentation, on 2/28/95. He had visited London recently and had had no professional or social faux pas or mishaps due to his memory. J.J. could not tell whether his problem was becoming worse, so he brought himself to the Neurology clinic on his own referral. + +FHX: Both parents (Mary and John) experienced memory problems in their ninth decades, but not earlier. 5 siblings have had no memory trouble. There are no neurological illnesses in his family. + +SHX: Writer and Poet. Tobacco/ETOH/illicit drug use. + +The rest of the neurologic exam was unremarkable and there were no extrapyramidal signs or primitive reflexes noted. +11/1996 in Dublin. + +The findings indicated multiple areas of cerebral dysfunction. With the exception of the patient's report of minimal occupational dysfunction ( which may reflect poor insight), the clinical picture is consistent with a progressive dementia syndrome such as Alzheimer disease. MRI brain, 3/6/95, showed mild generalized atrophy, more severe in the occipital-parietal regions. + +Seen by Dr. M. Sully on 11/11/1996. +''' + + +class DeIDModelWorks(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + _add_model(cls) + cls.deid_model = train_model_once(cls.deid_model)[1] + + def test_model_works_deid_text(self): + anon_text = self.deid_model.deid_text(input_text) + self.assertIn("[DOCTOR]", anon_text) + self.assertIn("[HOSPITAL]", anon_text) + + def test_model_works_dunder_call(self): + anon_doc = self.deid_model(input_text) + self.assertIsInstance(anon_doc, Doc) + + def test_model_works_deid_text_redact(self): + anon_text = self.deid_model.deid_text(input_text, redact=True) + self.assertIn("****", anon_text) + self.assertNotIn("[DOCTOR]", anon_text) + self.assertNotIn("[HOSPITAL]", anon_text) From 221b61d2af522644f734fb6915d27e64949387c4 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Wed, 5 Jul 2023 18:42:24 +0300 Subject: [PATCH 10/17] CU-862k1tt90 Fix circular imports by moving raw deid method back to helpers module (#328) * CU-862k1tt90 Fix circular imports by moving raw deid method back to helpers module * CU-862k1tt90 Fix missing import regarding deid * CU-862k1tt90 Remove unnecessary newline --- medcat/utils/ner/deid.py | 34 ++-------------------------------- medcat/utils/ner/helpers.py | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/medcat/utils/ner/deid.py b/medcat/utils/ner/deid.py index 122433b13..7c5d0231c 100644 --- a/medcat/utils/ner/deid.py +++ b/medcat/utils/ner/deid.py @@ -39,6 +39,8 @@ from medcat.cat import CAT from medcat.utils.ner.model import NerModel +from medcat.utils.ner.helpers import _deid_text as deid_text + class DeIdModel(NerModel): """The DeID model. @@ -110,35 +112,3 @@ def _get_reason_not_deid(cls, cat: CAT) -> str: if len(cat._addl_ner) != 1: return f"Incorrect number of addl_ner: {len(cat._addl_ner)}" return "" - - -# For now, we will keep this method separate from the above class -# This is so that we wouldn't need to create a thorwaway object -# when calling the method from .helpers where it used to be. -# After the deprecated method in .helpers is removed, we can -# move this to a proper class method. -def deid_text(cat: CAT, text: str, redact: bool = False) -> str: - """De-identify text. - - De-identified text. - If redaction is enabled, identifiable entities will be - replaced with starts (e.g `*****`). - Otherwise, the replacement will be the CUI or in other words, - the type of information that was hidden (e.g [PATIENT]). - - - Args: - cat (CAT): The CAT object to use for deid. - text (str): The input document. - redact (bool, optional): Whether to redact. Defaults to False. - - Returns: - str: The de-identified document. - """ - new_text = str(text) - entities = cat.get_entities(text)['entities'] - for ent in sorted(entities.values(), key=lambda ent: ent['start'], reverse=True): - r = "*"*(ent['end']-ent['start'] - ) if redact else cat.cdb.get_name(ent['cui']) - new_text = new_text[:ent['start']] + f'[{r}]' + new_text[ent['end']:] - return new_text diff --git a/medcat/utils/ner/helpers.py b/medcat/utils/ner/helpers.py index 518aecc22..7dcada3dd 100644 --- a/medcat/utils/ner/helpers.py +++ b/medcat/utils/ner/helpers.py @@ -1,10 +1,41 @@ from medcat.utils.data_utils import count_annotations from medcat.cdb import CDB -from medcat.utils.ner.deid import deid_text as _deid_text from medcat.utils.decorators import deprecated +# For now, we will keep this method separate from the above class +# This is so that we wouldn't need to create a thorwaway object +# when calling the method from .helpers where it used to be. +# After the deprecated method in .helpers is removed, we can +# move this to a proper class method. +def _deid_text(cat, text: str, redact: bool = False) -> str: + """De-identify text. + + De-identified text. + If redaction is enabled, identifiable entities will be + replaced with starts (e.g `*****`). + Otherwise, the replacement will be the CUI or in other words, + the type of information that was hidden (e.g [PATIENT]). + + + Args: + cat (CAT): The CAT object to use for deid. + text (str): The input document. + redact (bool, optional): Whether to redact. Defaults to False. + + Returns: + str: The de-identified document. + """ + new_text = str(text) + entities = cat.get_entities(text)['entities'] + for ent in sorted(entities.values(), key=lambda ent: ent['start'], reverse=True): + r = "*"*(ent['end']-ent['start'] + ) if redact else cat.cdb.get_name(ent['cui']) + new_text = new_text[:ent['start']] + f'[{r}]' + new_text[ent['end']:] + return new_text + + @deprecated("API now allows creating a DeId model (medcat.utils.ner.deid.DeIdModel). " "It aims to simplify the usage of DeId models. " "The use of this model is encouraged over the use of this method.") From 4573312e6154932a5c96a89bb0d6737ee50ce15d Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Wed, 5 Jul 2023 19:02:16 +0300 Subject: [PATCH 11/17] Cu 863h30jyb separate train from data load (#329) * CU-863h30jyb Deprecated train_supervised method in favour of train_supervised_from_json method * CU-863h30jyb Shuffle around docstrings for supoervised training methods * CU-863h30jyb Create new train_supervised_raw method for raw data based training * CU-863h30jyb In MetaCat deprecate train method and replace with train_from_json method * CU-863h30jyb In MetaCat add train_raw method and move most of the training logic into that one * CU-863h30jyb Fix type hint --- medcat/cat.py | 99 ++++++++++++++++++++++++++++++++++++++++++---- medcat/meta_cat.py | 50 ++++++++++++++++++++++- 2 files changed, 140 insertions(+), 9 deletions(-) diff --git a/medcat/cat.py b/medcat/cat.py index bd19a6f0b..36fc265bb 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -842,6 +842,8 @@ def add_and_train_concept(self, for _cui in cuis: self.linker.context_model.train(cui=_cui, entity=spacy_entity, doc=spacy_doc, negative=True) # type: ignore + @deprecated(message="Use train_supervised_from_json to train based on data " + "loaded from a json file") def train_supervised(self, data_path: str, reset_cui_count: bool = False, @@ -861,9 +863,93 @@ def train_supervised(self, checkpoint: Optional[Checkpoint] = None, retain_filters: bool = False, is_resumed: bool = False) -> Tuple: - """TODO: Refactor, left from old - Run supervised training on a dataset from MedCATtrainer. Please take care that this is more a simulated - online training then supervised. + """Train supervised by reading data from a json file. + + Refer to `train_supervvised_from_json` and/or `train_supervised_raw` + for further details. + """ + return self.train_supervised_from_json(data_path, reset_cui_count, nepochs, + print_stats, use_filters, terminate_last, + use_overlaps, use_cui_doc_limit, test_size, + devalue_others, use_groups, never_terminate, + train_from_false_positives, extra_cui_filter, + retain_extra_cui_filter, checkpoint, + retain_filters, is_resumed) + + def train_supervised_from_json(self, + data_path: str, + reset_cui_count: bool = False, + nepochs: int = 1, + print_stats: int = 0, + use_filters: bool = False, + terminate_last: bool = False, + use_overlaps: bool = False, + use_cui_doc_limit: bool = False, + test_size: int = 0, + devalue_others: bool = False, + use_groups: bool = False, + never_terminate: bool = False, + train_from_false_positives: bool = False, + extra_cui_filter: Optional[Set] = None, + retain_extra_cui_filter: bool = False, + checkpoint: Optional[Checkpoint] = None, + retain_filters: bool = False, + is_resumed: bool = False) -> Tuple: + """ + Run supervised training on a dataset from MedCATtrainer in JSON format. + + Refer to `train_supervised_raw` for more details. + """ + with open(data_path) as f: + data = json.load(f) + return self.train_supervised_raw(data, reset_cui_count, nepochs, + print_stats, use_filters, terminate_last, + use_overlaps, use_cui_doc_limit, test_size, + devalue_others, use_groups, never_terminate, + train_from_false_positives, extra_cui_filter, + retain_extra_cui_filter, checkpoint, + retain_filters, is_resumed) + + def train_supervised_raw(self, + data: Dict[str, List[Dict[str, dict]]], + reset_cui_count: bool = False, + nepochs: int = 1, + print_stats: int = 0, + use_filters: bool = False, + terminate_last: bool = False, + use_overlaps: bool = False, + use_cui_doc_limit: bool = False, + test_size: int = 0, + devalue_others: bool = False, + use_groups: bool = False, + never_terminate: bool = False, + train_from_false_positives: bool = False, + extra_cui_filter: Optional[Set] = None, + retain_extra_cui_filter: bool = False, + checkpoint: Optional[Checkpoint] = None, + retain_filters: bool = False, + is_resumed: bool = False) -> Tuple: + """Train supervised based on the raw data provided. + + The raw data is expected in the following format: + {'projects': + [ # list of projects + { # project 1 + 'name': '', + # list of documents + 'documents': [{'name': '', # document 1 + 'text': '', + # list of annotations + 'annotations': [{'start': -1, # annotation 1 + 'end': 1, + 'cui': 'cui', + 'value': ''}, ...], + }, ...] + }, ... + ] + } + + Please take care that this is more a simulated online training then supervised. When filtering, the filters within the CAT model are used first, then the ones from MedCATtrainer (MCT) export filters, @@ -872,8 +958,8 @@ def train_supervised(self, extra_cui_filter ⊆ MCT filter ⊆ Model/config filter. Args: - data_path (str): - The path to the json file that we get from MedCATtrainer on export. + data (Dict[str, List[Dict[str, dict]]]): + The raw data, e.g from MedCATtrainer on export. reset_cui_count (boolean): Used for training with weight_decay (annealing). Each concept has a count that is there from the beginning of the CDB, that count is used for annealing. Resetting the count will @@ -942,8 +1028,7 @@ def train_supervised(self, local_filters = self.config.linking.filters.copy_of() fp = fn = tp = p = r = f1 = examples = {} - with open(data_path) as f: - data = json.load(f) + cui_counts = {} if retain_filters: diff --git a/medcat/meta_cat.py b/medcat/meta_cat.py index 374b55978..d92e6ea61 100644 --- a/medcat/meta_cat.py +++ b/medcat/meta_cat.py @@ -15,6 +15,7 @@ from medcat.pipeline.pipe_runner import PipeRunner from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBase from medcat.utils.meta_cat.data_utils import Doc as FakeDoc +from medcat.utils.decorators import deprecated # It should be safe to do this always, as all other multiprocessing # will be finished before data comes to meta_cat @@ -98,6 +99,7 @@ def get_hash(self): hasher.update(self.config.get_hash()) return hasher.hexdigest() + @deprecated(message="Use `train_from_json` or `train_raw` instead") def train(self, json_path: Union[str, list], save_dir_path: Optional[str] = None) -> Dict: """Train or continue training a model give a json_path containing a MedCATtrainer export. It will continue training if an existing model is loaded or start new training if the model is blank/new. @@ -109,8 +111,19 @@ def train(self, json_path: Union[str, list], save_dir_path: Optional[str] = None In case we have aut_save_model (meaning during the training the best model will be saved) we need to set a save path. Defaults to `None`. """ - g_config = self.config.general - t_config = self.config.train + return self.train_from_json(json_path, save_dir_path) + + def train_from_json(self, json_path: Union[str, list], save_dir_path: Optional[str] = None) -> Dict: + """Train or continue training a model give a json_path containing a MedCATtrainer export. It will + continue training if an existing model is loaded or start new training if the model is blank/new. + + Args: + json_path (Union[str, list]): + Path/Paths to a MedCATtrainer export containing the meta_annotations we want to train for. + save_dir_path (Optional[str]): + In case we have aut_save_model (meaning during the training the best model will be saved) + we need to set a save path. Defaults to `None`. + """ # Load the medcattrainer export if isinstance(json_path, str): @@ -131,6 +144,39 @@ def merge_data_loaded(base, other): for path in json_path: with open(path, 'r') as f: data_loaded = merge_data_loaded(data_loaded, json.load(f)) + return self.train_raw(data_loaded, save_dir_path) + + def train_raw(self, data_loaded: Dict, save_dir_path: Optional[str] = None) -> Dict: + """Train or continue training a model given raw data. It will + continue training if an existing model is loaded or start new training if the model is blank/new. + + The raw data is expected in the following format: + {'projects': + [ # list of projects + { # project 1 + 'name': '', + # list of documents + 'documents': [{'name': '', # document 1 + 'text': '', + # list of annotations + 'annotations': [{'start': -1, # annotation 1 + 'end': 1, + 'cui': 'cui', + 'value': ''}, ...], + }, ...] + }, ... + ] + } + + Args: + data_loaded (Dict): + The raw data we want to train for. + save_dir_path (Optional[str]): + In case we have aut_save_model (meaning during the training the best model will be saved) + we need to set a save path. Defaults to `None`. + """ + g_config = self.config.general + t_config = self.config.train # Create directories if they don't exist if t_config['auto_save_model']: From 9a53faaa5c48c3c326ac68b39965acc5c06d8729 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Wed, 5 Jul 2023 19:21:55 +0300 Subject: [PATCH 12/17] CU-86785yhfk Add method to populate cui2snames with data from cui2names (#327) * CU-86785yhfk Add method to populate cui2snames with data from cui2names * CU-86785yhfk Add test for cui2sname population method --- medcat/cdb.py | 21 +++++++++++++++++++++ tests/test_cdb.py | 7 +++++++ 2 files changed, 28 insertions(+) diff --git a/medcat/cdb.py b/medcat/cdb.py index 8a58166b8..a4f87edf2 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -533,6 +533,27 @@ def reset_training(self) -> None: self.reset_concept_similarity() self.is_dirty = True + def populate_cui2snames(self, force: bool = True) -> None: + """Populate the cui2snames dict if it's empty. + + If the dict is not empty and the population is not force, + nothing will happen. + + For now, this method simply populates all the names form + cui2names into cui2snames. + + Args: + force (bool, optional): Whether to force the (re-)population. Defaults to True. + """ + if not force and self.cui2snames: + return + self.cui2snames.clear() # in case forced re-population + # run through cui2names + # and create new sets so that they can be independently modified + for cui, names in self.cui2names.items(): + self.cui2snames[cui] = set(names) # new set + self.is_dirty = True + def filter_by_cui(self, cuis_to_keep: Union[List[str], Set[str]]) -> None: """Subset the core CDB fields (dictionaries/maps). Note that this will potenitally keep a bit more CUIs then in cuis_to_keep. It will first find all names that link to the cuis_to_keep and then diff --git a/tests/test_cdb.py b/tests/test_cdb.py index 505320c6a..96425bc8c 100644 --- a/tests/test_cdb.py +++ b/tests/test_cdb.py @@ -75,5 +75,12 @@ def test_remove_cui(self): assert 'C0000039' not in self.undertest.name2cuis['virus~z'] assert 'C0000039' not in self.undertest.name2cuis2status['virus~z'] + def test_cui2snames_population(self): + self.undertest.cui2snames.clear() + self.undertest.populate_cui2snames() + for cui in self.undertest.cui2names: + with self.subTest(cui): + self.assertIn(cui, self.undertest.cui2snames) + if __name__ == '__main__': unittest.main() From 7862819ee5a57cdd99701b11b942c3c2bc755f88 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 5 Jul 2023 23:27:31 +0000 Subject: [PATCH 13/17] Bump django from 3.2.19 to 3.2.20 in /webapp/webapp Bumps [django](https://github.com/django/django) from 3.2.19 to 3.2.20. - [Commits](https://github.com/django/django/compare/3.2.19...3.2.20) --- updated-dependencies: - dependency-name: django dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- webapp/webapp/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp/webapp/requirements.txt b/webapp/webapp/requirements.txt index b4adaa104..a4b7827ad 100644 --- a/webapp/webapp/requirements.txt +++ b/webapp/webapp/requirements.txt @@ -1,4 +1,4 @@ -Django==3.2.19 +Django==3.2.20 django-dbbackup==4.0.0b0 django-storages[boto3]==1.12.3 django-cron==0.5.1 From 8631ae3aa45467185a1d83d8dc4dded4a865c328 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Thu, 6 Jul 2023 14:14:58 +0300 Subject: [PATCH 14/17] CU-346mpwz Improving memory usage of MedCAT models (#323) * CU-863gntc58 Add parent to child relationship getter to UMLS preprocessing * CU-863gntc58 Only use ISA relationships * Make sure parents do not have themselves as children * CU-863gntc58 Only keep preferred names * CU-346mpwz Add memory optimiser for CDB * CU-346mpwz Add name2 to memory optimiser for CDB * CU-346mpwz Add keys/items/values views to memory optimiser fake dicts * CU-346mpwz Fix keys/items/values views in memory optimiser fake dicts * CU-346mpwz Add option to optimise or not cui and/or name based dicts in memory optimiser * CU-346mpwz Make default memory optimiser omit name2... optimising; add comment regarding this in docstring * CU-346mpwz Remove unused/legacy code from memory optimiser * CU-346mpwz Add tests for memory optimiser * CU-346mpwz Add tests memory optimised CDB * CU-346mpwz Make dict names available within memory optimiser * CU-346mpwz Add separate tests for memory optimised CDB * CU-346mpwz Remove unused imports in memory optimiser * CU-346mpwz Move some encoding and decoing stuff within serialisation to their own module * CU-346mpwz Add tests for encoding/decoding stuff * CU-346mpwz Add encoding/decoding for delegating dict as well as postprocessing for delegation linking with json serialisation * CU-346mpwz Fix decision upon JSON deserialisation of CDB when loading model pack * CU-346mpwz Adapt serialisation tests to the potential one2many mappings * CU-346mpwz Add tests for memory optimisation, including JSON serialisation ones * CU-346mpwz Remove debug print statements * CU-346mpwz Remove debug methods from tests * CU-346mpwz Fix method signatures in encoding/decoding methods * CU-346mpwz Fix typing issue in serialiser when passing encoder * CU-346mpwz Relax typing restrictions for umls preprocessing / parent2child mapping * CU-346mpwz Remove some debug variables * CU-346mpwz Fix remnant merge conflict * CU-346mpwz Add item removal and popping to delegating dict * CU-346mpwz Add item removal and popping tests to delegating dict * CU-346mpwz Add item adding/setting tests to delegating dict * CU-346mpwz Fix typing issue (List vs list) * CU-346mpwz Add possibility of memory-optimising for snames as well * CU-346mpwz Add comment regarding memory-optimising for filtering by CUI to CDB * CU-346mpwz Add sname based memory optimisation tests * CU-346mpwz Add json serialisation capabilities to snames delegation * CU-346mpwz Make sname optimisation default for memory optimisation * CU-346mpwz Fix typo in serialisation tests * CU-346mpwz Add variable to keep track of current memory optimisation info to CDB * CU-346mpwz Add default cui2snames to sname optimisations; make sure sname optimisation dirties the CDB * CU-346mpwz Add method to undo CDB memory optimisation * CU-346mpwz Add tests for undoing CDB memory optimisation * CU-346mpwz Clear memory optimised parts if/when undoing optimisations * CU-346mpwz Remove accidentally added file/module * CU-346mpwz Add more straight forward optimisation part names; Fix memory optimisation part clearing * CU-346mpwz Add further tests for memory optimisation (dirty state, checking optimised parts) --- medcat/cat.py | 5 +- medcat/cdb.py | 17 +- medcat/utils/memory_optimiser.py | 366 ++++++++++++++++++++++ medcat/utils/preprocess_umls.py | 4 +- medcat/utils/saving/coding.py | 146 +++++++++ medcat/utils/saving/serializer.py | 51 ++- tests/utils/saving/test_coding.py | 77 +++++ tests/utils/saving/test_serialization.py | 28 +- tests/utils/test_memory_optimiser.py | 375 +++++++++++++++++++++++ 9 files changed, 1026 insertions(+), 43 deletions(-) create mode 100644 medcat/utils/memory_optimiser.py create mode 100644 medcat/utils/saving/coding.py create mode 100644 tests/utils/saving/test_coding.py create mode 100644 tests/utils/test_memory_optimiser.py diff --git a/medcat/cat.py b/medcat/cat.py index 36fc265bb..b2d3f7cb3 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -40,7 +40,7 @@ from medcat.vocab import Vocab from medcat.utils.decorators import deprecated from medcat.ner.transformers_ner import TransformersNER -from medcat.utils.saving.serializer import SPECIALITY_NAMES +from medcat.utils.saving.serializer import SPECIALITY_NAMES, ONE2MANY logger = logging.getLogger(__name__) # separate logger from the package-level one @@ -356,7 +356,8 @@ def load_model_pack(cls, # Load the CDB cdb_path = os.path.join(model_pack_path, "cdb.dat") - has_jsons = len(glob.glob(os.path.join(model_pack_path, '*.json'))) >= len(SPECIALITY_NAMES) + nr_of_jsons_expected = len(SPECIALITY_NAMES) - len(ONE2MANY) + has_jsons = len(glob.glob(os.path.join(model_pack_path, '*.json'))) >= nr_of_jsons_expected json_path = model_pack_path if has_jsons else None logger.info('Loading model pack with %s', 'JSON format' if json_path else 'dill format') cdb = CDB.load(cdb_path, json_path) diff --git a/medcat/cdb.py b/medcat/cdb.py index a4f87edf2..44d4fd9dd 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -95,6 +95,7 @@ def __init__(self, config: Union[Config, None] = None) -> None: self._optim_params = None self.is_dirty = False self._hash: Optional[str] = None + self._memory_optimised_parts: Set[str] = set() def get_name(self, cui: str) -> str: """Returns preferred name if it exists, otherwise it will return @@ -180,9 +181,13 @@ def remove_cui(self, cui: str) -> None: for name, cuis2status in self.name2cuis2status.items(): if cui in cuis2status: del cuis2status[cui] - self.snames = set() - for cuis in self.cui2snames.values(): - self.snames |= cuis + if isinstance(self.snames, set): + # if this is a memory optimised CDB, this won't be a set + # but it also won't need to be changed since it + # relies directly on cui2snames + self.snames = set() + for cuis in self.cui2snames.values(): + self.snames |= cuis self.name2count_train = {name: len(cuis) for name, cuis in self.name2cuis.items()} self.is_dirty = True @@ -561,6 +566,10 @@ def filter_by_cui(self, cuis_to_keep: Union[List[str], Set[str]]) -> None: This also will not remove any data from cdb.addl_info - as this field can contain data of unknown structure. + As a side note, if the CDB has been memory-optimised, filtering will undo this memory optimisation. + This is because the dicts being involved will be rewritten. + However, the memory optimisation can be performed again afterwards. + Args: cuis_to_keep (List[str]): CUIs that will be kept, the rest will be removed (not completely, look above). @@ -624,6 +633,8 @@ def filter_by_cui(self, cuis_to_keep: Union[List[str], Set[str]]) -> None: self.cui2type_ids = new_cui2type_ids self.cui2preferred_name = new_cui2preferred_name self.is_dirty = True + # reset memory optimisation state + self._memory_optimised_parts.clear() def make_stats(self): stats = {} diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py new file mode 100644 index 000000000..e8328734d --- /dev/null +++ b/medcat/utils/memory_optimiser.py @@ -0,0 +1,366 @@ +from typing import Any, Dict, KeysView, Iterator, List, Tuple, Union, Optional, Set + +from medcat.cdb import CDB +from medcat.utils.saving.coding import EncodeableObject, PartEncoder, PartDecoder, UnsuitableObject, register_encoder_decoder + + +CUI_DICT_NAMES_TO_COMBINE = [ + "cui2names", "cui2snames", "cui2context_vectors", + "cui2count_train", "cui2tags", "cui2type_ids", + "cui2preferred_name", "cui2average_confidence", +] +ONE2MANY = 'cui2many' + +NAME_DICT_NAMES_TO_COMBINE = [ + "cui2names", "name2cuis2status", "cui2preferred_name", +] +NAME2MANY = 'name2many' + +DELEGATING_DICT_IDENTIFIER = '==DELEGATING_DICT==' + +DELEGATING_SET_IDENTIFIER = '==DELEGATING_SET==' + +# these will be used in CDB._memory_optimised_parts +CUIS_PART = 'CUIS' +NAMES_PART = 'NAMES' +SNAMES_PART = 'snames' + + +class _KeysView: + def __init__(self, keys: KeysView, parent: 'DelegatingDict'): + self._keys = keys + self._parent = parent + + def __iter__(self) -> Iterator[Any]: + for key in self._keys: + if key in self._parent: + yield key + + def __len__(self) -> int: + return len([_ for _ in self]) + + +class _ItemsView: + def __init__(self, parent: 'DelegatingDict') -> None: + self._parent = parent + + def __iter__(self) -> Iterator[Any]: + for key in self._parent: + yield key, self._parent[key] + + def __len__(self) -> int: + return len(self._parent) + + +class _ValuesView: + def __init__(self, parent: 'DelegatingDict') -> None: + self._parent = parent + + def __iter__(self) -> Iterator[Any]: + for key in self._parent: + yield self._parent[key] + + def __len__(self) -> int: + return len(self._parent) + + +class DelegatingDict: + + def __init__(self, delegate: Dict[str, List[Any]], nr: int, + nr_of_overall_items: int = 8) -> None: + self.delegate = delegate + self.nr = nr + self.nr_of_overall_items = nr_of_overall_items + + def _generate_empty_entry(self) -> List[Any]: + return [None for _ in range(self.nr_of_overall_items)] + + def __getitem__(self, key: str) -> Any: + val = self.delegate[key][self.nr] + if val is None: + raise KeyError + return val + + def get(self, key: str, default: Any) -> Any: + try: + return self[key] + except KeyError: + return default + + def __setitem__(self, key: str, value: Any) -> None: + if key not in self.delegate: + self.delegate[key] = self._generate_empty_entry() + self.delegate[key][self.nr] = value + + def __contains__(self, key: str) -> bool: + return key in self.delegate and self.delegate[key][self.nr] is not None + + def keys(self) -> _KeysView: + return _KeysView(self.delegate.keys(), self) + + def items(self) -> _ItemsView: + return _ItemsView(self) + + def values(self) -> _ValuesView: + return _ValuesView(self) + + def __iter__(self) -> Iterator[str]: + yield from self.keys() + + def __len__(self) -> int: + return len(self.keys()) + + def to_dict(self) -> dict: + return {'delegate': None, + 'nr': self.nr, + 'nr_of_overall_items': self.nr_of_overall_items} + + def __eq__(self, __value: object) -> bool: + if not isinstance(__value, DelegatingDict): + return False + return self.delegate == __value.delegate and self.nr == __value.nr + + def __hash__(self) -> int: + return hash((self.delegate, self.nr)) + + def __delitem__(self, key: str) -> None: + self[key] = None + + def pop(self, key: str, default: Optional[Any] = None) -> Any: + if key in self: + item = self[key] + else: + item = default + del self[key] + return item + + +class DelegatingValueSet: + + def __init__(self, delegate: Dict[str, Set[str]]) -> None: + self.delegate = delegate + + def update(self, other: Any) -> None: + # do nothing since the value will be updated in delegate + pass + + def __contains__(self, value: str) -> bool: + for cui_value in self.delegate.values(): + if value in cui_value: + return True + return False + + def to_dict(self) -> dict: + return {'delegate': None} + + +class DelegatingDictEncoder(PartEncoder): + + def try_encode(self, obj): + if isinstance(obj, DelegatingDict): + return {DELEGATING_DICT_IDENTIFIER: obj.to_dict()} + raise UnsuitableObject() + + +class DelegatingDictDecoder(PartDecoder): + + def try_decode(self, dct: dict) -> Union[dict, EncodeableObject]: + if DELEGATING_DICT_IDENTIFIER in dct: + info = dct[DELEGATING_DICT_IDENTIFIER] + delegate = info['delegate'] + nr = info['nr'] + overall = info['nr_of_overall_items'] + return DelegatingDict(delegate, nr, overall) + return dct + + +class DelegatingValueSetEncoder(PartEncoder): + + def try_encode(self, obj): + if isinstance(obj, DelegatingValueSet): + return {DELEGATING_SET_IDENTIFIER: obj.to_dict()} + raise UnsuitableObject() + + +class DelegatingValueSetDecoder(PartDecoder): + + def try_decode(self, dct: dict) -> Union[dict, EncodeableObject]: + if DELEGATING_SET_IDENTIFIER in dct: + info = dct[DELEGATING_SET_IDENTIFIER] + delegate = info['delegate'] + return DelegatingValueSet(delegate) + return dct + + +def attempt_fix_after_load(cdb: CDB): + _attempt_fix_after_load(cdb, ONE2MANY, CUI_DICT_NAMES_TO_COMBINE) + _attempt_fix_after_load(cdb, NAME2MANY, NAME_DICT_NAMES_TO_COMBINE) + + +def attempt_fix_snames_after_load(cdb: CDB, snames_attr_name: str = 'snames'): + snames = getattr(cdb, snames_attr_name) + if isinstance(snames, DelegatingValueSet) and snames.delegate is None: + snames = DelegatingValueSet(cdb.cui2snames) + setattr(cdb, snames_attr_name, snames) + + +# register encoder and decoders +register_encoder_decoder(encoder=DelegatingDictEncoder, + decoder=DelegatingDictDecoder, + loading_postprocessor=attempt_fix_after_load) +register_encoder_decoder(encoder=DelegatingValueSetEncoder, + decoder=DelegatingValueSetDecoder, + loading_postprocessor=attempt_fix_snames_after_load) + + +def _optimise(cdb: CDB, to_many_name: str, dict_names_to_combine: List[str]) -> None: + dicts = [getattr(cdb, dict_name) + for dict_name in dict_names_to_combine] + one2many, delegators = map_to_many(dicts) + for delegator, name in zip(delegators, dict_names_to_combine): + setattr(cdb, name, delegator) + setattr(cdb, to_many_name, one2many) + cdb.is_dirty = True + + +def _optimise_snames(cdb: CDB, cui2snames: str = 'cui2snames', + snames_attr: str = 'snames') -> None: + """Optimise the snames part of a CDB. + + Args: + cdb (CDB): The CDB to optimise snames on. + one2many_name (str): The cui2snames dict name to delegate to. Defaults to 'cui2snames'. + snames_attr (str, optional): The `snames` attribute name. Defaults to 'snames'. + """ + delegate = getattr(cdb, cui2snames) + dvs = DelegatingValueSet(delegate) + setattr(cdb, snames_attr, dvs) + cdb.is_dirty = True + + +def perform_optimisation(cdb: CDB, optimise_cuis: bool = True, + optimise_names: bool = False, + optimise_snames: bool = True) -> None: + """Attempts to optimise the memory footprint of the CDB. + + This can perform optimisation for cui2<...> and name2<...> dicts. + However, by default, only cui2many optimisation will be done. + This is because at the time of writing, there were not enough name2<...> + dicts to be able to benefit from the optimisation. + + Does so by unifying the following dicts: + + cui2names (Dict[str, Set[str]]): + From cui to all names assigned to it. Mainly used for subsetting (maybe even only). + cui2snames (Dict[str, Set[str]]): + From cui to all sub-names assigned to it. Only used for subsetting. + cui2context_vectors (Dict[str, Dict[str, np.array]]): + From cui to a dictionary of different kinds of context vectors. Normally you would have here + a short and a long context vector - they are calculated separately. + cui2count_train (Dict[str, int]): + From CUI to the number of training examples seen. + cui2tags (Dict[str, List[str]]): + From CUI to a list of tags. This can be used to tag concepts for grouping of whatever. + cui2type_ids (Dict[str, Set[str]]): + From CUI to type id (e.g. TUI in UMLS). + cui2preferred_name (Dict[str, str]): + From CUI to the preferred name for this concept. + cui2average_confidence (Dict[str, str]): + Used for dynamic thresholding. Holds the average confidence for this CUI given the training examples. + + name2cuis (Dict[str, List[str]]): + Map fro concept name to CUIs - one name can map to multiple CUIs. + name2cuis2status (Dict[str, Dict[str, str]]): + What is the status for a given name and cui pair - each name can be: + P - Preferred, A - Automatic (e.g. let medcat decide), N - Not common. + name2count_train (Dict[str, str]): + Counts how often did a name appear during training. + + It can also delegate the `snames` set to use the various sets in `cui2snames` instead. + + They will all be included in 1 dict with CUI keys and a list of values for each pre-existing dict. + + Args: + cdb (CDB): The CDB to modify. + optimise_cuis (bool, optional): Whether to optimise cui2<...> dicts. Defaults to True. + optimise_names (bool, optional): Whether to optimise name2<...> dicts. Defaults to False. + optimise_snames (bool, optional): Whether to optimise `snames` set. Defaults to True. + """ + # cui2<...> -> cui2many + if optimise_cuis: + _optimise(cdb, ONE2MANY, CUI_DICT_NAMES_TO_COMBINE) + cdb._memory_optimised_parts.add(CUIS_PART) + # name2<...> -> name2many + if optimise_names: + _optimise(cdb, NAME2MANY, NAME_DICT_NAMES_TO_COMBINE) + cdb._memory_optimised_parts.add(NAMES_PART) + if optimise_snames: + # check snames based on cui2sanmes + _optimise_snames(cdb) + cdb._memory_optimised_parts.add(SNAMES_PART) + + +def _attempt_fix_after_load(cdb: CDB, one2many_name: str, dict_names: List[str]): + if not hasattr(cdb, one2many_name): + return + one2many = getattr(cdb, one2many_name) + for dict_name in dict_names: + d = getattr(cdb, dict_name) + if not isinstance(d, DelegatingDict): + raise ValueError(f'Unknown type for {dict_name}: {type(d)}') + d.delegate = one2many + + +def _unoptimise(cdb: CDB, to_many_name: str, dict_names_to_combine: List[str]): + # remove one2many attribute + # the references still exist on each delegator + delattr(cdb, to_many_name) + + delegating_dicts: List[Dict[str, Any]] = [getattr(cdb, dict_name) + for dict_name in dict_names_to_combine] + for del_dict, dict_name in zip(delegating_dicts, dict_names_to_combine): + raw_dict = dict(del_dict.items()) + setattr(cdb, dict_name, raw_dict) + cdb.is_dirty = True + + +def _unoptimise_snames(cdb: CDB, cui2snames: str = 'cui2snames', + snames_attr: str = 'snames') -> None: + # rebuild snames + delegate: Dict[str, Set[str]] = getattr(cdb, cui2snames) + snames = set() + for values in delegate.values(): + snames.update(values) + setattr(cdb, snames_attr, snames) + cdb.is_dirty = True + + +def unoptimise_cdb(cdb: CDB): + """This undoes all the (potential) memory optimisations done in `perform_optimisation`. + + This method relies on `CDB._memory_optimised_parts` to be up to date. + + Args: + cdb (CDB): The CDB to work on. + """ + if CUIS_PART in cdb._memory_optimised_parts: + _unoptimise(cdb, ONE2MANY, CUI_DICT_NAMES_TO_COMBINE) + if NAMES_PART in cdb._memory_optimised_parts: + _unoptimise(cdb, NAME2MANY, NAME_DICT_NAMES_TO_COMBINE) + if SNAMES_PART in cdb._memory_optimised_parts: + _unoptimise_snames(cdb) + cdb._memory_optimised_parts.clear() + + +def map_to_many(dicts: List[Dict[str, Any]]) -> Tuple[Dict[str, List[Any]], List[DelegatingDict]]: + one2many: Dict[str, List[Any]] = {} + delegators: List[DelegatingDict] = [] + for nr, d in enumerate(dicts): + delegator = DelegatingDict( + one2many, nr, nr_of_overall_items=len(dicts)) + for key, value in d.items(): + if key not in one2many: + one2many[key] = delegator._generate_empty_entry() + one2many[key][nr] = value + delegators.append(delegator) + return one2many, delegators diff --git a/medcat/utils/preprocess_umls.py b/medcat/utils/preprocess_umls.py index 9cf0ccea4..7c47f451a 100644 --- a/medcat/utils/preprocess_umls.py +++ b/medcat/utils/preprocess_umls.py @@ -3,7 +3,7 @@ import pandas as pd import tqdm import os -from typing import Dict, Set +from typing import Dict _DEFAULT_COLUMNS: list = [ "CUI", @@ -240,7 +240,7 @@ def get_pt2ch(self) -> dict: cui_parent = cui_parent[cui_parent['PAUI'].notna()] # create dict - pt2ch: Dict[str, Set[str]] = {} + pt2ch: dict = {} for _, row in tqdm.tqdm(cui_parent.iterrows(), total=len(cui_parent.index)): cur_cui = row['CUI'] paui = row['PAUI'] diff --git a/medcat/utils/saving/coding.py b/medcat/utils/saving/coding.py new file mode 100644 index 000000000..c03e6816f --- /dev/null +++ b/medcat/utils/saving/coding.py @@ -0,0 +1,146 @@ +from typing import Any, Protocol, runtime_checkable, List, Union, Type, Optional, Callable + +import json + + +@runtime_checkable +class EncodeableObject(Protocol): + + def to_dict(self) -> dict: + """Converts the object to a dict. + + Returns: + dict: The dict to be serialised. + """ + + +class UnsuitableObject(ValueError): + pass + + +class PartEncoder(Protocol): + + def try_encode(self, obj: object) -> Any: + """Try to encode an object + + Args: + obj (object): The object to encode + + Raises: + UnsuitableObject: If the object is unsuitable for encoding. + + Returns: + Any: The encoded object + """ + + +SET_IDENTIFIER = '==SET==' + + +class SetEncoder(PartEncoder): + """JSONEncoder (and decoder) for sets. + + Generally, JSON doesn't support serializing of sets natively. + This encoder adds a set identifier to the data when being serialized + and provides a method to read said identifier upon decoding.""" + + def try_encode(self, obj): + if isinstance(obj, set): + return {SET_IDENTIFIER: list(obj)} + raise UnsuitableObject() + + +class PartDecoder(Protocol): + + def try_decode(self, dct: dict) -> Union[dict, Any]: + """Try to decode the dictionary. + + Args: + dct (dict): The dict to decode. + + Returns: + Union[dict, Any]: The dict if unable to decode, the decoded object otherwise + """ + + +class SetDecoder(PartDecoder): + + def try_decode(self, dct: dict) -> Union[dict, set]: + """Decode sets from input dicts. + + Args: + dct (dict): The input dict + + Returns: + Union[dict, set]: The original dict if this was not a serialized set, the set otherwise + """ + if SET_IDENTIFIER in dct: + return set(dct[SET_IDENTIFIER]) + return dct + + +PostProcessor = Callable[[Any], None] # CDB -> None + +DEFAULT_ENCODERS: List[Type[PartEncoder]] = [SetEncoder, ] +DEFAULT_DECODERS: List[Type[PartDecoder]] = [SetDecoder, ] +LOADING_POSTPROCESSORS: List[PostProcessor] = [] + + +def register_encoder_decoder(encoder: Optional[Type[PartEncoder]], + decoder: Optional[Type[PartDecoder]], + loading_postprocessor: Optional[PostProcessor]): + if encoder: + DEFAULT_ENCODERS.append(encoder) + if decoder: + DEFAULT_DECODERS.append(decoder) + if loading_postprocessor: + LOADING_POSTPROCESSORS.append(loading_postprocessor) + + +class CustomDelegatingEncoder(json.JSONEncoder): + + def __init__(self, delegates: List[PartEncoder], *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self._delegates = delegates + + def default(self, obj): + for delegator in self._delegates: + try: + return delegator.try_encode(obj) + except UnsuitableObject: + pass + return json.JSONEncoder.default(self, obj) + + @classmethod + def def_inst(cls, *args, **kwargs) -> 'CustomDelegatingEncoder': + return cls([_cls() for _cls in DEFAULT_ENCODERS], *args, **kwargs) + + +class CustomDelegatingDecoder(json.JSONDecoder): + _def_inst: Optional['CustomDelegatingDecoder'] = None + + def __init__(self, delegates: List[PartDecoder]) -> None: + self._delegates = delegates + + def object_hook(self, dct: dict) -> Any: + for delegator in self._delegates: + ret_val = delegator.try_decode(dct) + if ret_val is not dct: + return ret_val + return dct + + @classmethod + def def_inst(cls) -> 'CustomDelegatingDecoder': + if cls._def_inst is None: + cls._def_inst = cls([_cls() for _cls in DEFAULT_DECODERS]) + return cls._def_inst + + +def default_hook(dct: dict) -> Any: + cdd = CustomDelegatingDecoder.def_inst() + return cdd.object_hook(dct) + + +def default_postprocessing(cdb) -> None: + for pp in LOADING_POSTPROCESSORS: + pp(cdb) diff --git a/medcat/utils/saving/serializer.py b/medcat/utils/saving/serializer.py index c08124831..d82df751c 100644 --- a/medcat/utils/saving/serializer.py +++ b/medcat/utils/saving/serializer.py @@ -5,11 +5,13 @@ """ import os import logging -from typing import cast, Dict, Optional, Union +from typing import cast, Dict, Optional, Type import dill import json from medcat.config import Config +from medcat.utils.saving.coding import CustomDelegatingEncoder, default_hook, default_postprocessing + logger = logging.getLogger(__name__) @@ -17,35 +19,8 @@ __SPECIALITY_NAMES_NAME = set( ["name2cuis", "name2cuis2status", "name_isupper"]) __SPECIALITY_NAMES_OTHER = set(["snames", "addl_info"]) -SPECIALITY_NAMES = __SPECIALITY_NAMES_CUI | __SPECIALITY_NAMES_NAME | __SPECIALITY_NAMES_OTHER - - -class SetEncode(json.JSONEncoder): - """JSONEncoder (and decoder) for sets. - - Generally, JSON doesn't support serializing of sets natively. - This encoder adds a set identifier to the data when being serialized - and provides a method to read said identifier upon decoding.""" - SET_IDENTIFIER = '==SET==' - - def default(self, obj): - if isinstance(obj, set): - return {SetEncode.SET_IDENTIFIER: list(obj)} - return json.JSONEncoder.default(self, obj) - - @staticmethod - def set_decode(dct: dict) -> Union[dict, set]: - """Decode sets from input dicts. - - Args: - dct (dict): The input dict - - Returns: - Union[dict, set]: The original dict if this was not a serialized set, the set otherwise - """ - if SetEncode.SET_IDENTIFIER in dct: - return set(dct[SetEncode.SET_IDENTIFIER]) - return dct +ONE2MANY = set(['cui2many', 'name2many']) # these may or may not exist +SPECIALITY_NAMES = __SPECIALITY_NAMES_CUI | __SPECIALITY_NAMES_NAME | __SPECIALITY_NAMES_OTHER | ONE2MANY class JsonSetSerializer: @@ -75,7 +50,11 @@ def write(self, d: dict) -> None: logger.info('Writing data for "%s" into "%s"', self.name, self.file_name) with open(self.file_name, 'w') as f: - json.dump(d, f, cls=SetEncode) + # the def_inst method, when called, + # returns the right type of object anyway + + json.dump(d, f, cls=cast(Type[json.JSONEncoder], + CustomDelegatingEncoder.def_inst)) def read(self) -> dict: """Read the json file specified by this serializer. @@ -85,7 +64,8 @@ def read(self) -> dict: """ logger.info('Reading data for %s from %s', self.name, self.file_name) with open(self.file_name, 'r') as f: - data = json.load(f, object_hook=SetEncode.set_decode) + data = json.load( + f, object_hook=default_hook) return data @@ -168,6 +148,8 @@ def serialize(self, cdb, overwrite: bool = False) -> None: dill.dump(to_save, f) if self.jsons is not None: for name in SPECIALITY_NAMES: + if name not in cdb.__dict__: + continue # in case cui2many doesn't exit self.jsons[name].write(cdb.__dict__[name]) def deserialize(self, cdb_cls): @@ -199,5 +181,10 @@ def deserialize(self, cdb_cls): # if applicable if self.jsons is not None: for name in SPECIALITY_NAMES: + if not os.path.exists(self.jsons[name].file_name): + continue # in case of non-memory-optimised where cui2many doesn't exist cdb.__dict__[name] = self.jsons[name].read() + # if anything has + # been registered to postprocess the CDBs + default_postprocessing(cdb) return cdb diff --git a/tests/utils/saving/test_coding.py b/tests/utils/saving/test_coding.py new file mode 100644 index 000000000..c60a3b1f2 --- /dev/null +++ b/tests/utils/saving/test_coding.py @@ -0,0 +1,77 @@ +from medcat.utils.saving import coding + +import json + +import unittest + + +class SetEncodeTests(unittest.TestCase): + string2sets_dict1 = {'s1': set(['v1', 'v2', 'v3']), + 's2': set(['u1', 'u2', 'u3'])} + string2sets_dict2 = {'p1': set([1, 2, 3]), + 'p2': set([3, 4, 5])} + + def serialise(self, d: dict) -> str: + return json.dumps(d, cls=coding.CustomDelegatingEncoder.def_inst) + + def _helper_serialises(self, d: dict): + s = self.serialise(d) + self.assertIsInstance(s, str) + + def test_sets_of_strings_serialise(self): + self._helper_serialises(self.string2sets_dict1) + + def test_sets_of_ints_serialise(self): + self._helper_serialises(self.string2sets_dict2) + + def _helper_keys_in_json(self, d: dict): + s = self.serialise(d) + for k in d.keys(): + with self.subTest(k): + self.assertIn(str(k), s) + + def test_sos_keys_in_json(self): + self._helper_keys_in_json(self.string2sets_dict1) + + def test_soi_keys_in_json(self): + self._helper_keys_in_json(self.string2sets_dict2) + + def _helper_values_in_json(self, d: dict): + s = self.serialise(d) + for key, v in d.items(): + for nr, el in enumerate(v): + with self.subTest(f"Key: {key}; Element {nr}"): + self.assertIn(str(el), s) + + def test_sos_values_in_json(self): + self._helper_values_in_json(self.string2sets_dict1) + + def test_soi_values_in_json(self): + self._helper_values_in_json(self.string2sets_dict2) + + +class SetDecodeTests(unittest.TestCase): + + def deserialise(self, s: str) -> dict: + return json.loads(s, object_hook=coding.default_hook) + + def setUp(self) -> None: + self.encoder = SetEncodeTests() + self.encoded1 = self.encoder.serialise(self.encoder.string2sets_dict1) + self.encoded2 = self.encoder.serialise(self.encoder.string2sets_dict2) + + def test_sos_decodes(self): + d = self.deserialise(self.encoded1) + self.assertIsInstance(d, dict) + + def test_soi_decodes(self): + d = self.deserialise(self.encoded2) + self.assertIsInstance(d, dict) + + def test_sos_decodes_to_identical(self): + d = self.deserialise(self.encoded1) + self.assertEqual(d, self.encoder.string2sets_dict1) + + def test_soi_decodes_to_identical(self): + d = self.deserialise(self.encoded2) + self.assertEqual(d, self.encoder.string2sets_dict2) diff --git a/tests/utils/saving/test_serialization.py b/tests/utils/saving/test_serialization.py index 6313906dc..f0cc75de1 100644 --- a/tests/utils/saving/test_serialization.py +++ b/tests/utils/saving/test_serialization.py @@ -9,11 +9,13 @@ from medcat.cat import CAT from medcat.vocab import Vocab -from medcat.utils.saving.serializer import JsonSetSerializer, CDBSerializer, SPECIALITY_NAMES +from medcat.utils.saving.serializer import JsonSetSerializer, CDBSerializer, SPECIALITY_NAMES, ONE2MANY +import medcat.utils.saving.coding as _ -class JSONSerialoizationTests(unittest.TestCase): - folder = os.path.join('temp', 'JSONSerialoizationTests') + +class JSONSerializationTests(unittest.TestCase): + folder = os.path.join('temp', 'JSONSerializationTests') def setUp(self) -> None: return super().setUp() @@ -42,6 +44,11 @@ def test_round_trip(self): self.ser.serialize(self.cdb, overwrite=True) cdb = self.ser.deserialize(CDB) for name in SPECIALITY_NAMES: + if name in ONE2MANY: + # ignore cui2many and name2many + # since they don't exist if/when + # optimisation hasn't been done + continue with self.subTest(name): orig = getattr(self.cdb, name) now = getattr(cdb, name) @@ -82,11 +89,19 @@ def test_dill_to_json(self): json_path = os.path.join(model_pack_folder, "*.json") jsons = glob.glob(json_path) # there is also a model_card.json - self.assertGreaterEqual(len(jsons), len(SPECIALITY_NAMES)) + # but nothing for cui2many or name2many + # so can remove the length of ONE2MANY + self.assertGreaterEqual(len(jsons), len( + SPECIALITY_NAMES) - len(ONE2MANY)) for json in jsons: with self.subTest(f'JSON {json}'): if json.endswith('model_card.json'): continue # ignore model card here + if any(name in json for name in ONE2MANY): + # ignore cui2many and name2many + # since they don't exist if/when + # optimisation hasn't been done + continue self.assertTrue( any(special_name in json for special_name in SPECIALITY_NAMES)) return model_pack_folder @@ -128,6 +143,11 @@ def test_round_trip(self): self.assertEqual(cat.vocab.unigram_table, self.undertest.vocab.unigram_table) for name in SPECIALITY_NAMES: + if name in ONE2MANY: + # ignore cui2many and name2many + # since they don't exist if/when + # optimisation hasn't been done + continue with self.subTest(f'CDB Name {name}'): self.assertEqual(cat.cdb.__dict__[ name], self.undertest.cdb.__dict__[name]) diff --git a/tests/utils/test_memory_optimiser.py b/tests/utils/test_memory_optimiser.py new file mode 100644 index 000000000..5f59f5274 --- /dev/null +++ b/tests/utils/test_memory_optimiser.py @@ -0,0 +1,375 @@ +from medcat.utils import memory_optimiser + +import unittest +import tempfile +import os +import shutil +import json +from medcat.cat import CAT +from medcat.cdb import CDB +from medcat.vocab import Vocab +from medcat.utils.saving import coding + + +class DelegatingDictTests(unittest.TestCase): + _dict = {'c1': [None, 0], 'c2': [1, None]} + + def setUp(self) -> None: + # deep copy so that the origianl remains unchangeds + _dict = dict((k, v.copy() + ) for k, v in self._dict.items()) + self.del_dict1 = memory_optimiser.DelegatingDict(_dict, 0, 2) + self.del_dict2 = memory_optimiser.DelegatingDict(_dict, 1, 2) + self.delegators = [self.del_dict1, self.del_dict2] + self.names = ['delegator 1', 'delegator 2'] + self.expected_lens = [len( + [v[nr] for v in _dict.values() if v[nr] is not None] + ) for nr in range(len(_dict[list(_dict.keys())[0]]))] + + def test_removal(self, key='c2'): + self.assertIn(key, self.del_dict1) + del self.del_dict1[key] + self.assertNotIn(key, self.del_dict1) + + def test_pop_no_def_existing(self, key='c2'): + self.assertIn(key, self.del_dict1) + val = self.del_dict1.pop(key) + self.assertNotIn(key, self.del_dict1) + self.assertIs(val, self._dict[key][0]) + + def test_pop_def_non_existing(self, key='c1', def_val='DEF VAL'): + self.assertNotIn(key, self.del_dict1) + val = self.del_dict1.pop(key, def_val) + self.assertNotIn(key, self.del_dict1) + self.assertIs(val, def_val) + + def test_adding_exiting_key_nonexist_value(self, key: str = 'c1'): + self.assertNotIn(key, self.del_dict1) + self.del_dict1[key] = 'value' + self.assertIn(key, self.del_dict1) + + def test_adding_nonexiting_key(self, key: str = 'nek1'): + self.assertNotIn(key, self.del_dict1) + self.del_dict1[key] = 'value-NEW' + self.assertIn(key, self.del_dict1) + + def test_adding_nonexiting_key_not_affect_other(self, key: str = 'nek2'): + self.assertNotIn(key, self.del_dict2) + self.del_dict1[key] = 'value-NEW-2' + self.assertNotIn(key, self.del_dict2) + + def test_delegating_dict_has_correct_keys(self): + for delegator, exp_len, name in zip(self.delegators, self.expected_lens, self.names): + with self.subTest(name): + self.assertEqual(len(delegator.keys()), exp_len) + + def test_delegating_dict_has_same_number_of_keys_and_values(self): + for delegator, exp_len, name in zip(self.delegators, self.expected_lens, self.names): + with self.subTest(name): + self.assertEqual(len(delegator.keys()), exp_len) + self.assertEqual(len(delegator.values()), exp_len) + + def test_delegating_dict_has_same_number_of_items_and_iter_values(self): + for delegator, exp_len, name in zip(self.delegators, self.expected_lens, self.names): + with self.subTest(name): + self.assertEqual(len(delegator.items()), exp_len) + # __iter__ -> list -> len + self.assertEqual(len(list(delegator)), exp_len) + + def test_delegator_do_not_have_None_values(self): + for delegator, name in zip(self.delegators, self.names): + for key, val in delegator.items(): + with self.subTest(f"{name}: {key}"): + self.assertIsNotNone(val) + + def test_delegator_keys_in_original(self): + for delegator, name in zip(self.delegators, self.names): + for key in delegator.keys(): + with self.subTest(f"{name}: {key}"): + self.assertIn(key, self._dict) + + def test_delegator_keys_in_container(self): + for delegator, name in zip(self.delegators, self.names): + for key in delegator.keys(): + with self.subTest(f"{name}: {key}"): + self.assertIn(key, delegator) + + def test_delegator_get_gets_key(self, def_value='#DEFAULT#'): + for delegator, name in zip(self.delegators, self.names): + for key in delegator.keys(): + with self.subTest(f"{name}: {key}"): + val = delegator.get(key, def_value) + self.assertIsNot(val, def_value) + + def test_delegator_get_defaults_non_existant_key(self, def_value='#DEFAULT#'): + for delegator, name in zip(self.delegators, self.names): + for key in self._dict.keys(): + if key in delegator: + continue + with self.subTest(f"{name}: {key}"): + val = delegator.get(key, def_value) + self.assertIs(val, def_value) + + +class DelegatingDictJsonTests(unittest.TestCase): + _dict = {'c5': [None, 10], 'c6': [11, None]} + + def setUp(self) -> None: + self.del_dict1 = memory_optimiser.DelegatingDict(self._dict, 0, 2) + self.del_dict2 = memory_optimiser.DelegatingDict(self._dict, 1, 2) + self.delegators = [self.del_dict1, self.del_dict2] + self.master_dict = {'one2many': self._dict, + 'part1': self.del_dict1, + 'part2': self.del_dict2} + + def serialise_master(self) -> str: + return json.dumps(self.master_dict, + cls=coding.CustomDelegatingEncoder.def_inst) + + def deserialise(self, s: str, one2many_name='one2many') -> dict: + d = json.loads(s, object_hook=coding.default_hook) + one2many = d[one2many_name] + for key, value in d.items(): + if key == one2many_name: + continue + if value.delegate is None: + value.delegate = one2many + return d + + def test_dict_of_delegation_serialises(self): + s = self.serialise_master() + self.assertIsInstance(s, str) + + def test_dod_ser_has_keys(self): + s = self.serialise_master() + for key in self.master_dict: + with self.subTest(key): + self.assertIn(key, s) + + def test_dod_ser_one2many_has_sub_keys(self): + s = self.serialise_master() + for key in self.master_dict['one2many']: + with self.subTest(key): + self.assertIn(key, s) + + def test_round_trip(self): + s = self.serialise_master() + d = self.deserialise(s) + self.assertIsInstance(d, dict) + + def test_round_trip_equal(self): + s = self.serialise_master() + d = self.deserialise(s) + self.assertEqual(d, self.master_dict) + + +class UnOptimisingTests(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + cls.cdb = CDB.load(os.path.join(os.path.dirname( + os.path.realpath(__file__)), "..", "..", "examples", "cdb.dat")) + + def test_unoptimised_cdb_does_not_have_cui2many(self): + self.assertFalse(hasattr(self.cdb, 'cui2many')) + + def test_unoptmised_cdb_does_not_have_delegating_dicts(self): + for key, val in self.cdb.__dict__.items(): + with self.subTest(key): + self.assertNotIsInstance(val, memory_optimiser.DelegatingDict) + + def test_unoptimised_knows_has_no_optimsied_parts(self): + self.assertFalse(self.cdb._memory_optimised_parts, + "Should have empty optimised partss") + + def test_simply_loaded_model_not_dirty(self): + self.assertFalse(self.cdb.is_dirty) + + +class MemoryOptimisingTests(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + cls.cdb = CDB.load(os.path.join(os.path.dirname( + os.path.realpath(__file__)), "..", "..", "examples", "cdb.dat")) + memory_optimiser.perform_optimisation(cls.cdb, optimise_snames=True) + + def test_is_dirty(self): + self.assertTrue(self.cdb.is_dirty, + "Should be dirty after optimisation") + + def test_knows_optimised(self): + self.assertTrue(self.cdb._memory_optimised_parts, + "Should have non-empty `_memory_optimised_parts`") + + def test_knows_correct_parts_optimsed(self, should_be=['CUIS', 'snames']): + for name in should_be: + with self.subTest(name): + self.assertIn(name, self.cdb._memory_optimised_parts) + + def test_knows_incorrect_parts_NOT_optimised(self, should_not_be=['NAMES']): + for name in should_not_be: + with self.subTest(name): + self.assertNotIn(name, self.cdb._memory_optimised_parts) + + def test_cdb_has_one2many(self, one2many_name='cui2many'): + self.assertTrue(hasattr(self.cdb, one2many_name)) + one2many = getattr(self.cdb, one2many_name) + self.assertIsInstance(one2many, dict) + + def test_cdb_has_delegating_dicts(self): + for dict_name in memory_optimiser.CUI_DICT_NAMES_TO_COMBINE: + with self.subTest(dict_name): + d = getattr(self.cdb, dict_name) + self.assertIsInstance(d, memory_optimiser.DelegatingDict) + + def test_has_delegating_set(self): + self.assertIsInstance( + self.cdb.snames, memory_optimiser.DelegatingValueSet) + + def test_delegating_set_has_values(self): + for values in self.cdb.cui2snames.values(): + for val in values: + with self.subTest(f'Checking {val}'): + self.assertIn(val, self.cdb.snames) + + +class MemoryUnoptimisingTests(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + cls.cdb = CDB.load(os.path.join(os.path.dirname( + os.path.realpath(__file__)), "..", "..", "examples", "cdb.dat")) + + def test_optimisation_round_trip_cuis(self): + cui_dicts_before = [getattr(self.cdb, dict_name) + for dict_name in memory_optimiser.CUI_DICT_NAMES_TO_COMBINE] + memory_optimiser.perform_optimisation(self.cdb) + memory_optimiser.unoptimise_cdb(self.cdb) + cui_dicts_after = [getattr(self.cdb, dict_name) + for dict_name in memory_optimiser.CUI_DICT_NAMES_TO_COMBINE] + for before, after, name in zip(cui_dicts_before, + cui_dicts_after, + memory_optimiser.CUI_DICT_NAMES_TO_COMBINE): + with self.subTest(f'{name}'): + self.assertIsInstance(before, dict) + self.assertIsInstance(after, dict) + self.assertEquals(len(before), len(after)) + self.assertEquals(before, after) + + def test_optimisation_round_trip_snames(self): + snames_before = self.cdb.snames + memory_optimiser.perform_optimisation(self.cdb) + memory_optimiser.unoptimise_cdb(self.cdb) + snames_after = self.cdb.snames + self.assertIsInstance(snames_before, set) + self.assertIsInstance(snames_after, set) + self.assertEquals(len(snames_before), len(snames_after)) + self.assertEquals(snames_before, snames_after) + + def test_optimisation_round_trip_dirty(self): + memory_optimiser.perform_optimisation(self.cdb) + memory_optimiser.unoptimise_cdb(self.cdb) + self.assertTrue(self.cdb.is_dirty) + + def test_optimisation_round_trip_no_optimised_parts(self): + memory_optimiser.perform_optimisation(self.cdb) + memory_optimiser.unoptimise_cdb(self.cdb) + self.assertFalse(self.cdb._memory_optimised_parts, + "Should have no optimised parts") + + +class OperationalTests(unittest.TestCase): + temp_folder = tempfile.TemporaryDirectory() + temp_cdb_path = os.path.join(temp_folder.name, 'cat.cdb') + json_path = temp_cdb_path.rsplit(os.path.sep, 1)[0] + # importing here so it's in the local namespace + # otherwise, all of its parts would get run again + from tests.test_cat import CATTests + test_callable_with_single_text = CATTests.test_callable_with_single_text + test_callable_with_single_empty_text = CATTests.test_callable_with_single_empty_text + test_callable_with_single_none_text = CATTests.test_callable_with_single_none_text + test_get_entities = CATTests.test_get_entities + test_get_entities_including_text = CATTests.test_get_entities_including_text + test_get_entities_multi_texts = CATTests.test_get_entities_multi_texts + test_get_entities_multi_texts_including_text = CATTests.test_get_entities_multi_texts_including_text + + @classmethod + def setUpClass(cls) -> None: + cls.cdb = CDB.load(os.path.join(os.path.dirname( + os.path.realpath(__file__)), "..", "..", "examples", "cdb.dat")) + memory_optimiser.perform_optimisation(cls.cdb, optimise_snames=True) + cls.vocab = Vocab.load(os.path.join(os.path.dirname( + os.path.realpath(__file__)), "..", "..", "examples", "vocab.dat")) + cls.cdb.config.general.spacy_model = "en_core_web_md" + cls.cdb.config.ner.min_name_len = 2 + cls.cdb.config.ner.upper_case_limit_len = 3 + cls.cdb.config.general.spell_check = True + cls.cdb.config.linking.train_count_threshold = 10 + cls.cdb.config.linking.similarity_threshold = 0.3 + cls.cdb.config.linking.train = True + cls.cdb.config.linking.disamb_length_limit = 5 + cls.cdb.config.general.full_unlink = True + cls.meta_cat_dir = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "tmp") + cls.undertest = CAT(cdb=cls.cdb, config=cls.cdb.config, + vocab=cls.vocab, meta_cats=[]) + cls._linkng_filters = cls.undertest.config.linking.filters.copy_of() + + # # add tests from CAT tests + + @classmethod + def tearDownClass(cls) -> None: + cls.temp_folder.cleanup() + cls.undertest.destroy_pipe() + if os.path.exists(cls.meta_cat_dir): + shutil.rmtree(cls.meta_cat_dir) + + def tearDown(self) -> None: + self.cdb.config.annotation_output.include_text_in_output = False + # need to make sure linking filters are not retained beyond a test scope + self.undertest.config.linking.filters = self._linkng_filters.copy_of() + + def test_optimised_cdb_has_cui2many(self): + self.assertTrue(hasattr(self.cdb, 'cui2many')) + + def test_can_be_saved_as_json(self): + self.cdb.save(self.temp_cdb_path, json_path=self.json_path) + + def test_can_be_loaded_as_json(self): + self.test_can_be_saved_as_json() + cdb = CDB.load(self.temp_cdb_path, self.json_path) + self.assertEqual(self.cdb.cui2many, cdb.cui2many) + for del_name in memory_optimiser.CUI_DICT_NAMES_TO_COMBINE: + d = getattr(cdb, del_name) + with self.subTest(del_name): + self.assertIsInstance(d, memory_optimiser.DelegatingDict) + self.assertIs(cdb.cui2many, d.delegate) + + +class DelegatingValueSetTests(unittest.TestCase): + + def setUp(self) -> None: + self.delegate = {'a': set('abcd'), + 'b': set('efghij'), + 'c': set('lm'), # skip k + 'd': set('qrst'), # skip a bunch + } + self.original = set([v for s in self.delegate for v in s]) + + def test_DelegatingValueSet_constructs(self): + dvs = memory_optimiser.DelegatingValueSet(self.delegate) + self.assertIsInstance(dvs, memory_optimiser.DelegatingValueSet) + + def test_DelegatingValueSet_contains_values(self): + dvs = memory_optimiser.DelegatingValueSet(self.delegate) + for v in self.original: + with self.subTest(f'Check: {v}'): + self.assertIn(v, dvs) + + def test_DelegatingValueSet_contains_incorrect_values(self, + to_check=set('kopuvwxyz')): + dvs = memory_optimiser.DelegatingValueSet(self.delegate) + for v in to_check: + with self.subTest(f'Check: {v}'): + self.assertNotIn(v, dvs) From 9711554cf57f032745717609fc9de4f25ac8b8cd Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Fri, 7 Jul 2023 16:53:44 +0300 Subject: [PATCH 15/17] Documentation fixes (#332) * Move C image in Medical oncept ... stuff to raw github image (to try and fix pypi image) * Remove out of date comment from README regarding logging tutorials --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 03e7ba3ec..395aecf69 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Medical oncept Annotation Tool +# Medical oncept Annotation Tool [![Build Status](https://github.com/CogStack/MedCAT/actions/workflows/main.yml/badge.svg?branch=master)](https://github.com/CogStack/MedCAT/actions/workflows/main.yml?query=branch%3Amaster) [![Documentation Status](https://readthedocs.org/projects/medcat/badge/?version=latest)](https://medcat.readthedocs.io/en/latest/?badge=latest) @@ -50,7 +50,6 @@ Since MedCAT is primarily a library, logging has been effectively disabled by de The idea is that the user can directly modify the logging behaviour of either the entire library or a certain set of modules within as they wish. We have provided a convenience method to add default handlers that log into the console as well as _medcat.log_ (`medcat.add_default_log_handlers`). Some details as to how one can configure the logging are described in the [MedCAT Tutorials](https://github.com/CogStack/MedCATtutorials). -PS: Currently (temporarily!) the tutorial is in the `tutorials` folder. ## Acknowledgements Entity extraction was trained on [MedMentions](https://github.com/chanzuckerberg/MedMentions) In total it has ~ 35K entites from UMLS From a1dccf4d21f7b23996250cb7fde61531395dfec0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 26 Jul 2023 09:48:04 +0100 Subject: [PATCH 16/17] Bump aiohttp from 3.8.3 to 3.8.5 (#333) Bumps [aiohttp](https://github.com/aio-libs/aiohttp) from 3.8.3 to 3.8.5. - [Release notes](https://github.com/aio-libs/aiohttp/releases) - [Changelog](https://github.com/aio-libs/aiohttp/blob/v3.8.5/CHANGES.rst) - [Commits](https://github.com/aio-libs/aiohttp/compare/v3.8.3...v3.8.5) --- updated-dependencies: - dependency-name: aiohttp dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 84ca5cc2a..be517876f 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -3,4 +3,4 @@ sphinx-rtd-theme~=1.0 myst-parser~=0.17 sphinx-autoapi~=1.8 setuptools>=60.0 -aiohttp==3.8.3 \ No newline at end of file +aiohttp==3.8.5 \ No newline at end of file diff --git a/setup.py b/setup.py index 4e73b2f89..cefc88572 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ 'pydantic>=1.10.0,<2.0', # for spacy compatibility; avoid 2.0 due to breaking changes # the following are not direct dependencies of MedCAT but needed for docs/building # hopefully will no longer need the transitive dependencies - 'aiohttp==3.8.3', # 3.8.3 is needed for compatibility with fsspec <- datasets <- medcat + 'aiohttp==3.8.5', # 3.8.3 is needed for compatibility with fsspec <- datasets <- medcat 'blis<0.8.0,>=0.7.8', # as required by thinc <- spacy <- medcat # 'smart-open==5.2.1', # 5.2.1 is needed for compatibility with pathy # 'joblib~=1.2', From 8fe9dfcdf8541149545faa683890bf234d2608c0 Mon Sep 17 00:00:00 2001 From: tomolopolis Date: Mon, 31 Jul 2023 12:08:23 +0100 Subject: [PATCH 17/17] CU-862k77jjj: changes needed for Trainer metrics page --- medcat/cat.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/medcat/cat.py b/medcat/cat.py index b2d3f7cb3..5218e9d02 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -534,10 +534,14 @@ def _print_stats(self, anns_norm.append((ann['start'], cui)) anns_examples.append({"text": doc['text'][max(0, ann['start']-60):ann['end']+60], "cui": cui, + "start": ann['start'], + "end": ann['end'], "source value": ann['value'], "acc": 1, "project name": project.get('name'), - "document name": doc.get('name')}) + "document name": doc.get('name'), + "project id": project.get('id'), + "document id": doc.get('id')}) elif ann.get('validated', True) and (ann.get('killed', False) or ann.get('deleted', False)): anns_norm_neg.append((ann['start'], cui)) @@ -556,11 +560,14 @@ def _print_stats(self, p_anns_norm.append((ann.start_char, cui)) p_anns_examples.append({"text": doc['text'][max(0, ann.start_char-60):ann.end_char+60], "cui": cui, + "start": ann.start_char, + "end": ann.end_char, "source value": ann.text, "acc": float(ann._.context_similarity), "project name": project.get('name'), - "document name": doc.get('name')}) - + "document name": doc.get('name'), + "project id": project.get('id'), + "document id": doc.get('id')}) for iann, ann in enumerate(p_anns_norm): cui = ann[1] if ann in anns_norm: