Skip to content

Commit

Permalink
Cu 8692zguyq no preferred name (#367)
Browse files Browse the repository at this point in the history
* CU-8692zguyq: Slight simplification of minimum-name-length logic

* CU-8692zguyq: Add some tests for prepare_name preprocessor

* CU-8692zguyq: Add warning if no preferred name was added along a new CUI

* CU-8692zguyq: Add additional warning messages when adding/training a new CUI with no preferred name

* CU-8692zguyq: Make no preferred name warnings only run if name status is preferred

* CU-8692zguyq: Add tests for no-preferred name warnings

* CU-8692zguyq: Add Vocab.make_unigram_table to CAT tests

* CU-8692zguyq: Move to built in asserting for logging instead of patching the method

* CU-8692zguyq: Add workaround for assertNoLogs on python 3.8 and 3.9
  • Loading branch information
mart-r authored Nov 30, 2023
1 parent d8473d9 commit 76b75cc
Show file tree
Hide file tree
Showing 6 changed files with 178 additions and 3 deletions.
4 changes: 4 additions & 0 deletions medcat/cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,6 +840,10 @@ def add_and_train_concept(self,
Refer to medcat.cat.cdb.CDB.add_concept
"""
names = prepare_name(name, self.pipe.spacy_nlp, {}, self.config)
if not names and cui not in self.cdb.cui2preferred_name and name_status == 'P':
logger.warning("No names were able to be prepared in CAT.add_and_train_concept "
"method. As such no preferred name will be able to be specifeid. "
"The CUI: '%s' and raw name: '%s'", cui, name)
# Only if not negative, otherwise do not add the new name if in fact it should not be detected
if do_add_concept and not negative:
self.cdb._add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description,
Expand Down
15 changes: 15 additions & 0 deletions medcat/cdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,21 @@ def _add_concept(self,
if name_status == 'P' and cui not in self.cui2preferred_name:
# Do not overwrite old preferred names
self.cui2preferred_name[cui] = name_info['raw_name']
elif names:
# if no name_info and names is NOT an empty dict
# this shouldn't really happen in the current setup
raise ValueError("Unknown state where there is no name_info, "
"yet the `names` dict is not empty (%s)", names)
elif name_status == 'P' and cui not in self.cui2preferred_name:
# this means names is an empty `names` dict
logger.warning("Did not manage to add a preferred name in `add_cui`. "
"Was trying to do so for cui: '%s'"
"This means that the `names` dict passed was empty. "
"This is _usually_ caused by either no name or too short "
"a name passed to the `prepare_name` method. "
"The minimum length is defined in: "
"'config.cdb_maker.min_letters_required' and "
"is currently set at %s", cui, self.config.cdb_maker['min_letters_required'])

# Add other fields if full_build
if full_build:
Expand Down
3 changes: 2 additions & 1 deletion medcat/preprocessing/cleaners.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ def prepare_name(raw_name: str, nlp: Language, names: Dict, config: Config) -> D
snames = set()
name = config.general['separator'].join(tokens)

if not config.cdb_maker.get('min_letters_required', 0) or len(re.sub("[^A-Za-z]*", '', name)) >= config.cdb_maker.get('min_letters_required', 0):
min_letters = config.cdb_maker.get('min_letters_required', 0)
if not min_letters or len(re.sub("[^A-Za-z]*", '', name)) >= min_letters:
if name not in names:
sname = ""
for token in tokens:
Expand Down
Empty file added tests/preprocessing/__init__.py
Empty file.
104 changes: 104 additions & 0 deletions tests/preprocessing/test_cleaners.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
from medcat.preprocessing.cleaners import prepare_name
from medcat.config import Config
from medcat.cdb_maker import CDBMaker

import logging, os

import unittest


class BaseCDBMakerTests(unittest.TestCase):

@classmethod
def setUpClass(cls) -> None:
config = Config()
config.general['log_level'] = logging.DEBUG
config.general["spacy_model"] = "en_core_web_md"
cls.maker = CDBMaker(config)
csvs = [
os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'examples', 'cdb.csv'),
os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'examples', 'cdb_2.csv')
]
cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True)


class BasePrepareNameTest(BaseCDBMakerTests):
raw_name = 'raw'

@classmethod
def setUpClass(cls) -> None:
super().setUpClass()
cls.do_prepare_name()

# method called after setup, when raw_name has been specified
@classmethod
def do_prepare_name(cls) -> None:
cls.name = cls.cdb.config.general.separator.join(cls.raw_name.split())
cls.names = prepare_name(cls.raw_name, cls.maker.pipe.spacy_nlp, {}, cls.cdb.config)

def _dict_has_key_val_type(self, d: dict, key, val_type):
self.assertIn(key, d)
self.assertIsInstance(d[key], val_type)

def _names_has_key_val_type(self, key, val_type):
self._dict_has_key_val_type(self.names, key, val_type)

def test_result_has_name(self):
self._names_has_key_val_type(self.name, dict)

def test_name_info_has_tokens(self):
self._dict_has_key_val_type(self.names[self.name], 'tokens', list)

def test_name_info_has_words_as_tokens(self):
name_info = self.names[self.name]
tokens = name_info['tokens']
for word in self.raw_name.split():
with self.subTest(word):
self.assertIn(word, tokens)


class NamePreparationTests_OneLetter(BasePrepareNameTest):

@classmethod
def setUpClass(cls) -> None:
super().setUpClass()
cls.raw_name = "a"
# the minimum name length is defined by the following config option
# if I don't set this to 1 here, I would see the tests fail
# that would be because the result from `prepare_names` would be empty
cls.cdb.config.cdb_maker.min_letters_required = 1
super().do_prepare_name()


class NamePreparationTests_TwoLetters(BasePrepareNameTest):

@classmethod
def setUpClass(cls) -> None:
super().setUpClass()
cls.raw_name = "an"
super().do_prepare_name()


class NamePreparationTests_MultiToken(BasePrepareNameTest):

@classmethod
def setUpClass(cls) -> None:
super().setUpClass()
cls.raw_name = "this raw name"
super().do_prepare_name()


class NamePreparationTests_Empty(BaseCDBMakerTests):
"""In case of an empty name, I would expect the names dict
returned by `prepare_name` to be empty.
"""
empty_raw_name = ''

@classmethod
def setUpClass(cls) -> None:
super().setUpClass()
cls.names = prepare_name(cls.empty_raw_name, cls.maker.pipe.spacy_nlp, {}, cls.cdb.config)

def test_names_dict_is_empty(self):
self.assertEqual(len(self.names), 0)
self.assertEqual(self.names, {})
55 changes: 53 additions & 2 deletions tests/test_cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
import unittest
import tempfile
import shutil
import logging
import contextlib
from transformers import AutoTokenizer
from medcat.vocab import Vocab
from medcat.cdb import CDB
from medcat.cat import CAT
from medcat.cdb import CDB, logger as cdb_logger
from medcat.cat import CAT, logger as cat_logger
from medcat.utils.checkpoint import Checkpoint
from medcat.meta_cat import MetaCAT
from medcat.config_meta_cat import ConfigMetaCAT
Expand All @@ -20,6 +22,7 @@ class CATTests(unittest.TestCase):
def setUpClass(cls) -> None:
cls.cdb = CDB.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb.dat"))
cls.vocab = Vocab.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab.dat"))
cls.vocab.make_unigram_table()
cls.cdb.config.general.spacy_model = "en_core_web_md"
cls.cdb.config.ner.min_name_len = 2
cls.cdb.config.ner.upper_case_limit_len = 3
Expand Down Expand Up @@ -388,6 +391,54 @@ def test_hashing(self):
cat = CAT.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip"))
self.assertEqual(cat.get_hash(), cat.config.version.id)

def _assertNoLogs(self, logger: logging.Logger, level: int):
if hasattr(self, 'assertNoLogs'):
return self.assertNoLogs(logger=logger, level=level)
else:
return self.__assertNoLogs(logger=logger, level=level)

@contextlib.contextmanager
def __assertNoLogs(self, logger: logging.Logger, level: int):
try:
with self.assertLogs(logger, level) as captured_logs:
yield
except AssertionError:
return
if captured_logs:
raise AssertionError("Logs were found: {}".format(captured_logs))

def assertLogsDuringAddAndTrainConcept(self, logger: logging.Logger, log_level,
name: str, name_status: str, nr_of_calls: int):
cui = 'CUI-%d'%(hash(name) + id(name))
with (self.assertLogs(logger=logger, level=log_level)
if nr_of_calls == 1
else self._assertNoLogs(logger=logger, level=log_level)):
self.undertest.add_and_train_concept(cui, name, name_status=name_status)

def test_add_and_train_concept_cat_nowarn_long_name(self):
long_name = 'a very long name'
self.assertLogsDuringAddAndTrainConcept(cat_logger, logging.WARNING, name=long_name, name_status='', nr_of_calls=0)

def test_add_and_train_concept_cdb_nowarn_long_name(self):
long_name = 'a very long name'
self.assertLogsDuringAddAndTrainConcept(cdb_logger, logging.WARNING, name=long_name, name_status='', nr_of_calls=0)

def test_add_and_train_concept_cat_nowarn_short_name_not_pref(self):
short_name = 'a'
self.assertLogsDuringAddAndTrainConcept(cat_logger, logging.WARNING, name=short_name, name_status='', nr_of_calls=0)

def test_add_and_train_concept_cdb_nowarn_short_name_not_pref(self):
short_name = 'a'
self.assertLogsDuringAddAndTrainConcept(cdb_logger, logging.WARNING, name=short_name, name_status='', nr_of_calls=0)

def test_add_and_train_concept_cat_warns_short_name(self):
short_name = 'a'
self.assertLogsDuringAddAndTrainConcept(cat_logger, logging.WARNING, name=short_name, name_status='P', nr_of_calls=1)

def test_add_and_train_concept_cdb_warns_short_name(self):
short_name = 'a'
self.assertLogsDuringAddAndTrainConcept(cdb_logger, logging.WARNING, name=short_name, name_status='P', nr_of_calls=1)


class ModelWithTwoConfigsLoadTests(unittest.TestCase):

Expand Down

0 comments on commit 76b75cc

Please sign in to comment.