Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cu 8692zguyq no preferred name #367

Merged
merged 9 commits into from
Nov 30, 2023
4 changes: 4 additions & 0 deletions medcat/cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,6 +840,10 @@ def add_and_train_concept(self,
Refer to medcat.cat.cdb.CDB.add_concept
"""
names = prepare_name(name, self.pipe.spacy_nlp, {}, self.config)
if not names and cui not in self.cdb.cui2preferred_name and name_status == 'P':
logger.warning("No names were able to be prepared in CAT.add_and_train_concept "
"method. As such no preferred name will be able to be specifeid. "
"The CUI: '%s' and raw name: '%s'", cui, name)
# Only if not negative, otherwise do not add the new name if in fact it should not be detected
if do_add_concept and not negative:
self.cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description,
Expand Down
15 changes: 15 additions & 0 deletions medcat/cdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,21 @@ def add_concept(self,
if name_status == 'P' and cui not in self.cui2preferred_name:
# Do not overwrite old preferred names
self.cui2preferred_name[cui] = name_info['raw_name']
elif names:
# if no name_info and names is NOT an empty dict
# this shouldn't really happen in the current setup
raise ValueError("Unknown state where there is no name_info, "
"yet the `names` dict is not empty (%s)", names)
elif name_status == 'P' and cui not in self.cui2preferred_name:
# this means names is an empty `names` dict
logger.warning("Did not manage to add a preferred name in `add_cui`. "
"Was trying to do so for cui: '%s'"
"This means that the `names` dict passed was empty. "
"This is _usually_ caused by either no name or too short "
"a name passed to the `prepare_name` method. "
"The minimum length is defined in: "
"'config.cdb_maker.min_letters_required' and "
"is currently set at %s", cui, self.config.cdb_maker['min_letters_required'])

# Add other fields if full_build
if full_build:
Expand Down
3 changes: 2 additions & 1 deletion medcat/preprocessing/cleaners.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ def prepare_name(raw_name: str, nlp: Language, names: Dict, config: Config) -> D
snames = set()
name = config.general['separator'].join(tokens)

if not config.cdb_maker.get('min_letters_required', 0) or len(re.sub("[^A-Za-z]*", '', name)) >= config.cdb_maker.get('min_letters_required', 0):
min_letters = config.cdb_maker.get('min_letters_required', 0)
if not min_letters or len(re.sub("[^A-Za-z]*", '', name)) >= min_letters:
if name not in names:
sname = ""
for token in tokens:
Expand Down
Empty file added tests/preprocessing/__init__.py
Empty file.
104 changes: 104 additions & 0 deletions tests/preprocessing/test_cleaners.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
from medcat.preprocessing.cleaners import prepare_name
from medcat.config import Config
from medcat.cdb_maker import CDBMaker

import logging, os

import unittest


class BaseCDBMakerTests(unittest.TestCase):

@classmethod
def setUpClass(cls) -> None:
config = Config()
config.general['log_level'] = logging.DEBUG
config.general["spacy_model"] = "en_core_web_md"
cls.maker = CDBMaker(config)
csvs = [
os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'examples', 'cdb.csv'),
os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'examples', 'cdb_2.csv')
]
cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True)


class BasePrepareNameTest(BaseCDBMakerTests):
raw_name = 'raw'

@classmethod
def setUpClass(cls) -> None:
super().setUpClass()
cls.do_prepare_name()

# method called after setup, when raw_name has been specified
@classmethod
def do_prepare_name(cls) -> None:
cls.name = cls.cdb.config.general.separator.join(cls.raw_name.split())
cls.names = prepare_name(cls.raw_name, cls.maker.pipe.spacy_nlp, {}, cls.cdb.config)

def _dict_has_key_val_type(self, d: dict, key, val_type):
self.assertIn(key, d)
self.assertIsInstance(d[key], val_type)

def _names_has_key_val_type(self, key, val_type):
self._dict_has_key_val_type(self.names, key, val_type)

def test_result_has_name(self):
self._names_has_key_val_type(self.name, dict)

def test_name_info_has_tokens(self):
self._dict_has_key_val_type(self.names[self.name], 'tokens', list)

def test_name_info_has_words_as_tokens(self):
name_info = self.names[self.name]
tokens = name_info['tokens']
for word in self.raw_name.split():
with self.subTest(word):
self.assertIn(word, tokens)


class NamePreparationTests_OneLetter(BasePrepareNameTest):

@classmethod
def setUpClass(cls) -> None:
super().setUpClass()
cls.raw_name = "a"
# the minimum name length is defined by the following config option
# if I don't set this to 1 here, I would see the tests fail
# that would be because the result from `prepare_names` would be empty
cls.cdb.config.cdb_maker.min_letters_required = 1
super().do_prepare_name()


class NamePreparationTests_TwoLetters(BasePrepareNameTest):

@classmethod
def setUpClass(cls) -> None:
super().setUpClass()
cls.raw_name = "an"
super().do_prepare_name()


class NamePreparationTests_MultiToken(BasePrepareNameTest):

@classmethod
def setUpClass(cls) -> None:
super().setUpClass()
cls.raw_name = "this raw name"
super().do_prepare_name()


class NamePreparationTests_Empty(BaseCDBMakerTests):
"""In case of an empty name, I would expect the names dict
returned by `prepare_name` to be empty.
"""
empty_raw_name = ''

@classmethod
def setUpClass(cls) -> None:
super().setUpClass()
cls.names = prepare_name(cls.empty_raw_name, cls.maker.pipe.spacy_nlp, {}, cls.cdb.config)

def test_names_dict_is_empty(self):
self.assertEqual(len(self.names), 0)
self.assertEqual(self.names, {})
55 changes: 53 additions & 2 deletions tests/test_cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
import unittest
import tempfile
import shutil
import logging
import contextlib
from transformers import AutoTokenizer
from medcat.vocab import Vocab
from medcat.cdb import CDB
from medcat.cat import CAT
from medcat.cdb import CDB, logger as cdb_logger
from medcat.cat import CAT, logger as cat_logger
from medcat.utils.checkpoint import Checkpoint
from medcat.meta_cat import MetaCAT
from medcat.config_meta_cat import ConfigMetaCAT
Expand All @@ -20,6 +22,7 @@ class CATTests(unittest.TestCase):
def setUpClass(cls) -> None:
cls.cdb = CDB.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb.dat"))
cls.vocab = Vocab.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab.dat"))
cls.vocab.make_unigram_table()
cls.cdb.config.general.spacy_model = "en_core_web_md"
cls.cdb.config.ner.min_name_len = 2
cls.cdb.config.ner.upper_case_limit_len = 3
Expand Down Expand Up @@ -388,6 +391,54 @@ def test_hashing(self):
cat = CAT.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip"))
self.assertEqual(cat.get_hash(), cat.config.version.id)

def _assertNoLogs(self, logger: logging.Logger, level: int):
if hasattr(self, 'assertNoLogs'):
return self.assertNoLogs(logger=logger, level=level)
else:
return self.__assertNoLogs(logger=logger, level=level)

@contextlib.contextmanager
def __assertNoLogs(self, logger: logging.Logger, level: int):
try:
with self.assertLogs(logger, level) as captured_logs:
yield
except AssertionError:
return
if captured_logs:
raise AssertionError("Logs were found: {}".format(captured_logs))

def assertLogsDuringAddAndTrainConcept(self, logger: logging.Logger, log_level,
name: str, name_status: str, nr_of_calls: int):
cui = 'CUI-%d'%(hash(name) + id(name))
with (self.assertLogs(logger=logger, level=log_level)
if nr_of_calls == 1
else self._assertNoLogs(logger=logger, level=log_level)):
self.undertest.add_and_train_concept(cui, name, name_status=name_status)

def test_add_and_train_concept_cat_nowarn_long_name(self):
long_name = 'a very long name'
self.assertLogsDuringAddAndTrainConcept(cat_logger, logging.WARNING, name=long_name, name_status='', nr_of_calls=0)

def test_add_and_train_concept_cdb_nowarn_long_name(self):
long_name = 'a very long name'
self.assertLogsDuringAddAndTrainConcept(cdb_logger, logging.WARNING, name=long_name, name_status='', nr_of_calls=0)

def test_add_and_train_concept_cat_nowarn_short_name_not_pref(self):
short_name = 'a'
self.assertLogsDuringAddAndTrainConcept(cat_logger, logging.WARNING, name=short_name, name_status='', nr_of_calls=0)

def test_add_and_train_concept_cdb_nowarn_short_name_not_pref(self):
short_name = 'a'
self.assertLogsDuringAddAndTrainConcept(cdb_logger, logging.WARNING, name=short_name, name_status='', nr_of_calls=0)

def test_add_and_train_concept_cat_warns_short_name(self):
short_name = 'a'
self.assertLogsDuringAddAndTrainConcept(cat_logger, logging.WARNING, name=short_name, name_status='P', nr_of_calls=1)

def test_add_and_train_concept_cdb_warns_short_name(self):
short_name = 'a'
self.assertLogsDuringAddAndTrainConcept(cdb_logger, logging.WARNING, name=short_name, name_status='P', nr_of_calls=1)


class ModelWithTwoConfigsLoadTests(unittest.TestCase):

Expand Down