Skip to content

Commit

Permalink
Merge branch 'master' into CU-8695hghww-backwards-compatibility-workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
mart-r committed Aug 30, 2024
2 parents 731693b + b8bb4e3 commit 1a94d2d
Show file tree
Hide file tree
Showing 38 changed files with 90 additions and 96 deletions.
14 changes: 7 additions & 7 deletions medcat/cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@

class CAT(object):
"""The main MedCAT class used to annotate documents, it is built on top of spaCy
and works as a spaCy pipline. Creates an instance of a spaCy pipline that can
and works as a spaCy pipeline. Creates an instance of a spaCy pipeline that can
be used as a spacy nlp model.
Args:
Expand Down Expand Up @@ -264,7 +264,7 @@ def create_model_pack(self, save_dir_path: str, model_pack_name: str = DEFAULT_M
if cdb_format.lower() == 'json':
json_path = save_dir_path # in the same folder!
else:
json_path = None # use dill formating
json_path = None # use dill formatting
logger.info('Saving model pack with CDB in %s format', cdb_format)

# expand user path to make this work with '~'
Expand Down Expand Up @@ -345,7 +345,7 @@ def attempt_unpack(cls, zip_path: str) -> str:

model_pack_path = os.path.join(base_dir, foldername)
if os.path.exists(model_pack_path):
logger.info("Found an existing unziped model pack at: {}, the provided zip will not be touched.".format(model_pack_path))
logger.info("Found an existing unzipped model pack at: {}, the provided zip will not be touched.".format(model_pack_path))
else:
logger.info("Unziping the model pack and loading models.")
shutil.unpack_archive(zip_path, extract_dir=model_pack_path)
Expand Down Expand Up @@ -554,7 +554,7 @@ def _print_stats(self,
Each project in MedCATtrainer can have filters, do we want to respect those filters
when calculating metrics.
use_overlaps (bool):
Allow overlapping entities, nearly always False as it is very difficult to annotate overlapping entites.
Allow overlapping entities, nearly always False as it is very difficult to annotate overlapping entities.
use_cui_doc_limit (bool):
If True the metrics for a CUI will be only calculated if that CUI appears in a document, in other words
if the document was annotated for that CUI. Useful in very specific situations when during the annotation
Expand Down Expand Up @@ -670,7 +670,7 @@ def add_cui_to_group(self, cui: str, group_name: str) -> None:
cui (str):
The concept to be added.
group_name (str):
The group to whcih the concept will be added.
The group to which the concept will be added.
Examples:
Expand Down Expand Up @@ -1222,7 +1222,7 @@ def _run_nn_components(self, docs: Dict, nn_components: List, id2text: Dict) ->
for name, component in nn_components:
component.config.general['disable_component_lock'] = True

# For meta_cat compoments
# For meta_cat components
for name, component in [c for c in nn_components if isinstance(c[1], MetaCAT)]:
spacy_docs = component.pipe(spacy_docs)
for spacy_doc in spacy_docs:
Expand Down Expand Up @@ -1370,7 +1370,7 @@ def multiprocessing_batch_char_size(self,

docs = {}
_start_time = time.time()
_batch_counter = 0 # Used for splitting the output, counts batches inbetween saves
_batch_counter = 0 # Used for splitting the output, counts batches between saves
for batch in self._batch_generator(iterator, batch_size_chars, skip_ids=set(annotated_ids)):
logger.info("Annotated until now: %s docs; Current BS: %s docs; Elapsed time: %.2f minutes",
len(annotated_ids),
Expand Down
24 changes: 9 additions & 15 deletions medcat/cdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from medcat.utils.matutils import unitvec
from medcat.utils.ml_utils import get_lr_linking
from medcat.config import Config, workers
from medcat.utils.decorators import deprecated
from medcat.utils.saving.serializer import CDBSerializer
from medcat.utils.config_utils import get_and_del_weighted_average_from_config
from medcat.utils.config_utils import default_weighted_average
Expand All @@ -29,7 +28,7 @@ class CDB(object):
Properties:
name2cuis (Dict[str, List[str]]):
Map fro concept name to CUIs - one name can map to multiple CUIs.
Map for concept name to CUIs - one name can map to multiple CUIs.
name2cuis2status (Dict[str, Dict[str, str]]):
What is the status for a given name and cui pair - each name can be:
P - Preferred, A - Automatic (e.g. let medcat decide), N - Not common.
Expand Down Expand Up @@ -58,7 +57,7 @@ class CDB(object):
Any additional maps that are not part of the core CDB. These are usually not needed
for the base NER+L use-case, but can be useufl for Debugging or some special stuff.
vocab (Dict[str, int]):
Stores all the words tha appear in this CDB and the count for each one.
Stores all the words that appear in this CDB and the count for each one.
is_dirty (bool):
Whether or not the CDB has been changed since it was loaded or created
"""
Expand Down Expand Up @@ -129,7 +128,7 @@ def get_name(self, cui: str) -> str:
Args:
cui (str):
Concept ID or unique identifer in this database.
Concept ID or unique identifier in this database.
Returns:
str: The name of the concept.
Expand All @@ -148,11 +147,6 @@ def update_cui2average_confidence(self, cui: str, new_sim: float) -> None:
(self.cui2count_train.get(cui, 0) + 1)
self.is_dirty = True

@deprecated("Deprecated. For internal use only. Use CAT.unlink_concept_name instead",
depr_version=(1, 12, 0), removal_version=(1, 13, 0))
def remove_names(self, cui: str, names: Iterable[str]) -> None:
self._remove_names(cui, names)

def _remove_names(self, cui: str, names: Iterable[str]) -> None:
"""Remove names from an existing concept - effect is this name will never again be used to link to this concept.
This will only remove the name from the linker (namely name2cuis and name2cuis2status), the name will still be present everywhere else.
Expand All @@ -161,7 +155,7 @@ def _remove_names(self, cui: str, names: Iterable[str]) -> None:
Args:
cui (str):
Concept ID or unique identifer in this database.
Concept ID or unique identifier in this database.
names (Iterable[str]):
Names to be removed (e.g list, set, or even a dict (in which case keys will be used)).
"""
Expand Down Expand Up @@ -194,7 +188,7 @@ def remove_cui(self, cui: str) -> None:
Args:
cui (str):
Concept ID or unique identifer in this database.
Concept ID or unique identifier in this database.
"""
if cui in self.cui2names:
del self.cui2names[cui]
Expand Down Expand Up @@ -233,7 +227,7 @@ def add_names(self, cui: str, names: Dict[str, Dict], name_status: str = 'A', fu
Args:
cui (str):
Concept ID or unique identifer in this database, all concepts that have
Concept ID or unique identifier in this database, all concepts that have
the same CUI will be merged internally.
names (Dict[str, Dict]):
Names for this concept, or the value that if found in free text can be linked to this concept.
Expand Down Expand Up @@ -318,7 +312,7 @@ def _add_concept(self,
self.name_isupper[name] = names[name]['is_upper']

if name in self.name2cuis:
# Means we have alrady seen this name
# Means we have already seen this name
if cui not in self.name2cuis[name]:
# If CUI is not already linked do it
self.name2cuis[name].append(cui)
Expand Down Expand Up @@ -421,7 +415,7 @@ def update_context_vector(self,
cui (str):
The concept in question.
vectors (Dict[str, np.ndarray]):
Vector represenation of the context, must have the format: {'context_type': np.array(<vector>), ...}
Vector representation of the context, must have the format: {'context_type': np.array(<vector>), ...}
context_type - is usually one of: ['long', 'medium', 'short']
negative (bool):
Is this negative context of positive (Default Value `False`).
Expand Down Expand Up @@ -601,7 +595,7 @@ def import_training(self, cdb: "CDB", overwrite: bool = True) -> None:
Examples:
>>> new_cdb.import_traininig(cdb=old_cdb, owerwrite=True)
>>> new_cdb.import_traininig(cdb=old_cdb, overwrite=True)
"""
# Import vectors and counts
for cui in cdb.cui2context_vectors:
Expand Down
4 changes: 2 additions & 2 deletions medcat/cdb_maker.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@

class CDBMaker(object):
"""Given a CSV as shown in https://github.com/CogStack/MedCAT/tree/master/examples/<example> it creates a CDB or
updates an exisitng one.
updates an existing one.
Args:
config (medcat.config.Config):
Global config for MedCAT.
cdb (medcat.cdb.CDB):
If set the `CDBMaker` will updat the existing `CDB` with
If set the `CDBMaker` will update the existing `CDB` with
new concepts in the CSV (Default value `None`).
"""

Expand Down
28 changes: 14 additions & 14 deletions medcat/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def merge_config(self, config_dict: Dict) -> None:
try:
setattr(self, key, value)
except AttributeError as err:
logger.warning('Issue with setting attribtue "%s":', key, exc_info=err)
logger.warning('Issue with setting attribute "%s":', key, exc_info=err)
self.rebuild_re()

def parse_config_file(self, path: str, extractor: ValueExtractor = _DEFAULT_EXTRACTOR) -> None:
Expand Down Expand Up @@ -281,7 +281,7 @@ class CDBMaker(MixingConfig, BaseModel):
name_versions: list = ['LOWER', 'CLEAN']
"""Name versions to be generated."""
multi_separator: str = '|'
"""If multiple names or type_ids for a concept present in one row of a CSV, they are separted
"""If multiple names or type_ids for a concept present in one row of a CSV, they are separated
by the character below."""
remove_parenthesis: int = 5
"""Should preferred names with parenthesis be cleaned 0 means no, else it means if longer than or equal
Expand Down Expand Up @@ -387,7 +387,7 @@ class General(MixingConfig, BaseModel):
should not be used when annotating millions of documents. If `None` it will be the string "concept", if `short` it will be CUI,
if `long` it will be CUI | Name | Confidence"""
map_cui_to_group: bool = False
"""If the cdb.addl_info['cui2group'] is provided and this option enabled, each CUI will be maped to the group"""
"""If the cdb.addl_info['cui2group'] is provided and this option enabled, each CUI will be mapped to the group"""
simple_hash: bool = False
"""Whether to use a simple hash.
Expand All @@ -402,7 +402,7 @@ class Config:
class Preprocessing(MixingConfig, BaseModel):
"""The preprocessing part of the config"""
words_to_skip: set = {'nos'}
"""This words will be completly ignored from concepts and from the text (must be a Set)"""
"""This words will be completely ignored from concepts and from the text (must be a Set)"""
keep_punct: set = {'.', ':'}
"""All punct will be skipped by default, here you can set what will be kept"""
do_not_normalize: set = {'VBD', 'VBG', 'VBN', 'VBP', 'JJS', 'JJR'}
Expand All @@ -411,7 +411,7 @@ class Preprocessing(MixingConfig, BaseModel):
- https://spacy.io/usage/linguistic-features#pos-tagging
- Label scheme section per model at https://spacy.io/models/en"""
skip_stopwords: bool = False
"""Should stopwords be skipped/ingored when processing input"""
"""Should stopwords be skipped/ignored when processing input"""
min_len_normalize: int = 5
"""Nothing below this length will ever be normalized (input tokens or concept names), normalized means lemmatized in this case"""
stopwords: Optional[set] = None
Expand All @@ -433,7 +433,7 @@ class Ner(MixingConfig, BaseModel):
min_name_len: int = 3
"""Do not detect names below this limit, skip them"""
max_skip_tokens: int = 2
"""When checkng tokens for concepts you can have skipped tokens inbetween
"""When checking tokens for concepts you can have skipped tokens between
used ones (usually spaces, new lines etc). This number tells you how many skipped can you have."""
check_upper_case_names: bool = False
"""Check uppercase to distinguish uppercase and lowercase words that have a different meaning."""
Expand Down Expand Up @@ -467,13 +467,13 @@ def __eq__(self, other):
class LinkingFilters(MixingConfig, BaseModel):
"""These describe the linking filters used alongside the model.
When no CUIs nor exlcuded CUIs are specified (the sets are empty),
When no CUIs nor excluded CUIs are specified (the sets are empty),
all CUIs are accepted.
If there are CUIs specified then only those will be accepted.
If there are excluded CUIs specified, they are excluded.
In some cases, there are extra filters as well as MedCATtrainer (MCT) export filters.
These are expcted to follow the following:
These are expected to follow the following:
extra_cui_filter ⊆ MCT filter ⊆ Model/config filter
While any other CUIs can be included in the the extra CUI filter or the MCT filter,
Expand Down Expand Up @@ -555,10 +555,10 @@ class Linking(MixingConfig, BaseModel):
"""Concepts that have seen less training examples than this will not be used for
similarity calculation and will have a similarity of -1."""
always_calculate_similarity: bool = False
"""Do we want to calculate context similarity even for concepts that are not ambigous."""
"""Do we want to calculate context similarity even for concepts that are not ambiguous."""
calculate_dynamic_threshold: bool = False
"""Concepts below this similarity will be ignored. Type can be static/dynamic - if dynamic each CUI has a different TH
and it is calcualted as the average confidence for that CUI * similarity_threshold. Take care that dynamic works only
and it is calculated as the average confidence for that CUI * similarity_threshold. Take care that dynamic works only
if the cdb was trained with calculate_dynamic_threshold = True."""
similarity_threshold_type: str = 'static'
similarity_threshold: float = 0.25
Expand All @@ -569,14 +569,14 @@ class Linking(MixingConfig, BaseModel):
prefer_primary_name: float = 0.35
"""If >0 concepts for which a detection is its primary name will be preferred by that amount (0 to 1)"""
prefer_frequent_concepts: float = 0.35
"""If >0 concepts that are more frequent will be prefered by a multiply of this amount"""
"""If >0 concepts that are more frequent will be preferred by a multiply of this amount"""
subsample_after: int = 30000
"""DISABLED in code permanetly: Subsample during unsupervised training if a concept has received more than"""
devalue_linked_concepts: bool = False
"""When adding a positive example, should it also be treated as Negative for concepts
which link to the postive one via names (ambigous names)."""
which link to the positive one via names (ambiguous names)."""
context_ignore_center_tokens: bool = False
"""If true when the context of a concept is calculated (embedding) the words making that concept are not taken into accout"""
"""If true when the context of a concept is calculated (embedding) the words making that concept are not taken into account"""

class Config:
extra = Extra.allow
Expand Down Expand Up @@ -612,7 +612,7 @@ def rebuild_re(self) -> None:
# Some regex that we will need
self.word_skipper = re.compile('^({})$'.format(
'|'.join(self.preprocessing.words_to_skip)))
# Very agressive punct checker, input will be lowercased
# Very aggressive punct checker, input will be lowercased
self.punct_checker = re.compile(r'[^a-z0-9]+')

# Override
Expand Down
2 changes: 1 addition & 1 deletion medcat/config_meta_cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class General(MixingConfig, BaseModel):
batch_size_eval: int = 5000
"""Number of annotations to be meta-annotated at once in eval"""
annotate_overlapping: bool = False
"""If set meta_anns will be calcualted for doc._.ents, otherwise for doc.ents"""
"""If set meta_anns will be calculated for doc._.ents, otherwise for doc.ents"""
tokenizer_name: str = 'bbpe'
"""
Tokenizer name used with MetaCAT.
Expand Down
2 changes: 1 addition & 1 deletion medcat/config_rel_cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ class Model(MixingConfig, BaseModel):
emb_grad: bool = True
"""If True the embeddings will also be trained"""
ignore_cpos: bool = False
"""If set to True center positions will be ignored when calculating represenation"""
"""If set to True center positions will be ignored when calculating representation"""

class Config:
extra = Extra.allow
Expand Down
2 changes: 1 addition & 1 deletion medcat/linking/context_based_linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def __call__(self, doc: Doc) -> Doc:

if len(name) >= cnf_l.disamb_length_limit:
if len(cuis) == 1:
# N - means name must be disambiguated, is not the prefered
# N - means name must be disambiguated, is not the preferred
#name of the concept, links to other concepts also.
if self.cdb.name2cuis2status[name][cuis[0]] != 'N':
self._train(cui=cuis[0], entity=entity, doc=doc)
Expand Down
2 changes: 1 addition & 1 deletion medcat/linking/vector_context_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def similarity(self, cui: str, entity: Span, doc: Doc) -> float:
doc (Doc): The document to look in.
Returns:
float: The simularity.
float: The similarity.
"""
vectors = self.get_context_vectors(entity, doc)
sim = self._similarity(cui, vectors)
Expand Down
4 changes: 2 additions & 2 deletions medcat/meta_cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,7 @@ def prepare_document(self, doc: Doc, input_ids: List, offset_mapping: List, lowe
input_ids (List):
Input ids
offset_mapping (List):
Offset mapings
Offset mappings
lowercase (bool):
Whether to use lower case replace center
Expand All @@ -475,7 +475,7 @@ def prepare_document(self, doc: Doc, input_ids: List, offset_mapping: List, lowe

samples = []
last_ind = 0
ent_id2ind = {} # Map form entitiy ID to where is it in the samples array
ent_id2ind = {} # Map form entity ID to where is it in the samples array
for ent in sorted(ents, key=lambda ent: ent.start_char):
start = ent.start_char
end = ent.end_char
Expand Down
6 changes: 3 additions & 3 deletions medcat/ner/transformers_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def train(self,
# NOTE: The following is for backwards comppatibility
# in datasets==2.20.0 `trust_remote_code=True` must be explicitly
# specified, otherwise an error is raised.
# On the other hand, the keyword argumnet was added in datasets==2.16.0
# On the other hand, the keyword argument was added in datasets==2.16.0
# yet we support datasets>=2.2.0.
# So we need to use the kwarg if applicable and omit its use otherwise.
if func_has_kwarg(datasets.load_dataset, 'trust_remote_code'):
Expand All @@ -196,7 +196,7 @@ def train(self,
split='train',
cache_dir='/tmp/')
# We split before encoding so the split is document level, as encoding
#does the document spliting into max_seq_len
#does the document splitting into max_seq_len
dataset = dataset.train_test_split(test_size=self.config.general['test_size']) # type: ignore

# Update labelmap in case the current dataset has more labels than what we had before
Expand Down Expand Up @@ -330,7 +330,7 @@ def load(cls, save_dir_path: str, config_dict: Optional[Dict] = None) -> "Transf
config = cast(ConfigTransformersNER, ConfigTransformersNER.load(os.path.join(save_dir_path, 'cat_config.json')))
config.general['model_name'] = save_dir_path

# Overwrite loaded paramters with something new
# Overwrite loaded parameters with something new
if config_dict is not None:
config.merge_config(config_dict)

Expand Down
Loading

0 comments on commit 1a94d2d

Please sign in to comment.