From c83780efbe2f9e014d782fa98e22170840579c30 Mon Sep 17 00:00:00 2001 From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com> Date: Tue, 3 Sep 2024 18:01:17 +0100 Subject: [PATCH 1/4] Pushing bug fix for metacat 2-phase learning for MetaCAT utilises data_undersampled. Fixed a bug in the eval function, which was incorrectly using the data_undersampled instead of the full_data --- medcat/meta_cat.py | 4 ++-- medcat/utils/meta_cat/data_utils.py | 10 ++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/medcat/meta_cat.py b/medcat/meta_cat.py index 8c73e617..4047ba4c 100644 --- a/medcat/meta_cat.py +++ b/medcat/meta_cat.py @@ -257,12 +257,12 @@ def train_raw(self, data_loaded: Dict, save_dir_path: Optional[str] = None, data category_value2id = g_config['category_value2id'] if not category_value2id: # Encode the category values - data_undersampled, full_data, category_value2id = encode_category_values(data, + full_data, data_undersampled, category_value2id = encode_category_values(data, category_undersample=self.config.model.category_undersample) g_config['category_value2id'] = category_value2id else: # We already have everything, just get the data - data_undersampled, full_data, category_value2id = encode_category_values(data, + full_data, data_undersampled, category_value2id = encode_category_values(data, existing_category_value2id=category_value2id, category_undersample=self.config.model.category_undersample) g_config['category_value2id'] = category_value2id diff --git a/medcat/utils/meta_cat/data_utils.py b/medcat/utils/meta_cat/data_utils.py index 17059d7f..ec5d41ab 100644 --- a/medcat/utils/meta_cat/data_utils.py +++ b/medcat/utils/meta_cat/data_utils.py @@ -166,12 +166,12 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict Name of class that should be used to undersample the data (for 2 phase learning) Returns: - dict: - New underesampled data (for 2 phase learning) with integers inplace of strings for category values dict: New data with integers inplace of strings for category values. dict: - Map rom category value to ID for all categories in the data. + New undersampled data (for 2 phase learning) with integers inplace of strings for category values + dict: + Map from category value to ID for all categories in the data. """ data = list(data) if existing_category_value2id is not None: @@ -210,6 +210,8 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict for i in range(len(data)): if data[i][2] in category_value2id.values(): label_data_[data[i][2]] = label_data_[data[i][2]] + 1 + + logger.info(f"Original label_data: {label_data_}") # Undersampling data if category_undersample is None or category_undersample == '': min_label = min(label_data_.values()) @@ -234,7 +236,7 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict label_data[data_undersampled[i][2]] = label_data[data_undersampled[i][2]] + 1 logger.info(f"Updated label_data: {label_data}") - return data_undersampled, data, category_value2id + return data, data_undersampled, category_value2id def json_to_fake_spacy(data: Dict, id2text: Dict) -> Iterable: From 231cccb682d996d998f42e92daa813eeaf78b631 Mon Sep 17 00:00:00 2001 From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com> Date: Wed, 4 Sep 2024 13:44:34 +0100 Subject: [PATCH 2/4] Pushing change for lazy logging --- medcat/meta_cat.py | 3 +-- medcat/utils/meta_cat/data_utils.py | 6 +++--- medcat/utils/meta_cat/ml_utils.py | 4 ++-- medcat/utils/meta_cat/models.py | 2 +- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/medcat/meta_cat.py b/medcat/meta_cat.py index 4047ba4c..af4dfb9d 100644 --- a/medcat/meta_cat.py +++ b/medcat/meta_cat.py @@ -269,8 +269,7 @@ def train_raw(self, data_loaded: Dict, save_dir_path: Optional[str] = None, data # Make sure the config number of classes is the same as the one found in the data if len(category_value2id) != self.config.model['nclasses']: logger.warning( - "The number of classes set in the config is not the same as the one found in the data: {} vs {}".format( - self.config.model['nclasses'], len(category_value2id))) + "The number of classes set in the config is not the same as the one found in the data: %d vs %d" % (self.config.model['nclasses'], len(category_value2id))) logger.warning("Auto-setting the nclasses value in config and rebuilding the model.") self.config.model['nclasses'] = len(category_value2id) diff --git a/medcat/utils/meta_cat/data_utils.py b/medcat/utils/meta_cat/data_utils.py index ec5d41ab..d6145af9 100644 --- a/medcat/utils/meta_cat/data_utils.py +++ b/medcat/utils/meta_cat/data_utils.py @@ -194,7 +194,7 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict for k in keys_ls: category_value2id_[k] = len(category_value2id_) - logger.warning("Labels found with 0 data; updates made\nFinal label encoding mapping:", category_value2id_) + logger.warning("Labels found with 0 data; updates made\nFinal label encoding mapping: %s" %category_value2id_) category_value2id = category_value2id_ for c in category_values: @@ -211,7 +211,7 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict if data[i][2] in category_value2id.values(): label_data_[data[i][2]] = label_data_[data[i][2]] + 1 - logger.info(f"Original label_data: {label_data_}") + logger.info("Original label_data: %s" %label_data_) # Undersampling data if category_undersample is None or category_undersample == '': min_label = min(label_data_.values()) @@ -234,7 +234,7 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict for i in range(len(data_undersampled)): if data_undersampled[i][2] in category_value2id.values(): label_data[data_undersampled[i][2]] = label_data[data_undersampled[i][2]] + 1 - logger.info(f"Updated label_data: {label_data}") + logger.info("Updated label_data: %s" %label_data) return data, data_undersampled, category_value2id diff --git a/medcat/utils/meta_cat/ml_utils.py b/medcat/utils/meta_cat/ml_utils.py index 3559ce1d..0bbbafdb 100644 --- a/medcat/utils/meta_cat/ml_utils.py +++ b/medcat/utils/meta_cat/ml_utils.py @@ -201,7 +201,7 @@ def train_model(model: nn.Module, data: List, config: ConfigMetaCAT, save_dir_pa y_ = [x[2] for x in train_data] class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_), y=y_) config.train['class_weights'] = class_weights.tolist() - logger.info(f"Class weights computed: {class_weights}") + logger.info(f"Class weights computed: %s" %class_weights) class_weights = torch.FloatTensor(class_weights).to(device) if config.train['loss_funct'] == 'cross_entropy': @@ -259,7 +259,7 @@ def initialize_model(classifier, data_, batch_size_, lr_, epochs=4): # Total number of training steps total_steps = int((len(data_) / batch_size_) * epochs) - logger.info('Total steps for optimizer: {}'.format(total_steps)) + logger.info('Total steps for optimizer: %d' %total_steps) # Set up the learning rate scheduler scheduler_ = get_linear_schedule_with_warmup(optimizer_, diff --git a/medcat/utils/meta_cat/models.py b/medcat/utils/meta_cat/models.py index 774cabff..f6a55b4f 100644 --- a/medcat/utils/meta_cat/models.py +++ b/medcat/utils/meta_cat/models.py @@ -91,7 +91,7 @@ def __init__(self, config): super(BertForMetaAnnotation, self).__init__() _bertconfig = AutoConfig.from_pretrained(config.model.model_variant,num_hidden_layers=config.model['num_layers']) if config.model['input_size'] != _bertconfig.hidden_size: - logger.warning(f"\nInput size for {config.model.model_variant} model should be {_bertconfig.hidden_size}, provided input size is {config.model['input_size']} Input size changed to {_bertconfig.hidden_size}") + logger.warning("Input size for %s model should be %d, provided input size is %d. Input size changed to %d" %(config.model.model_variant,_bertconfig.hidden_size,config.model['input_size'],_bertconfig.hidden_size)) bert = BertModel.from_pretrained(config.model.model_variant, config=_bertconfig) self.config = config From a75a7c630fac257a2ee04b4d57a612e163c44c92 Mon Sep 17 00:00:00 2001 From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com> Date: Wed, 4 Sep 2024 14:22:31 +0100 Subject: [PATCH 3/4] Pushing update for lazy logging --- medcat/meta_cat.py | 2 +- medcat/utils/meta_cat/data_utils.py | 6 +++--- medcat/utils/meta_cat/ml_utils.py | 4 ++-- medcat/utils/meta_cat/models.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/medcat/meta_cat.py b/medcat/meta_cat.py index af4dfb9d..386bbe0c 100644 --- a/medcat/meta_cat.py +++ b/medcat/meta_cat.py @@ -269,7 +269,7 @@ def train_raw(self, data_loaded: Dict, save_dir_path: Optional[str] = None, data # Make sure the config number of classes is the same as the one found in the data if len(category_value2id) != self.config.model['nclasses']: logger.warning( - "The number of classes set in the config is not the same as the one found in the data: %d vs %d" % (self.config.model['nclasses'], len(category_value2id))) + "The number of classes set in the config is not the same as the one found in the data: %d vs %d",self.config.model['nclasses'], len(category_value2id)) logger.warning("Auto-setting the nclasses value in config and rebuilding the model.") self.config.model['nclasses'] = len(category_value2id) diff --git a/medcat/utils/meta_cat/data_utils.py b/medcat/utils/meta_cat/data_utils.py index d6145af9..83661748 100644 --- a/medcat/utils/meta_cat/data_utils.py +++ b/medcat/utils/meta_cat/data_utils.py @@ -194,7 +194,7 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict for k in keys_ls: category_value2id_[k] = len(category_value2id_) - logger.warning("Labels found with 0 data; updates made\nFinal label encoding mapping: %s" %category_value2id_) + logger.warning("Labels found with 0 data; updates made\nFinal label encoding mapping: %s",category_value2id_) category_value2id = category_value2id_ for c in category_values: @@ -211,7 +211,7 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict if data[i][2] in category_value2id.values(): label_data_[data[i][2]] = label_data_[data[i][2]] + 1 - logger.info("Original label_data: %s" %label_data_) + logger.info("Original label_data: %s",label_data_) # Undersampling data if category_undersample is None or category_undersample == '': min_label = min(label_data_.values()) @@ -234,7 +234,7 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict for i in range(len(data_undersampled)): if data_undersampled[i][2] in category_value2id.values(): label_data[data_undersampled[i][2]] = label_data[data_undersampled[i][2]] + 1 - logger.info("Updated label_data: %s" %label_data) + logger.info("Updated label_data: %s",label_data) return data, data_undersampled, category_value2id diff --git a/medcat/utils/meta_cat/ml_utils.py b/medcat/utils/meta_cat/ml_utils.py index 0bbbafdb..25efc668 100644 --- a/medcat/utils/meta_cat/ml_utils.py +++ b/medcat/utils/meta_cat/ml_utils.py @@ -201,7 +201,7 @@ def train_model(model: nn.Module, data: List, config: ConfigMetaCAT, save_dir_pa y_ = [x[2] for x in train_data] class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_), y=y_) config.train['class_weights'] = class_weights.tolist() - logger.info(f"Class weights computed: %s" %class_weights) + logger.info(f"Class weights computed: %s",class_weights) class_weights = torch.FloatTensor(class_weights).to(device) if config.train['loss_funct'] == 'cross_entropy': @@ -259,7 +259,7 @@ def initialize_model(classifier, data_, batch_size_, lr_, epochs=4): # Total number of training steps total_steps = int((len(data_) / batch_size_) * epochs) - logger.info('Total steps for optimizer: %d' %total_steps) + logger.info('Total steps for optimizer: %d',total_steps) # Set up the learning rate scheduler scheduler_ = get_linear_schedule_with_warmup(optimizer_, diff --git a/medcat/utils/meta_cat/models.py b/medcat/utils/meta_cat/models.py index f6a55b4f..543e0ca6 100644 --- a/medcat/utils/meta_cat/models.py +++ b/medcat/utils/meta_cat/models.py @@ -91,7 +91,7 @@ def __init__(self, config): super(BertForMetaAnnotation, self).__init__() _bertconfig = AutoConfig.from_pretrained(config.model.model_variant,num_hidden_layers=config.model['num_layers']) if config.model['input_size'] != _bertconfig.hidden_size: - logger.warning("Input size for %s model should be %d, provided input size is %d. Input size changed to %d" %(config.model.model_variant,_bertconfig.hidden_size,config.model['input_size'],_bertconfig.hidden_size)) + logger.warning("Input size for %s model should be %d, provided input size is %d. Input size changed to %d",config.model.model_variant,_bertconfig.hidden_size,config.model['input_size'],_bertconfig.hidden_size) bert = BertModel.from_pretrained(config.model.model_variant, config=_bertconfig) self.config = config From f4341df646a89dda409a8b688e7449fcf709fa32 Mon Sep 17 00:00:00 2001 From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com> Date: Wed, 4 Sep 2024 14:32:07 +0100 Subject: [PATCH 4/4] Pushing lint fix --- medcat/utils/meta_cat/ml_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat/utils/meta_cat/ml_utils.py b/medcat/utils/meta_cat/ml_utils.py index 25efc668..0ba068d6 100644 --- a/medcat/utils/meta_cat/ml_utils.py +++ b/medcat/utils/meta_cat/ml_utils.py @@ -201,7 +201,7 @@ def train_model(model: nn.Module, data: List, config: ConfigMetaCAT, save_dir_pa y_ = [x[2] for x in train_data] class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_), y=y_) config.train['class_weights'] = class_weights.tolist() - logger.info(f"Class weights computed: %s",class_weights) + logger.info("Class weights computed: %s",class_weights) class_weights = torch.FloatTensor(class_weights).to(device) if config.train['loss_funct'] == 'cross_entropy':