From c83780efbe2f9e014d782fa98e22170840579c30 Mon Sep 17 00:00:00 2001
From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com>
Date: Tue, 3 Sep 2024 18:01:17 +0100
Subject: [PATCH 1/4] Pushing bug fix for metacat

2-phase learning for MetaCAT utilises data_undersampled. Fixed a bug in the eval function, which was incorrectly using the data_undersampled instead of the full_data
---
 medcat/meta_cat.py                  |  4 ++--
 medcat/utils/meta_cat/data_utils.py | 10 ++++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/medcat/meta_cat.py b/medcat/meta_cat.py
index 8c73e617..4047ba4c 100644
--- a/medcat/meta_cat.py
+++ b/medcat/meta_cat.py
@@ -257,12 +257,12 @@ def train_raw(self, data_loaded: Dict, save_dir_path: Optional[str] = None, data
         category_value2id = g_config['category_value2id']
         if not category_value2id:
             # Encode the category values
-            data_undersampled, full_data, category_value2id = encode_category_values(data,
+            full_data, data_undersampled, category_value2id = encode_category_values(data,
                                                                                      category_undersample=self.config.model.category_undersample)
             g_config['category_value2id'] = category_value2id
         else:
             # We already have everything, just get the data
-            data_undersampled, full_data, category_value2id = encode_category_values(data,
+            full_data, data_undersampled, category_value2id = encode_category_values(data,
                                                                                      existing_category_value2id=category_value2id,
                                                                                      category_undersample=self.config.model.category_undersample)
             g_config['category_value2id'] = category_value2id
diff --git a/medcat/utils/meta_cat/data_utils.py b/medcat/utils/meta_cat/data_utils.py
index 17059d7f..ec5d41ab 100644
--- a/medcat/utils/meta_cat/data_utils.py
+++ b/medcat/utils/meta_cat/data_utils.py
@@ -166,12 +166,12 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
             Name of class that should be used to undersample the data (for 2 phase learning)
 
     Returns:
-        dict:
-            New underesampled data (for 2 phase learning) with integers inplace of strings for category values
         dict:
             New data with integers inplace of strings for category values.
         dict:
-            Map rom category value to ID for all categories in the data.
+            New undersampled data (for 2 phase learning) with integers inplace of strings for category values
+        dict:
+            Map from category value to ID for all categories in the data.
     """
     data = list(data)
     if existing_category_value2id is not None:
@@ -210,6 +210,8 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
     for i in range(len(data)):
         if data[i][2] in category_value2id.values():
             label_data_[data[i][2]] = label_data_[data[i][2]] + 1
+
+    logger.info(f"Original label_data: {label_data_}")
     # Undersampling data
     if category_undersample is None or category_undersample == '':
         min_label = min(label_data_.values())
@@ -234,7 +236,7 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
             label_data[data_undersampled[i][2]] = label_data[data_undersampled[i][2]] + 1
     logger.info(f"Updated label_data: {label_data}")
 
-    return data_undersampled, data, category_value2id
+    return data, data_undersampled, category_value2id
 
 
 def json_to_fake_spacy(data: Dict, id2text: Dict) -> Iterable:

From 231cccb682d996d998f42e92daa813eeaf78b631 Mon Sep 17 00:00:00 2001
From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com>
Date: Wed, 4 Sep 2024 13:44:34 +0100
Subject: [PATCH 2/4] Pushing change for lazy logging

---
 medcat/meta_cat.py                  | 3 +--
 medcat/utils/meta_cat/data_utils.py | 6 +++---
 medcat/utils/meta_cat/ml_utils.py   | 4 ++--
 medcat/utils/meta_cat/models.py     | 2 +-
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/medcat/meta_cat.py b/medcat/meta_cat.py
index 4047ba4c..af4dfb9d 100644
--- a/medcat/meta_cat.py
+++ b/medcat/meta_cat.py
@@ -269,8 +269,7 @@ def train_raw(self, data_loaded: Dict, save_dir_path: Optional[str] = None, data
         # Make sure the config number of classes is the same as the one found in the data
         if len(category_value2id) != self.config.model['nclasses']:
             logger.warning(
-                "The number of classes set in the config is not the same as the one found in the data: {} vs {}".format(
-                    self.config.model['nclasses'], len(category_value2id)))
+                "The number of classes set in the config is not the same as the one found in the data: %d vs %d" % (self.config.model['nclasses'], len(category_value2id)))
             logger.warning("Auto-setting the nclasses value in config and rebuilding the model.")
             self.config.model['nclasses'] = len(category_value2id)
 
diff --git a/medcat/utils/meta_cat/data_utils.py b/medcat/utils/meta_cat/data_utils.py
index ec5d41ab..d6145af9 100644
--- a/medcat/utils/meta_cat/data_utils.py
+++ b/medcat/utils/meta_cat/data_utils.py
@@ -194,7 +194,7 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
         for k in keys_ls:
             category_value2id_[k] = len(category_value2id_)
 
-        logger.warning("Labels found with 0 data; updates made\nFinal label encoding mapping:", category_value2id_)
+        logger.warning("Labels found with 0 data; updates made\nFinal label encoding mapping: %s" %category_value2id_)
         category_value2id = category_value2id_
 
     for c in category_values:
@@ -211,7 +211,7 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
         if data[i][2] in category_value2id.values():
             label_data_[data[i][2]] = label_data_[data[i][2]] + 1
 
-    logger.info(f"Original label_data: {label_data_}")
+    logger.info("Original label_data: %s" %label_data_)
     # Undersampling data
     if category_undersample is None or category_undersample == '':
         min_label = min(label_data_.values())
@@ -234,7 +234,7 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
     for i in range(len(data_undersampled)):
         if data_undersampled[i][2] in category_value2id.values():
             label_data[data_undersampled[i][2]] = label_data[data_undersampled[i][2]] + 1
-    logger.info(f"Updated label_data: {label_data}")
+    logger.info("Updated label_data: %s" %label_data)
 
     return data, data_undersampled, category_value2id
 
diff --git a/medcat/utils/meta_cat/ml_utils.py b/medcat/utils/meta_cat/ml_utils.py
index 3559ce1d..0bbbafdb 100644
--- a/medcat/utils/meta_cat/ml_utils.py
+++ b/medcat/utils/meta_cat/ml_utils.py
@@ -201,7 +201,7 @@ def train_model(model: nn.Module, data: List, config: ConfigMetaCAT, save_dir_pa
             y_ = [x[2] for x in train_data]
             class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_), y=y_)
             config.train['class_weights'] = class_weights.tolist()
-            logger.info(f"Class weights computed: {class_weights}")
+            logger.info(f"Class weights computed: %s" %class_weights)
 
             class_weights = torch.FloatTensor(class_weights).to(device)
             if config.train['loss_funct'] == 'cross_entropy':
@@ -259,7 +259,7 @@ def initialize_model(classifier, data_, batch_size_, lr_, epochs=4):
 
         # Total number of training steps
         total_steps = int((len(data_) / batch_size_) * epochs)
-        logger.info('Total steps for optimizer: {}'.format(total_steps))
+        logger.info('Total steps for optimizer: %d' %total_steps)
 
         # Set up the learning rate scheduler
         scheduler_ = get_linear_schedule_with_warmup(optimizer_,
diff --git a/medcat/utils/meta_cat/models.py b/medcat/utils/meta_cat/models.py
index 774cabff..f6a55b4f 100644
--- a/medcat/utils/meta_cat/models.py
+++ b/medcat/utils/meta_cat/models.py
@@ -91,7 +91,7 @@ def __init__(self, config):
         super(BertForMetaAnnotation, self).__init__()
         _bertconfig = AutoConfig.from_pretrained(config.model.model_variant,num_hidden_layers=config.model['num_layers'])
         if config.model['input_size'] != _bertconfig.hidden_size:
-            logger.warning(f"\nInput size for {config.model.model_variant} model should be {_bertconfig.hidden_size}, provided input size is {config.model['input_size']} Input size changed to {_bertconfig.hidden_size}")
+            logger.warning("Input size for %s model should be %d, provided input size is %d. Input size changed to %d" %(config.model.model_variant,_bertconfig.hidden_size,config.model['input_size'],_bertconfig.hidden_size))
 
         bert = BertModel.from_pretrained(config.model.model_variant, config=_bertconfig)
         self.config = config

From a75a7c630fac257a2ee04b4d57a612e163c44c92 Mon Sep 17 00:00:00 2001
From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com>
Date: Wed, 4 Sep 2024 14:22:31 +0100
Subject: [PATCH 3/4] Pushing update for lazy logging

---
 medcat/meta_cat.py                  | 2 +-
 medcat/utils/meta_cat/data_utils.py | 6 +++---
 medcat/utils/meta_cat/ml_utils.py   | 4 ++--
 medcat/utils/meta_cat/models.py     | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/medcat/meta_cat.py b/medcat/meta_cat.py
index af4dfb9d..386bbe0c 100644
--- a/medcat/meta_cat.py
+++ b/medcat/meta_cat.py
@@ -269,7 +269,7 @@ def train_raw(self, data_loaded: Dict, save_dir_path: Optional[str] = None, data
         # Make sure the config number of classes is the same as the one found in the data
         if len(category_value2id) != self.config.model['nclasses']:
             logger.warning(
-                "The number of classes set in the config is not the same as the one found in the data: %d vs %d" % (self.config.model['nclasses'], len(category_value2id)))
+                "The number of classes set in the config is not the same as the one found in the data: %d vs %d",self.config.model['nclasses'], len(category_value2id))
             logger.warning("Auto-setting the nclasses value in config and rebuilding the model.")
             self.config.model['nclasses'] = len(category_value2id)
 
diff --git a/medcat/utils/meta_cat/data_utils.py b/medcat/utils/meta_cat/data_utils.py
index d6145af9..83661748 100644
--- a/medcat/utils/meta_cat/data_utils.py
+++ b/medcat/utils/meta_cat/data_utils.py
@@ -194,7 +194,7 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
         for k in keys_ls:
             category_value2id_[k] = len(category_value2id_)
 
-        logger.warning("Labels found with 0 data; updates made\nFinal label encoding mapping: %s" %category_value2id_)
+        logger.warning("Labels found with 0 data; updates made\nFinal label encoding mapping: %s",category_value2id_)
         category_value2id = category_value2id_
 
     for c in category_values:
@@ -211,7 +211,7 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
         if data[i][2] in category_value2id.values():
             label_data_[data[i][2]] = label_data_[data[i][2]] + 1
 
-    logger.info("Original label_data: %s" %label_data_)
+    logger.info("Original label_data: %s",label_data_)
     # Undersampling data
     if category_undersample is None or category_undersample == '':
         min_label = min(label_data_.values())
@@ -234,7 +234,7 @@ def encode_category_values(data: Dict, existing_category_value2id: Optional[Dict
     for i in range(len(data_undersampled)):
         if data_undersampled[i][2] in category_value2id.values():
             label_data[data_undersampled[i][2]] = label_data[data_undersampled[i][2]] + 1
-    logger.info("Updated label_data: %s" %label_data)
+    logger.info("Updated label_data: %s",label_data)
 
     return data, data_undersampled, category_value2id
 
diff --git a/medcat/utils/meta_cat/ml_utils.py b/medcat/utils/meta_cat/ml_utils.py
index 0bbbafdb..25efc668 100644
--- a/medcat/utils/meta_cat/ml_utils.py
+++ b/medcat/utils/meta_cat/ml_utils.py
@@ -201,7 +201,7 @@ def train_model(model: nn.Module, data: List, config: ConfigMetaCAT, save_dir_pa
             y_ = [x[2] for x in train_data]
             class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_), y=y_)
             config.train['class_weights'] = class_weights.tolist()
-            logger.info(f"Class weights computed: %s" %class_weights)
+            logger.info(f"Class weights computed: %s",class_weights)
 
             class_weights = torch.FloatTensor(class_weights).to(device)
             if config.train['loss_funct'] == 'cross_entropy':
@@ -259,7 +259,7 @@ def initialize_model(classifier, data_, batch_size_, lr_, epochs=4):
 
         # Total number of training steps
         total_steps = int((len(data_) / batch_size_) * epochs)
-        logger.info('Total steps for optimizer: %d' %total_steps)
+        logger.info('Total steps for optimizer: %d',total_steps)
 
         # Set up the learning rate scheduler
         scheduler_ = get_linear_schedule_with_warmup(optimizer_,
diff --git a/medcat/utils/meta_cat/models.py b/medcat/utils/meta_cat/models.py
index f6a55b4f..543e0ca6 100644
--- a/medcat/utils/meta_cat/models.py
+++ b/medcat/utils/meta_cat/models.py
@@ -91,7 +91,7 @@ def __init__(self, config):
         super(BertForMetaAnnotation, self).__init__()
         _bertconfig = AutoConfig.from_pretrained(config.model.model_variant,num_hidden_layers=config.model['num_layers'])
         if config.model['input_size'] != _bertconfig.hidden_size:
-            logger.warning("Input size for %s model should be %d, provided input size is %d. Input size changed to %d" %(config.model.model_variant,_bertconfig.hidden_size,config.model['input_size'],_bertconfig.hidden_size))
+            logger.warning("Input size for %s model should be %d, provided input size is %d. Input size changed to %d",config.model.model_variant,_bertconfig.hidden_size,config.model['input_size'],_bertconfig.hidden_size)
 
         bert = BertModel.from_pretrained(config.model.model_variant, config=_bertconfig)
         self.config = config

From f4341df646a89dda409a8b688e7449fcf709fa32 Mon Sep 17 00:00:00 2001
From: shubham-s-agarwal <66172189+shubham-s-agarwal@users.noreply.github.com>
Date: Wed, 4 Sep 2024 14:32:07 +0100
Subject: [PATCH 4/4] Pushing lint fix

---
 medcat/utils/meta_cat/ml_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/medcat/utils/meta_cat/ml_utils.py b/medcat/utils/meta_cat/ml_utils.py
index 25efc668..0ba068d6 100644
--- a/medcat/utils/meta_cat/ml_utils.py
+++ b/medcat/utils/meta_cat/ml_utils.py
@@ -201,7 +201,7 @@ def train_model(model: nn.Module, data: List, config: ConfigMetaCAT, save_dir_pa
             y_ = [x[2] for x in train_data]
             class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_), y=y_)
             config.train['class_weights'] = class_weights.tolist()
-            logger.info(f"Class weights computed: %s",class_weights)
+            logger.info("Class weights computed: %s",class_weights)
 
             class_weights = torch.FloatTensor(class_weights).to(device)
             if config.train['loss_funct'] == 'cross_entropy':