Fixing formatting issues

IITH-Compilers · Oct 7, 2024 · b8701f5 · b8701f5
1 parent a37b8bb
commit b8701f5
Show file tree

Hide file tree

Showing 8 changed files with 99 additions and 76 deletions.
diff --git a/seed_embeddings/OpenKE/analogy.py b/seed_embeddings/OpenKE/analogy.py
@@ -6,13 +6,14 @@
 import numpy as np
 from sklearn.metrics.pairwise import euclidean_distances
 
+
 class AnalogyScorer:
     def __init__(self, analogy_file="analogies.txt"):
         self.entity_dict = {}
         self.analogies = self._load_analogies(analogy_file)
 
     def _load_analogies(self, file_path):
-        with open(file_path, 'r') as f:
+        with open(file_path, "r") as f:
             return [tuple(line.strip().split()) for line in f if line.strip()]
 
     def find_vec(self, str1):
@@ -22,24 +23,24 @@ def gen_similarity_table(self, vec):
         keys = list(self.entity_dict.keys())
         entity_matrix = np.array(list(self.entity_dict.values()))
         vec = vec.reshape(1, -1)
-        
+
         # Calculate distances using euclidean_distances
         distances = euclidean_distances(vec, entity_matrix)[0]
-        
+
         return dict(zip(keys, distances))
-    
+
     def findTopk(self, dict1, k, values):
         sortedByVal = dict(sorted(dict1.items(), key=lambda x: x[1]))
         del sortedByVal[values[0].upper()]
         del sortedByVal[values[1].upper()]
         del sortedByVal[values[2].upper()]
         return {k: sortedByVal[k] for k in list(sortedByVal)[:k]}
 
-    def get_analogy_score(self, entity_dict):        
+    def get_analogy_score(self, entity_dict):
         self.entity_dict = entity_dict
         total_count = len(self.analogies)
         correct_count = 0
-        
+
         for values in self.analogies:
             vecA = self.find_vec(values[0])
             vecB = self.find_vec(values[1])
@@ -56,4 +57,4 @@ def get_analogy_score(self, entity_dict):
 
             if values[3].upper() in top_k_dict:
                 correct_count += 1
-        return correct_count
+        return correct_count
diff --git a/seed_embeddings/OpenKE/config/Trainer.py b/seed_embeddings/OpenKE/config/Trainer.py
@@ -52,10 +52,10 @@ def __init__(
         self.save_steps = save_steps
         self.checkpoint_dir = checkpoint_dir
         # self.out_path = out_path
-        
+
         self.entity_names = self.load_entity_names(index_dir)
         self.analogies = analogy.AnalogyScorer(analogy_file=analogy_file)
-        
+
     def load_entity_names(self, index_dir):
         with open(os.path.join(index_dir, "entity2id.txt")) as fEntity:
             content = fEntity.read()
@@ -93,7 +93,7 @@ def getEntityDict(self, ent_embeddings):
         mapping entity names to their corresponding embeddings.
         """
         entity_dict = {}
-        
+
         for i, entity_name in enumerate(self.entity_names):
             entity_dict[entity_name] = ent_embeddings[i].tolist()
 
@@ -180,25 +180,27 @@ def run(
                     # self.model => Negative Sampling object
                     # self.mode.model => Transe model
 
-                    ent_embeddings = self.model.model.ent_embeddings.weight.data.cpu().numpy()
+                    ent_embeddings = (
+                        self.model.model.ent_embeddings.weight.data.cpu().numpy()
+                    )
                     entity_dict = self.getEntityDict(ent_embeddings)
                     analogy_score = self.analogies.get_analogy_score(entity_dict)
                     metrics.update({"AnalogiesScore": analogy_score})
                     print("Analogy Score completed")
-                    
+
                     del entity_dict
-                    
+
                     if best_metric_val <= analogy_score:
                         best_metric_val = analogy_score
                         save_ckpt = True
 
-                else: # loss
+                else:  # loss
                     if best_metric_val >= res:
                         best_metric_val = res
                         save_ckpt = True
-                
+
                 with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
-                        # Save the checkpoint...
+                    # Save the checkpoint...
                     checkpoint = None
                     if save_ckpt:
                         self.model.save_checkpoint(

diff --git a/seed_embeddings/OpenKE/generate_embedding_ray.py b/seed_embeddings/OpenKE/generate_embedding_ray.py
@@ -28,6 +28,7 @@
 
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
 
+
 def test_files(index_dir):
     entities = os.path.join(index_dir, "entity2id.txt")
     relations = os.path.join(index_dir, "relation2id.txt")
@@ -41,6 +42,7 @@ def test_files(index_dir):
     if not os.path.exists(train):
         raise Exception("train2id.txt not found")
 
+
 def train(config, args=None):
     # dataloader for training
     train_dataloader = TrainDataLoader(
@@ -110,37 +112,37 @@ def findRep(src, index_dir, src_type="json"):
     for i in range(1, int(entities[0])):
         toTxt += entities[i].split("\t")[0] + ":" + str(rep[i - 1]) + ",\n"
     toTxt += (
-        entities[int(entities[0])].split("\t")[0]
-        + ":"
-        + str(rep[int(entities[0]) - 1])
+        entities[int(entities[0])].split("\t")[0] + ":" + str(rep[int(entities[0]) - 1])
     )
     return toTxt
 
+
 def reformat_embeddings(input_str):
     # Split the string by '],' to isolate each object
-    entries = input_str.split('],')
-    
+    entries = input_str.split("],")
+
     formatted_entries = []
-    
+
     for entry in entries:
         # Remove any newline characters
-        cleaned_entry = entry.replace('\n', ' ')
-        
+        cleaned_entry = entry.replace("\n", " ")
+
         # Split the object name from the values part
-        obj_name, values = cleaned_entry.split(':[')
-        
+        obj_name, values = cleaned_entry.split(":[")
+
         # Remove extra spaces and replace multiple spaces with a single one using regex
-        values = re.sub(r'\s+', ' ', values.split(']')[0].strip())
-        
+        values = re.sub(r"\s+", " ", values.split("]")[0].strip())
+
         # Replace spaces between numbers with commas and add the closing bracket
-        formatted_values = values.replace(' ', ', ') + ']'
-        
+        formatted_values = values.replace(" ", ", ") + "]"
+
         # Recombine the object name with the formatted values
         formatted_entry = f"{obj_name.strip()}:[{formatted_values}"
         formatted_entries.append(formatted_entry)
-    
+
     # Join all entries back into one string, separated by newline
-    return '\n'.join(formatted_entries)
+    return "\n".join(formatted_entries)
+
 
 if __name__ == "__main__":
     ray.init()
@@ -220,10 +222,10 @@ def reformat_embeddings(input_str):
         type=str,
         default="./analogies.txt",
     )
-    
+
     arg_conf = parser.parse_args()
     arg_conf.index_dir = arg_conf.index_dir + "/"
-    
+
     try:
         test_files(arg_conf.index_dir)
         print("Files are OK")
@@ -239,7 +241,7 @@ def reformat_embeddings(input_str):
         # "neg_ent": tune.randint(1, 30),
         # "neg_rel": tune.randint(1, 30),
         # "bern": tune.randint(0, 2),
-        "opt_method": "Adam", #tune.choice(["SGD", "Adam"]),
+        "opt_method": "Adam",  # tune.choice(["SGD", "Adam"]),
     }
 
     try:
@@ -258,7 +260,7 @@ def reformat_embeddings(input_str):
     else:
         metric = "loss"
         mode = "min"
-        
+
     scheduler = ASHAScheduler(
         time_attr="training_iteration",
         max_t=arg_conf.epoch,
@@ -271,13 +273,12 @@ def reformat_embeddings(input_str):
 
     if arg_conf.use_gpu:
         train_with_resources = tune.with_resources(
-            tune.with_parameters(train, args = arg_conf),
-            resources={"cpu": 8, "gpu": 0.15}
+            tune.with_parameters(train, args=arg_conf),
+            resources={"cpu": 8, "gpu": 0.15},
         )
     else:
         train_with_resources = tune.with_resources(
-            tune.with_parameters(train, args = arg_conf),
-            resources={"cpu": 10, "gpu": 0}
+            tune.with_parameters(train, args=arg_conf), resources={"cpu": 10, "gpu": 0}
         )
 
     tuner = tune.Tuner(
@@ -296,17 +297,15 @@ def reformat_embeddings(input_str):
                 # *Best* checkpoints are determined by these params:
                 checkpoint_score_attribute=metric,
                 checkpoint_score_order=mode,
-            )
+            ),
         ),
     )
     results = tuner.fit()
 
     # Write the best result to a file, best_result.txt
     fin_res = results.get_best_result(metric=metric, mode=mode)
     with open(os.path.join(arg_conf.index_dir, "best_result.txt"), "a") as f:
-        f.write(
-            "\n" + str(fin_res)
-        )
+        f.write("\n" + str(fin_res))
 
     if arg_conf.is_analogy:
         print(
@@ -323,7 +322,7 @@ def reformat_embeddings(input_str):
             "Best Config Based on Loss : ",
             fin_res,
         )
-    
+
     # Get the best configuration
     best_config = fin_res.config
     print("best_config: ", best_config)
@@ -366,10 +365,10 @@ def reformat_embeddings(input_str):
                 margin,
             ),
         )
-        
+
         data = findRep(outfile, index_dir, src_type="ckpt")
         formatted_data = reformat_embeddings(data)
-        
+
         # Write the embeddings to outfile
         embeddings_path = embeddings_path.replace(".ckpt", ".txt")
         print("embeddings_path: ", embeddings_path)

diff --git a/src/IR2Vec.cpp b/src/IR2Vec.cpp
@@ -38,7 +38,8 @@ cl::opt<bool> cl_collectIR(
 cl::opt<std::string> cl_iname(cl::Positional, cl::desc("Input file path"),
                               cl::Required, cl::cat(category));
 cl::opt<unsigned> cl_dim("dim", cl::Optional, cl::init(300),
-                         cl::desc("Dimension of the embeddings"), cl::cat(category));                           
+                         cl::desc("Dimension of the embeddings"),
+                         cl::cat(category));
 cl::opt<std::string> cl_oname("o", cl::Required, cl::desc("Output file path"),
                               cl::cat(category));
 // for on demand generation of embeddings taking function name