From 6bd3ddc18d0707e6187895af4e8727e9ee4141df Mon Sep 17 00:00:00 2001
From: svkeerthy <cs17mtech11018@iith.ac.in>
Date: Tue, 1 Oct 2024 17:07:20 +0530
Subject: [PATCH] Refactoring analogy.py for time/mem optimization

---
 seed_embeddings/OpenKE/analogy.py        | 240 +++++------------------
 seed_embeddings/OpenKE/config/Trainer.py |  43 ++--
 2 files changed, 79 insertions(+), 204 deletions(-)

diff --git a/seed_embeddings/OpenKE/analogy.py b/seed_embeddings/OpenKE/analogy.py
index aead96bdf..3ce055335 100644
--- a/seed_embeddings/OpenKE/analogy.py
+++ b/seed_embeddings/OpenKE/analogy.py
@@ -2,190 +2,58 @@
 # Exceptions. See the LICENSE file for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-import heapq
-import sys, re
-import numpy as np
-import pandas as pd
-from collections import OrderedDict
-from scipy import spatial
-import os
-from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
-
-
-def findVec(str1, src):
-    with open(src) as f:
-        content = f.read()
-        # print("searching for ", str1)
-        start = content.upper().find("\n" + str1.upper() + ":[") + len(str1) + 3
-        if start == -1:
-            print(str1, " not found")
-            return
-        else:
-            end = content.find("]", start)
-            vecstr = content[start:end].split(", ")
-            vec = [float(element) for element in vecstr]
-            # print(vec)
-            return vec
-
-
-def findVecFromDict(str1, entity_dict):
-    """
-    Finds the vector for a given entity dictionary
-    """
-    if str1.upper() in entity_dict:
-        return np.array(entity_dict[str1.upper()])
-    else:
-        print(f"{str1} not found in entity_dict")
-    return None
-
-
-# def genSimilarityTableFromDict(vec, entity_dict):
-#     """
-#     Generates cosine and Euclidean similarity tables based on the entity embeddings dictionary.
-#     """
-#     cosineDict = {}
-#     euclDict = {}
-
-#     for opcode, value in entity_dict.items():
-#         value = np.array(value)
-#         cosineDict[opcode] = spatial.distance.cosine(vec, value)
-#         # euclDict[opcode] = spatial.distance.euclidean(vec, value) # Ignored for now
-#     return cosineDict, euclDict
-def genSimilarityTableFromDict(vec, entities, entity_matrix):
-    """
-    Generates cosine table based on the entities and entity matrix.
-    """
-
-    vec = vec.reshape(1, -1)
-
-    # Compute cosine similarity between vec and all entity vectors at once
-    cosine_similarities = cosine_similarity(vec, entity_matrix)[
-        0
-    ]  # [0] to get the first row
-    # euclidean_dist = euclidean_distances(vec, entity_matrix)[0]
-
-    # Create the cosine similarity dictionary using the entity keys
-    cosineDict = dict(zip(entities, 1 - cosine_similarities))
-    # euclDict = {entities[i]: euclidean_dist[i] for i in range(len(entities))} # Keeping this empty since Euclidean part is ignored
-    euclDict = {}
-
-    return cosineDict, euclDict
-
-
-def genSimilarityTable(vec, src):
-    # opcVec = findVec(opc, src)
-    with open(src) as f:
-        lines = [line.strip("\n\t") for line in f]
-        cosineDict = {}
-        euclDict = {}
-        for line in lines:
-            opcode = line[0 : line.find(":[")].upper()
-            valueStr = line[line.find(":[") + 2 : -2].split(", ")
-            value = [float(element) for element in valueStr]
-            cosineDict[opcode] = spatial.distance.cosine(vec, value)
-            euclDict[opcode] = spatial.distance.euclidean(vec, value)
-        return cosineDict, euclDict
-
-
-def findTopk(dict1, k, values):
-    # print(sorted(dict.items(), key=lambda x: x[1]))
-    # k_keys_sorted_by_values = heapq.nsmallest(k+1, dict1, key=dict1.get)
-    # topKDict = OrderedDict((keys, dict1[keys]) for keys in k_keys_sorted_by_values)
-
-    sortedByVal = dict(sorted(dict1.items(), key=lambda x: x[1]))
-    # sortedByVal.pop('AND')
-    del sortedByVal[values[0].upper()]
-    del sortedByVal[values[1].upper()]
-    del sortedByVal[values[2].upper()]
-    return {k: sortedByVal[k] for k in list(sortedByVal)[:k]}
-
-
-def getAnalogyScoreFromDict(entity_dict, index_dir):
-    """
-    Computes the analogy score directly from entity embeddings dict and analogies.txt.
-    """
-    script_dir = os.path.dirname(os.path.abspath(__file__))
-    refFile = os.path.join(script_dir, "analogies.txt")
 
-    # Read analogies
-    with open(refFile) as f:
-        analogies = [line.strip("\n") for line in f]
-
-    totalCnt = 0
-    correctCnt = 0
-    avg = []
-
-    # Convert the entity_dict values to a matrix for vectorized operations
-    keys = list(entity_dict.keys())
-    entity_matrix = np.array(list(entity_dict.values()))
-
-    # Iterate through the analogies
-    for analogy in analogies:
-        totalCnt += 1
-        values = analogy.split(" ")
-
-        vecA = findVecFromDict(values[0], entity_dict)
-        vecB = findVecFromDict(values[1], entity_dict)
-        vecC = findVecFromDict(values[2], entity_dict)
-
-        if vecA is None or vecB is None or vecC is None:
-            continue
-
-        vecD = vecB - vecA + vecC
-
-        cosineDict, euclDict = genSimilarityTableFromDict(vecD, keys, entity_matrix)
-        topKCosineDict = findTopk(cosineDict, 5, values)
-
-        if values[3].upper() in topKCosineDict:
-            correctCnt += 1
-            avg.append(topKCosineDict[values[3].upper()])
-
-    return correctCnt
-
-
-def getAnalogyScore(fileName):
-    # Get the directory of the current script
-    script_dir = os.path.dirname(os.path.abspath(__file__))
-
-    # Construct the full path to 'analogies.txt'
-    refFile = os.path.join(script_dir, "analogies.txt")
-    # refFile = "analogies.txt"
-    with open(refFile) as f:
-        analogies = [line.strip("\n") for line in f]
-        totalCnt = 0
-        fileCorrectCnt = {}
-
-        avg = []
-        correctCnt = 0
-        for analogy in analogies:
-            totalCnt = totalCnt + 1
-            # values = [val for val in analogy.strip('\t')]
-            values = analogy.split(" ")
-            # print(values)
-            # fileName = argv[0]
-
-            vecA = findVec(values[0], fileName)
-            vecB = findVec(values[1], fileName)
-            vecC = findVec(values[2], fileName)
-            # vecD = np.asarray(vecA) - np.asarray(vecB) + np.asarray(vecC)
-            vecD = np.asarray(vecB) - np.asarray(vecA) + np.asarray(vecC)
-
-            del vecA
-            del vecB
-            del vecC
-
-            # print(vecD)
-            cosineDict, euclDict = genSimilarityTable(vecD, fileName)
-            topKCosineDict = findTopk(cosineDict, 5, values)
-
-            if values[3].upper() in topKCosineDict:
-                correctCnt = correctCnt + 1
-                # print(values, ' : ', '\033[92m' + u'\u2713' + '\033[0m', topKCosineDict[values[3].upper()])
-                avg.append(topKCosineDict[values[3].upper()])
-            else:
-                # print(values, ' : ', '\033[91m' + u'\u00D7' + '\033[0m', topKCosineDict.keys())
-                pass
-            fileCorrectCnt[fileName] = correctCnt
-            # fileCorrectCnt['averagedist_'+fileName] = sum(avg)/len(avg)
-
-        return fileCorrectCnt[fileName]
+import numpy as np
+from sklearn.metrics.pairwise import euclidean_distances
+
+class AnalogyScorer:
+    def __init__(self, analogy_file="analogies.txt"):
+        self.entity_dict = {}
+        self.analogies = self._load_analogies(analogy_file)
+
+    def _load_analogies(self, file_path):
+        with open(file_path, 'r') as f:
+            return [tuple(line.strip().split()) for line in f if line.strip()]
+
+    def find_vec(self, str1):
+        return np.array(self.entity_dict.get(str1.upper(), None))
+
+    def gen_similarity_table(self, vec):
+        keys = list(self.entity_dict.keys())
+        entity_matrix = np.array(list(self.entity_dict.values()))
+        vec = vec.reshape(1, -1)
+        
+        # Calculate distances using euclidean_distances
+        distances = euclidean_distances(vec, entity_matrix)[0]
+        
+        return dict(zip(keys, distances))
+    
+    def findTopk(self, dict1, k, values):
+        sortedByVal = dict(sorted(dict1.items(), key=lambda x: x[1]))
+        del sortedByVal[values[0].upper()]
+        del sortedByVal[values[1].upper()]
+        del sortedByVal[values[2].upper()]
+        return {k: sortedByVal[k] for k in list(sortedByVal)[:k]}
+
+    def get_analogy_score(self, entity_dict):        
+        self.entity_dict = entity_dict
+        total_count = len(self.analogies)
+        correct_count = 0
+        
+        for values in self.analogies:
+            vecA = self.find_vec(values[0])
+            vecB = self.find_vec(values[1])
+            vecC = self.find_vec(values[2])
+
+            if vecA is None or vecB is None or vecC is None:
+                print(f"Skipping analogy due to missing vector: {values}")
+                continue
+
+            # Calculate vecD based on the analogy
+            vecD = vecB - vecA + vecC
+            similarity_dict = self.gen_similarity_table(vecD)
+            top_k_dict = self.findTopk(similarity_dict, 5, values)
+
+            if values[3].upper() in top_k_dict:
+                correct_count += 1
+        return correct_count
\ No newline at end of file
diff --git a/seed_embeddings/OpenKE/config/Trainer.py b/seed_embeddings/OpenKE/config/Trainer.py
index 0ea91a1c9..6d05f1fc3 100755
--- a/seed_embeddings/OpenKE/config/Trainer.py
+++ b/seed_embeddings/OpenKE/config/Trainer.py
@@ -52,6 +52,25 @@ def __init__(
         self.save_steps = save_steps
         self.checkpoint_dir = checkpoint_dir
         # self.out_path = out_path
+        
+        self.entity_names = self.load_entity_names(index_dir)
+        self.analogies = analogy.AnalogyScorer(analogy_file="analogies.txt")
+        
+    def load_entity_names(self, index_dir):
+        with open(os.path.join(index_dir, "entity2id.txt")) as fEntity:
+            content = fEntity.read()
+
+        entities = content.split("\n")
+        entity_dict = {}
+
+        for i in range(1, int(entities[0])):
+            entity_name = entities[i].split("\t")[0].upper()
+            entity_dict[entity_name] = None  # Placeholder for embeddings
+
+        last_entity_name = entities[int(entities[0])].split("\t")[0].upper()
+        entity_dict[last_entity_name] = None
+
+        return entity_dict
 
     def train_one_step(self, data):
         self.optimizer.zero_grad()
@@ -68,25 +87,15 @@ def train_one_step(self, data):
         self.optimizer.step()
         return loss.item()
 
-    def getEntityDict(self, ent_embeddings, index_dir):
+    def getEntityDict(self, ent_embeddings):
         """
         Reads the entity embeddings and returns an dictionary
         mapping entity names to their corresponding embeddings.
         """
-        rep = ent_embeddings
-
-        with open(os.path.join(index_dir, "entity2id.txt")) as fEntity:
-            content = fEntity.read()
-
-        entities = content.split("\n")
         entity_dict = {}
-
-        for i in range(1, int(entities[0])):
-            entity_name = entities[i].split("\t")[0]
-            entity_dict[entity_name.upper()] = rep[i - 1].tolist()
-
-        last_entity_name = entities[int(entities[0])].split("\t")[0]
-        entity_dict[last_entity_name.upper()] = rep[int(entities[0]) - 1].tolist()
+        
+        for i, entity_name in enumerate(self.entity_dict):
+            entity_dict[entity_name] = ent_embeddings[i].tolist()
 
         return entity_dict
 
@@ -168,10 +177,8 @@ def run(
                     # self.mode.model => Transe model
 
                     ent_embeddings = self.model.model.ent_embeddings.weight.data.numpy()
-                    entity_dict = self.getEntityDict(ent_embeddings, self.index_dir)
-                    analogy_score = analogy.getAnalogyScoreFromDict(
-                        entity_dict, self.index_dir
-                    )
+                    entity_dict = self.getEntityDict(ent_embeddings)
+                    analogy_score = self.analogies.get_analogy_score(entity_dict)
                     metrics.update({"AnalogiesScore": analogy_score})
                     print("Analogy Score Completed")