From 6bd3ddc18d0707e6187895af4e8727e9ee4141df Mon Sep 17 00:00:00 2001 From: svkeerthy Date: Tue, 1 Oct 2024 17:07:20 +0530 Subject: [PATCH] Refactoring analogy.py for time/mem optimization --- seed_embeddings/OpenKE/analogy.py | 240 +++++------------------ seed_embeddings/OpenKE/config/Trainer.py | 43 ++-- 2 files changed, 79 insertions(+), 204 deletions(-) diff --git a/seed_embeddings/OpenKE/analogy.py b/seed_embeddings/OpenKE/analogy.py index aead96bdf..3ce055335 100644 --- a/seed_embeddings/OpenKE/analogy.py +++ b/seed_embeddings/OpenKE/analogy.py @@ -2,190 +2,58 @@ # Exceptions. See the LICENSE file for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -import heapq -import sys, re -import numpy as np -import pandas as pd -from collections import OrderedDict -from scipy import spatial -import os -from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances - - -def findVec(str1, src): - with open(src) as f: - content = f.read() - # print("searching for ", str1) - start = content.upper().find("\n" + str1.upper() + ":[") + len(str1) + 3 - if start == -1: - print(str1, " not found") - return - else: - end = content.find("]", start) - vecstr = content[start:end].split(", ") - vec = [float(element) for element in vecstr] - # print(vec) - return vec - - -def findVecFromDict(str1, entity_dict): - """ - Finds the vector for a given entity dictionary - """ - if str1.upper() in entity_dict: - return np.array(entity_dict[str1.upper()]) - else: - print(f"{str1} not found in entity_dict") - return None - - -# def genSimilarityTableFromDict(vec, entity_dict): -# """ -# Generates cosine and Euclidean similarity tables based on the entity embeddings dictionary. -# """ -# cosineDict = {} -# euclDict = {} - -# for opcode, value in entity_dict.items(): -# value = np.array(value) -# cosineDict[opcode] = spatial.distance.cosine(vec, value) -# # euclDict[opcode] = spatial.distance.euclidean(vec, value) # Ignored for now -# return cosineDict, euclDict -def genSimilarityTableFromDict(vec, entities, entity_matrix): - """ - Generates cosine table based on the entities and entity matrix. - """ - - vec = vec.reshape(1, -1) - - # Compute cosine similarity between vec and all entity vectors at once - cosine_similarities = cosine_similarity(vec, entity_matrix)[ - 0 - ] # [0] to get the first row - # euclidean_dist = euclidean_distances(vec, entity_matrix)[0] - - # Create the cosine similarity dictionary using the entity keys - cosineDict = dict(zip(entities, 1 - cosine_similarities)) - # euclDict = {entities[i]: euclidean_dist[i] for i in range(len(entities))} # Keeping this empty since Euclidean part is ignored - euclDict = {} - - return cosineDict, euclDict - - -def genSimilarityTable(vec, src): - # opcVec = findVec(opc, src) - with open(src) as f: - lines = [line.strip("\n\t") for line in f] - cosineDict = {} - euclDict = {} - for line in lines: - opcode = line[0 : line.find(":[")].upper() - valueStr = line[line.find(":[") + 2 : -2].split(", ") - value = [float(element) for element in valueStr] - cosineDict[opcode] = spatial.distance.cosine(vec, value) - euclDict[opcode] = spatial.distance.euclidean(vec, value) - return cosineDict, euclDict - - -def findTopk(dict1, k, values): - # print(sorted(dict.items(), key=lambda x: x[1])) - # k_keys_sorted_by_values = heapq.nsmallest(k+1, dict1, key=dict1.get) - # topKDict = OrderedDict((keys, dict1[keys]) for keys in k_keys_sorted_by_values) - - sortedByVal = dict(sorted(dict1.items(), key=lambda x: x[1])) - # sortedByVal.pop('AND') - del sortedByVal[values[0].upper()] - del sortedByVal[values[1].upper()] - del sortedByVal[values[2].upper()] - return {k: sortedByVal[k] for k in list(sortedByVal)[:k]} - - -def getAnalogyScoreFromDict(entity_dict, index_dir): - """ - Computes the analogy score directly from entity embeddings dict and analogies.txt. - """ - script_dir = os.path.dirname(os.path.abspath(__file__)) - refFile = os.path.join(script_dir, "analogies.txt") - # Read analogies - with open(refFile) as f: - analogies = [line.strip("\n") for line in f] - - totalCnt = 0 - correctCnt = 0 - avg = [] - - # Convert the entity_dict values to a matrix for vectorized operations - keys = list(entity_dict.keys()) - entity_matrix = np.array(list(entity_dict.values())) - - # Iterate through the analogies - for analogy in analogies: - totalCnt += 1 - values = analogy.split(" ") - - vecA = findVecFromDict(values[0], entity_dict) - vecB = findVecFromDict(values[1], entity_dict) - vecC = findVecFromDict(values[2], entity_dict) - - if vecA is None or vecB is None or vecC is None: - continue - - vecD = vecB - vecA + vecC - - cosineDict, euclDict = genSimilarityTableFromDict(vecD, keys, entity_matrix) - topKCosineDict = findTopk(cosineDict, 5, values) - - if values[3].upper() in topKCosineDict: - correctCnt += 1 - avg.append(topKCosineDict[values[3].upper()]) - - return correctCnt - - -def getAnalogyScore(fileName): - # Get the directory of the current script - script_dir = os.path.dirname(os.path.abspath(__file__)) - - # Construct the full path to 'analogies.txt' - refFile = os.path.join(script_dir, "analogies.txt") - # refFile = "analogies.txt" - with open(refFile) as f: - analogies = [line.strip("\n") for line in f] - totalCnt = 0 - fileCorrectCnt = {} - - avg = [] - correctCnt = 0 - for analogy in analogies: - totalCnt = totalCnt + 1 - # values = [val for val in analogy.strip('\t')] - values = analogy.split(" ") - # print(values) - # fileName = argv[0] - - vecA = findVec(values[0], fileName) - vecB = findVec(values[1], fileName) - vecC = findVec(values[2], fileName) - # vecD = np.asarray(vecA) - np.asarray(vecB) + np.asarray(vecC) - vecD = np.asarray(vecB) - np.asarray(vecA) + np.asarray(vecC) - - del vecA - del vecB - del vecC - - # print(vecD) - cosineDict, euclDict = genSimilarityTable(vecD, fileName) - topKCosineDict = findTopk(cosineDict, 5, values) - - if values[3].upper() in topKCosineDict: - correctCnt = correctCnt + 1 - # print(values, ' : ', '\033[92m' + u'\u2713' + '\033[0m', topKCosineDict[values[3].upper()]) - avg.append(topKCosineDict[values[3].upper()]) - else: - # print(values, ' : ', '\033[91m' + u'\u00D7' + '\033[0m', topKCosineDict.keys()) - pass - fileCorrectCnt[fileName] = correctCnt - # fileCorrectCnt['averagedist_'+fileName] = sum(avg)/len(avg) - - return fileCorrectCnt[fileName] +import numpy as np +from sklearn.metrics.pairwise import euclidean_distances + +class AnalogyScorer: + def __init__(self, analogy_file="analogies.txt"): + self.entity_dict = {} + self.analogies = self._load_analogies(analogy_file) + + def _load_analogies(self, file_path): + with open(file_path, 'r') as f: + return [tuple(line.strip().split()) for line in f if line.strip()] + + def find_vec(self, str1): + return np.array(self.entity_dict.get(str1.upper(), None)) + + def gen_similarity_table(self, vec): + keys = list(self.entity_dict.keys()) + entity_matrix = np.array(list(self.entity_dict.values())) + vec = vec.reshape(1, -1) + + # Calculate distances using euclidean_distances + distances = euclidean_distances(vec, entity_matrix)[0] + + return dict(zip(keys, distances)) + + def findTopk(self, dict1, k, values): + sortedByVal = dict(sorted(dict1.items(), key=lambda x: x[1])) + del sortedByVal[values[0].upper()] + del sortedByVal[values[1].upper()] + del sortedByVal[values[2].upper()] + return {k: sortedByVal[k] for k in list(sortedByVal)[:k]} + + def get_analogy_score(self, entity_dict): + self.entity_dict = entity_dict + total_count = len(self.analogies) + correct_count = 0 + + for values in self.analogies: + vecA = self.find_vec(values[0]) + vecB = self.find_vec(values[1]) + vecC = self.find_vec(values[2]) + + if vecA is None or vecB is None or vecC is None: + print(f"Skipping analogy due to missing vector: {values}") + continue + + # Calculate vecD based on the analogy + vecD = vecB - vecA + vecC + similarity_dict = self.gen_similarity_table(vecD) + top_k_dict = self.findTopk(similarity_dict, 5, values) + + if values[3].upper() in top_k_dict: + correct_count += 1 + return correct_count \ No newline at end of file diff --git a/seed_embeddings/OpenKE/config/Trainer.py b/seed_embeddings/OpenKE/config/Trainer.py index 0ea91a1c9..6d05f1fc3 100755 --- a/seed_embeddings/OpenKE/config/Trainer.py +++ b/seed_embeddings/OpenKE/config/Trainer.py @@ -52,6 +52,25 @@ def __init__( self.save_steps = save_steps self.checkpoint_dir = checkpoint_dir # self.out_path = out_path + + self.entity_names = self.load_entity_names(index_dir) + self.analogies = analogy.AnalogyScorer(analogy_file="analogies.txt") + + def load_entity_names(self, index_dir): + with open(os.path.join(index_dir, "entity2id.txt")) as fEntity: + content = fEntity.read() + + entities = content.split("\n") + entity_dict = {} + + for i in range(1, int(entities[0])): + entity_name = entities[i].split("\t")[0].upper() + entity_dict[entity_name] = None # Placeholder for embeddings + + last_entity_name = entities[int(entities[0])].split("\t")[0].upper() + entity_dict[last_entity_name] = None + + return entity_dict def train_one_step(self, data): self.optimizer.zero_grad() @@ -68,25 +87,15 @@ def train_one_step(self, data): self.optimizer.step() return loss.item() - def getEntityDict(self, ent_embeddings, index_dir): + def getEntityDict(self, ent_embeddings): """ Reads the entity embeddings and returns an dictionary mapping entity names to their corresponding embeddings. """ - rep = ent_embeddings - - with open(os.path.join(index_dir, "entity2id.txt")) as fEntity: - content = fEntity.read() - - entities = content.split("\n") entity_dict = {} - - for i in range(1, int(entities[0])): - entity_name = entities[i].split("\t")[0] - entity_dict[entity_name.upper()] = rep[i - 1].tolist() - - last_entity_name = entities[int(entities[0])].split("\t")[0] - entity_dict[last_entity_name.upper()] = rep[int(entities[0]) - 1].tolist() + + for i, entity_name in enumerate(self.entity_dict): + entity_dict[entity_name] = ent_embeddings[i].tolist() return entity_dict @@ -168,10 +177,8 @@ def run( # self.mode.model => Transe model ent_embeddings = self.model.model.ent_embeddings.weight.data.numpy() - entity_dict = self.getEntityDict(ent_embeddings, self.index_dir) - analogy_score = analogy.getAnalogyScoreFromDict( - entity_dict, self.index_dir - ) + entity_dict = self.getEntityDict(ent_embeddings) + analogy_score = self.analogies.get_analogy_score(entity_dict) metrics.update({"AnalogiesScore": analogy_score}) print("Analogy Score Completed")