Skip to content

Commit

Permalink
Refactoring analogy.py for time/mem optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
svkeerthy committed Oct 1, 2024
1 parent ce60eb3 commit 6bd3ddc
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 204 deletions.
240 changes: 54 additions & 186 deletions seed_embeddings/OpenKE/analogy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,190 +2,58 @@
# Exceptions. See the LICENSE file for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
import heapq
import sys, re
import numpy as np
import pandas as pd
from collections import OrderedDict
from scipy import spatial
import os
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances


def findVec(str1, src):
with open(src) as f:
content = f.read()
# print("searching for ", str1)
start = content.upper().find("\n" + str1.upper() + ":[") + len(str1) + 3
if start == -1:
print(str1, " not found")
return
else:
end = content.find("]", start)
vecstr = content[start:end].split(", ")
vec = [float(element) for element in vecstr]
# print(vec)
return vec


def findVecFromDict(str1, entity_dict):
"""
Finds the vector for a given entity dictionary
"""
if str1.upper() in entity_dict:
return np.array(entity_dict[str1.upper()])
else:
print(f"{str1} not found in entity_dict")
return None


# def genSimilarityTableFromDict(vec, entity_dict):
# """
# Generates cosine and Euclidean similarity tables based on the entity embeddings dictionary.
# """
# cosineDict = {}
# euclDict = {}

# for opcode, value in entity_dict.items():
# value = np.array(value)
# cosineDict[opcode] = spatial.distance.cosine(vec, value)
# # euclDict[opcode] = spatial.distance.euclidean(vec, value) # Ignored for now
# return cosineDict, euclDict
def genSimilarityTableFromDict(vec, entities, entity_matrix):
"""
Generates cosine table based on the entities and entity matrix.
"""

vec = vec.reshape(1, -1)

# Compute cosine similarity between vec and all entity vectors at once
cosine_similarities = cosine_similarity(vec, entity_matrix)[
0
] # [0] to get the first row
# euclidean_dist = euclidean_distances(vec, entity_matrix)[0]

# Create the cosine similarity dictionary using the entity keys
cosineDict = dict(zip(entities, 1 - cosine_similarities))
# euclDict = {entities[i]: euclidean_dist[i] for i in range(len(entities))} # Keeping this empty since Euclidean part is ignored
euclDict = {}

return cosineDict, euclDict


def genSimilarityTable(vec, src):
# opcVec = findVec(opc, src)
with open(src) as f:
lines = [line.strip("\n\t") for line in f]
cosineDict = {}
euclDict = {}
for line in lines:
opcode = line[0 : line.find(":[")].upper()
valueStr = line[line.find(":[") + 2 : -2].split(", ")
value = [float(element) for element in valueStr]
cosineDict[opcode] = spatial.distance.cosine(vec, value)
euclDict[opcode] = spatial.distance.euclidean(vec, value)
return cosineDict, euclDict


def findTopk(dict1, k, values):
# print(sorted(dict.items(), key=lambda x: x[1]))
# k_keys_sorted_by_values = heapq.nsmallest(k+1, dict1, key=dict1.get)
# topKDict = OrderedDict((keys, dict1[keys]) for keys in k_keys_sorted_by_values)

sortedByVal = dict(sorted(dict1.items(), key=lambda x: x[1]))
# sortedByVal.pop('AND')
del sortedByVal[values[0].upper()]
del sortedByVal[values[1].upper()]
del sortedByVal[values[2].upper()]
return {k: sortedByVal[k] for k in list(sortedByVal)[:k]}


def getAnalogyScoreFromDict(entity_dict, index_dir):
"""
Computes the analogy score directly from entity embeddings dict and analogies.txt.
"""
script_dir = os.path.dirname(os.path.abspath(__file__))
refFile = os.path.join(script_dir, "analogies.txt")

# Read analogies
with open(refFile) as f:
analogies = [line.strip("\n") for line in f]

totalCnt = 0
correctCnt = 0
avg = []

# Convert the entity_dict values to a matrix for vectorized operations
keys = list(entity_dict.keys())
entity_matrix = np.array(list(entity_dict.values()))

# Iterate through the analogies
for analogy in analogies:
totalCnt += 1
values = analogy.split(" ")

vecA = findVecFromDict(values[0], entity_dict)
vecB = findVecFromDict(values[1], entity_dict)
vecC = findVecFromDict(values[2], entity_dict)

if vecA is None or vecB is None or vecC is None:
continue

vecD = vecB - vecA + vecC

cosineDict, euclDict = genSimilarityTableFromDict(vecD, keys, entity_matrix)
topKCosineDict = findTopk(cosineDict, 5, values)

if values[3].upper() in topKCosineDict:
correctCnt += 1
avg.append(topKCosineDict[values[3].upper()])

return correctCnt


def getAnalogyScore(fileName):
# Get the directory of the current script
script_dir = os.path.dirname(os.path.abspath(__file__))

# Construct the full path to 'analogies.txt'
refFile = os.path.join(script_dir, "analogies.txt")
# refFile = "analogies.txt"
with open(refFile) as f:
analogies = [line.strip("\n") for line in f]
totalCnt = 0
fileCorrectCnt = {}

avg = []
correctCnt = 0
for analogy in analogies:
totalCnt = totalCnt + 1
# values = [val for val in analogy.strip('\t')]
values = analogy.split(" ")
# print(values)
# fileName = argv[0]

vecA = findVec(values[0], fileName)
vecB = findVec(values[1], fileName)
vecC = findVec(values[2], fileName)
# vecD = np.asarray(vecA) - np.asarray(vecB) + np.asarray(vecC)
vecD = np.asarray(vecB) - np.asarray(vecA) + np.asarray(vecC)

del vecA
del vecB
del vecC

# print(vecD)
cosineDict, euclDict = genSimilarityTable(vecD, fileName)
topKCosineDict = findTopk(cosineDict, 5, values)

if values[3].upper() in topKCosineDict:
correctCnt = correctCnt + 1
# print(values, ' : ', '\033[92m' + u'\u2713' + '\033[0m', topKCosineDict[values[3].upper()])
avg.append(topKCosineDict[values[3].upper()])
else:
# print(values, ' : ', '\033[91m' + u'\u00D7' + '\033[0m', topKCosineDict.keys())
pass
fileCorrectCnt[fileName] = correctCnt
# fileCorrectCnt['averagedist_'+fileName] = sum(avg)/len(avg)

return fileCorrectCnt[fileName]
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

class AnalogyScorer:
def __init__(self, analogy_file="analogies.txt"):
self.entity_dict = {}
self.analogies = self._load_analogies(analogy_file)

def _load_analogies(self, file_path):
with open(file_path, 'r') as f:
return [tuple(line.strip().split()) for line in f if line.strip()]

def find_vec(self, str1):
return np.array(self.entity_dict.get(str1.upper(), None))

def gen_similarity_table(self, vec):
keys = list(self.entity_dict.keys())
entity_matrix = np.array(list(self.entity_dict.values()))
vec = vec.reshape(1, -1)

# Calculate distances using euclidean_distances
distances = euclidean_distances(vec, entity_matrix)[0]

return dict(zip(keys, distances))

def findTopk(self, dict1, k, values):
sortedByVal = dict(sorted(dict1.items(), key=lambda x: x[1]))
del sortedByVal[values[0].upper()]
del sortedByVal[values[1].upper()]
del sortedByVal[values[2].upper()]
return {k: sortedByVal[k] for k in list(sortedByVal)[:k]}

def get_analogy_score(self, entity_dict):
self.entity_dict = entity_dict
total_count = len(self.analogies)
correct_count = 0

for values in self.analogies:
vecA = self.find_vec(values[0])
vecB = self.find_vec(values[1])
vecC = self.find_vec(values[2])

if vecA is None or vecB is None or vecC is None:
print(f"Skipping analogy due to missing vector: {values}")
continue

# Calculate vecD based on the analogy
vecD = vecB - vecA + vecC
similarity_dict = self.gen_similarity_table(vecD)
top_k_dict = self.findTopk(similarity_dict, 5, values)

if values[3].upper() in top_k_dict:
correct_count += 1
return correct_count
43 changes: 25 additions & 18 deletions seed_embeddings/OpenKE/config/Trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,25 @@ def __init__(
self.save_steps = save_steps
self.checkpoint_dir = checkpoint_dir
# self.out_path = out_path

self.entity_names = self.load_entity_names(index_dir)
self.analogies = analogy.AnalogyScorer(analogy_file="analogies.txt")

def load_entity_names(self, index_dir):
with open(os.path.join(index_dir, "entity2id.txt")) as fEntity:
content = fEntity.read()

entities = content.split("\n")
entity_dict = {}

for i in range(1, int(entities[0])):
entity_name = entities[i].split("\t")[0].upper()
entity_dict[entity_name] = None # Placeholder for embeddings

last_entity_name = entities[int(entities[0])].split("\t")[0].upper()
entity_dict[last_entity_name] = None

return entity_dict

def train_one_step(self, data):
self.optimizer.zero_grad()
Expand All @@ -68,25 +87,15 @@ def train_one_step(self, data):
self.optimizer.step()
return loss.item()

def getEntityDict(self, ent_embeddings, index_dir):
def getEntityDict(self, ent_embeddings):
"""
Reads the entity embeddings and returns an dictionary
mapping entity names to their corresponding embeddings.
"""
rep = ent_embeddings

with open(os.path.join(index_dir, "entity2id.txt")) as fEntity:
content = fEntity.read()

entities = content.split("\n")
entity_dict = {}

for i in range(1, int(entities[0])):
entity_name = entities[i].split("\t")[0]
entity_dict[entity_name.upper()] = rep[i - 1].tolist()

last_entity_name = entities[int(entities[0])].split("\t")[0]
entity_dict[last_entity_name.upper()] = rep[int(entities[0]) - 1].tolist()

for i, entity_name in enumerate(self.entity_dict):
entity_dict[entity_name] = ent_embeddings[i].tolist()

return entity_dict

Expand Down Expand Up @@ -168,10 +177,8 @@ def run(
# self.mode.model => Transe model

ent_embeddings = self.model.model.ent_embeddings.weight.data.numpy()
entity_dict = self.getEntityDict(ent_embeddings, self.index_dir)
analogy_score = analogy.getAnalogyScoreFromDict(
entity_dict, self.index_dir
)
entity_dict = self.getEntityDict(ent_embeddings)
analogy_score = self.analogies.get_analogy_score(entity_dict)
metrics.update({"AnalogiesScore": analogy_score})
print("Analogy Score Completed")

Expand Down

0 comments on commit 6bd3ddc

Please sign in to comment.