Skip to content

Commit

Permalink
Fixing formatting issues
Browse files Browse the repository at this point in the history
  • Loading branch information
svkeerthy committed Oct 7, 2024
1 parent a37b8bb commit b8701f5
Show file tree
Hide file tree
Showing 8 changed files with 99 additions and 76 deletions.
15 changes: 8 additions & 7 deletions seed_embeddings/OpenKE/analogy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances


class AnalogyScorer:
def __init__(self, analogy_file="analogies.txt"):
self.entity_dict = {}
self.analogies = self._load_analogies(analogy_file)

def _load_analogies(self, file_path):
with open(file_path, 'r') as f:
with open(file_path, "r") as f:
return [tuple(line.strip().split()) for line in f if line.strip()]

def find_vec(self, str1):
Expand All @@ -22,24 +23,24 @@ def gen_similarity_table(self, vec):
keys = list(self.entity_dict.keys())
entity_matrix = np.array(list(self.entity_dict.values()))
vec = vec.reshape(1, -1)

# Calculate distances using euclidean_distances
distances = euclidean_distances(vec, entity_matrix)[0]

return dict(zip(keys, distances))

def findTopk(self, dict1, k, values):
sortedByVal = dict(sorted(dict1.items(), key=lambda x: x[1]))
del sortedByVal[values[0].upper()]
del sortedByVal[values[1].upper()]
del sortedByVal[values[2].upper()]
return {k: sortedByVal[k] for k in list(sortedByVal)[:k]}

def get_analogy_score(self, entity_dict):
def get_analogy_score(self, entity_dict):
self.entity_dict = entity_dict
total_count = len(self.analogies)
correct_count = 0

for values in self.analogies:
vecA = self.find_vec(values[0])
vecB = self.find_vec(values[1])
Expand All @@ -56,4 +57,4 @@ def get_analogy_score(self, entity_dict):

if values[3].upper() in top_k_dict:
correct_count += 1
return correct_count
return correct_count
20 changes: 11 additions & 9 deletions seed_embeddings/OpenKE/config/Trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,10 @@ def __init__(
self.save_steps = save_steps
self.checkpoint_dir = checkpoint_dir
# self.out_path = out_path

self.entity_names = self.load_entity_names(index_dir)
self.analogies = analogy.AnalogyScorer(analogy_file=analogy_file)

def load_entity_names(self, index_dir):
with open(os.path.join(index_dir, "entity2id.txt")) as fEntity:
content = fEntity.read()
Expand Down Expand Up @@ -93,7 +93,7 @@ def getEntityDict(self, ent_embeddings):
mapping entity names to their corresponding embeddings.
"""
entity_dict = {}

for i, entity_name in enumerate(self.entity_names):
entity_dict[entity_name] = ent_embeddings[i].tolist()

Expand Down Expand Up @@ -180,25 +180,27 @@ def run(
# self.model => Negative Sampling object
# self.mode.model => Transe model

ent_embeddings = self.model.model.ent_embeddings.weight.data.cpu().numpy()
ent_embeddings = (
self.model.model.ent_embeddings.weight.data.cpu().numpy()
)
entity_dict = self.getEntityDict(ent_embeddings)
analogy_score = self.analogies.get_analogy_score(entity_dict)
metrics.update({"AnalogiesScore": analogy_score})
print("Analogy Score completed")

del entity_dict

if best_metric_val <= analogy_score:
best_metric_val = analogy_score
save_ckpt = True

else: # loss
else: # loss
if best_metric_val >= res:
best_metric_val = res
save_ckpt = True

with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
# Save the checkpoint...
# Save the checkpoint...
checkpoint = None
if save_ckpt:
self.model.save_checkpoint(
Expand Down
61 changes: 30 additions & 31 deletions seed_embeddings/OpenKE/generate_embedding_ray.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"


def test_files(index_dir):
entities = os.path.join(index_dir, "entity2id.txt")
relations = os.path.join(index_dir, "relation2id.txt")
Expand All @@ -41,6 +42,7 @@ def test_files(index_dir):
if not os.path.exists(train):
raise Exception("train2id.txt not found")


def train(config, args=None):
# dataloader for training
train_dataloader = TrainDataLoader(
Expand Down Expand Up @@ -110,37 +112,37 @@ def findRep(src, index_dir, src_type="json"):
for i in range(1, int(entities[0])):
toTxt += entities[i].split("\t")[0] + ":" + str(rep[i - 1]) + ",\n"
toTxt += (
entities[int(entities[0])].split("\t")[0]
+ ":"
+ str(rep[int(entities[0]) - 1])
entities[int(entities[0])].split("\t")[0] + ":" + str(rep[int(entities[0]) - 1])
)
return toTxt


def reformat_embeddings(input_str):
# Split the string by '],' to isolate each object
entries = input_str.split('],')
entries = input_str.split("],")

formatted_entries = []

for entry in entries:
# Remove any newline characters
cleaned_entry = entry.replace('\n', ' ')
cleaned_entry = entry.replace("\n", " ")

# Split the object name from the values part
obj_name, values = cleaned_entry.split(':[')
obj_name, values = cleaned_entry.split(":[")

# Remove extra spaces and replace multiple spaces with a single one using regex
values = re.sub(r'\s+', ' ', values.split(']')[0].strip())
values = re.sub(r"\s+", " ", values.split("]")[0].strip())

# Replace spaces between numbers with commas and add the closing bracket
formatted_values = values.replace(' ', ', ') + ']'
formatted_values = values.replace(" ", ", ") + "]"

# Recombine the object name with the formatted values
formatted_entry = f"{obj_name.strip()}:[{formatted_values}"
formatted_entries.append(formatted_entry)

# Join all entries back into one string, separated by newline
return '\n'.join(formatted_entries)
return "\n".join(formatted_entries)


if __name__ == "__main__":
ray.init()
Expand Down Expand Up @@ -220,10 +222,10 @@ def reformat_embeddings(input_str):
type=str,
default="./analogies.txt",
)

arg_conf = parser.parse_args()
arg_conf.index_dir = arg_conf.index_dir + "/"

try:
test_files(arg_conf.index_dir)
print("Files are OK")
Expand All @@ -239,7 +241,7 @@ def reformat_embeddings(input_str):
# "neg_ent": tune.randint(1, 30),
# "neg_rel": tune.randint(1, 30),
# "bern": tune.randint(0, 2),
"opt_method": "Adam", #tune.choice(["SGD", "Adam"]),
"opt_method": "Adam", # tune.choice(["SGD", "Adam"]),
}

try:
Expand All @@ -258,7 +260,7 @@ def reformat_embeddings(input_str):
else:
metric = "loss"
mode = "min"

scheduler = ASHAScheduler(
time_attr="training_iteration",
max_t=arg_conf.epoch,
Expand All @@ -271,13 +273,12 @@ def reformat_embeddings(input_str):

if arg_conf.use_gpu:
train_with_resources = tune.with_resources(
tune.with_parameters(train, args = arg_conf),
resources={"cpu": 8, "gpu": 0.15}
tune.with_parameters(train, args=arg_conf),
resources={"cpu": 8, "gpu": 0.15},
)
else:
train_with_resources = tune.with_resources(
tune.with_parameters(train, args = arg_conf),
resources={"cpu": 10, "gpu": 0}
tune.with_parameters(train, args=arg_conf), resources={"cpu": 10, "gpu": 0}
)

tuner = tune.Tuner(
Expand All @@ -296,17 +297,15 @@ def reformat_embeddings(input_str):
# *Best* checkpoints are determined by these params:
checkpoint_score_attribute=metric,
checkpoint_score_order=mode,
)
),
),
)
results = tuner.fit()

# Write the best result to a file, best_result.txt
fin_res = results.get_best_result(metric=metric, mode=mode)
with open(os.path.join(arg_conf.index_dir, "best_result.txt"), "a") as f:
f.write(
"\n" + str(fin_res)
)
f.write("\n" + str(fin_res))

if arg_conf.is_analogy:
print(
Expand All @@ -323,7 +322,7 @@ def reformat_embeddings(input_str):
"Best Config Based on Loss : ",
fin_res,
)

# Get the best configuration
best_config = fin_res.config
print("best_config: ", best_config)
Expand Down Expand Up @@ -366,10 +365,10 @@ def reformat_embeddings(input_str):
margin,
),
)

data = findRep(outfile, index_dir, src_type="ckpt")
formatted_data = reformat_embeddings(data)

# Write the embeddings to outfile
embeddings_path = embeddings_path.replace(".ckpt", ".txt")
print("embeddings_path: ", embeddings_path)
Expand Down
3 changes: 2 additions & 1 deletion src/IR2Vec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ cl::opt<bool> cl_collectIR(
cl::opt<std::string> cl_iname(cl::Positional, cl::desc("Input file path"),
cl::Required, cl::cat(category));
cl::opt<unsigned> cl_dim("dim", cl::Optional, cl::init(300),
cl::desc("Dimension of the embeddings"), cl::cat(category));
cl::desc("Dimension of the embeddings"),
cl::cat(category));
cl::opt<std::string> cl_oname("o", cl::Required, cl::desc("Output file path"),
cl::cat(category));
// for on demand generation of embeddings taking function name
Expand Down
Loading

0 comments on commit b8701f5

Please sign in to comment.