Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extending support for vocabularies of different dimensions #127

Merged
merged 19 commits into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
0615c8e
Fixing minor bugs in seedemb training
svkeerthy Oct 2, 2024
a379155
Fixing minor bugs in vocab gen
svkeerthy Oct 5, 2024
533445d
Vocab for different dims and fixing minor issues in vocab training
svkeerthy Oct 7, 2024
c196692
Changes to support vocabulary of different dimensions
svkeerthy Oct 7, 2024
5b2f06d
Minor change in the position of DIM in constructor
svkeerthy Oct 7, 2024
9da8c71
Update README.md to reflect mutliple dims support
svkeerthy Oct 7, 2024
87fd952
Merge pull request #124 from IITH-Compilers/svkeerthy-patch-1
svkeerthy Oct 7, 2024
a37b8bb
Update README.md of seed_embeddings to reflect multi dimensions support.
svkeerthy Oct 7, 2024
b8701f5
Fixing formatting issues
svkeerthy Oct 7, 2024
6cf732b
BugFix
nishant-sachdeva Oct 8, 2024
792067b
Fixed cp build/vocabulary.h import error
nishant-sachdeva Oct 8, 2024
d4444ab
Debug commit - compile error in core.cpp
nishant-sachdeva Oct 8, 2024
b7973f6
Extending API to accomodate DIM parameter
nishant-sachdeva Oct 8, 2024
1c6d22b
Modifying solitary test to verify DIM param functionality
nishant-sachdeva Oct 8, 2024
f3db69a
bugfix - segfault error from output_file = None. Status - fixed
nishant-sachdeva Oct 8, 2024
5bb4de3
test commit - syntax check in test_ir2vec.py
nishant-sachdeva Oct 9, 2024
564d364
format changes
nishant-sachdeva Oct 9, 2024
dd4d9ae
Merge pull request #125 from nishant-sachdeva/seedemb-opt
svkeerthy Oct 9, 2024
12da415
BugFix - accomodating core.cpp to changed ordering of function params
nishant-sachdeva Oct 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Manylinux2014_Compliant_Source/pkg/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ cd ..
cmake -DCMAKE_BUILD_TYPE=Release .. && make -j"$(nproc)" && make install

cd ..
cp build/vocabulary.h Manylinux2014_Compliant_Source/pkg/ir2vec/
cp build/include/Vocabulary*.h Manylinux2014_Compliant_Source/pkg/ir2vec/
cp src/include/utils.h Manylinux2014_Compliant_Source/pkg/ir2vec/
cp src/include/IR2Vec.h Manylinux2014_Compliant_Source/pkg/ir2vec/
cp build/src/version.h Manylinux2014_Compliant_Source/pkg/ir2vec/
Expand Down
26 changes: 14 additions & 12 deletions Manylinux2014_Compliant_Source/pkg/ir2vec/core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/CFG.h"
Expand Down Expand Up @@ -75,19 +74,21 @@ class IR2VecHandler {
std::string outputFile;
std::string mode;
std::string level;
unsigned dim;

public:
IR2VecHandler(std::string fileName, std::string outputFile, std::string mode,
std::string level)
: fileName(fileName), outputFile(outputFile), mode(mode), level(level) {}
std::string level, unsigned dim)
: fileName(fileName), outputFile(outputFile), mode(mode), level(level),
dim(dim) {}

std::string getFile() { return fileName; }
std::string getOutputFile() { return outputFile; }
std::string getMode() { return mode; }
std::string getLevel() { return level; }

// Function to get Program Vector List
PyObject *createProgramVectorList(llvm::SmallVector<double, DIM> llvmPgmVec) {
PyObject *createProgramVectorList(IR2Vec::Vector llvmPgmVec) {
// for PgmVector
PyObject *PgmList = PyList_New(0);
for (auto &Pgm_it : llvmPgmVec)
Expand Down Expand Up @@ -138,7 +139,6 @@ class IR2VecHandler {
PyObject *instructionVectorList = PyList_New(0);
for (auto &Inst_it : llvmInstVecMap) {
PyObject *instructionVector = PyList_New(0);
// copy this SmallVector into c++ Vector
for (auto &Vec_it : Inst_it.second) {
PyList_Append(instructionVector, PyFloat_FromDouble(Vec_it));
}
Expand Down Expand Up @@ -166,10 +166,10 @@ class IR2VecHandler {
ofstream output;
output.open(outFile, ios_base::app);
emb = std::move(new IR2Vec::Embeddings(
*Module, ir2vecMode, (this->level)[0], &output, funcName));
*Module, ir2vecMode, (this->level)[0], &output, this->dim, funcName));
} else {
emb = std::move(new IR2Vec::Embeddings(
*Module, ir2vecMode, (this->level)[0], nullptr, funcName));
*Module, ir2vecMode, (this->level)[0], nullptr, this->dim, funcName));
}

if (!emb) {
Expand All @@ -178,7 +178,7 @@ class IR2VecHandler {
}

if (type == OpType::Program) {
llvm::SmallVector<double, DIM> progVector = emb->getProgramVector();
IR2Vec::Vector progVector = emb->getProgramVector();
return this->createProgramVectorList(progVector);
} else if (type == OpType::Function) {
llvm::SmallMapVector<const llvm::Function *, IR2Vec::Vector, 16>
Expand Down Expand Up @@ -293,9 +293,10 @@ PyObject *getFunctionVectors(PyObject *self, PyObject *args) {

IR2VecHandlerObject *createIR2VECObject(const char *filename,
const char *output_file,
const char *mode, const char *level) {
const char *mode, const char *level,
unsigned dim) {
IR2VecHandler *ir2vecObj =
new IR2VecHandler(filename, output_file, mode, level);
new IR2VecHandler(filename, output_file, mode, level, dim);
if (!ir2vecObj) {
return nullptr;
}
Expand All @@ -314,8 +315,9 @@ PyObject *initEmbedding(PyObject *self, PyObject *args) {
const char *mode = "\0";
const char *level = "\0";
const char *output_file = "\0";
unsigned dim = 300;

if (!PyArg_ParseTuple(args, "sss|s", &filename, &mode, &level,
if (!PyArg_ParseTuple(args, "sss|Is", &filename, &mode, &level, &dim,
&output_file)) {
// raise error here
PyErr_SetString(PyExc_TypeError, "Invalid Arguments");
Expand Down Expand Up @@ -348,7 +350,7 @@ PyObject *initEmbedding(PyObject *self, PyObject *args) {
}

IR2VecHandlerObject *ir2vecObj =
createIR2VECObject(filename, output_file, mode, level);
createIR2VECObject(filename, output_file, mode, level, dim);

if (!ir2vecObj) {
PyErr_SetString(PyExc_TypeError, "Embedding Object not created");
Expand Down
2 changes: 1 addition & 1 deletion Manylinux2014_Compliant_Source/pkg/tests/test_ir2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def test_fa_f():
path = (TEST_SUITE_DIR / file).resolve()
full_path = str(path).strip()

initObj = ir2vec.initEmbedding(full_path, "fa", "f")
initObj = ir2vec.initEmbedding(full_path, "fa", "f", 300)
assert initObj is not None

functionVectorMap = ir2vec.getFunctionVectors(initObj)
Expand Down
24 changes: 18 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,13 +113,16 @@ To ensure the correctness, run `make check`
instructions.

### Using Binary
> ir2vec -\<mode\> -o \<output-file\> -level \<p|f\> -class \<class-number\> -funcName=\<function-name\> \<input-ll-file\>
> ir2vec -\<mode\> -dim \<dimensions\> -o \<output-file\> -level \<p|f\> -class \<class-number\> -funcName=\<function-name\> \<input-ll-file\>

#### Command-Line options

- `mode` - can be one of `sym`/`fa`
- `sym` denotes Symbolic representation
- `fa` denotes Flow-Aware representation
- `dim` - Dimensions of embeddings
- This is an optional argument. Defaults to `300`.
- Other supported dimensions are `75` and `100`
- `o` - file in which the embeddings are to be appended; (Note : If file doesn’t exist, new file would be created, else embeddings would be appended)
- `level` - can be one of chars `p`/`f`.
- `p` denotes `program level` encoding
Expand All @@ -141,16 +144,16 @@ Please use `--help` for further details.

#### Flow-Aware Embeddings
For all functions
* `` ir2vec -fa -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``
* `` ir2vec -fa -dim <dimension> -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``

For a specific function
* `` ir2vec -fa -o <output_file> -level f -class <class-number> -funcName=\<function-name\><input_ll_file>``
* `` ir2vec -fa -dim <dimension> -o <output_file> -level f -class <class-number> -funcName=\<function-name\><input_ll_file>``

#### Symbolic Embeddings
For all functions
* `` ir2vec -sym -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``
* `` ir2vec -sym -dim <dimension> -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``
For a specific function
* `` ir2vec -sym -o <output_file> -level f -class <class-number> -funcName=\<function-name\> <input_ll_file>``
* `` ir2vec -sym -dim <dimension> -o <output_file> -level f -class <class-number> -funcName=\<function-name\> <input_ll_file>``

## Using Libraries
The libraries can be installed by passing the installation location to the `CMAKE_INSTALL_PREFIX` flag during `cmake` followed by `make install`.
Expand Down Expand Up @@ -178,7 +181,7 @@ The following example snippet shows how to query the exposed vector representati

// Creating object to generate FlowAware representation
auto ir2vec =
IR2Vec::Embeddings(<LLVM Module>, IR2Vec::IR2VecMode::FlowAware);
IR2Vec::Embeddings(<LLVM Module>, IR2Vec::IR2VecMode::FlowAware, <DIM>);

// Getting Instruction vectors corresponding to the instructions in <LLVM Module>
auto instVecMap = ir2vec.getInstVecMap();
Expand Down Expand Up @@ -218,6 +221,8 @@ for (auto val : pgmVec)
* `file_path`: str - Path to the `.ll` or `.bc` file.
* `encoding_type`: str - Choose `fa` (Flow-Aware) or `sym` (Symbolic).
* `level`: str - Choose `p` for program-level or `f` for function-level.
* `dim`: uint - Choose from `[300, 100, 75]`. Default value is `300`
* `output_file`: str - If provided, embeddings are saved to this file. Default is an empty string.

**Returns:**

Expand All @@ -228,7 +233,14 @@ for (auto val : pgmVec)
```python
import ir2vec

# Approach 1
initObj = ir2vec.initEmbedding("/path/to/file.ll", "fa", "p")

# Approach 2
initObj = ir2vec.initEmbedding("/path/to/file.ll", "fa", "p", 100)

# Approach 3
initObj = ir2vec.initEmbedding("/path/to/file.ll", "fa", "p", 100, "output.txt")
```

### getProgramVector
Expand Down
15 changes: 8 additions & 7 deletions seed_embeddings/OpenKE/analogy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances


class AnalogyScorer:
def __init__(self, analogy_file="analogies.txt"):
self.entity_dict = {}
self.analogies = self._load_analogies(analogy_file)

def _load_analogies(self, file_path):
with open(file_path, 'r') as f:
with open(file_path, "r") as f:
return [tuple(line.strip().split()) for line in f if line.strip()]

def find_vec(self, str1):
Expand All @@ -22,24 +23,24 @@ def gen_similarity_table(self, vec):
keys = list(self.entity_dict.keys())
entity_matrix = np.array(list(self.entity_dict.values()))
vec = vec.reshape(1, -1)

# Calculate distances using euclidean_distances
distances = euclidean_distances(vec, entity_matrix)[0]

return dict(zip(keys, distances))

def findTopk(self, dict1, k, values):
sortedByVal = dict(sorted(dict1.items(), key=lambda x: x[1]))
del sortedByVal[values[0].upper()]
del sortedByVal[values[1].upper()]
del sortedByVal[values[2].upper()]
return {k: sortedByVal[k] for k in list(sortedByVal)[:k]}

def get_analogy_score(self, entity_dict):
def get_analogy_score(self, entity_dict):
self.entity_dict = entity_dict
total_count = len(self.analogies)
correct_count = 0

for values in self.analogies:
vecA = self.find_vec(values[0])
vecB = self.find_vec(values[1])
Expand All @@ -56,4 +57,4 @@ def get_analogy_score(self, entity_dict):

if values[3].upper() in top_k_dict:
correct_count += 1
return correct_count
return correct_count
51 changes: 35 additions & 16 deletions seed_embeddings/OpenKE/config/Trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(
save_steps=None,
checkpoint_dir=None,
index_dir=None,
out_path=None,
analogy_file="analogies.txt",
):

self.work_threads = 8
Expand All @@ -52,10 +52,10 @@ def __init__(
self.save_steps = save_steps
self.checkpoint_dir = checkpoint_dir
# self.out_path = out_path

self.entity_names = self.load_entity_names(index_dir)
self.analogies = analogy.AnalogyScorer(analogy_file="analogies.txt")
self.analogies = analogy.AnalogyScorer(analogy_file=analogy_file)

def load_entity_names(self, index_dir):
with open(os.path.join(index_dir, "entity2id.txt")) as fEntity:
content = fEntity.read()
Expand Down Expand Up @@ -93,8 +93,8 @@ def getEntityDict(self, ent_embeddings):
mapping entity names to their corresponding embeddings.
"""
entity_dict = {}
for i, entity_name in enumerate(self.entity_dict):

for i, entity_name in enumerate(self.entity_names):
entity_dict[entity_name] = ent_embeddings[i].tolist()

return entity_dict
Expand Down Expand Up @@ -139,7 +139,7 @@ def run(
weight_decay=self.weight_decay,
)
print("Finish initializing...")

best_metric_val = 0.0
training_range = tqdm(range(self.train_times))
for epoch in training_range:
res = 0.0
Expand All @@ -148,6 +148,7 @@ def run(
res += loss
training_range.set_description("Epoch %d | loss: %f" % (epoch, res))
checkpoint = None
save_ckpt = False
if ray and epoch % freq == 0:
metrics = {"loss": res}
# Link Prediction
Expand All @@ -170,27 +171,45 @@ def run(
"hit1": hit1,
}
)
if best_metric_val <= hit1:
best_metric_val = hit1
save_ckpt = True
print("Link Prediction Scores Completed")

if is_analogy:
elif is_analogy:
# self.model => Negative Sampling object
# self.mode.model => Transe model

ent_embeddings = self.model.model.ent_embeddings.weight.data.numpy()
ent_embeddings = (
self.model.model.ent_embeddings.weight.data.cpu().numpy()
)
entity_dict = self.getEntityDict(ent_embeddings)
analogy_score = self.analogies.get_analogy_score(entity_dict)
metrics.update({"AnalogiesScore": analogy_score})
print("Analogy Score Completed")
print("Analogy Score completed")

del entity_dict

if best_metric_val <= analogy_score:
best_metric_val = analogy_score
save_ckpt = True

else: # loss
if best_metric_val >= res:
best_metric_val = res
save_ckpt = True

with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
# Save the checkpoint...
self.model.save_checkpoint(
os.path.join(
temp_checkpoint_dir,
"checkpoint" + "-" + str(epoch) + ".ckpt",
checkpoint = None
if save_ckpt:
self.model.save_checkpoint(
os.path.join(
temp_checkpoint_dir,
"checkpoint" + "-" + str(epoch) + ".ckpt",
)
)
)
checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)
checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)

train.report(metrics, checkpoint=checkpoint)

Expand Down
Loading
Loading