Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exposing vocab of different dims #126

Merged
merged 18 commits into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Manylinux2014_Compliant_Source/pkg/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ cd ..
cmake -DCMAKE_BUILD_TYPE=Release .. && make -j"$(nproc)" && make install

cd ..
cp build/vocabulary.h Manylinux2014_Compliant_Source/pkg/ir2vec/
cp build/include/Vocabulary*.h Manylinux2014_Compliant_Source/pkg/ir2vec/
cp src/include/utils.h Manylinux2014_Compliant_Source/pkg/ir2vec/
cp src/include/IR2Vec.h Manylinux2014_Compliant_Source/pkg/ir2vec/
cp build/src/version.h Manylinux2014_Compliant_Source/pkg/ir2vec/
Expand Down
26 changes: 14 additions & 12 deletions Manylinux2014_Compliant_Source/pkg/ir2vec/core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/CFG.h"
Expand Down Expand Up @@ -75,19 +74,21 @@ class IR2VecHandler {
std::string outputFile;
std::string mode;
std::string level;
uint dim;

public:
IR2VecHandler(std::string fileName, std::string outputFile, std::string mode,
std::string level)
: fileName(fileName), outputFile(outputFile), mode(mode), level(level) {}
std::string level, uint dim)
: fileName(fileName), outputFile(outputFile), mode(mode), level(level),
dim(dim) {}

std::string getFile() { return fileName; }
std::string getOutputFile() { return outputFile; }
std::string getMode() { return mode; }
std::string getLevel() { return level; }

// Function to get Program Vector List
PyObject *createProgramVectorList(llvm::SmallVector<double, DIM> llvmPgmVec) {
PyObject *createProgramVectorList(IR2Vec::Vector llvmPgmVec) {
// for PgmVector
PyObject *PgmList = PyList_New(0);
for (auto &Pgm_it : llvmPgmVec)
Expand Down Expand Up @@ -138,7 +139,6 @@ class IR2VecHandler {
PyObject *instructionVectorList = PyList_New(0);
for (auto &Inst_it : llvmInstVecMap) {
PyObject *instructionVector = PyList_New(0);
// copy this SmallVector into c++ Vector
for (auto &Vec_it : Inst_it.second) {
PyList_Append(instructionVector, PyFloat_FromDouble(Vec_it));
}
Expand Down Expand Up @@ -166,10 +166,10 @@ class IR2VecHandler {
ofstream output;
output.open(outFile, ios_base::app);
emb = std::move(new IR2Vec::Embeddings(
*Module, ir2vecMode, (this->level)[0], &output, funcName));
*Module, ir2vecMode, (this->level)[0], &output, funcName, this->dim));
} else {
emb = std::move(new IR2Vec::Embeddings(
*Module, ir2vecMode, (this->level)[0], nullptr, funcName));
*Module, ir2vecMode, (this->level)[0], nullptr, funcName, this->dim));
}

if (!emb) {
Expand All @@ -178,7 +178,7 @@ class IR2VecHandler {
}

if (type == OpType::Program) {
llvm::SmallVector<double, DIM> progVector = emb->getProgramVector();
IR2Vec::Vector progVector = emb->getProgramVector();
return this->createProgramVectorList(progVector);
} else if (type == OpType::Function) {
llvm::SmallMapVector<const llvm::Function *, IR2Vec::Vector, 16>
Expand Down Expand Up @@ -293,9 +293,10 @@ PyObject *getFunctionVectors(PyObject *self, PyObject *args) {

IR2VecHandlerObject *createIR2VECObject(const char *filename,
const char *output_file,
const char *mode, const char *level) {
const char *mode, const char *level,
uint dim) {
IR2VecHandler *ir2vecObj =
new IR2VecHandler(filename, output_file, mode, level);
new IR2VecHandler(filename, output_file, mode, level, dim);
if (!ir2vecObj) {
return nullptr;
}
Expand All @@ -314,8 +315,9 @@ PyObject *initEmbedding(PyObject *self, PyObject *args) {
const char *mode = "\0";
const char *level = "\0";
const char *output_file = "\0";
uint dim = 300;

if (!PyArg_ParseTuple(args, "sss|s", &filename, &mode, &level,
if (!PyArg_ParseTuple(args, "sss|Is", &filename, &mode, &level, &dim,
&output_file)) {
// raise error here
PyErr_SetString(PyExc_TypeError, "Invalid Arguments");
Expand Down Expand Up @@ -348,7 +350,7 @@ PyObject *initEmbedding(PyObject *self, PyObject *args) {
}

IR2VecHandlerObject *ir2vecObj =
createIR2VECObject(filename, output_file, mode, level);
createIR2VECObject(filename, output_file, mode, level, dim);

if (!ir2vecObj) {
PyErr_SetString(PyExc_TypeError, "Embedding Object not created");
Expand Down
2 changes: 1 addition & 1 deletion Manylinux2014_Compliant_Source/pkg/tests/test_ir2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def test_fa_f():
path = (TEST_SUITE_DIR / file).resolve()
full_path = str(path).strip()

initObj = ir2vec.initEmbedding(full_path, "fa", "f")
initObj = ir2vec.initEmbedding(full_path, "fa", "f", 300)
assert initObj is not None

functionVectorMap = ir2vec.getFunctionVectors(initObj)
Expand Down
24 changes: 18 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,13 +113,16 @@ To ensure the correctness, run `make check`
instructions.

### Using Binary
> ir2vec -\<mode\> -o \<output-file\> -level \<p|f\> -class \<class-number\> -funcName=\<function-name\> \<input-ll-file\>
> ir2vec -\<mode\> -dim \<dimensions\> -o \<output-file\> -level \<p|f\> -class \<class-number\> -funcName=\<function-name\> \<input-ll-file\>

#### Command-Line options

- `mode` - can be one of `sym`/`fa`
- `sym` denotes Symbolic representation
- `fa` denotes Flow-Aware representation
- `dim` - Dimensions of embeddings
- This is an optional argument. Defaults to `300`.
- Other supported dimensions are `75` and `100`
- `o` - file in which the embeddings are to be appended; (Note : If file doesn’t exist, new file would be created, else embeddings would be appended)
- `level` - can be one of chars `p`/`f`.
- `p` denotes `program level` encoding
Expand All @@ -141,16 +144,16 @@ Please use `--help` for further details.

#### Flow-Aware Embeddings
For all functions
* `` ir2vec -fa -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``
* `` ir2vec -fa -dim <dimension> -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``

For a specific function
* `` ir2vec -fa -o <output_file> -level f -class <class-number> -funcName=\<function-name\><input_ll_file>``
* `` ir2vec -fa -dim <dimension> -o <output_file> -level f -class <class-number> -funcName=\<function-name\><input_ll_file>``

#### Symbolic Embeddings
For all functions
* `` ir2vec -sym -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``
* `` ir2vec -sym -dim <dimension> -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``
For a specific function
* `` ir2vec -sym -o <output_file> -level f -class <class-number> -funcName=\<function-name\> <input_ll_file>``
* `` ir2vec -sym -dim <dimension> -o <output_file> -level f -class <class-number> -funcName=\<function-name\> <input_ll_file>``

## Using Libraries
The libraries can be installed by passing the installation location to the `CMAKE_INSTALL_PREFIX` flag during `cmake` followed by `make install`.
Expand Down Expand Up @@ -178,7 +181,7 @@ The following example snippet shows how to query the exposed vector representati

// Creating object to generate FlowAware representation
auto ir2vec =
IR2Vec::Embeddings(<LLVM Module>, IR2Vec::IR2VecMode::FlowAware);
IR2Vec::Embeddings(<LLVM Module>, IR2Vec::IR2VecMode::FlowAware, <DIM>);

// Getting Instruction vectors corresponding to the instructions in <LLVM Module>
auto instVecMap = ir2vec.getInstVecMap();
Expand Down Expand Up @@ -218,6 +221,8 @@ for (auto val : pgmVec)
* `file_path`: str - Path to the `.ll` or `.bc` file.
* `encoding_type`: str - Choose `fa` (Flow-Aware) or `sym` (Symbolic).
* `level`: str - Choose `p` for program-level or `f` for function-level.
* `dim`: uint - Choose from `[300, 100, 75]`. Default value is `300`
* `output_file`: str - If provided, embeddings are saved to this file. Default is an empty string.

**Returns:**

Expand All @@ -228,7 +233,14 @@ for (auto val : pgmVec)
```python
import ir2vec

# Approach 1
initObj = ir2vec.initEmbedding("/path/to/file.ll", "fa", "p")

# Approach 2
initObj = ir2vec.initEmbedding("/path/to/file.ll", "fa", "p", 100)

# Approach 3
initObj = ir2vec.initEmbedding("/path/to/file.ll", "fa", "p", 100, "output.txt")
```

### getProgramVector
Expand Down
15 changes: 8 additions & 7 deletions seed_embeddings/OpenKE/analogy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances


class AnalogyScorer:
def __init__(self, analogy_file="analogies.txt"):
self.entity_dict = {}
self.analogies = self._load_analogies(analogy_file)

def _load_analogies(self, file_path):
with open(file_path, 'r') as f:
with open(file_path, "r") as f:
return [tuple(line.strip().split()) for line in f if line.strip()]

def find_vec(self, str1):
Expand All @@ -22,24 +23,24 @@ def gen_similarity_table(self, vec):
keys = list(self.entity_dict.keys())
entity_matrix = np.array(list(self.entity_dict.values()))
vec = vec.reshape(1, -1)

# Calculate distances using euclidean_distances
distances = euclidean_distances(vec, entity_matrix)[0]

return dict(zip(keys, distances))

def findTopk(self, dict1, k, values):
sortedByVal = dict(sorted(dict1.items(), key=lambda x: x[1]))
del sortedByVal[values[0].upper()]
del sortedByVal[values[1].upper()]
del sortedByVal[values[2].upper()]
return {k: sortedByVal[k] for k in list(sortedByVal)[:k]}

def get_analogy_score(self, entity_dict):
def get_analogy_score(self, entity_dict):
self.entity_dict = entity_dict
total_count = len(self.analogies)
correct_count = 0

for values in self.analogies:
vecA = self.find_vec(values[0])
vecB = self.find_vec(values[1])
Expand All @@ -56,4 +57,4 @@ def get_analogy_score(self, entity_dict):

if values[3].upper() in top_k_dict:
correct_count += 1
return correct_count
return correct_count
51 changes: 35 additions & 16 deletions seed_embeddings/OpenKE/config/Trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(
save_steps=None,
checkpoint_dir=None,
index_dir=None,
out_path=None,
analogy_file="analogies.txt",
):

self.work_threads = 8
Expand All @@ -52,10 +52,10 @@ def __init__(
self.save_steps = save_steps
self.checkpoint_dir = checkpoint_dir
# self.out_path = out_path

self.entity_names = self.load_entity_names(index_dir)
self.analogies = analogy.AnalogyScorer(analogy_file="analogies.txt")
self.analogies = analogy.AnalogyScorer(analogy_file=analogy_file)

def load_entity_names(self, index_dir):
with open(os.path.join(index_dir, "entity2id.txt")) as fEntity:
content = fEntity.read()
Expand Down Expand Up @@ -93,8 +93,8 @@ def getEntityDict(self, ent_embeddings):
mapping entity names to their corresponding embeddings.
"""
entity_dict = {}
for i, entity_name in enumerate(self.entity_dict):

for i, entity_name in enumerate(self.entity_names):
entity_dict[entity_name] = ent_embeddings[i].tolist()

return entity_dict
Expand Down Expand Up @@ -139,7 +139,7 @@ def run(
weight_decay=self.weight_decay,
)
print("Finish initializing...")

best_metric_val = 0.0
training_range = tqdm(range(self.train_times))
for epoch in training_range:
res = 0.0
Expand All @@ -148,6 +148,7 @@ def run(
res += loss
training_range.set_description("Epoch %d | loss: %f" % (epoch, res))
checkpoint = None
save_ckpt = False
if ray and epoch % freq == 0:
metrics = {"loss": res}
# Link Prediction
Expand All @@ -170,27 +171,45 @@ def run(
"hit1": hit1,
}
)
if best_metric_val <= hit1:
best_metric_val = hit1
save_ckpt = True
print("Link Prediction Scores Completed")

if is_analogy:
elif is_analogy:
# self.model => Negative Sampling object
# self.mode.model => Transe model

ent_embeddings = self.model.model.ent_embeddings.weight.data.numpy()
ent_embeddings = (
self.model.model.ent_embeddings.weight.data.cpu().numpy()
)
entity_dict = self.getEntityDict(ent_embeddings)
analogy_score = self.analogies.get_analogy_score(entity_dict)
metrics.update({"AnalogiesScore": analogy_score})
print("Analogy Score Completed")
print("Analogy Score completed")

del entity_dict

if best_metric_val <= analogy_score:
best_metric_val = analogy_score
save_ckpt = True

else: # loss
if best_metric_val >= res:
best_metric_val = res
save_ckpt = True

with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
# Save the checkpoint...
self.model.save_checkpoint(
os.path.join(
temp_checkpoint_dir,
"checkpoint" + "-" + str(epoch) + ".ckpt",
checkpoint = None
if save_ckpt:
self.model.save_checkpoint(
os.path.join(
temp_checkpoint_dir,
"checkpoint" + "-" + str(epoch) + ".ckpt",
)
)
)
checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)
checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)

train.report(metrics, checkpoint=checkpoint)

Expand Down
Loading
Loading