Skip to content

Commit

Permalink
Merge pull request #98 from IITH-Compilers/inMemoryVocabulary
Browse files Browse the repository at this point in the history
created file to generated vocabulary as map
  • Loading branch information
svkeerthy authored Apr 23, 2024
2 parents 3e2f3af + c8dc365 commit 4b7cfe6
Show file tree
Hide file tree
Showing 18 changed files with 124 additions and 129 deletions.
3 changes: 1 addition & 2 deletions Manylinux2014_Compliant_Source/pkg/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@ cd ..
cmake -DCMAKE_BUILD_TYPE=Release .. && make -j"$(nproc)" && make install

cd ..
cp build/vocabulary.h Manylinux2014_Compliant_Source/pkg/ir2vec/
cp src/include/utils.h Manylinux2014_Compliant_Source/pkg/ir2vec/
cp build/src/version.h Manylinux2014_Compliant_Source/pkg/ir2vec/
cp vocabulary/seedEmbeddingVocab.txt Manylinux2014_Compliant_Source/pkg/ir2vec/

bash Manylinux2014_Compliant_Source/pkg/regen-oracle.sh
2 changes: 0 additions & 2 deletions Manylinux2014_Compliant_Source/pkg/ir2vec/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,3 @@
__version__ = getVersion()
__copyright__ = "Copyright The Contributors of IR2Vec"
__license__ = "BSD 4-Clause License"

setSeedEmbdPath(preparation.install_loc_pkg)
22 changes: 4 additions & 18 deletions Manylinux2014_Compliant_Source/pkg/ir2vec/core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#include "IR2Vec.h"
#include "utils.h"
#include "version.h"

#include <Python.h>
#include <cstring>
#include <fstream>
Expand Down Expand Up @@ -58,14 +57,6 @@ static PyObject *getIR2VecVersion(PyObject *self, PyObject *args) {
NULL);
}

PyObject *setSeedEmbeddingPath(PyObject *self, PyObject *args) {
const char *vocab_path2 = "";
if (PyArg_ParseTuple(args, "s", &vocab_path2)) {
seed_emb_path = string(vocab_path2);
}
return PyUnicode_FromString("Seed Embedding Path is Set");
}

bool fileNotValid(const char *filename) {
ifstream temp;
temp.open(filename, ios_base::in);
Expand Down Expand Up @@ -169,21 +160,18 @@ class IR2VecHandler {
// The scope of this Module object is extremely crucial
std::unique_ptr<llvm::Module> Module;
Module = IR2Vec::getLLVMIR();
std::string vocab_path = seed_emb_path + "/seedEmbeddingVocab.txt";

IR2Vec::Embeddings *emb = new IR2Vec::Embeddings();
// if output file is provided
if (this->outputFile != "") {
string outFile = this->outputFile;
ofstream output;
output.open(outFile, ios_base::app);
emb = std::move(new IR2Vec::Embeddings(*Module, ir2vecMode, vocab_path,
(this->level)[0], &output,
funcName));
emb = std::move(new IR2Vec::Embeddings(
*Module, ir2vecMode, (this->level)[0], &output, funcName));
} else {
emb = std::move(new IR2Vec::Embeddings(*Module, ir2vecMode, vocab_path,
(this->level)[0], nullptr,
funcName));
emb = std::move(new IR2Vec::Embeddings(
*Module, ir2vecMode, (this->level)[0], nullptr, funcName));
}

if (!emb) {
Expand Down Expand Up @@ -381,8 +369,6 @@ PyMethodDef IR2Vec_core_Methods[] = {
"Get Program Vector"},
{"getFunctionVectors", (PyCFunction)getFunctionVectors, METH_VARARGS,
"Get Function Vectors"},
{"setSeedEmbdPath", (PyCFunction)setSeedEmbeddingPath, METH_VARARGS,
"Set Seed Embedding Path"},
{"getVersion", getIR2VecVersion, METH_VARARGS, "Get IR2Vec Version"},
{NULL, NULL, 0, NULL} /* Sentinel */
};
Expand Down
9 changes: 4 additions & 5 deletions Manylinux2014_Compliant_Source/pkg/regen-oracle.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,12 @@ mkdir -p ${DEST_FOLDER_SYM_P}
mkdir -p ${DEST_FOLDER_FA_P}

IR2VEC_PATH=../../build/bin/ir2vec
VOCAB_PATH="../../vocabulary/seedEmbeddingVocab.txt"

while IFS= read -r d; do
echo "Generating embeddings for ${d}"
${IR2VEC_PATH} -sym -vocab=${VOCAB_PATH} -o ${DEST_FOLDER_SYM}/ir2vec.txt -level f ${d} &>/dev/null
${IR2VEC_PATH} -fa -vocab=${VOCAB_PATH} -o ${DEST_FOLDER_FA}/ir2vec.txt -level f ${d} &>/dev/null
${IR2VEC_PATH} -sym -vocab=${VOCAB_PATH} -o ${DEST_FOLDER_SYM_P}/ir2vec.txt -level p ${d} >/dev/null
${IR2VEC_PATH} -fa -vocab=${VOCAB_PATH} -o ${DEST_FOLDER_FA_P}/ir2vec.txt -level p ${d} >/dev/null
${IR2VEC_PATH} -sym -o ${DEST_FOLDER_SYM}/ir2vec.txt -level f ${d} &>/dev/null
${IR2VEC_PATH} -fa -o ${DEST_FOLDER_FA}/ir2vec.txt -level f ${d} &>/dev/null
${IR2VEC_PATH} -sym -o ${DEST_FOLDER_SYM_P}/ir2vec.txt -level p ${d} >/dev/null
${IR2VEC_PATH} -fa -o ${DEST_FOLDER_FA_P}/ir2vec.txt -level p ${d} >/dev/null
done <index-${SEED_VERSION}.files
wait
1 change: 0 additions & 1 deletion Manylinux2014_Compliant_Source/pkg/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,5 @@ def get_llvm_files():
],
ext_modules=[IR2Vec_core],
packages=["ir2vec"],
package_data={"": ["seedEmbeddingVocab.txt"]},
include_package_data=True,
)
14 changes: 6 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,14 +110,13 @@ To ensure the correctness, run `make verify-all`
instructions.

### Using Binary
> ir2vec -\<mode\> -vocab \<seedEmbedding-file-path\> -o \<output-file\> -level \<p|f\> -class \<class-number\> -funcName=\<function-name\> \<input-ll-file\>
> ir2vec -\<mode\> -o \<output-file\> -level \<p|f\> -class \<class-number\> -funcName=\<function-name\> \<input-ll-file\>
#### Command-Line options

- `mode` - can be one of `sym`/`fa`
- `sym` denotes Symbolic representation
- `fa` denotes Flow-Aware representation
- `vocab` - the path to the seed embeddings file
- `o` - file in which the embeddings are to be appended; (Note : If file doesn’t exist, new file would be created, else embeddings would be appended)
- `level` - can be one of chars `p`/`f`.
- `p` denotes `program level` encoding
Expand All @@ -139,16 +138,16 @@ Please use `--help` for further details.
#### Flow-Aware Embeddings
For all functions
* `` ir2vec -fa -vocab vocabulary/seedEmbeddingVocab.txt -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``
* `` ir2vec -fa -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``

For a specific function
* `` ir2vec -fa -vocab vocabulary/seedEmbeddingVocab.txt -o <output_file> -level f -class <class-number> -funcName=\<function-name\><input_ll_file>``
* `` ir2vec -fa -o <output_file> -level f -class <class-number> -funcName=\<function-name\><input_ll_file>``

#### Symbolic Embeddings
For all functions
* `` ir2vec -sym -vocab vocabulary/seedEmbeddingVocab.txt -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``
* `` ir2vec -sym -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``
For a specific function
* `` ir2vec -sym -vocab vocabulary/seedEmbeddingVocab.txt -o <output_file> -level f -class <class-number> -funcName=\<function-name\> <input_ll_file>``
* `` ir2vec -sym -o <output_file> -level f -class <class-number> -funcName=\<function-name\> <input_ll_file>``

## Using Libraries
The libraries can be installed by passing the installation location to the `CMAKE_INSTALL_PREFIX` flag during `cmake` followed by `make install`.
Expand Down Expand Up @@ -176,8 +175,7 @@ The following example snippet shows how to query the exposed vector representati

// Creating object to generate FlowAware representation
auto ir2vec =
IR2Vec::Embeddings(<LLVM Module>, IR2Vec::IR2VecMode::FlowAware,
"./vocabulary/seedEmbeddingVocab.txt");
IR2Vec::Embeddings(<LLVM Module>, IR2Vec::IR2VecMode::FlowAware);

// Getting Instruction vectors corresponding to the instructions in <LLVM Module>
auto instVecMap = ir2vec.getInstVecMap();
Expand Down
4 changes: 1 addition & 3 deletions experiments/generate_IR2Vec_embeddings.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@ fi

BUILD=$(realpath ${BUILD})

Absolute_path_of_RepresentationFile=$(realpath ../vocabulary/seedEmbeddingVocab.txt)

TASK_DIR=$1
if [ -z "${TASK_DIR}" ]; then
echo "Task is not mentioned. Please enter value of DM for Device Mapping or TC for Thread_Coarsening."
Expand Down Expand Up @@ -88,7 +86,7 @@ ulimit -s unlimited
for d in ./*.ll; do
let "a++"
echo "$a $d" >>${ALL_FILE}
${BUILD}/bin/ir2vec -${PASS} -vocab $Absolute_path_of_RepresentationFile -class $a -o res_$Trans_type.txt -level p $WEIGHTS $d &>/dev/null
${BUILD}/bin/ir2vec -${PASS} -class $a -o res_$Trans_type.txt -level p $WEIGHTS $d &>/dev/null
done
cd ../..

Expand Down
12 changes: 6 additions & 6 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@

configure_file (./include/version.h.cmake version.h @ONLY)
include_directories(./include ${CMAKE_CURRENT_BINARY_DIR})

include_directories(${CMAKE_BINARY_DIR})
set(commonsrc FlowAware.cpp Symbolic.cpp utils.cpp)
set(libsrc libIR2Vec.cpp ${commonsrc})
set(binsrc CollectIR.cpp IR2Vec.cpp)

file(GLOB RESOURCE_FILES ../vocabulary/seedEmbeddingVocab.txt)

option(LLVM_IR2VEC "where to enable IR2Vec as subproject for LLVM" OFF)

execute_process(
COMMAND python3 generate_vocabulary.py -o ${CMAKE_BINARY_DIR}/vocabulary.h
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMAND echo "Vocabulary file generated."
)
if(NOT LLVM_IR2VEC)

set(LT_LLVM_INSTALL_DIR "" CACHE PATH "LLVM installation directory")
Expand Down Expand Up @@ -37,7 +38,6 @@ if(NOT LLVM_IR2VEC)
VERSION ${PROJECT_VERSION}
SOVERSION 1
PUBLIC_HEADER "./include/IR2Vec.h"
RESOURCE ${RESOURCE_FILES}
OUTPUT_NAME ${IR2VEC_LIB}
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
Expand Down
12 changes: 0 additions & 12 deletions src/IR2Vec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,6 @@ cl::opt<bool> cl_collectIR(
"collectIR", cl::Optional,
cl::desc("Generate triplets for training seed embedding vocabulary"),
cl::init(false), cl::cat(category));

cl::opt<std::string> cl_vocab("vocab", cl::Optional, cl::init(""),
cl::desc("Use embeddings from file path"),
cl::cat(category));

cl::opt<std::string> cl_iname(cl::Positional, cl::desc("Input file path"),
cl::Required, cl::cat(category));

Expand Down Expand Up @@ -81,7 +76,6 @@ int main(int argc, char **argv) {
fa = cl_fa;
sym = cl_sym;
collectIR = cl_collectIR;
vocab = cl_vocab;
iname = cl_iname;
oname = cl_oname;
// newly added
Expand All @@ -105,18 +99,12 @@ int main(int argc, char **argv) {
errs() << "Invalid level specified: Use either p or f\n";
failed = true;
}
if (vocab.empty()) {
errs() << "Should specify vocab pointing to the path of vocabulary\n";
failed = true;
}
} else {
if (!collectIR) {
errs() << "Either of sym, fa or collectIR should be specified\n";
failed = true;
} else if (level)
errs() << "[WARNING] level would not be used in collectIR mode\n";
else if (!vocab.empty())
errs() << "[WARNING] vocab would not be used in collectIR mode\n";
}

if (failed)
Expand Down
76 changes: 76 additions & 0 deletions src/generate_vocabulary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Copyright (c) 2024, The Contributors of IR2Vec.
#
# Part of the IR2Vec project. This software is available under the BSD 4-Clause
# License. Please see LICENSE file in the top-level directory for more details.
#
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("-o", "--output", type=str, help="Output file name")
args = parser.parse_args()

if args.output is None:
print("Error: Output file path not provided.")
exit(1)

output_file = args.output
vocab_file = "../vocabulary/seedEmbeddingVocab.txt"

# Define headers and opening/closing of map
header = """\
// Generated by IR2Vec. DO NOT EDIT!
// This file contains the learned vocabulary used by IR2Vec.
//
// clang-format off
#ifndef __VOCABULARY__
#define __VOCABULARY__
#include <map>
#include <string>
#include <vector>
#include "IR2Vec.h"
namespace IR2Vec {
class Vocabulary {
public:
static const std::map<std::string, IR2Vec::Vector>& getVocabulary() {
return vocabulary;
}
private:
static const std::map<std::string, IR2Vec::Vector> vocabulary;
};
"""

opening = "\nconst std::map<std::string, IR2Vec::Vector> Vocabulary::vocabulary = {\n"
closing = """\
};
} // namespace IR2Vec
#endif // __VOCABULARY__
"""

try:
with open(output_file, "w") as fw:
fw.write(header)
with open(vocab_file, "r") as fr:
# Write vector declarations to the output file
for line in fr.readlines():
key, val = line.strip().split(":")
e = val.find("]")
fw.write(f"const IR2Vec::Vector {key}_vector = {{ {val[1:e]} }};\n")

fw.write(opening)
with open(vocab_file, "r") as fr:
# Write map entries to the output file
for line in fr.readlines():
key, _ = line.strip().split(":")
fw.write(f' {{ "{key}", {key}_vector }},\n')
fw.write(closing)

print(f"Generated {output_file}")

except FileNotFoundError:
print(f"Error: Vocabulary file '{vocab_file}' not found.")
3 changes: 0 additions & 3 deletions src/include/FlowAware.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ class IR2Vec_FA {

pgmVector = IR2Vec::Vector(DIM, 0);
res = "";
IR2Vec::collectDataFromVocab(opcMap);

memWriteOps.try_emplace("store", 1);
memWriteOps.try_emplace("cmpxchg", 0);
Expand Down Expand Up @@ -176,8 +175,6 @@ class IR2Vec_FA {
std::ostream *o = nullptr, std::string name = "",
std::ostream *missCount = nullptr, std::ostream *cyclicCount = nullptr);

std::map<std::string, IR2Vec::Vector> opcMap;

llvm::SmallMapVector<const llvm::Instruction *, IR2Vec::Vector, 128>
getInstVecMap() {
return instVecMap;
Expand Down
23 changes: 11 additions & 12 deletions src/include/IR2Vec.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,30 +19,29 @@ using Vector = llvm::SmallVector<double, DIM>;
enum IR2VecMode { FlowAware, Symbolic };

class Embeddings {
int generateEncodings(llvm::Module &M, IR2VecMode mode, std::string vocab,
char level = '\0', std::string funcName = "",
std::ostream *o = nullptr, int cls = -1, float WO = 1,
float WA = 0.2, float WT = 0.5);
int generateEncodings(llvm::Module &M, IR2VecMode mode, char level = '\0',
std::string funcName = "", std::ostream *o = nullptr,
int cls = -1, float WO = 1, float WA = 0.2,
float WT = 0.5);

llvm::SmallMapVector<const llvm::Instruction *, Vector, 128> instVecMap;
llvm::SmallMapVector<const llvm::Function *, Vector, 16> funcVecMap;
Vector pgmVector;

public:
Embeddings() = default;
Embeddings(llvm::Module &M, IR2VecMode mode, std::string vocab,
std::string funcName = "", float WO = 1, float WA = 0.2,
float WT = 0.5) {
generateEncodings(M, mode, vocab, '\0', funcName, nullptr, -1, WO, WA, WT);
Embeddings(llvm::Module &M, IR2VecMode mode, std::string funcName = "",
float WO = 1, float WA = 0.2, float WT = 0.5) {
generateEncodings(M, mode, '\0', funcName, nullptr, -1, WO, WA, WT);
}

// Use this constructor if the representations ought to be written to a
// file. Analogous to the command line options that are being used in IR2Vec
// binary.
Embeddings(llvm::Module &M, IR2VecMode mode, std::string vocab, char level,
std::ostream *o, std::string funcName = "", float WO = 1,
float WA = 0.2, float WT = 0.5) {
generateEncodings(M, mode, vocab, level, funcName, o, -1, WO, WA, WT);
Embeddings(llvm::Module &M, IR2VecMode mode, char level, std::ostream *o,
std::string funcName = "", float WO = 1, float WA = 0.2,
float WT = 0.5) {
generateEncodings(M, mode, level, funcName, o, -1, WO, WA, WT);
}

// Returns a map containing instructions and the corresponding vector
Expand Down
2 changes: 0 additions & 2 deletions src/include/Symbolic.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,11 @@ class IR2Vec_Symbolic {
llvm::SmallMapVector<const llvm::Function *, IR2Vec::Vector, 16> funcVecMap;
llvm::SmallMapVector<const llvm::Instruction *, IR2Vec::Vector, 128>
instVecMap;
std::map<std::string, IR2Vec::Vector> opcMap;

public:
IR2Vec_Symbolic(llvm::Module &M) : M{M} {
pgmVector = IR2Vec::Vector(DIM, 0);
res = "";
IR2Vec::collectDataFromVocab(opcMap);
}

void generateSymbolicEncodings(std::ostream *o = nullptr);
Expand Down
Loading

0 comments on commit 4b7cfe6

Please sign in to comment.