diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 00000000..b77c2418 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,22 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/ubuntu +{ + "name": "LLVM Manylinux", + // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile + "image": "ghcr.io/iith-compilers/manylinux2014-llvm/manylinux2014-llvm:x86-llvm12" + + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + + // Use 'postCreateCommand' to run commands after the container is created. + // "postCreateCommand": "uname -a", + + // Configure tool-specific properties. + // "customizations": {}, + + // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "root" +} diff --git a/.github/workflows/upload-pypi.yml b/.github/workflows/upload-pypi.yml new file mode 100644 index 00000000..f4efaecd --- /dev/null +++ b/.github/workflows/upload-pypi.yml @@ -0,0 +1,59 @@ +name: Upload to PyPI + +on: + release: + types: + - published + workflow_dispatch: + inputs: + pypi_repo: + description: 'Repo to upload to (testpypi or pypi)' + default: 'testpypi' + required: true + type: choice + options: + - testpypi + - pypi + +jobs: + build_wheels: + uses: ./.github/workflows/wheel.yml + + + build_sdist: + runs-on: ubuntu-latest + container: ghcr.io/iith-compilers/manylinux2014-llvm/manylinux2014-llvm:x86-llvm12 + steps: + - uses: actions/checkout@v3 + + - name: Build IR2Vec + run: bash Manylinux2014_Compliant_Source/pkg/build.sh + + - name: Build sdist + run: cd Manylinux2014_Compliant_Source/pkg && pipx run build --sdist + + - uses: actions/upload-artifact@v3 + with: + path: Manylinux2014_Compliant_Source/pkg/dist/*.tar.gz + + + upload_pypi: + permissions: + id-token: write + needs: [build_wheels, build_sdist] + runs-on: ubuntu-latest + steps: + - uses: actions/download-artifact@v3 + with: + name: artifact + path: dist + + - name: Publish package to TestPyPI + uses: pypa/gh-action-pypi-publish@v1.8.5 + with: + repository-url: https://test.pypi.org/legacy/ + if: ${{ github.event.inputs.pypi_repo != 'pypi' }} + + - name: Publish package to PyPI + uses: pypa/gh-action-pypi-publish@v1.8.5 + if: ${{ github.event.inputs.pypi_repo == 'pypi' }} diff --git a/.github/workflows/wheel.yml b/.github/workflows/wheel.yml new file mode 100644 index 00000000..db5ea130 --- /dev/null +++ b/.github/workflows/wheel.yml @@ -0,0 +1,30 @@ +name: Build wheels + +on: [push, workflow_dispatch, workflow_call] + +jobs: + build_wheels: + name: Build wheels on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-20.04] + + steps: + - uses: actions/checkout@v3 + + - name: Build wheels + uses: pypa/cibuildwheel@v2.13.1 + with: + package-dir: Manylinux2014_Compliant_Source/pkg + env: + CIBW_SKIP: "pp* *-musllinux_*" + CIBW_ARCHS: "x86_64" + CIBW_MANYLINUX_X86_64_IMAGE: "ghcr.io/iith-compilers/manylinux2014-llvm/manylinux2014-llvm:x86-llvm12" + CIBW_BEFORE_ALL: "bash Manylinux2014_Compliant_Source/pkg/build.sh" + CIBW_TEST_REQUIRES: pytest + CIBW_TEST_COMMAND: "pytest {project}/Manylinux2014_Compliant_Source/pkg/tests" + + - uses: actions/upload-artifact@v3 + with: + path: ./wheelhouse/*.whl diff --git a/.gitignore b/.gitignore index 2fa33929..e38fa620 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ build/ __pycache__/ .vscode experiments/*/output +.cache/ diff --git a/IR2Vec_Wheels/Python310/IR2Vec_pkg-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl b/IR2Vec_Wheels/Python310/IR2Vec_pkg-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl deleted file mode 100644 index b88f296a..00000000 Binary files a/IR2Vec_Wheels/Python310/IR2Vec_pkg-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl and /dev/null differ diff --git a/IR2Vec_Wheels/Python311/IR2Vec_pkg-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl b/IR2Vec_Wheels/Python311/IR2Vec_pkg-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl deleted file mode 100644 index 7ac6791f..00000000 Binary files a/IR2Vec_Wheels/Python311/IR2Vec_pkg-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl and /dev/null differ diff --git a/IR2Vec_Wheels/Python36/IR2Vec_pkg-1.0.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl b/IR2Vec_Wheels/Python36/IR2Vec_pkg-1.0.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl deleted file mode 100644 index 82004c03..00000000 Binary files a/IR2Vec_Wheels/Python36/IR2Vec_pkg-1.0.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl and /dev/null differ diff --git a/IR2Vec_Wheels/Python37/IR2Vec_pkg-1.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl b/IR2Vec_Wheels/Python37/IR2Vec_pkg-1.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl deleted file mode 100644 index 0bf29a2f..00000000 Binary files a/IR2Vec_Wheels/Python37/IR2Vec_pkg-1.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl and /dev/null differ diff --git a/IR2Vec_Wheels/Python38/IR2Vec_pkg-1.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl b/IR2Vec_Wheels/Python38/IR2Vec_pkg-1.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl deleted file mode 100644 index 94957871..00000000 Binary files a/IR2Vec_Wheels/Python38/IR2Vec_pkg-1.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl and /dev/null differ diff --git a/IR2Vec_Wheels/Python39/IR2Vec_pkg-1.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl b/IR2Vec_Wheels/Python39/IR2Vec_pkg-1.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl deleted file mode 100644 index 50cb3bc3..00000000 Binary files a/IR2Vec_Wheels/Python39/IR2Vec_pkg-1.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl and /dev/null differ diff --git a/IR2Vec_Wheels/examples/test1.py b/IR2Vec_Wheels/examples/test1.py deleted file mode 100755 index 854d0717..00000000 --- a/IR2Vec_Wheels/examples/test1.py +++ /dev/null @@ -1,13 +0,0 @@ -import IR2Vec_pkg as pp - -d = pp.generateEmbeddings("/home/shikhar_jain/simpleBranch.ll", "fa", "f") - -if d["Status"]: - # print(d.keys()) - for x in d["Function_Dict"]: - print("key: ", x) - print("Value: ", d["Function_Dict"][x]) - print("\n\n") - print(d["Instruction_Dict"].keys()) -else: - print(d["Message"]) diff --git a/IR2Vec_Wheels/readme.md b/IR2Vec_Wheels/readme.md deleted file mode 100644 index 3d82b7f7..00000000 --- a/IR2Vec_Wheels/readme.md +++ /dev/null @@ -1,16 +0,0 @@ -AUTHORS : SHIKHAR JAIN (IITH COMPILERS) & ANILAVA KUNDU (IITH COMPILERS) -- Each manylinux wheel is specific to Python ABI version, hence install them accodingly -- Dependecies - - `patchelf` (sudo apt / as python package) -- Input: - - .ll/.bc file for which embeddings are needed - - `fa` -> Flow-Aware Encoddings; `sym` -> Symbolic Encoddings - - `p` -> program level; `f` -> function level -- Output: - - A dictionary containing: - - `Instruction_Dict`: Key: Instruction[String]; Value : Embedding Vector - - `Function_Dict`: Key: Function name[String]; Value : Embedding Vector - - `Program_List`: Program/Module Name[String]; Embedding Vector - - `Message`: [String] Appropriate debug message . - - `Status`: [Bool] True if everything went fine else False -- `test1.py` contains an example to demonstrate the usage of the package. diff --git a/LICENSE b/LICENSE index 090f06b6..ebe8f261 100644 --- a/LICENSE +++ b/LICENSE @@ -1,7 +1,8 @@ BSD 4-Clause License -Copyright (c) 2021, S. VenkataKeerthy, Rohit Aggarwal -Department of Computer Science and Engineering, IIT Hyderabad +Copyright (c) 2018-2023, S. VenkataKeerthy, Rohit Aggarwal +Department of Computer Science and Engineering, IIT Hyderabad. +All rights reserved. IR2Vec uses the following third party software packages: 1. Modified version of OpenKE with MIT license which can be found under: diff --git a/Manylinux2014_Compliant_Source/.gitignore b/Manylinux2014_Compliant_Source/.gitignore new file mode 100644 index 00000000..8a7b2a56 --- /dev/null +++ b/Manylinux2014_Compliant_Source/.gitignore @@ -0,0 +1 @@ +manylinux2014-LLVM/ diff --git a/Manylinux2014_Compliant_Source/manylinux-llvm/Dockerfile b/Manylinux2014_Compliant_Source/manylinux-llvm/Dockerfile new file mode 100644 index 00000000..1f9de480 --- /dev/null +++ b/Manylinux2014_Compliant_Source/manylinux-llvm/Dockerfile @@ -0,0 +1,62 @@ +FROM quay.io/pypa/manylinux2014_x86_64 as builder +LABEL maintainer="Shamil K (noteness@riseup.net)" + +RUN yum -y install cmake wget openssl-devel + +RUN mkdir /root/destdir + +WORKDIR /root/cmake +ARG CMAKE_VERSION="3.26.4" +RUN wget -q "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz" \ + && tar -xf "cmake-${CMAKE_VERSION}.tar.gz" +WORKDIR "/root/cmake/cmake-${CMAKE_VERSION}" +RUN cmake -DCMAKE_BUILD_TYPE=Release . \ + && make -j "$(nproc)" \ + && cmake --install . \ + && cmake --install . --prefix /root/destdir + +WORKDIR /root/ninja +ARG NINJA_VERSION="1.11.1" +RUN wget -q "https://github.com/ninja-build/ninja/archive/refs/tags/v${NINJA_VERSION}.tar.gz" \ + && tar -xf "v${NINJA_VERSION}.tar.gz" +WORKDIR "/root/ninja/ninja-${NINJA_VERSION}" +RUN cmake -DCMAKE_BUILD_TYPE=Release -B build \ + && cmake --build build -j"$(nproc)" \ + && cmake --install build \ + && cmake --install build --prefix /root/destdir + +WORKDIR /root/mold +ARG MOLD_VERSION="1.11.0" +RUN wget -q "https://github.com/rui314/mold/archive/refs/tags/v${MOLD_VERSION}.tar.gz" \ + && tar -xf "v${MOLD_VERSION}.tar.gz" +WORKDIR "/root/mold/mold-${MOLD_VERSION}" +RUN cmake -DCMAKE_BUILD_TYPE=Release -G Ninja -B build \ + && cmake --build build -j"$(nproc)" \ + && cmake --install build \ + && cmake --install build --prefix /root/destdir + +WORKDIR /root/llvm +ARG LLVM_VERSION="12.0.1" +RUN wget -q "https://github.com/llvm/llvm-project/releases/download/llvmorg-${LLVM_VERSION}/llvm-project-${LLVM_VERSION}.src.tar.xz" \ + && tar -xf "llvm-project-${LLVM_VERSION}.src.tar.xz" +WORKDIR /root/llvm/llvm-project-${LLVM_VERSION}.src/build +RUN cmake -G Ninja -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_ENABLE_PROJECTS='clang' \ + -DCLANG_ENABLE_BOOTSTRAP=On \ + -DLLVM_LINK_LLVM_DYLIB=On \ + -DLLVM_USE_SPLIT_DWARF=On \ + -DBOOTSTRAP_LLVM_LINK_LLVM_DYLIB=On \ + -DBOOTSTRAP_LLVM_USE_SPLIT_DWARF=On \ + -DBOOTSTRAP_LLVM_USE_LINKER=mold \ + -DCMAKE_INSTALL_PREFIX=/root/destdir \ + ../llvm \ + && ninja stage2 \ + && ninja stage2-install + +FROM quay.io/pypa/manylinux2014_x86_64 +COPY --from=builder /root/destdir /usr/local/ + +COPY entrypoint /usr/local/bin/entrypoint + +ENTRYPOINT ["entrypoint"] +CMD ["/bin/bash"] diff --git a/Manylinux2014_Compliant_Source/manylinux-llvm/entrypoint b/Manylinux2014_Compliant_Source/manylinux-llvm/entrypoint new file mode 100755 index 00000000..2faa7592 --- /dev/null +++ b/Manylinux2014_Compliant_Source/manylinux-llvm/entrypoint @@ -0,0 +1,5 @@ +#!/bin/bash + +set -eu + +exec "$@" diff --git a/Manylinux2014_Compliant_Source/pkg/IR2Vec/.gitignore b/Manylinux2014_Compliant_Source/pkg/IR2Vec/.gitignore new file mode 100644 index 00000000..f1e5012b --- /dev/null +++ b/Manylinux2014_Compliant_Source/pkg/IR2Vec/.gitignore @@ -0,0 +1 @@ +utils.h diff --git a/Manylinux2014_Compliant_Source/pkg/IR2Vec/README.md b/Manylinux2014_Compliant_Source/pkg/IR2Vec/README.md new file mode 100644 index 00000000..5bb2c0ac --- /dev/null +++ b/Manylinux2014_Compliant_Source/pkg/IR2Vec/README.md @@ -0,0 +1 @@ +given input .ll/.bc generates corresponding IR2Vec embeddings in a file or on stdout diff --git a/Manylinux2014_Compliant_Source/pkg/IR2Vec/__init__.py b/Manylinux2014_Compliant_Source/pkg/IR2Vec/__init__.py new file mode 100755 index 00000000..68187309 --- /dev/null +++ b/Manylinux2014_Compliant_Source/pkg/IR2Vec/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2023, The Contributors of IR2Vec. +# +# Part of the IR2Vec project. This software is available under the BSD 4-Clause +# License. Please see LICENSE file in the top-level directory for more details. +# + +### we have to find package before importing it else export LD_Lib path and LIB_path will not work. +from . import preparation +from IR2Vec.core import * + +import pathlib as pl +import os, io + +__version__ = getVersion() +__copyright__ = "Copyright The Contributors of IR2Vec" +__license__ = "BSD 4-Clause License" + +setSeedEmbdPath(preparation.install_loc_pkg) diff --git a/Manylinux2014_Compliant_Source/pkg/IR2Vec_pkg/core.cpp b/Manylinux2014_Compliant_Source/pkg/IR2Vec/core.cpp similarity index 93% rename from Manylinux2014_Compliant_Source/pkg/IR2Vec_pkg/core.cpp rename to Manylinux2014_Compliant_Source/pkg/IR2Vec/core.cpp index 22b6dd43..1f63651c 100644 --- a/Manylinux2014_Compliant_Source/pkg/IR2Vec_pkg/core.cpp +++ b/Manylinux2014_Compliant_Source/pkg/IR2Vec/core.cpp @@ -1,6 +1,15 @@ +// Copyright(c) 2023, The Contributors of IR2Vec. +// +// Part of the IR2Vec project.This software is available under the BSD 4-Clause +// License. Please see LICENSE file in the top - level directory for more +// details. +// + #define PY_SSIZE_T_CLEAN #include "IR2Vec.h" #include "utils.h" +#include "version.h" + #include #include #include @@ -38,9 +47,9 @@ #include -//#include "_dl_x86_cpu_features.c" +// #include "_dl_x86_cpu_features.c" -//#include "boost/python.hpp" +// #include "boost/python.hpp" // utils.h is included because it provides with a function for conversion using namespace std; @@ -68,6 +77,13 @@ string seed_emb_path = ""; // cout<<"fail2"< temp2; - const char *readable_name = ""; + char *readable_name; string demangledName; - size_t sz; - int status; + size_t sz = 17; + int status = 0; // coying llvm samll map vector data into c++ map // for InstVecMap @@ -266,7 +276,7 @@ PyObject *IR2Vec_generateEmbeddings(PyObject *self, PyObject *args) { readable_name = __cxxabiv1::__cxa_demangle(instName.c_str(), 0, &sz, &status); demangledName = status == 0 ? std::string(readable_name) : instName; - cout << demangledName << endl; + free(readable_name); InstVecMap[demangledName] = temp2; } else // if Value does not has a name { @@ -293,13 +303,8 @@ PyObject *IR2Vec_generateEmbeddings(PyObject *self, PyObject *args) { for (auto &Vec_it : Func_it.second) temp2.push_back(Vec_it); - auto temp1 = Func_it.first->getName(); // getName returns StringRef // apply __cxx::demangle just to be cautious - auto funcName = temp1.str(); - readable_name = - __cxxabiv1::__cxa_demangle(funcName.c_str(), 0, &sz, &status); - demangledName = status == 0 ? std::string(readable_name) : funcName; - cout << demangledName << endl; + demangledName = IR2Vec::getDemagledName(Func_it.first); FuncVecMap[demangledName] = temp2; temp2.clear(); @@ -319,10 +324,8 @@ PyObject *IR2Vec_generateEmbeddings(PyObject *self, PyObject *args) { for (auto Map_it1 : InstVecMap) { PyObject *temp3 = PyList_New(0); - cout << Map_it1.first << endl; for (auto &List_it1 : Map_it1.second) PyList_Append(temp3, PyFloat_FromDouble(List_it1)); - // cout</dev/null + ${IR2VEC_PATH} -fa -vocab=${VOCAB_PATH} -o ${DEST_FOLDER_FA}/ir2vec.txt -level f ${d} &>/dev/null + ${IR2VEC_PATH} -sym -vocab=${VOCAB_PATH} -o ${DEST_FOLDER_SYM_P}/ir2vec.txt -level p ${d} >/dev/null + ${IR2VEC_PATH} -fa -vocab=${VOCAB_PATH} -o ${DEST_FOLDER_FA_P}/ir2vec.txt -level p ${d} >/dev/null +done [^)]+)\)$") +llvm_libs_regex = re.compile( + r"^llvm_map_components_to_libnames\(llvm_libs (?P[^)]+)\)$" +) + +LLVM_LIBS = [] +VERSION = "" +DESCRIPTION = "" + +with (pl.Path(__file__).resolve().parents[2] / "src" / "CMakeLists.txt").open() as f: + for line in f: + if not VERSION: + vmatch = version_regex.match(line) # Not using walrus because Python3.6 + if vmatch: + VERSION = vmatch.group("version") + continue + if not LLVM_LIBS: + libmatch = llvm_libs_regex.match(line) + if libmatch: + LLVM_LIBS = libmatch.group("libs").split() + continue + if VERSION and LLVM_LIBS: + break + +with (pl.Path(__file__).parent / "IR2Vec" / "README.md").open() as f: + DESCRIPTION = f.read() + + +def get_llvm_files(): + out = sp.run( + ["llvm-config", "--link-static", "--libfiles"] + LLVM_LIBS, stdout=sp.PIPE + ) + files = out.stdout.decode("utf8").split() + return files + IR2Vec_core = Extension( - "IR2Vec_pkg.core", - sources=["IR2Vec_pkg/core.cpp"], - include_dirs=[ - "./IR2Vec_pkg", - "./IR2Vec_pkg/IR2Vec_include", - ], # list of directories to search for C/C++ header files (in Unix form for portability) + "IR2Vec.core", + sources=["IR2Vec/core.cpp"], + include_dirs=["./IR2Vec"], libraries=["z"], - extra_objects=["./libIR2Vec.a", "./libLLVMMother.a"], + extra_objects=["/usr/local/lib/libIR2Vec.a"] + get_llvm_files(), extra_compile_args=["-v"], ) setup( - name="IR2Vec_pkg", - author="Shikhar Jain", - author_email="cs22mtech02002@iith.ac.in", - version="1.0.0", - description="given input .ll/.bc generates corresponding IR2Vec embeddings in a file or on stdout", - ext_modules=[IR2Vec_core], # A list of Python extensions to be built - packages=["IR2Vec_pkg"], # A list of Python packages that distutils will manipulate - # package_data={'':['*.so*','./IR2Vec_include/*','./llvm/*','./llvm-c/*','seedEmbeddingVocab-300-llvm12.txt']}, - package_data={ - "": [ - "./IR2Vec_include/*", - "./llvm/*", - "./llvm-c/*", - "seedEmbeddingVocab-300-llvm12.txt", - ] - }, + name="IR2Vec", + author="IR2Vec Developers", + version=VERSION, + description="LLVM IR based Program Embeddings for Compiler Optimizations and Program Comprehension", + long_description=DESCRIPTION, + long_description_content_type="text/markdown", + url="https://github.com/IITH-Compilers/IR2Vec", + license="BSD 4-Clause", + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: BSD License", + "Programming Language :: C++", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Compilers", + ], + ext_modules=[IR2Vec_core], + packages=["IR2Vec"], + package_data={"": ["seedEmbeddingVocab-*.txt"]}, include_package_data=True, ) diff --git a/Manylinux2014_Compliant_Source/pkg/tests/test_ir2vec.py b/Manylinux2014_Compliant_Source/pkg/tests/test_ir2vec.py new file mode 100644 index 00000000..33ec77e3 --- /dev/null +++ b/Manylinux2014_Compliant_Source/pkg/tests/test_ir2vec.py @@ -0,0 +1,110 @@ +# Copyright (c) 2023, The Contributors of IR2Vec. +# +# Part of the IR2Vec project. This software is available under the BSD 4-Clause +# License. Please see LICENSE file in the top-level directory for more details. +# + +import pathlib as pl +import IR2Vec +import pytest + +from collections import defaultdict + +ABS_ACCURACY = 1e-4 + +TEST_SUITE_DIR = pl.Path(__file__).resolve().parents[3] / "src" / "test-suite" +SEED_VERSION = ( + list(IR2Vec.preparation.install_loc.glob("seed*.txt"))[0] + .name.split(".")[0] + .split("-")[-1] +) + +ll_files = [] +path = TEST_SUITE_DIR / f"index-{SEED_VERSION}.files" +with (TEST_SUITE_DIR / f"index-{SEED_VERSION}.files").open() as f: + for ll_file in f: + ll_files.append(ll_file) + + +def read_f_file(path): + f_map = defaultdict(dict) + with path.open() as f: + for line in f: + fouts = line.split("=") + vec = fouts[-1] + fname = "=".join(fouts[:-1]) + fname = fname.strip() + path, only_fname = fname.split(".cpp", 1) + filename = "".join([path.split("/")[1], ".ll"]) + only_fname = only_fname.strip()[2:] + vec = list(map(float, vec.strip().split())) + f_map[filename][only_fname] = vec + return f_map + + +def read_p_file(path): + p_vectors = [] + with path.open() as f: + for line in f: + p_vectors.append(list(map(float, line.strip().split()))) + return p_vectors + + +def test_fa_p(): + p_vectors = [] + for file in ll_files: + full_path = str((TEST_SUITE_DIR / file).resolve()).strip() + output = IR2Vec.generateEmbeddings(full_path, "fa", "p") + p_vectors.append(output["Program_List"]) + p_vectors_oracle = read_p_file( + TEST_SUITE_DIR / "oracle" / f"FA_{SEED_VERSION}" / "ir2vec.txt" + ) + for idx, v in enumerate(p_vectors): + assert v == pytest.approx(p_vectors_oracle[idx], abs=ABS_ACCURACY) + + +def test_sym_p(): + p_vectors = [] + for file in ll_files: + full_path = str((TEST_SUITE_DIR / file).resolve()).strip() + output = IR2Vec.generateEmbeddings(full_path, "sym", "p") + p_vectors.append(output["Program_List"]) + p_vectors_oracle = read_p_file( + TEST_SUITE_DIR / "oracle" / f"SYM_{SEED_VERSION}" / "ir2vec.txt" + ) + for idx, v in enumerate(p_vectors): + assert v == pytest.approx(p_vectors_oracle[idx], abs=ABS_ACCURACY) + + +def test_fa_f(): + f_vecs = defaultdict(dict) + for file in ll_files: + full_path = (TEST_SUITE_DIR / file).resolve() + output = IR2Vec.generateEmbeddings(str(full_path).strip(), "fa", "f") + for fun, vec in output["Function_Dict"].items(): + f_vecs[full_path.name.strip()][fun.strip()] = vec + f_vecs_oracle = read_f_file( + TEST_SUITE_DIR / "oracle" / f"FA_{SEED_VERSION}_f" / "ir2vec.txt" + ) + for pname, funs in f_vecs_oracle.items(): + for fname, vec in funs.items(): + assert vec == pytest.approx( + f_vecs[pname][fname], abs=ABS_ACCURACY + ), f"Checking {pname}: {fname}" + + +def test_sym_f(): + f_vecs = defaultdict(dict) + for file in ll_files: + full_path = (TEST_SUITE_DIR / file).resolve() + output = IR2Vec.generateEmbeddings(str(full_path).strip(), "sym", "f") + for fun, vec in output["Function_Dict"].items(): + f_vecs[full_path.name.strip()][fun.strip()] = vec + f_vecs_oracle = read_f_file( + TEST_SUITE_DIR / "oracle" / f"SYM_{SEED_VERSION}_f" / "ir2vec.txt" + ) + for pname, funs in f_vecs_oracle.items(): + for fname, vec in funs.items(): + assert vec == pytest.approx( + f_vecs[pname][fname], abs=ABS_ACCURACY + ), f"Checking {pname}: {fname}" diff --git a/README.md b/README.md index 0eff9583..f259fe60 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ Please see [here](https://compilers.cse.iith.ac.in/projects/ir2vec/) for more de > IR2Vec: LLVM IR Based Scalable Program Embeddings, S. VenkataKeerthy, Rohit Aggarwal, Shalini Jain, Maunendra Sankar Desarkar, Ramakrishna Upadrasta, and Y. N. Srikant ![LLVM](https://img.shields.io/badge/LLVM-v12.0.0-blue) +![PyPI Version](https://img.shields.io/pypi/v/your-package-name) ![Tests](https://github.com/IITH-Compilers/IR2Vec/workflows/Tests/badge.svg) ![Publish](https://github.com/IITH-Compilers/IR2Vec/workflows/Publish/badge.svg) ![pre-commit checks](https://github.com/IITH-Compilers/IR2Vec/workflows/pre-commit%20checks/badge.svg) @@ -14,17 +15,40 @@ Please see [here](https://compilers.cse.iith.ac.in/projects/ir2vec/) for more de ![Image](images/ir2vec.jpg) ## Table Of Contents -* [Requirements](#requirements) -* [Binaries and Libraries - Artifacts](#binaries-and-libraries---artifacts) -* [Building from Source](#building-from-source) +* [Installation](#installation) + * [Python](#python) + * [C++](#cpp) + * [Requirements](#requirements) + * [Building from Source](#building-from-source) * [Generating program representations](#generating-program-representations) * [Using Binary](#using-binary) * [Using Libraries](#using-libraries) + * [Using Python package (IR2Vec-Wheels)](#using-python-package-ir2vec-wheels) +* [Binaries, Libraries and Wheels - Artifacts](#binaries-libraries-and-wheels---artifacts) * [Experiments](#experiments) * [Citation](#citation) * [Contributions](#contributions) * [License](#license) +## Installation + +`IR2Vec` can be installed in different ways to accommodate individual preferences and requirements effectively. You may select to install via a user-friendly Python wheel setup if you are a Python user, or opt for a C++ based installation if you are looking to integrate with a compiler pass or necessitate advanced control and enhanced integration capabilities. The detailed setup steps are mentioned in the following sections. + +## Python + +If you prefer working with Python, you can easily install `IR2Vec` using `pip`. + +``` +pip install -U ir2vec +``` +Now, you can import and use IR2Vec in your Python projects. Make sure you have a good understanding of Python and its package management system. + +We are actively working on improving the Python interfaces and providing better support. If you find any good-to-have interfaces that you may need for your use case missing, please feel free to raise a request. + +## Cpp + +If you're a C++ developer and require low-level control, optimization, or integration with C++ projects, you can build `IR2Vec` from source. First, ensure the below requirements are satisfied, then follow the steps mentioned in the [Building from source](#building-from-source) section. + ## Requirements * cmake (>= 3.13.4) * GNU Make (4.2.1) @@ -39,8 +63,6 @@ Please see [here](https://compilers.cse.iith.ac.in/projects/ir2vec/) for more de (Experiments are done on an Ubuntu 18.04 machine) -## Binaries and Libraries - Artifacts -Binaries and Libraries (.a and .so) are autogenerated for every relevant checkin using GitHub Actions. Such generated artifacts are tagged along with the successful runs of `Publish` workflow and can be found [here](https://github.com/IITH-Compilers/IR2Vec/actions?query=workflow%3APublish). ## Building from source 1. `mkdir build && cd build` @@ -58,8 +80,10 @@ This process would generate `ir2vec` binary under `build/bin` directory, `libIR2 To ensure the correctness, run `make verify-all` + + ## Generating program representations -`IR2Vec` can be used either as a stand alone tool using binary, or can be integrated with any third party tools using libraries. Please see below for the usage +`IR2Vec` can be used either as a stand-alone tool using binary or can be integrated with any third-party tools using libraries. Please see below for the usage instructions. ### Using Binary @@ -91,15 +115,16 @@ Please use `--help` for further details. > = #### Flow-Aware Embeddings +For all functions * `` ir2vec -fa -vocab vocabulary/seedEmbeddingVocab-300-llvm12.txt -o -level -class `` -#### Symbolic Embeddings - * `` ir2vec -sym -vocab vocabulary/seedEmbeddingVocab-300-llvm12.txt -o -level -class `` - -#### On-demand Flow-Aware Embeddings +For a specific function * `` ir2vec -fa -vocab vocabulary/seedEmbeddingVocab-300-llvm12.txt -o -level f -class -funcName=\`` -#### On-demand Symbolic Embeddings +#### Symbolic Embeddings +For all functions + * `` ir2vec -sym -vocab vocabulary/seedEmbeddingVocab-300-llvm12.txt -o -level -class `` +For a specific function * `` ir2vec -sym -vocab vocabulary/seedEmbeddingVocab-300-llvm12.txt -o -level f -class -funcName=\ `` ## Using Libraries @@ -159,6 +184,38 @@ for (auto val : pgmVec) outs() << val << "\t"; ``` +## Using Python package (IR2Vec-Wheels) +- Input: + - .ll/.bc file for which embeddings are needed + - `fa` -> Flow-Aware Encoddings; `sym` -> Symbolic Encodings + - `p` -> program level; `f` -> function level +- Output: + - A dictionary containing: + - `Instruction_Dict`: Key: Instruction[String]; Value : Embedding Vector + - `Function_Dict`: Key: Function name[String]; Value : Embedding Vector + - `Program_List`: Program/Module Name[String]; Embedding Vector + - `Message`: [String] Appropriate debug message. + - `Status`: [Bool] True if everything went fine else False +- The following code snippet contains an example to demonstrate the usage of the package. + +```python +import IR2Vec as i2v + +emb = i2v.generateEmbeddings("/path/to/file.ll", "fa", "f") + +if emb["Status"]: + for x in emb["Function_Dict"]: + print("key: ", x) + print("Value: ", emb["Function_Dict"][x]) + print("\n\n") + print(emb["Instruction_Dict"].keys()) +else: + print(emb["Message"]) + +``` +## Binaries, Libraries and Wheels - Artifacts +Binaries, Libraries (.a and .so), and whl files are autogenerated for every relevant check-in using GitHub Actions. Such generated artifacts are tagged along with the successful runs of [`Publish`](https://github.com/IITH-Compilers/IR2Vec/actions?query=workflow%3APublish) and [`Build Wheels`](https://github.com/IITH-Compilers/IR2Vec/actions/workflows/wheel.yml) actions. + ## Experiments ### Note @@ -191,7 +248,7 @@ keywords = {heterogeneous systems, representation learning, compiler optimizatio } ``` ## Contributions -Please feel free to raise issues to file a bug, to pose a question, or to initiate any related discussions. Pull requests are welcome :) +Please feel free to raise issues to file a bug, pose a question, or initiate any related discussions. Pull requests are welcome :) ## License IR2Vec is released under a BSD 4-Clause License. See the LICENSE file for more details. diff --git a/src/include/utils.h b/src/include/utils.h index f3745628..5e6b744b 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -49,7 +49,7 @@ std::unique_ptr getLLVMIR(); void collectDataFromVocab(std::map &opcMap); void scaleVector(Vector &vec, float factor); // newly added -auto getDemagledName(llvm::Function *function); +std::string getDemagledName(const llvm::Function *function); char *getActualName(llvm::Function *function); std::string updatedRes(IR2Vec::Vector tmp, llvm::Function *f, llvm::Module *M); } // namespace IR2Vec diff --git a/src/utils.cpp b/src/utils.cpp index e5bf1050..d959ff88 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -7,6 +7,7 @@ #include "utils.h" #include "IR2Vec.h" #include +#include using namespace llvm; using namespace IR2Vec; @@ -69,13 +70,15 @@ void IR2Vec::scaleVector(Vector &vec, float factor) { } // Function to get demangled function name -auto IR2Vec::getDemagledName(llvm::Function *function) { +std::string IR2Vec::getDemagledName(const llvm::Function *function) { auto functionName = function->getName().str(); std::size_t sz = 17; int status; char *const readable_name = __cxa_demangle(functionName.c_str(), 0, &sz, &status); - auto demangledName = status == 0 ? std::string(readable_name) : functionName; + auto demangledName = + status == 0 ? std::string(readable_name) : std::string(functionName); + free(readable_name); return demangledName; }