From 5306201214f92d802c82714fead13ef71142de0a Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 12 Aug 2023 23:52:45 -0400 Subject: [PATCH 1/2] Automate deployment to `PyPI` and `Test PyPI` (#937) * add deploy workflow * linting * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * hopefully fix * fix * test * test * fix * fix * fixes * fix * fix * fix dependencies * fix * fix * maybe fix dependencies? * fix * fix * maybe fix * fix * fix * fix * fix * . * use python3 by default * fix * update setup.py * added docstring, enable deployment to both pypi and test.pypi (not at the same time) * update docstring * newline * test * . * maybe fix * maybe fix * maybe fix * fix? * fix * test * test * test * fix * final commit * updated contributing docs --- .github/workflows/pip-deploy.yml | 37 ++++--- .gitignore | 3 + CMakeLists.txt | 38 +++++--- CONTRIBUTING.md | 71 +++++++++++++- MANIFEST.in | 4 + cmake/pip_install/CMakeLists.txt | 10 ++ conda/environment.yml | 1 + conda/flexflow-cpu.yml | 1 + jupyter_notebook/flexflow_kernel_nocr.py | 2 +- pyproject.toml | 10 ++ python/flexflow/core/__init__.py | 33 +------ python/flexflow/core/flexflow_top.py | 2 +- python/flexflow/core/flexflowlib.py | 12 ++- python/flexflow/findpylib.py | 2 +- python/flexflow_cffi_build.py | 2 +- python/flexflow_cffi_header.py.in | 2 +- python/flexflow_python_build.py | 2 +- requirements.txt | 3 +- setup.py | 117 +++++++++++++++++++---- 19 files changed, 268 insertions(+), 84 deletions(-) create mode 100644 MANIFEST.in create mode 100644 cmake/pip_install/CMakeLists.txt create mode 100644 pyproject.toml diff --git a/.github/workflows/pip-deploy.yml b/.github/workflows/pip-deploy.yml index c3840a6671..66fdf00c9a 100644 --- a/.github/workflows/pip-deploy.yml +++ b/.github/workflows/pip-deploy.yml @@ -1,10 +1,5 @@ name: "pip-deploy" -on: - push: - tags: - - '*' - branches: - - inference # only trigger on push to inference branch for now +on: workflow_dispatch: concurrency: @@ -15,6 +10,9 @@ jobs: build-n-publish: name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI runs-on: ubuntu-20.04 + permissions: + # IMPORTANT: this permission is mandatory for trusted publishing + id-token: write steps: - name: Checkout Git Repository @@ -38,6 +36,8 @@ jobs: --user - name: Build a source tarball + env: + DEPLOY_TO_TEST_PYPI: ${{ vars.DEPLOY_TO_TEST_PYPI }} run: >- python3 -m build @@ -46,14 +46,27 @@ jobs: . - name: Publish distribution 📦 to Test PyPI + if: ${{ vars.DEPLOY_TO_TEST_PYPI == 'true' }} uses: pypa/gh-action-pypi-publish@release/v1 with: - password: ${{ secrets.TEST_PYPI_API_TOKEN }} repository-url: https://test.pypi.org/legacy/ - # - name: Publish distribution 📦 to PyPI - # if: startsWith(github.ref, 'refs/tags') - # uses: pypa/gh-action-pypi-publish@release/v1 - # with: - # password: ${{ secrets.PYPI_API_TOKEN }} + - name: Publish distribution 📦 to PyPI + if: ${{ vars.DEPLOY_TO_TEST_PYPI == 'false' }} + uses: pypa/gh-action-pypi-publish@release/v1 + + - name: Get package version + if: ${{ vars.DEPLOY_TO_TEST_PYPI == 'false' }} + run: | + # when running setup.py outside of pip install, we need to manually install the modules that are imported in the script + pip install setuptools requests cmake-build-extension + version=$(python setup.py --version) + echo "PY_VERSION=${version}" >> $GITHUB_ENV + + - name: Create Git tag + if: ${{ vars.DEPLOY_TO_TEST_PYPI == 'false' }} + uses: mathieudutour/github-tag-action@v6.1 + with: + github_token: ${{ secrets.FLEXFLOW_TOKEN }} + custom_tag: ${{ env.PY_VERSION }} diff --git a/.gitignore b/.gitignore index b2e3c59ced..20d3979b08 100644 --- a/.gitignore +++ b/.gitignore @@ -179,3 +179,6 @@ train-labels-idx1-ubyte # Logs logs/ + +# pip version +python/flexflow/version.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index e7504d7026..8ad3b81f9c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -433,14 +433,14 @@ if (FF_USE_PYTHON) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMENT "Creating flexflow_cffi_header.py..." ) - # generate the Legion Python bindings library - add_custom_command(TARGET flexflow - POST_BUILD - COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python - ) - # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead. if (NOT FF_BUILD_FROM_PYPI) + # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library + add_custom_command(TARGET flexflow + POST_BUILD + COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python + ) + # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead. add_custom_command(TARGET flexflow PRE_BUILD COMMAND ${PYTHON_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR} @@ -546,10 +546,22 @@ install(FILES ${FLEXFLOW_HDR} DESTINATION ${INCLUDE_DEST}) install(TARGETS flexflow DESTINATION ${LIB_DEST}) # install python if (FF_USE_PYTHON) - execute_process(COMMAND python -c "from distutils import sysconfig; print(sysconfig.get_python_lib(plat_specific=False,standard_lib=False))" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) - install( - DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/ - DESTINATION ${PY_DEST}/flexflow - FILES_MATCHING - PATTERN "*.py") + execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "from distutils import sysconfig; print(sysconfig.get_python_lib(plat_specific=False,standard_lib=False))" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) + if (NOT FF_BUILD_FROM_PYPI) + install( + DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/ + DESTINATION ${PY_DEST}/flexflow + FILES_MATCHING + PATTERN "*.py") + else() + # pip automatically installs all *.py files in the python/flexflow folder, but because flexflow_cffi_header.py is generated at build time, we have to install it manually. + install( + PROGRAMS ${FLEXFLOW_ROOT}/python/flexflow/core/flexflow_cffi_header.py + DESTINATION ${PY_DEST}/flexflow/core + ) + # Use setup.py script to re-install the Python bindings library with the right library paths. + # Need to put the instructions in a subfolder because of issue below: + # https://stackoverflow.com/questions/43875499/do-post-processing-after-make-install-in-cmake + add_subdirectory(cmake/pip_install) + endif() endif() diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ff77cb4612..e607fddb1a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -131,8 +131,9 @@ We currently implement CI testing using Github Workflows. Each workflow is defin 4. `gpu-ci.yml`: runs all the tests that require a GPU to run. 5. `gpu-ci-daemon.yml`: an helper workflow that turns on/off the GPU instance used by the test above 6. `multinode-test.yml`: runs the same GPU tests from the `gpu-ci.yml` workflow, but using multiple (simulated) nodes. The test currently simulates two nodes, each with 2 GPUs. To run FlexFlow on multiple nodes, we compile Legion with GASNET enabled, and choose MPI as the GASNET conduit. Compared to the single-node version, this test is much more time-consuming (about 4h instead 40mins at the time of writing), so we only run the test on the FlexFlow `master` branch every other day. -7. `pip-install.yml`: checks the build & installation of FlexFlow using `pip` -8. `shell-check.yml`: runs shellcheck on all bash scripts in the repo +7. `pip-deploy.yml`: builds the `flexflow` pip package and publishes it on `TestPyPI` (if the repository environment variable `DEPLOY_TO_TEST_PYPI` is unset, or set to `false`) or `PyPI` (if `DEPLOY_TO_TEST_PYPI` is set to `true`). When deploying to `PyPI`, a new git tag (with the pip package version) will also be created, and associated with the commit hash that triggered the workflow. The `pip-deploy.yml` can only be launched manually via workflow dispatch. More on the pip packaging in the [section below](#pip-packages). +8. `pip-install.yml`: checks the build & installation of FlexFlow using `pip` +9. `shell-check.yml`: runs shellcheck on all bash scripts in the repo We also have three placeholder workflows: `build-skip.yml`, `docker-build-skip.yml`, `gpu-ci-skip` and `pip-install-skip.yml`. These always pass and are used only in the case of skipped workflows whose status is required to merge a PR; we implement the "hack" officially recommended by Github ([see here](https://docs.github.com/en/repositories/configuring-branches-and-merges-in-your-repository/defining-the-mergeability-of-pull-requests/troubleshooting-required-status-checks#handling-skipped-but-required-checks)). @@ -208,6 +209,72 @@ Finally, we define the jobs that will run when the workflow is triggered. Each j Each step in a job will be executed sequentially, and if it fails, the remaining steps will be cancelled and the job will be marked as `failed`. Each step is specified by either reusing a Github action or running a shell command (or a script file). For instance, in the example above, the first step uses the Github Action `actions/checkout@v3` to check out the repository, the second step uses the `Jimver/cuda-toolkit@v0.2.11` action to install CUDA, whereas the third step runs a bash script stored in the repo at the path `.github/workflows/helpers/install_dependencies.sh`. +## Pip packages +This section illustrates how we support the automatic deployment of FlexFlow to the `PyPI` and `Test PyPI` repositories. Publishing FlexFlow on `PyPI` makes it possible for users to install FlexFlow on their machine by simply running: + +```bash +pip install flexflow +``` + +To install from `Test PyPI`, on the other hand, one can use: + +```bash +pip install flexflow --extra-index-url https://test.pypi.org/simple/ +``` + +The installation process currently takes approximately the same time as installing from source by running the command `pip install .` from `FF_HOME` after having cloned the repo. However, installing directly from PyPI allows the user to automatically install the Python dependencies, and removes the step of having to manually clone the repo with all its submodules. + +Below, we discuss some important properties of PyPI. + +### Packaging +When building a `pip` package from a repository, we can decide what files from the repository will be included in the package, and which ones will be left out. To do that, we write a [MANIFEST.in](https://github.com/flexflow/FlexFlow/blob/master/MANIFEST.in) file, according to the syntax from the [official instructions](https://packaging.python.org/en/latest/guides/using-manifest-in/). In particular, we manually include the submodules (which would otherwise be left out by default), we remove the `.git` folders, which are not needed to build FlexFlow, as well as the `triton` folder, whose contents are not currently in use. Finally, we request that the version.txt file, whose role is described in the section below, is included in the package distribution. Because this file is generated at build time, it would be left out by default if we didn't manually include it. + +### Source VS Wheel distribution +PyPI allows you to upload a source distribution, together with one (or more) binary distributions of your package. A `pip` package's pre-compiled binary is called a Wheel (formerly, Egg). The advantage of publishing Wheel distributions instead of just the source code is that the installation of the package will be much faster for the user, who will just need to download the binary, and extract its files in the proper locations (all of this is handled automatically when running `pip install `). If only the source code is available, on the other hand, `pip install ` will first need to compile the package, and then install it. + +`PyPI` allows you to upload multiple Wheels to support different Python versions (the Wheel compatible with version of Python installed on the user's machine is downloaded automatically when the user runs `pip install `), but unfortunately does not yet support uploading a Wheel for each CUDA version, and automatically downloading the relevant one depending on the user's machine configuration. Instead, one needs to upload a Wheel with a distinct name for each CUDA version, and the user will need to specify the name manually at dowload time. For this reason, to keep things simple, we only publish the source distribution at this moment, and plan to upload Wheels that are specific to each Python version and CUDA version at a later time. + +### Versioning + +PyPI imposes some strict versioning requirements. Among other things, versions need to follow a specific format, and once a given version of a package is published, it can never be replaced. In addition, even if the publisher deletes a version, nobody can never upload a source distribution / Wheel with that same version number again. Finally, when multiple versions of the same package are published, the one with the highest version number (not the one that was uploaded last) will be installed by default. + +When publishing a package on PyPI, the version attached to the upload is determined by the `setup.py` script. You can check which version string will be used by running `python setup.py --version`. + +The simplest way to version a `pip`package is to hard-code the version number in the `setup.py` script, and committing a change to the repository every time the `pip` package is to be updated. This approach, however, is incompatible with having a script or workflow to automatically update the `pip` package. + +If we intend to deploy the latest code to PyPI automatically, we need a way to automatically assign a properly-formatted version string to the code we want to upload. Further, we need to ensure that the assigned version is (1) different from any version (of the same package) already published on PyPI and (2) larger than any previous version. Finally, a trickier requirement is that: (3) at any point in time, the `setup.py` script included in a given version of our package should output a version string that exactly matches the version string recorded in the metadata attached to the package's version at publication time. More about this below. + +We follow a simple approach to automatically version the latest code: use the publication's date to generate the version string. For example, on Aug 12, 2023, we can use version string 23.08.12. Assuming that we publish at most one version per day, and that we always publish from the same timezone, we will be able to meet requirements (1) and (2). An additional enhancement to be able to support the update of the package more than once per day (which may be needed in development phase, or if a mistake is made), instead of using the day of the month (12 for August 12, 2023) for the sub-sub-version, we use an index that starts at 0 every month, and is incremented by +1 every time we upload a new version of the package within the same calendar month. So if on Aug 12, 2023 we are updating the package for the first time in the month, we will use version string 23.08.0; if later the same day (or any time before Sept 1, 2023) we wish to upload a new version, we will use string 23.08.1, and so forth. + +Having illustrated the general versioning policy, we will need to implement it carefully in `setup.py` to ensure that we meet requirement (3). You can take a look at the `compute_version()` function to see how this is done in practice. The key realization is that we cannot simply compute today's date (using any of the Python libraries that let us do that) and transform it into a string, nor simply get from PyPI the latest available version of our package, and, if it was published on the same calendar month, increment the sub-subversion by +1 to generate the version string of the new upload. We can best illustrate why we cannot do that with an example: +- Today, Aug 12, 2023, we wish to upload a new version to PyPI. As we said above, the version string is computed by `setup.py`. A naive way to do so in `setup.py` would be to compute the date using `date.today()`, and transform the year and month into digit form to generate the version (23) and sub-version (08) parts of the version string. We could then check on PyPI what was the latest published version of the package as of today, and if we found that it was, say, 23.08.05, we would use 5+1=6 as the sub-sub-version for the new upload (so the final version string would be 23.08.06). +- Over the next few days, we upload 3 more versions +- A week later, on Aug 18, 2023, a user trying to install our package, runs `pip install `. To determine which version it should install, the `pip install` script downloads the most recent X versions of `` on the user's machine, and, for each version, re-computes the version string by running `python setup.py --version`. When the script attempts to recompute the version string on the package 23.08.06 (which we uploaded on Aug 12, 2023), it will reconstruct the version string by replaying the same instructions that were run on Aug. 12, and obtain a different version string, as follows. Using the current date, the user will obtain: version=23, sub-version=08, which match the metadata. Checking the latest version of the package available on PyPI, the script finds version 23.08.09 (there were three more submissions since Aug 12). This will translate to sub-sub-version=9+1=10. Noticing that the version included in the Aug 12 package's metadata (23.08.06) does not match the recomputed version (23.08.10), the script will generate unexpected and undesired behavior. + +To prevent accidentally breaking requirement (3) as illustrated in the scenario from the example above, we employ a simple hack: when computing our package's version string for the first time by running `setup.py`, we save the string to a file, `python/flexflow/version.txt`, which is added to the `.gitignore` and as such, never committed to the repo. As long as the `version.txt` exists, any subsequent run of `setup.py` will simply read the file, and output the same version string, no matter on which day and/or how many new versions of the package have been uploaded to PyPI since then. When packaging our code to upload it on PyPI, we ensure to delete the `version.txt` file, compute the version string, and then include the `version.txt` in the source distribution that we upload to `PyPI`. In this way, when the user attempts to install the package, `pip install` will download the most recent available versions, run `setup.py` from each distribution, and for each distribution, `setup.py` will always output the correct version string, because it will just read the string recorded in that distribution's `version.txt`. + +### Test PyPI +Given all the complexities and restrictions of PyPI, Test PyPI was created as a "copy" of PyPI to be used for testing and for being able to make mistakes without affecting the user, or forever losing the opportunity to use a given package name and/or version. We take advantage of Test PyPI as follows. If we intend to deploy to PyPI, we can first deploy to Test PyPI, check the results, fix any issue, and only later deploy to PyPI. All our `pip` related scripts in the repo have been designed to support both Test PyPI and PyPI. In order to let `setup.py` know that it should package a distribution for Test PyPI, one can simply export the following environment variable: + +```bash +export DEPLOY_TO_TEST_PYPI=true +``` + +Conversely, to upload to PyPI, one can either leave `DEPLOY_TO_TEST_PYPI` unset, or export + +```bash +export DEPLOY_TO_TEST_PYPI=false +``` + +WARNING!!! More likely than not, the latest version of the `flexflow` package on Test PyPI and PyPI will be out of sync. This is to be expected, because one may need to upload a few drafts on Test PyPI to detect and correct some bugs, before publishing the definitive version on PyPI. Having different latest versions on the two repositories should not cause any issue. However, after uploading to Test PyPI and before uploading to PyPI (or viceversa), **it is EXTREMELY IMPORTANT** to delete the `python/flexflow/version.txt` file. + +An easy way to avoid forgetting this, is to only deploy on Test PyPI/PyPI using the `pip-deploy.yml`, which is designed to only upload to one of the two repositories at a given time. + +### Build vs install dependencies + +FlexFlow requires some other Python packages in order to run. In addition, even building FlexFlow requires some packages, and you cannot run `setup.py` without those build requirements. There is a way for us to specify these _install_ and _build_ requirements in such a way that `pip` will detect if they are missing, and install them. We record the build requirements in the `pyproject.toml` file, whereas we specify the installation requirements by passing a list with each package's name to the `install_requires` key of the `setup()` function in `setup.py`. The installation requirements are automatically read from the `requirements.txt` file. + + ## Contributing to FlexFlow We want to make contributing to this project as easy and transparent as possible. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000000..64f20c1890 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +graft deps +recursive-exclude . .git +prune triton +include python/flexflow/version.txt diff --git a/cmake/pip_install/CMakeLists.txt b/cmake/pip_install/CMakeLists.txt new file mode 100644 index 0000000000..b7795daf71 --- /dev/null +++ b/cmake/pip_install/CMakeLists.txt @@ -0,0 +1,10 @@ +# Use setup.py script to re-install the Python bindings library with the right library paths +if (FF_USE_PYTHON) + execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "from distutils import sysconfig; print(sysconfig.get_python_lib(plat_specific=False,standard_lib=False))" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) + if(FF_BUILD_FROM_PYPI) + install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${PY_DEST}/flexflow/lib \")") + # CMAKE_CURRENT_SOURCE_DIR=/usr/FlexFlow/cmake/pip_install + # Legion_BINARY_DIR=/usr/FlexFlow/build//deps/legion + install(CODE "execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)") + endif() +endif() diff --git a/conda/environment.yml b/conda/environment.yml index 05992a8bf7..2069acccdf 100644 --- a/conda/environment.yml +++ b/conda/environment.yml @@ -13,3 +13,4 @@ dependencies: - qualname>=0.1.0 - keras_preprocessing>=1.1.2 - numpy>=1.16.0 + - requests diff --git a/conda/flexflow-cpu.yml b/conda/flexflow-cpu.yml index ced02b9db4..cc6fcf4667 100644 --- a/conda/flexflow-cpu.yml +++ b/conda/flexflow-cpu.yml @@ -17,3 +17,4 @@ dependencies: - torch --index-url https://download.pytorch.org/whl/cpu - torchaudio --index-url https://download.pytorch.org/whl/cpu - torchvision --index-url https://download.pytorch.org/whl/cpu + - requests diff --git a/jupyter_notebook/flexflow_kernel_nocr.py b/jupyter_notebook/flexflow_kernel_nocr.py index 203a416d70..8441db5d3a 100644 --- a/jupyter_notebook/flexflow_kernel_nocr.py +++ b/jupyter_notebook/flexflow_kernel_nocr.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) # diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000..4b8214f3fe --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,10 @@ +[build-system] +requires = [ + "wheel", + "setuptools>=45", + "setuptools_scm[toml]>=6.0", + "cmake-build-extension", + "ninja", + "requests" +] +build-backend = "setuptools.build_meta" diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py index b0177be6fa..1ad4746cca 100644 --- a/python/flexflow/core/__init__.py +++ b/python/flexflow/core/__init__.py @@ -24,35 +24,6 @@ from flexflow.config import * from flexflow.jupyter import * -def rerun_if_needed(): - def update_ld_library_path_if_needed(path): - ld_lib_path = os.environ.get("LD_LIBRARY_PATH") or "" - if path not in ld_lib_path.split(":"): - os.environ["LD_LIBRARY_PATH"] = path + ":" + ld_lib_path - return True - return False - from distutils import sysconfig - # When installing FlexFlow with pip, the library files are installed within - # the pip package folder, instead of at /usr/local/lib - packages_dir = sysconfig.get_python_lib(plat_specific=False, standard_lib=False) - ff_lib_path = os.path.join(packages_dir, "flexflow", "lib") - # If the library exists at the ff_lib_path, rerun with the ff_lib_path in the LD_LIBRARY_PATH - rerun=False - if os.path.isdir(ff_lib_path): - rerun = update_ld_library_path_if_needed(ff_lib_path) - if rerun: - run_from_python_c = ((sys.argv or [''])[0] == '-c') - # re-running with os.execv only works with 'python -c' for python >= 3.10 - # (see https://bugs.python.org/issue23427) - if not run_from_python_c: - os.execv(sys.executable, ["python"] + sys.argv) - else: - if hasattr(sys, 'orig_argv'): - assert(len(sys.orig_argv) >= 3) - os.execv(sys.executable, ["python"] + sys.orig_argv[1:]) - else: - print(f'Error: Please export LD_LIBRARY_PATH={os.environ.get("LD_LIBRARY_PATH")} and rerun') - sys.exit(1) if flexflow_init_import(): os.environ["NCCL_LAUNCH_MODE"] = "PARALLEL" @@ -61,11 +32,11 @@ def update_ld_library_path_if_needed(path): # Default python mode if is_legion_python == False: - os.environ["REALM_DEFAULT_ARGS"] = "-ll:gpu 1" - rerun_if_needed() print("Using Default Python") _FF_BUILD_DOCS = bool(os.environ.get('READTHEDOCS') or os.environ.get("FF_BUILD_DOCS")) _CPU_ONLY = bool(os.environ.get('CPU_ONLY_TEST')) + if not _CPU_ONLY and not "-ll:gpu" in sys.argv: + os.environ["REALM_DEFAULT_ARGS"] = "-ll:gpu 1" if not _FF_BUILD_DOCS and not _CPU_ONLY: from legion_top import ( legion_canonical_python_main, diff --git a/python/flexflow/core/flexflow_top.py b/python/flexflow/core/flexflow_top.py index 61070f39f3..8ffe9a39c9 100644 --- a/python/flexflow/core/flexflow_top.py +++ b/python/flexflow/core/flexflow_top.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) # diff --git a/python/flexflow/core/flexflowlib.py b/python/flexflow/core/flexflowlib.py index 3d21864193..6fc8e52cf7 100644 --- a/python/flexflow/core/flexflowlib.py +++ b/python/flexflow/core/flexflowlib.py @@ -18,6 +18,7 @@ from .flexflow_cffi_header import flexflow_header from legion_cffi import ffi +from distutils import sysconfig class FlexFlowLib(object): __slots__ = ['_lib', '_header'] @@ -44,7 +45,16 @@ def get_library_extension(self) -> str: def get_shared_library(self) -> str: libname = "libflexflow" + self.get_library_extension() - return os.path.join(libname) + + # If we installed with pip, use the full path instead of just the library name, because the library will not be in the LD_LIBRARY_PATH + packages_dir = sysconfig.get_python_lib(plat_specific=False, standard_lib=False) + ff_lib_path = os.path.join(packages_dir, "flexflow", "lib", libname) + installed_with_pip = os.path.exists(ff_lib_path) + + if installed_with_pip: + return ff_lib_path + else: + return libname def get_c_header(self) -> str: return self._header diff --git a/python/flexflow/findpylib.py b/python/flexflow/findpylib.py index 2ac9dcbbb9..c2ce002996 100755 --- a/python/flexflow/findpylib.py +++ b/python/flexflow/findpylib.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Locate libpython associated with this Python executable. diff --git a/python/flexflow_cffi_build.py b/python/flexflow_cffi_build.py index c4cf8e9e09..b89fba2f06 100755 --- a/python/flexflow_cffi_build.py +++ b/python/flexflow_cffi_build.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) # diff --git a/python/flexflow_cffi_header.py.in b/python/flexflow_cffi_header.py.in index fdd03315ee..f9f5b01b20 100644 --- a/python/flexflow_cffi_header.py.in +++ b/python/flexflow_cffi_header.py.in @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) # diff --git a/python/flexflow_python_build.py b/python/flexflow_python_build.py index 0e58193ef7..4ca26d8ab3 100755 --- a/python/flexflow_python_build.py +++ b/python/flexflow_python_build.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) # diff --git a/requirements.txt b/requirements.txt index 13d7c67588..4ac0a8a047 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -python>=3.6 cffi>=1.11.0 numpy>=1.16.0 qualname>=0.1.0 @@ -6,3 +5,5 @@ keras_preprocessing>=1.1.2 Pillow pybind11 cmake-build-extension +ninja +requests diff --git a/setup.py b/setup.py index 4564657d46..5cc4d72b20 100644 --- a/setup.py +++ b/setup.py @@ -1,44 +1,120 @@ from setuptools import setup, find_packages from pathlib import Path from cmake_build_extension import BuildExtension, CMakeExtension -import os -import subprocess +import os, subprocess, requests, re +from datetime import date datadir = Path(__file__).parent / "python/flexflow" files = [str(p.relative_to(datadir)) for p in datadir.rglob("*.py")] -# Load CMake configs from config/config.linux file +# Load CMake configs from config/config.linux file, parsing any custom settings from environment variables configs_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), "config", "config.linux" ) - cmake_configure_options = subprocess.check_output([configs_path, "CMAKE_FLAGS"]).decode( "utf-8" ).strip().split() + ["-DFF_BUILD_FROM_PYPI=ON"] cuda_path = subprocess.check_output([configs_path, "CUDA_PATH"]).decode("utf-8").strip() - +# CUDA PATH should be passed to CMAKE via an environment variable os.environ["CUDA_PATH"] = cuda_path +# set up make flags to parallelize build of subcomponents that do not use ninja +os.environ["MAKEFLAGS"] = (os.environ.get("MAKEFLAGS", "")) + f" -j{max(os.cpu_count()-1, 1)}" + +def compute_version() -> str: + """This function generates the flexflow package version according to the following rules: + 1. If the python/flexflow/version.txt file exists, return the version from the file. + 2. If the version.txt file does not exist, the version will be YY.MM., + where the YY are the last two digits of the year, MM is the month number, + and is a counter that is reset at the beginning of every month, + and it is incremented every time we publish a new version on pypi (or test.pypi, + if the DEPLOY_TO_TEST_PYPI env is defined and set to true). + Using this index (instead of the day of the month) for the sub-subversion, allows + us to release more than once per day when needed. + + Warning! If the latest flexflow package version in test.pypi goes out of sync with pypi, this + script will publish the wrong version if it is used to deploy to both test.pypi and pypi without + deleting the version.txt file in-between the two uploads. + + :raises ValueError: if the python/flexflow/version.txt file exists, but contains a version in the wrong format + :raises ValueError: if the DEPLOY_TO_TEST_PYPI env is set to a value that cannot be converted to a Python boolean + :raises ValueError: if a flexflow release exists on pypi (or test.pypi) whose last two digits of the year are + larger than the last two digits of the current year (e.g., if it's year '23, + and we find a release from year '24) + :return: The version in YY.MM. format, as a string + :rtype: str + """ + # Check if the version has already been determined before, in which case we don't recompute it + version_file = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "python", "flexflow", "version.txt" + ) + if os.path.isfile(version_file): + with open(version_file) as f: + version = f.read() + # Version is YY.mm. + match = re.fullmatch(r'\d+\.\d+\.\d+', version) + if not match: + raise ValueError("Version is not in the right format!") + return version + + # Get latest version of FlexFlow on pypi (default) or test.pypi (if the DEPLOY_TO_TEST_PYPI env is set to true) + deploy_to_test_pypi = os.environ.get('DEPLOY_TO_TEST_PYPI', 'false') + if deploy_to_test_pypi.lower() in ['true', 'yes', '1']: + deploy_to_test_pypi = True + pypi_url = "https://test.pypi.org/pypi/flexflow/json" + elif deploy_to_test_pypi.lower() in ['false', 'no', '0']: + deploy_to_test_pypi = False + pypi_url = "https://pypi.org/pypi/flexflow/json" + else: + raise ValueError(f'Invalid boolean value: {deploy_to_test_pypi}') + try: + pip_version = requests.get(pypi_url).json()['info']['version'] + except KeyError: + pip_version = "0.0.0" + pip_year, pip_month, pip_incremental = [int(x) for x in pip_version.split(".")] + + today = date.today() + year_two_digits = int(str(today.year)[-2:]) + + # Ensure no version from the distant past or the future :) + if pip_year > year_two_digits or (pip_year == year_two_digits and pip_month > today.month): + raise ValueError(f"A version from the distant past or future (year '{pip_year}, month {pip_month}) already exists!") + + subversion = 0 + if pip_year == year_two_digits and pip_month == today.month: + subversion = pip_incremental + 1 + + version = f"{year_two_digits}.{today.month}.{subversion}" + # Add version to file + with open(version_file, 'w+') as f: + f.write(version) + + return version + +# Create description from README +long_description = (Path(__file__).parent / "README.md").read_text() + +# Create requirements list from requirements.txt +with open(Path(__file__).parent / "requirements.txt", "r") as reqs_file: + requirements = reqs_file.read().strip().split("\n") + setup( name="flexflow", - version="1.0", - description="FlexFlow Python package", + version=compute_version(), + description="A distributed deep learning framework that supports flexible parallelization strategies.", + long_description=long_description, + long_description_content_type="text/markdown", url="https://github.com/flexflow/FlexFlow", + project_urls={ + "Homepage": "https://flexflow.ai/", + "Documentation": "https://flexflow.readthedocs.io/en/latest/", + }, license="Apache", packages=find_packages("python"), package_dir={"": "python"}, package_data={"flexflow": files}, zip_safe=False, - install_requires=[ - "numpy>=1.16", - "cffi>=1.11", - "qualname>=0.1", - "keras_preprocessing", - "Pillow", - "cmake-build-extension", - "pybind11", - "ninja", - ], + install_requires=requirements, scripts=['python/flexflow/flexflow_python'], ext_modules=[ CMakeExtension( @@ -50,9 +126,14 @@ cmdclass={"build_ext": BuildExtension}, classifiers=[ "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "License :: OSI Approved :: Apache Software License", "Operating System :: POSIX :: Linux", - "Topic :: Software Development :: Libraries", + "Topic :: Scientific/Engineering :: Artificial Intelligence", ], python_requires=">=3.6", ) From d13991c58969d4b02b47ffbb393154ec1f81f4f9 Mon Sep 17 00:00:00 2001 From: vincent163 Date: Sat, 12 Aug 2023 23:54:24 -0400 Subject: [PATCH 2/2] Expand documentation for multi node (#961) Co-authored-by: Wenyang He Co-authored-by: Zhihao Jia --- MULTI-NODE.md | 68 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 13 deletions(-) diff --git a/MULTI-NODE.md b/MULTI-NODE.md index 78edba62c0..a8fd2fb705 100644 --- a/MULTI-NODE.md +++ b/MULTI-NODE.md @@ -1,29 +1,71 @@ -# Running FlexFlow On Multiple Nodes -To build, install, and run FlexFlow on multiple nodes, follow the instructions below. We take AWS as an example to present the instructions. +# Running FlexFlow on Multiple Nodes + +To build, install, and run FlexFlow on multiple nodes, follow the instructions below. We will use AWS as an example to present the instructions. ## 1. Spin up instances -Spin up multiple instances with GPU support. We choose p3.2xlarge with [Deep Learning AMI GPU PyTorch 1.13.1 (Ubuntu 20.04)](https://aws.amazon.com/releasenotes/aws-deep-learning-ami-neuron-pytorch-1-13-ubuntu-20-04/) to simplify the procedure. -Place the instances in a [placement group](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-groups.html) which utilizes `cluster` as strategy to achieve the low-latency network performance. +Spin up multiple instances with GPU support. For AWS, we recommend using p3.2xlarge with [Deep Learning AMI GPU PyTorch 1.13.1 (Ubuntu 20.04)](https://aws.amazon.com/releasenotes/aws-deep-learning-ami-neuron-pytorch-1-13-ubuntu-20-04/) to simplify the procedure. + +Place the instances in a [placement group](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-groups.html) that utilizes the `cluster` strategy to achieve low-latency network performance. -To enable the communications between instances, you should attach the same security group to all instances and add an inbound rule in the security group to enable all the incoming traffic from the same security group. An example inbound rule is as follows: +To enable communication between instances, attach the same security group to all instances and add an inbound rule in the security group to allow all incoming traffic from the same security group. An example inbound rule is as follows: ``` Type: Custom TCP Port range: 1 - 65535 Source: Custom (use the security group ID) ``` +You can also use your own GPU cluster, as long as all machines are interconnected with a low-latency network. + ## 2. Configure and build FlexFlow -Follow steps 1 to 5 in [INSTALL.md](INSTALL.md) to download the source code, install system dependencies, install the Python dependencies, configure the FlexFlow build, and build FlexFlow **on each instance**. You can skip the step 2 (Install system dependencies) if you have spun up instances with Deep Learning AMI which comes preconfigured with CUDA. Otherwise, you need to install system dependencies on each instance. -## 3. Test FlexFlow -Follow the step 6 in [INSTALL.md](INSTALL.md) to set environment variables. +Follow steps 1 to 5 in [INSTALL.md](INSTALL.md) to download the source code, install system dependencies, install the Python dependencies, configure the FlexFlow build, and build FlexFlow **on each instance at the same path**. + +You can skip step 2 (Install system dependencies) if you have spun up instances with Deep Learning AMI, which comes preconfigured with CUDA. Otherwise, you need to install system dependencies on each instance. + +For step 4 (Configuring the FlexFlow build), make sure to specify a network using the `FF_LEGION_NETWORKS` parameter. We recommend using `FF_LEGION_NETWORKS=gasnet` and `FF_GASNET_CONDUIT=ucx`. Other configurations are optional. + +## 3. Configure MPI + +MPI is an easy way to launch FlexFlow across all instances simultaneously and set up communication between them. + +To use MPI, enable non-interactive `ssh` logins between instances. This can be done by referring to the [Open MPI documentation](https://docs.open-mpi.org/en/v5.0.0rc9/running-apps/ssh.html). Here are the detailed steps: + +1. Choose one of the nodes as the main instance and create a public/private key pair on the instance. This will be the instance from which you launch MPI commands. Run the following command: + +``` +ssh-keygen -t ed25519 +``` + +This will create a public key at `~/.ssh/id_ed25519.pub` and a private key at `~/.ssh/id_ed25519`. -A script to run a Python example on multiple nodes is available at `scripts/mnist_mlp_run.sh` and you can run the script using [`mpirun`](https://www.open-mpi.org/doc/current/man1/mpirun.1.php) or [`srun`](https://slurm.schedmd.com/srun.html). For example, to run the script with MPI, you need to first enable non-interactive `ssh` logins (refer to [Open MPI doc](https://docs.open-mpi.org/en/v5.0.0rc9/running-apps/ssh.html)) between instances and then run: +2. Append the contents of the **public key** to `~/.ssh/authorized_keys` on all machines (if the file does not exist, create one). Execute the following command on **all instances**: + +``` +mkdir -p ~/.ssh +echo '' >> ~/.ssh/authorized_keys +``` + +Replace `` with the public key from `~/.ssh/id_ed25519.pub` on the main instance. It should be a single line containing a string like: ``` -mpirun --host :,: -np ./scripts/mnist_mlp_run.sh +ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOy5NKYdE8Cwgid59rx6xMqyj9vLaWuXIwy/BSRiK4su instance +``` + +3. Create a hostfile at `~/hostfile`, with one line for each instance (add more lines if you have more instances): + ``` + slots= + slots= +``` + +`` and `` refer to the number of slots available for each instance, respectively. Set it to one if you have a GPU on each instance. + +4. SSH into each host and make sure you can log into them. It may ask you to verify the public key. Make sure to trust the public key so that it doesn't ask you again. + +5. Test MPI by running `mpirun -N 1 --hostfile ~/hostfile hostname`. It should display the hostname of all your nodes. If you encounter any errors like `WARNING: Open MPI accepted a TCP connection from what appears to be another Open MPI process but cannot find a corresponding process entry for that peer.`, add the parameter `--mca btl_tcp_if_include` in the `mpirun` command (refer to [this Stack Overflow question](https://stackoverflow.com/questions/15072563/running-mpi-on-two-hosts)). + +## 4. Test FlexFlow + +Follow step 6 in [INSTALL.md](INSTALL.md) to set environment variables. -If you encounter some errors like `WARNING: Open MPI accepted a TCP connection from what appears to be a -another Open MPI process but cannot find a corresponding process -entry for that peer.`, add the parameter `--mca btl_tcp_if_include` in the `mpirun` command. (refer to [stack overflow question](https://stackoverflow.com/questions/15072563/running-mpi-on-two-hosts)) \ No newline at end of file +A script to run a Python example on multiple nodes is available at `scripts/mnist_mlp_run.sh`. You can run the script using [`mpirun`](https://www.open-mpi.org/doc/current/man1/mpirun.1.php) (if you configured it in step 3) or [`srun`](https://slurm.schedmd.com/srun.html). \ No newline at end of file