Optimum amd support (#464)

* add optimum-amd support * update documentation * add commit with onnxruntime * update template * remove source build pipeline * docker build and push v3 * complete and tested on radeon 7900 xtx * add migraphx compilation * add migraphx support to mi300x build * finalize the amd docker setup command * update the dockerfile.amd * update docker image * add to readme --------- Co-authored-by: tjtanaa <[email protected]> Co-authored-by: TJian <[email protected]> Co-authored-by: Hot Aisle Customer <tjtanaa>
michaelfeil · Nov 16, 2024 · f59df4f · f59df4f
1 parent d9050a7
commit f59df4f
Show file tree

Hide file tree

Showing 8 changed files with 141 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -90,7 +90,7 @@ docker run -it --gpus all \
  --model-id $model2 \
  --port $port
 ```
-The cache path at inside the docker container is set by the environment variable `HF_HOME`.
+The cache path inside the docker container is set by the environment variable `HF_HOME`.
 
 #### Specialized docker images
 <details>

diff --git a/docs/docs/deploy.md b/docs/docs/deploy.md
@@ -19,8 +19,40 @@ docker run -it --gpus all \
  --model-id $model2 \
  --port $port
 ```
+The cache path inside the docker container is set by the environment variable `HF_HOME`.
+
+
+### AMD Docker: Deploy on AMD Platform (MI200 Series and MI300 Series) 
+#### Launch the CLI using a pre-built docker container (recommended) 
+
+```bash
+port=7997
+model1=michaelfeil/bge-small-en-v1.5
+model2=mixedbread-ai/mxbai-rerank-xsmall-v1
+volume=$PWD/data
+
+docker run -it \
+  --cap-add=SYS_PTRACE \
+  --security-opt seccomp=unconfined \
+  --device=/dev/kfd \
+  --device=/dev/dri \
+  --group-add video \
+  --network host \
+  -v $volume:/app/.cache \
+  -p $port:$port \
+  michaelf34/infinity:latest-rocm \
+  v2 \
+  --model-id $model1 \
+  --model-id $model2 \
+  --port $port \
+  --engine torch \
+  --compile \
+  --no-bettertransformer
+```
 The cache path at inside the docker container is set by the environment variable `HF_HOME`.
 
+
+
 ## Modal Labs
 
 A deployment example for usage within are located at repo, including a Github Actions Pipeline.

diff --git a/docs/docs/index.md b/docs/docs/index.md
@@ -49,7 +49,7 @@ docker run -it --gpus all \
  --model-id $model2 \
  --port $port
 ```
-The cache path at inside the docker container is set by the environment variable `HF_HOME`.
+The cache path inside the docker container is set by the environment variable `HF_HOME`.
 
 ### or launch the cli after the pip install
 After your pip install, with your venv activate, you can run the CLI directly.

diff --git a/libs/infinity_emb/Docker.template.yaml b/libs/infinity_emb/Docker.template.yaml
@@ -33,7 +33,42 @@ amd:
     # "RUN poetry install --no-interaction --no-ansi --no-root --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all"
     COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh
     RUN ./requirements_install_from_poetry.sh --no-root --without lint,test "https://download.pytorch.org/whl/rocm6.2"
-  poetry_extras: "all onnxruntime-gpu"
+  extra_installs_main: | 
+    ARG GPU_ARCH
+    ENV GPU_ARCH=${GPU_ARCH}
+    # GPU architecture specific installations
+    RUN cd /opt/rocm/share/amd_smi && python -m pip wheel . --wheel-dir=/install
+    RUN apt update -y && apt install migraphx -y
+    RUN if [ "$GPU_ARCH" = "gfx90a" ] || [ "$GPU_ARCH" = "gfx942" ]; then \
+        # OPTION1: Follow the steps here to install onnxruntime-rocm 
+        # https://huggingface.co/docs/optimum/onnxruntime/usage_guides/amdgpu
+        . .venv/bin/activate && python -m pip uninstall onnxruntime -y \
+        && python -m pip install /install/*.whl \
+        && python -m pip install cmake onnx \
+        && (curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y) \
+        && (. $HOME/.cargo/env) \
+        && git clone --single-branch --branch main --recursive https://github.com/Microsoft/onnxruntime onnxruntime \
+        && cd onnxruntime \
+        && (./build.sh --config Release --build_wheel --allow_running_as_root --update --build --parallel --cmake_extra_defines CMAKE_HIP_ARCHITECTURES=${GPU_ARCH} ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) --use_rocm --use_migraphx --rocm_home=/opt/rocm) \
+        && python -m pip uninstall onnxruntime -y \
+        && python -m pip install build/Linux/Release/dist/* \
+        && cp -r /app/onnxruntime/build/Linux/Release/*.so /usr/local/lib/ \
+        && cp -r /app/onnxruntime/build/Linux/Release/*.so.* /usr/local/lib/ \
+        && git clone https://github.com/huggingface/optimum-amd.git \
+        && cd optimum-amd \
+        && python -m pip install -e .; \
+    elif [ "$GPU_ARCH" = "gfx1100" ]; then \
+        # OPTION2: Install onnxruntime-rocm from the wheel
+        . .venv/bin/activate && python -m pip uninstall onnxruntime onnxruntime-rocm -y && python -m pip install "numpy<2" https://repo.radeon.com/rocm/manylinux/rocm-rel-6.2.3/onnxruntime_rocm-1.18.0-cp310-cp310-linux_x86_64.whl \
+        && python -m pip install /install/*.whl \
+        && git clone https://github.com/huggingface/optimum-amd.git /tmp-optimum \
+        && cd /tmp-optimum \
+        && python -m pip install .; \
+    else \
+        echo "Unsupported GPU_ARCH: ${GPU_ARCH}"; \
+        exit(1); \
+    fi
+  poetry_extras: "all"
   python_version: python3.10
   extra_env_variables: |
     # RUN conda init --reverse --all

diff --git a/libs/infinity_emb/Dockerfile.amd_auto b/libs/infinity_emb/Dockerfile.amd_auto
@@ -17,12 +17,12 @@ ENV PYTHONUNBUFFERED=1 \
     POETRY_VIRTUALENVS_IN_PROJECT="true" \
     # do not ask any interactive question
     POETRY_NO_INTERACTION=1 \
-    EXTRAS="all onnxruntime-gpu" \
+    EXTRAS="all" \
     PYTHON="python3.10"
 RUN apt-get update && apt-get install --no-install-recommends -y build-essential python3-dev libsndfile1 $PYTHON-venv $PYTHON curl
 # RUN conda init --reverse --all
 # RUN rm -rf /opt/conda && rm -rf /var/lib/jenkins
-# Sets default to onnx
+# Bettertransformer is not supported on AMD
 ENV INFINITY_BETTERTRANSFORMER="0"
 
 WORKDIR /app
@@ -52,7 +52,41 @@ COPY infinity_emb infinity_emb
 COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh
 RUN ./requirements_install_from_poetry.sh  --without lint,test "https://download.pytorch.org/whl/rocm6.2"
 
-#
+ARG GPU_ARCH
+ENV GPU_ARCH=${GPU_ARCH}
+# GPU architecture specific installations
+RUN cd /opt/rocm/share/amd_smi && python -m pip wheel . --wheel-dir=/install
+RUN apt update -y && apt install migraphx -y
+RUN if [ "$GPU_ARCH" = "gfx90a" ] || [ "$GPU_ARCH" = "gfx942" ]; then \
+    # OPTION1: Follow the steps here to install onnxruntime-rocm 
+    # https://huggingface.co/docs/optimum/onnxruntime/usage_guides/amdgpu
+    . .venv/bin/activate && python -m pip uninstall onnxruntime -y \
+    && python -m pip install /install/*.whl \
+    && python -m pip install cmake onnx \
+    && (curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y) \
+    && (. $HOME/.cargo/env) \
+    && git clone --single-branch --branch main --recursive https://github.com/Microsoft/onnxruntime onnxruntime \
+    && cd onnxruntime \
+    && (./build.sh --config Release --build_wheel --allow_running_as_root --update --build --parallel --cmake_extra_defines CMAKE_HIP_ARCHITECTURES=${GPU_ARCH} ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) --use_rocm --use_migraphx --rocm_home=/opt/rocm) \
+    && python -m pip uninstall onnxruntime -y \
+    && python -m pip install build/Linux/Release/dist/* \
+    && cp -r /app/onnxruntime/build/Linux/Release/*.so /usr/local/lib/ \
+    && cp -r /app/onnxruntime/build/Linux/Release/*.so.* /usr/local/lib/ \
+    && git clone https://github.com/huggingface/optimum-amd.git \
+    && cd optimum-amd \
+    && python -m pip install -e .; \
+elif [ "$GPU_ARCH" = "gfx1100" ]; then \
+    # OPTION2: Install onnxruntime-rocm from the wheel
+    . .venv/bin/activate && python -m pip uninstall onnxruntime onnxruntime-rocm -y && python -m pip install "numpy<2" https://repo.radeon.com/rocm/manylinux/rocm-rel-6.2.3/onnxruntime_rocm-1.18.0-cp310-cp310-linux_x86_64.whl \
+    && python -m pip install /install/*.whl \
+    && git clone https://github.com/huggingface/optimum-amd.git /tmp-optimum \
+    && cd /tmp-optimum \
+    && python -m pip install .; \
+else \
+    echo "Unsupported GPU_ARCH: ${GPU_ARCH}"; \
+    exit(1); \
+fi
+
 
 
 FROM builder as testing

diff --git a/libs/infinity_emb/Makefile b/libs/infinity_emb/Makefile
@@ -42,10 +42,10 @@ format format_diff:
 	[ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES)
 
 template_docker:
-	jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s amd > Dockerfile.amd_auto
-	jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s cpu > Dockerfile.cpu_auto
-	jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s nvidia > Dockerfile.nvidia_auto
-	jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s trt > Dockerfile.trt_onnx_auto
+	poetry run jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s amd > Dockerfile.amd_auto
+	poetry run jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s cpu > Dockerfile.cpu_auto
+	poetry run jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s nvidia > Dockerfile.nvidia_auto
+	poetry run jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s trt > Dockerfile.trt_onnx_auto
 
 # Add new targets
 build-amd:

diff --git a/libs/infinity_emb/infinity_emb/_optional_imports.py b/libs/infinity_emb/infinity_emb/_optional_imports.py
@@ -64,6 +64,7 @@ def _raise_error(self) -> None:
 CHECK_FASTAPI = OptionalImports("fastapi", "server")
 CHECK_ONNXRUNTIME = OptionalImports("optimum.onnxruntime", "optimum")
 CHECK_OPTIMUM = OptionalImports("optimum", "optimum")
+CHECK_OPTIMUM_AMD = OptionalImports("optimum.amd", "optimum")
 CHECK_OPTIMUM_NEURON = OptionalImports(
     "optimum.neuron",
     "<neuronx not available as extra, only runs on AMI image, no pip install possible.>",

diff --git a/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py b/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py
@@ -8,7 +8,8 @@
 from huggingface_hub import HfApi, HfFolder  # type: ignore
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE  # type: ignore
 
-from infinity_emb._optional_imports import CHECK_ONNXRUNTIME
+from infinity_emb._optional_imports import CHECK_ONNXRUNTIME, CHECK_OPTIMUM_AMD
+
 from infinity_emb.log_handler import logger
 from infinity_emb.primitives import Device
 
@@ -57,6 +58,8 @@ def device_to_onnx(device: Device) -> str:
     elif device == Device.cuda:
         if "ROCMExecutionProvider" in available:
             return "ROCMExecutionProvider"
+        elif "MIGraphXExecutionProvider" in available:
+            return "MIGraphXExecutionProvider"
         return "CUDAExecutionProvider"
     elif device == Device.mps:
         return "CoreMLExecutionProvider"
@@ -67,6 +70,8 @@ def device_to_onnx(device: Device) -> str:
             return "TensorrtExecutionProvider"
         elif "CUDAExecutionProvider" in available:
             return "CUDAExecutionProvider"
+        elif "MIGraphXExecutionProvider" in available:
+            return "MIGraphXExecutionProvider"  # swapped order of ROCM and MIGraphX
         elif "ROCMExecutionProvider" in available:
             return "ROCMExecutionProvider"
         elif "CoreMLExecutionProvider" in available:
@@ -100,12 +105,8 @@ def optimize_model(
         revision (Optional[str], optional): The revision to use. Defaults to None.
         trust_remote_code (bool, optional): Whether to trust the remote code. Defaults to True.
     """
-    CHECK_ONNXRUNTIME.mark_required()
-    path_folder = (
-        Path(HUGGINGFACE_HUB_CACHE) / "infinity_onnx" / execution_provider / model_name_or_path
-    )
-    OPTIMIZED_SUFFIX = "_optimized.onnx"
-    files_optimized = list(path_folder.glob(f"**/*{OPTIMIZED_SUFFIX}"))
+
+    ## If there is no need for optimization
     if execution_provider == "TensorrtExecutionProvider":
         return model_class.from_pretrained(
             model_name_or_path,
@@ -123,8 +124,28 @@ def optimize_model(
                 # "trt_int8_enable": "quantize" in file_name,
             },
         )
+
+    elif execution_provider in ["ROCMExecutionProvider", "MIGraphXExecutionProvider"]:
+        CHECK_OPTIMUM_AMD.mark_required()
+        return model_class.from_pretrained(
+            model_name_or_path,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            provider=execution_provider,
+            file_name=file_name,
+        )
+
+    ## path to find if model has been optimized
+    CHECK_ONNXRUNTIME.mark_required()
+    path_folder = (
+        Path(HUGGINGFACE_HUB_CACHE) / "infinity_onnx" / execution_provider / model_name_or_path
+    )
+    OPTIMIZED_SUFFIX = "_optimized.onnx"
+    files_optimized = list(path_folder.glob(f"**/*{OPTIMIZED_SUFFIX}"))
+
+    logger.info(f"files_optimized: {files_optimized}")
     if files_optimized:
-        file_optimized = files_optimized[0]
+        file_optimized = files_optimized[-1]
         logger.info(f"Optimized model found at {file_optimized}, skipping optimization")
         return model_class.from_pretrained(
             file_optimized.parent.as_posix(),