Skip to content

Commit

Permalink
Optimum amd support (#464)
Browse files Browse the repository at this point in the history
* add optimum-amd support

* update documentation

* add commit with onnxruntime

* update template

* remove source build pipeline

* docker build and push v3

* complete and tested on radeon 7900 xtx

* add migraphx compilation

* add migraphx support to mi300x build

* finalize the amd docker setup command

* update the dockerfile.amd

* update docker image

* add to readme

---------

Co-authored-by: tjtanaa <[email protected]>
Co-authored-by: TJian <[email protected]>
Co-authored-by: Hot Aisle Customer <tjtanaa>
  • Loading branch information
3 people authored Nov 16, 2024
1 parent d9050a7 commit f59df4f
Show file tree
Hide file tree
Showing 8 changed files with 141 additions and 18 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ docker run -it --gpus all \
--model-id $model2 \
--port $port
```
The cache path at inside the docker container is set by the environment variable `HF_HOME`.
The cache path inside the docker container is set by the environment variable `HF_HOME`.

#### Specialized docker images
<details>
Expand Down
32 changes: 32 additions & 0 deletions docs/docs/deploy.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,40 @@ docker run -it --gpus all \
--model-id $model2 \
--port $port
```
The cache path inside the docker container is set by the environment variable `HF_HOME`.


### AMD Docker: Deploy on AMD Platform (MI200 Series and MI300 Series)
#### Launch the CLI using a pre-built docker container (recommended)

```bash
port=7997
model1=michaelfeil/bge-small-en-v1.5
model2=mixedbread-ai/mxbai-rerank-xsmall-v1
volume=$PWD/data

docker run -it \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
--device=/dev/kfd \
--device=/dev/dri \
--group-add video \
--network host \
-v $volume:/app/.cache \
-p $port:$port \
michaelf34/infinity:latest-rocm \
v2 \
--model-id $model1 \
--model-id $model2 \
--port $port \
--engine torch \
--compile \
--no-bettertransformer
```
The cache path at inside the docker container is set by the environment variable `HF_HOME`.



## Modal Labs

A deployment example for usage within are located at repo, including a Github Actions Pipeline.
Expand Down
2 changes: 1 addition & 1 deletion docs/docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ docker run -it --gpus all \
--model-id $model2 \
--port $port
```
The cache path at inside the docker container is set by the environment variable `HF_HOME`.
The cache path inside the docker container is set by the environment variable `HF_HOME`.

### or launch the cli after the pip install
After your pip install, with your venv activate, you can run the CLI directly.
Expand Down
37 changes: 36 additions & 1 deletion libs/infinity_emb/Docker.template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,42 @@ amd:
# "RUN poetry install --no-interaction --no-ansi --no-root --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all"
COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh
RUN ./requirements_install_from_poetry.sh --no-root --without lint,test "https://download.pytorch.org/whl/rocm6.2"
poetry_extras: "all onnxruntime-gpu"
extra_installs_main: |
ARG GPU_ARCH
ENV GPU_ARCH=${GPU_ARCH}
# GPU architecture specific installations
RUN cd /opt/rocm/share/amd_smi && python -m pip wheel . --wheel-dir=/install
RUN apt update -y && apt install migraphx -y
RUN if [ "$GPU_ARCH" = "gfx90a" ] || [ "$GPU_ARCH" = "gfx942" ]; then \
# OPTION1: Follow the steps here to install onnxruntime-rocm
# https://huggingface.co/docs/optimum/onnxruntime/usage_guides/amdgpu
. .venv/bin/activate && python -m pip uninstall onnxruntime -y \
&& python -m pip install /install/*.whl \
&& python -m pip install cmake onnx \
&& (curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y) \
&& (. $HOME/.cargo/env) \
&& git clone --single-branch --branch main --recursive https://github.com/Microsoft/onnxruntime onnxruntime \
&& cd onnxruntime \
&& (./build.sh --config Release --build_wheel --allow_running_as_root --update --build --parallel --cmake_extra_defines CMAKE_HIP_ARCHITECTURES=${GPU_ARCH} ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) --use_rocm --use_migraphx --rocm_home=/opt/rocm) \
&& python -m pip uninstall onnxruntime -y \
&& python -m pip install build/Linux/Release/dist/* \
&& cp -r /app/onnxruntime/build/Linux/Release/*.so /usr/local/lib/ \
&& cp -r /app/onnxruntime/build/Linux/Release/*.so.* /usr/local/lib/ \
&& git clone https://github.com/huggingface/optimum-amd.git \
&& cd optimum-amd \
&& python -m pip install -e .; \
elif [ "$GPU_ARCH" = "gfx1100" ]; then \
# OPTION2: Install onnxruntime-rocm from the wheel
. .venv/bin/activate && python -m pip uninstall onnxruntime onnxruntime-rocm -y && python -m pip install "numpy<2" https://repo.radeon.com/rocm/manylinux/rocm-rel-6.2.3/onnxruntime_rocm-1.18.0-cp310-cp310-linux_x86_64.whl \
&& python -m pip install /install/*.whl \
&& git clone https://github.com/huggingface/optimum-amd.git /tmp-optimum \
&& cd /tmp-optimum \
&& python -m pip install .; \
else \
echo "Unsupported GPU_ARCH: ${GPU_ARCH}"; \
exit(1); \
fi
poetry_extras: "all"
python_version: python3.10
extra_env_variables: |
# RUN conda init --reverse --all
Expand Down
40 changes: 37 additions & 3 deletions libs/infinity_emb/Dockerfile.amd_auto
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ ENV PYTHONUNBUFFERED=1 \
POETRY_VIRTUALENVS_IN_PROJECT="true" \
# do not ask any interactive question
POETRY_NO_INTERACTION=1 \
EXTRAS="all onnxruntime-gpu" \
EXTRAS="all" \
PYTHON="python3.10"
RUN apt-get update && apt-get install --no-install-recommends -y build-essential python3-dev libsndfile1 $PYTHON-venv $PYTHON curl
# RUN conda init --reverse --all
# RUN rm -rf /opt/conda && rm -rf /var/lib/jenkins
# Sets default to onnx
# Bettertransformer is not supported on AMD
ENV INFINITY_BETTERTRANSFORMER="0"

WORKDIR /app
Expand Down Expand Up @@ -52,7 +52,41 @@ COPY infinity_emb infinity_emb
COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh
RUN ./requirements_install_from_poetry.sh --without lint,test "https://download.pytorch.org/whl/rocm6.2"

#
ARG GPU_ARCH
ENV GPU_ARCH=${GPU_ARCH}
# GPU architecture specific installations
RUN cd /opt/rocm/share/amd_smi && python -m pip wheel . --wheel-dir=/install
RUN apt update -y && apt install migraphx -y
RUN if [ "$GPU_ARCH" = "gfx90a" ] || [ "$GPU_ARCH" = "gfx942" ]; then \
# OPTION1: Follow the steps here to install onnxruntime-rocm
# https://huggingface.co/docs/optimum/onnxruntime/usage_guides/amdgpu
. .venv/bin/activate && python -m pip uninstall onnxruntime -y \
&& python -m pip install /install/*.whl \
&& python -m pip install cmake onnx \
&& (curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y) \
&& (. $HOME/.cargo/env) \
&& git clone --single-branch --branch main --recursive https://github.com/Microsoft/onnxruntime onnxruntime \
&& cd onnxruntime \
&& (./build.sh --config Release --build_wheel --allow_running_as_root --update --build --parallel --cmake_extra_defines CMAKE_HIP_ARCHITECTURES=${GPU_ARCH} ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) --use_rocm --use_migraphx --rocm_home=/opt/rocm) \
&& python -m pip uninstall onnxruntime -y \
&& python -m pip install build/Linux/Release/dist/* \
&& cp -r /app/onnxruntime/build/Linux/Release/*.so /usr/local/lib/ \
&& cp -r /app/onnxruntime/build/Linux/Release/*.so.* /usr/local/lib/ \
&& git clone https://github.com/huggingface/optimum-amd.git \
&& cd optimum-amd \
&& python -m pip install -e .; \
elif [ "$GPU_ARCH" = "gfx1100" ]; then \
# OPTION2: Install onnxruntime-rocm from the wheel
. .venv/bin/activate && python -m pip uninstall onnxruntime onnxruntime-rocm -y && python -m pip install "numpy<2" https://repo.radeon.com/rocm/manylinux/rocm-rel-6.2.3/onnxruntime_rocm-1.18.0-cp310-cp310-linux_x86_64.whl \
&& python -m pip install /install/*.whl \
&& git clone https://github.com/huggingface/optimum-amd.git /tmp-optimum \
&& cd /tmp-optimum \
&& python -m pip install .; \
else \
echo "Unsupported GPU_ARCH: ${GPU_ARCH}"; \
exit(1); \
fi



FROM builder as testing
Expand Down
8 changes: 4 additions & 4 deletions libs/infinity_emb/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ format format_diff:
[ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES)

template_docker:
jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s amd > Dockerfile.amd_auto
jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s cpu > Dockerfile.cpu_auto
jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s nvidia > Dockerfile.nvidia_auto
jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s trt > Dockerfile.trt_onnx_auto
poetry run jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s amd > Dockerfile.amd_auto
poetry run jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s cpu > Dockerfile.cpu_auto
poetry run jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s nvidia > Dockerfile.nvidia_auto
poetry run jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s trt > Dockerfile.trt_onnx_auto

# Add new targets
build-amd:
Expand Down
1 change: 1 addition & 0 deletions libs/infinity_emb/infinity_emb/_optional_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def _raise_error(self) -> None:
CHECK_FASTAPI = OptionalImports("fastapi", "server")
CHECK_ONNXRUNTIME = OptionalImports("optimum.onnxruntime", "optimum")
CHECK_OPTIMUM = OptionalImports("optimum", "optimum")
CHECK_OPTIMUM_AMD = OptionalImports("optimum.amd", "optimum")
CHECK_OPTIMUM_NEURON = OptionalImports(
"optimum.neuron",
"<neuronx not available as extra, only runs on AMI image, no pip install possible.>",
Expand Down
37 changes: 29 additions & 8 deletions libs/infinity_emb/infinity_emb/transformer/utils_optimum.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from huggingface_hub import HfApi, HfFolder # type: ignore
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE # type: ignore

from infinity_emb._optional_imports import CHECK_ONNXRUNTIME
from infinity_emb._optional_imports import CHECK_ONNXRUNTIME, CHECK_OPTIMUM_AMD

from infinity_emb.log_handler import logger
from infinity_emb.primitives import Device

Expand Down Expand Up @@ -57,6 +58,8 @@ def device_to_onnx(device: Device) -> str:
elif device == Device.cuda:
if "ROCMExecutionProvider" in available:
return "ROCMExecutionProvider"
elif "MIGraphXExecutionProvider" in available:
return "MIGraphXExecutionProvider"
return "CUDAExecutionProvider"
elif device == Device.mps:
return "CoreMLExecutionProvider"
Expand All @@ -67,6 +70,8 @@ def device_to_onnx(device: Device) -> str:
return "TensorrtExecutionProvider"
elif "CUDAExecutionProvider" in available:
return "CUDAExecutionProvider"
elif "MIGraphXExecutionProvider" in available:
return "MIGraphXExecutionProvider" # swapped order of ROCM and MIGraphX
elif "ROCMExecutionProvider" in available:
return "ROCMExecutionProvider"
elif "CoreMLExecutionProvider" in available:
Expand Down Expand Up @@ -100,12 +105,8 @@ def optimize_model(
revision (Optional[str], optional): The revision to use. Defaults to None.
trust_remote_code (bool, optional): Whether to trust the remote code. Defaults to True.
"""
CHECK_ONNXRUNTIME.mark_required()
path_folder = (
Path(HUGGINGFACE_HUB_CACHE) / "infinity_onnx" / execution_provider / model_name_or_path
)
OPTIMIZED_SUFFIX = "_optimized.onnx"
files_optimized = list(path_folder.glob(f"**/*{OPTIMIZED_SUFFIX}"))

## If there is no need for optimization
if execution_provider == "TensorrtExecutionProvider":
return model_class.from_pretrained(
model_name_or_path,
Expand All @@ -123,8 +124,28 @@ def optimize_model(
# "trt_int8_enable": "quantize" in file_name,
},
)

elif execution_provider in ["ROCMExecutionProvider", "MIGraphXExecutionProvider"]:
CHECK_OPTIMUM_AMD.mark_required()
return model_class.from_pretrained(
model_name_or_path,
revision=revision,
trust_remote_code=trust_remote_code,
provider=execution_provider,
file_name=file_name,
)

## path to find if model has been optimized
CHECK_ONNXRUNTIME.mark_required()
path_folder = (
Path(HUGGINGFACE_HUB_CACHE) / "infinity_onnx" / execution_provider / model_name_or_path
)
OPTIMIZED_SUFFIX = "_optimized.onnx"
files_optimized = list(path_folder.glob(f"**/*{OPTIMIZED_SUFFIX}"))

logger.info(f"files_optimized: {files_optimized}")
if files_optimized:
file_optimized = files_optimized[0]
file_optimized = files_optimized[-1]
logger.info(f"Optimized model found at {file_optimized}, skipping optimization")
return model_class.from_pretrained(
file_optimized.parent.as_posix(),
Expand Down

0 comments on commit f59df4f

Please sign in to comment.