Dockerfile.ubi

## Global Args #################################################################
ARG BASE_UBI_IMAGE_TAG=9.4
ARG PYTHON_VERSION=3.12

ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'

## Base Layer ##################################################################
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
ARG PYTHON_VERSION
ENV PYTHON_VERSION=${PYTHON_VERSION}
RUN microdnf -y update && microdnf install -y \
    python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel \
    && microdnf clean all

WORKDIR /workspace

ENV LANG=C.UTF-8 \
    LC_ALL=C.UTF-8

# Some utils for dev purposes - tar required for kubectl cp
RUN microdnf install -y \
        which procps findutils tar vim git\
    && microdnf clean all


## Python Installer ############################################################
FROM base as python-install
ARG PYTHON_VERSION

ENV VIRTUAL_ENV=/opt/vllm
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
ENV PYTHON_VERSION=${PYTHON_VERSION}
RUN microdnf install -y \
    python${PYTHON_VERSION}-devel  && \
    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all


## CUDA Base ###################################################################
FROM python-install as cuda-base

RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
        https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo

RUN microdnf install -y \
        cuda-nvcc-12-4 cuda-nvtx-12-4 cuda-libraries-devel-12-4 && \
    microdnf clean all

ENV CUDA_HOME="/usr/local/cuda" \
    PATH="${CUDA_HOME}/bin:${PATH}" \
    LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"

## Python cuda base #################################################################
FROM cuda-base AS python-cuda-base

ENV VIRTUAL_ENV=/opt/vllm
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

# install cuda and common dependencies
RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
    uv pip install \
        -r requirements-cuda.txt


## Development #################################################################
FROM python-cuda-base AS dev

# install build and runtime dependencies
RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
    --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
    --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
    --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
    uv pip install \
        -r requirements-cuda.txt \
        -r requirements-dev.txt

## Builder #####################################################################
FROM dev AS build

# install build dependencies
RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
    uv pip install -r requirements-build.txt

# install compiler cache to speed up compilation leveraging local or remote caching
# git is required for the cutlass kernels
RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y git ccache && microdnf clean all

COPY . .

ARG TORCH_CUDA_ARCH_LIST
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
ARG vllm_fa_cmake_gpu_arches
ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}

# max jobs used by Ninja to build extensions
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# number of threads used by nvcc
ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1

# Make sure the cuda environment is in the PATH
ENV PATH=/usr/local/cuda/bin:$PATH

ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/pip \
    --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,src=.git,target=/workspace/.git \
    env CFLAGS="-march=haswell" \
        CXXFLAGS="$CFLAGS $CXXFLAGS" \
        CMAKE_BUILD_TYPE=Release \
        python3 setup.py bdist_wheel --dist-dir=dist

#################### libsodium Build IMAGE ####################
FROM base as libsodium-builder

RUN microdnf install -y gcc gzip \
    && microdnf clean all

WORKDIR /usr/src/libsodium

ARG LIBSODIUM_VERSION=1.0.20
RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \
    && tar -xzvf libsodium*.tar.gz \
    && rm -f libsodium*.tar.gz \
    && mv libsodium*/* ./

RUN CFLAGS="-O3 -Wall -Werror=format-security -Wno-unused-function -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection"\
    ./configure --prefix="/usr/" && make -j $MAX_JOBS && make check

## Release #####################################################################
FROM python-install AS vllm-openai
ARG PYTHON_VERSION

WORKDIR /workspace

ENV VIRTUAL_ENV=/opt/vllm
ENV PATH=$VIRTUAL_ENV/bin/:$PATH

# force using the python venv's cuda runtime libraries
ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_nvrtc/lib:${LD_LIBRARY_PATH}"
ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_runtime/lib:${LD_LIBRARY_PATH}"
ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}"

# Triton needs a CC compiler
RUN microdnf install -y gcc \
    rsync \
    && microdnf clean all

# install vllm wheel first, so that torch etc will be installed
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
    --mount=type=cache,target=/root/.cache/pip \
    --mount=type=cache,target=/root/.cache/uv \
    uv pip install $(echo dist/*.whl)'[tensorizer]' --verbose

# Install libsodium for Tensorizer encryption
RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
    cd /usr/src/libsodium \
    && make install

RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=cache,target=/root/.cache/uv \
    uv pip install \
        "https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp312-cp312-linux_x86_64.whl"

ENV HF_HUB_OFFLINE=1 \
    HOME=/home/vllm \
    # Allow requested max length to exceed what is extracted from the
    # config.json
    # see: https://github.com/vllm-project/vllm/pull/7080
    VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
    VLLM_USAGE_SOURCE=production-docker-image \
    VLLM_WORKER_MULTIPROC_METHOD=fork \
    OUTLINES_CACHE_DIR=/tmp/outlines \
    NUMBA_CACHE_DIR=/tmp/numba \
    TRITON_CACHE_DIR=/tmp/triton \
    VLLM_NO_USAGE_STATS=1

# setup non-root user for OpenShift
RUN umask 002 \
    && useradd --uid 2000 --gid 0 vllm \
    && chmod g+rwx $HOME /usr/src /workspace

COPY LICENSE /licenses/vllm.md

# Copy only .jinja files from example directory to template directory
COPY examples/*.jinja /app/data/template/

USER 2000
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]


FROM vllm-openai as vllm-grpc-adapter

USER root

RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
    uv pip install $(echo dist/*.whl)'[tensorizer]' vllm-tgis-adapter==0.5.3

ENV GRPC_PORT=8033 \
    PORT=8000 \
    # As an optimization, vLLM disables logprobs when using spec decoding by
    # default, but this would be unexpected to users of a hosted model that
    # happens to have spec decoding
    # see: https://github.com/vllm-project/vllm/pull/6485
    DISABLE_LOGPROBS_DURING_SPEC_DECODING=false

USER 2000
ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--uvicorn-log-level=warning"]