|
| 1 | +# The vLLM Dockerfile is used to construct vLLM image against torch nightly that can be directly used for testing |
| 2 | + |
| 3 | +# for torch nightly, cuda >=12.6 is required, |
| 4 | +# use 12.8 due to FlashAttention issue with cuda 12.6 (https://github.com/vllm-project/vllm/issues/15435#issuecomment-2775924628) |
| 5 | +ARG CUDA_VERSION=12.8.0 |
| 6 | +# |
| 7 | +#################### BASE BUILD IMAGE #################### |
| 8 | +# prepare basic build environment |
| 9 | +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base |
| 10 | +ARG CUDA_VERSION=12.8.0 |
| 11 | +ARG PYTHON_VERSION=3.12 |
| 12 | +ARG TARGETPLATFORM |
| 13 | +ENV DEBIAN_FRONTEND=noninteractive |
| 14 | +# Install Python and other dependencies |
| 15 | +RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ |
| 16 | + && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ |
| 17 | + && apt-get update -y \ |
| 18 | + && apt-get install -y ccache software-properties-common git curl sudo \ |
| 19 | + && add-apt-repository ppa:deadsnakes/ppa \ |
| 20 | + && apt-get update -y \ |
| 21 | + && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ |
| 22 | + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ |
| 23 | + && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ |
| 24 | + && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ |
| 25 | + && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ |
| 26 | + && python3 --version \ |
| 27 | + && python3 -m pip --version |
| 28 | +# Install uv for faster pip installs |
| 29 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 30 | + python3 -m pip install uv |
| 31 | + |
| 32 | +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out |
| 33 | +# Reference: https://github.com/astral-sh/uv/pull/1694 |
| 34 | +ENV UV_HTTP_TIMEOUT=500 |
| 35 | + |
| 36 | +# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 |
| 37 | +# as it was causing spam when compiling the CUTLASS kernels |
| 38 | +RUN apt-get install -y gcc-10 g++-10 |
| 39 | +RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 |
| 40 | +RUN <<EOF |
| 41 | +gcc --version |
| 42 | +EOF |
| 43 | + |
| 44 | +# Workaround for https://github.com/openai/triton/issues/2507 and |
| 45 | +# https://github.com/pytorch/pytorch/issues/107960 -- hopefully |
| 46 | +# this won't be needed for future versions of this docker image |
| 47 | +# or future versions of triton. |
| 48 | +RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ |
| 49 | + |
| 50 | +WORKDIR /workspace |
| 51 | + |
| 52 | +# install build and runtime dependencies |
| 53 | +COPY requirements/common.txt requirements/common.txt |
| 54 | +COPY use_existing_torch.py use_existing_torch.py |
| 55 | +COPY pyproject.toml pyproject.toml |
| 56 | + |
| 57 | +# install build and runtime dependencies without stable torch version |
| 58 | +RUN python3 use_existing_torch.py |
| 59 | + |
| 60 | +# install torch nightly |
| 61 | +ARG PINNED_TORCH_VERSION |
| 62 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 63 | + if [ -n "$PINNED_TORCH_VERSION" ]; then \ |
| 64 | + pkgs="$PINNED_TORCH_VERSION"; \ |
| 65 | + else \ |
| 66 | + pkgs="torch torchaudio torchvision"; \ |
| 67 | + fi && \ |
| 68 | + uv pip install --system $pkgs --index-url https://download.pytorch.org/whl/nightly/cu128 |
| 69 | + |
| 70 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 71 | + uv pip install --system numba==0.61.2 |
| 72 | + |
| 73 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 74 | +uv pip install --system -r requirements/common.txt |
| 75 | + |
| 76 | +# must put before installing xformers, so it can install the correct version of xfomrers. |
| 77 | +ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0' |
| 78 | +ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} |
| 79 | + |
| 80 | +# Build xformers with cuda and torch nightly |
| 81 | +# following official xformers guidance: https://github.com/facebookresearch/xformers#build |
| 82 | +# todo(elainewy): cache xformers build result for faster build |
| 83 | +ARG max_jobs=16 |
| 84 | +ENV MAX_JOBS=${max_jobs} |
| 85 | +ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c |
| 86 | + |
| 87 | +ENV CCACHE_DIR=/root/.cache/ccache |
| 88 | +RUN --mount=type=cache,target=/root/.cache/ccache \ |
| 89 | + --mount=type=cache,target=/root/.cache/uv \ |
| 90 | + echo 'git clone xformers...' \ |
| 91 | + && git clone https://github.com/facebookresearch/xformers.git --recursive \ |
| 92 | + && cd xformers \ |
| 93 | + && git checkout ${XFORMERS_COMMIT} \ |
| 94 | + && git submodule update --init --recursive \ |
| 95 | + && echo 'finish git clone xformers...' \ |
| 96 | + && rm -rf build \ |
| 97 | + && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \ |
| 98 | + && cd .. \ |
| 99 | + && rm -rf xformers |
| 100 | + |
| 101 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 102 | + uv pip install --system xformers-dist/*.whl --verbose |
| 103 | + |
| 104 | +# build can take a long time, and the torch nightly version fetched from url can be different in next docker stage. |
| 105 | +# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same |
| 106 | +RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt |
| 107 | +RUN cat torch_build_versions.txt |
| 108 | + |
| 109 | +# cuda arch list used by torch |
| 110 | +# can be useful for `test` |
| 111 | +# explicitly set the list to avoid issues with torch 2.2 |
| 112 | +# see https://github.com/pytorch/pytorch/pull/123243 |
| 113 | + |
| 114 | +# Override the arch list for flash-attn to reduce the binary size |
| 115 | +ARG vllm_fa_cmake_gpu_arches='80-real;90-real' |
| 116 | +ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches} |
| 117 | +#################### BASE BUILD IMAGE #################### |
| 118 | + |
| 119 | +#################### WHEEL BUILD IMAGE #################### |
| 120 | +FROM base AS build |
| 121 | +ARG TARGETPLATFORM |
| 122 | + |
| 123 | +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out |
| 124 | +# Reference: https://github.com/astral-sh/uv/pull/1694 |
| 125 | +ENV UV_HTTP_TIMEOUT=500 |
| 126 | + |
| 127 | +COPY . . |
| 128 | + |
| 129 | +RUN python3 use_existing_torch.py |
| 130 | + |
| 131 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 132 | + uv pip install --system -r requirements/build.txt |
| 133 | + |
| 134 | +ARG GIT_REPO_CHECK=0 |
| 135 | +RUN --mount=type=bind,source=.git,target=.git \ |
| 136 | + if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi |
| 137 | + |
| 138 | +# Max jobs used by Ninja to build extensions |
| 139 | +ARG max_jobs=16 |
| 140 | +ENV MAX_JOBS=${max_jobs} |
| 141 | +ARG nvcc_threads=2 |
| 142 | +ENV NVCC_THREADS=$nvcc_threads |
| 143 | + |
| 144 | +ARG USE_SCCACHE |
| 145 | +ARG SCCACHE_BUCKET_NAME=vllm-build-sccache |
| 146 | +ARG SCCACHE_REGION_NAME=us-west-2 |
| 147 | +ARG SCCACHE_S3_NO_CREDENTIALS=0 |
| 148 | + |
| 149 | +# if USE_SCCACHE is set, use sccache to speed up compilation |
| 150 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 151 | + --mount=type=bind,source=.git,target=.git \ |
| 152 | + if [ "$USE_SCCACHE" = "1" ]; then \ |
| 153 | + echo "Installing sccache..." \ |
| 154 | + && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \ |
| 155 | + && tar -xzf sccache.tar.gz \ |
| 156 | + && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \ |
| 157 | + && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \ |
| 158 | + && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \ |
| 159 | + && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \ |
| 160 | + && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \ |
| 161 | + && export SCCACHE_IDLE_TIMEOUT=0 \ |
| 162 | + && export CMAKE_BUILD_TYPE=Release \ |
| 163 | + && sccache --show-stats \ |
| 164 | + && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \ |
| 165 | + && sccache --show-stats; \ |
| 166 | + fi |
| 167 | + |
| 168 | +ENV CCACHE_DIR=/root/.cache/ccache |
| 169 | +RUN --mount=type=cache,target=/root/.cache/ccache \ |
| 170 | + --mount=type=cache,target=/root/.cache/uv \ |
| 171 | + --mount=type=bind,source=.git,target=.git \ |
| 172 | + if [ "$USE_SCCACHE" != "1" ]; then \ |
| 173 | + # Clean any existing CMake artifacts |
| 174 | + rm -rf .deps && \ |
| 175 | + mkdir -p .deps && \ |
| 176 | + python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ |
| 177 | + fi |
| 178 | + |
| 179 | +#################### WHEEL BUILD IMAGE #################### |
| 180 | + |
| 181 | +################### VLLM INSTALLED IMAGE #################### |
| 182 | +# Setup clean environment for vLLM and its dependencies for test and api server using ubuntu22.04 with AOT flashinfer |
| 183 | +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base |
| 184 | +# prepare for environment starts |
| 185 | +ARG CUDA_VERSION=12.8.0 |
| 186 | +ARG PYTHON_VERSION=3.12 |
| 187 | +WORKDIR /vllm-workspace |
| 188 | +ENV DEBIAN_FRONTEND=noninteractive |
| 189 | +ARG TARGETPLATFORM |
| 190 | + |
| 191 | +RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ |
| 192 | + echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment |
| 193 | + |
| 194 | +# Install Python and other dependencies |
| 195 | +RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ |
| 196 | + && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ |
| 197 | + && apt-get update -y \ |
| 198 | + && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \ |
| 199 | + && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ |
| 200 | + && add-apt-repository ppa:deadsnakes/ppa \ |
| 201 | + && apt-get update -y \ |
| 202 | + && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \ |
| 203 | + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ |
| 204 | + && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ |
| 205 | + && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ |
| 206 | + && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ |
| 207 | + && python3 --version && python3 -m pip --version |
| 208 | + |
| 209 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 210 | + python3 -m pip install uv |
| 211 | + |
| 212 | +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out |
| 213 | +# Reference: https://github.com/astral-sh/uv/pull/1694 |
| 214 | +ENV UV_HTTP_TIMEOUT=500 |
| 215 | + |
| 216 | +# Workaround for https://github.com/openai/triton/issues/2507 and |
| 217 | +# https://github.com/pytorch/pytorch/issues/107960 -- hopefully |
| 218 | +# this won't be needed for future versions of this docker image |
| 219 | +# or future versions of triton. |
| 220 | +RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ |
| 221 | + |
| 222 | +# get the nightly torch version used in the build to make sure the version is the same |
| 223 | +COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt |
| 224 | + |
| 225 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 226 | + uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128 |
| 227 | + |
| 228 | +# install the vllm wheel |
| 229 | +RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm-dist \ |
| 230 | + --mount=type=cache,target=/root/.cache/uv \ |
| 231 | + uv pip install --system vllm-dist/*.whl --verbose |
| 232 | + |
| 233 | +# install xformers again for the new environment |
| 234 | +RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \ |
| 235 | + --mount=type=cache,target=/root/.cache/uv \ |
| 236 | + uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose |
| 237 | + |
| 238 | +ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0' |
| 239 | + |
| 240 | +# install package for build flashinfer |
| 241 | +# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738 |
| 242 | +RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.post1 |
| 243 | + |
| 244 | + |
| 245 | +# build flashinfer for torch nightly from source around 10 mins |
| 246 | +# release version: v0.2.2.post1 |
| 247 | +# todo(elainewy): cache flashinfer build result for faster build |
| 248 | +ENV CCACHE_DIR=/root/.cache/ccache |
| 249 | +RUN --mount=type=cache,target=/root/.cache/ccache \ |
| 250 | + --mount=type=cache,target=/root/.cache/uv \ |
| 251 | + echo "git clone flashinfer..." \ |
| 252 | + && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \ |
| 253 | + && cd flashinfer \ |
| 254 | + && git checkout v0.2.2.post1 \ |
| 255 | + && git submodule update --init --recursive \ |
| 256 | + && echo "finish git clone flashinfer..." \ |
| 257 | + && rm -rf build \ |
| 258 | + && export TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} \ |
| 259 | + && FLASHINFER_ENABLE_AOT=1 python3 setup.py bdist_wheel --dist-dir=../flashinfer-dist --verbose \ |
| 260 | + && cd .. \ |
| 261 | + && rm -rf flashinfer |
| 262 | + |
| 263 | +# install flashinfer |
| 264 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 265 | + uv pip install --system flashinfer-dist/*.whl --verbose |
| 266 | + |
| 267 | +# install common packages |
| 268 | +COPY requirements/common.txt requirements/common.txt |
| 269 | +COPY use_existing_torch.py use_existing_torch.py |
| 270 | +COPY pyproject.toml pyproject.toml |
| 271 | + |
| 272 | +COPY examples examples |
| 273 | +COPY benchmarks benchmarks |
| 274 | +COPY ./vllm/collect_env.py . |
| 275 | + |
| 276 | +RUN python3 use_existing_torch.py |
| 277 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 278 | + uv pip install --system -r requirements/common.txt |
| 279 | + |
| 280 | +################### VLLM INSTALLED IMAGE #################### |
| 281 | + |
| 282 | + |
| 283 | +#################### UNITTEST IMAGE ############################# |
| 284 | +FROM vllm-base as test |
| 285 | +COPY tests/ tests/ |
| 286 | + |
| 287 | +# install build and runtime dependencies without stable torch version |
| 288 | +COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt |
| 289 | + |
| 290 | +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out |
| 291 | +# Reference: https://github.com/astral-sh/uv/pull/1694 |
| 292 | +ENV UV_HTTP_TIMEOUT=500 |
| 293 | + |
| 294 | +# install development dependencies (for testing) |
| 295 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 296 | + uv pip install --system -e tests/vllm_test_utils |
| 297 | + |
| 298 | +# enable fast downloads from hf (for testing) |
| 299 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 300 | + uv pip install --system hf_transfer |
| 301 | +ENV HF_HUB_ENABLE_HF_TRANSFER 1 |
| 302 | + |
| 303 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 304 | + uv pip install --system -r requirements/nightly_torch_test.txt |
| 305 | + |
| 306 | +#################### UNITTEST IMAGE ############################# |
| 307 | + |
0 commit comments