Skip to content

Commit 003c484

Browse files
authored
Merge branch 'main' into fix_triton
2 parents b720817 + 70e500c commit 003c484

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+702
-388
lines changed

.buildkite/run-xpu-test.sh

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,11 @@ docker build -t ${image_name} -f Dockerfile.xpu .
1212

1313
# Setup cleanup
1414
remove_docker_container() {
15-
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true;
15+
docker rm -f "${container_name}" || true;
16+
docker image rm -f "${image_name}" || true;
17+
docker system prune -f || true;
1618
}
1719
trap remove_docker_container EXIT
18-
remove_docker_container
1920

2021
# Run the image and test offline inference/tensor parallel
2122
docker run \
@@ -25,6 +26,6 @@ docker run \
2526
--name "${container_name}" \
2627
"${image_name}" \
2728
sh -c '
28-
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
29-
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
29+
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
30+
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
3031
'

.buildkite/test-pipeline.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,7 @@ steps:
295295
# these tests need to be separated, cannot combine
296296
- pytest -v -s compile/piecewise/test_simple.py
297297
- pytest -v -s compile/piecewise/test_toy_llama.py
298+
- pytest -v -s compile/test_pass_manager.py
298299

299300
- label: PyTorch Fullgraph Test # 18min
300301
source_file_dependencies:

Dockerfile.xpu

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,7 @@
1-
FROM intel/deep-learning-essentials:2025.0.1-0-devel-ubuntu22.04 AS vllm-base
1+
# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually.
2+
FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base
23

3-
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
4-
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
5-
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
6-
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
7-
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
8-
chmod 644 /usr/share/keyrings/intel-graphics.gpg
4+
RUN rm /etc/apt/sources.list.d/intel-graphics.list
95

106
RUN apt-get update -y && \
117
apt-get install -y --no-install-recommends --fix-missing \
@@ -21,8 +17,6 @@ RUN apt-get update -y && \
2117
python3 \
2218
python3-dev \
2319
python3-pip \
24-
libze-intel-gpu-dev \
25-
libze-intel-gpu1 \
2620
wget
2721

2822
WORKDIR /workspace/vllm

README.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,17 @@ Easy, fast, and cheap LLM serving for everyone
1313
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
1414
</p>
1515

16+
---
17+
18+
[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
19+
20+
[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet vLLM's Cyrus Leung and Chen Zhang, and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
21+
22+
---
23+
1624
*Latest News* 🔥
1725

18-
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit#slide=id.g33fb1ff286e_0_29).
26+
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
1927
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
2028
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
2129
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).

benchmarks/benchmark_serving_structured_output.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -732,8 +732,11 @@ def main(args: argparse.Namespace):
732732
api_url = f"http://{args.host}:{args.port}{args.endpoint}"
733733
base_url = f"http://{args.host}:{args.port}"
734734

735-
tokenizer = get_tokenizer(tokenizer_id,
736-
trust_remote_code=args.trust_remote_code)
735+
tokenizer = get_tokenizer(
736+
tokenizer_id,
737+
trust_remote_code=args.trust_remote_code,
738+
tokenizer_mode=args.tokenizer_mode,
739+
)
737740

738741
if args.dataset == 'grammar':
739742
args.structure_type = 'guided_grammar'
@@ -876,6 +879,13 @@ def main(args: argparse.Namespace):
876879
help=
877880
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
878881
)
882+
parser.add_argument(
883+
"--tokenizer-mode",
884+
type=str,
885+
default="auto",
886+
help=
887+
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
888+
)
879889
parser.add_argument(
880890
"--num-prompts",
881891
type=int,

benchmarks/run_structured_output_benchmark.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ for qps in "${QPS_VALUES[@]}"; do
5454
python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
5555
--request-rate $qps \
5656
--result-filename "$FILENAME" \
57+
--tokenizer-mode ${TOKENIZER_MODE:-"auto"} \
5758
--port ${PORT:-8000}
5859

5960
echo "Completed benchmark with QPS: $qps"

docs/source/deployment/docker.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,11 @@ If you need to use those dependencies (having accepted the license terms),
3434
create a custom Dockerfile on top of the base image with an extra layer that installs them:
3535

3636
```Dockerfile
37-
FROM vllm/vllm-openai:v0.7.3
37+
FROM vllm/vllm-openai:v0.8.0
3838

3939
# e.g. install the `audio` and `video` optional dependencies
4040
# NOTE: Make sure the version of vLLM matches the base image!
41-
RUN uv pip install --system vllm[audio,video]==0.7.3
41+
RUN uv pip install vllm[audio,video]==0.8.0
4242
```
4343

4444
:::
@@ -52,7 +52,7 @@ with an extra layer that installs their code from source:
5252
```Dockerfile
5353
FROM vllm/vllm-openai:latest
5454

55-
RUN uv pip install --system git+https://github.com/huggingface/transformers.git
55+
RUN uv pip install git+https://github.com/huggingface/transformers.git
5656
```
5757

5858
:::

docs/source/models/supported_models.md

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -768,7 +768,7 @@ See [this page](#generative-models) for more information on how to use generativ
768768
* `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
769769
* ✅︎
770770
* ✅︎
771-
*
771+
* ⚠️
772772
- * `GLM4VForCausalLM`<sup>^</sup>
773773
* GLM-4V
774774
* T + I
@@ -951,13 +951,10 @@ V0 correctly implements the model's attention pattern:
951951

952952
V1 currently uses a simplified attention pattern:
953953
- Uses causal attention for all tokens, including image tokens
954-
- Generates reasonable outputs but does not match the original model's attention for text + image inputs
954+
- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": True}`
955955
- Will be updated in the future to support the correct behavior
956-
- Does not support `"do_pan_and_scan": True`
957956

958957
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
959-
960-
For these reasons, `Gemma3ForConditionalGeneration` is supported only on V0 at the moment.
961958
:::
962959

963960
:::{note}

docs/source/serving/distributed_serving.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ Since this is a ray cluster of **containers**, all the following commands should
8383

8484
Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` and `ray list nodes` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
8585

86-
After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
86+
After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node: vLLM will be able to leverage GPU resources of all nodes in the Ray cluster, and therefore, only run the `vllm` command on this node but not other nodes. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
8787

8888
```console
8989
vllm serve /path/to/the/model/in/the/container \

requirements/test.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ matplotlib # required for qwen-vl test
3030
mistral_common[opencv] >= 1.5.4 # required for pixtral test
3131
datamodel_code_generator # required for minicpm3 test
3232
lm-eval[api]==0.4.4 # required for model evaluation test
33-
transformers==4.48.2
33+
transformers==4.48.2
3434
# quantization
3535
bitsandbytes>=0.45.3
3636
buildkite-test-collector==0.1.9

requirements/tpu.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@ ray[data]
1717
--find-links https://storage.googleapis.com/libtpu-releases/index.html
1818
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
1919
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
20-
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
21-
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
22-
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
23-
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
24-
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
25-
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
20+
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
21+
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
22+
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
23+
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
24+
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
25+
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"

tests/compile/test_pass_manager.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,34 +4,38 @@
44

55
import pytest
66
import torch
7-
from torch._inductor.codecache import BypassFxGraphCache
87

9-
from vllm.compilation.config import CompilationConfig
10-
from vllm.compilation.inductor_pass import (CallableInductorPass,
11-
as_inductor_pass)
8+
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
129
from vllm.compilation.pass_manager import PostGradPassManager
10+
from vllm.config import CompilationConfig
1311

1412

1513
def simple_callable(graph: torch.fx.Graph):
1614
pass
1715

1816

19-
@as_inductor_pass(files=(__file__, ))
20-
def callable_decorated(graph: torch.fx.Graph):
21-
pass
17+
callable_uuid = CallableInductorPass(simple_callable,
18+
InductorPass.hash_source(__file__))
2219

2320

2421
@pytest.mark.parametrize(
2522
"works, callable",
26-
[(False, simple_callable), (True, callable_decorated),
27-
(True, CallableInductorPass(simple_callable, "simple_callable"))])
23+
[
24+
(False, simple_callable),
25+
(True, callable_uuid),
26+
(True, CallableInductorPass(simple_callable)),
27+
],
28+
)
2829
def test_pass_manager(works: bool, callable):
2930
config = CompilationConfig().pass_config
30-
pass_manager = PostGradPassManager([callable])
31-
pass_manager.configure(config) # Adds default passes
3231

32+
pass_manager = PostGradPassManager()
33+
pass_manager.configure(config)
34+
35+
# Try to add the callable to the pass manager
3336
if works:
37+
pass_manager.add(callable)
3438
pickle.dumps(pass_manager)
3539
else:
36-
with pytest.raises(BypassFxGraphCache):
37-
pickle.dumps(pass_manager)
40+
with pytest.raises(AssertionError):
41+
pass_manager.add(callable)

tests/multimodal/test_processing.py

Lines changed: 100 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,25 @@
77

88
import numpy as np
99
import pytest
10+
import torch
1011
from transformers import ProcessorMixin
1112

1213
from vllm.config import ModelConfig
1314
from vllm.multimodal import MULTIMODAL_REGISTRY
15+
from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
16+
MultiModalKwargsItem,
17+
MultiModalSharedField)
1418
# yapf conflicts with isort for this block
1519
# yapf: disable
1620
from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
17-
PromptIndexTargets, PromptInsertion,
18-
PromptReplacement, apply_text_matches,
21+
ProcessingCache, PromptIndexTargets,
22+
PromptInsertion, PromptReplacement,
23+
apply_text_matches,
1924
apply_token_matches,
2025
find_mm_placeholders,
2126
find_text_matches, find_token_matches,
22-
iter_token_matches)
27+
iter_token_matches,
28+
replace_token_matches)
2329
# yapf: enable
2430
from vllm.multimodal.profiling import MultiModalProfiler
2531
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
@@ -89,6 +95,58 @@ def test_iter_token_matches(token_ids, match_ids, expected):
8995
assert all(match_len == len(match_ids) for match_len in match_lens)
9096

9197

98+
# yapf: disable
99+
@pytest.mark.parametrize(
100+
("token_ids", "match_ids", "new_ids", "expected"),
101+
[
102+
([], [], [-1], []),
103+
([], [32000], [-1], []),
104+
(
105+
[32000, 32000, 32000],
106+
[32000],
107+
[-1],
108+
[-1, -1, -1],
109+
),
110+
(
111+
[32000, 32000, 32000],
112+
[32000, 32000],
113+
[-1],
114+
[-1, 32000],
115+
),
116+
(
117+
[32000, 32000, 32000],
118+
[32000, 32000, 32000],
119+
[-1],
120+
[-1],
121+
),
122+
(
123+
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
124+
[28747, 32000],
125+
[-1],
126+
[9833, -1, 32000, 32000, 9833, -1, 32000, 918],
127+
),
128+
(
129+
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
130+
[28747, 32000, 32000, 32000],
131+
[-1],
132+
[9833, -1, 9833, 28747, 32000, 32000, 918],
133+
),
134+
(
135+
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
136+
[28747, 0, 32000],
137+
[-1],
138+
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
139+
),
140+
],
141+
)
142+
# yapf: enable
143+
def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
144+
result = replace_token_matches(token_ids, match_ids, new_ids)
145+
146+
# Manually constructed results
147+
assert result == expected
148+
149+
92150
# yapf: disable
93151
@pytest.mark.parametrize(
94152
("prompt", "target_by_key", "expected_by_key"),
@@ -837,6 +895,45 @@ def test_find_mm_placeholders(
837895
assert result == expected
838896

839897

898+
def _dummy_elem(modality: str, key: str, size: int):
899+
return MultiModalFieldElem(
900+
modality=modality,
901+
key=key,
902+
data=torch.empty((size, ), dtype=torch.int8),
903+
field=MultiModalSharedField(1),
904+
)
905+
906+
907+
def _dummy_item(modality: str, size_by_key: dict[str, int]):
908+
return MultiModalKwargsItem.from_elems([
909+
_dummy_elem(modality, key, size) for key, size in size_by_key.items()
910+
])
911+
912+
913+
def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
914+
return MultiModalKwargs.from_items([
915+
_dummy_item(modality, size_by_key)
916+
for modality, size_by_key in size_by_key_modality.items()
917+
])
918+
919+
920+
# yapf: disable
921+
@pytest.mark.parametrize(
922+
("item", "expected_size"),
923+
[
924+
(_dummy_item("a", {"a1": 100}), 100),
925+
(_dummy_item("a", {"a1": 100, "a2": 110}), 210),
926+
(_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501
927+
],
928+
)
929+
# yapf: enable
930+
def test_cache_item_size(item, expected_size):
931+
cache = ProcessingCache.get_lru_cache(2048, type(item))
932+
cache[""] = item
933+
934+
assert cache.currsize == expected_size
935+
936+
840937
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
841938
@pytest.mark.parametrize(
842939
("limit", "num_supported", "is_valid"),

0 commit comments

Comments
 (0)