vllm-project · vllm-bot · May 1, 2025 · Apr 30, 2025 · Apr 30, 2025 · Apr 30, 2025
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -390,12 +390,15 @@ steps:
   commands:
   - pytest -v -s benchmarks/
 
-- label: Quantization Test # 33min
+- label: Quantization Test
   source_file_dependencies:
   - csrc/
   - vllm/model_executor/layers/quantization
   - tests/quantization
-  command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
+  - tests/models/quantization
+  commands:
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
+  - pytest -v -s models/quantization
 
 - label: LM Eval Small Models # 53min
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
@@ -441,82 +444,70 @@ steps:
   commands:
     - pytest -v -s models/test_transformers.py
     - pytest -v -s models/test_registry.py
+    - pytest -v -s models/test_utils.py
+    - pytest -v -s models/test_vision.py
     # V1 Test: https://github.com/vllm-project/vllm/issues/14531
     - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
     - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
     - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
 
-- label: Language Models Test (Standard) # 32min
+- label: Language Models Test (Standard)
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
-  - tests/models/decoder_only/language
-  - tests/models/embedding/language
-  - tests/models/encoder_decoder/language
+  - tests/models/language
   commands:
     # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
     - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
-    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
-    - pytest -v -s models/embedding/language -m core_model
+    - pytest -v -s models/language -m core_model
 
-- label: Language Models Test (Extended) # 1h10min
+- label: Language Models Test (Extended)
   optional: true
   source_file_dependencies:
   - vllm/
-  - tests/models/decoder_only/language
-  - tests/models/embedding/language
-  - tests/models/encoder_decoder/language
+  - tests/models/language
   commands:
     # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install causal-conv1d
-    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
-    - pytest -v -s models/embedding/language -m 'not core_model'
+    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    - pytest -v -s models/language -m 'not core_model'
 
-- label: Multi-Modal Models Test (Standard) # 40min
+- label: Multi-Modal Models Test (Standard)
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
-  - tests/models/decoder_only/audio_language
-  - tests/models/decoder_only/vision_language
-  - tests/models/embedding/vision_language
-  - tests/models/encoder_decoder/audio_language
-  - tests/models/encoder_decoder/vision_language
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing
+    - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
+    - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+
+- label: Multi-Modal Models Test (Extended) 1
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal
-    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
-    - pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
-    - pytest -v -s models/embedding/vision_language -m core_model
-    - pytest -v -s models/encoder_decoder/audio_language -m core_model
-    - pytest -v -s models/encoder_decoder/language -m core_model
-    - pytest -v -s models/encoder_decoder/vision_language -m core_model
-    - pytest -v -s models/decoder_only/vision_language/test_interleaved.py
-
-- label: Multi-Modal Models Test (Extended) 1 # 48m
+    - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
+
+- label: Multi-Modal Models Test (Extended) 2
   optional: true
   source_file_dependencies:
   - vllm/
-  - tests/models/decoder_only/audio_language
-  - tests/models/decoder_only/vision_language
-  - tests/models/embedding/vision_language
-  - tests/models/encoder_decoder/vision_language
+  - tests/models/multimodal
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
-    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
-    - pytest -v -s models/embedding/vision_language -m 'not core_model'
-    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
-    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
-
-- label: Multi-Modal Models Test (Extended) 2 # 38m
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+
+- label: Multi-Modal Models Test (Extended) 3
   optional: true
   source_file_dependencies:
   - vllm/
-  - tests/models/decoder_only/vision_language
+  - tests/models/multimodal
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
 
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
@@ -586,9 +577,8 @@ steps:
   - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   # Avoid importing model tests that cause CUDA reinitialization error
   - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
   # test sequence parallel
   - pytest -v -s distributed/test_sequence_parallel.py
   # this test fails consistently.

diff --git a/pyproject.toml b/pyproject.toml
@@ -158,7 +158,6 @@ markers = [
     "skip_global_cleanup",
     "core_model: enable this model test in each PR instead of only nightly",
     "cpu_model: enable this model test in CPU tests",
-    "quant_model: run this model test under Quantized category",
     "split: run this test as part of a split",
     "distributed: run this test only in distributed GPU tests",
     "skip_v1: do not run this test with v1",

@@ -11,7 +11,7 @@
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from ...models.embedding.utils import correctness_test
+from ...models.utils import run_embedding_correctness_test
 from ...utils import RemoteOpenAIServer
 
 MODEL_NAME = "intfloat/multilingual-e5-small"
@@ -76,7 +76,7 @@ async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
     assert embeddings.usage.total_tokens == 11
 
     vllm_outputs = [d.embedding for d in embeddings.data]
-    correctness_test(hf_model, input_texts, vllm_outputs)
+    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
 
     # test using token IDs
     input_tokens = [1, 1, 1, 1, 1]
@@ -121,7 +121,7 @@ async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
     assert embeddings.usage.total_tokens == 33
 
     vllm_outputs = [d.embedding for d in embeddings.data]
-    correctness_test(hf_model, input_texts, vllm_outputs)
+    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
 
     # test list[list[int]]
     input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
@@ -208,7 +208,7 @@ async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
                                                      model=model_name,
                                                      encoding_format="float")
     float_data = [d.embedding for d in responses_float.data]
-    correctness_test(hf_model, input_texts, float_data)
+    run_embedding_correctness_test(hf_model, input_texts, float_data)
 
     responses_base64 = await client.embeddings.create(input=input_texts,
                                                       model=model_name,
@@ -219,13 +219,13 @@ async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
             np.frombuffer(base64.b64decode(data.embedding),
                           dtype="float32").tolist())
 
-    correctness_test(hf_model, input_texts, base64_data)
+    run_embedding_correctness_test(hf_model, input_texts, base64_data)
 
     # Default response is float32 decoded from base64 by OpenAI Client
     responses_default = await client.embeddings.create(input=input_texts,
                                                        model=model_name)
     default_data = [d.embedding for d in responses_default.data]
-    correctness_test(hf_model, input_texts, default_data)
+    run_embedding_correctness_test(hf_model, input_texts, default_data)
 
 
 @pytest.mark.asyncio

@@ -11,7 +11,7 @@
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 
 from ...conftest import HfRunner
-from ...models.embedding.utils import EmbedModelInfo, correctness_test
+from ...models.utils import EmbedModelInfo, run_embedding_correctness_test
 from ...utils import RemoteOpenAIServer
 
 MODELS = [
@@ -95,7 +95,8 @@ async def make_request_and_correctness_test(dimensions):
             assert len(embeddings.data[0].embedding) == dimensions
 
         vllm_outputs = [d.embedding for d in embeddings.data]
-        correctness_test(hf_model, prompts, vllm_outputs, dimensions)
+        run_embedding_correctness_test(hf_model, prompts, vllm_outputs,
+                                       dimensions)
 
     if model_info.is_matryoshka:
         valid_dimensions: list[Optional[int]] = [None]

@@ -1,8 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the outputs of HF and vLLM for BART models using greedy sampling.
-
-Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
-"""
 from typing import Optional
 
 import pytest

@@ -289,23 +289,25 @@ def test_multistep_correctness(
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
 @pytest.mark.parametrize("max_tokens", [64])
-def test_hybrid_distributed_produces_identical_generation(
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_distributed_correctness(
     vllm_runner,
     example_prompts,
     model: str,
     max_tokens: int,
+    num_logprobs: int,
 ) -> None:
-    with vllm_runner(model, tensor_parallel_size=2,
+    with vllm_runner(model, tensor_parallel_size=1,
                      max_num_seqs=2) as vllm_model:
-        vllm_outputs_tp_2 = vllm_model.generate_greedy(example_prompts,
-                                                       max_tokens)
+        vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
 
-    with vllm_runner(model, tensor_parallel_size=1,
+    with vllm_runner(model, tensor_parallel_size=2,
                      max_num_seqs=2) as vllm_model:
-        vllm_outputs_tp_1 = vllm_model.generate_greedy(example_prompts,
-                                                       max_tokens)
+        vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
 
-    check_outputs_equal(
+    check_logprobs_close(
         outputs_0_lst=vllm_outputs_tp_1,
         outputs_1_lst=vllm_outputs_tp_2,
         name_0="vllm_tp_1",

@@ -8,7 +8,7 @@
 from vllm.config import PoolerConfig
 from vllm.platforms import current_platform
 
-from ..utils import check_embeddings_close
+from ...utils import check_embeddings_close
 
 
 @pytest.mark.parametrize(