Skip to content

[CI/Build] Reorganize models tests #17459

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 38 additions & 48 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -390,12 +390,15 @@ steps:
commands:
- pytest -v -s benchmarks/

- label: Quantization Test # 33min
- label: Quantization Test
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
- tests/quantization
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- tests/models/quantization
commands:
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- pytest -v -s models/quantization

- label: LM Eval Small Models # 53min
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
Expand Down Expand Up @@ -441,82 +444,70 @@ steps:
commands:
- pytest -v -s models/test_transformers.py
- pytest -v -s models/test_registry.py
- pytest -v -s models/test_utils.py
- pytest -v -s models/test_vision.py
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'

- label: Language Models Test (Standard) # 32min
- label: Language Models Test (Standard)
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language
- tests/models/embedding/language
- tests/models/encoder_decoder/language
- tests/models/language
commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
- pytest -v -s models/embedding/language -m core_model
- pytest -v -s models/language -m core_model

- label: Language Models Test (Extended) # 1h10min
- label: Language Models Test (Extended)
optional: true
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language
- tests/models/embedding/language
- tests/models/encoder_decoder/language
- tests/models/language
commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- pip install causal-conv1d
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
- pytest -v -s models/embedding/language -m 'not core_model'
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
- pytest -v -s models/language -m 'not core_model'

- label: Multi-Modal Models Test (Standard) # 40min
- label: Multi-Modal Models Test (Standard)
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/audio_language
- tests/models/encoder_decoder/vision_language
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/processing
- pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work

- label: Multi-Modal Models Test (Extended) 1
optional: true
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
- pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
- pytest -v -s models/embedding/vision_language -m core_model
- pytest -v -s models/encoder_decoder/audio_language -m core_model
- pytest -v -s models/encoder_decoder/language -m core_model
- pytest -v -s models/encoder_decoder/vision_language -m core_model
- pytest -v -s models/decoder_only/vision_language/test_interleaved.py

- label: Multi-Modal Models Test (Extended) 1 # 48m
- pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'

- label: Multi-Modal Models Test (Extended) 2
optional: true
source_file_dependencies:
- vllm/
- tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/vision_language
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
- pytest -v -s models/embedding/vision_language -m 'not core_model'
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'

- label: Multi-Modal Models Test (Extended) 2 # 38m
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'

- label: Multi-Modal Models Test (Extended) 3
optional: true
source_file_dependencies:
- vllm/
- tests/models/decoder_only/vision_language
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'

# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test
Expand Down Expand Up @@ -586,9 +577,8 @@ steps:
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
# test sequence parallel
- pytest -v -s distributed/test_sequence_parallel.py
# this test fails consistently.
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,6 @@ markers = [
"skip_global_cleanup",
"core_model: enable this model test in each PR instead of only nightly",
"cpu_model: enable this model test in CPU tests",
"quant_model: run this model test under Quantized category",
"split: run this test as part of a split",
"distributed: run this test only in distributed GPU tests",
"skip_v1: do not run this test with v1",
Expand Down
12 changes: 6 additions & 6 deletions tests/entrypoints/openai/test_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from vllm.entrypoints.openai.protocol import EmbeddingResponse
from vllm.transformers_utils.tokenizer import get_tokenizer

from ...models.embedding.utils import correctness_test
from ...models.utils import run_embedding_correctness_test
from ...utils import RemoteOpenAIServer

MODEL_NAME = "intfloat/multilingual-e5-small"
Expand Down Expand Up @@ -76,7 +76,7 @@ async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
assert embeddings.usage.total_tokens == 11

vllm_outputs = [d.embedding for d in embeddings.data]
correctness_test(hf_model, input_texts, vllm_outputs)
run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)

# test using token IDs
input_tokens = [1, 1, 1, 1, 1]
Expand Down Expand Up @@ -121,7 +121,7 @@ async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
assert embeddings.usage.total_tokens == 33

vllm_outputs = [d.embedding for d in embeddings.data]
correctness_test(hf_model, input_texts, vllm_outputs)
run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)

# test list[list[int]]
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
Expand Down Expand Up @@ -208,7 +208,7 @@ async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
model=model_name,
encoding_format="float")
float_data = [d.embedding for d in responses_float.data]
correctness_test(hf_model, input_texts, float_data)
run_embedding_correctness_test(hf_model, input_texts, float_data)

responses_base64 = await client.embeddings.create(input=input_texts,
model=model_name,
Expand All @@ -219,13 +219,13 @@ async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
np.frombuffer(base64.b64decode(data.embedding),
dtype="float32").tolist())

correctness_test(hf_model, input_texts, base64_data)
run_embedding_correctness_test(hf_model, input_texts, base64_data)

# Default response is float32 decoded from base64 by OpenAI Client
responses_default = await client.embeddings.create(input=input_texts,
model=model_name)
default_data = [d.embedding for d in responses_default.data]
correctness_test(hf_model, input_texts, default_data)
run_embedding_correctness_test(hf_model, input_texts, default_data)


@pytest.mark.asyncio
Expand Down
5 changes: 3 additions & 2 deletions tests/entrypoints/openai/test_embedding_dimensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from vllm.entrypoints.openai.protocol import EmbeddingResponse

from ...conftest import HfRunner
from ...models.embedding.utils import EmbedModelInfo, correctness_test
from ...models.utils import EmbedModelInfo, run_embedding_correctness_test
from ...utils import RemoteOpenAIServer

MODELS = [
Expand Down Expand Up @@ -95,7 +95,8 @@ async def make_request_and_correctness_test(dimensions):
assert len(embeddings.data[0].embedding) == dimensions

vllm_outputs = [d.embedding for d in embeddings.data]
correctness_test(hf_model, prompts, vllm_outputs, dimensions)
run_embedding_correctness_test(hf_model, prompts, vllm_outputs,
dimensions)

if model_info.is_matryoshka:
valid_dimensions: list[Optional[int]] = [None]
Expand Down
66 changes: 0 additions & 66 deletions tests/models/embedding/utils.py

This file was deleted.

Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
37 changes: 0 additions & 37 deletions tests/models/encoder_decoder/vision_language/test_broadcast.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for BART models using greedy sampling.

Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
"""
from typing import Optional

import pytest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -289,23 +289,25 @@ def test_multistep_correctness(
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@pytest.mark.parametrize("max_tokens", [64])
def test_hybrid_distributed_produces_identical_generation(
@pytest.mark.parametrize("num_logprobs", [5])
def test_distributed_correctness(
vllm_runner,
example_prompts,
model: str,
max_tokens: int,
num_logprobs: int,
) -> None:
with vllm_runner(model, tensor_parallel_size=2,
with vllm_runner(model, tensor_parallel_size=1,
max_num_seqs=2) as vllm_model:
vllm_outputs_tp_2 = vllm_model.generate_greedy(example_prompts,
max_tokens)
vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)

with vllm_runner(model, tensor_parallel_size=1,
with vllm_runner(model, tensor_parallel_size=2,
max_num_seqs=2) as vllm_model:
vllm_outputs_tp_1 = vllm_model.generate_greedy(example_prompts,
max_tokens)
vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)

check_outputs_equal(
check_logprobs_close(
outputs_0_lst=vllm_outputs_tp_1,
outputs_1_lst=vllm_outputs_tp_2,
name_0="vllm_tp_1",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from vllm.config import PoolerConfig
from vllm.platforms import current_platform

from ..utils import check_embeddings_close
from ...utils import check_embeddings_close


@pytest.mark.parametrize(
Expand Down
Loading