diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8da43322c5c..13ed64ed00f 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -390,12 +390,15 @@ steps: commands: - pytest -v -s benchmarks/ -- label: Quantization Test # 33min +- label: Quantization Test source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization - tests/quantization - command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization + - tests/models/quantization + commands: + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization + - pytest -v -s models/quantization - label: LM Eval Small Models # 53min working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" @@ -441,82 +444,70 @@ steps: commands: - pytest -v -s models/test_transformers.py - pytest -v -s models/test_registry.py + - pytest -v -s models/test_utils.py + - pytest -v -s models/test_vision.py # V1 Test: https://github.com/vllm-project/vllm/issues/14531 - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2' - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4' - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2' -- label: Language Models Test (Standard) # 32min +- label: Language Models Test (Standard) #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - - tests/models/decoder_only/language - - tests/models/embedding/language - - tests/models/encoder_decoder/language + - tests/models/language commands: # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - - pytest -v -s models/decoder_only/language -m 'core_model or quant_model' - - pytest -v -s models/embedding/language -m core_model + - pytest -v -s models/language -m core_model -- label: Language Models Test (Extended) # 1h10min +- label: Language Models Test (Extended) optional: true source_file_dependencies: - vllm/ - - tests/models/decoder_only/language - - tests/models/embedding/language - - tests/models/encoder_decoder/language + - tests/models/language commands: # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. - - pip install causal-conv1d - - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' - - pytest -v -s models/embedding/language -m 'not core_model' + - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' + - pytest -v -s models/language -m 'not core_model' -- label: Multi-Modal Models Test (Standard) # 40min +- label: Multi-Modal Models Test (Standard) #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - - tests/models/decoder_only/audio_language - - tests/models/decoder_only/vision_language - - tests/models/embedding/vision_language - - tests/models/encoder_decoder/audio_language - - tests/models/encoder_decoder/vision_language + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing + - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model + - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work + +- label: Multi-Modal Models Test (Extended) 1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal - - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model' - - pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model' - - pytest -v -s models/embedding/vision_language -m core_model - - pytest -v -s models/encoder_decoder/audio_language -m core_model - - pytest -v -s models/encoder_decoder/language -m core_model - - pytest -v -s models/encoder_decoder/vision_language -m core_model - - pytest -v -s models/decoder_only/vision_language/test_interleaved.py - -- label: Multi-Modal Models Test (Extended) 1 # 48m + - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model' + +- label: Multi-Modal Models Test (Extended) 2 optional: true source_file_dependencies: - vllm/ - - tests/models/decoder_only/audio_language - - tests/models/decoder_only/vision_language - - tests/models/embedding/vision_language - - tests/models/encoder_decoder/vision_language + - tests/models/multimodal commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model' - - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model' - - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model' - - pytest -v -s models/embedding/vision_language -m 'not core_model' - - pytest -v -s models/encoder_decoder/language -m 'not core_model' - - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model' - -- label: Multi-Modal Models Test (Extended) 2 # 38m + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' + +- label: Multi-Modal Models Test (Extended) 3 optional: true source_file_dependencies: - vllm/ - - tests/models/decoder_only/vision_language + - tests/models/multimodal commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model' + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' # This test is used only in PR development phase to test individual models and should never run on main - label: Custom Models Test @@ -586,9 +577,8 @@ steps: - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' # Avoid importing model tests that cause CUDA reinitialization error - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)' - - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)' - - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/language -v -s -m 'distributed(num_gpus=2)' + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' # test sequence parallel - pytest -v -s distributed/test_sequence_parallel.py # this test fails consistently. diff --git a/pyproject.toml b/pyproject.toml index c85e85b0c82..e51d4c9a4bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -158,7 +158,6 @@ markers = [ "skip_global_cleanup", "core_model: enable this model test in each PR instead of only nightly", "cpu_model: enable this model test in CPU tests", - "quant_model: run this model test under Quantized category", "split: run this test as part of a split", "distributed: run this test only in distributed GPU tests", "skip_v1: do not run this test with v1", diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 50b20e78c4c..1019bfd5893 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -11,7 +11,7 @@ from vllm.entrypoints.openai.protocol import EmbeddingResponse from vllm.transformers_utils.tokenizer import get_tokenizer -from ...models.embedding.utils import correctness_test +from ...models.utils import run_embedding_correctness_test from ...utils import RemoteOpenAIServer MODEL_NAME = "intfloat/multilingual-e5-small" @@ -76,7 +76,7 @@ async def test_single_embedding(hf_model, client: openai.AsyncOpenAI, assert embeddings.usage.total_tokens == 11 vllm_outputs = [d.embedding for d in embeddings.data] - correctness_test(hf_model, input_texts, vllm_outputs) + run_embedding_correctness_test(hf_model, input_texts, vllm_outputs) # test using token IDs input_tokens = [1, 1, 1, 1, 1] @@ -121,7 +121,7 @@ async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI, assert embeddings.usage.total_tokens == 33 vllm_outputs = [d.embedding for d in embeddings.data] - correctness_test(hf_model, input_texts, vllm_outputs) + run_embedding_correctness_test(hf_model, input_texts, vllm_outputs) # test list[list[int]] input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], @@ -208,7 +208,7 @@ async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI, model=model_name, encoding_format="float") float_data = [d.embedding for d in responses_float.data] - correctness_test(hf_model, input_texts, float_data) + run_embedding_correctness_test(hf_model, input_texts, float_data) responses_base64 = await client.embeddings.create(input=input_texts, model=model_name, @@ -219,13 +219,13 @@ async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI, np.frombuffer(base64.b64decode(data.embedding), dtype="float32").tolist()) - correctness_test(hf_model, input_texts, base64_data) + run_embedding_correctness_test(hf_model, input_texts, base64_data) # Default response is float32 decoded from base64 by OpenAI Client responses_default = await client.embeddings.create(input=input_texts, model=model_name) default_data = [d.embedding for d in responses_default.data] - correctness_test(hf_model, input_texts, default_data) + run_embedding_correctness_test(hf_model, input_texts, default_data) @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py index 9f5a8c6839b..332fa332a4a 100644 --- a/tests/entrypoints/openai/test_embedding_dimensions.py +++ b/tests/entrypoints/openai/test_embedding_dimensions.py @@ -11,7 +11,7 @@ from vllm.entrypoints.openai.protocol import EmbeddingResponse from ...conftest import HfRunner -from ...models.embedding.utils import EmbedModelInfo, correctness_test +from ...models.utils import EmbedModelInfo, run_embedding_correctness_test from ...utils import RemoteOpenAIServer MODELS = [ @@ -95,7 +95,8 @@ async def make_request_and_correctness_test(dimensions): assert len(embeddings.data[0].embedding) == dimensions vllm_outputs = [d.embedding for d in embeddings.data] - correctness_test(hf_model, prompts, vllm_outputs, dimensions) + run_embedding_correctness_test(hf_model, prompts, vllm_outputs, + dimensions) if model_info.is_matryoshka: valid_dimensions: list[Optional[int]] = [None] diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py deleted file mode 100644 index 6d4df2c265c..00000000000 --- a/tests/models/embedding/utils.py +++ /dev/null @@ -1,66 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from collections.abc import Sequence -from typing import NamedTuple, Optional - -import torch -import torch.nn.functional as F - - -def check_embeddings_close( - *, - embeddings_0_lst: Sequence[list[float]], - embeddings_1_lst: Sequence[list[float]], - name_0: str, - name_1: str, - tol: float = 1e-3, -) -> None: - assert len(embeddings_0_lst) == len(embeddings_1_lst) - - for prompt_idx, (embeddings_0, embeddings_1) in enumerate( - zip(embeddings_0_lst, embeddings_1_lst)): - assert len(embeddings_0) == len(embeddings_1), ( - f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}") - - sim = F.cosine_similarity(torch.tensor(embeddings_0), - torch.tensor(embeddings_1), - dim=0) - - fail_msg = (f"Test{prompt_idx}:" - f"\n{name_0}:\t{embeddings_0[:16]!r}" - f"\n{name_1}:\t{embeddings_1[:16]!r}") - - assert sim >= 1 - tol, fail_msg - - -def matryoshka_fy(tensor, dimensions): - tensor = torch.tensor(tensor) - tensor = tensor[..., :dimensions] - tensor = F.normalize(tensor, p=2, dim=1) - return tensor - - -class EmbedModelInfo(NamedTuple): - name: str - is_matryoshka: bool - matryoshka_dimensions: Optional[list[int]] = None - architecture: str = "" - enable_test: bool = True - - -def correctness_test(hf_model, - inputs, - vllm_outputs: Sequence[list[float]], - dimensions: Optional[int] = None): - - hf_outputs = hf_model.encode(inputs) - if dimensions: - hf_outputs = matryoshka_fy(hf_outputs, dimensions) - - check_embeddings_close( - embeddings_0_lst=hf_outputs, - embeddings_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - tol=1e-2, - ) diff --git a/tests/models/embedding/vision_language/__init__.py b/tests/models/embedding/vision_language/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/models/encoder_decoder/__init__.py b/tests/models/encoder_decoder/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/models/encoder_decoder/audio_language/__init__.py b/tests/models/encoder_decoder/audio_language/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/models/encoder_decoder/language/__init__.py b/tests/models/encoder_decoder/language/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/models/encoder_decoder/vision_language/__init__.py b/tests/models/encoder_decoder/vision_language/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/models/encoder_decoder/vision_language/test_broadcast.py b/tests/models/encoder_decoder/vision_language/test_broadcast.py deleted file mode 100644 index 8d986414eec..00000000000 --- a/tests/models/encoder_decoder/vision_language/test_broadcast.py +++ /dev/null @@ -1,37 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import pytest - -from ....utils import multi_gpu_test - - -@multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) -@pytest.mark.parametrize("model", [ - "meta-llama/Llama-3.2-11B-Vision-Instruct", -]) -def test_models(hf_runner, vllm_runner, image_assets, - distributed_executor_backend, model) -> None: - - dtype = "half" - max_tokens = 5 - num_logprobs = 5 - tensor_parallel_size = 2 - - if model.startswith("meta-llama/Llama-3.2-11B-Vision-Instruct"): - from .test_mllama import models, run_test - else: - raise NotImplementedError(f"Unsupported model: {model}") - - run_test( - hf_runner, - vllm_runner, - image_assets, - model=models[0], - size_factors=[0.25, 0.5, 1.0], - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - ) diff --git a/tests/models/decoder_only/__init__.py b/tests/models/language/__init__.py similarity index 100% rename from tests/models/decoder_only/__init__.py rename to tests/models/language/__init__.py diff --git a/tests/models/decoder_only/audio_language/__init__.py b/tests/models/language/generation/__init__.py similarity index 100% rename from tests/models/decoder_only/audio_language/__init__.py rename to tests/models/language/generation/__init__.py diff --git a/tests/models/encoder_decoder/language/test_bart.py b/tests/models/language/generation/test_bart.py similarity index 98% rename from tests/models/encoder_decoder/language/test_bart.py rename to tests/models/language/generation/test_bart.py index e8070d28bef..8ab0167dc77 100644 --- a/tests/models/encoder_decoder/language/test_bart.py +++ b/tests/models/language/generation/test_bart.py @@ -1,8 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -"""Compare the outputs of HF and vLLM for BART models using greedy sampling. - -Run `pytest tests/models/encoder_decoder/language/test_bart.py`. -""" from typing import Optional import pytest diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/language/generation/test_granite.py similarity index 100% rename from tests/models/decoder_only/language/test_granite.py rename to tests/models/language/generation/test_granite.py diff --git a/tests/models/decoder_only/language/test_hybrid.py b/tests/models/language/generation/test_hybrid.py similarity index 96% rename from tests/models/decoder_only/language/test_hybrid.py rename to tests/models/language/generation/test_hybrid.py index e5e0c28ae2d..880967b4aed 100644 --- a/tests/models/decoder_only/language/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -289,23 +289,25 @@ def test_multistep_correctness( @multi_gpu_test(num_gpus=2) @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]]) @pytest.mark.parametrize("max_tokens", [64]) -def test_hybrid_distributed_produces_identical_generation( +@pytest.mark.parametrize("num_logprobs", [5]) +def test_distributed_correctness( vllm_runner, example_prompts, model: str, max_tokens: int, + num_logprobs: int, ) -> None: - with vllm_runner(model, tensor_parallel_size=2, + with vllm_runner(model, tensor_parallel_size=1, max_num_seqs=2) as vllm_model: - vllm_outputs_tp_2 = vllm_model.generate_greedy(example_prompts, - max_tokens) + vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) - with vllm_runner(model, tensor_parallel_size=1, + with vllm_runner(model, tensor_parallel_size=2, max_num_seqs=2) as vllm_model: - vllm_outputs_tp_1 = vllm_model.generate_greedy(example_prompts, - max_tokens) + vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) - check_outputs_equal( + check_logprobs_close( outputs_0_lst=vllm_outputs_tp_1, outputs_1_lst=vllm_outputs_tp_2, name_0="vllm_tp_1", diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/language/generation/test_mistral.py similarity index 100% rename from tests/models/decoder_only/language/test_mistral.py rename to tests/models/language/generation/test_mistral.py diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/language/generation/test_models.py similarity index 100% rename from tests/models/decoder_only/language/test_models.py rename to tests/models/language/generation/test_models.py diff --git a/tests/models/decoder_only/language/test_phimoe.py b/tests/models/language/generation/test_phimoe.py similarity index 100% rename from tests/models/decoder_only/language/test_phimoe.py rename to tests/models/language/generation/test_phimoe.py diff --git a/tests/models/decoder_only/language/__init__.py b/tests/models/language/pooling/__init__.py similarity index 100% rename from tests/models/decoder_only/language/__init__.py rename to tests/models/language/pooling/__init__.py diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/language/pooling/test_cls_models.py similarity index 100% rename from tests/models/embedding/language/test_cls_models.py rename to tests/models/language/pooling/test_cls_models.py diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/language/pooling/test_embedding.py similarity index 98% rename from tests/models/embedding/language/test_embedding.py rename to tests/models/language/pooling/test_embedding.py index 5deb35fa321..2a90f47af54 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -8,7 +8,7 @@ from vllm.config import PoolerConfig from vllm.platforms import current_platform -from ..utils import check_embeddings_close +from ...utils import check_embeddings_close @pytest.mark.parametrize( diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py similarity index 64% rename from tests/models/embedding/language/test_gritlm.py rename to tests/models/language/pooling/test_gritlm.py index 87a1dde9381..3ad6e719094 100644 --- a/tests/models/embedding/language/test_gritlm.py +++ b/tests/models/language/pooling/test_gritlm.py @@ -7,11 +7,10 @@ import openai import pytest -import pytest_asyncio from scipy.spatial.distance import cosine -import vllm -import vllm.config +from vllm import LLM, SamplingParams +from vllm.config import ModelConfig from vllm.utils import STR_BACKEND_ENV_VAR from ....utils import RemoteOpenAIServer @@ -31,73 +30,45 @@ def _arr(arr): return array("i", arr) -def test_find_array(monkeypatch: pytest.MonkeyPatch): - # GritLM embedding implementation is only supported by XFormers backend. - with monkeypatch.context() as m: - m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS") - - from vllm.model_executor.models.gritlm import GritLMPooler - - # Create an LLM object to get the model config. - llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) - pooler = GritLMPooler(model_config=llm.llm_engine.model_config) - - arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) - - assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3 - assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3 - assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1 - assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1 - - with pytest.raises(ValueError): - pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1) - - -@pytest.fixture(scope="module") -def server_embedding(): - # GritLM embedding implementation is only supported by XFormers backend. - args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)] - with pytest.MonkeyPatch.context() as m: - m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS") - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: - yield remote_server - - -@pytest.fixture(scope="module") -def server_generate(): - args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)] - with pytest.MonkeyPatch.context() as m: - m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS") - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: - yield remote_server +def test_find_array(): + from vllm.model_executor.models.gritlm import GritLMPooler + model_config = ModelConfig( + MODEL_NAME, + task="embed", + tokenizer=MODEL_NAME, + tokenizer_mode="auto", + trust_remote_code=False, + dtype="bfloat16", + seed=0, + ) + pooler = GritLMPooler(model_config=model_config) -@pytest_asyncio.fixture -async def client_embedding(server_embedding: RemoteOpenAIServer): - async with server_embedding.get_async_client() as async_client: - yield async_client + arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3 + assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3 + assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1 + assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1 -@pytest_asyncio.fixture -async def client_generate(server_generate: RemoteOpenAIServer): - async with server_generate.get_async_client() as async_client: - yield async_client + with pytest.raises(ValueError): + pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1) def run_llm_encode( - llm: vllm.LLM, + llm: LLM, queries: list[str], instruction: str, -) -> list[float]: - outputs = llm.encode([instruction + q for q in queries], ) +) -> list[list[float]]: + outputs = llm.embed([instruction + q for q in queries]) return [output.outputs.embedding for output in outputs] async def run_client_embeddings( - client: vllm.LLM, + client: openai.AsyncOpenAI, queries: list[str], instruction: str, -) -> list[float]: +) -> list[list[float]]: outputs = await client.embeddings.create( model=MODEL_NAME, input=[instruction + q for q in queries], @@ -132,7 +103,7 @@ def get_test_data(): return queries, q_instruction, documents, d_instruction -def validate_embed_output(q_rep: list[float], d_rep: list[float]): +def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]): cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0]) assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001) @@ -143,70 +114,100 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]): assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001) cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1]) - assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001) + assert math.isclose(cosine_sim_q1_d1, 0.534, abs_tol=0.001) -def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch): +def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch, + vllm_runner): # GritLM embedding implementation is only supported by XFormers backend. with monkeypatch.context() as m: m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS") queries, q_instruction, documents, d_instruction = get_test_data() - llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) + with vllm_runner( + MODEL_NAME, + task="embed", + max_model_len=MAX_MODEL_LEN, + ) as vllm_model: + llm = vllm_model.model + + d_rep = run_llm_encode( + llm, + documents, + d_instruction, + ) + q_rep = run_llm_encode( + llm, + queries, + q_instruction, + ) + + validate_embed_output(q_rep, d_rep) + + +@pytest.mark.asyncio +async def test_gritlm_api_server_embedding(): + queries, q_instruction, documents, d_instruction = get_test_data() + + # GritLM embedding implementation is only supported by XFormers backend. + args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)] + env_dict = {STR_BACKEND_ENV_VAR: "XFORMERS"} + + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as server: + client_embedding = server.get_async_client() - d_rep = run_llm_encode( - llm, + d_rep = await run_client_embeddings( + client_embedding, documents, d_instruction, ) - q_rep = run_llm_encode( - llm, + q_rep = await run_client_embeddings( + client_embedding, queries, q_instruction, ) - validate_embed_output(q_rep, d_rep) - - -@pytest.mark.asyncio -async def test_gritlm_api_server_embedding( - client_embedding: openai.AsyncOpenAI, ): - queries, q_instruction, documents, d_instruction = get_test_data() + validate_embed_output(q_rep, d_rep) - d_rep = await run_client_embeddings( - client_embedding, - documents, - d_instruction, - ) - q_rep = await run_client_embeddings( - client_embedding, - queries, - q_instruction, - ) - validate_embed_output(q_rep, d_rep) +def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner): + # GritLM embedding implementation is only supported by XFormers backend. + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") + m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS") + input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n" -def test_gritlm_offline_gen(): - input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n" + with vllm_runner( + MODEL_NAME, + task="generate", + max_model_len=MAX_MODEL_LEN, + ) as vllm_model: + llm = vllm_model.model - llm = vllm.LLM(MODEL_NAME, max_model_len=MAX_MODEL_LEN) - sampling_params = vllm.SamplingParams(temperature=0.0, max_tokens=256) - outputs = llm.generate(input, sampling_params=sampling_params) + sampling_params = SamplingParams(temperature=0.0, max_tokens=256) + outputs = llm.generate(input, sampling_params=sampling_params) - assert outputs[0].outputs[0].text == "The capital of France is Paris." + assert outputs[0].outputs[0].text == "The capital of France is Paris." @pytest.mark.asyncio -async def test_gritlm_api_server_gen(client_generate: openai.AsyncOpenAI): +async def test_gritlm_api_server_generate(): input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n" - outputs = await client_generate.completions.create( - model=MODEL_NAME, - prompt=input, - max_tokens=256, - temperature=0.0, - ) + # GritLM embedding implementation is only supported by XFormers backend. + args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)] + env_dict = {"VLLM_USE_V1": "0", STR_BACKEND_ENV_VAR: "XFORMERS"} + + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as server: + client_generate = server.get_async_client() + + outputs = await client_generate.completions.create( + model=MODEL_NAME, + prompt=input, + max_tokens=256, + temperature=0.0, + ) assert outputs.choices[0].text == "The capital of France is Paris." diff --git a/tests/models/embedding/language/test_jina.py b/tests/models/language/pooling/test_jina.py similarity index 98% rename from tests/models/embedding/language/test_jina.py rename to tests/models/language/pooling/test_jina.py index 1e234368f3b..154aefe594a 100644 --- a/tests/models/embedding/language/test_jina.py +++ b/tests/models/language/pooling/test_jina.py @@ -8,9 +8,10 @@ import pytest -from tests.models.embedding.utils import check_embeddings_close, matryoshka_fy from vllm import PoolingParams +from ...utils import check_embeddings_close, matryoshka_fy + SCORING_MODELS = [ "jinaai/jina-reranker-v2-base-multilingual", # Roberta ] diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/language/pooling/test_scoring.py similarity index 100% rename from tests/models/embedding/language/test_scoring.py rename to tests/models/language/pooling/test_scoring.py diff --git a/tests/models/embedding/language/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py similarity index 97% rename from tests/models/embedding/language/test_snowflake_arctic_embed.py rename to tests/models/language/pooling/test_snowflake_arctic_embed.py index 2b884fceec8..81abc0e9e93 100644 --- a/tests/models/embedding/language/test_snowflake_arctic_embed.py +++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py @@ -5,9 +5,7 @@ """ import pytest -from tests.models.embedding.utils import EmbedModelInfo - -from ..utils import check_embeddings_close +from ...utils import EmbedModelInfo, check_embeddings_close EMBEDDING_PROMPTS = [ 'what is snowflake?', 'Where can I get the best tacos?', 'The Data Cloud!', diff --git a/tests/models/embedding/language/test_truncation_control.py b/tests/models/language/pooling/test_truncation_control.py similarity index 100% rename from tests/models/embedding/language/test_truncation_control.py rename to tests/models/language/pooling/test_truncation_control.py diff --git a/tests/models/decoder_only/vision_language/__init__.py b/tests/models/multimodal/generation/__init__.py similarity index 100% rename from tests/models/decoder_only/vision_language/__init__.py rename to tests/models/multimodal/generation/__init__.py diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/multimodal/generation/test_common.py similarity index 98% rename from tests/models/decoder_only/vision_language/test_models.py rename to tests/models/multimodal/generation/test_common.py index 3dd82b93fae..b21c80bef92 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/multimodal/generation/test_common.py @@ -267,6 +267,7 @@ multi_image_prompt="Describe the two images in detail.", # noqa: E501 max_model_len=4096, max_num_seqs=2, + dtype="bfloat16", auto_cls=AutoModelForImageTextToText, vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}}, patch_hf_runner=model_utils.gemma3_patch_hf_runner, @@ -423,6 +424,8 @@ get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id], hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner, + # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55 + marks=[pytest.mark.skip("HF import fails")], ), "minicpmo_26": VLMTestInfo( models=["openbmb/MiniCPM-o-2_6"], @@ -434,6 +437,8 @@ get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner, + # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55 + marks=[pytest.mark.skip("HF import fails")], ), "minicpmv_26": VLMTestInfo( models=["openbmb/MiniCPM-V-2_6"], @@ -445,6 +450,8 @@ get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner, + # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55 + marks=[pytest.mark.skip("HF import fails")], ), "minimax_vl_01": VLMTestInfo( models=["MiniMaxAI/MiniMax-VL-01"], diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/multimodal/generation/test_florence2.py similarity index 100% rename from tests/models/encoder_decoder/vision_language/test_florence2.py rename to tests/models/multimodal/generation/test_florence2.py diff --git a/tests/models/decoder_only/audio_language/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py similarity index 100% rename from tests/models/decoder_only/audio_language/test_granite_speech.py rename to tests/models/multimodal/generation/test_granite_speech.py diff --git a/tests/models/decoder_only/vision_language/test_interleaved.py b/tests/models/multimodal/generation/test_interleaved.py similarity index 99% rename from tests/models/decoder_only/vision_language/test_interleaved.py rename to tests/models/multimodal/generation/test_interleaved.py index 8804497ae61..92c8155fe1e 100644 --- a/tests/models/decoder_only/vision_language/test_interleaved.py +++ b/tests/models/multimodal/generation/test_interleaved.py @@ -16,6 +16,7 @@ def base_prompt(modalities_str: str) -> str: NONINTERLEAVED_PROMPT = base_prompt("