diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 8da43322c5c..13ed64ed00f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -390,12 +390,15 @@ steps:
   commands:
   - pytest -v -s benchmarks/
 
-- label: Quantization Test # 33min
+- label: Quantization Test
   source_file_dependencies:
   - csrc/
   - vllm/model_executor/layers/quantization
   - tests/quantization
-  command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
+  - tests/models/quantization
+  commands:
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
+  - pytest -v -s models/quantization
 
 - label: LM Eval Small Models # 53min
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
@@ -441,82 +444,70 @@ steps:
   commands:
     - pytest -v -s models/test_transformers.py
     - pytest -v -s models/test_registry.py
+    - pytest -v -s models/test_utils.py
+    - pytest -v -s models/test_vision.py
     # V1 Test: https://github.com/vllm-project/vllm/issues/14531
     - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
     - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
     - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
 
-- label: Language Models Test (Standard) # 32min
+- label: Language Models Test (Standard)
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
-  - tests/models/decoder_only/language
-  - tests/models/embedding/language
-  - tests/models/encoder_decoder/language
+  - tests/models/language
   commands:
     # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
     - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
-    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
-    - pytest -v -s models/embedding/language -m core_model
+    - pytest -v -s models/language -m core_model
 
-- label: Language Models Test (Extended) # 1h10min
+- label: Language Models Test (Extended)
   optional: true
   source_file_dependencies:
   - vllm/
-  - tests/models/decoder_only/language
-  - tests/models/embedding/language
-  - tests/models/encoder_decoder/language
+  - tests/models/language
   commands:
     # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install causal-conv1d
-    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
-    - pytest -v -s models/embedding/language -m 'not core_model'
+    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    - pytest -v -s models/language -m 'not core_model'
 
-- label: Multi-Modal Models Test (Standard) # 40min
+- label: Multi-Modal Models Test (Standard)
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
-  - tests/models/decoder_only/audio_language
-  - tests/models/decoder_only/vision_language
-  - tests/models/embedding/vision_language
-  - tests/models/encoder_decoder/audio_language
-  - tests/models/encoder_decoder/vision_language
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing
+    - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
+    - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+
+- label: Multi-Modal Models Test (Extended) 1
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal
-    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
-    - pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
-    - pytest -v -s models/embedding/vision_language -m core_model
-    - pytest -v -s models/encoder_decoder/audio_language -m core_model
-    - pytest -v -s models/encoder_decoder/language -m core_model
-    - pytest -v -s models/encoder_decoder/vision_language -m core_model
-    - pytest -v -s models/decoder_only/vision_language/test_interleaved.py
-
-- label: Multi-Modal Models Test (Extended) 1 # 48m
+    - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
+
+- label: Multi-Modal Models Test (Extended) 2
   optional: true
   source_file_dependencies:
   - vllm/
-  - tests/models/decoder_only/audio_language
-  - tests/models/decoder_only/vision_language
-  - tests/models/embedding/vision_language
-  - tests/models/encoder_decoder/vision_language
+  - tests/models/multimodal
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
-    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
-    - pytest -v -s models/embedding/vision_language -m 'not core_model'
-    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
-    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
-
-- label: Multi-Modal Models Test (Extended) 2 # 38m
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+
+- label: Multi-Modal Models Test (Extended) 3
   optional: true
   source_file_dependencies:
   - vllm/
-  - tests/models/decoder_only/vision_language
+  - tests/models/multimodal
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
 
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
@@ -586,9 +577,8 @@ steps:
   - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   # Avoid importing model tests that cause CUDA reinitialization error
   - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
   # test sequence parallel
   - pytest -v -s distributed/test_sequence_parallel.py
   # this test fails consistently.
diff --git a/pyproject.toml b/pyproject.toml
index c85e85b0c82..e51d4c9a4bd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -158,7 +158,6 @@ markers = [
     "skip_global_cleanup",
     "core_model: enable this model test in each PR instead of only nightly",
     "cpu_model: enable this model test in CPU tests",
-    "quant_model: run this model test under Quantized category",
     "split: run this test as part of a split",
     "distributed: run this test only in distributed GPU tests",
     "skip_v1: do not run this test with v1",
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index 50b20e78c4c..1019bfd5893 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -11,7 +11,7 @@
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from ...models.embedding.utils import correctness_test
+from ...models.utils import run_embedding_correctness_test
 from ...utils import RemoteOpenAIServer
 
 MODEL_NAME = "intfloat/multilingual-e5-small"
@@ -76,7 +76,7 @@ async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
     assert embeddings.usage.total_tokens == 11
 
     vllm_outputs = [d.embedding for d in embeddings.data]
-    correctness_test(hf_model, input_texts, vllm_outputs)
+    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
 
     # test using token IDs
     input_tokens = [1, 1, 1, 1, 1]
@@ -121,7 +121,7 @@ async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
     assert embeddings.usage.total_tokens == 33
 
     vllm_outputs = [d.embedding for d in embeddings.data]
-    correctness_test(hf_model, input_texts, vllm_outputs)
+    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
 
     # test list[list[int]]
     input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
@@ -208,7 +208,7 @@ async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
                                                      model=model_name,
                                                      encoding_format="float")
     float_data = [d.embedding for d in responses_float.data]
-    correctness_test(hf_model, input_texts, float_data)
+    run_embedding_correctness_test(hf_model, input_texts, float_data)
 
     responses_base64 = await client.embeddings.create(input=input_texts,
                                                       model=model_name,
@@ -219,13 +219,13 @@ async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
             np.frombuffer(base64.b64decode(data.embedding),
                           dtype="float32").tolist())
 
-    correctness_test(hf_model, input_texts, base64_data)
+    run_embedding_correctness_test(hf_model, input_texts, base64_data)
 
     # Default response is float32 decoded from base64 by OpenAI Client
     responses_default = await client.embeddings.create(input=input_texts,
                                                        model=model_name)
     default_data = [d.embedding for d in responses_default.data]
-    correctness_test(hf_model, input_texts, default_data)
+    run_embedding_correctness_test(hf_model, input_texts, default_data)
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py
index 9f5a8c6839b..332fa332a4a 100644
--- a/tests/entrypoints/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/openai/test_embedding_dimensions.py
@@ -11,7 +11,7 @@
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 
 from ...conftest import HfRunner
-from ...models.embedding.utils import EmbedModelInfo, correctness_test
+from ...models.utils import EmbedModelInfo, run_embedding_correctness_test
 from ...utils import RemoteOpenAIServer
 
 MODELS = [
@@ -95,7 +95,8 @@ async def make_request_and_correctness_test(dimensions):
             assert len(embeddings.data[0].embedding) == dimensions
 
         vllm_outputs = [d.embedding for d in embeddings.data]
-        correctness_test(hf_model, prompts, vllm_outputs, dimensions)
+        run_embedding_correctness_test(hf_model, prompts, vllm_outputs,
+                                       dimensions)
 
     if model_info.is_matryoshka:
         valid_dimensions: list[Optional[int]] = [None]
diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py
deleted file mode 100644
index 6d4df2c265c..00000000000
--- a/tests/models/embedding/utils.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from collections.abc import Sequence
-from typing import NamedTuple, Optional
-
-import torch
-import torch.nn.functional as F
-
-
-def check_embeddings_close(
-    *,
-    embeddings_0_lst: Sequence[list[float]],
-    embeddings_1_lst: Sequence[list[float]],
-    name_0: str,
-    name_1: str,
-    tol: float = 1e-3,
-) -> None:
-    assert len(embeddings_0_lst) == len(embeddings_1_lst)
-
-    for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
-            zip(embeddings_0_lst, embeddings_1_lst)):
-        assert len(embeddings_0) == len(embeddings_1), (
-            f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
-
-        sim = F.cosine_similarity(torch.tensor(embeddings_0),
-                                  torch.tensor(embeddings_1),
-                                  dim=0)
-
-        fail_msg = (f"Test{prompt_idx}:"
-                    f"\n{name_0}:\t{embeddings_0[:16]!r}"
-                    f"\n{name_1}:\t{embeddings_1[:16]!r}")
-
-        assert sim >= 1 - tol, fail_msg
-
-
-def matryoshka_fy(tensor, dimensions):
-    tensor = torch.tensor(tensor)
-    tensor = tensor[..., :dimensions]
-    tensor = F.normalize(tensor, p=2, dim=1)
-    return tensor
-
-
-class EmbedModelInfo(NamedTuple):
-    name: str
-    is_matryoshka: bool
-    matryoshka_dimensions: Optional[list[int]] = None
-    architecture: str = ""
-    enable_test: bool = True
-
-
-def correctness_test(hf_model,
-                     inputs,
-                     vllm_outputs: Sequence[list[float]],
-                     dimensions: Optional[int] = None):
-
-    hf_outputs = hf_model.encode(inputs)
-    if dimensions:
-        hf_outputs = matryoshka_fy(hf_outputs, dimensions)
-
-    check_embeddings_close(
-        embeddings_0_lst=hf_outputs,
-        embeddings_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-        tol=1e-2,
-    )
diff --git a/tests/models/embedding/vision_language/__init__.py b/tests/models/embedding/vision_language/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tests/models/encoder_decoder/__init__.py b/tests/models/encoder_decoder/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tests/models/encoder_decoder/audio_language/__init__.py b/tests/models/encoder_decoder/audio_language/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tests/models/encoder_decoder/language/__init__.py b/tests/models/encoder_decoder/language/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tests/models/encoder_decoder/vision_language/__init__.py b/tests/models/encoder_decoder/vision_language/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tests/models/encoder_decoder/vision_language/test_broadcast.py b/tests/models/encoder_decoder/vision_language/test_broadcast.py
deleted file mode 100644
index 8d986414eec..00000000000
--- a/tests/models/encoder_decoder/vision_language/test_broadcast.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-
-from ....utils import multi_gpu_test
-
-
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-@pytest.mark.parametrize("model", [
-    "meta-llama/Llama-3.2-11B-Vision-Instruct",
-])
-def test_models(hf_runner, vllm_runner, image_assets,
-                distributed_executor_backend, model) -> None:
-
-    dtype = "half"
-    max_tokens = 5
-    num_logprobs = 5
-    tensor_parallel_size = 2
-
-    if model.startswith("meta-llama/Llama-3.2-11B-Vision-Instruct"):
-        from .test_mllama import models, run_test
-    else:
-        raise NotImplementedError(f"Unsupported model: {model}")
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        image_assets,
-        model=models[0],
-        size_factors=[0.25, 0.5, 1.0],
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=tensor_parallel_size,
-        distributed_executor_backend=distributed_executor_backend,
-    )
diff --git a/tests/models/decoder_only/__init__.py b/tests/models/language/__init__.py
similarity index 100%
rename from tests/models/decoder_only/__init__.py
rename to tests/models/language/__init__.py
diff --git a/tests/models/decoder_only/audio_language/__init__.py b/tests/models/language/generation/__init__.py
similarity index 100%
rename from tests/models/decoder_only/audio_language/__init__.py
rename to tests/models/language/generation/__init__.py
diff --git a/tests/models/encoder_decoder/language/test_bart.py b/tests/models/language/generation/test_bart.py
similarity index 98%
rename from tests/models/encoder_decoder/language/test_bart.py
rename to tests/models/language/generation/test_bart.py
index e8070d28bef..8ab0167dc77 100644
--- a/tests/models/encoder_decoder/language/test_bart.py
+++ b/tests/models/language/generation/test_bart.py
@@ -1,8 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the outputs of HF and vLLM for BART models using greedy sampling.
-
-Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
-"""
 from typing import Optional
 
 import pytest
diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/language/generation/test_granite.py
similarity index 100%
rename from tests/models/decoder_only/language/test_granite.py
rename to tests/models/language/generation/test_granite.py
diff --git a/tests/models/decoder_only/language/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
similarity index 96%
rename from tests/models/decoder_only/language/test_hybrid.py
rename to tests/models/language/generation/test_hybrid.py
index e5e0c28ae2d..880967b4aed 100644
--- a/tests/models/decoder_only/language/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -289,23 +289,25 @@ def test_multistep_correctness(
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
 @pytest.mark.parametrize("max_tokens", [64])
-def test_hybrid_distributed_produces_identical_generation(
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_distributed_correctness(
     vllm_runner,
     example_prompts,
     model: str,
     max_tokens: int,
+    num_logprobs: int,
 ) -> None:
-    with vllm_runner(model, tensor_parallel_size=2,
+    with vllm_runner(model, tensor_parallel_size=1,
                      max_num_seqs=2) as vllm_model:
-        vllm_outputs_tp_2 = vllm_model.generate_greedy(example_prompts,
-                                                       max_tokens)
+        vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
 
-    with vllm_runner(model, tensor_parallel_size=1,
+    with vllm_runner(model, tensor_parallel_size=2,
                      max_num_seqs=2) as vllm_model:
-        vllm_outputs_tp_1 = vllm_model.generate_greedy(example_prompts,
-                                                       max_tokens)
+        vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
 
-    check_outputs_equal(
+    check_logprobs_close(
         outputs_0_lst=vllm_outputs_tp_1,
         outputs_1_lst=vllm_outputs_tp_2,
         name_0="vllm_tp_1",
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/language/generation/test_mistral.py
similarity index 100%
rename from tests/models/decoder_only/language/test_mistral.py
rename to tests/models/language/generation/test_mistral.py
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/language/generation/test_models.py
similarity index 100%
rename from tests/models/decoder_only/language/test_models.py
rename to tests/models/language/generation/test_models.py
diff --git a/tests/models/decoder_only/language/test_phimoe.py b/tests/models/language/generation/test_phimoe.py
similarity index 100%
rename from tests/models/decoder_only/language/test_phimoe.py
rename to tests/models/language/generation/test_phimoe.py
diff --git a/tests/models/decoder_only/language/__init__.py b/tests/models/language/pooling/__init__.py
similarity index 100%
rename from tests/models/decoder_only/language/__init__.py
rename to tests/models/language/pooling/__init__.py
diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/language/pooling/test_cls_models.py
similarity index 100%
rename from tests/models/embedding/language/test_cls_models.py
rename to tests/models/language/pooling/test_cls_models.py
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/language/pooling/test_embedding.py
similarity index 98%
rename from tests/models/embedding/language/test_embedding.py
rename to tests/models/language/pooling/test_embedding.py
index 5deb35fa321..2a90f47af54 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -8,7 +8,7 @@
 from vllm.config import PoolerConfig
 from vllm.platforms import current_platform
 
-from ..utils import check_embeddings_close
+from ...utils import check_embeddings_close
 
 
 @pytest.mark.parametrize(
diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py
similarity index 64%
rename from tests/models/embedding/language/test_gritlm.py
rename to tests/models/language/pooling/test_gritlm.py
index 87a1dde9381..3ad6e719094 100644
--- a/tests/models/embedding/language/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -7,11 +7,10 @@
 
 import openai
 import pytest
-import pytest_asyncio
 from scipy.spatial.distance import cosine
 
-import vllm
-import vllm.config
+from vllm import LLM, SamplingParams
+from vllm.config import ModelConfig
 from vllm.utils import STR_BACKEND_ENV_VAR
 
 from ....utils import RemoteOpenAIServer
@@ -31,73 +30,45 @@ def _arr(arr):
     return array("i", arr)
 
 
-def test_find_array(monkeypatch: pytest.MonkeyPatch):
-    # GritLM embedding implementation is only supported by XFormers backend.
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
-
-        from vllm.model_executor.models.gritlm import GritLMPooler
-
-        # Create an LLM object to get the model config.
-        llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
-        pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
-
-        arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
-
-        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
-        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
-        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
-        assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
-
-        with pytest.raises(ValueError):
-            pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
-
-
-@pytest.fixture(scope="module")
-def server_embedding():
-    # GritLM embedding implementation is only supported by XFormers backend.
-    args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
-    with pytest.MonkeyPatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
-        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-            yield remote_server
-
-
-@pytest.fixture(scope="module")
-def server_generate():
-    args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
-    with pytest.MonkeyPatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
-        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-            yield remote_server
+def test_find_array():
+    from vllm.model_executor.models.gritlm import GritLMPooler
 
+    model_config = ModelConfig(
+        MODEL_NAME,
+        task="embed",
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="bfloat16",
+        seed=0,
+    )
+    pooler = GritLMPooler(model_config=model_config)
 
-@pytest_asyncio.fixture
-async def client_embedding(server_embedding: RemoteOpenAIServer):
-    async with server_embedding.get_async_client() as async_client:
-        yield async_client
+    arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
 
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
+    assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
 
-@pytest_asyncio.fixture
-async def client_generate(server_generate: RemoteOpenAIServer):
-    async with server_generate.get_async_client() as async_client:
-        yield async_client
+    with pytest.raises(ValueError):
+        pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
 
 
 def run_llm_encode(
-    llm: vllm.LLM,
+    llm: LLM,
     queries: list[str],
     instruction: str,
-) -> list[float]:
-    outputs = llm.encode([instruction + q for q in queries], )
+) -> list[list[float]]:
+    outputs = llm.embed([instruction + q for q in queries])
     return [output.outputs.embedding for output in outputs]
 
 
 async def run_client_embeddings(
-    client: vllm.LLM,
+    client: openai.AsyncOpenAI,
     queries: list[str],
     instruction: str,
-) -> list[float]:
+) -> list[list[float]]:
     outputs = await client.embeddings.create(
         model=MODEL_NAME,
         input=[instruction + q for q in queries],
@@ -132,7 +103,7 @@ def get_test_data():
     return queries, q_instruction, documents, d_instruction
 
 
-def validate_embed_output(q_rep: list[float], d_rep: list[float]):
+def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]):
     cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
     assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001)
 
@@ -143,70 +114,100 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]):
     assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001)
 
     cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
-    assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)
+    assert math.isclose(cosine_sim_q1_d1, 0.534, abs_tol=0.001)
 
 
-def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch):
+def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch,
+                                  vllm_runner):
     # GritLM embedding implementation is only supported by XFormers backend.
     with monkeypatch.context() as m:
         m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
 
         queries, q_instruction, documents, d_instruction = get_test_data()
 
-        llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
+        with vllm_runner(
+                MODEL_NAME,
+                task="embed",
+                max_model_len=MAX_MODEL_LEN,
+        ) as vllm_model:
+            llm = vllm_model.model
+
+            d_rep = run_llm_encode(
+                llm,
+                documents,
+                d_instruction,
+            )
+            q_rep = run_llm_encode(
+                llm,
+                queries,
+                q_instruction,
+            )
+
+        validate_embed_output(q_rep, d_rep)
+
+
+@pytest.mark.asyncio
+async def test_gritlm_api_server_embedding():
+    queries, q_instruction, documents, d_instruction = get_test_data()
+
+    # GritLM embedding implementation is only supported by XFormers backend.
+    args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
+    env_dict = {STR_BACKEND_ENV_VAR: "XFORMERS"}
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as server:
+        client_embedding = server.get_async_client()
 
-        d_rep = run_llm_encode(
-            llm,
+        d_rep = await run_client_embeddings(
+            client_embedding,
             documents,
             d_instruction,
         )
-        q_rep = run_llm_encode(
-            llm,
+        q_rep = await run_client_embeddings(
+            client_embedding,
             queries,
             q_instruction,
         )
 
-        validate_embed_output(q_rep, d_rep)
-
-
-@pytest.mark.asyncio
-async def test_gritlm_api_server_embedding(
-    client_embedding: openai.AsyncOpenAI, ):
-    queries, q_instruction, documents, d_instruction = get_test_data()
+    validate_embed_output(q_rep, d_rep)
 
-    d_rep = await run_client_embeddings(
-        client_embedding,
-        documents,
-        d_instruction,
-    )
-    q_rep = await run_client_embeddings(
-        client_embedding,
-        queries,
-        q_instruction,
-    )
 
-    validate_embed_output(q_rep, d_rep)
+def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
+    # GritLM embedding implementation is only supported by XFormers backend.
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
 
+        input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
 
-def test_gritlm_offline_gen():
-    input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
+        with vllm_runner(
+                MODEL_NAME,
+                task="generate",
+                max_model_len=MAX_MODEL_LEN,
+        ) as vllm_model:
+            llm = vllm_model.model
 
-    llm = vllm.LLM(MODEL_NAME, max_model_len=MAX_MODEL_LEN)
-    sampling_params = vllm.SamplingParams(temperature=0.0, max_tokens=256)
-    outputs = llm.generate(input, sampling_params=sampling_params)
+            sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
+            outputs = llm.generate(input, sampling_params=sampling_params)
 
-    assert outputs[0].outputs[0].text == "The capital of France is Paris."
+        assert outputs[0].outputs[0].text == "The capital of France is Paris."
 
 
 @pytest.mark.asyncio
-async def test_gritlm_api_server_gen(client_generate: openai.AsyncOpenAI):
+async def test_gritlm_api_server_generate():
     input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
 
-    outputs = await client_generate.completions.create(
-        model=MODEL_NAME,
-        prompt=input,
-        max_tokens=256,
-        temperature=0.0,
-    )
+    # GritLM embedding implementation is only supported by XFormers backend.
+    args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
+    env_dict = {"VLLM_USE_V1": "0", STR_BACKEND_ENV_VAR: "XFORMERS"}
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as server:
+        client_generate = server.get_async_client()
+
+        outputs = await client_generate.completions.create(
+            model=MODEL_NAME,
+            prompt=input,
+            max_tokens=256,
+            temperature=0.0,
+        )
 
     assert outputs.choices[0].text == "The capital of France is Paris."
diff --git a/tests/models/embedding/language/test_jina.py b/tests/models/language/pooling/test_jina.py
similarity index 98%
rename from tests/models/embedding/language/test_jina.py
rename to tests/models/language/pooling/test_jina.py
index 1e234368f3b..154aefe594a 100644
--- a/tests/models/embedding/language/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -8,9 +8,10 @@
 
 import pytest
 
-from tests.models.embedding.utils import check_embeddings_close, matryoshka_fy
 from vllm import PoolingParams
 
+from ...utils import check_embeddings_close, matryoshka_fy
+
 SCORING_MODELS = [
     "jinaai/jina-reranker-v2-base-multilingual",  # Roberta
 ]
diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/language/pooling/test_scoring.py
similarity index 100%
rename from tests/models/embedding/language/test_scoring.py
rename to tests/models/language/pooling/test_scoring.py
diff --git a/tests/models/embedding/language/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py
similarity index 97%
rename from tests/models/embedding/language/test_snowflake_arctic_embed.py
rename to tests/models/language/pooling/test_snowflake_arctic_embed.py
index 2b884fceec8..81abc0e9e93 100644
--- a/tests/models/embedding/language/test_snowflake_arctic_embed.py
+++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py
@@ -5,9 +5,7 @@
 """
 import pytest
 
-from tests.models.embedding.utils import EmbedModelInfo
-
-from ..utils import check_embeddings_close
+from ...utils import EmbedModelInfo, check_embeddings_close
 
 EMBEDDING_PROMPTS = [
     'what is snowflake?', 'Where can I get the best tacos?', 'The Data Cloud!',
diff --git a/tests/models/embedding/language/test_truncation_control.py b/tests/models/language/pooling/test_truncation_control.py
similarity index 100%
rename from tests/models/embedding/language/test_truncation_control.py
rename to tests/models/language/pooling/test_truncation_control.py
diff --git a/tests/models/decoder_only/vision_language/__init__.py b/tests/models/multimodal/generation/__init__.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/__init__.py
rename to tests/models/multimodal/generation/__init__.py
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/multimodal/generation/test_common.py
similarity index 98%
rename from tests/models/decoder_only/vision_language/test_models.py
rename to tests/models/multimodal/generation/test_common.py
index 3dd82b93fae..b21c80bef92 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -267,6 +267,7 @@
         multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
         max_model_len=4096,
         max_num_seqs=2,
+        dtype="bfloat16",
         auto_cls=AutoModelForImageTextToText,
         vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
         patch_hf_runner=model_utils.gemma3_patch_hf_runner,
@@ -423,6 +424,8 @@
         get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
+        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
+        marks=[pytest.mark.skip("HF import fails")],
     ),
     "minicpmo_26": VLMTestInfo(
         models=["openbmb/MiniCPM-o-2_6"],
@@ -434,6 +437,8 @@
         get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
+        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
+        marks=[pytest.mark.skip("HF import fails")],
     ),
     "minicpmv_26": VLMTestInfo(
         models=["openbmb/MiniCPM-V-2_6"],
@@ -445,6 +450,8 @@
         get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
+        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
+        marks=[pytest.mark.skip("HF import fails")],
     ),
     "minimax_vl_01": VLMTestInfo(
         models=["MiniMaxAI/MiniMax-VL-01"],
diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/multimodal/generation/test_florence2.py
similarity index 100%
rename from tests/models/encoder_decoder/vision_language/test_florence2.py
rename to tests/models/multimodal/generation/test_florence2.py
diff --git a/tests/models/decoder_only/audio_language/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py
similarity index 100%
rename from tests/models/decoder_only/audio_language/test_granite_speech.py
rename to tests/models/multimodal/generation/test_granite_speech.py
diff --git a/tests/models/decoder_only/vision_language/test_interleaved.py b/tests/models/multimodal/generation/test_interleaved.py
similarity index 99%
rename from tests/models/decoder_only/vision_language/test_interleaved.py
rename to tests/models/multimodal/generation/test_interleaved.py
index 8804497ae61..92c8155fe1e 100644
--- a/tests/models/decoder_only/vision_language/test_interleaved.py
+++ b/tests/models/multimodal/generation/test_interleaved.py
@@ -16,6 +16,7 @@ def base_prompt(modalities_str: str) -> str:
 NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n")
 
 
+@pytest.mark.core_model
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("dtype", ["float16"])
 @pytest.mark.parametrize("max_tokens", [128])
diff --git a/tests/models/decoder_only/vision_language/test_intern_vit.py b/tests/models/multimodal/generation/test_intern_vit.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/test_intern_vit.py
rename to tests/models/multimodal/generation/test_intern_vit.py
diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/multimodal/generation/test_mllama.py
similarity index 96%
rename from tests/models/encoder_decoder/vision_language/test_mllama.py
rename to tests/models/multimodal/generation/test_mllama.py
index d94c2e885cb..1e09c8673dc 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/multimodal/generation/test_mllama.py
@@ -17,7 +17,8 @@
 from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
                           _ImageAssets)
 from ....quantization.utils import is_quant_method_supported
-from ....utils import large_gpu_test
+from ....utils import (create_new_process_for_each_test, large_gpu_test,
+                       multi_gpu_test)
 from ...utils import check_logprobs_close
 
 _LIMIT_IMAGE_PER_PROMPT = 3
@@ -393,6 +394,37 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
         )
 
 
+@create_new_process_for_each_test()
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models_distributed(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    distributed_executor_backend,
+    model,
+    dtype,
+    max_tokens,
+    num_logprobs,
+) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model=model,
+        size_factors=[0.25, 0.5, 1.0],
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=2,
+        distributed_executor_backend=distributed_executor_backend,
+    )
+
+
 @large_gpu_test(min_gb=48)
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", models)
diff --git a/tests/models/decoder_only/vision_language/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/test_phi4mm.py
rename to tests/models/multimodal/generation/test_phi4mm.py
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/test_pixtral.py
rename to tests/models/multimodal/generation/test_pixtral.py
diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/test_qwen2_vl.py
rename to tests/models/multimodal/generation/test_qwen2_vl.py
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/multimodal/generation/test_ultravox.py
similarity index 100%
rename from tests/models/decoder_only/audio_language/test_ultravox.py
rename to tests/models/multimodal/generation/test_ultravox.py
diff --git a/tests/models/encoder_decoder/audio_language/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py
similarity index 83%
rename from tests/models/encoder_decoder/audio_language/test_whisper.py
rename to tests/models/multimodal/generation/test_whisper.py
index 7897bf113d3..4603b4e8e83 100644
--- a/tests/models/encoder_decoder/audio_language/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -1,15 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
-
-Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
-"""
 from typing import Optional
 
 import pytest
 
-from vllm import LLM, SamplingParams
+from vllm import SamplingParams
 from vllm.assets.audio import AudioAsset
 
+from ....conftest import VllmRunner
 from ....utils import create_new_process_for_each_test, multi_gpu_test
 
 PROMPTS = [
@@ -92,6 +89,7 @@
 
 
 def run_test(
+    vllm_runner: type[VllmRunner],
     model: str,
     *,
     tensor_parallel_size: int,
@@ -100,38 +98,52 @@ def run_test(
     prompt_list = PROMPTS * 10
     expected_list = EXPECTED[model] * 10
 
-    llm = LLM(
-        model=model,
-        tensor_parallel_size=tensor_parallel_size,
-        distributed_executor_backend=distributed_executor_backend,
-    )
+    with vllm_runner(
+            model,
+            max_model_len=448,
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+    ) as vllm_model:
+        llm = vllm_model.model
 
-    sampling_params = SamplingParams(
-        temperature=0,
-        top_p=1.0,
-        max_tokens=200,
-    )
+        sampling_params = SamplingParams(
+            temperature=0,
+            top_p=1.0,
+            max_tokens=200,
+        )
 
-    outputs = llm.generate(prompt_list, sampling_params)
+        outputs = llm.generate(prompt_list, sampling_params)
 
     for output, expected in zip(outputs, expected_list):
         print(output.outputs[0].text)
         assert output.outputs[0].text == expected
 
 
-@create_new_process_for_each_test()
+@create_new_process_for_each_test("spawn")
 @pytest.mark.core_model
 @pytest.mark.parametrize(
     "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
-def test_models(model) -> None:
-    run_test(model, tensor_parallel_size=1)
+def test_models(vllm_runner, model) -> None:
+    run_test(
+        vllm_runner,
+        model,
+        tensor_parallel_size=1,
+    )
 
 
+@create_new_process_for_each_test("spawn")
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
 @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-def test_models_distributed(model, distributed_executor_backend) -> None:
-    run_test(model,
-             tensor_parallel_size=2,
-             distributed_executor_backend=distributed_executor_backend)
+def test_models_distributed(
+    vllm_runner,
+    model,
+    distributed_executor_backend,
+) -> None:
+    run_test(
+        vllm_runner,
+        model,
+        tensor_parallel_size=2,
+        distributed_executor_backend=distributed_executor_backend,
+    )
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/__init__.py b/tests/models/multimodal/generation/vlm_utils/__init__.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/vlm_utils/__init__.py
rename to tests/models/multimodal/generation/vlm_utils/__init__.py
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/vlm_utils/builders.py
rename to tests/models/multimodal/generation/vlm_utils/builders.py
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
rename to tests/models/multimodal/generation/vlm_utils/case_filtering.py
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/vlm_utils/core.py
rename to tests/models/multimodal/generation/vlm_utils/core.py
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
rename to tests/models/multimodal/generation/vlm_utils/custom_inputs.py
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
rename to tests/models/multimodal/generation/vlm_utils/model_utils.py
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/runners.py b/tests/models/multimodal/generation/vlm_utils/runners.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/vlm_utils/runners.py
rename to tests/models/multimodal/generation/vlm_utils/runners.py
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/vlm_utils/types.py
rename to tests/models/multimodal/generation/vlm_utils/types.py
diff --git a/tests/models/embedding/__init__.py b/tests/models/multimodal/pooling/__init__.py
similarity index 100%
rename from tests/models/embedding/__init__.py
rename to tests/models/multimodal/pooling/__init__.py
diff --git a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
similarity index 99%
rename from tests/models/embedding/vision_language/test_dse_qwen2_vl.py
rename to tests/models/multimodal/pooling/test_dse_qwen2_vl.py
index 3c15b0b5526..ea1caec0ecf 100644
--- a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
@@ -10,7 +10,7 @@
 
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
-from ..utils import check_embeddings_close
+from ...utils import check_embeddings_close
 
 HF_TEXT_PROMPTS = [
     # T -> X
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/multimodal/pooling/test_llava_next.py
similarity index 99%
rename from tests/models/embedding/vision_language/test_llava_next.py
rename to tests/models/multimodal/pooling/test_llava_next.py
index 4da59ff505e..77508738cc8 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/multimodal/pooling/test_llava_next.py
@@ -8,7 +8,7 @@
 
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
-from ..utils import check_embeddings_close
+from ...utils import check_embeddings_close
 
 # Llava Next embedding implementation is only supported by CUDA.
 # If run on ROCm, hf_model.model.resize_token_embeddings will
diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/multimodal/pooling/test_phi3v.py
similarity index 98%
rename from tests/models/embedding/vision_language/test_phi3v.py
rename to tests/models/multimodal/pooling/test_phi3v.py
index f9985bd8a2e..cd58a5cb453 100644
--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/multimodal/pooling/test_phi3v.py
@@ -9,7 +9,7 @@
 
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
-from ..utils import check_embeddings_close
+from ...utils import check_embeddings_close
 
 HF_TEXT_PROMPTS = [
     # T -> X
diff --git a/tests/models/embedding/language/__init__.py b/tests/models/quantization/__init__.py
similarity index 100%
rename from tests/models/embedding/language/__init__.py
rename to tests/models/quantization/__init__.py
diff --git a/tests/models/decoder_only/language/test_aqlm.py b/tests/models/quantization/test_aqlm.py
similarity index 99%
rename from tests/models/decoder_only/language/test_aqlm.py
rename to tests/models/quantization/test_aqlm.py
index 85557b30d8b..c4e142fcc9b 100644
--- a/tests/models/decoder_only/language/test_aqlm.py
+++ b/tests/models/quantization/test_aqlm.py
@@ -39,7 +39,6 @@
 ]
 
 
-@pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("aqlm"),
                     reason="AQLM is not supported on this GPU type.")
 @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
diff --git a/tests/models/decoder_only/vision_language/test_awq.py b/tests/models/quantization/test_awq.py
similarity index 97%
rename from tests/models/decoder_only/vision_language/test_awq.py
rename to tests/models/quantization/test_awq.py
index 6cc81d2b9ed..c02c3d90e34 100644
--- a/tests/models/decoder_only/vision_language/test_awq.py
+++ b/tests/models/quantization/test_awq.py
@@ -7,8 +7,8 @@
 
 from vllm.multimodal.image import rescale_image_size
 
-from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
-from ...utils import check_logprobs_close
+from ...conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
+from ..utils import check_logprobs_close
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
@@ -85,7 +85,6 @@ def run_awq_test(
         )
 
 
-@pytest.mark.quant_model
 @pytest.mark.parametrize(
     ("source_model", "quant_model"),
     [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")],
diff --git a/tests/models/test_bitblas.py b/tests/models/quantization/test_bitblas.py
similarity index 97%
rename from tests/models/test_bitblas.py
rename to tests/models/quantization/test_bitblas.py
index ae4a52214ad..6d7c30126fc 100644
--- a/tests/models/test_bitblas.py
+++ b/tests/models/quantization/test_bitblas.py
@@ -15,7 +15,7 @@
 
 import pytest
 
-from .utils import check_logprobs_close
+from ..utils import check_logprobs_close
 
 
 @dataclass
diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/quantization/test_fp8.py
similarity index 97%
rename from tests/models/decoder_only/language/test_fp8.py
rename to tests/models/quantization/test_fp8.py
index 51abcb7172c..4d15675a3ab 100644
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@@ -4,20 +4,15 @@
 """Tests fp8 models against ground truth generation
 Note: these tests will only pass on L4 GPU.
 """
-import os
-from typing import Optional
-
 import pytest
 
-from tests.kernels.utils import override_backend_env_variable
 from tests.quantization.utils import is_quant_method_supported
 from vllm.platforms import current_platform
 from vllm.utils import STR_BACKEND_ENV_VAR
 
-from ...utils import check_logprobs_close
+from ..utils import check_logprobs_close
 
 
-@pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize(
diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/quantization/test_gguf.py
similarity index 97%
rename from tests/models/decoder_only/language/test_gguf.py
rename to tests/models/quantization/test_gguf.py
index 925e7104eae..3ff36502df5 100644
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -14,9 +14,9 @@
 
 from tests.quantization.utils import is_quant_method_supported
 
-from ....conftest import VllmRunner
-from ....utils import multi_gpu_test
-from ...utils import check_logprobs_close
+from ...conftest import VllmRunner
+from ...utils import multi_gpu_test
+from ..utils import check_logprobs_close
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
@@ -38,7 +38,6 @@ def gguf_model(self):
     original_model="meta-llama/Llama-3.2-1B-Instruct",
     gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
     gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf",
-    marks=[pytest.mark.quant_model],
 )
 
 QWEN2_CONFIG = GGUFTestConfig(
diff --git a/tests/models/test_gptq_bitblas.py b/tests/models/quantization/test_gptq_bitblas.py
similarity index 97%
rename from tests/models/test_gptq_bitblas.py
rename to tests/models/quantization/test_gptq_bitblas.py
index d28442120ea..98cd03eb741 100644
--- a/tests/models/test_gptq_bitblas.py
+++ b/tests/models/quantization/test_gptq_bitblas.py
@@ -15,7 +15,7 @@
 
 import pytest
 
-from .utils import check_logprobs_close
+from ..utils import check_logprobs_close
 
 
 @dataclass
diff --git a/tests/models/decoder_only/language/test_gptq_marlin.py b/tests/models/quantization/test_gptq_marlin.py
similarity index 97%
rename from tests/models/decoder_only/language/test_gptq_marlin.py
rename to tests/models/quantization/test_gptq_marlin.py
index 0f61466c399..c6e7d234d1a 100644
--- a/tests/models/decoder_only/language/test_gptq_marlin.py
+++ b/tests/models/quantization/test_gptq_marlin.py
@@ -16,7 +16,7 @@
 from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
 
-from ...utils import check_logprobs_close
+from ..utils import check_logprobs_close
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
@@ -34,7 +34,6 @@
 ]
 
 
-@pytest.mark.quant_model
 @pytest.mark.flaky(reruns=3)
 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                     reason="gptq_marlin is not supported on this GPU type.")
diff --git a/tests/models/decoder_only/language/test_gptq_marlin_24.py b/tests/models/quantization/test_gptq_marlin_24.py
similarity index 97%
rename from tests/models/decoder_only/language/test_gptq_marlin_24.py
rename to tests/models/quantization/test_gptq_marlin_24.py
index c8162614849..c1000b181b0 100644
--- a/tests/models/decoder_only/language/test_gptq_marlin_24.py
+++ b/tests/models/quantization/test_gptq_marlin_24.py
@@ -13,7 +13,7 @@
 
 from tests.quantization.utils import is_quant_method_supported
 
-from ...utils import check_logprobs_close
+from ..utils import check_logprobs_close
 
 
 @dataclass
@@ -39,7 +39,6 @@ class ModelPair:
 ]
 
 
-@pytest.mark.quant_model
 @pytest.mark.flaky(reruns=2)
 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"),
                     reason="Marlin24 is not supported on this GPU type.")
diff --git a/tests/models/decoder_only/language/test_modelopt.py b/tests/models/quantization/test_modelopt.py
similarity index 99%
rename from tests/models/decoder_only/language/test_modelopt.py
rename to tests/models/quantization/test_modelopt.py
index a997b9e6640..1d9aa4fa8ad 100644
--- a/tests/models/decoder_only/language/test_modelopt.py
+++ b/tests/models/quantization/test_modelopt.py
@@ -40,7 +40,6 @@
 @pytest.mark.skip(
     reason=
     "Prevent unstable test based on golden strings from breaking the build.")
-@pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)
diff --git a/tests/models/decoder_only/language/test_nvfp4.py b/tests/models/quantization/test_nvfp4.py
similarity index 99%
rename from tests/models/decoder_only/language/test_nvfp4.py
rename to tests/models/quantization/test_nvfp4.py
index 442e8e93cfa..f94f3457c37 100644
--- a/tests/models/decoder_only/language/test_nvfp4.py
+++ b/tests/models/quantization/test_nvfp4.py
@@ -41,7 +41,6 @@
     reason=
     "Prevent unstable test based on golden strings from breaking the build "
     " and test input model being too large and hanging the system.")
-@pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("nvfp4"),
                     reason="nvfp4 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 5407540114b..bb87863d076 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -2,9 +2,10 @@
 
 import warnings
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
 
 import torch
+import torch.nn.functional as F
 
 from vllm.config import ModelConfig, TaskOption
 from vllm.inputs import InputContext
@@ -12,6 +13,9 @@
 
 from .registry import HF_EXAMPLE_MODELS
 
+if TYPE_CHECKING:
+    from ..conftest import HfRunner
+
 TokensText = tuple[list[int], str]
 
 
@@ -291,3 +295,63 @@ def build_model_context(
         **model_config_kwargs,
     )
     return InputContext(model_config)
+
+
+def check_embeddings_close(
+    *,
+    embeddings_0_lst: Sequence[list[float]],
+    embeddings_1_lst: Sequence[list[float]],
+    name_0: str,
+    name_1: str,
+    tol: float = 1e-3,
+) -> None:
+    assert len(embeddings_0_lst) == len(embeddings_1_lst)
+
+    for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
+            zip(embeddings_0_lst, embeddings_1_lst)):
+        assert len(embeddings_0) == len(embeddings_1), (
+            f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
+
+        sim = F.cosine_similarity(torch.tensor(embeddings_0),
+                                  torch.tensor(embeddings_1),
+                                  dim=0)
+
+        fail_msg = (f"Test{prompt_idx}:"
+                    f"\n{name_0}:\t{embeddings_0[:16]!r}"
+                    f"\n{name_1}:\t{embeddings_1[:16]!r}")
+
+        assert sim >= 1 - tol, fail_msg
+
+
+def matryoshka_fy(tensor: torch.Tensor, dimensions: int):
+    tensor = torch.tensor(tensor)
+    tensor = tensor[..., :dimensions]
+    tensor = F.normalize(tensor, p=2, dim=1)
+    return tensor
+
+
+class EmbedModelInfo(NamedTuple):
+    name: str
+    is_matryoshka: bool
+    matryoshka_dimensions: Optional[list[int]] = None
+    architecture: str = ""
+    enable_test: bool = True
+
+
+def run_embedding_correctness_test(
+    hf_model: "HfRunner",
+    inputs: list[str],
+    vllm_outputs: Sequence[list[float]],
+    dimensions: Optional[int] = None,
+):
+    hf_outputs = hf_model.encode(inputs)
+    if dimensions:
+        hf_outputs = matryoshka_fy(hf_outputs, dimensions)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
diff --git a/vllm/config.py b/vllm/config.py
index f9c5e25a47d..b686464a09d 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1043,8 +1043,10 @@ def get_head_size(self) -> int:
         if self.is_attention_free:
             return 0
 
-        if hasattr(self.hf_text_config, "head_dim"):
+        # NOTE: Some configs may set head_dim=None in the config
+        if getattr(self.hf_text_config, "head_dim", None) is not None:
             return self.hf_text_config.head_dim
+
         # FIXME(woosuk): This may not be true for all models.
         return (self.hf_text_config.hidden_size //
                 self.hf_text_config.num_attention_heads)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 38a18180e23..7a3ea7a6876 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -127,8 +127,10 @@ def __init__(self,
             assert tp_size % self.total_num_kv_heads == 0
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
         # MistralConfig has an optional head_dim introduced by Mistral-Nemo
-        self.head_dim = getattr(config, "head_dim",
-                                self.hidden_size // self.total_num_heads)
+        head_dim = getattr(config, "head_dim", None)
+        if head_dim is None:
+            head_dim = self.hidden_size // self.total_num_heads
+        self.head_dim = head_dim
         # Phi models introduced a partial_rotary_factor parameter in the config
         self.partial_rotary_factor = getattr(config, "partial_rotary_factor",
                                              1)