From d502e50f44491d24189433f30698635fa5a07bcf Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 23 Apr 2025 17:12:05 -0700 Subject: [PATCH 01/10] Remove sampler from model code Signed-off-by: Woosuk Kwon --- vllm/model_executor/models/arctic.py | 10 --------- vllm/model_executor/models/aria.py | 12 +---------- vllm/model_executor/models/aya_vision.py | 16 -------------- vllm/model_executor/models/baichuan.py | 10 --------- vllm/model_executor/models/bamba.py | 10 --------- vllm/model_executor/models/bart.py | 10 --------- vllm/model_executor/models/blip2.py | 18 +--------------- vllm/model_executor/models/bloom.py | 10 --------- vllm/model_executor/models/chameleon.py | 10 --------- vllm/model_executor/models/chatglm.py | 10 --------- vllm/model_executor/models/commandr.py | 10 --------- vllm/model_executor/models/dbrx.py | 10 --------- vllm/model_executor/models/deepseek.py | 10 --------- vllm/model_executor/models/deepseek_mtp.py | 11 ---------- vllm/model_executor/models/deepseek_v2.py | 10 --------- vllm/model_executor/models/deepseek_vl2.py | 16 -------------- vllm/model_executor/models/eagle.py | 13 ------------ vllm/model_executor/models/exaone.py | 11 ---------- vllm/model_executor/models/falcon.py | 10 --------- vllm/model_executor/models/florence2.py | 21 ------------------- vllm/model_executor/models/fuyu.py | 13 ------------ vllm/model_executor/models/gemma.py | 10 --------- vllm/model_executor/models/gemma2.py | 10 --------- vllm/model_executor/models/gemma3.py | 10 --------- vllm/model_executor/models/gemma3_mm.py | 16 ++------------ vllm/model_executor/models/glm4.py | 10 --------- vllm/model_executor/models/gpt2.py | 10 --------- vllm/model_executor/models/gpt_bigcode.py | 10 --------- vllm/model_executor/models/gpt_j.py | 10 --------- vllm/model_executor/models/gpt_neox.py | 10 --------- vllm/model_executor/models/granite.py | 11 ---------- vllm/model_executor/models/grok1.py | 10 --------- vllm/model_executor/models/interfaces_base.py | 9 -------- vllm/model_executor/models/llama.py | 8 ------- vllm/model_executor/models/mixtral.py | 10 --------- vllm/model_executor/models/mixtral_quant.py | 10 --------- vllm/model_executor/models/molmo.py | 12 +---------- vllm/model_executor/models/opt.py | 10 --------- vllm/model_executor/models/paligemma.py | 14 +------------ vllm/model_executor/models/phi.py | 10 --------- vllm/model_executor/models/phi4mm.py | 10 --------- vllm/model_executor/models/phimoe.py | 10 --------- vllm/model_executor/models/pixtral.py | 15 ------------- vllm/model_executor/models/qwen.py | 10 --------- vllm/model_executor/models/qwen2.py | 10 --------- .../models/qwen2_5_omni_thinker.py | 17 +-------------- vllm/model_executor/models/qwen2_5_vl.py | 17 +-------------- vllm/model_executor/models/qwen2_audio.py | 16 -------------- vllm/model_executor/models/qwen2_moe.py | 10 --------- vllm/model_executor/models/qwen2_vl.py | 17 +-------------- vllm/model_executor/models/qwen3.py | 10 --------- vllm/model_executor/models/qwen3_moe.py | 10 --------- vllm/model_executor/models/solar.py | 11 ---------- vllm/model_executor/models/stablelm.py | 10 --------- vllm/model_executor/models/starcoder2.py | 10 --------- vllm/model_executor/models/ultravox.py | 16 -------------- vllm/model_executor/models/whisper.py | 10 --------- vllm/model_executor/models/zamba2.py | 19 ----------------- 58 files changed, 9 insertions(+), 670 deletions(-) diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 065715cbde4..dfe8f20c70d 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -24,7 +24,6 @@ from vllm.model_executor.layers.quantization.deepspeedfp import ( DeepSpeedFPConfig, DeepSpeedFPParameter) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -435,7 +434,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.unpadded_vocab_size = config.vocab_size self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -462,14 +460,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index edf67c860e9..7c716efab8e 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -15,11 +15,10 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.sampler import (SamplerOutput, - SamplingMetadata, get_sampler) from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) +from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargs) @@ -527,7 +526,6 @@ def __init__( logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, self.vocab_size, logit_scale) - self.sampler = get_sampler() def _validate_image_sizes( self, images: List[torch.Tensor]) -> List[torch.Tensor]: @@ -653,14 +651,6 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 8700f24d2bd..d152287e8fa 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 Adapted from # https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision -from functools import cached_property from typing import (Iterable, Literal, Mapping, Optional, Sequence, Set, Tuple, TypedDict, Union, cast) @@ -17,7 +16,6 @@ from vllm.config import VllmConfig from vllm.jsontree import json_map_leaves -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs @@ -461,17 +459,3 @@ def compute_logits( ) -> Optional[torch.Tensor]: return self.language_model.compute_logits(hidden_states, sampling_metadata) - - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - - return get_sampler() - - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 6a3112b5f76..444ed38d05c 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -39,7 +39,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -396,7 +395,6 @@ def __init__( if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -423,14 +421,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index dfb8f49cc01..16dac6123d6 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -24,7 +24,6 @@ MambaMixer2, extra_groups_for_head_shards) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -462,7 +461,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -538,14 +536,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index 04d6cde555e..bcfbe92c3a1 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -37,7 +37,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -791,7 +790,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = get_sampler() def forward( self, @@ -828,14 +826,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - stacked_params_mapping = { "q_proj": { "param_name": "qkv_proj", diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index a6f00f99977..17c857ce096 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from collections.abc import Iterable, Mapping, Sequence -from functools import cached_property from typing import Literal, Optional, Set, Tuple, TypedDict, Union import torch @@ -12,7 +11,6 @@ from vllm.config import CacheConfig, VllmConfig from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -530,13 +528,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - - return get_sampler() - def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -649,7 +640,7 @@ def forward( intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, - ) -> Union[SamplerOutput, IntermediateTensors]: + ) -> IntermediateTensors: """Run forward pass for BLIP-2. One key thing to understand is the `input_ids` already accounts for the @@ -707,13 +698,6 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index f960075b98b..74d401b295c 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -35,7 +35,6 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -297,7 +296,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config.hidden_size) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) @@ -324,14 +322,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 0ad5e89df2e..e2c275300f8 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -22,7 +22,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -950,7 +949,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -1054,14 +1052,6 @@ def compute_logits( return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 1b1738f882b..233e9ee0a25 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -21,7 +21,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -429,7 +428,6 @@ def __init__( self.transformer.embedding.weight) self.lm_head = self.transformer.output_layer self.logits_processor = LogitsProcessor(config.padded_vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) @@ -442,14 +440,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index bb8d9bf8a03..090c6e46439 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -38,7 +38,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -365,7 +364,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): scale=config.logit_scale) self.model = CohereModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -399,14 +397,6 @@ def compute_logits( return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index b66529860bc..40c0a73f52d 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -16,7 +16,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -390,7 +389,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) @@ -417,14 +415,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: expert_params_mapping = [( diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 5e036d049a8..c6421143dd6 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -43,7 +43,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -453,7 +452,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -480,14 +478,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index e7fde76cd0b..b50175cf764 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -10,7 +10,6 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -154,8 +153,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): prefix=maybe_prefix( prefix, "model")) - self.sampler = get_sampler() - def forward( self, input_ids: torch.Tensor, @@ -179,14 +176,6 @@ def compute_logits( return self.model.compute_logits(hidden_states, sampling_metadata, spec_step_idx) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 23b450aedda..ffa5840b460 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -44,7 +44,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -686,7 +685,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else: self.lm_head = PPMissingLayer() self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -713,14 +711,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def make_empty_intermediate_tensors( self, batch_size: int, dtype: torch.dtype, device: torch.device) -> IntermediateTensors: diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index c3dbadb2927..ac136698ee1 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -4,7 +4,6 @@ """Inference-only Deepseek-VL2 model compatible with HuggingFace weights.""" import math from collections.abc import Iterable, Mapping, Sequence -from functools import cached_property from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union import torch @@ -16,7 +15,6 @@ from vllm.config import VllmConfig from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.utils import set_default_torch_dtype from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -393,13 +391,6 @@ def _init_vision_module( model = model.to(dtype=torch.get_default_dtype()) return model - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - - return get_sampler() - def _validate_pixel_values( self, data: Union[torch.Tensor, List[torch.Tensor]] ) -> Union[torch.Tensor, List[torch.Tensor]]: @@ -647,13 +638,6 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index 3e4a5040b7c..4ff1e785494 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -9,7 +9,6 @@ from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -131,10 +130,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # checkpoint file has token_map tensor. self.token_map = None - @property - def sampler(self): - return self.model.sampler - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.model.get_input_embeddings(input_ids) @@ -188,14 +183,6 @@ def compute_logits(self, hidden_states: torch.Tensor, return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # This implementation is incompitable with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B # due to missing lm_head weights and its config being that of a diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 553c524ebc3..4a6490cd127 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -510,8 +509,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else: self.lm_head = PPMissingLayer() - self.sampler = get_sampler() - self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) @@ -538,14 +535,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 0e67b1ec94f..e7e03fc0997 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -40,7 +40,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -473,7 +472,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config=quant_config, ) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) @@ -500,14 +498,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index 359cc7f3773..d1a36c3f481 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -3,7 +3,6 @@ import math from collections import OrderedDict from collections.abc import Iterable, Mapping, Sequence -from functools import cached_property from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union import torch @@ -14,7 +13,6 @@ from vllm.config import VllmConfig from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.bart import (BartDecoder, BartEncoder, BartParallelLMHead, @@ -673,7 +671,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(self.vocab_size, config.vocab_size) - self.sampler = get_sampler() def forward( self, @@ -716,11 +713,6 @@ def compute_logits( sampling_metadata) return logits - def sample(self, logits: torch.Tensor, - sampling_metadata: SamplingMetadata) -> SamplerOutput: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ @@ -929,12 +921,6 @@ def _build_image_projection_layers(self, config: PretrainedConfig): raise NotImplementedError( 'Florence2 only supports COSINE as temporal embedding.') - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - return get_sampler() - def _validate_pixel_values( self, data: Union[torch.Tensor, List[torch.Tensor]] ) -> Union[torch.Tensor, List[torch.Tensor]]: @@ -1110,13 +1096,6 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> SamplerOutput: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 27cd8d0986a..d6bd6155a44 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -27,7 +27,6 @@ from vllm.config import VllmConfig from vllm.model_executor.layers.linear import ColumnParallelLinear -from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY @@ -270,10 +269,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - @property - def sampler(self): - return self.language_model.sampler - def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: h = w = self.config.patch_size @@ -387,14 +382,6 @@ def compute_logits( self.language_model.lm_head, hidden_states, sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.language_model.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 92d99883c77..c1cc0df1117 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -35,7 +35,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -388,7 +387,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.model = GemmaModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -415,14 +413,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index d125c666f3c..35e698fca41 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -34,7 +34,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -388,7 +387,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): prefix=maybe_prefix(prefix, "model")) self.logits_processor = LogitsProcessor( config.vocab_size, soft_cap=config.final_logit_softcapping) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -415,14 +413,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index fb8eccc5507..34957300ff9 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -34,7 +34,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -493,7 +492,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): prefix=maybe_prefix(prefix, "model")) self.logits_processor = LogitsProcessor( config.vocab_size, soft_cap=config.final_logit_softcapping) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -521,14 +519,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index e5a3d6762ff..6d3e6e389c4 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import math from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Any, Literal, Optional, Set, Tuple, TypedDict import torch from torch import nn @@ -12,7 +12,6 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import GemmaRMSNorm -from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY @@ -503,10 +502,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def dtype(self): return next(self.parameters()).dtype - @property - def sampler(self): - return self.language_model.sampler - def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -607,7 +602,7 @@ def forward(self, positions: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, - **kwargs: object) -> Union[SamplerOutput, IntermediateTensors]: + **kwargs: object) -> IntermediateTensors: if intermediate_tensors is not None: inputs_embeds = None @@ -704,13 +699,6 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 28cebfbd7ba..290be968cb5 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -37,7 +37,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors @@ -267,7 +266,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.lm_head = PPMissingLayer() self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -295,14 +293,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 776c03f652b..e3219333915 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -35,7 +35,6 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -255,7 +254,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.lm_head = self.lm_head.tie_weights(self.transformer.wte) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) @@ -282,14 +280,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 40d01a2ecfc..def6b1544d8 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -35,7 +35,6 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -302,7 +301,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.unpadded_vocab_size += lora_config.lora_extra_vocab_size self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) @@ -329,14 +327,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 1f73d2ab0f1..3db96fb8e18 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -34,7 +34,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -306,7 +305,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config=quant_config, ) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) @@ -333,14 +331,6 @@ def compute_logits( sampling_metadata, self.lm_head.bias) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 582b2ff7e75..620ee66f57e 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -34,7 +34,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -299,7 +298,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): if self.config.tie_word_embeddings: self.embed_out.weight = self.gpt_neox.embed_in.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.gpt_neox.make_empty_intermediate_tensors) @@ -326,14 +324,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 3bd6332c11c..0696a7245c2 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -441,8 +440,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else: self.lm_head = PPMissingLayer() - self.sampler = get_sampler() - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) @@ -464,14 +461,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def make_empty_intermediate_tensors( self, batch_size: int, dtype: torch.dtype, device: torch.device) -> IntermediateTensors: diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index ef96257ba4b..c48cb157084 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -39,7 +39,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -521,7 +520,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.vocab_size, self.output_multiplier_scale) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -551,14 +549,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: skip_prefixes = ["rotary_emb.inv_freq"] diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 22c9287509e..f141dcf3cd4 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -13,7 +13,6 @@ if TYPE_CHECKING: from vllm.config import VllmConfig from vllm.model_executor.layers.pooler import PoolerOutput - from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -103,14 +102,6 @@ def compute_logits( """Return `None` if TP rank > 0.""" ... - def sample( - self, - logits: T, - sampling_metadata: "SamplingMetadata", - ) -> "SamplerOutput": - """Only called on TP rank 0.""" - ... - @overload def is_text_generation_model( diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index caa4a5108a9..31ffa4e1e63 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -40,7 +40,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -515,8 +514,6 @@ def __init__(self, else: self.lm_head = PPMissingLayer() - self.sampler = get_sampler() - self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -551,11 +548,6 @@ def compute_logits( sampling_metadata) return logits - def sample(self, logits: torch.Tensor, - sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index b0ac99f21ea..1513c8dad09 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -40,7 +40,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -454,7 +453,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -481,14 +479,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self, skip_prefixes=["rotary_emb.inv_freq"]) diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 96eb925cf89..7c022a5b8f6 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -42,7 +42,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -372,7 +371,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -399,14 +397,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index d75845b45e7..46147a333b0 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -35,7 +35,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -1394,7 +1393,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(config.embedding_size or config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -1506,7 +1504,7 @@ def forward( intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, - ) -> SamplerOutput: + ) -> torch.Tensor: if intermediate_tensors is not None: inputs_embeds = None @@ -1532,14 +1530,6 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 14aae2fbf81..d258eddae25 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -35,7 +35,6 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -375,7 +374,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.lm_head = ParallelLMHead(config.vocab_size, config.word_embed_proj_dim) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -402,14 +400,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 6c1bd499f63..8699ae52622 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -8,7 +8,6 @@ from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -260,10 +259,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - @property - def sampler(self): - return self.language_model.sampler - def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -369,7 +364,7 @@ def forward(self, positions: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, - **kwargs: object) -> Union[SamplerOutput, IntermediateTensors]: + **kwargs: object) -> IntermediateTensors: if intermediate_tensors is not None: inputs_embeds = None @@ -396,13 +391,6 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index fdf7734595a..fc2b108bad9 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -53,7 +53,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -322,7 +321,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): bias=True, quant_config=quant_config) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -350,14 +348,6 @@ def compute_logits( sampling_metadata, self.lm_head.bias) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 03ca143f9c0..6035994f433 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -13,7 +13,6 @@ from vllm.distributed import get_pp_group from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead) from vllm.model_executor.models.llama import LlamaModel @@ -968,7 +967,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = get_sampler() def _parse_and_validate_audio_input( self, **kwargs: object) -> Optional[Phi4MMAudioInputs]: @@ -1244,14 +1242,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> None: weights = ((name, data) for name, data in weights diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 381a33d98b9..2dc55e4c352 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -40,7 +40,6 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -634,7 +633,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -659,14 +657,6 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 38e140a91ec..58323d639d5 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -28,7 +28,6 @@ QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs @@ -331,13 +330,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - - return get_sampler() - def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[PixtralImagePixelInputs]: images = kwargs.pop("images", None) @@ -441,13 +433,6 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): def is_vision_encoder_weights(weight: Tuple[str, torch.Tensor]): diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index a33739a8eef..e75294bc6cb 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -24,7 +24,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -273,7 +272,6 @@ def __init__( if self.config.tie_word_embeddings: self.lm_head.weight = self.transformer.wte.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) @@ -286,14 +284,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 2831a5a1233..f76f31c9fc8 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -43,7 +43,6 @@ from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -450,7 +449,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.lm_head = PPMissingLayer() self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -478,14 +476,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 19bdae1de70..93c21fc55c5 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -22,7 +22,7 @@ """Inference-only Qwen2.5-Omni model (thinker part).""" from copy import copy -from functools import cached_property, partial +from functools import partial from typing import (Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple, Union) @@ -40,7 +40,6 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.models.qwen2_5_vl import ( Qwen2_5_VisionTransformer, Qwen2_5_VLImageEmbeddingInputs, Qwen2_5_VLImageInputs, Qwen2_5_VLImagePixelInputs, @@ -790,13 +789,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - - return get_sampler() - def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: mm_input_by_modality = {} @@ -937,13 +929,6 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 0ab55411bad..76175d79fb5 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -24,7 +24,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2.5-VL model compatible with HuggingFace weights.""" -from functools import cached_property, partial +from functools import partial from typing import (Callable, Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) @@ -51,7 +51,6 @@ from vllm.model_executor.layers.quantization.gptq import GPTQConfig from vllm.model_executor.layers.quantization.gptq_marlin import ( GPTQMarlinConfig) -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY @@ -833,13 +832,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - - return get_sampler() - def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): # GPTQ configs do not have a list of ignored modules, however AutoGPTQ # seems to avoid vision encoder sections for some models. @@ -1127,13 +1119,6 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 280cda0f68f..0cb541c6cbb 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -22,7 +22,6 @@ # limitations under the License. """Inference-only Qwen2-Audio model compatible with HuggingFace weights.""" from collections.abc import Iterable, Mapping, Sequence -from functools import cached_property from typing import Any, Optional, Set, Tuple, TypedDict, Union import torch @@ -34,7 +33,6 @@ from transformers.models.whisper import WhisperFeatureExtractor from vllm.config import VllmConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -267,13 +265,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - - return get_sampler() - def _validate_and_reshape_mm_tensor(self, mm_input: object, name: str) -> torch.Tensor: if not isinstance(mm_input, (torch.Tensor, list)): @@ -405,13 +396,6 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 2700c706b97..62696678b7a 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -47,7 +47,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -497,7 +496,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -524,14 +522,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 8c24b8f7df5..8c2895ce9d1 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -24,7 +24,7 @@ # limitations under the License. """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" from collections.abc import Iterable, Mapping, Sequence -from functools import cached_property, partial +from functools import partial from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict, Union) @@ -51,7 +51,6 @@ from vllm.model_executor.layers.quantization.gptq import GPTQConfig from vllm.model_executor.layers.quantization.gptq_marlin import ( GPTQMarlinConfig) -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY @@ -1112,13 +1111,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - - return get_sampler() - def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): # GPTQ configs do not have a list of ignored modules, however AutoGPTQ # seems to avoid vision encoder sections for some models. @@ -1400,13 +1392,6 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 9c14038e611..73d2838f461 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -38,7 +38,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors @@ -283,7 +282,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.lm_head = PPMissingLayer() self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -311,14 +309,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index f0ef79dfdfe..70f9956e3ef 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -44,7 +44,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -494,7 +493,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -521,14 +519,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 1cae0a7fe0d..f86aff7ba7e 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -418,8 +417,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else: self.lm_head = PPMissingLayer() - self.sampler = get_sampler() - self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -440,14 +437,6 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 53f520304ab..13460d1dfd4 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -36,7 +36,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -310,7 +309,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -337,14 +335,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 8b9fb7cb7bc..6eebe4c4d61 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -36,7 +36,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -317,7 +316,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -344,14 +342,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index cb5ff4ed636..bfa48099b74 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -3,7 +3,6 @@ # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py """PyTorch Ultravox model.""" from collections.abc import Iterable, Mapping, Sequence -from functools import cached_property from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union import torch @@ -18,7 +17,6 @@ from vllm.forward_context import get_forward_context from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.loader import DefaultModelLoader from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -438,13 +436,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - - return get_sampler() - def get_mm_mapping(self) -> MultiModelKeys: """ Get the module prefix in multimodal models @@ -628,13 +619,6 @@ def compute_logits(self, hidden_states: torch.Tensor, return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 63e71f26880..908cd7885aa 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -21,7 +21,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -669,7 +668,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = Sampler() def forward( self, @@ -724,14 +722,6 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."]) diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index ea21fffaede..d34033e3ac9 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -31,7 +31,6 @@ MambaMixer2, extra_groups_for_head_shards) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -870,7 +869,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: # Initialize logits processing and sampling self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = get_sampler() def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: """Convert input token IDs to embeddings. @@ -1004,23 +1002,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - """Sample next tokens from computed logits. - - Args: - logits: Computed logits for next token prediction - sampling_metadata: Metadata for sampling process - - Returns: - Sampled tokens and related sampling information - """ - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) From 7c60c99612f5c6980d8b46813731418ceeff90b8 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 23 Apr 2025 17:25:30 -0700 Subject: [PATCH 02/10] Remove sampler from model code Signed-off-by: Woosuk Kwon --- vllm/model_executor/models/granitemoe.py | 11 ----------- vllm/model_executor/models/granitemoeshared.py | 11 ----------- vllm/model_executor/models/idefics3.py | 10 ---------- vllm/model_executor/models/internlm2.py | 12 +----------- vllm/model_executor/models/internvl.py | 18 +----------------- vllm/model_executor/models/jais.py | 10 ---------- vllm/model_executor/models/jamba.py | 10 ---------- vllm/model_executor/models/kimi_vl.py | 12 +----------- vllm/model_executor/models/llava.py | 16 ---------------- vllm/model_executor/models/llava_next.py | 16 ---------------- vllm/model_executor/models/llava_next_video.py | 16 ---------------- vllm/model_executor/models/llava_onevision.py | 16 ---------------- vllm/model_executor/models/mamba.py | 10 ---------- vllm/model_executor/models/mamba2.py | 10 ---------- vllm/model_executor/models/minicpm.py | 10 ---------- vllm/model_executor/models/minicpmv.py | 18 +----------------- vllm/model_executor/models/minimax_text_01.py | 12 ------------ vllm/model_executor/models/mistral3.py | 16 ---------------- vllm/model_executor/models/mllama.py | 10 ---------- vllm/model_executor/models/mllama4.py | 13 ------------- vllm/model_executor/models/mpt.py | 10 ---------- vllm/model_executor/models/nemotron.py | 11 ----------- vllm/model_executor/models/nemotron_nas.py | 8 -------- vllm/model_executor/models/olmo.py | 10 ---------- vllm/model_executor/models/olmo2.py | 10 ---------- vllm/model_executor/models/olmoe.py | 10 ---------- vllm/model_executor/models/orion.py | 10 ---------- vllm/model_executor/models/persimmon.py | 10 ---------- vllm/model_executor/models/phi3_small.py | 13 +------------ vllm/model_executor/models/phi3v.py | 16 ---------------- vllm/model_executor/models/plamo2.py | 10 ---------- vllm/model_executor/models/skyworkr1v.py | 18 +----------------- vllm/model_executor/models/transformers.py | 9 --------- 33 files changed, 6 insertions(+), 396 deletions(-) diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 367722126e5..7fff14cb9f1 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -391,8 +390,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): scale=1 / self.config.logits_scaling) - self.sampler = get_sampler() - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) @@ -428,14 +425,6 @@ def make_empty_intermediate_tensors( device=device), }) - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py index cf8c969e118..4e660cbf667 100644 --- a/vllm/model_executor/models/granitemoeshared.py +++ b/vllm/model_executor/models/granitemoeshared.py @@ -20,7 +20,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -295,8 +294,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): scale=1 / self.config.logits_scaling) - self.sampler = get_sampler() - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) @@ -332,14 +329,6 @@ def make_empty_intermediate_tensors( device=device), }) - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index c31870461b4..961954c2b58 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -28,7 +28,6 @@ from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -603,7 +602,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): if self.config.text_config.tie_word_embeddings: self.lm_head.weight = self.model.text_model.wte.weight self.logits_processor = LogitsProcessor(config.text_config.vocab_size) - self.sampler = get_sampler() def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: h = w = self.config.vision_config.image_size @@ -754,14 +752,6 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 520b85c0cdf..c3d7cbfcddb 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -23,7 +23,6 @@ from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -336,7 +335,6 @@ def __init__(self, if self.config.tie_word_embeddings: self.output.weight = self.model.tok_embeddings.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -363,14 +361,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ @@ -423,7 +413,7 @@ def __init__( prefix=prefix, model_type=model_type) - for attr in ("output", "logits_processor", "sampler"): + for attr in ("output", "logits_processor"): delattr(self, attr) config = vllm_config.model_config.hf_config diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 8f5f454cbf6..23b92ad2bbf 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -8,7 +8,6 @@ # -------------------------------------------------------- from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence -from functools import cached_property from typing import Literal, Optional, Set, Tuple, TypedDict, TypeVar, Union import torch @@ -20,7 +19,6 @@ from vllm.config import VllmConfig from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.models.intern_vit import (InternVisionModel, InternVisionPatchModel) from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -698,13 +696,6 @@ def _patch_quant_config(self, config: PretrainedConfig, (llm_quant_config is not None): quant_config.modules_to_not_convert.append("vision_model") - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - - return get_sampler() - def _init_vision_model( self, config: PretrainedConfig, @@ -903,7 +894,7 @@ def forward( intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, - ) -> Union[SamplerOutput, IntermediateTensors]: + ) -> IntermediateTensors: if intermediate_tensors is not None: input_ids = None @@ -941,13 +932,6 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: # unused modules appear in OpenGVLab/InternVideo2_5_Chat_8B diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index 78fe6588edd..e1e3f0f199c 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -36,7 +36,6 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -308,7 +307,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.mup_width_scale) self.logits_processor = LogitsProcessor(vocab_size=config.vocab_size, scale=self.output_logits_scale) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) @@ -335,14 +333,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 6fabc8228e1..46335c2b393 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -19,7 +19,6 @@ from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -409,7 +408,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -466,14 +464,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index 1520f6992f0..5c39907d79e 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -58,7 +58,6 @@ get_tensor_model_parallel_world_size) from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead) from vllm.model_executor.model_loader.weight_utils import ( @@ -298,7 +297,6 @@ def __init__( logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = get_sampler() self.media_placeholder: int = self.config.media_placeholder_token_id self.tp_rank = get_tensor_model_parallel_rank() self.tp_world_size = get_tensor_model_parallel_world_size() @@ -409,7 +407,7 @@ def forward( intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, - ) -> SamplerOutput: + ) -> IntermediateTensors: if intermediate_tensors is not None: inputs_embeds = None # NOTE: In v1, inputs_embeds is always generated at model runner from @@ -447,14 +445,6 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata, **kwargs) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): config = self.config.text_config _KEYS_TO_MODIFY_MAPPING = { diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index fbd212d1700..8862b2679f9 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -2,7 +2,6 @@ from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence -from functools import cached_property from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict, TypeVar, Union, cast) @@ -23,7 +22,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -546,13 +544,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - - return get_sampler() - def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -763,13 +754,6 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 9c4d0e1fc27..c646c0f03d1 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from abc import abstractmethod -from functools import cached_property from typing import (Final, Iterable, List, Literal, Mapping, Optional, Protocol, Set, Tuple, TypedDict, TypeVar, Union) @@ -13,7 +12,6 @@ from typing_extensions import NotRequired from vllm.config import VllmConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFieldConfig @@ -250,13 +248,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - - return get_sampler() - def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor: expected_dims = (2, ) @@ -585,13 +576,6 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 0221c6b237c..a5ff189cfdb 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -2,7 +2,6 @@ import math from collections.abc import Iterable, Mapping, Sequence -from functools import cached_property from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union import torch @@ -12,7 +11,6 @@ from vllm.config import VllmConfig from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY @@ -301,13 +299,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.make_empty_intermediate_tensors = ( self.language_model.model.make_empty_intermediate_tensors) - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - - return get_sampler() - def _validate_video_pixel_values( self, data: Union[torch.Tensor, List[torch.Tensor]] ) -> Union[torch.Tensor, List[torch.Tensor]]: @@ -469,13 +460,6 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index ab2bf881ad6..5c2b388e403 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -2,7 +2,6 @@ import math from collections.abc import Iterable, Mapping, Sequence -from functools import cached_property from typing import (Final, List, Literal, Optional, Protocol, Set, Tuple, TypedDict, Union) @@ -16,7 +15,6 @@ from vllm.config import VllmConfig from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -455,13 +453,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.make_empty_intermediate_tensors = ( self.language_model.model.make_empty_intermediate_tensors) - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - - return get_sampler() - def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor: expected_dims = (2, ) @@ -957,13 +948,6 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index ac95b65fd03..af78ece66bb 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -14,7 +14,6 @@ from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -213,7 +212,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.backbone.make_empty_intermediate_tensors) @@ -267,14 +265,6 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index 526dec46ff2..78303733f6b 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -19,7 +19,6 @@ MambaMixer2, extra_groups_for_head_shards) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -208,7 +207,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.backbone.make_empty_intermediate_tensors) @@ -282,14 +280,6 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters()) diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index cf03396a9ca..866dc3f466e 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -45,7 +45,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -553,7 +552,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(unpadded_vocab_size, config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -584,14 +582,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 1a91cf9bab4..65a26eadd5c 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -25,7 +25,7 @@ import math from collections import defaultdict from collections.abc import Iterable, Mapping, Sequence -from functools import cached_property, partial +from functools import partial from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict, Union) @@ -40,7 +40,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2, get_2d_sincos_pos_embed) -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.utils import set_default_torch_dtype from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.model_executor.models.minicpm import MiniCPMForCausalLM @@ -758,13 +757,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.llm.make_empty_intermediate_tensors) - @cached_property - def sampler(self): - if hasattr(self.llm, "sampler"): - return self.llm.sampler - - return get_sampler() - def _parse_and_validate_vision_input( self, modality: str, @@ -946,14 +938,6 @@ def compute_logits( ) -> Optional[torch.Tensor]: return self.llm.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 7562aa678d5..74be08159cd 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -33,7 +33,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -994,7 +993,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, self.config.vocab_size) - self.sampler = Sampler() else: self.lm_head = PPMissingLayer() @@ -1030,16 +1028,6 @@ def compute_logits(self, hidden_states: torch.Tensor, return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ): - - next_tokens = self.sampler(logits, sampling_metadata) - - return next_tokens - def make_empty_intermediate_tensors( self, batch_size: int, dtype: torch.dtype, device: torch.device) -> IntermediateTensors: diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 8b1a1d68fc3..94cbb20cee5 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -2,7 +2,6 @@ from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence -from functools import cached_property from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict, TypeVar, Union) @@ -19,7 +18,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -435,13 +433,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - - return get_sampler() - def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -598,13 +589,6 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 7bfb3ada6bb..0c1d61c01f9 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -47,7 +47,6 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -1211,7 +1210,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.logits_processor = LogitsProcessor(config.output_hidden_states, config.text_config.vocab_size) - self.sampler = get_sampler() def compute_logits( self, @@ -1222,14 +1220,6 @@ def compute_logits( hidden_states, sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def unpack_data(self, image_data: Union[List[torch.Tensor], torch.Tensor], padding_value=0) -> torch.Tensor: diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 69e3ea8bd06..56a7f02c415 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -17,7 +17,6 @@ # limitations under the License. import math from collections.abc import Iterable, Mapping -from functools import cached_property from itertools import tee from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union @@ -38,7 +37,6 @@ RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.loader import _initialize_model from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -682,13 +680,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - - return get_sampler() - def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[Llama4ImagePatchInputs]: # num_images, 1, num_chunks, channel, image_size, image_size @@ -785,10 +776,6 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample(self, logits: torch.Tensor, - sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def separate_weights( self, weights: Iterable[Tuple[str, torch.Tensor]], diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index b30f3ee3799..77bd794058c 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -18,7 +18,6 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -298,7 +297,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): prefix=maybe_prefix(prefix, "transformer")) self.lm_head = self.transformer.wte self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) @@ -325,14 +323,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 0ea296b2f93..5208c0796c8 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -38,7 +38,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -416,8 +415,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else: self.lm_head = PPMissingLayer() - self.sampler = get_sampler() - self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -444,14 +441,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index 5c9b04cab18..26499949687 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -34,7 +34,6 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -408,8 +407,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else: self.lm_head = PPMissingLayer() - self.sampler = get_sampler() - self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -439,11 +436,6 @@ def compute_logits( sampling_metadata) return logits - def sample(self, logits: torch.Tensor, - sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 4a341c97d6c..0781ca168f8 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -39,7 +39,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -309,7 +308,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config=quant_config, ) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -340,14 +338,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index f9427cdadf7..e7a5f91a3da 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -42,7 +42,6 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -339,7 +338,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): prefix=maybe_prefix(prefix, "lm_head"), ) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -367,14 +365,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ # (param_name, shard_name, shard_id) diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 296bac51d01..9bed29d0132 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -31,7 +31,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -417,7 +416,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.hidden_size, quant_config=quant_config) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -442,14 +440,6 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 5d35234544c..8d9c000750d 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -22,7 +22,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -316,7 +315,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -343,14 +341,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 15afea82f3a..6e26658c926 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -38,7 +38,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -306,7 +305,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.hidden_size, bias=False) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -337,14 +335,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index 7b02c9edfad..338e87b4285 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -17,7 +17,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -396,7 +395,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -437,6 +435,7 @@ def compute_logits( sampling_metadata) if self.dummy_token_indices is not None and logits is not None: logits.index_fill_(-1, self.dummy_token_indices, -torch.inf) + logits = logits / self.mup_width_multiplier return logits def forward( @@ -455,16 +454,6 @@ def forward( output_hidden_states = output_hidden_states return output_hidden_states - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - - next_tokens = self.sampler(logits / self.mup_width_multiplier, - sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 5b43871b759..a1442251b99 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -16,7 +16,6 @@ # limitations under the License. import re from collections.abc import Iterable, Mapping, Sequence -from functools import cached_property from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union import torch @@ -27,7 +26,6 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -555,13 +553,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - - return get_sampler() - def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor: expected_dims = (2, ) @@ -716,13 +707,6 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index fb1442526c6..790c48ccd21 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -24,7 +24,6 @@ selective_scan_fn, selective_state_update) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -628,7 +627,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, self.config.vocab_size) - self.sampler = get_sampler() # Initialize weights and apply final processing self.post_init() @@ -684,14 +682,6 @@ def compute_logits( sampling_metadata) return logits - def sample( - self, - logits: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): params_dict = dict(self.named_parameters()) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index 19a23162aa8..e78c37b65f8 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -8,7 +8,6 @@ # -------------------------------------------------------- from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence -from functools import cached_property from typing import Literal, Optional, Set, Tuple, TypedDict, TypeVar, Union import torch @@ -21,7 +20,6 @@ from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.models.intern_vit import (InternVisionModel, InternVisionPatchModel) from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -699,13 +697,6 @@ def _patch_quant_config(self, config: PretrainedConfig, (llm_quant_config is not None): quant_config.modules_to_not_convert.append("vision_model") - @cached_property - def sampler(self): - if hasattr(self.language_model, "sampler"): - return self.language_model.sampler - - return get_sampler() - def _init_vision_model( self, config: PretrainedConfig, @@ -908,7 +899,7 @@ def forward( intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, - ) -> Union[SamplerOutput, IntermediateTensors]: + ) -> IntermediateTensors: if intermediate_tensors is not None: input_ids = None @@ -946,13 +937,6 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: skip_prefixes = [ diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 212e35e79d4..a37e88a387f 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -35,7 +35,6 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -396,8 +395,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else: self.lm_head = PPMissingLayer() - self.sampler = get_sampler() - self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -435,12 +432,6 @@ def compute_logits( sampling_metadata) return logits - def sample(self, logits: torch.Tensor, - sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]: - - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( From 19965153dd148c650bfb00668bd15f8dad43ab85 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 23 Apr 2025 20:11:14 -0700 Subject: [PATCH 03/10] sampler Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c12e4fd555d..2e61a6b4f3a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -38,6 +38,7 @@ ModelRunnerOutput) from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.rejection_sampler import RejectionSampler +from vllm.v1.sample.sampler import Sampler from vllm.v1.spec_decode.eagle import EagleProposer from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.spec_decode.ngram_proposer import NgramProposer @@ -153,6 +154,9 @@ def __init__( self.max_num_encoder_input_tokens = encoder_compute_budget self.encoder_cache_size = encoder_cache_size + # Sampler + self.sampler = Sampler() + # Lazy initialization # self.model: nn.Module # Set after load_model self.kv_caches: list[torch.Tensor] = [] @@ -1096,7 +1100,7 @@ def execute_model( # Sample the next token and get logprobs if needed. sampling_metadata = self.input_batch.sampling_metadata if spec_decode_metadata is None: - sampler_output = self.model.sample( + sampler_output = self.sampler( logits=logits, sampling_metadata=sampling_metadata, ) @@ -1106,7 +1110,7 @@ def execute_model( # logits tensor. This means any in-place operations on bonus_logits # won't affect the original logits tensor. bonus_logits = logits[spec_decode_metadata.bonus_logits_indices] - sampler_output = self.model.sample( + sampler_output = self.sampler( logits=bonus_logits, sampling_metadata=sampling_metadata, ) @@ -1502,8 +1506,8 @@ def _dummy_sampler_run( bad_words_token_ids={}, ) try: - sampler_output = self.model.sample( - logits=logits, sampling_metadata=dummy_metadata) + sampler_output = self.sampler(logits=logits, + sampling_metadata=dummy_metadata) except RuntimeError as e: if 'out of memory' in str(e): raise RuntimeError( From 47c438d42b343fcd1babd70f04c3a990f710342f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 23 Apr 2025 21:02:54 -0700 Subject: [PATCH 04/10] Fix V0 Signed-off-by: Woosuk Kwon --- vllm/spec_decode/draft_model_runner.py | 2 +- vllm/worker/cpu_enc_dec_model_runner.py | 2 +- vllm/worker/cpu_model_runner.py | 10 +++------- vllm/worker/enc_dec_model_runner.py | 2 +- vllm/worker/hpu_model_runner.py | 10 ++++++---- vllm/worker/model_runner.py | 5 +++-- vllm/worker/xpu_model_runner.py | 2 +- 7 files changed, 16 insertions(+), 17 deletions(-) diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index 3ad9b499332..24095ef2a56 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -295,7 +295,7 @@ def execute_model( if not self.is_driver_worker: return [] # Sample the next token. - output = self.model.sample( + output = self.model_runner.sampler( logits=logits, sampling_metadata=model_input.sampling_metadata, ) diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py index ac7c93e4839..c2120c03517 100644 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -316,7 +316,7 @@ def execute_model( return [] # Sample the next token. - output = self.model.sample( + output = self.sampler( logits=logits, sampling_metadata=model_input.sampling_metadata, ) diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index f22d45ed828..29fbfbf0d37 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -19,7 +19,7 @@ from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding -from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader import get_model from vllm.model_executor.models import supports_lora, supports_multimodal from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, @@ -490,6 +490,7 @@ def __init__( self.model: nn.Module # Set after init_Model # Set after load_model. self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None + self.sampler = get_sampler() if hasattr(self, "_builder_cls"): # multi-step model runner does not have `_builder_cls` @@ -545,11 +546,6 @@ def _prepare_model_input_tensors( return self.builder.build() # type: ignore - # sampler property will be used by spec_decode_worker - @property - def sampler(self): - return self.model.sampler - @property def vocab_size(self) -> int: return self.model_config.get_vocab_size() @@ -677,7 +673,7 @@ def execute_model( return [] # Sample the next token. - output = self.model.sample( + output = self.sampler( logits=logits, sampling_metadata=model_input.sampling_metadata, ) diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 72ff9d66a68..dee59041ec6 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -205,7 +205,7 @@ def execute_model( model_input.async_callback() # Sample the next token. - output: SamplerOutput = self.model.sample( + output: SamplerOutput = self.sampler( logits=logits, sampling_metadata=model_input.sampling_metadata, ) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 2d31024b47d..6fa8fccb086 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -41,7 +41,7 @@ from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader import get_model @@ -454,7 +454,7 @@ def compute_logits(self, *args, **kwargs): return self.model.compute_logits(*args, **kwargs) def sample(self, *args, **kwargs): - return self.model.sample(*args, **kwargs) + return self.sampler(*args, **kwargs) class PreparePromptMetadata(NamedTuple): @@ -657,6 +657,7 @@ def __init__( self.lora_manager: LRUCacheWorkerLoRAManager = None self.model: torch.nn.Module = None self.inc_initialized_successfully = False + self.sampler = None # Profiler stats self.profiler = HabanaHighLevelProfiler() @@ -771,6 +772,7 @@ def load_model(self) -> None: self.model, vllm_config=self.vllm_config) msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}" logger.info(msg) + self.sampler = get_sampler() self.model_memory_usage = m.consumed_device_memory msg = f"Loading model weights took in total {m.get_summary_string()}" @@ -2167,7 +2169,7 @@ def execute_model( # in case of multi-step scheduling # we only want to pythonize in the last step sampling_metadata.skip_sampler_cpu_output = True - self.model.model.sampler.include_gpu_probs_tensor = True + self.sampler.include_gpu_probs_tensor = True cache_orig_output_tokens_len: List[Dict] = [] def try_revert_dummy_output_tokens(): @@ -2231,7 +2233,7 @@ def try_revert_dummy_output_tokens(): f'{"prompt" if is_prompt else "decode"}_' f'bs{batch_size}_' f'seq{seq_len}')): - output = self.model.sample( + output = self.sampler( logits=logits, sampling_metadata=sampling_metadata, ) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 2ebada343d0..3b09c92ae15 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -35,7 +35,7 @@ from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor import SamplingMetadata, SamplingMetadataCache from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding -from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader import get_model from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.model_executor.models import supports_lora, supports_multimodal @@ -1094,6 +1094,7 @@ def __init__( # Set after load_model. self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None self.prompt_adapter_manager: LRUCacheWorkerPromptAdapterManager = None + self.sampler = get_sampler() set_cpu_offload_max_bytes( int(self.cache_config.cpu_offload_gb * 1024**3)) @@ -1832,7 +1833,7 @@ def execute_model( model_input.async_callback() # Sample the next token. - output: SamplerOutput = self.model.sample( + output: SamplerOutput = self.sampler( logits=logits, sampling_metadata=model_input.sampling_metadata, ) diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 9d49b4385dc..f70d71428f1 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -596,7 +596,7 @@ def execute_model( model_input.async_callback() # Sample the next token. - output: SamplerOutput = self.model.sample( + output: SamplerOutput = self.sampler( logits=logits, sampling_metadata=model_input.sampling_metadata, ) From ba6d5c343f5cb2a933b262c28608f304c769e46e Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 23 Apr 2025 21:06:55 -0700 Subject: [PATCH 05/10] Fix xpu Signed-off-by: Woosuk Kwon --- vllm/worker/xpu_model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index f70d71428f1..fc9461a24b9 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -18,7 +18,7 @@ from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.logger import init_logger from vllm.model_executor import SamplingMetadataCache -from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader import get_model from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, MultiModalKwargs, MultiModalPlaceholderMap, @@ -410,6 +410,7 @@ def __init__( # Lazy initialization. self.model: nn.Module # Set after init_Model + self.sampler = get_sampler() self.sampling_metadata_cache: SamplingMetadataCache = \ SamplingMetadataCache() \ From c1c9c0444376fb2fa500caf3d6e89e3358a2f75d Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 23 Apr 2025 21:15:01 -0700 Subject: [PATCH 06/10] Fix hpu Signed-off-by: Woosuk Kwon --- vllm/worker/hpu_model_runner.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 6fa8fccb086..17baf11bdb9 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -316,6 +316,7 @@ class HpuModelAdapter: def __init__(self, model, vllm_config): self.model = model + self.sampler = get_sampler() self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', '0').lower() in ['1', 'true'] self.vllm_config = vllm_config @@ -657,9 +658,8 @@ def __init__( self.lora_manager: LRUCacheWorkerLoRAManager = None self.model: torch.nn.Module = None self.inc_initialized_successfully = False - self.sampler = None - # Profiler stats + # Profiler tats self.profiler = HabanaHighLevelProfiler() self.profiler_counter_helper = HabanaProfilerCounterHelper() self.seen_configs: set = set() @@ -772,7 +772,6 @@ def load_model(self) -> None: self.model, vllm_config=self.vllm_config) msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}" logger.info(msg) - self.sampler = get_sampler() self.model_memory_usage = m.consumed_device_memory msg = f"Loading model weights took in total {m.get_summary_string()}" @@ -2169,7 +2168,7 @@ def execute_model( # in case of multi-step scheduling # we only want to pythonize in the last step sampling_metadata.skip_sampler_cpu_output = True - self.sampler.include_gpu_probs_tensor = True + self.model.sampler.include_gpu_probs_tensor = True cache_orig_output_tokens_len: List[Dict] = [] def try_revert_dummy_output_tokens(): @@ -2233,7 +2232,7 @@ def try_revert_dummy_output_tokens(): f'{"prompt" if is_prompt else "decode"}_' f'bs{batch_size}_' f'seq{seq_len}')): - output = self.sampler( + output = self.sample( logits=logits, sampling_metadata=sampling_metadata, ) From 7b334f84eb7b8bfb28eb396065472ab9bfa8f1b8 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 23 Apr 2025 21:15:48 -0700 Subject: [PATCH 07/10] typo Signed-off-by: Woosuk Kwon --- vllm/worker/hpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 17baf11bdb9..910483b3268 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -659,7 +659,7 @@ def __init__( self.model: torch.nn.Module = None self.inc_initialized_successfully = False - # Profiler tats + # Profiler stat self.profiler = HabanaHighLevelProfiler() self.profiler_counter_helper = HabanaProfilerCounterHelper() self.seen_configs: set = set() From 435aaeb5f6ca24406d7555b9005ec9d149de7fe7 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 23 Apr 2025 21:16:18 -0700 Subject: [PATCH 08/10] typo Signed-off-by: Woosuk Kwon --- vllm/worker/hpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 910483b3268..ec9fab11358 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -659,7 +659,7 @@ def __init__( self.model: torch.nn.Module = None self.inc_initialized_successfully = False - # Profiler stat + # Profiler stats self.profiler = HabanaHighLevelProfiler() self.profiler_counter_helper = HabanaProfilerCounterHelper() self.seen_configs: set = set() From b9aaa2ce30050ffada40ce306aa47f4ae1468031 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 23 Apr 2025 21:17:29 -0700 Subject: [PATCH 09/10] fix hpu Signed-off-by: Woosuk Kwon --- vllm/worker/hpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index ec9fab11358..1bcef841b06 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -2232,7 +2232,7 @@ def try_revert_dummy_output_tokens(): f'{"prompt" if is_prompt else "decode"}_' f'bs{batch_size}_' f'seq{seq_len}')): - output = self.sample( + output = self.model.sample( logits=logits, sampling_metadata=sampling_metadata, ) From a3ac80702015f5d5003c2a1bd82ec8d57bb9c9f2 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 23 Apr 2025 22:55:06 -0700 Subject: [PATCH 10/10] Fix model.sampler Signed-off-by: Woosuk Kwon --- tests/spec_decode/test_scorer.py | 5 ++--- vllm/spec_decode/multi_step_worker.py | 5 ++--- vllm/spec_decode/spec_decode_worker.py | 4 ++-- vllm/v1/worker/gpu_model_runner.py | 4 ++-- vllm/worker/multi_step_model_runner.py | 3 +-- 5 files changed, 9 insertions(+), 12 deletions(-) diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py index 161cc9fbf55..f73cf4b345f 100644 --- a/tests/spec_decode/test_scorer.py +++ b/tests/spec_decode/test_scorer.py @@ -62,9 +62,8 @@ def test_scorer(model_name: str, batch_size: int, max_propose_len: int, scorer_worker = create_worker(Worker, model_name, block_size, num_gpu_blocks, seed) scorer_worker.model_runner.disable_logprobs = True # accessed by mqa_scorer - scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor = True - scorer_worker.model_runner.model.sampler.\ - should_modify_greedy_probs_inplace = True + scorer_worker.model_runner.sampler.include_gpu_probs_tensor = True + scorer_worker.model_runner.sampler.should_modify_greedy_probs_inplace = True vocab_size = scorer_worker.vocab_size diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index d8d54918fa9..6473740ae51 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -50,11 +50,10 @@ def init_device(self) -> None: def set_include_gpu_probs_tensor(self) -> None: # Need include_gpu_probs_tensor for MultiStepWorker - self.model_runner.model.sampler.include_gpu_probs_tensor = True + self.model_runner.sampler.include_gpu_probs_tensor = True def set_should_modify_greedy_probs_inplace(self) -> None: - self.model_runner.model.sampler.should_modify_greedy_probs_inplace = ( - True) + self.model_runner.sampler.should_modify_greedy_probs_inplace = True @torch.inference_mode() def sampler_output( diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index a724beade12..4e79003de39 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -410,9 +410,9 @@ def _configure_model_sampler_for_spec_decode(self): NOTE(cade): This will require a special check if the proposer worker does not have a sampler (e.g. ngram speculation). """ - (self.scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor + (self.scorer_worker.model_runner.sampler.include_gpu_probs_tensor ) = True - (self.scorer_worker.model_runner.model.sampler. + (self.scorer_worker.model_runner.sampler. should_modify_greedy_probs_inplace) = True self.proposer_worker.set_include_gpu_probs_tensor() self.proposer_worker.set_should_modify_greedy_probs_inplace() diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2e61a6b4f3a..86f6a301fbb 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1387,8 +1387,8 @@ def _get_prompt_logprobs_dict( tgt_token_ids = prompt_token_ids[start_tok:start_tok + num_logits] # Compute prompt logprobs. - logprobs = self.model.sampler.compute_logprobs(logits) - token_ids, logprobs, ranks = self.model.sampler.gather_logprobs( + logprobs = self.sampler.compute_logprobs(logits) + token_ids, logprobs, ranks = self.sampler.gather_logprobs( logprobs, num_prompt_logprobs, tgt_token_ids) # Transfer GPU->CPU async. diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 7ddf382079c..a6f5ec82563 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -488,8 +488,7 @@ def execute_model( device="cpu", pin_memory=True) - self._base_model_runner.model.sampler.include_gpu_probs_tensor = ( - True) + self._base_model_runner.sampler.include_gpu_probs_tensor = True if frozen_model_input.sampling_metadata: frozen_model_input.sampling_metadata.skip_sampler_cpu_output = ( True)