diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 16500ab23e0..794de4c383b 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -929,6 +929,15 @@ def weight_loader_v2(self, shard_offset = self._get_shard_offset_mapping(loaded_shard_id) shard_size = self._get_shard_size_mapping(loaded_shard_id) + # Note(simon): This is needed for Qwen3's fp8 quantization. + if isinstance(param, BlockQuantScaleParameter): + assert self.quant_method is not None + assert hasattr(self.quant_method, "quant_config") + weight_block_size = self.quant_method.quant_config.weight_block_size + block_n, _ = weight_block_size[0], weight_block_size[1] + shard_offset = (shard_offset + block_n - 1) // block_n + shard_size = (shard_size + block_n - 1) // block_n + param.load_qkv_weight(loaded_weight=loaded_weight, num_heads=self.num_kv_head_replicas, shard_id=loaded_shard_id,