feat: Support for Mistral Small 3.1 24B VLM

brb-nv · brb-nv · commit 7cadbe2f43a3 · 2025-05-13T19:02:51.000Z
Signed-off-by: Balaram Buddharaju &lt;169953907+brb-nv@users.noreply.github.com&gt;
diff --git a/examples/models/core/multimodal/run.py b/examples/models/core/multimodal/run.py
@@ -41,7 +41,7 @@ def print_result(model, input_text, output_text, args):
                     0][0].lower()
             elif model.model_type in [
                     'blip2', 'neva', 'phi-3-vision', 'llava_next',
-                    'phi-4-multimodal'
+                    'phi-4-multimodal', 'pixtral'
             ]:
                 assert 'singapore' in output_text[0][0].lower()
             elif model.model_type == 'video-neva':
diff --git a/tensorrt_llm/models/llama/config.py b/tensorrt_llm/models/llama/config.py
@@ -137,6 +137,11 @@ def from_hugging_face(
                 # InternLM-XComposer2 has a mask for partial lora
                 # Therefore we need an additional flag for this mask
                 has_partial_lora_mask = True
+            if hf_config.model_type == 'mistral3':
+                from transformers import Mistral3Config
+                hf_config = Mistral3Config.from_pretrained(
+                    hf_config_dir).text_config
+                hf_config.architectures = ["MistralForCausalLM"]
 
         num_key_value_heads = getattr(hf_config, "num_key_value_heads",
                                       hf_config.num_attention_heads)
diff --git a/tensorrt_llm/runtime/multimodal_model_runner.py b/tensorrt_llm/runtime/multimodal_model_runner.py
@@ -413,6 +413,13 @@ def __init__(self, args):
             if self.num_frames is None:
                 self.num_frames = 8
             assert self.args.video_path is None or self.args.image_path is None
+        if self.model_type == "pixtral":
+            hf_config = AutoConfig.from_pretrained(self.args.hf_model_dir)
+            self.image_size = hf_config.vision_config.image_size
+            self.patch_size = hf_config.vision_config.patch_size
+            self.vocab_size = hf_config.text_config.vocab_size
+            self.image_token_index = hf_config.image_token_index
+            self.spatial_merge_size = hf_config.spatial_merge_size
 
         self.audio_input_names = self.audio_output_names = None
         if self.model_type == "mllama":
@@ -617,6 +624,10 @@ def init_processor(self):
             self.processor = AutoProcessor.from_pretrained(
                 self.args.hf_model_dir, trust_remote_code=True, num_crops=16)
 
+        elif 'pixtral' in self.model_type:
+            self.processor = AutoProcessor.from_pretrained(
+                self.args.hf_model_dir)
+
         elif 'internlm' in self.model_type:
             image_size = 490
             self.processor = transforms.Compose([
@@ -895,6 +906,33 @@ def preprocess(self, pre_prompt, post_prompt, image, other_vision_inputs,
             audio_mask = audio.new_ones(*audio.shape[:2])
             audio_mask[-1, -pad:] = 0
             other_audio_inputs['attention_mask'] = audio_mask.bool()
+        elif self.model_type == 'pixtral':
+            # Hold on to pixel_values and input_ids.
+            dtype = str_dtype_to_torch(self.vision_precision)
+            pixel_values = image["pixel_values"].to(device="cuda", dtype=dtype)
+            input_ids = image["input_ids"].to(device="cuda")
+
+            # Shape of pixel values from the processor varies with the raw image.
+            # So we create a new tensor with a fixed shape as expected by the vision
+            # encoder and create a corresponding attention mask.
+            image_size = self.image_size
+            patch_size = self.patch_size
+            d_min = torch.finfo(dtype).min
+            num_patches = (image_size // patch_size)
+            image = torch.full((1, 3, image_size, image_size),
+                               fill_value=0,
+                               dtype=dtype,
+                               device="cuda")
+            attention_mask = torch.full((1, num_patches, num_patches),
+                                        fill_value=d_min,
+                                        dtype=dtype,
+                                        device="cuda")
+            h, w = pixel_values.shape[-2:]
+            image[..., :h, :w] = pixel_values
+            attention_mask[..., :h // patch_size, :w // patch_size] = 0
+            other_vision_inputs = {
+                "attention_mask": attention_mask,
+            }
         elif self.model_type == 'llava_next':
             input = image
             image = input['pixel_values']
@@ -1108,6 +1146,17 @@ def preprocess(self, pre_prompt, post_prompt, image, other_vision_inputs,
                 audio_features = audio_features.unsqueeze(0).repeat(
                     self.args.batch_size, 1, 1)
             length = input_ids.shape[1]
+
+        elif self.model_type == 'pixtral':
+            relevant_patch_size = self.patch_size * self.spatial_merge_size
+            output_img_size = self.image_size // relevant_patch_size
+            visual_features = visual_features.reshape(
+                output_img_size, output_img_size,
+                -1)[:h // relevant_patch_size, :w //
+                    relevant_patch_size].flatten(0, 1)
+            input_ids = self.ptuning_setup_pixtral(input_ids=input_ids)
+            length = input_ids.shape[1]
+
         elif self.model_type == 'llava_next':
             visual_features = LlavaNextUtils.rearrange_image_features(
                 visual_features, self.image_newlines["image_newline"],
@@ -1208,7 +1257,7 @@ def preprocess(self, pre_prompt, post_prompt, image, other_vision_inputs,
                 torch.int32)
 
         if self.model_type in [
-                'fuyu', 'kosmos-2', 'phi-3-vision', 'llava_next'
+                'fuyu', 'kosmos-2', 'phi-3-vision', 'llava_next', 'pixtral'
         ]:
             return input_ids, input_lengths, [
                 visual_features
@@ -1976,6 +2025,20 @@ def ptuning_setup_fuyu(self, input_ids, image_patches_indices):
             res_input_ids.append(cur_input_ids)
         return res_input_ids
 
+    def ptuning_setup_pixtral(self, input_ids):
+        # input_ids obtained from processor has token_ids for text as well as image tokens
+        # where each image token is represented the same image_token_index (10 for this model).
+        image_token_index = self.image_token_index
+        vocab_size = self.vocab_size
+        # Replace all image tokens with a unique token_id > text_vacab_size.
+        # This shall be used to lookup the prompt table.
+        replacer = vocab_size
+        for i in range(len(input_ids[0])):
+            if input_ids[0][i] == image_token_index:
+                input_ids[0][i] = replacer
+                replacer += 1
+        return input_ids
+
     def ptuning_setup_llava_next(self, visual_features, pre_prompt,
                                  post_prompt):
         input_ids = []
@@ -2342,6 +2405,18 @@ def setup_inputs(self, input_text, raw_image, raw_audio=None):
                                    audios=[raw_audio],
                                    return_tensors="pt")
 
+        elif 'pixtral' in self.model_type:
+            # Send image and text prompt to processor.
+            pre_prompt = "<s>[INST][IMG]"
+            if input_text is None:
+                input_text = "What is in the image?"
+            post_prompt = "[/INST]"
+            prompt = pre_prompt + input_text + post_prompt
+            dtype = str_dtype_to_torch(self.vision_precision)
+            image = self.processor(text=prompt,
+                                   images=[raw_image],
+                                   return_tensors="pt").to(dtype)
+
         elif 'internvl' in self.model_type:
             pre_prompt = "<|system|>\n你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。<|end|><|user|>\n<image>\n"
             if input_text is None:
@@ -2526,7 +2601,8 @@ def setup_inputs(self, input_text, raw_image, raw_audio=None):
             post_prompt = [post_prompt] * self.args.batch_size
         if self.model_type not in [
                 'fuyu', 'pix2struct', 'kosmos-2', 'vila', 'phi-3-vision',
-                'phi-4-multimodal', 'llava_next', 'internvl', 'llava_onevision'
+                'phi-4-multimodal', 'llava_next', 'internvl', 'llava_onevision',
+                'pixtral'
         ]:
             if image is not None:
                 if image.dim() == 5:
diff --git a/tensorrt_llm/tools/multimodal_builder.py b/tensorrt_llm/tools/multimodal_builder.py
@@ -41,7 +41,7 @@ def add_multimodal_arguments(parser):
                             'fuyu', 'pix2struct', 'neva', 'kosmos-2',
                             'video-neva', 'phi-3-vision', 'phi-4-multimodal',
                             'mllama', 'internvl', 'qwen2_vl',
-                            'internlm-xcomposer2', 'qwen2_audio'
+                            'internlm-xcomposer2', 'qwen2_audio', 'pixtral'
                         ],
                         help="Model type")
     parser.add_argument(
@@ -142,6 +142,8 @@ def build(self):
             build_qwen2_vl_engine(args)
         elif args.model_type == 'qwen2_audio':
             build_qwen2_audio_engine(args)
+        elif args.model_type == "pixtral":
+            build_pixtral_engine(args)
         else:
             raise RuntimeError(f"Invalid model type {args.model_type}")
 
@@ -1577,3 +1579,158 @@ def forward(self, x, mask):
                          'num_mul_bins': args.num_mul_bins,
                          'max_mel_seq_len': args.max_mel_seq_len
                      })
+
+
+def build_pixtral_engine(args):
+    processor = AutoProcessor.from_pretrained(args.model_path)
+    hf_config = AutoConfig.from_pretrained(args.model_path)
+    vision_config = hf_config.vision_config
+    raw_image = Image.new(
+        'RGB',
+        [vision_config.image_size, vision_config.image_size])  # dummy image
+
+    inputs = processor(text="dummy", images=[raw_image], return_tensors="pt")
+    pixel_values = inputs["pixel_values"].to(args.device, torch.bfloat16)
+    attention_mask = torch.zeros(
+        1, vision_config.image_size // vision_config.patch_size,
+        vision_config.image_size // vision_config.patch_size).to(
+            args.device, torch.bfloat16)
+
+    # isort: off
+    from transformers.models.pixtral.modeling_pixtral import \
+        apply_rotary_pos_emb
+    from transformers import Mistral3ForConditionalGeneration
+    from transformers.models.pixtral.modeling_pixtral import (PixtralAttention,
+                                                              PixtralVisionModel
+                                                              )
+    from transformers.models.mistral3.modeling_mistral3 import (
+        Mistral3MultiModalProjector, Mistral3PatchMerger)
+    # isort: on
+    @torch.no_grad
+    def attn_forward(self,
+                     hidden_states,
+                     attention_mask,
+                     position_embeddings,
+                     output_attentions=False):
+        batch, patches, _ = hidden_states.size()
+
+        q = self.q_proj(hidden_states)
+        k = self.k_proj(hidden_states)
+        v = self.v_proj(hidden_states)
+
+        q = q.view(batch, patches, self.num_heads,
+                   self.head_dim).transpose(1, 2)
+        k = k.view(batch, patches, self.num_heads,
+                   self.head_dim).transpose(1, 2)
+        v = v.view(batch, patches, self.num_heads,
+                   self.head_dim).transpose(1, 2)
+        cos, sin = position_embeddings
+        q, k = apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=0)
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, attn_mask=attention_mask).transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch, patches, -1)
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None
+
+    @torch.no_grad
+    def vision_tower_forward(self, pixel_values, attention_mask):
+        patch_embeds = self.patch_conv(pixel_values)  # (bs, c, h, w)
+
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)  # (bs, h*w, c)
+        attention_mask = attention_mask.flatten(1)  # (bs, h*w)
+
+        patch_embeds = self.ln_pre(patch_embeds)
+        position_ids = self.position_ids.flatten()  # (h*w, )
+        position_embeddings = self.patch_positional_embedding(
+            patch_embeds, position_ids)
+
+        out = self.transformer(patch_embeds,
+                               attention_mask=attention_mask,
+                               position_embeddings=position_embeddings,
+                               output_hidden_states=False,
+                               output_attentions=False,
+                               return_dict=False)[0]
+        return out
+
+    @torch.no_grad
+    def patch_merger_forward(self, image_features, attention_mask):
+        h, w = attention_mask.shape[-2:]
+        bs, n, d = image_features.shape
+        image_grid = image_features.view(bs, h, w, d).permute(0, 3, 1, 2)
+        image_features = torch.nn.functional.unfold(image_grid, 2,
+                                                    stride=2).transpose(1, 2)
+        image_features = self.merging_layer(image_features)
+        return image_features
+
+    @torch.no_grad
+    def mm_projector_forward(self, image_features, attention_mask):
+        image_features = self.norm(image_features)
+        image_features = self.patch_merger(image_features, attention_mask)
+        hidden_states = self.linear_2(self.act(self.linear_1(image_features)))
+        return hidden_states
+
+    class PixtralVisionWrapper(torch.nn.Module):
+
+        def __init__(self, vision_tower, mm_projector):
+            super().__init__()
+            self.vision_tower = vision_tower
+            self.mm_projector = mm_projector
+
+        @torch.no_grad
+        def forward(self, pixel_values, attention_mask):
+            features = self.vision_tower(pixel_values, attention_mask)
+            out = self.mm_projector(features, attention_mask)
+            return out
+
+    model = Mistral3ForConditionalGeneration.from_pretrained(args.model_path,
+                                                             torch_dtype="auto")
+    vision_tower = model.vision_tower
+    mm_projector = model.multi_modal_projector
+
+    height = width = vision_config.image_size // vision_config.patch_size
+    mesh = torch.meshgrid(torch.arange(height),
+                          torch.arange(width),
+                          indexing="ij")
+    h_grid, v_grid = torch.stack(mesh, dim=-1).chunk(2, -1)
+    ids = h_grid[..., 0] * width + v_grid[..., 0]
+    vision_tower.register_buffer("position_ids", ids)
+
+    PixtralAttention.forward = attn_forward
+    PixtralVisionModel.forward = vision_tower_forward
+
+    Mistral3PatchMerger.forward = patch_merger_forward
+    Mistral3MultiModalProjector.forward = mm_projector_forward
+
+    vision_tower = vision_tower.to(args.device, torch.bfloat16)
+    mm_projector = mm_projector.to(args.device, torch.bfloat16)
+    vision_tower.eval()
+    mm_projector.eval()
+    wrapper = PixtralVisionWrapper(vision_tower, mm_projector)
+
+    Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    part_name = 'vision'
+    onnx_dir = f"{args.output_dir}/{part_name}/onnx"
+
+    export_onnx(wrapper,
+                input=(pixel_values, attention_mask),
+                onnx_dir=onnx_dir,
+                input_names=['input', 'attention_mask'],
+                dynamic_axes={
+                    'input': {
+                        0: "batch"
+                    },
+                    'attention_mask': {
+                        0: "batch"
+                    }
+                })
+    build_trt_engine(
+        args.model_type,
+        input_sizes=[[list(pixel_values.shape[1:]) for _ in range(3)],
+                     [list(attention_mask.shape[1:]) for _ in range(3)]],
+        onnx_dir=onnx_dir,
+        engine_dir=args.output_dir,
+        max_batch_size=args.max_batch_size,
+        engine_name=f"model.engine",
+        dtype=torch.bfloat16)
diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py
@@ -794,6 +794,8 @@ def multimodal_model_root(request, llm_venv):
         tllm_model_name = tllm_model_name + ".nemo"
     elif 'Llama-3.2' in tllm_model_name:
         models_root = os.path.join(llm_models_root(), 'llama-3.2-models')
+    elif 'Mistral-Small' in tllm_model_name:
+        models_root = llm_models_root()
 
     multimodal_model_root = os.path.join(models_root, tllm_model_name)
 
diff --git a/tests/integration/defs/examples/test_multimodal.py b/tests/integration/defs/examples/test_multimodal.py
@@ -115,7 +115,7 @@ def _test_llm_multimodal_general(llm_venv,
     mllama_model = 'Llama-3.2' in model_name
     qwen2_vl_model = 'Qwen2-VL' in model_name
     internlm_model = 'internlm-xcomposer2' in model_name
-
+    mistral_model = 'Mistral-Small' in model_name
     if enc_dec_model:
         builder_root = enc_dec_example_root
         if nougat_model:
@@ -134,6 +134,8 @@ def _test_llm_multimodal_general(llm_venv,
         builder_root, model_type = internlm_example_root, "internlm"
     elif llava_model or vila_model:
         builder_root, model_type = llama_example_root, "llama"
+    elif mistral_model:
+        builder_root, model_type = llama_example_root, "llama"
     elif cogvlm_model:
         builder_root, model_type = cogvlm_example_root, "cogvlm"
     elif nemotron_model:
@@ -214,7 +216,7 @@ def _test_llm_multimodal_general(llm_venv,
     print("Build LLM engines...")
     model_name = model_name.split('/')[-1]  # Remove HF directory name
     llm_engine_dir = f"{engine_dir}/{model_name}/{world_size}-gpu"
-    if "opt" in model_name or llava_model or vila_model or gpt_model or nemotron_model or phi3_model or phi4_model or qwen2_vl_model:
+    if "opt" in model_name or llava_model or vila_model or gpt_model or nemotron_model or phi3_model or phi4_model or qwen2_vl_model or mistral_model:
         max_input_len_text = 1024
         max_output_len = 200
         if llava_next_model:
@@ -227,7 +229,9 @@ def _test_llm_multimodal_general(llm_venv,
             multimodal_len = 196
         elif phi3_model:
             multimodal_len = 5120
-        elif phi4_model:  # @B: Confirm this.
+        elif phi4_model:
+            multimodal_len = 5120
+        elif mistral_model:
             multimodal_len = 5120
         elif "fuyu" in model_name:
             multimodal_len = 2640
@@ -386,6 +390,7 @@ def _test_llm_multimodal_general(llm_venv,
     elif 'Llama-3.2' in model_name: vision_model_type = 'mllama'
     elif "Qwen2-VL" in model_name: vision_model_type = 'qwen2_vl'
     elif 'internlm' in model_name: vision_model_type = 'internlm-xcomposer2'
+    elif 'Mistral-Small' in model_name: vision_model_type = 'pixtral'
 
     vit_batch_size = batch_size
     if vision_model_type == "llava_next":
@@ -623,6 +628,7 @@ def _test_llm_multimodal_general(llm_venv,
     'Llama-3.2-11B-Vision',
     'Qwen2-VL-7B-Instruct',
     'internlm-xcomposer2-vl-7b',
+    'Mistral-Small-3.1-24B-Instruct-2503',
 ],
                          indirect=True)
 def test_llm_multimodal_general(llm_venv, llm_root, llm_datasets_root,
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml