From 6e9ebda9eacd1c9519439b26eabf0a595b3059c8 Mon Sep 17 00:00:00 2001
From: caitianchi <caitianchi@modelbest.cn>
Date: Mon, 13 Jan 2025 22:00:39 +0800
Subject: [PATCH 1/9] init

---
 Makefile                                      |  6 +--
 convert_hf_to_gguf.py                         |  1 +
 examples/llava/README-minicpmo2.6.md          | 44 +++++++++++++++++++
 examples/llava/clip.cpp                       | 17 +++++++
 examples/llava/llava.cpp                      | 10 +----
 examples/llava/minicpmv-cli.cpp               | 10 ++++-
 .../minicpmv-convert-image-encoder-to-gguf.py | 19 +++++---
 examples/llava/minicpmv-surgery.py            |  2 +-
 8 files changed, 91 insertions(+), 18 deletions(-)
 create mode 100644 examples/llava/README-minicpmo2.6.md

diff --git a/Makefile b/Makefile
index 19ae0d5f1c87b..18d3815b88b94 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
-ifndef LLAMA_MAKEFILE
-$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
-endif
+# ifndef LLAMA_MAKEFILE
+# $(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
+# endif
 
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 4dc9837ab2d04..63151346945f8 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -579,6 +579,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
         #       or pull the latest version of the model from Huggingface
         #       don't edit the hashes manually!
+        res = "qwen2"
         if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
             # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
             res = "llama-bpe"
diff --git a/examples/llava/README-minicpmo2.6.md b/examples/llava/README-minicpmo2.6.md
new file mode 100644
index 0000000000000..fc65f622551e6
--- /dev/null
+++ b/examples/llava/README-minicpmo2.6.md
@@ -0,0 +1,44 @@
+## MiniCPM-o 2.6
+
+### Prepare models and code
+
+Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch model from huggingface to "MiniCPM-o-2_6" folder.
+
+Clone llama.cpp:
+```bash
+git clone git@github.com:OpenBMB/llama.cpp.git
+cd llama.cpp
+git checkout minicpm-omni
+```
+
+### Usage of MiniCPM-o 2.6
+
+Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
+
+```bash
+python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-o-2_6
+python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmo.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
+python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
+
+# quantize int4 version
+./llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
+```
+
+Build for Linux or Mac
+
+```bash
+make
+make llama-minicpmv-cli
+```
+
+Inference on Linux or Mac
+```
+# run f16 version
+./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+
+# run quantized int4 version
+./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+
+# or run in interactive mode
+./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
+```
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 7a8a3156bfdef..076d07cee8259 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -718,6 +718,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
         else if (ctx->minicpmv_version == 3) {
             pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
         }
+        else if (ctx->minicpmv_version == 4) {
+            pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
+        }
         ggml_set_name(pos_embed, "pos_embed");
         ggml_set_input(pos_embed);
     }
@@ -1053,6 +1056,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                     n_head = hidden_size/d_head;
                     num_query = 64;
                 }
+                else if (ctx->minicpmv_version == 4) {
+                    hidden_size = 3584;
+                    n_head = hidden_size/d_head;
+                    num_query = 64;
+                }
 
                 struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
                 Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
@@ -2335,6 +2343,9 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
         else if (ctx->minicpmv_version == 3) {
             n_patches = 64;
         }
+        else if (ctx->minicpmv_version == 4) {
+            n_patches = 64;
+        }
     } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
         int patch_size = params.patch_size * 2;
         int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
@@ -2543,6 +2554,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             else if (ctx->minicpmv_version == 3) {
                 embed_dim = 3584;
             }
+            else if (ctx->minicpmv_version == 4) {
+                embed_dim = 3584;
+            }
             auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
 
             float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
@@ -2786,6 +2800,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
         else if (ctx->minicpmv_version == 3) {
             return 3584;
         }
+        else if (ctx->minicpmv_version == 4) {
+            return 3584;
+        }
     }
     if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
         return ctx->vision_model.mm_1_b->ne[0];
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index c598caf3dd1eb..f8f16ab585032 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -216,7 +216,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
     return true;
 }
 
-static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
+static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size) {
     int width = image->nx;
     int height = image->ny;
     int num_patches = (height / patch_size) * (width / patch_size);
@@ -277,13 +277,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
                 encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
             }
             else {
-                int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
-                if (has_minicpmv_projector == 2) {
-                    encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
-                }
-                else if (has_minicpmv_projector == 3) {
-                    encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
-                }
+                encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
             }
 
             if (!encoded) {
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index 38c44e130e15c..53d902d616e85 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -140,6 +140,9 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
     else if (has_minicpmv_projector == 3) {
         system_prompt = "<|im_start|>user\n";
     }
+    else if (has_minicpmv_projector == 4) {
+        system_prompt = "<|im_start|>user\n";
+    }
     LOG_INF("%s: image token past: %d\n", __func__, n_past);
     eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
     process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
@@ -227,6 +230,9 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm
         else if (has_minicpmv_projector == 3) {
             user_prompt = "<|im_start|>user\n" + prompt;
         }
+        else if (has_minicpmv_projector == 4) {
+            user_prompt = "<|im_start|>user\n" + prompt;
+        }
     }
 
     eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
@@ -236,6 +242,9 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm
     else if (has_minicpmv_projector == 3) {
         eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
     }
+    else if (has_minicpmv_projector == 4) {
+        eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
+    }
 
     // generate the response
 
@@ -308,7 +317,6 @@ int main(int argc, char ** argv) {
                     const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
                     response += tmp;
                     if (strcmp(tmp, "</s>") == 0) break;
-                    if (strstr(tmp, "###")) break; // Yi-VL behavior
                     printf("%s", tmp);// mistral llava-1.6
                     if (strstr(response.c_str(), "<user>")) break; // minicpm-v
                     fflush(stdout);
diff --git a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
index ea773742a832b..ade04fd9c5c00 100644
--- a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
+++ b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
@@ -501,7 +501,7 @@ def bytes_to_unicode():
 default_image_std = [0.26862954, 0.26130258, 0.27577711]
 ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
 ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
-ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3', default=2)
+ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4', default=2)
 
 # with proper
 args = ap.parse_args()
@@ -545,13 +545,20 @@ def bytes_to_unicode():
 
 minicpmv_version = args.minicpmv_version
 emb_dim = 4096
+block_count = 26
 if minicpmv_version == 1:
     emb_dim = 2304
+    block_count = 26
 elif minicpmv_version == 2:
     emb_dim = 4096
+    block_count = 27
 elif minicpmv_version == 3:
     emb_dim = 3584
-
+    block_count = 27
+elif minicpmv_version == 4:
+    emb_dim = 3584
+    block_count = 27
+    
 default_vision_config = {
         "hidden_size": 1152,
         "image_size": 980,
@@ -567,7 +574,10 @@ def bytes_to_unicode():
 if minicpmv_version == 3:
     vision_config = SiglipVisionConfig(**default_vision_config)
     model = SiglipVisionTransformer(vision_config)
-
+elif minicpmv_version == 4:
+    vision_config = SiglipVisionConfig(**default_vision_config)
+    model = SiglipVisionTransformer(vision_config)
+    
 processor = None
 # if model.attn_pool is not None:
 #     model.attn_pool = torch.nn.Identity()
@@ -587,7 +597,7 @@ def bytes_to_unicode():
     fname_middle = "mmproj-"
     has_text_encoder = False
     has_minicpmv_projector = True
-    minicpmv_version = 3
+    minicpmv_version = 4
 elif args.vision_only:
     fname_middle = "vision-"
     has_text_encoder = False
@@ -625,7 +635,6 @@ def bytes_to_unicode():
     fout.add_uint32("clip.vision.projection_dim", 0)
     fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
     fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
-    block_count = 26
     fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
 
     if processor is not None:
diff --git a/examples/llava/minicpmv-surgery.py b/examples/llava/minicpmv-surgery.py
index 748ff5c57824e..ba82116582b1f 100644
--- a/examples/llava/minicpmv-surgery.py
+++ b/examples/llava/minicpmv-surgery.py
@@ -8,7 +8,7 @@
 args = ap.parse_args()
 
 # find the model part that includes the the multimodal projector weights
-model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True)
+model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True, torch_dtype=torch.bfloat16)
 checkpoint = model.state_dict()
 
 # get a list of mm tensor names

From ee3fae86d5bf08c1d8f20e6c64315f6f359415c2 Mon Sep 17 00:00:00 2001
From: caitianchi <caitianchi@modelbest.cn>
Date: Wed, 15 Jan 2025 12:20:31 +0800
Subject: [PATCH 2/9] add readme

---
 examples/llava/README-minicpmo2.6.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/llava/README-minicpmo2.6.md b/examples/llava/README-minicpmo2.6.md
index fc65f622551e6..ea220224ac8aa 100644
--- a/examples/llava/README-minicpmo2.6.md
+++ b/examples/llava/README-minicpmo2.6.md
@@ -1,4 +1,5 @@
 ## MiniCPM-o 2.6
+Currently, this readme only supports minicpm-omni's image capabilities, and we will update the full-mode support as soon as possible.
 
 ### Prepare models and code
 

From ed02c70ce49a5616ddfa7848b5a91d8b61efa554 Mon Sep 17 00:00:00 2001
From: caitianchi <caitianchi@modelbest.cn>
Date: Thu, 16 Jan 2025 13:57:36 +0800
Subject: [PATCH 3/9] update readme

---
 examples/llava/README-minicpmo2.6.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llava/README-minicpmo2.6.md b/examples/llava/README-minicpmo2.6.md
index ea220224ac8aa..10b637eb75bc9 100644
--- a/examples/llava/README-minicpmo2.6.md
+++ b/examples/llava/README-minicpmo2.6.md
@@ -18,7 +18,7 @@ Convert PyTorch model to gguf files (You can also download the converted [gguf](
 
 ```bash
 python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-o-2_6
-python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmo.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
+python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
 python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
 
 # quantize int4 version

From 52c6abe0df092b531546564a79c9f79f6561c5b1 Mon Sep 17 00:00:00 2001
From: caitianchi <caitianchi@modelbest.cn>
Date: Thu, 16 Jan 2025 14:11:41 +0800
Subject: [PATCH 4/9] no use make

---
 Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 18d3815b88b94..19ae0d5f1c87b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
-# ifndef LLAMA_MAKEFILE
-# $(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
-# endif
+ifndef LLAMA_MAKEFILE
+$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
+endif
 
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \

From c69f9baf2b0fd692fb974496a2573eb12c379c16 Mon Sep 17 00:00:00 2001
From: caitianchi <caitianchi@modelbest.cn>
Date: Thu, 16 Jan 2025 14:16:38 +0800
Subject: [PATCH 5/9] update readme

---
 examples/llava/README-minicpmo2.6.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/llava/README-minicpmo2.6.md b/examples/llava/README-minicpmo2.6.md
index 10b637eb75bc9..8713a43d64fd8 100644
--- a/examples/llava/README-minicpmo2.6.md
+++ b/examples/llava/README-minicpmo2.6.md
@@ -25,11 +25,12 @@ python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
 ./llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
 ```
 
-Build for Linux or Mac
+Build llama.cpp using `CMake`:
+https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md
 
 ```bash
-make
-make llama-minicpmv-cli
+cmake -B build
+cmake --build build --config Release
 ```
 
 Inference on Linux or Mac

From 27d0ec81eae603a42a34a7185ba483920b33f539 Mon Sep 17 00:00:00 2001
From: caitianchi <caitianchi@modelbest.cn>
Date: Sat, 18 Jan 2025 22:12:13 +0800
Subject: [PATCH 6/9] update fix code

---
 examples/llava/clip.cpp  | 12 ++++++++++--
 examples/llava/llava.cpp |  3 +++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 076d07cee8259..4251f884e3537 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2049,6 +2049,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
                 images[images.size()-1].push_back(patch);
             }
         }
+        delete refine_image;
     }
     return images;
 }
@@ -2087,6 +2088,13 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
                 clip_image_f32_free(res);
             }
         }
+        for (size_t i = 0; i < imgs.size(); ++i) {
+            for (size_t j = 0; j < imgs[i].size(); ++j) {
+                if (imgs[i][j] != nullptr) {
+                    delete imgs[i][j];
+                }
+            }
+        }
         return true;
     }
     else if (ctx->has_qwen2vl_merger) {
@@ -2525,8 +2533,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
             struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
             int* positions_data = (int*)malloc(ggml_nbytes(positions));
-            int bucket_coords_h[70];
-            int bucket_coords_w[70];
+            int bucket_coords_h[1024];
+            int bucket_coords_w[1024];
             for (int i = 0; i < pos_h; i++){
                 bucket_coords_h[i] = std::floor(70.0*i/pos_h);
             }
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index f8f16ab585032..2cac7933d2f2a 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -307,6 +307,9 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
         load_image_size->height = img->ny;
         clip_add_load_image_size(ctx_clip, load_image_size);
         LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
+        delete[] img_res_v.data;
+        img_res_v.size = 0;
+        img_res_v.data = nullptr;
     }
     else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
         // flat / default llava-1.5 type embedding

From 136644489660c6a268c808dee95b3adb1e954c4f Mon Sep 17 00:00:00 2001
From: caitianchi <caitianchi@modelbest.cn>
Date: Sat, 18 Jan 2025 23:49:48 +0800
Subject: [PATCH 7/9] fix editorconfig-checker

---
 examples/llava/minicpmv-convert-image-encoder-to-gguf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
index ade04fd9c5c00..9b196757f07c9 100644
--- a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
+++ b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
@@ -558,7 +558,7 @@ def bytes_to_unicode():
 elif minicpmv_version == 4:
     emb_dim = 3584
     block_count = 27
-    
+
 default_vision_config = {
         "hidden_size": 1152,
         "image_size": 980,
@@ -577,7 +577,7 @@ def bytes_to_unicode():
 elif minicpmv_version == 4:
     vision_config = SiglipVisionConfig(**default_vision_config)
     model = SiglipVisionTransformer(vision_config)
-    
+
 processor = None
 # if model.attn_pool is not None:
 #     model.attn_pool = torch.nn.Identity()

From fee2aa687a8cb56b09143968be809a91a93f5093 Mon Sep 17 00:00:00 2001
From: caitianchi <caitianchi@modelbest.cn>
Date: Mon, 20 Jan 2025 10:40:16 +0800
Subject: [PATCH 8/9] no change convert py

---
 convert_hf_to_gguf.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 63151346945f8..4dc9837ab2d04 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -579,7 +579,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
         #       or pull the latest version of the model from Huggingface
         #       don't edit the hashes manually!
-        res = "qwen2"
         if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
             # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
             res = "llama-bpe"

From 27f5e8a10a3c8916f18b786e8d24db4c672b9891 Mon Sep 17 00:00:00 2001
From: caitianchi <caitianchi@modelbest.cn>
Date: Tue, 21 Jan 2025 17:08:45 +0800
Subject: [PATCH 9/9] use clip_image_u8_free

---
 examples/llava/clip.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 4251f884e3537..24073c5a9b15f 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2049,7 +2049,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
                 images[images.size()-1].push_back(patch);
             }
         }
-        delete refine_image;
+        clip_image_u8_free(refine_image);
     }
     return images;
 }
@@ -2091,7 +2091,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
         for (size_t i = 0; i < imgs.size(); ++i) {
             for (size_t j = 0; j < imgs[i].size(); ++j) {
                 if (imgs[i][j] != nullptr) {
-                    delete imgs[i][j];
+                    clip_image_u8_free(imgs[i][j]);
                 }
             }
         }