@@ -718,6 +718,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
718
718
else if (ctx->minicpmv_version == 3 ) {
719
719
pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 3584 , pos_w * pos_h, 1 );
720
720
}
721
+ else if (ctx->minicpmv_version == 4 ) {
722
+ pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 3584 , pos_w * pos_h, 1 );
723
+ }
721
724
ggml_set_name (pos_embed, " pos_embed" );
722
725
ggml_set_input (pos_embed);
723
726
}
@@ -1053,6 +1056,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
1053
1056
n_head = hidden_size/d_head;
1054
1057
num_query = 64 ;
1055
1058
}
1059
+ else if (ctx->minicpmv_version == 4 ) {
1060
+ hidden_size = 3584 ;
1061
+ n_head = hidden_size/d_head;
1062
+ num_query = 64 ;
1063
+ }
1056
1064
1057
1065
struct ggml_tensor * Q = ggml_add (ctx0, ggml_mul_mat (ctx0, model.mm_model_attn_q_w , q), model.mm_model_attn_q_b );
1058
1066
Q = ggml_scale_inplace (ctx0, Q, 1 .0f / sqrt ((float )d_head));
@@ -2041,6 +2049,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
2041
2049
images[images.size ()-1 ].push_back (patch);
2042
2050
}
2043
2051
}
2052
+ clip_image_u8_free (refine_image);
2044
2053
}
2045
2054
return images;
2046
2055
}
@@ -2079,6 +2088,13 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
2079
2088
clip_image_f32_free (res);
2080
2089
}
2081
2090
}
2091
+ for (size_t i = 0 ; i < imgs.size (); ++i) {
2092
+ for (size_t j = 0 ; j < imgs[i].size (); ++j) {
2093
+ if (imgs[i][j] != nullptr ) {
2094
+ clip_image_u8_free (imgs[i][j]);
2095
+ }
2096
+ }
2097
+ }
2082
2098
return true ;
2083
2099
}
2084
2100
else if (ctx->has_qwen2vl_merger ) {
@@ -2335,6 +2351,9 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
2335
2351
else if (ctx->minicpmv_version == 3 ) {
2336
2352
n_patches = 64 ;
2337
2353
}
2354
+ else if (ctx->minicpmv_version == 4 ) {
2355
+ n_patches = 64 ;
2356
+ }
2338
2357
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
2339
2358
int patch_size = params.patch_size * 2 ;
2340
2359
int x_patch = img->nx / patch_size + (int )(img->nx % patch_size > 0 );
@@ -2514,8 +2533,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2514
2533
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
2515
2534
struct ggml_tensor * positions = ggml_graph_get_tensor (gf, " positions" );
2516
2535
int * positions_data = (int *)malloc (ggml_nbytes (positions));
2517
- int bucket_coords_h[70 ];
2518
- int bucket_coords_w[70 ];
2536
+ int bucket_coords_h[1024 ];
2537
+ int bucket_coords_w[1024 ];
2519
2538
for (int i = 0 ; i < pos_h; i++){
2520
2539
bucket_coords_h[i] = std::floor (70.0 *i/pos_h);
2521
2540
}
@@ -2543,6 +2562,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2543
2562
else if (ctx->minicpmv_version == 3 ) {
2544
2563
embed_dim = 3584 ;
2545
2564
}
2565
+ else if (ctx->minicpmv_version == 4 ) {
2566
+ embed_dim = 3584 ;
2567
+ }
2546
2568
auto pos_embed_t = get_2d_sincos_pos_embed (embed_dim, std::make_pair (pos_w, pos_h));
2547
2569
2548
2570
float * pos_embed_data = (float *)malloc (ggml_nbytes (pos_embed));
@@ -2786,6 +2808,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
2786
2808
else if (ctx->minicpmv_version == 3 ) {
2787
2809
return 3584 ;
2788
2810
}
2811
+ else if (ctx->minicpmv_version == 4 ) {
2812
+ return 3584 ;
2813
+ }
2789
2814
}
2790
2815
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
2791
2816
return ctx->vision_model .mm_1_b ->ne [0 ];
0 commit comments