@@ -196,6 +196,7 @@ struct vk_device_struct {
196
196
vk_pipeline pipeline_pad_f32;
197
197
vk_pipeline pipeline_repeat_f32;
198
198
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
199
+ vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16;
199
200
vk_pipeline pipeline_norm_f32;
200
201
vk_pipeline pipeline_group_norm_f32;
201
202
vk_pipeline pipeline_rms_norm_f32;
@@ -722,6 +723,12 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
722
723
std::lock_guard<std::mutex> guard (compile_count_mutex);
723
724
assert (compile_count > 0 );
724
725
compile_count--;
726
+
727
+ // "Progress bar" for shader compiles
728
+ static uint32_t total_compile_count = 0 ;
729
+ if ((total_compile_count++ % 10 ) == 0 ) {
730
+ std::cerr << " ." ;
731
+ }
725
732
}
726
733
compile_count_cond.notify_all ();
727
734
}
@@ -1200,6 +1207,8 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
1200
1207
static void ggml_vk_load_shaders (vk_device& device) {
1201
1208
VK_LOG_DEBUG (" ggml_vk_load_shaders(" << device->name << " )" );
1202
1209
1210
+ std::cerr << " ggml_vulkan: Compiling shaders" ;
1211
+
1203
1212
// mulmat
1204
1213
std::initializer_list<uint32_t > warptile_l = { 128 , 128 , 128 , 16 , device->subgroup_size * 2 , 64 , 2 , 4 , 4 , device->subgroup_size };
1205
1214
std::initializer_list<uint32_t > warptile_m = { 128 , 64 , 64 , 16 , device->subgroup_size , 32 , 2 , 4 , 2 , device->subgroup_size };
@@ -1759,6 +1768,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
1759
1768
ggml_vk_create_pipeline (device, device->pipeline_cpy_f32_f16 , " cpy_f32_f16" , cpy_f32_f16_len, cpy_f32_f16_data, " main" , 2 , sizeof (vk_op_unary_push_constants), {512 , 1 , 1 }, {}, 1 );
1760
1769
ggml_vk_create_pipeline (device, device->pipeline_cpy_f16_f16 , " cpy_f16_f16" , cpy_f16_f16_len, cpy_f16_f16_data, " main" , 2 , sizeof (vk_op_unary_push_constants), {512 , 1 , 1 }, {}, 1 );
1761
1770
1771
+ ggml_vk_create_pipeline (device, device->pipeline_contig_cpy_f32_f32 , " contig_cpy_f32_f32" , contig_cpy_f32_f32_len, contig_cpy_f32_f32_data, " main" , 2 , sizeof (vk_op_unary_push_constants), {512 , 1 , 1 }, {}, 1 );
1772
+ ggml_vk_create_pipeline (device, device->pipeline_contig_cpy_f32_f16 , " contig_cpy_f32_f16" , contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, " main" , 2 , sizeof (vk_op_unary_push_constants), {512 , 1 , 1 }, {}, 1 );
1773
+ ggml_vk_create_pipeline (device, device->pipeline_contig_cpy_f16_f16 , " contig_cpy_f16_f16" , contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, " main" , 2 , sizeof (vk_op_unary_push_constants), {512 , 1 , 1 }, {}, 1 );
1774
+
1762
1775
ggml_vk_create_pipeline (device, device->pipeline_add_f32 , " add_f32" , add_f32_len, add_f32_data, " main" , 3 , sizeof (vk_op_binary_push_constants), {512 , 1 , 1 }, {}, 1 );
1763
1776
ggml_vk_create_pipeline (device, device->pipeline_add_f16_f32_f16 , " add_f16_f32_f16" , add_f16_f32_f16_len, add_f16_f32_f16_data, " main" , 3 , sizeof (vk_op_binary_push_constants), {512 , 1 , 1 }, {}, 1 );
1764
1777
@@ -1817,6 +1830,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
1817
1830
for (auto &c : compiles) {
1818
1831
c.wait ();
1819
1832
}
1833
+ std::cerr << " Done!" << std::endl;
1820
1834
}
1821
1835
1822
1836
static vk_device ggml_vk_get_device (size_t idx) {
@@ -3061,18 +3075,34 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
3061
3075
tensor->nb [3 ] == tensor->nb [2 ]*tensor->ne [2 ];
3062
3076
}
3063
3077
3064
- static vk_pipeline ggml_vk_get_cpy_pipeline (ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) {
3065
- if (from == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
3066
- return ctx->device ->pipeline_cpy_f32_f32 ;
3078
+ static vk_pipeline ggml_vk_get_cpy_pipeline (ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) {
3079
+
3080
+ // Choose "contiguous copy" shader if src/dst are contiguous
3081
+ bool contig = ggml_is_contiguous (src) && (!dst || ggml_is_contiguous (dst));
3082
+
3083
+ if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
3084
+ if (contig) {
3085
+ return ctx->device ->pipeline_contig_cpy_f32_f32 ;
3086
+ } else {
3087
+ return ctx->device ->pipeline_cpy_f32_f32 ;
3088
+ }
3067
3089
}
3068
- if (from == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
3069
- return ctx->device ->pipeline_cpy_f32_f16 ;
3090
+ if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
3091
+ if (contig) {
3092
+ return ctx->device ->pipeline_contig_cpy_f32_f16 ;
3093
+ } else {
3094
+ return ctx->device ->pipeline_cpy_f32_f16 ;
3095
+ }
3070
3096
}
3071
- if (from == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
3072
- return ctx->device ->pipeline_cpy_f16_f16 ;
3097
+ if (src->type == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
3098
+ if (contig) {
3099
+ return ctx->device ->pipeline_contig_cpy_f16_f16 ;
3100
+ } else {
3101
+ return ctx->device ->pipeline_cpy_f16_f16 ;
3102
+ }
3073
3103
}
3074
3104
3075
- std::cerr << " Missing CPY op for types: " << ggml_type_name (from ) << " " << ggml_type_name (to) << std::endl;
3105
+ std::cerr << " Missing CPY op for types: " << ggml_type_name (src-> type ) << " " << ggml_type_name (to) << std::endl;
3076
3106
GGML_ABORT (" fatal error" );
3077
3107
}
3078
3108
@@ -3082,6 +3112,15 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
3082
3112
const int tensor_type_size = ggml_type_size (tensor->type );
3083
3113
3084
3114
const uint32_t ne = ggml_nelements (tensor);
3115
+ std::array<uint32_t , 3 > elements;
3116
+
3117
+ if (ne > 262144 ) {
3118
+ elements = { 512 , 512 , CEIL_DIV (ne, 262144 ) };
3119
+ } else if (ne > 512 ) {
3120
+ elements = { 512 , CEIL_DIV (ne, 512 ), 1 };
3121
+ } else {
3122
+ elements = { ne, 1 , 1 };
3123
+ }
3085
3124
3086
3125
const vk_op_unary_push_constants pc = {
3087
3126
(uint32_t )ne,
@@ -3091,7 +3130,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
3091
3130
0 .0f , 0 .0f ,
3092
3131
};
3093
3132
ggml_vk_sync_buffers (subctx);
3094
- ggml_vk_dispatch_pipeline (ctx, subctx, pipeline, { in, out }, sizeof (vk_op_unary_push_constants), &pc, { ne, 1 , 1 } );
3133
+ ggml_vk_dispatch_pipeline (ctx, subctx, pipeline, { in, out }, sizeof (vk_op_unary_push_constants), &pc, elements );
3095
3134
}
3096
3135
3097
3136
static void ggml_vk_mul_mat_q_f16 (ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false ) {
@@ -3176,12 +3215,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
3176
3215
vk_pipeline to_fp16_vk_1 = nullptr ;
3177
3216
3178
3217
if (x_non_contig) {
3179
- to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0-> type , GGML_TYPE_F16);
3218
+ to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0, nullptr , GGML_TYPE_F16);
3180
3219
} else {
3181
3220
to_fp16_vk_0 = ggml_vk_get_to_fp16 (ctx, src0->type );
3182
3221
}
3183
3222
if (y_non_contig) {
3184
- to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1-> type , GGML_TYPE_F16);
3223
+ to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1, nullptr , GGML_TYPE_F16);
3185
3224
} else {
3186
3225
to_fp16_vk_1 = ggml_vk_get_to_fp16 (ctx, src1->type );
3187
3226
}
@@ -3361,10 +3400,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
3361
3400
vk_pipeline to_fp16_vk_0 = nullptr ;
3362
3401
vk_pipeline to_fp16_vk_1 = nullptr ;
3363
3402
if (x_non_contig) {
3364
- to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0-> type , src0->type );
3403
+ to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0, nullptr , src0->type );
3365
3404
}
3366
3405
if (y_non_contig) {
3367
- to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1-> type , src1->type );
3406
+ to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1, nullptr , src1->type );
3368
3407
} else {
3369
3408
to_fp16_vk_1 = ggml_vk_get_to_fp16 (ctx, src1->type );
3370
3409
}
@@ -3745,12 +3784,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
3745
3784
vk_pipeline to_fp16_vk_1 = nullptr ;
3746
3785
3747
3786
if (x_non_contig) {
3748
- to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0-> type , GGML_TYPE_F16);
3787
+ to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0, nullptr , GGML_TYPE_F16);
3749
3788
} else {
3750
3789
to_fp16_vk_0 = ggml_vk_get_to_fp16 (ctx, src0->type );
3751
3790
}
3752
3791
if (y_non_contig) {
3753
- to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1-> type , GGML_TYPE_F16);
3792
+ to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1, nullptr , GGML_TYPE_F16);
3754
3793
} else {
3755
3794
to_fp16_vk_1 = ggml_vk_get_to_fp16 (ctx, src1->type );
3756
3795
}
@@ -3938,10 +3977,10 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3938
3977
vk_pipeline to_fp16_vk_0 = nullptr ;
3939
3978
vk_pipeline to_fp16_vk_1 = nullptr ;
3940
3979
if (x_non_contig) {
3941
- to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0-> type , src0->type );
3980
+ to_fp16_vk_0 = ggml_vk_get_cpy_pipeline (ctx, src0, nullptr , src0->type );
3942
3981
}
3943
3982
if (y_non_contig) {
3944
- to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1-> type , src1->type );
3983
+ to_fp16_vk_1 = ggml_vk_get_cpy_pipeline (ctx, src1, nullptr , src1->type );
3945
3984
} else {
3946
3985
to_fp16_vk_1 = ggml_vk_get_to_fp16 (ctx, src1->type );
3947
3986
}
@@ -4148,7 +4187,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
4148
4187
case GGML_OP_CPY:
4149
4188
case GGML_OP_CONT:
4150
4189
case GGML_OP_DUP:
4151
- return ggml_vk_get_cpy_pipeline (ctx, src0-> type , dst->type );
4190
+ return ggml_vk_get_cpy_pipeline (ctx, src0, dst , dst->type );
4152
4191
case GGML_OP_NORM:
4153
4192
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
4154
4193
return ctx->device ->pipeline_norm_f32 ;
@@ -4281,7 +4320,6 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
4281
4320
case GGML_OP_DIV:
4282
4321
case GGML_OP_CONCAT:
4283
4322
case GGML_OP_UPSCALE:
4284
- case GGML_OP_SCALE:
4285
4323
case GGML_OP_SQR:
4286
4324
case GGML_OP_SIN:
4287
4325
case GGML_OP_COS:
0 commit comments