CUDA: Prefer vector flash decoding kernel for Gemma models (#12738)

gaugarg-nv · JohannesGaessler · web-flow · commit c262beddf29f · 2025-04-03T18:20:29.000+02:00
* Prefer vector flash decoding kernel for Gemma models

Vector flash decoding kernel was not being picked for models with head dimension 256. Gemma models are in this category.
Removing this limit improves e2e performance by upto 12% in gen phase throughput for Gemm models.

* Update ggml/src/ggml-cuda/fattn.cu

Co-authored-by: Johannes Gäßler &lt;johannesg@5d6.de&gt;

---------

Co-authored-by: Johannes Gäßler &lt;johannesg@5d6.de&gt;
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
@@ -299,7 +299,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
     const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
     const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
     const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && cc < GGML_CUDA_CC_ADA_LOVELACE && !mma_needs_data_conversion;
-    const bool can_use_vector_kernel = (Q->ne[0] % (2*warp_size) == 0) && (prec == GGML_PREC_DEFAULT || Q->ne[0] <= 128);
+    const bool can_use_vector_kernel = Q->ne[0] % (2*warp_size) == 0;
     if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
         if (prec == GGML_PREC_DEFAULT) {
             ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);