@@ -8462,28 +8462,31 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8462
8462
new_type = GGML_TYPE_Q8_0;
8463
8463
}
8464
8464
} else if (name.find("ffn_down") != std::string::npos) {
8465
+ const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
8466
+ const int i_layer = qs.i_feed_forward_w2 / n_expert;
8467
+ const int n_layer = qs.i_feed_forward_w2 / n_expert;
8465
8468
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8466
8469
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8467
- if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2 /8) new_type = GGML_TYPE_Q4_K;
8470
+ if (i_layer < n_layer /8) new_type = GGML_TYPE_Q4_K;
8468
8471
}
8469
8472
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8470
- new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2 /16 ? GGML_TYPE_Q5_K
8471
- : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2 ) ? GGML_TYPE_Q4_K
8473
+ new_type = i_layer < n_layer /16 ? GGML_TYPE_Q5_K
8474
+ : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer ) ? GGML_TYPE_Q4_K
8472
8475
: GGML_TYPE_Q3_K;
8473
8476
}
8474
8477
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
8475
8478
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
8476
8479
}
8477
8480
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8478
8481
if (arch == LLM_ARCH_FALCON) {
8479
- new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2 /16 ? GGML_TYPE_Q6_K :
8480
- use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2 ) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8482
+ new_type = i_layer < n_layer /16 ? GGML_TYPE_Q6_K :
8483
+ use_more_bits(i_layer, n_layer ) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8481
8484
} else {
8482
- if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2 )) new_type = GGML_TYPE_Q6_K;
8485
+ if (use_more_bits(i_layer, n_layer )) new_type = GGML_TYPE_Q6_K;
8483
8486
}
8484
8487
}
8485
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2 )) new_type = GGML_TYPE_Q6_K;
8486
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2 /8) {
8488
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer )) new_type = GGML_TYPE_Q6_K;
8489
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer /8) {
8487
8490
new_type = GGML_TYPE_Q5_K;
8488
8491
}
8489
8492
++qs.i_feed_forward_w2;
0 commit comments