Skip to content

Commit 998b635

Browse files
committed
Fix ffn_down quantization mix for MoE models
In #4872 I did not consider the part where every third tensor is quantized with more bits. Fir MoE this leads to tensors of the same layer being quantized with different number of bits, which is not considered as a possibility in the inference implementation (it is assumed all experts use the same quantization).
1 parent 76484fb commit 998b635

File tree

1 file changed

+11
-8
lines changed

1 file changed

+11
-8
lines changed

llama.cpp

+11-8
Original file line numberDiff line numberDiff line change
@@ -8462,28 +8462,31 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
84628462
new_type = GGML_TYPE_Q8_0;
84638463
}
84648464
} else if (name.find("ffn_down") != std::string::npos) {
8465+
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
8466+
const int i_layer = qs.i_feed_forward_w2 / n_expert;
8467+
const int n_layer = qs.i_feed_forward_w2 / n_expert;
84658468
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
84668469
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8467-
if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
8470+
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
84688471
}
84698472
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8470-
new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
8471-
: arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
8473+
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
8474+
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
84728475
: GGML_TYPE_Q3_K;
84738476
}
84748477
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
84758478
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
84768479
}
84778480
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
84788481
if (arch == LLM_ARCH_FALCON) {
8479-
new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
8480-
use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8482+
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
8483+
use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
84818484
} else {
8482-
if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8485+
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
84838486
}
84848487
}
8485-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8486-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
8488+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
8489+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
84878490
new_type = GGML_TYPE_Q5_K;
84888491
}
84898492
++qs.i_feed_forward_w2;

0 commit comments

Comments
 (0)