From 998b635a17adc33dfccf8083417efc721351f0f3 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sun, 14 Jan 2024 08:39:10 +0200 Subject: [PATCH 1/3] Fix ffn_down quantization mix for MoE models In #4872 I did not consider the part where every third tensor is quantized with more bits. Fir MoE this leads to tensors of the same layer being quantized with different number of bits, which is not considered as a possibility in the inference implementation (it is assumed all experts use the same quantization). --- llama.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/llama.cpp b/llama.cpp index 66494974abb6f..28de3c7416293 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8462,13 +8462,16 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty new_type = GGML_TYPE_Q8_0; } } else if (name.find("ffn_down") != std::string::npos) { + const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); + const int i_layer = qs.i_feed_forward_w2 / n_expert; + const int n_layer = qs.i_feed_forward_w2 / n_expert; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { - if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K; + if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K - : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K + new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K + : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { @@ -8476,14 +8479,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { if (arch == LLM_ARCH_FALCON) { - new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K : - use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K : + use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else { - if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; + if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; } } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { new_type = GGML_TYPE_Q5_K; } ++qs.i_feed_forward_w2; From 121eb0664049dc78dd392655c9fe39bd9382bdde Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sun, 14 Jan 2024 09:39:56 +0200 Subject: [PATCH 2/3] Fix the fix --- llama.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 28de3c7416293..71a0b8e89909e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8463,8 +8463,18 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty } } else if (name.find("ffn_down") != std::string::npos) { const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); - const int i_layer = qs.i_feed_forward_w2 / n_expert; - const int n_layer = qs.i_feed_forward_w2 / n_expert; + int i_layer, n_layer; + if (n_expert == 1) { + i_layer = qs.i_feed_forward_w2; + n_layer = qs.n_feed_forward_w2; + } else { + // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly + // sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work + // for getting the current layer as I initially thought, and we need to resort to parsing the + // tensor name. + n_layer = qs.n_feed_forward_w2 / n_expert; + sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer); + } if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; From 00cc67e2e4b85ed452f7dde605eb63b50ac21f81 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sun, 14 Jan 2024 10:52:55 +0200 Subject: [PATCH 3/3] Review suggestion --- llama.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 71a0b8e89909e..e2cb447ad7f68 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8473,7 +8473,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty // for getting the current layer as I initially thought, and we need to resort to parsing the // tensor name. n_layer = qs.n_feed_forward_w2 / n_expert; - sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer); + if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) { + throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str())); + } + if (i_layer < 0 || i_layer >= n_layer) { + throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer)); + } } if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {