Skip to content

Commit faa3526

Browse files
ikawrakowKawrakow
andauthored
Fix Q3_K_XS for MoE models (#5113)
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
1 parent ddc5a50 commit faa3526

File tree

1 file changed

+25
-20
lines changed

1 file changed

+25
-20
lines changed

llama.cpp

+25-20
Original file line numberDiff line numberDiff line change
@@ -8829,6 +8829,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
88298829
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
88308830
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
88318831
};
8832+
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
8833+
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
8834+
if (n_expert > 1) {
8835+
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
8836+
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
8837+
// for getting the current layer as I initially thought, and we need to resort to parsing the
8838+
// tensor name.
8839+
n_layer /= n_expert;
8840+
if (sscanf(name, "blk.%d.", &i_layer) != 1) {
8841+
throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
8842+
}
8843+
if (i_layer < 0 || i_layer >= n_layer) {
8844+
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
8845+
}
8846+
}
8847+
return std::make_pair(i_layer, n_layer);
8848+
};
88328849

88338850
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
88348851
int nx = tensor->ne[0];
@@ -8890,24 +8907,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
88908907
new_type = GGML_TYPE_Q2_K;
88918908
}
88928909
} else if (name.find("ffn_down") != std::string::npos) {
8893-
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
8894-
int i_layer, n_layer;
8895-
if (n_expert == 1) {
8896-
i_layer = qs.i_ffn_down;
8897-
n_layer = qs.n_ffn_down;
8898-
} else {
8899-
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
8900-
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
8901-
// for getting the current layer as I initially thought, and we need to resort to parsing the
8902-
// tensor name.
8903-
n_layer = qs.n_ffn_down / n_expert;
8904-
if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
8905-
throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
8906-
}
8907-
if (i_layer < 0 || i_layer >= n_layer) {
8908-
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer));
8909-
}
8910-
}
8910+
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
8911+
int i_layer = info.first, n_layer = info.second;
89118912
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
89128913
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
89138914
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
@@ -8963,13 +8964,17 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
89638964
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
89648965
}
89658966
else if (name.find("ffn_gate") != std::string::npos) {
8966-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_gate, qs.n_ffn_gate)) {
8967+
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
8968+
int i_layer = info.first, n_layer = info.second;
8969+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
89678970
new_type = GGML_TYPE_Q2_K;
89688971
}
89698972
++qs.i_ffn_gate;
89708973
}
89718974
else if (name.find("ffn_up") != std::string::npos) {
8972-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_up, qs.n_ffn_up)) {
8975+
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
8976+
int i_layer = info.first, n_layer = info.second;
8977+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) {
89738978
new_type = GGML_TYPE_Q2_K;
89748979
}
89758980
++qs.i_ffn_up;

0 commit comments

Comments
 (0)