Skip to content

Commit 03562f3

Browse files
authored
llama : support attention bias on LLaMA architecture (#4283)
* Support attention_bias on LLaMA architecture QKVO bias, should fix InternLM (#3133) and works for LLaMAfied Qwen models (#3743 (comment)). * check existence of qkvo bias while loading llama models Tested on LLaMA2, CUDA and CPU. * Update llama.cpp
1 parent 37c746d commit 03562f3

File tree

1 file changed

+48
-4
lines changed

1 file changed

+48
-4
lines changed

llama.cpp

+48-4
Original file line numberDiff line numberDiff line change
@@ -1266,6 +1266,9 @@ struct llama_layer {
12661266
struct ggml_tensor * wqkv;
12671267

12681268
// attention bias
1269+
struct ggml_tensor * bq;
1270+
struct ggml_tensor * bk;
1271+
struct ggml_tensor * bv;
12691272
struct ggml_tensor * bo;
12701273
struct ggml_tensor * bqkv;
12711274

@@ -2809,6 +2812,30 @@ static void llm_load_tensors(
28092812
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
28102813
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
28112814

2815+
try {
2816+
layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend);
2817+
} catch (const std::runtime_error& e) {
2818+
if (std::string(e.what()).find("not found") != std::string::npos) layer.bq = NULL; else throw;
2819+
}
2820+
2821+
try {
2822+
layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend);
2823+
} catch (const std::runtime_error& e) {
2824+
if (std::string(e.what()).find("not found") != std::string::npos) layer.bk = NULL; else throw;
2825+
}
2826+
2827+
try {
2828+
layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend);
2829+
} catch (const std::runtime_error& e) {
2830+
if (std::string(e.what()).find("not found") != std::string::npos) layer.bv = NULL; else throw;
2831+
}
2832+
2833+
try {
2834+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
2835+
} catch (const std::runtime_error& e) {
2836+
if (std::string(e.what()).find("not found") != std::string::npos) layer.bo = NULL; else throw;
2837+
}
2838+
28122839
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
28132840

28142841
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
@@ -2817,9 +2844,14 @@ static void llm_load_tensors(
28172844

28182845
if (backend == GGML_BACKEND_GPU) {
28192846
vram_weights +=
2820-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
2821-
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
2822-
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
2847+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
2848+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) +
2849+
(layer.bq ? ggml_nbytes(layer.bq) : 0) +
2850+
(layer.bk ? ggml_nbytes(layer.bk) : 0) +
2851+
(layer.bv ? ggml_nbytes(layer.bv) : 0) +
2852+
(layer.bo ? ggml_nbytes(layer.bo) : 0) +
2853+
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
2854+
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
28232855
}
28242856
}
28252857
} break;
@@ -3983,12 +4015,24 @@ struct llm_build_context {
39834015
// compute Q and K and RoPE them
39844016
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
39854017
cb(Qcur, "Qcur", il);
4018+
if (model.layers[il].bq) {
4019+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
4020+
cb(Qcur, "Qcur", il);
4021+
}
39864022

39874023
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
39884024
cb(Kcur, "Kcur", il);
4025+
if (model.layers[il].bk) {
4026+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
4027+
cb(Kcur, "Kcur", il);
4028+
}
39894029

39904030
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
39914031
cb(Vcur, "Vcur", il);
4032+
if (model.layers[il].bv) {
4033+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
4034+
cb(Vcur, "Vcur", il);
4035+
}
39924036

39934037
Qcur = ggml_rope_custom(
39944038
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
@@ -4007,7 +4051,7 @@ struct llm_build_context {
40074051
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
40084052

40094053
cur = llm_build_kqv(ctx0, hparams, kv_self,
4010-
model.layers[il].wo, NULL,
4054+
model.layers[il].wo, model.layers[il].bo,
40114055
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
40124056
cb(cur, "kqv_out", il);
40134057
}

0 commit comments

Comments
 (0)