@@ -1266,6 +1266,9 @@ struct llama_layer {
1266
1266
struct ggml_tensor * wqkv;
1267
1267
1268
1268
// attention bias
1269
+ struct ggml_tensor * bq;
1270
+ struct ggml_tensor * bk;
1271
+ struct ggml_tensor * bv;
1269
1272
struct ggml_tensor * bo;
1270
1273
struct ggml_tensor * bqkv;
1271
1274
@@ -2809,6 +2812,30 @@ static void llm_load_tensors(
2809
2812
layer.wv = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_V, " weight" , i), {n_embd, n_embd_gqa}, backend_split);
2810
2813
layer.wo = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_OUT, " weight" , i), {n_embd, n_embd}, backend_split);
2811
2814
2815
+ try {
2816
+ layer.bq = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_Q, " bias" , i), {n_embd}, backend);
2817
+ } catch (const std::runtime_error& e) {
2818
+ if (std::string (e.what ()).find (" not found" ) != std::string::npos) layer.bq = NULL ; else throw ;
2819
+ }
2820
+
2821
+ try {
2822
+ layer.bk = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_K, " bias" , i), {n_embd_gqa}, backend);
2823
+ } catch (const std::runtime_error& e) {
2824
+ if (std::string (e.what ()).find (" not found" ) != std::string::npos) layer.bk = NULL ; else throw ;
2825
+ }
2826
+
2827
+ try {
2828
+ layer.bv = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_V, " bias" , i), {n_embd_gqa}, backend);
2829
+ } catch (const std::runtime_error& e) {
2830
+ if (std::string (e.what ()).find (" not found" ) != std::string::npos) layer.bv = NULL ; else throw ;
2831
+ }
2832
+
2833
+ try {
2834
+ layer.bo = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_OUT, " bias" , i), {n_embd}, backend);
2835
+ } catch (const std::runtime_error& e) {
2836
+ if (std::string (e.what ()).find (" not found" ) != std::string::npos) layer.bo = NULL ; else throw ;
2837
+ }
2838
+
2812
2839
layer.ffn_norm = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_NORM, " weight" , i), {n_embd}, backend);
2813
2840
2814
2841
layer.ffn_gate = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_GATE, " weight" , i), {n_embd, n_ff}, backend_split);
@@ -2817,9 +2844,14 @@ static void llm_load_tensors(
2817
2844
2818
2845
if (backend == GGML_BACKEND_GPU) {
2819
2846
vram_weights +=
2820
- ggml_nbytes (layer.attn_norm ) + ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk ) +
2821
- ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.ffn_norm ) +
2822
- ggml_nbytes (layer.ffn_gate ) + ggml_nbytes (layer.ffn_down ) + ggml_nbytes (layer.ffn_up );
2847
+ ggml_nbytes (layer.attn_norm ) + ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk ) +
2848
+ ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) +
2849
+ (layer.bq ? ggml_nbytes (layer.bq ) : 0 ) +
2850
+ (layer.bk ? ggml_nbytes (layer.bk ) : 0 ) +
2851
+ (layer.bv ? ggml_nbytes (layer.bv ) : 0 ) +
2852
+ (layer.bo ? ggml_nbytes (layer.bo ) : 0 ) +
2853
+ ggml_nbytes (layer.ffn_norm ) + ggml_nbytes (layer.ffn_gate ) +
2854
+ ggml_nbytes (layer.ffn_down ) + ggml_nbytes (layer.ffn_up );
2823
2855
}
2824
2856
}
2825
2857
} break ;
@@ -3983,12 +4015,24 @@ struct llm_build_context {
3983
4015
// compute Q and K and RoPE them
3984
4016
struct ggml_tensor * Qcur = ggml_mul_mat (ctx0, model.layers [il].wq , cur);
3985
4017
cb (Qcur, " Qcur" , il);
4018
+ if (model.layers [il].bq ) {
4019
+ Qcur = ggml_add (ctx0, Qcur, model.layers [il].bq );
4020
+ cb (Qcur, " Qcur" , il);
4021
+ }
3986
4022
3987
4023
struct ggml_tensor * Kcur = ggml_mul_mat (ctx0, model.layers [il].wk , cur);
3988
4024
cb (Kcur, " Kcur" , il);
4025
+ if (model.layers [il].bk ) {
4026
+ Kcur = ggml_add (ctx0, Kcur, model.layers [il].bk );
4027
+ cb (Kcur, " Kcur" , il);
4028
+ }
3989
4029
3990
4030
struct ggml_tensor * Vcur = ggml_mul_mat (ctx0, model.layers [il].wv , cur);
3991
4031
cb (Vcur, " Vcur" , il);
4032
+ if (model.layers [il].bv ) {
4033
+ Vcur = ggml_add (ctx0, Vcur, model.layers [il].bv );
4034
+ cb (Vcur, " Vcur" , il);
4035
+ }
3992
4036
3993
4037
Qcur = ggml_rope_custom (
3994
4038
ctx0, ggml_reshape_3d (ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
@@ -4007,7 +4051,7 @@ struct llm_build_context {
4007
4051
llm_build_kv_store (ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4008
4052
4009
4053
cur = llm_build_kqv (ctx0, hparams, kv_self,
4010
- model.layers [il].wo , NULL ,
4054
+ model.layers [il].wo , model. layers [il]. bo ,
4011
4055
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1 .0f , cb, il);
4012
4056
cb (cur, " kqv_out" , il);
4013
4057
}
0 commit comments