@@ -532,16 +532,16 @@ static struct ggml_tensor * forward(
532
532
// Vcur shape [n_embd, N, 1, 1]
533
533
struct ggml_tensor * Vcur = ggml_cont (ctx0, ggml_transpose (ctx0, ggml_reshape_2d (ctx0, ggml_mul_mat (ctx0, model->layers [il].wv , cur), n_embd, N)));
534
534
535
- // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
536
- // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
535
+ // kv_self.k shape [n_embd * kv_size * n_layer, 1]
536
+ // kv_self.v shape [n_embd * kv_size * n_layer, 1]
537
537
// k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
538
538
// v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
539
539
540
540
/* {
541
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
541
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
542
542
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
543
- ( n_ctx )*ggml_element_size(kv_self.v),
544
- (il*n_ctx )*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
543
+ ( kv_size )*ggml_element_size(kv_self.v),
544
+ (il*kv_size )*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
545
545
546
546
// important: storing RoPE-ed version of K in the KV cache!
547
547
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@@ -560,7 +560,7 @@ static struct ggml_tensor * forward(
560
560
Qcur,
561
561
0 , 2 , 1 , 3 );
562
562
563
- // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
563
+ // kv_self.k shape [n_embd * kv_size * n_layer, 1]
564
564
// K shape [n_embd/n_head, n_past + N, n_head, 1]
565
565
struct ggml_tensor * K =
566
566
ggml_permute (ctx0,
@@ -780,16 +780,16 @@ static struct ggml_tensor * forward_batch(
780
780
781
781
assert_shape_3d (Vcur, N, n_embd, n_batch);
782
782
783
- // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
784
- // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
783
+ // kv_self.k shape [n_embd * kv_size * n_batch * n_layer]
784
+ // kv_self.v shape [kv_size * n_embd * n_batch * n_layer]
785
785
// k shape [n_embd * N, n_batch] == kv_self.k[:,n_past:n_past+N,:,il]
786
786
// v shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il]
787
787
788
788
/* {
789
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
789
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
790
790
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
791
- ( n_ctx )*ggml_element_size(kv_self.v),
792
- (il*n_ctx )*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
791
+ ( kv_size )*ggml_element_size(kv_self.v),
792
+ (il*kv_size )*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
793
793
794
794
// important: storing RoPE-ed version of K in the KV cache!
795
795
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@@ -817,7 +817,7 @@ static struct ggml_tensor * forward_batch(
817
817
0 , 2 , 1 , 3 );
818
818
assert_shape_4d (Q, n_embd/n_head, N, n_head, n_batch);
819
819
820
- // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
820
+ // kv_self.k shape [n_embd * kv_size * n_batch * n_layer]
821
821
// K shape [n_embd/n_head, n_past + N, n_head, n_batch]
822
822
struct ggml_tensor * K =
823
823
ggml_permute (ctx0,
@@ -855,7 +855,7 @@ static struct ggml_tensor * forward_batch(
855
855
assert_shape_4d (KQ_soft_max, n_past + N, N, n_head, n_batch);
856
856
857
857
// split cached V into n_head heads
858
- // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
858
+ // kv_self.v shape [kv_size * n_embd * n_batch * n_layer]
859
859
// V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il]
860
860
struct ggml_tensor * V =
861
861
ggml_view_4d (ctx0, vc,
@@ -1082,16 +1082,16 @@ static struct ggml_tensor * forward_lora(
1082
1082
cur)),
1083
1083
n_embd, N)));
1084
1084
1085
- // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
1086
- // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
1085
+ // kv_self.k shape [n_embd * kv_size * n_layer, 1]
1086
+ // kv_self.v shape [n_embd * kv_size * n_layer, 1]
1087
1087
// k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
1088
1088
// v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
1089
1089
1090
1090
/* {
1091
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1091
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
1092
1092
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1093
- ( n_ctx )*ggml_element_size(kv_self.v),
1094
- (il*n_ctx )*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1093
+ ( kv_size )*ggml_element_size(kv_self.v),
1094
+ (il*kv_size )*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1095
1095
1096
1096
// important: storing RoPE-ed version of K in the KV cache!
1097
1097
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@@ -1110,7 +1110,7 @@ static struct ggml_tensor * forward_lora(
1110
1110
Qcur,
1111
1111
0 , 2 , 1 , 3 );
1112
1112
1113
- // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
1113
+ // kv_self.k shape [n_embd * kv_size * n_layer, 1]
1114
1114
// K shape [n_embd/n_head, n_past + N, n_head, 1]
1115
1115
struct ggml_tensor * K =
1116
1116
ggml_permute (ctx0,
@@ -1470,15 +1470,15 @@ int main(int argc, char ** argv) {
1470
1470
/*
1471
1471
struct llama_model_lora model_lora;
1472
1472
// model.hparams.n_vocab = 6;
1473
- // model.hparams.n_ctx = 64;
1473
+ // model.hparams.kv_size = 64;
1474
1474
// model.hparams.n_embd = 128;
1475
1475
// model.hparams.n_mult = 2;
1476
1476
// model.hparams.n_head = 8;
1477
1477
// model.hparams.n_layer = 6;
1478
1478
// model.hparams.n_rot = model.hparams.n_embd / model.hparams.n_head;
1479
1479
1480
1480
model_lora.hparams.n_vocab = 16;
1481
- model_lora.hparams.n_ctx = 32;
1481
+ model_lora.hparams.kv_size = 32;
1482
1482
model_lora.hparams.n_embd = 256;
1483
1483
model_lora.hparams.n_mult = 2;
1484
1484
model_lora.hparams.n_head = 16;
0 commit comments