ggml-org · phymbert · Feb 17, 2024 · Feb 18, 2024 · Feb 18, 2024 · Feb 18, 2024
diff --git a/README.md b/README.md
@@ -186,7 +186,7 @@ llm_load_print_meta: vocab type     = SPM
 llm_load_print_meta: n_vocab        = 32000
 llm_load_print_meta: n_merges       = 0
 llm_load_print_meta: n_ctx_train    = 4096
-llm_load_print_meta: n_ctx          = 512
+llm_load_print_meta: kv_size        = 512
 llm_load_print_meta: n_embd         = 5120
 llm_load_print_meta: n_head         = 40
 llm_load_print_meta: n_head_kv      = 40
@@ -214,7 +214,7 @@ llama_new_context_with_model: compute buffer total size =   75.41 MB
 
 system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
 sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
-generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
+generate: kv_size = 512, n_batch = 512, n_predict = 400, n_keep = 0
 
 
  Building a website can be done in 10 simple steps:

diff --git a/common/common.cpp b/common/common.cpp
@@ -258,11 +258,19 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             }
             sparams.top_k = std::stoi(argv[i]);
         } else if (arg == "-c" || arg == "--ctx-size") {
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+            params.kv_size = std::stoi(argv[i]);
+            fprintf(stderr, "warning: -c,--ctx-size option is deprecated, use --kv-size instead");
+        } else if (arg == "-kv" || arg == "--kv-size" || arg == "--kv_size") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.n_ctx = std::stoi(argv[i]);
+            params.kv_size = std::stoi(argv[i]);
         } else if (arg == "--grp-attn-n" || arg == "-gan") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -962,7 +970,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -bf FNAME, --binary-file FNAME\n");
     printf("                        binary file containing multiple choice tasks.\n");
     printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
-    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
+    printf("  -kv N, --kv-size N    Specify the total size of the KV cache (default: %d)\n", params.kv_size);
     printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
     printf("  --samplers            samplers that will be used for generation in the order, separated by \';\'\n");
     printf("                        (default: %s)\n", sampler_type_names.c_str());
@@ -972,7 +980,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --min-p N             min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
     printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
     printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
-    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
+    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = kv_size)\n", sparams.penalty_last_n);
     printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
     printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
     printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
@@ -1269,7 +1277,7 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
     auto cparams = llama_context_default_params();
 
-    cparams.n_ctx             = params.n_ctx;
+    cparams.kv_size           = params.kv_size;
     cparams.n_batch           = params.n_batch;
     cparams.n_threads         = params.n_threads;
     cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -1658,7 +1666,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
     fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
     fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
-    fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
+    fprintf(stream, "kv_size: %d # default: 512\n", params.kv_size);
     fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
     fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
     fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);

diff --git a/common/common.h b/common/common.h
@@ -50,7 +50,7 @@ struct gpt_params {
     int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
     int32_t n_threads_batch_draft = -1;
     int32_t n_predict             = -1;    // new tokens to predict
-    int32_t n_ctx                 = 512;   // context size
+    int32_t kv_size               = 512;   // KV Cache size
     int32_t n_batch               = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep                = 0;     // number of tokens to keep from initial prompt
     int32_t n_draft               = 8;     // number of tokens to draft during speculative decoding

diff --git a/examples/Miku.sh b/examples/Miku.sh
@@ -7,11 +7,11 @@ USER_NAME="${USER_NAME:-Anon}"
 
 # Uncomment and adjust to the number of CPU cores you want to use.
 #N_THREAD="${N_THREAD:-4}"
-CTX_SIZE="${CTX_SIZE:-4096}"
+KV_SIZE="${KV_SIZE:-4096}"
 N_PREDICTS="${N_PREDICTS:-4096}"
 
 GEN_OPTIONS=(--batch_size 1024
---ctx_size "$CTX_SIZE"
+--kv_size "$KV_SIZE"
 --keep -1
 --repeat_last_n 256
 --repeat_penalty 1.17647

diff --git a/examples/alpaca.sh b/examples/alpaca.sh
@@ -10,7 +10,7 @@ cd ..
 ./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
        --color \
        -f ./prompts/alpaca.txt \
-       --ctx_size 2048 \
+       --kv_size 2048 \
        -n -1 \
        -ins -b 256 \
        --top_k 10000 \

diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
@@ -532,16 +532,16 @@ static struct ggml_tensor * forward(
                 // Vcur shape [n_embd, N, 1, 1]
                 struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N)));
 
-                // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
-                // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
+                // kv_self.k shape [n_embd * kv_size * n_layer, 1]
+                // kv_self.v shape [n_embd * kv_size * n_layer, 1]
                 // k         shape [n_embd * N, 1]   == kv_self.k[:,n_past:n_past+N,il,0]
                 // v         shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
 
                 /* {
-                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
                     struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
-                            (   n_ctx)*ggml_element_size(kv_self.v),
-                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+                            (   kv_size)*ggml_element_size(kv_self.v),
+                            (il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
 
                     // important: storing RoPE-ed version of K in the KV cache!
                     ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@@ -560,7 +560,7 @@ static struct ggml_tensor * forward(
                         Qcur,
                         0, 2, 1, 3);
 
-            // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
+            // kv_self.k shape [n_embd * kv_size * n_layer, 1]
             // K shape [n_embd/n_head, n_past + N, n_head, 1]
             struct ggml_tensor * K =
                 ggml_permute(ctx0,
@@ -780,16 +780,16 @@ static struct ggml_tensor * forward_batch(
 
                 assert_shape_3d(Vcur, N, n_embd, n_batch);
 
-                // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
-                // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
+                // kv_self.k shape [n_embd * kv_size * n_batch * n_layer]
+                // kv_self.v shape [kv_size * n_embd * n_batch * n_layer]
                 // k         shape [n_embd * N, n_batch]   == kv_self.k[:,n_past:n_past+N,:,il]
                 // v         shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il]
 
                 /* {
-                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
                     struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
-                            (   n_ctx)*ggml_element_size(kv_self.v),
-                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+                            (   kv_size)*ggml_element_size(kv_self.v),
+                            (il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
 
                     // important: storing RoPE-ed version of K in the KV cache!
                     ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@@ -817,7 +817,7 @@ static struct ggml_tensor * forward_batch(
                         0, 2, 1, 3);
             assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
 
-            // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
+            // kv_self.k shape [n_embd * kv_size * n_batch * n_layer]
             // K shape [n_embd/n_head, n_past + N, n_head, n_batch]
             struct ggml_tensor * K =
                 ggml_permute(ctx0,
@@ -855,7 +855,7 @@ static struct ggml_tensor * forward_batch(
             assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch);
 
             // split cached V into n_head heads
-            // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
+            // kv_self.v shape [kv_size * n_embd * n_batch * n_layer]
             // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il]
             struct ggml_tensor * V =
                 ggml_view_4d(ctx0, vc,
@@ -1082,16 +1082,16 @@ static struct ggml_tensor * forward_lora(
                                                                 cur)),
                                                         n_embd, N)));
 
-                // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
-                // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
+                // kv_self.k shape [n_embd * kv_size * n_layer, 1]
+                // kv_self.v shape [n_embd * kv_size * n_layer, 1]
                 // k         shape [n_embd * N, 1]   == kv_self.k[:,n_past:n_past+N,il,0]
                 // v         shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
 
                 /* {
-                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
                     struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
-                            (   n_ctx)*ggml_element_size(kv_self.v),
-                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+                            (   kv_size)*ggml_element_size(kv_self.v),
+                            (il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
 
                     // important: storing RoPE-ed version of K in the KV cache!
                     ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@@ -1110,7 +1110,7 @@ static struct ggml_tensor * forward_lora(
                         Qcur,
                         0, 2, 1, 3);
 
-            // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
+            // kv_self.k shape [n_embd * kv_size * n_layer, 1]
             // K shape [n_embd/n_head, n_past + N, n_head, 1]
             struct ggml_tensor * K =
                 ggml_permute(ctx0,
@@ -1470,15 +1470,15 @@ int main(int argc, char ** argv) {
 /*
     struct llama_model_lora model_lora;
     // model.hparams.n_vocab = 6;
-    // model.hparams.n_ctx   = 64;
+    // model.hparams.kv_size = 64;
     // model.hparams.n_embd  = 128;
     // model.hparams.n_mult  = 2;
     // model.hparams.n_head  = 8;
     // model.hparams.n_layer = 6;
     // model.hparams.n_rot   = model.hparams.n_embd / model.hparams.n_head;
 
     model_lora.hparams.n_vocab = 16;
-    model_lora.hparams.n_ctx   = 32;
+    model_lora.hparams.kv_size = 32;
     model_lora.hparams.n_embd  = 256;
     model_lora.hparams.n_mult  = 2;
     model_lora.hparams.n_head  = 16;

diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
@@ -104,7 +104,7 @@ int main(int argc, char ** argv) {
     llama_context_params ctx_params = llama_context_default_params();
 
     ctx_params.seed      = 1234;
-    ctx_params.n_ctx     = n_kv_max;
+    ctx_params.kv_size   = n_kv_max;
     ctx_params.n_batch   = 512;
     ctx_params.mul_mat_q = mmq;
 

diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
@@ -38,7 +38,7 @@ let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_par
 
 var context_params = llama_context_default_params()
 context_params.seed = 1234
-context_params.n_ctx = n_kv_req
+context_params.kv_size = n_kv_req
 context_params.n_batch = UInt32(max(n_len, n_parallel))
 context_params.n_threads = 8
 context_params.n_threads_batch = 8
@@ -53,12 +53,12 @@ defer {
     llama_free(context)
 }
 
-let n_ctx = llama_n_ctx(context)
+let kv_size = llama_kv_size(context)
 
-print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
+print("\nn_len = \(n_len), kv_size = \(kv_size), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
 
-if n_kv_req > n_ctx {
-    print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
+if n_kv_req > kv_size {
+    print("error: n_kv_req (%d) > kv_size, the required KV cache size is not big enough\n", n_kv_req)
     exit(1)
 }
 

diff --git a/examples/batched/README.md b/examples/batched/README.md
@@ -7,7 +7,7 @@ The example demonstrates batched generation from a given prompt
 
 ...
 
-main: n_len = 32, n_ctx = 2048, n_parallel = 4, n_kv_req = 113
+main: n_len = 32, kv_size = 2048, n_parallel = 4, n_kv_req = 113
 
  Hello my name is
 

diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
@@ -78,7 +78,7 @@ int main(int argc, char ** argv) {
     llama_context_params ctx_params = llama_context_default_params();
 
     ctx_params.seed  = 1234;
-    ctx_params.n_ctx = n_kv_req;
+    ctx_params.kv_size = n_kv_req;
     ctx_params.n_batch = std::max(n_len, n_parallel);
     ctx_params.n_threads       = params.n_threads;
     ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -90,14 +90,14 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    const int n_ctx    = llama_n_ctx(ctx);
+    const int kv_size    = llama_kv_size(ctx);
 
-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+    LOG_TEE("\n%s: n_len = %d, kv_size = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, kv_size, ctx_params.n_batch, n_parallel, n_kv_req);
 
     // make sure the KV cache is big enough to hold all the prompt and generated tokens
-    if (n_kv_req > n_ctx) {
-        LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
-        LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
+    if (n_kv_req > kv_size) {
+        LOG_TEE("%s: error: n_kv_req (%d) > kv_size, the required KV cache size is not big enough\n", __func__,  n_kv_req);
+        LOG_TEE("%s:        either reduce n_parallel or increase kv_size\n", __func__);
         return 1;
     }
 

diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp
@@ -139,8 +139,8 @@ int main(int argc, char ** argv)
 
     std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
 
-    const size_t max_context_size     = llama_n_ctx( ctx );
-    const size_t max_tokens_list_size = max_context_size - 4 ;
+    const size_t max_kv_size          = llama_kv_size(ctx);
+    const size_t max_tokens_list_size = max_kv_size - 4 ;
 
     if (tokens_list.size() > max_tokens_list_size)
     {

diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
@@ -128,20 +128,20 @@ int main(int argc, char ** argv)  {
     // TODO: perform the bench for all types or for a user specified type
     const ggml_type qtype = GGML_TYPE_Q4_1;
 
-    size_t ctx_size = 0;
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
-    ctx_size += ggml_row_size(qtype,         sizex*sizey);
-    ctx_size += ggml_row_size(qtype,         sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
-    ctx_size += 1024*1024*16;
-
-    printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
+    size_t kv_size = 0;
+    kv_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
+    kv_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
+    kv_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
+    kv_size += ggml_row_size(qtype, sizex*sizey);
+    kv_size += ggml_row_size(qtype, sizex*sizey);
+    kv_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
+    kv_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
+    kv_size += 1024*1024*16;
+
+    printf("Allocating Memory of size %zi bytes, %zi MB\n", kv_size, (kv_size/1024/1024));
 
     struct ggml_init_params params = {
-        /*.mem_size   =*/ ctx_size,
+        /*.mem_size   =*/ kv_size,
         /*.mem_buffer =*/ NULL,
         /* no_alloc   =*/ 0
     };

diff --git a/examples/chat-13B.bat b/examples/chat-13B.bat
@@ -15,7 +15,7 @@ rem Adjust to the number of CPU cores you want to use.
 rem if not defined N_THREAD set "N_THREAD=8"
 rem Number of tokens to predict (made it larger than default because we want a long interaction)
 if not defined N_PREDICTS set "N_PREDICTS=2048"
-if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
+if not defined GEN_OPTIONS set "GEN_OPTIONS=--kv_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
 
 rem Default main script paths
 set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe"
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,7 +7,7 @@ The example demonstrates batched generation from a given prompt @@
     ...
-    main: n_len = 32, n_ctx = 2048, n_parallel = 4, n_kv_req = 113
+    main: n_len = 32, kv_size = 2048, n_parallel = 4, n_kv_req = 113
      Hello my name is
@@ Expand Down @@