Skip to content

server: rename legacy --ctx-size to --kv-size option #5546

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ llm_load_print_meta: vocab type = SPM
llm_load_print_meta: n_vocab = 32000
llm_load_print_meta: n_merges = 0
llm_load_print_meta: n_ctx_train = 4096
llm_load_print_meta: n_ctx = 512
llm_load_print_meta: kv_size = 512
llm_load_print_meta: n_embd = 5120
llm_load_print_meta: n_head = 40
llm_load_print_meta: n_head_kv = 40
Expand Down Expand Up @@ -214,7 +214,7 @@ llama_new_context_with_model: compute buffer total size = 75.41 MB

system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
generate: kv_size = 512, n_batch = 512, n_predict = 400, n_keep = 0


Building a website can be done in 10 simple steps:
Expand Down
18 changes: 13 additions & 5 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,11 +258,19 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
}
sparams.top_k = std::stoi(argv[i]);
} else if (arg == "-c" || arg == "--ctx-size") {
if (++i >= argc)
{
invalid_param = true;
break;
}
params.kv_size = std::stoi(argv[i]);
fprintf(stderr, "warning: -c,--ctx-size option is deprecated, use --kv-size instead");
} else if (arg == "-kv" || arg == "--kv-size" || arg == "--kv_size") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_ctx = std::stoi(argv[i]);
params.kv_size = std::stoi(argv[i]);
} else if (arg == "--grp-attn-n" || arg == "-gan") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -962,7 +970,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -bf FNAME, --binary-file FNAME\n");
printf(" binary file containing multiple choice tasks.\n");
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
printf(" -kv N, --kv-size N Specify the total size of the KV cache (default: %d)\n", params.kv_size);
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n");
printf(" (default: %s)\n", sampler_type_names.c_str());
Expand All @@ -972,7 +980,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = kv_size)\n", sparams.penalty_last_n);
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
Expand Down Expand Up @@ -1269,7 +1277,7 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
auto cparams = llama_context_default_params();

cparams.n_ctx = params.n_ctx;
cparams.kv_size = params.kv_size;
cparams.n_batch = params.n_batch;
cparams.n_threads = params.n_threads;
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
Expand Down Expand Up @@ -1658,7 +1666,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
fprintf(stream, "kv_size: %d # default: 512\n", params.kv_size);
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
Expand Down
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ struct gpt_params {
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t kv_size = 512; // KV Cache size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 8; // number of tokens to draft during speculative decoding
Expand Down
4 changes: 2 additions & 2 deletions examples/Miku.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ USER_NAME="${USER_NAME:-Anon}"

# Uncomment and adjust to the number of CPU cores you want to use.
#N_THREAD="${N_THREAD:-4}"
CTX_SIZE="${CTX_SIZE:-4096}"
KV_SIZE="${KV_SIZE:-4096}"
N_PREDICTS="${N_PREDICTS:-4096}"

GEN_OPTIONS=(--batch_size 1024
--ctx_size "$CTX_SIZE"
--kv_size "$KV_SIZE"
--keep -1
--repeat_last_n 256
--repeat_penalty 1.17647
Expand Down
2 changes: 1 addition & 1 deletion examples/alpaca.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ cd ..
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
--color \
-f ./prompts/alpaca.txt \
--ctx_size 2048 \
--kv_size 2048 \
-n -1 \
-ins -b 256 \
--top_k 10000 \
Expand Down
42 changes: 21 additions & 21 deletions examples/baby-llama/baby-llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -532,16 +532,16 @@ static struct ggml_tensor * forward(
// Vcur shape [n_embd, N, 1, 1]
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N)));

// kv_self.k shape [n_embd * n_ctx * n_layer, 1]
// kv_self.v shape [n_embd * n_ctx * n_layer, 1]
// kv_self.k shape [n_embd * kv_size * n_layer, 1]
// kv_self.v shape [n_embd * kv_size * n_layer, 1]
// k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
// v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]

/* {
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
( n_ctx)*ggml_element_size(kv_self.v),
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
( kv_size)*ggml_element_size(kv_self.v),
(il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));

// important: storing RoPE-ed version of K in the KV cache!
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
Expand All @@ -560,7 +560,7 @@ static struct ggml_tensor * forward(
Qcur,
0, 2, 1, 3);

// kv_self.k shape [n_embd * n_ctx * n_layer, 1]
// kv_self.k shape [n_embd * kv_size * n_layer, 1]
// K shape [n_embd/n_head, n_past + N, n_head, 1]
struct ggml_tensor * K =
ggml_permute(ctx0,
Expand Down Expand Up @@ -780,16 +780,16 @@ static struct ggml_tensor * forward_batch(

assert_shape_3d(Vcur, N, n_embd, n_batch);

// kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
// kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
// kv_self.k shape [n_embd * kv_size * n_batch * n_layer]
// kv_self.v shape [kv_size * n_embd * n_batch * n_layer]
// k shape [n_embd * N, n_batch] == kv_self.k[:,n_past:n_past+N,:,il]
// v shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il]

/* {
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
( n_ctx)*ggml_element_size(kv_self.v),
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
( kv_size)*ggml_element_size(kv_self.v),
(il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));

// important: storing RoPE-ed version of K in the KV cache!
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
Expand Down Expand Up @@ -817,7 +817,7 @@ static struct ggml_tensor * forward_batch(
0, 2, 1, 3);
assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);

// kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
// kv_self.k shape [n_embd * kv_size * n_batch * n_layer]
// K shape [n_embd/n_head, n_past + N, n_head, n_batch]
struct ggml_tensor * K =
ggml_permute(ctx0,
Expand Down Expand Up @@ -855,7 +855,7 @@ static struct ggml_tensor * forward_batch(
assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch);

// split cached V into n_head heads
// kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
// kv_self.v shape [kv_size * n_embd * n_batch * n_layer]
// V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il]
struct ggml_tensor * V =
ggml_view_4d(ctx0, vc,
Expand Down Expand Up @@ -1082,16 +1082,16 @@ static struct ggml_tensor * forward_lora(
cur)),
n_embd, N)));

// kv_self.k shape [n_embd * n_ctx * n_layer, 1]
// kv_self.v shape [n_embd * n_ctx * n_layer, 1]
// kv_self.k shape [n_embd * kv_size * n_layer, 1]
// kv_self.v shape [n_embd * kv_size * n_layer, 1]
// k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
// v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]

/* {
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
( n_ctx)*ggml_element_size(kv_self.v),
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
( kv_size)*ggml_element_size(kv_self.v),
(il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));

// important: storing RoPE-ed version of K in the KV cache!
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
Expand All @@ -1110,7 +1110,7 @@ static struct ggml_tensor * forward_lora(
Qcur,
0, 2, 1, 3);

// kv_self.k shape [n_embd * n_ctx * n_layer, 1]
// kv_self.k shape [n_embd * kv_size * n_layer, 1]
// K shape [n_embd/n_head, n_past + N, n_head, 1]
struct ggml_tensor * K =
ggml_permute(ctx0,
Expand Down Expand Up @@ -1470,15 +1470,15 @@ int main(int argc, char ** argv) {
/*
struct llama_model_lora model_lora;
// model.hparams.n_vocab = 6;
// model.hparams.n_ctx = 64;
// model.hparams.kv_size = 64;
// model.hparams.n_embd = 128;
// model.hparams.n_mult = 2;
// model.hparams.n_head = 8;
// model.hparams.n_layer = 6;
// model.hparams.n_rot = model.hparams.n_embd / model.hparams.n_head;

model_lora.hparams.n_vocab = 16;
model_lora.hparams.n_ctx = 32;
model_lora.hparams.kv_size = 32;
model_lora.hparams.n_embd = 256;
model_lora.hparams.n_mult = 2;
model_lora.hparams.n_head = 16;
Expand Down
2 changes: 1 addition & 1 deletion examples/batched-bench/batched-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ int main(int argc, char ** argv) {
llama_context_params ctx_params = llama_context_default_params();

ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_max;
ctx_params.kv_size = n_kv_max;
ctx_params.n_batch = 512;
ctx_params.mul_mat_q = mmq;

Expand Down
10 changes: 5 additions & 5 deletions examples/batched.swift/Sources/main.swift
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_par

var context_params = llama_context_default_params()
context_params.seed = 1234
context_params.n_ctx = n_kv_req
context_params.kv_size = n_kv_req
context_params.n_batch = UInt32(max(n_len, n_parallel))
context_params.n_threads = 8
context_params.n_threads_batch = 8
Expand All @@ -53,12 +53,12 @@ defer {
llama_free(context)
}

let n_ctx = llama_n_ctx(context)
let kv_size = llama_kv_size(context)

print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
print("\nn_len = \(n_len), kv_size = \(kv_size), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")

if n_kv_req > n_ctx {
print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
if n_kv_req > kv_size {
print("error: n_kv_req (%d) > kv_size, the required KV cache size is not big enough\n", n_kv_req)
exit(1)
}

Expand Down
2 changes: 1 addition & 1 deletion examples/batched/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ The example demonstrates batched generation from a given prompt

...

main: n_len = 32, n_ctx = 2048, n_parallel = 4, n_kv_req = 113
main: n_len = 32, kv_size = 2048, n_parallel = 4, n_kv_req = 113

Hello my name is

Expand Down
12 changes: 6 additions & 6 deletions examples/batched/batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ int main(int argc, char ** argv) {
llama_context_params ctx_params = llama_context_default_params();

ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_req;
ctx_params.kv_size = n_kv_req;
ctx_params.n_batch = std::max(n_len, n_parallel);
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
Expand All @@ -90,14 +90,14 @@ int main(int argc, char ** argv) {
return 1;
}

const int n_ctx = llama_n_ctx(ctx);
const int kv_size = llama_kv_size(ctx);

LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
LOG_TEE("\n%s: n_len = %d, kv_size = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, kv_size, ctx_params.n_batch, n_parallel, n_kv_req);

// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
if (n_kv_req > kv_size) {
LOG_TEE("%s: error: n_kv_req (%d) > kv_size, the required KV cache size is not big enough\n", __func__, n_kv_req);
LOG_TEE("%s: either reduce n_parallel or increase kv_size\n", __func__);
return 1;
}

Expand Down
4 changes: 2 additions & 2 deletions examples/beam-search/beam-search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ int main(int argc, char ** argv)

std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);

const size_t max_context_size = llama_n_ctx( ctx );
const size_t max_tokens_list_size = max_context_size - 4 ;
const size_t max_kv_size = llama_kv_size(ctx);
const size_t max_tokens_list_size = max_kv_size - 4 ;

if (tokens_list.size() > max_tokens_list_size)
{
Expand Down
24 changes: 12 additions & 12 deletions examples/benchmark/benchmark-matmult.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,20 +128,20 @@ int main(int argc, char ** argv) {
// TODO: perform the bench for all types or for a user specified type
const ggml_type qtype = GGML_TYPE_Q4_1;

size_t ctx_size = 0;
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
ctx_size += ggml_row_size(qtype, sizex*sizey);
ctx_size += ggml_row_size(qtype, sizex*sizey);
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
ctx_size += 1024*1024*16;

printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
size_t kv_size = 0;
kv_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
kv_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
kv_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
kv_size += ggml_row_size(qtype, sizex*sizey);
kv_size += ggml_row_size(qtype, sizex*sizey);
kv_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
kv_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
kv_size += 1024*1024*16;

printf("Allocating Memory of size %zi bytes, %zi MB\n", kv_size, (kv_size/1024/1024));

struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.mem_size =*/ kv_size,
/*.mem_buffer =*/ NULL,
/* no_alloc =*/ 0
};
Expand Down
2 changes: 1 addition & 1 deletion examples/chat-13B.bat
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ rem Adjust to the number of CPU cores you want to use.
rem if not defined N_THREAD set "N_THREAD=8"
rem Number of tokens to predict (made it larger than default because we want a long interaction)
if not defined N_PREDICTS set "N_PREDICTS=2048"
if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
if not defined GEN_OPTIONS set "GEN_OPTIONS=--kv_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"

rem Default main script paths
set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe"
Expand Down
Loading