Skip to content

Commit c8e172a

Browse files
committed
rename n_ctx to kv_size
1 parent 12addf2 commit c8e172a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+404
-394
lines changed

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ llm_load_print_meta: vocab type = SPM
186186
llm_load_print_meta: n_vocab = 32000
187187
llm_load_print_meta: n_merges = 0
188188
llm_load_print_meta: n_ctx_train = 4096
189-
llm_load_print_meta: n_ctx = 512
189+
llm_load_print_meta: kv_size = 512
190190
llm_load_print_meta: n_embd = 5120
191191
llm_load_print_meta: n_head = 40
192192
llm_load_print_meta: n_head_kv = 40
@@ -214,7 +214,7 @@ llama_new_context_with_model: compute buffer total size = 75.41 MB
214214
215215
system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
216216
sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
217-
generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
217+
generate: kv_size = 512, n_batch = 512, n_predict = 400, n_keep = 0
218218
219219
220220
Building a website can be done in 10 simple steps:

common/common.cpp

+13-5
Original file line numberDiff line numberDiff line change
@@ -258,11 +258,19 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
258258
}
259259
sparams.top_k = std::stoi(argv[i]);
260260
} else if (arg == "-c" || arg == "--ctx-size") {
261+
if (++i >= argc)
262+
{
263+
invalid_param = true;
264+
break;
265+
}
266+
params.kv_size = std::stoi(argv[i]);
267+
fprintf(stderr, "warning: -c,--ctx-size option is deprecated, use --kv-size instead");
268+
} else if (arg == "-kv" || arg == "--kv-size" || arg == "--kv_size") {
261269
if (++i >= argc) {
262270
invalid_param = true;
263271
break;
264272
}
265-
params.n_ctx = std::stoi(argv[i]);
273+
params.kv_size = std::stoi(argv[i]);
266274
} else if (arg == "--grp-attn-n" || arg == "-gan") {
267275
if (++i >= argc) {
268276
invalid_param = true;
@@ -962,7 +970,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
962970
printf(" -bf FNAME, --binary-file FNAME\n");
963971
printf(" binary file containing multiple choice tasks.\n");
964972
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
965-
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
973+
printf(" -kv N, --kv-size N Specify the total size of the KV cache (default: %d)\n", params.kv_size);
966974
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
967975
printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n");
968976
printf(" (default: %s)\n", sampler_type_names.c_str());
@@ -972,7 +980,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
972980
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
973981
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
974982
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
975-
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
983+
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = kv_size)\n", sparams.penalty_last_n);
976984
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
977985
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
978986
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
@@ -1269,7 +1277,7 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
12691277
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
12701278
auto cparams = llama_context_default_params();
12711279

1272-
cparams.n_ctx = params.n_ctx;
1280+
cparams.kv_size = params.kv_size;
12731281
cparams.n_batch = params.n_batch;
12741282
cparams.n_threads = params.n_threads;
12751283
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -1658,7 +1666,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
16581666
fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
16591667
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
16601668
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
1661-
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
1669+
fprintf(stream, "kv_size: %d # default: 512\n", params.kv_size);
16621670
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
16631671
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
16641672
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);

common/common.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ struct gpt_params {
5050
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
5151
int32_t n_threads_batch_draft = -1;
5252
int32_t n_predict = -1; // new tokens to predict
53-
int32_t n_ctx = 512; // context size
53+
int32_t kv_size = 512; // KV Cache size
5454
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
5555
int32_t n_keep = 0; // number of tokens to keep from initial prompt
5656
int32_t n_draft = 8; // number of tokens to draft during speculative decoding

examples/Miku.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@ USER_NAME="${USER_NAME:-Anon}"
77

88
# Uncomment and adjust to the number of CPU cores you want to use.
99
#N_THREAD="${N_THREAD:-4}"
10-
CTX_SIZE="${CTX_SIZE:-4096}"
10+
KV_SIZE="${KV_SIZE:-4096}"
1111
N_PREDICTS="${N_PREDICTS:-4096}"
1212

1313
GEN_OPTIONS=(--batch_size 1024
14-
--ctx_size "$CTX_SIZE"
14+
--kv_size "$KV_SIZE"
1515
--keep -1
1616
--repeat_last_n 256
1717
--repeat_penalty 1.17647

examples/alpaca.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ cd ..
1010
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
1111
--color \
1212
-f ./prompts/alpaca.txt \
13-
--ctx_size 2048 \
13+
--kv_size 2048 \
1414
-n -1 \
1515
-ins -b 256 \
1616
--top_k 10000 \

examples/baby-llama/baby-llama.cpp

+21-21
Original file line numberDiff line numberDiff line change
@@ -532,16 +532,16 @@ static struct ggml_tensor * forward(
532532
// Vcur shape [n_embd, N, 1, 1]
533533
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N)));
534534

535-
// kv_self.k shape [n_embd * n_ctx * n_layer, 1]
536-
// kv_self.v shape [n_embd * n_ctx * n_layer, 1]
535+
// kv_self.k shape [n_embd * kv_size * n_layer, 1]
536+
// kv_self.v shape [n_embd * kv_size * n_layer, 1]
537537
// k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
538538
// v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
539539

540540
/* {
541-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
541+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
542542
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
543-
( n_ctx)*ggml_element_size(kv_self.v),
544-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
543+
( kv_size)*ggml_element_size(kv_self.v),
544+
(il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
545545
546546
// important: storing RoPE-ed version of K in the KV cache!
547547
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@@ -560,7 +560,7 @@ static struct ggml_tensor * forward(
560560
Qcur,
561561
0, 2, 1, 3);
562562

563-
// kv_self.k shape [n_embd * n_ctx * n_layer, 1]
563+
// kv_self.k shape [n_embd * kv_size * n_layer, 1]
564564
// K shape [n_embd/n_head, n_past + N, n_head, 1]
565565
struct ggml_tensor * K =
566566
ggml_permute(ctx0,
@@ -780,16 +780,16 @@ static struct ggml_tensor * forward_batch(
780780

781781
assert_shape_3d(Vcur, N, n_embd, n_batch);
782782

783-
// kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
784-
// kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
783+
// kv_self.k shape [n_embd * kv_size * n_batch * n_layer]
784+
// kv_self.v shape [kv_size * n_embd * n_batch * n_layer]
785785
// k shape [n_embd * N, n_batch] == kv_self.k[:,n_past:n_past+N,:,il]
786786
// v shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il]
787787

788788
/* {
789-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
789+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
790790
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
791-
( n_ctx)*ggml_element_size(kv_self.v),
792-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
791+
( kv_size)*ggml_element_size(kv_self.v),
792+
(il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
793793
794794
// important: storing RoPE-ed version of K in the KV cache!
795795
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@@ -817,7 +817,7 @@ static struct ggml_tensor * forward_batch(
817817
0, 2, 1, 3);
818818
assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
819819

820-
// kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
820+
// kv_self.k shape [n_embd * kv_size * n_batch * n_layer]
821821
// K shape [n_embd/n_head, n_past + N, n_head, n_batch]
822822
struct ggml_tensor * K =
823823
ggml_permute(ctx0,
@@ -855,7 +855,7 @@ static struct ggml_tensor * forward_batch(
855855
assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch);
856856

857857
// split cached V into n_head heads
858-
// kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
858+
// kv_self.v shape [kv_size * n_embd * n_batch * n_layer]
859859
// V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il]
860860
struct ggml_tensor * V =
861861
ggml_view_4d(ctx0, vc,
@@ -1082,16 +1082,16 @@ static struct ggml_tensor * forward_lora(
10821082
cur)),
10831083
n_embd, N)));
10841084

1085-
// kv_self.k shape [n_embd * n_ctx * n_layer, 1]
1086-
// kv_self.v shape [n_embd * n_ctx * n_layer, 1]
1085+
// kv_self.k shape [n_embd * kv_size * n_layer, 1]
1086+
// kv_self.v shape [n_embd * kv_size * n_layer, 1]
10871087
// k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
10881088
// v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
10891089

10901090
/* {
1091-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1091+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
10921092
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1093-
( n_ctx)*ggml_element_size(kv_self.v),
1094-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1093+
( kv_size)*ggml_element_size(kv_self.v),
1094+
(il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
10951095
10961096
// important: storing RoPE-ed version of K in the KV cache!
10971097
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@@ -1110,7 +1110,7 @@ static struct ggml_tensor * forward_lora(
11101110
Qcur,
11111111
0, 2, 1, 3);
11121112

1113-
// kv_self.k shape [n_embd * n_ctx * n_layer, 1]
1113+
// kv_self.k shape [n_embd * kv_size * n_layer, 1]
11141114
// K shape [n_embd/n_head, n_past + N, n_head, 1]
11151115
struct ggml_tensor * K =
11161116
ggml_permute(ctx0,
@@ -1470,15 +1470,15 @@ int main(int argc, char ** argv) {
14701470
/*
14711471
struct llama_model_lora model_lora;
14721472
// model.hparams.n_vocab = 6;
1473-
// model.hparams.n_ctx = 64;
1473+
// model.hparams.kv_size = 64;
14741474
// model.hparams.n_embd = 128;
14751475
// model.hparams.n_mult = 2;
14761476
// model.hparams.n_head = 8;
14771477
// model.hparams.n_layer = 6;
14781478
// model.hparams.n_rot = model.hparams.n_embd / model.hparams.n_head;
14791479
14801480
model_lora.hparams.n_vocab = 16;
1481-
model_lora.hparams.n_ctx = 32;
1481+
model_lora.hparams.kv_size = 32;
14821482
model_lora.hparams.n_embd = 256;
14831483
model_lora.hparams.n_mult = 2;
14841484
model_lora.hparams.n_head = 16;

examples/batched-bench/batched-bench.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ int main(int argc, char ** argv) {
104104
llama_context_params ctx_params = llama_context_default_params();
105105

106106
ctx_params.seed = 1234;
107-
ctx_params.n_ctx = n_kv_max;
107+
ctx_params.kv_size = n_kv_max;
108108
ctx_params.n_batch = 512;
109109
ctx_params.mul_mat_q = mmq;
110110

examples/batched.swift/Sources/main.swift

+5-5
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_par
3838

3939
var context_params = llama_context_default_params()
4040
context_params.seed = 1234
41-
context_params.n_ctx = n_kv_req
41+
context_params.kv_size = n_kv_req
4242
context_params.n_batch = UInt32(max(n_len, n_parallel))
4343
context_params.n_threads = 8
4444
context_params.n_threads_batch = 8
@@ -53,12 +53,12 @@ defer {
5353
llama_free(context)
5454
}
5555

56-
let n_ctx = llama_n_ctx(context)
56+
let kv_size = llama_kv_size(context)
5757

58-
print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
58+
print("\nn_len = \(n_len), kv_size = \(kv_size), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
5959

60-
if n_kv_req > n_ctx {
61-
print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
60+
if n_kv_req > kv_size {
61+
print("error: n_kv_req (%d) > kv_size, the required KV cache size is not big enough\n", n_kv_req)
6262
exit(1)
6363
}
6464

examples/batched/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ The example demonstrates batched generation from a given prompt
77

88
...
99

10-
main: n_len = 32, n_ctx = 2048, n_parallel = 4, n_kv_req = 113
10+
main: n_len = 32, kv_size = 2048, n_parallel = 4, n_kv_req = 113
1111

1212
Hello my name is
1313

examples/batched/batched.cpp

+6-6
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ int main(int argc, char ** argv) {
7878
llama_context_params ctx_params = llama_context_default_params();
7979

8080
ctx_params.seed = 1234;
81-
ctx_params.n_ctx = n_kv_req;
81+
ctx_params.kv_size = n_kv_req;
8282
ctx_params.n_batch = std::max(n_len, n_parallel);
8383
ctx_params.n_threads = params.n_threads;
8484
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -90,14 +90,14 @@ int main(int argc, char ** argv) {
9090
return 1;
9191
}
9292

93-
const int n_ctx = llama_n_ctx(ctx);
93+
const int kv_size = llama_kv_size(ctx);
9494

95-
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
95+
LOG_TEE("\n%s: n_len = %d, kv_size = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, kv_size, ctx_params.n_batch, n_parallel, n_kv_req);
9696

9797
// make sure the KV cache is big enough to hold all the prompt and generated tokens
98-
if (n_kv_req > n_ctx) {
99-
LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
100-
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
98+
if (n_kv_req > kv_size) {
99+
LOG_TEE("%s: error: n_kv_req (%d) > kv_size, the required KV cache size is not big enough\n", __func__, n_kv_req);
100+
LOG_TEE("%s: either reduce n_parallel or increase kv_size\n", __func__);
101101
return 1;
102102
}
103103

examples/beam-search/beam-search.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,8 @@ int main(int argc, char ** argv)
139139

140140
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
141141

142-
const size_t max_context_size = llama_n_ctx( ctx );
143-
const size_t max_tokens_list_size = max_context_size - 4 ;
142+
const size_t max_kv_size = llama_kv_size(ctx);
143+
const size_t max_tokens_list_size = max_kv_size - 4 ;
144144

145145
if (tokens_list.size() > max_tokens_list_size)
146146
{

examples/benchmark/benchmark-matmult.cpp

+12-12
Original file line numberDiff line numberDiff line change
@@ -128,20 +128,20 @@ int main(int argc, char ** argv) {
128128
// TODO: perform the bench for all types or for a user specified type
129129
const ggml_type qtype = GGML_TYPE_Q4_1;
130130

131-
size_t ctx_size = 0;
132-
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
133-
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
134-
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
135-
ctx_size += ggml_row_size(qtype, sizex*sizey);
136-
ctx_size += ggml_row_size(qtype, sizex*sizey);
137-
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
138-
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
139-
ctx_size += 1024*1024*16;
140-
141-
printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
131+
size_t kv_size = 0;
132+
kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizey);
133+
kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizey);
134+
kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizez);
135+
kv_size += ggml_row_size(qtype, sizex * sizey);
136+
kv_size += ggml_row_size(qtype, sizex * sizey);
137+
kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizey); // BLAS
138+
kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizey); // BLAS
139+
kv_size += 1024 * 1024 * 16;
140+
141+
printf("Allocating Memory of size %zi bytes, %zi MB\n", kv_size, (kv_size / 1024 / 1024));
142142

143143
struct ggml_init_params params = {
144-
/*.mem_size =*/ ctx_size,
144+
/*.mem_size =*/ kv_size,
145145
/*.mem_buffer =*/ NULL,
146146
/* no_alloc =*/ 0
147147
};

examples/chat-13B.bat

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ rem Adjust to the number of CPU cores you want to use.
1515
rem if not defined N_THREAD set "N_THREAD=8"
1616
rem Number of tokens to predict (made it larger than default because we want a long interaction)
1717
if not defined N_PREDICTS set "N_PREDICTS=2048"
18-
if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
18+
if not defined GEN_OPTIONS set "GEN_OPTIONS=--kv_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
1919

2020
rem Default main script paths
2121
set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe"

0 commit comments

Comments
 (0)