@@ -55,8 +55,6 @@ struct llama_server_context
55
55
56
56
size_t num_tokens_predicted = 0 ;
57
57
size_t n_past = 0 ;
58
- size_t n_consumed = 0 ;
59
- size_t n_session_consumed = 0 ;
60
58
size_t n_remain = 0 ;
61
59
62
60
std::vector<llama_token> embd;
@@ -87,7 +85,6 @@ struct llama_server_context
87
85
88
86
n_remain = 0 ;
89
87
n_past = 0 ;
90
- n_consumed = 0 ;
91
88
}
92
89
93
90
bool loadModel (const gpt_params ¶ms_)
@@ -105,7 +102,7 @@ struct llama_server_context
105
102
return true ;
106
103
}
107
104
108
- bool loadPrompt () {
105
+ void loadPrompt () {
109
106
params.prompt .insert (0 , 1 , ' ' ); // always add a first space
110
107
std::vector<llama_token> prompt_tokens = ::llama_tokenize (ctx, params.prompt , true );
111
108
@@ -135,14 +132,11 @@ struct llama_server_context
135
132
n_past--;
136
133
}
137
134
has_next_token = true ;
138
- return true ;
139
135
}
140
136
141
137
void beginCompletion ()
142
138
{
143
139
// number of tokens to keep when resetting context
144
-
145
-
146
140
n_remain = params.n_predict ;
147
141
llama_set_rng_seed (ctx, params.seed );
148
142
}
@@ -196,9 +190,8 @@ struct llama_server_context
196
190
auto n_vocab = llama_n_vocab (ctx);
197
191
198
192
// Apply params.logit_bias map
199
- for (auto it = params.logit_bias .begin (); it != params.logit_bias .end (); it++)
200
- {
201
- logits[it->first ] += it->second ;
193
+ for (const auto &it : params.logit_bias ) {
194
+ logits[it.first ] += it.second ;
202
195
}
203
196
204
197
std::vector<llama_token_data> candidates;
@@ -275,7 +268,7 @@ struct llama_server_context
275
268
return result;
276
269
}
277
270
278
- has_next_token = params.n_predict == -1 ? true : n_remain != 0 ;
271
+ has_next_token = params.n_predict == -1 || n_remain != 0 ;
279
272
return result;
280
273
}
281
274
@@ -334,7 +327,7 @@ struct llama_server_context
334
327
std::vector<float > embedding (std::string content, int threads) {
335
328
content.insert (0 , 1 , ' ' );
336
329
std::vector<llama_token> tokens = ::llama_tokenize (ctx, content, true );
337
- if (tokens.size () > 0 )
330
+ if (! tokens.empty () )
338
331
{
339
332
if (llama_eval (ctx, tokens.data (), tokens.size (), 0 , threads))
340
333
{
@@ -344,7 +337,7 @@ struct llama_server_context
344
337
}
345
338
}
346
339
const int n_embd = llama_n_embd (ctx);
347
- const auto embeddings = llama_get_embeddings (ctx);
340
+ auto * const embeddings = llama_get_embeddings (ctx);
348
341
std::vector<float > embeddings_ (embeddings, embeddings + n_embd);
349
342
return embeddings_;
350
343
}
@@ -392,7 +385,7 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms, con
392
385
fprintf (stderr, " \n " );
393
386
}
394
387
395
- bool server_params_parse (int argc, char **argv, server_params &sparams, gpt_params ¶ms)
388
+ void server_params_parse (int argc, char **argv, server_params &sparams, gpt_params ¶ms)
396
389
{
397
390
gpt_params default_params;
398
391
server_params default_sparams;
@@ -534,7 +527,6 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
534
527
server_print_usage (argc, argv, default_params, default_sparams);
535
528
exit (1 );
536
529
}
537
- return true ;
538
530
}
539
531
540
532
json format_generation_settings (llama_server_context &llama) {
@@ -575,12 +567,12 @@ bool parse_options_completion(json body, llama_server_context& llama, Response &
575
567
llama.stream = false ;
576
568
}
577
569
if (!body[" n_predict" ].is_null ()) {
578
- llama.params .n_predict = body[" n_predict" ].get <int >();
570
+ llama.params .n_predict = body[" n_predict" ].get <int32_t >();
579
571
} else {
580
572
llama.params .n_predict = default_params.n_predict ;
581
573
}
582
574
if (!body[" top_k" ].is_null ()) {
583
- llama.params .top_k = body[" top_k" ].get <int >();
575
+ llama.params .top_k = body[" top_k" ].get <int32_t >();
584
576
} else {
585
577
llama.params .top_k = default_params.top_k ;
586
578
}
@@ -600,7 +592,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response &
600
592
llama.params .typical_p = default_params.typical_p ;
601
593
}
602
594
if (!body[" repeat_last_n" ].is_null ()) {
603
- llama.params .repeat_last_n = body[" repeat_last_n" ].get <int >();
595
+ llama.params .repeat_last_n = body[" repeat_last_n" ].get <int32_t >();
604
596
} else {
605
597
llama.params .repeat_last_n = default_params.repeat_last_n ;
606
598
}
@@ -625,7 +617,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response &
625
617
llama.params .frequency_penalty = default_params.frequency_penalty ;
626
618
}
627
619
if (!body[" mirostat" ].is_null ()) {
628
- llama.params .mirostat = body[" mirostat" ].get <float >();
620
+ llama.params .mirostat = body[" mirostat" ].get <int >();
629
621
} else {
630
622
llama.params .mirostat = default_params.mirostat ;
631
623
}
@@ -640,17 +632,17 @@ bool parse_options_completion(json body, llama_server_context& llama, Response &
640
632
llama.params .mirostat_eta = default_params.mirostat_eta ;
641
633
}
642
634
if (!body[" penalize_nl" ].is_null ()) {
643
- llama.params .penalize_nl = body[" penalize_nl" ].get <float >();
635
+ llama.params .penalize_nl = body[" penalize_nl" ].get <bool >();
644
636
} else {
645
637
llama.params .penalize_nl = default_params.penalize_nl ;
646
638
}
647
639
if (!body[" n_keep" ].is_null ()) {
648
- llama.params .n_keep = body[" n_keep" ].get <int >();
640
+ llama.params .n_keep = body[" n_keep" ].get <int32_t >();
649
641
} else {
650
642
llama.params .n_keep = default_params.n_keep ;
651
643
}
652
644
if (!body[" seed" ].is_null ()) {
653
- llama.params .seed = body[" seed" ].get <int >();
645
+ llama.params .seed = body[" seed" ].get <int32_t >();
654
646
} else {
655
647
llama.params .seed = time (NULL );
656
648
}
@@ -717,10 +709,7 @@ int main(int argc, char **argv)
717
709
llama_server_context llama;
718
710
params.model = " ggml-model.bin" ;
719
711
720
- if (server_params_parse (argc, argv, sparams, params) == false )
721
- {
722
- return 1 ;
723
- }
712
+ server_params_parse (argc, argv, sparams, params);
724
713
725
714
llama.verbose = sparams.verbose ;
726
715
llama.json_indent = sparams.verbose ? 4 : -1 ;
@@ -768,15 +757,7 @@ int main(int argc, char **argv)
768
757
return ;
769
758
}
770
759
771
- if (!llama.loadPrompt ()) {
772
- json data = {{" status" , " error" }, {" reason" , " Context too long." }};
773
- res.set_content (
774
- data.dump (llama.json_indent , ' ' , false , json::error_handler_t ::replace),
775
- " application/json" );
776
- res.status = 400 ;
777
- return ;
778
- }
779
-
760
+ llama.loadPrompt ();
780
761
llama.beginCompletion ();
781
762
782
763
if (!llama.stream ) {
0 commit comments