@@ -2325,6 +2325,11 @@ struct llama_context {
2325
2325
// control vectors
2326
2326
struct llama_control_vector cvec;
2327
2327
2328
+ // caching token pieces & their decoded codepoints.
2329
+ std::vector<std::string> token_pieces;
2330
+ std::vector<std::pair<std::vector<uint32_t>,
2331
+ llama_partial_utf8>> token_codepoints;
2332
+
2328
2333
#ifdef GGML_USE_MPI
2329
2334
ggml_mpi_context * ctx_mpi = NULL;
2330
2335
#endif
@@ -13051,15 +13056,15 @@ struct llama_grammar * llama_grammar_init(
13051
13056
}
13052
13057
} while (true);
13053
13058
13054
- return new llama_grammar{ std::move(vec_rules), std::move(stacks), {}, {}, {} };
13059
+ return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
13055
13060
}
13056
13061
13057
13062
void llama_grammar_free(struct llama_grammar * grammar) {
13058
13063
delete grammar;
13059
13064
}
13060
13065
13061
13066
struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
13062
- llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8, grammar->token_pieces, grammar->token_codepoints };
13067
+ llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
13063
13068
13064
13069
// redirect elements in stacks to point to new rules
13065
13070
for (size_t is = 0; is < result->stacks.size(); is++) {
@@ -13552,14 +13557,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
13552
13557
}
13553
13558
}
13554
13559
13555
- if (grammar ->token_codepoints.empty()) {
13560
+ if (ctx ->token_codepoints.empty()) {
13556
13561
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
13557
- grammar ->token_codepoints.resize(n_vocab);
13558
- grammar ->token_pieces.resize(n_vocab);
13562
+ ctx ->token_codepoints.resize(n_vocab);
13563
+ ctx ->token_pieces.resize(n_vocab);
13559
13564
for (llama_token id = 0; id < n_vocab; ++id) {
13560
13565
const std::string piece = llama_token_to_piece(ctx, id, false);
13561
- grammar ->token_pieces[id] = piece;
13562
- grammar ->token_codepoints[id] = decode_utf8(piece, {0, 0});
13566
+ ctx ->token_pieces[id] = piece;
13567
+ ctx ->token_codepoints[id] = decode_utf8(piece, {0, 0});
13563
13568
}
13564
13569
}
13565
13570
@@ -13572,15 +13577,15 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
13572
13577
13573
13578
for (size_t i = 0; i < candidates->size; ++i) {
13574
13579
const llama_token id = candidates->data[i].id;
13575
- const auto & piece = grammar ->token_pieces[id];
13580
+ const auto & piece = ctx ->token_pieces[id];
13576
13581
if (llama_token_is_eog(&ctx->model, id)) {
13577
13582
if (!allow_eog) {
13578
13583
candidates->data[i].logit = -INFINITY;
13579
13584
}
13580
13585
} else if (piece.empty() || piece[0] == 0) {
13581
13586
candidates->data[i].logit = -INFINITY;
13582
13587
} else if (grammar->partial_utf8.n_remain == 0){
13583
- const auto & decoded = grammar ->token_codepoints.at(id);
13588
+ const auto & decoded = ctx ->token_codepoints.at(id);
13584
13589
candidates_grammar.push_back({ i, decoded.first.data(), decoded.second });
13585
13590
} else {
13586
13591
candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
@@ -13778,11 +13783,11 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
13778
13783
GGML_ASSERT(false);
13779
13784
}
13780
13785
13781
- const auto & piece = grammar ->token_pieces.at(token);
13786
+ const auto & piece = ctx ->token_pieces.at(token);
13782
13787
13783
13788
// Note terminating 0 in decoded string
13784
13789
const auto decoded = grammar->partial_utf8.n_remain == 0
13785
- ? grammar ->token_codepoints[token]
13790
+ ? ctx ->token_codepoints[token]
13786
13791
: decode_utf8(piece, grammar->partial_utf8);
13787
13792
const auto & code_points = decoded.first;
13788
13793
std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
0 commit comments