Skip to content

Commit cc1fd2f

Browse files
committed
llama : add remove_space_prefix to llama_detokenize
This commit adds a new parameter to llama_detokenize to remove the leading space before tokens if they have a word boundary character. The motivation for this change is that when llama_server returns completion_propabilities, the tokens are detokenized and currently the leading space for the boundary tokens are removed. With this change llama_server can set remove_space_prefix to false and the leading space will be preserved. Resolves: #11728
1 parent d7b31a9 commit cc1fd2f

File tree

7 files changed

+35
-24
lines changed

7 files changed

+35
-24
lines changed

common/common.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -1746,19 +1746,19 @@ std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token
17461746
return piece;
17471747
}
17481748

1749-
std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1749+
std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special, bool remove_space_prefix) {
17501750
const llama_model * model = llama_get_model(ctx);
17511751
const llama_vocab * vocab = llama_model_get_vocab(model);
1752-
return common_detokenize(vocab, tokens, special);
1752+
return common_detokenize(vocab, tokens, special, remove_space_prefix);
17531753
}
17541754

1755-
std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
1755+
std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special, bool remove_space_prefix) {
17561756
std::string text;
17571757
text.resize(std::max(text.capacity(), tokens.size()));
1758-
int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1758+
int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special, remove_space_prefix);
17591759
if (n_chars < 0) {
17601760
text.resize(-n_chars);
1761-
n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1761+
n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special, remove_space_prefix);
17621762
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
17631763
}
17641764

common/common.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -601,12 +601,14 @@ std::string common_token_to_piece(
601601
std::string common_detokenize(
602602
const struct llama_context * ctx,
603603
const std::vector<llama_token> & tokens,
604-
bool special = true);
604+
bool special = true,
605+
bool remove_space_prefix = true);
605606

606607
std::string common_detokenize(
607608
const struct llama_vocab * vocab,
608609
const std::vector<llama_token> & tokens,
609-
bool special = true);
610+
bool special = true,
611+
bool remove_space_prefix = true);
610612

611613
//
612614
// Chat template utils

common/llguidance.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -176,12 +176,12 @@ static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab)
176176

177177
llama_token token = i;
178178
auto dp = (char *) token_bytes + offset;
179-
auto size = llama_detokenize(vocab, &token, 1, dp, max_token, false, false);
179+
auto size = llama_detokenize(vocab, &token, 1, dp, max_token, false, false, true);
180180
if (size < 0) {
181181
GGML_ABORT("llama_detokenize failed\n");
182182
}
183183
if (size == 0) {
184-
size = llama_detokenize(vocab, &token, 1, dp + 1, max_token - 1, false, true);
184+
size = llama_detokenize(vocab, &token, 1, dp + 1, max_token - 1, false, true, true);
185185
if (size < 0) {
186186
GGML_ABORT("llama_detokenize failed\n");
187187
}

examples/server/server.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -2297,7 +2297,7 @@ struct server_context {
22972297
for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) {
22982298
result.probs.push_back({
22992299
cur[i].id,
2300-
common_detokenize(ctx, {cur[i].id}, special),
2300+
common_detokenize(ctx, {cur[i].id}, special, /* remove_space_prefix */ false),
23012301
cur[i].p
23022302
});
23032303
}

include/llama.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -1025,14 +1025,16 @@ extern "C" {
10251025
/// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
10261026
/// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
10271027
/// @param unparse_special If true, special tokens are rendered in the output.
1028+
/// @param remove_space_prefix If true, removes the leading space before tokens if they have a word boundary character.
10281029
LLAMA_API int32_t llama_detokenize(
10291030
const struct llama_vocab * vocab,
10301031
const llama_token * tokens,
10311032
int32_t n_tokens,
10321033
char * text,
10331034
int32_t text_len_max,
10341035
bool remove_special,
1035-
bool unparse_special);
1036+
bool unparse_special,
1037+
bool remove_space_prefix);
10361038

10371039
//
10381040
// Chat templates

src/llama-vocab.cpp

+16-11
Original file line numberDiff line numberDiff line change
@@ -1322,11 +1322,13 @@ struct llama_vocab::impl {
13221322
char * text,
13231323
int32_t text_len_max,
13241324
bool remove_special,
1325-
bool unparse_special) const;
1325+
bool unparse_special,
1326+
bool remove_space_prefix = true) const;
13261327

13271328
std::string detokenize(
13281329
const std::vector<llama_token> & tokens,
1329-
bool special) const;
1330+
bool special,
1331+
bool remove_space_prefix = true) const;
13301332

13311333
void print_info() const;
13321334

@@ -2581,7 +2583,8 @@ int32_t llama_vocab::impl::detokenize(
25812583
char * text,
25822584
int32_t text_len_max,
25832585
bool remove_special,
2584-
bool unparse_special) const {
2586+
bool unparse_special,
2587+
bool remove_space_prefix) const {
25852588
if (type == LLAMA_VOCAB_TYPE_NONE) {
25862589
return 0;
25872590
}
@@ -2592,7 +2595,7 @@ int32_t llama_vocab::impl::detokenize(
25922595
int32_t total = 0;
25932596

25942597
// remove the leading space
2595-
bool remove_space = add_space_prefix;
2598+
bool remove_space = add_space_prefix && remove_space_prefix;
25962599

25972600
if (remove_special && add_bos) {
25982601
if (n_tokens > 0 && tokens[0] == special_bos_id) {
@@ -2991,17 +2994,18 @@ int32_t llama_vocab::detokenize(
29912994
char * text,
29922995
int32_t text_len_max,
29932996
bool remove_special,
2994-
bool unparse_special) const {
2995-
return pimpl->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
2997+
bool unparse_special,
2998+
bool remove_space_prefix) const {
2999+
return pimpl->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special, remove_space_prefix);
29963000
}
29973001

2998-
std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, bool special) const {
3002+
std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, bool special, bool remove_space_prefix) const {
29993003
std::string text;
30003004
text.resize(std::max(text.capacity(), tokens.size()));
3001-
int32_t n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
3005+
int32_t n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special, remove_space_prefix);
30023006
if (n_chars < 0) {
30033007
text.resize(-n_chars);
3004-
n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
3008+
n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special, remove_space_prefix);
30053009
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
30063010
}
30073011

@@ -3246,7 +3250,8 @@ int32_t llama_detokenize(
32463250
char * text,
32473251
int32_t text_len_max,
32483252
bool remove_special,
3249-
bool unparse_special) {
3250-
return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
3253+
bool unparse_special,
3254+
bool remove_space_prefix) {
3255+
return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special, remove_space_prefix);
32513256
}
32523257

src/llama-vocab.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -111,11 +111,13 @@ struct llama_vocab {
111111
char * text,
112112
int32_t text_len_max,
113113
bool remove_special,
114-
bool unparse_special) const;
114+
bool unparse_special,
115+
bool remove_space_prefix = true) const;
115116

116117
std::string detokenize(
117118
const std::vector<llama_token> & tokens,
118-
bool special) const;
119+
bool special,
120+
bool remove_space_prefix = true) const;
119121

120122
void print_info() const;
121123

0 commit comments

Comments
 (0)