llama : add remove_space_prefix to llama_detokenize

danbev · danbev · commit cc1fd2fd0d1c · 2025-02-10T16:21:06.000+01:00
This commit adds a new parameter to llama_detokenize to remove the leading space before tokens if they have a word boundary character. The motivation for this change is that when llama_server returns completion_propabilities, the tokens are detokenized and currently the leading space for the boundary tokens are removed. With this change llama_server can set remove_space_prefix to false and the leading space will be preserved. Resolves: #11728
diff --git a/common/common.cpp b/common/common.cpp
@@ -1746,19 +1746,19 @@ std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token
     return piece;
 }
 
-std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
+std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special, bool remove_space_prefix) {
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
-    return common_detokenize(vocab, tokens, special);
+    return common_detokenize(vocab, tokens, special, remove_space_prefix);
 }
 
-std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
+std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special, bool remove_space_prefix) {
     std::string text;
     text.resize(std::max(text.capacity(), tokens.size()));
-    int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+    int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special, remove_space_prefix);
     if (n_chars < 0) {
         text.resize(-n_chars);
-        n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+        n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special, remove_space_prefix);
         GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
     }
 
diff --git a/common/common.h b/common/common.h
@@ -601,12 +601,14 @@ std::string common_token_to_piece(
 std::string common_detokenize(
             const struct llama_context * ctx,
         const std::vector<llama_token> & tokens,
-                                  bool   special = true);
+                                  bool   special = true,
+                                  bool   remove_space_prefix = true);
 
 std::string common_detokenize(
               const struct llama_vocab * vocab,
         const std::vector<llama_token> & tokens,
-                                  bool   special = true);
+                                  bool   special = true,
+                                  bool   remove_space_prefix = true);
 
 //
 // Chat template utils
diff --git a/common/llguidance.cpp b/common/llguidance.cpp
@@ -176,12 +176,12 @@ static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab)
 
         llama_token token = i;
         auto        dp    = (char *) token_bytes + offset;
-        auto        size  = llama_detokenize(vocab, &token, 1, dp, max_token, false, false);
+        auto        size  = llama_detokenize(vocab, &token, 1, dp, max_token, false, false, true);
         if (size < 0) {
             GGML_ABORT("llama_detokenize failed\n");
         }
         if (size == 0) {
-            size = llama_detokenize(vocab, &token, 1, dp + 1, max_token - 1, false, true);
+            size = llama_detokenize(vocab, &token, 1, dp + 1, max_token - 1, false, true, true);
             if (size < 0) {
                 GGML_ABORT("llama_detokenize failed\n");
             }
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -2297,7 +2297,7 @@ struct server_context {
             for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) {
                 result.probs.push_back({
                     cur[i].id,
-                    common_detokenize(ctx, {cur[i].id}, special),
+                    common_detokenize(ctx, {cur[i].id}, special, /* remove_space_prefix */ false),
                     cur[i].p
                 });
             }
diff --git a/include/llama.h b/include/llama.h
@@ -1025,14 +1025,16 @@ extern "C" {
     /// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
     /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
     /// @param unparse_special If true, special tokens are rendered in the output.
+    /// @param remove_space_prefix If true, removes the leading space before tokens if they have a word boundary character.
     LLAMA_API int32_t llama_detokenize(
         const struct llama_vocab * vocab,
                const llama_token * tokens,
                          int32_t   n_tokens,
                             char * text,
                          int32_t   text_len_max,
                             bool   remove_special,
-                            bool   unparse_special);
+                            bool   unparse_special,
+                            bool   remove_space_prefix);
 
     //
     // Chat templates
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
@@ -1322,11 +1322,13 @@ struct llama_vocab::impl {
                          char * text,
                       int32_t   text_len_max,
                          bool   remove_special,
-                         bool   unparse_special) const;
+                         bool   unparse_special,
+                         bool   remove_space_prefix = true) const;
 
     std::string detokenize(
             const std::vector<llama_token> & tokens,
-                                      bool   special) const;
+                                      bool   special,
+                                      bool   remove_space_prefix = true) const;
 
     void print_info() const;
 
@@ -2581,7 +2583,8 @@ int32_t llama_vocab::impl::detokenize(
                             char * text,
                          int32_t   text_len_max,
                             bool   remove_special,
-                            bool   unparse_special) const {
+                            bool   unparse_special,
+                            bool   remove_space_prefix) const {
     if (type == LLAMA_VOCAB_TYPE_NONE) {
         return 0;
     }
@@ -2592,7 +2595,7 @@ int32_t llama_vocab::impl::detokenize(
     int32_t total = 0;
 
     // remove the leading space
-    bool remove_space = add_space_prefix;
+    bool remove_space = add_space_prefix && remove_space_prefix;
 
     if (remove_special && add_bos) {
         if (n_tokens > 0 && tokens[0] == special_bos_id) {
@@ -2991,17 +2994,18 @@ int32_t llama_vocab::detokenize(
                             char * text,
                          int32_t   text_len_max,
                             bool   remove_special,
-                            bool   unparse_special) const {
-    return pimpl->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
+                            bool   unparse_special,
+                            bool   remove_space_prefix) const {
+    return pimpl->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special, remove_space_prefix);
 }
 
-std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, bool special) const {
+std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, bool special, bool remove_space_prefix) const {
     std::string text;
     text.resize(std::max(text.capacity(), tokens.size()));
-    int32_t n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+    int32_t n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special, remove_space_prefix);
     if (n_chars < 0) {
         text.resize(-n_chars);
-        n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+        n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special, remove_space_prefix);
         GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
     }
 
@@ -3246,7 +3250,8 @@ int32_t llama_detokenize(
                         char * text,
                      int32_t   text_len_max,
                         bool   remove_special,
-                        bool   unparse_special) {
-    return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
+                        bool   unparse_special,
+                        bool   remove_space_prefix) {
+    return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special, remove_space_prefix);
 }
 
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
@@ -111,11 +111,13 @@ struct llama_vocab {
                          char * text,
                       int32_t   text_len_max,
                          bool   remove_special,
-                         bool   unparse_special) const;
+                         bool   unparse_special,
+                         bool   remove_space_prefix = true) const;
 
     std::string detokenize(
             const std::vector<llama_token> & tokens,
-                                      bool   special) const;
+                                      bool   special,
+                                      bool   remove_space_prefix = true) const;
 
     void print_info() const;
 

Original file line number	Diff line number	Diff line change
`@@ -176,12 +176,12 @@ static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab)`
`176`	`176`
`177`	`177`	`llama_token token = i;`
`178`	`178`	`auto dp = (char *) token_bytes + offset;`
`179`		`- auto size = llama_detokenize(vocab, &token, 1, dp, max_token, false, false);`
	`179`	`+ auto size = llama_detokenize(vocab, &token, 1, dp, max_token, false, false, true);`
`180`	`180`	`if (size < 0) {`
`181`	`181`	`GGML_ABORT("llama_detokenize failed\n");`
`182`	`182`	`}`
`183`	`183`	`if (size == 0) {`
`184`		`- size = llama_detokenize(vocab, &token, 1, dp + 1, max_token - 1, false, true);`
	`184`	`+ size = llama_detokenize(vocab, &token, 1, dp + 1, max_token - 1, false, true, true);`
`185`	`185`	`if (size < 0) {`
`186`	`186`	`GGML_ABORT("llama_detokenize failed\n");`
`187`	`187`	`}`
Original file line number	Diff line number	Diff line change
`@@ -2297,7 +2297,7 @@ struct server_context {`
`2297`	`2297`	`for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) {`
`2298`	`2298`	`result.probs.push_back({`
`2299`	`2299`	`cur[i].id,`
`2300`		`- common_detokenize(ctx, {cur[i].id}, special),`
	`2300`	`+ common_detokenize(ctx, {cur[i].id}, special, /* remove_space_prefix */ false),`
`2301`	`2301`	`cur[i].p`
`2302`	`2302`	`});`
`2303`	`2303`	`}`