Skip to content

Commit a6fc554

Browse files
authored
llama : restore prefix space in llama tokenizer (#4081)
1 parent 1cf2850 commit a6fc554

File tree

1 file changed

+4
-1
lines changed

1 file changed

+4
-1
lines changed

llama.cpp

+4-1
Original file line numberDiff line numberDiff line change
@@ -6283,7 +6283,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
62836283
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
62846284
// and passing 'add space prefix' as bool argument
62856285
//
6286-
auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length);
6286+
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
6287+
if (&fragment == &fragment_buffer.front()) {
6288+
raw_text = " " + raw_text; // prefix with space if the first token is not special
6289+
}
62876290

62886291
#ifdef PRETOKENIZERDEBUG
62896292
fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());

0 commit comments

Comments
 (0)