Skip to content

Commit aeeb942

Browse files
ggerganovslaren
andcommitted
vocab : minor tokenization optimizations (#11160)
ggml-ci Co-authored-by: Diego Devesa <slarengh@gmail.com>
1 parent a857dc5 commit aeeb942

File tree

1 file changed

+19
-17
lines changed

1 file changed

+19
-17
lines changed

src/llama-vocab.cpp

+19-17
Original file line numberDiff line numberDiff line change
@@ -2345,19 +2345,21 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
23452345

23462346
for (const auto & fragment : fragment_buffer) {
23472347
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2348-
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
2348+
std::string text;
23492349

23502350
// prefix with space if previous is special
23512351
if (tokenizer_add_space_prefix && is_prev_special) {
2352-
raw_text = " " + raw_text;
2352+
text = ' ';
23532353
}
23542354

2355+
text += fragment.raw_text.substr(fragment.offset, fragment.length);
2356+
23552357
#ifdef PRETOKENIZERDEBUG
2356-
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
2358+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
23572359
#endif
2358-
llama_escape_whitespace(raw_text);
2360+
llama_escape_whitespace(text);
23592361
llm_tokenizer_spm_session session(vocab);
2360-
session.tokenize(raw_text, output);
2362+
session.tokenize(text, output);
23612363
is_prev_special = false;
23622364
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
23632365
output.push_back(fragment.token);
@@ -2387,12 +2389,12 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
23872389
}
23882390
for (const auto & fragment : fragment_buffer) {
23892391
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2390-
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
2392+
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
23912393

23922394
#ifdef PRETOKENIZERDEBUG
2393-
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
2395+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
23942396
#endif
2395-
session.tokenize(raw_text, output);
2397+
session.tokenize(text, output);
23962398
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
23972399
session.append(fragment.token, output);
23982400
}
@@ -2414,12 +2416,12 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
24142416

24152417
for (const auto & fragment : fragment_buffer) {
24162418
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2417-
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
2419+
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
24182420

24192421
#ifdef PRETOKENIZERDEBUG
2420-
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
2422+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
24212423
#endif
2422-
session.tokenize(raw_text, output);
2424+
session.tokenize(text, output);
24232425
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
24242426
output.push_back(fragment.token);
24252427
}
@@ -2440,11 +2442,11 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
24402442

24412443
for (const auto & fragment : fragment_buffer) {
24422444
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2443-
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
2445+
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
24442446
#ifdef PRETOKENIZERDEBUG
2445-
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
2447+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
24462448
#endif
2447-
session.tokenize(raw_text, output);
2449+
session.tokenize(text, output);
24482450
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
24492451
output.push_back(fragment.token);
24502452
}
@@ -2467,13 +2469,13 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
24672469
llm_tokenizer_rwkv_session session(vocab, *static_cast<const llm_tokenizer_rwkv *>(tokenizer.get()));
24682470
for (const auto & fragment : fragment_buffer) {
24692471
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2470-
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
2472+
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
24712473

24722474
#ifdef PRETOKENIZERDEBUG
2473-
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
2475+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
24742476
#endif
24752477

2476-
session.tokenize(raw_text, output);
2478+
session.tokenize(text, output);
24772479
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
24782480
output.push_back(fragment.token);
24792481
}

0 commit comments

Comments
 (0)