@@ -2345,19 +2345,21 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
2345
2345
2346
2346
for (const auto & fragment : fragment_buffer) {
2347
2347
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2348
- auto raw_text = fragment. raw_text . substr (fragment. offset , fragment. length ) ;
2348
+ std::string text ;
2349
2349
2350
2350
// prefix with space if previous is special
2351
2351
if (tokenizer_add_space_prefix && is_prev_special) {
2352
- raw_text = " " + raw_text ;
2352
+ text = ' ' ;
2353
2353
}
2354
2354
2355
+ text += fragment.raw_text .substr (fragment.offset , fragment.length );
2356
+
2355
2357
#ifdef PRETOKENIZERDEBUG
2356
- LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , raw_text .length (), fragment.offset , fragment.length , raw_text .c_str ());
2358
+ LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text .length (), fragment.offset , fragment.length , text .c_str ());
2357
2359
#endif
2358
- llama_escape_whitespace (raw_text );
2360
+ llama_escape_whitespace (text );
2359
2361
llm_tokenizer_spm_session session (vocab);
2360
- session.tokenize (raw_text , output);
2362
+ session.tokenize (text , output);
2361
2363
is_prev_special = false ;
2362
2364
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2363
2365
output.push_back (fragment.token );
@@ -2387,12 +2389,12 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
2387
2389
}
2388
2390
for (const auto & fragment : fragment_buffer) {
2389
2391
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2390
- auto raw_text = fragment.raw_text .substr (fragment.offset , fragment.length );
2392
+ std::string text = fragment.raw_text .substr (fragment.offset , fragment.length );
2391
2393
2392
2394
#ifdef PRETOKENIZERDEBUG
2393
- LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , raw_text .length (), fragment.offset , fragment.length , raw_text .c_str ());
2395
+ LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text .length (), fragment.offset , fragment.length , text .c_str ());
2394
2396
#endif
2395
- session.tokenize (raw_text , output);
2397
+ session.tokenize (text , output);
2396
2398
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2397
2399
session.append (fragment.token , output);
2398
2400
}
@@ -2414,12 +2416,12 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
2414
2416
2415
2417
for (const auto & fragment : fragment_buffer) {
2416
2418
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2417
- auto raw_text = fragment.raw_text .substr (fragment.offset , fragment.length );
2419
+ std::string text = fragment.raw_text .substr (fragment.offset , fragment.length );
2418
2420
2419
2421
#ifdef PRETOKENIZERDEBUG
2420
- LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , raw_text .length (), fragment.offset , fragment.length , raw_text .c_str ());
2422
+ LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text .length (), fragment.offset , fragment.length , text .c_str ());
2421
2423
#endif
2422
- session.tokenize (raw_text , output);
2424
+ session.tokenize (text , output);
2423
2425
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2424
2426
output.push_back (fragment.token );
2425
2427
}
@@ -2440,11 +2442,11 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
2440
2442
2441
2443
for (const auto & fragment : fragment_buffer) {
2442
2444
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2443
- auto raw_text = fragment.raw_text .substr (fragment.offset , fragment.length );
2445
+ std::string text = fragment.raw_text .substr (fragment.offset , fragment.length );
2444
2446
#ifdef PRETOKENIZERDEBUG
2445
- LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , raw_text .length (), fragment.offset , fragment.length , raw_text .c_str ());
2447
+ LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text .length (), fragment.offset , fragment.length , text .c_str ());
2446
2448
#endif
2447
- session.tokenize (raw_text , output);
2449
+ session.tokenize (text , output);
2448
2450
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2449
2451
output.push_back (fragment.token );
2450
2452
}
@@ -2467,13 +2469,13 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
2467
2469
llm_tokenizer_rwkv_session session (vocab, *static_cast <const llm_tokenizer_rwkv *>(tokenizer.get ()));
2468
2470
for (const auto & fragment : fragment_buffer) {
2469
2471
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2470
- auto raw_text = fragment.raw_text .substr (fragment.offset , fragment.length );
2472
+ std::string text = fragment.raw_text .substr (fragment.offset , fragment.length );
2471
2473
2472
2474
#ifdef PRETOKENIZERDEBUG
2473
- LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , raw_text .length (), fragment.offset , fragment.length , raw_text .c_str ());
2475
+ LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text .length (), fragment.offset , fragment.length , text .c_str ());
2474
2476
#endif
2475
2477
2476
- session.tokenize (raw_text , output);
2478
+ session.tokenize (text , output);
2477
2479
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2478
2480
output.push_back (fragment.token );
2479
2481
}
0 commit comments