Skip to content

Commit 8eb15ae

Browse files
committed
examples : new program to verify gguf tokenizer parameters
This program verifies that a given gguf model file can tokenize all potential valid characters. Since llama.cpp currently raises an exception when tokenization is not possible[1], this tool helps verifying that valid ascii and utf-8 will always be properly tokenized. [1] #2580
1 parent b8c1476 commit 8eb15ae

File tree

3 files changed

+85
-0
lines changed

3 files changed

+85
-0
lines changed

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ else()
3131
endif()
3232
add_subdirectory(main)
3333
add_subdirectory(tokenize)
34+
add_subdirectory(tokenizer-verifier)
3435
add_subdirectory(parallel)
3536
add_subdirectory(perplexity)
3637
add_subdirectory(quantize)
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
set(TARGET tokenizer-verifier)
2+
add_executable(${TARGET} tokenizer-verifier.cpp)
3+
install(TARGETS ${TARGET} RUNTIME)
4+
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5+
target_compile_features(${TARGET} PRIVATE cxx_std_11)
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#include "common.h"
2+
#include "llama.h"
3+
4+
#include <cstdio>
5+
#include <err.h>
6+
#include <string>
7+
#include <vector>
8+
9+
int unicode_to_utf8(int codepoint, char *dest) {
10+
// https://stackoverflow.com/a/4609989 — who needs iconv?
11+
if (codepoint < 0x80) {
12+
*dest++ = codepoint;
13+
} else if (codepoint < 0x800) {
14+
*dest++ = 192 + codepoint / 64, *dest++ = 128 + codepoint % 64;
15+
// we also support reserved utf-16 surrogates 0xd800 - 0xdfff for simplicity
16+
} else if (codepoint < 0x10000) {
17+
*dest++ = 224 + codepoint / 4096, *dest++ = 128 + codepoint / 64 % 64,
18+
*dest++ = 128 + codepoint % 64;
19+
} else if (codepoint < 0x110000) {
20+
*dest++ = 240 + codepoint / 262144, *dest++ = 128 + codepoint / 4096 % 64,
21+
*dest++ = 128 + codepoint / 64 % 64, *dest++ = 128 + codepoint % 64;
22+
} else {
23+
return 1;
24+
}
25+
return 0;
26+
}
27+
28+
int main(int argc, char **argv) {
29+
if (argc < 2) {
30+
printf("usage: %s MODEL_PATH\n", argv[0]);
31+
return 1;
32+
}
33+
34+
const char *model_path = argv[1];
35+
36+
llama_backend_init();
37+
38+
llama_model_params model_params = llama_model_default_params();
39+
model_params.vocab_only = true;
40+
llama_model *model = llama_load_model_from_file(model_path, model_params);
41+
42+
std::vector<llama_token> tokens;
43+
44+
int failed_ascii = 0;
45+
int ascii_max = 127;
46+
for (int c = 0; c <= ascii_max; c++) {
47+
const char prompt[] = {(char)c, '\0'};
48+
try {
49+
tokens = ::llama_tokenize(model, prompt, false, true);
50+
} catch (...) {
51+
printf("%#x -> Tokenization failed for char '%c'\n", c, (char)c);
52+
failed_ascii += 1;
53+
continue;
54+
}
55+
}
56+
printf("%d/%d 7-bit ascii characters could not be tokenized\n", failed_ascii, ascii_max);
57+
58+
int failed_unicode = 0;
59+
int utf8_max = 0x10FFFF;
60+
// Now let's do all potential codepoints
61+
for (int cp = 0; cp <= utf8_max; cp++) {
62+
char buf[5] = {};
63+
if (unicode_to_utf8(cp, buf)) {
64+
printf("Impossible to encode codepoint %#x\n", cp);
65+
continue;
66+
}
67+
try {
68+
tokens = ::llama_tokenize(model, buf, false, true);
69+
} catch (...) {
70+
// printf("%#x -> Tokenization failed for codepoint '%s'\n", cp, buf);
71+
failed_unicode += 1;
72+
continue;
73+
}
74+
}
75+
printf("%d/%d potential unicode codepoints not tokenized\n", failed_unicode,
76+
utf8_max);
77+
78+
return (failed_ascii != 0 || failed_unicode != 0);
79+
}

0 commit comments

Comments
 (0)