examples : new program to verify gguf tokenizer parameters

anisse · anisse · commit 8eb15ae0e0b2 · 2024-04-29T22:04:06.000+02:00
This program verifies that a given gguf model file can tokenize all potential valid characters. Since llama.cpp currently raises an exception when tokenization is not possible[1], this tool helps verifying that valid ascii and utf-8 will always be properly tokenized. [1] #2580
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -31,6 +31,7 @@ else()
     endif()
     add_subdirectory(main)
     add_subdirectory(tokenize)
+    add_subdirectory(tokenizer-verifier)
     add_subdirectory(parallel)
     add_subdirectory(perplexity)
     add_subdirectory(quantize)
diff --git a/examples/tokenizer-verifier/CMakeLists.txt b/examples/tokenizer-verifier/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET tokenizer-verifier)
+add_executable(${TARGET} tokenizer-verifier.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/tokenizer-verifier/tokenizer-verifier.cpp b/examples/tokenizer-verifier/tokenizer-verifier.cpp
@@ -0,0 +1,79 @@
+#include "common.h"
+#include "llama.h"
+
+#include <cstdio>
+#include <err.h>
+#include <string>
+#include <vector>
+
+int unicode_to_utf8(int codepoint, char *dest) {
+    // https://stackoverflow.com/a/4609989 — who needs iconv?
+    if (codepoint < 0x80) {
+        *dest++ = codepoint;
+    } else if (codepoint < 0x800) {
+        *dest++ = 192 + codepoint / 64, *dest++ = 128 + codepoint % 64;
+        // we also support reserved utf-16 surrogates 0xd800 - 0xdfff for simplicity
+    } else if (codepoint < 0x10000) {
+        *dest++ = 224 + codepoint / 4096, *dest++ = 128 + codepoint / 64 % 64,
+            *dest++ = 128 + codepoint % 64;
+    } else if (codepoint < 0x110000) {
+        *dest++ = 240 + codepoint / 262144, *dest++ = 128 + codepoint / 4096 % 64,
+            *dest++ = 128 + codepoint / 64 % 64, *dest++ = 128 + codepoint % 64;
+    } else {
+        return 1;
+    }
+    return 0;
+}
+
+int main(int argc, char **argv) {
+    if (argc < 2) {
+        printf("usage: %s MODEL_PATH\n", argv[0]);
+        return 1;
+    }
+
+    const char *model_path = argv[1];
+
+    llama_backend_init();
+
+    llama_model_params model_params = llama_model_default_params();
+    model_params.vocab_only = true;
+    llama_model *model = llama_load_model_from_file(model_path, model_params);
+
+    std::vector<llama_token> tokens;
+
+    int failed_ascii = 0;
+    int ascii_max = 127;
+    for (int c = 0; c <= ascii_max; c++) {
+        const char prompt[] = {(char)c, '\0'};
+        try {
+            tokens = ::llama_tokenize(model, prompt, false, true);
+        } catch (...) {
+            printf("%#x -> Tokenization failed for char '%c'\n", c, (char)c);
+            failed_ascii += 1;
+            continue;
+        }
+    }
+    printf("%d/%d 7-bit ascii characters could not be tokenized\n", failed_ascii, ascii_max);
+
+    int failed_unicode = 0;
+    int utf8_max = 0x10FFFF;
+    // Now let's do all potential codepoints
+    for (int cp = 0; cp <= utf8_max; cp++) {
+        char buf[5] = {};
+        if (unicode_to_utf8(cp, buf)) {
+            printf("Impossible to encode codepoint %#x\n", cp);
+            continue;
+        }
+        try {
+            tokens = ::llama_tokenize(model, buf, false, true);
+        } catch (...) {
+            // printf("%#x -> Tokenization failed for codepoint '%s'\n", cp, buf);
+            failed_unicode += 1;
+            continue;
+        }
+    }
+    printf("%d/%d potential unicode codepoints not tokenized\n", failed_unicode,
+            utf8_max);
+
+    return (failed_ascii != 0 || failed_unicode != 0);
+}