Add support for loading merges.txt

KerfuffleV2 · KerfuffleV2 · commit 98dc4544487a · 2023-10-23T06:44:13.000-06:00
Add --padvocab option to convert.py

Other minor cleanups
diff --git a/convert.py b/convert.py
@@ -785,20 +785,29 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
                     break
             yield result
 
-def check_vocab_size(params: Params, vocab: Vocab) -> None:
+def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
     if params.n_vocab != vocab.vocab_size:
         assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
         if params.n_vocab == vocab.vocab_size_base:
             print("Ignoring added_tokens.json since model matches vocab size without it.")
             vocab.added_tokens_list = []
             vocab.vocab_size = vocab.vocab_size_base
             return
+        if pad_vocab and params.n_vocab > vocab.vocab_size:
+            pad_count = params.n_vocab - vocab.vocab_size
+            print(f'Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>')
+            for i in range(1, (params.n_vocab - vocab.vocab_size) + 1):
+                vocab.added_tokens_list.append(f'<dummy{i:05}>')
+            vocab.vocab_size = params.n_vocab
+            return
         msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
         if vocab.fname_added_tokens is not None:
             msg += f" combined with {vocab.fname_added_tokens}"
         msg += f" has {vocab.vocab_size})."
         if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
             msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
+        if vocab.vocab_size < params.n_vocab:
+            msg += " Possibly try using the --padvocab option."
         raise Exception(msg)
 
 
@@ -875,8 +884,12 @@ def close(self) -> None:
         self.gguf.close()
 
     @staticmethod
-    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
-        check_vocab_size(params, vocab)
+    def write_vocab_only(
+        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+        pad_vocab: bool            = False,
+        ) -> None:
+        check_vocab_size(params, vocab, pad_vocab = pad_vocab)
 
         of = OutputFile(fname_out, endianess=endianess)
 
@@ -903,8 +916,14 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
         return dt.quantize(arr)
 
     @staticmethod
-    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
-        check_vocab_size(params, vocab)
+    def write_all(
+        fname_out  : Path, ftype: GGMLFileType, params: Params,
+        model      : LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
+        concurrency: int             = DEFAULT_CONCURRENCY,
+        endianess  : gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+        pad_vocab  : bool            = False,
+        ) -> None:
+        check_vocab_size(params, vocab, pad_vocab = pad_vocab)
 
         of = OutputFile(fname_out, endianess=endianess)
 
@@ -1124,6 +1143,7 @@ def main(args_in: list[str] | None = None) -> None:
     parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
     parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
     parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
+    parser.add_argument("--padvocab",    action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
 
     args = parser.parse_args(args_in)
     if args.dump_single:
@@ -1171,7 +1191,8 @@ def main(args_in: list[str] | None = None) -> None:
             load_merges = args.vocabtype == 'bpe',
             n_vocab = vocab.vocab_size)
         outfile = args.outfile
-        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
+        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
+            endianess = endianess, pad_vocab = args.padvocab)
         print(f"Wrote {outfile}")
         return
 
@@ -1194,7 +1215,8 @@ def main(args_in: list[str] | None = None) -> None:
     params.ftype = ftype
     print(f"Writing {outfile}, format {ftype}")
 
-    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
+        concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab)
     print(f"Wrote {outfile}")
 
 
diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py
@@ -1004,6 +1004,35 @@ def __init__(
     def _load(self, path: Path) -> None:
         if not self._try_load_from_tokenizer_json(path):
             self._try_load_from_config_json(path)
+        if self.load_merges and len(self.merges) == 0:
+            self._try_load_merges_txt(path)
+
+    def _try_load_merges_txt(self, path: Path) -> bool:
+        merges_file = path / 'merges.txt'
+        if not merges_file.is_file():
+            return False
+        with open(merges_file, 'r') as fp:
+            first_line = next(fp, '').strip()
+            if not first_line.startswith('#'):
+                fp.seek(0)
+                line_num = 0
+            else:
+                line_num = 1
+            merges = []
+            for line in fp:
+                line_num += 1
+                line = line.strip()
+                if len(line) == 0:
+                    continue
+                parts = line.split(None, 3)
+                if len(parts) != 2:
+                    print(f'gguf: WARNING: {merges_file.name}: Line {line_num}: Entry malformed, ignoring',
+                        file = sys.stderr)
+                    continue
+                merges.append(f'{parts[0]} {parts[1]}')
+        self.merges = merges
+        return True
+
 
     def _set_special_token(self, typ: str, tid: Any):
         if not isinstance(tid, int) or tid < 0:
@@ -1064,6 +1093,9 @@ def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
             if not quiet:
                 print(f'gguf: Adding {len(self.merges)} merge(s).')
             gw.add_token_merges(self.merges)
+        elif self.load_merges:
+            print('gguf: WARNING: Adding merges requested but no merges found, output may be non-functional.',
+                file = sys.stderr)
         for typ, tokid in self.special_token_ids.items():
             handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
             if handler is None:
diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.4.5"
+version = "0.4.6"
 description = "Write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [