@@ -785,20 +785,29 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
785
785
break
786
786
yield result
787
787
788
- def check_vocab_size (params : Params , vocab : Vocab ) -> None :
788
+ def check_vocab_size (params : Params , vocab : Vocab , pad_vocab : bool = False ) -> None :
789
789
if params .n_vocab != vocab .vocab_size :
790
790
assert isinstance (vocab , BpeVocab ) or isinstance (vocab , SentencePieceVocab )
791
791
if params .n_vocab == vocab .vocab_size_base :
792
792
print ("Ignoring added_tokens.json since model matches vocab size without it." )
793
793
vocab .added_tokens_list = []
794
794
vocab .vocab_size = vocab .vocab_size_base
795
795
return
796
+ if pad_vocab and params .n_vocab > vocab .vocab_size :
797
+ pad_count = params .n_vocab - vocab .vocab_size
798
+ print (f'Padding vocab with { pad_count } token(s) - <dummy00001> through <dummy{ pad_count :05} >' )
799
+ for i in range (1 , (params .n_vocab - vocab .vocab_size ) + 1 ):
800
+ vocab .added_tokens_list .append (f'<dummy{ i :05} >' )
801
+ vocab .vocab_size = params .n_vocab
802
+ return
796
803
msg = f"Vocab size mismatch (model has { params .n_vocab } , but { vocab .fname_tokenizer } "
797
804
if vocab .fname_added_tokens is not None :
798
805
msg += f" combined with { vocab .fname_added_tokens } "
799
806
msg += f" has { vocab .vocab_size } )."
800
807
if vocab .vocab_size < params .n_vocab < vocab .vocab_size + 20 and vocab .fname_added_tokens is None :
801
808
msg += f" Most likely you are missing added_tokens.json (should be in { vocab .fname_tokenizer .parent } )."
809
+ if vocab .vocab_size < params .n_vocab :
810
+ msg += " Possibly try using the --padvocab option."
802
811
raise Exception (msg )
803
812
804
813
@@ -875,8 +884,12 @@ def close(self) -> None:
875
884
self .gguf .close ()
876
885
877
886
@staticmethod
878
- def write_vocab_only (fname_out : Path , params : Params , vocab : Vocab , svocab : gguf .SpecialVocab , endianess :gguf .GGUFEndian = gguf .GGUFEndian .LITTLE ) -> None :
879
- check_vocab_size (params , vocab )
887
+ def write_vocab_only (
888
+ fname_out : Path , params : Params , vocab : Vocab , svocab : gguf .SpecialVocab ,
889
+ endianess : gguf .GGUFEndian = gguf .GGUFEndian .LITTLE ,
890
+ pad_vocab : bool = False ,
891
+ ) -> None :
892
+ check_vocab_size (params , vocab , pad_vocab = pad_vocab )
880
893
881
894
of = OutputFile (fname_out , endianess = endianess )
882
895
@@ -903,8 +916,14 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
903
916
return dt .quantize (arr )
904
917
905
918
@staticmethod
906
- def write_all (fname_out : Path , ftype : GGMLFileType , params : Params , model : LazyModel , vocab : Vocab , svocab : gguf .SpecialVocab , concurrency : int = DEFAULT_CONCURRENCY , endianess = gguf .GGUFEndian .LITTLE ) -> None :
907
- check_vocab_size (params , vocab )
919
+ def write_all (
920
+ fname_out : Path , ftype : GGMLFileType , params : Params ,
921
+ model : LazyModel , vocab : Vocab , svocab : gguf .SpecialVocab ,
922
+ concurrency : int = DEFAULT_CONCURRENCY ,
923
+ endianess : gguf .GGUFEndian = gguf .GGUFEndian .LITTLE ,
924
+ pad_vocab : bool = False ,
925
+ ) -> None :
926
+ check_vocab_size (params , vocab , pad_vocab = pad_vocab )
908
927
909
928
of = OutputFile (fname_out , endianess = endianess )
910
929
@@ -1124,6 +1143,7 @@ def main(args_in: list[str] | None = None) -> None:
1124
1143
parser .add_argument ("--ctx" , type = int , help = "model training context (default: based on input)" )
1125
1144
parser .add_argument ("--concurrency" , type = int , help = f"concurrency used for conversion (default: { DEFAULT_CONCURRENCY } )" , default = DEFAULT_CONCURRENCY )
1126
1145
parser .add_argument ("--bigendian" , action = "store_true" , help = "model is executed on big endian machine" )
1146
+ parser .add_argument ("--padvocab" , action = "store_true" , help = "add pad tokens when model vocab expects more than tokenizer metadata provides" )
1127
1147
1128
1148
args = parser .parse_args (args_in )
1129
1149
if args .dump_single :
@@ -1171,7 +1191,8 @@ def main(args_in: list[str] | None = None) -> None:
1171
1191
load_merges = args .vocabtype == 'bpe' ,
1172
1192
n_vocab = vocab .vocab_size )
1173
1193
outfile = args .outfile
1174
- OutputFile .write_vocab_only (outfile , params , vocab , special_vocab )
1194
+ OutputFile .write_vocab_only (outfile , params , vocab , special_vocab ,
1195
+ endianess = endianess , pad_vocab = args .padvocab )
1175
1196
print (f"Wrote { outfile } " )
1176
1197
return
1177
1198
@@ -1194,7 +1215,8 @@ def main(args_in: list[str] | None = None) -> None:
1194
1215
params .ftype = ftype
1195
1216
print (f"Writing { outfile } , format { ftype } " )
1196
1217
1197
- OutputFile .write_all (outfile , ftype , params , model , vocab , special_vocab , concurrency = args .concurrency , endianess = endianess )
1218
+ OutputFile .write_all (outfile , ftype , params , model , vocab , special_vocab ,
1219
+ concurrency = args .concurrency , endianess = endianess , pad_vocab = args .padvocab )
1198
1220
print (f"Wrote { outfile } " )
1199
1221
1200
1222
0 commit comments