Skip to content

Commit 69a6735

Browse files
authored
Update special token handling in conversion scripts for gpt2 derived tokenizers (#3746)
We still have the heads up in `README.md` regarding `bpe` tokenizers and this patch is needed for - a couple of tokenizer tests - some more `special` and `non-special` added tokens handling (as far as I understand it) * Update special token handling * Add mpt
1 parent 5be6c80 commit 69a6735

5 files changed

+56
-19
lines changed

convert-bloom-hf-to-gguf.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -118,15 +118,24 @@ def parse_args() -> argparse.Namespace:
118118
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
119119
assert max(tokenizer.vocab.values()) < vocab_size
120120

121+
added_vocab = tokenizer.get_added_vocab()
121122
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
122123

123124
for i in range(vocab_size):
124-
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
125-
scores.append(0.0) # dummy
126-
toktypes.append(gguf.TokenType.NORMAL)
125+
if i not in reverse_vocab:
126+
tokens.append(f"[PAD{i}]")
127+
toktypes.append(gguf.TokenType.USER_DEFINED)
128+
elif reverse_vocab[i] in added_vocab:
129+
tokens.append(reverse_vocab[i])
130+
if tokenizer.added_tokens_decoder[i].special:
131+
toktypes.append(gguf.TokenType.CONTROL)
132+
else:
133+
toktypes.append(gguf.TokenType.USER_DEFINED)
134+
else:
135+
tokens.append(reverse_vocab[i])
136+
toktypes.append(gguf.TokenType.NORMAL)
127137

128138
gguf_writer.add_token_list(tokens)
129-
gguf_writer.add_token_scores(scores)
130139
gguf_writer.add_token_types(toktypes)
131140

132141
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))

convert-gptneox-hf-to-gguf.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -123,15 +123,24 @@ def parse_args() -> argparse.Namespace:
123123
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
124124
assert max(tokenizer.vocab.values()) < vocab_size
125125

126+
added_vocab = tokenizer.get_added_vocab()
126127
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
127128

128129
for i in range(vocab_size):
129-
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
130-
scores.append(0.0) # dummy
131-
toktypes.append(gguf.TokenType.NORMAL)
130+
if i not in reverse_vocab:
131+
tokens.append(f"[PAD{i}]")
132+
toktypes.append(gguf.TokenType.USER_DEFINED)
133+
elif reverse_vocab[i] in added_vocab:
134+
tokens.append(reverse_vocab[i])
135+
if tokenizer.added_tokens_decoder[i].special:
136+
toktypes.append(gguf.TokenType.CONTROL)
137+
else:
138+
toktypes.append(gguf.TokenType.USER_DEFINED)
139+
else:
140+
tokens.append(reverse_vocab[i])
141+
toktypes.append(gguf.TokenType.NORMAL)
132142

133143
gguf_writer.add_token_list(tokens)
134-
gguf_writer.add_token_scores(scores)
135144
gguf_writer.add_token_types(toktypes)
136145

137146
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))

convert-mpt-hf-to-gguf.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,11 @@ def parse_args() -> argparse.Namespace:
136136
tokens.append(f"[PAD{i}]")
137137
toktypes.append(gguf.TokenType.USER_DEFINED)
138138
elif reverse_vocab[i] in added_vocab:
139-
# NOTE: wouldn't we like to distinguish CONTROL tokens here?
140139
tokens.append(reverse_vocab[i])
141-
toktypes.append(gguf.TokenType.USER_DEFINED)
140+
if tokenizer.added_tokens_decoder[i].special:
141+
toktypes.append(gguf.TokenType.CONTROL)
142+
else:
143+
toktypes.append(gguf.TokenType.USER_DEFINED)
142144
else:
143145
tokens.append(reverse_vocab[i])
144146
toktypes.append(gguf.TokenType.NORMAL)

convert-refact-hf-to-gguf.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -139,15 +139,24 @@ def parse_args() -> argparse.Namespace:
139139
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
140140
assert max(tokenizer.vocab.values()) < vocab_size
141141

142+
added_vocab = tokenizer.get_added_vocab()
142143
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
143144

144145
for i in range(vocab_size):
145-
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
146-
scores.append(0.0) # dummy
147-
toktypes.append(gguf.TokenType.NORMAL)
146+
if i not in reverse_vocab:
147+
tokens.append(f"[PAD{i}]")
148+
toktypes.append(gguf.TokenType.USER_DEFINED)
149+
elif reverse_vocab[i] in added_vocab:
150+
tokens.append(reverse_vocab[i])
151+
if tokenizer.added_tokens_decoder[i].special:
152+
toktypes.append(gguf.TokenType.CONTROL)
153+
else:
154+
toktypes.append(gguf.TokenType.USER_DEFINED)
155+
else:
156+
tokens.append(reverse_vocab[i])
157+
toktypes.append(gguf.TokenType.NORMAL)
148158

149159
gguf_writer.add_token_list(tokens)
150-
gguf_writer.add_token_scores(scores)
151160
gguf_writer.add_token_types(toktypes)
152161

153162
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))

convert-starcoder-hf-to-gguf.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -111,17 +111,25 @@ def parse_args() -> argparse.Namespace:
111111
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
112112
assert max(tokenizer.vocab.values()) < vocab_size
113113

114+
added_vocab = tokenizer.get_added_vocab()
114115
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
115116

116117
for i in range(vocab_size):
117-
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
118-
scores.append(0.0) # dummy
119-
toktypes.append(gguf.TokenType.NORMAL)
118+
if i not in reverse_vocab:
119+
tokens.append(f"[PAD{i}]")
120+
toktypes.append(gguf.TokenType.USER_DEFINED)
121+
elif reverse_vocab[i] in added_vocab:
122+
tokens.append(reverse_vocab[i])
123+
if tokenizer.added_tokens_decoder[i].special:
124+
toktypes.append(gguf.TokenType.CONTROL)
125+
else:
126+
toktypes.append(gguf.TokenType.USER_DEFINED)
127+
else:
128+
tokens.append(reverse_vocab[i])
129+
toktypes.append(gguf.TokenType.NORMAL)
120130

121131
gguf_writer.add_token_list(tokens)
122-
gguf_writer.add_token_scores(scores)
123132
gguf_writer.add_token_types(toktypes)
124-
125133
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
126134
special_vocab.add_to_gguf(gguf_writer)
127135

0 commit comments

Comments
 (0)