diff --git a/convert.py b/convert.py index e9b08d344f5bd..96b6c6f847e1c 100755 --- a/convert.py +++ b/convert.py @@ -366,44 +366,47 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No added_tokens = {} vocab_size: int = self.sentencepiece_tokenizer.vocab_size() - expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) - actual_ids = sorted(added_tokens.values()) - if expected_ids != actual_ids: - raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}") - items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) - self.added_tokens_list = [text for (text, idx) in items] - self.vocab_size_base: int = vocab_size - self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer - self.fname_added_tokens = fname_added_tokens + new_tokens: dict[int, str] = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} + expected_new_ids: list[int] = list(range(vocab_size, vocab_size + len(new_tokens))) + actual_new_ids: list[int] = sorted(new_tokens.keys()) + + if expected_new_ids != actual_new_ids: + raise Exception(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") + + # Token pieces that were added to the base vocabulary. + self.new_tokens_list: list[str] = [new_tokens[id] for id in actual_new_ids] + self.vocab_size_base: int = vocab_size + self.vocab_size: int = self.vocab_size_base + len(self.new_tokens_list) + self.fname_tokenizer = fname_tokenizer + self.fname_added_tokens = fname_added_tokens def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.sentencepiece_tokenizer - for i in range(tokenizer.vocab_size()): - piece = tokenizer.id_to_piece(i) + for id in range(tokenizer.vocab_size()): + piece = tokenizer.id_to_piece(id) text: bytes = piece.encode("utf-8") - score: float = tokenizer.get_score(i) + score: float = tokenizer.get_score(id) toktype = gguf.TokenType.NORMAL - if tokenizer.is_unknown(i): + if tokenizer.is_unknown(id): toktype = gguf.TokenType.UNKNOWN - if tokenizer.is_control(i): + if tokenizer.is_control(id): toktype = gguf.TokenType.CONTROL # NOTE: I think added_tokens are user defined. # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED - if tokenizer.is_unused(i): + if tokenizer.is_unused(id): toktype = gguf.TokenType.UNUSED - if tokenizer.is_byte(i): + if tokenizer.is_byte(id): toktype = gguf.TokenType.BYTE yield text, score, toktype def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - for text in self.added_tokens_list: + for text in self.new_tokens_list: score = -1000.0 yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED