@@ -4076,24 +4076,26 @@ static void llm_load_vocab(
4076
4076
if (add_space_prefix_keyidx != -1) {
4077
4077
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4078
4078
} // The default value of add_space_prefix is true.
4079
+ } else if (tokenizer_name == "bert") {
4080
+ vocab.type = LLAMA_VOCAB_TYPE_WPM;
4081
+
4082
+ // default special tokens
4083
+ vocab.special_bos_id = -1;
4084
+ vocab.special_eos_id = -1;
4085
+ vocab.special_unk_id = 100;
4086
+ vocab.special_sep_id = 102;
4087
+ vocab.special_pad_id = 0;
4088
+ vocab.special_cls_id = 101;
4089
+ vocab.special_mask_id = 103;
4090
+ vocab.add_space_prefix = false;
4079
4091
} else {
4080
4092
if (tokenizer_name == "gpt2") {
4081
4093
vocab.type = LLAMA_VOCAB_TYPE_BPE;
4082
4094
} else if (tokenizer_name == "deepseek_coder") {
4083
4095
vocab.type = LLAMA_VOCAB_TYPE_DEEPSEEKCODER;
4084
4096
} else if (tokenizer_name == "deepseek_llm") {
4085
4097
vocab.type = LLAMA_VOCAB_TYPE_DEEPSEEKLLM;
4086
- } else if (tokenizer_name == "bert") {
4087
- vocab.type = LLAMA_VOCAB_TYPE_WPM;
4088
-
4089
- // default special tokens
4090
- vocab.special_bos_id = 101;
4091
- vocab.special_eos_id = 102;
4092
- vocab.special_unk_id = 100;
4093
- vocab.special_sep_id = -1;
4094
- vocab.special_pad_id = -1;
4095
- vocab.add_space_prefix = false;
4096
- } else {
4098
+ } else {
4097
4099
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
4098
4100
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4099
4101
vocab.type = LLAMA_VOCAB_TYPE_SPM;
@@ -4125,11 +4127,13 @@ static void llm_load_vocab(
4125
4127
}
4126
4128
4127
4129
// default special tokens
4128
- vocab.special_bos_id = 11;
4129
- vocab.special_eos_id = 11;
4130
- vocab.special_unk_id = -1;
4131
- vocab.special_sep_id = -1;
4132
- vocab.special_pad_id = -1;
4130
+ vocab.special_bos_id = 11;
4131
+ vocab.special_eos_id = 11;
4132
+ vocab.special_unk_id = -1;
4133
+ vocab.special_sep_id = -1;
4134
+ vocab.special_pad_id = -1;
4135
+ vocab.special_cls_id = -1;
4136
+ vocab.special_mask_id = -1;
4133
4137
}
4134
4138
}
4135
4139
0 commit comments