@@ -497,8 +497,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
497
497
hparams.n_embd_head_v = 0 ;
498
498
}
499
499
500
+ // for differentiating model types
500
501
uint32_t n_vocab = 0 ;
501
-
502
502
ml.get_key (LLM_KV_VOCAB_SIZE, n_vocab, false ) || ml.get_arr_n (LLM_KV_TOKENIZER_LIST, n_vocab, false );
503
503
504
504
// arch-specific KVs
@@ -622,7 +622,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
622
622
{
623
623
ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
624
624
ml.get_key (LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn );
625
- ml.get_key (LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type );
626
625
ml.get_key (LLM_KV_POOLING_TYPE, hparams.pooling_type , false );
627
626
628
627
switch (hparams.n_layer ) {
@@ -645,7 +644,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
645
644
{
646
645
ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
647
646
ml.get_key (LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn );
648
- ml.get_key (LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type );
649
647
ml.get_key (LLM_KV_POOLING_TYPE, hparams.pooling_type , false );
650
648
hparams.f_max_alibi_bias = 8 .0f ;
651
649
@@ -659,7 +657,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
659
657
{
660
658
ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
661
659
ml.get_key (LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn );
662
- ml.get_key (LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type );
663
660
ml.get_key (LLM_KV_POOLING_TYPE, hparams.pooling_type );
664
661
665
662
if (hparams.n_layer == 12 && hparams.n_embd == 768 ) {
@@ -1367,7 +1364,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1367
1364
const int64_t n_ff = hparams.n_ff ();
1368
1365
const int64_t n_embd_gqa = n_embd_v_gqa;
1369
1366
const int64_t n_vocab = vocab.n_vocab ();
1370
- const int64_t n_vocab_type = hparams. n_vocab_type ;
1367
+ const int64_t n_token_types = vocab. n_token_types () ;
1371
1368
const int64_t n_rot = hparams.n_rot ;
1372
1369
const int64_t n_expert = hparams.n_expert ;
1373
1370
const int64_t n_expert_used = hparams.n_expert_used ;
@@ -1812,7 +1809,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1812
1809
case LLM_ARCH_NOMIC_BERT:
1813
1810
{
1814
1811
tok_embd = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, 0 );
1815
- type_embd = create_tensor (tn (LLM_TENSOR_TOKEN_TYPES, " weight" ), {n_embd, n_vocab_type }, 0 );
1812
+ type_embd = create_tensor (tn (LLM_TENSOR_TOKEN_TYPES, " weight" ), {n_embd, n_token_types }, 0 );
1816
1813
1817
1814
if (arch == LLM_ARCH_BERT) {
1818
1815
pos_embd = create_tensor (tn (LLM_TENSOR_POS_EMBD, " weight" ), {n_embd, n_ctx_train}, 0 );
@@ -1866,7 +1863,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1866
1863
case LLM_ARCH_JINA_BERT_V2:
1867
1864
{
1868
1865
tok_embd = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, 0 ); // word_embeddings
1869
- type_embd = create_tensor (tn (LLM_TENSOR_TOKEN_TYPES, " weight" ), {n_embd, n_vocab_type }, 0 ); // token_type_embeddings
1866
+ type_embd = create_tensor (tn (LLM_TENSOR_TOKEN_TYPES, " weight" ), {n_embd, n_token_types }, 0 ); // token_type_embeddings
1870
1867
1871
1868
tok_norm = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD_NORM, " weight" ), {n_embd}, 0 ); // LayerNorm
1872
1869
tok_norm_b = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD_NORM, " bias" ), {n_embd}, 0 ); // LayerNorm bias
0 commit comments