@@ -402,9 +402,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
402
402
// get general kv
403
403
ml.get_key (LLM_KV_GENERAL_NAME, name, false );
404
404
405
- // get hparams kv
406
- ml.get_key (LLM_KV_VOCAB_SIZE, hparams.n_vocab , false ) || ml.get_arr_n (LLM_KV_TOKENIZER_LIST, hparams.n_vocab , false );
407
-
408
405
// everything past this point is not vocab-related
409
406
if (hparams.vocab_only ) {
410
407
return ;
@@ -500,6 +497,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
500
497
hparams.n_embd_head_v = 0 ;
501
498
}
502
499
500
+ // for differentiating model types
501
+ uint32_t n_vocab = 0 ;
502
+ ml.get_key (LLM_KV_VOCAB_SIZE, n_vocab, false ) || ml.get_arr_n (LLM_KV_TOKENIZER_LIST, n_vocab, false );
503
+
503
504
// arch-specific KVs
504
505
switch (arch) {
505
506
case LLM_ARCH_LLAMA:
@@ -519,7 +520,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
519
520
case 26 : type = LLM_TYPE_3B; break ;
520
521
case 28 : type = LLM_TYPE_3B; break ; // Llama 3.2 3B
521
522
// granite uses a vocab with len 49152
522
- case 32 : type = hparams. n_vocab == 49152 ? LLM_TYPE_3B : (hparams. n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break ;
523
+ case 32 : type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break ;
523
524
case 36 : type = LLM_TYPE_8B; break ; // granite
524
525
case 40 : type = LLM_TYPE_13B; break ;
525
526
case 48 : type = LLM_TYPE_34B; break ;
@@ -621,7 +622,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
621
622
{
622
623
ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
623
624
ml.get_key (LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn );
624
- ml.get_key (LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type );
625
625
ml.get_key (LLM_KV_POOLING_TYPE, hparams.pooling_type , false );
626
626
627
627
switch (hparams.n_layer ) {
@@ -644,7 +644,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
644
644
{
645
645
ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
646
646
ml.get_key (LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn );
647
- ml.get_key (LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type );
648
647
ml.get_key (LLM_KV_POOLING_TYPE, hparams.pooling_type , false );
649
648
hparams.f_max_alibi_bias = 8 .0f ;
650
649
@@ -658,7 +657,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
658
657
{
659
658
ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
660
659
ml.get_key (LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn );
661
- ml.get_key (LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type );
662
660
ml.get_key (LLM_KV_POOLING_TYPE, hparams.pooling_type );
663
661
664
662
if (hparams.n_layer == 12 && hparams.n_embd == 768 ) {
@@ -1365,8 +1363,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1365
1363
const int64_t n_embd_head_v = hparams.n_embd_head_v ;
1366
1364
const int64_t n_ff = hparams.n_ff ();
1367
1365
const int64_t n_embd_gqa = n_embd_v_gqa;
1368
- const int64_t n_vocab = hparams .n_vocab ;
1369
- const int64_t n_vocab_type = hparams. n_vocab_type ;
1366
+ const int64_t n_vocab = vocab .n_vocab () ;
1367
+ const int64_t n_token_types = vocab. n_token_types () ;
1370
1368
const int64_t n_rot = hparams.n_rot ;
1371
1369
const int64_t n_expert = hparams.n_expert ;
1372
1370
const int64_t n_expert_used = hparams.n_expert_used ;
@@ -1811,7 +1809,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1811
1809
case LLM_ARCH_NOMIC_BERT:
1812
1810
{
1813
1811
tok_embd = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, 0 );
1814
- type_embd = create_tensor (tn (LLM_TENSOR_TOKEN_TYPES, " weight" ), {n_embd, n_vocab_type }, 0 );
1812
+ type_embd = create_tensor (tn (LLM_TENSOR_TOKEN_TYPES, " weight" ), {n_embd, n_token_types }, 0 );
1815
1813
1816
1814
if (arch == LLM_ARCH_BERT) {
1817
1815
pos_embd = create_tensor (tn (LLM_TENSOR_POS_EMBD, " weight" ), {n_embd, n_ctx_train}, 0 );
@@ -1865,7 +1863,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1865
1863
case LLM_ARCH_JINA_BERT_V2:
1866
1864
{
1867
1865
tok_embd = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, 0 ); // word_embeddings
1868
- type_embd = create_tensor (tn (LLM_TENSOR_TOKEN_TYPES, " weight" ), {n_embd, n_vocab_type }, 0 ); // token_type_embeddings
1866
+ type_embd = create_tensor (tn (LLM_TENSOR_TOKEN_TYPES, " weight" ), {n_embd, n_token_types }, 0 ); // token_type_embeddings
1869
1867
1870
1868
tok_norm = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD_NORM, " weight" ), {n_embd}, 0 ); // LayerNorm
1871
1869
tok_norm_b = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD_NORM, " bias" ), {n_embd}, 0 ); // LayerNorm bias
@@ -3494,7 +3492,6 @@ void llama_model::print_info() const {
3494
3492
3495
3493
// hparams
3496
3494
LLAMA_LOG_INFO (" %s: arch = %s\n " , __func__, arch_name ().c_str ());
3497
- LLAMA_LOG_INFO (" %s: n_vocab (hp) = %u\n " , __func__, hparams.n_vocab );
3498
3495
LLAMA_LOG_INFO (" %s: vocab_only = %d\n " , __func__, hparams.vocab_only );
3499
3496
3500
3497
if (!hparams.vocab_only ) {
0 commit comments