@@ -438,6 +438,11 @@ struct llama_file {
438
438
read_raw (&ret, sizeof (ret));
439
439
return ret;
440
440
}
441
+ std::float_t read_f32 () {
442
+ std::float_t ret;
443
+ read_raw (&ret, sizeof (ret));
444
+ return ret;
445
+ }
441
446
442
447
std::string read_string (std::uint32_t len) {
443
448
std::vector<char > chars (len);
@@ -491,6 +496,59 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
491
496
file->write_raw (tensor->data , ggml_nbytes (tensor));
492
497
}
493
498
499
+ bool is_ggml_file (const char *filename) {
500
+ llama_file file (filename, " rb" );
501
+ if (file.size < 4 ) {
502
+ return false ;
503
+ }
504
+ uint32_t magic = file.read_u32 ();
505
+ return magic == LLAMA_FILE_MAGIC;
506
+ }
507
+
508
+ void load_vocab (const char *filename, Config *config, struct llama_vocab *vocab) {
509
+ // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
510
+ if (is_ggml_file (filename)) {
511
+
512
+ struct llama_context_params llama_params = llama_context_default_params ();
513
+ llama_params.vocab_only = true ;
514
+
515
+ struct llama_model * lmodel = llama_load_model_from_file (filename, llama_params);
516
+ struct llama_context * lctx = llama_new_context_with_model (lmodel, llama_params);
517
+
518
+ std::vector<const char *> strings;
519
+ std::vector<float > scores;
520
+ int n_vocab = llama_n_vocab (lctx);
521
+ strings.resize (n_vocab, NULL );
522
+ scores.resize (n_vocab, 0 );
523
+ n_vocab = llama_get_vocab (lctx, strings.data (), scores.data (), n_vocab);
524
+ GGML_ASSERT (n_vocab == llama_n_vocab (lctx));
525
+ vocab->id_to_token .resize (n_vocab);
526
+ for (int i=0 ; i<n_vocab; ++i) {
527
+ std::string tok = std::string (strings[i]);
528
+ float score = scores[i];
529
+ vocab->id_to_token [i].tok = tok;
530
+ vocab->id_to_token [i].score = score;
531
+ vocab->token_to_id .emplace (tok, i);
532
+ }
533
+ llama_free (lctx);
534
+ llama_free_model (lmodel);
535
+ } else { // assume llama2.c vocabulary
536
+ printf (" Assuming llama2.c vocabulary since %s is not a ggml file\n " , filename);
537
+ llama_file file (filename, " rb" );
538
+ uint32_t n_vocab = config->vocab_size ;
539
+ /* uint32_t max_token_length = */ file.read_u32 (); // unused
540
+ vocab->id_to_token .resize (n_vocab);
541
+ for (uint32_t i=0 ; i<n_vocab; ++i) {
542
+ float_t score = file.read_f32 ();
543
+ uint32_t len = file.read_u32 ();
544
+ std::string tok = file.read_string (len);
545
+ vocab->id_to_token [i].tok = tok;
546
+ vocab->id_to_token [i].score = score;
547
+ vocab->token_to_id .emplace (tok, i);
548
+ }
549
+ }
550
+ }
551
+
494
552
void stuff_karpathy_weights_into_gg (struct ggml_tensor * gg_weights, float * karpathy_weights){
495
553
int ct;
496
554
switch (gg_weights->n_dims ){
@@ -658,7 +716,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
658
716
fprintf (stderr, " \n " );
659
717
fprintf (stderr, " options:\n " );
660
718
fprintf (stderr, " -h, --help show this help message and exit\n " );
661
- fprintf (stderr, " --copy-vocab-from-model FNAME model path from which to copy vocab (default '%s')\n " , params->fn_vocab_model );
719
+ fprintf (stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggml model path from which to copy vocab (default '%s')\n " , params->fn_vocab_model );
662
720
fprintf (stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n " );
663
721
fprintf (stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n " , params->fn_llama2c_output_model );
664
722
fprintf (stderr, " \n " );
@@ -737,30 +795,9 @@ int main(int argc, char ** argv) {
737
795
fclose (file);
738
796
}
739
797
740
- struct llama_context_params llama_params = llama_context_default_params ();
741
- llama_params.vocab_only = true ;
742
-
743
- struct llama_model * lmodel = llama_load_model_from_file (params.fn_vocab_model , llama_params);
744
- struct llama_context * lctx = llama_new_context_with_model (lmodel, llama_params);
745
-
746
798
struct llama_vocab vocab;
747
- {
748
- std::vector<const char *> strings;
749
- std::vector<float > scores;
750
- int n_vocab = llama_n_vocab (lctx);
751
- strings.resize (n_vocab, NULL );
752
- scores.resize (n_vocab, 0 );
753
- n_vocab = llama_get_vocab (lctx, strings.data (), scores.data (), n_vocab);
754
- GGML_ASSERT (n_vocab == llama_n_vocab (lctx));
755
- vocab.id_to_token .resize (n_vocab);
756
- for (int i=0 ; i<n_vocab; ++i) {
757
- std::string tok = std::string (strings[i]);
758
- float score = scores[i];
759
- vocab.id_to_token [i].tok = tok;
760
- vocab.id_to_token [i].score = score;
761
- vocab.token_to_id .emplace (tok, i);
762
- }
763
- }
799
+ load_vocab (params.fn_vocab_model , &config, &vocab);
800
+
764
801
struct my_llama_model model;
765
802
model.hparams .n_vocab = config.vocab_size ; // llama_n_vocab(lctx);
766
803
model.hparams .n_ctx = params.n_ctx ;
@@ -782,8 +819,6 @@ int main(int argc, char ** argv) {
782
819
783
820
printf (" Saving llama.c model file %s in ggml format at %s\n " , params.fn_llama2c_model , params.fn_llama2c_output_model );
784
821
785
- llama_free (lctx);
786
- llama_free_model (lmodel);
787
822
ggml_free (model.ctx );
788
823
free_weights (&weights);
789
824
return 0 ;
0 commit comments