@@ -368,7 +368,6 @@ struct llama_load_tensor_shard {
368
368
std::vector<uint32_t > ne;
369
369
size_t size;
370
370
enum ggml_type type;
371
- size_t file_idx;
372
371
size_t file_off;
373
372
374
373
void calc_size () {
@@ -427,13 +426,13 @@ struct llama_file_loader {
427
426
llama_hparams hparams;
428
427
llama_vocab vocab;
429
428
430
- llama_file_loader (const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
429
+ llama_file_loader (const char * fname, llama_load_tensors_map & tensors_map)
431
430
: file(fname, " rb" ) {
432
431
fprintf (stderr, " llama.cpp: loading model from %s\n " , fname);
433
432
read_magic ();
434
433
read_hparams ();
435
434
read_vocab ();
436
- read_tensor_metadata (file_idx, tensors_map);
435
+ read_tensor_metadata (tensors_map);
437
436
}
438
437
void read_magic () {
439
438
uint32_t magic = file.read_u32 ();
@@ -490,7 +489,7 @@ struct llama_file_loader {
490
489
tok_score.score = score;
491
490
}
492
491
}
493
- void read_tensor_metadata (size_t file_idx, llama_load_tensors_map & tensors_map) {
492
+ void read_tensor_metadata (llama_load_tensors_map & tensors_map) {
494
493
while (file.tell () < file.size ) {
495
494
llama_load_tensor_shard shard;
496
495
uint32_t n_dims = file.read_u32 ();
@@ -525,7 +524,7 @@ struct llama_file_loader {
525
524
// skip to the next multiple of 32 bytes
526
525
file.seek (-static_cast <ptrdiff_t >(file.tell ()) & 31 , SEEK_CUR);
527
526
}
528
- shard. file_idx = file_idx;
527
+
529
528
shard.file_off = file.tell ();
530
529
531
530
shard.calc_size ();
@@ -610,25 +609,15 @@ struct llama_file_saver {
610
609
};
611
610
612
611
struct llama_model_loader {
613
- std::vector<std:: unique_ptr<llama_file_loader>> file_loaders ;
612
+ std::unique_ptr<llama_file_loader> file_loader ;
614
613
llama_load_tensors_map tensors_map;
615
614
bool use_mmap;
616
615
size_t num_ggml_tensors_created = 0 ;
617
616
struct ggml_context * ggml_ctx = NULL ;
618
617
std::unique_ptr<llama_mmap> mapping;
619
618
620
619
llama_model_loader (const std::string & fname_base, bool use_mmap, bool vocab_only) {
621
- auto * first_file = new llama_file_loader (fname_base.c_str (), 0 , tensors_map);
622
- file_loaders.emplace_back (first_file);
623
- uint32_t n_parts = vocab_only ? 1 : guess_n_parts ();
624
- for (uint32_t i = 1 ; i < n_parts; i++) {
625
- std::string fname = fname_base + " ." + std::to_string (i);
626
- auto * ith_file = new llama_file_loader (fname.c_str (), i, tensors_map);
627
- file_loaders.emplace_back (ith_file);
628
- if (ith_file->hparams != first_file->hparams ) {
629
- throw std::runtime_error (format (" llama.cpp: hparams inconsistent between files" ));
630
- }
631
- }
620
+ file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader (fname_base.c_str (), tensors_map));
632
621
if (!llama_mmap::SUPPORTED) {
633
622
use_mmap = false ;
634
623
}
@@ -657,7 +646,7 @@ struct llama_model_loader {
657
646
throw std::runtime_error (std::string (" missing tok_embeddings.weight" ));
658
647
}
659
648
const llama_load_tensor & lt = tensors_map.tensors .at (it->second );
660
- return file_loaders. at ( 0 ) ->hparams .n_embd / lt.first_shard .ne .at (0 );
649
+ return file_loader ->hparams .n_embd / lt.first_shard .ne .at (0 );
661
650
}
662
651
663
652
void calc_sizes (size_t * ctx_size_p, size_t * mmapped_size_p) const {
@@ -723,7 +712,7 @@ struct llama_model_loader {
723
712
}
724
713
725
714
if (use_mmap) {
726
- mapping.reset (new llama_mmap (&file_loaders. at ( 0 ) ->file , prefetch_size, ggml_is_numa ()));
715
+ mapping.reset (new llama_mmap (&file_loader ->file , prefetch_size, ggml_is_numa ()));
727
716
if (lmlock) {
728
717
lmlock->init (mapping->addr );
729
718
}
@@ -781,7 +770,7 @@ struct llama_model_loader {
781
770
if (use_mmap) {
782
771
lt.data = (uint8_t *) mapping->addr + lt.first_shard .file_off ;
783
772
} else {
784
- llama_file & file = file_loaders. at (lt. first_shard . file_idx ) ->file ;
773
+ llama_file & file = file_loader ->file ;
785
774
file.seek (lt.first_shard .file_off , SEEK_SET);
786
775
file.read_raw (lt.data , lt.size );
787
776
}
@@ -986,10 +975,10 @@ static void llama_model_load_internal(
986
975
987
976
std::unique_ptr<llama_model_loader> ml (new llama_model_loader (fname, use_mmap, vocab_only));
988
977
989
- vocab = std::move (ml->file_loaders . at ( 0 ) ->vocab );
990
- model.hparams = ml->file_loaders . at ( 0 ) ->hparams ;
978
+ vocab = std::move (ml->file_loader ->vocab );
979
+ model.hparams = ml->file_loader ->hparams ;
991
980
model.n_gpu_layers = n_gpu_layers;
992
- llama_file_version file_version = ml->file_loaders . at ( 0 ) ->file_version ;
981
+ llama_file_version file_version = ml->file_loader ->file_version ;
993
982
auto & hparams = model.hparams ;
994
983
995
984
{
@@ -1023,7 +1012,6 @@ static void llama_model_load_internal(
1023
1012
fprintf (stderr, " %s: n_rot = %u\n " , __func__, hparams.n_rot );
1024
1013
fprintf (stderr, " %s: ftype = %u (%s)\n " , __func__, hparams.ftype , llama_ftype_name (hparams.ftype ));
1025
1014
fprintf (stderr, " %s: n_ff = %u\n " , __func__, n_ff);
1026
- fprintf (stderr, " %s: n_parts = %zu\n " , __func__, ml->file_loaders .size ());
1027
1015
fprintf (stderr, " %s: model size = %s\n " , __func__, llama_model_type_name (model.type ));
1028
1016
}
1029
1017
@@ -2370,7 +2358,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2370
2358
2371
2359
std::unique_ptr<llama_model_loader> model_loader (new llama_model_loader (fname_inp, /* use_mmap*/ false ,
2372
2360
/* vocab_only*/ false ));
2373
- llama_file_saver file_saver (fname_out.c_str (), model_loader->file_loaders . at ( 0 ) .get (), params->ftype );
2361
+ llama_file_saver file_saver (fname_out.c_str (), model_loader->file_loader .get (), params->ftype );
2374
2362
2375
2363
#ifdef GGML_USE_K_QUANTS
2376
2364
int n_attention_wv = 0 ;
@@ -2820,7 +2808,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2820
2808
2821
2809
// maybe this should in llama_model_loader
2822
2810
if (model_loader->use_mmap ) {
2823
- model_loader->mapping .reset (new llama_mmap (&model_loader->file_loaders . at ( 0 ) ->file , /* prefetch */ 0 , ggml_is_numa ()));
2811
+ model_loader->mapping .reset (new llama_mmap (&model_loader->file_loader ->file , /* prefetch */ 0 , ggml_is_numa ()));
2824
2812
}
2825
2813
}
2826
2814
0 commit comments