@@ -376,18 +376,11 @@ struct llama_load_tensor_shard {
376
376
}
377
377
};
378
378
379
- enum llama_split_type {
380
- SPLIT_NONE,
381
- SPLIT_BY_COLUMNS,
382
- SPLIT_BY_ROWS
383
- };
384
-
385
379
struct llama_load_tensor {
386
- std::vector< llama_load_tensor_shard> shards ;
380
+ llama_load_tensor_shard first_shard ;
387
381
388
382
std::string name;
389
383
enum ggml_type type = GGML_TYPE_F32;
390
- llama_split_type split_type = SPLIT_NONE;
391
384
std::vector<uint32_t > ne;
392
385
size_t size;
393
386
struct ggml_tensor * ggml_tensor = NULL ;
@@ -397,58 +390,16 @@ struct llama_load_tensor {
397
390
398
391
void calc_all () {
399
392
calc_type ();
400
- calc_split_type ();
401
393
calc_ne ();
402
394
calc_size ();
403
395
}
404
396
405
397
void calc_type () {
406
- const auto & first_shard = shards.at (0 );
407
- for (const auto & shard : shards) {
408
- if (shard.type != first_shard.type ) {
409
- throw std::runtime_error (format (" inconsistent tensor shard type in '%s'" , name.c_str ()));
410
- }
411
- }
412
398
type = first_shard.type ;
413
399
}
414
400
415
- void calc_split_type () {
416
- if (shards.at (0 ).ne .size () == 1 || // 1D tensors are just duplicated in every file
417
- shards.size () == 1 ) { // only one file?
418
- split_type = SPLIT_NONE;
419
- } else if (name.find (" tok_embeddings." ) == 0 ||
420
- name.find (" .attention.wo.weight" ) != std::string::npos ||
421
- name.find (" .feed_forward.w2.weight" ) != std::string::npos) {
422
- split_type = SPLIT_BY_COLUMNS;
423
- } else {
424
- split_type = SPLIT_BY_ROWS;
425
- }
426
- }
427
-
428
401
void calc_ne () {
429
- const auto & first_shard = shards.at (0 );
430
- for (const auto & shard : shards) {
431
- if (shard.ne != first_shard.ne ) {
432
- throw std::runtime_error (format (" inconsistent tensor shard shape in '%s': first was %s, other was %s" ,
433
- name.c_str (), llama_format_tensor_shape (first_shard.ne ).c_str (), llama_format_tensor_shape (shard.ne ).c_str ()));
434
- }
435
- }
436
402
ne = first_shard.ne ;
437
- LLAMA_ASSERT (shards.size () <= UINT32_MAX);
438
- uint32_t n_shards = (uint32_t ) shards.size ();
439
- switch (split_type) {
440
- case SPLIT_NONE:
441
- ne = first_shard.ne ;
442
- break ;
443
- case SPLIT_BY_COLUMNS:
444
- ne = {checked_mul<uint32_t >(first_shard.ne [0 ], n_shards),
445
- first_shard.ne [1 ]};
446
- break ;
447
- case SPLIT_BY_ROWS:
448
- ne = {first_shard.ne [0 ],
449
- checked_mul<uint32_t >(first_shard.ne [1 ], n_shards)};
450
- break ;
451
- }
452
403
}
453
404
454
405
void calc_size () {
@@ -589,7 +540,7 @@ struct llama_file_loader {
589
540
idx = tensors_map.tensors .size () - 1 ;
590
541
tensors_map.name_to_idx .emplace (name, idx);
591
542
}
592
- tensors_map.tensors .at (idx).shards . push_back ( shard) ;
543
+ tensors_map.tensors .at (idx).first_shard = shard;
593
544
}
594
545
}
595
546
};
@@ -693,10 +644,8 @@ struct llama_model_loader {
693
644
694
645
bool alignment_prevents_mmap () {
695
646
for (const llama_load_tensor & lt : tensors_map.tensors ) {
696
- for (const llama_load_tensor_shard & shard : lt.shards ) {
697
- if (shard.file_off & 3 ) {
698
- return true ;
699
- }
647
+ if (lt.first_shard .file_off & 3 ) {
648
+ return true ;
700
649
}
701
650
}
702
651
return false ;
@@ -708,7 +657,7 @@ struct llama_model_loader {
708
657
throw std::runtime_error (std::string (" missing tok_embeddings.weight" ));
709
658
}
710
659
const llama_load_tensor & lt = tensors_map.tensors .at (it->second );
711
- return file_loaders.at (0 )->hparams .n_embd / lt.shards . at ( 0 ) .ne .at (0 );
660
+ return file_loaders.at (0 )->hparams .n_embd / lt.first_shard .ne .at (0 );
712
661
}
713
662
714
663
void calc_sizes (size_t * ctx_size_p, size_t * mmapped_size_p) const {
@@ -830,45 +779,13 @@ struct llama_model_loader {
830
779
831
780
void load_data_for (llama_load_tensor & lt) {
832
781
if (use_mmap) {
833
- LLAMA_ASSERT (lt.shards .size () == 1 );
834
- lt.data = (uint8_t *) mapping->addr + lt.shards .at (0 ).file_off ;
835
- } else if (lt.split_type == SPLIT_NONE) {
836
- llama_file & file = file_loaders.at (lt.shards .at (0 ).file_idx )->file ;
837
- file.seek (lt.shards .at (0 ).file_off , SEEK_SET);
782
+ lt.data = (uint8_t *) mapping->addr + lt.first_shard .file_off ;
783
+ } else {
784
+ llama_file & file = file_loaders.at (lt.first_shard .file_idx )->file ;
785
+ file.seek (lt.first_shard .file_off , SEEK_SET);
838
786
file.read_raw (lt.data , lt.size );
839
- } else if (lt.split_type == SPLIT_BY_ROWS) {
840
- size_t offset = 0 ;
841
- for (llama_load_tensor_shard & shard : lt.shards ) {
842
- llama_file & file = file_loaders.at (shard.file_idx )->file ;
843
- file.seek (shard.file_off , SEEK_SET);
844
- file.read_raw (lt.data + offset, shard.size );
845
- offset += shard.size ;
846
- }
847
- LLAMA_ASSERT (offset == lt.size );
848
- } else if (lt.split_type == SPLIT_BY_COLUMNS) {
849
- // Let's load the data into temporary buffers to ensure the OS performs large loads.
850
- std::vector<llama_buffer> tmp_bufs (lt.shards .size ());
851
- for (size_t i = 0 ; i < lt.shards .size (); i++) {
852
- llama_load_tensor_shard & shard = lt.shards .at (i);
853
- llama_file & file = file_loaders.at (shard.file_idx )->file ;
854
- file.seek (shard.file_off , SEEK_SET);
855
- tmp_bufs.at (i).resize (shard.size );
856
- file.read_raw (tmp_bufs.at (i).addr , shard.size );
857
- }
858
- // Then reshape.
859
- size_t num_rows = lt.ne .at (1 );
860
- size_t per_shard_row_size = lt.shards .at (0 ).size / num_rows;
861
- size_t out_offset = 0 ;
862
- for (size_t row = 0 ; row < num_rows; row++) {
863
- for (llama_buffer & tmp_buf : tmp_bufs) {
864
- memcpy (lt.data + out_offset,
865
- tmp_buf.addr + row * per_shard_row_size,
866
- per_shard_row_size);
867
- out_offset += per_shard_row_size;
868
- }
869
- }
870
- LLAMA_ASSERT (out_offset == lt.size );
871
787
}
788
+
872
789
if (0 ) {
873
790
print_checksum (lt);
874
791
}
0 commit comments