@@ -510,22 +510,9 @@ struct llama_state {
510
510
// global state
511
511
static llama_state g_state;
512
512
513
- template <typename T>
514
- static T checked_mul (T a, T b) {
515
- T ret = a * b;
516
- if (a != 0 && ret / a != b) {
517
- throw std::runtime_error (format (" overflow multiplying %llu * %llu" ,
518
- (unsigned long long ) a, (unsigned long long ) b));
519
- }
520
- return ret;
521
- }
522
-
523
- static size_t checked_div (size_t a, size_t b) {
524
- if (b == 0 || a % b != 0 ) {
525
- throw std::runtime_error (format (" error dividing %zu / %zu" , a, b));
526
- }
527
- return a / b;
528
- }
513
+ //
514
+ // model loading and saving
515
+ //
529
516
530
517
static std::string llama_format_tensor_shape (const std::vector<uint32_t > & ne) {
531
518
char buf[256 ];
@@ -536,14 +523,6 @@ static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
536
523
return buf;
537
524
}
538
525
539
- static size_t llama_calc_tensor_size (const std::vector<uint32_t > & ne, enum ggml_type type) {
540
- size_t size = ggml_type_size (type);
541
- for (uint32_t dim : ne) {
542
- size = checked_mul<size_t >(size, dim);
543
- }
544
- return size / ggml_blck_size (type);
545
- }
546
-
547
526
struct gguf_load_tensor {
548
527
std::string name;
549
528
enum ggml_type type = GGML_TYPE_F32;
@@ -573,20 +552,19 @@ struct gguf_file_loader {
573
552
574
553
struct ggml_context * ctx_data = NULL ;
575
554
576
- gguf_file_loader (const char * fname, gguf_load_tensors_map & tensors_map)
577
- : file(fname, " rb" ) {
555
+ gguf_file_loader (const char * fname, gguf_load_tensors_map & tensors_map) : file(fname, " rb" ) {
578
556
fprintf (stderr, " llama.cpp: loading model from %s\n " , fname);
579
557
580
- struct gguf_init_params params = {
581
- /* .no_alloc = */ true ,
582
- /* .ctx = */ &ctx_data,
583
- };
558
+ struct gguf_init_params params = {
559
+ /* .no_alloc = */ true ,
560
+ /* .ctx = */ &ctx_data,
561
+ };
584
562
585
- gguf_ctx = gguf_init_from_file (fname, params);
586
- file_version = (enum gguf_file_version) gguf_get_version (gguf_ctx);
563
+ gguf_ctx = gguf_init_from_file (fname, params);
564
+ file_version = (enum gguf_file_version) gguf_get_version (gguf_ctx);
587
565
588
- read_hparams ();
589
- read_vocab ();
566
+ read_hparams ();
567
+ read_vocab ();
590
568
read_tensor_metadata (tensors_map);
591
569
}
592
570
@@ -637,18 +615,18 @@ struct gguf_file_loader {
637
615
638
616
void read_vocab () {
639
617
vocab.id_to_token .resize (hparams.n_vocab );
640
- int token_idx = gguf_find_key (gguf_ctx, " tokenizer.ggml.tokens" );
618
+
619
+ const int token_idx = gguf_find_key (gguf_ctx, " tokenizer.ggml.tokens" );
641
620
if (token_idx == -1 ) {
642
621
throw std::runtime_error (" cannot find token list in GGUF file\n " );
643
622
}
644
623
645
- int score_idx = gguf_find_key (gguf_ctx, " tokenizer.ggml.scores" );
624
+ const int score_idx = gguf_find_key (gguf_ctx, " tokenizer.ggml.scores" );
646
625
if (score_idx == -1 ) {
647
626
throw std::runtime_error (" cannot find token scores list in GGUF file\n " );
648
627
}
649
628
650
629
for (uint32_t i = 0 ; i < hparams.n_vocab ; i++) {
651
-
652
630
std::string word = gguf_get_arr_str (gguf_ctx, token_idx, i);
653
631
654
632
vocab.token_to_id [word] = i;
@@ -702,7 +680,7 @@ struct gguf_file_loader {
702
680
tensor.file_off = gguf_get_data_offset (gguf_ctx) + gguf_get_tensor_offset (gguf_ctx, i);
703
681
704
682
tensor.name = name;
705
- tensor.size = llama_calc_tensor_size (tensor. ne , tensor. type );
683
+ tensor.size = ggml_nbytes (cur );
706
684
707
685
tensors_map.tensors .push_back (tensor);
708
686
tensors_map.name_to_idx [name] = tensors_map.tensors .size () - 1 ;
@@ -787,7 +765,7 @@ struct gguf_file_saver {
787
765
gguf_type arr_type;
788
766
int n_arr;
789
767
790
- switch (vtype) {
768
+ switch (vtype) {
791
769
case GGUF_TYPE_BOOL:
792
770
bool_val = gguf_get_val_bool (fl->gguf_ctx , i);
793
771
file.write_val <bool >(key, GGUF_TYPE_BOOL, bool_val);
@@ -810,7 +788,7 @@ struct gguf_file_saver {
810
788
break ;
811
789
case GGUF_TYPE_STRING:
812
790
str_val = gguf_get_val_str (fl->gguf_ctx , i);
813
- file.write_val <std::string> (key, GGUF_TYPE_STRING, str_val);
791
+ file.write_str (key, GGUF_TYPE_STRING, str_val);
814
792
break ;
815
793
case GGUF_TYPE_UINT16:
816
794
u16_val = gguf_get_val_u16 (fl->gguf_ctx , i);
@@ -826,7 +804,7 @@ struct gguf_file_saver {
826
804
break ;
827
805
case GGUF_TYPE_ARRAY:
828
806
arr_type = gguf_get_arr_type (fl->gguf_ctx , i);
829
- n_arr = gguf_get_arr_n (fl->gguf_ctx , i);
807
+ n_arr = gguf_get_arr_n (fl->gguf_ctx , i);
830
808
if (arr_type == GGUF_TYPE_FLOAT32) {
831
809
write_hparam_arr_f32 (key, arr_type, i, n_arr);
832
810
} else if (arr_type == GGUF_TYPE_STRING) {
@@ -923,20 +901,6 @@ struct llama_model_loader {
923
901
}
924
902
}
925
903
926
- struct ggml_tensor * get_tensor (const std::string & name, const std::vector<uint32_t > & ne, ggml_backend backend) {
927
- auto it = tensors_map.name_to_idx .find (name);
928
- if (it == tensors_map.name_to_idx .end ()) {
929
- throw std::runtime_error (std::runtime_error (format (" llama.cpp: tensor '%s' is missing from model" , name.c_str ())));
930
- }
931
- gguf_load_tensor & lt = tensors_map.tensors .at (it->second );
932
- if (lt.ne != ne) {
933
- throw std::runtime_error (format (" llama.cpp: tensor '%s' has wrong shape; expected %s, got %s" ,
934
- name.c_str (), llama_format_tensor_shape (ne).c_str (), llama_format_tensor_shape (lt.ne ).c_str ()));
935
- }
936
-
937
- return get_tensor_for (lt, backend);
938
- }
939
-
940
904
struct ggml_tensor * get_tensor_for (gguf_load_tensor & lt, ggml_backend backend) {
941
905
struct ggml_tensor * tensor;
942
906
if (backend != GGML_BACKEND_CPU) {
@@ -960,16 +924,41 @@ struct llama_model_loader {
960
924
return tensor;
961
925
}
962
926
927
+ struct ggml_tensor * get_tensor (const std::string & name, const std::vector<uint32_t > & ne, ggml_backend backend) {
928
+ auto it = tensors_map.name_to_idx .find (name);
929
+ if (it == tensors_map.name_to_idx .end ()) {
930
+ throw std::runtime_error (std::runtime_error (format (" llama.cpp: tensor '%s' is missing from model" , name.c_str ())));
931
+ }
932
+ gguf_load_tensor & lt = tensors_map.tensors .at (it->second );
933
+ if (lt.ne != ne) {
934
+ throw std::runtime_error (format (" llama.cpp: tensor '%s' has wrong shape; expected %s, got %s" ,
935
+ name.c_str (), llama_format_tensor_shape (ne).c_str (), llama_format_tensor_shape (lt.ne ).c_str ()));
936
+ }
937
+
938
+ return get_tensor_for (lt, backend);
939
+ }
940
+
963
941
void done_getting_tensors () const {
964
942
if (num_ggml_tensors_created != tensors_map.tensors .size ()) {
965
943
throw std::runtime_error (std::string (" llama.cpp: file contained more tensors than expected" ));
966
944
}
967
945
}
968
946
969
- void load_all_data (llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) {
970
- size_t data_size = 0 ;
947
+ void load_data_for (gguf_load_tensor & lt) const {
948
+ if (use_mmap) {
949
+ lt.data = (uint8_t *) mapping->addr + lt.file_off ;
950
+ } else {
951
+ gguf_file & file = file_loader->file ;
952
+ file.seek (lt.file_off , SEEK_SET);
953
+ file.read_raw (lt.data , lt.size );
954
+ }
955
+ }
956
+
957
+ void load_all_data (llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) {
958
+ size_t data_size = 0 ;
971
959
size_t prefetch_size = 0 ;
972
- size_t lock_size = 0 ;
960
+ size_t lock_size = 0 ;
961
+
973
962
for (const gguf_load_tensor & lt : tensors_map.tensors ) {
974
963
data_size += lt.size ;
975
964
if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
@@ -1031,31 +1020,6 @@ struct llama_model_loader {
1031
1020
done_size += lt.size ;
1032
1021
}
1033
1022
}
1034
-
1035
- void load_data_for (gguf_load_tensor & lt) {
1036
- if (use_mmap) {
1037
- lt.data = (uint8_t *) mapping->addr + lt.file_off ;
1038
- } else {
1039
- gguf_file & file = file_loader->file ;
1040
- file.seek (lt.file_off , SEEK_SET);
1041
- file.read_raw (lt.data , lt.size );
1042
- }
1043
-
1044
- if (0 ) {
1045
- print_checksum (lt);
1046
- }
1047
- }
1048
-
1049
- static void print_checksum (gguf_load_tensor & lt) {
1050
- uint32_t sum = 0 ;
1051
- for (size_t i = 0 ; i < lt.size ; i++) {
1052
- uint8_t byte = lt.data [i];
1053
- sum = byte + (sum << 6 ) + (sum << 16 ) - sum; // sdbm hash
1054
- }
1055
- fprintf (stderr, " %s checksum: %#08x (%s, size %zu)\n " , lt.name .c_str (), sum,
1056
- llama_format_tensor_shape (lt.ne ).c_str (), lt.size );
1057
- }
1058
-
1059
1023
};
1060
1024
1061
1025
//
@@ -1185,18 +1149,18 @@ int64_t llama_time_us() {
1185
1149
}
1186
1150
1187
1151
//
1188
- // model loading
1152
+ // load LLaMA models
1189
1153
//
1190
1154
1191
- static const char *gguf_file_version_name (gguf_file_version version) {
1155
+ static const char * gguf_file_version_name (gguf_file_version version) {
1192
1156
switch (version) {
1193
1157
case GGUF_FILE_VERSION_V1: return " GGUF V1 (latest)" ;
1194
- }
1158
+ }
1195
1159
1196
1160
return " unknown" ;
1197
1161
}
1198
1162
1199
- static const char *llama_ftype_name (enum llama_ftype ftype) {
1163
+ static const char * llama_ftype_name (enum llama_ftype ftype) {
1200
1164
switch (ftype) {
1201
1165
case LLAMA_FTYPE_ALL_F32: return " all F32" ;
1202
1166
case LLAMA_FTYPE_MOSTLY_F16: return " mostly F16" ;
@@ -1207,24 +1171,26 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
1207
1171
case LLAMA_FTYPE_MOSTLY_Q5_0: return " mostly Q5_0" ;
1208
1172
case LLAMA_FTYPE_MOSTLY_Q5_1: return " mostly Q5_1" ;
1209
1173
case LLAMA_FTYPE_MOSTLY_Q8_0: return " mostly Q8_0" ;
1174
+
1210
1175
// K-quants
1211
- case LLAMA_FTYPE_MOSTLY_Q2_K: return " mostly Q2_K" ;
1176
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return " mostly Q2_K" ;
1212
1177
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return " mostly Q3_K - Small" ;
1213
1178
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return " mostly Q3_K - Medium" ;
1214
1179
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return " mostly Q3_K - Large" ;
1215
1180
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return " mostly Q4_K - Small" ;
1216
1181
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return " mostly Q4_K - Medium" ;
1217
1182
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return " mostly Q5_K - Small" ;
1218
1183
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return " mostly Q5_K - Medium" ;
1219
- case LLAMA_FTYPE_MOSTLY_Q6_K: return " mostly Q6_K" ;
1220
- default : return " unknown, may not work" ;
1184
+ case LLAMA_FTYPE_MOSTLY_Q6_K: return " mostly Q6_K" ;
1185
+
1186
+ default : return " unknown, may not work" ;
1221
1187
}
1222
1188
}
1223
1189
1224
- static const char *llama_model_type_name (e_model type) {
1190
+ static const char * llama_model_type_name (e_model type) {
1225
1191
switch (type) {
1226
- case MODEL_3B: return " 3B" ;
1227
- case MODEL_7B: return " 7B" ;
1192
+ case MODEL_3B: return " 3B" ;
1193
+ case MODEL_7B: return " 7B" ;
1228
1194
case MODEL_13B: return " 13B" ;
1229
1195
case MODEL_30B: return " 30B" ;
1230
1196
case MODEL_65B: return " 65B" ;
@@ -1605,7 +1571,6 @@ static struct ggml_cgraph * llama_build_graph(
1605
1571
const int64_t n_embd_head = hparams.n_embd_head ();
1606
1572
const int64_t n_embd_gqa = hparams.n_embd_gqa ();
1607
1573
1608
-
1609
1574
GGML_ASSERT (n_embd_head == hparams.n_rot );
1610
1575
1611
1576
const float freq_base = hparams.rope_freq_base ;
@@ -1714,7 +1679,7 @@ static struct ggml_cgraph * llama_build_graph(
1714
1679
1715
1680
struct ggml_tensor * inpSA = inpL;
1716
1681
1717
- lctx. use_buf (ctx0, 0 );
1682
+ llama_context:: use_buf (ctx0, 0 );
1718
1683
1719
1684
// norm
1720
1685
{
@@ -1853,7 +1818,7 @@ static struct ggml_cgraph * llama_build_graph(
1853
1818
ggml_set_name (cur, " result_wo" );
1854
1819
}
1855
1820
1856
- lctx. use_buf (ctx0, 1 );
1821
+ llama_context:: use_buf (ctx0, 1 );
1857
1822
1858
1823
struct ggml_tensor * inpFF = ggml_add (ctx0, cur, inpSA);
1859
1824
offload_func (inpFF);
@@ -1909,7 +1874,7 @@ static struct ggml_cgraph * llama_build_graph(
1909
1874
inpL = cur;
1910
1875
}
1911
1876
1912
- lctx. use_buf (ctx0, 0 );
1877
+ llama_context:: use_buf (ctx0, 0 );
1913
1878
1914
1879
// norm
1915
1880
{
@@ -1927,7 +1892,7 @@ static struct ggml_cgraph * llama_build_graph(
1927
1892
cur = ggml_mul_mat (ctx0, model.output , cur);
1928
1893
ggml_set_name (cur, " result_output" );
1929
1894
1930
- lctx. use_buf (ctx0, -1 );
1895
+ llama_context:: use_buf (ctx0, -1 );
1931
1896
1932
1897
// logits -> probs
1933
1898
// cur = ggml_soft_max_inplace(ctx0, cur);
@@ -2997,9 +2962,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
2997
2962
}
2998
2963
}
2999
2964
3000
- const auto rejects =
3001
- llama_grammar_reject_candidates (grammar->rules , grammar->stacks , candidates_grammar);
3002
- for (auto & reject : rejects) {
2965
+ const auto rejects = llama_grammar_reject_candidates (grammar->rules , grammar->stacks , candidates_grammar);
2966
+ for (const auto & reject : rejects) {
3003
2967
candidates->data [reject.index ].logit = -INFINITY;
3004
2968
}
3005
2969
@@ -3726,7 +3690,7 @@ void llama_free(struct llama_context * ctx) {
3726
3690
int llama_model_quantize (
3727
3691
const char * fname_inp,
3728
3692
const char * fname_out,
3729
- const llama_model_quantize_params *params) {
3693
+ const llama_model_quantize_params * params) {
3730
3694
try {
3731
3695
llama_model_quantize_internal (fname_inp, fname_out, params);
3732
3696
return 0 ;
@@ -4344,8 +4308,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
4344
4308
GGML_UNUSED (n_token_capacity);
4345
4309
GGML_UNUSED (n_token_count_out);
4346
4310
4347
-
4348
- // TODO: implement with GGUF format
4311
+ // TODO: implement with GGUF format
4349
4312
return true ;
4350
4313
}
4351
4314
@@ -4390,7 +4353,6 @@ int llama_eval(
4390
4353
return 0 ;
4391
4354
}
4392
4355
4393
-
4394
4356
int llama_eval_embd (
4395
4357
struct llama_context * ctx,
4396
4358
const float * embd,
0 commit comments