Skip to content

Commit a82e3a4

Browse files
committed
llama : style formatting + remove helper methods
1 parent 2dd5d2c commit a82e3a4

File tree

3 files changed

+79
-115
lines changed

3 files changed

+79
-115
lines changed

ggml.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1744,12 +1744,12 @@ extern "C" {
17441744
GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
17451745
GGML_API void * gguf_get_data (struct gguf_context * ctx);
17461746

1747-
GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
1748-
GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
1749-
GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
1747+
GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
1748+
GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
1749+
GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
1750+
17501751
GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
1751-
GGML_API enum gguf_type gguf_get_arr_type (struct gguf_context * ctx, int i);
1752-
GGML_API void gguf_get_val (struct gguf_context * ctx, int i, void * val);
1752+
GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
17531753

17541754
GGML_API const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i);
17551755
GGML_API float gguf_get_arr_f32(struct gguf_context * ctx, int key_id, int i);

gguf-llama.cpp

Lines changed: 67 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -510,22 +510,9 @@ struct llama_state {
510510
// global state
511511
static llama_state g_state;
512512

513-
template <typename T>
514-
static T checked_mul(T a, T b) {
515-
T ret = a * b;
516-
if (a != 0 && ret / a != b) {
517-
throw std::runtime_error(format("overflow multiplying %llu * %llu",
518-
(unsigned long long) a, (unsigned long long) b));
519-
}
520-
return ret;
521-
}
522-
523-
static size_t checked_div(size_t a, size_t b) {
524-
if (b == 0 || a % b != 0) {
525-
throw std::runtime_error(format("error dividing %zu / %zu", a, b));
526-
}
527-
return a / b;
528-
}
513+
//
514+
// model loading and saving
515+
//
529516

530517
static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
531518
char buf[256];
@@ -536,14 +523,6 @@ static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
536523
return buf;
537524
}
538525

539-
static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
540-
size_t size = ggml_type_size(type);
541-
for (uint32_t dim : ne) {
542-
size = checked_mul<size_t>(size, dim);
543-
}
544-
return size / ggml_blck_size(type);
545-
}
546-
547526
struct gguf_load_tensor {
548527
std::string name;
549528
enum ggml_type type = GGML_TYPE_F32;
@@ -573,20 +552,19 @@ struct gguf_file_loader {
573552

574553
struct ggml_context * ctx_data = NULL;
575554

576-
gguf_file_loader(const char * fname, gguf_load_tensors_map & tensors_map)
577-
: file(fname, "rb") {
555+
gguf_file_loader(const char * fname, gguf_load_tensors_map & tensors_map) : file(fname, "rb") {
578556
fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
579557

580-
struct gguf_init_params params = {
581-
/*.no_alloc = */ true,
582-
/*.ctx = */ &ctx_data,
583-
};
558+
struct gguf_init_params params = {
559+
/*.no_alloc = */ true,
560+
/*.ctx = */ &ctx_data,
561+
};
584562

585-
gguf_ctx = gguf_init_from_file(fname, params);
586-
file_version = (enum gguf_file_version) gguf_get_version(gguf_ctx);
563+
gguf_ctx = gguf_init_from_file(fname, params);
564+
file_version = (enum gguf_file_version) gguf_get_version(gguf_ctx);
587565

588-
read_hparams();
589-
read_vocab();
566+
read_hparams();
567+
read_vocab();
590568
read_tensor_metadata(tensors_map);
591569
}
592570

@@ -637,18 +615,18 @@ struct gguf_file_loader {
637615

638616
void read_vocab() {
639617
vocab.id_to_token.resize(hparams.n_vocab);
640-
int token_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.tokens");
618+
619+
const int token_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.tokens");
641620
if (token_idx == -1) {
642621
throw std::runtime_error("cannot find token list in GGUF file\n");
643622
}
644623

645-
int score_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.scores");
624+
const int score_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.scores");
646625
if (score_idx == -1) {
647626
throw std::runtime_error("cannot find token scores list in GGUF file\n");
648627
}
649628

650629
for (uint32_t i = 0; i < hparams.n_vocab; i++) {
651-
652630
std::string word = gguf_get_arr_str(gguf_ctx, token_idx, i);
653631

654632
vocab.token_to_id[word] = i;
@@ -702,7 +680,7 @@ struct gguf_file_loader {
702680
tensor.file_off = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, i);
703681

704682
tensor.name = name;
705-
tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type);
683+
tensor.size = ggml_nbytes(cur);
706684

707685
tensors_map.tensors.push_back(tensor);
708686
tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
@@ -787,7 +765,7 @@ struct gguf_file_saver {
787765
gguf_type arr_type;
788766
int n_arr;
789767

790-
switch(vtype) {
768+
switch (vtype) {
791769
case GGUF_TYPE_BOOL:
792770
bool_val = gguf_get_val_bool(fl->gguf_ctx, i);
793771
file.write_val<bool>(key, GGUF_TYPE_BOOL, bool_val);
@@ -810,7 +788,7 @@ struct gguf_file_saver {
810788
break;
811789
case GGUF_TYPE_STRING:
812790
str_val = gguf_get_val_str(fl->gguf_ctx, i);
813-
file.write_val<std::string>(key, GGUF_TYPE_STRING, str_val);
791+
file.write_str(key, GGUF_TYPE_STRING, str_val);
814792
break;
815793
case GGUF_TYPE_UINT16:
816794
u16_val = gguf_get_val_u16(fl->gguf_ctx, i);
@@ -826,7 +804,7 @@ struct gguf_file_saver {
826804
break;
827805
case GGUF_TYPE_ARRAY:
828806
arr_type = gguf_get_arr_type(fl->gguf_ctx, i);
829-
n_arr = gguf_get_arr_n(fl->gguf_ctx, i);
807+
n_arr = gguf_get_arr_n (fl->gguf_ctx, i);
830808
if (arr_type == GGUF_TYPE_FLOAT32) {
831809
write_hparam_arr_f32(key, arr_type, i, n_arr);
832810
} else if (arr_type == GGUF_TYPE_STRING) {
@@ -923,20 +901,6 @@ struct llama_model_loader {
923901
}
924902
}
925903

926-
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
927-
auto it = tensors_map.name_to_idx.find(name);
928-
if (it == tensors_map.name_to_idx.end()) {
929-
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
930-
}
931-
gguf_load_tensor & lt = tensors_map.tensors.at(it->second);
932-
if (lt.ne != ne) {
933-
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
934-
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
935-
}
936-
937-
return get_tensor_for(lt, backend);
938-
}
939-
940904
struct ggml_tensor * get_tensor_for(gguf_load_tensor & lt, ggml_backend backend) {
941905
struct ggml_tensor * tensor;
942906
if (backend != GGML_BACKEND_CPU) {
@@ -960,16 +924,41 @@ struct llama_model_loader {
960924
return tensor;
961925
}
962926

927+
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
928+
auto it = tensors_map.name_to_idx.find(name);
929+
if (it == tensors_map.name_to_idx.end()) {
930+
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
931+
}
932+
gguf_load_tensor & lt = tensors_map.tensors.at(it->second);
933+
if (lt.ne != ne) {
934+
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
935+
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
936+
}
937+
938+
return get_tensor_for(lt, backend);
939+
}
940+
963941
void done_getting_tensors() const {
964942
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
965943
throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
966944
}
967945
}
968946

969-
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) {
970-
size_t data_size = 0;
947+
void load_data_for(gguf_load_tensor & lt) const {
948+
if (use_mmap) {
949+
lt.data = (uint8_t *) mapping->addr + lt.file_off;
950+
} else {
951+
gguf_file & file = file_loader->file;
952+
file.seek(lt.file_off, SEEK_SET);
953+
file.read_raw(lt.data, lt.size);
954+
}
955+
}
956+
957+
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) {
958+
size_t data_size = 0;
971959
size_t prefetch_size = 0;
972-
size_t lock_size = 0;
960+
size_t lock_size = 0;
961+
973962
for (const gguf_load_tensor & lt : tensors_map.tensors) {
974963
data_size += lt.size;
975964
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
@@ -1031,31 +1020,6 @@ struct llama_model_loader {
10311020
done_size += lt.size;
10321021
}
10331022
}
1034-
1035-
void load_data_for(gguf_load_tensor & lt) {
1036-
if (use_mmap) {
1037-
lt.data = (uint8_t *) mapping->addr + lt.file_off;
1038-
} else {
1039-
gguf_file & file = file_loader->file;
1040-
file.seek(lt.file_off, SEEK_SET);
1041-
file.read_raw(lt.data, lt.size);
1042-
}
1043-
1044-
if (0) {
1045-
print_checksum(lt);
1046-
}
1047-
}
1048-
1049-
static void print_checksum(gguf_load_tensor & lt) {
1050-
uint32_t sum = 0;
1051-
for (size_t i = 0; i < lt.size; i++) {
1052-
uint8_t byte = lt.data[i];
1053-
sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
1054-
}
1055-
fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
1056-
llama_format_tensor_shape(lt.ne).c_str(), lt.size);
1057-
}
1058-
10591023
};
10601024

10611025
//
@@ -1185,18 +1149,18 @@ int64_t llama_time_us() {
11851149
}
11861150

11871151
//
1188-
// model loading
1152+
// load LLaMA models
11891153
//
11901154

1191-
static const char *gguf_file_version_name(gguf_file_version version) {
1155+
static const char * gguf_file_version_name(gguf_file_version version) {
11921156
switch (version) {
11931157
case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)";
1194-
}
1158+
}
11951159

11961160
return "unknown";
11971161
}
11981162

1199-
static const char *llama_ftype_name(enum llama_ftype ftype) {
1163+
static const char * llama_ftype_name(enum llama_ftype ftype) {
12001164
switch (ftype) {
12011165
case LLAMA_FTYPE_ALL_F32: return "all F32";
12021166
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
@@ -1207,24 +1171,26 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
12071171
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
12081172
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
12091173
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
1174+
12101175
// K-quants
1211-
case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
1176+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
12121177
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
12131178
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
12141179
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
12151180
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
12161181
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
12171182
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
12181183
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
1219-
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
1220-
default: return "unknown, may not work";
1184+
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
1185+
1186+
default: return "unknown, may not work";
12211187
}
12221188
}
12231189

1224-
static const char *llama_model_type_name(e_model type) {
1190+
static const char * llama_model_type_name(e_model type) {
12251191
switch (type) {
1226-
case MODEL_3B: return "3B";
1227-
case MODEL_7B: return "7B";
1192+
case MODEL_3B: return "3B";
1193+
case MODEL_7B: return "7B";
12281194
case MODEL_13B: return "13B";
12291195
case MODEL_30B: return "30B";
12301196
case MODEL_65B: return "65B";
@@ -1605,7 +1571,6 @@ static struct ggml_cgraph * llama_build_graph(
16051571
const int64_t n_embd_head = hparams.n_embd_head();
16061572
const int64_t n_embd_gqa = hparams.n_embd_gqa();
16071573

1608-
16091574
GGML_ASSERT(n_embd_head == hparams.n_rot);
16101575

16111576
const float freq_base = hparams.rope_freq_base;
@@ -1714,7 +1679,7 @@ static struct ggml_cgraph * llama_build_graph(
17141679

17151680
struct ggml_tensor * inpSA = inpL;
17161681

1717-
lctx.use_buf(ctx0, 0);
1682+
llama_context::use_buf(ctx0, 0);
17181683

17191684
// norm
17201685
{
@@ -1853,7 +1818,7 @@ static struct ggml_cgraph * llama_build_graph(
18531818
ggml_set_name(cur, "result_wo");
18541819
}
18551820

1856-
lctx.use_buf(ctx0, 1);
1821+
llama_context::use_buf(ctx0, 1);
18571822

18581823
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
18591824
offload_func(inpFF);
@@ -1909,7 +1874,7 @@ static struct ggml_cgraph * llama_build_graph(
19091874
inpL = cur;
19101875
}
19111876

1912-
lctx.use_buf(ctx0, 0);
1877+
llama_context::use_buf(ctx0, 0);
19131878

19141879
// norm
19151880
{
@@ -1927,7 +1892,7 @@ static struct ggml_cgraph * llama_build_graph(
19271892
cur = ggml_mul_mat(ctx0, model.output, cur);
19281893
ggml_set_name(cur, "result_output");
19291894

1930-
lctx.use_buf(ctx0, -1);
1895+
llama_context::use_buf(ctx0, -1);
19311896

19321897
// logits -> probs
19331898
//cur = ggml_soft_max_inplace(ctx0, cur);
@@ -2997,9 +2962,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
29972962
}
29982963
}
29992964

3000-
const auto rejects =
3001-
llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
3002-
for (auto & reject : rejects) {
2965+
const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
2966+
for (const auto & reject : rejects) {
30032967
candidates->data[reject.index].logit = -INFINITY;
30042968
}
30052969

@@ -3726,7 +3690,7 @@ void llama_free(struct llama_context * ctx) {
37263690
int llama_model_quantize(
37273691
const char * fname_inp,
37283692
const char * fname_out,
3729-
const llama_model_quantize_params *params) {
3693+
const llama_model_quantize_params * params) {
37303694
try {
37313695
llama_model_quantize_internal(fname_inp, fname_out, params);
37323696
return 0;
@@ -4344,8 +4308,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
43444308
GGML_UNUSED(n_token_capacity);
43454309
GGML_UNUSED(n_token_count_out);
43464310

4347-
4348-
// TODO: implement with GGUF format
4311+
// TODO: implement with GGUF format
43494312
return true;
43504313
}
43514314

@@ -4390,7 +4353,6 @@ int llama_eval(
43904353
return 0;
43914354
}
43924355

4393-
43944356
int llama_eval_embd(
43954357
struct llama_context * ctx,
43964358
const float * embd,

0 commit comments

Comments
 (0)