@@ -1109,11 +1109,11 @@ static bool llama_kv_cache_init(
1109
1109
// model loading and saving
1110
1110
//
1111
1111
1112
- enum llama_file_version {
1112
+ enum llama_fver {
1113
1113
GGUF_FILE_VERSION_V1 = 1 ,
1114
1114
};
1115
1115
1116
- static const char * llama_file_version_name (llama_file_version version) {
1116
+ static const char * llama_file_version_name (llama_fver version) {
1117
1117
switch (version) {
1118
1118
case GGUF_FILE_VERSION_V1: return " GGUF V1 (latest)" ;
1119
1119
}
@@ -1148,9 +1148,9 @@ struct llama_model_loader {
1148
1148
1149
1149
bool use_mmap = false ;
1150
1150
1151
- llama_file file;
1151
+ llama_file file;
1152
1152
llama_ftype ftype;
1153
- llama_file_version fver;
1153
+ llama_fver fver;
1154
1154
1155
1155
std::unique_ptr<llama_mmap> mapping;
1156
1156
@@ -1171,7 +1171,7 @@ struct llama_model_loader {
1171
1171
n_kv = gguf_get_n_kv (ctx_gguf);
1172
1172
n_tensors = gguf_get_n_tensors (ctx_gguf);
1173
1173
1174
- fver = (enum llama_file_version ) gguf_get_version (ctx_gguf);
1174
+ fver = (enum llama_fver ) gguf_get_version (ctx_gguf);
1175
1175
1176
1176
for (int i = 0 ; i < n_tensors; i++) {
1177
1177
const char * name = gguf_get_tensor_name (ctx_gguf, i);
@@ -1268,6 +1268,21 @@ struct llama_model_loader {
1268
1268
}
1269
1269
}
1270
1270
1271
+ std::string get_arch_name () const {
1272
+ const auto kv = LLM_KV (LLM_ARCH_UNKNOWN);
1273
+
1274
+ std::string arch_name;
1275
+ GGUF_GET_KEY (ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, false , kv (LLM_KV_GENERAL_ARCHITECTURE));
1276
+
1277
+ return arch_name;
1278
+ }
1279
+
1280
+ enum llm_arch get_arch () const {
1281
+ const std::string arch_name = get_arch_name ();
1282
+
1283
+ return llm_arch_from_string (arch_name);
1284
+ }
1285
+
1271
1286
const char * get_tensor_name (int i) const {
1272
1287
return gguf_get_tensor_name (ctx_gguf, i);
1273
1288
}
@@ -1480,16 +1495,9 @@ static const char * llama_model_type_name(e_model type) {
1480
1495
}
1481
1496
1482
1497
static void llm_load_arch (llama_model_loader & ml, llama_model & model) {
1483
- struct gguf_context * ctx = ml.ctx_gguf ;
1484
-
1485
- const auto kv = LLM_KV (LLM_ARCH_UNKNOWN);
1486
-
1487
- std::string arch_name;
1488
- GGUF_GET_KEY (ctx, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, true , kv (LLM_KV_GENERAL_ARCHITECTURE));
1489
-
1490
- model.arch = llm_arch_from_string (arch_name);
1498
+ model.arch = ml.get_arch ();
1491
1499
if (model.arch == LLM_ARCH_UNKNOWN) {
1492
- throw std::runtime_error (" unknown model architecture: '" + arch_name + " '" );
1500
+ throw std::runtime_error (" unknown model architecture: '" + ml. get_arch_name () + " '" );
1493
1501
}
1494
1502
}
1495
1503
@@ -4048,22 +4056,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4048
4056
nthread = std::thread::hardware_concurrency ();
4049
4057
}
4050
4058
4051
- std::unique_ptr<llama_model_loader> model_loader (new llama_model_loader (fname_inp, /* use_mmap*/ false ));
4059
+ std::unique_ptr<llama_model_loader> ml (new llama_model_loader (fname_inp, /* use_mmap*/ false ));
4052
4060
4053
4061
const size_t align = GGUF_DEFAULT_ALIGNMENT;
4054
4062
struct gguf_context * ctx_out = gguf_init_empty ();
4055
4063
4056
4064
// copy the KV pairs from the input file
4057
- gguf_set_kv (ctx_out, model_loader ->ctx_gguf );
4065
+ gguf_set_kv (ctx_out, ml ->ctx_gguf );
4058
4066
gguf_set_val_u32 (ctx_out, " general.quantization_version" , GGML_QNT_VERSION);
4059
4067
gguf_set_val_u32 (ctx_out, " general.file_type" , ftype);
4060
4068
4061
4069
#ifdef GGML_USE_K_QUANTS
4062
4070
int n_attention_wv = 0 ;
4063
4071
int n_feed_forward_w2 = 0 ;
4064
4072
4065
- for (int i = 0 ; i < model_loader ->n_tensors ; ++i) {
4066
- struct ggml_tensor * meta = model_loader ->get_tensor_meta (i);
4073
+ for (int i = 0 ; i < ml ->n_tensors ; ++i) {
4074
+ struct ggml_tensor * meta = ml ->get_tensor_meta (i);
4067
4075
4068
4076
const std::string name = ggml_get_name (meta);
4069
4077
@@ -4097,8 +4105,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4097
4105
std::vector<uint8_t > work;
4098
4106
4099
4107
// populate the original tensors so we get an initial meta data
4100
- for (int i = 0 ; i < model_loader ->n_tensors ; ++i) {
4101
- struct ggml_tensor * meta = model_loader ->get_tensor_meta (i);
4108
+ for (int i = 0 ; i < ml ->n_tensors ; ++i) {
4109
+ struct ggml_tensor * meta = ml ->get_tensor_meta (i);
4102
4110
gguf_add_tensor (ctx_out, meta);
4103
4111
}
4104
4112
@@ -4111,17 +4119,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4111
4119
// placeholder for the meta data
4112
4120
::zeros (fout, meta_size);
4113
4121
4114
- for (int i = 0 ; i < model_loader ->n_tensors ; ++i) {
4115
- struct ggml_tensor * tensor = model_loader ->get_tensor_meta (i);
4122
+ for (int i = 0 ; i < ml ->n_tensors ; ++i) {
4123
+ struct ggml_tensor * tensor = ml ->get_tensor_meta (i);
4116
4124
4117
4125
const std::string name = ggml_get_name (tensor);
4118
4126
4119
4127
read_data.resize (ggml_nbytes (tensor));
4120
4128
tensor->data = read_data.data ();
4121
- model_loader ->load_data_for (tensor);
4129
+ ml ->load_data_for (tensor);
4122
4130
4123
4131
LLAMA_LOG_INFO (" [%4d/%4d] %36s - [%s], type = %6s, " ,
4124
- ++idx, model_loader ->n_tensors ,
4132
+ ++idx, ml ->n_tensors ,
4125
4133
ggml_get_name (tensor),
4126
4134
llama_format_tensor_shape (tensor).c_str (),
4127
4135
ggml_type_name (tensor->type ));
@@ -4147,7 +4155,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4147
4155
new_type = quantized_type;
4148
4156
#ifdef GGML_USE_K_QUANTS
4149
4157
// TODO: avoid hardcoded tensor names - use the TN_* constants
4150
- const auto tn = LLM_TN (LLM_ARCH_LLAMA );
4158
+ const auto tn = LLM_TN (ml-> get_arch () );
4151
4159
4152
4160
if (name == tn (LLM_TENSOR_OUTPUT, " weight" )) {
4153
4161
int nx = tensor->ne [0 ];
@@ -4386,28 +4394,28 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
4386
4394
}
4387
4395
4388
4396
// load base model
4389
- std::unique_ptr<llama_model_loader> model_loader ;
4397
+ std::unique_ptr<llama_model_loader> ml ;
4390
4398
ggml_context * base_ctx = NULL ;
4391
4399
std::vector<uint8_t > base_buf;
4392
4400
if (path_base_model) {
4393
4401
LLAMA_LOG_INFO (" %s: loading base model from '%s'\n " , __func__, path_base_model);
4394
- model_loader .reset (new llama_model_loader (path_base_model, /* use_mmap*/ true ));
4402
+ ml .reset (new llama_model_loader (path_base_model, /* use_mmap*/ true ));
4395
4403
4396
4404
size_t ctx_size;
4397
4405
size_t mmapped_size;
4398
- model_loader ->calc_sizes (ctx_size, mmapped_size);
4406
+ ml ->calc_sizes (ctx_size, mmapped_size);
4399
4407
base_buf.resize (ctx_size);
4400
4408
4401
4409
ggml_init_params base_params;
4402
4410
base_params.mem_size = base_buf.size ();
4403
4411
base_params.mem_buffer = base_buf.data ();
4404
- base_params.no_alloc = model_loader ->use_mmap ;
4412
+ base_params.no_alloc = ml ->use_mmap ;
4405
4413
4406
4414
base_ctx = ggml_init (base_params);
4407
4415
4408
4416
// maybe this should in llama_model_loader
4409
- if (model_loader ->use_mmap ) {
4410
- model_loader ->mapping .reset (new llama_mmap (&model_loader ->file , /* prefetch */ 0 , ggml_is_numa ()));
4417
+ if (ml ->use_mmap ) {
4418
+ ml ->mapping .reset (new llama_mmap (&ml ->file , /* prefetch */ 0 , ggml_is_numa ()));
4411
4419
}
4412
4420
}
4413
4421
@@ -4511,8 +4519,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
4511
4519
#endif // GGML_USE_CUBLAS
4512
4520
4513
4521
ggml_tensor * base_t ;
4514
- if (model_loader ) {
4515
- struct gguf_context * ctx_gguf = model_loader ->ctx_gguf ;
4522
+ if (ml ) {
4523
+ struct gguf_context * ctx_gguf = ml ->ctx_gguf ;
4516
4524
4517
4525
// load from base model
4518
4526
if (gguf_find_tensor (ctx_gguf, base_name.c_str ()) < 0 ) {
@@ -4522,8 +4530,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
4522
4530
}
4523
4531
4524
4532
// TODO: not tested!! maybe not working!
4525
- base_t = model_loader ->create_tensor (base_ctx, base_name, { (uint32_t )dest_t ->ne [0 ], (uint32_t )dest_t ->ne [1 ] }, GGML_BACKEND_CPU);
4526
- model_loader ->load_data_for (base_t );
4533
+ base_t = ml ->create_tensor (base_ctx, base_name, { (uint32_t )dest_t ->ne [0 ], (uint32_t )dest_t ->ne [1 ] }, GGML_BACKEND_CPU);
4534
+ ml ->load_data_for (base_t );
4527
4535
} else {
4528
4536
base_t = dest_t ;
4529
4537
}
0 commit comments