Skip to content

Commit 7bbbf38

Browse files
committed
llama : minor updates
ggml-ci
1 parent 0ec27ad commit 7bbbf38

File tree

1 file changed

+43
-35
lines changed

1 file changed

+43
-35
lines changed

llama.cpp

Lines changed: 43 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1109,11 +1109,11 @@ static bool llama_kv_cache_init(
11091109
// model loading and saving
11101110
//
11111111

1112-
enum llama_file_version {
1112+
enum llama_fver {
11131113
GGUF_FILE_VERSION_V1 = 1,
11141114
};
11151115

1116-
static const char * llama_file_version_name(llama_file_version version) {
1116+
static const char * llama_file_version_name(llama_fver version) {
11171117
switch (version) {
11181118
case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)";
11191119
}
@@ -1148,9 +1148,9 @@ struct llama_model_loader {
11481148

11491149
bool use_mmap = false;
11501150

1151-
llama_file file;
1151+
llama_file file;
11521152
llama_ftype ftype;
1153-
llama_file_version fver;
1153+
llama_fver fver;
11541154

11551155
std::unique_ptr<llama_mmap> mapping;
11561156

@@ -1171,7 +1171,7 @@ struct llama_model_loader {
11711171
n_kv = gguf_get_n_kv(ctx_gguf);
11721172
n_tensors = gguf_get_n_tensors(ctx_gguf);
11731173

1174-
fver = (enum llama_file_version) gguf_get_version(ctx_gguf);
1174+
fver = (enum llama_fver ) gguf_get_version(ctx_gguf);
11751175

11761176
for (int i = 0; i < n_tensors; i++) {
11771177
const char * name = gguf_get_tensor_name(ctx_gguf, i);
@@ -1268,6 +1268,21 @@ struct llama_model_loader {
12681268
}
12691269
}
12701270

1271+
std::string get_arch_name() const {
1272+
const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
1273+
1274+
std::string arch_name;
1275+
GGUF_GET_KEY(ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_ARCHITECTURE));
1276+
1277+
return arch_name;
1278+
}
1279+
1280+
enum llm_arch get_arch() const {
1281+
const std::string arch_name = get_arch_name();
1282+
1283+
return llm_arch_from_string(arch_name);
1284+
}
1285+
12711286
const char * get_tensor_name(int i) const {
12721287
return gguf_get_tensor_name(ctx_gguf, i);
12731288
}
@@ -1480,16 +1495,9 @@ static const char * llama_model_type_name(e_model type) {
14801495
}
14811496

14821497
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
1483-
struct gguf_context * ctx = ml.ctx_gguf;
1484-
1485-
const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
1486-
1487-
std::string arch_name;
1488-
GGUF_GET_KEY(ctx, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_GENERAL_ARCHITECTURE));
1489-
1490-
model.arch = llm_arch_from_string(arch_name);
1498+
model.arch = ml.get_arch();
14911499
if (model.arch == LLM_ARCH_UNKNOWN) {
1492-
throw std::runtime_error("unknown model architecture: '" + arch_name + "'");
1500+
throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
14931501
}
14941502
}
14951503

@@ -4048,22 +4056,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
40484056
nthread = std::thread::hardware_concurrency();
40494057
}
40504058

4051-
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
4059+
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
40524060

40534061
const size_t align = GGUF_DEFAULT_ALIGNMENT;
40544062
struct gguf_context * ctx_out = gguf_init_empty();
40554063

40564064
// copy the KV pairs from the input file
4057-
gguf_set_kv (ctx_out, model_loader->ctx_gguf);
4065+
gguf_set_kv (ctx_out, ml->ctx_gguf);
40584066
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
40594067
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
40604068

40614069
#ifdef GGML_USE_K_QUANTS
40624070
int n_attention_wv = 0;
40634071
int n_feed_forward_w2 = 0;
40644072

4065-
for (int i = 0; i < model_loader->n_tensors; ++i) {
4066-
struct ggml_tensor * meta = model_loader->get_tensor_meta(i);
4073+
for (int i = 0; i < ml->n_tensors; ++i) {
4074+
struct ggml_tensor * meta = ml->get_tensor_meta(i);
40674075

40684076
const std::string name = ggml_get_name(meta);
40694077

@@ -4097,8 +4105,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
40974105
std::vector<uint8_t> work;
40984106

40994107
// populate the original tensors so we get an initial meta data
4100-
for (int i = 0; i < model_loader->n_tensors; ++i) {
4101-
struct ggml_tensor * meta = model_loader->get_tensor_meta(i);
4108+
for (int i = 0; i < ml->n_tensors; ++i) {
4109+
struct ggml_tensor * meta = ml->get_tensor_meta(i);
41024110
gguf_add_tensor(ctx_out, meta);
41034111
}
41044112

@@ -4111,17 +4119,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
41114119
// placeholder for the meta data
41124120
::zeros(fout, meta_size);
41134121

4114-
for (int i = 0; i < model_loader->n_tensors; ++i) {
4115-
struct ggml_tensor * tensor = model_loader->get_tensor_meta(i);
4122+
for (int i = 0; i < ml->n_tensors; ++i) {
4123+
struct ggml_tensor * tensor = ml->get_tensor_meta(i);
41164124

41174125
const std::string name = ggml_get_name(tensor);
41184126

41194127
read_data.resize(ggml_nbytes(tensor));
41204128
tensor->data = read_data.data();
4121-
model_loader->load_data_for(tensor);
4129+
ml->load_data_for(tensor);
41224130

41234131
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
4124-
++idx, model_loader->n_tensors,
4132+
++idx, ml->n_tensors,
41254133
ggml_get_name(tensor),
41264134
llama_format_tensor_shape(tensor).c_str(),
41274135
ggml_type_name(tensor->type));
@@ -4147,7 +4155,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
41474155
new_type = quantized_type;
41484156
#ifdef GGML_USE_K_QUANTS
41494157
// TODO: avoid hardcoded tensor names - use the TN_* constants
4150-
const auto tn = LLM_TN(LLM_ARCH_LLAMA);
4158+
const auto tn = LLM_TN(ml->get_arch());
41514159

41524160
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
41534161
int nx = tensor->ne[0];
@@ -4386,28 +4394,28 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
43864394
}
43874395

43884396
// load base model
4389-
std::unique_ptr<llama_model_loader> model_loader;
4397+
std::unique_ptr<llama_model_loader> ml;
43904398
ggml_context * base_ctx = NULL;
43914399
std::vector<uint8_t> base_buf;
43924400
if (path_base_model) {
43934401
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
4394-
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
4402+
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
43954403

43964404
size_t ctx_size;
43974405
size_t mmapped_size;
4398-
model_loader->calc_sizes(ctx_size, mmapped_size);
4406+
ml->calc_sizes(ctx_size, mmapped_size);
43994407
base_buf.resize(ctx_size);
44004408

44014409
ggml_init_params base_params;
44024410
base_params.mem_size = base_buf.size();
44034411
base_params.mem_buffer = base_buf.data();
4404-
base_params.no_alloc = model_loader->use_mmap;
4412+
base_params.no_alloc = ml->use_mmap;
44054413

44064414
base_ctx = ggml_init(base_params);
44074415

44084416
// maybe this should in llama_model_loader
4409-
if (model_loader->use_mmap) {
4410-
model_loader->mapping.reset(new llama_mmap(&model_loader->file, /* prefetch */ 0, ggml_is_numa()));
4417+
if (ml->use_mmap) {
4418+
ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa()));
44114419
}
44124420
}
44134421

@@ -4511,8 +4519,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
45114519
#endif // GGML_USE_CUBLAS
45124520

45134521
ggml_tensor * base_t;
4514-
if (model_loader) {
4515-
struct gguf_context * ctx_gguf = model_loader->ctx_gguf;
4522+
if (ml) {
4523+
struct gguf_context * ctx_gguf = ml->ctx_gguf;
45164524

45174525
// load from base model
45184526
if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
@@ -4522,8 +4530,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
45224530
}
45234531

45244532
// TODO: not tested!! maybe not working!
4525-
base_t = model_loader->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
4526-
model_loader->load_data_for(base_t);
4533+
base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
4534+
ml->load_data_for(base_t);
45274535
} else {
45284536
base_t = dest_t;
45294537
}

0 commit comments

Comments
 (0)