@@ -4271,20 +4271,34 @@ struct llama_model_loader {
4271
4271
4272
4272
ggml_tensor * tensor;
4273
4273
4274
- llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
4275
- const int tensor_idx = gguf_find_tensor(gguf_ctx, name );
4274
+ llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
4275
+ const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor) );
4276
4276
if (tensor_idx < 0) {
4277
- throw std::runtime_error(format("tensor '%s' not found in the model", name ));
4277
+ throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor) ));
4278
4278
}
4279
4279
4280
4280
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
4281
4281
if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
4282
- throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name ));
4282
+ throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor) ));
4283
4283
}
4284
4284
}
4285
4285
};
4286
- std::vector<llama_tensor_weight> weights;
4287
4286
4287
+ // custom comparator to sort weights more nicely by layer
4288
+ struct weight_name_comparer {
4289
+ bool operator()(const std::string & a, const std::string & b) const {
4290
+ int a_layer = -1;
4291
+ int b_layer = -1;
4292
+ sscanf(a.c_str(), "blk.%d.", &a_layer);
4293
+ sscanf(b.c_str(), "blk.%d.", &b_layer);
4294
+ if (a_layer != b_layer) {
4295
+ return a_layer < b_layer;
4296
+ }
4297
+ return a < b;
4298
+ }
4299
+ };
4300
+
4301
+ std::map<std::string, struct llama_tensor_weight, weight_name_comparer> weights_map;
4288
4302
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
4289
4303
4290
4304
struct gguf_context * meta = NULL;
@@ -4326,7 +4340,14 @@ struct llama_model_loader {
4326
4340
// For subsidiary files, `meta` tensor data offset must not be used,
4327
4341
// so we build a unified tensors index for weights.
4328
4342
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
4329
- weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
4343
+ std::string tensor_name = std::string(cur->name);
4344
+ // make sure there is no duplicated tensor names
4345
+ if (weights_map.find(tensor_name) != weights_map.end()) {
4346
+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
4347
+ }
4348
+ n_elements += ggml_nelements(cur);
4349
+ n_bytes += ggml_nbytes(cur);
4350
+ weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta, cur));
4330
4351
}
4331
4352
uint16_t n_split = 0;
4332
4353
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
@@ -4366,7 +4387,14 @@ struct llama_model_loader {
4366
4387
4367
4388
// Save tensors data offset info of the shard.
4368
4389
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
4369
- weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
4390
+ std::string tensor_name = std::string(cur->name);
4391
+ // make sure there is no duplicated tensor names
4392
+ if (weights_map.find(tensor_name) != weights_map.end()) {
4393
+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
4394
+ }
4395
+ n_elements += ggml_nelements(cur);
4396
+ n_bytes += ggml_nbytes(cur);
4397
+ weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf, cur));
4370
4398
}
4371
4399
4372
4400
gguf_free(ctx_gguf);
@@ -4376,7 +4404,7 @@ struct llama_model_loader {
4376
4404
4377
4405
// sanity check
4378
4406
{
4379
- const int n_tensors_loaded = (int) weights .size();
4407
+ const int n_tensors_loaded = (int) weights_map .size();
4380
4408
if (n_tensors != n_tensors_loaded) {
4381
4409
throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
4382
4410
}
@@ -4386,23 +4414,10 @@ struct llama_model_loader {
4386
4414
}
4387
4415
4388
4416
n_kv = gguf_get_n_kv(meta);
4389
- n_tensors = weights .size();
4417
+ n_tensors = weights_map .size();
4390
4418
4391
4419
fver = (enum llama_fver) gguf_get_version(meta);
4392
4420
4393
- std::set<std::string> tensor_names;
4394
- for (auto & w : weights) {
4395
- n_elements += ggml_nelements(w.tensor);
4396
- n_bytes += ggml_nbytes(w.tensor);
4397
- // make sure there is no duplicated tensor names
4398
- const std::string name(w.tensor->name);
4399
- auto found = tensor_names.find(name);
4400
- if (found != tensor_names.end()) {
4401
- throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
4402
- }
4403
- tensor_names.insert(name);
4404
- }
4405
-
4406
4421
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
4407
4422
__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
4408
4423
@@ -4414,8 +4429,10 @@ struct llama_model_loader {
4414
4429
uint32_t n_type_max = 0;
4415
4430
enum ggml_type type_max = GGML_TYPE_F32;
4416
4431
4417
- for (int i = 0; i < n_tensors; i++) {
4418
- const ggml_tensor * tensor = weights.at(i).tensor;
4432
+ for (const auto & it : weights_map) {
4433
+ const llama_tensor_weight & w = it.second;
4434
+ const ggml_tensor * tensor = w.tensor;
4435
+
4419
4436
enum ggml_type type = tensor->type;
4420
4437
4421
4438
n_type[type]++;
@@ -4426,8 +4443,8 @@ struct llama_model_loader {
4426
4443
}
4427
4444
4428
4445
if (trace > 0) {
4429
- const uint16_t sid = weights.at(i) .idx;
4430
- LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i , sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
4446
+ const uint16_t sid = w .idx;
4447
+ LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ]\n", __func__, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
4431
4448
}
4432
4449
}
4433
4450
@@ -4691,21 +4708,13 @@ struct llama_model_loader {
4691
4708
return llm_kv.arch;
4692
4709
}
4693
4710
4694
- const char * get_tensor_name(int i) const {
4695
- return weights.at(i).tensor->name;
4696
- }
4697
-
4698
4711
const llama_tensor_weight * get_weight(const char * name) const {
4699
- for (const auto & weight : weights) {
4700
- if (strcmp(name, weight.tensor->name) == 0) {
4701
- return &weight;
4702
- }
4712
+ auto pos = weights_map.find(name);
4713
+ if (pos != weights_map.end()) {
4714
+ return &pos->second;
4703
4715
}
4704
- return nullptr;
4705
- }
4706
4716
4707
- const llama_tensor_weight * get_weight(int i) const {
4708
- return get_weight(get_tensor_name(i));
4717
+ return nullptr;
4709
4718
}
4710
4719
4711
4720
const llama_tensor_weight & require_weight(const char * name) const {
@@ -4732,10 +4741,6 @@ struct llama_model_loader {
4732
4741
return tensor;
4733
4742
}
4734
4743
4735
- struct ggml_tensor * get_tensor_meta(int i) const {
4736
- return get_tensor_meta(get_tensor_name(i));
4737
- }
4738
-
4739
4744
const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
4740
4745
const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
4741
4746
@@ -4842,8 +4847,8 @@ struct llama_model_loader {
4842
4847
}
4843
4848
4844
4849
// compute the total size of all tensors for progress reporting
4845
- for (auto & w : weights ) {
4846
- size_data += ggml_nbytes(w .tensor);
4850
+ for (const auto & it : weights_map ) {
4851
+ size_data += ggml_nbytes(it.second .tensor);
4847
4852
}
4848
4853
}
4849
4854
@@ -18598,10 +18603,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18598
18603
}
18599
18604
}
18600
18605
18601
- for (int i = 0; i < ml.n_tensors; ++i ) {
18602
- const struct ggml_tensor * meta = ml.get_tensor_meta(i) ;
18606
+ for (const auto & it : ml.weights_map ) {
18607
+ const struct ggml_tensor * tensor = it.second.tensor ;
18603
18608
18604
- const std::string name = ggml_get_name(meta );
18609
+ const std::string name = ggml_get_name(tensor );
18605
18610
18606
18611
// TODO: avoid hardcoded tensor names - use the TN_* constants
18607
18612
if (name.find("attn_v.weight") != std::string::npos ||
@@ -18639,20 +18644,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18639
18644
std::vector<no_init<float>> f32_conv_buf;
18640
18645
18641
18646
uint16_t n_split = 1;
18647
+ const auto & weights_map = ml.weights_map;
18648
+
18642
18649
// Assume split index is continuous
18643
18650
if (params->keep_split) {
18644
- for (int i = 0; i < ml.n_tensors; ++i ) {
18645
- n_split = std::max(uint16_t(ml.get_weight(i)-> idx+ 1), n_split);
18651
+ for (const auto & it : weights_map ) {
18652
+ n_split = std::max(uint16_t(it.second. idx + 1), n_split);
18646
18653
}
18654
+
18647
18655
}
18648
18656
std::vector<gguf_context*> ctx_outs(n_split, NULL);
18649
18657
ctx_outs[0] = ctx_out;
18650
18658
18651
18659
// populate the original tensors so we get an initial meta data
18652
- for (int i = 0; i < ml.n_tensors; ++i) {
18653
- auto weight = ml.get_weight(i);
18654
- uint16_t i_split = params->keep_split ? weight->idx : 0;
18655
- struct ggml_tensor * tensor = weight->tensor;
18660
+ for (const auto & it : weights_map) {
18661
+ uint16_t i_split = params->keep_split ? it.second.idx : 0;
18662
+ struct ggml_tensor * tensor = it.second.tensor;
18656
18663
if (ctx_outs[i_split] == NULL) {
18657
18664
ctx_outs[i_split] = gguf_init_empty();
18658
18665
}
@@ -18699,12 +18706,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18699
18706
18700
18707
const auto tn = LLM_TN(model.arch);
18701
18708
new_ofstream(0);
18702
- for (int i = 0; i < ml.n_tensors; ++i ) {
18703
- auto weight = ml.get_weight(i) ;
18704
- struct ggml_tensor * tensor = weight-> tensor;
18705
- if (weight-> idx != cur_split && params->keep_split) {
18709
+ for (const auto & it : weights_map ) {
18710
+ const auto & weight = it.second ;
18711
+ struct ggml_tensor * tensor = weight. tensor;
18712
+ if (weight. idx != cur_split && params->keep_split) {
18706
18713
close_ofstream();
18707
- new_ofstream(weight-> idx);
18714
+ new_ofstream(weight. idx);
18708
18715
}
18709
18716
18710
18717
const std::string name = ggml_get_name(tensor);
0 commit comments