Skip to content

Commit 83ad0ab

Browse files
kylo5abyslaren
authored andcommitted
loader: refactor tensor weights storage (ggml-org#9935)
* loader: refactor tensor weights storage * use sorted map, sort weights by layer --------- Co-authored-by: slaren <[email protected]>
1 parent 963f06e commit 83ad0ab

File tree

1 file changed

+65
-58
lines changed

1 file changed

+65
-58
lines changed

src/llama.cpp

Lines changed: 65 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -4275,20 +4275,34 @@ struct llama_model_loader {
42754275

42764276
ggml_tensor * tensor;
42774277

4278-
llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
4279-
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
4278+
llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
4279+
const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor));
42804280
if (tensor_idx < 0) {
4281-
throw std::runtime_error(format("tensor '%s' not found in the model", name));
4281+
throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor)));
42824282
}
42834283

42844284
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
42854285
if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
4286-
throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
4286+
throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor)));
42874287
}
42884288
}
42894289
};
4290-
std::vector<llama_tensor_weight> weights;
42914290

4291+
// custom comparator to sort weights more nicely by layer
4292+
struct weight_name_comparer {
4293+
bool operator()(const std::string & a, const std::string & b) const {
4294+
int a_layer = -1;
4295+
int b_layer = -1;
4296+
sscanf(a.c_str(), "blk.%d.", &a_layer);
4297+
sscanf(b.c_str(), "blk.%d.", &b_layer);
4298+
if (a_layer != b_layer) {
4299+
return a_layer < b_layer;
4300+
}
4301+
return a < b;
4302+
}
4303+
};
4304+
4305+
std::map<std::string, struct llama_tensor_weight, weight_name_comparer> weights_map;
42924306
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
42934307

42944308
struct gguf_context * meta = NULL;
@@ -4330,7 +4344,14 @@ struct llama_model_loader {
43304344
// For subsidiary files, `meta` tensor data offset must not be used,
43314345
// so we build a unified tensors index for weights.
43324346
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
4333-
weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
4347+
std::string tensor_name = std::string(cur->name);
4348+
// make sure there is no duplicated tensor names
4349+
if (weights_map.find(tensor_name) != weights_map.end()) {
4350+
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
4351+
}
4352+
n_elements += ggml_nelements(cur);
4353+
n_bytes += ggml_nbytes(cur);
4354+
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta, cur));
43344355
}
43354356
uint16_t n_split = 0;
43364357
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
@@ -4370,7 +4391,14 @@ struct llama_model_loader {
43704391

43714392
// Save tensors data offset info of the shard.
43724393
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
4373-
weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
4394+
std::string tensor_name = std::string(cur->name);
4395+
// make sure there is no duplicated tensor names
4396+
if (weights_map.find(tensor_name) != weights_map.end()) {
4397+
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
4398+
}
4399+
n_elements += ggml_nelements(cur);
4400+
n_bytes += ggml_nbytes(cur);
4401+
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf, cur));
43744402
}
43754403

43764404
gguf_free(ctx_gguf);
@@ -4380,7 +4408,7 @@ struct llama_model_loader {
43804408

43814409
// sanity check
43824410
{
4383-
const int n_tensors_loaded = (int) weights.size();
4411+
const int n_tensors_loaded = (int) weights_map.size();
43844412
if (n_tensors != n_tensors_loaded) {
43854413
throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
43864414
}
@@ -4390,23 +4418,10 @@ struct llama_model_loader {
43904418
}
43914419

43924420
n_kv = gguf_get_n_kv(meta);
4393-
n_tensors = weights.size();
4421+
n_tensors = weights_map.size();
43944422

43954423
fver = (enum llama_fver) gguf_get_version(meta);
43964424

4397-
std::set<std::string> tensor_names;
4398-
for (auto & w : weights) {
4399-
n_elements += ggml_nelements(w.tensor);
4400-
n_bytes += ggml_nbytes(w.tensor);
4401-
// make sure there is no duplicated tensor names
4402-
const std::string name(w.tensor->name);
4403-
auto found = tensor_names.find(name);
4404-
if (found != tensor_names.end()) {
4405-
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
4406-
}
4407-
tensor_names.insert(name);
4408-
}
4409-
44104425
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
44114426
__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
44124427

@@ -4418,8 +4433,10 @@ struct llama_model_loader {
44184433
uint32_t n_type_max = 0;
44194434
enum ggml_type type_max = GGML_TYPE_F32;
44204435

4421-
for (int i = 0; i < n_tensors; i++) {
4422-
const ggml_tensor * tensor = weights.at(i).tensor;
4436+
for (const auto & it : weights_map) {
4437+
const llama_tensor_weight & w = it.second;
4438+
const ggml_tensor * tensor = w.tensor;
4439+
44234440
enum ggml_type type = tensor->type;
44244441

44254442
n_type[type]++;
@@ -4430,8 +4447,8 @@ struct llama_model_loader {
44304447
}
44314448

44324449
if (trace > 0) {
4433-
const uint16_t sid = weights.at(i).idx;
4434-
LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
4450+
const uint16_t sid = w.idx;
4451+
LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ]\n", __func__, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
44354452
}
44364453
}
44374454

@@ -4695,21 +4712,13 @@ struct llama_model_loader {
46954712
return llm_kv.arch;
46964713
}
46974714

4698-
const char * get_tensor_name(int i) const {
4699-
return weights.at(i).tensor->name;
4700-
}
4701-
47024715
const llama_tensor_weight * get_weight(const char * name) const {
4703-
for (const auto & weight : weights) {
4704-
if (strcmp(name, weight.tensor->name) == 0) {
4705-
return &weight;
4706-
}
4716+
auto pos = weights_map.find(name);
4717+
if (pos != weights_map.end()) {
4718+
return &pos->second;
47074719
}
4708-
return nullptr;
4709-
}
47104720

4711-
const llama_tensor_weight * get_weight(int i) const {
4712-
return get_weight(get_tensor_name(i));
4721+
return nullptr;
47134722
}
47144723

47154724
const llama_tensor_weight & require_weight(const char * name) const {
@@ -4736,10 +4745,6 @@ struct llama_model_loader {
47364745
return tensor;
47374746
}
47384747

4739-
struct ggml_tensor * get_tensor_meta(int i) const {
4740-
return get_tensor_meta(get_tensor_name(i));
4741-
}
4742-
47434748
const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
47444749
const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
47454750

@@ -4846,8 +4851,8 @@ struct llama_model_loader {
48464851
}
48474852

48484853
// compute the total size of all tensors for progress reporting
4849-
for (auto & w : weights) {
4850-
size_data += ggml_nbytes(w.tensor);
4854+
for (const auto & it : weights_map) {
4855+
size_data += ggml_nbytes(it.second.tensor);
48514856
}
48524857
}
48534858

@@ -18607,10 +18612,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1860718612
}
1860818613
}
1860918614

18610-
for (int i = 0; i < ml.n_tensors; ++i) {
18611-
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
18615+
for (const auto & it : ml.weights_map) {
18616+
const struct ggml_tensor * tensor = it.second.tensor;
1861218617

18613-
const std::string name = ggml_get_name(meta);
18618+
const std::string name = ggml_get_name(tensor);
1861418619

1861518620
// TODO: avoid hardcoded tensor names - use the TN_* constants
1861618621
if (name.find("attn_v.weight") != std::string::npos ||
@@ -18648,20 +18653,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1864818653
std::vector<no_init<float>> f32_conv_buf;
1864918654

1865018655
uint16_t n_split = 1;
18656+
const auto & weights_map = ml.weights_map;
18657+
1865118658
// Assume split index is continuous
1865218659
if (params->keep_split) {
18653-
for (int i = 0; i < ml.n_tensors; ++i) {
18654-
n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
18660+
for (const auto & it : weights_map) {
18661+
n_split = std::max(uint16_t(it.second.idx + 1), n_split);
1865518662
}
18663+
1865618664
}
1865718665
std::vector<gguf_context*> ctx_outs(n_split, NULL);
1865818666
ctx_outs[0] = ctx_out;
1865918667

1866018668
// populate the original tensors so we get an initial meta data
18661-
for (int i = 0; i < ml.n_tensors; ++i) {
18662-
auto weight = ml.get_weight(i);
18663-
uint16_t i_split = params->keep_split ? weight->idx : 0;
18664-
struct ggml_tensor * tensor = weight->tensor;
18669+
for (const auto & it : weights_map) {
18670+
uint16_t i_split = params->keep_split ? it.second.idx : 0;
18671+
struct ggml_tensor * tensor = it.second.tensor;
1866518672
if (ctx_outs[i_split] == NULL) {
1866618673
ctx_outs[i_split] = gguf_init_empty();
1866718674
}
@@ -18708,12 +18715,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1870818715

1870918716
const auto tn = LLM_TN(model.arch);
1871018717
new_ofstream(0);
18711-
for (int i = 0; i < ml.n_tensors; ++i) {
18712-
auto weight = ml.get_weight(i);
18713-
struct ggml_tensor * tensor = weight->tensor;
18714-
if (weight->idx != cur_split && params->keep_split) {
18718+
for (const auto & it : weights_map) {
18719+
const auto & weight = it.second;
18720+
struct ggml_tensor * tensor = weight.tensor;
18721+
if (weight.idx != cur_split && params->keep_split) {
1871518722
close_ofstream();
18716-
new_ofstream(weight->idx);
18723+
new_ofstream(weight.idx);
1871718724
}
1871818725

1871918726
const std::string name = ggml_get_name(tensor);

0 commit comments

Comments
 (0)