@@ -4860,19 +4860,12 @@ struct llama_model_loader {
4860
4860
*last = 0;
4861
4861
*addr = mapping->addr;
4862
4862
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
4863
- try {
4864
- const auto * weight = get_weight(ggml_get_name(tensor));
4865
- if (!weight) {
4866
- continue;
4867
- }
4868
- if (weight->idx != idx) {
4869
- continue;
4870
- }
4871
- *first = std::min(*first, weight->offs);
4872
- *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
4873
- } catch(...) {
4874
- // the tensor is not in the model
4863
+ const auto * weight = get_weight(ggml_get_name(tensor));
4864
+ if (!weight || weight->idx != idx) {
4865
+ continue;
4875
4866
}
4867
+ *first = std::min(*first, weight->offs);
4868
+ *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
4876
4869
}
4877
4870
}
4878
4871
@@ -5049,7 +5042,6 @@ struct llama_model_loader {
5049
5042
ggml_backend_tensor_set(cur, data, 0, n_size);
5050
5043
}
5051
5044
} else {
5052
- GGML_ASSERT(weight->idx < files.size());
5053
5045
const auto & file = files.at(weight->idx);
5054
5046
if (ggml_backend_buffer_is_host(cur->buffer)) {
5055
5047
file->seek(weight->offs, SEEK_SET);
@@ -18623,8 +18615,25 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18623
18615
}
18624
18616
}
18625
18617
18618
+ // make a list of weights
18619
+ std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
18620
+ tensors.reserve(ml.weights_map.size());
18626
18621
for (const auto & it : ml.weights_map) {
18627
- const struct ggml_tensor * tensor = it.second.tensor;
18622
+ tensors.push_back(&it.second);
18623
+ }
18624
+
18625
+ // keep_split requires that the weights are sorted by split index
18626
+ if (params->keep_split) {
18627
+ std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
18628
+ if (a->idx == b->idx) {
18629
+ return a->offs < b->offs;
18630
+ }
18631
+ return a->idx < b->idx;
18632
+ });
18633
+ }
18634
+
18635
+ for (const auto * it : tensors) {
18636
+ const struct ggml_tensor * tensor = it->tensor;
18628
18637
18629
18638
const std::string name = ggml_get_name(tensor);
18630
18639
@@ -18664,22 +18673,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18664
18673
std::vector<no_init<float>> f32_conv_buf;
18665
18674
18666
18675
uint16_t n_split = 1;
18667
- const auto & weights_map = ml.weights_map;
18668
18676
18669
18677
// Assume split index is continuous
18670
18678
if (params->keep_split) {
18671
- for (const auto & it : weights_map ) {
18672
- n_split = std::max(uint16_t(it.second. idx + 1), n_split);
18679
+ for (const auto * it : tensors ) {
18680
+ n_split = std::max(uint16_t(it-> idx + 1), n_split);
18673
18681
}
18674
-
18675
18682
}
18676
18683
std::vector<gguf_context*> ctx_outs(n_split, NULL);
18677
18684
ctx_outs[0] = ctx_out;
18678
18685
18679
18686
// populate the original tensors so we get an initial meta data
18680
- for (const auto & it : weights_map ) {
18681
- uint16_t i_split = params->keep_split ? it.second. idx : 0;
18682
- struct ggml_tensor * tensor = it.second. tensor;
18687
+ for (const auto * it : tensors ) {
18688
+ uint16_t i_split = params->keep_split ? it-> idx : 0;
18689
+ struct ggml_tensor * tensor = it-> tensor;
18683
18690
if (ctx_outs[i_split] == NULL) {
18684
18691
ctx_outs[i_split] = gguf_init_empty();
18685
18692
}
@@ -18726,8 +18733,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18726
18733
18727
18734
const auto tn = LLM_TN(model.arch);
18728
18735
new_ofstream(0);
18729
- for (const auto & it : weights_map ) {
18730
- const auto & weight = it.second ;
18736
+ for (const auto * it : tensors ) {
18737
+ const auto & weight = *it ;
18731
18738
struct ggml_tensor * tensor = weight.tensor;
18732
18739
if (weight.idx != cur_split && params->keep_split) {
18733
18740
close_ofstream();
0 commit comments