@@ -4864,19 +4864,12 @@ struct llama_model_loader {
4864
4864
*last = 0;
4865
4865
*addr = mapping->addr;
4866
4866
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
4867
- try {
4868
- const auto * weight = get_weight(ggml_get_name(tensor));
4869
- if (!weight) {
4870
- continue;
4871
- }
4872
- if (weight->idx != idx) {
4873
- continue;
4874
- }
4875
- *first = std::min(*first, weight->offs);
4876
- *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
4877
- } catch(...) {
4878
- // the tensor is not in the model
4867
+ const auto * weight = get_weight(ggml_get_name(tensor));
4868
+ if (!weight || weight->idx != idx) {
4869
+ continue;
4879
4870
}
4871
+ *first = std::min(*first, weight->offs);
4872
+ *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
4880
4873
}
4881
4874
}
4882
4875
@@ -5053,7 +5046,6 @@ struct llama_model_loader {
5053
5046
ggml_backend_tensor_set(cur, data, 0, n_size);
5054
5047
}
5055
5048
} else {
5056
- GGML_ASSERT(weight->idx < files.size());
5057
5049
const auto & file = files.at(weight->idx);
5058
5050
if (ggml_backend_buffer_is_host(cur->buffer)) {
5059
5051
file->seek(weight->offs, SEEK_SET);
@@ -18632,8 +18624,25 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18632
18624
}
18633
18625
}
18634
18626
18627
+ // make a list of weights
18628
+ std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
18629
+ tensors.reserve(ml.weights_map.size());
18635
18630
for (const auto & it : ml.weights_map) {
18636
- const struct ggml_tensor * tensor = it.second.tensor;
18631
+ tensors.push_back(&it.second);
18632
+ }
18633
+
18634
+ // keep_split requires that the weights are sorted by split index
18635
+ if (params->keep_split) {
18636
+ std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
18637
+ if (a->idx == b->idx) {
18638
+ return a->offs < b->offs;
18639
+ }
18640
+ return a->idx < b->idx;
18641
+ });
18642
+ }
18643
+
18644
+ for (const auto * it : tensors) {
18645
+ const struct ggml_tensor * tensor = it->tensor;
18637
18646
18638
18647
const std::string name = ggml_get_name(tensor);
18639
18648
@@ -18673,22 +18682,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18673
18682
std::vector<no_init<float>> f32_conv_buf;
18674
18683
18675
18684
uint16_t n_split = 1;
18676
- const auto & weights_map = ml.weights_map;
18677
18685
18678
18686
// Assume split index is continuous
18679
18687
if (params->keep_split) {
18680
- for (const auto & it : weights_map ) {
18681
- n_split = std::max(uint16_t(it.second. idx + 1), n_split);
18688
+ for (const auto * it : tensors ) {
18689
+ n_split = std::max(uint16_t(it-> idx + 1), n_split);
18682
18690
}
18683
-
18684
18691
}
18685
18692
std::vector<gguf_context*> ctx_outs(n_split, NULL);
18686
18693
ctx_outs[0] = ctx_out;
18687
18694
18688
18695
// populate the original tensors so we get an initial meta data
18689
- for (const auto & it : weights_map ) {
18690
- uint16_t i_split = params->keep_split ? it.second. idx : 0;
18691
- struct ggml_tensor * tensor = it.second. tensor;
18696
+ for (const auto * it : tensors ) {
18697
+ uint16_t i_split = params->keep_split ? it-> idx : 0;
18698
+ struct ggml_tensor * tensor = it-> tensor;
18692
18699
if (ctx_outs[i_split] == NULL) {
18693
18700
ctx_outs[i_split] = gguf_init_empty();
18694
18701
}
@@ -18735,8 +18742,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18735
18742
18736
18743
const auto tn = LLM_TN(model.arch);
18737
18744
new_ofstream(0);
18738
- for (const auto & it : weights_map ) {
18739
- const auto & weight = it.second ;
18745
+ for (const auto * it : tensors ) {
18746
+ const auto & weight = *it ;
18740
18747
struct ggml_tensor * tensor = weight.tensor;
18741
18748
if (weight.idx != cur_split && params->keep_split) {
18742
18749
close_ofstream();
0 commit comments