Skip to content

Commit 6c7075d

Browse files
slarenarthw
authored andcommitted
quantize : fix --keep-split (ggml-org#10114)
1 parent b9110bb commit 6c7075d

File tree

1 file changed

+30
-23
lines changed

1 file changed

+30
-23
lines changed

src/llama.cpp

Lines changed: 30 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4864,19 +4864,12 @@ struct llama_model_loader {
48644864
*last = 0;
48654865
*addr = mapping->addr;
48664866
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
4867-
try {
4868-
const auto * weight = get_weight(ggml_get_name(tensor));
4869-
if (!weight) {
4870-
continue;
4871-
}
4872-
if (weight->idx != idx) {
4873-
continue;
4874-
}
4875-
*first = std::min(*first, weight->offs);
4876-
*last = std::max(*last, weight->offs + ggml_nbytes(tensor));
4877-
} catch(...) {
4878-
// the tensor is not in the model
4867+
const auto * weight = get_weight(ggml_get_name(tensor));
4868+
if (!weight || weight->idx != idx) {
4869+
continue;
48794870
}
4871+
*first = std::min(*first, weight->offs);
4872+
*last = std::max(*last, weight->offs + ggml_nbytes(tensor));
48804873
}
48814874
}
48824875

@@ -5053,7 +5046,6 @@ struct llama_model_loader {
50535046
ggml_backend_tensor_set(cur, data, 0, n_size);
50545047
}
50555048
} else {
5056-
GGML_ASSERT(weight->idx < files.size());
50575049
const auto & file = files.at(weight->idx);
50585050
if (ggml_backend_buffer_is_host(cur->buffer)) {
50595051
file->seek(weight->offs, SEEK_SET);
@@ -18632,8 +18624,25 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1863218624
}
1863318625
}
1863418626

18627+
// make a list of weights
18628+
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
18629+
tensors.reserve(ml.weights_map.size());
1863518630
for (const auto & it : ml.weights_map) {
18636-
const struct ggml_tensor * tensor = it.second.tensor;
18631+
tensors.push_back(&it.second);
18632+
}
18633+
18634+
// keep_split requires that the weights are sorted by split index
18635+
if (params->keep_split) {
18636+
std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
18637+
if (a->idx == b->idx) {
18638+
return a->offs < b->offs;
18639+
}
18640+
return a->idx < b->idx;
18641+
});
18642+
}
18643+
18644+
for (const auto * it : tensors) {
18645+
const struct ggml_tensor * tensor = it->tensor;
1863718646

1863818647
const std::string name = ggml_get_name(tensor);
1863918648

@@ -18673,22 +18682,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1867318682
std::vector<no_init<float>> f32_conv_buf;
1867418683

1867518684
uint16_t n_split = 1;
18676-
const auto & weights_map = ml.weights_map;
1867718685

1867818686
// Assume split index is continuous
1867918687
if (params->keep_split) {
18680-
for (const auto & it : weights_map) {
18681-
n_split = std::max(uint16_t(it.second.idx + 1), n_split);
18688+
for (const auto * it : tensors) {
18689+
n_split = std::max(uint16_t(it->idx + 1), n_split);
1868218690
}
18683-
1868418691
}
1868518692
std::vector<gguf_context*> ctx_outs(n_split, NULL);
1868618693
ctx_outs[0] = ctx_out;
1868718694

1868818695
// populate the original tensors so we get an initial meta data
18689-
for (const auto & it : weights_map) {
18690-
uint16_t i_split = params->keep_split ? it.second.idx : 0;
18691-
struct ggml_tensor * tensor = it.second.tensor;
18696+
for (const auto * it : tensors) {
18697+
uint16_t i_split = params->keep_split ? it->idx : 0;
18698+
struct ggml_tensor * tensor = it->tensor;
1869218699
if (ctx_outs[i_split] == NULL) {
1869318700
ctx_outs[i_split] = gguf_init_empty();
1869418701
}
@@ -18735,8 +18742,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1873518742

1873618743
const auto tn = LLM_TN(model.arch);
1873718744
new_ofstream(0);
18738-
for (const auto & it : weights_map) {
18739-
const auto & weight = it.second;
18745+
for (const auto * it : tensors) {
18746+
const auto & weight = *it;
1874018747
struct ggml_tensor * tensor = weight.tensor;
1874118748
if (weight.idx != cur_split && params->keep_split) {
1874218749
close_ofstream();

0 commit comments

Comments
 (0)