Skip to content

Commit 1e9f949

Browse files
authored
quantize : fix --keep-split (#10114)
1 parent c02e5ab commit 1e9f949

File tree

1 file changed

+30
-23
lines changed

1 file changed

+30
-23
lines changed

src/llama.cpp

Lines changed: 30 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4860,19 +4860,12 @@ struct llama_model_loader {
48604860
*last = 0;
48614861
*addr = mapping->addr;
48624862
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
4863-
try {
4864-
const auto * weight = get_weight(ggml_get_name(tensor));
4865-
if (!weight) {
4866-
continue;
4867-
}
4868-
if (weight->idx != idx) {
4869-
continue;
4870-
}
4871-
*first = std::min(*first, weight->offs);
4872-
*last = std::max(*last, weight->offs + ggml_nbytes(tensor));
4873-
} catch(...) {
4874-
// the tensor is not in the model
4863+
const auto * weight = get_weight(ggml_get_name(tensor));
4864+
if (!weight || weight->idx != idx) {
4865+
continue;
48754866
}
4867+
*first = std::min(*first, weight->offs);
4868+
*last = std::max(*last, weight->offs + ggml_nbytes(tensor));
48764869
}
48774870
}
48784871

@@ -5049,7 +5042,6 @@ struct llama_model_loader {
50495042
ggml_backend_tensor_set(cur, data, 0, n_size);
50505043
}
50515044
} else {
5052-
GGML_ASSERT(weight->idx < files.size());
50535045
const auto & file = files.at(weight->idx);
50545046
if (ggml_backend_buffer_is_host(cur->buffer)) {
50555047
file->seek(weight->offs, SEEK_SET);
@@ -18623,8 +18615,25 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1862318615
}
1862418616
}
1862518617

18618+
// make a list of weights
18619+
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
18620+
tensors.reserve(ml.weights_map.size());
1862618621
for (const auto & it : ml.weights_map) {
18627-
const struct ggml_tensor * tensor = it.second.tensor;
18622+
tensors.push_back(&it.second);
18623+
}
18624+
18625+
// keep_split requires that the weights are sorted by split index
18626+
if (params->keep_split) {
18627+
std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
18628+
if (a->idx == b->idx) {
18629+
return a->offs < b->offs;
18630+
}
18631+
return a->idx < b->idx;
18632+
});
18633+
}
18634+
18635+
for (const auto * it : tensors) {
18636+
const struct ggml_tensor * tensor = it->tensor;
1862818637

1862918638
const std::string name = ggml_get_name(tensor);
1863018639

@@ -18664,22 +18673,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1866418673
std::vector<no_init<float>> f32_conv_buf;
1866518674

1866618675
uint16_t n_split = 1;
18667-
const auto & weights_map = ml.weights_map;
1866818676

1866918677
// Assume split index is continuous
1867018678
if (params->keep_split) {
18671-
for (const auto & it : weights_map) {
18672-
n_split = std::max(uint16_t(it.second.idx + 1), n_split);
18679+
for (const auto * it : tensors) {
18680+
n_split = std::max(uint16_t(it->idx + 1), n_split);
1867318681
}
18674-
1867518682
}
1867618683
std::vector<gguf_context*> ctx_outs(n_split, NULL);
1867718684
ctx_outs[0] = ctx_out;
1867818685

1867918686
// populate the original tensors so we get an initial meta data
18680-
for (const auto & it : weights_map) {
18681-
uint16_t i_split = params->keep_split ? it.second.idx : 0;
18682-
struct ggml_tensor * tensor = it.second.tensor;
18687+
for (const auto * it : tensors) {
18688+
uint16_t i_split = params->keep_split ? it->idx : 0;
18689+
struct ggml_tensor * tensor = it->tensor;
1868318690
if (ctx_outs[i_split] == NULL) {
1868418691
ctx_outs[i_split] = gguf_init_empty();
1868518692
}
@@ -18726,8 +18733,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1872618733

1872718734
const auto tn = LLM_TN(model.arch);
1872818735
new_ofstream(0);
18729-
for (const auto & it : weights_map) {
18730-
const auto & weight = it.second;
18736+
for (const auto * it : tensors) {
18737+
const auto & weight = *it;
1873118738
struct ggml_tensor * tensor = weight.tensor;
1873218739
if (weight.idx != cur_split && params->keep_split) {
1873318740
close_ofstream();

0 commit comments

Comments
 (0)