Skip to content

Commit 1966eb2

Browse files
zj040045z5269887ggerganov
authored
quantize : add '--keep-split' to quantize model into shards (#6688)
* Implement '--keep-split' to quantize model into several shards * Add test script * Update examples/quantize/quantize.cpp Co-authored-by: Georgi Gerganov <[email protected]> * Split model correctly even if tensor id is out-of-order * Update llama_model_quantize_params * Fix preci failures --------- Co-authored-by: z5269887 <[email protected]> Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 784e11d commit 1966eb2

File tree

4 files changed

+148
-26
lines changed

4 files changed

+148
-26
lines changed

examples/quantize/quantize.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ static void usage(const char * executable) {
9797
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
9898
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
9999
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
100+
printf(" --keep-split: will generate quatized model in the same shards as input");
100101
printf(" --override-kv KEY=TYPE:VALUE\n");
101102
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
102103
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
@@ -300,6 +301,8 @@ int main(int argc, char ** argv) {
300301
} else {
301302
usage(argv[0]);
302303
}
304+
} else if (strcmp(argv[arg_idx], "--keep-split")) {
305+
params.keep_split = true;
303306
} else {
304307
usage(argv[0]);
305308
}
@@ -332,20 +335,28 @@ int main(int argc, char ** argv) {
332335
std::string fname_out;
333336

334337
std::string ftype_str;
338+
std::string suffix = ".gguf";
335339
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
336340
std::string fpath;
337341
const size_t pos = fname_inp.find_last_of("/\\");
338342
if (pos != std::string::npos) {
339343
fpath = fname_inp.substr(0, pos + 1);
340344
}
341-
// export as [inp path]/ggml-model-[ftype].gguf
342-
fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
345+
346+
// export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
347+
fname_out = fpath + "ggml-model-" + ftype_str;
348+
if (!params.keep_split) {
349+
fname_out += suffix;
350+
}
343351
arg_idx++;
344352
if (ftype_str == "COPY") {
345353
params.only_copy = true;
346354
}
347355
} else {
348356
fname_out = argv[arg_idx];
357+
if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
358+
fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
359+
}
349360
arg_idx++;
350361

351362
if (argc <= arg_idx) {

examples/quantize/test.sh

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#!/bin/bash
2+
3+
set -eu
4+
5+
if [ $# -lt 1 ]
6+
then
7+
echo "usage: $0 path_to_build_binary [path_to_temp_folder]"
8+
echo "example: $0 ../../build/bin ../../tmp"
9+
exit 1
10+
fi
11+
12+
if [ $# -gt 1 ]
13+
then
14+
TMP_DIR=$2
15+
else
16+
TMP_DIR=/tmp
17+
fi
18+
19+
set -x
20+
21+
SPLIT=$1/gguf-split
22+
QUANTIZE=$1/quantize
23+
MAIN=$1/main
24+
WORK_PATH=$TMP_DIR/quantize
25+
CUR_DIR=$(pwd)
26+
27+
mkdir -p "$WORK_PATH"
28+
29+
# Clean up in case of previously failed test
30+
rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
31+
32+
# 1. Get a model
33+
(
34+
cd $WORK_PATH
35+
"$CUR_DIR"/../../scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
36+
)
37+
echo PASS
38+
39+
# 2. Split model
40+
$SPLIT --split-max-tensors 28 $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
41+
echo PASS
42+
echo
43+
44+
# 3. Requant model with '--keep_split'
45+
$QUANTIZE --allow-requantize --keep_split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K
46+
echo PASS
47+
echo
48+
49+
# 3a. Test the requanted model is loading properly
50+
$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32
51+
echo PASS
52+
echo
53+
54+
# 4. Requant mode without '--keep_split'
55+
$QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K
56+
echo PASS
57+
echo
58+
59+
# 4b. Test the requanted model is loading properly
60+
$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32
61+
echo PASS
62+
echo
63+
64+
# Clean up
65+
rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf

llama.cpp

Lines changed: 69 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3297,6 +3297,10 @@ struct llama_model_loader {
32973297
return nullptr;
32983298
}
32993299

3300+
const llama_tensor_weight * get_weight(int i) const {
3301+
return get_weight(get_tensor_name(i));
3302+
}
3303+
33003304
const llama_tensor_weight & require_weight(const char * name) const {
33013305
const llama_tensor_weight * weight = get_weight(name);
33023306
if (!weight) {
@@ -14528,26 +14532,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1452814532
std::vector<no_init<uint8_t>> work;
1452914533
std::vector<no_init<float>> f32_conv_buf;
1453014534

14535+
uint16_t n_split = 1;
14536+
// Assume split index is continuous
14537+
if (params->keep_split) {
14538+
for (int i = 0; i < ml.n_tensors; ++i) {
14539+
n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
14540+
}
14541+
}
14542+
std::vector<gguf_context*> ctx_outs(n_split, NULL);
14543+
ctx_outs[0] = ctx_out;
14544+
1453114545
// populate the original tensors so we get an initial meta data
1453214546
for (int i = 0; i < ml.n_tensors; ++i) {
14533-
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
14534-
gguf_add_tensor(ctx_out, meta);
14547+
auto weight = ml.get_weight(i);
14548+
uint16_t i_split = params->keep_split ? weight->idx : 0;
14549+
struct ggml_tensor * tensor = weight->tensor;
14550+
if (ctx_outs[i_split] == NULL) {
14551+
ctx_outs[i_split] = gguf_init_empty();
14552+
}
14553+
gguf_add_tensor(ctx_outs[i_split], tensor);
1453514554
}
1453614555

14537-
std::ofstream fout(fname_out, std::ios::binary);
14538-
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
14539-
14540-
const size_t meta_size = gguf_get_meta_size(ctx_out);
14556+
// Set split info if needed
14557+
if (n_split > 1) {
14558+
for (size_t i = 0; i < ctx_outs.size(); ++i) {
14559+
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
14560+
gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
14561+
gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
14562+
}
14563+
}
1454114564

14542-
LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
14565+
int cur_split = -1;
14566+
std::ofstream fout;
14567+
auto close_ofstream = [&]() {
14568+
// Write metadata and close file handler
14569+
if (fout.is_open()) {
14570+
fout.seekp(0);
14571+
std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
14572+
gguf_get_meta_data(ctx_outs[cur_split], data.data());
14573+
fout.write((const char *) data.data(), data.size());
14574+
fout.close();
14575+
}
14576+
};
14577+
auto new_ofstream = [&](int index = 0) {
14578+
cur_split = index;
14579+
GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
14580+
std::string fname = fname_out;
14581+
if (params->keep_split) {
14582+
char split_path[PATH_MAX] = {0};
14583+
llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
14584+
fname = std::string(split_path);
14585+
}
1454314586

14544-
// placeholder for the meta data
14545-
::zeros(fout, meta_size);
14587+
fout = std::ofstream(fname, std::ios::binary);
14588+
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
14589+
const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
14590+
// placeholder for the meta data
14591+
::zeros(fout, meta_size);
14592+
};
1454614593

1454714594
const auto tn = LLM_TN(model.arch);
14548-
14595+
new_ofstream();
1454914596
for (int i = 0; i < ml.n_tensors; ++i) {
14550-
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
14597+
auto weight = ml.get_weight(i);
14598+
struct ggml_tensor * tensor = weight->tensor;
14599+
if (weight->idx != cur_split && params->keep_split) {
14600+
close_ofstream();
14601+
new_ofstream(weight->idx);
14602+
}
1455114603

1455214604
const std::string name = ggml_get_name(tensor);
1455314605

@@ -14702,26 +14754,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1470214754
total_size_new += new_size;
1470314755

1470414756
// update the gguf meta data as we go
14705-
gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
14706-
gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
14757+
gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
14758+
gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
1470714759

1470814760
// write tensor data + padding
1470914761
fout.write((const char *) new_data, new_size);
1471014762
zeros(fout, GGML_PAD(new_size, align) - new_size);
1471114763
}
14712-
14713-
// go back to beginning of file and write the updated meta data
14714-
{
14715-
fout.seekp(0);
14716-
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
14717-
gguf_get_meta_data(ctx_out, data.data());
14718-
fout.write((const char *) data.data(), data.size());
14764+
close_ofstream();
14765+
for (auto & c:ctx_outs) {
14766+
gguf_free(c);
1471914767
}
1472014768

14721-
fout.close();
14722-
14723-
gguf_free(ctx_out);
14724-
1472514769
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
1472614770
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
1472714771

@@ -15077,6 +15121,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
1507715121
/*.quantize_output_tensor =*/ true,
1507815122
/*.only_copy =*/ false,
1507915123
/*.pure =*/ false,
15124+
/*.keep_split =*/ false,
1508015125
/*.imatrix =*/ nullptr,
1508115126
/*.kv_overrides =*/ nullptr,
1508215127
};

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ extern "C" {
288288
bool quantize_output_tensor; // quantize output.weight
289289
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
290290
bool pure; // quantize all tensors to the default type
291+
bool keep_split; // quantize to the same number of shards
291292
void * imatrix; // pointer to importance matrix data
292293
void * kv_overrides; // pointer to vector containing overrides
293294
} llama_model_quantize_params;

0 commit comments

Comments
 (0)