Skip to content

Commit 5d6f19f

Browse files
authored
Allow quantize to only copy tensors, some other improvements (#2931)
* Allow quantize tool to only copy tensors to allow repackaging models. * Slightly better logic when requantizing. * Change help message to go to `stdout`.
1 parent 0d58936 commit 5d6f19f

File tree

3 files changed

+37
-13
lines changed

3 files changed

+37
-13
lines changed

examples/quantize/quantize.cpp

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
3535
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
3636
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
3737
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
38+
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
39+
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
3840
};
3941

4042

@@ -71,12 +73,17 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
7173
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
7274
//
7375
void usage(const char * executable) {
74-
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
75-
fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
76-
fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
77-
fprintf(stderr, "\nAllowed quantization types:\n");
76+
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
77+
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
78+
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
79+
printf("\nAllowed quantization types:\n");
7880
for (auto & it : QUANT_OPTIONS) {
79-
printf(" %2d or %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
81+
if (it.name != "COPY") {
82+
printf(" %2d or ", it.ftype);
83+
} else {
84+
printf(" ");
85+
}
86+
printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str());
8087
}
8188
exit(1);
8289
}
@@ -121,6 +128,9 @@ int main(int argc, char ** argv) {
121128
// export as [inp path]/ggml-model-[ftype].gguf
122129
fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
123130
arg_idx++;
131+
if (ftype_str == "COPY") {
132+
params.only_copy = true;
133+
}
124134
}
125135
else {
126136
fname_out = argv[arg_idx];
@@ -133,6 +143,10 @@ int main(int argc, char ** argv) {
133143
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
134144
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
135145
return 1;
146+
} else {
147+
if (ftype_str == "COPY") {
148+
params.only_copy = true;
149+
}
136150
}
137151
arg_idx++;
138152
}

llama.cpp

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4683,6 +4683,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
46834683
llm_load_arch(*ml, model);
46844684
llm_load_hparams(*ml, model, 0, 0, 0);
46854685

4686+
if (params->only_copy) {
4687+
ftype = model.ftype;
4688+
}
4689+
46864690
const size_t align = GGUF_DEFAULT_ALIGNMENT;
46874691
struct gguf_context * ctx_out = gguf_init_empty();
46884692

@@ -4769,18 +4773,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
47694773
// quantize only 2D tensors
47704774
quantize &= (tensor->n_dims == 2);
47714775
quantize &= params->quantize_output_tensor || name != "output.weight";
4772-
quantize &= quantized_type != tensor->type;
4776+
quantize &= !params->only_copy;
47734777

47744778
enum ggml_type new_type;
47754779
void * new_data;
47764780
size_t new_size;
47774781

4778-
if (!quantize) {
4779-
new_type = tensor->type;
4780-
new_data = tensor->data;
4781-
new_size = ggml_nbytes(tensor);
4782-
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
4783-
} else {
4782+
if (quantize) {
47844783
new_type = quantized_type;
47854784
#ifdef GGML_USE_K_QUANTS
47864785
// TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -4879,7 +4878,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
48794878
}
48804879
}
48814880
#endif
4882-
4881+
// If we've decided to quantize to the same type the tensor is already
4882+
// in then there's nothing to do.
4883+
quantize = tensor->type != new_type;
4884+
}
4885+
if (!quantize) {
4886+
new_type = tensor->type;
4887+
new_data = tensor->data;
4888+
new_size = ggml_nbytes(tensor);
4889+
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
4890+
} else {
48834891
const size_t nelements = ggml_nelements(tensor);
48844892

48854893
float * f32_data;
@@ -5310,6 +5318,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
53105318
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
53115319
/*.allow_requantize =*/ false,
53125320
/*.quantize_output_tensor =*/ true,
5321+
/*.only_copy =*/ false,
53135322
};
53145323

53155324
return result;

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ extern "C" {
164164
enum llama_ftype ftype; // quantize to this llama_ftype
165165
bool allow_requantize; // allow quantizing non-f32/f16 tensors
166166
bool quantize_output_tensor; // quantize output.weight
167+
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
167168
} llama_model_quantize_params;
168169

169170
// grammar types

0 commit comments

Comments
 (0)