Skip to content

Commit 1d0331c

Browse files
ikawrakowKawrakow
andauthored
quantize: options for output and token embedding tensors qtype (#6239)
* quantize: be able to specify the output tensor type * quantize: be able to specify the token embedding tensor type --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent dba1af6 commit 1d0331c

File tree

3 files changed

+61
-26
lines changed

3 files changed

+61
-26
lines changed

examples/quantize/quantize.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,18 @@ static void prepare_imatrix(const std::string& imatrix_file,
189189
}
190190
}
191191

192+
static ggml_type parse_ggml_type(const char * arg) {
193+
ggml_type result = GGML_TYPE_COUNT;
194+
for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
195+
auto type = ggml_type(j);
196+
const auto * name = ggml_type_name(type);
197+
if (name && strcmp(arg, name) == 0) {
198+
result = type; break;
199+
}
200+
}
201+
return result;
202+
}
203+
192204
int main(int argc, char ** argv) {
193205
if (argc < 3) {
194206
usage(argv[0]);
@@ -203,6 +215,18 @@ int main(int argc, char ** argv) {
203215
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
204216
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
205217
params.quantize_output_tensor = false;
218+
} else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
219+
if (arg_idx < argc-1) {
220+
params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
221+
} else {
222+
usage(argv[0]);
223+
}
224+
} else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
225+
if (arg_idx < argc-1) {
226+
params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
227+
} else {
228+
usage(argv[0]);
229+
}
206230
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
207231
params.allow_requantize = true;
208232
} else if (strcmp(argv[arg_idx], "--pure") == 0) {

llama.cpp

Lines changed: 28 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12141,27 +12141,34 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1214112141
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
1214212142
// with the quantization of the output tensor
1214312143
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
12144-
int nx = tensor->ne[0];
12145-
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
12146-
new_type = GGML_TYPE_Q8_0;
12147-
}
12148-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
12149-
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
12150-
new_type = GGML_TYPE_Q5_K;
12151-
}
12152-
else if (new_type != GGML_TYPE_Q8_0) {
12153-
new_type = GGML_TYPE_Q6_K;
12144+
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
12145+
new_type = qs.params->output_tensor_type;
12146+
} else {
12147+
int nx = tensor->ne[0];
12148+
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
12149+
new_type = GGML_TYPE_Q8_0;
12150+
}
12151+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
12152+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
12153+
new_type = GGML_TYPE_Q5_K;
12154+
}
12155+
else if (new_type != GGML_TYPE_Q8_0) {
12156+
new_type = GGML_TYPE_Q6_K;
12157+
}
1215412158
}
1215512159
} else if (name == "token_embd.weight") {
12156-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
12157-
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
12158-
new_type = GGML_TYPE_Q2_K;
12159-
}
12160-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
12161-
new_type = GGML_TYPE_IQ3_S;
12162-
}
12163-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
12164-
new_type = GGML_TYPE_IQ3_S;
12160+
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
12161+
new_type = qs.params->token_embedding_type;
12162+
} else {
12163+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
12164+
new_type = GGML_TYPE_Q2_K;
12165+
}
12166+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
12167+
new_type = GGML_TYPE_IQ3_S;
12168+
}
12169+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
12170+
new_type = GGML_TYPE_IQ3_S;
12171+
}
1216512172
}
1216612173
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
1216712174
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
@@ -13051,6 +13058,8 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
1305113058
struct llama_model_quantize_params result = {
1305213059
/*.nthread =*/ 0,
1305313060
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
13061+
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
13062+
/*.token_embedding_type =*/ GGML_TYPE_COUNT,
1305413063
/*.allow_requantize =*/ false,
1305513064
/*.quantize_output_tensor =*/ true,
1305613065
/*.only_copy =*/ false,

llama.h

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -275,13 +275,15 @@ extern "C" {
275275

276276
// model quantization parameters
277277
typedef struct llama_model_quantize_params {
278-
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
279-
enum llama_ftype ftype; // quantize to this llama_ftype
280-
bool allow_requantize; // allow quantizing non-f32/f16 tensors
281-
bool quantize_output_tensor; // quantize output.weight
282-
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
283-
bool pure; // quantize all tensors to the default type
284-
void * imatrix; // pointer to importance matrix data
278+
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
279+
enum llama_ftype ftype; // quantize to this llama_ftype
280+
enum ggml_type output_tensor_type; // output tensor type
281+
enum ggml_type token_embedding_type; // itoken embeddings tensor type
282+
bool allow_requantize; // allow quantizing non-f32/f16 tensors
283+
bool quantize_output_tensor; // quantize output.weight
284+
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
285+
bool pure; // quantize all tensors to the default type
286+
void * imatrix; // pointer to importance matrix data
285287
} llama_model_quantize_params;
286288

287289
// grammar types

0 commit comments

Comments
 (0)