@@ -12141,27 +12141,34 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
12141
12141
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
12142
12142
// with the quantization of the output tensor
12143
12143
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
12144
- int nx = tensor->ne[0];
12145
- if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
12146
- new_type = GGML_TYPE_Q8_0;
12147
- }
12148
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
12149
- ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
12150
- new_type = GGML_TYPE_Q5_K;
12151
- }
12152
- else if (new_type != GGML_TYPE_Q8_0) {
12153
- new_type = GGML_TYPE_Q6_K;
12144
+ if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
12145
+ new_type = qs.params->output_tensor_type;
12146
+ } else {
12147
+ int nx = tensor->ne[0];
12148
+ if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
12149
+ new_type = GGML_TYPE_Q8_0;
12150
+ }
12151
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
12152
+ ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
12153
+ new_type = GGML_TYPE_Q5_K;
12154
+ }
12155
+ else if (new_type != GGML_TYPE_Q8_0) {
12156
+ new_type = GGML_TYPE_Q6_K;
12157
+ }
12154
12158
}
12155
12159
} else if (name == "token_embd.weight") {
12156
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
12157
- ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
12158
- new_type = GGML_TYPE_Q2_K;
12159
- }
12160
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
12161
- new_type = GGML_TYPE_IQ3_S;
12162
- }
12163
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
12164
- new_type = GGML_TYPE_IQ3_S;
12160
+ if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
12161
+ new_type = qs.params->token_embedding_type;
12162
+ } else {
12163
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
12164
+ new_type = GGML_TYPE_Q2_K;
12165
+ }
12166
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
12167
+ new_type = GGML_TYPE_IQ3_S;
12168
+ }
12169
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
12170
+ new_type = GGML_TYPE_IQ3_S;
12171
+ }
12165
12172
}
12166
12173
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
12167
12174
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
@@ -13051,6 +13058,8 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
13051
13058
struct llama_model_quantize_params result = {
13052
13059
/*.nthread =*/ 0,
13053
13060
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
13061
+ /*.output_tensor_type =*/ GGML_TYPE_COUNT,
13062
+ /*.token_embedding_type =*/ GGML_TYPE_COUNT,
13054
13063
/*.allow_requantize =*/ false,
13055
13064
/*.quantize_output_tensor =*/ true,
13056
13065
/*.only_copy =*/ false,
0 commit comments