@@ -11636,7 +11636,7 @@ static void llama_tensor_dequantize_internal(
11636
11636
workers.clear();
11637
11637
}
11638
11638
11639
- static ggml_type get_k_quant_type (quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
11639
+ static ggml_type llama_tensor_get_type (quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
11640
11640
const std::string name = ggml_get_name(tensor);
11641
11641
11642
11642
// TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -11951,40 +11951,40 @@ static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const flo
11951
11951
}
11952
11952
11953
11953
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
11954
- ggml_type quantized_type ;
11954
+ ggml_type default_type ;
11955
11955
llama_ftype ftype = params->ftype;
11956
11956
11957
11957
switch (params->ftype) {
11958
- case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
11959
- case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
11960
- case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
11961
- case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
11962
- case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
11963
- case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
11964
- case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
11958
+ case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
11959
+ case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
11960
+ case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
11961
+ case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
11962
+ case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
11963
+ case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
11964
+ case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
11965
11965
11966
11966
// K-quants
11967
11967
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
11968
- case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
11969
- case LLAMA_FTYPE_MOSTLY_IQ3_XS: quantized_type = GGML_TYPE_IQ3_S; break;
11968
+ case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
11969
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_S; break;
11970
11970
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
11971
11971
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
11972
- case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
11972
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break;
11973
11973
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
11974
- case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
11974
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break;
11975
11975
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
11976
- case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
11977
- case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
11978
- case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
11979
- case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
11980
- case LLAMA_FTYPE_MOSTLY_IQ2_S: quantized_type = GGML_TYPE_IQ2_XS; break;
11981
- case LLAMA_FTYPE_MOSTLY_IQ2_M: quantized_type = GGML_TYPE_IQ2_S; break;
11982
- case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
11983
- case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
11984
- case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
11985
- case LLAMA_FTYPE_MOSTLY_IQ4_XS: quantized_type = GGML_TYPE_IQ4_XS; break;
11986
- case LLAMA_FTYPE_MOSTLY_IQ3_S: quantized_type = GGML_TYPE_IQ3_S; break;
11987
- case LLAMA_FTYPE_MOSTLY_IQ3_M: quantized_type = GGML_TYPE_IQ3_S; break;
11976
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break;
11977
+ case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break;
11978
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
11979
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break;
11980
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break;
11981
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
11982
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
11983
+ case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
11984
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
11985
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
11986
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
11987
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
11988
11988
11989
11989
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
11990
11990
}
@@ -12125,23 +12125,26 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12125
12125
// do not quantize Mamba's small yet 2D weights
12126
12126
// NOTE: can't use LLM_TN here because the layer number is not known
12127
12127
quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
12128
- quantize &= name.find("ssm_x.weight") == std::string::npos;
12129
- quantize &= name.find("ssm_dt.weight") == std::string::npos;
12128
+ quantize &= name.find("ssm_x.weight") == std::string::npos;
12129
+ quantize &= name.find("ssm_dt.weight") == std::string::npos;
12130
12130
12131
12131
enum ggml_type new_type;
12132
12132
void * new_data;
12133
12133
size_t new_size;
12134
12134
12135
12135
if (quantize) {
12136
- new_type = quantized_type;
12137
- if (!params->pure) {
12138
- new_type = get_k_quant_type(qs, new_type, tensor, ftype);
12136
+ new_type = default_type;
12137
+
12138
+ // get more optimal quantization type based on the tensor shape, layer, etc.
12139
+ if (!params->pure && ggml_is_quantized(default_type)) {
12140
+ new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
12139
12141
}
12140
12142
12141
12143
// If we've decided to quantize to the same type the tensor is already
12142
12144
// in then there's nothing to do.
12143
12145
quantize = tensor->type != new_type;
12144
12146
}
12147
+
12145
12148
if (!quantize) {
12146
12149
new_type = tensor->type;
12147
12150
new_data = tensor->data;
@@ -12187,7 +12190,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12187
12190
f32_data = (float *) f32_conv_buf.data();
12188
12191
}
12189
12192
12190
- LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
12193
+ LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
12191
12194
fflush(stdout);
12192
12195
12193
12196
if (work.size() < nelements * 4) {
@@ -12235,7 +12238,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
12235
12238
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
12236
12239
12237
12240
if (qs.n_fallback > 0) {
12238
- LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n",
12241
+ LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
12239
12242
__func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
12240
12243
}
12241
12244
}
0 commit comments