Skip to content

Commit ee35600

Browse files
committed
llama : fix F16/F32 downcast + improve names (#5980)
1 parent be858f6 commit ee35600

File tree

2 files changed

+36
-33
lines changed

2 files changed

+36
-33
lines changed

llama.cpp

Lines changed: 35 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -11636,7 +11636,7 @@ static void llama_tensor_dequantize_internal(
1163611636
workers.clear();
1163711637
}
1163811638

11639-
static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
11639+
static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
1164011640
const std::string name = ggml_get_name(tensor);
1164111641

1164211642
// TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -11951,40 +11951,40 @@ static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const flo
1195111951
}
1195211952

1195311953
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
11954-
ggml_type quantized_type;
11954+
ggml_type default_type;
1195511955
llama_ftype ftype = params->ftype;
1195611956

1195711957
switch (params->ftype) {
11958-
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
11959-
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
11960-
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
11961-
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
11962-
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
11963-
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
11964-
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
11958+
case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
11959+
case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
11960+
case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
11961+
case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
11962+
case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
11963+
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
11964+
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
1196511965

1196611966
// K-quants
1196711967
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
11968-
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
11969-
case LLAMA_FTYPE_MOSTLY_IQ3_XS: quantized_type = GGML_TYPE_IQ3_S; break;
11968+
case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
11969+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_S; break;
1197011970
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
1197111971
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
11972-
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
11972+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break;
1197311973
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
11974-
case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
11974+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break;
1197511975
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
11976-
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
11977-
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
11978-
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
11979-
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
11980-
case LLAMA_FTYPE_MOSTLY_IQ2_S: quantized_type = GGML_TYPE_IQ2_XS; break;
11981-
case LLAMA_FTYPE_MOSTLY_IQ2_M: quantized_type = GGML_TYPE_IQ2_S; break;
11982-
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
11983-
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
11984-
case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
11985-
case LLAMA_FTYPE_MOSTLY_IQ4_XS: quantized_type = GGML_TYPE_IQ4_XS; break;
11986-
case LLAMA_FTYPE_MOSTLY_IQ3_S: quantized_type = GGML_TYPE_IQ3_S; break;
11987-
case LLAMA_FTYPE_MOSTLY_IQ3_M: quantized_type = GGML_TYPE_IQ3_S; break;
11976+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break;
11977+
case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break;
11978+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
11979+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break;
11980+
case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break;
11981+
case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
11982+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
11983+
case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
11984+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
11985+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
11986+
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
11987+
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
1198811988

1198911989
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
1199011990
}
@@ -12125,23 +12125,26 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1212512125
// do not quantize Mamba's small yet 2D weights
1212612126
// NOTE: can't use LLM_TN here because the layer number is not known
1212712127
quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
12128-
quantize &= name.find("ssm_x.weight") == std::string::npos;
12129-
quantize &= name.find("ssm_dt.weight") == std::string::npos;
12128+
quantize &= name.find("ssm_x.weight") == std::string::npos;
12129+
quantize &= name.find("ssm_dt.weight") == std::string::npos;
1213012130

1213112131
enum ggml_type new_type;
1213212132
void * new_data;
1213312133
size_t new_size;
1213412134

1213512135
if (quantize) {
12136-
new_type = quantized_type;
12137-
if (!params->pure) {
12138-
new_type = get_k_quant_type(qs, new_type, tensor, ftype);
12136+
new_type = default_type;
12137+
12138+
// get more optimal quantization type based on the tensor shape, layer, etc.
12139+
if (!params->pure && ggml_is_quantized(default_type)) {
12140+
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
1213912141
}
1214012142

1214112143
// If we've decided to quantize to the same type the tensor is already
1214212144
// in then there's nothing to do.
1214312145
quantize = tensor->type != new_type;
1214412146
}
12147+
1214512148
if (!quantize) {
1214612149
new_type = tensor->type;
1214712150
new_data = tensor->data;
@@ -12187,7 +12190,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1218712190
f32_data = (float *) f32_conv_buf.data();
1218812191
}
1218912192

12190-
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
12193+
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
1219112194
fflush(stdout);
1219212195

1219312196
if (work.size() < nelements * 4) {
@@ -12235,7 +12238,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1223512238
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
1223612239

1223712240
if (qs.n_fallback > 0) {
12238-
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n",
12241+
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
1223912242
__func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
1224012243
}
1224112244
}

llama.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ extern "C" {
278278
bool allow_requantize; // allow quantizing non-f32/f16 tensors
279279
bool quantize_output_tensor; // quantize output.weight
280280
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
281-
bool pure; // disable k-quant mixtures and quantize all tensors to the same type
281+
bool pure; // quantize all tensors to the default type
282282
void * imatrix; // pointer to importance matrix data
283283
} llama_model_quantize_params;
284284

0 commit comments

Comments
 (0)