Skip to content

Commit 205b919

Browse files
committed
iq1_m: checking pure iq1_m quantization
It is pretty bad: PPL(LLaMA-v2-7B) = 34 if we quantize output.weight with Q4_K.
1 parent 0dc1214 commit 205b919

File tree

3 files changed

+16
-6
lines changed

3 files changed

+16
-6
lines changed

ggml-quants.c

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11968,7 +11968,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
1196811968
const int * kmap_q2xs = iq2_data[gindex].map;
1196911969
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
1197011970

11971-
GGML_ASSERT(quant_weights && "missing quantization weights");
11971+
//GGML_ASSERT(quant_weights && "missing quantization weights");
1197211972
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
1197311973
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
1197411974
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
@@ -12006,8 +12006,12 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
1200612006

1200712007
for (int ib = 0; ib < QK_K/block_size; ++ib) {
1200812008
const float * xb = xbl + block_size*ib;
12009-
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
12010-
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
12009+
if (quant_weights) {
12010+
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
12011+
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
12012+
} else {
12013+
for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
12014+
}
1201112015
float max = fabsf(xb[0]);
1201212016
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
1201312017
if (!max) {

ggml.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20353,8 +20353,8 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
2035320353
return
2035420354
type == GGML_TYPE_IQ2_XXS ||
2035520355
type == GGML_TYPE_IQ2_XS ||
20356-
type == GGML_TYPE_IQ1_S ||
20357-
type == GGML_TYPE_IQ1_M;
20356+
type == GGML_TYPE_IQ1_S;// ||
20357+
//type == GGML_TYPE_IQ1_M;
2035820358
}
2035920359

2036020360
size_t ggml_quantize_chunk(

llama.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12919,6 +12919,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1291912919
if (!params->pure && ggml_is_quantized(default_type)) {
1292012920
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
1292112921
}
12922+
else if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
12923+
new_type = params->token_embedding_type;
12924+
}
12925+
else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
12926+
new_type = params->output_tensor_type;
12927+
}
1292212928

1292312929
// If we've decided to quantize to the same type the tensor is already
1292412930
// in then there's nothing to do.
@@ -12951,7 +12957,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1295112957
new_type == GGML_TYPE_IQ2_XS ||
1295212958
new_type == GGML_TYPE_IQ2_S ||
1295312959
new_type == GGML_TYPE_IQ1_S ||
12954-
new_type == GGML_TYPE_IQ1_M ||
12960+
(new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
1295512961
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
1295612962
LLAMA_LOG_ERROR("\n\n============================================================\n");
1295712963
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);

0 commit comments

Comments
 (0)