Skip to content

Commit 6fa6a9a

Browse files
committed
ggml : fix quants nans when all the group weights are very close to zero
1 parent e1b40ac commit 6fa6a9a

File tree

2 files changed

+17
-8
lines changed

2 files changed

+17
-8
lines changed

ggml-quants.c

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1109,7 +1109,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
11091109
float ax = fabsf(x[i]);
11101110
if (ax > amax) { amax = ax; max = x[i]; }
11111111
}
1112-
if (amax < 1e-30f) { // all zero
1112+
if (amax < 1e-20f) { // all zero
11131113
for (int i = 0; i < n; ++i) {
11141114
L[i] = 0;
11151115
}
@@ -1177,7 +1177,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
11771177
float ax = fabsf(x[i]);
11781178
if (ax > amax) { amax = ax; max = x[i]; }
11791179
}
1180-
if (!amax) { // all zero
1180+
if (amax < 1e20f) { // all zero
11811181
for (int i = 0; i < n; ++i) { L[i] = 0; }
11821182
return 0.f;
11831183
}
@@ -2653,7 +2653,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
26532653

26542654
}
26552655

2656-
if (!max_abs_scale) {
2656+
if (max_abs_scale < 1e-20f) {
26572657
memset(&y[i], 0, sizeof(block_q6_K));
26582658
y[i].d = GGML_FP32_TO_FP16(0.f);
26592659
x += QK_K;
@@ -2805,7 +2805,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
28052805

28062806
}
28072807

2808-
if (!max_abs_scale) {
2808+
if (max_abs_scale < 1e-20f) {
28092809
memset(&y[i], 0, sizeof(block_q6_K));
28102810
y[i].d = GGML_FP32_TO_FP16(0.f);
28112811
x += QK_K;
@@ -13213,7 +13213,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
1321313213
}
1321413214
float max = xval[0];
1321513215
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
13216-
if (!max) {
13216+
if (max < 1e-20f) {
1321713217
scales[ib] = 0;
1321813218
memset(L, 0, 32);
1321913219
continue;
@@ -13941,7 +13941,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
1394113941
}
1394213942
float max = fabsf(xb[0]);
1394313943
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
13944-
if (!max) {
13944+
if (max < 1e-20f) {
1394513945
scales[ib] = 0;
1394613946
memset(L, 1, block_size);
1394713947
continue;
@@ -14205,7 +14205,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
1420514205
amax = ax; max = xb[j];
1420614206
}
1420714207
}
14208-
if (!amax) {
14208+
if (amax < 1e-20f) {
1420914209
scales[ib] = 0;
1421014210
continue;
1421114211
}
@@ -14426,7 +14426,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
1442614426
}
1442714427
float max = xval[0];
1442814428
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
14429-
if (!max) {
14429+
if (max < 1e-20f) {
1443014430
scales[ib] = 0;
1443114431
continue;
1443214432
}

tests/test-backend-ops.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,15 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
4949
t.join();
5050
}
5151

52+
#if 0
53+
// test quantization with very small values that may result in nan scales due to division by zero
54+
if (ggml_is_quantized(tensor->type)) {
55+
for (int i = 0; i < 256; i++) {
56+
data[i] = 1e-24f;
57+
}
58+
}
59+
#endif
60+
5261
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
5362
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
5463
} else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {

0 commit comments

Comments
 (0)