Skip to content

Commit f59edee

Browse files
committed
increase eps to 1e-7
1 parent 6fa6a9a commit f59edee

File tree

2 files changed

+13
-11
lines changed

2 files changed

+13
-11
lines changed

ggml-quants.c

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
#include <stdlib.h> // for qsort
1515
#include <stdio.h> // for GGML_ASSERT
1616

17+
#define GROUP_MAX_EPS 1e-7f
18+
1719
#if defined(_MSC_VER)
1820
// disable "possible loss of data" to avoid warnings for hundreds of casts
1921
// we should just be careful :)
@@ -1109,7 +1111,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
11091111
float ax = fabsf(x[i]);
11101112
if (ax > amax) { amax = ax; max = x[i]; }
11111113
}
1112-
if (amax < 1e-20f) { // all zero
1114+
if (amax < GROUP_MAX_EPS) { // all zero
11131115
for (int i = 0; i < n; ++i) {
11141116
L[i] = 0;
11151117
}
@@ -1177,7 +1179,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
11771179
float ax = fabsf(x[i]);
11781180
if (ax > amax) { amax = ax; max = x[i]; }
11791181
}
1180-
if (amax < 1e20f) { // all zero
1182+
if (amax < GROUP_MAX_EPS) { // all zero
11811183
for (int i = 0; i < n; ++i) { L[i] = 0; }
11821184
return 0.f;
11831185
}
@@ -2653,7 +2655,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
26532655

26542656
}
26552657

2656-
if (max_abs_scale < 1e-20f) {
2658+
if (max_abs_scale < GROUP_MAX_EPS) {
26572659
memset(&y[i], 0, sizeof(block_q6_K));
26582660
y[i].d = GGML_FP32_TO_FP16(0.f);
26592661
x += QK_K;
@@ -2805,7 +2807,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
28052807

28062808
}
28072809

2808-
if (max_abs_scale < 1e-20f) {
2810+
if (max_abs_scale < GROUP_MAX_EPS) {
28092811
memset(&y[i], 0, sizeof(block_q6_K));
28102812
y[i].d = GGML_FP32_TO_FP16(0.f);
28112813
x += QK_K;
@@ -12772,7 +12774,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
1277212774
}
1277312775
float max = xval[0];
1277412776
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
12775-
if (!max) {
12777+
if (max < GROUP_MAX_EPS) {
1277612778
scales[ib] = 0;
1277712779
memset(L, 0, 16);
1277812780
continue;
@@ -13213,7 +13215,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
1321313215
}
1321413216
float max = xval[0];
1321513217
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
13216-
if (max < 1e-20f) {
13218+
if (max < GROUP_MAX_EPS) {
1321713219
scales[ib] = 0;
1321813220
memset(L, 0, 32);
1321913221
continue;
@@ -13753,7 +13755,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
1375313755
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
1375413756
float max = fabsf(xb[0]);
1375513757
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
13756-
if (!max) {
13758+
if (max < GROUP_MAX_EPS) {
1375713759
scales[ib] = 0;
1375813760
memset(L, 1, block_size);
1375913761
continue;
@@ -13941,7 +13943,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
1394113943
}
1394213944
float max = fabsf(xb[0]);
1394313945
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
13944-
if (max < 1e-20f) {
13946+
if (max < GROUP_MAX_EPS) {
1394513947
scales[ib] = 0;
1394613948
memset(L, 1, block_size);
1394713949
continue;
@@ -14205,7 +14207,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
1420514207
amax = ax; max = xb[j];
1420614208
}
1420714209
}
14208-
if (amax < 1e-20f) {
14210+
if (amax < GROUP_MAX_EPS) {
1420914211
scales[ib] = 0;
1421014212
continue;
1421114213
}
@@ -14426,7 +14428,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
1442614428
}
1442714429
float max = xval[0];
1442814430
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
14429-
if (max < 1e-20f) {
14431+
if (max < GROUP_MAX_EPS) {
1443014432
scales[ib] = 0;
1443114433
continue;
1443214434
}

tests/test-backend-ops.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
5353
// test quantization with very small values that may result in nan scales due to division by zero
5454
if (ggml_is_quantized(tensor->type)) {
5555
for (int i = 0; i < 256; i++) {
56-
data[i] = 1e-24f;
56+
data[i] = 1e-7f;
5757
}
5858
}
5959
#endif

0 commit comments

Comments
 (0)