|
14 | 14 | #include <stdlib.h> // for qsort
|
15 | 15 | #include <stdio.h> // for GGML_ASSERT
|
16 | 16 |
|
| 17 | +#define GROUP_MAX_EPS 1e-7f |
| 18 | + |
17 | 19 | #if defined(_MSC_VER)
|
18 | 20 | // disable "possible loss of data" to avoid warnings for hundreds of casts
|
19 | 21 | // we should just be careful :)
|
@@ -1109,7 +1111,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
1109 | 1111 | float ax = fabsf(x[i]);
|
1110 | 1112 | if (ax > amax) { amax = ax; max = x[i]; }
|
1111 | 1113 | }
|
1112 |
| - if (amax < 1e-20f) { // all zero |
| 1114 | + if (amax < GROUP_MAX_EPS) { // all zero |
1113 | 1115 | for (int i = 0; i < n; ++i) {
|
1114 | 1116 | L[i] = 0;
|
1115 | 1117 | }
|
@@ -1177,7 +1179,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
|
1177 | 1179 | float ax = fabsf(x[i]);
|
1178 | 1180 | if (ax > amax) { amax = ax; max = x[i]; }
|
1179 | 1181 | }
|
1180 |
| - if (amax < 1e20f) { // all zero |
| 1182 | + if (amax < GROUP_MAX_EPS) { // all zero |
1181 | 1183 | for (int i = 0; i < n; ++i) { L[i] = 0; }
|
1182 | 1184 | return 0.f;
|
1183 | 1185 | }
|
@@ -2653,7 +2655,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
|
2653 | 2655 |
|
2654 | 2656 | }
|
2655 | 2657 |
|
2656 |
| - if (max_abs_scale < 1e-20f) { |
| 2658 | + if (max_abs_scale < GROUP_MAX_EPS) { |
2657 | 2659 | memset(&y[i], 0, sizeof(block_q6_K));
|
2658 | 2660 | y[i].d = GGML_FP32_TO_FP16(0.f);
|
2659 | 2661 | x += QK_K;
|
@@ -2805,7 +2807,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
2805 | 2807 |
|
2806 | 2808 | }
|
2807 | 2809 |
|
2808 |
| - if (max_abs_scale < 1e-20f) { |
| 2810 | + if (max_abs_scale < GROUP_MAX_EPS) { |
2809 | 2811 | memset(&y[i], 0, sizeof(block_q6_K));
|
2810 | 2812 | y[i].d = GGML_FP32_TO_FP16(0.f);
|
2811 | 2813 | x += QK_K;
|
@@ -12772,7 +12774,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
12772 | 12774 | }
|
12773 | 12775 | float max = xval[0];
|
12774 | 12776 | for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
12775 |
| - if (!max) { |
| 12777 | + if (max < GROUP_MAX_EPS) { |
12776 | 12778 | scales[ib] = 0;
|
12777 | 12779 | memset(L, 0, 16);
|
12778 | 12780 | continue;
|
@@ -13213,7 +13215,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
|
13213 | 13215 | }
|
13214 | 13216 | float max = xval[0];
|
13215 | 13217 | for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
13216 |
| - if (max < 1e-20f) { |
| 13218 | + if (max < GROUP_MAX_EPS) { |
13217 | 13219 | scales[ib] = 0;
|
13218 | 13220 | memset(L, 0, 32);
|
13219 | 13221 | continue;
|
@@ -13753,7 +13755,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
13753 | 13755 | for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
13754 | 13756 | float max = fabsf(xb[0]);
|
13755 | 13757 | for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
13756 |
| - if (!max) { |
| 13758 | + if (max < GROUP_MAX_EPS) { |
13757 | 13759 | scales[ib] = 0;
|
13758 | 13760 | memset(L, 1, block_size);
|
13759 | 13761 | continue;
|
@@ -13941,7 +13943,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
|
13941 | 13943 | }
|
13942 | 13944 | float max = fabsf(xb[0]);
|
13943 | 13945 | for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
13944 |
| - if (max < 1e-20f) { |
| 13946 | + if (max < GROUP_MAX_EPS) { |
13945 | 13947 | scales[ib] = 0;
|
13946 | 13948 | memset(L, 1, block_size);
|
13947 | 13949 | continue;
|
@@ -14205,7 +14207,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
14205 | 14207 | amax = ax; max = xb[j];
|
14206 | 14208 | }
|
14207 | 14209 | }
|
14208 |
| - if (amax < 1e-20f) { |
| 14210 | + if (amax < GROUP_MAX_EPS) { |
14209 | 14211 | scales[ib] = 0;
|
14210 | 14212 | continue;
|
14211 | 14213 | }
|
@@ -14426,7 +14428,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
|
14426 | 14428 | }
|
14427 | 14429 | float max = xval[0];
|
14428 | 14430 | for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
14429 |
| - if (max < 1e-20f) { |
| 14431 | + if (max < GROUP_MAX_EPS) { |
14430 | 14432 | scales[ib] = 0;
|
14431 | 14433 | continue;
|
14432 | 14434 | }
|
|
0 commit comments