|
14 | 14 | #include <stdlib.h> // for qsort
|
15 | 15 | #include <stdio.h> // for GGML_ASSERT
|
16 | 16 |
|
17 |
| -#define GROUP_MAX_EPS 1e-7f |
| 17 | +#define GROUP_MAX_EPS 1e-15f |
| 18 | +#define GROUP_MAX_EPS_IQ3_XXS 1e-8f |
| 19 | +#define GROUP_MAX_EPS_IQ2_S 1e-8f |
| 20 | +#define GROUP_MAX_EPS_IQ1_M 1e-7f |
| 21 | +#define GROUP_MAX_EPS_IQ1_S 1e-12f |
18 | 22 |
|
19 | 23 | #if defined(_MSC_VER)
|
20 | 24 | // disable "possible loss of data" to avoid warnings for hundreds of casts
|
@@ -1648,7 +1652,7 @@ static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t *
|
1648 | 1652 | break;
|
1649 | 1653 | }
|
1650 | 1654 | }
|
1651 |
| - return sumlx / suml2; |
| 1655 | + return sumlx/suml2; |
1652 | 1656 | }
|
1653 | 1657 |
|
1654 | 1658 | static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
|
@@ -12598,7 +12602,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
12598 | 12602 | }
|
12599 | 12603 | float max = xval[0];
|
12600 | 12604 | for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
12601 |
| - if (!max) { |
| 12605 | + if (max < GROUP_MAX_EPS) { |
12602 | 12606 | scales[ib] = 0;
|
12603 | 12607 | memset(L, 0, 32);
|
12604 | 12608 | continue;
|
@@ -13215,7 +13219,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
|
13215 | 13219 | }
|
13216 | 13220 | float max = xval[0];
|
13217 | 13221 | for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
13218 |
| - if (max < GROUP_MAX_EPS) { |
| 13222 | + if (max < GROUP_MAX_EPS_IQ3_XXS) { |
13219 | 13223 | scales[ib] = 0;
|
13220 | 13224 | memset(L, 0, 32);
|
13221 | 13225 | continue;
|
@@ -13755,7 +13759,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
13755 | 13759 | for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
13756 | 13760 | float max = fabsf(xb[0]);
|
13757 | 13761 | for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
13758 |
| - if (max < GROUP_MAX_EPS) { |
| 13762 | + if (max < GROUP_MAX_EPS_IQ1_S) { |
13759 | 13763 | scales[ib] = 0;
|
13760 | 13764 | memset(L, 1, block_size);
|
13761 | 13765 | continue;
|
@@ -13943,7 +13947,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
|
13943 | 13947 | }
|
13944 | 13948 | float max = fabsf(xb[0]);
|
13945 | 13949 | for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
13946 |
| - if (max < GROUP_MAX_EPS) { |
| 13950 | + if (max < GROUP_MAX_EPS_IQ1_M) { |
13947 | 13951 | scales[ib] = 0;
|
13948 | 13952 | memset(L, 1, block_size);
|
13949 | 13953 | continue;
|
@@ -14428,7 +14432,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
|
14428 | 14432 | }
|
14429 | 14433 | float max = xval[0];
|
14430 | 14434 | for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
14431 |
| - if (max < GROUP_MAX_EPS) { |
| 14435 | + if (max < GROUP_MAX_EPS_IQ2_S) { |
14432 | 14436 | scales[ib] = 0;
|
14433 | 14437 | continue;
|
14434 | 14438 | }
|
|
0 commit comments