|
14 | 14 | #include <stdlib.h> // for qsort
|
15 | 15 | #include <stdio.h> // for GGML_ASSERT
|
16 | 16 |
|
| 17 | +#define GROUP_MAX_EPS 1e-15f |
| 18 | +#define GROUP_MAX_EPS_IQ3_XXS 1e-8f |
| 19 | +#define GROUP_MAX_EPS_IQ2_S 1e-8f |
| 20 | +#define GROUP_MAX_EPS_IQ1_M 1e-7f |
| 21 | +#define GROUP_MAX_EPS_IQ1_S 1e-12f |
| 22 | + |
17 | 23 | #if defined(_MSC_VER)
|
18 | 24 | // disable "possible loss of data" to avoid warnings for hundreds of casts
|
19 | 25 | // we should just be careful :)
|
@@ -1109,7 +1115,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
1109 | 1115 | float ax = fabsf(x[i]);
|
1110 | 1116 | if (ax > amax) { amax = ax; max = x[i]; }
|
1111 | 1117 | }
|
1112 |
| - if (amax < 1e-30f) { // all zero |
| 1118 | + if (amax < GROUP_MAX_EPS) { // all zero |
1113 | 1119 | for (int i = 0; i < n; ++i) {
|
1114 | 1120 | L[i] = 0;
|
1115 | 1121 | }
|
@@ -1177,7 +1183,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
|
1177 | 1183 | float ax = fabsf(x[i]);
|
1178 | 1184 | if (ax > amax) { amax = ax; max = x[i]; }
|
1179 | 1185 | }
|
1180 |
| - if (!amax) { // all zero |
| 1186 | + if (amax < GROUP_MAX_EPS) { // all zero |
1181 | 1187 | for (int i = 0; i < n; ++i) { L[i] = 0; }
|
1182 | 1188 | return 0.f;
|
1183 | 1189 | }
|
@@ -1646,7 +1652,7 @@ static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t *
|
1646 | 1652 | break;
|
1647 | 1653 | }
|
1648 | 1654 | }
|
1649 |
| - return sumlx / suml2; |
| 1655 | + return sumlx/suml2; |
1650 | 1656 | }
|
1651 | 1657 |
|
1652 | 1658 | static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
|
@@ -2653,7 +2659,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
|
2653 | 2659 |
|
2654 | 2660 | }
|
2655 | 2661 |
|
2656 |
| - if (!max_abs_scale) { |
| 2662 | + if (max_abs_scale < GROUP_MAX_EPS) { |
2657 | 2663 | memset(&y[i], 0, sizeof(block_q6_K));
|
2658 | 2664 | y[i].d = GGML_FP32_TO_FP16(0.f);
|
2659 | 2665 | x += QK_K;
|
@@ -2805,7 +2811,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
2805 | 2811 |
|
2806 | 2812 | }
|
2807 | 2813 |
|
2808 |
| - if (!max_abs_scale) { |
| 2814 | + if (max_abs_scale < GROUP_MAX_EPS) { |
2809 | 2815 | memset(&y[i], 0, sizeof(block_q6_K));
|
2810 | 2816 | y[i].d = GGML_FP32_TO_FP16(0.f);
|
2811 | 2817 | x += QK_K;
|
@@ -12599,7 +12605,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
12599 | 12605 | }
|
12600 | 12606 | float max = xval[0];
|
12601 | 12607 | for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
12602 |
| - if (!max) { |
| 12608 | + if (max < GROUP_MAX_EPS) { |
12603 | 12609 | scales[ib] = 0;
|
12604 | 12610 | memset(L, 0, 32);
|
12605 | 12611 | continue;
|
@@ -12775,7 +12781,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
12775 | 12781 | }
|
12776 | 12782 | float max = xval[0];
|
12777 | 12783 | for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
12778 |
| - if (!max) { |
| 12784 | + if (max < GROUP_MAX_EPS) { |
12779 | 12785 | scales[ib] = 0;
|
12780 | 12786 | memset(L, 0, 16);
|
12781 | 12787 | continue;
|
@@ -13216,7 +13222,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
|
13216 | 13222 | }
|
13217 | 13223 | float max = xval[0];
|
13218 | 13224 | for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
13219 |
| - if (!max) { |
| 13225 | + if (max < GROUP_MAX_EPS_IQ3_XXS) { |
13220 | 13226 | scales[ib] = 0;
|
13221 | 13227 | memset(L, 0, 32);
|
13222 | 13228 | continue;
|
@@ -13756,7 +13762,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
13756 | 13762 | for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
13757 | 13763 | float max = fabsf(xb[0]);
|
13758 | 13764 | for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
13759 |
| - if (!max) { |
| 13765 | + if (max < GROUP_MAX_EPS_IQ1_S) { |
13760 | 13766 | scales[ib] = 0;
|
13761 | 13767 | memset(L, 1, block_size);
|
13762 | 13768 | continue;
|
@@ -13944,7 +13950,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
|
13944 | 13950 | }
|
13945 | 13951 | float max = fabsf(xb[0]);
|
13946 | 13952 | for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
13947 |
| - if (!max) { |
| 13953 | + if (max < GROUP_MAX_EPS_IQ1_M) { |
13948 | 13954 | scales[ib] = 0;
|
13949 | 13955 | memset(L, 1, block_size);
|
13950 | 13956 | continue;
|
@@ -14208,7 +14214,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
14208 | 14214 | amax = ax; max = xb[j];
|
14209 | 14215 | }
|
14210 | 14216 | }
|
14211 |
| - if (!amax) { |
| 14217 | + if (amax < GROUP_MAX_EPS) { |
14212 | 14218 | scales[ib] = 0;
|
14213 | 14219 | continue;
|
14214 | 14220 | }
|
@@ -14429,7 +14435,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
|
14429 | 14435 | }
|
14430 | 14436 | float max = xval[0];
|
14431 | 14437 | for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
14432 |
| - if (!max) { |
| 14438 | + if (max < GROUP_MAX_EPS_IQ2_S) { |
14433 | 14439 | scales[ib] = 0;
|
14434 | 14440 | continue;
|
14435 | 14441 | }
|
|
0 commit comments