@@ -1109,7 +1109,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
1109
1109
float ax = fabsf(x[i]);
1110
1110
if (ax > amax) { amax = ax; max = x[i]; }
1111
1111
}
1112
- if (amax < 1e-30f ) { // all zero
1112
+ if (amax < 1e-20f ) { // all zero
1113
1113
for (int i = 0; i < n; ++i) {
1114
1114
L[i] = 0;
1115
1115
}
@@ -1177,7 +1177,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
1177
1177
float ax = fabsf(x[i]);
1178
1178
if (ax > amax) { amax = ax; max = x[i]; }
1179
1179
}
1180
- if (! amax) { // all zero
1180
+ if (amax < 1e20f ) { // all zero
1181
1181
for (int i = 0; i < n; ++i) { L[i] = 0; }
1182
1182
return 0.f;
1183
1183
}
@@ -2653,7 +2653,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
2653
2653
2654
2654
}
2655
2655
2656
- if (! max_abs_scale) {
2656
+ if (max_abs_scale < 1e-20f ) {
2657
2657
memset(&y[i], 0, sizeof(block_q6_K));
2658
2658
y[i].d = GGML_FP32_TO_FP16(0.f);
2659
2659
x += QK_K;
@@ -2805,7 +2805,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
2805
2805
2806
2806
}
2807
2807
2808
- if (! max_abs_scale) {
2808
+ if (max_abs_scale < 1e-20f ) {
2809
2809
memset(&y[i], 0, sizeof(block_q6_K));
2810
2810
y[i].d = GGML_FP32_TO_FP16(0.f);
2811
2811
x += QK_K;
@@ -13213,7 +13213,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
13213
13213
}
13214
13214
float max = xval[0];
13215
13215
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
13216
- if (! max) {
13216
+ if (max < 1e-20f ) {
13217
13217
scales[ib] = 0;
13218
13218
memset(L, 0, 32);
13219
13219
continue;
@@ -13941,7 +13941,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
13941
13941
}
13942
13942
float max = fabsf(xb[0]);
13943
13943
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
13944
- if (! max) {
13944
+ if (max < 1e-20f ) {
13945
13945
scales[ib] = 0;
13946
13946
memset(L, 1, block_size);
13947
13947
continue;
@@ -14205,7 +14205,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
14205
14205
amax = ax; max = xb[j];
14206
14206
}
14207
14207
}
14208
- if (! amax) {
14208
+ if (amax < 1e-20f ) {
14209
14209
scales[ib] = 0;
14210
14210
continue;
14211
14211
}
@@ -14426,7 +14426,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
14426
14426
}
14427
14427
float max = xval[0];
14428
14428
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
14429
- if (! max) {
14429
+ if (max < 1e-20f ) {
14430
14430
scales[ib] = 0;
14431
14431
continue;
14432
14432
}
0 commit comments