ggml : fix quants nans when all the group weights are very close to zero

slaren · slaren · commit 6fa6a9a10acc · 2024-05-15T23:26:11.000+02:00
diff --git a/ggml-quants.c b/ggml-quants.c
@@ -1109,7 +1109,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
         float ax = fabsf(x[i]);
         if (ax > amax) { amax = ax; max = x[i]; }
     }
-    if (amax < 1e-30f) { // all zero
+    if (amax < 1e-20f) { // all zero
         for (int i = 0; i < n; ++i) {
             L[i] = 0;
         }
@@ -1177,7 +1177,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
         float ax = fabsf(x[i]);
         if (ax > amax) { amax = ax; max = x[i]; }
     }
-    if (!amax) { // all zero
+    if (amax < 1e20f) { // all zero
         for (int i = 0; i < n; ++i) { L[i] = 0; }
         return 0.f;
     }
@@ -2653,7 +2653,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
 
         }
 
-        if (!max_abs_scale) {
+        if (max_abs_scale < 1e-20f) {
             memset(&y[i], 0, sizeof(block_q6_K));
             y[i].d = GGML_FP32_TO_FP16(0.f);
             x += QK_K;
@@ -2805,7 +2805,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
 
         }
 
-        if (!max_abs_scale) {
+        if (max_abs_scale < 1e-20f) {
             memset(&y[i], 0, sizeof(block_q6_K));
             y[i].d = GGML_FP32_TO_FP16(0.f);
             x += QK_K;
@@ -13213,7 +13213,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
             }
             float max = xval[0];
             for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
-            if (!max) {
+            if (max < 1e-20f) {
                 scales[ib] = 0;
                 memset(L, 0, 32);
                 continue;
@@ -13941,7 +13941,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
             }
             float max = fabsf(xb[0]);
             for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
-            if (!max) {
+            if (max < 1e-20f) {
                 scales[ib] = 0;
                 memset(L, 1, block_size);
                 continue;
@@ -14205,7 +14205,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
                 amax = ax; max = xb[j];
             }
         }
-        if (!amax) {
+        if (amax < 1e-20f) {
             scales[ib] = 0;
             continue;
         }
@@ -14426,7 +14426,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
             }
             float max = xval[0];
             for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
-            if (!max) {
+            if (max < 1e-20f) {
                 scales[ib] = 0;
                 continue;
             }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -49,6 +49,15 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
         t.join();
     }
 
+#if 0
+    // test quantization with very small values that may result in nan scales due to division by zero
+    if (ggml_is_quantized(tensor->type)) {
+        for (int i = 0; i < 256; i++) {
+            data[i] = 1e-24f;
+        }
+    }
+#endif
+
     if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
         ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
     } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {

Original file line number	Diff line number	Diff line change
`@@ -1109,7 +1109,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *`
`1109`	`1109`	`float ax = fabsf(x[i]);`
`1110`	`1110`	`if (ax > amax) { amax = ax; max = x[i]; }`
`1111`	`1111`	`}`
`1112`		`- if (amax < 1e-30f) { // all zero`
	`1112`	`+ if (amax < 1e-20f) { // all zero`
`1113`	`1113`	`for (int i = 0; i < n; ++i) {`
`1114`	`1114`	`L[i] = 0;`
`1115`	`1115`	`}`
`@@ -1177,7 +1177,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *`
`1177`	`1177`	`float ax = fabsf(x[i]);`
`1178`	`1178`	`if (ax > amax) { amax = ax; max = x[i]; }`
`1179`	`1179`	`}`
`1180`		`- if (!amax) { // all zero`
	`1180`	`+ if (amax < 1e20f) { // all zero`
`1181`	`1181`	`for (int i = 0; i < n; ++i) { L[i] = 0; }`
`1182`	`1182`	`return 0.f;`
`1183`	`1183`	`}`
`@@ -2653,7 +2653,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict`
`2653`	`2653`
`2654`	`2654`	`}`
`2655`	`2655`
`2656`		`- if (!max_abs_scale) {`
	`2656`	`+ if (max_abs_scale < 1e-20f) {`
`2657`	`2657`	`memset(&y[i], 0, sizeof(block_q6_K));`
`2658`	`2658`	`y[i].d = GGML_FP32_TO_FP16(0.f);`
`2659`	`2659`	`x += QK_K;`
`@@ -2805,7 +2805,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri`
`2805`	`2805`
`2806`	`2806`	`}`
`2807`	`2807`
`2808`		`- if (!max_abs_scale) {`
	`2808`	`+ if (max_abs_scale < 1e-20f) {`
`2809`	`2809`	`memset(&y[i], 0, sizeof(block_q6_K));`
`2810`	`2810`	`y[i].d = GGML_FP32_TO_FP16(0.f);`
`2811`	`2811`	`x += QK_K;`
`@@ -13213,7 +13213,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v`
`13213`	`13213`	`}`
`13214`	`13214`	`float max = xval[0];`
`13215`	`13215`	`for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);`
`13216`		`- if (!max) {`
	`13216`	`+ if (max < 1e-20f) {`
`13217`	`13217`	`scales[ib] = 0;`
`13218`	`13218`	`memset(L, 0, 32);`
`13219`	`13219`	`continue;`
`@@ -13941,7 +13941,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy`
`13941`	`13941`	`}`
`13942`	`13942`	`float max = fabsf(xb[0]);`
`13943`	`13943`	`for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));`
`13944`		`- if (!max) {`
	`13944`	`+ if (max < 1e-20f) {`
`13945`	`13945`	`scales[ib] = 0;`
`13946`	`13946`	`memset(L, 1, block_size);`
`13947`	`13947`	`continue;`
`@@ -14205,7 +14205,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block`
`14205`	`14205`	`amax = ax; max = xb[j];`
`14206`	`14206`	`}`
`14207`	`14207`	`}`
`14208`		`- if (!amax) {`
	`14208`	`+ if (amax < 1e-20f) {`
`14209`	`14209`	`scales[ib] = 0;`
`14210`	`14210`	`continue;`
`14211`	`14211`	`}`
`@@ -14426,7 +14426,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy`
`14426`	`14426`	`}`
`14427`	`14427`	`float max = xval[0];`
`14428`	`14428`	`for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);`
`14429`		`- if (!max) {`
	`14429`	`+ if (max < 1e-20f) {`
`14430`	`14430`	`scales[ib] = 0;`
`14431`	`14431`	`continue;`
`14432`	`14432`	`}`