Q4_0 scale selection using RMSE

sw · sw · commit de6600939fb5 · 2023-04-07T16:34:59.000+02:00
diff --git a/SHA256SUMS b/SHA256SUMS
@@ -1,4 +1,5 @@
 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
+5dec1979849d73e361a8bcc10bc8f53237cbbe435a572882dc87629e011e24b3  models/7B/ggml-model-q4_0.bin
 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
diff --git a/examples/quantize/scale.py b/examples/quantize/scale.py
@@ -0,0 +1,76 @@
+import matplotlib.pyplot as plt
+
+# Generated by quantizing the entire 7B model with the first element of each tuple as the scale factor.
+# The second element of the tuple is the number of q4_0 blocks for which that scale factor has lowest RMSE.
+data = (
+    (-10.0, 0),
+    (-9.9, 1),
+    (-9.8, 3),
+    (-9.7, 65),
+    (-9.6, 738),
+    (-9.5, 5779),
+    (-9.4, 30880),
+    (-9.3, 121078),
+    (-9.2, 375674),
+    (-9.1, 941350),
+    (-9.0, 1990278),
+    (-8.9, 3635317),
+    (-8.8, 5891752),
+    (-8.7, 8678748),
+    (-8.6, 11771759),
+    (-8.5, 14873993),
+    (-8.4, 17594260),
+    (-8.3, 19553100),
+    (-8.2, 20415428),
+    (-8.1, 20017134),
+    (-8.0, 18357204),
+    (-7.9, 15597612),
+    (-7.8, 11993688),
+    (-7.7, 7842970),
+    (-7.6, 2880878),
+    (-7.5, 3478),
+    (-7.4, 2648437),
+    (-7.3, 5641970),
+    (-7.2, 5935890),
+    (-7.1, 4910790),
+    (-7.0, 3425891),
+    (-6.9, 2068250),
+    (-6.8, 1089883),
+    (-6.7, 502462),
+    (-6.6, 156356),
+    (-6.5, 205),
+    (-6.4, 163500),
+    (-6.3, 386291),
+    (-6.2, 423018),
+    (-6.1, 319360),
+    (-6.0, 180783),
+    (-5.9, 78822),
+    (-5.8, 28254),
+    (-5.7, 8698),
+    (-5.6, 1969),
+    (-5.5, 0),
+    (-5.4, 2069),
+    (-5.3, 5722),
+    (-5.2, 7107),
+    (-5.1, 5113),
+    (-5.0, 2332),
+    (-4.9, 636),
+    (-4.8, 130),
+    (-4.7, 12),
+    (-4.6, 1),
+    (-4.5, 0),
+    (-4.4, 3),
+    (-4.3, 4),
+    (-4.2, 8),
+    (-4.1, 8),
+    (-4.0, 27),
+)
+x, y = zip(*data)
+
+fig, ax = plt.subplots()
+b = ax.bar(x, y, 0.1, bottom=1)
+ax.set_yscale("log")
+ax.set_xlabel("scale")
+ax.set_ylabel("N")
+plt.title("Quantization scale factor with lowest RMS error")
+plt.show()
diff --git a/ggml.c b/ggml.c
@@ -73,11 +73,15 @@ static int sched_yield (void) {
     Sleep (0);
     return 0;
 }
+
+#define __attribute__(...)
 #else
 #include <pthread.h>
 #include <stdatomic.h>
 
 typedef void* thread_ret_t;
+
+#define __declspec(...)
 #endif
 
 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -517,39 +521,120 @@ typedef struct {
 static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK / 2, "wrong q4_1 block size/padding");
 
 // reference implementation for deterministic creation of model files
-static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
-    assert(k % QK == 0);
-    const int nb = k / QK;
-
+static inline void quantize_block_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, float scale) {
     uint8_t pp[QK/2];
 
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
+    float amax = 0.0f; // absolute max
+    float max = 0.0f;
 
-        for (int l = 0; l < QK; l++) {
-            const float v = x[i*QK + l];
-            amax = MAX(amax, fabsf(v));
+    for (int l = 0; l < QK; l++) {
+        const float v = x[l];
+        if (amax < fabsf(v)) {
+            amax = fabsf(v);
+            max = v;
         }
+    }
 
-        const float d = amax / ((1 << 3) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
+    const float d = max / scale;
+    const float id = d ? 1.0f/d : 0.0f;
 
-        y[i].d = d;
+    y->d = d;
 
-        for (int l = 0; l < QK; l += 2) {
-            const float v0 = x[i*QK + l + 0]*id;
-            const float v1 = x[i*QK + l + 1]*id;
+    for (int l = 0; l < QK; l += 2) {
+        const float v0 = x[l + 0]*id;
+        const float v1 = x[l + 1]*id;
 
-            const uint8_t vi0 = (int8_t)roundf(v0) + 8;
-            const uint8_t vi1 = (int8_t)roundf(v1) + 8;
+        int8_t vs0 = roundf(v0);
+        int8_t vs1 = roundf(v1);
 
-            assert(vi0 < 16);
-            assert(vi1 < 16);
+        vs0 = MIN(MAX(0 - 8, vs0), 15 - 8);
+        vs1 = MIN(MAX(0 - 8, vs1), 15 - 8);
 
-            pp[l/2] = vi0 | (vi1 << 4);
+        const uint8_t vi0 = vs0 + 8;    // guaranteed to fit into 4 bits
+        const uint8_t vi1 = vs1 + 8;    // thanks to the clamping of the signed values above
+
+        pp[l/2] = vi0 | (vi1 << 4);
+    }
+
+    memcpy(y->qs, pp, sizeof(pp));
+}
+
+static void quantize_row_q4_0_rmse(const float * restrict x, block_q4_0 * restrict y, int k) {
+    // For each q4_0 block, we try the following values to scale the shared float value
+    // and pick the one with lowest RMS error. We could do a more involved search,
+    // but this is a trade-off with speed of model generation and simplicity of the code.
+    // Operating on 8 values can reasonably be loop-unrolled or vectorized, but that is not
+    // manually done here.
+    // Values hand-picked according to histogram in examples/quantize/scale.py
+    // Include the value +7 of the old method to ensure we don't regress on RMSE on any block.
+    #define Q4_0_SCALE_CANDIDATE_COUNT 8
+    static const float candidates[Q4_0_SCALE_CANDIDATE_COUNT] = { -8.7f, -8.5f, -8.3f, -8.1f, -7.9f, -7.7f, -7.2f, +7.0f };
+
+    assert(k % QK == 0);
+    const int nb = k / QK;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+        float max = 0.0f;
+
+        for (int l = 0; l < QK; l++) {
+            const float v = x[i*QK + l];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max = v;
+            }
         }
 
-        memcpy(y[i].qs, pp, sizeof(pp));
+        // find scale with lowest sum of squared errors, equivalent to lowest RMS error
+        float best_sqerr = +INFINITY;
+        float best_scale = NAN;
+
+        for (int si = 0; si < Q4_0_SCALE_CANDIDATE_COUNT; si++) {
+            const float scale = candidates[si];
+            const float d = max / scale;
+            const float id = d ? 1.0f / d : 0.0f;
+            float sqe_acc = 0.f;
+#ifdef __AVX2__
+            const __m256 clamp_lo = _mm256_set1_ps( 0 - 8);
+            const __m256 clamp_hi = _mm256_set1_ps(15 - 8);
+            const __m256 id256    = _mm256_set1_ps(id);
+            for (int l = 0; l < QK; l += 8) {
+                // TODO: why are the inputs not aligned to 32 bytes?
+                __m256 v = _mm256_loadu_ps(&x[i * QK + l]);
+                v = _mm256_mul_ps(v, id256);
+                __m256 vs = _mm256_round_ps(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+                vs = _mm256_min_ps(_mm256_max_ps(clamp_lo, vs), clamp_hi);
+                const __m256 err = _mm256_sub_ps(vs, v);
+                const __m256 sqe = _mm256_mul_ps(err, err);
+
+                // this is far from optimal speed-wise, but ensures identical results to scalar implementation
+                // we have to add the floats in sqe to sqe_acc separately and in the correct order
+                // 8x _mm_add_ps(,_mm_permute_ps()) would work but isn't faster than this:
+                __declspec(align(32)) float out[8] __attribute__((aligned(32)));
+                _mm256_store_ps(out, sqe);
+                for (int ei= 0; ei < 8; ei++) {
+                    sqe_acc += out[ei];
+                }
+            }
+#else
+            for (int l = 0; l < QK; l++) {
+                const float v = x[i * QK + l] * id;
+                int8_t vs = roundf(v);
+                vs = MIN(MAX(0 - 8, vs), 15 - 8);
+                sqe_acc += (vs - v) * (vs - v);
+            }
+#endif
+            // the square error sum is calculated on un-scaled q's inside the inner loop
+            sqe_acc *= d * d;
+
+            if (best_sqerr > sqe_acc) {
+                best_sqerr = sqe_acc;
+                best_scale = scale;
+            }
+        }
+        assert(isfinite(best_sqerr));
+        assert(isfinite(best_scale));
+        quantize_block_q4_0_reference(x + i * QK, y + i, best_scale);
     }
 }
 
@@ -803,7 +888,9 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
     }
 #else
     // scalar
-    quantize_row_q4_0_reference(x, y, k);
+    for (int i = 0; i < nb; i++) {
+        quantize_block_q4_0_reference(x + i*QK, y + i, 7);
+    }
 #endif
 }
 
@@ -10604,7 +10691,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
     for (int j = 0; j < n; j += k) {
         block_q4_0 * restrict y = (block_q4_0 *)dst + j/QK;
 
-        quantize_row_q4_0_reference(src + j, y, k);
+        quantize_row_q4_0_rmse(src + j, y, k);
 
         for (int i = 0; i < nb; i++) {
             for (int l = 0; l < QK; l += 2) {
diff --git a/llama.cpp b/llama.cpp
@@ -644,7 +644,7 @@ static bool llama_model_load(
         size_t total_size = 0;
         model.n_loaded = 0;
 
-        while (true) {
+        while (size_t(fin.tellg()) + 12 < file_size) {
             int32_t n_dims;
             int32_t length;
             int32_t ftype;
@@ -653,10 +653,6 @@ static bool llama_model_load(
             fin.read(reinterpret_cast<char *>(&length), sizeof(length));
             fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
 
-            if (fin.eof()) {
-                break;
-            }
-
             int32_t nelements = 1;
             int32_t ne[2] = { 1, 1 };
             for (int i = 0; i < n_dims; ++i) {
@@ -707,6 +703,10 @@ static bool llama_model_load(
             offset = (offset + 31) & -32;
             tensor->data = mm_addr + offset;
             fin.seekg(offset + tensor_data_size);
+            if (fin.eof()) {
+               fprintf(stderr, "%s: Truncated file?\n", __func__);
+               return false;
+            }
             total_size += tensor_data_size;
             model.n_loaded++;
 
@@ -717,6 +717,18 @@ static bool llama_model_load(
             }
         }
 
+        uint32_t version_minor = 0;
+        fin.read((char *)&version_minor, sizeof(version_minor));
+        if (fin.eof() || version_minor < LLAMA_FILE_VERSION_MINOR) {
+#if LLAMA_FILE_VERSION_MINOR == 1
+            if (model.hparams.f16 == 2) {
+                fprintf(stderr, "%s: WARN no minor version detected - your file will work but consider re-creating it for better quantization\n", __func__);
+            }
+#else
+    #error Provide a helpful message that explains why the user may want to update their files
+#endif
+        }
+
         fin.close();
 
         fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
@@ -1583,6 +1595,15 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
         }
     }
 
+#if LLAMA_FILE_VERSION_MINOR == 1
+    if ((LLAMA_FILE_VERSION_MINOR > 1) || (itype == 2)) {
+#else
+    #error Check if this condition needs updating for minimal model checksum changes
+#endif
+        uint32_t version_minor = LLAMA_FILE_VERSION_MINOR;
+        fout.write((char *)&version_minor, sizeof(version_minor));
+    }
+
     finp.close();
     fout.close();
 
diff --git a/llama.h b/llama.h
@@ -20,6 +20,7 @@
 #endif
 
 #define LLAMA_FILE_VERSION 1
+#define LLAMA_FILE_VERSION_MINOR 1 // for backward-compatible changes
 #define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
 #define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
 
diff --git a/tests/test-quantize.c b/tests/test-quantize.c
@@ -13,18 +13,7 @@ int main(void) {
         src[i] = (float)(i + 1);
     }
 
-    size_t size = ggml_quantize_q4_0(src, dst, QK, QK, hist);
-    assert(size == 20);
-    float max_result = ((float *)dst)[0];
-    float max_expected = src[31] / ((1 << 3) - 1);
-    assert(max_result == max_expected);
-    for (int i = 0; i < QK; i++) {
-        uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF);
-        uint8_t q4_expected = roundf(src[i] / max_expected) + 8;
-        assert(q4_result == q4_expected);
-    }
-
-    size = ggml_quantize_q4_1(src, dst, QK, QK, hist);
+    size_t size = ggml_quantize_q4_1(src, dst, QK, QK, hist);
     assert(size == 24);
     float delta_result = ((float *)dst)[0];
     float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth`
	`2`	`+5dec1979849d73e361a8bcc10bc8f53237cbbe435a572882dc87629e011e24b3 models/7B/ggml-model-q4_0.bin`
`2`	`3`	`7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json`
`3`	`4`	`745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth`
`4`	`5`	`d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth`