ggml-org
diff --git a/‎Makefile
Lines changed: 1 addition & 1 deletion b/‎Makefile
Lines changed: 1 addition & 1 deletion
diff --git a/‎SHA256SUMS
Lines changed: 4 additions & 0 deletions b/‎SHA256SUMS
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/quantize-stats/quantize-stats.cpp
Lines changed: 61 additions & 42 deletions b/‎examples/quantize-stats/quantize-stats.cpp
Lines changed: 61 additions & 42 deletions
diff --git a/‎examples/quantize/scale.py
Lines changed: 76 additions & 0 deletions b/‎examples/quantize/scale.py
Lines changed: 76 additions & 0 deletions
@@ -133,7 +133,7 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )
 
-default: main quantize perplexity embedding
+default: main quantize quantize-stats perplexity embedding
 
 #
 # Build library
 
@@ -1,7 +1,11 @@
 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
+0cc0b0a3dc8cd29f005946f8364ac2bbce797e792a40c0fb4114615e4f825976  models/7B/ggml-model-f16.bin
+5dec1979849d73e361a8bcc10bc8f53237cbbe435a572882dc87629e011e24b3  models/7B/ggml-model-q4_0.bin
 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
+7da75a2a164a8fb4cfbdd4823111f3545c690c5d75c345a2419a9f1e2d24080f  models/13B/ggml-model-f16.bin
+4c5a285985bac6b8dcc56a97752b8ab70687ce0584daa6bb418ee458d91126e8  models/13B/ggml-model-q4_0.bin
 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
 e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff  models/30B/consolidated.01.pth
 
@@ -17,12 +17,15 @@
 static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32"  };
 static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
 
+static const char * impl_strs[] = { "simd", "reference", "rmse" };
+static_assert(sizeof(impl_strs) == GGML_QUANTIZE_IMPL_COUNT * sizeof(char *), "Incomplete implementation list");
+
 struct quantize_stats_params {
     std::string model = "models/7B/ggml-model-f16.bin";
     bool verbose = false;
     bool per_layer_stats = false;
     bool print_histogram = false;
-    bool reference = false;
+    std::vector<ggml_quantize_impl_t> include_impl;
     std::vector<std::string> include_layers;
     std::vector<std::string> exclude_layers;
     std::vector<enum ggml_type> include_types;
@@ -48,8 +51,8 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) {
     fprintf(stderr, "  -h, --help            show this help message and exit\n");
     fprintf(stderr, "  -m FNAME, --model FNAME\n");
     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "  -r, --reference\n");
-    fprintf(stderr, "                        use reference implementation (default: false)\n");
+    fprintf(stderr, "  -i, --implementation\n");
+    fprintf(stderr, "                        select implementation (simd, reference, rmse)\n");
     fprintf(stderr, "  -v, --verbose\n");
     fprintf(stderr, "                        verbose output (default: false)\n");
     fprintf(stderr, "  -p, --per-layer-stats\n");
@@ -104,11 +107,12 @@ double find_quantile(const error_stats & stats, double quantile) {
     return INFINITY;
 }
 
-void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
+void print_error_stats(const std::string & name, ggml_quantize_impl_t impl, const error_stats & stats, bool print_histogram) {
     double rmse = sqrt(stats.total_error / (double) stats.num_samples);
     double median = find_quantile(stats, .5);
     double pct95 = find_quantile(stats, .95);
-    printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
+    printf("%-4s %-10s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n",
+        name.c_str(), impl_strs[impl], rmse, stats.max_error, pct95, median);
     if (print_histogram) {
         printf("Error distribution:\n");
         for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
@@ -136,7 +140,7 @@ void test_roundtrip_on_layer(
         std::string & name,
         bool print_layer_stats,
         const quantize_fns_t & qfns,
-        bool use_reference,
+        ggml_quantize_impl_t impl,
         const ggml_tensor * layer,
         float * input_scratch,
         char *quantized_scratch,
@@ -158,11 +162,7 @@ void test_roundtrip_on_layer(
             input_scratch = ggml_get_data_f32(layer) + offset;
         }
 
-        if (use_reference) {
-            qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
-        } else {
-            qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
-        }
+        qfns.quantize_row_q[impl](input_scratch, quantized_scratch, chunk_size);
         qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
 
         update_error_stats(chunk_size, input_scratch, output_scratch, total_error);
@@ -171,7 +171,7 @@ void test_roundtrip_on_layer(
         }
     }
     if (print_layer_stats) {
-        print_error_stats(name, layer_error, false);
+        print_error_stats(name, impl, layer_error, false);
     }
 }
 
@@ -190,8 +190,21 @@ int main(int argc, char ** argv) {
         if (arg == "-h" || arg == "--help") {
             quantize_stats_print_usage(argc, argv);
             exit(0);
-        } else if (arg == "-r" || arg == "--reference") {
-            params.reference = true;
+        } else if (arg == "-i" || arg == "--implementation") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            int j;
+            for (j = 0; j < GGML_QUANTIZE_IMPL_COUNT && strcmp(argv[i], impl_strs[j]) != 0; j++) {
+                // find match
+            }
+            if (j < GGML_QUANTIZE_IMPL_COUNT) {
+                params.include_impl.push_back((ggml_quantize_impl_t)j);
+            } else {
+                fprintf(stderr, "error: %s not in list of implementations\n", argv[i]);
+                invalid_param = true;
+            }
         } else if (arg == "-v") {
             params.verbose = true;
         } else if (arg == "-p" || arg == "--per-layer-stats") {
@@ -302,42 +315,48 @@ int main(int argc, char ** argv) {
     std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4);
     std::vector<float> output_scratch(SCRATCH_ELEMENTS);
 
-    // loop throught quantization types
-    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
-        if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
+    // loop through quantization types
+    for (int type = 0; type < GGML_TYPE_COUNT; type++) {
+        if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), type) == params.include_types.end()) {
             continue;
         }
-        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
+        quantize_fns_t qfns = ggml_internal_get_quantize_fn(type);
         if (qfns.quantize_row_q && qfns.dequantize_row_q) {
-            if (params.verbose) {
-                printf("testing %s ...\n",  type_strs[i]);
-            }
-
-            error_stats global_stats {};
-
-            for (const auto& kv_tensor : tensors_sorted) {
-                if (!layer_included(params, kv_tensor.first)) {
+            for (int impl = 0; impl < GGML_QUANTIZE_IMPL_COUNT; impl++) {
+                if (!params.include_impl.empty() && std::find(params.include_impl.begin(), params.include_impl.end(), impl) == params.include_impl.end()) {
                     continue;
                 }
+
                 if (params.verbose) {
-                    printf("  %s ...\n",  kv_tensor.first.c_str());
+                    printf("testing %s %s ...\n", type_strs[type], impl_strs[impl]);
                 }
-                std::string layer_name { type_strs[i] };
-                layer_name += "::" + kv_tensor.first;
-                test_roundtrip_on_layer(
-                        layer_name,
-                        params.per_layer_stats,
-                        qfns,
-                        params.reference,
-                        kv_tensor.second,
-                        input_scratch.data(),
-                        quantized_scratch.data(),
-                        output_scratch.data(),
-                        global_stats
-                );
-            }
 
-            print_error_stats(type_strs[i], global_stats, params.print_histogram);
+                error_stats global_stats {};
+
+                for (const auto& kv_tensor : tensors_sorted) {
+                    if (!layer_included(params, kv_tensor.first)) {
+                        continue;
+                    }
+                    if (params.verbose) {
+                        printf("  %s ...\n",  kv_tensor.first.c_str());
+                    }
+                    std::string layer_name { type_strs[type] };
+                    layer_name += "::" + kv_tensor.first;
+                    test_roundtrip_on_layer(
+                            layer_name,
+                            params.per_layer_stats,
+                            qfns,
+                            (ggml_quantize_impl_t)impl,
+                            kv_tensor.second,
+                            input_scratch.data(),
+                            quantized_scratch.data(),
+                            output_scratch.data(),
+                            global_stats
+                    );
+                }
+
+                print_error_stats(type_strs[type], (ggml_quantize_impl_t)impl, global_stats, params.print_histogram);
+            }
         }
     }
 
 
@@ -0,0 +1,76 @@
+import matplotlib.pyplot as plt
+
+# Generated by quantizing the entire 7B model with the first element of each tuple as the scale factor.
+# The second element of the tuple is the number of q4_0 blocks for which that scale factor has lowest RMSE.
+data = (
+    (-10.0, 0),
+    (-9.9, 1),
+    (-9.8, 3),
+    (-9.7, 65),
+    (-9.6, 738),
+    (-9.5, 5779),
+    (-9.4, 30880),
+    (-9.3, 121078),
+    (-9.2, 375674),
+    (-9.1, 941350),
+    (-9.0, 1990278),
+    (-8.9, 3635317),
+    (-8.8, 5891752),
+    (-8.7, 8678748),
+    (-8.6, 11771759),
+    (-8.5, 14873993),
+    (-8.4, 17594260),
+    (-8.3, 19553100),
+    (-8.2, 20415428),
+    (-8.1, 20017134),
+    (-8.0, 18357204),
+    (-7.9, 15597612),
+    (-7.8, 11993688),
+    (-7.7, 7842970),
+    (-7.6, 2880878),
+    (-7.5, 3478),
+    (-7.4, 2648437),
+    (-7.3, 5641970),
+    (-7.2, 5935890),
+    (-7.1, 4910790),
+    (-7.0, 3425891),
+    (-6.9, 2068250),
+    (-6.8, 1089883),
+    (-6.7, 502462),
+    (-6.6, 156356),
+    (-6.5, 205),
+    (-6.4, 163500),
+    (-6.3, 386291),
+    (-6.2, 423018),
+    (-6.1, 319360),
+    (-6.0, 180783),
+    (-5.9, 78822),
+    (-5.8, 28254),
+    (-5.7, 8698),
+    (-5.6, 1969),
+    (-5.5, 0),
+    (-5.4, 2069),
+    (-5.3, 5722),
+    (-5.2, 7107),
+    (-5.1, 5113),
+    (-5.0, 2332),
+    (-4.9, 636),
+    (-4.8, 130),
+    (-4.7, 12),
+    (-4.6, 1),
+    (-4.5, 0),
+    (-4.4, 3),
+    (-4.3, 4),
+    (-4.2, 8),
+    (-4.1, 8),
+    (-4.0, 27),
+)
+x, y = zip(*data)
+
+fig, ax = plt.subplots()
+b = ax.bar(x, y, 0.1, bottom=1)
+ax.set_yscale("log")
+ax.set_xlabel("scale")
+ax.set_ylabel("N")
+plt.title("Quantization scale factor with lowest RMS error")
+plt.show()
Original file line number	Diff line number	Diff line change
`@@ -133,7 +133,7 @@ $(info I CC: $(CCV))`
`133`	`133`	`$(info I CXX: $(CXXV))`
`134`	`134`	`$(info )`
`135`	`135`
`136`		`-default: main quantize perplexity embedding`
	`136`	`+default: main quantize quantize-stats perplexity embedding`
`137`	`137`
`138`	`138`	`#`
`139`	`139`	`# Build library`