Skip to content

Commit de66009

Browse files
committed
Q4_0 scale selection using RMSE
1 parent cc9cee8 commit de66009

File tree

6 files changed

+215
-40
lines changed

6 files changed

+215
-40
lines changed

SHA256SUMS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth
2+
5dec1979849d73e361a8bcc10bc8f53237cbbe435a572882dc87629e011e24b3 models/7B/ggml-model-q4_0.bin
23
7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json
34
745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
45
d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth

examples/quantize/scale.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import matplotlib.pyplot as plt
2+
3+
# Generated by quantizing the entire 7B model with the first element of each tuple as the scale factor.
4+
# The second element of the tuple is the number of q4_0 blocks for which that scale factor has lowest RMSE.
5+
data = (
6+
(-10.0, 0),
7+
(-9.9, 1),
8+
(-9.8, 3),
9+
(-9.7, 65),
10+
(-9.6, 738),
11+
(-9.5, 5779),
12+
(-9.4, 30880),
13+
(-9.3, 121078),
14+
(-9.2, 375674),
15+
(-9.1, 941350),
16+
(-9.0, 1990278),
17+
(-8.9, 3635317),
18+
(-8.8, 5891752),
19+
(-8.7, 8678748),
20+
(-8.6, 11771759),
21+
(-8.5, 14873993),
22+
(-8.4, 17594260),
23+
(-8.3, 19553100),
24+
(-8.2, 20415428),
25+
(-8.1, 20017134),
26+
(-8.0, 18357204),
27+
(-7.9, 15597612),
28+
(-7.8, 11993688),
29+
(-7.7, 7842970),
30+
(-7.6, 2880878),
31+
(-7.5, 3478),
32+
(-7.4, 2648437),
33+
(-7.3, 5641970),
34+
(-7.2, 5935890),
35+
(-7.1, 4910790),
36+
(-7.0, 3425891),
37+
(-6.9, 2068250),
38+
(-6.8, 1089883),
39+
(-6.7, 502462),
40+
(-6.6, 156356),
41+
(-6.5, 205),
42+
(-6.4, 163500),
43+
(-6.3, 386291),
44+
(-6.2, 423018),
45+
(-6.1, 319360),
46+
(-6.0, 180783),
47+
(-5.9, 78822),
48+
(-5.8, 28254),
49+
(-5.7, 8698),
50+
(-5.6, 1969),
51+
(-5.5, 0),
52+
(-5.4, 2069),
53+
(-5.3, 5722),
54+
(-5.2, 7107),
55+
(-5.1, 5113),
56+
(-5.0, 2332),
57+
(-4.9, 636),
58+
(-4.8, 130),
59+
(-4.7, 12),
60+
(-4.6, 1),
61+
(-4.5, 0),
62+
(-4.4, 3),
63+
(-4.3, 4),
64+
(-4.2, 8),
65+
(-4.1, 8),
66+
(-4.0, 27),
67+
)
68+
x, y = zip(*data)
69+
70+
fig, ax = plt.subplots()
71+
b = ax.bar(x, y, 0.1, bottom=1)
72+
ax.set_yscale("log")
73+
ax.set_xlabel("scale")
74+
ax.set_ylabel("N")
75+
plt.title("Quantization scale factor with lowest RMS error")
76+
plt.show()

ggml.c

Lines changed: 110 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,15 @@ static int sched_yield (void) {
7373
Sleep (0);
7474
return 0;
7575
}
76+
77+
#define __attribute__(...)
7678
#else
7779
#include <pthread.h>
7880
#include <stdatomic.h>
7981

8082
typedef void* thread_ret_t;
83+
84+
#define __declspec(...)
8185
#endif
8286

8387
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -517,39 +521,120 @@ typedef struct {
517521
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK / 2, "wrong q4_1 block size/padding");
518522

519523
// reference implementation for deterministic creation of model files
520-
static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
521-
assert(k % QK == 0);
522-
const int nb = k / QK;
523-
524+
static inline void quantize_block_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, float scale) {
524525
uint8_t pp[QK/2];
525526

526-
for (int i = 0; i < nb; i++) {
527-
float amax = 0.0f; // absolute max
527+
float amax = 0.0f; // absolute max
528+
float max = 0.0f;
528529

529-
for (int l = 0; l < QK; l++) {
530-
const float v = x[i*QK + l];
531-
amax = MAX(amax, fabsf(v));
530+
for (int l = 0; l < QK; l++) {
531+
const float v = x[l];
532+
if (amax < fabsf(v)) {
533+
amax = fabsf(v);
534+
max = v;
532535
}
536+
}
533537

534-
const float d = amax / ((1 << 3) - 1);
535-
const float id = d ? 1.0f/d : 0.0f;
538+
const float d = max / scale;
539+
const float id = d ? 1.0f/d : 0.0f;
536540

537-
y[i].d = d;
541+
y->d = d;
538542

539-
for (int l = 0; l < QK; l += 2) {
540-
const float v0 = x[i*QK + l + 0]*id;
541-
const float v1 = x[i*QK + l + 1]*id;
543+
for (int l = 0; l < QK; l += 2) {
544+
const float v0 = x[l + 0]*id;
545+
const float v1 = x[l + 1]*id;
542546

543-
const uint8_t vi0 = (int8_t)roundf(v0) + 8;
544-
const uint8_t vi1 = (int8_t)roundf(v1) + 8;
547+
int8_t vs0 = roundf(v0);
548+
int8_t vs1 = roundf(v1);
545549

546-
assert(vi0 < 16);
547-
assert(vi1 < 16);
550+
vs0 = MIN(MAX(0 - 8, vs0), 15 - 8);
551+
vs1 = MIN(MAX(0 - 8, vs1), 15 - 8);
548552

549-
pp[l/2] = vi0 | (vi1 << 4);
553+
const uint8_t vi0 = vs0 + 8; // guaranteed to fit into 4 bits
554+
const uint8_t vi1 = vs1 + 8; // thanks to the clamping of the signed values above
555+
556+
pp[l/2] = vi0 | (vi1 << 4);
557+
}
558+
559+
memcpy(y->qs, pp, sizeof(pp));
560+
}
561+
562+
static void quantize_row_q4_0_rmse(const float * restrict x, block_q4_0 * restrict y, int k) {
563+
// For each q4_0 block, we try the following values to scale the shared float value
564+
// and pick the one with lowest RMS error. We could do a more involved search,
565+
// but this is a trade-off with speed of model generation and simplicity of the code.
566+
// Operating on 8 values can reasonably be loop-unrolled or vectorized, but that is not
567+
// manually done here.
568+
// Values hand-picked according to histogram in examples/quantize/scale.py
569+
// Include the value +7 of the old method to ensure we don't regress on RMSE on any block.
570+
#define Q4_0_SCALE_CANDIDATE_COUNT 8
571+
static const float candidates[Q4_0_SCALE_CANDIDATE_COUNT] = { -8.7f, -8.5f, -8.3f, -8.1f, -7.9f, -7.7f, -7.2f, +7.0f };
572+
573+
assert(k % QK == 0);
574+
const int nb = k / QK;
575+
576+
for (int i = 0; i < nb; i++) {
577+
float amax = 0.0f; // absolute max
578+
float max = 0.0f;
579+
580+
for (int l = 0; l < QK; l++) {
581+
const float v = x[i*QK + l];
582+
if (amax < fabsf(v)) {
583+
amax = fabsf(v);
584+
max = v;
585+
}
550586
}
551587

552-
memcpy(y[i].qs, pp, sizeof(pp));
588+
// find scale with lowest sum of squared errors, equivalent to lowest RMS error
589+
float best_sqerr = +INFINITY;
590+
float best_scale = NAN;
591+
592+
for (int si = 0; si < Q4_0_SCALE_CANDIDATE_COUNT; si++) {
593+
const float scale = candidates[si];
594+
const float d = max / scale;
595+
const float id = d ? 1.0f / d : 0.0f;
596+
float sqe_acc = 0.f;
597+
#ifdef __AVX2__
598+
const __m256 clamp_lo = _mm256_set1_ps( 0 - 8);
599+
const __m256 clamp_hi = _mm256_set1_ps(15 - 8);
600+
const __m256 id256 = _mm256_set1_ps(id);
601+
for (int l = 0; l < QK; l += 8) {
602+
// TODO: why are the inputs not aligned to 32 bytes?
603+
__m256 v = _mm256_loadu_ps(&x[i * QK + l]);
604+
v = _mm256_mul_ps(v, id256);
605+
__m256 vs = _mm256_round_ps(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
606+
vs = _mm256_min_ps(_mm256_max_ps(clamp_lo, vs), clamp_hi);
607+
const __m256 err = _mm256_sub_ps(vs, v);
608+
const __m256 sqe = _mm256_mul_ps(err, err);
609+
610+
// this is far from optimal speed-wise, but ensures identical results to scalar implementation
611+
// we have to add the floats in sqe to sqe_acc separately and in the correct order
612+
// 8x _mm_add_ps(,_mm_permute_ps()) would work but isn't faster than this:
613+
__declspec(align(32)) float out[8] __attribute__((aligned(32)));
614+
_mm256_store_ps(out, sqe);
615+
for (int ei= 0; ei < 8; ei++) {
616+
sqe_acc += out[ei];
617+
}
618+
}
619+
#else
620+
for (int l = 0; l < QK; l++) {
621+
const float v = x[i * QK + l] * id;
622+
int8_t vs = roundf(v);
623+
vs = MIN(MAX(0 - 8, vs), 15 - 8);
624+
sqe_acc += (vs - v) * (vs - v);
625+
}
626+
#endif
627+
// the square error sum is calculated on un-scaled q's inside the inner loop
628+
sqe_acc *= d * d;
629+
630+
if (best_sqerr > sqe_acc) {
631+
best_sqerr = sqe_acc;
632+
best_scale = scale;
633+
}
634+
}
635+
assert(isfinite(best_sqerr));
636+
assert(isfinite(best_scale));
637+
quantize_block_q4_0_reference(x + i * QK, y + i, best_scale);
553638
}
554639
}
555640

@@ -803,7 +888,9 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
803888
}
804889
#else
805890
// scalar
806-
quantize_row_q4_0_reference(x, y, k);
891+
for (int i = 0; i < nb; i++) {
892+
quantize_block_q4_0_reference(x + i*QK, y + i, 7);
893+
}
807894
#endif
808895
}
809896

@@ -10604,7 +10691,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
1060410691
for (int j = 0; j < n; j += k) {
1060510692
block_q4_0 * restrict y = (block_q4_0 *)dst + j/QK;
1060610693

10607-
quantize_row_q4_0_reference(src + j, y, k);
10694+
quantize_row_q4_0_rmse(src + j, y, k);
1060810695

1060910696
for (int i = 0; i < nb; i++) {
1061010697
for (int l = 0; l < QK; l += 2) {

llama.cpp

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -644,7 +644,7 @@ static bool llama_model_load(
644644
size_t total_size = 0;
645645
model.n_loaded = 0;
646646

647-
while (true) {
647+
while (size_t(fin.tellg()) + 12 < file_size) {
648648
int32_t n_dims;
649649
int32_t length;
650650
int32_t ftype;
@@ -653,10 +653,6 @@ static bool llama_model_load(
653653
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
654654
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
655655

656-
if (fin.eof()) {
657-
break;
658-
}
659-
660656
int32_t nelements = 1;
661657
int32_t ne[2] = { 1, 1 };
662658
for (int i = 0; i < n_dims; ++i) {
@@ -707,6 +703,10 @@ static bool llama_model_load(
707703
offset = (offset + 31) & -32;
708704
tensor->data = mm_addr + offset;
709705
fin.seekg(offset + tensor_data_size);
706+
if (fin.eof()) {
707+
fprintf(stderr, "%s: Truncated file?\n", __func__);
708+
return false;
709+
}
710710
total_size += tensor_data_size;
711711
model.n_loaded++;
712712

@@ -717,6 +717,18 @@ static bool llama_model_load(
717717
}
718718
}
719719

720+
uint32_t version_minor = 0;
721+
fin.read((char *)&version_minor, sizeof(version_minor));
722+
if (fin.eof() || version_minor < LLAMA_FILE_VERSION_MINOR) {
723+
#if LLAMA_FILE_VERSION_MINOR == 1
724+
if (model.hparams.f16 == 2) {
725+
fprintf(stderr, "%s: WARN no minor version detected - your file will work but consider re-creating it for better quantization\n", __func__);
726+
}
727+
#else
728+
#error Provide a helpful message that explains why the user may want to update their files
729+
#endif
730+
}
731+
720732
fin.close();
721733

722734
fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
@@ -1583,6 +1595,15 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
15831595
}
15841596
}
15851597

1598+
#if LLAMA_FILE_VERSION_MINOR == 1
1599+
if ((LLAMA_FILE_VERSION_MINOR > 1) || (itype == 2)) {
1600+
#else
1601+
#error Check if this condition needs updating for minimal model checksum changes
1602+
#endif
1603+
uint32_t version_minor = LLAMA_FILE_VERSION_MINOR;
1604+
fout.write((char *)&version_minor, sizeof(version_minor));
1605+
}
1606+
15861607
finp.close();
15871608
fout.close();
15881609

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#endif
2121

2222
#define LLAMA_FILE_VERSION 1
23+
#define LLAMA_FILE_VERSION_MINOR 1 // for backward-compatible changes
2324
#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
2425
#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
2526

tests/test-quantize.c

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,7 @@ int main(void) {
1313
src[i] = (float)(i + 1);
1414
}
1515

16-
size_t size = ggml_quantize_q4_0(src, dst, QK, QK, hist);
17-
assert(size == 20);
18-
float max_result = ((float *)dst)[0];
19-
float max_expected = src[31] / ((1 << 3) - 1);
20-
assert(max_result == max_expected);
21-
for (int i = 0; i < QK; i++) {
22-
uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF);
23-
uint8_t q4_expected = roundf(src[i] / max_expected) + 8;
24-
assert(q4_result == q4_expected);
25-
}
26-
27-
size = ggml_quantize_q4_1(src, dst, QK, QK, hist);
16+
size_t size = ggml_quantize_q4_1(src, dst, QK, QK, hist);
2817
assert(size == 24);
2918
float delta_result = ((float *)dst)[0];
3019
float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);

0 commit comments

Comments
 (0)