Skip to content

Commit 4bd781c

Browse files
committed
q4_0c: quantize support
1 parent a1e6fb9 commit 4bd781c

File tree

5 files changed

+50
-9
lines changed

5 files changed

+50
-9
lines changed

examples/quantize/quantize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
static const std::map<std::string, enum llama_ftype> LLAMA_FTYPE_MAP = {
1010
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
11+
{"q4_0c", LLAMA_FTYPE_MOSTLY_Q4_0C},
1112
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
1213
{"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
1314
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},

ggml.c

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -774,11 +774,17 @@ static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block s
774774

775775
#define QK4_0C (4*32)
776776
#define QK4_0C_MUL (QK4_0C / QK4_0)
777-
// TODO: nicer description - pseudostruct?
778-
// q4_0c : (uint8_t[QK4_0C/2]) qs[nb] || float d[n]
777+
#define Q4_0C_QSIZE (QK4_0C/2 + 4*sizeof(float))
778+
// typedef struct {
779+
// uint8_t qs[QK4_0C/2][nb];
780+
// float d[nb];
781+
// } block_q4_0c
779782

780783
#define QK8_0C 32
781-
// q8_0c : uint8_t qs[n] || float d[n]
784+
// typedef struct {
785+
// uint8_t qs[QK8_0C][nb];
786+
// float d[nb];
787+
// } block_q8_0c
782788

783789
// reference implementation for deterministic creation of model files
784790
static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
@@ -13102,6 +13108,27 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
1310213108
return (n/QK4_0*sizeof(block_q4_0));
1310313109
}
1310413110

13111+
size_t ggml_quantize_q4_0c(const float * src, void * dst, int n, int k, int64_t * hist) {
13112+
assert(k % QK4_0C == 0);
13113+
const int nb = k / QK4_0;
13114+
13115+
for (int j = 0; j < n; j += k) {
13116+
uint8_t * restrict y = (uint8_t *)dst + sizeof(block_q4_0)*j/QK4_0;
13117+
13118+
quantize_row_q4_0c_reference(src + j, y, k);
13119+
13120+
for (int i = 0; i < nb*QK4_0/2; i++) {
13121+
const uint8_t vi0 = y[i] & 0xF;
13122+
const uint8_t vi1 = y[i] >> 4;
13123+
13124+
hist[vi0]++;
13125+
hist[vi1]++;
13126+
}
13127+
}
13128+
13129+
return (n/QK4_0*sizeof(block_q4_0));
13130+
}
13131+
1310513132
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
1310613133
assert(k % QK4_1 == 0);
1310713134
const int nb = k / QK4_1;
@@ -13229,7 +13256,7 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
1322913256
return (n/QK8_0*sizeof(block_q8_0));
1323013257
}
1323113258

13232-
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
13259+
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int k, int64_t * hist) {
1323313260
size_t result = 0;
1323413261
switch (type) {
1323513262
case GGML_TYPE_Q4_0:
@@ -13238,6 +13265,12 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
1323813265
block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
1323913266
result = ggml_quantize_q4_0(src + start, block, n, n, hist);
1324013267
} break;
13268+
case GGML_TYPE_Q4_0C:
13269+
{
13270+
GGML_ASSERT(start % QK4_0C == 0);
13271+
uint8_t * dst_off = (uint8_t *) dst + Q4_0C_QSIZE * start / QK4_0C;
13272+
result = ggml_quantize_q4_0c(src + start, dst_off, n, k, hist);
13273+
} break;
1324113274
case GGML_TYPE_Q4_1:
1324213275
{
1324313276
GGML_ASSERT(start % QK4_1 == 0);

ggml.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -871,13 +871,14 @@ extern "C" {
871871
//
872872

873873
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
874+
GGML_API size_t ggml_quantize_q4_0c(const float * src, void * dst, int n, int k, int64_t * hist);
874875
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
875876
GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
876877
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
877878
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
878879
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
879880

880-
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
881+
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int k, int64_t * hist);
881882

882883
//
883884
// system info

llama.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,7 @@ struct llama_file_loader {
481481
case GGML_TYPE_F32:
482482
case GGML_TYPE_F16:
483483
case GGML_TYPE_Q4_0:
484+
case GGML_TYPE_Q4_0C:
484485
case GGML_TYPE_Q4_1:
485486
case GGML_TYPE_Q4_2:
486487
case GGML_TYPE_Q5_0:
@@ -557,6 +558,7 @@ struct llama_file_saver {
557558
case GGML_TYPE_F32:
558559
case GGML_TYPE_F16:
559560
case GGML_TYPE_Q4_0:
561+
case GGML_TYPE_Q4_0C:
560562
case GGML_TYPE_Q4_1:
561563
case GGML_TYPE_Q4_2:
562564
case GGML_TYPE_Q5_0:
@@ -846,6 +848,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
846848
case LLAMA_FTYPE_ALL_F32: return "all F32";
847849
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
848850
case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
851+
case LLAMA_FTYPE_MOSTLY_Q4_0C: return "mostly Q4_0C";
849852
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
850853
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
851854
return "mostly Q4_1, some F16";
@@ -1880,6 +1883,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18801883
ggml_type quantized_type;
18811884
switch (ftype) {
18821885
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1886+
case LLAMA_FTYPE_MOSTLY_Q4_0C: quantized_type = GGML_TYPE_Q4_0C; break;
18831887
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
18841888
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
18851889
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
@@ -1961,15 +1965,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
19611965
new_data = work.addr;
19621966
std::vector<int64_t> hist_cur(1 << 4, 0);
19631967

1964-
int chunk_size = 32 * 512;
1968+
int row_size = tensor.ne.at(0);
1969+
int chunk_size = ceil(32 * 512 * 1.0 / row_size) * row_size;
19651970
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
19661971
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
19671972
if (nthread_use < 2) {
1968-
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
1973+
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, row_size, hist_cur.data());
19691974
} else {
19701975
size_t counter = 0;
19711976
new_size = 0;
1972-
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
1977+
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size, row_size] () {
19731978
std::vector<int64_t> local_hist;
19741979
size_t local_size = 0;
19751980
while (true) {
@@ -1985,7 +1990,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
19851990
lock.unlock();
19861991
size_t last = std::min(nelements, first + chunk_size);
19871992
if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
1988-
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
1993+
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, row_size, local_hist.data());
19891994
}
19901995
};
19911996
if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ extern "C" {
8383
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
8484
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
8585
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
86+
LLAMA_FTYPE_MOSTLY_Q4_0C = 20, // except 1d tensors
8687
};
8788

8889
LLAMA_API struct llama_context_params llama_context_default_params();

0 commit comments

Comments
 (0)