Skip to content

Commit f635735

Browse files
committed
q4_0c: quantize support
1 parent 5c55b33 commit f635735

File tree

5 files changed

+32
-0
lines changed

5 files changed

+32
-0
lines changed

examples/quantize/quantize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ int main(int argc, char ** argv) {
1313
if (argc != 4) {
1414
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
1515
fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
16+
fprintf(stderr, " type = %d - q4_0c\n", LLAMA_FTYPE_MOSTLY_Q4_0C);
1617
fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
1718
fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
1819
return 1;

ggml.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12023,6 +12023,27 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
1202312023
return (n/QK4_0*sizeof(block_q4_0));
1202412024
}
1202512025

12026+
size_t ggml_quantize_q4_0c(const float * src, void * dst, int n, int k, int64_t * hist) {
12027+
assert(k % QK4_0C == 0);
12028+
const int nb = k / QK4_0;
12029+
12030+
for (int j = 0; j < n; j += k) {
12031+
uint8_t * restrict y = (uint8_t *)dst + sizeof(block_q4_0)*j/QK4_0;
12032+
12033+
quantize_row_q4_0c_reference(src + j, y, k);
12034+
12035+
for (int i = 0; i < nb*QK4_0/2; i++) {
12036+
const uint8_t vi0 = y[i] & 0xF;
12037+
const uint8_t vi1 = y[i] >> 4;
12038+
12039+
hist[vi0]++;
12040+
hist[vi1]++;
12041+
}
12042+
}
12043+
12044+
return (n/QK4_0*sizeof(block_q4_0));
12045+
}
12046+
1202612047
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
1202712048
assert(k % QK4_1 == 0);
1202812049
const int nb = k / QK4_1;

ggml.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -808,6 +808,7 @@ enum ggml_opt_result ggml_opt(
808808
//
809809

810810
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
811+
size_t ggml_quantize_q4_0c(const float * src, void * dst, int n, int k, int64_t * hist);
811812
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
812813
size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
813814

llama.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -477,6 +477,7 @@ struct llama_file_loader {
477477
case GGML_TYPE_F32:
478478
case GGML_TYPE_F16:
479479
case GGML_TYPE_Q4_0:
480+
case GGML_TYPE_Q4_0C:
480481
case GGML_TYPE_Q4_1:
481482
case GGML_TYPE_Q4_2:
482483
break;
@@ -550,6 +551,7 @@ struct llama_file_saver {
550551
case GGML_TYPE_F32:
551552
case GGML_TYPE_F16:
552553
case GGML_TYPE_Q4_0:
554+
case GGML_TYPE_Q4_0C:
553555
case GGML_TYPE_Q4_1:
554556
case GGML_TYPE_Q4_2:
555557
break;
@@ -837,6 +839,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
837839
case LLAMA_FTYPE_ALL_F32: return "all F32";
838840
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
839841
case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
842+
case LLAMA_FTYPE_MOSTLY_Q4_0C: return "mostly Q4_1C";
840843
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
841844
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
842845
return "mostly Q4_1, some F16";
@@ -1573,6 +1576,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
15731576
ggml_type quantized_type;
15741577
switch (ftype) {
15751578
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1579+
case LLAMA_FTYPE_MOSTLY_Q4_0C: quantized_type = GGML_TYPE_Q4_0C; break;
15761580
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
15771581
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
15781582
default: throw format("invalid output file type %d\n", ftype);
@@ -1644,6 +1648,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
16441648
{
16451649
new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
16461650
} break;
1651+
case GGML_TYPE_Q4_0C:
1652+
{
1653+
new_size = ggml_quantize_q4_0c(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1654+
} break;
16471655
case GGML_TYPE_Q4_1:
16481656
{
16491657
new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ extern "C" {
7373
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
7474
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
7575
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
76+
LLAMA_FTYPE_MOSTLY_Q4_0C = 6, // except 1d tensors
7677
};
7778

7879
LLAMA_API struct llama_context_params llama_context_default_params();

0 commit comments

Comments
 (0)