Skip to content

Introduce IQ4_NL_4_4 format and its neon implementation #10196

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "IQ4_NL_4_4", LLAMA_FTYPE_MOSTLY_IQ4_NL_4_4, " 4.50 bpw non-linear quantization", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
Expand Down
1 change: 1 addition & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ extern "C" {
GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_TQ1_0 = 34,
GGML_TYPE_TQ2_0 = 35,
GGML_TYPE_IQ4_NL_4_4 = 36,
GGML_TYPE_COUNT,
};

Expand Down
999 changes: 491 additions & 508 deletions ggml/src/ggml-aarch64.c

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions ggml/src/ggml-aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,19 @@ void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq4_nl_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

// GEMV
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);

// GEMM
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);

#ifdef __cplusplus
}
Expand Down
6 changes: 6 additions & 0 deletions ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,12 @@ typedef struct {
} block_iq4_xs;
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");

typedef struct {
ggml_half d[4]; // deltas for 4 iq4_nl blocks
uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks
} block_iq4_nlx4;
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");

#endif // GGML_COMMON_DECL
#endif // GGML_COMMON_DECL

Expand Down
9 changes: 9 additions & 0 deletions ggml/src/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,14 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
},
[GGML_TYPE_IQ4_NL_4_4] = {
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
.ncols = 4,
.gemv = ggml_gemv_iq4_nl_4x4_q8_0,
.gemm = ggml_gemm_iq4_nl_4x4_q8_0,
},
};

const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
Expand Down Expand Up @@ -9119,6 +9127,7 @@ static void ggml_compute_forward_clamp(
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
case GGML_TYPE_IQ4_NL_4_4:
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -15721,6 +15721,10 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
{
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
} break;
case GGML_TYPE_IQ4_NL_4_4:
{
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_iq4_nlx4, data, nbytes / sizeof(block_iq4_nlx4), 4);
} break;

case GGML_TYPE_I8:
case GGML_TYPE_I16:
Expand Down
11 changes: 11 additions & 0 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -852,6 +852,16 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
.from_float = quantize_row_tq2_0,
.from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref,
},
[GGML_TYPE_IQ4_NL_4_4] = {
.type_name = "iq4_nl_4x4",
.blck_size = QK4_NL,
.blck_size_interleave = 4,
.type_size = sizeof(block_iq4_nl),
.is_quantized = true,
.to_float = NULL,
.from_float = NULL,
.from_float_ref = NULL,
}
};

const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
Expand Down Expand Up @@ -6854,6 +6864,7 @@ size_t ggml_quantize_chunk(
case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_NL_4_4: result = quantize_iq4_nl_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_F16:
{
size_t elemsize = sizeof(ggml_fp16_t);
Expand Down
1 change: 1 addition & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ4_NL_4_4 = 38, // except 1d tensors

LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
};
Expand Down
11 changes: 11 additions & 0 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4434,6 +4434,7 @@ struct llama_model_loader {
case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
case GGML_TYPE_IQ4_NL_4_4: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL_4_4; break;
default:
{
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
Expand Down Expand Up @@ -5198,6 +5199,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
case LLAMA_FTYPE_MOSTLY_IQ4_NL_4_4: return "IQ4_NL_4_4 - 4.5 bpw";

default: return "unknown, may not work";
}
Expand Down Expand Up @@ -18146,6 +18148,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
new_type == GGML_TYPE_Q4_0_8_8) {
new_type = GGML_TYPE_Q4_0;
}
else if (new_type == GGML_TYPE_IQ4_NL_4_4) {
new_type = GGML_TYPE_IQ4_NL;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
new_type = GGML_TYPE_Q4_K;
}
Expand Down Expand Up @@ -18471,6 +18476,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
case LLAMA_FTYPE_MOSTLY_IQ4_NL_4_4: default_type = GGML_TYPE_IQ4_NL_4_4; break;

default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
}
Expand Down Expand Up @@ -18816,6 +18822,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
}

if (new_type == GGML_TYPE_IQ4_NL_4_4) {
if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
else chunk_size_multiplier = 4;
}

LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
fflush(stdout);

Expand Down
Loading