Skip to content

Deduplicate q4 quantization functions #383

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ jobs:
run: |
mkdir build
cd build
cmake ..
cmake -DLLAMA_AVX2=OFF ..
cmake --build . --config Release
ctest --output-on-failure

Expand Down
171 changes: 65 additions & 106 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -400,16 +400,63 @@ static inline __m128i packNibbles( __m256i bytes )
// method 5
// blocks of QK elements
// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)

// reference implementation for deterministic creation of model files
static void quantize_row_q4_0_reference(const float * restrict x, void * restrict y, int k) {
assert(k % QK == 0);
const int nb = k / QK;

const size_t bs = sizeof(float) + QK/2;

uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));

uint8_t pp[QK/2];

for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max

for (int l = 0; l < QK; l++) {
const float v = x[i*QK + l];
amax = MAX(amax, fabsf(v));
}

const float d = amax / ((1 << 3) - 1);
const float id = d ? 1.0f/d : 0.0f;

*(float *)pd = d;
pd += bs;

for (int l = 0; l < QK; l += 2) {
const float v0 = x[i*QK + l + 0]*id;
const float v1 = x[i*QK + l + 1]*id;

const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;

assert(vi0 >= 0 && vi0 < 16);
assert(vi1 >= 0 && vi1 < 16);

pp[l/2] = vi0 | (vi1 << 4);
}

memcpy(pb, pp, sizeof(pp));
pb += bs;
}
}

void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
assert(k % QK == 0);

#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__)
const int nb = k / QK;
const size_t bs = sizeof(float) + QK/2;

uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));

uint8_t pp[QK/2];
#endif

#if __ARM_NEON
#if QK == 32
Expand Down Expand Up @@ -566,36 +613,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
#endif
#else
// scalar
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max

for (int l = 0; l < QK; l++) {
const float v = x[i*QK + l];
amax = MAX(amax, fabsf(v));
}

const float d = amax / ((1 << 3) - 1);
const float id = d ? 1.0f/d : 0.0f;

*(float *)pd = d;
pd += bs;

for (int l = 0; l < QK; l += 2) {
const float v0 = x[i*QK + l + 0]*id;
const float v1 = x[i*QK + l + 1]*id;

const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;

assert(vi0 >= 0 && vi0 < 16);
assert(vi1 >= 0 && vi1 < 16);

pp[l/2] = vi0 | (vi1 << 4);
}

memcpy(pb, pp, sizeof(pp));
pb += bs;
}
quantize_row_q4_0_reference(x, y, k);
#endif
}

Expand Down Expand Up @@ -10702,119 +10720,60 @@ enum ggml_opt_result ggml_opt(

////////////////////////////////////////////////////////////////////////////////

size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist) {
const int nb = k / qk;
const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);
const size_t row_size = nb*bs;

assert(k % qk == 0);

const size_t pp_size = qk / 2;
uint8_t * pp = (uint8_t *) alloca(pp_size);

char * pdst = (char *) dst;

for (int j = 0; j < n; j += k) {
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));

for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max

{
for (int l = 0; l < qk; l++) {
const float v = src[j + i*qk + l];
amax = MAX(amax, fabsf(v));
}

const float d = amax / ((1 << 3) - 1);
const float id = d ? 1.0f/d : 0.0f;

*(float *) pd = d;
pd += bs;
quantize_row_q4_0_reference(src + j, pd, k);

for (int l = 0; l < qk; l += 2) {
const float v0 = (src[j + i*qk + l + 0])*id;
const float v1 = (src[j + i*qk + l + 1])*id;

const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;

assert(vi0 >= 0 && vi0 < 16);
assert(vi1 >= 0 && vi1 < 16);

hist[vi0]++;
hist[vi1]++;

pp[l/2] = vi0 | (vi1 << 4);
}
for (int i = 0; i < nb; i++) {
for (int l = 0; l < qk; l += 2) {
const uint8_t vi0 = pb[l/2] & 0xF;
const uint8_t vi1 = pb[l/2] >> 4;

memcpy(pb, pp, pp_size);
pb += bs;
hist[vi0]++;
hist[vi1]++;
}
pb += bs;
}
}

return (n/k)*row_size;
}

size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist) {
const int nb = k / qk;
const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2);
const size_t row_size = nb*bs;

assert(k % qk == 0);

const size_t pp_size = qk / 2;
uint8_t * pp = (uint8_t *) alloca(pp_size);

char * pdst = (char *) dst;

for (int j = 0; j < n; j += k) {
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));

//printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
quantize_row_q4_1(src + j, pd, k);

for (int i = 0; i < nb; i++) {
float min = FLT_MAX;
float max = -FLT_MAX;

{
for (int l = 0; l < qk; l++) {
const float v = src[j + i*qk + l];
if (v < min) min = v;
if (v > max) max = v;
}

const float d = (max - min) / ((1 << 4) - 1);
const float id = d ? 1.0f/d : 0.0f;

*(float *) pd = d;
*(float *) pm = min;
pd += bs;
pm += bs;

for (int l = 0; l < qk; l += 2) {
const float v0 = (src[j + i*qk + l + 0] - min)*id;
const float v1 = (src[j + i*qk + l + 1] - min)*id;

const uint8_t vi0 = round(v0);
const uint8_t vi1 = round(v1);

assert(vi0 >= 0 && vi0 < 16);
assert(vi1 >= 0 && vi1 < 16);

hist[vi0]++;
hist[vi1]++;

pp[l/2] = vi0 | (vi1 << 4);
}
for (int l = 0; l < qk; l += 2) {
const uint8_t vi0 = pb[l/2] & 0xF;
const uint8_t vi1 = pb[l/2] >> 4;

memcpy(pb, pp, pp_size);
pb += bs;
hist[vi0]++;
hist[vi1]++;
}
pb += bs;
}
}

Expand Down
4 changes: 2 additions & 2 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -745,8 +745,8 @@ enum ggml_opt_result ggml_opt(
// quantization
//

size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);

//
// system info
Expand Down
13 changes: 9 additions & 4 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
set(TEST_TARGET test-tokenizer-0)
add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
function(llama_add_test source)
get_filename_component(TEST_TARGET ${source} NAME_WE)
add_executable(${TEST_TARGET} ${source})
target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
endfunction()

llama_add_test(test-quantize.c)
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
42 changes: 42 additions & 0 deletions tests/test-quantize.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#include "ggml.h"
#undef NDEBUG
#include <assert.h>
#include <math.h>

int main(void) {
#define QK 32
float src[QK];
uint8_t dst[24];
int64_t hist[16];

for (int i = 0; i < QK; i++) {
src[i] = (float)(i + 1);
}

size_t size = ggml_quantize_q4_0(src, dst, QK, QK, QK, hist);
assert(size == 20);
float max_result = ((float *)dst)[0];
float max_expected = src[31] / ((1 << 3) - 1);
assert(max_result == max_expected);
for (int i = 0; i < QK; i++) {
uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF);
uint8_t q4_expected = roundf(src[i] / max_expected) + 8;
assert(q4_result == q4_expected);
}

size = ggml_quantize_q4_1(src, dst, QK, QK, QK, hist);
assert(size == 24);
float delta_result = ((float *)dst)[0];
float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);
assert(delta_result == delta_expected);
float min_result = ((float *)dst)[1];
float min_expected = src[0];
assert(min_result == min_expected);
for (int i = 0; i < QK; i++) {
uint8_t q4_result = (i % 2) ? (dst[sizeof(float)*2 + i/2] >> 4) : (dst[sizeof(float)*2 + i/2] & 0xF);
uint8_t q4_expected = roundf((src[i] - min_expected) / delta_expected);
assert(q4_result == q4_expected);
}

return 0;
}