Skip to content

Commit 639949f

Browse files
committed
backend-cpu: add online flow for aarch64 Q4_0 GEMV/GEMM kernels
1 parent b11f9ba commit 639949f

File tree

10 files changed

+870
-91
lines changed

10 files changed

+870
-91
lines changed

common/arg.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2047,6 +2047,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20472047
common_log_set_timestamps(common_log_main(), true);
20482048
}
20492049
).set_env("LLAMA_LOG_TIMESTAMPS"));
2050+
add_opt(common_arg(
2051+
{"-rtrp", "--runtime-repack"},
2052+
string_format("Allow runtime requantization and repacking of Q4_0 to enable optimized GEMM and GEMV kernels (default: %d)", params.runtime_repack),
2053+
[](common_params & params) {
2054+
params.runtime_repack = true;
2055+
}
2056+
).set_examples({LLAMA_EXAMPLE_MAIN}));
20502057

20512058
return ctx_arg;
20522059
}

common/common.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -983,7 +983,7 @@ struct llama_model_params common_model_params_to_llama(const common_params & par
983983
mparams.main_gpu = params.main_gpu;
984984
mparams.split_mode = params.split_mode;
985985
mparams.tensor_split = params.tensor_split;
986-
mparams.use_mmap = params.use_mmap;
986+
mparams.use_mmap = params.use_mmap && !params.runtime_repack;
987987
mparams.use_mlock = params.use_mlock;
988988
mparams.check_tensors = params.check_tensors;
989989
if (params.kv_overrides.empty()) {
@@ -1053,6 +1053,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
10531053
cparams.offload_kqv = !params.no_kv_offload;
10541054
cparams.flash_attn = params.flash_attn;
10551055
cparams.no_perf = params.no_perf;
1056+
cparams.runtime_repack = params.runtime_repack;
10561057

10571058
if (params.reranking) {
10581059
cparams.embeddings = true;

common/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,8 @@ struct common_params {
271271
bool warmup = true; // warmup run
272272
bool check_tensors = false; // validate tensor data
273273

274+
bool runtime_repack = false; // runtime repack weight for optimized kernels
275+
274276
std::string cache_type_k = "f16"; // KV cache data type for the K
275277
std::string cache_type_v = "f16"; // KV cache data type for the V
276278

examples/llama-bench/llama-bench.cpp

Lines changed: 112 additions & 84 deletions
Large diffs are not rendered by default.

ggml/include/ggml-backend.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,19 @@ extern "C" {
305305
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
306306
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
307307

308-
// CPU buffer types are always available
308+
//
309+
// CPU backend
310+
//
311+
312+
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
313+
314+
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
315+
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
316+
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
317+
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
318+
GGML_API void ggml_backend_cpu_set_runtime_repack(ggml_backend_t backend_cpu, bool runtime_repack);
319+
320+
// Create a backend buffer from an existing pointer
309321
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
310322
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
311323

ggml/src/ggml-aarch64.c

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3476,3 +3476,102 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
34763476
}
34773477
}
34783478
}
3479+
3480+
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor *t, int interleave_block, uint8_t **pmem, size_t *psize) {
3481+
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
3482+
GGML_ASSERT(t->ne[0] % 8 == 0);
3483+
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
3484+
3485+
// Do in-place transformation. Allocate scratch buffer
3486+
size_t size = sizeof(block_q4_0x4) * t->ne[0] / QK4_0;
3487+
if (size > *psize) {
3488+
uint8_t *new_mem = realloc(*pmem, size);
3489+
if (!new_mem) {
3490+
return -1;
3491+
}
3492+
*pmem = new_mem;
3493+
*psize = size;
3494+
}
3495+
block_q4_0x4 *dst = (block_q4_0x4*) *pmem;
3496+
block_q4_0 *src = (block_q4_0*) t->data;
3497+
block_q4_0 dst_tmp[4];
3498+
int n = t->ne[0];
3499+
int nrow = t->ne[1]; // Number of rows
3500+
int nrows_interleaved = 4;
3501+
int nblocks = t->ne[0] / QK4_0;
3502+
for (int b = 0; b < (nrow * n); b += nrows_interleaved * n) {
3503+
int cnt = 0;
3504+
for (int64_t x = 0; x < nblocks; x++) {
3505+
for (int i = 0; i < nrows_interleaved; i++ ) {
3506+
dst_tmp[i] = src[x + i * nblocks];
3507+
}
3508+
dst[cnt++] = make_block_q4_0x4(dst_tmp, interleave_block, 0x88);
3509+
}
3510+
memcpy(src, dst, size);
3511+
src += cnt * 4;
3512+
}
3513+
return 0;
3514+
}
3515+
3516+
static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, uint8_t **pmem, size_t *psize) {
3517+
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
3518+
GGML_ASSERT(t->ne[0] % 8 == 0);
3519+
GGML_ASSERT(interleave_block == 8);
3520+
3521+
// Do in-place transformation. Allocate scratch buffer
3522+
size_t size = sizeof(block_q4_0x8) * t->ne[0] / QK4_0;
3523+
if (size > *psize) {
3524+
uint8_t *new_mem = realloc(*pmem, size);
3525+
if (!new_mem) {
3526+
return -1;
3527+
}
3528+
*pmem = new_mem;
3529+
*psize = size;
3530+
}
3531+
block_q4_0x8 *dst = (block_q4_0x8*) *pmem;
3532+
block_q4_0 *src = (block_q4_0*) t->data;
3533+
block_q4_0 dst_tmp[8];
3534+
int n = t->ne[0];
3535+
int nrow = t->ne[1]; // Number of rows
3536+
int nrows_interleaved = 8;
3537+
int nblocks = t->ne[0] / QK4_0;
3538+
for (int b = 0; b < (nrow * n); b += nrows_interleaved * n) {
3539+
int cnt = 0;
3540+
for (int64_t x = 0; x < nblocks; x++) {
3541+
for (int i = 0; i < nrows_interleaved; i++ ) {
3542+
dst_tmp[i] = src[x + i * nblocks];
3543+
}
3544+
dst[cnt++] = make_block_q4_0x8(dst_tmp, interleave_block, 0x88);
3545+
}
3546+
memcpy(src, dst, size);
3547+
src += cnt * 4;
3548+
}
3549+
return 0;
3550+
}
3551+
3552+
// Prepare for optimized kernels if applicable
3553+
void ggml_prepare_optimal_kernel(struct ggml_tensor *cur, uint8_t **pmem, size_t *psize) {
3554+
UNUSED(cur);
3555+
UNUSED(pmem);
3556+
UNUSED(psize);
3557+
3558+
#if defined(__ARM_ARCH)
3559+
if (cur->type == GGML_TYPE_Q4_0) {
3560+
if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
3561+
if (repack_q4_0_to_q4_0_8_bl(cur, 8, pmem, psize) == 0) {
3562+
cur->type = GGML_TYPE_Q4_0_8_8;
3563+
}
3564+
}
3565+
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
3566+
if (repack_q4_0_to_q4_0_4_bl(cur, 8, pmem, psize) == 0) {
3567+
cur->type = GGML_TYPE_Q4_0_4_8;
3568+
}
3569+
}
3570+
else if (ggml_cpu_has_neon()) {
3571+
if (repack_q4_0_to_q4_0_4_bl(cur, 4, pmem, psize) == 0) {
3572+
cur->type = GGML_TYPE_Q4_0_4_4;
3573+
}
3574+
}
3575+
}
3576+
#endif
3577+
}

ggml/src/ggml-aarch64.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
3333
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
3434
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
3535

36+
void ggml_prepare_optimal_kernel(struct ggml_tensor *cur, uint8_t **pmem, size_t *psize);
37+
3638
#ifdef __cplusplus
3739
}
3840
#endif

0 commit comments

Comments
 (0)