Skip to content

Commit 1ac01fb

Browse files
committed
add ggml_backend_buffer_clear
zero-init KV cache buffer
1 parent 0c5ee7c commit 1ac01fb

File tree

6 files changed

+44
-16
lines changed

6 files changed

+44
-16
lines changed

ggml-backend-impl.h

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,16 @@ extern "C" {
3131
typedef void * ggml_backend_buffer_context_t;
3232

3333
struct ggml_backend_buffer_i {
34-
void (*free_buffer)(ggml_backend_buffer_t buffer);
34+
void (*free_buffer) (ggml_backend_buffer_t buffer);
3535
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
36-
void * (*get_base) (ggml_backend_buffer_t buffer);
37-
void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
38-
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
39-
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
36+
void * (*get_base) (ggml_backend_buffer_t buffer);
37+
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
38+
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
39+
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
4040
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
41-
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
42-
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
41+
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
42+
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
43+
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
4344
};
4445

4546
struct ggml_backend_buffer {

ggml-backend.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
9494
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
9595
}
9696

97+
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
98+
buffer->iface.clear(buffer, value);
99+
}
100+
97101
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
98102
return buffer->buft;
99103
}
@@ -410,6 +414,10 @@ static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer,
410414
GGML_UNUSED(buffer);
411415
}
412416

417+
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
418+
memset(buffer->context, value, buffer->size);
419+
}
420+
413421
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
414422
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
415423
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
@@ -418,6 +426,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
418426
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
419427
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
420428
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
429+
/* .clear = */ ggml_backend_cpu_buffer_clear,
421430
};
422431

423432
// for buffers from ptr, free is not called
@@ -429,6 +438,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
429438
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
430439
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
431440
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
441+
/* .clear = */ ggml_backend_cpu_buffer_clear,
432442
};
433443

434444
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512

ggml-backend.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ extern "C" {
2929
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
3030
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
3131
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
32+
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
3233
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
3334

3435
//

ggml-cuda.cu

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9494,6 +9494,15 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co
94949494
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
94959495
}
94969496

9497+
static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
9498+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
9499+
9500+
ggml_cuda_set_device(ctx->device);
9501+
CUDA_CHECK(cudaDeviceSynchronize());
9502+
9503+
CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
9504+
}
9505+
94979506
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
94989507
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
94999508
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
@@ -9502,6 +9511,7 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
95029511
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
95039512
/* .cpy_tensor_from = */ NULL,
95049513
/* .cpy_tensor_to = */ NULL,
9514+
/* .clear = */ ggml_backend_cuda_buffer_clear,
95059515
};
95069516

95079517
// cuda buffer type

ggml-metal.m

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2429,8 +2429,6 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
24292429
}
24302430

24312431
free(ctx);
2432-
2433-
UNUSED(buffer);
24342432
}
24352433

24362434
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -2457,6 +2455,12 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer
24572455
UNUSED(buffer);
24582456
}
24592457

2458+
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
2459+
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
2460+
2461+
memset(ctx->all_data, value, ctx->all_size);
2462+
}
2463+
24602464
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
24612465
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
24622466
/* .get_base = */ ggml_backend_metal_buffer_get_base,
@@ -2465,6 +2469,7 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer
24652469
/* .get_tensor = */ ggml_backend_metal_buffer_get_tensor,
24662470
/* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
24672471
/* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
2472+
/* .clear = */ ggml_backend_metal_buffer_clear,
24682473
};
24692474

24702475
// default buffer type

llama.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1551,9 +1551,8 @@ static bool llama_kv_cache_init(
15511551

15521552
// buf may be NULL with full offload
15531553
if (cache.buf) {
1554-
// TODO: ggml_backend_buffer_memset
1555-
// this is only valid with CPU buffers!
1556-
//memset(ggml_backend_buffer_get_base(cache.buf), 0, ggml_backend_buffer_get_size(cache.buf));
1554+
// initialize the buffer to avoid NaNs in the padding
1555+
ggml_backend_buffer_clear(cache.buf, 0);
15571556
}
15581557

15591558
if (vram_kv_cache > 0) {
@@ -3569,8 +3568,12 @@ static void llm_load_tensors(
35693568
{
35703569
size_t sys_mem_required = ctx_size + buf_size;
35713570

3572-
LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0);
3573-
LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
3571+
{
3572+
LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0);
3573+
}
3574+
if (vram_weights > 0) {
3575+
LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
3576+
}
35743577

35753578
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
35763579
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
@@ -3586,7 +3589,6 @@ static void llm_load_tensors(
35863589
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
35873590
#else
35883591
GGML_UNUSED(n_gpu_layers);
3589-
GGML_UNUSED(vram_weights);
35903592
GGML_UNUSED(tensor_split);
35913593
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
35923594
}
@@ -3601,7 +3603,6 @@ static void llm_load_tensors(
36013603
ggml_cuda_set_tensor_split(tensor_split);
36023604
#endif // GGML_USE_CUBLAS
36033605

3604-
// TODO: only pass buf if it is a mmap buffer
36053606
ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL);
36063607

36073608
if (progress_callback) {

0 commit comments

Comments
 (0)