Skip to content

Commit c8bd5d8

Browse files
committed
add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data
1 parent 1ac01fb commit c8bd5d8

File tree

6 files changed

+67
-28
lines changed

6 files changed

+67
-28
lines changed

ggml-backend-impl.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ extern "C" {
2020
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
2121
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
2222
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
23+
// check if tensor data is in host memory
24+
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
25+
bool (*is_host) (ggml_backend_buffer_type_t buft);
2326
};
2427

2528
struct ggml_backend_buffer_type {
@@ -79,7 +82,7 @@ extern "C" {
7982
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
8083
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
8184

82-
void (*synchronize) (ggml_backend_t backend);
85+
void (*synchronize)(ggml_backend_t backend);
8386

8487
// compute graph with a plan
8588
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);

ggml-backend.c

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba
3535
return buft->iface.supports_backend(buft, backend);
3636
}
3737

38+
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
39+
if (buft->iface.is_host) {
40+
return buft->iface.is_host(buft);
41+
}
42+
return false;
43+
}
44+
3845
// backend buffer
3946

4047
ggml_backend_buffer_t ggml_backend_buffer_init(
@@ -98,6 +105,10 @@ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
98105
buffer->iface.clear(buffer, value);
99106
}
100107

108+
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
109+
return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
110+
}
111+
101112
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
102113
return buffer->buft;
103114
}
@@ -464,13 +475,20 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty
464475
GGML_UNUSED(buft);
465476
}
466477

478+
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
479+
return true;
480+
481+
GGML_UNUSED(buft);
482+
}
483+
467484
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
468485
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
469486
/* .iface = */ {
470487
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
471488
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
472489
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
473490
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
491+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
474492
},
475493
/* .context = */ NULL,
476494
};
@@ -479,9 +497,11 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
479497
}
480498

481499
#ifdef GGML_USE_CPU_HBM
500+
501+
// buffer type HBM
502+
482503
#include <hbwmalloc.h>
483504

484-
// HBM buffer type
485505
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
486506
hbw_free(buffer->context);
487507
}
@@ -503,16 +523,15 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
503523
return buffer;
504524
}
505525

506-
struct ggml_backend_buffer_type_i cpu_backend_hbm_buffer_type_interface = {
507-
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
508-
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
509-
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
510-
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
511-
};
512-
513526
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
514527
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
515-
/* .iface = */ cpu_backend_hbm_buffer_type_interface,
528+
/* .iface = */ {
529+
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
530+
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
531+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
532+
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
533+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
534+
},
516535
/* .context = */ NULL,
517536
};
518537

ggml-backend.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ extern "C" {
2121
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
2222
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
2323
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
24+
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
2425

2526
// buffer
2627
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
@@ -30,6 +31,7 @@ extern "C" {
3031
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
3132
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
3233
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
34+
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
3335
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
3436

3537
//

ggml-cuda.cu

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9568,6 +9568,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
95689568
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
95699569
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
95709570
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
9571+
/* .is_host = */ nullptr,
95719572
};
95729573

95739574
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
@@ -9606,16 +9607,15 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
96069607
return buffer;
96079608
}
96089609

9609-
struct ggml_backend_buffer_type_i ggml_backend_cuda_host_buffer_type_interface = {
9610-
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
9611-
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
9612-
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
9613-
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
9614-
};
9615-
96169610
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
96179611
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
9618-
/* .iface = */ ggml_backend_cuda_host_buffer_type_interface,
9612+
/* .iface = */ {
9613+
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
9614+
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
9615+
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
9616+
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
9617+
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
9618+
},
96199619
/* .context = */ nullptr,
96209620
};
96219621

ggml-metal.m

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2521,13 +2521,20 @@ static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_
25212521
UNUSED(buft);
25222522
}
25232523

2524+
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
2525+
return true;
2526+
2527+
UNUSED(buft);
2528+
}
2529+
25242530
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
25252531
static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
25262532
/* .iface = */ {
25272533
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
25282534
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
25292535
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
25302536
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
2537+
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
25312538
},
25322539
/* .context = */ NULL,
25332540
};

llama.cpp

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1539,7 +1539,7 @@ static bool llama_kv_cache_init(
15391539
ggml_cuda_assign_buffers_no_scratch(v);
15401540
vram_kv_cache += ggml_nbytes(k);
15411541
vram_kv_cache += ggml_nbytes(v);
1542-
// HACK: mark tensor as allocated, but crash if we try to use it from the CPU
1542+
// HACK: mark tensor as allocated
15431543
k->data = v->data = (void *)(uintptr_t)1;
15441544
}
15451545
}
@@ -2285,9 +2285,15 @@ struct llama_model_loader {
22852285
ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + offs, 0, ggml_nbytes(cur));
22862286
}
22872287
} else {
2288-
// FIXME: use read_buf for device buffers without unified memory
2289-
file.seek(offs, SEEK_SET);
2290-
file.read_raw(cur->data, ggml_nbytes(cur));
2288+
if (ggml_backend_buffer_is_host(cur->buffer)) {
2289+
file.seek(offs, SEEK_SET);
2290+
file.read_raw(cur->data, ggml_nbytes(cur));
2291+
} else {
2292+
read_buf.resize(ggml_nbytes(cur));
2293+
file.seek(offs, SEEK_SET);
2294+
file.read_raw(read_buf.data(), ggml_nbytes(cur));
2295+
ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
2296+
}
22912297
}
22922298

22932299
if (use_mmap && lmlock) {
@@ -2298,7 +2304,7 @@ struct llama_model_loader {
22982304

22992305
case GGML_BACKEND_GPU:
23002306
case GGML_BACKEND_GPU_SPLIT: {
2301-
// HACK: mark tensor as allocated, but crash if we try to use it from the CPU
2307+
// HACK: mark tensor as allocated
23022308
cur->data = (void *)(uintptr_t)1;
23032309
void * data;
23042310
if (use_mmap) {
@@ -5773,7 +5779,7 @@ static struct ggml_cgraph * llama_build_graph(
57735779
const int64_t n_tokens = cur->ne[1];
57745780

57755781
float * data;
5776-
if (/*is_sys_mem_buf(cur->buffer)*/false) { // TODO
5782+
if (ggml_backend_buffer_is_host(cur->buffer)) {
57775783
data = (float *) cur->data;
57785784
} else {
57795785
lctx.buf_copy.resize(ggml_nbytes(cur));
@@ -5812,7 +5818,7 @@ static struct ggml_cgraph * llama_build_graph(
58125818
const int64_t n_ctx = cur->ne[0];
58135819

58145820
int32_t * data;
5815-
if (/*is_sys_mem_buf(cur->buffer)*/false) { // TODO
5821+
if (ggml_backend_buffer_is_host(cur->buffer)) {
58165822
data = (int32_t *) cur->data;
58175823
} else {
58185824
lctx.buf_copy.resize(ggml_nbytes(cur));
@@ -9230,13 +9236,15 @@ struct llama_context * llama_new_context_with_model(
92309236
}
92319237
#endif
92329238

9233-
if (ctx->backend == nullptr) {
9234-
// FIXME: this may fail if the model buffer is not compatible with the CPU backend
9239+
if (ctx->backend == nullptr && ggml_backend_buffer_is_host(model->buf)) {
92359240
ctx->backend = ggml_backend_cpu_init();
9241+
if (ctx->backend == nullptr) {
9242+
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
9243+
}
92369244
}
92379245

92389246
if (ctx->backend == nullptr) {
9239-
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
9247+
LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__);
92409248
delete ctx;
92419249
return nullptr;
92429250
}

0 commit comments

Comments
 (0)