Skip to content

Commit 963a122

Browse files
committed
backend : add event API
1 parent 940c01e commit 963a122

File tree

7 files changed

+133
-30
lines changed

7 files changed

+133
-30
lines changed

examples/llama-bench/llama-bench.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ static void print_usage(int /* argc */, char ** argv) {
202202
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
203203
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
204204
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
205-
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
205+
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
206206
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
207207
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
208208
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");

ggml-alloc.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,11 @@ struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) {
320320
}
321321

322322
void ggml_tallocr_set_buffer(ggml_tallocr_t talloc, struct ggml_backend_buffer * buffer) {
323+
GGML_ASSERT(talloc->measure == false);
324+
// FIXME: buffer ownership semantics
325+
// if the user is doing this, they probably want to take ownership of the buffer
326+
// or they need to restore the original buffer before freeing the allocator
327+
//talloc->buffer_owned = false;
323328
talloc->buffer = buffer;
324329
talloc->base = ggml_backend_buffer_get_base(buffer);
325330
talloc->alignment = ggml_backend_buffer_get_alignment(buffer);

ggml-backend-impl.h

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ extern "C" {
8585
// (optional) complete all pending operations
8686
void (*GGML_CALL synchronize)(ggml_backend_t backend);
8787

88-
// compute graph with a plan
88+
// compute graph with a plan (not used currently)
8989
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
9090
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
9191
void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
@@ -95,14 +95,25 @@ extern "C" {
9595

9696
// check if the backend supports an operation
9797
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
98+
99+
// (optional) event synchronization
100+
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
101+
void (*GGML_CALL event_free) (ggml_backend_event_t event);
102+
void (*GGML_CALL event_record) (ggml_backend_event_t event);
103+
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
104+
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
98105
};
99106

100107
struct ggml_backend {
101108
struct ggml_backend_i iface;
102-
103109
ggml_backend_context_t context;
104110
};
105111

112+
struct ggml_backend_event {
113+
ggml_backend_t backend;
114+
void * context;
115+
};
116+
106117
//
107118
// Backend registry
108119
//

ggml-backend.c

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,28 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
303303
}
304304
}
305305

306+
// events
307+
308+
ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
309+
return backend->iface.event_new(backend);
310+
}
311+
312+
void ggml_backend_event_free(ggml_backend_event_t event) {
313+
event->backend->iface.event_free(event);
314+
free(event);
315+
}
316+
317+
void ggml_backend_event_record(ggml_backend_event_t event) {
318+
event->backend->iface.event_record(event);
319+
}
320+
321+
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
322+
event->backend->iface.event_synchronize(event);
323+
}
324+
325+
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
326+
backend->iface.event_wait(backend, event);
327+
}
306328

307329
// backend registry
308330

@@ -716,6 +738,11 @@ static struct ggml_backend_i cpu_backend_i = {
716738
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
717739
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
718740
/* .supports_op = */ ggml_backend_cpu_supports_op,
741+
/* .event_new = */ NULL,
742+
/* .event_free = */ NULL,
743+
/* .event_record = */ NULL,
744+
/* .event_wait = */ NULL,
745+
/* .event_synchronize = */ NULL,
719746
};
720747

721748
ggml_backend_t ggml_backend_cpu_init(void) {
@@ -853,6 +880,8 @@ static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_
853880
return sched->tallocs[i];
854881
}
855882
}
883+
884+
fprintf(stderr, "%s: error: no backend supports buffer type %s\n", __func__, ggml_backend_buffer_name(buffer));
856885
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
857886
}
858887

@@ -1336,7 +1365,6 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
13361365
ggml_graph_dump_dot(split->graph, NULL, split_filename);
13371366
#endif
13381367

1339-
13401368
uint64_t compute_start_us = ggml_time_us();
13411369
if (!sched->callback_eval) {
13421370
ggml_backend_graph_compute(split_backend, &split->graph);

ggml-backend.h

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ extern "C" {
99

1010
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
1111
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
12+
typedef struct ggml_backend_event * ggml_backend_event_t;
1213
typedef struct ggml_backend * ggml_backend_t;
1314
typedef void * ggml_backend_graph_plan_t;
1415

@@ -47,7 +48,6 @@ extern "C" {
4748
// Backend
4849
//
4950

50-
5151
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
5252
GGML_API void ggml_backend_free(ggml_backend_t backend);
5353

@@ -74,6 +74,13 @@ extern "C" {
7474
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
7575
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t src_backend, ggml_backend_t dst_backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy
7676

77+
// events
78+
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
79+
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
80+
GGML_API void ggml_backend_event_record (ggml_backend_event_t event); // can only be called from the backend that created the event
81+
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event); // can only be called from the backend that created the event
82+
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // can be called from any backend
83+
7784
//
7885
// CPU backend
7986
//
@@ -118,17 +125,21 @@ extern "C" {
118125
/*
119126
Example usage:
120127
121-
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
128+
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS
129+
// will be assigned preferrably to run on the buffer backend by ggml_backend_sched
130+
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
131+
132+
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE);
122133
// sched is initialized with measure allocators and cannot be used until allocated with a measure graph
123134
124135
// initialize buffers from a measure graph
125136
measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
126137
127138
// in build_graph:
128-
build_graph(...) {
139+
void build_graph(...) {
129140
// allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
130-
alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
131-
ggml_allocr_alloc(alloc_cpu, tensor);
141+
alloc_cpu = ggml_backend_sched_get_tallocr(sched, backend_cpu);
142+
ggml_tallocr_alloc(alloc_cpu, tensor);
132143
133144
// manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
134145
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
@@ -143,6 +154,7 @@ extern "C" {
143154
// compute
144155
graph = build_graph(sched);
145156
ggml_backend_sched_graph_compute(sched, graph);
157+
146158
*/
147159

148160
struct ggml_backend_sched;

ggml-cuda.cu

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11070,6 +11070,58 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
1107011070
UNUSED(backend);
1107111071
}
1107211072

11073+
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
11074+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11075+
11076+
ggml_cuda_set_device(cuda_ctx->device);
11077+
11078+
cudaEvent_t event;
11079+
CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
11080+
11081+
return new ggml_backend_event {
11082+
/* .backend = */ backend,
11083+
/* .context = */ event,
11084+
};
11085+
}
11086+
11087+
static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
11088+
CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
11089+
11090+
delete event;
11091+
}
11092+
11093+
static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
11094+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;
11095+
11096+
ggml_cuda_set_device(cuda_ctx->device);
11097+
11098+
CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, g_cudaStreams[cuda_ctx->device][0]));
11099+
}
11100+
11101+
static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
11102+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
11103+
11104+
if (ggml_backend_is_cuda(event->backend)) {
11105+
11106+
ggml_cuda_set_device(cuda_ctx->device);
11107+
11108+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0));
11109+
} else {
11110+
auto wait_fn = [](void * user_data) {
11111+
ggml_backend_event_t event = (ggml_backend_event_t)user_data;
11112+
ggml_backend_event_synchronize(event);
11113+
};
11114+
11115+
CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event));
11116+
}
11117+
}
11118+
11119+
static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
11120+
assert(backend == event->backend);
11121+
11122+
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
11123+
}
11124+
1107311125
static ggml_backend_i ggml_backend_cuda_interface = {
1107411126
/* .get_name = */ ggml_backend_cuda_name,
1107511127
/* .free = */ ggml_backend_cuda_free,
@@ -11083,6 +11135,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
1108311135
/* .graph_plan_compute = */ NULL,
1108411136
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
1108511137
/* .supports_op = */ ggml_backend_cuda_supports_op,
11138+
/* .event_new = */ ggml_backend_cuda_event_new,
11139+
/* .event_free = */ ggml_backend_cuda_event_free,
11140+
/* .event_record = */ ggml_backend_cuda_event_record,
11141+
/* .event_wait = */ ggml_backend_cuda_event_wait,
11142+
/* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
1108611143
};
1108711144

1108811145
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {

llama.cpp

Lines changed: 11 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6601,8 +6601,6 @@ static int llama_decode_internal(
66016601
const auto & hparams = model.hparams;
66026602
const auto & cparams = lctx.cparams;
66036603

6604-
//const auto n_batch = cparams.n_batch;
6605-
66066604
GGML_ASSERT((!all_batch.token && all_batch.embd) || (all_batch.token && !all_batch.embd)); // NOLINT
66076605

66086606
GGML_ASSERT(n_tokens_all <= cparams.n_ctx);
@@ -6623,16 +6621,6 @@ static int llama_decode_internal(
66236621

66246622
auto * logits_out = lctx.logits;
66256623

6626-
/*
6627-
if (all_batch.logits) {
6628-
logits_out.resize(n_vocab * n_tokens_all);
6629-
} else if (lctx.logits_all) {
6630-
logits_out.resize(n_vocab * n_tokens_all);
6631-
} else {
6632-
logits_out.resize(n_vocab);
6633-
}
6634-
*/
6635-
66366624
#ifndef NDEBUG
66376625
auto & logits_valid = lctx.logits_valid;
66386626
logits_valid.clear();
@@ -6643,7 +6631,8 @@ static int llama_decode_internal(
66436631

66446632

66456633
const uint32_t n_ubatch = cparams.n_ubatch;
6646-
//const uint32_t n_microbatch = 256;
6634+
6635+
//printf("n_tokens_all = %u, n_ubatch = %u\n", n_tokens_all, n_ubatch);
66476636

66486637
for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
66496638
const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
@@ -10016,9 +10005,17 @@ struct llama_context * llama_new_context_with_model(
1001610005
LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
1001710006
ctx->alloc_cpu = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
1001810007

10008+
for (ggml_backend_t backend : ctx->backends) {
10009+
ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
10010+
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
10011+
ggml_backend_buffer_name(buf),
10012+
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
10013+
}
10014+
1001910015
// duplicate cpu buffers for microbatching
10020-
const int n_ub = 16;
10016+
const int n_ub = (cparams.n_batch + cparams.n_ubatch - 1) / cparams.n_ubatch;
1002110017
ctx->n_compute_bufs = n_ub;
10018+
LLAMA_LOG_INFO("%s: allocating %d compute buffers\n", __func__, n_ub);
1002210019

1002310020
for (ggml_backend_t b : ctx->backends) {
1002410021
ggml_tallocr_t alloc = ggml_backend_sched_get_tallocr(ctx->sched, b);
@@ -10049,13 +10046,6 @@ struct llama_context * llama_new_context_with_model(
1004910046
LLAMA_LOG_INFO("%s: logits buffer size = %8.2f MiB, type = %s\n", __func__,
1005010047
ggml_backend_buffer_get_size(ctx->buf_logits) / 1024.0 / 1024.0,
1005110048
ggml_backend_buffer_name(ctx->buf_logits));
10052-
10053-
for (ggml_backend_t backend : ctx->backends) {
10054-
ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
10055-
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
10056-
ggml_backend_buffer_name(buf),
10057-
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
10058-
}
1005910049
}
1006010050
}
1006110051

0 commit comments

Comments
 (0)