Skip to content

Commit af789e7

Browse files
committed
fix async copy between backends
1 parent dbbaf82 commit af789e7

File tree

4 files changed

+36
-14
lines changed

4 files changed

+36
-14
lines changed

ggml-backend-impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ extern "C" {
8080
// (optional) asynchronous tensor data access
8181
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
8282
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
83-
bool (*cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
83+
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
8484

8585
// (optional) complete all pending operations
8686
void (*synchronize)(ggml_backend_t backend);

ggml-backend.c

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -279,24 +279,24 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
279279
}
280280
}
281281

282-
void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
282+
void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
283283
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
284284

285285
if (src == dst) {
286286
return;
287287
}
288288

289-
if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) {
290-
if (backend->iface.cpy_tensor_async != NULL) {
291-
if (backend->iface.cpy_tensor_async(backend, src, dst)) {
292-
return;
293-
}
289+
if (backend_dst->iface.cpy_tensor_async != NULL) {
290+
if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
291+
return;
294292
}
295293
}
296294

297295
size_t nbytes = ggml_nbytes(src);
298296
if (ggml_backend_buffer_is_host(src->buffer)) {
299-
ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes);
297+
// wait for src to be ready before copy
298+
ggml_backend_synchronize(backend_src);
299+
ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, nbytes);
300300
}
301301
else {
302302
ggml_backend_tensor_copy(src, dst);
@@ -1304,6 +1304,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
13041304
// copy the input tensors to the split backend
13051305
uint64_t copy_start_us = ggml_time_us();
13061306
for (int j = 0; j < split->n_inputs; j++) {
1307+
ggml_backend_t input_backend = get_allocr_backend(sched, node_allocr(split->inputs[j]));
13071308
struct ggml_tensor * input = split->inputs[j];
13081309
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id];
13091310

@@ -1312,7 +1313,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
13121313

13131314
// TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
13141315
// this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
1315-
ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
1316+
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
13161317
}
13171318
//ggml_backend_synchronize(split_backend); // necessary to measure copy time
13181319
int64_t copy_end_us = ggml_time_us();

ggml-backend.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ extern "C" {
7272

7373
// tensor copy between different backends
7474
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
75-
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy
75+
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t src_backend, ggml_backend_t dst_backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy
7676

7777
//
7878
// CPU backend

ggml-cuda.cu

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10763,11 +10763,32 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
1076310763
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
1076410764
}
1076510765

10766-
static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
10767-
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10766+
static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
10767+
if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
10768+
return false;
10769+
}
10770+
10771+
if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
10772+
return false;
10773+
}
10774+
10775+
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
10776+
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
10777+
10778+
if (backend_src == backend_dst) {
10779+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
10780+
} else {
10781+
cudaEvent_t event;
10782+
CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
10783+
10784+
// record event on src stream
10785+
CUDA_CHECK(cudaEventRecord(event, g_cudaStreams[cuda_ctx_src->device][0]));
10786+
// wait on dst stream
10787+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx_dst->device][0], event, 0));
10788+
// copy
10789+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
1076810790

10769-
if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
10770-
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0]));
10791+
CUDA_CHECK(cudaEventDestroy(event));
1077110792
return true;
1077210793
}
1077310794

0 commit comments

Comments
 (0)