Skip to content

Commit 16e12ab

Browse files
committed
also duplicate gpu compute buffers to avoid races
1 parent a971987 commit 16e12ab

File tree

2 files changed

+64
-26
lines changed

2 files changed

+64
-26
lines changed

ggml-cuda.cu

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10842,8 +10842,11 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
1084210842
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
1084310843
GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
1084410844

10845+
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
10846+
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
10847+
1084510848
// host -> device
10846-
if (ggml_backend_buffer_is_cuda_host(src->buffer) && ggml_backend_buffer_is_cuda(dst->buffer)) {
10849+
if (ggml_backend_buffer_is_cuda_host(buf_src) && ggml_backend_buffer_is_cuda(buf_dst)) {
1084710850
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
1084810851
// make sure the data is ready on the source backend
1084910852
// the CPU backend does not support async compute, so this does nothing at the moment
@@ -10854,7 +10857,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
1085410857
}
1085510858

1085610859
// device -> host
10857-
if (ggml_backend_buffer_is_cuda_host(dst->buffer) && ggml_backend_buffer_is_cuda(src->buffer)) {
10860+
if (ggml_backend_buffer_is_cuda_host(buf_dst) && ggml_backend_buffer_is_cuda(buf_src)) {
1085810861
// this shoudln't happen currently because the dst backend is our own backend, which does not support host buffers
1085910862
GGML_ASSERT(false);
1086010863
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
@@ -10875,22 +10878,31 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
1087510878
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
1087610879

1087710880
if (backend_src != backend_dst) {
10878-
//printf("async copy between devices %s, %d -> %d\n", src->name, cuda_ctx_src->device, cuda_ctx_dst->device);
10879-
cudaDeviceSynchronize();
10880-
// TODO: reuse event?
10881+
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
10882+
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
10883+
10884+
GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
10885+
GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
10886+
10887+
ggml_cuda_set_device(cuda_ctx_src->device);
10888+
1088110889
cudaEvent_t event;
1088210890
CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
1088310891

1088410892
// record event on src stream
1088510893
CUDA_CHECK(cudaEventRecord(event, g_cudaStreams[cuda_ctx_src->device][0]));
1088610894

1088710895
// wait on dst stream
10896+
ggml_cuda_set_device(cuda_ctx_dst->device);
1088810897
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx_dst->device][0], event, 0));
1088910898

10899+
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), g_cudaStreams[cuda_ctx_dst->device][0]));
10900+
1089010901
CUDA_CHECK(cudaEventDestroy(event));
10902+
} else {
10903+
// copy
10904+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
1089110905
}
10892-
// copy
10893-
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
1089410906
return true;
1089510907
}
1089610908

llama.cpp

Lines changed: 45 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1663,13 +1663,24 @@ struct llama_model {
16631663
struct llama_context {
16641664
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
16651665
~llama_context() {
1666+
for (auto & it : bufs_compute) {
1667+
// restore the original buffer in the tallocr
1668+
ggml_tallocr_t allocr = ggml_backend_sched_get_tallocr(sched, it.first);
1669+
ggml_tallocr_set_buffer(allocr, it.second[0]);
1670+
// free the rest of the buffers
1671+
for (size_t i = 1; i < it.second.size(); ++i) {
1672+
ggml_backend_buffer_free(it.second[i]);
1673+
}
1674+
}
1675+
16661676
ggml_backend_sched_free(sched);
16671677

16681678
for (ggml_backend_t backend : backends) {
16691679
ggml_backend_free(backend);
16701680
}
16711681

16721682
ggml_backend_buffer_free(buf_logits);
1683+
16731684
}
16741685

16751686
llama_cparams cparams;
@@ -1719,10 +1730,11 @@ struct llama_context {
17191730
std::vector<uint8_t> buf_compute_meta;
17201731
ggml_backend_sched_t sched = nullptr;
17211732
// allocator for the input tensors
1722-
ggml_tallocr * alloc_cpu = nullptr;
1733+
ggml_tallocr_t alloc_cpu = nullptr;
17231734

1724-
std::vector<ggml_backend_buffer_t> buf_cpu_ub;
1725-
size_t buf_cpu_ub_cur = 0;
1735+
std::map<ggml_backend_t, std::vector<ggml_backend_buffer_t>> bufs_compute;
1736+
size_t n_compute_bufs = 0;
1737+
size_t i_compute_buf = 0;
17261738

17271739
// temporary buffer for copying data to/from the backend
17281740
std::vector<no_init<uint8_t>> buf_copy;
@@ -6704,15 +6716,17 @@ static int llama_decode_internal(
67046716
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
67056717

67066718
// change the CPU compute buffer to avoid overwriting inputs
6707-
size_t buf_cpu_ub_cur = lctx.buf_cpu_ub_cur;
6708-
lctx.buf_cpu_ub_cur = (lctx.buf_cpu_ub_cur + 1) % lctx.buf_cpu_ub.size();
6709-
if (buf_cpu_ub_cur == 0 && cur_token > 0) {
6719+
size_t i_compute_buf = lctx.i_compute_buf;
6720+
lctx.i_compute_buf = (lctx.i_compute_buf + 1) % lctx.n_compute_bufs;
6721+
if (i_compute_buf == 0 && cur_token > 0) {
67106722
// sync all backends to ensure that the current buffer is not in use
67116723
printf("not enough buffers, syncing now\n");
67126724
ggml_backend_sched_synchronize(lctx.sched);
67136725
}
6714-
6715-
ggml_tallocr_set_buffer(lctx.alloc_cpu, lctx.buf_cpu_ub.at(buf_cpu_ub_cur));
6726+
for (auto it : lctx.bufs_compute) {
6727+
ggml_tallocr_t alloc = ggml_backend_sched_get_tallocr(lctx.sched, it.first);
6728+
ggml_tallocr_set_buffer(alloc, it.second.at(i_compute_buf));
6729+
}
67166730

67176731
ggml_backend_sched_reset(lctx.sched);
67186732

@@ -6833,7 +6847,7 @@ static int llama_decode_internal(
68336847
}
68346848

68356849
ggml_backend_sched_synchronize(lctx.sched);
6836-
lctx.buf_cpu_ub_cur = 0;
6850+
lctx.i_compute_buf = 0;
68376851

68386852
// measure the performance only for the single-token evals
68396853
if (n_tokens_all == 1) {
@@ -10003,14 +10017,26 @@ struct llama_context * llama_new_context_with_model(
1000310017
ctx->alloc_cpu = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
1000410018

1000510019
// duplicate cpu buffers for microbatching
10006-
ggml_backend_buffer_t buf_cpu = ggml_tallocr_get_buffer(ctx->alloc_cpu);
10007-
size_t buf_size = ggml_backend_buffer_get_size(buf_cpu);
10008-
ctx->buf_cpu_ub.push_back(buf_cpu);
10009-
int n_ub = 64;
10010-
for (int i = 1; i < n_ub; ++i) {
10011-
ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_size);
10012-
ctx->buf_cpu_ub.push_back(buf);
10020+
const int n_ub = 16;
10021+
ctx->n_compute_bufs = n_ub;
10022+
10023+
for (ggml_backend_t b : ctx->backends) {
10024+
ggml_tallocr_t alloc = ggml_backend_sched_get_tallocr(ctx->sched, b);
10025+
ggml_backend_buffer_t buf = ggml_tallocr_get_buffer(alloc);
10026+
size_t buf_size = ggml_backend_buffer_get_size(buf);
10027+
ctx->bufs_compute[b].push_back(buf);
10028+
auto * buft = ggml_backend_buffer_get_type(buf);
10029+
for (int i = 1; i < n_ub; ++i) {
10030+
ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(buft, buf_size);
10031+
if (buf == nullptr) {
10032+
LLAMA_LOG_ERROR("%s: failed to allocate compute buffer\n", __func__);
10033+
llama_free(ctx);
10034+
return nullptr;
10035+
}
10036+
ctx->bufs_compute[b].push_back(buf);
10037+
}
1001310038
}
10039+
1001410040
// allocate buffer for logits output
1001510041
ctx->buf_logits = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), hparams.n_vocab*cparams.n_ctx*sizeof(float));
1001610042
if (ctx->buf_logits == nullptr) {
@@ -10816,21 +10842,21 @@ int32_t llama_decode(
1081610842

1081710843
float * llama_get_logits(struct llama_context * ctx) {
1081810844
ggml_backend_sched_synchronize(ctx->sched);
10819-
ctx->buf_cpu_ub_cur = 0;
10845+
ctx->i_compute_buf = 0;
1082010846
return ctx->logits;
1082110847
}
1082210848

1082310849
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
1082410850
ggml_backend_sched_synchronize(ctx->sched);
10825-
ctx->buf_cpu_ub_cur = 0;
10851+
ctx->i_compute_buf = 0;
1082610852

1082710853
assert(ctx->logits_valid.at(i));
1082810854
return ctx->logits + i*ctx->model.hparams.n_vocab;
1082910855
}
1083010856

1083110857
float * llama_get_embeddings(struct llama_context * ctx) {
1083210858
ggml_backend_sched_synchronize(ctx->sched);
10833-
ctx->buf_cpu_ub_cur = 0;
10859+
ctx->i_compute_buf = 0;
1083410860

1083510861
return ctx->embedding.data();
1083610862
}

0 commit comments

Comments
 (0)