Skip to content

Commit 24cc321

Browse files
committed
update session copy/set to use ggml-backend
ggml-ci
1 parent bcd87ca commit 24cc321

File tree

3 files changed

+49
-32
lines changed

3 files changed

+49
-32
lines changed

ggml-alloc.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -792,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
792792
} else {
793793
ggml_backend_view_init(buffer, t);
794794
}
795+
} else {
796+
if (t->view_src != NULL) {
797+
// view of a pre-allocated tensor
798+
ggml_backend_view_init(buffer, t);
799+
}
795800
}
796801
}
797802

ggml-backend.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1250,7 +1250,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
12501250
// utils
12511251
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
12521252
GGML_ASSERT(tensor->buffer == NULL);
1253-
GGML_ASSERT(tensor->data == NULL);
1253+
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
12541254
GGML_ASSERT(tensor->view_src != NULL);
12551255
GGML_ASSERT(tensor->view_src->buffer != NULL);
12561256
GGML_ASSERT(tensor->view_src->data != NULL);

llama.cpp

Lines changed: 43 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3661,7 +3661,7 @@ static void llm_load_tensors(
36613661
LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
36623662
}
36633663

3664-
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
3664+
#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
36653665
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
36663666

36673667
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
@@ -9830,17 +9830,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
98309830
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
98319831
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
98329832

9833-
std::vector<std::vector<uint8_t>> kout2d_data(n_layer);
9834-
std::vector<std::vector<uint8_t>> vout2d_data(n_layer);
9833+
std::vector<struct ggml_tensor *> kout2d(n_layer);
9834+
std::vector<struct ggml_tensor *> vout2d(n_layer);
98359835

98369836
for (int il = 0; il < (int) n_layer; ++il) {
9837-
ggml_tensor * kout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9838-
kout2d_data[il].resize(ggml_nbytes(kout2d));
9839-
kout2d->data = kout2d_data[il].data();
9840-
9841-
ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9842-
vout2d_data[il].resize(ggml_nbytes(vout2d));
9843-
vout2d->data = vout2d_data[il].data();
9837+
kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9838+
vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
98449839

98459840
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
98469841
n_embd, kv_head,
@@ -9850,21 +9845,28 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
98509845
kv_head, n_embd,
98519846
elt_size*n_ctx, 0);
98529847

9853-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
9854-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
9848+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
9849+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d[il]));
98559850
}
98569851

9857-
std::vector<uint8_t> work_buffer;
9858-
ggml_graph_compute_helper(work_buffer, gf, ctx->cparams.n_threads);
9852+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
98599853

9860-
ggml_free(cpy_ctx);
9854+
ggml_backend_graph_compute(ctx->backend, gf);
9855+
9856+
std::vector<uint8_t> tmp_buf;
9857+
for (int il = 0; il < (int) n_layer; ++il) {
9858+
tmp_buf.resize(ggml_nbytes(kout2d[il]));
9859+
ggml_backend_tensor_get(kout2d[il], tmp_buf.data(), 0, tmp_buf.size());
9860+
data_ctx->write(tmp_buf.data(), tmp_buf.size());
98619861

9862-
// our data is now in the kout2d_data and vout2d_data buffers
9863-
// write them to file
9864-
for (uint32_t il = 0; il < n_layer; ++il) {
9865-
data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
9866-
data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
9862+
tmp_buf.resize(ggml_nbytes(vout2d[il]));
9863+
ggml_backend_tensor_get(vout2d[il], tmp_buf.data(), 0, tmp_buf.size());
9864+
data_ctx->write(tmp_buf.data(), tmp_buf.size());
98679865
}
9866+
9867+
ggml_free(cpy_ctx);
9868+
9869+
ggml_backend_buffer_free(buf);
98689870
}
98699871

98709872
for (uint32_t i = 0; i < kv_size; ++i) {
@@ -9969,14 +9971,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
99699971
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
99709972
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
99719973

9972-
for (int il = 0; il < n_layer; ++il) {
9973-
ggml_tensor * kin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9974-
kin2d->data = (void *) inp;
9975-
inp += ggml_nbytes(kin2d);
9974+
std::vector<struct ggml_tensor *> kin2d(n_layer);
9975+
std::vector<struct ggml_tensor *> vin2d(n_layer);
99769976

9977-
ggml_tensor * vin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9978-
vin2d->data = (void *) inp;
9979-
inp += ggml_nbytes(vin2d);
9977+
for (int il = 0; il < n_layer; ++il) {
9978+
kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9979+
vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
99809980

99819981
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
99829982
n_embd, kv_head,
@@ -9986,14 +9986,26 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
99869986
kv_head, n_embd,
99879987
elt_size*n_ctx, 0);
99889988

9989-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d));
9990-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
9989+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
9990+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d[il], v2d));
9991+
}
9992+
9993+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
9994+
9995+
// load data into the tensors
9996+
for (int il = 0; il < n_layer; ++il) {
9997+
ggml_backend_tensor_set(kin2d[il], inp, 0, ggml_nbytes(kin2d[il]));
9998+
inp += ggml_nbytes(kin2d[il]);
9999+
10000+
ggml_backend_tensor_set(vin2d[il], inp, 0, ggml_nbytes(vin2d[il]));
10001+
inp += ggml_nbytes(vin2d[il]);
999110002
}
999210003

9993-
std::vector<uint8_t> work_buffer;
9994-
ggml_graph_compute_helper(work_buffer, gf, ctx->cparams.n_threads);
10004+
ggml_backend_graph_compute(ctx->backend, gf);
999510005

999610006
ggml_free(cpy_ctx);
10007+
10008+
ggml_backend_buffer_free(buf);
999710009
}
999810010

999910011
ctx->kv_self.head = kv_head;

0 commit comments

Comments
 (0)