Skip to content

Commit 506c0ad

Browse files
committed
kompute : make partial tensor copies faster by syncing less data (#15)
Signed-off-by: Jared Van Bortel <[email protected]>
1 parent 6596b7c commit 506c0ad

File tree

2 files changed

+40
-23
lines changed

2 files changed

+40
-23
lines changed

ggml-kompute.cpp

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@
6262

6363
typedef ggml_fp16_t half;
6464

65+
static const std::shared_ptr<kp::Tensor> nullTensor = nullptr;
66+
6567
static std::string ggml_kompute_format_name(int device) {
6668
return "Kompute" + std::to_string(device);
6769
}
@@ -585,31 +587,47 @@ ggml_vk_memory * ggml_vk_find_tensor(const struct ggml_tensor * t, uint64_t & of
585587
}
586588

587589
static
588-
const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(const struct ggml_tensor * t, uint32_t * alignedOffset = nullptr) {
589-
uint64_t originalOffset = 0;
590-
auto * res = ggml_vk_find_tensor(t, originalOffset);
590+
const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor_aligned(const struct ggml_tensor * t, uint32_t * aligned_offset) {
591+
uint64_t original_offset = 0;
592+
auto * res = ggml_vk_find_tensor(t, original_offset);
591593
if (!res) {
592-
static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
593594
return nullTensor;
594595
}
595596

596597
// Create a tensor whose memory will be composed of our buffers at the correct offset
597-
const size_t nelements = ggml_nelements(t);
598598
size_t nbytes = ggml_nbytes(t);
599+
size_t vulkan_offset = ggml_vk_aligned_offset(t->buffer, original_offset);
600+
*aligned_offset = original_offset - vulkan_offset;
601+
nbytes += *aligned_offset;
602+
603+
return komputeManager()->tensor(
604+
t->data,
605+
ggml_nelements(t), nbytes,
606+
kp::Tensor::TensorDataTypes::eFloat,
607+
res->primaryMemory, res->primaryBuffer,
608+
res->stagingMemory, res->stagingBuffer,
609+
vulkan_offset);
610+
}
599611

600-
size_t vulkanOffset = ggml_vk_aligned_offset(t->buffer, originalOffset);
601-
if (alignedOffset) {
602-
*alignedOffset = originalOffset - vulkanOffset;
603-
nbytes += *alignedOffset;
612+
static
613+
const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor_slice(const struct ggml_tensor * t, size_t offset, size_t nbytes) {
614+
uint64_t tensor_offset = 0;
615+
auto * res = ggml_vk_find_tensor(t, tensor_offset);
616+
if (!res) {
617+
return nullTensor;
604618
}
605619

620+
size_t elsz = ggml_element_size(t);
621+
GGML_ASSERT(nbytes % elsz == 0);
622+
623+
// Create a tensor whose memory will be composed of our buffers at the correct offset
606624
return komputeManager()->tensor(
607-
t->data,
608-
nelements,
609-
nbytes, kp::Tensor::TensorDataTypes::eFloat,
625+
reinterpret_cast<char *>(t->data) + offset,
626+
nbytes / elsz, nbytes,
627+
kp::Tensor::TensorDataTypes::eFloat,
610628
res->primaryMemory, res->primaryBuffer,
611629
res->stagingMemory, res->stagingBuffer,
612-
vulkanOffset);
630+
tensor_offset + offset);
613631
}
614632

615633
static std::vector<uint32_t> getSpirvShader(const unsigned char* rawData, size_t size) {
@@ -1551,13 +1569,12 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
15511569
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
15521570
const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
15531571

1554-
const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
15551572
uint32_t off_src0 = 0;
15561573
uint32_t off_src1 = 0;
15571574
uint32_t off_dst = 0;
1558-
const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor;
1559-
const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor;
1560-
const std::shared_ptr<kp::Tensor>& id_dst = dst ? ggml_vk_get_tensor(dst, &off_dst) : nullTensor;
1575+
const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor_aligned(src0, &off_src0) : nullTensor;
1576+
const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor_aligned(src1, &off_src1) : nullTensor;
1577+
const std::shared_ptr<kp::Tensor>& id_dst = dst ? ggml_vk_get_tensor_aligned(dst, &off_dst) : nullTensor;
15611578

15621579
switch (dst->op) {
15631580
case GGML_OP_ADD:
@@ -1876,7 +1893,7 @@ static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer)
18761893
static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
18771894
GGML_UNUSED(buffer);
18781895

1879-
const auto res = ggml_vk_get_tensor(tensor);
1896+
const auto res = ggml_vk_get_tensor_slice(tensor, offset, size);
18801897
GGML_ASSERT(res);
18811898

18821899
memcpy((char *)tensor->data + offset, data, size);
@@ -1887,7 +1904,7 @@ static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer,
18871904
static void ggml_backend_kompute_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
18881905
GGML_UNUSED(buffer);
18891906

1890-
const auto res = ggml_vk_get_tensor(tensor);
1907+
const auto res = ggml_vk_get_tensor_slice(tensor, offset, size);
18911908
GGML_ASSERT(res);
18921909

18931910
komputeManager()->sequence()->eval<kp::OpTensorSyncLocal>({res});

llama.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16447,7 +16447,7 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
1644716447
data_ctx->write(&kv_used, sizeof(kv_used));
1644816448
data_ctx->write(&v_trans, sizeof(v_trans));
1644916449

16450-
if (kv_buf_size) {
16450+
if (kv_buf_size && kv_head) {
1645116451
const size_t pre_kv_buf_size = data_ctx->get_size_written();
1645216452

1645316453
std::vector<uint8_t> tmp_buf;
@@ -16611,10 +16611,10 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
1661116611

1661216612
llama_kv_cache_clear(ctx);
1661316613

16614-
if (kv_buf_size) {
16615-
const size_t pre_kv_buf_size = inp - src;
16614+
GGML_ASSERT(kv_self.total_size() >= kv_buf_size);
1661616615

16617-
GGML_ASSERT(kv_self.total_size() >= kv_buf_size);
16616+
if (kv_buf_size && kv_head) {
16617+
const size_t pre_kv_buf_size = inp - src;
1661816618

1661916619
for (int il = 0; il < (int) n_layer; ++il) {
1662016620
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);

0 commit comments

Comments
 (0)