Skip to content

Commit fa654d0

Browse files
committed
kompute : make partial tensor copies faster by syncing less data (#15)
Signed-off-by: Jared Van Bortel <[email protected]>
1 parent 75204ba commit fa654d0

File tree

2 files changed

+39
-22
lines changed

2 files changed

+39
-22
lines changed

ggml-kompute.cpp

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@
6262

6363
typedef ggml_fp16_t half;
6464

65+
static const std::shared_ptr<kp::Tensor> nullTensor = nullptr;
66+
6567
static std::string ggml_kompute_format_name(int device) {
6668
return "Kompute" + std::to_string(device);
6769
}
@@ -585,31 +587,47 @@ ggml_vk_memory * ggml_vk_find_tensor(const struct ggml_tensor * t, uint64_t & of
585587
}
586588

587589
static
588-
const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(const struct ggml_tensor * t, uint32_t * alignedOffset = nullptr) {
589-
uint64_t originalOffset = 0;
590-
auto * res = ggml_vk_find_tensor(t, originalOffset);
590+
const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor_aligned(const struct ggml_tensor * t, uint32_t * aligned_offset) {
591+
uint64_t original_offset = 0;
592+
auto * res = ggml_vk_find_tensor(t, original_offset);
591593
if (!res) {
592-
static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
593594
return nullTensor;
594595
}
595596

596597
// Create a tensor whose memory will be composed of our buffers at the correct offset
597-
const size_t nelements = ggml_nelements(t);
598598
size_t nbytes = ggml_nbytes(t);
599+
size_t vulkan_offset = ggml_vk_aligned_offset(t->buffer, original_offset);
600+
*aligned_offset = original_offset - vulkan_offset;
601+
nbytes += *aligned_offset;
602+
603+
return komputeManager()->tensor(
604+
t->data,
605+
ggml_nelements(t), nbytes,
606+
kp::Tensor::TensorDataTypes::eFloat,
607+
res->primaryMemory, res->primaryBuffer,
608+
res->stagingMemory, res->stagingBuffer,
609+
vulkan_offset);
610+
}
599611

600-
size_t vulkanOffset = ggml_vk_aligned_offset(t->buffer, originalOffset);
601-
if (alignedOffset) {
602-
*alignedOffset = originalOffset - vulkanOffset;
603-
nbytes += *alignedOffset;
612+
static
613+
const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor_slice(const struct ggml_tensor * t, size_t offset, size_t nbytes) {
614+
uint64_t tensor_offset = 0;
615+
auto * res = ggml_vk_find_tensor(t, tensor_offset);
616+
if (!res) {
617+
return nullTensor;
604618
}
605619

620+
size_t elsz = ggml_element_size(t);
621+
GGML_ASSERT(nbytes % elsz == 0);
622+
623+
// Create a tensor whose memory will be composed of our buffers at the correct offset
606624
return komputeManager()->tensor(
607-
t->data,
608-
nelements,
609-
nbytes, kp::Tensor::TensorDataTypes::eFloat,
625+
reinterpret_cast<char *>(t->data) + offset,
626+
nbytes / elsz, nbytes,
627+
kp::Tensor::TensorDataTypes::eFloat,
610628
res->primaryMemory, res->primaryBuffer,
611629
res->stagingMemory, res->stagingBuffer,
612-
vulkanOffset);
630+
tensor_offset + offset);
613631
}
614632

615633
static std::vector<uint32_t> getSpirvShader(const unsigned char* rawData, size_t size) {
@@ -1546,13 +1564,12 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
15461564
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
15471565
const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
15481566

1549-
const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
15501567
uint32_t off_src0 = 0;
15511568
uint32_t off_src1 = 0;
15521569
uint32_t off_dst = 0;
1553-
const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor;
1554-
const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor;
1555-
const std::shared_ptr<kp::Tensor>& id_dst = dst ? ggml_vk_get_tensor(dst, &off_dst) : nullTensor;
1570+
const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor_aligned(src0, &off_src0) : nullTensor;
1571+
const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor_aligned(src1, &off_src1) : nullTensor;
1572+
const std::shared_ptr<kp::Tensor>& id_dst = dst ? ggml_vk_get_tensor_aligned(dst, &off_dst) : nullTensor;
15561573

15571574
switch (dst->op) {
15581575
case GGML_OP_ADD:
@@ -1865,7 +1882,7 @@ static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer)
18651882
static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
18661883
GGML_UNUSED(buffer);
18671884

1868-
const auto res = ggml_vk_get_tensor(tensor);
1885+
const auto res = ggml_vk_get_tensor_slice(tensor, offset, size);
18691886
GGML_ASSERT(res);
18701887

18711888
memcpy((char *)tensor->data + offset, data, size);
@@ -1876,7 +1893,7 @@ static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer,
18761893
static void ggml_backend_kompute_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
18771894
GGML_UNUSED(buffer);
18781895

1879-
const auto res = ggml_vk_get_tensor(tensor);
1896+
const auto res = ggml_vk_get_tensor_slice(tensor, offset, size);
18801897
GGML_ASSERT(res);
18811898

18821899
komputeManager()->sequence()->eval<kp::OpTensorSyncLocal>({res});

llama.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12180,7 +12180,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
1218012180
data_ctx->write(&kv_size, sizeof(kv_size));
1218112181
data_ctx->write(&kv_used, sizeof(kv_used));
1218212182

12183-
if (kv_buf_size) {
12183+
if (kv_buf_size && kv_head) {
1218412184
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
1218512185

1218612186
std::vector<uint8_t> tmp_buf;
@@ -12291,9 +12291,9 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
1229112291
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
1229212292
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
1229312293

12294-
if (kv_buf_size) {
12295-
GGML_ASSERT(kv_self.total_size() == kv_buf_size);
12294+
GGML_ASSERT(kv_self.total_size() == kv_buf_size);
1229612295

12296+
if (kv_buf_size && kv_head) {
1229712297
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
1229812298

1229912299
for (int il = 0; il < (int) n_layer; ++il) {

0 commit comments

Comments
 (0)