Skip to content

Commit fdd2188

Browse files
authored
vulkan: Use push constant offset to handle misaligned descriptors (#10987)
1 parent f865ea1 commit fdd2188

19 files changed

+103
-42
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 64 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,7 @@ struct vk_op_unary_push_constants {
411411
uint32_t ne;
412412
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
413413
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
414-
uint32_t d_offset;
414+
uint32_t misalign_offsets;
415415
float param1; float param2;
416416
uint32_t ne0_012mp; uint32_t ne0_012L;
417417
uint32_t ne0_01mp; uint32_t ne0_01L;
@@ -459,7 +459,7 @@ struct vk_op_binary_push_constants {
459459
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
460460
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
461461
uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
462-
uint32_t d_offset;
462+
uint32_t misalign_offsets;
463463
float param1; float param2; int32_t param3;
464464
};
465465

@@ -546,7 +546,7 @@ struct vk_staging_memcpy {
546546
};
547547

548548
struct vk_op_upscale_push_constants {
549-
uint32_t ne; uint32_t d_offset;
549+
uint32_t ne; uint32_t a_offset; uint32_t d_offset;
550550
uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
551551
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
552552
float sf0; float sf1; float sf2; float sf3;
@@ -5076,6 +5076,57 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
50765076
}
50775077
}
50785078

5079+
static uint32_t get_misalign_bytes(ggml_backend_vk_context * ctx, const ggml_tensor * t)
5080+
{
5081+
return ((vk_tensor_offset(t) + t->view_offs) & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1));;
5082+
}
5083+
5084+
template <typename T> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, T &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
5085+
GGML_UNUSED(p);
5086+
GGML_UNUSED(src0);
5087+
GGML_UNUSED(src1);
5088+
GGML_UNUSED(src2);
5089+
GGML_UNUSED(dst);
5090+
static_assert(!std::is_const<T>::value, "unexpected type");
5091+
GGML_ASSERT(!src0 || get_misalign_bytes(ctx, src0) == 0);
5092+
GGML_ASSERT(!src1 || get_misalign_bytes(ctx, src1) == 0);
5093+
GGML_ASSERT(!src2 || get_misalign_bytes(ctx, src2) == 0);
5094+
GGML_ASSERT(!dst || get_misalign_bytes(ctx, dst) == 0);
5095+
}
5096+
5097+
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
5098+
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
5099+
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
5100+
5101+
p.misalign_offsets = (a_offset << 16) | d_offset;
5102+
5103+
GGML_UNUSED(src1);
5104+
GGML_UNUSED(src2);
5105+
}
5106+
5107+
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
5108+
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
5109+
const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
5110+
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
5111+
5112+
GGML_ASSERT(dst->op != GGML_OP_GET_ROWS || (a_offset == 0 && b_offset == 0 && d_offset == 0));
5113+
5114+
p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset;
5115+
5116+
GGML_UNUSED(src2);
5117+
}
5118+
5119+
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_upscale_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
5120+
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
5121+
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
5122+
5123+
p.a_offset = a_offset;
5124+
p.d_offset = d_offset;
5125+
5126+
GGML_UNUSED(src1);
5127+
GGML_UNUSED(src2);
5128+
}
5129+
50795130
template<typename PC>
50805131
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) {
50815132
VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
@@ -5179,8 +5230,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
51795230
}
51805231

51815232
GGML_ASSERT(d_D != nullptr);
5182-
uint64_t d_buf_offset = ((vk_tensor_offset(dst) + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
5183-
GGML_ASSERT(d_buf_offset == vk_tensor_offset(dst) || op == GGML_OP_CPY); // NOLINT
5233+
uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
51845234
if(!src0_uma) {
51855235
d_X = src0_buf_ctx->dev_buffer;
51865236
x_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
@@ -5196,6 +5246,12 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
51965246
z_buf_offset = vk_tensor_offset(src2) + src2->view_offs;
51975247
GGML_ASSERT(d_Z != nullptr);
51985248
}
5249+
// Compute misalignment offset for descriptors and store it in in push constants, then align the descriptor offsets.
5250+
init_pushconst_tensor_offsets(ctx, pc, src0, src1, src2, dst);
5251+
x_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
5252+
y_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
5253+
z_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
5254+
d_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
51995255

52005256
if (op_supports_incontiguous) {
52015257
x_sz = ggml_nbytes(src0);
@@ -5383,7 +5439,6 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
53835439
const uint32_t src0_type_size = ggml_type_size(src0->type);
53845440
const uint32_t src1_type_size = ggml_type_size(src1->type);
53855441
const uint32_t dst_type_size = ggml_type_size(dst->type);
5386-
const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
53875442

53885443
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
53895444
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
@@ -5395,7 +5450,7 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
53955450
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size,
53965451
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
53975452
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] / dst_type_size,
5398-
d_offset,
5453+
0,
53995454
0.0f, 0.0f, offset,
54005455
}, dryrun);
54015456
}
@@ -5599,7 +5654,7 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c
55995654
const float sf3 = (float)dst->ne[3] / src0->ne[3];
56005655

56015656
ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
5602-
(uint32_t)ggml_nelements(dst), 0,
5657+
(uint32_t)ggml_nelements(dst), 0, 0,
56035658
(uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
56045659
(uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
56055660
sf0, sf1, sf2, sf3,
@@ -5709,13 +5764,12 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, co
57095764
static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
57105765
const uint32_t src0_type_size = ggml_type_size(src0->type);
57115766
const uint32_t dst_type_size = ggml_type_size(dst->type);
5712-
const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
57135767

57145768
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
57155769
(uint32_t)ggml_nelements(src0),
57165770
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
57175771
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
5718-
d_offset,
5772+
0,
57195773
0.0f, 0.0f,
57205774
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57215775
}, dryrun);

ggml/src/ggml-vulkan/vulkan-shaders/acc.comp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ void main() {
2121
get_indices(idx, i00, i01, i02, i03);
2222

2323
if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
24-
data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
24+
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
2525
} else {
26-
data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]));
26+
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]));
2727
}
2828
}
2929

ggml/src/ggml-vulkan/vulkan-shaders/add.comp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ void main() {
2222
uint i00, i01, i02, i03;
2323
get_indices(idx, i00, i01, i02, i03);
2424

25-
data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
25+
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
2626

2727
idx += num_threads;
2828
}

ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@ void main() {
1212
return;
1313
}
1414

15-
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
16-
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
15+
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
16+
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
1717
}

ggml/src/ggml-vulkan/vulkan-shaders/concat.comp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,12 @@ void main() {
3030
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
3131

3232
#ifndef OPTIMIZATION_ERROR_WORKAROUND
33-
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
33+
data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : data_b[get_boffset() + src1_idx]);
3434
#else
3535
if (is_src0) {
36-
data_d[p.d_offset + dst_idx] = data_a[src0_idx];
36+
data_d[get_doffset() + dst_idx] = data_a[get_aoffset() + src0_idx];
3737
} else {
38-
data_d[p.d_offset + dst_idx] = data_b[src1_idx];
38+
data_d[get_doffset() + dst_idx] = data_b[get_boffset() + src1_idx];
3939
}
4040
#endif
4141
}

ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@ void main() {
1919
if (idx + (num_iter-1)*num_threads < p.ne) {
2020
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
2121
#ifndef OPTIMIZATION_ERROR_WORKAROUND
22-
data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
22+
data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]);
2323
#else
24-
data_d[p.d_offset + idx] = data_a[idx];
24+
data_d[get_doffset() + idx] = data_a[get_aoffset() + idx];
2525
#endif
2626
idx += num_threads;
2727
}
@@ -32,9 +32,9 @@ void main() {
3232
}
3333

3434
#ifndef OPTIMIZATION_ERROR_WORKAROUND
35-
data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
35+
data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]);
3636
#else
37-
data_d[p.d_offset + idx] = data_a[idx];
37+
data_d[get_doffset() + idx] = data_a[get_aoffset() + idx];
3838
#endif
3939
idx += num_threads;
4040
}

ggml/src/ggml-vulkan/vulkan-shaders/copy.comp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ void main() {
1313
}
1414

1515
#ifndef OPTIMIZATION_ERROR_WORKAROUND
16-
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]);
16+
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
1717
#else
18-
data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)];
18+
data_d[get_doffset() + dst_idx(idx)] = data_a[get_aoffset() + src0_idx(idx)];
1919
#endif
2020
}

ggml/src/ggml-vulkan/vulkan-shaders/cos.comp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@ void main() {
1212
return;
1313
}
1414

15-
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
16-
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(cos(val));
15+
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
16+
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(cos(val));
1717
}

ggml/src/ggml-vulkan/vulkan-shaders/div.comp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ void main() {
2020
uint i00, i01, i02, i03;
2121
get_indices(idx, i00, i01, i02, i03);
2222

23-
data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) / FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
23+
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) / FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
2424

2525
idx += num_threads;
2626
}

ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ layout (push_constant) uniform parameter
77
uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
88
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
99
uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
10-
uint d_offset;
10+
uint misalign_offsets;
1111
float param1; float param2; int param3;
1212
} p;
1313

@@ -22,6 +22,10 @@ uint get_idx() {
2222
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
2323
}
2424

25+
uint get_aoffset() { return p.misalign_offsets >> 16; }
26+
uint get_boffset() { return (p.misalign_offsets >> 8) & 0xFF; }
27+
uint get_doffset() { return p.misalign_offsets & 0xFF; }
28+
2529
// mod and div are expensive and coordinates/dimensions are often power of 2 or equal to 1
2630
uint fastmod(uint a, uint b) {
2731
if ((b & (b-1)) == 0) {

ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ layout (push_constant) uniform parameter
66
uint ne;
77
uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
88
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
9-
uint d_offset;
9+
uint misalign_offsets;
1010
float param1; float param2;
1111

1212
uint ne0_012mp; uint ne0_012L;
@@ -24,6 +24,9 @@ uint get_idx() {
2424
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
2525
}
2626

27+
uint get_aoffset() { return p.misalign_offsets >> 16; }
28+
uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
29+
2730
// see init_fastdiv_values in ggml-vulkan.cpp
2831
uint fastdiv(uint n, uint mp, uint L) {
2932
uint msbs, lsbs;

ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ void main() {
1515
return;
1616
}
1717

18-
const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
18+
const uint i01 = data_b[get_boffset() + i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
1919

20-
const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
21-
const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
20+
const uint a_offset = get_aoffset() + i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
21+
const uint d_offset = get_doffset() + i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
2222

2323
#ifndef OPTIMIZATION_ERROR_WORKAROUND
2424
data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);

ggml/src/ggml-vulkan/vulkan-shaders/mul.comp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ void main() {
2020
uint i00, i01, i02, i03;
2121
get_indices(idx, i00, i01, i02, i03);
2222

23-
data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) * FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
23+
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) * FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
2424

2525
idx += num_threads;
2626
}

ggml/src/ggml-vulkan/vulkan-shaders/pad.comp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,5 +24,5 @@ void main() {
2424

2525
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
2626

27-
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f);
27+
data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
2828
}

ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,5 @@ void main() {
2222
return;
2323
}
2424

25-
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx_mod(idx)]);
25+
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx_mod(idx)]);
2626
}

ggml/src/ggml-vulkan/vulkan-shaders/scale.comp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ void main() {
1818
continue;
1919
}
2020

21-
data_d[p.d_offset + idx] = D_TYPE(FLOAT_TYPE(data_a[idx]) * FLOAT_TYPE(p.param1));
21+
data_d[get_doffset() + idx] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + idx]) * FLOAT_TYPE(p.param1));
2222
idx += num_threads;
2323
}
2424
}

ggml/src/ggml-vulkan/vulkan-shaders/sin.comp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@ void main() {
1212
return;
1313
}
1414

15-
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
16-
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(sin(val));
15+
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
16+
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(sin(val));
1717
}

ggml/src/ggml-vulkan/vulkan-shaders/square.comp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@ void main() {
1212
return;
1313
}
1414

15-
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
16-
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val);
15+
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
16+
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val * val);
1717
}

ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
layout (push_constant) uniform parameter
44
{
5-
uint ne; uint d_offset;
5+
uint ne; uint a_offset; uint d_offset;
66
uint nb00; uint nb01; uint nb02; uint nb03;
77
uint ne10; uint ne11; uint ne12; uint ne13;
88
float sf0; float sf1; float sf2; float sf3;
@@ -32,5 +32,5 @@ void main() {
3232
const uint i02 = uint(i12 / p.sf2);
3333
const uint i03 = uint(i13 / p.sf3);
3434

35-
data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
35+
data_d[p.d_offset + idx] = D_TYPE(data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
3636
}

0 commit comments

Comments
 (0)