@@ -411,7 +411,7 @@ struct vk_op_unary_push_constants {
411
411
uint32_t ne;
412
412
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
413
413
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
414
- uint32_t d_offset ;
414
+ uint32_t misalign_offsets ;
415
415
float param1; float param2;
416
416
uint32_t ne0_012mp; uint32_t ne0_012L;
417
417
uint32_t ne0_01mp; uint32_t ne0_01L;
@@ -459,7 +459,7 @@ struct vk_op_binary_push_constants {
459
459
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
460
460
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
461
461
uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
462
- uint32_t d_offset ;
462
+ uint32_t misalign_offsets ;
463
463
float param1; float param2; int32_t param3;
464
464
};
465
465
@@ -546,7 +546,7 @@ struct vk_staging_memcpy {
546
546
};
547
547
548
548
struct vk_op_upscale_push_constants {
549
- uint32_t ne; uint32_t d_offset;
549
+ uint32_t ne; uint32_t a_offset; uint32_t d_offset;
550
550
uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
551
551
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
552
552
float sf0; float sf1; float sf2; float sf3;
@@ -5076,6 +5076,57 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
5076
5076
}
5077
5077
}
5078
5078
5079
+ static uint32_t get_misalign_bytes (ggml_backend_vk_context * ctx, const ggml_tensor * t)
5080
+ {
5081
+ return ((vk_tensor_offset (t) + t->view_offs ) & (ctx->device ->properties .limits .minStorageBufferOffsetAlignment - 1 ));;
5082
+ }
5083
+
5084
+ template <typename T> void init_pushconst_tensor_offsets (ggml_backend_vk_context * ctx, T &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
5085
+ GGML_UNUSED (p);
5086
+ GGML_UNUSED (src0);
5087
+ GGML_UNUSED (src1);
5088
+ GGML_UNUSED (src2);
5089
+ GGML_UNUSED (dst);
5090
+ static_assert (!std::is_const<T>::value, " unexpected type" );
5091
+ GGML_ASSERT (!src0 || get_misalign_bytes (ctx, src0) == 0 );
5092
+ GGML_ASSERT (!src1 || get_misalign_bytes (ctx, src1) == 0 );
5093
+ GGML_ASSERT (!src2 || get_misalign_bytes (ctx, src2) == 0 );
5094
+ GGML_ASSERT (!dst || get_misalign_bytes (ctx, dst) == 0 );
5095
+ }
5096
+
5097
+ template <> void init_pushconst_tensor_offsets (ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
5098
+ const uint32_t a_offset = get_misalign_bytes (ctx, src0) / ggml_type_size (src0->type );
5099
+ const uint32_t d_offset = get_misalign_bytes (ctx, dst) / ggml_type_size (dst->type );
5100
+
5101
+ p.misalign_offsets = (a_offset << 16 ) | d_offset;
5102
+
5103
+ GGML_UNUSED (src1);
5104
+ GGML_UNUSED (src2);
5105
+ }
5106
+
5107
+ template <> void init_pushconst_tensor_offsets (ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
5108
+ const uint32_t a_offset = get_misalign_bytes (ctx, src0) / ggml_type_size (src0->type );
5109
+ const uint32_t b_offset = get_misalign_bytes (ctx, src1) / ggml_type_size (src1->type );
5110
+ const uint32_t d_offset = get_misalign_bytes (ctx, dst) / ggml_type_size (dst->type );
5111
+
5112
+ GGML_ASSERT (dst->op != GGML_OP_GET_ROWS || (a_offset == 0 && b_offset == 0 && d_offset == 0 ));
5113
+
5114
+ p.misalign_offsets = (a_offset << 16 ) | (b_offset << 8 ) | d_offset;
5115
+
5116
+ GGML_UNUSED (src2);
5117
+ }
5118
+
5119
+ template <> void init_pushconst_tensor_offsets (ggml_backend_vk_context * ctx, vk_op_upscale_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
5120
+ const uint32_t a_offset = get_misalign_bytes (ctx, src0) / ggml_type_size (src0->type );
5121
+ const uint32_t d_offset = get_misalign_bytes (ctx, dst) / ggml_type_size (dst->type );
5122
+
5123
+ p.a_offset = a_offset;
5124
+ p.d_offset = d_offset;
5125
+
5126
+ GGML_UNUSED (src1);
5127
+ GGML_UNUSED (src2);
5128
+ }
5129
+
5079
5130
template <typename PC>
5080
5131
static void ggml_vk_op_f32 (ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false ) {
5081
5132
VK_LOG_DEBUG (" ggml_vk_op_f32((" << src0 << " , name=" << src0->name << " , type=" << src0->type << " , ne0=" << src0->ne [0 ] << " , ne1=" << src0->ne [1 ] << " , ne2=" << src0->ne [2 ] << " , ne3=" << src0->ne [3 ] << " , nb0=" << src0->nb [0 ] << " , nb1=" << src0->nb [1 ] << " , nb2=" << src0->nb [2 ] << " , nb3=" << src0->nb [3 ];
@@ -5179,8 +5230,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
5179
5230
}
5180
5231
5181
5232
GGML_ASSERT (d_D != nullptr );
5182
- uint64_t d_buf_offset = ((vk_tensor_offset (dst) + dst->view_offs ) / ctx->device ->properties .limits .minStorageBufferOffsetAlignment ) * ctx->device ->properties .limits .minStorageBufferOffsetAlignment ;
5183
- GGML_ASSERT (d_buf_offset == vk_tensor_offset (dst) || op == GGML_OP_CPY); // NOLINT
5233
+ uint64_t d_buf_offset = vk_tensor_offset (dst) + dst->view_offs ;
5184
5234
if (!src0_uma) {
5185
5235
d_X = src0_buf_ctx->dev_buffer ;
5186
5236
x_buf_offset = vk_tensor_offset (src0) + src0->view_offs ;
@@ -5196,6 +5246,12 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
5196
5246
z_buf_offset = vk_tensor_offset (src2) + src2->view_offs ;
5197
5247
GGML_ASSERT (d_Z != nullptr );
5198
5248
}
5249
+ // Compute misalignment offset for descriptors and store it in in push constants, then align the descriptor offsets.
5250
+ init_pushconst_tensor_offsets (ctx, pc, src0, src1, src2, dst);
5251
+ x_buf_offset &= ~(ctx->device ->properties .limits .minStorageBufferOffsetAlignment - 1 );
5252
+ y_buf_offset &= ~(ctx->device ->properties .limits .minStorageBufferOffsetAlignment - 1 );
5253
+ z_buf_offset &= ~(ctx->device ->properties .limits .minStorageBufferOffsetAlignment - 1 );
5254
+ d_buf_offset &= ~(ctx->device ->properties .limits .minStorageBufferOffsetAlignment - 1 );
5199
5255
5200
5256
if (op_supports_incontiguous) {
5201
5257
x_sz = ggml_nbytes (src0);
@@ -5383,7 +5439,6 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
5383
5439
const uint32_t src0_type_size = ggml_type_size (src0->type );
5384
5440
const uint32_t src1_type_size = ggml_type_size (src1->type );
5385
5441
const uint32_t dst_type_size = ggml_type_size (dst->type );
5386
- const uint32_t d_offset = ((vk_tensor_offset (dst) + dst->view_offs ) % ctx->device ->properties .limits .minStorageBufferOffsetAlignment ) / dst_type_size;
5387
5442
5388
5443
int nb1 = dst->op_params [0 ] / 4 ; // 4 bytes of float32
5389
5444
int nb2 = dst->op_params [1 ] / 4 ; // 4 bytes of float32
@@ -5395,7 +5450,7 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
5395
5450
(uint32_t )src0->ne [0 ], (uint32_t )src0->ne [1 ], (uint32_t )src0->ne [2 ],(uint32_t )src0->ne [3 ], (uint32_t )src0->nb [0 ] / src0_type_size, (uint32_t )nb1, (uint32_t )nb2, (uint32_t )src0->nb [3 ] / src0_type_size,
5396
5451
(uint32_t )src1->ne [0 ], (uint32_t )src1->ne [1 ], (uint32_t )src1->ne [2 ],(uint32_t )src1->ne [3 ], (uint32_t )src1->nb [0 ] / src1_type_size, (uint32_t )src1->nb [1 ] / src1_type_size, (uint32_t )src1->nb [2 ] / src1_type_size, (uint32_t )src1->nb [3 ] / src1_type_size,
5397
5452
(uint32_t ) dst->ne [0 ], (uint32_t ) dst->ne [1 ], (uint32_t ) dst->ne [2 ],(uint32_t ) dst->ne [3 ], (uint32_t ) dst->nb [0 ] / dst_type_size, (uint32_t )nb1, (uint32_t )nb2, (uint32_t ) dst->nb [3 ] / dst_type_size,
5398
- d_offset ,
5453
+ 0 ,
5399
5454
0 .0f , 0 .0f , offset,
5400
5455
}, dryrun);
5401
5456
}
@@ -5599,7 +5654,7 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c
5599
5654
const float sf3 = (float )dst->ne [3 ] / src0->ne [3 ];
5600
5655
5601
5656
ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr , nullptr , dst, GGML_OP_UPSCALE, {
5602
- (uint32_t )ggml_nelements (dst), 0 ,
5657
+ (uint32_t )ggml_nelements (dst), 0 , 0 ,
5603
5658
(uint32_t )src0->nb [0 ] / src0_type_size, (uint32_t )src0->nb [1 ] / src0_type_size, (uint32_t )src0->nb [2 ] / src0_type_size, (uint32_t )src0->nb [3 ] / src0_type_size,
5604
5659
(uint32_t )dst->ne [0 ], (uint32_t )dst->ne [1 ], (uint32_t )dst->ne [2 ],(uint32_t )dst->ne [3 ],
5605
5660
sf0, sf1, sf2, sf3,
@@ -5709,13 +5764,12 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, co
5709
5764
static void ggml_vk_cpy (ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false ) {
5710
5765
const uint32_t src0_type_size = ggml_type_size (src0->type );
5711
5766
const uint32_t dst_type_size = ggml_type_size (dst->type );
5712
- const uint32_t d_offset = ((vk_tensor_offset (dst) + dst->view_offs ) % ctx->device ->properties .limits .minStorageBufferOffsetAlignment ) / dst_type_size;
5713
5767
5714
5768
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr , nullptr , dst, GGML_OP_CPY, {
5715
5769
(uint32_t )ggml_nelements (src0),
5716
5770
(uint32_t )src0->ne [0 ], (uint32_t )src0->ne [1 ], (uint32_t )src0->ne [2 ], (uint32_t )src0->ne [3 ], (uint32_t )src0->nb [0 ] / src0_type_size, (uint32_t )src0->nb [1 ] / src0_type_size, (uint32_t )src0->nb [2 ] / src0_type_size, (uint32_t )src0->nb [3 ] / src0_type_size,
5717
5771
(uint32_t ) dst->ne [0 ], (uint32_t ) dst->ne [1 ], (uint32_t ) dst->ne [2 ], (uint32_t ) dst->ne [3 ], (uint32_t ) dst->nb [0 ] / dst_type_size, (uint32_t ) dst->nb [1 ] / dst_type_size, (uint32_t ) dst->nb [2 ] / dst_type_size, (uint32_t ) dst->nb [3 ] / dst_type_size,
5718
- d_offset ,
5772
+ 0 ,
5719
5773
0 .0f , 0 .0f ,
5720
5774
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
5721
5775
}, dryrun);
0 commit comments