@@ -224,12 +224,14 @@ struct ggml_backend_opencl_context {
224
224
cl_program program;
225
225
cl_program program_1;
226
226
cl_program program_2;
227
+ cl_program program_im2col;
227
228
228
229
cl_kernel kernel_add, kernel_add_row;
229
230
cl_kernel kernel_mul, kernel_mul_row;
230
231
cl_kernel kernel_scale;
231
232
cl_kernel kernel_silu, kernel_silu_4;
232
233
cl_kernel kernel_gelu, kernel_gelu_4;
234
+ cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
233
235
cl_kernel kernel_relu;
234
236
cl_kernel kernel_clamp;
235
237
cl_kernel kernel_norm;
@@ -239,6 +241,7 @@ struct ggml_backend_opencl_context {
239
241
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
240
242
cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
241
243
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
244
+ cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
242
245
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
243
246
cl_kernel kernel_mul_mat_f32_f32;
244
247
cl_kernel kernel_mul_mat_f16_f16;
@@ -252,6 +255,7 @@ struct ggml_backend_opencl_context {
252
255
kernel_mul_mat_q4_0_f32_flat_img_v0;
253
256
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
254
257
cl_kernel kernel_mul_mv_q6_K_f32;
258
+ cl_kernel kernel_im2col_f32, kernel_im2col_f16;
255
259
256
260
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
257
261
// Transpose kernels
@@ -708,6 +712,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
708
712
CL_CHECK ((backend_ctx->kernel_silu_4 = clCreateKernel (backend_ctx->program , " kernel_silu_4" , &err), err));
709
713
CL_CHECK ((backend_ctx->kernel_gelu = clCreateKernel (backend_ctx->program , " kernel_gelu" , &err), err));
710
714
CL_CHECK ((backend_ctx->kernel_gelu_4 = clCreateKernel (backend_ctx->program , " kernel_gelu_4" , &err), err));
715
+ CL_CHECK ((backend_ctx->kernel_gelu_quick = clCreateKernel (backend_ctx->program , " kernel_gelu_quick" , &err), err));
716
+ CL_CHECK ((backend_ctx->kernel_gelu_quick_4 = clCreateKernel (backend_ctx->program , " kernel_gelu_quick_4" , &err), err));
711
717
CL_CHECK ((backend_ctx->kernel_relu = clCreateKernel (backend_ctx->program , " kernel_relu" , &err), err));
712
718
CL_CHECK ((backend_ctx->kernel_clamp = clCreateKernel (backend_ctx->program , " kernel_clamp" , &err), err));
713
719
CL_CHECK ((backend_ctx->kernel_norm = clCreateKernel (backend_ctx->program , " kernel_norm" , &err), err));
@@ -722,6 +728,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
722
728
CL_CHECK ((backend_ctx->kernel_rope_norm_f16 = clCreateKernel (backend_ctx->program , " kernel_rope_norm_f16" , &err), err));
723
729
CL_CHECK ((backend_ctx->kernel_rope_neox_f32 = clCreateKernel (backend_ctx->program , " kernel_rope_neox_f32" , &err), err));
724
730
CL_CHECK ((backend_ctx->kernel_rope_neox_f16 = clCreateKernel (backend_ctx->program , " kernel_rope_neox_f16" , &err), err));
731
+ CL_CHECK ((backend_ctx->kernel_rope_multi_f32 = clCreateKernel (backend_ctx->program , " kernel_rope_multi_f32" , &err), err));
732
+ CL_CHECK ((backend_ctx->kernel_rope_multi_f16 = clCreateKernel (backend_ctx->program , " kernel_rope_multi_f16" , &err), err));
733
+ CL_CHECK ((backend_ctx->kernel_rope_vision_f32 = clCreateKernel (backend_ctx->program , " kernel_rope_vision_f32" , &err), err));
734
+ CL_CHECK ((backend_ctx->kernel_rope_vision_f16 = clCreateKernel (backend_ctx->program , " kernel_rope_vision_f16" , &err), err));
725
735
CL_CHECK ((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel (backend_ctx->program , " kernel_cpy_f16_f16" , &err), err));
726
736
CL_CHECK ((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel (backend_ctx->program , " kernel_cpy_f16_f32" , &err), err));
727
737
CL_CHECK ((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel (backend_ctx->program , " kernel_cpy_f32_f16" , &err), err));
@@ -769,6 +779,19 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
769
779
770
780
CL_CHECK ((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel (backend_ctx->program_2 , " kernel_convert_block_q4_0_noshuffle" , &err), err));
771
781
782
+ // im2col kernels
783
+ #ifdef GGML_OPENCL_EMBED_KERNELS
784
+ const std::string kernel_src_im2col {
785
+ #include " ggml-opencl_im2col.cl.h"
786
+ };
787
+ #else
788
+ const std::string kernel_src_im2col = read_file (" ggml-opencl_im2col.cl" );
789
+ #endif
790
+ backend_ctx->program_im2col = build_program_from_source (context, device, kernel_src_im2col.c_str (), compile_opts);
791
+
792
+ CL_CHECK ((backend_ctx->kernel_im2col_f32 = clCreateKernel (backend_ctx->program_im2col , " kernel_im2col_f32" , &err), err));
793
+ CL_CHECK ((backend_ctx->kernel_im2col_f16 = clCreateKernel (backend_ctx->program_im2col , " kernel_im2col_f16" , &err), err));
794
+
772
795
// Kernels for Adreno
773
796
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
774
797
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1187,6 +1210,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1187
1210
case GGML_UNARY_OP_GELU:
1188
1211
case GGML_UNARY_OP_SILU:
1189
1212
case GGML_UNARY_OP_RELU:
1213
+ case GGML_UNARY_OP_GELU_QUICK:
1190
1214
return ggml_is_contiguous (op->src [0 ]) && op->src [0 ]->type == GGML_TYPE_F32;
1191
1215
default :
1192
1216
return false ;
@@ -1216,14 +1240,26 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1216
1240
return op->ne [3 ] == 1 ;
1217
1241
case GGML_OP_ROPE: {
1218
1242
const int mode = ((const int32_t *) op->op_params )[2 ];
1219
- if (mode & GGML_ROPE_TYPE_MROPE) {
1243
+ const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
1244
+ const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
1245
+ if (is_mrope && !is_vision) {
1246
+ if (op->src [0 ]->type == GGML_TYPE_F32 ||
1247
+ op->src [0 ]->type == GGML_TYPE_F16) {
1248
+ return true ;
1249
+ }
1220
1250
return false ;
1221
1251
}
1222
- if (mode & GGML_ROPE_TYPE_VISION) {
1252
+ if (is_vision) {
1253
+ if (op->src [0 ]->type == GGML_TYPE_F32 ||
1254
+ op->src [0 ]->type == GGML_TYPE_F16) {
1255
+ return true ;
1256
+ }
1223
1257
return false ;
1224
1258
}
1225
1259
return true ;
1226
1260
}
1261
+ case GGML_OP_IM2COL:
1262
+ return true ;
1227
1263
default :
1228
1264
return false ;
1229
1265
}
@@ -2582,6 +2618,53 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
2582
2618
#endif
2583
2619
}
2584
2620
2621
+ static void ggml_cl_gelu_quick (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2622
+ GGML_ASSERT (src0);
2623
+ GGML_ASSERT (src0->extra );
2624
+ GGML_ASSERT (dst);
2625
+ GGML_ASSERT (dst->extra );
2626
+
2627
+ UNUSED (src1);
2628
+
2629
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
2630
+ cl_command_queue queue = backend_ctx->queue ;
2631
+
2632
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra ;
2633
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
2634
+
2635
+ cl_ulong offset0 = extra0->offset + src0->view_offs ;
2636
+ cl_ulong offsetd = extrad->offset + dst->view_offs ;
2637
+
2638
+ cl_kernel kernel;
2639
+
2640
+ int n = ggml_nelements (dst);
2641
+
2642
+ if (n % 4 == 0 ) {
2643
+ kernel = backend_ctx->kernel_gelu_quick_4 ;
2644
+ n /= 4 ;
2645
+ } else {
2646
+ kernel = backend_ctx->kernel_gelu_quick ;
2647
+ }
2648
+
2649
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
2650
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
2651
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extrad->data_device ));
2652
+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offsetd));
2653
+
2654
+ size_t global_work_size[] = {(size_t )n, 1 , 1 };
2655
+ size_t local_work_size[] = {64 , 1 , 1 };
2656
+
2657
+ #ifdef GGML_OPENCL_PROFILING
2658
+ cl_event evt;
2659
+ clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , &evt);
2660
+
2661
+ g_profiling_info.emplace_back ();
2662
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size, dst);
2663
+ #else
2664
+ clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , NULL );
2665
+ #endif
2666
+ }
2667
+
2585
2668
static void ggml_cl_silu (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2586
2669
GGML_ASSERT (src0);
2587
2670
GGML_ASSERT (src0->extra );
@@ -3980,36 +4063,70 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
3980
4063
float attn_factor;
3981
4064
float beta_fast;
3982
4065
float beta_slow;
4066
+ int32_t sections[4 ];
3983
4067
3984
4068
memcpy (&freq_base, (int32_t *) dst->op_params + 5 , sizeof (float ));
3985
4069
memcpy (&freq_scale, (int32_t *) dst->op_params + 6 , sizeof (float ));
3986
4070
memcpy (&ext_factor, (int32_t *) dst->op_params + 7 , sizeof (float ));
3987
4071
memcpy (&attn_factor, (int32_t *) dst->op_params + 8 , sizeof (float ));
3988
4072
memcpy (&beta_fast, (int32_t *) dst->op_params + 9 , sizeof (float ));
3989
4073
memcpy (&beta_slow, (int32_t *) dst->op_params + 10 , sizeof (float ));
4074
+ memcpy (§ions, (int32_t *) dst->op_params + 11 , sizeof (int32_t )*4 );
3990
4075
3991
4076
const bool is_neox = mode & 2 ;
4077
+ const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
4078
+ const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
4079
+
4080
+ if (is_mrope) {
4081
+ GGML_ASSERT (sections[0 ] > 0 || sections[1 ] > 0 || sections[2 ] > 0 );
4082
+ }
4083
+
4084
+ if (is_vision) {
4085
+ GGML_ASSERT (n_dims == ne00/2 );
4086
+ }
3992
4087
3993
4088
cl_kernel kernel;
3994
4089
3995
- if (! is_neox) {
4090
+ if (is_neox) {
3996
4091
switch (src0->type ) {
3997
4092
case GGML_TYPE_F32:
3998
- kernel = backend_ctx->kernel_rope_norm_f32 ;
4093
+ kernel = backend_ctx->kernel_rope_neox_f32 ;
3999
4094
break ;
4000
4095
case GGML_TYPE_F16:
4001
- kernel = backend_ctx->kernel_rope_norm_f16 ;
4096
+ kernel = backend_ctx->kernel_rope_neox_f16 ;
4097
+ break ;
4098
+ default :
4099
+ GGML_ASSERT (false );
4100
+ };
4101
+ } else if (is_mrope && !is_vision) {
4102
+ switch (src0->type ) {
4103
+ case GGML_TYPE_F32:
4104
+ kernel = backend_ctx->kernel_rope_multi_f32 ;
4105
+ break ;
4106
+ case GGML_TYPE_F16:
4107
+ kernel = backend_ctx->kernel_rope_multi_f16 ;
4002
4108
break ;
4003
4109
default :
4004
4110
GGML_ASSERT (false );
4005
4111
};
4112
+ } else if (is_vision) {
4113
+ switch (src0->type ) {
4114
+ case GGML_TYPE_F32:
4115
+ kernel = backend_ctx->kernel_rope_vision_f32 ;
4116
+ break ;
4117
+ case GGML_TYPE_F16:
4118
+ kernel = backend_ctx->kernel_rope_vision_f16 ;
4119
+ break ;
4120
+ default :
4121
+ GGML_ASSERT (false );
4122
+ }
4006
4123
} else {
4007
4124
switch (src0->type ) {
4008
4125
case GGML_TYPE_F32:
4009
- kernel = backend_ctx->kernel_rope_neox_f32 ;
4126
+ kernel = backend_ctx->kernel_rope_norm_f32 ;
4010
4127
break ;
4011
4128
case GGML_TYPE_F16:
4012
- kernel = backend_ctx->kernel_rope_neox_f16 ;
4129
+ kernel = backend_ctx->kernel_rope_norm_f16 ;
4013
4130
break ;
4014
4131
default :
4015
4132
GGML_ASSERT (false );
@@ -4049,6 +4166,9 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
4049
4166
CL_CHECK (clSetKernelArg (kernel, 30 , sizeof (float ), &attn_factor));
4050
4167
CL_CHECK (clSetKernelArg (kernel, 31 , sizeof (float ), &beta_fast));
4051
4168
CL_CHECK (clSetKernelArg (kernel, 32 , sizeof (float ), &beta_slow));
4169
+ if (is_mrope || is_vision) {
4170
+ CL_CHECK (clSetKernelArg (kernel, 33 , sizeof (int32_t )*4 , §ions));
4171
+ }
4052
4172
4053
4173
size_t global_work_size[] = {(size_t )ne01*nth, (size_t )ne02, (size_t )ne03};
4054
4174
size_t local_work_size[] = {(size_t )nth, 1 , 1 };
@@ -4064,6 +4184,98 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
4064
4184
#endif
4065
4185
}
4066
4186
4187
+ static void ggml_cl_im2col (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4188
+ GGML_ASSERT (src0);
4189
+ GGML_ASSERT (src1);
4190
+ GGML_ASSERT (src1->extra );
4191
+ GGML_ASSERT (dst);
4192
+ GGML_ASSERT (dst->extra );
4193
+
4194
+ // src0 - filter, src1 - input
4195
+ GGML_ASSERT (src1->type == GGML_TYPE_F32);
4196
+ GGML_ASSERT (dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
4197
+
4198
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
4199
+ cl_command_queue queue = backend_ctx->queue ;
4200
+
4201
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra ;
4202
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
4203
+
4204
+ cl_ulong offset1 = extra1->offset + src1->view_offs ;
4205
+ cl_ulong offsetd = extrad->offset + dst->view_offs ;
4206
+
4207
+ const int32_t s0 = ((const int32_t *)(dst->op_params ))[0 ];
4208
+ const int32_t s1 = ((const int32_t *)(dst->op_params ))[1 ];
4209
+ const int32_t p0 = ((const int32_t *)(dst->op_params ))[2 ];
4210
+ const int32_t p1 = ((const int32_t *)(dst->op_params ))[3 ];
4211
+ const int32_t d0 = ((const int32_t *)(dst->op_params ))[4 ];
4212
+ const int32_t d1 = ((const int32_t *)(dst->op_params ))[5 ];
4213
+
4214
+ const bool is_2D = ((const int32_t *)(dst->op_params ))[6 ] == 1 ;
4215
+
4216
+ const cl_long IC = src1->ne [is_2D ? 2 : 1 ];
4217
+ const cl_long IH = is_2D ? src1->ne [1 ] : 1 ;
4218
+ const cl_long IW = src1->ne [0 ];
4219
+
4220
+ const cl_long KH = is_2D ? src0->ne [1 ] : 1 ;
4221
+ const cl_long KW = src0->ne [0 ];
4222
+
4223
+ const cl_long OH = is_2D ? dst->ne [2 ] : 1 ;
4224
+ const cl_long OW = dst->ne [1 ];
4225
+
4226
+ // nb is byte offset, src is type float32
4227
+ const cl_ulong delta_offset = src1->nb [is_2D ? 2 : 1 ]/4 ;
4228
+ const cl_long batch = src1->ne [is_2D ? 3 : 2 ];
4229
+ const cl_ulong batch_offset = src1->nb [is_2D ? 3 : 2 ]/4 ;
4230
+
4231
+ const cl_long pelements = OW*KW*KH;
4232
+ const cl_long CHW = IC*KH*KW;
4233
+
4234
+ cl_kernel kernel;
4235
+
4236
+ if (dst->type == GGML_TYPE_F16) {
4237
+ kernel = backend_ctx->kernel_im2col_f16 ;
4238
+ } else {
4239
+ kernel = backend_ctx->kernel_im2col_f32 ;
4240
+ }
4241
+
4242
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra1->data_device ));
4243
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset1));
4244
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extrad->data_device ));
4245
+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offsetd));
4246
+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_ulong), &batch_offset));
4247
+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &delta_offset));
4248
+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (cl_long), &IW));
4249
+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (cl_long), &IH));
4250
+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_long), &IC));
4251
+ CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_long), &OW));
4252
+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (cl_long), &OH));
4253
+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (cl_long), &KW));
4254
+ CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (cl_long), &KH));
4255
+ CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (cl_long), &pelements));
4256
+ CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_long), &CHW));
4257
+ CL_CHECK (clSetKernelArg (kernel, 15 , sizeof (int ), &s0));
4258
+ CL_CHECK (clSetKernelArg (kernel, 16 , sizeof (int ), &s1));
4259
+ CL_CHECK (clSetKernelArg (kernel, 17 , sizeof (int ), &p0));
4260
+ CL_CHECK (clSetKernelArg (kernel, 18 , sizeof (int ), &p1));
4261
+ CL_CHECK (clSetKernelArg (kernel, 19 , sizeof (int ), &d0));
4262
+ CL_CHECK (clSetKernelArg (kernel, 20 , sizeof (int ), &d1));
4263
+
4264
+ const int num_blocks = (pelements + 256 - 1 ) / 256 ;
4265
+ size_t global_work_size[] = {(size_t )num_blocks*256 , (size_t )OH, (size_t )batch*IC};
4266
+ size_t local_work_size[] = {256 , 1 , 1 };
4267
+
4268
+ #ifdef GGML_OPENCL_PROFILING
4269
+ cl_event evt;
4270
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , &evt));
4271
+
4272
+ g_profiling_info.emplace_back ();
4273
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size, dst);
4274
+ #else
4275
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , NULL ));
4276
+ #endif
4277
+ }
4278
+
4067
4279
// ------------------------------------------------------------------------------
4068
4280
// Op offloading
4069
4281
// ------------------------------------------------------------------------------
@@ -4122,6 +4334,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
4122
4334
}
4123
4335
func = ggml_cl_gelu;
4124
4336
break ;
4337
+ case GGML_UNARY_OP_GELU_QUICK:
4338
+ if (!any_on_device) {
4339
+ return false ;
4340
+ }
4341
+ func = ggml_cl_gelu_quick;
4342
+ break ;
4125
4343
case GGML_UNARY_OP_SILU:
4126
4344
if (!any_on_device) {
4127
4345
return false ;
@@ -4194,6 +4412,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
4194
4412
}
4195
4413
func = ggml_cl_rope;
4196
4414
break ;
4415
+ case GGML_OP_IM2COL:
4416
+ if (!any_on_device) {
4417
+ return false ;
4418
+ }
4419
+ func = ggml_cl_im2col;
4420
+ break ;
4197
4421
default :
4198
4422
return false ;
4199
4423
}
0 commit comments