Skip to content

Commit 5dec47d

Browse files
authored
opencl: add multi and vision rope, gelu_quick and im2col (#12600)
* opencl: add `im2col` * opencl: add `gelu_quick` * opencl: add mrope * opencl: add vision rope
1 parent f125b8d commit 5dec47d

File tree

4 files changed

+767
-7
lines changed

4 files changed

+767
-7
lines changed

ggml/src/ggml-opencl/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ set(GGML_OPENCL_KERNELS
6363
ggml-opencl_transpose_16
6464
ggml-opencl_transpose_32
6565
ggml-opencl_transpose_32_16
66+
ggml-opencl_im2col
6667
)
6768

6869
foreach (K ${GGML_OPENCL_KERNELS})

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 231 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -224,12 +224,14 @@ struct ggml_backend_opencl_context {
224224
cl_program program;
225225
cl_program program_1;
226226
cl_program program_2;
227+
cl_program program_im2col;
227228

228229
cl_kernel kernel_add, kernel_add_row;
229230
cl_kernel kernel_mul, kernel_mul_row;
230231
cl_kernel kernel_scale;
231232
cl_kernel kernel_silu, kernel_silu_4;
232233
cl_kernel kernel_gelu, kernel_gelu_4;
234+
cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
233235
cl_kernel kernel_relu;
234236
cl_kernel kernel_clamp;
235237
cl_kernel kernel_norm;
@@ -239,6 +241,7 @@ struct ggml_backend_opencl_context {
239241
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
240242
cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
241243
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
244+
cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
242245
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
243246
cl_kernel kernel_mul_mat_f32_f32;
244247
cl_kernel kernel_mul_mat_f16_f16;
@@ -252,6 +255,7 @@ struct ggml_backend_opencl_context {
252255
kernel_mul_mat_q4_0_f32_flat_img_v0;
253256
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
254257
cl_kernel kernel_mul_mv_q6_K_f32;
258+
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
255259

256260
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
257261
// Transpose kernels
@@ -708,6 +712,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
708712
CL_CHECK((backend_ctx->kernel_silu_4 = clCreateKernel(backend_ctx->program, "kernel_silu_4", &err), err));
709713
CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program, "kernel_gelu", &err), err));
710714
CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program, "kernel_gelu_4", &err), err));
715+
CL_CHECK((backend_ctx->kernel_gelu_quick = clCreateKernel(backend_ctx->program, "kernel_gelu_quick", &err), err));
716+
CL_CHECK((backend_ctx->kernel_gelu_quick_4 = clCreateKernel(backend_ctx->program, "kernel_gelu_quick_4", &err), err));
711717
CL_CHECK((backend_ctx->kernel_relu = clCreateKernel(backend_ctx->program, "kernel_relu", &err), err));
712718
CL_CHECK((backend_ctx->kernel_clamp = clCreateKernel(backend_ctx->program, "kernel_clamp", &err), err));
713719
CL_CHECK((backend_ctx->kernel_norm = clCreateKernel(backend_ctx->program, "kernel_norm", &err), err));
@@ -722,6 +728,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
722728
CL_CHECK((backend_ctx->kernel_rope_norm_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f16", &err), err));
723729
CL_CHECK((backend_ctx->kernel_rope_neox_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f32", &err), err));
724730
CL_CHECK((backend_ctx->kernel_rope_neox_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f16", &err), err));
731+
CL_CHECK((backend_ctx->kernel_rope_multi_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_multi_f32", &err), err));
732+
CL_CHECK((backend_ctx->kernel_rope_multi_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_multi_f16", &err), err));
733+
CL_CHECK((backend_ctx->kernel_rope_vision_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_vision_f32", &err), err));
734+
CL_CHECK((backend_ctx->kernel_rope_vision_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_vision_f16", &err), err));
725735
CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f16", &err), err));
726736
CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f32", &err), err));
727737
CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(backend_ctx->program, "kernel_cpy_f32_f16", &err), err));
@@ -769,6 +779,19 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
769779

770780
CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_2, "kernel_convert_block_q4_0_noshuffle", &err), err));
771781

782+
// im2col kernels
783+
#ifdef GGML_OPENCL_EMBED_KERNELS
784+
const std::string kernel_src_im2col {
785+
#include "ggml-opencl_im2col.cl.h"
786+
};
787+
#else
788+
const std::string kernel_src_im2col = read_file("ggml-opencl_im2col.cl");
789+
#endif
790+
backend_ctx->program_im2col = build_program_from_source(context, device, kernel_src_im2col.c_str(), compile_opts);
791+
792+
CL_CHECK((backend_ctx->kernel_im2col_f32 = clCreateKernel(backend_ctx->program_im2col, "kernel_im2col_f32", &err), err));
793+
CL_CHECK((backend_ctx->kernel_im2col_f16 = clCreateKernel(backend_ctx->program_im2col, "kernel_im2col_f16", &err), err));
794+
772795
// Kernels for Adreno
773796
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
774797
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1187,6 +1210,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
11871210
case GGML_UNARY_OP_GELU:
11881211
case GGML_UNARY_OP_SILU:
11891212
case GGML_UNARY_OP_RELU:
1213+
case GGML_UNARY_OP_GELU_QUICK:
11901214
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
11911215
default:
11921216
return false;
@@ -1216,14 +1240,26 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
12161240
return op->ne[3] == 1;
12171241
case GGML_OP_ROPE: {
12181242
const int mode = ((const int32_t *) op->op_params)[2];
1219-
if (mode & GGML_ROPE_TYPE_MROPE) {
1243+
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
1244+
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
1245+
if (is_mrope && !is_vision) {
1246+
if (op->src[0]->type == GGML_TYPE_F32 ||
1247+
op->src[0]->type == GGML_TYPE_F16) {
1248+
return true;
1249+
}
12201250
return false;
12211251
}
1222-
if (mode & GGML_ROPE_TYPE_VISION) {
1252+
if (is_vision) {
1253+
if (op->src[0]->type == GGML_TYPE_F32 ||
1254+
op->src[0]->type == GGML_TYPE_F16) {
1255+
return true;
1256+
}
12231257
return false;
12241258
}
12251259
return true;
12261260
}
1261+
case GGML_OP_IM2COL:
1262+
return true;
12271263
default:
12281264
return false;
12291265
}
@@ -2582,6 +2618,53 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
25822618
#endif
25832619
}
25842620

2621+
static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2622+
GGML_ASSERT(src0);
2623+
GGML_ASSERT(src0->extra);
2624+
GGML_ASSERT(dst);
2625+
GGML_ASSERT(dst->extra);
2626+
2627+
UNUSED(src1);
2628+
2629+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
2630+
cl_command_queue queue = backend_ctx->queue;
2631+
2632+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
2633+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
2634+
2635+
cl_ulong offset0 = extra0->offset + src0->view_offs;
2636+
cl_ulong offsetd = extrad->offset + dst->view_offs;
2637+
2638+
cl_kernel kernel;
2639+
2640+
int n = ggml_nelements(dst);
2641+
2642+
if (n % 4 == 0) {
2643+
kernel = backend_ctx->kernel_gelu_quick_4;
2644+
n /= 4;
2645+
} else {
2646+
kernel = backend_ctx->kernel_gelu_quick;
2647+
}
2648+
2649+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2650+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2651+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
2652+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
2653+
2654+
size_t global_work_size[] = {(size_t)n, 1, 1};
2655+
size_t local_work_size[] = {64, 1, 1};
2656+
2657+
#ifdef GGML_OPENCL_PROFILING
2658+
cl_event evt;
2659+
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
2660+
2661+
g_profiling_info.emplace_back();
2662+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
2663+
#else
2664+
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
2665+
#endif
2666+
}
2667+
25852668
static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
25862669
GGML_ASSERT(src0);
25872670
GGML_ASSERT(src0->extra);
@@ -3980,36 +4063,70 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
39804063
float attn_factor;
39814064
float beta_fast;
39824065
float beta_slow;
4066+
int32_t sections[4];
39834067

39844068
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
39854069
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
39864070
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
39874071
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
39884072
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
39894073
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
4074+
memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int32_t)*4);
39904075

39914076
const bool is_neox = mode & 2;
4077+
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
4078+
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
4079+
4080+
if (is_mrope) {
4081+
GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
4082+
}
4083+
4084+
if (is_vision) {
4085+
GGML_ASSERT(n_dims == ne00/2);
4086+
}
39924087

39934088
cl_kernel kernel;
39944089

3995-
if (!is_neox) {
4090+
if (is_neox) {
39964091
switch (src0->type) {
39974092
case GGML_TYPE_F32:
3998-
kernel = backend_ctx->kernel_rope_norm_f32;
4093+
kernel = backend_ctx->kernel_rope_neox_f32;
39994094
break;
40004095
case GGML_TYPE_F16:
4001-
kernel = backend_ctx->kernel_rope_norm_f16;
4096+
kernel = backend_ctx->kernel_rope_neox_f16;
4097+
break;
4098+
default:
4099+
GGML_ASSERT(false);
4100+
};
4101+
} else if (is_mrope && !is_vision) {
4102+
switch (src0->type) {
4103+
case GGML_TYPE_F32:
4104+
kernel = backend_ctx->kernel_rope_multi_f32;
4105+
break;
4106+
case GGML_TYPE_F16:
4107+
kernel = backend_ctx->kernel_rope_multi_f16;
40024108
break;
40034109
default:
40044110
GGML_ASSERT(false);
40054111
};
4112+
} else if (is_vision) {
4113+
switch (src0->type) {
4114+
case GGML_TYPE_F32:
4115+
kernel = backend_ctx->kernel_rope_vision_f32;
4116+
break;
4117+
case GGML_TYPE_F16:
4118+
kernel = backend_ctx->kernel_rope_vision_f16;
4119+
break;
4120+
default:
4121+
GGML_ASSERT(false);
4122+
}
40064123
} else {
40074124
switch (src0->type) {
40084125
case GGML_TYPE_F32:
4009-
kernel = backend_ctx->kernel_rope_neox_f32;
4126+
kernel = backend_ctx->kernel_rope_norm_f32;
40104127
break;
40114128
case GGML_TYPE_F16:
4012-
kernel = backend_ctx->kernel_rope_neox_f16;
4129+
kernel = backend_ctx->kernel_rope_norm_f16;
40134130
break;
40144131
default:
40154132
GGML_ASSERT(false);
@@ -4049,6 +4166,9 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
40494166
CL_CHECK(clSetKernelArg(kernel, 30, sizeof(float), &attn_factor));
40504167
CL_CHECK(clSetKernelArg(kernel, 31, sizeof(float), &beta_fast));
40514168
CL_CHECK(clSetKernelArg(kernel, 32, sizeof(float), &beta_slow));
4169+
if (is_mrope || is_vision) {
4170+
CL_CHECK(clSetKernelArg(kernel, 33, sizeof(int32_t)*4, &sections));
4171+
}
40524172

40534173
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
40544174
size_t local_work_size[] = {(size_t)nth, 1, 1};
@@ -4064,6 +4184,98 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
40644184
#endif
40654185
}
40664186

4187+
static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4188+
GGML_ASSERT(src0);
4189+
GGML_ASSERT(src1);
4190+
GGML_ASSERT(src1->extra);
4191+
GGML_ASSERT(dst);
4192+
GGML_ASSERT(dst->extra);
4193+
4194+
// src0 - filter, src1 - input
4195+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
4196+
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
4197+
4198+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4199+
cl_command_queue queue = backend_ctx->queue;
4200+
4201+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
4202+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
4203+
4204+
cl_ulong offset1 = extra1->offset + src1->view_offs;
4205+
cl_ulong offsetd = extrad->offset + dst->view_offs;
4206+
4207+
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
4208+
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
4209+
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
4210+
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
4211+
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
4212+
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
4213+
4214+
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
4215+
4216+
const cl_long IC = src1->ne[is_2D ? 2 : 1];
4217+
const cl_long IH = is_2D ? src1->ne[1] : 1;
4218+
const cl_long IW = src1->ne[0];
4219+
4220+
const cl_long KH = is_2D ? src0->ne[1] : 1;
4221+
const cl_long KW = src0->ne[0];
4222+
4223+
const cl_long OH = is_2D ? dst->ne[2] : 1;
4224+
const cl_long OW = dst->ne[1];
4225+
4226+
// nb is byte offset, src is type float32
4227+
const cl_ulong delta_offset = src1->nb[is_2D ? 2 : 1]/4;
4228+
const cl_long batch = src1->ne[is_2D ? 3 : 2];
4229+
const cl_ulong batch_offset = src1->nb[is_2D ? 3 : 2]/4;
4230+
4231+
const cl_long pelements = OW*KW*KH;
4232+
const cl_long CHW = IC*KH*KW;
4233+
4234+
cl_kernel kernel;
4235+
4236+
if(dst->type == GGML_TYPE_F16) {
4237+
kernel = backend_ctx->kernel_im2col_f16;
4238+
} else {
4239+
kernel = backend_ctx->kernel_im2col_f32;
4240+
}
4241+
4242+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra1->data_device));
4243+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset1));
4244+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
4245+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
4246+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &batch_offset));
4247+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &delta_offset));
4248+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_long), &IW));
4249+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_long), &IH));
4250+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_long), &IC));
4251+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_long), &OW));
4252+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_long), &OH));
4253+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_long), &KW));
4254+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_long), &KH));
4255+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_long), &pelements));
4256+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_long), &CHW));
4257+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &s0));
4258+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &s1));
4259+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &p0));
4260+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &p1));
4261+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &d0));
4262+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &d1));
4263+
4264+
const int num_blocks = (pelements + 256 - 1) / 256;
4265+
size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC};
4266+
size_t local_work_size[] = {256, 1, 1};
4267+
4268+
#ifdef GGML_OPENCL_PROFILING
4269+
cl_event evt;
4270+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4271+
4272+
g_profiling_info.emplace_back();
4273+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4274+
#else
4275+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4276+
#endif
4277+
}
4278+
40674279
//------------------------------------------------------------------------------
40684280
// Op offloading
40694281
//------------------------------------------------------------------------------
@@ -4122,6 +4334,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
41224334
}
41234335
func = ggml_cl_gelu;
41244336
break;
4337+
case GGML_UNARY_OP_GELU_QUICK:
4338+
if (!any_on_device) {
4339+
return false;
4340+
}
4341+
func = ggml_cl_gelu_quick;
4342+
break;
41254343
case GGML_UNARY_OP_SILU:
41264344
if (!any_on_device) {
41274345
return false;
@@ -4194,6 +4412,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
41944412
}
41954413
func = ggml_cl_rope;
41964414
break;
4415+
case GGML_OP_IM2COL:
4416+
if (!any_on_device) {
4417+
return false;
4418+
}
4419+
func = ggml_cl_im2col;
4420+
break;
41974421
default:
41984422
return false;
41994423
}

0 commit comments

Comments
 (0)