@@ -304,6 +304,7 @@ struct ggml_backend_opencl_context {
304
304
cl_program program_norm;
305
305
cl_program program_relu;
306
306
cl_program program_rms_norm;
307
+ cl_program program_group_norm;
307
308
cl_program program_rope;
308
309
cl_program program_scale;
309
310
cl_program program_silu;
@@ -328,6 +329,7 @@ struct ggml_backend_opencl_context {
328
329
cl_kernel kernel_clamp;
329
330
cl_kernel kernel_norm;
330
331
cl_kernel kernel_rms_norm;
332
+ cl_kernel kernel_group_norm;
331
333
cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
332
334
cl_kernel kernel_soft_max, kernel_soft_max_4;
333
335
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
@@ -1079,6 +1081,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1079
1081
GGML_LOG_CONT (" ." );
1080
1082
}
1081
1083
1084
+ // group_norm
1085
+ {
1086
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1087
+ const std::string kernel_src {
1088
+ #include " group_norm.cl.h"
1089
+ };
1090
+ #else
1091
+ const std::string kernel_src = read_file (" group_norm.cl" );
1092
+ #endif
1093
+ backend_ctx->program_group_norm =
1094
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1095
+
1096
+ CL_CHECK ((backend_ctx->kernel_group_norm = clCreateKernel (backend_ctx->program_group_norm , " kernel_group_norm" , &err), err));
1097
+ GGML_LOG_CONT (" ." );
1098
+ }
1099
+
1082
1100
// Adreno kernels
1083
1101
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1084
1102
// transpose
@@ -1970,6 +1988,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1970
1988
case GGML_OP_NORM:
1971
1989
case GGML_OP_RMS_NORM:
1972
1990
return true ;
1991
+ case GGML_OP_GROUP_NORM:
1992
+ return ggml_is_contiguous (op->src [0 ]);
1973
1993
case GGML_OP_MUL_MAT:
1974
1994
if (op->src [0 ]->type == GGML_TYPE_F16) {
1975
1995
return true ;
@@ -4029,6 +4049,65 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
4029
4049
#endif
4030
4050
}
4031
4051
4052
+ static void ggml_cl_group_norm (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4053
+ GGML_ASSERT (src0);
4054
+ GGML_ASSERT (src0->extra );
4055
+ GGML_ASSERT (dst);
4056
+ GGML_ASSERT (dst->extra );
4057
+
4058
+ UNUSED (src1);
4059
+
4060
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
4061
+ cl_command_queue queue = backend_ctx->queue ;
4062
+
4063
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra ;
4064
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
4065
+
4066
+ cl_ulong offset0 = extra0->offset + src0->view_offs ;
4067
+ cl_ulong offsetd = extrad->offset + dst->view_offs ;
4068
+
4069
+ int32_t n_groups = ((const int32_t *) dst->op_params )[0 ];
4070
+ int32_t group_size = src0->ne [0 ] * src0->ne [1 ] * ((src0->ne [2 ] + n_groups - 1 ) / n_groups);
4071
+ float eps = ((const float *) dst->op_params )[1 ];
4072
+
4073
+ const int ne00 = src0->ne [0 ];
4074
+ const int ne01 = src0->ne [1 ];
4075
+ const int ne02 = src0->ne [2 ];
4076
+ const int ne = ne00*ne01*ne02;
4077
+
4078
+ cl_kernel kernel = backend_ctx->kernel_group_norm ;
4079
+
4080
+ size_t sgs = 64 ;
4081
+ if (backend_ctx->gpu_family == ADRENO) {
4082
+ sgs = 64 ;
4083
+ } else if (backend_ctx->gpu_family == INTEL) {
4084
+ sgs = 32 ;
4085
+ } else {
4086
+ GGML_ASSERT (false && " Unsupported GPU" );
4087
+ }
4088
+
4089
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
4090
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
4091
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extrad->data_device ));
4092
+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offsetd));
4093
+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (int ), &ne));
4094
+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (int ), &group_size));
4095
+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (float ), &eps));
4096
+
4097
+ size_t global_work_size[] = {(size_t )n_groups*sgs, 1 , 1 };
4098
+ size_t local_work_size[] = {(size_t )sgs, 1 , 1 };
4099
+
4100
+ #ifdef GGML_OPENCL_PROFILING
4101
+ cl_event evt;
4102
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , &evt));
4103
+
4104
+ g_profiling_info.emplace_back ();
4105
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size, dst);
4106
+ #else
4107
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , NULL ));
4108
+ #endif
4109
+ }
4110
+
4032
4111
static void ggml_cl_mul_mat (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4033
4112
GGML_ASSERT (src0);
4034
4113
GGML_ASSERT (src0->extra );
@@ -5609,6 +5688,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
5609
5688
}
5610
5689
func = ggml_cl_rms_norm;
5611
5690
break ;
5691
+ case GGML_OP_GROUP_NORM:
5692
+ if (!any_on_device) {
5693
+ return false ;
5694
+ }
5695
+ func = ggml_cl_group_norm;
5696
+ break ;
5612
5697
case GGML_OP_MUL_MAT:
5613
5698
if (!any_on_device && !ggml_cl_can_mul_mat (tensor->src [0 ], tensor->src [1 ], tensor)) {
5614
5699
return false ;
0 commit comments