Skip to content

Commit be52192

Browse files
committed
opencl: add group_norm
1 parent 15f56d5 commit be52192

File tree

3 files changed

+158
-0
lines changed

3 files changed

+158
-0
lines changed

ggml/src/ggml-opencl/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ set(GGML_OPENCL_KERNELS
6565
gemv_noshuffle_general
6666
gemv_noshuffle
6767
get_rows
68+
group_norm
6869
im2col_f32
6970
im2col_f16
7071
mul_mat_Ab_Bi_8x4

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,7 @@ struct ggml_backend_opencl_context {
304304
cl_program program_norm;
305305
cl_program program_relu;
306306
cl_program program_rms_norm;
307+
cl_program program_group_norm;
307308
cl_program program_rope;
308309
cl_program program_scale;
309310
cl_program program_silu;
@@ -328,6 +329,7 @@ struct ggml_backend_opencl_context {
328329
cl_kernel kernel_clamp;
329330
cl_kernel kernel_norm;
330331
cl_kernel kernel_rms_norm;
332+
cl_kernel kernel_group_norm;
331333
cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
332334
cl_kernel kernel_soft_max, kernel_soft_max_4;
333335
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
@@ -1079,6 +1081,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
10791081
GGML_LOG_CONT(".");
10801082
}
10811083

1084+
// group_norm
1085+
{
1086+
#ifdef GGML_OPENCL_EMBED_KERNELS
1087+
const std::string kernel_src {
1088+
#include "group_norm.cl.h"
1089+
};
1090+
#else
1091+
const std::string kernel_src = read_file("group_norm.cl");
1092+
#endif
1093+
backend_ctx->program_group_norm =
1094+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1095+
1096+
CL_CHECK((backend_ctx->kernel_group_norm = clCreateKernel(backend_ctx->program_group_norm, "kernel_group_norm", &err), err));
1097+
GGML_LOG_CONT(".");
1098+
}
1099+
10821100
// Adreno kernels
10831101
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
10841102
// transpose
@@ -1970,6 +1988,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
19701988
case GGML_OP_NORM:
19711989
case GGML_OP_RMS_NORM:
19721990
return true;
1991+
case GGML_OP_GROUP_NORM:
1992+
return ggml_is_contiguous(op->src[0]);
19731993
case GGML_OP_MUL_MAT:
19741994
if (op->src[0]->type == GGML_TYPE_F16) {
19751995
return true;
@@ -4029,6 +4049,65 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
40294049
#endif
40304050
}
40314051

4052+
static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4053+
GGML_ASSERT(src0);
4054+
GGML_ASSERT(src0->extra);
4055+
GGML_ASSERT(dst);
4056+
GGML_ASSERT(dst->extra);
4057+
4058+
UNUSED(src1);
4059+
4060+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4061+
cl_command_queue queue = backend_ctx->queue;
4062+
4063+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4064+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
4065+
4066+
cl_ulong offset0 = extra0->offset + src0->view_offs;
4067+
cl_ulong offsetd = extrad->offset + dst->view_offs;
4068+
4069+
int32_t n_groups = ((const int32_t *) dst->op_params)[0];
4070+
int32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + n_groups - 1) / n_groups);
4071+
float eps = ((const float *) dst->op_params)[1];
4072+
4073+
const int ne00 = src0->ne[0];
4074+
const int ne01 = src0->ne[1];
4075+
const int ne02 = src0->ne[2];
4076+
const int ne = ne00*ne01*ne02;
4077+
4078+
cl_kernel kernel = backend_ctx->kernel_group_norm;
4079+
4080+
size_t sgs = 64;
4081+
if (backend_ctx->gpu_family == ADRENO) {
4082+
sgs = 64;
4083+
} else if (backend_ctx->gpu_family == INTEL) {
4084+
sgs = 32;
4085+
} else {
4086+
GGML_ASSERT(false && "Unsupported GPU");
4087+
}
4088+
4089+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
4090+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
4091+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
4092+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
4093+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne));
4094+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &group_size));
4095+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
4096+
4097+
size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
4098+
size_t local_work_size[] = {(size_t)sgs, 1, 1};
4099+
4100+
#ifdef GGML_OPENCL_PROFILING
4101+
cl_event evt;
4102+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4103+
4104+
g_profiling_info.emplace_back();
4105+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4106+
#else
4107+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4108+
#endif
4109+
}
4110+
40324111
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
40334112
GGML_ASSERT(src0);
40344113
GGML_ASSERT(src0->extra);
@@ -5609,6 +5688,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
56095688
}
56105689
func = ggml_cl_rms_norm;
56115690
break;
5691+
case GGML_OP_GROUP_NORM:
5692+
if (!any_on_device) {
5693+
return false;
5694+
}
5695+
func = ggml_cl_group_norm;
5696+
break;
56125697
case GGML_OP_MUL_MAT:
56135698
if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
56145699
return false;
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2+
3+
#ifdef cl_intel_subgroups
4+
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
5+
#else
6+
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
7+
#endif
8+
9+
#ifdef cl_intel_required_subgroup_size
10+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11+
#define INTEL_GPU 1
12+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14+
#elif defined(cl_qcom_reqd_sub_group_size)
15+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16+
#define ADRENO_GPU 1
17+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19+
#endif
20+
21+
// Workgroup must be a subgroup
22+
#ifdef INTEL_GPU
23+
REQD_SUBGROUP_SIZE_32
24+
#elif defined (ADRENO_GPU)
25+
REQD_SUBGROUP_SIZE_64
26+
#endif
27+
kernel void kernel_group_norm(
28+
global float * src0,
29+
ulong offset0,
30+
global float * dst,
31+
ulong offsetd,
32+
int ne,
33+
int group_size,
34+
float eps
35+
) {
36+
src0 = (global float *)((global char *)src0 + offset0);
37+
dst = (global float *)((global char *)dst + offsetd);
38+
39+
int start = get_group_id(0) * group_size;
40+
int end = start + group_size;
41+
42+
start += get_local_id(0);
43+
44+
if (end >= ne) {
45+
end = ne;
46+
}
47+
48+
float tmp = 0.0f;
49+
50+
for (int j = start; j < end; j += get_local_size(0)) {
51+
tmp += src0[j];
52+
}
53+
54+
tmp = sub_group_reduce_add(tmp);
55+
56+
const float mean = tmp / group_size;
57+
tmp = 0.0f;
58+
59+
for (int j = start; j < end; j += get_local_size(0)) {
60+
float xi = src0[j] - mean;
61+
dst[j] = xi;
62+
tmp += xi * xi;
63+
}
64+
65+
tmp = sub_group_reduce_add(tmp);
66+
67+
const float variance = tmp / group_size;
68+
const float scale = 1.0f/sqrt(variance + eps);
69+
for (int j = start; j < end; j += get_local_size(0)) {
70+
dst[j] *= scale;
71+
}
72+
}

0 commit comments

Comments
 (0)