Skip to content

Commit 15f56d5

Browse files
committed
opencl: add sigmoid, both f16 and f32
1 parent 3767817 commit 15f56d5

File tree

3 files changed

+110
-1
lines changed

3 files changed

+110
-1
lines changed

ggml/src/ggml-opencl/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ set(GGML_OPENCL_KERNELS
8585
rms_norm
8686
rope
8787
scale
88+
sigmoid
8889
silu
8990
softmax_4_f32
9091
softmax_4_f16

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,7 @@ struct ggml_backend_opencl_context {
307307
cl_program program_rope;
308308
cl_program program_scale;
309309
cl_program program_silu;
310+
cl_program program_sigmoid;
310311
cl_program program_softmax_f32;
311312
cl_program program_softmax_f16;
312313
cl_program program_softmax_4_f32;
@@ -323,6 +324,7 @@ struct ggml_backend_opencl_context {
323324
cl_kernel kernel_gelu, kernel_gelu_4;
324325
cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
325326
cl_kernel kernel_relu;
327+
cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
326328
cl_kernel kernel_clamp;
327329
cl_kernel kernel_norm;
328330
cl_kernel kernel_rms_norm;
@@ -1060,6 +1062,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
10601062
GGML_LOG_CONT(".");
10611063
}
10621064

1065+
// sigmoid
1066+
{
1067+
#ifdef GGML_OPENCL_EMBED_KERNELS
1068+
const std::string kernel_src {
1069+
#include "sigmoid.cl.h"
1070+
};
1071+
#else
1072+
const std::string kernel_src = read_file("sigmoid.cl");
1073+
#endif
1074+
backend_ctx->program_sigmoid =
1075+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1076+
1077+
CL_CHECK((backend_ctx->kernel_sigmoid_f32 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f32", &err), err));
1078+
CL_CHECK((backend_ctx->kernel_sigmoid_f16 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f16", &err), err));
1079+
GGML_LOG_CONT(".");
1080+
}
1081+
10631082
// Adreno kernels
10641083
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
10651084
// transpose
@@ -1939,7 +1958,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
19391958
case GGML_UNARY_OP_SILU:
19401959
case GGML_UNARY_OP_RELU:
19411960
case GGML_UNARY_OP_GELU_QUICK:
1942-
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1961+
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1962+
case GGML_UNARY_OP_SIGMOID:
1963+
return ggml_is_contiguous(op->src[0]);
19431964
default:
19441965
return false;
19451966
}
@@ -3759,6 +3780,58 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
37593780
#endif
37603781
}
37613782

3783+
static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3784+
GGML_ASSERT(src0);
3785+
GGML_ASSERT(src0->extra);
3786+
GGML_ASSERT(dst);
3787+
GGML_ASSERT(dst->extra);
3788+
3789+
UNUSED(src1);
3790+
3791+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3792+
cl_command_queue queue = backend_ctx->queue;
3793+
3794+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
3795+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
3796+
3797+
cl_ulong offset0 = extra0->offset + src0->view_offs;
3798+
cl_ulong offsetd = extrad->offset + dst->view_offs;
3799+
3800+
cl_kernel kernel;
3801+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3802+
kernel = backend_ctx->kernel_sigmoid_f32;
3803+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
3804+
kernel = backend_ctx->kernel_sigmoid_f16;
3805+
} else {
3806+
GGML_ASSERT(false && "Unsupported data types for sigmoid (input and output must be both f32 or f16)");
3807+
}
3808+
3809+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
3810+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
3811+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
3812+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
3813+
3814+
const int64_t n = ggml_nelements(dst);
3815+
3816+
size_t global_work_size[] = {(size_t)n, 1, 1};
3817+
size_t local_work_size[] = {64, 1, 1};
3818+
3819+
size_t * local_work_size_ptr = local_work_size;
3820+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3821+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3822+
}
3823+
3824+
#ifdef GGML_OPENCL_PROFILING
3825+
cl_event evt;
3826+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3827+
3828+
g_profiling_info.emplace_back();
3829+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3830+
#else
3831+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3832+
#endif
3833+
}
3834+
37623835
static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
37633836
GGML_ASSERT(src0);
37643837
GGML_ASSERT(src0->extra);
@@ -5509,6 +5582,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
55095582
}
55105583
func = ggml_cl_relu;
55115584
break;
5585+
case GGML_UNARY_OP_SIGMOID:
5586+
if (!any_on_device) {
5587+
return false;
5588+
}
5589+
func = ggml_cl_sigmoid;
5590+
break;
55125591
default:
55135592
return false;
55145593
} break;
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2+
3+
//------------------------------------------------------------------------------
4+
// sigmoid
5+
//------------------------------------------------------------------------------
6+
7+
kernel void kernel_sigmoid_f32(
8+
global float * src0,
9+
ulong offset0,
10+
global float * dst,
11+
ulong offsetd
12+
) {
13+
src0 = (global float*)((global char*)src0 + offset0);
14+
dst = (global float*)((global char*)dst + offsetd);
15+
16+
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
17+
}
18+
19+
kernel void kernel_sigmoid_f16(
20+
global half * src0,
21+
ulong offset0,
22+
global half * dst,
23+
ulong offsetd
24+
) {
25+
src0 = (global half*)((global char*)src0 + offset0);
26+
dst = (global half*)((global char*)dst + offsetd);
27+
28+
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
29+
}

0 commit comments

Comments
 (0)