@@ -307,6 +307,7 @@ struct ggml_backend_opencl_context {
307
307
cl_program program_rope;
308
308
cl_program program_scale;
309
309
cl_program program_silu;
310
+ cl_program program_sigmoid;
310
311
cl_program program_softmax_f32;
311
312
cl_program program_softmax_f16;
312
313
cl_program program_softmax_4_f32;
@@ -323,6 +324,7 @@ struct ggml_backend_opencl_context {
323
324
cl_kernel kernel_gelu, kernel_gelu_4;
324
325
cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
325
326
cl_kernel kernel_relu;
327
+ cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
326
328
cl_kernel kernel_clamp;
327
329
cl_kernel kernel_norm;
328
330
cl_kernel kernel_rms_norm;
@@ -1060,6 +1062,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1060
1062
GGML_LOG_CONT (" ." );
1061
1063
}
1062
1064
1065
+ // sigmoid
1066
+ {
1067
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1068
+ const std::string kernel_src {
1069
+ #include " sigmoid.cl.h"
1070
+ };
1071
+ #else
1072
+ const std::string kernel_src = read_file (" sigmoid.cl" );
1073
+ #endif
1074
+ backend_ctx->program_sigmoid =
1075
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1076
+
1077
+ CL_CHECK ((backend_ctx->kernel_sigmoid_f32 = clCreateKernel (backend_ctx->program_sigmoid , " kernel_sigmoid_f32" , &err), err));
1078
+ CL_CHECK ((backend_ctx->kernel_sigmoid_f16 = clCreateKernel (backend_ctx->program_sigmoid , " kernel_sigmoid_f16" , &err), err));
1079
+ GGML_LOG_CONT (" ." );
1080
+ }
1081
+
1063
1082
// Adreno kernels
1064
1083
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1065
1084
// transpose
@@ -1939,7 +1958,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1939
1958
case GGML_UNARY_OP_SILU:
1940
1959
case GGML_UNARY_OP_RELU:
1941
1960
case GGML_UNARY_OP_GELU_QUICK:
1942
- return ggml_is_contiguous (op->src [0 ]) && op->src [0 ]->type == GGML_TYPE_F32;
1961
+ return ggml_is_contiguous (op->src [0 ]) && op->src [0 ]->type == GGML_TYPE_F32;
1962
+ case GGML_UNARY_OP_SIGMOID:
1963
+ return ggml_is_contiguous (op->src [0 ]);
1943
1964
default :
1944
1965
return false ;
1945
1966
}
@@ -3759,6 +3780,58 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
3759
3780
#endif
3760
3781
}
3761
3782
3783
+ static void ggml_cl_sigmoid (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3784
+ GGML_ASSERT (src0);
3785
+ GGML_ASSERT (src0->extra );
3786
+ GGML_ASSERT (dst);
3787
+ GGML_ASSERT (dst->extra );
3788
+
3789
+ UNUSED (src1);
3790
+
3791
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
3792
+ cl_command_queue queue = backend_ctx->queue ;
3793
+
3794
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra ;
3795
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
3796
+
3797
+ cl_ulong offset0 = extra0->offset + src0->view_offs ;
3798
+ cl_ulong offsetd = extrad->offset + dst->view_offs ;
3799
+
3800
+ cl_kernel kernel;
3801
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3802
+ kernel = backend_ctx->kernel_sigmoid_f32 ;
3803
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
3804
+ kernel = backend_ctx->kernel_sigmoid_f16 ;
3805
+ } else {
3806
+ GGML_ASSERT (false && " Unsupported data types for sigmoid (input and output must be both f32 or f16)" );
3807
+ }
3808
+
3809
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
3810
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
3811
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extrad->data_device ));
3812
+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offsetd));
3813
+
3814
+ const int64_t n = ggml_nelements (dst);
3815
+
3816
+ size_t global_work_size[] = {(size_t )n, 1 , 1 };
3817
+ size_t local_work_size[] = {64 , 1 , 1 };
3818
+
3819
+ size_t * local_work_size_ptr = local_work_size;
3820
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups ) {
3821
+ local_work_size_ptr = nullptr ; // Let driver choose the work-group sizes.
3822
+ }
3823
+
3824
+ #ifdef GGML_OPENCL_PROFILING
3825
+ cl_event evt;
3826
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr, 0 , NULL , &evt));
3827
+
3828
+ g_profiling_info.emplace_back ();
3829
+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size_ptr, dst);
3830
+ #else
3831
+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size_ptr, 0 , NULL , NULL ));
3832
+ #endif
3833
+ }
3834
+
3762
3835
static void ggml_cl_clamp (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3763
3836
GGML_ASSERT (src0);
3764
3837
GGML_ASSERT (src0->extra );
@@ -5509,6 +5582,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
5509
5582
}
5510
5583
func = ggml_cl_relu;
5511
5584
break ;
5585
+ case GGML_UNARY_OP_SIGMOID:
5586
+ if (!any_on_device) {
5587
+ return false ;
5588
+ }
5589
+ func = ggml_cl_sigmoid;
5590
+ break ;
5512
5591
default :
5513
5592
return false ;
5514
5593
} break ;
0 commit comments