@@ -75,6 +75,15 @@ static __global__ void hardswish_f32(const float * x, float * dst, const int k)
75
75
dst[i] = x[i] * fminf (1 .0f , fmaxf (0 .0f , (x[i] + 3 .0f ) / 6 .0f ));
76
76
}
77
77
78
+ static __global__ void exp_f32 (const float * x, float * dst, const int k) {
79
+ const int i = blockDim .x *blockIdx .x + threadIdx .x ;
80
+
81
+ if (i >= k) {
82
+ return ;
83
+ }
84
+ dst[i] = expf (x[i]);
85
+ }
86
+
78
87
static __global__ void leaky_relu_f32 (const float * x, float * dst, const int k, const float negative_slope) {
79
88
const int i = blockDim .x *blockIdx .x + threadIdx .x ;
80
89
if (i >= k) {
@@ -159,6 +168,11 @@ static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaSt
159
168
hardswish_f32<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0 , stream>>> (x, dst, k);
160
169
}
161
170
171
+ static void exp_f32_cuda (const float * x, float * dst, const int k, cudaStream_t stream) {
172
+ const int num_blocks = (k + CUDA_EXP_BLOCK_SIZE - 1 ) / CUDA_EXP_BLOCK_SIZE;
173
+ exp_f32<<<num_blocks, CUDA_EXP_BLOCK_SIZE, 0 , stream>>> (x, dst, k);
174
+ }
175
+
162
176
static void leaky_relu_f32_cuda (const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
163
177
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1 ) / CUDA_RELU_BLOCK_SIZE;
164
178
leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0 , stream>>> (x, dst, k, negative_slope);
@@ -296,6 +310,20 @@ void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
296
310
hardswish_f32_cuda (src0_d, dst_d, ggml_nelements (src0), stream);
297
311
}
298
312
313
+ void ggml_cuda_op_exp (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
314
+ const ggml_tensor * src0 = dst->src [0 ];
315
+ const float * src0_d = (const float *)src0->data ;
316
+ float * dst_d = (float *)dst->data ;
317
+ cudaStream_t stream = ctx.stream ();
318
+
319
+ GGML_ASSERT (ggml_is_contiguous (src0));
320
+
321
+ GGML_ASSERT (src0->type == GGML_TYPE_F32);
322
+ GGML_ASSERT ( dst->type == GGML_TYPE_F32);
323
+
324
+ exp_f32_cuda (src0_d, dst_d, ggml_nelements (src0), stream);
325
+ }
326
+
299
327
void ggml_cuda_op_leaky_relu (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
300
328
const ggml_tensor * src0 = dst->src [0 ];
301
329
const float * src0_d = (const float *)src0->data ;
0 commit comments