[CANN] Support ELU and CONV_TRANSPOSE_1D

noemotiovon · noemotiovon · commit 712ffc843862 · 2025-04-08T02:17:41.000Z
Signed-off-by: noemotiovon &lt;noemotiovon@gmail.com&gt;
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -57,6 +57,8 @@
 #include <aclnnop/aclnn_sub.h>
 #include <aclnnop/aclnn_mul.h>
 #include <aclnnop/aclnn_div.h>
+#include <aclnnop/aclnn_convolution.h>
+#include <aclnnop/aclnn_elu.h>
 #include <float.h>
 
 #include <cmath>
@@ -2585,3 +2587,55 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ACL_CHECK(aclDestroyTensor(acl_src));
     ACL_CHECK(aclDestroyTensor(acl_dst));
 }
+
+void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
+    ggml_tensor * src0 = dst->src[0];
+    ggml_tensor * src1 = dst->src[1];
+
+    // stride
+    int64_t s0 = ((const int32_t*)(dst->op_params))[0];
+
+    aclTensor* acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
+    aclTensor* acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
+
+    int64_t strideVal[1];
+    strideVal[0] = s0;
+    aclIntArray *stride = aclCreateIntArray(strideVal, 1);
+    int64_t paddingVal[] = {0};
+    aclIntArray *padding = aclCreateIntArray(paddingVal, 1);
+    int64_t dilationVal[] = {1};
+    aclIntArray *dilation = aclCreateIntArray(dilationVal, 1);
+    int64_t outputPaddingVal[] = {0};
+    aclIntArray *outputPadding = aclCreateIntArray(outputPaddingVal, 1);
+    bool transposed = true;
+    int64_t groups = 1;
+    int8_t cubeMathType = 0;
+
+    GGML_CANN_CALL_ACLNN_OP(Convolution, acl_input, acl_weight, nullptr, stride,
+        padding, dilation, transposed, outputPadding, groups, acl_dst, cubeMathType);
+
+    ACL_CHECK(aclDestroyTensor(acl_weight));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst){
+    ggml_tensor * src0 = dst->src[0];
+
+    aclTensor* acl_input = ggml_cann_create_tensor(src0);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+
+    float ONE = 1.0f;
+    aclScalar* alpha = nullptr;
+    alpha = aclCreateScalar(&ONE, aclDataType::ACL_FLOAT);
+    aclScalar* scale = nullptr;
+    scale = aclCreateScalar(&ONE, aclDataType::ACL_FLOAT);
+    aclScalar* inputScale = nullptr;
+    inputScale = aclCreateScalar(&ONE, aclDataType::ACL_FLOAT);
+
+    GGML_CANN_CALL_ACLNN_OP(Elu, acl_input, alpha, scale, inputScale,
+        acl_dst);
+
+    ACL_CHECK(aclDestroyTensor(acl_input));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h
@@ -1,15 +1,4 @@
-#ifndef CANN_ACLNN_OPS
-#define CANN_ACLNN_OPS
-
 /**
- * @file    acl_tensor
- * @brief   This file contains related functions of ggml_tensor and acl_tensor.
- *          Contains conversion from ggml_tensor to acl_tensor, broadcast and other
- *          functions.
- * @author  hipudding <huafengchun@gmail.com>
- * @author  wangshuai09 <391746016@qq.com>
- * @date    July 15, 2024
- *
  * Copyright (c) 2023-2024 The ggml authors
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -31,6 +20,9 @@
  * IN THE SOFTWARE.
  */
 
+#ifndef CANN_ACLNN_OPS
+#define CANN_ACLNN_OPS
+
 #include <aclnnop/aclnn_abs.h>
 #include <aclnnop/aclnn_neg.h>
 #include <aclnnop/aclnn_exp.h>
@@ -483,8 +475,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
  *          operation is executed using the CANN backend for optimized performance.
  *
  * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the indices of the maximum values will be stored.
- *            dst->op is `GGML_OP_ARGMAX`.
+ * @param dst The destination tensor where the indices of the maximum values will
+ *            be stored. dst->op is `GGML_OP_ARGMAX`.
  */
 void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
@@ -600,40 +592,8 @@ void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
     aclTensor* acl_dst);
 
 /**
- * @brief Launches an asynchronous task using the memory allocator.
- *
- * This macro submit an asynchronous task on the specified stream.
- * The task uses memory allocated by the allocator. It is guaranteed
- * that the memory will not be accessed by other tasks until this task
- * completes, due to the sequential execution order within the same stream.
- *
- * @param OP_NAME aclnn operator name.
- * @param args Additional arguments required by the task.
- *
- * @note
- * Memory from the allocator will be "freed" immediately and can be
- * reallocated to other pointers. However, it won't be accessed by any
- * other task before this asynchronous task ends, because all tasks in the
- * same stream are executed in queue order.
- */
-#define GGML_CANN_CALL_ACLNN_OP(OP_NAME, ...)                                                \
-    do {                                                                                     \
-        uint64_t        workspaceSize = 0;                                                   \
-        aclOpExecutor * executor;                                                            \
-        void *          workspaceAddr = nullptr;                                             \
-                                                                                             \
-        ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor)); \
-                                                                                             \
-        if (workspaceSize > 0) {                                                             \
-            ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);             \
-            workspaceAddr = workspace_allocator.get();                                       \
-        }                                                                                    \
-        ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, ctx.stream()));     \
-    } while (0)
-
-
-/**
- * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one output tensor.
+ * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one
+ * output tensor.
  *
  * This function checks whether broadcasting is needed between `src0` and `src1`.
  * If broadcasting is required, it calculates the proper shapes and creates
@@ -647,14 +607,57 @@ void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
  * @param acl_src1 Output pointer to the created ACL tensor corresponding to src1.
  * @param acl_dst  Output pointer to the created ACL tensor corresponding to dst.
  */
-void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclTensor ** acl_src0,
-                        aclTensor ** acl_src1, aclTensor ** acl_dst);
+void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst,
+    aclTensor ** acl_src0, aclTensor ** acl_src1, aclTensor ** acl_dst);
+
+/**
+ * @brief   Computes the 1D transposed convolution (deconvolution) of a ggml
+ * tensor using the CANN backend.
+ *
+ * @details This function performs a 1D transposed convolution (also known as
+ * deconvolution) operation on the input tensor. The computed result is stored
+ * in the destination tensor `dst`. The operation is optimized using the CANN
+ * backend for improved performance.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the transposed convolution result
+ * will be stored. dst->op is `GGML_OP_CONV_TRANSPOSE_1D`.
+ */
+void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Applies the ELU (Exponential Linear Unit) activation to a ggml tensor
+ * using the CANN backend.
+ *
+ * @details This function performs an element-wise ELU activation on the input
+ *          tensor.
+ *          The result is written to the destination tensor `dst` in-place.
+ *          The ELU function is defined as:
+ *
+ *          \text{ELU}(x) =
+ *          \begin{cases}
+ *          x, & \text{if } x > 0 \\
+ *          \alpha \left( \exp(x) - 1 \right), & \text{if } x \leq 0
+ *          \end{cases}
+ *
+ *          where α (alpha) is a hyperparameter, typically set to 1.0.
+ *          This operation is optimized using the CANN backend for high-performance
+ *          inference or training.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the ELU-activated result will be stored.
+ *            dst->op is expected to be `GGML_OP_ELU`.
+ */
+void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
 /**
- * @brief Applies a element-wise operation to two input tensors using the CANN backend.
+ * @brief Applies a element-wise operation to two input tensors using the CANN
+ * backend.
  *
- * This templated function takes a binary operator and applies it to two source tensors
- * associated with the destination tensor. The function handles broadcasting as needed.
+ * This templated function takes a binary operator and applies it to two source
+ * tensors
+ * associated with the destination tensor. The function handles broadcasting as
+ * needed.
  *
  * @tparam binary_op A callable object (e.g., lambda or function pointer) representing
  *         the binary operation to be performed. It must take three arguments:
@@ -681,6 +684,38 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 
+/**
+ * @brief Launches an asynchronous task using the memory allocator.
+ *
+ * This macro submit an asynchronous task on the specified stream.
+ * The task uses memory allocated by the allocator. It is guaranteed
+ * that the memory will not be accessed by other tasks until this task
+ * completes, due to the sequential execution order within the same stream.
+ *
+ * @param OP_NAME aclnn operator name.
+ * @param args Additional arguments required by the task.
+ *
+ * @note
+ * Memory from the allocator will be "freed" immediately and can be
+ * reallocated to other pointers. However, it won't be accessed by any
+ * other task before this asynchronous task ends, because all tasks in the
+ * same stream are executed in queue order.
+ */
+#define GGML_CANN_CALL_ACLNN_OP(OP_NAME, ...)                                                \
+    do {                                                                                     \
+        uint64_t        workspaceSize = 0;                                                   \
+        aclOpExecutor * executor;                                                            \
+        void *          workspaceAddr = nullptr;                                             \
+                                                                                             \
+        ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor)); \
+                                                                                             \
+        if (workspaceSize > 0) {                                                             \
+            ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);             \
+            workspaceAddr = workspace_allocator.get();                                       \
+        }                                                                                    \
+        ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, ctx.stream()));     \
+    } while (0)
+
 /**
  * @brief Applies a unary operation to an input tensor using the CANN backend.
  *
@@ -690,12 +725,13 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
  * @tparam unary_op A callable with the signature:
  *         void(ggml_backend_cann_context&, aclTensor*, aclTensor*)
  *         where the first aclTensor is the source and the second is the destination.
- *
+ * @param unary_op function unary_op.
  * @param ctx The CANN backend context for managing resources and execution.
  * @param dst The destination tensor. Its src[0] is treated as the input tensor.
  */
-template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
-    void ggml_cann_unary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+using unary_func_ptr = void (*)(ggml_backend_cann_context&, aclTensor*, aclTensor*);
+
+static void ggml_cann_unary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst, unary_func_ptr unary_op) {
     ggml_tensor* src = dst->src[0];
 
     aclTensor* acl_src = ggml_cann_create_tensor(src);
@@ -706,6 +742,28 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
     ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 
+#define DEFINE_ACLNN_WRAPPER(OP_NAME)                                      \
+    static void aclnn_##OP_NAME##_wrapper(ggml_backend_cann_context& ctx,        \
+                                   aclTensor* src, aclTensor* dst) {      \
+        GGML_CANN_CALL_ACLNN_OP(OP_NAME, src, dst);                       \
+    }
+
+DEFINE_ACLNN_WRAPPER(Abs)
+DEFINE_ACLNN_WRAPPER(Neg)
+DEFINE_ACLNN_WRAPPER(Gelu)
+DEFINE_ACLNN_WRAPPER(Silu)
+DEFINE_ACLNN_WRAPPER(Tanh)
+DEFINE_ACLNN_WRAPPER(Relu)
+DEFINE_ACLNN_WRAPPER(Sigmoid)
+DEFINE_ACLNN_WRAPPER(Hardsigmoid)
+DEFINE_ACLNN_WRAPPER(Hardswish)
+DEFINE_ACLNN_WRAPPER(Exp)
+DEFINE_ACLNN_WRAPPER(Sqrt)
+
+static void aclnn_GeluV2_wrapper(ggml_backend_cann_context& ctx, aclTensor* src, aclTensor* dst) {
+    GGML_CANN_CALL_ACLNN_OP(GeluV2, src, 0, dst);
+}
+
 /**
  * @brief Helper macro to invoke a unary ACL operation using ggml_cann_unary_op.
  *
@@ -725,10 +783,7 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
  */
 #define GGML_CANN_CALL_UNARY_OP(OP_NAME)                         \
     do {                                                         \
-        auto lambda = [](auto ctx, auto acl_src, auto acl_dst) { \
-            GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst);  \
-        };                                                       \
-        ggml_cann_unary_op<lambda>(ctx, dst);                    \
+        ggml_cann_unary_op(ctx, dst, aclnn_##OP_NAME##_wrapper); \
     }                                                            \
     while (0)
 
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1330,12 +1330,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
                     GGML_CANN_CALL_UNARY_OP(Silu);
                     break;
                 case GGML_UNARY_OP_GELU_QUICK: {
-                        auto lambda = [](auto ctx, auto acl_src, auto acl_dst) {
-                            GGML_CANN_CALL_ACLNN_OP(GeluV2, acl_src, 0, acl_dst);
-                        };
-                        ggml_cann_unary_op<lambda>(ctx, dst);
-                    }
+                    GGML_CANN_CALL_UNARY_OP(GeluV2);
                     break;
+                }
                 case GGML_UNARY_OP_TANH:
                     GGML_CANN_CALL_UNARY_OP(Tanh);
                     break;
@@ -1354,6 +1351,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
                 case GGML_UNARY_OP_EXP:
                     GGML_CANN_CALL_UNARY_OP(Exp);
                     break;
+                case GGML_UNARY_OP_ELU:
+                    ggml_cann_elu(ctx, dst);
+                    break;
                 default:
                     return false;
             }
@@ -1444,11 +1444,14 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
             ggml_cann_argmax(ctx, dst);
             break;
         case GGML_OP_COS:
-            ggml_cann_unary_op<aclnn_cos>(ctx, dst);
+            ggml_cann_unary_op(ctx, dst, aclnn_cos);
             break;
         case GGML_OP_SIN:
-            ggml_cann_unary_op<aclnn_sin>(ctx, dst);
-        break;
+            ggml_cann_unary_op(ctx, dst, aclnn_sin);
+            break;
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            ggml_cann_conv_transpose_1d(ctx, dst);
+            break;
         default:
             return false;
     }
@@ -1710,6 +1713,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_TANH:
                 case GGML_UNARY_OP_EXP:
+                case GGML_UNARY_OP_ELU:
                     return true;
                 default:
                     return false;
@@ -1842,6 +1846,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
         case GGML_OP_ARGMAX:
         case GGML_OP_COS:
         case GGML_OP_SIN:
+        case GGML_OP_CONV_TRANSPOSE_1D:
             return true;
         default:
             return false;