Skip to content

Commit 6e1c4ce

Browse files
authored
CANN: Support Opt CONV_TRANSPOSE_1D and ELU (#12786)
* [CANN] Support ELU and CONV_TRANSPOSE_1D * [CANN]Modification review comments * [CANN]Modification review comments * [CANN]name adjustment * [CANN]remove lambda used in template * [CANN]Use std::func instead of template * [CANN]Modify the code according to the review comments --------- Signed-off-by: noemotiovon <[email protected]>
1 parent 0090950 commit 6e1c4ce

File tree

5 files changed

+193
-67
lines changed

5 files changed

+193
-67
lines changed

.devops/llama-cli-cann.Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
1+
ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
22

33
FROM ascendai/cann:$ASCEND_VERSION AS build
44

55
WORKDIR /app
66

77
COPY . .
88

9-
RUN yum install -y gcc g++ cmake make
9+
RUN yum install -y gcc g++ cmake make libcurl-devel
1010
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
1111
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
1212
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}

.github/workflows/build.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1771,7 +1771,7 @@ jobs:
17711771
strategy:
17721772
matrix:
17731773
cann:
1774-
- '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
1774+
- '8.1.RC1.alpha001-910b-openeuler22.03-py3.10'
17751775
device:
17761776
- 'ascend910b3'
17771777
build:
@@ -1784,7 +1784,7 @@ jobs:
17841784
- name: Dependencies
17851785
run: |
17861786
yum update -y
1787-
yum install -y git gcc gcc-c++ make cmake
1787+
yum install -y git gcc gcc-c++ make cmake libcurl-devel
17881788
17891789
- name: Build
17901790
run: |

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@
5757
#include <aclnnop/aclnn_sub.h>
5858
#include <aclnnop/aclnn_mul.h>
5959
#include <aclnnop/aclnn_div.h>
60+
#include <aclnnop/aclnn_convolution.h>
61+
#include <aclnnop/aclnn_elu.h>
6062
#include <float.h>
6163

6264
#include <cmath>
@@ -86,6 +88,20 @@ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclT
8688
}
8789
}
8890

91+
void ggml_cann_unary_op(
92+
std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
93+
ggml_backend_cann_context& ctx, ggml_tensor* dst) {
94+
ggml_tensor* src = dst->src[0];
95+
96+
aclTensor* acl_src = ggml_cann_create_tensor(src);
97+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
98+
99+
unary_op(ctx, acl_src, acl_dst);
100+
101+
ACL_CHECK(aclDestroyTensor(acl_src));
102+
ACL_CHECK(aclDestroyTensor(acl_dst));
103+
}
104+
89105
/**
90106
* @brief Repeats elements of a tensor along each dimension according to the
91107
* specified repeat array.
@@ -2585,3 +2601,49 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
25852601
ACL_CHECK(aclDestroyTensor(acl_src));
25862602
ACL_CHECK(aclDestroyTensor(acl_dst));
25872603
}
2604+
2605+
void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2606+
ggml_tensor * src0 = dst->src[0];
2607+
ggml_tensor * src1 = dst->src[1];
2608+
2609+
// stride
2610+
int64_t s0 = ((const int32_t*)(dst->op_params))[0];
2611+
2612+
aclTensor* acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
2613+
aclTensor* acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
2614+
aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
2615+
2616+
int64_t strideVal[1];
2617+
strideVal[0] = s0;
2618+
aclIntArray *stride = aclCreateIntArray(strideVal, 1);
2619+
int64_t paddingVal[] = {0};
2620+
aclIntArray *padding = aclCreateIntArray(paddingVal, 1);
2621+
int64_t dilationVal[] = {1};
2622+
aclIntArray *dilation = aclCreateIntArray(dilationVal, 1);
2623+
bool transposed = true;
2624+
int64_t groups = 1;
2625+
int8_t cubeMathType = 0;
2626+
2627+
GGML_CANN_CALL_ACLNN_OP(Convolution, acl_input, acl_weight, nullptr, stride,
2628+
padding, dilation, transposed, padding, groups, acl_dst, cubeMathType);
2629+
2630+
ACL_CHECK(aclDestroyTensor(acl_weight));
2631+
ACL_CHECK(aclDestroyTensor(acl_dst));
2632+
}
2633+
2634+
void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2635+
ggml_tensor * src0 = dst->src[0];
2636+
2637+
aclTensor* acl_input = ggml_cann_create_tensor(src0);
2638+
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
2639+
2640+
float alphaValue = 1.0f;
2641+
aclScalar* alpha = nullptr;
2642+
alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
2643+
2644+
GGML_CANN_CALL_ACLNN_OP(Elu, acl_input, alpha, alpha, alpha,
2645+
acl_dst);
2646+
2647+
ACL_CHECK(aclDestroyTensor(acl_input));
2648+
ACL_CHECK(aclDestroyTensor(acl_dst));
2649+
}

ggml/src/ggml-cann/aclnn_ops.h

Lines changed: 111 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,4 @@
1-
#ifndef CANN_ACLNN_OPS
2-
#define CANN_ACLNN_OPS
3-
41
/**
5-
* @file acl_tensor
6-
* @brief This file contains related functions of ggml_tensor and acl_tensor.
7-
* Contains conversion from ggml_tensor to acl_tensor, broadcast and other
8-
* functions.
9-
* @author hipudding <[email protected]>
10-
* @author wangshuai09 <[email protected]>
11-
* @date July 15, 2024
12-
*
132
* Copyright (c) 2023-2024 The ggml authors
143
*
154
* Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -31,6 +20,9 @@
3120
* IN THE SOFTWARE.
3221
*/
3322

23+
#ifndef CANN_ACLNN_OPS
24+
#define CANN_ACLNN_OPS
25+
3426
#include <aclnnop/aclnn_abs.h>
3527
#include <aclnnop/aclnn_neg.h>
3628
#include <aclnnop/aclnn_exp.h>
@@ -483,8 +475,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
483475
* operation is executed using the CANN backend for optimized performance.
484476
*
485477
* @param ctx The CANN context used for operations.
486-
* @param dst The destination tensor where the indices of the maximum values will be stored.
487-
* dst->op is `GGML_OP_ARGMAX`.
478+
* @param dst The destination tensor where the indices of the maximum values will
479+
* be stored. dst->op is `GGML_OP_ARGMAX`.
488480
*/
489481
void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
490482

@@ -600,40 +592,8 @@ void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
600592
aclTensor* acl_dst);
601593

602594
/**
603-
* @brief Launches an asynchronous task using the memory allocator.
604-
*
605-
* This macro submit an asynchronous task on the specified stream.
606-
* The task uses memory allocated by the allocator. It is guaranteed
607-
* that the memory will not be accessed by other tasks until this task
608-
* completes, due to the sequential execution order within the same stream.
609-
*
610-
* @param OP_NAME aclnn operator name.
611-
* @param args Additional arguments required by the task.
612-
*
613-
* @note
614-
* Memory from the allocator will be "freed" immediately and can be
615-
* reallocated to other pointers. However, it won't be accessed by any
616-
* other task before this asynchronous task ends, because all tasks in the
617-
* same stream are executed in queue order.
618-
*/
619-
#define GGML_CANN_CALL_ACLNN_OP(OP_NAME, ...) \
620-
do { \
621-
uint64_t workspaceSize = 0; \
622-
aclOpExecutor * executor; \
623-
void * workspaceAddr = nullptr; \
624-
\
625-
ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor)); \
626-
\
627-
if (workspaceSize > 0) { \
628-
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); \
629-
workspaceAddr = workspace_allocator.get(); \
630-
} \
631-
ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, ctx.stream())); \
632-
} while (0)
633-
634-
635-
/**
636-
* @brief Prepares broadcast-compatible ACL tensors for two input tensors and one output tensor.
595+
* @brief Prepares broadcast-compatible ACL tensors for two input tensors and one
596+
* output tensor.
637597
*
638598
* This function checks whether broadcasting is needed between `src0` and `src1`.
639599
* If broadcasting is required, it calculates the proper shapes and creates
@@ -647,14 +607,57 @@ void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
647607
* @param acl_src1 Output pointer to the created ACL tensor corresponding to src1.
648608
* @param acl_dst Output pointer to the created ACL tensor corresponding to dst.
649609
*/
650-
void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclTensor ** acl_src0,
651-
aclTensor ** acl_src1, aclTensor ** acl_dst);
610+
void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst,
611+
aclTensor ** acl_src0, aclTensor ** acl_src1, aclTensor ** acl_dst);
612+
613+
/**
614+
* @brief Computes the 1D transposed convolution (deconvolution) of a ggml
615+
* tensor using the CANN backend.
616+
*
617+
* @details This function performs a 1D transposed convolution (also known as
618+
* deconvolution) operation on the input tensor. The computed result is stored
619+
* in the destination tensor `dst`. The operation is optimized using the CANN
620+
* backend for improved performance.
621+
*
622+
* @param ctx The CANN context used for operations.
623+
* @param dst The destination tensor where the transposed convolution result
624+
* will be stored. dst->op is `GGML_OP_CONV_TRANSPOSE_1D`.
625+
*/
626+
void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
652627

653628
/**
654-
* @brief Applies a element-wise operation to two input tensors using the CANN backend.
629+
* @brief Applies the ELU (Exponential Linear Unit) activation to a ggml tensor
630+
* using the CANN backend.
631+
*
632+
* @details This function performs an element-wise ELU activation on the input
633+
* tensor.
634+
* The result is written to the destination tensor `dst` in-place.
635+
* The ELU function is defined as:
636+
*
637+
* \text{ELU}(x) =
638+
* \begin{cases}
639+
* x, & \text{if } x > 0 \\
640+
* \alpha \left( \exp(x) - 1 \right), & \text{if } x \leq 0
641+
* \end{cases}
655642
*
656-
* This templated function takes a binary operator and applies it to two source tensors
657-
* associated with the destination tensor. The function handles broadcasting as needed.
643+
* where α (alpha) is a hyperparameter, typically set to 1.0.
644+
* This operation is optimized using the CANN backend for high-performance
645+
* inference or training.
646+
*
647+
* @param ctx The CANN context used for operations.
648+
* @param dst The destination tensor where the ELU-activated result will be stored.
649+
* dst->op is expected to be `GGML_OP_ELU`.
650+
*/
651+
void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
652+
653+
/**
654+
* @brief Applies a element-wise operation to two input tensors using the CANN
655+
* backend.
656+
*
657+
* This templated function takes a binary operator and applies it to two source
658+
* tensors
659+
* associated with the destination tensor. The function handles broadcasting as
660+
* needed.
658661
*
659662
* @tparam binary_op A callable object (e.g., lambda or function pointer) representing
660663
* the binary operation to be performed. It must take three arguments:
@@ -681,6 +684,38 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
681684
ACL_CHECK(aclDestroyTensor(acl_dst));
682685
}
683686

687+
/**
688+
* @brief Launches an asynchronous task using the memory allocator.
689+
*
690+
* This macro submit an asynchronous task on the specified stream.
691+
* The task uses memory allocated by the allocator. It is guaranteed
692+
* that the memory will not be accessed by other tasks until this task
693+
* completes, due to the sequential execution order within the same stream.
694+
*
695+
* @param OP_NAME aclnn operator name.
696+
* @param args Additional arguments required by the task.
697+
*
698+
* @note
699+
* Memory from the allocator will be "freed" immediately and can be
700+
* reallocated to other pointers. However, it won't be accessed by any
701+
* other task before this asynchronous task ends, because all tasks in the
702+
* same stream are executed in queue order.
703+
*/
704+
#define GGML_CANN_CALL_ACLNN_OP(OP_NAME, ...) \
705+
do { \
706+
uint64_t workspaceSize = 0; \
707+
aclOpExecutor * executor; \
708+
void * workspaceAddr = nullptr; \
709+
\
710+
ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor)); \
711+
\
712+
if (workspaceSize > 0) { \
713+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); \
714+
workspaceAddr = workspace_allocator.get(); \
715+
} \
716+
ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, ctx.stream())); \
717+
} while (0)
718+
684719
/**
685720
* @brief Applies a unary operation to an input tensor using the CANN backend.
686721
*
@@ -690,7 +725,6 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
690725
* @tparam unary_op A callable with the signature:
691726
* void(ggml_backend_cann_context&, aclTensor*, aclTensor*)
692727
* where the first aclTensor is the source and the second is the destination.
693-
*
694728
* @param ctx The CANN backend context for managing resources and execution.
695729
* @param dst The destination tensor. Its src[0] is treated as the input tensor.
696730
*/
@@ -702,10 +736,30 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
702736
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
703737

704738
unary_op(ctx, acl_src, acl_dst);
739+
705740
ACL_CHECK(aclDestroyTensor(acl_src));
706741
ACL_CHECK(aclDestroyTensor(acl_dst));
707742
}
708743

744+
/**
745+
* @brief Applies a unary operation to a ggml tensor using the CANN backend.
746+
*
747+
* @details This function performs a unary operation on the input tensor using
748+
* a user-provided lambda or callable object `unary_op`, which accepts the CANN
749+
* context and two ACL tensors (source and destination). Internally, this function
750+
* creates ACL representations of the ggml tensors and invokes the unary operation.
751+
* The result is stored in the destination tensor `dst`. This utility abstracts the
752+
* common boilerplate of tensor conversion and cleanup when implementing unary ops.
753+
*
754+
* @param unary_op A callable that performs the unary operation using CANN APIs.
755+
* @param ctx The CANN context used for operations.
756+
* @param dst The destination tensor where the result will be stored.
757+
* The source tensor is retrieved from `dst->src[0]`.
758+
*/
759+
void ggml_cann_unary_op(
760+
std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
761+
ggml_backend_cann_context& ctx, ggml_tensor* dst);
762+
709763
/**
710764
* @brief Helper macro to invoke a unary ACL operation using ggml_cann_unary_op.
711765
*
@@ -725,11 +779,12 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
725779
*/
726780
#define GGML_CANN_CALL_UNARY_OP(OP_NAME) \
727781
do { \
728-
auto lambda = [](auto ctx, auto acl_src, auto acl_dst) { \
782+
auto lambda = [](ggml_backend_cann_context& ctx, \
783+
aclTensor* acl_src, \
784+
aclTensor* acl_dst) { \
729785
GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst); \
730786
}; \
731-
ggml_cann_unary_op<lambda>(ctx, dst); \
787+
ggml_cann_unary_op(lambda, ctx, dst); \
732788
} \
733789
while (0)
734-
735790
#endif // CANN_ACLNN_OPS

0 commit comments

Comments
 (0)