[cann] concat optimization

noemotiovon · noemotiovon · commit c71809cbb336 · 2024-11-26T03:43:45.000Z
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -21,6 +21,7 @@
  */
 
 #include "aclnn_ops.h"
+#include "ggml-impl.h"
 
 #include <aclnnop/aclnn_avgpool2d.h>
 #include <aclnnop/aclnn_cast.h>
@@ -241,10 +242,14 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
     aclTensor* acl_dst = ggml_cann_create_tensor(dst);
 
-    int64_t concat_dim = 1;
+    const int32_t dim = ggml_get_op_params_i32(dst, 0);
+
+    GGML_ASSERT(dim >= 0 && dim < 4);
+    int32_t acl_dim = 3 - dim;
+
     aclTensor* tensors[] = {acl_src0, acl_src1};
     aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
-    aclnn_concat(ctx, tensorList, acl_dst, concat_dim);
+    aclnn_concat(ctx, tensorList, acl_dst, acl_dim);
 
     ACL_CHECK(aclDestroyTensorList(tensorList));
     ACL_CHECK(aclDestroyTensor(acl_dst));
@@ -1437,10 +1442,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ggml_tensor* src0 = dst->src[0];  // kernel
     ggml_tensor* src1 = dst->src[1];  // input
 
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
-
     GGML_TENSOR_BINARY_OP_LOCALS;
 
     // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
@@ -1462,9 +1463,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     const int64_t OH = is_2D ? ne2 : 1;
     const int64_t OW = ne1;
 
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
     // memory allocated increased to 3x when is_2D == false
     const int64_t n_bytes_factor = is_2D ? 1 : 3;
 
@@ -2899,15 +2897,16 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
     // TODO: with freq_factors
     GGML_ASSERT(src2 == NULL);
-
+    // TODO: attn_factor != 1
+    GGML_ASSERT(attn_factor == 1);
+    // TODO: n_dims <= ne0
     GGML_ASSERT(n_dims == ne0);
     GGML_ASSERT(n_dims % 2 == 0);
-
     // TODO: ext_factor != 0
     GGML_ASSERT(ext_factor == 0);
     // TODO: freq_scale != 1
     GGML_ASSERT(freq_scale == 1);
-
+    // TODO: type == GGML_TYPE_F16
     GGML_ASSERT(src0->type == GGML_TYPE_F32);
 
     const float theta_scale = powf(freq_base, -2.0f / n_dims);
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1670,6 +1670,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
         case GGML_OP_MUL_MAT: {
             switch (op->src[0]->type) {
                 case GGML_TYPE_Q8_0:
+                    // Current groupsize should not be greater than k-1 in
+                    // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
                     if (op->src[0]->ne[0] <= QK8_0) {
                         return false;
                     }
@@ -1706,22 +1708,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                     return false;
             }
         }
-        case GGML_OP_IM2COL: {
-            switch (op->src[0]->type) {
-                case GGML_TYPE_F16:
-                    return true;
-                default:
-                    return false;
-            }
-            switch (op->src[1]->type) {
-                case GGML_TYPE_F32:
-                    return true;
-                default:
-                    return false;
-            }
-        }
         case GGML_OP_CONT: {
-            switch (op->type) {
+            // TODO: support GGML_TYPE_BF16
+            switch (op->src[0]->type) {
                 case GGML_TYPE_F32:
                 case GGML_TYPE_F16:
                     return true;
@@ -1730,30 +1719,49 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
             }
         }
         case GGML_OP_ROPE: {
-            float freq_scale;
-            memcpy(&freq_scale, (int32_t*)op->op_params + 6, sizeof(float));
+            // TODO: with ops-test v == 1
+            float freq_scale, attn_factor, ext_factor;
+            memcpy(&freq_scale, (int32_t*)op->op_params + 6, sizeof(float)); 
+            memcpy(&attn_factor, (int32_t*)op->op_params + 8, sizeof(float));
+            memcpy(&ext_factor, (int32_t*)op->op_params + 7, sizeof(float));
+            // TODO: with freq_factors
             if (op->src[2] != NULL) {
                 return false;
             }
+            // TODO: n_dims <= ne0
             if (op->src[0]->ne[0] != op->op_params[1]) {
                 return false;
             }
-            
-            if (op->op_params[7] != 0) {
+            // TODO: ext_factor != 0
+            if (ext_factor != 0) {
                 return false;
             }
+            // TODO: freq_scale != 1
             if (freq_scale != 1) {
                 return false;
             }
+            // TODO: attn_factor != 1
+            if (attn_factor != 1) {
+                return false;
+            }
+            // TODO: type == GGML_TYPE_F16
             switch (op->src[0]->type) {
                 case GGML_TYPE_F32:
                     return true;
                 default:
                     return false;
             }
         }
+        case GGML_OP_UPSCALE: {
+            // aclnnUpsampleNearest2dGetWorkspaceSize not support
+            // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
+            if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
+                return false;
+            }
+            return true;
+        }
+        case GGML_OP_IM2COL: 
         case GGML_OP_CONCAT: 
-        case GGML_OP_UPSCALE:
         case GGML_OP_DUP:
         case GGML_OP_REPEAT:
         case GGML_OP_NONE: