Skip to content

Commit c71809c

Browse files
author
noemotiovon
committed
[cann] concat optimization
1 parent 4dea1d5 commit c71809c

File tree

2 files changed

+39
-32
lines changed

2 files changed

+39
-32
lines changed

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
*/
2222

2323
#include "aclnn_ops.h"
24+
#include "ggml-impl.h"
2425

2526
#include <aclnnop/aclnn_avgpool2d.h>
2627
#include <aclnnop/aclnn_cast.h>
@@ -241,10 +242,14 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
241242
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
242243
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
243244

244-
int64_t concat_dim = 1;
245+
const int32_t dim = ggml_get_op_params_i32(dst, 0);
246+
247+
GGML_ASSERT(dim >= 0 && dim < 4);
248+
int32_t acl_dim = 3 - dim;
249+
245250
aclTensor* tensors[] = {acl_src0, acl_src1};
246251
aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
247-
aclnn_concat(ctx, tensorList, acl_dst, concat_dim);
252+
aclnn_concat(ctx, tensorList, acl_dst, acl_dim);
248253

249254
ACL_CHECK(aclDestroyTensorList(tensorList));
250255
ACL_CHECK(aclDestroyTensor(acl_dst));
@@ -1437,10 +1442,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
14371442
ggml_tensor* src0 = dst->src[0]; // kernel
14381443
ggml_tensor* src1 = dst->src[1]; // input
14391444

1440-
GGML_ASSERT(src0->type == GGML_TYPE_F16);
1441-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
1442-
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
1443-
14441445
GGML_TENSOR_BINARY_OP_LOCALS;
14451446

14461447
// aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
@@ -1462,9 +1463,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
14621463
const int64_t OH = is_2D ? ne2 : 1;
14631464
const int64_t OW = ne1;
14641465

1465-
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
1466-
GGML_ASSERT(nb10 == sizeof(float));
1467-
14681466
// memory allocated increased to 3x when is_2D == false
14691467
const int64_t n_bytes_factor = is_2D ? 1 : 3;
14701468

@@ -2899,15 +2897,16 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
28992897

29002898
// TODO: with freq_factors
29012899
GGML_ASSERT(src2 == NULL);
2902-
2900+
// TODO: attn_factor != 1
2901+
GGML_ASSERT(attn_factor == 1);
2902+
// TODO: n_dims <= ne0
29032903
GGML_ASSERT(n_dims == ne0);
29042904
GGML_ASSERT(n_dims % 2 == 0);
2905-
29062905
// TODO: ext_factor != 0
29072906
GGML_ASSERT(ext_factor == 0);
29082907
// TODO: freq_scale != 1
29092908
GGML_ASSERT(freq_scale == 1);
2910-
2909+
// TODO: type == GGML_TYPE_F16
29112910
GGML_ASSERT(src0->type == GGML_TYPE_F32);
29122911

29132912
const float theta_scale = powf(freq_base, -2.0f / n_dims);

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1670,6 +1670,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
16701670
case GGML_OP_MUL_MAT: {
16711671
switch (op->src[0]->type) {
16721672
case GGML_TYPE_Q8_0:
1673+
// Current groupsize should not be greater than k-1 in
1674+
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
16731675
if (op->src[0]->ne[0] <= QK8_0) {
16741676
return false;
16751677
}
@@ -1706,22 +1708,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
17061708
return false;
17071709
}
17081710
}
1709-
case GGML_OP_IM2COL: {
1710-
switch (op->src[0]->type) {
1711-
case GGML_TYPE_F16:
1712-
return true;
1713-
default:
1714-
return false;
1715-
}
1716-
switch (op->src[1]->type) {
1717-
case GGML_TYPE_F32:
1718-
return true;
1719-
default:
1720-
return false;
1721-
}
1722-
}
17231711
case GGML_OP_CONT: {
1724-
switch (op->type) {
1712+
// TODO: support GGML_TYPE_BF16
1713+
switch (op->src[0]->type) {
17251714
case GGML_TYPE_F32:
17261715
case GGML_TYPE_F16:
17271716
return true;
@@ -1730,30 +1719,49 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
17301719
}
17311720
}
17321721
case GGML_OP_ROPE: {
1733-
float freq_scale;
1734-
memcpy(&freq_scale, (int32_t*)op->op_params + 6, sizeof(float));
1722+
// TODO: with ops-test v == 1
1723+
float freq_scale, attn_factor, ext_factor;
1724+
memcpy(&freq_scale, (int32_t*)op->op_params + 6, sizeof(float));
1725+
memcpy(&attn_factor, (int32_t*)op->op_params + 8, sizeof(float));
1726+
memcpy(&ext_factor, (int32_t*)op->op_params + 7, sizeof(float));
1727+
// TODO: with freq_factors
17351728
if (op->src[2] != NULL) {
17361729
return false;
17371730
}
1731+
// TODO: n_dims <= ne0
17381732
if (op->src[0]->ne[0] != op->op_params[1]) {
17391733
return false;
17401734
}
1741-
1742-
if (op->op_params[7] != 0) {
1735+
// TODO: ext_factor != 0
1736+
if (ext_factor != 0) {
17431737
return false;
17441738
}
1739+
// TODO: freq_scale != 1
17451740
if (freq_scale != 1) {
17461741
return false;
17471742
}
1743+
// TODO: attn_factor != 1
1744+
if (attn_factor != 1) {
1745+
return false;
1746+
}
1747+
// TODO: type == GGML_TYPE_F16
17481748
switch (op->src[0]->type) {
17491749
case GGML_TYPE_F32:
17501750
return true;
17511751
default:
17521752
return false;
17531753
}
17541754
}
1755+
case GGML_OP_UPSCALE: {
1756+
// aclnnUpsampleNearest2dGetWorkspaceSize not support
1757+
// selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
1758+
if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
1759+
return false;
1760+
}
1761+
return true;
1762+
}
1763+
case GGML_OP_IM2COL:
17551764
case GGML_OP_CONCAT:
1756-
case GGML_OP_UPSCALE:
17571765
case GGML_OP_DUP:
17581766
case GGML_OP_REPEAT:
17591767
case GGML_OP_NONE:

0 commit comments

Comments
 (0)