Skip to content

Commit 2deb900

Browse files
committed
rename bcast to acl_tensor
1 parent 339a3fb commit 2deb900

File tree

8 files changed

+69
-79
lines changed

8 files changed

+69
-79
lines changed

ggml-cann.cpp

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,9 @@
66
#include <mutex>
77

88
#include "ggml-backend-impl.h"
9+
#include "ggml-cann/acl_ops.h"
910
#include "ggml-cann/aclnn_ops.h"
1011
#include "ggml-cann/common.h"
11-
#include "ggml-cann/acl_ops.h"
12-
13-
struct AclLifeCycle {
14-
AclLifeCycle() { ACL_CHECK(aclInit(nullptr)); }
15-
16-
~AclLifeCycle() { ACL_CHECK(aclFinalize()); }
17-
};
18-
19-
AclLifeCycle acl_life_cycle;
2012

2113
[[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
2214
const char* file, int line, const char* msg) {
@@ -477,9 +469,15 @@ GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) {
477469
ggml_backend_cann_context* cann_ctx =
478470
(ggml_backend_cann_context*)backend->context;
479471
ACL_CHECK(aclrtSynchronizeDevice());
472+
cann_ctx->free_buffers();
480473
ACL_CHECK(aclrtResetDevice(cann_ctx->device));
481474
delete cann_ctx;
482475
delete backend;
476+
477+
// Finalize when last device freed.
478+
if (cann_ctx->device == ggml_backend_cann_get_device_count() - 1) {
479+
ACL_CHECK(aclFinalize());
480+
}
483481
}
484482

485483
GGML_CALL static ggml_backend_buffer_type_t
@@ -678,7 +676,7 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
678676
case GGML_OP_DIAG_MASK_INF:
679677
return false;
680678
case GGML_OP_SOFT_MAX:
681-
return true;
679+
return true;
682680
case GGML_OP_ROPE:
683681
case GGML_OP_ALIBI:
684682
case GGML_OP_IM2COL:
@@ -844,6 +842,7 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cann_init(const char* params,
844842
extern "C" GGML_CALL int ggml_backend_cann_reg_devices();
845843

846844
GGML_CALL int ggml_backend_cann_reg_devices() {
845+
ACL_CHECK(aclInit(nullptr));
847846
uint32_t device_count = ggml_backend_cann_get_device_count();
848847
// initialization
849848
for (uint32_t i = 0; i < device_count; i++) {

ggml-cann/acl_ops.cpp

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,6 @@ OpCaller::~OpCaller() {
1515
for (aclDataBuffer* buffer : output_buffers) {
1616
aclDestroyDataBuffer(buffer);
1717
}
18-
// TODO: may free before use.
19-
for (void* ptr : ptrs) {
20-
aclrtFree(ptr);
21-
}
2218
aclopDestroyAttr(attrs);
2319
}
2420

@@ -100,20 +96,21 @@ void ggml_cann_cont(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
10096
OpCaller op;
10197
op.name("ViewCopy")
10298
.input_no_contiguous(dst, "dst")
103-
.input(dst->ne, ACL_INT64, 1, size_stride_dim, "dst_size", ctx.stream())
104-
.input(dst_stride, ACL_INT64, 1, size_stride_dim, "dst_stride",
99+
.input(ctx, dst->ne, ACL_INT64, 1, size_stride_dim, "dst_size",
100+
ctx.stream())
101+
.input(ctx, dst_stride, ACL_INT64, 1, size_stride_dim, "dst_stride",
105102
ctx.stream())
106-
.input(storage_offset, ACL_INT64, 1, storage_offset_dim,
103+
.input(ctx, storage_offset, ACL_INT64, 1, storage_offset_dim,
107104
"dst_storage_offset", ctx.stream())
108105
.input_no_contiguous(src, "src")
109-
.input(src->ne, ACL_INT64, 1, size_stride_dim, "src_size", ctx.stream())
110-
.input(src_stride, ACL_INT64, 1, size_stride_dim, "src_stride",
106+
.input(ctx, src->ne, ACL_INT64, 1, size_stride_dim, "src_size",
107+
ctx.stream())
108+
.input(ctx, src_stride, ACL_INT64, 1, size_stride_dim, "src_stride",
111109
ctx.stream())
112-
.input(storage_offset, ACL_INT64, 1, storage_offset_dim,
110+
.input(ctx, storage_offset, ACL_INT64, 1, storage_offset_dim,
113111
"src_storage_offset", ctx.stream())
114112
.output(dst, "dst")
115113
.run(ctx.stream());
116-
//aclrtSynchronizeStream(ctx.stream());
117114
}
118115

119116
void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -125,8 +122,7 @@ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
125122
OpCaller op;
126123
op.name("Pad")
127124
.input(src, "x")
128-
.input(paddings, ACL_INT64, 2, dim, "paddings", ctx.stream())
125+
.input(ctx, paddings, ACL_INT64, 2, dim, "paddings", ctx.stream())
129126
.output(dst, "y")
130127
.run(ctx.stream());
131-
//aclrtSynchronizeStream(ctx.stream());
132128
}

ggml-cann/acl_ops.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#include <string>
88
#include <vector>
99

10-
#include "bcast.h"
10+
#include "acl_tensor.h"
1111
#include "common.h"
1212

1313
struct OpCaller {
@@ -38,17 +38,16 @@ struct OpCaller {
3838
OpCaller& attr(float value, const char* name);
3939

4040
template <typename T>
41-
OpCaller& input(T* values, aclDataType dtype, size_t dims, int64_t* dim,
41+
OpCaller& input(ggml_backend_cann_context& ctx, T* values,
42+
aclDataType dtype, size_t dims, int64_t* dim,
4243
const char* name, aclrtStream stream = nullptr) {
43-
void* device_ptr = nullptr;
4444
size_t n_elem = 1;
4545
for (size_t i = 0; i < dims; i++) {
4646
n_elem *= dim[i];
4747
}
4848

4949
size_t n_bytes = n_elem * sizeof(T);
50-
ACL_CHECK(aclrtMalloc(&device_ptr, n_bytes, ACL_MEM_MALLOC_HUGE_FIRST));
51-
ptrs.push_back(device_ptr);
50+
void* device_ptr = ctx.alloc_buffer(n_bytes);
5251
if (stream == nullptr) {
5352
ACL_CHECK(aclrtMemcpy(device_ptr, n_bytes, values, n_bytes,
5453
ACL_MEMCPY_HOST_TO_DEVICE));

ggml-cann/bcast.cpp renamed to ggml-cann/acl_tensor.cpp

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
#include "bcast.h"
1+
#include "acl_tensor.h"
2+
23
#include <algorithm>
34
#include <cstring>
45

@@ -32,7 +33,8 @@ aclDataType type_mapping(ggml_type type) {
3233
* changed to satisfy the broadcast. @sa: get_bcast_shape.
3334
*/
3435
aclTensor* create_acl_tensor(const ggml_tensor* tensor, int64_t* bcast_ne,
35-
size_t* bcast_nb, int64_t bcast_dims, aclFormat format) {
36+
size_t* bcast_nb, int64_t bcast_dims,
37+
aclFormat format) {
3638
size_t size = ggml_nbytes(tensor);
3739
void* deviceAddr = nullptr;
3840

@@ -74,9 +76,9 @@ aclTensor* create_acl_tensor(const ggml_tensor* tensor, int64_t* bcast_ne,
7476
return acl_tensor;
7577
}
7678

77-
aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype, size_t type_size, int64_t* ne,
78-
size_t* nb, int64_t dims, aclFormat format) {
79-
79+
aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype,
80+
size_t type_size, int64_t* ne, size_t* nb,
81+
int64_t dims, aclFormat format) {
8082
int64_t tmp_ne[GGML_MAX_DIMS * 2];
8183
int64_t tmp_stride[GGML_MAX_DIMS * 2];
8284

@@ -88,9 +90,8 @@ aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype, size_t type_size
8890
std::reverse(tmp_ne, tmp_ne + dims);
8991
std::reverse(tmp_stride, tmp_stride + dims);
9092

91-
aclTensor* acl_tensor =
92-
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, 0,
93-
format, tmp_ne, dims, data_ptr);
93+
aclTensor* acl_tensor = aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, 0,
94+
format, tmp_ne, dims, data_ptr);
9495

9596
return acl_tensor;
9697
}
@@ -132,8 +133,7 @@ aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype, size_t type_size
132133
*/
133134
int64_t get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,
134135
int64_t* bcast_ne_src0, int64_t* bcast_ne_src1,
135-
size_t* bcast_nb_src0,
136-
size_t* bcast_nb_src1) {
136+
size_t* bcast_nb_src0, size_t* bcast_nb_src1) {
137137
GGML_ASSERT(ggml_can_repeat(src1, src0));
138138
int bcast_dim_cnt = 0;
139139
for (int i = 0; i < GGML_MAX_DIMS; i++) {
@@ -147,12 +147,10 @@ int64_t get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,
147147
// Need to add an extra dim.
148148
bcast_ne_src0[bcast_dim_cnt] = nr;
149149
bcast_ne_src1[bcast_dim_cnt] = 1;
150-
bcast_nb_src0[bcast_dim_cnt] =
151-
bcast_nb_src0[bcast_dim_cnt - 1] *
152-
bcast_ne_src0[bcast_dim_cnt - 1];
153-
bcast_nb_src1[bcast_dim_cnt] =
154-
bcast_nb_src1[bcast_dim_cnt - 1] *
155-
bcast_ne_src1[bcast_dim_cnt - 1];
150+
bcast_nb_src0[bcast_dim_cnt] = bcast_nb_src0[bcast_dim_cnt - 1] *
151+
bcast_ne_src0[bcast_dim_cnt - 1];
152+
bcast_nb_src1[bcast_dim_cnt] = bcast_nb_src1[bcast_dim_cnt - 1] *
153+
bcast_ne_src1[bcast_dim_cnt - 1];
156154
bcast_dim_cnt++;
157155
}
158156
}

ggml-cann/bcast.h renamed to ggml-cann/acl_tensor.h

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
#ifndef CANN_BCAST_H
2-
#define CANN_BCAST_H
1+
#ifndef CANN_ACL_TENSOR_H
2+
#define CANN_ACL_TENSOR_H
33

44
#include <aclnn/aclnn_base.h>
55

@@ -10,8 +10,7 @@ aclDataType type_mapping(ggml_type type);
1010

1111
aclTensor* create_acl_tensor(const ggml_tensor* tensor,
1212
int64_t* bcast_ne = nullptr,
13-
size_t* bcast_nb = nullptr,
14-
int64_t bcast_dims = 0,
13+
size_t* bcast_nb = nullptr, int64_t bcast_dims = 0,
1514
aclFormat format = ACL_FORMAT_ND);
1615

1716
aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype,
@@ -28,12 +27,12 @@ int64_t get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,
2827
#define BCAST_SHAPE(src0, src1) \
2928
int64_t bcast_ne_##src0[GGML_MAX_DIMS * 2]; \
3029
int64_t bcast_ne_##src1[GGML_MAX_DIMS * 2]; \
31-
size_t bcast_nb_##src0[GGML_MAX_DIMS * 2]; \
32-
size_t bcast_nb_##src1[GGML_MAX_DIMS * 2]; \
30+
size_t bcast_nb_##src0[GGML_MAX_DIMS * 2]; \
31+
size_t bcast_nb_##src1[GGML_MAX_DIMS * 2]; \
3332
int64_t bcast_dims = \
3433
get_bcast_shape(src0, src1, bcast_ne_##src0, bcast_ne_##src1, \
3534
bcast_nb_##src0, bcast_nb_##src1);
3635

3736
#define BCAST_PARAM(src) bcast_ne_##src, bcast_nb_##src, bcast_dims
3837

39-
#endif // CANN_BCAST_H
38+
#endif // CANN_ACL_TENSOR_H

ggml-cann/aclnn_ops.cpp

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
#include "aclnn_ops.h"
22

3-
#include <aclnnop/aclnn_layer_norm.h>
43
#include <aclnnop/aclnn_cast.h>
54
#include <aclnnop/aclnn_group_norm.h>
6-
#include <aclnnop/aclnn_softmax.h>
5+
#include <aclnnop/aclnn_layer_norm.h>
76
#include <aclnnop/aclnn_repeat.h>
7+
#include <aclnnop/aclnn_softmax.h>
88

99
#include <cmath>
1010
#include <cstring>
@@ -25,13 +25,14 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2525
int64_t repeatsArray[] = {dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2],
2626
dst->ne[1] / src->ne[1], dst->ne[0] / src->ne[0]};
2727

28-
aclIntArray *repeats = aclCreateIntArray(repeatsArray, GGML_MAX_DIMS);
28+
aclIntArray* repeats = aclCreateIntArray(repeatsArray, GGML_MAX_DIMS);
2929

3030
uint64_t workspaceSize = 0;
3131
aclOpExecutor* executor;
3232
void* workspaceAddr = nullptr;
3333

34-
ACL_CHECK(aclnnRepeatGetWorkspaceSize(acl_src, repeats, acl_dst, &workspaceSize, &executor));
34+
ACL_CHECK(aclnnRepeatGetWorkspaceSize(acl_src, repeats, acl_dst,
35+
&workspaceSize, &executor));
3536

3637
if (workspaceSize > 0) {
3738
workspaceAddr = ctx.alloc_buffer(workspaceSize);
@@ -42,7 +43,6 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
4243
ACL_CHECK(aclDestroyIntArray(repeats));
4344
ACL_CHECK(aclDestroyTensor(acl_src));
4445
ACL_CHECK(aclDestroyTensor(acl_dst));
45-
4646
}
4747

4848
void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -140,7 +140,7 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
140140
ACL_CHECK(aclnnCatGetWorkspaceSize(tensorList, 1, acl_dst, &workspaceSize,
141141
&executor));
142142
if (workspaceSize > 0) {
143-
workspaceAddr = ctx.alloc_buffer(workspaceSize);
143+
workspaceAddr = ctx.alloc_buffer(workspaceSize);
144144
}
145145

146146
aclrtStream main_stream = ctx.stream();
@@ -262,7 +262,8 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
262262

263263
aclTensor* acl_src = create_acl_tensor(src);
264264
aclTensor* acl_dst = create_acl_tensor(dst);
265-
void* buffer = ctx.alloc_buffer(ggml_nbytes(dst) / ggml_type_size(dst->type) * sizeof(int64_t));
265+
void* buffer = ctx.alloc_buffer(
266+
ggml_nbytes(dst) / ggml_type_size(dst->type) * sizeof(int64_t));
266267
aclTensor* tmp_tensor =
267268
create_acl_tensor(buffer, ACL_INT64, ggml_type_size(dst->type), dst->ne,
268269
dst->nb, GGML_MAX_DIMS);
@@ -311,8 +312,8 @@ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
311312

312313
std::vector<int64_t> normData = {dst->ne[0]};
313314
aclIntArray* norm = aclCreateIntArray(normData.data(), normData.size());
314-
ACL_CHECK(aclnnLayerNormGetWorkspaceSize(acl_src, norm, nullptr, nullptr, eps,
315-
acl_dst, nullptr, nullptr,
315+
ACL_CHECK(aclnnLayerNormGetWorkspaceSize(acl_src, norm, nullptr, nullptr,
316+
eps, acl_dst, nullptr, nullptr,
316317
&workspaceSize, &executor));
317318

318319
if (workspaceSize > 0) {
@@ -381,33 +382,37 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
381382
aclTensor* acl_src0 = create_acl_tensor(src0);
382383
aclTensor* acl_dst = create_acl_tensor(dst);
383384

384-
float scale = 1.0f;
385+
float scale = 1.0f;
385386
float max_bias = 0.0f;
386387

387-
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
388-
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
388+
memcpy(&scale, (float*)dst->op_params + 0, sizeof(float));
389+
memcpy(&max_bias, (float*)dst->op_params + 1, sizeof(float));
389390

390391
aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
391-
aclScalar* acl_max_bias = aclCreateScalar(&max_bias, aclDataType::ACL_FLOAT);
392+
aclScalar* acl_max_bias =
393+
aclCreateScalar(&max_bias, aclDataType::ACL_FLOAT);
392394

393395
size_t n_bytes = ggml_nbytes(src0);
394-
void *buffer = ctx.alloc_buffer(n_bytes);
395-
aclTensor* temp_tensor = create_acl_tensor(buffer, ACL_FLOAT, ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
396+
void* buffer = ctx.alloc_buffer(n_bytes);
397+
aclTensor* temp_tensor =
398+
create_acl_tensor(buffer, ACL_FLOAT, ggml_type_size(src0->type),
399+
src0->ne, src0->nb, GGML_MAX_DIMS);
396400

397401
uint64_t workspaceSize = 0;
398402
aclOpExecutor* executor;
399403
void* workspaceAddr = nullptr;
400404

401-
aclnnMulsGetWorkspaceSize(acl_src0, acl_scale, temp_tensor, &workspaceSize, &executor);
405+
aclnnMulsGetWorkspaceSize(acl_src0, acl_scale, temp_tensor, &workspaceSize,
406+
&executor);
402407
if (workspaceSize > 0) {
403408
workspaceAddr = ctx.alloc_buffer(workspaceSize);
404409
}
405410

406411
aclrtStream stream = ctx.stream();
407412
aclnnMuls(workspaceAddr, workspaceSize, executor, stream);
408413

409-
ACL_CHECK(aclnnSoftmaxGetWorkspaceSize(
410-
temp_tensor, 3, acl_dst, &workspaceSize, &executor));
414+
ACL_CHECK(aclnnSoftmaxGetWorkspaceSize(temp_tensor, 3, acl_dst,
415+
&workspaceSize, &executor));
411416

412417
if (workspaceSize > 0) {
413418
workspaceAddr = ctx.alloc_buffer(workspaceSize);
@@ -419,6 +424,4 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
419424
ACL_CHECK(aclDestroyTensor(acl_dst));
420425
}
421426

422-
void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
423-
424-
}
427+
void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {}

ggml-cann/aclnn_ops.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#include <aclnnop/aclnn_silu.h>
1717
#include <aclnnop/aclnn_tanh.h>
1818

19-
#include "bcast.h"
19+
#include "acl_tensor.h"
2020
#include "common.h"
2121

2222
void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst);

ggml-cann/common.h

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,7 @@ struct ggml_backend_cann_context {
8181
return buffer;
8282
}
8383

84-
void* alloc_buffer(size_t size) {
85-
return alloc_buffer(size, 0);
86-
}
84+
void* alloc_buffer(size_t size) { return alloc_buffer(size, 0); }
8785

8886
void free_buffers() {
8987
for (int i = 0; i < GGML_CANN_MAX_STREAMS; i++) {
@@ -107,6 +105,4 @@ struct ggml_backend_cann_context {
107105
aclrtStream stream() { return stream(0); }
108106
};
109107

110-
111-
112-
#endif //CANN_COMMON_H
108+
#endif // CANN_COMMON_H

0 commit comments

Comments
 (0)