Skip to content

Commit 6201f9f

Browse files
committed
free temp buffer when sync stream
1 parent 4b55e48 commit 6201f9f

File tree

4 files changed

+120
-112
lines changed

4 files changed

+120
-112
lines changed

ggml-cann.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -444,7 +444,10 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
444444
// Do nothing with these ops.
445445
break;
446446
case GGML_OP_DIAG_MASK_INF:
447+
return false;
447448
case GGML_OP_SOFT_MAX:
449+
ggml_cann_softmax(ctx, dst);
450+
break;
448451
case GGML_OP_ROPE:
449452
case GGML_OP_ALIBI:
450453
case GGML_OP_IM2COL:
@@ -595,6 +598,9 @@ GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
595598
(ggml_backend_cann_context*)backend->context;
596599

597600
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
601+
602+
// Free temp buffers binding to each stream.
603+
cann_ctx->free_buffers();
598604
}
599605

600606
GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
@@ -670,7 +676,9 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
670676
case GGML_OP_CONT:
671677
return true;
672678
case GGML_OP_DIAG_MASK_INF:
679+
return false;
673680
case GGML_OP_SOFT_MAX:
681+
return true;
674682
case GGML_OP_ROPE:
675683
case GGML_OP_ALIBI:
676684
case GGML_OP_IM2COL:

ggml-cann/aclnn_ops.cpp

Lines changed: 68 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,14 @@
33
#include <aclnnop/aclnn_layer_norm.h>
44
#include <aclnnop/aclnn_cast.h>
55
#include <aclnnop/aclnn_group_norm.h>
6+
#include <aclnnop/aclnn_softmax.h>
67

78
#include <cmath>
89
#include <cstring>
910
#include <vector>
1011

1112
// TODO: repeat is implemented through add to apply bcast. Optimize it.
13+
// change to use aclnnRepeat
1214
void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1315
ggml_tensor* src = dst->src[0];
1416
GGML_ASSERT(ggml_can_repeat(src, dst));
@@ -47,8 +49,7 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
4749
ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src, alpha,
4850
&workspaceSize, &executor));
4951
if (workspaceSize > 0) {
50-
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize,
51-
ACL_MEM_MALLOC_HUGE_FIRST));
52+
workspaceAddr = ctx.alloc_buffer(workspaceSize);
5253
}
5354

5455
ACL_CHECK(aclnnInplaceAdd(workspaceAddr, workspaceSize, executor,
@@ -57,10 +58,6 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
5758
ACL_CHECK(aclDestroyScalar(alpha));
5859
ACL_CHECK(aclDestroyTensor(acl_src));
5960
ACL_CHECK(aclDestroyTensor(acl_dst));
60-
61-
if (workspaceSize > 0) {
62-
ACL_CHECK(aclrtFree(workspaceAddr));
63-
}
6461
}
6562
}
6663

@@ -95,11 +92,8 @@ void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
9592

9693
ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst,
9794
&workspaceSize, &executor));
98-
// TODO, workspace should free after sync. Add alloc memory to
99-
// backend_buffer.
10095
if (workspaceSize > 0) {
101-
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize,
102-
ACL_MEM_MALLOC_HUGE_FIRST));
96+
workspaceAddr = ctx.alloc_buffer(workspaceSize);
10397
}
10498

10599
aclrtStream main_stream = ctx.stream();
@@ -109,10 +103,6 @@ void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
109103
ACL_CHECK(aclDestroyTensor(acl_src0));
110104
ACL_CHECK(aclDestroyTensor(acl_src1));
111105
ACL_CHECK(aclDestroyTensor(acl_dst));
112-
113-
if (workspaceSize > 0) {
114-
ACL_CHECK(aclrtFree(workspaceAddr));
115-
}
116106
}
117107

118108
void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -136,8 +126,7 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
136126
ACL_CHECK(aclnnLeakyReluGetWorkspaceSize(
137127
acl_src, acl_negative_slope, acl_dst, &workspaceSize, &executor));
138128
if (workspaceSize > 0) {
139-
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize,
140-
ACL_MEM_MALLOC_HUGE_FIRST));
129+
workspaceAddr = ctx.alloc_buffer(workspaceSize);
141130
}
142131

143132
aclrtStream main_stream = ctx.stream();
@@ -147,10 +136,6 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
147136
ACL_CHECK(aclDestroyScalar(acl_negative_slope));
148137
ACL_CHECK(aclDestroyTensor(acl_src));
149138
ACL_CHECK(aclDestroyTensor(acl_dst));
150-
151-
if (workspaceSize > 0) {
152-
ACL_CHECK(aclrtFree(workspaceAddr));
153-
}
154139
}
155140

156141
void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -167,22 +152,18 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
167152
aclOpExecutor* executor;
168153
void* workspaceAddr = nullptr;
169154

170-
ACL_CHECK(aclnnCatGetWorkspaceSize(tensorList, 2, acl_dst, &workspaceSize,
155+
// dim1 == ne2, dims in llama.cpp is reversed.
156+
ACL_CHECK(aclnnCatGetWorkspaceSize(tensorList, 1, acl_dst, &workspaceSize,
171157
&executor));
172158
if (workspaceSize > 0) {
173-
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize,
174-
ACL_MEM_MALLOC_HUGE_FIRST));
159+
workspaceAddr = ctx.alloc_buffer(workspaceSize);
175160
}
176161

177162
aclrtStream main_stream = ctx.stream();
178163
ACL_CHECK(aclnnCat(workspaceAddr, workspaceSize, executor, main_stream));
179164

180165
aclDestroyTensorList(tensorList);
181166
ACL_CHECK(aclDestroyTensor(acl_dst));
182-
183-
if (workspaceSize > 0) {
184-
ACL_CHECK(aclrtFree(workspaceAddr));
185-
}
186167
}
187168

188169
void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -210,8 +191,7 @@ void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
210191
ACL_CHECK(aclnnArangeGetWorkspaceSize(acl_start, acl_end, acl_step, acl_dst,
211192
&workspaceSize, &executor));
212193
if (workspaceSize > 0) {
213-
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize,
214-
ACL_MEM_MALLOC_HUGE_FIRST));
194+
workspaceAddr = ctx.alloc_buffer(workspaceSize);
215195
}
216196

217197
aclrtStream main_stream = ctx.stream();
@@ -221,10 +201,6 @@ void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
221201
ACL_CHECK(aclDestroyScalar(acl_end));
222202
ACL_CHECK(aclDestroyScalar(acl_step));
223203
ACL_CHECK(aclDestroyTensor(acl_dst));
224-
225-
if (workspaceSize > 0) {
226-
ACL_CHECK(aclrtFree(workspaceAddr));
227-
}
228204
}
229205

230206
void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -254,9 +230,9 @@ void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
254230

255231
ACL_CHECK(aclnnClampGetWorkspaceSize(acl_src, acl_min, acl_max, acl_dst,
256232
&workspaceSize, &executor));
257-
if (workspaceSize > 0)
258-
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize,
259-
ACL_MEM_MALLOC_HUGE_FIRST));
233+
if (workspaceSize > 0) {
234+
workspaceAddr = ctx.alloc_buffer(workspaceSize);
235+
}
260236

261237
aclrtStream main_stream = ctx.stream();
262238
ACL_CHECK(aclnnClamp(workspaceAddr, workspaceSize, executor, main_stream));
@@ -265,10 +241,6 @@ void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
265241
ACL_CHECK(aclDestroyScalar(acl_max));
266242
ACL_CHECK(aclDestroyTensor(acl_src));
267243
ACL_CHECK(aclDestroyTensor(acl_dst));
268-
269-
if (workspaceSize > 0) {
270-
ACL_CHECK(aclrtFree(workspaceAddr));
271-
}
272244
}
273245

274246
void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -288,20 +260,16 @@ void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
288260

289261
ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, scale, acl_dst, &workspaceSize,
290262
&executor));
291-
if (workspaceSize > 0)
292-
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize,
293-
ACL_MEM_MALLOC_HUGE_FIRST));
263+
if (workspaceSize > 0) {
264+
workspaceAddr = ctx.alloc_buffer(workspaceSize);
265+
}
294266

295267
aclrtStream main_stream = ctx.stream();
296268
ACL_CHECK(aclnnMuls(workspaceAddr, workspaceSize, executor, main_stream));
297269

298270
ACL_CHECK(aclDestroyScalar(scale));
299271
ACL_CHECK(aclDestroyTensor(acl_src));
300272
ACL_CHECK(aclDestroyTensor(acl_dst));
301-
302-
if (workspaceSize > 0) {
303-
ACL_CHECK(aclrtFree(workspaceAddr));
304-
}
305273
}
306274

307275
void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -310,10 +278,7 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
310278

311279
aclTensor* acl_src = create_acl_tensor(src);
312280
aclTensor* acl_dst = create_acl_tensor(dst);
313-
void* buffer = nullptr;
314-
ACL_CHECK(aclrtMalloc(
315-
&buffer, ggml_nbytes(dst) / ggml_type_size(dst->type) * sizeof(int64_t),
316-
ACL_MEM_MALLOC_HUGE_FIRST));
281+
void* buffer = ctx.alloc_buffer(ggml_nbytes(dst) / ggml_type_size(dst->type) * sizeof(int64_t));
317282
aclTensor* tmp_tensor =
318283
create_acl_tensor(buffer, ACL_INT64, ggml_type_size(dst->type), dst->ne,
319284
dst->nb, GGML_MAX_DIMS);
@@ -326,39 +291,25 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
326291
acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false), tmp_tensor,
327292
&workspaceSize, &executor));
328293
if (workspaceSize > 0) {
329-
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize,
330-
ACL_MEM_MALLOC_HUGE_FIRST));
294+
workspaceAddr = ctx.alloc_buffer(workspaceSize);
331295
}
332296

333297
aclrtStream main_stream = ctx.stream();
334298
ACL_CHECK(
335299
aclnnArgsort(workspaceAddr, workspaceSize, executor, main_stream));
336300

337-
if (workspaceSize > 0) {
338-
ACL_CHECK(aclrtFree(workspaceAddr));
339-
workspaceSize = 0;
340-
}
341-
301+
workspaceSize = 0;
342302
ACL_CHECK(aclnnCastGetWorkspaceSize(tmp_tensor, type_mapping(dst->type),
343303
acl_dst, &workspaceSize, &executor));
344304
if (workspaceSize > 0) {
345-
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize,
346-
ACL_MEM_MALLOC_HUGE_FIRST));
305+
workspaceAddr = ctx.alloc_buffer(workspaceSize);
347306
}
348307

349308
ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, main_stream));
350309

351310
ACL_CHECK(aclDestroyTensor(acl_src));
352311
ACL_CHECK(aclDestroyTensor(tmp_tensor));
353312
ACL_CHECK(aclDestroyTensor(acl_dst));
354-
355-
// TODO: optimize argsort kernel or free tmp buffers after stream sync.
356-
ACL_CHECK(aclrtSynchronizeStream(main_stream));
357-
ACL_CHECK(aclrtFree(buffer));
358-
359-
if (workspaceSize > 0) {
360-
ACL_CHECK(aclrtFree(workspaceAddr));
361-
}
362313
}
363314

364315
void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -381,8 +332,7 @@ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
381332
&workspaceSize, &executor));
382333

383334
if (workspaceSize > 0) {
384-
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize,
385-
ACL_MEM_MALLOC_HUGE_FIRST));
335+
workspaceAddr = ctx.alloc_buffer(workspaceSize);
386336
}
387337

388338
aclrtStream stream = ctx.stream();
@@ -392,10 +342,6 @@ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
392342
ACL_CHECK(aclDestroyIntArray(norm));
393343
ACL_CHECK(aclDestroyTensor(acl_src));
394344
ACL_CHECK(aclDestroyTensor(acl_dst));
395-
396-
if (workspaceSize > 0) {
397-
ACL_CHECK(aclrtFree(workspaceAddr));
398-
}
399345
}
400346

401347
void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -419,8 +365,7 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
419365
int64_t ne[] = {n_groups, N};
420366
size_t nb[] = {type_size, type_size * n_groups};
421367
size_t n_bytes = N * n_groups;
422-
void* buffer;
423-
ACL_CHECK(aclrtMalloc(&buffer, n_bytes * 2, ACL_MEM_MALLOC_HUGE_FIRST));
368+
void* buffer = ctx.alloc_buffer(n_bytes * 2);
424369
aclTensor* acl_mean_out =
425370
create_acl_tensor(buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
426371
aclTensor* acl_rstd_out = create_acl_tensor(
@@ -431,8 +376,7 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
431376
acl_mean_out, acl_rstd_out, &workspaceSize, &executor));
432377

433378
if (workspaceSize > 0) {
434-
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize,
435-
ACL_MEM_MALLOC_HUGE_FIRST));
379+
workspaceAddr = ctx.alloc_buffer(workspaceSize);
436380
}
437381

438382
aclrtStream stream = ctx.stream();
@@ -443,12 +387,54 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
443387
ACL_CHECK(aclDestroyTensor(acl_dst));
444388
ACL_CHECK(aclDestroyTensor(acl_mean_out));
445389
ACL_CHECK(aclDestroyTensor(acl_rstd_out));
390+
}
391+
392+
// TODO: need alibi.
393+
void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
394+
ggml_tensor* src0 = dst->src[0];
395+
ggml_tensor* src1 = dst->src[0];
396+
397+
aclTensor* acl_src0 = create_acl_tensor(src0);
398+
aclTensor* acl_dst = create_acl_tensor(dst);
399+
400+
float scale = 1.0f;
401+
float max_bias = 0.0f;
402+
403+
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
404+
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
405+
406+
aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
407+
aclScalar* acl_max_bias = aclCreateScalar(&max_bias, aclDataType::ACL_FLOAT);
408+
409+
size_t n_bytes = ggml_nbytes(src0);
410+
void *buffer = ctx.alloc_buffer(n_bytes);
411+
aclTensor* temp_tensor = create_acl_tensor(buffer, ACL_FLOAT, ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
446412

447-
// TODO: free after sync.
448-
ACL_CHECK(aclrtSynchronizeStream(stream));
449-
ACL_CHECK(aclrtFree(buffer));
413+
uint64_t workspaceSize = 0;
414+
aclOpExecutor* executor;
415+
void* workspaceAddr = nullptr;
450416

417+
aclnnMulsGetWorkspaceSize(acl_src0, acl_scale, temp_tensor, &workspaceSize, &executor);
451418
if (workspaceSize > 0) {
452-
ACL_CHECK(aclrtFree(workspaceAddr));
419+
workspaceAddr = ctx.alloc_buffer(workspaceSize);
453420
}
421+
422+
aclrtStream stream = ctx.stream();
423+
aclnnMuls(workspaceAddr, workspaceSize, executor, stream);
424+
425+
ACL_CHECK(aclnnSoftmaxGetWorkspaceSize(
426+
temp_tensor, 3, acl_dst, &workspaceSize, &executor));
427+
428+
if (workspaceSize > 0) {
429+
workspaceAddr = ctx.alloc_buffer(workspaceSize);
430+
}
431+
432+
ACL_CHECK(aclnnSoftmax(workspaceAddr, workspaceSize, executor, stream));
433+
434+
ACL_CHECK(aclDestroyTensor(acl_src0));
435+
ACL_CHECK(aclDestroyTensor(acl_dst));
454436
}
437+
438+
void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
439+
440+
}

0 commit comments

Comments
 (0)