3
3
#include < aclnnop/aclnn_layer_norm.h>
4
4
#include < aclnnop/aclnn_cast.h>
5
5
#include < aclnnop/aclnn_group_norm.h>
6
+ #include < aclnnop/aclnn_softmax.h>
6
7
7
8
#include < cmath>
8
9
#include < cstring>
9
10
#include < vector>
10
11
11
12
// TODO: repeat is implemented through add to apply bcast. Optimize it.
13
+ // change to use aclnnRepeat
12
14
void ggml_cann_repeat (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
13
15
ggml_tensor* src = dst->src [0 ];
14
16
GGML_ASSERT (ggml_can_repeat (src, dst));
@@ -47,8 +49,7 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
47
49
ACL_CHECK (aclnnInplaceAddGetWorkspaceSize (acl_dst, acl_src, alpha,
48
50
&workspaceSize, &executor));
49
51
if (workspaceSize > 0 ) {
50
- ACL_CHECK (aclrtMalloc (&workspaceAddr, workspaceSize,
51
- ACL_MEM_MALLOC_HUGE_FIRST));
52
+ workspaceAddr = ctx.alloc_buffer (workspaceSize);
52
53
}
53
54
54
55
ACL_CHECK (aclnnInplaceAdd (workspaceAddr, workspaceSize, executor,
@@ -57,10 +58,6 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
57
58
ACL_CHECK (aclDestroyScalar (alpha));
58
59
ACL_CHECK (aclDestroyTensor (acl_src));
59
60
ACL_CHECK (aclDestroyTensor (acl_dst));
60
-
61
- if (workspaceSize > 0 ) {
62
- ACL_CHECK (aclrtFree (workspaceAddr));
63
- }
64
61
}
65
62
}
66
63
@@ -95,11 +92,8 @@ void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
95
92
96
93
ACL_CHECK (aclnnAddGetWorkspaceSize (acl_src0, acl_src1, alpha, acl_dst,
97
94
&workspaceSize, &executor));
98
- // TODO, workspace should free after sync. Add alloc memory to
99
- // backend_buffer.
100
95
if (workspaceSize > 0 ) {
101
- ACL_CHECK (aclrtMalloc (&workspaceAddr, workspaceSize,
102
- ACL_MEM_MALLOC_HUGE_FIRST));
96
+ workspaceAddr = ctx.alloc_buffer (workspaceSize);
103
97
}
104
98
105
99
aclrtStream main_stream = ctx.stream ();
@@ -109,10 +103,6 @@ void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
109
103
ACL_CHECK (aclDestroyTensor (acl_src0));
110
104
ACL_CHECK (aclDestroyTensor (acl_src1));
111
105
ACL_CHECK (aclDestroyTensor (acl_dst));
112
-
113
- if (workspaceSize > 0 ) {
114
- ACL_CHECK (aclrtFree (workspaceAddr));
115
- }
116
106
}
117
107
118
108
void ggml_cann_leaky_relu (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -136,8 +126,7 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
136
126
ACL_CHECK (aclnnLeakyReluGetWorkspaceSize (
137
127
acl_src, acl_negative_slope, acl_dst, &workspaceSize, &executor));
138
128
if (workspaceSize > 0 ) {
139
- ACL_CHECK (aclrtMalloc (&workspaceAddr, workspaceSize,
140
- ACL_MEM_MALLOC_HUGE_FIRST));
129
+ workspaceAddr = ctx.alloc_buffer (workspaceSize);
141
130
}
142
131
143
132
aclrtStream main_stream = ctx.stream ();
@@ -147,10 +136,6 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
147
136
ACL_CHECK (aclDestroyScalar (acl_negative_slope));
148
137
ACL_CHECK (aclDestroyTensor (acl_src));
149
138
ACL_CHECK (aclDestroyTensor (acl_dst));
150
-
151
- if (workspaceSize > 0 ) {
152
- ACL_CHECK (aclrtFree (workspaceAddr));
153
- }
154
139
}
155
140
156
141
void ggml_cann_concat (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -167,22 +152,18 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
167
152
aclOpExecutor* executor;
168
153
void * workspaceAddr = nullptr ;
169
154
170
- ACL_CHECK (aclnnCatGetWorkspaceSize (tensorList, 2 , acl_dst, &workspaceSize,
155
+ // dim1 == ne2, dims in llama.cpp is reversed.
156
+ ACL_CHECK (aclnnCatGetWorkspaceSize (tensorList, 1 , acl_dst, &workspaceSize,
171
157
&executor));
172
158
if (workspaceSize > 0 ) {
173
- ACL_CHECK (aclrtMalloc (&workspaceAddr, workspaceSize,
174
- ACL_MEM_MALLOC_HUGE_FIRST));
159
+ workspaceAddr = ctx.alloc_buffer (workspaceSize);
175
160
}
176
161
177
162
aclrtStream main_stream = ctx.stream ();
178
163
ACL_CHECK (aclnnCat (workspaceAddr, workspaceSize, executor, main_stream));
179
164
180
165
aclDestroyTensorList (tensorList);
181
166
ACL_CHECK (aclDestroyTensor (acl_dst));
182
-
183
- if (workspaceSize > 0 ) {
184
- ACL_CHECK (aclrtFree (workspaceAddr));
185
- }
186
167
}
187
168
188
169
void ggml_cann_arange (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -210,8 +191,7 @@ void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
210
191
ACL_CHECK (aclnnArangeGetWorkspaceSize (acl_start, acl_end, acl_step, acl_dst,
211
192
&workspaceSize, &executor));
212
193
if (workspaceSize > 0 ) {
213
- ACL_CHECK (aclrtMalloc (&workspaceAddr, workspaceSize,
214
- ACL_MEM_MALLOC_HUGE_FIRST));
194
+ workspaceAddr = ctx.alloc_buffer (workspaceSize);
215
195
}
216
196
217
197
aclrtStream main_stream = ctx.stream ();
@@ -221,10 +201,6 @@ void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
221
201
ACL_CHECK (aclDestroyScalar (acl_end));
222
202
ACL_CHECK (aclDestroyScalar (acl_step));
223
203
ACL_CHECK (aclDestroyTensor (acl_dst));
224
-
225
- if (workspaceSize > 0 ) {
226
- ACL_CHECK (aclrtFree (workspaceAddr));
227
- }
228
204
}
229
205
230
206
void ggml_cann_sqr (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -254,9 +230,9 @@ void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
254
230
255
231
ACL_CHECK (aclnnClampGetWorkspaceSize (acl_src, acl_min, acl_max, acl_dst,
256
232
&workspaceSize, &executor));
257
- if (workspaceSize > 0 )
258
- ACL_CHECK ( aclrtMalloc (& workspaceAddr, workspaceSize,
259
- ACL_MEM_MALLOC_HUGE_FIRST));
233
+ if (workspaceSize > 0 ) {
234
+ workspaceAddr = ctx. alloc_buffer ( workspaceSize);
235
+ }
260
236
261
237
aclrtStream main_stream = ctx.stream ();
262
238
ACL_CHECK (aclnnClamp (workspaceAddr, workspaceSize, executor, main_stream));
@@ -265,10 +241,6 @@ void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
265
241
ACL_CHECK (aclDestroyScalar (acl_max));
266
242
ACL_CHECK (aclDestroyTensor (acl_src));
267
243
ACL_CHECK (aclDestroyTensor (acl_dst));
268
-
269
- if (workspaceSize > 0 ) {
270
- ACL_CHECK (aclrtFree (workspaceAddr));
271
- }
272
244
}
273
245
274
246
void ggml_cann_scale (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -288,20 +260,16 @@ void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
288
260
289
261
ACL_CHECK (aclnnMulsGetWorkspaceSize (acl_src, scale, acl_dst, &workspaceSize,
290
262
&executor));
291
- if (workspaceSize > 0 )
292
- ACL_CHECK ( aclrtMalloc (& workspaceAddr, workspaceSize,
293
- ACL_MEM_MALLOC_HUGE_FIRST));
263
+ if (workspaceSize > 0 ) {
264
+ workspaceAddr = ctx. alloc_buffer ( workspaceSize);
265
+ }
294
266
295
267
aclrtStream main_stream = ctx.stream ();
296
268
ACL_CHECK (aclnnMuls (workspaceAddr, workspaceSize, executor, main_stream));
297
269
298
270
ACL_CHECK (aclDestroyScalar (scale));
299
271
ACL_CHECK (aclDestroyTensor (acl_src));
300
272
ACL_CHECK (aclDestroyTensor (acl_dst));
301
-
302
- if (workspaceSize > 0 ) {
303
- ACL_CHECK (aclrtFree (workspaceAddr));
304
- }
305
273
}
306
274
307
275
void ggml_cann_argsort (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -310,10 +278,7 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
310
278
311
279
aclTensor* acl_src = create_acl_tensor (src);
312
280
aclTensor* acl_dst = create_acl_tensor (dst);
313
- void * buffer = nullptr ;
314
- ACL_CHECK (aclrtMalloc (
315
- &buffer, ggml_nbytes (dst) / ggml_type_size (dst->type ) * sizeof (int64_t ),
316
- ACL_MEM_MALLOC_HUGE_FIRST));
281
+ void * buffer = ctx.alloc_buffer (ggml_nbytes (dst) / ggml_type_size (dst->type ) * sizeof (int64_t ));
317
282
aclTensor* tmp_tensor =
318
283
create_acl_tensor (buffer, ACL_INT64, ggml_type_size (dst->type ), dst->ne ,
319
284
dst->nb , GGML_MAX_DIMS);
@@ -326,39 +291,25 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
326
291
acl_src, -1 , (order == GGML_SORT_ORDER_DESC ? true : false ), tmp_tensor,
327
292
&workspaceSize, &executor));
328
293
if (workspaceSize > 0 ) {
329
- ACL_CHECK (aclrtMalloc (&workspaceAddr, workspaceSize,
330
- ACL_MEM_MALLOC_HUGE_FIRST));
294
+ workspaceAddr = ctx.alloc_buffer (workspaceSize);
331
295
}
332
296
333
297
aclrtStream main_stream = ctx.stream ();
334
298
ACL_CHECK (
335
299
aclnnArgsort (workspaceAddr, workspaceSize, executor, main_stream));
336
300
337
- if (workspaceSize > 0 ) {
338
- ACL_CHECK (aclrtFree (workspaceAddr));
339
- workspaceSize = 0 ;
340
- }
341
-
301
+ workspaceSize = 0 ;
342
302
ACL_CHECK (aclnnCastGetWorkspaceSize (tmp_tensor, type_mapping (dst->type ),
343
303
acl_dst, &workspaceSize, &executor));
344
304
if (workspaceSize > 0 ) {
345
- ACL_CHECK (aclrtMalloc (&workspaceAddr, workspaceSize,
346
- ACL_MEM_MALLOC_HUGE_FIRST));
305
+ workspaceAddr = ctx.alloc_buffer (workspaceSize);
347
306
}
348
307
349
308
ACL_CHECK (aclnnCast (workspaceAddr, workspaceSize, executor, main_stream));
350
309
351
310
ACL_CHECK (aclDestroyTensor (acl_src));
352
311
ACL_CHECK (aclDestroyTensor (tmp_tensor));
353
312
ACL_CHECK (aclDestroyTensor (acl_dst));
354
-
355
- // TODO: optimize argsort kernel or free tmp buffers after stream sync.
356
- ACL_CHECK (aclrtSynchronizeStream (main_stream));
357
- ACL_CHECK (aclrtFree (buffer));
358
-
359
- if (workspaceSize > 0 ) {
360
- ACL_CHECK (aclrtFree (workspaceAddr));
361
- }
362
313
}
363
314
364
315
void ggml_cann_norm (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -381,8 +332,7 @@ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
381
332
&workspaceSize, &executor));
382
333
383
334
if (workspaceSize > 0 ) {
384
- ACL_CHECK (aclrtMalloc (&workspaceAddr, workspaceSize,
385
- ACL_MEM_MALLOC_HUGE_FIRST));
335
+ workspaceAddr = ctx.alloc_buffer (workspaceSize);
386
336
}
387
337
388
338
aclrtStream stream = ctx.stream ();
@@ -392,10 +342,6 @@ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
392
342
ACL_CHECK (aclDestroyIntArray (norm));
393
343
ACL_CHECK (aclDestroyTensor (acl_src));
394
344
ACL_CHECK (aclDestroyTensor (acl_dst));
395
-
396
- if (workspaceSize > 0 ) {
397
- ACL_CHECK (aclrtFree (workspaceAddr));
398
- }
399
345
}
400
346
401
347
void ggml_cann_group_norm (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -419,8 +365,7 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
419
365
int64_t ne[] = {n_groups, N};
420
366
size_t nb[] = {type_size, type_size * n_groups};
421
367
size_t n_bytes = N * n_groups;
422
- void * buffer;
423
- ACL_CHECK (aclrtMalloc (&buffer, n_bytes * 2 , ACL_MEM_MALLOC_HUGE_FIRST));
368
+ void * buffer = ctx.alloc_buffer (n_bytes * 2 );
424
369
aclTensor* acl_mean_out =
425
370
create_acl_tensor (buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
426
371
aclTensor* acl_rstd_out = create_acl_tensor (
@@ -431,8 +376,7 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
431
376
acl_mean_out, acl_rstd_out, &workspaceSize, &executor));
432
377
433
378
if (workspaceSize > 0 ) {
434
- ACL_CHECK (aclrtMalloc (&workspaceAddr, workspaceSize,
435
- ACL_MEM_MALLOC_HUGE_FIRST));
379
+ workspaceAddr = ctx.alloc_buffer (workspaceSize);
436
380
}
437
381
438
382
aclrtStream stream = ctx.stream ();
@@ -443,12 +387,54 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
443
387
ACL_CHECK (aclDestroyTensor (acl_dst));
444
388
ACL_CHECK (aclDestroyTensor (acl_mean_out));
445
389
ACL_CHECK (aclDestroyTensor (acl_rstd_out));
390
+ }
391
+
392
+ // TODO: need alibi.
393
+ void ggml_cann_softmax (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
394
+ ggml_tensor* src0 = dst->src [0 ];
395
+ ggml_tensor* src1 = dst->src [0 ];
396
+
397
+ aclTensor* acl_src0 = create_acl_tensor (src0);
398
+ aclTensor* acl_dst = create_acl_tensor (dst);
399
+
400
+ float scale = 1 .0f ;
401
+ float max_bias = 0 .0f ;
402
+
403
+ memcpy (&scale, (float *) dst->op_params + 0 , sizeof (float ));
404
+ memcpy (&max_bias, (float *) dst->op_params + 1 , sizeof (float ));
405
+
406
+ aclScalar* acl_scale = aclCreateScalar (&scale, aclDataType::ACL_FLOAT);
407
+ aclScalar* acl_max_bias = aclCreateScalar (&max_bias, aclDataType::ACL_FLOAT);
408
+
409
+ size_t n_bytes = ggml_nbytes (src0);
410
+ void *buffer = ctx.alloc_buffer (n_bytes);
411
+ aclTensor* temp_tensor = create_acl_tensor (buffer, ACL_FLOAT, ggml_type_size (src0->type ), src0->ne , src0->nb , GGML_MAX_DIMS);
446
412
447
- // TODO: free after sync.
448
- ACL_CHECK ( aclrtSynchronizeStream (stream)) ;
449
- ACL_CHECK ( aclrtFree (buffer)) ;
413
+ uint64_t workspaceSize = 0 ;
414
+ aclOpExecutor* executor ;
415
+ void * workspaceAddr = nullptr ;
450
416
417
+ aclnnMulsGetWorkspaceSize (acl_src0, acl_scale, temp_tensor, &workspaceSize, &executor);
451
418
if (workspaceSize > 0 ) {
452
- ACL_CHECK ( aclrtFree ( workspaceAddr) );
419
+ workspaceAddr = ctx. alloc_buffer (workspaceSize );
453
420
}
421
+
422
+ aclrtStream stream = ctx.stream ();
423
+ aclnnMuls (workspaceAddr, workspaceSize, executor, stream);
424
+
425
+ ACL_CHECK (aclnnSoftmaxGetWorkspaceSize (
426
+ temp_tensor, 3 , acl_dst, &workspaceSize, &executor));
427
+
428
+ if (workspaceSize > 0 ) {
429
+ workspaceAddr = ctx.alloc_buffer (workspaceSize);
430
+ }
431
+
432
+ ACL_CHECK (aclnnSoftmax (workspaceAddr, workspaceSize, executor, stream));
433
+
434
+ ACL_CHECK (aclDestroyTensor (acl_src0));
435
+ ACL_CHECK (aclDestroyTensor (acl_dst));
454
436
}
437
+
438
+ void ggml_cann_acc (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
439
+
440
+ }
0 commit comments