Skip to content

Commit e9d40dd

Browse files
committed
f16_get_row debug OK
1 parent c343f85 commit e9d40dd

File tree

3 files changed

+40
-32
lines changed

3 files changed

+40
-32
lines changed

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2329,6 +2329,13 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
23292329
}
23302330
break;
23312331
case GGML_TYPE_F16:
2332+
{
2333+
if ((src0->ne[0] % 16) != 0) {
2334+
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16
2335+
/* printf("\n\nggml_cann_get_rows: row elements:%d, src1->ne[0]:%d, src1->ne[1]:%d, src1->ne[2]:%d, src0->ne[0]:%d, ggml_type_size(GGML_TYPE_F32):%d, dst_len:%d.\n", src0->ne[0],
2336+
src1->ne[0], src1->ne[1], src1->ne[2], src0->ne[0], ggml_type_size(GGML_TYPE_F32), dst_len); */
2337+
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
2338+
}
23322339
aclrtlaunch_ascendc_get_row_f16(
23332340
24, ctx.stream(), src0->data, src1->data, dst->data,
23342341
((ggml_tensor*)src0->extra)->ne,
@@ -2337,6 +2344,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
23372344
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
23382345
((ggml_tensor*)dst->extra)->nb);
23392346
break;
2347+
}
23402348
case GGML_TYPE_Q4_0:
23412349
aclrtlaunch_ascendc_get_row_q4_0(
23422350
24, ctx.stream(), src0->data, src1->data, dst->data,

ggml/src/ggml-cann/kernels/get_row_f16.cpp

Lines changed: 29 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ class GET_ROW_F16 {
1414
int64_t *output_ne_ub, size_t *output_nb_ub) {
1515
// TODO, use template for F16/f32
1616
int64_t op_block_num = GetBlockNum();
17-
int64_t op_block_idx = GetBlockIdx();
17+
op_block_idx = GetBlockIdx();
1818

1919
for (int i = 0; i < 4; i++) {
2020
input_ne[i] = input_ne_ub[i];
@@ -59,65 +59,61 @@ class GET_ROW_F16 {
5959
}
6060

6161
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
62+
size_t origin_len = len;
6263
LocalTensor<half> input_local = input_queue.AllocTensor<half>();
6364
const size_t elem_per_block = 32 / sizeof(half);
6465
size_t tail = len % elem_per_block;
65-
len = len & ~elem_per_block;
66-
DataCopy(input_local, input_gm[offset], len);
66+
len = len & ~(elem_per_block - 1);
6767
if(tail != 0) {
68-
DataCopy(input_local[len], input_gm[offset + len], elem_per_block);
69-
// clean
70-
for (int i = tail; i < elem_per_block; i++) {
71-
input_local[len].SetValue(i, 0);
72-
}
73-
#if 0
74-
const half padVal = 0;
75-
uint64_t mask0 = ((uint64_t)1ul << 16) - ((uint64_t)1ul << tail);
76-
uint64_t mask[2] = {mask0, 0};
77-
Duplicate<half>(input_local[len], padVal, mask, 1 /*no repeat*/, 1/*no gap in block*/, 8/*no gap between ∂repeats*/);
78-
#endif
79-
68+
//printf("f16 get_row: copy_in: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", offset, len, origin_len, tail, elem_per_block);
69+
//DumpTensor(input_local, 5, elem_per_block);
70+
len += elem_per_block;
71+
// DataCopy(input_local[len], input_gm[offset + len], elem_per_block);
8072
#if 0
8173
DataCopyExtParams dataCopyParams;
8274
dataCopyParams.blockCount = 1;
8375
dataCopyParams.blockLen = tail * sizeof(half);
8476
DataCopyPadExtParams<half> padParams;
8577
DataCopyPad(input_local[len], input_gm[offset + len],
8678
dataCopyParams, padParams);
87-
88-
uint16_t rightPadNum = 32 / sizeof(half) - tail;
89-
PadParams padParas{0, rightPadNum, 0};
90-
Pad(input_local[len], input_gm[offset + len], padParas, tilingData.padTilingData);
9179
#endif
9280
}
81+
DataCopy(input_local, input_gm[offset], len);
9382
input_queue.EnQue(input_local);
9483
}
9584

9685
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
9786
LocalTensor<float> output_local = output_queue.DeQue<float>();
9887
const size_t elem_per_block = 32 / sizeof(float);
9988
size_t tail = len % elem_per_block;
100-
len = len & ~elem_per_block;
101-
// DataCopy(output_gm[offset], output_local, len);
89+
len = len & ~(elem_per_block - 1);
90+
if (len > 0) {
91+
DataCopy(output_gm[offset], output_local, len);
92+
}
93+
#if 1
10294
if(tail != 0) {
103-
len += elem_per_block;
95+
/* printf("\nf16 BLOCK_IDX:%d get_row: Copy_Out AtomicAdd: offset:%d, len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, tail, elem_per_block);
96+
DumpTensor(output_gm, 5, elem_per_block); */
97+
for (size_t i = tail; i < elem_per_block; i++) {
98+
output_local[len + i].SetValue(0, 0);
99+
}
100+
// DumpTensor(output_local[len], 5, elem_per_block);
101+
SetAtomicAdd<float>();
102+
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
103+
SetAtomicNone();
104+
// DumpTensor(output_gm, 5, elem_per_block);
105+
}
106+
#endif
107+
104108
#if 0
109+
if(tail != 0) {
105110
DataCopyExtParams dataCopyParams;
106111
dataCopyParams.blockCount = 1;
107112
dataCopyParams.blockLen = tail * sizeof(float);
108113
DataCopyPad(output_gm[offset + len], output_local[len],
109114
dataCopyParams);
110-
#endif
111-
}
112-
DataCopy(output_gm[offset], output_local, len);
113-
// clean
114-
if (tail != 0) {
115-
for (int i = tail; i < elem_per_block; i++) {
116-
output_gm[offset + len - elem_per_block].SetValue(i, 0);
117-
}
118-
DataCacheCleanAndInvalid<float, CacheLine::SINGLE_CACHE_LINE>(output_gm[offset + len - elem_per_block]);
119115
}
120-
116+
#endif
121117
output_queue.FreeTensor(output_local);
122118
}
123119

@@ -182,6 +178,7 @@ class GET_ROW_F16 {
182178
GlobalTensor<float> output_gm;
183179
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
184180
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
181+
int64_t op_block_idx;
185182
};
186183

187184
template <typename T>

ggml/src/ggml-cann/kernels/get_row_f32.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,9 @@ class GET_ROW_F32 {
107107

108108
#if 1
109109
if(tail != 0) {
110+
for (size_t i = tail; i < elem_per_block; i++) {
111+
output_local[len + i].SetValue(0, 0);
112+
}
110113
//printf("\nf32 BLOCK_IDX:%d get_row: Copy_Out AtomicAdd: offset:%d, len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, tail, elem_per_block);
111114
/* DumpTensor(output_gm[offset + len], 5, elem_per_block);
112115
DumpTensor(output_local[len], 5, elem_per_block); */

0 commit comments

Comments
 (0)