Skip to content

Commit c343f85

Browse files
committed
f32 get row 310P debug OK
1 parent d901aff commit c343f85

File tree

2 files changed

+57
-30
lines changed

2 files changed

+57
-30
lines changed

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2312,13 +2312,21 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
23122312

23132313
switch (src0->type) {
23142314
case GGML_TYPE_F32:
2315-
aclrtlaunch_ascendc_get_row_f32(
2316-
24, ctx.stream(), src0->data, src1->data, dst->data,
2317-
((ggml_tensor*)src0->extra)->ne,
2318-
((ggml_tensor*)src0->extra)->nb,
2319-
((ggml_tensor*)src1->extra)->ne,
2320-
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2321-
((ggml_tensor*)dst->extra)->nb);
2315+
{
2316+
if ((src0->ne[0] % 8) != 0) {
2317+
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
2318+
/* printf("\n\nggml_cann_get_rows: row elements:%d, src1->ne[0]:%d, src1->ne[1]:%d, src1->ne[2]%d, src0->ne[0]:%d, ggml_type_size(GGML_TYPE_F32):%d, dst_len:%d.\n", src0->ne[0],
2319+
src1->ne[0], src1->ne[1], src1->ne[2], src0->ne[0], ggml_type_size(GGML_TYPE_F32), dst_len); */
2320+
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
2321+
}
2322+
aclrtlaunch_ascendc_get_row_f32(
2323+
24, ctx.stream(), src0->data, src1->data, dst->data,
2324+
((ggml_tensor*)src0->extra)->ne,
2325+
((ggml_tensor*)src0->extra)->nb,
2326+
((ggml_tensor*)src1->extra)->ne,
2327+
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2328+
((ggml_tensor*)dst->extra)->nb);
2329+
}
23222330
break;
23232331
case GGML_TYPE_F16:
23242332
aclrtlaunch_ascendc_get_row_f16(

ggml/src/ggml-cann/kernels/get_row_f32.cpp

Lines changed: 42 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ class GET_ROW_F32 {
1313
int64_t *indices_ne_ub, size_t *indices_nb_ub,
1414
int64_t *output_ne_ub, size_t *output_nb_ub) {
1515
int64_t op_block_num = GetBlockNum();
16-
int64_t op_block_idx = GetBlockIdx();
16+
op_block_idx = GetBlockIdx();
1717

1818
for (int i = 0; i < 4; i++) {
1919
input_ne[i] = input_ne_ub[i];
@@ -51,25 +51,38 @@ class GET_ROW_F32 {
5151
// All data should asign to 32. It's ok because all data is align to 32.
5252
pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
5353
pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
54+
// printf("f32 BLOCK_IDX:%d get_row: init: ir:%d, dr:%d, n_elements:%d.\n", op_block_idx, ir, dr, n_elements);
5455
}
5556

5657
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
58+
size_t origin_len = len;
5759
LocalTensor<float> input_local = input_queue.AllocTensor<float>();
5860
const size_t elem_per_block = 32 / sizeof(float);
5961
size_t tail = len % elem_per_block;
60-
len = len & ~elem_per_block;
61-
DataCopy(input_local, input_gm[offset], len);
62+
len = len & ~(elem_per_block - 1);
63+
64+
//printf("f32 BLOCK_IDX:%d get_row: Copy_in: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
65+
if (len > 0)
66+
DataCopy(input_local, input_gm[offset], len);
67+
//printf("f32 BLOCK_IDX:%d get_row: Copy_in executed: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
6268
if(tail != 0) {
69+
#if 1
70+
/* //printf("f32 BLOCK_IDX:%d get_row: Copy_in ENTER tail != 0: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
71+
for (int i = 0; i < elem_per_block; i++) {
72+
printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, origin input local val:%f.\n", op_block_idx, i, input_local[len + i].GetValue(0));
73+
}
74+
//DumpTensor(input_gm[offset + len], 5, elem_per_block);
75+
for (int i = 0; i < tail; i++) {
76+
printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, input local val:%f, input_gm:%f.\n", op_block_idx, len + i, input_local[len + i].GetValue(0), input_gm[offset + len + i]);
77+
} */
6378
DataCopy(input_local[len], input_gm[offset + len], elem_per_block);
6479
// clean
65-
for (int i = tail; i < elem_per_block; i++) {
66-
input_local[len].SetValue(i, 0);
80+
/* for (int i = tail; i < elem_per_block; i++) {
81+
input_local[len + i].SetValue(0, 0);
6782
}
68-
#if 0
69-
const float padVal = 0;
70-
uint64_t mask0 = ((uint64_t)1ul << 8) - ((uint64_t)1ul << tail);
71-
uint64_t mask[2] = {mask0, 0};
72-
Duplicate<float>(input_local[len], padVal, mask, 1 /*no repeat*/, 1/*no gap in block*/, 8/*no gap between repeats*/);
83+
for (int i = 0; i < elem_per_block; i++) {
84+
printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, after clean and copy, input local val:%f.\n", op_block_idx, i, input_local[len + i].GetValue(0));
85+
} */
7386
#endif
7487
#if 0
7588
DataCopyExtParams dataCopyParams;
@@ -87,27 +100,32 @@ class GET_ROW_F32 {
87100
LocalTensor<float> output_local = output_queue.DeQue<float>();
88101
const size_t elem_per_block = 32 / sizeof(float);
89102
size_t tail = len % elem_per_block;
90-
len = len & ~elem_per_block;
91-
//DataCopy(output_gm[offset], output_local, len);
103+
len = len & ~(elem_per_block - 1);
104+
if (len > 0) {
105+
DataCopy(output_gm[offset], output_local, len);
106+
}
107+
108+
#if 1
92109
if(tail != 0) {
93-
len += elem_per_block;
110+
//printf("\nf32 BLOCK_IDX:%d get_row: Copy_Out AtomicAdd: offset:%d, len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, tail, elem_per_block);
111+
/* DumpTensor(output_gm[offset + len], 5, elem_per_block);
112+
DumpTensor(output_local[len], 5, elem_per_block); */
113+
SetAtomicAdd<float>();
114+
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
115+
SetAtomicNone();
116+
/* DumpTensor(output_gm[offset + len], 5, elem_per_block); */
117+
}
118+
#endif
94119
#if 0
120+
if(tail != 0) {
121+
95122
DataCopyExtParams dataCopyParams;
96123
dataCopyParams.blockCount = 1;
97124
dataCopyParams.blockLen = tail * sizeof(float);
98125
DataCopyPad(output_gm[offset + len], output_local[len],
99126
dataCopyParams);
100-
#endif
101127
}
102-
DataCopy(output_gm[offset], output_local, len);
103-
104-
if (tail != 0) { // clean
105-
for (int i = tail; i < elem_per_block; i++) {
106-
output_gm[offset + len - elem_per_block].SetValue(i, 0);
107-
}
108-
DataCacheCleanAndInvalid<float, CacheLine::SINGLE_CACHE_LINE>(output_gm[offset + len - elem_per_block]);
109-
}
110-
128+
#endif
111129
output_queue.FreeTensor(output_local);
112130
}
113131

@@ -171,6 +189,7 @@ class GET_ROW_F32 {
171189
GlobalTensor<float> output_gm;
172190
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
173191
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
192+
int64_t op_block_idx;
174193
};
175194

176195
template <typename T>

0 commit comments

Comments
 (0)