Skip to content

Commit d901aff

Browse files
committed
Adapt DataCopyPad for get row f16 and f32
1 parent 61715d5 commit d901aff

File tree

2 files changed

+69
-10
lines changed

2 files changed

+69
-10
lines changed

ggml/src/ggml-cann/kernels/get_row_f16.cpp

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,32 +60,64 @@ class GET_ROW_F16 {
6060

6161
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
6262
LocalTensor<half> input_local = input_queue.AllocTensor<half>();
63-
size_t tail = len % 32;
64-
len = len & ~31;
63+
const size_t elem_per_block = 32 / sizeof(half);
64+
size_t tail = len % elem_per_block;
65+
len = len & ~elem_per_block;
6566
DataCopy(input_local, input_gm[offset], len);
6667
if(tail != 0) {
68+
DataCopy(input_local[len], input_gm[offset + len], elem_per_block);
69+
// clean
70+
for (int i = tail; i < elem_per_block; i++) {
71+
input_local[len].SetValue(i, 0);
72+
}
73+
#if 0
74+
const half padVal = 0;
75+
uint64_t mask0 = ((uint64_t)1ul << 16) - ((uint64_t)1ul << tail);
76+
uint64_t mask[2] = {mask0, 0};
77+
Duplicate<half>(input_local[len], padVal, mask, 1 /*no repeat*/, 1/*no gap in block*/, 8/*no gap between ∂repeats*/);
78+
#endif
79+
80+
#if 0
6781
DataCopyExtParams dataCopyParams;
6882
dataCopyParams.blockCount = 1;
6983
dataCopyParams.blockLen = tail * sizeof(half);
7084
DataCopyPadExtParams<half> padParams;
7185
DataCopyPad(input_local[len], input_gm[offset + len],
7286
dataCopyParams, padParams);
87+
88+
uint16_t rightPadNum = 32 / sizeof(half) - tail;
89+
PadParams padParas{0, rightPadNum, 0};
90+
Pad(input_local[len], input_gm[offset + len], padParas, tilingData.padTilingData);
91+
#endif
7392
}
7493
input_queue.EnQue(input_local);
7594
}
7695

7796
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
7897
LocalTensor<float> output_local = output_queue.DeQue<float>();
79-
size_t tail = len % 32;
80-
len = len & ~31;
81-
DataCopy(output_gm[offset], output_local, len);
98+
const size_t elem_per_block = 32 / sizeof(float);
99+
size_t tail = len % elem_per_block;
100+
len = len & ~elem_per_block;
101+
// DataCopy(output_gm[offset], output_local, len);
82102
if(tail != 0) {
103+
len += elem_per_block;
104+
#if 0
83105
DataCopyExtParams dataCopyParams;
84106
dataCopyParams.blockCount = 1;
85107
dataCopyParams.blockLen = tail * sizeof(float);
86108
DataCopyPad(output_gm[offset + len], output_local[len],
87109
dataCopyParams);
110+
#endif
88111
}
112+
DataCopy(output_gm[offset], output_local, len);
113+
// clean
114+
if (tail != 0) {
115+
for (int i = tail; i < elem_per_block; i++) {
116+
output_gm[offset + len - elem_per_block].SetValue(i, 0);
117+
}
118+
DataCacheCleanAndInvalid<float, CacheLine::SINGLE_CACHE_LINE>(output_gm[offset + len - elem_per_block]);
119+
}
120+
89121
output_queue.FreeTensor(output_local);
90122
}
91123

ggml/src/ggml-cann/kernels/get_row_f32.cpp

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,32 +55,59 @@ class GET_ROW_F32 {
5555

5656
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
5757
LocalTensor<float> input_local = input_queue.AllocTensor<float>();
58-
size_t tail = len % 32;
59-
len = len & ~31;
58+
const size_t elem_per_block = 32 / sizeof(float);
59+
size_t tail = len % elem_per_block;
60+
len = len & ~elem_per_block;
6061
DataCopy(input_local, input_gm[offset], len);
6162
if(tail != 0) {
63+
DataCopy(input_local[len], input_gm[offset + len], elem_per_block);
64+
// clean
65+
for (int i = tail; i < elem_per_block; i++) {
66+
input_local[len].SetValue(i, 0);
67+
}
68+
#if 0
69+
const float padVal = 0;
70+
uint64_t mask0 = ((uint64_t)1ul << 8) - ((uint64_t)1ul << tail);
71+
uint64_t mask[2] = {mask0, 0};
72+
Duplicate<float>(input_local[len], padVal, mask, 1 /*no repeat*/, 1/*no gap in block*/, 8/*no gap between repeats*/);
73+
#endif
74+
#if 0
6275
DataCopyExtParams dataCopyParams;
6376
dataCopyParams.blockCount = 1;
6477
dataCopyParams.blockLen = tail * sizeof(float);
6578
DataCopyPadExtParams<float> padParams;
6679
DataCopyPad(input_local[len], input_gm[offset + len],
6780
dataCopyParams, padParams);
81+
#endif
6882
}
6983
input_queue.EnQue(input_local);
7084
}
7185

7286
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
7387
LocalTensor<float> output_local = output_queue.DeQue<float>();
74-
size_t tail = len % 32;
75-
len = len & ~31;
76-
DataCopy(output_gm[offset], output_local, len);
88+
const size_t elem_per_block = 32 / sizeof(float);
89+
size_t tail = len % elem_per_block;
90+
len = len & ~elem_per_block;
91+
//DataCopy(output_gm[offset], output_local, len);
7792
if(tail != 0) {
93+
len += elem_per_block;
94+
#if 0
7895
DataCopyExtParams dataCopyParams;
7996
dataCopyParams.blockCount = 1;
8097
dataCopyParams.blockLen = tail * sizeof(float);
8198
DataCopyPad(output_gm[offset + len], output_local[len],
8299
dataCopyParams);
100+
#endif
101+
}
102+
DataCopy(output_gm[offset], output_local, len);
103+
104+
if (tail != 0) { // clean
105+
for (int i = tail; i < elem_per_block; i++) {
106+
output_gm[offset + len - elem_per_block].SetValue(i, 0);
107+
}
108+
DataCacheCleanAndInvalid<float, CacheLine::SINGLE_CACHE_LINE>(output_gm[offset + len - elem_per_block]);
83109
}
110+
84111
output_queue.FreeTensor(output_local);
85112
}
86113

0 commit comments

Comments
 (0)