@@ -18,6 +18,8 @@ constexpr int MASKED_LANE_NUM_REV = 1;
18
18
constexpr int NUM_RGBA_CHANNELS =
19
19
get_num_channels_enabled (sycl::ext::intel::esimd::rgba_channel_mask::ABGR);
20
20
21
+ template <class T > inline constexpr T marker = (T)0xcafebabe ;
22
+
21
23
template <typename T, unsigned VL, auto CH_MASK> struct Kernel {
22
24
T *bufOut;
23
25
Kernel (T *bufOut) : bufOut(bufOut) {}
@@ -44,8 +46,8 @@ template <typename T, unsigned VL, auto CH_MASK> struct Kernel {
44
46
}
45
47
46
48
// Prepare values to store into SLM in a SOA manner, e.g.:
47
- // R R R R ... G G G G ... B B B B ... A A A A ...
48
- // 0, 4, 8, 12, ... 1, 5, 9, 13, ... 2, 6, 10, 14, ... 3, 7, 11, 15 ...
49
+ // R R R R R ...G G G G G ...B B B B B ...A A A A A ...
50
+ // 00,04,08, 12,16 ...01,05,09, 13,17 ...02,06, 10,14,18 ...03,07, 11,15,19 ...
49
51
simd<T, VL * numChannels> valsIn;
50
52
for (unsigned i = 0 ; i < numChannels; i++)
51
53
for (unsigned j = 0 ; j < VL; j++)
@@ -58,10 +60,15 @@ template <typename T, unsigned VL, auto CH_MASK> struct Kernel {
58
60
slm_scatter_rgba<T, VL, CH_MASK>(byteOffsets, valsIn);
59
61
60
62
// Load back values from SLM. They will be transposed back to SOA.
63
+ // "_" = "undefined" (masked out lane/pixel in each channel)
64
+ // 00,04,08,12,...,_,01,05,09,13,...,_,02,06,10,14,...,_,03,07,11,19,...,_
61
65
simd_mask<VL> pred = 1 ;
62
66
pred[VL - MASKED_LANE_NUM_REV] = 0 ; // mask out the last lane
63
67
simd<T, VL *numChannels> valsOut =
64
68
slm_gather_rgba<T, VL, CH_MASK>(byteOffsets, pred);
69
+ // replace undefined values in the masked out lane with something verifiable
70
+ valsOut.template select <NUM_RGBA_CHANNELS, VL>(VL - MASKED_LANE_NUM_REV) =
71
+ marker<T>;
65
72
66
73
// Copy results to the output USM buffer. Maximum write block size must be
67
74
// at most 8 owords, so conservatively write in chunks of 8 elements.
@@ -109,13 +116,14 @@ template <typename T, unsigned VL, auto CH_MASK> bool test(queue q) {
109
116
// R R R R ... G G G G ... B B B B ... A A A A ...
110
117
// 0, 4, 8, 12, ... 1, 5, 9, 13, ... 2, 6, 10, 14, ... 3, 7, 11, 15 ...
111
118
for (unsigned i = 0 ; i < numChannels; i++)
112
- for (unsigned j = 0 ; j < VL; j++)
113
- gold[i * VL + j] = j * numChannels + i;
114
-
115
- // Account for masked out last lanes (with pred argument to slm_gather_rgba).
116
- unsigned maskedIndex = VL - 1 ;
117
- for (unsigned i = 0 ; i < numChannels; i++, maskedIndex += VL)
118
- gold[maskedIndex] = 0 ;
119
+ for (unsigned j = 0 ; j < VL; j++) {
120
+ // masked lane is assigned/verified separately:
121
+ if (j == VL - MASKED_LANE_NUM_REV) {
122
+ gold[i * VL + j] = marker<T>;
123
+ } else {
124
+ gold[i * VL + j] = j * numChannels + i;
125
+ }
126
+ }
119
127
120
128
try {
121
129
// We need that many workitems
@@ -137,7 +145,7 @@ template <typename T, unsigned VL, auto CH_MASK> bool test(queue q) {
137
145
}
138
146
139
147
int err_cnt = 0 ;
140
- for (unsigned i = 0 ; i < size; ++i ) {
148
+ for (unsigned i = 0 ; i < size; i++ ) {
141
149
if (A[i] != gold[i]) {
142
150
if (++err_cnt < 35 ) {
143
151
std::cerr << " failed at index " << i << " : " << A[i]
0 commit comments