Skip to content

Commit 9a62df2

Browse files
authored
Merge pull request intel#849 from kbobrovs/scsel-cfl-11-testroot-topic
Fix bug (using undefined value) in ESIMD's slm_gather_scatter_rgba.cpp (CMPLRTST-16329).
2 parents 86a34fa + ed297f7 commit 9a62df2

File tree

1 file changed

+18
-10
lines changed

1 file changed

+18
-10
lines changed

SYCL/ESIMD/api/slm_gather_scatter_rgba.cpp

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ constexpr int MASKED_LANE_NUM_REV = 1;
1818
constexpr int NUM_RGBA_CHANNELS =
1919
get_num_channels_enabled(sycl::ext::intel::esimd::rgba_channel_mask::ABGR);
2020

21+
template <class T> inline constexpr T marker = (T)0xcafebabe;
22+
2123
template <typename T, unsigned VL, auto CH_MASK> struct Kernel {
2224
T *bufOut;
2325
Kernel(T *bufOut) : bufOut(bufOut) {}
@@ -44,8 +46,8 @@ template <typename T, unsigned VL, auto CH_MASK> struct Kernel {
4446
}
4547

4648
// Prepare values to store into SLM in a SOA manner, e.g.:
47-
// R R R R ... G G G G ... B B B B ... A A A A ...
48-
// 0, 4, 8, 12, ... 1, 5, 9, 13, ... 2, 6, 10, 14, ... 3, 7, 11, 15 ...
49+
// R R R R R ...G G G G G ...B B B B B ...A A A A A ...
50+
// 00,04,08,12,16...01,05,09,13,17...02,06,10,14,18...03,07,11,15,19...
4951
simd<T, VL * numChannels> valsIn;
5052
for (unsigned i = 0; i < numChannels; i++)
5153
for (unsigned j = 0; j < VL; j++)
@@ -58,10 +60,15 @@ template <typename T, unsigned VL, auto CH_MASK> struct Kernel {
5860
slm_scatter_rgba<T, VL, CH_MASK>(byteOffsets, valsIn);
5961

6062
// Load back values from SLM. They will be transposed back to SOA.
63+
// "_" = "undefined" (masked out lane/pixel in each channel)
64+
// 00,04,08,12,...,_,01,05,09,13,...,_,02,06,10,14,...,_,03,07,11,19,...,_
6165
simd_mask<VL> pred = 1;
6266
pred[VL - MASKED_LANE_NUM_REV] = 0; // mask out the last lane
6367
simd<T, VL *numChannels> valsOut =
6468
slm_gather_rgba<T, VL, CH_MASK>(byteOffsets, pred);
69+
// replace undefined values in the masked out lane with something verifiable
70+
valsOut.template select<NUM_RGBA_CHANNELS, VL>(VL - MASKED_LANE_NUM_REV) =
71+
marker<T>;
6572

6673
// Copy results to the output USM buffer. Maximum write block size must be
6774
// at most 8 owords, so conservatively write in chunks of 8 elements.
@@ -109,13 +116,14 @@ template <typename T, unsigned VL, auto CH_MASK> bool test(queue q) {
109116
// R R R R ... G G G G ... B B B B ... A A A A ...
110117
// 0, 4, 8, 12, ... 1, 5, 9, 13, ... 2, 6, 10, 14, ... 3, 7, 11, 15 ...
111118
for (unsigned i = 0; i < numChannels; i++)
112-
for (unsigned j = 0; j < VL; j++)
113-
gold[i * VL + j] = j * numChannels + i;
114-
115-
// Account for masked out last lanes (with pred argument to slm_gather_rgba).
116-
unsigned maskedIndex = VL - 1;
117-
for (unsigned i = 0; i < numChannels; i++, maskedIndex += VL)
118-
gold[maskedIndex] = 0;
119+
for (unsigned j = 0; j < VL; j++) {
120+
// masked lane is assigned/verified separately:
121+
if (j == VL - MASKED_LANE_NUM_REV) {
122+
gold[i * VL + j] = marker<T>;
123+
} else {
124+
gold[i * VL + j] = j * numChannels + i;
125+
}
126+
}
119127

120128
try {
121129
// We need that many workitems
@@ -137,7 +145,7 @@ template <typename T, unsigned VL, auto CH_MASK> bool test(queue q) {
137145
}
138146

139147
int err_cnt = 0;
140-
for (unsigned i = 0; i < size; ++i) {
148+
for (unsigned i = 0; i < size; i++) {
141149
if (A[i] != gold[i]) {
142150
if (++err_cnt < 35) {
143151
std::cerr << "failed at index " << i << ": " << A[i]

0 commit comments

Comments
 (0)