Skip to content

Commit 30e08cf

Browse files
v-klochkovbb-sycl
authored andcommitted
[ESIMD] Add test for accessor based gather/scatter_rgba (intel#1022)
* [ESIMD] Add test for accessor based gather/scatter_rgba Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent fec6664 commit 30e08cf

File tree

6 files changed

+234
-32
lines changed

6 files changed

+234
-32
lines changed

SYCL/ESIMD/PrefixSum.cpp

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,8 @@ void cmk_acum_iterative(unsigned *buf, unsigned h_pos,
160160
cnt_table.select<1, 1, TUPLE_SZ, 1>(0, 0) +=
161161
cnt_table.select<1, 1, TUPLE_SZ, 1>(1, 0);
162162

163-
simd<unsigned, 8> voff(0, 1); // 0, 1, 2, 3
164-
simd_mask<8> p = voff < TUPLE_SZ; // predicate
163+
simd<unsigned, 8> voff(0, 1); // 0, 1, 2, 3
164+
simd_mask<8> p = voff < TUPLE_SZ; // predicate
165165
voff = (voff + (global_offset + stride_threads * TUPLE_SZ - TUPLE_SZ)) *
166166
sizeof(unsigned);
167167
scatter<unsigned, 8>(buf, voff, S.select<8, 1>(0), p);
@@ -183,8 +183,7 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
183183

184184
simd_mask<32> p = elm32 < remaining;
185185

186-
S = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset,
187-
p);
186+
S = gather_rgba<GATHER_SCATTER_MASK>(buf, element_offset, p);
188187

189188
auto cnt_table = S.bit_cast_view<unsigned int, TUPLE_SZ, 32>();
190189
cnt_table.column(0) += prev;
@@ -214,8 +213,7 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
214213
cnt_table.select<1, 1, 16, 1>(j, 16) +=
215214
cnt_table.replicate_vs_w_hs<1, 0, 16, 0>(j, 15);
216215
}
217-
scatter_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset, S,
218-
p);
216+
scatter_rgba<GATHER_SCATTER_MASK>(buf, element_offset, S, p);
219217
elm32 += 32;
220218
element_offset += stride_elems * TUPLE_SZ * sizeof(unsigned) * 32;
221219
prev = cnt_table.column(31);
@@ -253,7 +251,7 @@ void cmk_prefix_iterative(unsigned *buf, unsigned h_pos,
253251
unsigned n_iter = n_entries / 32;
254252
for (unsigned i = 0; i < n_iter; i++) {
255253

256-
S = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
254+
S = gather_rgba<GATHER_SCATTER_MASK>(buf, element_offset);
257255

258256
auto cnt_table = S.bit_cast_view<unsigned int, TUPLE_SZ, 32>();
259257
cnt_table.column(0) += prev;
@@ -289,7 +287,7 @@ void cmk_prefix_iterative(unsigned *buf, unsigned h_pos,
289287
if (i == n_iter - 1)
290288
cnt_table.column(31) -= cnt_table.column(30);
291289

292-
scatter_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset, S);
290+
scatter_rgba<GATHER_SCATTER_MASK>(buf, element_offset, S);
293291

294292
element_offset += stride_elems * TUPLE_SZ * sizeof(unsigned) * 32;
295293
prev = cnt_table.column(31);

SYCL/ESIMD/Prefix_Local_sum2.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,13 +72,13 @@ void cmk_acum_iterative(unsigned *buf, unsigned h_pos,
7272

7373
simd<unsigned int, 32 * TUPLE_SZ> S, T;
7474

75-
S = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
75+
S = gather_rgba<GATHER_SCATTER_MASK>(buf, element_offset);
7676

7777
#pragma unroll
7878
for (int i = 1; i < PREFIX_ENTRIES / 32; i++) {
7979
element_offset += (stride_elems * 32 * TUPLE_SZ) * sizeof(unsigned);
8080
// scattered read, each inst reads 16 entries
81-
T = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
81+
T = gather_rgba<GATHER_SCATTER_MASK>(buf, element_offset);
8282
S += T;
8383
}
8484

@@ -93,8 +93,8 @@ void cmk_acum_iterative(unsigned *buf, unsigned h_pos,
9393

9494
simd<unsigned, 8> result = 0;
9595
result.select<TUPLE_SZ, 1>(0) = sum;
96-
simd<unsigned, 8> voff(0, 1); // 0, 1, 2, 3
97-
simd_mask<8> p = voff < TUPLE_SZ; // predicate
96+
simd<unsigned, 8> voff(0, 1); // 0, 1, 2, 3
97+
simd_mask<8> p = voff < TUPLE_SZ; // predicate
9898
voff = (voff + (global_offset + stride_threads * TUPLE_SZ - TUPLE_SZ)) *
9999
sizeof(unsigned);
100100
scatter<unsigned, 8>(buf, voff, result, p);

SYCL/ESIMD/Prefix_Local_sum3.cpp

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,8 @@ void cmk_acum_iterative(unsigned *buf, unsigned h_pos,
122122
cnt_table.select<1, 1, TUPLE_SZ, 1>(0, 0) +=
123123
cnt_table.select<1, 1, TUPLE_SZ, 1>(1, 0);
124124

125-
simd<unsigned, 8> voff(0, 1); // 0, 1, 2, 3
126-
simd_mask<8> p = voff < TUPLE_SZ; // predicate
125+
simd<unsigned, 8> voff(0, 1); // 0, 1, 2, 3
126+
simd_mask<8> p = voff < TUPLE_SZ; // predicate
127127
voff = (voff + (global_offset + stride_threads * TUPLE_SZ - TUPLE_SZ)) *
128128
sizeof(unsigned);
129129
scatter<unsigned, 8>(buf, voff, S.select<8, 1>(0), p);
@@ -173,8 +173,8 @@ void cmk_acum_iterative_low(unsigned *buf, unsigned h_pos,
173173
cnt_table.select<1, 1, TUPLE_SZ, 1>(0, 0) +=
174174
cnt_table.select<1, 1, TUPLE_SZ, 1>(1, 0);
175175

176-
simd<unsigned, 8> voff(0, 1); // 0, 1, 2, 3
177-
simd_mask<8> p = voff < TUPLE_SZ; // predicate
176+
simd<unsigned, 8> voff(0, 1); // 0, 1, 2, 3
177+
simd_mask<8> p = voff < TUPLE_SZ; // predicate
178178
voff = (voff + (global_offset + stride_threads * TUPLE_SZ - TUPLE_SZ)) *
179179
sizeof(unsigned);
180180
scatter<unsigned, 8>(buf, voff, S.select<8, 1>(0), p);
@@ -196,8 +196,7 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
196196

197197
simd_mask<32> p = elm32 < remaining;
198198

199-
S = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset,
200-
p);
199+
S = gather_rgba<GATHER_SCATTER_MASK>(buf, element_offset, p);
201200

202201
auto cnt_table = S.bit_cast_view<unsigned int, TUPLE_SZ, 32>();
203202
cnt_table.column(0) += prev;
@@ -226,8 +225,7 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
226225
cnt_table.select<1, 1, 16, 1>(j, 16) +=
227226
cnt_table.replicate_vs_w_hs<1, 0, 16, 0>(j, 15);
228227
}
229-
scatter_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset, S,
230-
p);
228+
scatter_rgba<GATHER_SCATTER_MASK>(buf, element_offset, S, p);
231229
elm32 += 32;
232230
element_offset += stride_elems * TUPLE_SZ * sizeof(unsigned) * 32;
233231
prev = cnt_table.column(31);
Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
//==-------- acc_gather_scatter_rgba.cpp - DPC++ ESIMD on-device test -----==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// REQUIRES: gpu
9+
// UNSUPPORTED: cuda || hip
10+
// RUN: %clangxx -fsycl %s -o %t.out
11+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
12+
//
13+
// The test checks functionality of the gather_rgba/scatter_rgba accessor-based
14+
// ESIMD intrinsics.
15+
16+
#include "esimd_test_utils.hpp"
17+
18+
#include <CL/sycl.hpp>
19+
#include <iostream>
20+
#include <sycl/ext/intel/esimd.hpp>
21+
22+
using namespace cl::sycl;
23+
24+
template <typename T>
25+
using AccT = accessor<T, 1, access_mode::read_write, access::target::device>;
26+
27+
constexpr int MASKED_LANE_NUM_REV = 1;
28+
constexpr int NUM_RGBA_CHANNELS =
29+
get_num_channels_enabled(sycl::ext::intel::esimd::rgba_channel_mask::ABGR);
30+
31+
template <typename T, unsigned VL, unsigned STRIDE, auto CH_MASK>
32+
struct Kernel {
33+
AccT<T> InAcc;
34+
AccT<T> OutAcc;
35+
Kernel(AccT<T> InAcc, AccT<T> OutAcc) : InAcc(InAcc), OutAcc(OutAcc) {}
36+
37+
void operator()(id<1> i) const SYCL_ESIMD_KERNEL {
38+
using namespace sycl::ext::intel::esimd;
39+
constexpr int numChannels = get_num_channels_enabled(CH_MASK);
40+
41+
// Every workitem accesses contiguous block of VL * STRIDE elements,
42+
// where each element consists of RGBA channels.
43+
uint32_t global_offset = i * VL * STRIDE * NUM_RGBA_CHANNELS * sizeof(T);
44+
45+
simd<uint32_t, VL> byteOffsets(0, STRIDE * sizeof(T) * NUM_RGBA_CHANNELS);
46+
simd<T, VL * numChannels> v;
47+
if constexpr (CH_MASK == rgba_channel_mask::ABGR)
48+
// Check that the default mask value is ABGR.
49+
v = gather_rgba(InAcc, byteOffsets, global_offset);
50+
else
51+
v = gather_rgba<CH_MASK>(InAcc, byteOffsets, global_offset);
52+
v += (int)i;
53+
54+
simd_mask<VL> pred = 1;
55+
pred[VL - MASKED_LANE_NUM_REV] = 0; // mask out the last lane
56+
scatter_rgba<CH_MASK>(OutAcc, byteOffsets, v, global_offset, pred);
57+
}
58+
};
59+
60+
std::string convertMaskToStr(sycl::ext::intel::esimd::rgba_channel_mask mask) {
61+
using namespace sycl::ext::intel::esimd;
62+
switch (mask) {
63+
case rgba_channel_mask::R:
64+
return "R";
65+
case rgba_channel_mask::GR:
66+
return "GR";
67+
case rgba_channel_mask::ABGR:
68+
return "ABGR";
69+
default:
70+
return "";
71+
}
72+
return "";
73+
}
74+
75+
template <typename T, unsigned VL, unsigned STRIDE, auto CH_MASK>
76+
bool test(queue q) {
77+
size_t numWorkItems = 2;
78+
size_t size = VL * STRIDE * NUM_RGBA_CHANNELS * numWorkItems;
79+
using namespace sycl::ext::intel::esimd;
80+
constexpr int numChannels = get_num_channels_enabled(CH_MASK);
81+
82+
std::cout << "Testing T=" << typeid(T).name() << " VL=" << VL
83+
<< " STRIDE=" << STRIDE << " MASK=" << convertMaskToStr(CH_MASK)
84+
<< "...\t";
85+
86+
T *A = new T[size];
87+
T *B = new T[size];
88+
T *gold = new T[size];
89+
90+
for (int i = 0; i < size; ++i) {
91+
A[i] = (T)i;
92+
B[i] = (T)-i;
93+
gold[i] = (T)-i;
94+
}
95+
96+
// Fill out the array with gold values. The kernel only writes the elements
97+
// that are not masked. For example,
98+
// for STRIDE=1 and MASK=R, we have the following indices written:
99+
// 0, 4, 8, 12 ...
100+
// for STRIDE=2 and MASK=RG, we have the following indices written:
101+
// 0, 1, 8, 9, 16, 17 ...
102+
// All the other elements will be equal to '-A[i]'.
103+
auto blockSize = VL * STRIDE * NUM_RGBA_CHANNELS;
104+
for (unsigned i = 0; i < size; i += NUM_RGBA_CHANNELS * STRIDE)
105+
for (unsigned j = 0; j < numChannels; j++)
106+
gold[i + j] = A[i + j] + (i / (blockSize));
107+
108+
// Account for masked out last lanes (with pred argument to scatter_rgba).
109+
auto maskedElementOffset = (VL - 1) * STRIDE * NUM_RGBA_CHANNELS;
110+
for (unsigned i = maskedElementOffset; i < size; i += blockSize)
111+
for (unsigned j = 0; j < numChannels; j++)
112+
gold[i + j] = -A[i + j];
113+
114+
try {
115+
buffer<T, 1> InBuf(A, range<1>(size));
116+
buffer<T, 1> OutBuf(B, range<1>(size));
117+
range<1> glob_range{numWorkItems};
118+
auto e = q.submit([&](handler &cgh) {
119+
auto InAcc = InBuf.template get_access<access::mode::read_write>(cgh);
120+
auto OutAcc = OutBuf.template get_access<access::mode::read_write>(cgh);
121+
Kernel<T, VL, STRIDE, CH_MASK> kernel(InAcc, OutAcc);
122+
cgh.parallel_for(glob_range, kernel);
123+
});
124+
e.wait();
125+
} catch (sycl::exception const &e) {
126+
std::cerr << "SYCL exception caught: " << e.what() << '\n';
127+
delete[] A;
128+
delete[] B;
129+
delete[] gold;
130+
return false; // not success
131+
}
132+
133+
int err_cnt = 0;
134+
for (unsigned i = 0; i < size; ++i) {
135+
if (B[i] != gold[i]) {
136+
if (++err_cnt < 35) {
137+
std::cout << "\nFAILED at index " << i << ": " << B[i]
138+
<< " != " << gold[i] << " (gold)";
139+
}
140+
}
141+
}
142+
143+
if (err_cnt > 0) {
144+
std::cout << "\n pass rate: "
145+
<< ((float)(size - err_cnt) / (float)size) * 100.0f << "% ("
146+
<< (size - err_cnt) << "/" << size << ")\n";
147+
}
148+
149+
delete[] A;
150+
delete[] B;
151+
delete[] gold;
152+
153+
if (err_cnt == 0)
154+
std::cout << "Passed\n";
155+
return err_cnt == 0;
156+
}
157+
158+
template <typename T, unsigned VL, unsigned STRIDE> bool test(queue q) {
159+
using namespace sycl::ext::intel::esimd;
160+
bool passed = true;
161+
passed &= test<T, VL, STRIDE, rgba_channel_mask::R>(q);
162+
passed &= test<T, VL, STRIDE, rgba_channel_mask::GR>(q);
163+
passed &= test<T, VL, STRIDE, rgba_channel_mask::ABGR>(q);
164+
return passed;
165+
}
166+
167+
int main(void) {
168+
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
169+
170+
auto dev = q.get_device();
171+
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
172+
173+
bool passed = true;
174+
passed &= test<int, 16, 1>(q);
175+
passed &= test<int, 16, 2>(q);
176+
passed &= test<int, 16, 4>(q);
177+
passed &= test<int, 32, 1>(q);
178+
passed &= test<int, 32, 3>(q);
179+
passed &= test<int, 32, 8>(q);
180+
passed &= test<float, 16, 1>(q);
181+
passed &= test<float, 16, 2>(q);
182+
passed &= test<float, 16, 4>(q);
183+
passed &= test<float, 32, 1>(q);
184+
passed &= test<float, 32, 3>(q);
185+
passed &= test<float, 32, 8>(q);
186+
187+
passed &= test<int, 8, 1>(q);
188+
passed &= test<int, 8, 3>(q);
189+
passed &= test<int, 8, 8>(q);
190+
passed &= test<float, 8, 1>(q);
191+
passed &= test<float, 8, 2>(q);
192+
passed &= test<float, 8, 4>(q);
193+
194+
std::cout << (passed ? "All tests passed.\n" : "Some tests failed!\n");
195+
return passed ? 0 : 1;
196+
}

SYCL/ESIMD/api/esimd_rgba_smoke.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,11 +92,11 @@ bool test_impl(queue q) {
9292
cgh.single_task<TestID<T, NPixels, static_cast<int>(ChMask)>>(
9393
[=]() SYCL_ESIMD_KERNEL {
9494
constexpr unsigned NElems = NPixels * NOnChs;
95-
simd<T, NPixels> offsets(0, sizeof(T) * NAllChs);
96-
simd<T, NElems> p = gather_rgba<T, NPixels, ChMask>(A, offsets);
95+
simd<unsigned int, NPixels> offsets(0, sizeof(T) * NAllChs);
96+
simd<T, NElems> p = gather_rgba<ChMask>(A, offsets);
9797
// simply scatter back to B - should give same results as A in
9898
// enabled channels, the rest should remain zero:
99-
scatter_rgba<T, NPixels, ChMask>(B, offsets, p);
99+
scatter_rgba<ChMask>(B, offsets, p);
100100
// copy instead of scattering to C - thus getting AOS to SOA layout
101101
// layout conversion:
102102
// R0 R1 ... G0 G1 ... B0 B1 ... A0 A1 ...

0 commit comments

Comments
 (0)