Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit c8093ac

Browse files
[ESIMD] Rename gather4/scatter4 to gather_rgba/scatter_rgba (#367)
1 parent 5bf349c commit c8093ac

File tree

5 files changed

+192
-8
lines changed

5 files changed

+192
-8
lines changed

SYCL/ESIMD/PrefixSum.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,8 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
184184

185185
simd<ushort, 32> p = elm32 < remaining;
186186

187-
S = gather4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset, p);
187+
S = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset,
188+
p);
188189

189190
auto cnt_table = S.bit_cast_view<unsigned int, TUPLE_SZ, 32>();
190191
cnt_table.column(0) += prev;
@@ -214,7 +215,8 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
214215
cnt_table.select<1, 1, 16, 1>(j, 16) +=
215216
cnt_table.replicate<1, 0, 16, 0>(j, 15);
216217
}
217-
scatter4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, S, element_offset, p);
218+
scatter_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, S, element_offset,
219+
p);
218220
elm32 += 32;
219221
element_offset += stride_elems * TUPLE_SZ * sizeof(unsigned) * 32;
220222
prev = cnt_table.column(31);
@@ -252,7 +254,7 @@ void cmk_prefix_iterative(unsigned *buf, unsigned h_pos,
252254
unsigned n_iter = n_entries / 32;
253255
for (unsigned i = 0; i < n_iter; i++) {
254256

255-
S = gather4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
257+
S = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
256258

257259
auto cnt_table = S.bit_cast_view<unsigned int, TUPLE_SZ, 32>();
258260
cnt_table.column(0) += prev;
@@ -288,7 +290,7 @@ void cmk_prefix_iterative(unsigned *buf, unsigned h_pos,
288290
if (i == n_iter - 1)
289291
cnt_table.column(31) -= cnt_table.column(30);
290292

291-
scatter4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, S, element_offset);
293+
scatter_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, S, element_offset);
292294

293295
element_offset += stride_elems * TUPLE_SZ * sizeof(unsigned) * 32;
294296
prev = cnt_table.column(31);

SYCL/ESIMD/Prefix_Local_sum2.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,13 +73,13 @@ void cmk_acum_iterative(unsigned *buf, unsigned h_pos,
7373

7474
simd<unsigned int, 32 * TUPLE_SZ> S, T;
7575

76-
S = gather4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
76+
S = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
7777

7878
#pragma unroll
7979
for (int i = 1; i < PREFIX_ENTRIES / 32; i++) {
8080
element_offset += (stride_elems * 32 * TUPLE_SZ) * sizeof(unsigned);
8181
// scattered read, each inst reads 16 entries
82-
T = gather4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
82+
T = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
8383
S += T;
8484
}
8585

SYCL/ESIMD/Prefix_Local_sum3.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,8 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
197197

198198
simd<ushort, 32> p = elm32 < remaining;
199199

200-
S = gather4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset, p);
200+
S = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset,
201+
p);
201202

202203
auto cnt_table = S.bit_cast_view<unsigned int, TUPLE_SZ, 32>();
203204
cnt_table.column(0) += prev;
@@ -226,7 +227,8 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
226227
cnt_table.select<1, 1, 16, 1>(j, 16) +=
227228
cnt_table.replicate<1, 0, 16, 0>(j, 15);
228229
}
229-
scatter4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, S, element_offset, p);
230+
scatter_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, S, element_offset,
231+
p);
230232
elm32 += 32;
231233
element_offset += stride_elems * TUPLE_SZ * sizeof(unsigned) * 32;
232234
prev = cnt_table.column(31);

SYCL/ESIMD/accessor_gather_scatter.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ template <typename T, unsigned VL, unsigned STRIDE> bool test(queue q) {
6767
Kernel<T, VL, STRIDE> kernel(acc);
6868
cgh.parallel_for(glob_range, kernel);
6969
});
70+
e.wait();
7071
} catch (cl::sycl::exception const &e) {
7172
std::cout << "SYCL exception caught: " << e.what() << '\n';
7273
delete[] A;
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
//==-------- usm_gather_scatter_rgba.cpp - DPC++ ESIMD on-device test -----==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// REQUIRES: gpu
9+
// UNSUPPORTED: cuda
10+
// RUN: %clangxx -fsycl %s -o %t.out
11+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
12+
//
13+
// The test checks functionality of the gather_rgba/scatter_rgba USM-based ESIMD
14+
// intrinsics.
15+
16+
#include "esimd_test_utils.hpp"
17+
18+
#include <CL/sycl.hpp>
19+
#include <CL/sycl/INTEL/esimd.hpp>
20+
#include <iostream>
21+
22+
using namespace cl::sycl;
23+
24+
constexpr int MASKED_LANE_NUM_REV = 1;
25+
constexpr int NUM_RGBA_CHANNELS = get_num_channels_enabled(
26+
sycl::ext::intel::experimental::esimd::rgba_channel_mask::ABGR);
27+
28+
template <typename T, unsigned VL, unsigned STRIDE, auto CH_MASK>
29+
struct Kernel {
30+
T *bufIn;
31+
T *bufOut;
32+
Kernel(T *bufIn, T *bufOut) : bufIn(bufIn), bufOut(bufOut) {}
33+
34+
void operator()(id<1> i) const SYCL_ESIMD_KERNEL {
35+
using namespace sycl::ext::intel::experimental::esimd;
36+
constexpr int numChannels = get_num_channels_enabled(CH_MASK);
37+
38+
// every workitem accesses contiguous block of VL * STRIDE elements,
39+
// where each element consists of RGBA channels.
40+
uint32_t global_offset = i * VL * STRIDE * NUM_RGBA_CHANNELS;
41+
42+
simd<uint32_t, VL> byteOffsets(0, STRIDE * sizeof(T) * NUM_RGBA_CHANNELS);
43+
simd<T, VL *numChannels> v =
44+
gather_rgba<T, VL, CH_MASK>(bufIn + global_offset, byteOffsets);
45+
v += i;
46+
47+
simd<uint16_t, VL> pred = 1;
48+
pred[VL - MASKED_LANE_NUM_REV] = 0; // mask out the last lane
49+
scatter_rgba<T, VL, CH_MASK>(bufOut + global_offset, v, byteOffsets, pred);
50+
}
51+
};
52+
53+
std::string convertMaskToStr(
54+
sycl::ext::intel::experimental::esimd::rgba_channel_mask mask) {
55+
using namespace sycl::ext::intel::experimental::esimd;
56+
switch (mask) {
57+
case rgba_channel_mask::R:
58+
return "R";
59+
case rgba_channel_mask::GR:
60+
return "GR";
61+
case rgba_channel_mask::ABGR:
62+
return "ABGR";
63+
default:
64+
return "";
65+
}
66+
return "";
67+
}
68+
69+
template <typename T, unsigned VL, unsigned STRIDE, auto CH_MASK>
70+
bool test(queue q) {
71+
size_t numWorkItems = 2;
72+
size_t size = VL * STRIDE * NUM_RGBA_CHANNELS * numWorkItems;
73+
using namespace sycl::ext::intel::experimental::esimd;
74+
constexpr int numChannels = get_num_channels_enabled(CH_MASK);
75+
76+
std::cout << "Testing T=" << typeid(T).name() << " VL=" << VL
77+
<< " STRIDE=" << STRIDE << " MASK=" << convertMaskToStr(CH_MASK)
78+
<< "...\n";
79+
80+
auto dev = q.get_device();
81+
auto ctxt = q.get_context();
82+
T *A = static_cast<T *>(malloc_shared(size * sizeof(T), dev, ctxt));
83+
T *B = static_cast<T *>(malloc_shared(size * sizeof(T), dev, ctxt));
84+
T *gold = new T[size];
85+
86+
for (int i = 0; i < size; ++i) {
87+
A[i] = (T)i;
88+
B[i] = (T)-i;
89+
gold[i] = (T)-i;
90+
}
91+
92+
// Fill out the array with gold values. The kernel only writes the elements
93+
// that are not masked. For example,
94+
// for STRIDE=1 and MASK=R, we have the following indices written:
95+
// 0, 4, 8, 12 ...
96+
// for STRIDE=2 and MASK=RG, we have the following indices written:
97+
// 0, 1, 8, 9, 16, 17 ...
98+
// All the other elements will be equal to '-A[i]'.
99+
auto blockSize = VL * STRIDE * NUM_RGBA_CHANNELS;
100+
for (unsigned i = 0; i < size; i += NUM_RGBA_CHANNELS * STRIDE)
101+
for (unsigned j = 0; j < numChannels; j++)
102+
gold[i + j] = A[i + j] + (i / (blockSize));
103+
104+
// Account for masked out last lanes (with pred argument to scatter_rgba).
105+
auto maskedElementOffset = (VL - 1) * STRIDE * NUM_RGBA_CHANNELS;
106+
for (unsigned i = maskedElementOffset; i < size; i += blockSize)
107+
for (unsigned j = 0; j < numChannels; j++)
108+
gold[i + j] = -A[i + j];
109+
110+
try {
111+
range<1> glob_range{numWorkItems};
112+
auto e = q.submit([&](handler &cgh) {
113+
Kernel<T, VL, STRIDE, CH_MASK> kernel(A, B);
114+
cgh.parallel_for(glob_range, kernel);
115+
});
116+
e.wait();
117+
} catch (cl::sycl::exception const &e) {
118+
std::cout << "SYCL exception caught: " << e.what() << '\n';
119+
free(A, ctxt);
120+
free(B, ctxt);
121+
delete[] gold;
122+
return e.get_cl_code();
123+
}
124+
125+
int err_cnt = 0;
126+
for (unsigned i = 0; i < size; ++i) {
127+
if (B[i] != gold[i]) {
128+
if (++err_cnt < 35) {
129+
std::cout << "failed at index " << i << ": " << B[i]
130+
<< " != " << gold[i] << " (gold)\n";
131+
}
132+
}
133+
}
134+
135+
if (err_cnt > 0) {
136+
std::cout << " pass rate: "
137+
<< ((float)(size - err_cnt) / (float)size) * 100.0f << "% ("
138+
<< (size - err_cnt) << "/" << size << ")\n";
139+
}
140+
141+
free(A, ctxt);
142+
free(B, ctxt);
143+
delete[] gold;
144+
145+
std::cout << (err_cnt > 0 ? " FAILED\n" : " Passed\n");
146+
return err_cnt > 0 ? false : true;
147+
}
148+
149+
template <typename T, unsigned VL, unsigned STRIDE> bool test(queue q) {
150+
using namespace sycl::ext::intel::experimental::esimd;
151+
bool passed = true;
152+
passed &= test<T, VL, STRIDE, rgba_channel_mask::R>(q);
153+
passed &= test<T, VL, STRIDE, rgba_channel_mask::GR>(q);
154+
passed &= test<T, VL, STRIDE, rgba_channel_mask::ABGR>(q);
155+
return passed;
156+
}
157+
158+
int main(void) {
159+
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
160+
161+
auto dev = q.get_device();
162+
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
163+
164+
bool passed = true;
165+
passed &= test<int, 16, 1>(q);
166+
passed &= test<int, 16, 2>(q);
167+
passed &= test<int, 16, 4>(q);
168+
passed &= test<int, 32, 1>(q);
169+
passed &= test<int, 32, 3>(q);
170+
passed &= test<int, 32, 8>(q);
171+
passed &= test<float, 16, 1>(q);
172+
passed &= test<float, 16, 2>(q);
173+
passed &= test<float, 16, 4>(q);
174+
passed &= test<float, 32, 1>(q);
175+
passed &= test<float, 32, 3>(q);
176+
passed &= test<float, 32, 8>(q);
177+
178+
return passed ? 0 : 1;
179+
}

0 commit comments

Comments
 (0)