Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit bb4b380

Browse files
[ESIMD] rename slm_load4/slm_store4 to slm_gather_rgba/slm_scatter_rgba (#372)
[ESIMD] rename slm_load4/slm_store4 to slm_gather_rgba/slm_scatter_rgba Co-authored-by: Vyacheslav N Klochkov <[email protected]>
1 parent 7f04719 commit bb4b380

File tree

6 files changed

+197
-10
lines changed

6 files changed

+197
-10
lines changed
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
// REQUIRES: gpu
2+
// UNSUPPORTED: cuda
3+
// RUN: %clangxx -fsycl %s -o %t.out
4+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
5+
//
6+
// The test checks functionality of the slm_gather_rgba/slm_scatter_rgba ESIMD
7+
// API.
8+
9+
#include "../esimd_test_utils.hpp"
10+
11+
#include <CL/sycl.hpp>
12+
#include <CL/sycl/INTEL/esimd.hpp>
13+
#include <iostream>
14+
15+
using namespace cl::sycl;
16+
17+
constexpr int MASKED_LANE_NUM_REV = 1;
18+
constexpr int NUM_RGBA_CHANNELS = get_num_channels_enabled(
19+
sycl::ext::intel::experimental::esimd::rgba_channel_mask::ABGR);
20+
21+
template <typename T, unsigned VL, auto CH_MASK> struct Kernel {
22+
T *bufOut;
23+
Kernel(T *bufOut) : bufOut(bufOut) {}
24+
25+
void operator()(sycl::nd_item<1> ndi) const SYCL_ESIMD_KERNEL {
26+
using namespace sycl::ext::intel::experimental::esimd;
27+
constexpr int numChannels = get_num_channels_enabled(CH_MASK);
28+
uint32_t i = ndi.get_global_id(0);
29+
30+
// In this test, each group consist of one workitem. No barriers required.
31+
// Each workitem accesses contiguous block of VL elements, where
32+
// each element consists of RGBA channels.
33+
slm_init(VL * NUM_RGBA_CHANNELS * sizeof(T));
34+
35+
// Prepare initial values in SLM:
36+
// 0, -1, -2, -3, -4 ...
37+
// slm_scatter only supports VL = 16 or 32, so conservatively write in
38+
// chunks of 16 elements.
39+
constexpr unsigned numStores = (VL * NUM_RGBA_CHANNELS) / 16;
40+
for (int i = 0; i < numStores; i++) {
41+
simd<T, 16> vals(-i * 16, -1);
42+
simd<uint32_t, 16> fourByteOffsets(i * 16 * sizeof(T), sizeof(T));
43+
slm_scatter<T, 16>(vals, fourByteOffsets);
44+
}
45+
46+
// Prepare values to store into SLM in a SOA manner, e.g.:
47+
// R R R R ... G G G G ... B B B B ... A A A A ...
48+
// 0, 4, 8, 12, ... 1, 5, 9, 13, ... 2, 6, 10, 14, ... 3, 7, 11, 15 ...
49+
simd<T, VL * numChannels> valsIn;
50+
for (unsigned i = 0; i < numChannels; i++)
51+
for (unsigned j = 0; j < VL; j++)
52+
valsIn[i * VL + j] = j * numChannels + i;
53+
54+
// Store values to SLM. In the SLM it will be transposed into AOS:
55+
// R G B A R G B A ...
56+
// 0, 1, 2, 3, 4, 5, 6, 7 ...
57+
simd<uint32_t, VL> byteOffsets(0, sizeof(T) * NUM_RGBA_CHANNELS);
58+
slm_scatter_rgba<T, VL, CH_MASK>(valsIn, byteOffsets);
59+
60+
// Load back values from SLM. They will be transposed back to SOA.
61+
simd<uint16_t, VL> pred = 1;
62+
pred[VL - MASKED_LANE_NUM_REV] = 0; // mask out the last lane
63+
simd<T, VL *numChannels> valsOut =
64+
slm_gather_rgba<T, VL, CH_MASK>(byteOffsets, pred);
65+
66+
// Copy results to the output USM buffer. Maximum write block size must be
67+
// at most 8 owords, so conservatively write in chunks of 8 elements.
68+
uint32_t global_offset = i * VL * NUM_RGBA_CHANNELS;
69+
for (unsigned i = 0; i < (VL * numChannels) / 8; i++) {
70+
simd<T, 8> valsToWrite = valsOut.template select<8, 1>(i * 8);
71+
valsToWrite.copy_to(bufOut + global_offset + i * 8);
72+
}
73+
}
74+
};
75+
76+
std::string convertMaskToStr(
77+
sycl::ext::intel::experimental::esimd::rgba_channel_mask mask) {
78+
using namespace sycl::ext::intel::experimental::esimd;
79+
switch (mask) {
80+
case rgba_channel_mask::R:
81+
return "R";
82+
case rgba_channel_mask::GR:
83+
return "GR";
84+
case rgba_channel_mask::ABGR:
85+
return "ABGR";
86+
default:
87+
return "";
88+
}
89+
return "";
90+
}
91+
92+
template <typename T, unsigned VL, auto CH_MASK> bool test(queue q) {
93+
using namespace sycl::ext::intel::experimental::esimd;
94+
constexpr int numChannels = get_num_channels_enabled(CH_MASK);
95+
constexpr size_t size = VL * numChannels;
96+
97+
std::cout << "Testing T=" << typeid(T).name() << " VL=" << VL
98+
<< " MASK=" << convertMaskToStr(CH_MASK) << "...\n";
99+
100+
auto dev = q.get_device();
101+
auto ctxt = q.get_context();
102+
T *A = static_cast<T *>(malloc_shared(size * sizeof(T), dev, ctxt));
103+
T *gold = new T[size];
104+
105+
for (int i = 0; i < size; ++i) {
106+
A[i] = (T)-i;
107+
}
108+
109+
// Fill out the array with gold values.
110+
// R R R R ... G G G G ... B B B B ... A A A A ...
111+
// 0, 4, 8, 12, ... 1, 5, 9, 13, ... 2, 6, 10, 14, ... 3, 7, 11, 15 ...
112+
for (unsigned i = 0; i < numChannels; i++)
113+
for (unsigned j = 0; j < VL; j++)
114+
gold[i * VL + j] = j * numChannels + i;
115+
116+
// Account for masked out last lanes (with pred argument to slm_gather_rgba).
117+
unsigned maskedIndex = VL - 1;
118+
for (unsigned i = 0; i < numChannels; i++, maskedIndex += VL)
119+
gold[maskedIndex] = 0;
120+
121+
try {
122+
// We need that many workitems
123+
sycl::range<1> GlobalRange{1};
124+
// Number of workitems in a workgroup
125+
sycl::range<1> LocalRange{1};
126+
sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
127+
128+
auto e = q.submit([&](handler &cgh) {
129+
Kernel<T, VL, CH_MASK> kernel(A);
130+
cgh.parallel_for(Range, kernel);
131+
});
132+
e.wait();
133+
} catch (sycl::exception const &e) {
134+
std::cerr << "SYCL exception caught: " << e.what() << '\n';
135+
free(A, ctxt);
136+
delete[] gold;
137+
return static_cast<bool>(e.code());
138+
}
139+
140+
int err_cnt = 0;
141+
for (unsigned i = 0; i < size; ++i) {
142+
if (A[i] != gold[i]) {
143+
if (++err_cnt < 35) {
144+
std::cerr << "failed at index " << i << ": " << A[i]
145+
<< " != " << gold[i] << " (gold)\n";
146+
}
147+
}
148+
}
149+
150+
if (err_cnt > 0) {
151+
std::cout << " pass rate: "
152+
<< ((float)(size - err_cnt) / (float)size) * 100.0f << "% ("
153+
<< (size - err_cnt) << "/" << size << ")\n";
154+
}
155+
156+
free(A, ctxt);
157+
delete[] gold;
158+
159+
std::cout << (err_cnt > 0 ? " FAILED\n" : " Passed\n");
160+
return err_cnt > 0 ? false : true;
161+
}
162+
163+
template <typename T, unsigned VL> bool test(queue q) {
164+
using namespace sycl::ext::intel::experimental::esimd;
165+
bool passed = true;
166+
passed &= test<T, VL, rgba_channel_mask::R>(q);
167+
passed &= test<T, VL, rgba_channel_mask::GR>(q);
168+
passed &= test<T, VL, rgba_channel_mask::ABGR>(q);
169+
return passed;
170+
}
171+
172+
int main(void) {
173+
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
174+
175+
auto dev = q.get_device();
176+
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
177+
178+
bool passed = true;
179+
passed &= test<int, 8>(q);
180+
passed &= test<int, 16>(q);
181+
passed &= test<int, 32>(q);
182+
passed &= test<float, 8>(q);
183+
passed &= test<float, 16>(q);
184+
passed &= test<float, 32>(q);
185+
186+
return passed ? 0 : 1;
187+
}

SYCL/ESIMD/histogram_256_slm.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
3636
slm_offset += 16 * lid;
3737
slm_offset *= sizeof(int);
3838
simd<uint, 16> slm_data = 0;
39-
slm_store<uint, 16>(slm_data, slm_offset);
39+
slm_scatter<uint, 16>(slm_data, slm_offset);
4040
esimd_barrier();
4141

4242
// Each thread handles NUM_BLOCKSxBLOCK_WIDTH pixel blocks
@@ -60,7 +60,7 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
6060

6161
// Update global sum by atomically adding each local histogram
6262
simd<uint, 16> local_histogram;
63-
local_histogram = slm_load<uint32_t, 16>(slm_offset);
63+
local_histogram = slm_gather<uint32_t, 16>(slm_offset);
6464
flat_atomic<atomic_op::add, uint32_t, 8>(output, slm_offset.select<8, 1>(0),
6565
local_histogram.select<8, 1>(0), 1);
6666
flat_atomic<atomic_op::add, uint32_t, 8>(output, slm_offset.select<8, 1>(8),

SYCL/ESIMD/histogram_256_slm_spec.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
3737
slm_offset += 16 * lid;
3838
slm_offset *= sizeof(int);
3939
simd<uint, 16> slm_data = 0;
40-
slm_store<uint, 16>(slm_data, slm_offset);
40+
slm_scatter<uint, 16>(slm_data, slm_offset);
4141
esimd_barrier();
4242

4343
// Each thread handles NUM_BLOCKSxBLOCK_WIDTH pixel blocks
@@ -61,7 +61,7 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
6161

6262
// Update global sum by atomically adding each local histogram
6363
simd<uint, 16> local_histogram;
64-
local_histogram = slm_load<uint32_t, 16>(slm_offset);
64+
local_histogram = slm_gather<uint32_t, 16>(slm_offset);
6565
flat_atomic<atomic_op::add, uint32_t, 8>(output, slm_offset.select<8, 1>(0),
6666
local_histogram.select<8, 1>(0), 1);
6767
flat_atomic<atomic_op::add, uint32_t, 8>(output, slm_offset.select<8, 1>(8),

SYCL/ESIMD/histogram_256_slm_spec_2020.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
3232
slm_offset += 16 * lid;
3333
slm_offset *= sizeof(int);
3434
simd<uint, 16> slm_data = 0;
35-
slm_store<uint, 16>(slm_data, slm_offset);
35+
slm_scatter<uint, 16>(slm_data, slm_offset);
3636
esimd_barrier();
3737

3838
// Each thread handles NUM_BLOCKSxBLOCK_WIDTH pixel blocks
@@ -55,7 +55,7 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
5555

5656
// Update global sum by atomically adding each local histogram
5757
simd<uint, 16> local_histogram;
58-
local_histogram = slm_load<uint32_t, 16>(slm_offset);
58+
local_histogram = slm_gather<uint32_t, 16>(slm_offset);
5959
flat_atomic<atomic_op::add, uint32_t, 8>(output, slm_offset.select<8, 1>(0),
6060
local_histogram.select<8, 1>(0), 1);
6161
flat_atomic<atomic_op::add, uint32_t, 8>(output, slm_offset.select<8, 1>(8),

SYCL/ESIMD/slm_barrier.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ void load_to_slm(uint grpSize, uint localId, uint slmOffset, char *addr,
6262
rowTrans.select<8, 1>(40) = row1.select<8, 4>(2);
6363
rowTrans.select<8, 1>(56) = row1.select<8, 4>(3);
6464

65-
slm_store4<uint, 16, rgba_channel_mask::ABGR>(rowTrans, vOffsets);
65+
slm_scatter_rgba<uint, 16, rgba_channel_mask::ABGR>(rowTrans, vOffsets);
6666
threadOffsetInMemory += grpSize * 256;
6767
vOffsets += (grpSize * 256);
6868
}
@@ -123,7 +123,7 @@ int main(void) {
123123

124124
v_Off = v_Off + shiftID * 64;
125125

126-
v_slmData = slm_load<uint, VL>(v_Off);
126+
v_slmData = slm_gather<uint, VL>(v_Off);
127127

128128
v_slmData.copy_to(B + globalID * VL);
129129
});

SYCL/ESIMD/slm_split_barrier.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ void load_to_slm(uint grpSize, uint localId, uint slmOffset, char *addr,
6262
rowTrans.select<8, 1>(40) = row1.select<8, 4>(2);
6363
rowTrans.select<8, 1>(56) = row1.select<8, 4>(3);
6464

65-
slm_store4<uint, 16, rgba_channel_mask::ABGR>(rowTrans, vOffsets);
65+
slm_scatter_rgba<uint, 16, rgba_channel_mask::ABGR>(rowTrans, vOffsets);
6666
threadOffsetInMemory += grpSize * 256;
6767
vOffsets += (grpSize * 256);
6868
}
@@ -125,7 +125,7 @@ int main(void) {
125125

126126
v_Off = v_Off + shiftID * 64;
127127

128-
v_slmData = slm_load<uint, VL>(v_Off);
128+
v_slmData = slm_gather<uint, VL>(v_Off);
129129

130130
v_slmData.copy_to(B + globalID * VL);
131131
});

0 commit comments

Comments
 (0)