Skip to content

Commit 1b8b26a

Browse files
authored
[SYCL] Fix errors caused by half/double types in reduction (#1912)
1) Added a fix/workaround in half types support that caused a compilation error for unary minus applied to half type operand. 2) Separated the test cases for half and double types to two new reduction LIT tests. It is needed to handle them more easily on devices where 'half' and/or 'double' extensions are not supported. Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent e94d82f commit 1b8b26a

10 files changed

+125
-29
lines changed

sycl/include/CL/sycl/intel/reduction.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ class reducer<T, BinaryOperation,
219219
static enable_if_t<IsMaximumIdentityOp<_T, _BinaryOperation>::value, _T>
220220
getIdentity() {
221221
return std::numeric_limits<_T>::has_infinity
222-
? -std::numeric_limits<_T>::infinity()
222+
? static_cast<_T>(-std::numeric_limits<_T>::infinity())
223223
: std::numeric_limits<_T>::lowest();
224224
}
225225

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// UNSUPPORTED: cuda
2+
// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
3+
//
4+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
5+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
6+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
7+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
8+
9+
// TODO: Enable the test for HOST when it supports intel::reduce() and barrier()
10+
// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
11+
12+
// This test performs basic checks of parallel_for(nd_range, reduction, func)
13+
// used with 'double' type.
14+
15+
#include "reduction_nd_ext_type.hpp"
16+
17+
int main() {
18+
return runTests<double>("cl_khr_double");
19+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
// UNSUPPORTED: cuda
2+
// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
3+
//
4+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
5+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
6+
7+
// TODO: Enable the test for CPU/ACC when they support half type.
8+
// RUNx: %CPU_RUN_PLACEHOLDER %t.out
9+
// RUNx: %ACC_RUN_PLACEHOLDER %t.out
10+
11+
// TODO: Enable the test for HOST when it supports intel::reduce() and barrier()
12+
// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
13+
14+
// This test performs basic checks of parallel_for(nd_range, reduction, func)
15+
// used with 'half' type.
16+
17+
#include "reduction_nd_ext_type.hpp"
18+
19+
int main() {
20+
return runTests<half>("cl_khr_fp16");
21+
}
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
// This test performs basic checks of parallel_for(nd_range, reduction, func)
2+
// with types that may require additional runtime checks for extensions
3+
// supported by the device, e.g. 'half' or 'double'
4+
5+
#include "reduction_utils.hpp"
6+
#include <CL/sycl.hpp>
7+
#include <cassert>
8+
9+
using namespace cl::sycl;
10+
11+
template <typename T, int Dim, class BinaryOperation>
12+
class SomeClass;
13+
14+
template <typename T, int Dim, access::mode Mode, class BinaryOperation>
15+
void test(T Identity, size_t WGSize, size_t NWItems) {
16+
buffer<T, 1> InBuf(NWItems);
17+
buffer<T, 1> OutBuf(1);
18+
19+
// Initialize.
20+
BinaryOperation BOp;
21+
T CorrectOut;
22+
initInputData(InBuf, CorrectOut, Identity, BOp, NWItems);
23+
24+
if (Mode == access::mode::read_write)
25+
(OutBuf.template get_access<access::mode::write>())[0] = Identity;
26+
27+
// Compute.
28+
queue Q;
29+
Q.submit([&](handler &CGH) {
30+
auto In = InBuf.template get_access<access::mode::read>(CGH);
31+
accessor<T, Dim, Mode, access::target::global_buffer>
32+
Out(OutBuf, CGH);
33+
auto Redu = intel::reduction(Out, Identity, BOp);
34+
35+
range<1> GlobalRange(NWItems);
36+
range<1> LocalRange(WGSize);
37+
nd_range<1> NDRange(GlobalRange, LocalRange);
38+
CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
39+
NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
40+
Sum.combine(In[NDIt.get_global_linear_id()]);
41+
});
42+
});
43+
44+
// Check correctness.
45+
auto Out = OutBuf.template get_access<access::mode::read>();
46+
T ComputedOut = *(Out.get_pointer());
47+
T MaxDiff = 3 * std::numeric_limits<T>::epsilon() * std::fabs(ComputedOut + CorrectOut);
48+
if (std::fabs(static_cast<T>(ComputedOut - CorrectOut)) > MaxDiff) {
49+
std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n";
50+
std::cout << "Computed value: " << ComputedOut
51+
<< ", Expected value: " << CorrectOut
52+
<< ", MaxDiff = " << MaxDiff << "\n";
53+
assert(0 && "Wrong value.");
54+
}
55+
}
56+
57+
template <typename T>
58+
int runTests(const string_class &ExtensionName) {
59+
device D = default_selector().select_device();
60+
if (!D.is_host() && !D.has_extension(ExtensionName)) {
61+
std::cout << "Test skipped\n";
62+
return 0;
63+
}
64+
65+
// Check some less standards WG sizes and corner cases first.
66+
test<T, 1, access::mode::read_write, std::multiplies<T>>(0, 4, 4);
67+
test<T, 0, access::mode::discard_write, intel::plus<T>>(0, 4, 64);
68+
69+
test<T, 0, access::mode::read_write, intel::minimum<T>>(getMaximumFPValue<T>(), 7, 7);
70+
test<T, 1, access::mode::discard_write, intel::maximum<T>>(getMinimumFPValue<T>(), 7, 7 * 5);
71+
72+
#if __cplusplus >= 201402L
73+
test<T, 1, access::mode::read_write, intel::plus<>>(1, 3, 3 * 5);
74+
test<T, 1, access::mode::discard_write, intel::minimum<>>(getMaximumFPValue<T>(), 3, 3);
75+
test<T, 0, access::mode::discard_write, intel::maximum<>>(getMinimumFPValue<T>(), 3, 3);
76+
#endif // __cplusplus >= 201402L
77+
78+
std::cout << "Test passed\n";
79+
return 0;
80+
}

sycl/test/reduction/reduction_nd_s0_dw.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,6 @@ int main() {
8787
test<float, 0, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
8888
test<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
8989

90-
test<double, 0, std::multiplies<double>>(1, 8, 256);
91-
test<double, 0, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
92-
test<double, 0, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);
93-
9490
// Check with CUSTOM type.
9591
test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
9692

sycl/test/reduction/reduction_nd_s0_rw.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,6 @@ int main() {
8989
test<float, 0, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
9090
test<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
9191

92-
test<double, 0, std::multiplies<double>>(1, 8, 256);
93-
test<double, 0, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
94-
test<double, 0, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);
95-
9692
// Check with CUSTOM type.
9793
test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
9894

sycl/test/reduction/reduction_nd_s1_dw.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -88,10 +88,6 @@ int main() {
8888
test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
8989
test<float, 1, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
9090

91-
test<double, 1, std::multiplies<double>>(1, 8, 256);
92-
test<double, 1, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
93-
test<double, 1, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);
94-
9591
// Check with CUSTOM type.
9692
test<CustomVec<long long>, 1, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
9793

sycl/test/reduction/reduction_nd_s1_rw.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -90,10 +90,6 @@ int main() {
9090
test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
9191
test<float, 1, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
9292

93-
test<double, 1, std::multiplies<double>>(1, 8, 256);
94-
test<double, 1, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
95-
test<double, 1, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);
96-
9793
// Check with CUSTOM type.
9894
test<CustomVec<long long>, 1, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
9995

sycl/test/reduction/reduction_transparent.cpp

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,8 @@
33

44
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
55
// RUN: %GPU_RUN_PLACEHOLDER %t.out
6-
7-
// TODO: enable all checks for CPU/ACC when CPU/ACC RT supports intel::reduce()
8-
// for 'cl::sycl::half' type.
9-
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DSKIP_FOR_HALF -o %t.no_half.out
10-
// RUN: %ACC_RUN_PLACEHOLDER %t.no_half.out
11-
// RUN: %CPU_RUN_PLACEHOLDER %t.no_half.out
6+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
7+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
128

139
// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
1410
// TODO: Enable the test for HOST when it supports intel::reduce() and barrier()
@@ -114,13 +110,9 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
114110

115111
int main() {
116112
#if __cplusplus >= 201402L
117-
test<double, 0, intel::maximum<>>(getMinimumFPValue<double>(), 7, 7 * 5);
113+
test<float, 0, intel::maximum<>>(getMinimumFPValue<float>(), 7, 7 * 5);
118114
test<signed char, 0, intel::plus<>>(0, 7, 49);
119115
test<unsigned char, 1, std::multiplies<>>(1, 4, 16);
120-
#ifndef SKIP_FOR_HALF
121-
test<half, 1, intel::plus<>>(0, 4, 8);
122-
test<half, 1, intel::minimum<>>(getMaximumFPValue<half>(), 8, 32);
123-
#endif // SKIP_FOR_HALF
124116
#endif // __cplusplus >= 201402L
125117

126118
std::cout << "Test passed\n";

sycl/test/reduction/reduction_utils.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ struct CustomVecPlus {
5656
template <typename T>
5757
T getMinimumFPValue() {
5858
return std::numeric_limits<T>::has_infinity
59-
? -std::numeric_limits<T>::infinity()
59+
? static_cast<T>(-std::numeric_limits<T>::infinity())
6060
: std::numeric_limits<T>::lowest();
6161
}
6262

0 commit comments

Comments
 (0)