Skip to content

Commit 92e6226

Browse files
authored
[SYCL] Fix types and transparent functors recognition in reduction (#1709)
* [SYCL] Fix types and transparent functors recognition in reduction 1. Enable operator*, operator+, operator|, operator&, operator^= for corresponding transparent functors used in reduction. 2. Fixed the case when reduction object is passed to parallel_for an R-value. 3. Allow identity-less constructors for reductions with transparent functors. 4. Replaced some 'auto' declarations with Reduction::result_type and added intermediate assignments/casts to avoid type ambiguities caused by using sycl::half type, and which may also be caused by custom/user types as well. 5. Fix statically known identity values for MIN/MAX reductions Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent ea2f012 commit 92e6226

11 files changed

+269
-163
lines changed

sycl/include/CL/sycl/handler.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -853,7 +853,7 @@ class __SYCL_EXPORT handler {
853853
int Dims, typename Reduction>
854854
detail::enable_if_t<Reduction::accessor_mode == access::mode::read_write &&
855855
Reduction::has_fast_atomics>
856-
parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
856+
parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
857857
if (Reduction::is_usm)
858858
Redu.associateWithHandler(*this);
859859
shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
@@ -886,7 +886,7 @@ class __SYCL_EXPORT handler {
886886
int Dims, typename Reduction>
887887
detail::enable_if_t<Reduction::accessor_mode == access::mode::discard_write &&
888888
Reduction::has_fast_atomics>
889-
parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
889+
parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
890890
shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
891891
auto RWAcc = Redu.getReadWriteScalarAcc(*this);
892892
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
@@ -920,7 +920,7 @@ class __SYCL_EXPORT handler {
920920
template <typename KernelName = detail::auto_name, typename KernelType,
921921
int Dims, typename Reduction>
922922
detail::enable_if_t<!Reduction::has_fast_atomics>
923-
parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
923+
parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
924924
size_t NWorkGroups = Range.get_group_range().size();
925925

926926
// This parallel_for() is lowered to the following sequence:

sycl/include/CL/sycl/intel/reduction.hpp

Lines changed: 147 additions & 80 deletions
Large diffs are not rendered by default.

sycl/test/reduction/reduction_ctor.cpp

Lines changed: 5 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,15 @@
11
// RUN: %clangxx -fsycl %s -o %t.out
22
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
3-
//==----------------reduction_ctor.cpp - SYCL reduction basic test ---------==//
4-
//
5-
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
6-
// See https://llvm.org/LICENSE.txt for license information.
7-
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8-
//
9-
//===----------------------------------------------------------------------===//
103

114
// This performs basic checks such as reduction creation, getIdentity() method,
125
// and the combine() method of the aux class 'reducer'.
136

7+
#include "reduction_utils.hpp"
148
#include <CL/sycl.hpp>
159
#include <cassert>
1610

1711
using namespace cl::sycl;
1812

19-
template <typename T, class BinaryOperation, int N>
20-
struct init_data_t {
21-
void initInputData(T IdentityVal,
22-
buffer<T, 1> &InBuf,
23-
T &ExpectedReduValue) {
24-
ExpectedReduValue = IdentityVal;
25-
BinaryOperation Op;
26-
auto In = InBuf.template get_access<access::mode::write>();
27-
for (int I = 0; I < N; ++I) {
28-
In[I] = ((I + 1) % 5) + 1;
29-
ExpectedReduValue = Op(ExpectedReduValue, In[I]);
30-
}
31-
}
32-
};
33-
34-
template <typename T, int N>
35-
struct init_data_t<T, std::multiplies<T>, N> {
36-
void initInputData(T IdentityVal, buffer<T, 1> &InBuf, T &ExpectedReduValue) {
37-
ExpectedReduValue = IdentityVal;
38-
std::multiplies<T> Op;
39-
auto In = InBuf.template get_access<access::mode::write>();
40-
for (int I = 0; I < N; ++I) {
41-
In[I] = 1 + (((I % 37) == 0) ? 1 : 0);
42-
ExpectedReduValue = Op(ExpectedReduValue, In[I]);
43-
}
44-
}
45-
};
4613

4714
template <typename T, typename Reduction>
4815
void test_reducer(Reduction &Redu, T A, T B) {
@@ -157,15 +124,13 @@ int main() {
157124
testBoth<int, intel::bit_or<int>>(0, 1, 8);
158125
testBoth<int, intel::bit_xor<int>>(0, 7, 3);
159126
testBoth<int, intel::bit_and<int>>(~0, 7, 3);
160-
testBoth<int, intel::minimum<int>>(std::numeric_limits<int>::max(), 7, 3);
161-
testBoth<int, intel::maximum<int>>(std::numeric_limits<int>::min(), 7, 3);
127+
testBoth<int, intel::minimum<int>>((std::numeric_limits<int>::max)(), 7, 3);
128+
testBoth<int, intel::maximum<int>>((std::numeric_limits<int>::min)(), 7, 3);
162129

163130
testBoth<float, intel::plus<float>>(0, 1, 7);
164131
testBoth<float, std::multiplies<float>>(1, 1, 7);
165-
testBoth<float, intel::minimum<float>>(
166-
std::numeric_limits<float>::max(), 7, 3);
167-
testBoth<float, intel::maximum<float>>(
168-
std::numeric_limits<float>::min(), 7, 3);
132+
testBoth<float, intel::minimum<float>>(getMaximumFPValue<float>(), 7, 3);
133+
testBoth<float, intel::maximum<float>>(getMinimumFPValue<float>(), 7, 3);
169134

170135
testUnknown<Point<float>, 0, PointPlus<float>>(Point<float>(0), Point<float>(1), Point<float>(7));
171136
testUnknown<Point<float>, 1, PointPlus<float>>(Point<float>(0), Point<float>(1), Point<float>(7));

sycl/test/reduction/reduction_nd_s0_dw.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -79,17 +79,17 @@ int main() {
7979
test<int, 0, intel::bit_or<int>>(0, 8, 256);
8080
test<int, 0, intel::bit_xor<int>>(0, 8, 256);
8181
test<int, 0, intel::bit_and<int>>(~0, 8, 256);
82-
test<int, 0, intel::minimum<int>>(std::numeric_limits<int>::max(), 8, 256);
83-
test<int, 0, intel::maximum<int>>(std::numeric_limits<int>::min(), 8, 256);
82+
test<int, 0, intel::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
83+
test<int, 0, intel::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
8484

8585
// Check with various types.
8686
test<float, 0, std::multiplies<float>>(1, 8, 256);
87-
test<float, 0, intel::minimum<float>>(std::numeric_limits<float>::max(), 8, 256);
88-
test<float, 0, intel::maximum<float>>(std::numeric_limits<float>::min(), 8, 256);
87+
test<float, 0, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
88+
test<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
8989

9090
test<double, 0, std::multiplies<double>>(1, 8, 256);
91-
test<double, 0, intel::minimum<double>>(std::numeric_limits<double>::max(), 8, 256);
92-
test<double, 0, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
91+
test<double, 0, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
92+
test<double, 0, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);
9393

9494
// Check with CUSTOM type.
9595
test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);

sycl/test/reduction/reduction_nd_s0_rw.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -81,17 +81,17 @@ int main() {
8181
test<int, 0, intel::bit_or<int>>(0, 8, 256);
8282
test<int, 0, intel::bit_xor<int>>(0, 8, 256);
8383
test<int, 0, intel::bit_and<int>>(~0, 8, 256);
84-
test<int, 0, intel::minimum<int>>(std::numeric_limits<int>::max(), 8, 256);
85-
test<int, 0, intel::maximum<int>>(std::numeric_limits<int>::min(), 8, 256);
84+
test<int, 0, intel::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
85+
test<int, 0, intel::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
8686

8787
// Check with various types.
8888
test<float, 0, std::multiplies<float>>(1, 8, 256);
89-
test<float, 0, intel::minimum<float>>(std::numeric_limits<float>::max(), 8, 256);
90-
test<float, 0, intel::maximum<float>>(std::numeric_limits<float>::min(), 8, 256);
89+
test<float, 0, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
90+
test<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
9191

9292
test<double, 0, std::multiplies<double>>(1, 8, 256);
93-
test<double, 0, intel::minimum<double>>(std::numeric_limits<double>::max(), 8, 256);
94-
test<double, 0, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
93+
test<double, 0, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
94+
test<double, 0, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);
9595

9696
// Check with CUSTOM type.
9797
test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);

sycl/test/reduction/reduction_nd_s1_dw.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -80,17 +80,17 @@ int main() {
8080
test<int, 1, intel::bit_or<int>>(0, 8, 256);
8181
test<int, 1, intel::bit_xor<int>>(0, 8, 256);
8282
test<int, 1, intel::bit_and<int>>(~0, 8, 256);
83-
test<int, 1, intel::minimum<int>>(std::numeric_limits<int>::max(), 8, 256);
84-
test<int, 1, intel::maximum<int>>(std::numeric_limits<int>::min(), 8, 256);
83+
test<int, 1, intel::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
84+
test<int, 1, intel::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
8585

8686
// Check with various types.
8787
test<float, 1, std::multiplies<float>>(1, 8, 256);
88-
test<float, 1, intel::minimum<float>>(std::numeric_limits<float>::max(), 8, 256);
89-
test<float, 1, intel::maximum<float>>(std::numeric_limits<float>::min(), 8, 256);
88+
test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
89+
test<float, 1, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
9090

9191
test<double, 1, std::multiplies<double>>(1, 8, 256);
92-
test<double, 1, intel::minimum<double>>(std::numeric_limits<double>::max(), 8, 256);
93-
test<double, 1, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
92+
test<double, 1, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
93+
test<double, 1, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);
9494

9595
// Check with CUSTOM type.
9696
test<CustomVec<long long>, 1, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);

sycl/test/reduction/reduction_nd_s1_rw.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -82,17 +82,17 @@ int main() {
8282
test<int, 1, intel::bit_or<int>>(0, 8, 256);
8383
test<int, 1, intel::bit_xor<int>>(0, 8, 256);
8484
test<int, 1, intel::bit_and<int>>(~0, 8, 256);
85-
test<int, 1, intel::minimum<int>>(std::numeric_limits<int>::max(), 8, 256);
86-
test<int, 1, intel::maximum<int>>(std::numeric_limits<int>::min(), 8, 256);
85+
test<int, 1, intel::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
86+
test<int, 1, intel::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
8787

8888
// Check with various types.
8989
test<float, 1, std::multiplies<float>>(1, 8, 256);
90-
test<float, 1, intel::minimum<float>>(std::numeric_limits<float>::max(), 8, 256);
91-
test<float, 1, intel::maximum<float>>(std::numeric_limits<float>::min(), 8, 256);
90+
test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
91+
test<float, 1, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
9292

9393
test<double, 1, std::multiplies<double>>(1, 8, 256);
94-
test<double, 1, intel::minimum<double>>(std::numeric_limits<double>::max(), 8, 256);
95-
test<double, 1, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
94+
test<double, 1, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
95+
test<double, 1, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);
9696

9797
// Check with CUSTOM type.
9898
test<CustomVec<long long>, 1, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);

sycl/test/reduction/reduction_placeholder.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,8 @@ int main() {
7272
test<int, 1, intel::bit_or<int>>(0, 4, 128);
7373

7474
// fast reduce
75-
test<float, 1, intel::minimum<float>>(std::numeric_limits<float>::max(), 5, 5 * 7);
76-
test<float, 0, intel::maximum<float>>(std::numeric_limits<float>::min(), 4, 128);
75+
test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 5, 5 * 7);
76+
test<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 4, 128);
7777

7878
// generic algorithm
7979
test<int, 0, std::multiplies<int>>(1, 7, 7 * 5);

sycl/test/reduction/reduction_transparent.cpp

Lines changed: 72 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,17 @@
1+
// UNSUPPORTED: cuda
2+
// Reductions use work-group builtins not yet supported by CUDA.
3+
14
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2-
// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
3-
// RUN: %CPU_RUN_PLACEHOLDER %t.out
45
// RUN: %GPU_RUN_PLACEHOLDER %t.out
5-
// RUN: %ACC_RUN_PLACEHOLDER %t.out
6+
7+
// TODO: enable all checks for CPU/ACC when CPU/ACC RT supports intel::reduce()
8+
// for 'cl::sycl::half' type.
9+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DSKIP_FOR_HALF -o %t.no_half.out
10+
// RUN: %ACC_RUN_PLACEHOLDER %t.no_half.out
11+
// RUN: %CPU_RUN_PLACEHOLDER %t.no_half.out
12+
13+
// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
14+
// TODO: Enable the test for HOST when it supports intel::reduce() and barrier()
615

716
// This test performs basic checks of parallel_for(nd_range, reduction, func)
817
// where func is a transparent functor.
@@ -14,10 +23,14 @@
1423
using namespace cl::sycl;
1524

1625
template <typename T, int Dim, class BinaryOperation>
17-
class SomeClass;
26+
class SomeIdClass;
27+
template <typename T, int Dim, class BinaryOperation>
28+
class SomeNoIdClass;
1829

30+
// Checks reductions initialized with transparent functor and explicitly set
31+
// identity value.
1932
template <typename T, int Dim, class BinaryOperation>
20-
void test(T Identity, size_t WGSize, size_t NWItems) {
33+
void testId(T Identity, size_t WGSize, size_t NWItems) {
2134
buffer<T, 1> InBuf(NWItems);
2235
buffer<T, 1> OutBuf(1);
2336

@@ -32,13 +45,12 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
3245
auto In = InBuf.template get_access<access::mode::read>(CGH);
3346
accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
3447
Out(OutBuf, CGH);
35-
auto Redu = intel::reduction(Out, Identity, BOp);
3648

3749
range<1> GlobalRange(NWItems);
3850
range<1> LocalRange(WGSize);
3951
nd_range<1> NDRange(GlobalRange, LocalRange);
40-
CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
41-
NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
52+
CGH.parallel_for<SomeIdClass<T, Dim, BinaryOperation>>(
53+
NDRange, intel::reduction(Out, Identity, BOp), [=](nd_item<1> NDIt, auto &Sum) {
4254
Sum.combine(In[NDIt.get_global_linear_id()]);
4355
});
4456
});
@@ -54,12 +66,62 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
5466
}
5567
}
5668

69+
// Checks reductions initialized with transparent functor and identity
70+
// value not explicitly specified. The parameter 'Identity' is passed here
71+
// only to pre-initialize input data correctly.
72+
template <typename T, int Dim, class BinaryOperation>
73+
void testNoId(T Identity, size_t WGSize, size_t NWItems) {
74+
buffer<T, 1> InBuf(NWItems);
75+
buffer<T, 1> OutBuf(1);
76+
77+
// Initialize.
78+
BinaryOperation BOp;
79+
T CorrectOut;
80+
initInputData(InBuf, CorrectOut, Identity, BOp, NWItems);
81+
82+
// Compute.
83+
queue Q;
84+
Q.submit([&](handler &CGH) {
85+
auto In = InBuf.template get_access<access::mode::read>(CGH);
86+
accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
87+
Out(OutBuf, CGH);
88+
89+
range<1> GlobalRange(NWItems);
90+
range<1> LocalRange(WGSize);
91+
nd_range<1> NDRange(GlobalRange, LocalRange);
92+
CGH.parallel_for<SomeNoIdClass<T, Dim, BinaryOperation>>(
93+
NDRange, intel::reduction(Out, BOp), [=](nd_item<1> NDIt, auto &Sum) {
94+
Sum.combine(In[NDIt.get_global_linear_id()]);
95+
});
96+
});
97+
98+
// Check correctness.
99+
auto Out = OutBuf.template get_access<access::mode::read>();
100+
T ComputedOut = *(Out.get_pointer());
101+
if (ComputedOut != CorrectOut) {
102+
std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n";
103+
std::cout << "Computed value: " << ComputedOut
104+
<< ", Expected value: " << CorrectOut << "\n";
105+
assert(0 && "Wrong value.");
106+
}
107+
}
108+
109+
template <typename T, int Dim, class BinaryOperation>
110+
void test(T Identity, size_t WGSize, size_t NWItems) {
111+
testId<T, Dim, BinaryOperation>(Identity, WGSize, NWItems);
112+
testNoId<T, Dim, BinaryOperation>(Identity, WGSize, NWItems);
113+
}
114+
57115
int main() {
58116
#if __cplusplus >= 201402L
59-
test<double, 0, intel::maximum<>>(std::numeric_limits<double>::min(), 7, 7 * 5);
117+
test<double, 0, intel::maximum<>>(getMinimumFPValue<double>(), 7, 7 * 5);
60118
test<signed char, 0, intel::plus<>>(0, 7, 49);
61119
test<unsigned char, 1, std::multiplies<>>(1, 4, 16);
62-
#endif
120+
#ifndef SKIP_FOR_HALF
121+
test<half, 1, intel::plus<>>(0, 4, 8);
122+
test<half, 1, intel::minimum<>>(getMaximumFPValue<half>(), 8, 32);
123+
#endif // SKIP_FOR_HALF
124+
#endif // __cplusplus >= 201402L
63125

64126
std::cout << "Test passed\n";
65127
return 0;

sycl/test/reduction/reduction_usm.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -113,10 +113,8 @@ int main() {
113113
testUSM<int, 1, intel::bit_or<int>>(0, 4, 128);
114114

115115
// fast reduce
116-
testUSM<float, 1, intel::minimum<float>>(
117-
(std::numeric_limits<float>::max)(), 5, 5 * 7);
118-
testUSM<float, 0, intel::maximum<float>>(
119-
(std::numeric_limits<float>::min)(), 4, 128);
116+
testUSM<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 5, 5 * 7);
117+
testUSM<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 4, 128);
120118

121119
// generic algorithm
122120
testUSM<int, 0, std::multiplies<int>>(1, 7, 7 * 5);

sycl/test/reduction/reduction_utils.hpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,17 @@ struct CustomVecPlus {
5252
return CV(A.X + B.X, A.Y + B.Y);
5353
}
5454
};
55+
56+
template <typename T>
57+
T getMinimumFPValue() {
58+
return std::numeric_limits<T>::has_infinity
59+
? -std::numeric_limits<T>::infinity()
60+
: std::numeric_limits<T>::lowest();
61+
}
62+
63+
template <typename T>
64+
T getMaximumFPValue() {
65+
return std::numeric_limits<T>::has_infinity
66+
? std::numeric_limits<T>::infinity()
67+
: (std::numeric_limits<T>::max)();
68+
}

0 commit comments

Comments
 (0)