@@ -26,8 +26,7 @@ auto createReduction(T *USMPtr, T Identity, BinaryOperation BOp) {
26
26
return ONEAPI::reduction (USMPtr, Identity, BOp);
27
27
}
28
28
29
- template <typename Name, bool IsSYCL2020Mode, typename T, int Dim,
30
- class BinaryOperation >
29
+ template <typename Name, bool IsSYCL2020Mode, typename T, class BinaryOperation >
31
30
void test (T Identity, size_t WGSize, size_t NWItems, usm::alloc AllocType) {
32
31
queue Q;
33
32
auto Dev = Q.get_device ();
@@ -46,11 +45,10 @@ void test(T Identity, size_t WGSize, size_t NWItems, usm::alloc AllocType) {
46
45
if (ReduVarPtr == nullptr )
47
46
return ;
48
47
if (AllocType == usm::alloc::device) {
49
- event E = Q.submit ([&](handler &CGH) {
50
- CGH.single_task <KernelNameGroup<Name, class Init >>(
51
- [=]() { *ReduVarPtr = Identity; });
52
- });
53
- E.wait ();
48
+ Q.submit ([&](handler &CGH) {
49
+ CGH.single_task <KernelNameGroup<Name, class Init >>(
50
+ [=]() { *ReduVarPtr = Identity; });
51
+ }).wait ();
54
52
} else {
55
53
*ReduVarPtr = Identity;
56
54
}
@@ -64,26 +62,24 @@ void test(T Identity, size_t WGSize, size_t NWItems, usm::alloc AllocType) {
64
62
65
63
// Compute.
66
64
Q.submit ([&](handler &CGH) {
67
- auto In = InBuf.template get_access <access::mode::read>(CGH);
68
- auto Redu = createReduction<IsSYCL2020Mode>(ReduVarPtr, Identity, BOp);
69
- range<1 > GlobalRange (NWItems);
70
- range<1 > LocalRange (WGSize);
71
- nd_range<1 > NDRange (GlobalRange, LocalRange);
72
- CGH.parallel_for <KernelNameGroup<Name, class Test >>(
73
- NDRange, Redu, [=](nd_item<1 > NDIt, auto &Sum) {
74
- Sum.combine (In[NDIt.get_global_linear_id ()]);
75
- });
76
- });
77
- Q.wait ();
65
+ auto In = InBuf.template get_access <access::mode::read>(CGH);
66
+ auto Redu = createReduction<IsSYCL2020Mode>(ReduVarPtr, Identity, BOp);
67
+ nd_range<1 > NDRange (range<1 >{NWItems}, range<1 >{WGSize});
68
+ CGH.parallel_for <KernelNameGroup<Name, class Test >>(
69
+ NDRange, Redu, [=](nd_item<1 > NDIt, auto &Sum) {
70
+ Sum.combine (In[NDIt.get_global_linear_id ()]);
71
+ });
72
+ }).wait ();
78
73
79
74
// Check correctness.
80
75
T ComputedOut;
81
76
if (AllocType == usm::alloc::device) {
82
77
buffer<T, 1 > Buf (&ComputedOut, range<1 >(1 ));
83
- event E = Q.submit ([&](handler &CGH) {
84
- auto OutAcc = Buf.template get_access <access::mode::discard_write>(CGH);
85
- CGH.copy (ReduVarPtr, OutAcc);
86
- });
78
+ Q.submit ([&](handler &CGH) {
79
+ auto OutAcc = Buf.template get_access <access::mode::discard_write>(CGH);
80
+ CGH.single_task <KernelNameGroup<Name, class Check >>(
81
+ [=]() { OutAcc[0 ] = *ReduVarPtr; });
82
+ }).wait ();
87
83
ComputedOut = (Buf.template get_access <access::mode::read>())[0 ];
88
84
} else {
89
85
ComputedOut = *ReduVarPtr;
@@ -99,42 +95,41 @@ void test(T Identity, size_t WGSize, size_t NWItems, usm::alloc AllocType) {
99
95
free (ReduVarPtr, Q.get_context ());
100
96
}
101
97
102
- template <typename Name, typename T, int Dim, class BinaryOperation >
98
+ template <typename Name, typename T, class BinaryOperation >
103
99
void testUSM (T Identity, size_t WGSize, size_t NWItems) {
104
- test<KernelNameGroup<Name, class SharedCase >, false , T, Dim, BinaryOperation>(
100
+ test<KernelNameGroup<Name, class SharedCase >, false , T, BinaryOperation>(
105
101
Identity, WGSize, NWItems, usm::alloc::shared);
106
- test<KernelNameGroup<Name, class HostCase >, false , T, Dim, BinaryOperation>(
102
+ test<KernelNameGroup<Name, class HostCase >, false , T, BinaryOperation>(
107
103
Identity, WGSize, NWItems, usm::alloc::host);
108
- test<KernelNameGroup<Name, class DeviceCase >, false , T, Dim, BinaryOperation>(
104
+ test<KernelNameGroup<Name, class DeviceCase >, false , T, BinaryOperation>(
109
105
Identity, WGSize, NWItems, usm::alloc::device);
110
106
111
- test<KernelNameGroup<Name, class SharedCase2020 >, true , T, Dim,
112
- BinaryOperation>( Identity, WGSize, NWItems, usm::alloc::shared);
113
- test<KernelNameGroup<Name, class HostCase2020 >, true , T, Dim,
114
- BinaryOperation>( Identity, WGSize, NWItems, usm::alloc::host);
115
- test<KernelNameGroup<Name, class DeviceCase2020 >, true , T, Dim,
116
- BinaryOperation>( Identity, WGSize, NWItems, usm::alloc::device);
107
+ test<KernelNameGroup<Name, class SharedCase2020 >, true , T, BinaryOperation>(
108
+ Identity, WGSize, NWItems, usm::alloc::shared);
109
+ test<KernelNameGroup<Name, class HostCase2020 >, true , T, BinaryOperation>(
110
+ Identity, WGSize, NWItems, usm::alloc::host);
111
+ test<KernelNameGroup<Name, class DeviceCase2020 >, true , T, BinaryOperation>(
112
+ Identity, WGSize, NWItems, usm::alloc::device);
117
113
}
118
114
119
115
int main () {
120
116
// fast atomics and fast reduce
121
- testUSM<class AtomicReduce1 , int , 1 , ONEAPI::plus<int >>(0 , 49 , 49 * 5 );
122
- testUSM<class AtomicReduce2 , int , 0 , ONEAPI::plus<int >>(0 , 8 , 128 );
117
+ testUSM<class AtomicReduce1 , int , ONEAPI::plus<int >>(0 , 49 , 49 );
118
+ testUSM<class AtomicReduce2 , int , ONEAPI::plus<int >>(0 , 8 , 32 );
123
119
124
120
// fast atomics
125
- testUSM<class Atomic1 , int , 0 , ONEAPI::bit_or<int >>(0 , 7 , 7 * 3 );
126
- testUSM<class Atomic2 , int , 1 , ONEAPI::bit_or<int >>(0 , 4 , 128 );
121
+ testUSM<class Atomic1 , int , ONEAPI::bit_or<int >>(0 , 7 , 7 * 3 );
122
+ testUSM<class Atomic2 , int , ONEAPI::bit_or<int >>(0 , 4 , 32 );
127
123
128
124
// fast reduce
129
- testUSM<class Reduce1 , float , 1 , ONEAPI::minimum<float >>(
130
- getMaximumFPValue<float >(), 5 , 5 * 7 );
131
- testUSM<class Reduce2 , float , 0 , ONEAPI::maximum<float >>(
132
- getMinimumFPValue<float >(), 4 , 128 );
125
+ testUSM<class Reduce1 , float , ONEAPI::minimum<float >>(
126
+ getMaximumFPValue<float >(), 17 , 17 );
127
+ testUSM<class Reduce2 , float , ONEAPI::maximum<float >>(
128
+ getMinimumFPValue<float >(), 4 , 32 );
133
129
134
130
// generic algorithm
135
- testUSM<class Generic1 , int , 0 , std::multiplies<int >>(1 , 7 , 7 * 5 );
136
- testUSM<class Generic2 , int , 1 , std::multiplies<int >>(1 , 8 , 16 );
137
- testUSM<class Generic3 , CustomVec<short >, 0 , CustomVecPlus<short >>(
131
+ testUSM<class Generic1 , int , std::multiplies<int >>(1 , 7 , 7 );
132
+ testUSM<class Generic2 , CustomVec<short >, CustomVecPlus<short >>(
138
133
CustomVec<short >(0 ), 8 , 8 * 3 );
139
134
140
135
std::cout << " Test passed\n " ;
0 commit comments