Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit 89d93f3

Browse files
authored
[SYCL] Add tests for SYCL2020 queue::parallel_for() shortcuts (#405)
* [SYCL] Add tests for SYCL2020 queue::parallel_for() shortcuts using reductions Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent 0640465 commit 89d93f3

File tree

4 files changed

+344
-60
lines changed

4 files changed

+344
-60
lines changed
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
4+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
5+
6+
// `Group algorithms are not supported on host device.` on NVidia.
7+
// XFAIL: hip_nvidia
8+
9+
// RUNx: %HOST_RUN_PLACEHOLDER %t.out
10+
// TODO: Enable the test for HOST when it supports ext::oneapi::reduce() and
11+
// barrier()
12+
13+
// This test only checks that the method queue::parallel_for() accepting
14+
// reduction, can be properly translated into queue::submit + parallel_for().
15+
16+
#include "reduction_utils.hpp"
17+
18+
using namespace sycl;
19+
20+
template <typename T, int I> class KName;
21+
22+
enum TestCase { NoDependencies, Dependency, DependenciesVector };
23+
24+
template <typename T> T *allocUSM(queue &Q, size_t Size) {
25+
if (!Q.get_device().has(getUSMAspect(usm::alloc::shared)))
26+
return nullptr;
27+
28+
return malloc_shared<T>(Size, Q);
29+
}
30+
31+
template <typename RangeT>
32+
void printNVarsTestLabel(bool IsSYCL2020, const RangeT &Range,
33+
bool ToCERR = false) {
34+
std::ostream &OS = ToCERR ? std::cerr : std::cout;
35+
std::string Mode = IsSYCL2020 ? "SYCL2020" : "ONEAPI ";
36+
OS << (ToCERR ? "Error" : "Start") << ": Mode=" << Mode
37+
<< ", Range=" << Range;
38+
if (!ToCERR)
39+
OS << std::endl;
40+
}
41+
42+
template <typename T1, typename T2, TestCase TC, int Dims, typename BOpT1,
43+
typename BOpT2>
44+
int test(queue &Q, BOpT1 BOp1, BOpT2 BOp2, const nd_range<Dims> &Range) {
45+
printNVarsTestLabel(true /*SYCL2020*/, Range);
46+
47+
size_t NElems = Range.get_global_range().size();
48+
T1 *Sum1 = allocUSM<T1>(Q, 1);
49+
T2 *Sum2 = allocUSM<T2>(Q, 1);
50+
T1 *Arr1 = allocUSM<T1>(Q, NElems);
51+
T2 *Arr2 = allocUSM<T2>(Q, NElems);
52+
if (!Sum1 || !Sum2 || !Arr1 || !Arr2) {
53+
sycl::free(Sum1, Q);
54+
sycl::free(Sum2, Q);
55+
sycl::free(Arr1, Q);
56+
sycl::free(Arr2, Q);
57+
std::cout << " SKIPPED due to unrelated problems with USM" << std::endl;
58+
return 0;
59+
}
60+
61+
*Sum2 = 0;
62+
auto R1 = sycl::reduction(
63+
Sum1, BOp1, property_list(property::reduction::initialize_to_identity{}));
64+
auto R2 = sycl::reduction(Sum2, static_cast<T2>(0), BOp2);
65+
66+
if constexpr (TC == TestCase::NoDependencies) {
67+
std::fill(Arr1, Arr1 + NElems, 1);
68+
std::fill(Arr2, Arr2 + NElems, 2);
69+
Q.parallel_for(Range, R1, R2,
70+
[=](nd_item<Dims> It, auto &Sum1, auto &Sum2) {
71+
size_t LinId = It.get_global_linear_id();
72+
Sum1.combine(static_cast<T1>(LinId) + Arr1[LinId]);
73+
Sum2.combine(static_cast<T2>(LinId) + Arr2[LinId]);
74+
})
75+
.wait();
76+
} else if constexpr (TC == TestCase::Dependency) {
77+
auto E = Q.single_task([=]() {
78+
std::fill(Arr1, Arr1 + NElems, 1);
79+
std::fill(Arr2, Arr2 + NElems, 2);
80+
});
81+
Q.parallel_for(Range, E, R1, R2,
82+
[=](nd_item<Dims> It, auto &Sum1, auto &Sum2) {
83+
size_t LinId = It.get_global_linear_id();
84+
Sum1.combine(static_cast<T1>(LinId) + Arr1[LinId]);
85+
Sum2.combine(static_cast<T2>(LinId) + Arr2[LinId]);
86+
})
87+
.wait();
88+
} else {
89+
auto E1 = Q.single_task([=]() { std::fill(Arr1, Arr1 + NElems, 1); });
90+
auto E2 = Q.single_task([=]() { std::fill(Arr2, Arr2 + NElems, 2); });
91+
std::vector<event> EVec{E1, E2};
92+
Q.parallel_for(Range, EVec, R1, R2,
93+
[=](nd_item<Dims> It, auto &Sum1, auto &Sum2) {
94+
size_t LinId = It.get_global_linear_id();
95+
Sum1.combine(static_cast<T1>(LinId) + Arr1[LinId]);
96+
Sum2.combine(static_cast<T2>(LinId) + Arr2[LinId]);
97+
})
98+
.wait();
99+
}
100+
101+
T1 ExpectedSum1 = NElems + (NElems - 1) * NElems / 2;
102+
T2 ExpectedSum2 = 2 * NElems + (NElems - 1) * NElems / 2;
103+
std::string AddInfo = "TestCase=";
104+
int Error = checkResults(Q, true /*SYCL2020*/, BOp1, Range, *Sum1,
105+
ExpectedSum1, AddInfo + std::to_string(1));
106+
Error += checkResults(Q, true /*SYCL2020*/, BOp2, Range, *Sum2, ExpectedSum2,
107+
AddInfo + std::to_string(2));
108+
109+
sycl::free(Sum1, Q);
110+
sycl::free(Sum2, Q);
111+
sycl::free(Arr1, Q);
112+
sycl::free(Arr2, Q);
113+
return Error;
114+
}
115+
116+
template <typename T1, typename T2, int Dims, typename BinaryOperation1,
117+
typename BinaryOperation2>
118+
int tests(queue &Q, BinaryOperation1 BOp1, BinaryOperation2 BOp2,
119+
const nd_range<Dims> &Range) {
120+
int NumErrors = 0;
121+
NumErrors += test<T1, T2, TestCase::NoDependencies>(Q, BOp1, BOp2, Range);
122+
NumErrors += test<T1, T2, TestCase::Dependency>(Q, BOp1, BOp2, Range);
123+
NumErrors += test<T1, T2, TestCase::DependenciesVector>(Q, BOp1, BOp2, Range);
124+
return NumErrors;
125+
}
126+
127+
int main() {
128+
queue Q;
129+
printDeviceInfo(Q);
130+
131+
int NumErrors = 0;
132+
auto LambdaSum = [](auto X, auto Y) { return (X + Y); };
133+
134+
NumErrors +=
135+
tests<int, short>(Q, std::plus<>{}, LambdaSum, nd_range<1>{32, 16});
136+
NumErrors += tests<int, short>(Q, std::plus<>{}, LambdaSum,
137+
nd_range<2>{range<2>{4, 4}, range<2>{2, 2}});
138+
NumErrors +=
139+
tests<int, short>(Q, std::plus<>{}, LambdaSum,
140+
nd_range<3>{range<3>{4, 4, 3}, range<3>{1, 2, 3}});
141+
142+
printFinalStatus(NumErrors);
143+
return NumErrors;
144+
}
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
4+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
5+
6+
// `Group algorithms are not supported on host device.` on NVidia.
7+
// XFAIL: hip_nvidia
8+
9+
// RUNx: %HOST_RUN_PLACEHOLDER %t.out
10+
// TODO: Enable the test for HOST when it supports ext::oneapi::reduce() and
11+
// barrier()
12+
13+
// This test only checks that the method queue::parallel_for() accepting
14+
// reduction, can be properly translated into queue::submit + parallel_for().
15+
16+
#include "reduction_utils.hpp"
17+
18+
using namespace sycl;
19+
20+
template <typename T, int I> class KName;
21+
22+
enum TestCase { NoDependencies, Dependency, DependenciesVector };
23+
24+
template <typename T> T *allocUSM(queue &Q, size_t Size) {
25+
if (!Q.get_device().has(getUSMAspect(usm::alloc::shared)))
26+
return nullptr;
27+
28+
return malloc_shared<T>(Size, Q);
29+
}
30+
31+
template <typename T, TestCase TC, int Dims, typename BinaryOperation>
32+
int test(queue &Q, BinaryOperation BOp, const nd_range<Dims> &Range) {
33+
printTestLabel<T, BinaryOperation>(true /*SYCL2020*/, Range);
34+
35+
size_t NElems = Range.get_global_range().size();
36+
T *Sum = allocUSM<T>(Q, 1);
37+
T *Arr = allocUSM<T>(Q, NElems);
38+
if (!Sum || !Arr) {
39+
std::cout << " SKIPPED due to unrelated problems with USM" << std::endl;
40+
sycl::free(Sum, Q);
41+
sycl::free(Arr, Q);
42+
return 0;
43+
}
44+
45+
auto Redu = sycl::reduction(
46+
Sum, BOp, property_list(property::reduction::initialize_to_identity{}));
47+
if constexpr (TC == TestCase::NoDependencies) {
48+
std::fill(Arr, Arr + NElems, 1);
49+
Q.parallel_for(Range, Redu, [=](nd_item<Dims> It, auto &Sum) {
50+
size_t LinId = It.get_global_linear_id();
51+
Sum.combine(static_cast<T>(LinId) + Arr[LinId]);
52+
}).wait();
53+
} else if constexpr (TC == TestCase::Dependency) {
54+
auto E = Q.single_task([=]() { std::fill(Arr, Arr + NElems, 1); });
55+
Q.parallel_for(Range, E, Redu, [=](nd_item<Dims> It, auto &Sum) {
56+
size_t LinId = It.get_global_linear_id();
57+
Sum.combine(static_cast<T>(LinId) + Arr[LinId]);
58+
}).wait();
59+
} else {
60+
auto E = Q.single_task([=]() { std::fill(Arr, Arr + NElems, 1); });
61+
std::vector<event> EVec{E};
62+
Q.parallel_for(Range, EVec, Redu, [=](nd_item<Dims> It, auto &Sum) {
63+
size_t LinId = It.get_global_linear_id();
64+
Sum.combine(static_cast<T>(LinId) + Arr[LinId]);
65+
}).wait();
66+
}
67+
68+
T ExpectedSum = NElems + (NElems - 1) * NElems / 2;
69+
int Error = checkResults(Q, true /*SYCL2020*/, BOp, Range, *Sum, ExpectedSum);
70+
free(Sum, Q);
71+
free(Arr, Q);
72+
return Error;
73+
}
74+
75+
template <typename T, int Dims, typename BinaryOperation>
76+
int tests(queue &Q, BinaryOperation BOp, const nd_range<Dims> &Range) {
77+
int NumErrors = 0;
78+
NumErrors += test<T, TestCase::NoDependencies>(Q, BOp, Range);
79+
NumErrors += test<T, TestCase::Dependency>(Q, BOp, Range);
80+
NumErrors += test<T, TestCase::DependenciesVector>(Q, BOp, Range);
81+
return NumErrors;
82+
}
83+
84+
int main() {
85+
queue Q;
86+
printDeviceInfo(Q);
87+
88+
int NumErrors = 0;
89+
NumErrors += tests<int>(Q, std::plus<>{}, nd_range<1>{32, 16});
90+
NumErrors +=
91+
tests<int>(Q, std::plus<>{}, nd_range<2>{range<2>{4, 4}, range<2>{2, 2}});
92+
NumErrors += tests<int>(Q, std::plus<>{},
93+
nd_range<3>{range<3>{4, 4, 3}, range<3>{1, 2, 3}});
94+
95+
printFinalStatus(NumErrors);
96+
return NumErrors;
97+
}

SYCL/Reduction/reduction_queue_parallel_for.cpp

Lines changed: 0 additions & 60 deletions
This file was deleted.

0 commit comments

Comments
 (0)