Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit f53c4a4

Browse files
committed
[SYCL] Improve reduction tests time by about 200x by re-using queue
Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent f96a788 commit f53c4a4

14 files changed

+194
-344
lines changed

SYCL/Reduction/reduction_big_data.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,7 @@ size_t getSafeMaxWGSize(size_t MaxWGSize, size_t MemSize, size_t OneElemSize) {
2828
}
2929

3030
template <typename KernelName, typename T, int Dim, class BinaryOperation>
31-
void test(T Identity) {
32-
queue Q;
31+
void test(queue &Q, T Identity) {
3332
device Device = Q.get_device();
3433

3534
std::size_t MaxWGSize = Device.get_info<info::device::max_work_group_size>();
@@ -99,10 +98,11 @@ template <class T> struct BigCustomVecPlus {
9998
};
10099

101100
int main() {
102-
test<class Test1, float, 0, ONEAPI::maximum<>>(getMinimumFPValue<float>());
101+
queue Q;
102+
test<class Test1, float, 0, ONEAPI::maximum<>>(Q, getMinimumFPValue<float>());
103103

104104
using BCV = BigCustomVec<long long>;
105-
test<class Test2, BCV, 1, BigCustomVecPlus<long long>>(BCV(0));
105+
test<class Test2, BCV, 1, BigCustomVecPlus<long long>>(Q, BCV(0));
106106

107107
std::cout << "Test passed\n";
108108
return 0;

SYCL/Reduction/reduction_nd_N_vars.cpp

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ template <class Name, bool IsSYCL2020, typename T1, access::mode Mode1,
5050
typename T4, access::mode Mode4, class BinaryOperation1,
5151
class BinaryOperation2, class BinaryOperation3,
5252
class BinaryOperation4>
53-
int testOne(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
53+
int testOne(queue &Q, T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
5454
T2 IdentityVal2, T2 InitVal2, BinaryOperation2 BOp2,
5555
T3 IdentityVal3, T3 InitVal3, BinaryOperation3 BOp3,
5656
T4 IdentityVal4, T3 InitVal4, BinaryOperation4 BOp4,
@@ -63,7 +63,6 @@ int testOne(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
6363
buffer<T2, 1> OutBuf2(1);
6464
buffer<T3, 1> OutBuf3(1);
6565

66-
queue Q;
6766
auto Dev = Q.get_device();
6867
if (AllocType4 == usm::alloc::shared &&
6968
!Dev.get_info<info::device::usm_shared_allocations>())
@@ -213,32 +212,34 @@ template <class Name, typename T1, access::mode Mode1, typename T2,
213212
access::mode Mode2, typename T3, access::mode Mode3, typename T4,
214213
access::mode Mode4, class BinaryOperation1, class BinaryOperation2,
215214
class BinaryOperation3, class BinaryOperation4>
216-
int testBoth(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
215+
int testBoth(queue &Q, T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
217216
T2 IdentityVal2, T2 InitVal2, BinaryOperation2 BOp2,
218217
T3 IdentityVal3, T3 InitVal3, BinaryOperation3 BOp3,
219218
T4 IdentityVal4, T3 InitVal4, BinaryOperation4 BOp4,
220219
usm::alloc AllocType4, size_t NWorkItems, size_t WGSize) {
221220
int Error =
222221
testOne<KName<Name, false>, false, T1, Mode1, T2, Mode2, T3, Mode3, T4,
223-
Mode4>(IdentityVal1, InitVal1, BOp1, IdentityVal2, InitVal2, BOp2,
224-
IdentityVal3, InitVal3, BOp3, IdentityVal4, InitVal4, BOp4,
225-
AllocType4, NWorkItems, WGSize);
226-
227-
Error += testOne<KName<Name, true>, true, T1, Mode1, T2, Mode2, T3, Mode3, T4,
228-
Mode4>(IdentityVal1, InitVal1, BOp1, IdentityVal2, InitVal2,
229-
BOp2, IdentityVal3, InitVal3, BOp3, IdentityVal4,
230-
InitVal4, BOp4, AllocType4, NWorkItems, WGSize);
222+
Mode4>(Q, IdentityVal1, InitVal1, BOp1, IdentityVal2, InitVal2,
223+
BOp2, IdentityVal3, InitVal3, BOp3, IdentityVal4, InitVal4,
224+
BOp4, AllocType4, NWorkItems, WGSize);
225+
226+
Error +=
227+
testOne<KName<Name, true>, true, T1, Mode1, T2, Mode2, T3, Mode3, T4,
228+
Mode4>(Q, IdentityVal1, InitVal1, BOp1, IdentityVal2, InitVal2,
229+
BOp2, IdentityVal3, InitVal3, BOp3, IdentityVal4, InitVal4,
230+
BOp4, AllocType4, NWorkItems, WGSize);
231231
return Error;
232232
}
233233

234234
int main() {
235+
queue Q;
235236
int Error = testBoth<class Case1, float, DW, int, RW, short, RW, int, RW>(
236-
0, 1000, std::plus<float>{}, 0, 2000, std::plus<>{}, 0, 4000,
237+
Q, 0, 1000, std::plus<float>{}, 0, 2000, std::plus<>{}, 0, 4000,
237238
std::bit_or<>{}, 0, 8000, std::bit_xor<>{}, usm::alloc::shared, 16, 16);
238239

239240
auto Add = [](auto x, auto y) { return (x + y); };
240241
Error += testBoth<class Case2, float, RW, int, RW, short, DW, int, DW>(
241-
0, 1000, std::plus<float>{}, 0, 2000, std::plus<>{}, 0, 4000, Add, 0,
242+
Q, 0, 1000, std::plus<float>{}, 0, 2000, std::plus<>{}, 0, 4000, Add, 0,
242243
8000, std::plus<>{}, usm::alloc::device, 5 * (256 + 1), 5);
243244

244245
if (!Error)

SYCL/Reduction/reduction_nd_conditional.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ template <class T> struct VecPlus {
5656

5757
template <typename SpecializationKernelName, typename T, int Dim,
5858
class BinaryOperation>
59-
void test(T Identity, size_t WGSize, size_t NWItems) {
59+
void test(queue &Q, T Identity, size_t WGSize, size_t NWItems) {
6060
buffer<T, 1> InBuf(NWItems);
6161
buffer<T, 1> OutBuf(1);
6262

@@ -66,7 +66,6 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
6666
initInputData(InBuf, CorrectOut, Identity, BOp, NWItems);
6767

6868
// Compute.
69-
queue Q;
7069
Q.submit([&](handler &CGH) {
7170
auto In = InBuf.template get_access<access::mode::read>(CGH);
7271
accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
@@ -100,11 +99,11 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
10099
}
101100

102101
int main() {
103-
test<class KernelName_lAx, int, 0, ONEAPI::plus<int>>(0, 2, 2);
104-
test<class KernelName_eVBkBK, int, 1, ONEAPI::plus<int>>(0, 7, 7);
105-
test<class KernelName_vMSyszeYKJbaXATnPL, int, 0, ONEAPI::plus<int>>(0, 2,
106-
64);
107-
test<class KernelName_UPKnfG, short, 1, ONEAPI::plus<short>>(0, 16, 256);
102+
queue Q;
103+
test<class A, int, 0, ONEAPI::plus<int>>(Q, 0, 2, 2);
104+
test<class B, int, 1, ONEAPI::plus<int>>(Q, 0, 7, 7);
105+
test<class C, int, 0, ONEAPI::plus<int>>(Q, 0, 2, 64);
106+
test<class D, short, 1, ONEAPI::plus<short>>(Q, 0, 16, 256);
108107

109108
std::cout << "Test passed\n";
110109
return 0;

SYCL/Reduction/reduction_nd_ext_type.hpp

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ constexpr access::mode DW = access::mode::discard_write;
1414

1515
template <typename Name, bool IsSYCL2020Mode, typename T, int Dim,
1616
access::mode Mode, class BinaryOperation>
17-
void test(T Identity, T Init, size_t WGSize, size_t NWItems) {
17+
void test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) {
1818
buffer<T, 1> InBuf(NWItems);
1919
buffer<T, 1> OutBuf(1);
2020

@@ -28,7 +28,6 @@ void test(T Identity, T Init, size_t WGSize, size_t NWItems) {
2828
(OutBuf.template get_access<access::mode::write>())[0] = Init;
2929

3030
// Compute.
31-
queue Q;
3231
nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize});
3332
if constexpr (IsSYCL2020Mode) {
3433
Q.submit([&](handler &CGH) {
@@ -69,38 +68,38 @@ void test(T Identity, T Init, size_t WGSize, size_t NWItems) {
6968

7069
template <typename Name, typename T, int Dim, access::mode Mode,
7170
class BinaryOperation>
72-
void testBoth(T Identity, T Init, size_t WGSize, size_t NWItems) {
71+
void testBoth(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) {
7372
test<KName<Name, false>, false, T, Dim, Mode, BinaryOperation>(
74-
Identity, Init, WGSize, NWItems);
73+
Q, Identity, Init, WGSize, NWItems);
7574

76-
test<KName<Name, true>, true, T, Dim, Mode, BinaryOperation>(Identity, Init,
77-
WGSize, NWItems);
75+
test<KName<Name, true>, true, T, Dim, Mode, BinaryOperation>(
76+
Q, Identity, Init, WGSize, NWItems);
7877
}
7978

8079
template <typename T> int runTests(const string_class &ExtensionName) {
81-
device D = default_selector().select_device();
80+
queue Q;
81+
device D = Q.get_device();
8282
if (!D.is_host() && !D.has_extension(ExtensionName)) {
8383
std::cout << "Test skipped\n";
8484
return 0;
8585
}
8686

87-
testBoth<class A, T, 1, RW, std::multiplies<T>>(1, 77, 4, 4);
87+
testBoth<class A, T, 1, RW, std::multiplies<T>>(Q, 1, 77, 4, 4);
8888

89-
testBoth<class B1, T, 0, DW, ONEAPI::plus<T>>(0, 77, 4, 64);
90-
testBoth<class B2, T, 1, RW, ONEAPI::plus<>>(0, 33, 3, 3 * 5);
89+
testBoth<class B1, T, 0, DW, ONEAPI::plus<T>>(Q, 0, 77, 4, 64);
90+
testBoth<class B2, T, 1, RW, ONEAPI::plus<>>(Q, 0, 33, 3, 3 * 5);
9191

92-
testBoth<class C1, T, 0, RW, ONEAPI::minimum<T>>(getMaximumFPValue<T>(),
92+
testBoth<class C1, T, 0, RW, ONEAPI::minimum<T>>(Q, getMaximumFPValue<T>(),
9393
-10.0, 7, 7);
94-
testBoth<class C2, T, 0, RW, ONEAPI::minimum<T>>(getMaximumFPValue<T>(), 99.0,
95-
7, 7);
96-
testBoth<class C3, T, 1, DW, ONEAPI::minimum<>>(getMaximumFPValue<T>(), -99.0,
97-
3, 3);
98-
99-
testBoth<class D1, T, 0, DW, ONEAPI::maximum<>>(getMinimumFPValue<T>(), 99.0,
100-
3, 3);
101-
testBoth<class D2, T, 1, RW, ONEAPI::maximum<T>>(getMinimumFPValue<T>(), 99.0,
102-
7, 7 * 5);
103-
94+
testBoth<class C2, T, 0, RW, ONEAPI::minimum<T>>(Q, getMaximumFPValue<T>(),
95+
99.0, 7, 7);
96+
testBoth<class C3, T, 1, DW, ONEAPI::minimum<>>(Q, getMaximumFPValue<T>(),
97+
-99.0, 3, 3);
98+
99+
testBoth<class D1, T, 0, DW, ONEAPI::maximum<>>(Q, getMinimumFPValue<T>(),
100+
99.0, 3, 3);
101+
testBoth<class D2, T, 1, RW, ONEAPI::maximum<T>>(Q, getMinimumFPValue<T>(),
102+
99.0, 7, 7 * 5);
104103
std::cout << "Test passed\n";
105104
return 0;
106105
}

SYCL/Reduction/reduction_nd_s0_dw.cpp

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ struct CustomAllocator : public sycl::buffer_allocator {};
1919
template <typename T, bool B> class KName;
2020

2121
template <typename Name, bool IsSYCL2020Mode, typename T, class BinaryOperation>
22-
void test(T Identity, T Init, size_t WGSize, size_t NWItems) {
22+
void test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) {
2323
buffer<T, 1> InBuf(NWItems);
2424
buffer<T, 1, CustomAllocator> OutBuf(1);
2525

@@ -33,7 +33,7 @@ void test(T Identity, T Init, size_t WGSize, size_t NWItems) {
3333
(OutBuf.template get_access<access::mode::write>())[0] = Init;
3434

3535
// Compute.
36-
queue Q;
36+
3737
nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize});
3838
if constexpr (IsSYCL2020Mode) {
3939
Q.submit([&](handler &CGH) {
@@ -70,37 +70,39 @@ void test(T Identity, T Init, size_t WGSize, size_t NWItems) {
7070
}
7171

7272
template <typename Name, typename T, class BinaryOperation>
73-
void testBoth(T Identity, T Init, size_t WGSize, size_t NWItems) {
74-
test<KName<Name, false>, false, T, BinaryOperation>(Identity, Init, WGSize,
75-
NWItems);
76-
test<KName<Name, true>, true, T, BinaryOperation>(Identity, Init, WGSize,
73+
void testBoth(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) {
74+
test<KName<Name, false>, false, T, BinaryOperation>(Q, Identity, Init,
75+
WGSize, NWItems);
76+
test<KName<Name, true>, true, T, BinaryOperation>(Q, Identity, Init, WGSize,
7777
NWItems);
7878
}
7979

8080
int main() {
81+
queue Q;
82+
8183
// Check some non power-of-two work-group sizes.
82-
testBoth<class A1, int, ONEAPI::plus<int>>(0, 99, 1, 7);
83-
testBoth<class A2, int, ONEAPI::plus<int>>(0, 99, 49, 49 * 5);
84+
testBoth<class A1, int, ONEAPI::plus<int>>(Q, 0, 99, 1, 7);
85+
testBoth<class A2, int, ONEAPI::plus<int>>(Q, 0, 99, 49, 49 * 5);
8486

8587
// Try some power-of-two work-group sizes.
86-
testBoth<class B1, int, ONEAPI::plus<int>>(0, 99, 1, 32);
87-
testBoth<class B2, int, std::multiplies<int>>(1, 99, 4, 32);
88-
testBoth<class B4, int, ONEAPI::bit_xor<int>>(0, 99, 16, 256);
89-
testBoth<class B5, int, ONEAPI::bit_and<int>>(~0, 99, 32, 256);
88+
testBoth<class B1, int, ONEAPI::plus<int>>(Q, 0, 99, 1, 32);
89+
testBoth<class B2, int, std::multiplies<int>>(Q, 1, 99, 4, 32);
90+
testBoth<class B4, int, ONEAPI::bit_xor<int>>(Q, 0, 99, 16, 256);
91+
testBoth<class B5, int, ONEAPI::bit_and<int>>(Q, ~0, 99, 32, 256);
9092
testBoth<class B6, int, ONEAPI::minimum<int>>(
91-
(std::numeric_limits<int>::max)(), -99, 64, 256);
93+
Q, (std::numeric_limits<int>::max)(), -99, 64, 256);
9294
testBoth<class B7, int, ONEAPI::maximum<int>>(
93-
(std::numeric_limits<int>::min)(), 99, 128, 256);
94-
testBoth<class B8, int, ONEAPI::plus<>>(0, 99, 256, 256);
95+
Q, (std::numeric_limits<int>::min)(), 99, 128, 256);
96+
testBoth<class B8, int, ONEAPI::plus<>>(Q, 0, 99, 256, 256);
9597

9698
// Check with various types.
97-
testBoth<class C1, float, std::multiplies<>>(1, 99, 8, 256);
98-
testBoth<class C2, short, ONEAPI::minimum<>>(0x7fff, -99, 8, 256);
99-
testBoth<class C3, unsigned char, ONEAPI::maximum<>>(0, 99, 8, 256);
99+
testBoth<class C1, float, std::multiplies<>>(Q, 1, 99, 8, 256);
100+
testBoth<class C2, short, ONEAPI::minimum<>>(Q, 0x7fff, -99, 8, 256);
101+
testBoth<class C3, unsigned char, ONEAPI::maximum<>>(Q, 0, 99, 8, 256);
100102

101103
// Check with CUSTOM type.
102104
testBoth<class D1, CustomVec<long long>, CustomVecPlus<long long>>(
103-
CustomVec<long long>(0), CustomVec<long long>(99), 8, 256);
105+
Q, CustomVec<long long>(0), CustomVec<long long>(99), 8, 256);
104106

105107
std::cout << "Test passed\n";
106108
return 0;

SYCL/Reduction/reduction_nd_s0_rw.cpp

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ struct CustomAllocator : public sycl::buffer_allocator {};
1919
template <typename T, bool B> class KName;
2020

2121
template <typename Name, bool IsSYCL2020Mode, typename T, class BinaryOperation>
22-
void test(T Identity, T Init, size_t WGSize, size_t NWItems) {
22+
void test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) {
2323
buffer<T, 1> InBuf(NWItems);
2424
buffer<T, 1, CustomAllocator> OutBuf(1);
2525

@@ -34,7 +34,6 @@ void test(T Identity, T Init, size_t WGSize, size_t NWItems) {
3434
(OutBuf.template get_access<access::mode::write>())[0] = Init;
3535

3636
// Compute.
37-
queue Q;
3837
nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize});
3938
if constexpr (IsSYCL2020Mode) {
4039
Q.submit([&](handler &CGH) {
@@ -70,38 +69,40 @@ void test(T Identity, T Init, size_t WGSize, size_t NWItems) {
7069
}
7170

7271
template <typename Name, typename T, class BinaryOperation>
73-
void testBoth(T Identity, T Init, size_t WGSize, size_t NWItems) {
74-
test<KName<Name, false>, false, T, BinaryOperation>(Identity, Init, WGSize,
72+
void testBoth(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) {
73+
test<KName<Name, false>, false, T, BinaryOperation>(Q, Identity, Init, WGSize,
7574
NWItems);
76-
test<KName<Name, true>, true, T, BinaryOperation>(Identity, Init, WGSize,
75+
test<KName<Name, true>, true, T, BinaryOperation>(Q, Identity, Init, WGSize,
7776
NWItems);
7877
}
7978

8079
int main() {
80+
queue Q;
81+
8182
// Check non power-of-two work-group sizes.
82-
testBoth<class A1, int, ONEAPI::plus<int>>(0, 99, 1, 7);
83-
testBoth<class A2, int, ONEAPI::plus<int>>(0, -99, 49, 49 * 5);
83+
testBoth<class A1, int, ONEAPI::plus<int>>(Q, 0, 99, 1, 7);
84+
testBoth<class A2, int, ONEAPI::plus<int>>(Q, 0, -99, 49, 49 * 5);
8485

8586
// Try some power-of-two work-group sizes.
86-
testBoth<class B1, int, ONEAPI::plus<>>(0, 99, 2, 32);
87-
testBoth<class B2, int, ONEAPI::plus<>>(0, 199, 32, 32);
88-
testBoth<class B3, int, ONEAPI::plus<>>(0, 299, 128, 256);
89-
testBoth<class B4, int, ONEAPI::plus<>>(0, 399, 256, 256);
87+
testBoth<class B1, int, ONEAPI::plus<>>(Q, 0, 99, 2, 32);
88+
testBoth<class B2, int, ONEAPI::plus<>>(Q, 0, 199, 32, 32);
89+
testBoth<class B3, int, ONEAPI::plus<>>(Q, 0, 299, 128, 256);
90+
testBoth<class B4, int, ONEAPI::plus<>>(Q, 0, 399, 256, 256);
9091

9192
// Check with various operations and types.
92-
testBoth<class C1, int, std::multiplies<int>>(1, 2, 8, 256);
93-
testBoth<class C2, float, std::multiplies<float>>(1, 1.2, 8, 32);
94-
testBoth<class C3, short, ONEAPI::bit_or<>>(0, 0x3400, 4, 32);
95-
testBoth<class C4, int, ONEAPI::bit_xor<int>>(0, 0x12340000, 4, 32);
96-
testBoth<class C5, char, ONEAPI::bit_and<>>(~0, ~0, 4, 16);
93+
testBoth<class C1, int, std::multiplies<int>>(Q, 1, 2, 8, 256);
94+
testBoth<class C2, float, std::multiplies<float>>(Q, 1, 1.2, 8, 32);
95+
testBoth<class C3, short, ONEAPI::bit_or<>>(Q, 0, 0x3400, 4, 32);
96+
testBoth<class C4, int, ONEAPI::bit_xor<int>>(Q, 0, 0x12340000, 4, 32);
97+
testBoth<class C5, char, ONEAPI::bit_and<>>(Q, ~0, ~0, 4, 16);
9798
testBoth<class C6, int, ONEAPI::minimum<int>>(
98-
(std::numeric_limits<int>::max)(), -99, 8, 256);
99+
Q, (std::numeric_limits<int>::max)(), -99, 8, 256);
99100
testBoth<class C7, int, ONEAPI::maximum<float>>(
100-
(std::numeric_limits<int>::min)(), 99, 8, 256);
101+
Q, (std::numeric_limits<int>::min)(), 99, 8, 256);
101102

102103
// Check with CUSTOM type.
103104
testBoth<class D1, CustomVec<long long>, CustomVecPlus<long long>>(
104-
CustomVec<long long>(0), CustomVec<long long>(199), 8, 256);
105+
Q, CustomVec<long long>(0), CustomVec<long long>(199), 8, 256);
105106

106107
std::cout << "Test passed\n";
107108
return 0;

0 commit comments

Comments
 (0)