Skip to content

Commit f1e2b91

Browse files
committed
[SYCL][Fusion] Take auxiliary resources from fused command groups
Reductions use auxiliary resources to handle reduction temporal buffers. Assign those auxiliary resources from each fused command group to the placeholder fusion event. This event will not be marked as completed after the fused reductions finish execution either if fusion is completed, cancelled or aborted. Test is updated to check every algorithm that is selected automatically by `sycl::reduction`, i.e., every supported algorithm. We also cover both cases (fusion taking and not taking place). Signed-off-by: Victor Perez <[email protected]>
1 parent 2c85e99 commit f1e2b91

File tree

9 files changed

+145
-68
lines changed

9 files changed

+145
-68
lines changed

sycl/source/detail/jit_compiler.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -667,8 +667,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
667667
unsigned KernelIndex = 0;
668668
ParamList FusedParams;
669669
PromotionMap PromotedAccs;
670-
// TODO: Collect information about streams and auxiliary resources (which
671-
// contain reductions) and figure out how to fuse them.
670+
// TODO: Collect information about streams and figure out how
671+
// to fuse them.
672672
for (auto &RawCmd : InputKernels) {
673673
auto *KernelCmd = static_cast<ExecCGCommand *>(RawCmd);
674674
auto &CG = KernelCmd->getCG();

sycl/source/detail/scheduler/graph_builder.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1615,6 +1615,9 @@ Scheduler::GraphBuilder::completeFusion(QueueImplPtr Queue,
16151615
auto FusedKernelCmd =
16161616
std::make_unique<ExecCGCommand>(std::move(FusedCG), Queue);
16171617

1618+
// Inherit auxiliary resources from fused command groups
1619+
Scheduler::getInstance().takeAuxiliaryResources(FusedKernelCmd->getEvent(),
1620+
PlaceholderCmd->getEvent());
16181621
assert(PlaceholderCmd->MDeps.empty());
16191622
// Next, backwards iterate over all the commands in the fusion list and remove
16201623
// them from the graph to restore the state before starting fusion, so we can

sycl/source/detail/scheduler/scheduler.cpp

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -154,11 +154,11 @@ EventImplPtr Scheduler::addCG(
154154
for (const auto &StreamImplPtr : Streams) {
155155
StreamImplPtr->flush(NewEvent);
156156
}
157-
158-
if (!AuxiliaryResources.empty())
159-
registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources));
160157
}
161158

159+
if (!AuxiliaryResources.empty())
160+
registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources));
161+
162162
return NewEvent;
163163
}
164164

@@ -558,10 +558,35 @@ void Scheduler::cleanupDeferredMemObjects(BlockingT Blocking) {
558558
}
559559
}
560560

561+
static void registerAuxiliaryResourcesNoLock(
562+
std::unordered_map<EventImplPtr, std::vector<std::shared_ptr<const void>>>
563+
&AuxiliaryResources,
564+
const EventImplPtr &Event,
565+
std::vector<std::shared_ptr<const void>> &&Resources) {
566+
std::vector<std::shared_ptr<const void>> &StoredResources =
567+
AuxiliaryResources[Event];
568+
StoredResources.insert(StoredResources.end(),
569+
std::make_move_iterator(Resources.begin()),
570+
std::make_move_iterator(Resources.end()));
571+
}
572+
573+
void Scheduler::takeAuxiliaryResources(const EventImplPtr &Dst,
574+
const EventImplPtr &Src) {
575+
std::unique_lock<std::mutex> Lock{MAuxiliaryResourcesMutex};
576+
auto Iter = MAuxiliaryResources.find(Src);
577+
if (Iter == MAuxiliaryResources.end()) {
578+
return;
579+
}
580+
registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Dst,
581+
std::move(Iter->second));
582+
MAuxiliaryResources.erase(Iter);
583+
}
584+
561585
void Scheduler::registerAuxiliaryResources(
562586
EventImplPtr &Event, std::vector<std::shared_ptr<const void>> Resources) {
563587
std::unique_lock<std::mutex> Lock{MAuxiliaryResourcesMutex};
564-
MAuxiliaryResources.insert({Event, std::move(Resources)});
588+
registerAuxiliaryResourcesNoLock(MAuxiliaryResources, Event,
589+
std::move(Resources));
565590
}
566591

567592
void Scheduler::cleanupAuxiliaryResources(BlockingT Blocking) {

sycl/source/detail/scheduler/scheduler.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -543,6 +543,8 @@ class Scheduler {
543543
bool ShouldEnqueue;
544544
};
545545

546+
/// Assign \p Src's auxiliary resources to \p Dst.
547+
void takeAuxiliaryResources(const EventImplPtr &Dst, const EventImplPtr &Src);
546548
void registerAuxiliaryResources(
547549
EventImplPtr &Event, std::vector<std::shared_ptr<const void>> Resources);
548550
void cleanupAuxiliaryResources(BlockingT Blocking);
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
// RUN: %{build} -fsycl-embed-ir -o %t.out
2+
// RUN: %{run} %t.out
3+
// UNSUPPORTED: hip || cuda
4+
5+
#include "./reduction.hpp"
6+
7+
int main() {
8+
test<detail::reduction::strategy::group_reduce_and_last_wg_detection>();
9+
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
// RUN: %{build} -fsycl-embed-ir -o %t.out
2+
// RUN: %{run} %t.out
3+
4+
#include "./reduction.hpp"
5+
6+
int main() {
7+
test<detail::reduction::strategy::local_atomic_and_atomic_cross_wg>();
8+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
// RUN: %{build} -fsycl-embed-ir -o %t.out
2+
// RUN: %{run} %t.out
3+
4+
#include "./reduction.hpp"
5+
6+
int main() { test<detail::reduction::strategy::range_basic>(); }

sycl/test-e2e/KernelFusion/Reduction/reduction.cpp

Lines changed: 0 additions & 62 deletions
This file was deleted.
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
// Test fusion works with reductions. Some algorithms will lead to fusion being
2+
// cancelled in some devices. These should work properly anyway.
3+
4+
#include <sycl/sycl.hpp>
5+
6+
#include "../helpers.hpp"
7+
#include "sycl/detail/reduction_forward.hpp"
8+
9+
using namespace sycl;
10+
11+
constexpr inline size_t globalSize = 512;
12+
13+
template <detail::reduction::strategy Strategy> struct is_fusion_supported {
14+
constexpr static inline bool value =
15+
detail::reduction::strategy::group_reduce_and_last_wg_detection <=
16+
Strategy &&
17+
Strategy < detail::reduction::strategy::group_reduce_and_atomic_cross_wg;
18+
};
19+
20+
template <detail::reduction::strategy Strategy>
21+
constexpr inline bool is_fusion_supported_v =
22+
is_fusion_supported<Strategy>::value;
23+
24+
template <detail::reduction::strategy Strategy, bool Fuse>
25+
void test(nd_range<1> ndr) {
26+
static_assert(is_fusion_supported_v<Strategy>,
27+
"Testing unsupported algorithm");
28+
std::array<int, globalSize> data;
29+
int sumRes = 0;
30+
int maxRes = 0;
31+
32+
{
33+
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
34+
35+
buffer<int> dataBuf{data};
36+
buffer<int> sumBuf{&sumRes, 1};
37+
buffer<int> maxBuf{&maxRes, 1};
38+
39+
ext::codeplay::experimental::fusion_wrapper fw{q};
40+
41+
fw.start_fusion();
42+
iota(q, dataBuf, 0);
43+
44+
q.submit([&](handler &cgh) {
45+
accessor in(dataBuf, cgh, read_only);
46+
auto sumRed = reduction(sumBuf, cgh, plus<>{},
47+
property::reduction::initialize_to_identity{});
48+
detail::reduction_parallel_for<detail::auto_name, Strategy>(
49+
cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, sumRed,
50+
[=](nd_item<1> Item, auto &Red) {
51+
Red.combine(in[Item.get_global_id()]);
52+
});
53+
});
54+
55+
q.submit([&](handler &cgh) {
56+
accessor in(dataBuf, cgh, read_only);
57+
auto maxRed = reduction(maxBuf, cgh, maximum<>{},
58+
property::reduction::initialize_to_identity{});
59+
detail::reduction_parallel_for<detail::auto_name, Strategy>(
60+
cgh, ndr, ext::oneapi::experimental::empty_properties_t{}, maxRed,
61+
[=](nd_item<1> Item, auto &Red) {
62+
Red.combine(in[Item.get_global_id()]);
63+
});
64+
});
65+
66+
if constexpr (Fuse) {
67+
fw.complete_fusion(ext::codeplay::experimental::property::no_barriers{});
68+
} else {
69+
fw.cancel_fusion();
70+
}
71+
}
72+
73+
constexpr int expectedMax = globalSize - 1;
74+
constexpr int expectedSum = globalSize * expectedMax / 2;
75+
76+
assert(sumRes == expectedSum);
77+
assert(maxRes == expectedMax);
78+
}
79+
80+
template <detail::reduction::strategy Strategy> void test() {
81+
for (size_t localSize = 1; localSize <= globalSize; localSize *= 2) {
82+
nd_range<1> ndr{globalSize, localSize};
83+
test<Strategy, true>(ndr);
84+
test<Strategy, false>(ndr);
85+
}
86+
}

0 commit comments

Comments
 (0)