Skip to content

Commit 29e629e

Browse files
authored
[SYCL] Add tangle/opportunistic algorithms (#9220)
Enables the following functions to be used with tangle_group and opportunistic_group arguments: - group_barrier - group_broadcast - any_of_group - all_of_group - none_of_group - reduce_over_group - exclusive_scan_over_group - inclusive_scan_over_group --- A few quick notes to reviewers: 1) This implementation leverages the fact that it is undefined behavior to use a tangle group or opportunistic group in control flow that does not match the control flow at the point of construction to avoid using a mask for most operations. I _think_ it is safe to call the NonUniform intrinsics directly, because they are already control-flow-aware. 2) In a few places, I've deliberately duplicated the implementation across tangle group and opportunistic group even though they're the same. I've done this primarily in an attempt to simplify @JackAKirk's efforts to merge in his CUDA implementation, because I expect that there may be some cases where the CUDA implementations of these groups _do_ diverge. If this turns out not to be true, we can tidy things up afterwards. 3) In general, tangle and opportunistic group are not the same thing. But I expect their behavior will be identical on all of the SPIR-V implementations that we're targeting. --------- Signed-off-by: John Pennycook <[email protected]>
1 parent df743a5 commit 29e629e

File tree

6 files changed

+395
-4
lines changed

6 files changed

+395
-4
lines changed

sycl/include/sycl/detail/spirv.hpp

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ struct sub_group;
2727
namespace experimental {
2828
template <typename ParentGroup> class ballot_group;
2929
template <size_t PartitionSize, typename ParentGroup> class fixed_size_group;
30+
template <typename ParentGroup> class tangle_group;
31+
class opportunistic_group;
3032
} // namespace experimental
3133
} // namespace oneapi
3234
} // namespace ext
@@ -72,6 +74,16 @@ struct group_scope<sycl::ext::oneapi::experimental::fixed_size_group<
7274
static constexpr __spv::Scope::Flag value = group_scope<ParentGroup>::value;
7375
};
7476

77+
template <typename ParentGroup>
78+
struct group_scope<sycl::ext::oneapi::experimental::tangle_group<ParentGroup>> {
79+
static constexpr __spv::Scope::Flag value = group_scope<ParentGroup>::value;
80+
};
81+
82+
template <>
83+
struct group_scope<::sycl::ext::oneapi::experimental::opportunistic_group> {
84+
static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Subgroup;
85+
};
86+
7587
// Generic shuffles and broadcasts may require multiple calls to
7688
// intrinsics, and should use the fewest broadcasts possible
7789
// - Loop over chunks until remaining bytes < chunk size
@@ -135,6 +147,16 @@ bool GroupAll(
135147
static_cast<uint32_t>(__spv::GroupOperation::ClusteredReduce),
136148
static_cast<uint32_t>(pred), PartitionSize);
137149
}
150+
template <typename ParentGroup>
151+
bool GroupAll(ext::oneapi::experimental::tangle_group<ParentGroup>, bool pred) {
152+
return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
153+
}
154+
template <typename Group>
155+
bool GroupAll(const ext::oneapi::experimental::opportunistic_group &,
156+
bool pred) {
157+
return __spirv_GroupNonUniformAll(
158+
group_scope<ext::oneapi::experimental::opportunistic_group>::value, pred);
159+
}
138160

139161
template <typename Group> bool GroupAny(Group, bool pred) {
140162
return __spirv_GroupAny(group_scope<Group>::value, pred);
@@ -161,6 +183,15 @@ bool GroupAny(
161183
static_cast<uint32_t>(__spv::GroupOperation::ClusteredReduce),
162184
static_cast<uint32_t>(pred), PartitionSize);
163185
}
186+
template <typename ParentGroup>
187+
bool GroupAny(ext::oneapi::experimental::tangle_group<ParentGroup>, bool pred) {
188+
return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
189+
}
190+
bool GroupAny(const ext::oneapi::experimental::opportunistic_group &,
191+
bool pred) {
192+
return __spirv_GroupNonUniformAny(
193+
group_scope<ext::oneapi::experimental::opportunistic_group>::value, pred);
194+
}
164195

165196
// Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic
166197
// FIXME: Do not special-case for half or vec once all backends support all data
@@ -281,6 +312,45 @@ EnableIfNativeBroadcast<T, IdT> GroupBroadcast(
281312
return __spirv_GroupNonUniformShuffle(group_scope<ParentGroup>::value, OCLX,
282313
OCLId);
283314
}
315+
template <typename ParentGroup, typename T, typename IdT>
316+
EnableIfNativeBroadcast<T, IdT>
317+
GroupBroadcast(ext::oneapi::experimental::tangle_group<ParentGroup> g, T x,
318+
IdT local_id) {
319+
// Remap local_id to its original numbering in ParentGroup.
320+
auto LocalId = detail::IdToMaskPosition(g, local_id);
321+
322+
// TODO: Refactor to avoid duplication after design settles.
323+
using GroupIdT = typename GroupId<ParentGroup>::type;
324+
GroupIdT GroupLocalId = static_cast<GroupIdT>(LocalId);
325+
using OCLT = detail::ConvertToOpenCLType_t<T>;
326+
using WidenedT = WidenOpenCLTypeTo32_t<OCLT>;
327+
using OCLIdT = detail::ConvertToOpenCLType_t<GroupIdT>;
328+
WidenedT OCLX = detail::convertDataToType<T, OCLT>(x);
329+
OCLIdT OCLId = detail::convertDataToType<GroupIdT, OCLIdT>(GroupLocalId);
330+
331+
return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value, OCLX,
332+
OCLId);
333+
}
334+
template <typename T, typename IdT>
335+
EnableIfNativeBroadcast<T, IdT>
336+
GroupBroadcast(const ext::oneapi::experimental::opportunistic_group &g, T x,
337+
IdT local_id) {
338+
// Remap local_id to its original numbering in sub-group
339+
auto LocalId = detail::IdToMaskPosition(g, local_id);
340+
341+
// TODO: Refactor to avoid duplication after design settles.
342+
using GroupIdT = typename GroupId<sycl::ext::oneapi::sub_group>::type;
343+
GroupIdT GroupLocalId = static_cast<GroupIdT>(LocalId);
344+
using OCLT = detail::ConvertToOpenCLType_t<T>;
345+
using WidenedT = WidenOpenCLTypeTo32_t<OCLT>;
346+
using OCLIdT = detail::ConvertToOpenCLType_t<GroupIdT>;
347+
WidenedT OCLX = detail::convertDataToType<T, OCLT>(x);
348+
OCLIdT OCLId = detail::convertDataToType<GroupIdT, OCLIdT>(GroupLocalId);
349+
350+
return __spirv_GroupNonUniformBroadcast(
351+
group_scope<ext::oneapi::experimental::opportunistic_group>::value, OCLX,
352+
OCLId);
353+
}
284354

285355
template <typename Group, typename T, typename IdT>
286356
EnableIfBitcastBroadcast<T, IdT> GroupBroadcast(Group g, T x, IdT local_id) {
@@ -956,6 +1026,18 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
9561026
#endif
9571027
}
9581028

1029+
template <typename Group>
1030+
struct is_tangle_or_opportunistic_group : std::false_type {};
1031+
1032+
template <typename ParentGroup>
1033+
struct is_tangle_or_opportunistic_group<
1034+
sycl::ext::oneapi::experimental::tangle_group<ParentGroup>>
1035+
: std::true_type {};
1036+
1037+
template <>
1038+
struct is_tangle_or_opportunistic_group<
1039+
sycl::ext::oneapi::experimental::opportunistic_group> : std::true_type {};
1040+
9591041
// TODO: Refactor to avoid duplication after design settles
9601042
#define __SYCL_GROUP_COLLECTIVE_OVERLOAD(Instruction) \
9611043
template <__spv::GroupOperation Op, typename Group, typename T> \
@@ -1037,6 +1119,24 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
10371119
} \
10381120
return tmp; \
10391121
} \
1122+
} \
1123+
template <__spv::GroupOperation Op, typename Group, typename T> \
1124+
inline typename std::enable_if_t< \
1125+
is_tangle_or_opportunistic_group<Group>::value, T> \
1126+
Group##Instruction(Group, T x) { \
1127+
using ConvertedT = detail::ConvertToOpenCLType_t<T>; \
1128+
\
1129+
using OCLT = std::conditional_t< \
1130+
std::is_same<ConvertedT, cl_char>() || \
1131+
std::is_same<ConvertedT, cl_short>(), \
1132+
cl_int, \
1133+
std::conditional_t<std::is_same<ConvertedT, cl_uchar>() || \
1134+
std::is_same<ConvertedT, cl_ushort>(), \
1135+
cl_uint, ConvertedT>>; \
1136+
OCLT Arg = x; \
1137+
OCLT Ret = __spirv_GroupNonUniform##Instruction( \
1138+
group_scope<Group>::value, static_cast<unsigned int>(Op), Arg); \
1139+
return Ret; \
10401140
}
10411141

10421142
__SYCL_GROUP_COLLECTIVE_OVERLOAD(SMin)

sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ namespace ext::oneapi::experimental {
6464
// Forward declarations of non-uniform group types for algorithm definitions
6565
template <typename ParentGroup> class ballot_group;
6666
template <size_t PartitionSize, typename ParentGroup> class fixed_size_group;
67+
template <typename ParentGroup> class tangle_group;
68+
class opportunistic_group;
6769

6870
} // namespace ext::oneapi::experimental
6971

sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,13 +111,16 @@ class opportunistic_group {
111111
#endif
112112
}
113113

114-
private:
114+
protected:
115115
sub_group_mask Mask;
116116

117-
protected:
118117
opportunistic_group(sub_group_mask m) : Mask(m) {}
119118

120119
friend opportunistic_group this_kernel::get_opportunistic_group();
120+
121+
friend uint32_t
122+
sycl::detail::IdToMaskPosition<opportunistic_group>(opportunistic_group Group,
123+
uint32_t Id);
121124
};
122125

123126
namespace this_kernel {
@@ -144,5 +147,10 @@ template <>
144147
struct is_user_constructed_group<opportunistic_group> : std::true_type {};
145148

146149
} // namespace ext::oneapi::experimental
150+
151+
template <>
152+
struct is_group<ext::oneapi::experimental::opportunistic_group>
153+
: std::true_type {};
154+
147155
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
148156
} // namespace sycl

sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,13 +112,15 @@ template <typename ParentGroup> class tangle_group {
112112
#endif
113113
}
114114

115-
private:
115+
protected:
116116
sub_group_mask Mask;
117117

118-
protected:
119118
tangle_group(sub_group_mask m) : Mask(m) {}
120119

121120
friend tangle_group<ParentGroup> get_tangle_group<ParentGroup>(ParentGroup);
121+
122+
friend uint32_t sycl::detail::IdToMaskPosition<tangle_group<ParentGroup>>(
123+
tangle_group<ParentGroup> Group, uint32_t Id);
122124
};
123125

124126
template <typename Group>
@@ -149,5 +151,10 @@ template <typename ParentGroup>
149151
struct is_user_constructed_group<tangle_group<ParentGroup>> : std::true_type {};
150152

151153
} // namespace ext::oneapi::experimental
154+
155+
template <typename ParentGroup>
156+
struct is_group<ext::oneapi::experimental::tangle_group<ParentGroup>>
157+
: std::true_type {};
158+
152159
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
153160
} // namespace sycl
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
3+
//
4+
// UNSUPPORTED: cpu || cuda || hip
5+
6+
#include <sycl/sycl.hpp>
7+
#include <vector>
8+
namespace syclex = sycl::ext::oneapi::experimental;
9+
10+
class TestKernel;
11+
12+
constexpr uint32_t SGSize = 32;
13+
constexpr uint32_t ArbitraryItem = 5;
14+
15+
int main() {
16+
sycl::queue Q;
17+
18+
auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
19+
if (std::find(SGSizes.begin(), SGSizes.end(), SGSize) == SGSizes.end()) {
20+
std::cout << "Test skipped due to missing support for sub-group size 32."
21+
<< std::endl;
22+
return 0;
23+
}
24+
25+
sycl::buffer<size_t, 1> TmpBuf{sycl::range{SGSize}};
26+
sycl::buffer<bool, 1> BarrierBuf{sycl::range{SGSize}};
27+
sycl::buffer<bool, 1> BroadcastBuf{sycl::range{SGSize}};
28+
sycl::buffer<bool, 1> AnyBuf{sycl::range{SGSize}};
29+
sycl::buffer<bool, 1> AllBuf{sycl::range{SGSize}};
30+
sycl::buffer<bool, 1> NoneBuf{sycl::range{SGSize}};
31+
sycl::buffer<bool, 1> ReduceBuf{sycl::range{SGSize}};
32+
sycl::buffer<bool, 1> ExScanBuf{sycl::range{SGSize}};
33+
sycl::buffer<bool, 1> IncScanBuf{sycl::range{SGSize}};
34+
35+
const auto NDR = sycl::nd_range<1>{SGSize, SGSize};
36+
Q.submit([&](sycl::handler &CGH) {
37+
sycl::accessor TmpAcc{TmpBuf, CGH, sycl::write_only};
38+
sycl::accessor BarrierAcc{BarrierBuf, CGH, sycl::write_only};
39+
sycl::accessor BroadcastAcc{BroadcastBuf, CGH, sycl::write_only};
40+
sycl::accessor AnyAcc{AnyBuf, CGH, sycl::write_only};
41+
sycl::accessor AllAcc{AllBuf, CGH, sycl::write_only};
42+
sycl::accessor NoneAcc{NoneBuf, CGH, sycl::write_only};
43+
sycl::accessor ReduceAcc{ReduceBuf, CGH, sycl::write_only};
44+
sycl::accessor ExScanAcc{ExScanBuf, CGH, sycl::write_only};
45+
sycl::accessor IncScanAcc{IncScanBuf, CGH, sycl::write_only};
46+
const auto KernelFunc =
47+
[=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(SGSize)]] {
48+
auto WI = item.get_global_id();
49+
auto SG = item.get_sub_group();
50+
51+
uint32_t OriginalLID = SG.get_local_linear_id();
52+
53+
// Given the dynamic nature of opportunistic groups, the simplest
54+
// case we can reason about is a single work-item. This isn't a very
55+
// robust test, but choosing an arbitrary work-item (i.e. rather
56+
// than the leader) should test an implementation's ability to handle
57+
// arbitrary group membership.
58+
if (OriginalLID == ArbitraryItem) {
59+
auto OpportunisticGroup =
60+
syclex::this_kernel::get_opportunistic_group();
61+
62+
// This is trivial, but does test that group_barrier can be called.
63+
TmpAcc[WI] = 1;
64+
sycl::group_barrier(OpportunisticGroup);
65+
size_t Visible = TmpAcc[WI];
66+
BarrierAcc[WI] = (Visible == 1);
67+
68+
// Simple check of group algorithms.
69+
uint32_t LID = OpportunisticGroup.get_local_linear_id();
70+
71+
uint32_t BroadcastResult =
72+
sycl::group_broadcast(OpportunisticGroup, OriginalLID, 0);
73+
BroadcastAcc[WI] = (BroadcastResult == OriginalLID);
74+
75+
bool AnyResult = sycl::any_of_group(OpportunisticGroup, (LID == 0));
76+
AnyAcc[WI] = (AnyResult == true);
77+
78+
bool AllResult = sycl::all_of_group(OpportunisticGroup, (LID == 0));
79+
AllAcc[WI] = (AllResult == true);
80+
81+
bool NoneResult =
82+
sycl::none_of_group(OpportunisticGroup, (LID != 0));
83+
NoneAcc[WI] = (NoneResult == true);
84+
85+
uint32_t ReduceResult =
86+
sycl::reduce_over_group(OpportunisticGroup, 1, sycl::plus<>());
87+
ReduceAcc[WI] =
88+
(ReduceResult == OpportunisticGroup.get_local_linear_range());
89+
90+
uint32_t ExScanResult = sycl::exclusive_scan_over_group(
91+
OpportunisticGroup, 1, sycl::plus<>());
92+
ExScanAcc[WI] = (ExScanResult == LID);
93+
94+
uint32_t IncScanResult = sycl::inclusive_scan_over_group(
95+
OpportunisticGroup, 1, sycl::plus<>());
96+
IncScanAcc[WI] = (IncScanResult == LID + 1);
97+
} else {
98+
BarrierAcc[WI] = false;
99+
BroadcastAcc[WI] = false;
100+
AnyAcc[WI] = false;
101+
AllAcc[WI] = false;
102+
NoneAcc[WI] = false;
103+
ReduceAcc[WI] = false;
104+
ExScanAcc[WI] = false;
105+
IncScanAcc[WI] = false;
106+
}
107+
};
108+
CGH.parallel_for<TestKernel>(NDR, KernelFunc);
109+
});
110+
111+
sycl::host_accessor BarrierAcc{BarrierBuf, sycl::read_only};
112+
sycl::host_accessor BroadcastAcc{BroadcastBuf, sycl::read_only};
113+
sycl::host_accessor AnyAcc{AnyBuf, sycl::read_only};
114+
sycl::host_accessor AllAcc{AllBuf, sycl::read_only};
115+
sycl::host_accessor NoneAcc{NoneBuf, sycl::read_only};
116+
sycl::host_accessor ReduceAcc{ReduceBuf, sycl::read_only};
117+
sycl::host_accessor ExScanAcc{ExScanBuf, sycl::read_only};
118+
sycl::host_accessor IncScanAcc{IncScanBuf, sycl::read_only};
119+
for (uint32_t WI = 0; WI < 32; ++WI) {
120+
bool ExpectedResult = (WI == ArbitraryItem);
121+
assert(BarrierAcc[WI] == ExpectedResult);
122+
assert(BroadcastAcc[WI] == ExpectedResult);
123+
assert(AnyAcc[WI] == ExpectedResult);
124+
assert(AllAcc[WI] == ExpectedResult);
125+
assert(NoneAcc[WI] == ExpectedResult);
126+
assert(ReduceAcc[WI] == ExpectedResult);
127+
assert(ExScanAcc[WI] == ExpectedResult);
128+
assert(IncScanAcc[WI] == ExpectedResult);
129+
}
130+
return 0;
131+
}

0 commit comments

Comments
 (0)