Skip to content

[SYCL][Fusion] Abort fusion on non-uniform work-group sizes ND-range #12077

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions sycl-fusion/common/lib/NDRangesHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,16 @@ bool jit_compiler::isHeterogeneousList(ArrayRef<NDRange> NDRanges) {
return any_of(NDRanges, [&ND](const auto &Other) { return ND != Other; });
}

static bool wouldYieldUniformWorkGroupSize(const Indices &LocalSize,
llvm::ArrayRef<NDRange> NDRanges) {
const auto GlobalSize = getMaximalGlobalSize(NDRanges);
return llvm::all_of(llvm::zip_equal(GlobalSize, LocalSize),
[](const std::tuple<std::size_t, std::size_t> &P) {
const auto &[GlobalSize, LocalSize] = P;
return GlobalSize % LocalSize == 0;
});
}

bool jit_compiler::isValidCombination(llvm::ArrayRef<NDRange> NDRanges) {
if (NDRanges.empty()) {
return false;
Expand All @@ -95,9 +105,14 @@ bool jit_compiler::isValidCombination(llvm::ArrayRef<NDRange> NDRanges) {
const auto &ND = FirstSpecifiedLocalSize == NDRanges.end()
? NDRanges.front()
: *FirstSpecifiedLocalSize;
return llvm::all_of(NDRanges, [&ND](const auto &Other) {
return compatibleRanges(ND, Other);
});
return llvm::all_of(NDRanges,
[&ND](const auto &Other) {
return compatibleRanges(ND, Other);
}) &&
// Either no local size is specified or the maximal global size is
// compatible with the specified local size.
(FirstSpecifiedLocalSize == NDRanges.end() ||
wouldYieldUniformWorkGroupSize(ND.getLocalSize(), NDRanges));
}

bool jit_compiler::requireIDRemapping(const NDRange &LHS, const NDRange &RHS) {
Expand Down
5 changes: 3 additions & 2 deletions sycl-fusion/jit-compiler/lib/KernelFusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,9 @@ FusionResult KernelFusion::fuseKernels(

if (!isValidCombination(NDRanges)) {
return FusionResult{
"Cannot fuse kernels with different offsets or local sizes or "
"different global sizes in dimensions [2, N) and non-zero offsets"};
"Cannot fuse kernels with different offsets or local sizes, or "
"different global sizes in dimensions [2, N) and non-zero offsets, "
"or those whose fusion would yield non-uniform work-groups sizes"};
}

bool IsHeterogeneousList = jit_compiler::isHeterogeneousList(NDRanges);
Expand Down
41 changes: 27 additions & 14 deletions sycl/test-e2e/KernelFusion/abort_fusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,16 @@ constexpr size_t dataSize = 512;

enum class Internalization { None, Local, Private };

template <typename Kernel1Name, typename Kernel2Name, int Kernel1Dim>
void performFusion(queue &q, range<Kernel1Dim> k1Global,
range<Kernel1Dim> k1Local) {
template <typename Range> size_t getSize(Range r);

template <> size_t getSize(range<1> r) { return r.size(); }
template <> size_t getSize(nd_range<1> r) {
return r.get_global_range().size();
}

template <typename Kernel1Name, typename Kernel2Name, typename Range1,
typename Range2>
void performFusion(queue &q, Range1 R1, Range2 R2) {
int in[dataSize], tmp[dataSize], out[dataSize];

for (size_t i = 0; i < dataSize; ++i) {
Expand All @@ -37,19 +44,15 @@ void performFusion(queue &q, range<Kernel1Dim> k1Global,
q.submit([&](handler &cgh) {
auto accIn = bIn.get_access(cgh);
auto accTmp = bTmp.get_access(cgh);
cgh.parallel_for<Kernel1Name>(nd_range<Kernel1Dim>{k1Global, k1Local},
[=](item<Kernel1Dim> i) {
auto LID = i.get_linear_id();
accTmp[LID] = accIn[LID] + 5;
});
cgh.parallel_for<Kernel1Name>(
R1, [=](item<1> i) { accTmp[i] = accIn[i] + 5; });
});

q.submit([&](handler &cgh) {
auto accTmp = bTmp.get_access(cgh);
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<Kernel2Name>(nd_range<1>{{dataSize}, {8}}, [=](id<1> i) {
accOut[i] = accTmp[i] * 2;
});
cgh.parallel_for<Kernel2Name>(
R2, [=](id<1> i) { accOut[i] = accTmp[i] * 2; });
});

fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
Expand All @@ -60,7 +63,8 @@ void performFusion(queue &q, range<Kernel1Dim> k1Global,

// Check the results
size_t numErrors = 0;
for (size_t i = 0; i < k1Global.size(); ++i) {
size_t size = getSize(R1);
for (size_t i = 0; i < size; ++i) {
if (out[i] != ((i + 5) * 2)) {
++numErrors;
}
Expand Down Expand Up @@ -89,8 +93,9 @@ int main() {

// Scenario: Fusing two kernels with different local size should lead to
// fusion being aborted.
performFusion<class Kernel1_3, class Kernel2_3>(q, range<1>{dataSize},
range<1>{16});
performFusion<class Kernel1_3, class Kernel2_3>(
q, nd_range<1>{range<1>{dataSize}, range<1>{16}},
nd_range<1>{range<1>{dataSize}, range<1>{8}});
// CHECK: ERROR: JIT compilation for kernel fusion failed with message:
// CHECK-NEXT: Cannot fuse kernels with different offsets or local sizes
// CHECK: COMPUTATION OK
Expand All @@ -101,5 +106,13 @@ int main() {
// CHECK-NOT: Cannot fuse kernels with different offsets or local sizes
// CHECK: WARNING: Fusion list is empty

// Scenario: Fusing two kernels that would lead to non-uniform work-group
// sizes should lead to fusion being aborted.
performFusion<class Kernel1_4, class Kernel2_4>(
q, nd_range<1>{range<1>{9}, range<1>{3}}, range<1>{dataSize});
// CHECK: ERROR: JIT compilation for kernel fusion failed with message:
// CHECK-NEXT: Cannot fuse kernels with different offsets or local sizes
// CHECK: COMPUTATION OK

return 0;
}