Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL][Fusion] Test fusion of kernels with different ND-ranges #1575

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 2 additions & 15 deletions SYCL/KernelFusion/abort_fusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,25 +80,12 @@ int main() {

queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};

// Scenario: Fusing two kernels with different dimensionality should lead to
// fusion being aborted.
performFusion<class Kernel1_1, class Kernel2_1>(q, range<2>{32, 16},
range<2>{1, 8});
// CHECK: WARNING: Cannot fuse kernels with different dimensionality
// CHECK-NEXT: COMPUTATION OK

// Scenario: Fusing two kernels with different global size should lead to
// fusion being aborted.
performFusion<class Kernel1_2, class Kernel2_2>(q, range<1>{256},
range<1>{8});
// CHECK-NEXT: WARNING: Cannot fuse kerneles with different global size
// CHECK-NEXT: COMPUTATION OK

// Scenario: Fusing two kernels with different local size should lead to
// fusion being aborted.
performFusion<class Kernel1_3, class Kernel2_3>(q, range<1>{dataSize},
range<1>{16});
// CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
// CHECK: ERROR: JIT compilation for kernel fusion failed with message:
// CHECK-NEXT: Cannot fuse kernels with different offsets or local sizes
// CHECK-NEXT: COMPUTATION OK

return 0;
Expand Down
4 changes: 2 additions & 2 deletions SYCL/KernelFusion/abort_internalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,6 @@ int main() {
// CHECK-NEXT: Local(0), Local(8)
// CHECK-NEXT: WARNING: Work-group size for local promotion not specified, not performing internalization
// CHECK-NEXT: WARNING: Not performing specified local promotion, due to previous mismatch or because previous accessor specified no promotion
// CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
// CHECK-NEXT: COMPUTATION OK

// Scenario: Both accessor with local promotion, but the kernels specify
Expand All @@ -157,7 +156,8 @@ int main() {
performFusion(q, Internalization::Local, 8, Internalization::Local, 16);
// CHECK-NEXT: Local(8), Local(16)
// CHECK-NEXT: WARNING: Not performing specified local promotion due to work-group size mismatch
// CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
// CHECK-NEXT: ERROR: JIT compilation for kernel fusion failed with message:
// CHECK-NEXT: Cannot fuse kernels with different offsets or local sizes
// CHECK-NEXT: COMPUTATION OK

// Scenario: One accessor with local internalization, one with private
Expand Down
270 changes: 270 additions & 0 deletions SYCL/KernelFusion/different_nd_ranges.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %CPU_CHECK_PLACEHOLDER
// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %GPU_CHECK_PLACEHOLDER
// UNSUPPORTED: cuda || hip
// REQUIRES: fusion

// Test complete fusion of kernels with different ND-ranges.

// Kernels with different ND-ranges should be fused.
// CHECK-NOT: Cannot fuse kernels with different offsets or local sizes

#include <sycl/sycl.hpp>

#include <algorithm>

using namespace sycl;

////////////////////////////////////////////////////////////////////////////////
// Kernels
////////////////////////////////////////////////////////////////////////////////

using DataTy = vec<uint16_t, 3>;
using VecTy = std::vector<DataTy>;

template <std::size_t Dimensions> class FillBase {
public:
FillBase(accessor<DataTy, 1, access_mode::write> GS,
accessor<DataTy, 1, access_mode::write> LS,
accessor<DataTy, 1, access_mode::write> GrS,
accessor<DataTy, 1, access_mode::write> G,
accessor<DataTy, 1, access_mode::write> L,
accessor<DataTy, 1, access_mode::write> Gr)
: GS{GS}, LS{LS}, GrS{GrS}, G{G}, L{L}, Gr{Gr} {}

protected:
template <typename F> static DataTy getValue(F gen) {
DataTy x;
for (std::size_t i = 0; i < Dimensions; ++i) {
x[i] = gen(i);
}
return x;
}

accessor<DataTy, 1, access_mode::write> GS;
accessor<DataTy, 1, access_mode::write> LS;
accessor<DataTy, 1, access_mode::write> GrS;
accessor<DataTy, 1, access_mode::write> G;
accessor<DataTy, 1, access_mode::write> L;
accessor<DataTy, 1, access_mode::write> Gr;
};

template <int Dimensions> class Fill : public FillBase<Dimensions> {
public:
using FillBase<Dimensions>::FillBase;
using FillBase<Dimensions>::getValue;

void operator()(item<Dimensions> i) const {
const auto lid = i.get_linear_id();

FillBase<Dimensions>::GS[lid] =
getValue([i](int arg) { return i.get_range(arg); });
FillBase<Dimensions>::G[lid] =
getValue([i](int arg) { return i.get_id(arg); });
}
};

template <int Dimensions> class FillLS : public FillBase<Dimensions> {
public:
using FillBase<Dimensions>::FillBase;
using FillBase<Dimensions>::getValue;

void operator()(nd_item<Dimensions> nd) const {
const auto lid = nd.get_global_linear_id();
FillBase<Dimensions>::GS[lid] =
getValue([nd](int arg) { return nd.get_global_range(arg); });
FillBase<Dimensions>::LS[lid] =
getValue([nd](int arg) { return nd.get_local_range(arg); });
FillBase<Dimensions>::GrS[lid] =
getValue([nd](int arg) { return nd.get_group_range(arg); });
FillBase<Dimensions>::G[lid] =
getValue([nd](int arg) { return nd.get_global_id(arg); });
FillBase<Dimensions>::L[lid] =
getValue([nd](int arg) { return nd.get_local_id(arg); });
FillBase<Dimensions>::Gr[lid] =
getValue([nd](int arg) { return nd.get_group(arg); });
}
};

////////////////////////////////////////////////////////////////////////////////
// Range description
////////////////////////////////////////////////////////////////////////////////

struct RangeDesc {
using Indices = std::array<std::size_t, 3>;

constexpr RangeDesc(std::initializer_list<std::size_t> GS)
: Dimensions{static_cast<int>(GS.size())}, GS{init(GS)},
LS{std::nullopt} {}
constexpr RangeDesc(std::initializer_list<std::size_t> GS,
std::initializer_list<std::size_t> LS)
: Dimensions{static_cast<int>(GS.size())}, GS{init(GS)}, LS{init(LS)} {}

constexpr std::size_t num_work_items() const { return GS[0] * GS[1] * GS[2]; }

int Dimensions;
Indices GS;
std::optional<Indices> LS;

template <std::size_t D> range<D> get_range() const;
template <std::size_t D> nd_range<D> get_nd_range() const;

private:
static constexpr Indices init(std::initializer_list<std::size_t> sizes) {
Indices res{1, 1, 1};
std::copy(sizes.begin(), sizes.end(), res.begin());
return res;
}
};

template <> range<1> RangeDesc::get_range<1>() const { return {GS[0]}; }

template <> range<2> RangeDesc::get_range<2>() const { return {GS[0], GS[1]}; }

template <> range<3> RangeDesc::get_range<3>() const {
return {GS[0], GS[1], GS[2]};
}

template <> nd_range<1> RangeDesc::get_nd_range<1>() const {
return {get_range<1>(), {(*LS)[0]}};
}

template <> nd_range<2> RangeDesc::get_nd_range<2>() const {
return {get_range<2>(), {(*LS)[0], (*LS)[1]}};
}

template <> nd_range<3> RangeDesc::get_nd_range<3>() const {
return {get_range<3>(), {(*LS)[0], (*LS)[1], (*LS)[2]}};
}

////////////////////////////////////////////////////////////////////////////////
// Test
////////////////////////////////////////////////////////////////////////////////

using TestResult = std::vector<VecTy>;

TestResult run(const std::vector<RangeDesc> &sizes, bool fuse) {
const auto numWorkItems =
std::max_element(sizes.begin(), sizes.end(),
[](const auto &LHS, const auto &RHS) {
return LHS.num_work_items() < RHS.num_work_items();
})
->num_work_items();
TestResult res(6 * sizes.size(), VecTy(numWorkItems));

{
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
std::vector<buffer<DataTy>> Buffers;
Buffers.reserve(res.size());
for (auto &v : res) {
Buffers.emplace_back(v);
}
if (fuse) {
ext::codeplay::experimental::fusion_wrapper fw{q};
fw.start_fusion();
}
for (std::size_t i = 0; i < sizes.size(); ++i) {
q.submit([&](handler &cgh) {
const auto &size = *(sizes.begin() + i);
const auto j = i * 6;
accessor GS{Buffers[j], cgh, write_only};
accessor LS{Buffers[j + 1], cgh, write_only};
accessor GrS{Buffers[j + 2], cgh, write_only};
accessor G{Buffers[j + 3], cgh, write_only};
accessor L{Buffers[j + 4], cgh, write_only};
accessor Gr{Buffers[j + 5], cgh, write_only};
if (size.LS) {
switch (size.Dimensions) {
case 1:
cgh.parallel_for(size.template get_nd_range<1>(),
FillLS<1>{GS, LS, GrS, G, L, Gr});
break;
case 2:
cgh.parallel_for(size.template get_nd_range<2>(),
FillLS<2>{GS, LS, GrS, G, L, Gr});
break;
case 3:
cgh.parallel_for(size.template get_nd_range<3>(),
FillLS<3>{GS, LS, GrS, G, L, Gr});
break;
}
} else {
switch (size.Dimensions) {
case 1:
cgh.parallel_for(size.template get_range<1>(),
Fill<1>{GS, LS, GrS, G, L, Gr});
break;
case 2:
cgh.parallel_for(size.template get_range<2>(),
Fill<2>{GS, LS, GrS, G, L, Gr});
break;
case 3:
cgh.parallel_for(size.template get_range<3>(),
Fill<3>{GS, LS, GrS, G, L, Gr});
break;
}
}
});
}
if (fuse) {
ext::codeplay::experimental::fusion_wrapper fw{q};
assert(fw.is_in_fusion_mode() && "Fusion failed");
fw.complete_fusion(
{ext::codeplay::experimental::property::no_barriers{}});
assert(!fw.is_in_fusion_mode() && "Fusion failed");
}
}

return res;
}

void test(const std::vector<RangeDesc> &sizes) {
const auto res = run(sizes, /*fuse*/ false);
const auto fusedRes = run(sizes, /*fuse*/ true);
assert(std::equal(res.begin(), res.end(), fusedRes.begin(),
[](const auto &LHS, const auto &RHS) {
return std::equal(LHS.begin(), LHS.end(), RHS.begin(),
[](const auto &LHS, const auto &RHS) {
return all(LHS == RHS);
});
}) &&
"COMPUTATION ERROR");
}

int main() {
// 1-D kernels with different global sizes
test({RangeDesc{10}, RangeDesc{20}});
test({RangeDesc{10}, RangeDesc{20}, RangeDesc{30}});

// Two 1-D kernels with different global sizes and a 2-D kernel with more
// work-items.
test({RangeDesc{10}, RangeDesc{20}, RangeDesc{10, 10}});

// Two 1-D kernels with different global sizes and specified (equal) local
// size.
const auto R2 = {2ul};
test({RangeDesc{{10}, R2}, RangeDesc{{20}, R2}});

// Three 1-D kernels with different global sizes and specified (equal) local
// size.
const auto R5 = {5ul};
test({RangeDesc{{10}, R5}, RangeDesc{{20}, R5}, RangeDesc{{30}, R5}});

// Two 1-D kernels with different global sizes and a 2-D kernel with more
// work-items and specified (equal) local size.
test({RangeDesc{{10}, R2}, RangeDesc{{20}, R2}, RangeDesc{{10, 10}, {2, 1}}});

// Three 2-D kernels with different global sizes.
test({RangeDesc{{10, 15}, {2, 5}}, RangeDesc{{20, 10}, {2, 5}},
RangeDesc{{10, 5}, {2, 5}}});

// Three 3-D kernels with different global sizes.
test({RangeDesc{{10, 4, 2}, {5, 2, 1}}, RangeDesc{{20, 2, 4}, {5, 2, 1}},
RangeDesc{{10, 2, 4}, {5, 2, 1}}});

// 1-D, 2-D and 3-D kernels with different global sizes.
test({RangeDesc{{10}, R5}, RangeDesc{{10, 1}, {5, 1}},
RangeDesc{{10, 1, 1}, {5, 1, 1}}});
}