Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL][Fusion] Test fusion scheduler integration #1416

Merged
merged 1 commit into from
Jan 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions SYCL/KernelFusion/cancel_fusion.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// UNSUPPORTED: cuda || hip

// Test cancel fusion
Expand Down
77 changes: 77 additions & 0 deletions SYCL/KernelFusion/event_wait_cancel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// UNSUPPORTED: cuda || hip

// Test validity of events after cancel_fusion.

#include "fusion_event_test_common.h"
#include <sycl/sycl.hpp>

using namespace sycl;

int main() {
constexpr size_t dataSize = 512;

queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};

int *in1 = sycl::malloc_shared<int>(dataSize, q);
int *in2 = sycl::malloc_shared<int>(dataSize, q);
int *in3 = sycl::malloc_shared<int>(dataSize, q);
int *tmp = sycl::malloc_shared<int>(dataSize, q);
int *out = sycl::malloc_shared<int>(dataSize, q);

for (size_t i = 0; i < dataSize; ++i) {
in1[i] = i * 2;
in2[i] = i * 3;
in3[i] = i * 4;
tmp[i] = -1;
out[i] = -1;
}

ext::codeplay::experimental::fusion_wrapper fw{q};
fw.start_fusion();

assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");

auto kernel1 = q.submit([&](handler &cgh) {
cgh.parallel_for<class KernelOne>(
dataSize, [=](id<1> i) { tmp[i] = in1[i] + in2[i]; });
});

auto kernel2 = q.submit([&](handler &cgh) {
cgh.depends_on(kernel1);
cgh.parallel_for<class KernelTwo>(
dataSize, [=](id<1> i) { out[i] = tmp[i] * in3[i]; });
});

fw.cancel_fusion();

assert(!fw.is_in_fusion_mode() &&
"Queue should not be in fusion mode anymore");

kernel1.wait();
assert(isEventComplete(kernel1) && "Event should be complete");
// The event returned by submit while in fusion mode depends on both
// individual kernels to be executed.
assert(kernel1.get_wait_list().size() == 2);

kernel2.wait();
assert(isEventComplete(kernel2) && "Event should be complete");
// The event returned by submit while in fusion mode depends on both
// individual kernels to be executed.
assert(kernel2.get_wait_list().size() == 2);

// Check the results
for (size_t i = 0; i < dataSize; ++i) {
assert(out[i] == (20 * i * i) && "Computation error");
}

sycl::free(in1, q);
sycl::free(in2, q);
sycl::free(in3, q);
sycl::free(tmp, q);
sycl::free(out, q);

return 0;
}
8 changes: 8 additions & 0 deletions SYCL/KernelFusion/fusion_event_test_common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#include <sycl/sycl.hpp>

using namespace sycl;

static bool isEventComplete(sycl::event &ev) {
return ev.get_info<info::event::command_execution_status>() ==
info::event_command_status::complete;
}
84 changes: 84 additions & 0 deletions SYCL/KernelFusion/sync_acc_mem_op.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %CPU_CHECK_PLACEHOLDER
// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %GPU_CHECK_PLACEHOLDER
// UNSUPPORTED: cuda || hip

// Test fusion cancellation on an explicit memory operation on an accessor
// happening before complete_fusion.

#include <sycl/sycl.hpp>

using namespace sycl;

int main() {
constexpr size_t dataSize = 512;
int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
int dst[dataSize];

for (size_t i = 0; i < dataSize; ++i) {
in1[i] = i * 2;
in2[i] = i * 3;
in3[i] = i * 4;
tmp[i] = -1;
out[i] = -1;
dst[i] = -1;
}

queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};

{
buffer<int> bIn1{in1, range{dataSize}};
buffer<int> bIn2{in2, range{dataSize}};
buffer<int> bIn3{in3, range{dataSize}};
buffer<int> bTmp{tmp, range{dataSize}};
buffer<int> bOut{out, range{dataSize}};

ext::codeplay::experimental::fusion_wrapper fw{q};
fw.start_fusion();

assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");

q.submit([&](handler &cgh) {
auto accIn1 = bIn1.get_access<access::mode::read>(cgh);
auto accIn2 = bIn2.get_access<access::mode::read>(cgh);
auto accTmp = bTmp.get_access(cgh);
cgh.parallel_for<class KernelOne>(
dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
});

q.submit([&](handler &cgh) {
auto accTmp = bTmp.get_access(cgh);
auto accIn3 = bIn3.get_access(cgh);
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<class KernelTwo>(
dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
});

// This explicit copy operation has an overlapping requirement with one of
// the kernels and therefore requires synchronization. This should lead to
// cancellation of the fusion.
auto copyEvt = q.submit([&](handler &cgh) {
auto accTmp = bTmp.get_access(cgh);
cgh.copy(accTmp, dst);
});

copyEvt.wait();

assert(!fw.is_in_fusion_mode() &&
"Queue should not be in fusion mode anymore");

fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
}

// Check the results
for (size_t i = 0; i < dataSize; ++i) {
assert(out[i] == (20 * i * i) && "Computation error");
assert(dst[i] == (5 * i) && "Computation error");
}

return 0;
}

// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested
76 changes: 76 additions & 0 deletions SYCL/KernelFusion/sync_buffer_destruction.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %CPU_CHECK_PLACEHOLDER
// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %GPU_CHECK_PLACEHOLDER
// UNSUPPORTED: cuda || hip

// Test fusion cancellation on buffer destruction happening before
// complete_fusion.

#include <sycl/sycl.hpp>

using namespace sycl;

int main() {
constexpr size_t dataSize = 512;
int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];

for (size_t i = 0; i < dataSize; ++i) {
in1[i] = i * 2;
in2[i] = i * 3;
in3[i] = i * 4;
tmp[i] = -1;
out[i] = -1;
}

queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};

{
buffer<int> bIn1{in1, range{dataSize}};
buffer<int> bIn2{in2, range{dataSize}};
buffer<int> bTmp{tmp, range{dataSize}};
buffer<int> bOut{out, range{dataSize}};

ext::codeplay::experimental::fusion_wrapper fw{q};
{
buffer<int> bIn3{in3, range{dataSize}};

fw.start_fusion();

assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");

q.submit([&](handler &cgh) {
auto accIn1 = bIn1.get_access<access::mode::read>(cgh);
auto accIn2 = bIn2.get_access<access::mode::read>(cgh);
auto accTmp = bTmp.get_access(cgh);
cgh.parallel_for<class KernelOne>(
dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
});

q.submit([&](handler &cgh) {
auto accTmp = bTmp.get_access(cgh);
auto accIn3 = bIn3.get_access(cgh);
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<class KernelTwo>(
dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
});
// Buffer bIn3, which is accessed by one of the kernels in the fusion list
// goes out scope, causing a blocking wait for one of the kernels in the
// fusion list. This should lead to cancellation of the fusion.
}
assert(!fw.is_in_fusion_mode() &&
"Queue should not be in fusion mode anymore");

fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
}

// Check the results
for (size_t i = 0; i < dataSize; ++i) {
assert(out[i] == (20 * i * i) && "Computation error");
}

return 0;
}

// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested
75 changes: 75 additions & 0 deletions SYCL/KernelFusion/sync_event_wait.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %CPU_CHECK_PLACEHOLDER
// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %GPU_CHECK_PLACEHOLDER
// UNSUPPORTED: cuda || hip

// Test fusion cancellation on event::wait() happening before
// complete_fusion.

#include <sycl/sycl.hpp>

using namespace sycl;

int main() {
constexpr size_t dataSize = 512;
int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];

for (size_t i = 0; i < dataSize; ++i) {
in1[i] = i * 2;
in2[i] = i * 3;
in3[i] = i * 4;
tmp[i] = -1;
out[i] = -1;
}

queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};

{
buffer<int> bIn1{in1, range{dataSize}};
buffer<int> bIn2{in2, range{dataSize}};
buffer<int> bIn3{in3, range{dataSize}};
buffer<int> bTmp{tmp, range{dataSize}};
buffer<int> bOut{out, range{dataSize}};

ext::codeplay::experimental::fusion_wrapper fw{q};
fw.start_fusion();

assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");

auto kernel1Ev = q.submit([&](handler &cgh) {
auto accIn1 = bIn1.get_access<access::mode::read>(cgh);
auto accIn2 = bIn2.get_access<access::mode::read>(cgh);
auto accTmp = bTmp.get_access(cgh);
cgh.parallel_for<class KernelOne>(
dataSize, [=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
});

q.submit([&](handler &cgh) {
auto accTmp = bTmp.get_access(cgh);
auto accIn3 = bIn3.get_access(cgh);
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<class KernelTwo>(
dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
});

// This event::wait() causes a blocking wait for one of the kernels in the
// fusion list. This should lead to cancellation of the fusion.
kernel1Ev.wait();

assert(!fw.is_in_fusion_mode() &&
"Queue should not be in fusion mode anymore");

fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
}

// Check the results
for (size_t i = 0; i < dataSize; ++i) {
assert(out[i] == (20 * i * i) && "Computation error");
}

return 0;
}

// CHECK: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested
Loading