Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL][CUDA] Enables sub_group_sycl2020 for CUDA #316

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 49 additions & 35 deletions SYCL/SubGroup/sub_groups_sycl2020.cpp
Original file line number Diff line number Diff line change
@@ -1,55 +1,69 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
//
// TODO enable test on CUDA once kernel_bundle is supported
// UNSUPPORTED: cuda

#include <sycl/sycl.hpp>

class TestKernel;
class TestKernelCUDA;

int main() {
sycl::queue Q;
sycl::buffer<int, 3> Buf{sycl::range{3, 32, 32}};

sycl::kernel_id TestKernelID = sycl::get_kernel_id<TestKernel>();

sycl::kernel_bundle KernelBundle =
sycl::get_kernel_bundle<sycl::bundle_state::executable>(Q.get_context(),
{TestKernelID});

Q.submit([&](sycl::handler &CGH) {
CGH.use_kernel_bundle(KernelBundle);
sycl::accessor Acc{Buf, CGH, sycl::write_only};
CGH.parallel_for<TestKernel>(
sycl::nd_range<1>(sycl::range{32}, sycl::range{32}),
[=](sycl::nd_item<1> item) {
auto SG = item.get_sub_group();
Acc[0][SG.get_group_linear_id()][SG.get_local_linear_id()] =
SG.leader();
Acc[1][SG.get_group_linear_id()][SG.get_local_linear_id()] =
SG.get_group_linear_range();
Acc[2][SG.get_group_linear_id()][SG.get_local_linear_id()] =
SG.get_local_linear_range();
});
});

sycl::host_accessor Acc{Buf, sycl::read_only};

sycl::kernel Kernel = KernelBundle.get_kernel(TestKernelID);

const size_t SubgroupSize =
Kernel.get_info<sycl::info::kernel_device_specific::max_sub_group_size>(
Q.get_device(), sycl::range{32, 1, 1});
size_t SubgroupSize = 0;

sycl::accessor WriteAcc{Buf, sycl::write_only};
const auto KernelFunc = [=](sycl::nd_item<1> item) {
auto SG = item.get_sub_group();
WriteAcc[0][SG.get_group_linear_id()][SG.get_local_linear_id()] =
SG.leader();
WriteAcc[1][SG.get_group_linear_id()][SG.get_local_linear_id()] =
SG.get_group_linear_range();
WriteAcc[2][SG.get_group_linear_id()][SG.get_local_linear_id()] =
SG.get_local_linear_range();
};

if (Q.get_backend() != sycl::backend::cuda) {
sycl::kernel_id TestKernelID = sycl::get_kernel_id<TestKernel>();
sycl::kernel_bundle KernelBundle =
sycl::get_kernel_bundle<sycl::bundle_state::executable>(Q.get_context(),
{TestKernelID});

Q.submit([&](sycl::handler &CGH) {
CGH.use_kernel_bundle(KernelBundle);
CGH.require(WriteAcc);
CGH.parallel_for<TestKernel>(
sycl::nd_range<1>(sycl::range{32}, sycl::range{32}), KernelFunc);
});

sycl::kernel Kernel = KernelBundle.get_kernel(TestKernelID);
SubgroupSize =
Kernel.get_info<sycl::info::kernel_device_specific::max_sub_group_size>(
Q.get_device(), sycl::range{32, 1, 1});
} else {
// CUDA sub-group size is 32 by default (size of a warp) so the kernel
// bundle is not strictly needed to do this test for the CUDA backend.
// TODO: Remove this special CUDA path once the CUDA backend supports kernel
// bundles.
SubgroupSize = 32;
Q.submit([&](sycl::handler &CGH) {
CGH.require(WriteAcc);
CGH.parallel_for<TestKernelCUDA>(
sycl::nd_range<1>(sycl::range{32}, sycl::range{32}), KernelFunc);
});
}

sycl::host_accessor HostAcc{Buf, sycl::read_only};

const size_t MaxNumSubgroups = 32 / SubgroupSize;

for (size_t SGNo = 0; SGNo < MaxNumSubgroups; SGNo++) {
for (size_t WINo = 0; WINo < SubgroupSize; WINo++) {
const int Leader = WINo == 0 ? 1 : 0;
assert(Acc[0][SGNo][WINo] == Leader);
assert(Acc[1][SGNo][WINo] == MaxNumSubgroups);
assert(Acc[2][SGNo][WINo] == SubgroupSize);
assert(HostAcc[0][SGNo][WINo] == Leader);
assert(HostAcc[1][SGNo][WINo] == MaxNumSubgroups);
assert(HostAcc[2][SGNo][WINo] == SubgroupSize);
}
}

Expand Down