Skip to content

Commit 9b36e07

Browse files
authored
[SYCL][CUDA] Enable sub-group load_store tests (intel/llvm-test-suite#90)
* Prevent out-of-bounds access in load_store. The test assumed that it was safe to load/store off the end of a buffer as long as the results of those accesses were ignored. Some backends (e.g. CUDA) will fail because of an out-of-bounds access, so it's safer to pad the buffers instead. * Enable sub-group load_store tests Signed-off-by: John Pennycook <[email protected]>
1 parent e49f4f4 commit 9b36e07

File tree

1 file changed

+22
-12
lines changed

1 file changed

+22
-12
lines changed

SYCL/SubGroup/load_store.cpp

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
1-
2-
// XFAIL: cuda
31
// UNSUPPORTED: cpu
4-
// CUDA compilation and runtime do not yet support sub-groups.
52
// #2252 Disable until all variants of built-ins are available in OpenCL CPU
63
// runtime for every supported ISA
74
//
@@ -27,9 +24,18 @@ using namespace cl::sycl;
2724

2825
template <typename T, int N> void check(queue &Queue) {
2926
const int G = 1024, L = 128;
27+
28+
// Pad arrays based on sub-group size to ensure no out-of-bounds accesses
29+
// Workaround for info::device::sub_group_sizes support on some devices
30+
size_t max_sg_size = 128;
31+
#if 0
32+
auto sg_sizes = Queue.get_device().get_info<info::device::sub_group_sizes>();
33+
size_t max_sg_size = *std::max_element(sg_sizes.begin(), sg_sizes.end());
34+
#endif
35+
3036
try {
3137
nd_range<1> NdRange(G, L);
32-
buffer<T> syclbuf(G);
38+
buffer<T> syclbuf(G + max_sg_size * N);
3339
buffer<size_t> sgsizebuf(1);
3440
{
3541
auto acc = syclbuf.template get_access<access::mode::read_write>();
@@ -42,7 +48,7 @@ template <typename T, int N> void check(queue &Queue) {
4248
auto acc = syclbuf.template get_access<access::mode::read_write>(cgh);
4349
auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
4450
accessor<T, 1, access::mode::read_write, access::target::local> LocalMem(
45-
{L}, cgh);
51+
{L + max_sg_size * N}, cgh);
4652
cgh.parallel_for<sycl_subgr<T, N>>(NdRange, [=](nd_item<1> NdItem) {
4753
ONEAPI::sub_group SG = NdItem.get_sub_group();
4854
if (SG.get_group_id().get(0) % N == 0) {
@@ -161,13 +167,14 @@ template <typename T> void check(queue &Queue) {
161167

162168
int main() {
163169
queue Queue;
164-
if (!Queue.get_device().has_extension("cl_intel_subgroups") &&
165-
!Queue.get_device().has_extension("cl_intel_subgroups_short") &&
166-
!Queue.get_device().has_extension("cl_intel_subgroups_long")) {
170+
if (Queue.get_device().is_host()) {
167171
std::cout << "Skipping test\n";
168172
return 0;
169173
}
170-
if (Queue.get_device().has_extension("cl_intel_subgroups")) {
174+
std::string PlatformName =
175+
Queue.get_device().get_platform().get_info<info::platform::name>();
176+
if (Queue.get_device().has_extension("cl_intel_subgroups") ||
177+
PlatformName.find("CUDA") != std::string::npos) {
171178
typedef bool aligned_char __attribute__((aligned(16)));
172179
check<aligned_char>(Queue);
173180
typedef int aligned_int __attribute__((aligned(16)));
@@ -189,14 +196,16 @@ int main() {
189196
check<aligned_float, 4>(Queue);
190197
check<aligned_float, 8>(Queue);
191198
}
192-
if (Queue.get_device().has_extension("cl_intel_subgroups_short")) {
199+
if (Queue.get_device().has_extension("cl_intel_subgroups_short") ||
200+
PlatformName.find("CUDA") != std::string::npos) {
193201
typedef short aligned_short __attribute__((aligned(16)));
194202
check<aligned_short>(Queue);
195203
check<aligned_short, 1>(Queue);
196204
check<aligned_short, 2>(Queue);
197205
check<aligned_short, 4>(Queue);
198206
check<aligned_short, 8>(Queue);
199-
if (Queue.get_device().has_extension("cl_khr_fp16")) {
207+
if (Queue.get_device().has_extension("cl_khr_fp16") ||
208+
PlatformName.find("CUDA") != std::string::npos) {
200209
typedef half aligned_half __attribute__((aligned(16)));
201210
check<aligned_half>(Queue);
202211
check<aligned_half, 1>(Queue);
@@ -205,7 +214,8 @@ int main() {
205214
check<aligned_half, 8>(Queue);
206215
}
207216
}
208-
if (Queue.get_device().has_extension("cl_intel_subgroups_long")) {
217+
if (Queue.get_device().has_extension("cl_intel_subgroups_long") ||
218+
PlatformName.find("CUDA") != std::string::npos) {
209219
typedef long aligned_long __attribute__((aligned(16)));
210220
check<aligned_long>(Queue);
211221
check<aligned_long, 1>(Queue);

0 commit comments

Comments
 (0)