1
-
2
- // XFAIL: cuda
3
1
// UNSUPPORTED: cpu
4
- // CUDA compilation and runtime do not yet support sub-groups.
5
2
// #2252 Disable until all variants of built-ins are available in OpenCL CPU
6
3
// runtime for every supported ISA
7
4
//
@@ -27,9 +24,18 @@ using namespace cl::sycl;
27
24
28
25
template <typename T, int N> void check (queue &Queue) {
29
26
const int G = 1024 , L = 128 ;
27
+
28
+ // Pad arrays based on sub-group size to ensure no out-of-bounds accesses
29
+ // Workaround for info::device::sub_group_sizes support on some devices
30
+ size_t max_sg_size = 128 ;
31
+ #if 0
32
+ auto sg_sizes = Queue.get_device().get_info<info::device::sub_group_sizes>();
33
+ size_t max_sg_size = *std::max_element(sg_sizes.begin(), sg_sizes.end());
34
+ #endif
35
+
30
36
try {
31
37
nd_range<1 > NdRange (G, L);
32
- buffer<T> syclbuf (G);
38
+ buffer<T> syclbuf (G + max_sg_size * N );
33
39
buffer<size_t > sgsizebuf (1 );
34
40
{
35
41
auto acc = syclbuf.template get_access <access::mode::read_write>();
@@ -42,7 +48,7 @@ template <typename T, int N> void check(queue &Queue) {
42
48
auto acc = syclbuf.template get_access <access::mode::read_write>(cgh);
43
49
auto sgsizeacc = sgsizebuf.get_access <access::mode::read_write>(cgh);
44
50
accessor<T, 1 , access::mode::read_write, access::target::local> LocalMem (
45
- {L}, cgh);
51
+ {L + max_sg_size * N }, cgh);
46
52
cgh.parallel_for <sycl_subgr<T, N>>(NdRange, [=](nd_item<1 > NdItem) {
47
53
ONEAPI::sub_group SG = NdItem.get_sub_group ();
48
54
if (SG.get_group_id ().get (0 ) % N == 0 ) {
@@ -161,13 +167,14 @@ template <typename T> void check(queue &Queue) {
161
167
162
168
int main () {
163
169
queue Queue;
164
- if (!Queue.get_device ().has_extension (" cl_intel_subgroups" ) &&
165
- !Queue.get_device ().has_extension (" cl_intel_subgroups_short" ) &&
166
- !Queue.get_device ().has_extension (" cl_intel_subgroups_long" )) {
170
+ if (Queue.get_device ().is_host ()) {
167
171
std::cout << " Skipping test\n " ;
168
172
return 0 ;
169
173
}
170
- if (Queue.get_device ().has_extension (" cl_intel_subgroups" )) {
174
+ std::string PlatformName =
175
+ Queue.get_device ().get_platform ().get_info <info::platform::name>();
176
+ if (Queue.get_device ().has_extension (" cl_intel_subgroups" ) ||
177
+ PlatformName.find (" CUDA" ) != std::string::npos) {
171
178
typedef bool aligned_char __attribute__ ((aligned (16 )));
172
179
check<aligned_char>(Queue);
173
180
typedef int aligned_int __attribute__ ((aligned (16 )));
@@ -189,14 +196,16 @@ int main() {
189
196
check<aligned_float, 4 >(Queue);
190
197
check<aligned_float, 8 >(Queue);
191
198
}
192
- if (Queue.get_device ().has_extension (" cl_intel_subgroups_short" )) {
199
+ if (Queue.get_device ().has_extension (" cl_intel_subgroups_short" ) ||
200
+ PlatformName.find (" CUDA" ) != std::string::npos) {
193
201
typedef short aligned_short __attribute__ ((aligned (16 )));
194
202
check<aligned_short>(Queue);
195
203
check<aligned_short, 1 >(Queue);
196
204
check<aligned_short, 2 >(Queue);
197
205
check<aligned_short, 4 >(Queue);
198
206
check<aligned_short, 8 >(Queue);
199
- if (Queue.get_device ().has_extension (" cl_khr_fp16" )) {
207
+ if (Queue.get_device ().has_extension (" cl_khr_fp16" ) ||
208
+ PlatformName.find (" CUDA" ) != std::string::npos) {
200
209
typedef half aligned_half __attribute__ ((aligned (16 )));
201
210
check<aligned_half>(Queue);
202
211
check<aligned_half, 1 >(Queue);
@@ -205,7 +214,8 @@ int main() {
205
214
check<aligned_half, 8 >(Queue);
206
215
}
207
216
}
208
- if (Queue.get_device ().has_extension (" cl_intel_subgroups_long" )) {
217
+ if (Queue.get_device ().has_extension (" cl_intel_subgroups_long" ) ||
218
+ PlatformName.find (" CUDA" ) != std::string::npos) {
209
219
typedef long aligned_long __attribute__ ((aligned (16 )));
210
220
check<aligned_long>(Queue);
211
221
check<aligned_long, 1 >(Queue);
0 commit comments