Skip to content

Commit 0d5266b

Browse files
jchlandanpmiller
andauthored
[SYCL] UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL on Cuda and HIP (#17137)
For HIP the value of sub group size can either be 32 or 64, it can be retrieved from `intel_reqd_sub_group_size` metadata node. Cuda only supports 32, which is enforced in the compiler, see [SemaSYCL::addIntelReqdSubGroupSizeAttr](https://github.com/intel/llvm/blob/sycl/clang/lib/Sema/SemaSYCLDeclAttr.cpp#L828). --------- Co-authored-by: Nicolas Miller <[email protected]>
1 parent c5c0be5 commit 0d5266b

File tree

12 files changed

+253
-144
lines changed

12 files changed

+253
-144
lines changed

llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,14 @@ PropSetRegTy computeModuleProperties(const Module &M,
314314
KernelReqdWorkGroupSize);
315315
}
316316

317+
if (auto ReqdSubGroupSize = getKernelSingleEltMetadata<uint32_t>(
318+
Func, "intel_reqd_sub_group_size")) {
319+
// intel_reqd_sub_group_size is stored as i32.
320+
MetadataNames.push_back(Func.getName().str() + "@reqd_sub_group_size");
321+
PropSet.add(PropSetRegTy::SYCL_PROGRAM_METADATA, MetadataNames.back(),
322+
*ReqdSubGroupSize);
323+
}
324+
317325
if (auto WorkGroupNumDim = getKernelSingleEltMetadata<uint32_t>(
318326
Func, "work_group_num_dim")) {
319327
MetadataNames.push_back(Func.getName().str() + "@work_group_num_dim");
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
; This test checks that the sycl-post-link tool correctly handles
2+
; intel_reqd_sub_group_size metadata.
3+
4+
; RUN: sycl-post-link -properties -emit-program-metadata -device-globals -S < %s -o %t.files.table
5+
; RUN: FileCheck %s -input-file=%t.files.table --check-prefixes CHECK-TABLE
6+
; RUN: FileCheck %s -input-file=%t.files_0.prop --match-full-lines --check-prefixes CHECK-PROP
7+
8+
target triple = "amdgcn-amd-amdhsa"
9+
10+
!0 = !{i32 64}
11+
12+
define weak_odr amdgpu_kernel void @_ZTS7Kernel1(float %arg1) !intel_reqd_sub_group_size !0 {
13+
call void @foo(float %arg1)
14+
ret void
15+
}
16+
17+
declare void @foo(float)
18+
19+
; CHECK-PROP: [SYCL/program metadata]
20+
; CHECK-PROP-NEXT: _ZTS7Kernel1@reqd_sub_group_size=1|64
21+
22+
; CHECK-TABLE: [Code|Properties]
23+
; CHECK-TABLE-NEXT: {{.*}}files_0.prop
24+
; CHECK-TABLE-EMPTY:

sycl/test-e2e/SubGroup/attributes.cpp

Lines changed: 9 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1-
// TODO: Despite using a supported required subgroup size compile_sub_group_size
2-
// reports as 0 on cuda and hip
3-
// XFAIL: target-nvidia || target-amd
4-
// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14357
1+
// UNSUPPORTED: target-amd, target-nvidia
2+
// UNSUPPORTED-INTENDED: This test is not meant to be run on CUDA/HIP. Instead
3+
// `attributes_cuda_hip.cpp` is designed to test those backends. This is needed
4+
// as the CI is set up such that it only builds a test once for all available
5+
// devices, this is not suitable, as GPU targets will compile-time-check the
6+
// sub-group size and error out if it is not correct.
57

68
// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
79
// RUN: %{run} %t.out
10+
811
//==------- attributes.cpp - SYCL sub_group attributes test ----*- C++ -*---==//
912
//
1013
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -13,134 +16,6 @@
1316
//
1417
//===----------------------------------------------------------------------===//
1518

16-
#include "helper.hpp"
17-
18-
#define KERNEL_FUNCTOR_WITH_SIZE(SIZE) \
19-
class KernelFunctor##SIZE { \
20-
public: \
21-
[[sycl::reqd_sub_group_size(SIZE)]] void \
22-
operator()(sycl::nd_item<1> Item) const { \
23-
const auto GID = Item.get_global_id(); \
24-
} \
25-
};
26-
27-
KERNEL_FUNCTOR_WITH_SIZE(1);
28-
KERNEL_FUNCTOR_WITH_SIZE(2);
29-
KERNEL_FUNCTOR_WITH_SIZE(4);
30-
KERNEL_FUNCTOR_WITH_SIZE(8);
31-
KERNEL_FUNCTOR_WITH_SIZE(16);
32-
KERNEL_FUNCTOR_WITH_SIZE(32);
33-
KERNEL_FUNCTOR_WITH_SIZE(64);
34-
35-
#undef KERNEL_FUNCTOR_WITH_SIZE
36-
37-
inline uint32_t flp2(uint32_t X) {
38-
X = X | (X >> 1);
39-
X = X | (X >> 2);
40-
X = X | (X >> 4);
41-
X = X | (X >> 8);
42-
X = X | (X >> 16);
43-
return X - (X >> 1);
44-
}
45-
46-
template <typename Fn> inline void submit(sycl::queue &Q) {
47-
Q.submit([](sycl::handler &cgh) {
48-
Fn F;
49-
cgh.parallel_for(sycl::nd_range<1>{64, 16}, F);
50-
});
51-
}
52-
53-
int main() {
54-
queue Queue;
55-
device Device = Queue.get_device();
56-
57-
try {
58-
const auto SGSizes = Device.get_info<info::device::sub_group_sizes>();
59-
60-
for (const auto SGSize : SGSizes) {
61-
// Get the previous power of 2
62-
auto ReqdSize = flp2(SGSize);
63-
64-
std::cout << "Run for " << ReqdSize << " required workgroup size.\n";
65-
66-
// Store the `sycl::kernel` into a vector because `sycl::kernel`
67-
// doesn't have default constructor
68-
std::vector<sycl::kernel> TheKernel;
69-
70-
switch (ReqdSize) {
71-
case 64: {
72-
auto KernelID = sycl::get_kernel_id<KernelFunctor64>();
73-
auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
74-
Queue.get_context(), {KernelID});
75-
TheKernel.push_back(KB.get_kernel(KernelID));
76-
submit<KernelFunctor64>(Queue);
77-
break;
78-
}
79-
case 32: {
80-
auto KernelID = sycl::get_kernel_id<KernelFunctor32>();
81-
auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
82-
Queue.get_context(), {KernelID});
83-
TheKernel.push_back(KB.get_kernel(KernelID));
84-
submit<KernelFunctor32>(Queue);
85-
break;
86-
}
87-
case 16: {
88-
auto KernelID = sycl::get_kernel_id<KernelFunctor16>();
89-
auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
90-
Queue.get_context(), {KernelID});
91-
TheKernel.push_back(KB.get_kernel(KernelID));
92-
submit<KernelFunctor16>(Queue);
93-
break;
94-
}
95-
case 8: {
96-
auto KernelID = sycl::get_kernel_id<KernelFunctor8>();
97-
auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
98-
Queue.get_context(), {KernelID});
99-
TheKernel.push_back(KB.get_kernel(KernelID));
100-
submit<KernelFunctor8>(Queue);
101-
break;
102-
}
103-
case 4: {
104-
auto KernelID = sycl::get_kernel_id<KernelFunctor4>();
105-
auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
106-
Queue.get_context(), {KernelID});
107-
TheKernel.push_back(KB.get_kernel(KernelID));
108-
submit<KernelFunctor4>(Queue);
109-
break;
110-
}
111-
case 2: {
112-
auto KernelID = sycl::get_kernel_id<KernelFunctor2>();
113-
auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
114-
Queue.get_context(), {KernelID});
115-
TheKernel.push_back(KB.get_kernel(KernelID));
116-
submit<KernelFunctor2>(Queue);
117-
break;
118-
}
119-
case 1: {
120-
auto KernelID = sycl::get_kernel_id<KernelFunctor1>();
121-
auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
122-
Queue.get_context(), {KernelID});
123-
TheKernel.push_back(KB.get_kernel(KernelID));
124-
submit<KernelFunctor1>(Queue);
125-
break;
126-
}
127-
default:
128-
throw sycl::exception(sycl::errc::feature_not_supported,
129-
"sub-group size is not supported");
130-
}
131-
132-
auto Kernel = TheKernel[0];
133-
134-
auto Res = Kernel.get_info<
135-
sycl::info::kernel_device_specific::compile_sub_group_size>(Device);
136-
137-
exit_if_not_equal<size_t>(Res, ReqdSize, "compile_sub_group_size");
138-
}
139-
} catch (exception e) {
140-
std::cout << "SYCL exception caught: " << e.what();
141-
return 1;
142-
}
19+
#include "attributes_helper.hpp"
14320

144-
std::cout << "Test passed.\n";
145-
return 0;
146-
}
21+
int main() { return runTests(); }
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
// REQUIRES: cuda || hip
2+
// RUN: %{build} -DBUILD_FOR_GPU -fsycl-device-code-split=per_kernel -o %t.out
3+
// RUN: %{run} %t.out
4+
5+
//==- attributes_cuda_hip.cpp - SYCL sub_group attributes test -*- C++ -*---==//
6+
//
7+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8+
// See https://llvm.org/LICENSE.txt for license information.
9+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#include "attributes_helper.hpp"
14+
15+
int main() { return runTests(); }
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
//==- attributes_helper.hpp - SYCL sub_group attributes helper -*- C++ -*---==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "helper.hpp"
10+
11+
#define KERNEL_FUNCTOR_WITH_SIZE(SIZE) \
12+
class KernelFunctor##SIZE { \
13+
public: \
14+
[[sycl::reqd_sub_group_size(SIZE)]] void \
15+
operator()(sycl::nd_item<1> Item) const { \
16+
const auto GID = Item.get_global_id(); \
17+
} \
18+
};
19+
20+
// Dummy kernel, so we get the types and can keep later code straight-lined.
21+
#define DUMMY_KERNEL_FUNCTOR(SIZE) \
22+
class KernelFunctor##SIZE { \
23+
public: \
24+
void operator()(sycl::nd_item<1> Item) const { \
25+
const auto GID = Item.get_global_id(); \
26+
} \
27+
};
28+
29+
#ifdef BUILD_FOR_GPU
30+
DUMMY_KERNEL_FUNCTOR(1);
31+
DUMMY_KERNEL_FUNCTOR(2);
32+
DUMMY_KERNEL_FUNCTOR(4);
33+
DUMMY_KERNEL_FUNCTOR(8);
34+
DUMMY_KERNEL_FUNCTOR(16);
35+
KERNEL_FUNCTOR_WITH_SIZE(32);
36+
DUMMY_KERNEL_FUNCTOR(64);
37+
#else
38+
KERNEL_FUNCTOR_WITH_SIZE(1);
39+
KERNEL_FUNCTOR_WITH_SIZE(2);
40+
KERNEL_FUNCTOR_WITH_SIZE(4);
41+
KERNEL_FUNCTOR_WITH_SIZE(8);
42+
KERNEL_FUNCTOR_WITH_SIZE(16);
43+
KERNEL_FUNCTOR_WITH_SIZE(32);
44+
KERNEL_FUNCTOR_WITH_SIZE(64);
45+
#endif
46+
47+
#undef KERNEL_FUNCTOR_WITH_SIZE
48+
49+
inline uint32_t flp2(uint32_t X) {
50+
X = X | (X >> 1);
51+
X = X | (X >> 2);
52+
X = X | (X >> 4);
53+
X = X | (X >> 8);
54+
X = X | (X >> 16);
55+
return X - (X >> 1);
56+
}
57+
58+
template <typename Fn> inline void submit(sycl::queue &Q) {
59+
Q.submit([](sycl::handler &cgh) {
60+
Fn F;
61+
cgh.parallel_for(sycl::nd_range<1>{64, 16}, F);
62+
});
63+
}
64+
65+
int runTests() {
66+
queue Queue;
67+
device Device = Queue.get_device();
68+
69+
try {
70+
const auto SGSizes = Device.get_info<info::device::sub_group_sizes>();
71+
72+
for (const auto SGSize : SGSizes) {
73+
// Get the previous power of 2
74+
auto ReqdSize = flp2(SGSize);
75+
76+
std::cout << "Run for " << ReqdSize << " required workgroup size.\n";
77+
78+
// Store the `sycl::kernel` into a vector because `sycl::kernel`
79+
// doesn't have default constructor
80+
std::vector<sycl::kernel> TheKernel;
81+
82+
switch (ReqdSize) {
83+
case 64: {
84+
auto KernelID = sycl::get_kernel_id<KernelFunctor64>();
85+
auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
86+
Queue.get_context(), {KernelID});
87+
TheKernel.push_back(KB.get_kernel(KernelID));
88+
submit<KernelFunctor64>(Queue);
89+
break;
90+
}
91+
case 32: {
92+
auto KernelID = sycl::get_kernel_id<KernelFunctor32>();
93+
auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
94+
Queue.get_context(), {KernelID});
95+
TheKernel.push_back(KB.get_kernel(KernelID));
96+
submit<KernelFunctor32>(Queue);
97+
break;
98+
}
99+
case 16: {
100+
auto KernelID = sycl::get_kernel_id<KernelFunctor16>();
101+
auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
102+
Queue.get_context(), {KernelID});
103+
TheKernel.push_back(KB.get_kernel(KernelID));
104+
submit<KernelFunctor16>(Queue);
105+
break;
106+
}
107+
case 8: {
108+
auto KernelID = sycl::get_kernel_id<KernelFunctor8>();
109+
auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
110+
Queue.get_context(), {KernelID});
111+
TheKernel.push_back(KB.get_kernel(KernelID));
112+
submit<KernelFunctor8>(Queue);
113+
break;
114+
}
115+
case 4: {
116+
auto KernelID = sycl::get_kernel_id<KernelFunctor4>();
117+
auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
118+
Queue.get_context(), {KernelID});
119+
TheKernel.push_back(KB.get_kernel(KernelID));
120+
submit<KernelFunctor4>(Queue);
121+
break;
122+
}
123+
case 2: {
124+
auto KernelID = sycl::get_kernel_id<KernelFunctor2>();
125+
auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
126+
Queue.get_context(), {KernelID});
127+
TheKernel.push_back(KB.get_kernel(KernelID));
128+
submit<KernelFunctor2>(Queue);
129+
break;
130+
}
131+
case 1: {
132+
auto KernelID = sycl::get_kernel_id<KernelFunctor1>();
133+
auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
134+
Queue.get_context(), {KernelID});
135+
TheKernel.push_back(KB.get_kernel(KernelID));
136+
submit<KernelFunctor1>(Queue);
137+
break;
138+
}
139+
default:
140+
throw sycl::exception(sycl::errc::feature_not_supported,
141+
"sub-group size is not supported");
142+
}
143+
144+
auto Kernel = TheKernel[0];
145+
146+
auto Res = Kernel.get_info<
147+
sycl::info::kernel_device_specific::compile_sub_group_size>(Device);
148+
149+
#ifdef BUILD_FOR_GPU
150+
// GPU targets only test this one size, override the value, so the check
151+
// passes and the code path don't diverge.
152+
if (ReqdSize != 32)
153+
ReqdSize = 0;
154+
#endif
155+
156+
exit_if_not_equal<size_t>(Res, ReqdSize, "compile_sub_group_size");
157+
}
158+
} catch (exception e) {
159+
std::cout << "SYCL exception caught: " << e.what();
160+
return 1;
161+
}
162+
163+
std::cout << "Test passed.\n";
164+
return 0;
165+
}

unified-runtime/source/adapters/cuda/kernel.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -339,10 +339,15 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
339339
return ReturnValue(0);
340340
}
341341
case UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL: {
342-
// Return value of 0 => unspecified or "auto" sub-group size
343-
// Correct for now, since warp size may be read from special register
344-
// TODO: Return warp size once default is primary sub-group size
345-
// TODO: Revisit if we can recover [[sub_group_size]] attribute from PTX
342+
const auto &KernelReqdSubGroupSizeMap =
343+
hKernel->getProgram()->KernelReqdSubGroupSizeMD;
344+
// If present, return the value of intel_reqd_sub_group_size metadata, if
345+
// not: 0, which stands for unspecified or auto sub-group size.
346+
if (auto KernelReqdSubGroupSize =
347+
KernelReqdSubGroupSizeMap.find(hKernel->getName());
348+
KernelReqdSubGroupSize != KernelReqdSubGroupSizeMap.end())
349+
return ReturnValue(KernelReqdSubGroupSize->second);
350+
346351
return ReturnValue(0);
347352
}
348353
default:

0 commit comments

Comments
 (0)