Skip to content

Commit 25c3666

Browse files
authored
[SYCL] Improve handling of large reqd_work_group_size values (#10620)
When a kernel with a large (unsupported) work-group size is submitted, the backend would fail upon trying build the program, and throw the wrong exception (compile_program_error with `errc::build`, but it should just be a `sycl::exception` with `errc::kernel_not_supported` according to [5.8.1. Kernel attributes](https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:kernel.attributes)). https://github.com/intel/llvm/blob/b65969014f001f9730349a5caad5c2b85c9bc378/sycl/source/detail/program_manager/program_manager.cpp#L749 This PR adds a check before building to validate the `reqd_work_group_size`. Additionally, another bug in `sycl-post-link` was fixed. The bug occurred when a kernel was decorated with a multi-dimensional required work-group size and two dimensions had the same value. Due to how `sycl-post-link` worked, the attached metadata on the device image would include fewer dimensions than specified. (e.g. If the decorated with `reqd_work_group_size(16, 16)`, then the device image would appear as if decorated by `reqd_work_group_size(16)`.)
1 parent 3506325 commit 25c3666

File tree

6 files changed

+200
-82
lines changed

6 files changed

+200
-82
lines changed

llvm/include/llvm/Support/PropertySetIO.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,8 @@ class PropertyValue {
8181

8282
PropertyValue(uint32_t Val) : Ty(UINT32), Val({Val}) {}
8383
PropertyValue(const byte *Data, SizeTy DataBitSize);
84-
template <typename T>
85-
PropertyValue(const std::vector<T> &Data)
84+
template <typename C, typename T = typename C::value_type>
85+
PropertyValue(const C &Data)
8686
: PropertyValue(reinterpret_cast<const byte *>(Data.data()),
8787
Data.size() * sizeof(T) * /* bits in one byte */ 8) {}
8888
PropertyValue(const llvm::StringRef &Str)

llvm/test/tools/sycl-post-link/device-requirements/reqd-work-group-size.ll

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,19 @@
2525
; RUN: FileCheck %s -input-file=%t_0.prop --check-prefix CHECK-PROP-AUTO-SPLIT-0
2626
; RUN: FileCheck %s -input-file=%t_1.prop --check-prefix CHECK-PROP-AUTO-SPLIT-1
2727

28+
; TODO: Before intel/llvm#10620, the reqd_work_group_size attribute
29+
; stores its values as uint32_t, but this needed to be expanded to
30+
; uint64_t. However, this change did not happen in ABI-breaking
31+
; window, so we attach the required work-group size as the
32+
; reqd_work_group_size_uint64_t attribute. At the next ABI-breaking
33+
; window, this can be changed back to reqd_work_group_size.
2834
; CHECK-PROP-AUTO-SPLIT-0: [SYCL/device requirements]
2935
; CHECK-PROP-AUTO-SPLIT-0-NEXT: aspects=2|AAAAAAAAAAA
30-
; CHECK-PROP-AUTO-SPLIT-0-NEXT: reqd_work_group_size=2|gAAAAAAAAAAQAAAA
36+
; CHECK-PROP-AUTO-SPLIT-0-NEXT: reqd_work_group_size_uint64_t=2|ABAAAAAAAAAQAAAAAAAAAA
3137

3238
; CHECK-PROP-AUTO-SPLIT-1: [SYCL/device requirements]
3339
; CHECK-PROP-AUTO-SPLIT-1-NEXT: aspects=2|AAAAAAAAAAA
34-
; CHECK-PROP-AUTO-SPLIT-1-NEXT: reqd_work_group_size=2|gAAAAAAAAAAIAAAA
40+
; CHECK-PROP-AUTO-SPLIT-1-NEXT: reqd_work_group_size_uint64_t=2|ABAAAAAAAAAIAAAAAAAAAA
3541

3642
; ModuleID = '/tmp/source-5f7d0d.bc'
3743
source_filename = "llvm-link"

llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,7 @@ void llvm::getSYCLDeviceRequirements(
4040
// Scan the module and if the metadata is present fill the corresponing
4141
// property with metadata's aspects
4242
constexpr std::pair<const char *, const char *> ReqdMDs[] = {
43-
{"sycl_used_aspects", "aspects"},
44-
{"sycl_fixed_targets", "fixed_target"},
45-
{"reqd_work_group_size", "reqd_work_group_size"}};
43+
{"sycl_used_aspects", "aspects"}, {"sycl_fixed_targets", "fixed_target"}};
4644

4745
for (const auto &[MDName, MappedName] : ReqdMDs) {
4846
std::set<uint32_t> Values;
@@ -69,6 +67,27 @@ void llvm::getSYCLDeviceRequirements(
6967
std::vector<uint32_t>(Values.begin(), Values.end());
7068
}
7169

70+
std::optional<llvm::SmallVector<uint64_t, 3>> ReqdWorkGroupSize;
71+
for (const Function &F : MD.getModule()) {
72+
if (const MDNode *MDN = F.getMetadata("reqd_work_group_size")) {
73+
llvm::SmallVector<size_t, 3> NewReqdWorkGroupSize;
74+
for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I)
75+
NewReqdWorkGroupSize.push_back(
76+
ExtractUnsignedIntegerFromMDNodeOperand(MDN, I));
77+
if (!ReqdWorkGroupSize)
78+
ReqdWorkGroupSize = NewReqdWorkGroupSize;
79+
}
80+
}
81+
82+
// TODO: Before intel/llvm#10620, the reqd_work_group_size attribute
83+
// stores its values as uint32_t, but this needed to be expanded to
84+
// uint64_t. However, this change did not happen in ABI-breaking
85+
// window, so we attach the required work-group size as the
86+
// reqd_work_group_size_uint64_t attribute. At the next ABI-breaking
87+
// window, this can be changed back to reqd_work_group_size.
88+
if (ReqdWorkGroupSize)
89+
Requirements["reqd_work_group_size_uint64_t"] = *ReqdWorkGroupSize;
90+
7291
// There should only be at most one function with
7392
// intel_reqd_sub_group_size metadata when considering the entry
7493
// points of a module, but not necessarily when considering all the

sycl/source/detail/program_manager/program_manager.cpp

Lines changed: 103 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -630,61 +630,8 @@ sycl::detail::pi::PiProgram ProgramManager::getBuiltPIProgram(
630630
getDeviceImage(KSId, Context, Device, JITCompilationIsRequired);
631631

632632
// Check that device supports all aspects used by the kernel
633-
const RTDeviceBinaryImage::PropertyRange &ARange =
634-
Img.getDeviceRequirements();
635-
636-
#define __SYCL_ASPECT(ASPECT, ID) \
637-
case aspect::ASPECT: \
638-
return #ASPECT;
639-
#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
640-
// We don't need "case aspect::usm_allocator" here because it will duplicate
641-
// "case aspect::usm_system_allocations", therefore leave this macro empty
642-
#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
643-
auto getAspectNameStr = [](aspect AspectNum) -> std::string {
644-
switch (AspectNum) {
645-
#include <sycl/info/aspects.def>
646-
#include <sycl/info/aspects_deprecated.def>
647-
}
648-
throw sycl::exception(errc::kernel_not_supported,
649-
"Unknown aspect " +
650-
std::to_string(static_cast<unsigned>(AspectNum)));
651-
};
652-
#undef __SYCL_ASPECT_DEPRECATED_ALIAS
653-
#undef __SYCL_ASPECT_DEPRECATED
654-
#undef __SYCL_ASPECT
655-
656-
for (RTDeviceBinaryImage::PropertyRange::ConstIterator It : ARange) {
657-
using namespace std::literals;
658-
if ((*It)->Name == "aspects"sv) {
659-
ByteArray Aspects = DeviceBinaryProperty(*It).asByteArray();
660-
// 8 because we need to skip 64-bits of size of the byte array
661-
Aspects.dropBytes(8);
662-
while (!Aspects.empty()) {
663-
auto Aspect = static_cast<aspect>(Aspects.consume<int>());
664-
if (!Dev->has(Aspect))
665-
throw sycl::exception(errc::kernel_not_supported,
666-
"Required aspect " + getAspectNameStr(Aspect) +
667-
" is not supported on the device");
668-
}
669-
} else if ((*It)->Name == "reqd_sub_group_size"sv) {
670-
auto ReqdSubGroupSize = DeviceBinaryProperty(*It).asUint32();
671-
auto SupportedSubGroupSizes =
672-
Device.get_info<info::device::sub_group_sizes>();
673-
674-
// !getUint32PropAsBool(Img, "isEsimdImage") is a WA for ESIMD,
675-
// as ESIMD images have a reqd-sub-group-size of 1, but currently
676-
// no backend currently includes 1 as a valid sub-group size.
677-
// This can be removed if backends add 1 as a valid sub-group size.
678-
if (!getUint32PropAsBool(Img, "isEsimdImage") &&
679-
std::none_of(SupportedSubGroupSizes.cbegin(),
680-
SupportedSubGroupSizes.cend(),
681-
[=](auto s) { return s == ReqdSubGroupSize; }))
682-
throw sycl::exception(errc::kernel_not_supported,
683-
"Sub-group size " +
684-
std::to_string(ReqdSubGroupSize) +
685-
" is not supported on the device");
686-
}
687-
}
633+
if (auto exception = checkDevSupportDeviceRequirements(Device, Img))
634+
throw *exception;
688635

689636
auto BuildF = [this, &Img, &Context, &ContextImpl, &Device, Prg, &CompileOpts,
690637
&LinkOpts, SpecConsts] {
@@ -2456,6 +2403,44 @@ ProgramManager::getOrCreateKernel(const context &Context,
24562403

24572404
bool doesDevSupportDeviceRequirements(const device &Dev,
24582405
const RTDeviceBinaryImage &Img) {
2406+
return !checkDevSupportDeviceRequirements(Dev, Img).has_value();
2407+
}
2408+
2409+
static std::string getAspectNameStr(sycl::aspect AspectNum) {
2410+
#define __SYCL_ASPECT(ASPECT, ID) \
2411+
case aspect::ASPECT: \
2412+
return #ASPECT;
2413+
#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
2414+
// We don't need "case aspect::usm_allocator" here because it will duplicate
2415+
// "case aspect::usm_system_allocations", therefore leave this macro empty
2416+
#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
2417+
switch (AspectNum) {
2418+
#include <sycl/info/aspects.def>
2419+
#include <sycl/info/aspects_deprecated.def>
2420+
}
2421+
throw sycl::exception(errc::kernel_not_supported,
2422+
"Unknown aspect " +
2423+
std::to_string(static_cast<unsigned>(AspectNum)));
2424+
#undef __SYCL_ASPECT_DEPRECATED_ALIAS
2425+
#undef __SYCL_ASPECT_DEPRECATED
2426+
#undef __SYCL_ASPECT
2427+
}
2428+
2429+
// Check if the multiplication over unsigned integers overflows
2430+
template <typename T>
2431+
static std::enable_if_t<std::is_unsigned_v<T>, std::optional<T>>
2432+
multiply_with_overflow_check(T x, T y) {
2433+
if (y == 0)
2434+
return 0;
2435+
if (x > std::numeric_limits<T>::max() / y)
2436+
return {};
2437+
else
2438+
return x * y;
2439+
}
2440+
2441+
std::optional<sycl::exception>
2442+
checkDevSupportDeviceRequirements(const device &Dev,
2443+
const RTDeviceBinaryImage &Img) {
24592444
auto getPropIt = [&Img](const std::string &PropName) {
24602445
const RTDeviceBinaryImage::PropertyRange &PropRange =
24612446
Img.getDeviceRequirements();
@@ -2471,7 +2456,8 @@ bool doesDevSupportDeviceRequirements(const device &Dev,
24712456
};
24722457

24732458
auto AspectsPropIt = getPropIt("aspects");
2474-
auto ReqdWGSizePropIt = getPropIt("reqd_work_group_size");
2459+
auto ReqdWGSizeUint32TPropIt = getPropIt("reqd_work_group_size");
2460+
auto ReqdWGSizeUint64TPropIt = getPropIt("reqd_work_group_size_uint64_t");
24752461
auto ReqdSubGroupSizePropIt = getPropIt("reqd_sub_group_size");
24762462

24772463
// Checking if device supports defined aspects
@@ -2483,28 +2469,54 @@ bool doesDevSupportDeviceRequirements(const device &Dev,
24832469
while (!Aspects.empty()) {
24842470
aspect Aspect = Aspects.consume<aspect>();
24852471
if (!Dev.has(Aspect))
2486-
return false;
2472+
return sycl::exception(errc::kernel_not_supported,
2473+
"Required aspect " + getAspectNameStr(Aspect) +
2474+
" is not supported on the device");
24872475
}
24882476
}
24892477

24902478
// Checking if device supports defined required work group size
2491-
if (ReqdWGSizePropIt) {
2492-
ByteArray ReqdWGSize =
2493-
DeviceBinaryProperty(*(ReqdWGSizePropIt.value())).asByteArray();
2479+
if (ReqdWGSizeUint32TPropIt || ReqdWGSizeUint64TPropIt) {
2480+
/// TODO: Before intel/llvm#10620, the reqd_work_group_size attribute
2481+
// stores its values as uint32_t, but this needed to be expanded to
2482+
// uint64_t. However, this change did not happen in ABI-breaking
2483+
// window, so we attach the required work-group size as the
2484+
// reqd_work_group_size_uint64_t attribute. At the next ABI-breaking
2485+
// window, we can remove the logic for the 32 bit property.
2486+
bool usingUint64_t = ReqdWGSizeUint64TPropIt.has_value();
2487+
auto it = usingUint64_t ? ReqdWGSizeUint64TPropIt : ReqdWGSizeUint32TPropIt;
2488+
2489+
ByteArray ReqdWGSize = DeviceBinaryProperty(*(it.value())).asByteArray();
24942490
// Drop 8 bytes describing the size of the byte array.
24952491
ReqdWGSize.dropBytes(8);
2496-
int ReqdWGSizeAllDimsTotal = 1;
2497-
std::vector<int> ReqdWGSizeVec;
2492+
uint64_t ReqdWGSizeAllDimsTotal = 1;
2493+
std::vector<uint64_t> ReqdWGSizeVec;
24982494
int Dims = 0;
24992495
while (!ReqdWGSize.empty()) {
2500-
int SingleDimSize = ReqdWGSize.consume<int>();
2501-
ReqdWGSizeAllDimsTotal *= SingleDimSize;
2496+
uint64_t SingleDimSize = usingUint64_t ? ReqdWGSize.consume<uint64_t>()
2497+
: ReqdWGSize.consume<uint32_t>();
2498+
if (auto res = multiply_with_overflow_check(ReqdWGSizeAllDimsTotal,
2499+
SingleDimSize))
2500+
ReqdWGSizeAllDimsTotal = *res;
2501+
else
2502+
return sycl::exception(
2503+
sycl::errc::kernel_not_supported,
2504+
"Required work-group size is not supported"
2505+
" (total number of work-items requested can't fit into size_t)");
25022506
ReqdWGSizeVec.push_back(SingleDimSize);
25032507
Dims++;
25042508
}
2505-
if (static_cast<size_t>(ReqdWGSizeAllDimsTotal) >
2506-
Dev.get_info<info::device::max_work_group_size>())
2507-
return false;
2509+
2510+
// The SingleDimSize was computed in an uint64_t; size_t does not
2511+
// necessarily have to be the same uint64_t (but should fit in an
2512+
// uint64_t).
2513+
if (ReqdWGSizeAllDimsTotal >
2514+
Dev.get_info<info::device::max_work_group_size>() ||
2515+
ReqdWGSizeAllDimsTotal > std::numeric_limits<size_t>::max())
2516+
return sycl::exception(sycl::errc::kernel_not_supported,
2517+
"Required work-group size " +
2518+
std::to_string(ReqdWGSizeAllDimsTotal) +
2519+
" is not supported on the device");
25082520
// Creating std::variant to call max_work_item_sizes one time to avoid
25092521
// performance drop
25102522
std::variant<id<1>, id<2>, id<3>> MaxWorkItemSizesVariant;
@@ -2522,17 +2534,26 @@ bool doesDevSupportDeviceRequirements(const device &Dev,
25222534
// issues after that
25232535
if (Dims == 1) {
25242536
// ReqdWGSizeVec is in reverse order compared to MaxWorkItemSizes
2525-
if (static_cast<size_t>(ReqdWGSizeVec[i]) >
2537+
if (ReqdWGSizeVec[i] >
25262538
std::get<id<1>>(MaxWorkItemSizesVariant)[Dims - i - 1])
2527-
return false;
2539+
return sycl::exception(sycl::errc::kernel_not_supported,
2540+
"Required work-group size " +
2541+
std::to_string(ReqdWGSizeVec[i]) +
2542+
" is not supported");
25282543
} else if (Dims == 2) {
2529-
if (static_cast<size_t>(ReqdWGSizeVec[i]) >
2544+
if (ReqdWGSizeVec[i] >
25302545
std::get<id<2>>(MaxWorkItemSizesVariant)[Dims - i - 1])
2531-
return false;
2546+
return sycl::exception(sycl::errc::kernel_not_supported,
2547+
"Required work-group size " +
2548+
std::to_string(ReqdWGSizeVec[i]) +
2549+
" is not supported");
25322550
} else // (Dims == 3)
2533-
if (static_cast<size_t>(ReqdWGSizeVec[i]) >
2551+
if (ReqdWGSizeVec[i] >
25342552
std::get<id<3>>(MaxWorkItemSizesVariant)[Dims - i - 1])
2535-
return false;
2553+
return sycl::exception(sycl::errc::kernel_not_supported,
2554+
"Required work-group size " +
2555+
std::to_string(ReqdWGSizeVec[i]) +
2556+
" is not supported");
25362557
}
25372558
}
25382559

@@ -2541,14 +2562,21 @@ bool doesDevSupportDeviceRequirements(const device &Dev,
25412562
auto ReqdSubGroupSize =
25422563
DeviceBinaryProperty(*(ReqdSubGroupSizePropIt.value())).asUint32();
25432564
auto SupportedSubGroupSizes = Dev.get_info<info::device::sub_group_sizes>();
2565+
// !getUint32PropAsBool(Img, "isEsimdImage") is a WA for ESIMD,
2566+
// as ESIMD images have a reqd-sub-group-size of 1, but currently
2567+
// no backend currently includes 1 as a valid sub-group size.
2568+
// This can be removed if backends add 1 as a valid sub-group size.
25442569
if (!getUint32PropAsBool(Img, "isEsimdImage") &&
25452570
std::none_of(SupportedSubGroupSizes.cbegin(),
25462571
SupportedSubGroupSizes.cend(),
25472572
[=](auto s) { return s == ReqdSubGroupSize; }))
2548-
return false;
2573+
return sycl::exception(sycl::errc::kernel_not_supported,
2574+
"Sub-group size " +
2575+
std::to_string(ReqdSubGroupSize) +
2576+
" is not supported on the device");
25492577
}
25502578

2551-
return true;
2579+
return {};
25522580
}
25532581

25542582
} // namespace detail

sycl/source/detail/program_manager/program_manager.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ namespace detail {
5151

5252
bool doesDevSupportDeviceRequirements(const device &Dev,
5353
const RTDeviceBinaryImage &BinImages);
54+
std::optional<sycl::exception>
55+
checkDevSupportDeviceRequirements(const device &Dev,
56+
const RTDeviceBinaryImage &BinImages);
5457

5558
// This value must be the same as in libdevice/device_itt.h.
5659
// See sycl/doc/design/ITTAnnotations.md for more info.
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
// UNSUPPORTED: hip
2+
// RUN: %{build} -o %t.out -fno-sycl-id-queries-fit-in-int
3+
// RUN: %{run} %t.out
4+
5+
#include <sycl/sycl.hpp>
6+
7+
using namespace sycl;
8+
queue q;
9+
int n_fail = 0;
10+
11+
template <typename FunctorT>
12+
void throws_kernel_not_supported(const char *test_name, FunctorT f) {
13+
try {
14+
f();
15+
} catch (const sycl::exception &e) {
16+
if (e.code() != errc::kernel_not_supported) {
17+
std::cout << "fail: " << test_name << "\n"
18+
<< "Caught wrong exception with error code " << e.code() << "\n"
19+
<< e.what() << "\n";
20+
++n_fail;
21+
return;
22+
} else {
23+
std::cout << "pass: " << test_name << "\n"
24+
<< "Caught right exception:\n"
25+
<< e.what() << "\n";
26+
return;
27+
}
28+
}
29+
std::cout << "fail: " << test_name << "\n"
30+
<< "No exception thrown\n";
31+
++n_fail;
32+
return;
33+
}
34+
35+
int main(int argc, char *argv[]) {
36+
throws_kernel_not_supported("nd_range<1>", [] {
37+
constexpr uint32_t N = std::numeric_limits<uint32_t>::max();
38+
q.parallel_for<class K0>(nd_range<1>(N, N),
39+
[=](auto) [[sycl::reqd_work_group_size(N)]] {});
40+
});
41+
42+
throws_kernel_not_supported("nd_range<2>", [] {
43+
constexpr uint32_t N = std::numeric_limits<uint32_t>::max();
44+
q.parallel_for<class K1>(nd_range<2>({N, N}, {N, N}),
45+
[=](auto) [[sycl::reqd_work_group_size(N, N)]] {});
46+
});
47+
48+
throws_kernel_not_supported("nd_range<3>", [] {
49+
constexpr uint32_t N = std::numeric_limits<uint32_t>::max();
50+
q.parallel_for<class K2>(nd_range<3>({N, N, N}, {N, N, N}),
51+
[=](auto)
52+
[[sycl::reqd_work_group_size(N, N, N)]] {});
53+
});
54+
55+
throws_kernel_not_supported("uint32_max+2", [] {
56+
constexpr uint64_t N = std::numeric_limits<uint32_t>::max() + uint64_t(2);
57+
q.parallel_for<class K3>(nd_range<1>(N, N),
58+
[=](auto) [[sycl::reqd_work_group_size(N)]] {});
59+
});
60+
61+
return n_fail;
62+
}

0 commit comments

Comments
 (0)