Skip to content

[SYCL] Improve handling of large reqd_work_group_size values #10620

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Aug 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions llvm/include/llvm/Support/PropertySetIO.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ class PropertyValue {

PropertyValue(uint32_t Val) : Ty(UINT32), Val({Val}) {}
PropertyValue(const byte *Data, SizeTy DataBitSize);
template <typename T>
PropertyValue(const std::vector<T> &Data)
template <typename C, typename T = typename C::value_type>
PropertyValue(const C &Data)
: PropertyValue(reinterpret_cast<const byte *>(Data.data()),
Data.size() * sizeof(T) * /* bits in one byte */ 8) {}
PropertyValue(const llvm::StringRef &Str)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,19 @@
; RUN: FileCheck %s -input-file=%t_0.prop --check-prefix CHECK-PROP-AUTO-SPLIT-0
; RUN: FileCheck %s -input-file=%t_1.prop --check-prefix CHECK-PROP-AUTO-SPLIT-1

; TODO: Before intel/llvm#10620, the reqd_work_group_size attribute
; stores its values as uint32_t, but this needed to be expanded to
; uint64_t. However, this change did not happen in ABI-breaking
; window, so we attach the required work-group size as the
; reqd_work_group_size_uint64_t attribute. At the next ABI-breaking
; window, this can be changed back to reqd_work_group_size.
; CHECK-PROP-AUTO-SPLIT-0: [SYCL/device requirements]
; CHECK-PROP-AUTO-SPLIT-0-NEXT: aspects=2|AAAAAAAAAAA
; CHECK-PROP-AUTO-SPLIT-0-NEXT: reqd_work_group_size=2|gAAAAAAAAAAQAAAA
; CHECK-PROP-AUTO-SPLIT-0-NEXT: reqd_work_group_size_uint64_t=2|ABAAAAAAAAAQAAAAAAAAAA

; CHECK-PROP-AUTO-SPLIT-1: [SYCL/device requirements]
; CHECK-PROP-AUTO-SPLIT-1-NEXT: aspects=2|AAAAAAAAAAA
; CHECK-PROP-AUTO-SPLIT-1-NEXT: reqd_work_group_size=2|gAAAAAAAAAAIAAAA
; CHECK-PROP-AUTO-SPLIT-1-NEXT: reqd_work_group_size_uint64_t=2|ABAAAAAAAAAIAAAAAAAAAA

; ModuleID = '/tmp/source-5f7d0d.bc'
source_filename = "llvm-link"
Expand Down
25 changes: 22 additions & 3 deletions llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,7 @@ void llvm::getSYCLDeviceRequirements(
// Scan the module and if the metadata is present fill the corresponing
// property with metadata's aspects
constexpr std::pair<const char *, const char *> ReqdMDs[] = {
{"sycl_used_aspects", "aspects"},
{"sycl_fixed_targets", "fixed_target"},
{"reqd_work_group_size", "reqd_work_group_size"}};
{"sycl_used_aspects", "aspects"}, {"sycl_fixed_targets", "fixed_target"}};

for (const auto &[MDName, MappedName] : ReqdMDs) {
std::set<uint32_t> Values;
Expand All @@ -69,6 +67,27 @@ void llvm::getSYCLDeviceRequirements(
std::vector<uint32_t>(Values.begin(), Values.end());
}

std::optional<llvm::SmallVector<uint64_t, 3>> ReqdWorkGroupSize;
for (const Function &F : MD.getModule()) {
if (const MDNode *MDN = F.getMetadata("reqd_work_group_size")) {
llvm::SmallVector<size_t, 3> NewReqdWorkGroupSize;
for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I)
NewReqdWorkGroupSize.push_back(
ExtractUnsignedIntegerFromMDNodeOperand(MDN, I));
if (!ReqdWorkGroupSize)
ReqdWorkGroupSize = NewReqdWorkGroupSize;
}
}

// TODO: Before intel/llvm#10620, the reqd_work_group_size attribute
// stores its values as uint32_t, but this needed to be expanded to
// uint64_t. However, this change did not happen in ABI-breaking
// window, so we attach the required work-group size as the
// reqd_work_group_size_uint64_t attribute. At the next ABI-breaking
// window, this can be changed back to reqd_work_group_size.
if (ReqdWorkGroupSize)
Requirements["reqd_work_group_size_uint64_t"] = *ReqdWorkGroupSize;

// There should only be at most one function with
// intel_reqd_sub_group_size metadata when considering the entry
// points of a module, but not necessarily when considering all the
Expand Down
178 changes: 103 additions & 75 deletions sycl/source/detail/program_manager/program_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -630,61 +630,8 @@ sycl::detail::pi::PiProgram ProgramManager::getBuiltPIProgram(
getDeviceImage(KSId, Context, Device, JITCompilationIsRequired);

// Check that device supports all aspects used by the kernel
const RTDeviceBinaryImage::PropertyRange &ARange =
Img.getDeviceRequirements();

#define __SYCL_ASPECT(ASPECT, ID) \
case aspect::ASPECT: \
return #ASPECT;
#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
// We don't need "case aspect::usm_allocator" here because it will duplicate
// "case aspect::usm_system_allocations", therefore leave this macro empty
#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
auto getAspectNameStr = [](aspect AspectNum) -> std::string {
switch (AspectNum) {
#include <sycl/info/aspects.def>
#include <sycl/info/aspects_deprecated.def>
}
throw sycl::exception(errc::kernel_not_supported,
"Unknown aspect " +
std::to_string(static_cast<unsigned>(AspectNum)));
};
#undef __SYCL_ASPECT_DEPRECATED_ALIAS
#undef __SYCL_ASPECT_DEPRECATED
#undef __SYCL_ASPECT

for (RTDeviceBinaryImage::PropertyRange::ConstIterator It : ARange) {
using namespace std::literals;
if ((*It)->Name == "aspects"sv) {
ByteArray Aspects = DeviceBinaryProperty(*It).asByteArray();
// 8 because we need to skip 64-bits of size of the byte array
Aspects.dropBytes(8);
while (!Aspects.empty()) {
auto Aspect = static_cast<aspect>(Aspects.consume<int>());
if (!Dev->has(Aspect))
throw sycl::exception(errc::kernel_not_supported,
"Required aspect " + getAspectNameStr(Aspect) +
" is not supported on the device");
}
} else if ((*It)->Name == "reqd_sub_group_size"sv) {
auto ReqdSubGroupSize = DeviceBinaryProperty(*It).asUint32();
auto SupportedSubGroupSizes =
Device.get_info<info::device::sub_group_sizes>();

// !getUint32PropAsBool(Img, "isEsimdImage") is a WA for ESIMD,
// as ESIMD images have a reqd-sub-group-size of 1, but currently
// no backend currently includes 1 as a valid sub-group size.
// This can be removed if backends add 1 as a valid sub-group size.
if (!getUint32PropAsBool(Img, "isEsimdImage") &&
std::none_of(SupportedSubGroupSizes.cbegin(),
SupportedSubGroupSizes.cend(),
[=](auto s) { return s == ReqdSubGroupSize; }))
throw sycl::exception(errc::kernel_not_supported,
"Sub-group size " +
std::to_string(ReqdSubGroupSize) +
" is not supported on the device");
}
}
if (auto exception = checkDevSupportDeviceRequirements(Device, Img))
throw *exception;

auto BuildF = [this, &Img, &Context, &ContextImpl, &Device, Prg, &CompileOpts,
&LinkOpts, SpecConsts] {
Expand Down Expand Up @@ -2456,6 +2403,44 @@ ProgramManager::getOrCreateKernel(const context &Context,

bool doesDevSupportDeviceRequirements(const device &Dev,
const RTDeviceBinaryImage &Img) {
return !checkDevSupportDeviceRequirements(Dev, Img).has_value();
}

static std::string getAspectNameStr(sycl::aspect AspectNum) {
#define __SYCL_ASPECT(ASPECT, ID) \
case aspect::ASPECT: \
return #ASPECT;
#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
// We don't need "case aspect::usm_allocator" here because it will duplicate
// "case aspect::usm_system_allocations", therefore leave this macro empty
#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
switch (AspectNum) {
#include <sycl/info/aspects.def>
#include <sycl/info/aspects_deprecated.def>
}
throw sycl::exception(errc::kernel_not_supported,
"Unknown aspect " +
std::to_string(static_cast<unsigned>(AspectNum)));
#undef __SYCL_ASPECT_DEPRECATED_ALIAS
#undef __SYCL_ASPECT_DEPRECATED
#undef __SYCL_ASPECT
}

// Check if the multiplication over unsigned integers overflows
template <typename T>
static std::enable_if_t<std::is_unsigned_v<T>, std::optional<T>>
multiply_with_overflow_check(T x, T y) {
if (y == 0)
return 0;
if (x > std::numeric_limits<T>::max() / y)
return {};
else
return x * y;
}

std::optional<sycl::exception>
checkDevSupportDeviceRequirements(const device &Dev,
const RTDeviceBinaryImage &Img) {
auto getPropIt = [&Img](const std::string &PropName) {
const RTDeviceBinaryImage::PropertyRange &PropRange =
Img.getDeviceRequirements();
Expand All @@ -2471,7 +2456,8 @@ bool doesDevSupportDeviceRequirements(const device &Dev,
};

auto AspectsPropIt = getPropIt("aspects");
auto ReqdWGSizePropIt = getPropIt("reqd_work_group_size");
auto ReqdWGSizeUint32TPropIt = getPropIt("reqd_work_group_size");
auto ReqdWGSizeUint64TPropIt = getPropIt("reqd_work_group_size_uint64_t");
auto ReqdSubGroupSizePropIt = getPropIt("reqd_sub_group_size");

// Checking if device supports defined aspects
Expand All @@ -2483,28 +2469,54 @@ bool doesDevSupportDeviceRequirements(const device &Dev,
while (!Aspects.empty()) {
aspect Aspect = Aspects.consume<aspect>();
if (!Dev.has(Aspect))
return false;
return sycl::exception(errc::kernel_not_supported,
"Required aspect " + getAspectNameStr(Aspect) +
" is not supported on the device");
}
}

// Checking if device supports defined required work group size
if (ReqdWGSizePropIt) {
ByteArray ReqdWGSize =
DeviceBinaryProperty(*(ReqdWGSizePropIt.value())).asByteArray();
if (ReqdWGSizeUint32TPropIt || ReqdWGSizeUint64TPropIt) {
/// TODO: Before intel/llvm#10620, the reqd_work_group_size attribute
// stores its values as uint32_t, but this needed to be expanded to
// uint64_t. However, this change did not happen in ABI-breaking
// window, so we attach the required work-group size as the
// reqd_work_group_size_uint64_t attribute. At the next ABI-breaking
// window, we can remove the logic for the 32 bit property.
bool usingUint64_t = ReqdWGSizeUint64TPropIt.has_value();
auto it = usingUint64_t ? ReqdWGSizeUint64TPropIt : ReqdWGSizeUint32TPropIt;

ByteArray ReqdWGSize = DeviceBinaryProperty(*(it.value())).asByteArray();
// Drop 8 bytes describing the size of the byte array.
ReqdWGSize.dropBytes(8);
int ReqdWGSizeAllDimsTotal = 1;
std::vector<int> ReqdWGSizeVec;
uint64_t ReqdWGSizeAllDimsTotal = 1;
std::vector<uint64_t> ReqdWGSizeVec;
int Dims = 0;
while (!ReqdWGSize.empty()) {
int SingleDimSize = ReqdWGSize.consume<int>();
ReqdWGSizeAllDimsTotal *= SingleDimSize;
uint64_t SingleDimSize = usingUint64_t ? ReqdWGSize.consume<uint64_t>()
: ReqdWGSize.consume<uint32_t>();
if (auto res = multiply_with_overflow_check(ReqdWGSizeAllDimsTotal,
SingleDimSize))
ReqdWGSizeAllDimsTotal = *res;
else
return sycl::exception(
sycl::errc::kernel_not_supported,
"Required work-group size is not supported"
" (total number of work-items requested can't fit into size_t)");
ReqdWGSizeVec.push_back(SingleDimSize);
Dims++;
}
if (static_cast<size_t>(ReqdWGSizeAllDimsTotal) >
Dev.get_info<info::device::max_work_group_size>())
return false;

// The SingleDimSize was computed in an uint64_t; size_t does not
// necessarily have to be the same uint64_t (but should fit in an
// uint64_t).
if (ReqdWGSizeAllDimsTotal >
Dev.get_info<info::device::max_work_group_size>() ||
ReqdWGSizeAllDimsTotal > std::numeric_limits<size_t>::max())
return sycl::exception(sycl::errc::kernel_not_supported,
"Required work-group size " +
std::to_string(ReqdWGSizeAllDimsTotal) +
" is not supported on the device");
// Creating std::variant to call max_work_item_sizes one time to avoid
// performance drop
std::variant<id<1>, id<2>, id<3>> MaxWorkItemSizesVariant;
Expand All @@ -2522,17 +2534,26 @@ bool doesDevSupportDeviceRequirements(const device &Dev,
// issues after that
if (Dims == 1) {
// ReqdWGSizeVec is in reverse order compared to MaxWorkItemSizes
if (static_cast<size_t>(ReqdWGSizeVec[i]) >
if (ReqdWGSizeVec[i] >
std::get<id<1>>(MaxWorkItemSizesVariant)[Dims - i - 1])
return false;
return sycl::exception(sycl::errc::kernel_not_supported,
"Required work-group size " +
std::to_string(ReqdWGSizeVec[i]) +
" is not supported");
} else if (Dims == 2) {
if (static_cast<size_t>(ReqdWGSizeVec[i]) >
if (ReqdWGSizeVec[i] >
std::get<id<2>>(MaxWorkItemSizesVariant)[Dims - i - 1])
return false;
return sycl::exception(sycl::errc::kernel_not_supported,
"Required work-group size " +
std::to_string(ReqdWGSizeVec[i]) +
" is not supported");
} else // (Dims == 3)
if (static_cast<size_t>(ReqdWGSizeVec[i]) >
if (ReqdWGSizeVec[i] >
std::get<id<3>>(MaxWorkItemSizesVariant)[Dims - i - 1])
return false;
return sycl::exception(sycl::errc::kernel_not_supported,
"Required work-group size " +
std::to_string(ReqdWGSizeVec[i]) +
" is not supported");
}
}

Expand All @@ -2541,14 +2562,21 @@ bool doesDevSupportDeviceRequirements(const device &Dev,
auto ReqdSubGroupSize =
DeviceBinaryProperty(*(ReqdSubGroupSizePropIt.value())).asUint32();
auto SupportedSubGroupSizes = Dev.get_info<info::device::sub_group_sizes>();
// !getUint32PropAsBool(Img, "isEsimdImage") is a WA for ESIMD,
// as ESIMD images have a reqd-sub-group-size of 1, but currently
// no backend currently includes 1 as a valid sub-group size.
// This can be removed if backends add 1 as a valid sub-group size.
if (!getUint32PropAsBool(Img, "isEsimdImage") &&
std::none_of(SupportedSubGroupSizes.cbegin(),
SupportedSubGroupSizes.cend(),
[=](auto s) { return s == ReqdSubGroupSize; }))
return false;
return sycl::exception(sycl::errc::kernel_not_supported,
"Sub-group size " +
std::to_string(ReqdSubGroupSize) +
" is not supported on the device");
}

return true;
return {};
}

} // namespace detail
Expand Down
3 changes: 3 additions & 0 deletions sycl/source/detail/program_manager/program_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ namespace detail {

bool doesDevSupportDeviceRequirements(const device &Dev,
const RTDeviceBinaryImage &BinImages);
std::optional<sycl::exception>
checkDevSupportDeviceRequirements(const device &Dev,
const RTDeviceBinaryImage &BinImages);

// This value must be the same as in libdevice/device_itt.h.
// See sycl/doc/design/ITTAnnotations.md for more info.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// UNSUPPORTED: hip
// RUN: %{build} -o %t.out -fno-sycl-id-queries-fit-in-int
// RUN: %{run} %t.out

#include <sycl/sycl.hpp>

using namespace sycl;
queue q;
int n_fail = 0;

template <typename FunctorT>
void throws_kernel_not_supported(const char *test_name, FunctorT f) {
try {
f();
} catch (const sycl::exception &e) {
if (e.code() != errc::kernel_not_supported) {
std::cout << "fail: " << test_name << "\n"
<< "Caught wrong exception with error code " << e.code() << "\n"
<< e.what() << "\n";
++n_fail;
return;
} else {
std::cout << "pass: " << test_name << "\n"
<< "Caught right exception:\n"
<< e.what() << "\n";
return;
}
}
std::cout << "fail: " << test_name << "\n"
<< "No exception thrown\n";
++n_fail;
return;
}

int main(int argc, char *argv[]) {
throws_kernel_not_supported("nd_range<1>", [] {
constexpr uint32_t N = std::numeric_limits<uint32_t>::max();
q.parallel_for<class K0>(nd_range<1>(N, N),
[=](auto) [[sycl::reqd_work_group_size(N)]] {});
});

throws_kernel_not_supported("nd_range<2>", [] {
constexpr uint32_t N = std::numeric_limits<uint32_t>::max();
q.parallel_for<class K1>(nd_range<2>({N, N}, {N, N}),
[=](auto) [[sycl::reqd_work_group_size(N, N)]] {});
});

throws_kernel_not_supported("nd_range<3>", [] {
constexpr uint32_t N = std::numeric_limits<uint32_t>::max();
q.parallel_for<class K2>(nd_range<3>({N, N, N}, {N, N, N}),
[=](auto)
[[sycl::reqd_work_group_size(N, N, N)]] {});
});

throws_kernel_not_supported("uint32_max+2", [] {
constexpr uint64_t N = std::numeric_limits<uint32_t>::max() + uint64_t(2);
q.parallel_for<class K3>(nd_range<1>(N, N),
[=](auto) [[sycl::reqd_work_group_size(N)]] {});
});

return n_fail;
}