Skip to content

[SYCL] Improve error reporting for kernel enqueue #860

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 2 additions & 11 deletions sycl/include/CL/sycl/detail/cg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ class HostKernel : public HostKernelBase {
for (int I = 0; I < Dims; ++I) {
if (NDRDesc.LocalSize[I] == 0 ||
NDRDesc.GlobalSize[I] % NDRDesc.LocalSize[I] != 0)
throw sycl::runtime_error("Invalid local size for global size");
throw sycl::nd_range_error("Invalid local size for global size");
GroupSize[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I];
}

Expand Down Expand Up @@ -278,7 +278,7 @@ class HostKernel : public HostKernelBase {
for (int I = 0; I < Dims; ++I) {
if (NDRDesc.LocalSize[I] == 0 ||
NDRDesc.GlobalSize[I] % NDRDesc.LocalSize[I] != 0)
throw sycl::runtime_error("Invalid local size for global size");
throw sycl::nd_range_error("Invalid local size for global size");
NGroups[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I];
}

Expand Down Expand Up @@ -385,15 +385,6 @@ class CGExecKernel : public CG {
MStreams(std::move(Streams)) {
assert((getType() == RUN_ON_HOST_INTEL || getType() == KERNEL) &&
"Wrong type of exec kernel CG.");

if (MNDRDesc.LocalSize.size() > 0) {
range<3> Excess = (MNDRDesc.GlobalSize % MNDRDesc.LocalSize);
for (int I = 0; I < 3; I++) {
if (Excess[I] != 0)
throw nd_range_error("Global size is not a multiple of local size",
CL_INVALID_WORK_GROUP_SIZE);
}
}
}

std::vector<ArgDesc> getArguments() const { return MArgs; }
Expand Down
7 changes: 5 additions & 2 deletions sycl/include/CL/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ typedef enum {
PI_INVALID_DEVICE = CL_INVALID_DEVICE,
PI_INVALID_BINARY = CL_INVALID_BINARY,
PI_MISALIGNED_SUB_BUFFER_OFFSET = CL_MISALIGNED_SUB_BUFFER_OFFSET,
PI_OUT_OF_HOST_MEMORY = CL_OUT_OF_HOST_MEMORY
PI_OUT_OF_HOST_MEMORY = CL_OUT_OF_HOST_MEMORY,
PI_INVALID_WORK_GROUP_SIZE = CL_INVALID_WORK_GROUP_SIZE
} _pi_result;

typedef enum {
Expand All @@ -88,7 +89,9 @@ typedef enum {
PI_DEVICE_INFO_PARENT = CL_DEVICE_PARENT_DEVICE,
PI_DEVICE_INFO_PLATFORM = CL_DEVICE_PLATFORM,
PI_DEVICE_INFO_PARTITION_TYPE = CL_DEVICE_PARTITION_TYPE,
PI_DEVICE_INFO_NAME = CL_DEVICE_NAME
PI_DEVICE_INFO_NAME = CL_DEVICE_NAME,
PI_DEVICE_VERSION = CL_DEVICE_VERSION,
PI_DEVICE_MAX_WORK_GROUP_SIZE = CL_DEVICE_MAX_WORK_GROUP_SIZE
} _pi_device_info;

// TODO: populate
Expand Down
1 change: 1 addition & 0 deletions sycl/source/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ set(SYCL_SOURCES
"detail/context_impl.cpp"
"detail/device_impl.cpp"
"detail/device_info.cpp"
"detail/error_handling/enqueue_kernel.cpp"
"detail/event_impl.cpp"
"detail/force_device.cpp"
"detail/helpers.cpp"
Expand Down
183 changes: 183 additions & 0 deletions sycl/source/detail/error_handling/enqueue_kernel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
//===------------------- enqueue_kernel.cpp ---------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// SYCL error handling of enqueue kernel operations
//
//===----------------------------------------------------------------------===//

#include "error_handling.hpp"

#include <CL/sycl/detail/pi.hpp>

namespace cl {
namespace sycl {
namespace detail {

namespace enqueue_kernel_launch {

bool handleInvalidWorkGroupSize(pi_device Device, pi_kernel Kernel,
const NDRDescT &NDRDesc) {
const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0);

size_t VerSize = 0;
PI_CALL(piDeviceGetInfo)(Device, PI_DEVICE_VERSION, 0, nullptr, &VerSize);
assert(VerSize >= 10 &&
"Unexpected device version string"); // strlen("OpenCL X.Y")
string_class VerStr(VerSize, '\0');
PI_CALL(piDeviceGetInfo)(Device, PI_DEVICE_VERSION, VerSize, &VerStr.front(),
nullptr);
const char *Ver = &VerStr[7]; // strlen("OpenCL ")

size_t CompileWGSize[3] = {0};
PI_CALL(piKernelGetGroupInfo)(Kernel, Device,
CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
sizeof(size_t) * 3, CompileWGSize, nullptr);

if (CompileWGSize[0] != 0) {
// OpenCL 1.x && 2.0:
// CL_INVALID_WORK_GROUP_SIZE if local_work_size is NULL and the
// reqd_work_group_size attribute is used to declare the work-group size
// for kernel in the program source.
if (!HasLocalSize && (Ver[0] == '1' || (Ver[0] == '2' && Ver[2] == '0')))
throw sycl::nd_range_error(
"OpenCL 1.x and 2.0 requires to pass local size argument even if "
"required work-group size was specified in the program source",
PI_INVALID_WORK_GROUP_SIZE);

// Any OpenCL version:
// CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and does not
// match the required work-group size for kernel in the program source.
if (NDRDesc.LocalSize[0] != CompileWGSize[0] ||
NDRDesc.LocalSize[1] != CompileWGSize[1] ||
NDRDesc.LocalSize[2] != CompileWGSize[2])
throw sycl::nd_range_error(
"Specified local size doesn't match the required work-group size "
"specified in the program source",
PI_INVALID_WORK_GROUP_SIZE);
}

if (Ver[0] == '1') {
// OpenCL 1.x:
// CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the
// total number of work-items in the work-group computed as
// local_work_size[0] * ... * local_work_size[work_dim – 1] is greater
// than the value specified by CL_DEVICE_MAX_WORK_GROUP_SIZE in
// table 4.3
size_t MaxWGSize = 0;
PI_CALL(piDeviceGetInfo)(Device, PI_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(size_t), &MaxWGSize, nullptr);
const size_t TotalNumberOfWIs =
NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];
if (TotalNumberOfWIs > MaxWGSize)
throw sycl::nd_range_error(
"Total number of work-items in a work-group cannot exceed "
"info::device::max_work_group_size which is equal to " +
std::to_string(MaxWGSize),
PI_INVALID_WORK_GROUP_SIZE);
} else {
// OpenCL 2.x:
// CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the
// total number of work-items in the work-group computed as
// local_work_size[0] * ... * local_work_size[work_dim – 1] is greater
// than the value specified by CL_KERNEL_WORK_GROUP_SIZE in table 5.21.
size_t KernelWGSize = 0;
PI_CALL(piKernelGetGroupInfo)(Kernel, Device, CL_KERNEL_WORK_GROUP_SIZE,
sizeof(size_t), &KernelWGSize, nullptr);
const size_t TotalNumberOfWIs =
NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];
if (TotalNumberOfWIs > KernelWGSize)
throw sycl::nd_range_error(
"Total number of work-items in a work-group cannot exceed "
"info::kernel_work_group::work_group_size which is equal to " +
std::to_string(KernelWGSize) + " for this kernel",
PI_INVALID_WORK_GROUP_SIZE);
}

if (HasLocalSize) {
const bool NonUniformWGs =
(NDRDesc.LocalSize[0] != 0 &&
NDRDesc.GlobalSize[0] % NDRDesc.LocalSize[0] != 0) ||
(NDRDesc.LocalSize[1] != 0 &&
NDRDesc.GlobalSize[1] % NDRDesc.LocalSize[1] != 0) ||
(NDRDesc.LocalSize[2] != 0 &&
NDRDesc.GlobalSize[2] % NDRDesc.LocalSize[2] != 0);

if (Ver[0] == '1') {
// OpenCL 1.x:
// CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and
// number of workitems specified by global_work_size is not evenly
// divisible by size of work-group given by local_work_size

if (NonUniformWGs)
throw sycl::nd_range_error(
"Non-uniform work-groups are not supported by the target device",
PI_INVALID_WORK_GROUP_SIZE);
} else {
// OpenCL 2.x:
// CL_INVALID_WORK_GROUP_SIZE if the program was compiled with
// –cl-uniform-work-group-size and the number of work-items specified
// by global_work_size is not evenly divisible by size of work-group
// given by local_work_size

pi_program Program = nullptr;
PI_CALL(piKernelGetInfo)(Kernel, CL_KERNEL_PROGRAM, sizeof(pi_program),
&Program, nullptr);
size_t OptsSize = 0;
PI_CALL(piProgramGetBuildInfo)(Program, Device, CL_PROGRAM_BUILD_OPTIONS,
0, nullptr, &OptsSize);
string_class Opts(OptsSize, '\0');
PI_CALL(piProgramGetBuildInfo)(Program, Device, CL_PROGRAM_BUILD_OPTIONS,
OptsSize, &Opts.front(), nullptr);
if (NonUniformWGs) {
const bool HasStd20 = Opts.find("-cl-std=CL2.0") != string_class::npos;
if (!HasStd20)
throw sycl::nd_range_error(
"Non-uniform work-groups are not allowed by default. Underlying "
"OpenCL 2.x implementation supports this feature and to enable "
"it, build device program with -cl-std=CL2.0",
PI_INVALID_WORK_GROUP_SIZE);
else
throw sycl::nd_range_error(
"Non-uniform work-groups are not allowed by default. Underlying "
"OpenCL 2.x implementation supports this feature, but it is "
"disabled by -cl-uniform-work-group-size build flag",
PI_INVALID_WORK_GROUP_SIZE);
}
}
}

// TODO: required number of sub-groups, OpenCL 2.1:
// CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and is not
// consistent with the required number of sub-groups for kernel in the
// program source.

// Fallback
constexpr pi_result Error = PI_INVALID_WORK_GROUP_SIZE;
throw runtime_error(
"OpenCL API failed. OpenCL API returns: " + codeToString(Error), Error);
}

bool handleError(pi_result Error, pi_device Device, pi_kernel Kernel,
const NDRDescT &NDRDesc) {
assert(Error != PI_SUCCESS &&
"Success is expected to be handled on caller side");
switch (Error) {
case PI_INVALID_WORK_GROUP_SIZE:
return handleInvalidWorkGroupSize(Device, Kernel, NDRDesc);
// TODO: Handle other error codes
default:
throw runtime_error(
"OpenCL API failed. OpenCL API returns: " + codeToString(Error), Error);
}
}

} // namespace enqueue_kernel_launch

} // namespace detail
} // namespace sycl
} // namespace cl
32 changes: 32 additions & 0 deletions sycl/source/detail/error_handling/error_handling.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
//===-------- error_handling.hpp - SYCL error handling ---------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#pragma once

#include <CL/sycl/detail/pi.h>
#include <CL/sycl/detail/cg.hpp>

namespace cl {
namespace sycl {
namespace detail {

namespace enqueue_kernel_launch {
/// Analyzes error code and arguments of piEnqueueKernelLaunch to emit
/// user-friendly exception describing the problem.
///
/// This function is expected to be called only for non-success error codes,
/// i.e. the first argument must not be equal to PI_SUCCESS.
///
/// This function actually never returns and always throws an exception with
/// error description.
bool handleError(pi_result, pi_device, pi_kernel, const NDRDescT &);
} // namespace enqueue_kernel_launch

} // namespace detail
} // namespace sycl
} // namespace cl
12 changes: 11 additions & 1 deletion sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
//
//===----------------------------------------------------------------------===//

#include <detail/error_handling/error_handling.hpp>

#include "CL/sycl/access/access.hpp"
#include <CL/cl.h>
#include <CL/sycl/detail/clusm.hpp>
Expand Down Expand Up @@ -940,11 +942,19 @@ cl_int ExecCGCommand::enqueueImp() {

ReverseRangeDimensionsForKernel(NDRDesc);

PI_CALL(piEnqueueKernelLaunch)(
pi_result Error = PI_CALL_NOCHECK(piEnqueueKernelLaunch)(
MQueue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
&NDRDesc.GlobalSize[0], HasLocalSize ? &NDRDesc.LocalSize[0] : nullptr,
RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], &Event);

if (PI_SUCCESS != Error) {
// If we have got non-success error code, let's analyze it to emit nice
// exception explaining what was wrong
pi_device Device =
detail::getSyclObjImpl(MQueue->get_device())->getHandleRef();
return detail::enqueue_kernel_launch::handleError(Error, Device, Kernel,
NDRDesc);
}
return PI_SUCCESS;
}
case CG::CGTYPE::COPY_USM: {
Expand Down
Loading