Skip to content

Commit ff116fc

Browse files
committed
[SYCL] Improve error reporting for kernel enqueue
The idea is to handle error code returned from PI and based on input arguments, emit meaningful exception with error explanation. One of the side-effects of this is that this patch effectively allows to use non-uniform work-groups if underlying OpenCL supports this functionality. Signed-off-by: Alexey Sachkov <[email protected]>
1 parent d442364 commit ff116fc

File tree

8 files changed

+869
-73
lines changed

8 files changed

+869
-73
lines changed

sycl/include/CL/sycl/detail/cg.hpp

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ class HostKernel : public HostKernelBase {
237237
for (int I = 0; I < Dims; ++I) {
238238
if (NDRDesc.LocalSize[I] == 0 ||
239239
NDRDesc.GlobalSize[I] % NDRDesc.LocalSize[I] != 0)
240-
throw sycl::runtime_error("Invalid local size for global size");
240+
throw sycl::nd_range_error("Invalid local size for global size");
241241
GroupSize[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I];
242242
}
243243

@@ -278,7 +278,7 @@ class HostKernel : public HostKernelBase {
278278
for (int I = 0; I < Dims; ++I) {
279279
if (NDRDesc.LocalSize[I] == 0 ||
280280
NDRDesc.GlobalSize[I] % NDRDesc.LocalSize[I] != 0)
281-
throw sycl::runtime_error("Invalid local size for global size");
281+
throw sycl::nd_range_error("Invalid local size for global size");
282282
NGroups[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I];
283283
}
284284

@@ -385,15 +385,6 @@ class CGExecKernel : public CG {
385385
MStreams(std::move(Streams)) {
386386
assert((getType() == RUN_ON_HOST_INTEL || getType() == KERNEL) &&
387387
"Wrong type of exec kernel CG.");
388-
389-
if (MNDRDesc.LocalSize.size() > 0) {
390-
range<3> Excess = (MNDRDesc.GlobalSize % MNDRDesc.LocalSize);
391-
for (int I = 0; I < 3; I++) {
392-
if (Excess[I] != 0)
393-
throw nd_range_error("Global size is not a multiple of local size",
394-
CL_INVALID_WORK_GROUP_SIZE);
395-
}
396-
}
397388
}
398389

399390
std::vector<ArgDesc> getArguments() const { return MArgs; }

sycl/include/CL/sycl/detail/pi.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ typedef enum {
6262
PI_INVALID_DEVICE = CL_INVALID_DEVICE,
6363
PI_INVALID_BINARY = CL_INVALID_BINARY,
6464
PI_MISALIGNED_SUB_BUFFER_OFFSET = CL_MISALIGNED_SUB_BUFFER_OFFSET,
65-
PI_OUT_OF_HOST_MEMORY = CL_OUT_OF_HOST_MEMORY
65+
PI_OUT_OF_HOST_MEMORY = CL_OUT_OF_HOST_MEMORY,
66+
PI_INVALID_WORK_GROUP_SIZE = CL_INVALID_WORK_GROUP_SIZE
6667
} _pi_result;
6768

6869
typedef enum {
@@ -88,7 +89,9 @@ typedef enum {
8889
PI_DEVICE_INFO_PARENT = CL_DEVICE_PARENT_DEVICE,
8990
PI_DEVICE_INFO_PLATFORM = CL_DEVICE_PLATFORM,
9091
PI_DEVICE_INFO_PARTITION_TYPE = CL_DEVICE_PARTITION_TYPE,
91-
PI_DEVICE_INFO_NAME = CL_DEVICE_NAME
92+
PI_DEVICE_INFO_NAME = CL_DEVICE_NAME,
93+
PI_DEVICE_VERSION = CL_DEVICE_VERSION,
94+
PI_DEVICE_MAX_WORK_GROUP_SIZE = CL_DEVICE_MAX_WORK_GROUP_SIZE
9295
} _pi_device_info;
9396

9497
// TODO: populate

sycl/source/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ set(SYCL_SOURCES
4646
"detail/context_impl.cpp"
4747
"detail/device_impl.cpp"
4848
"detail/device_info.cpp"
49+
"detail/error_handling/enqueue_kernel.cpp"
4950
"detail/event_impl.cpp"
5051
"detail/force_device.cpp"
5152
"detail/helpers.cpp"
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
//===------------------- enqueue_kernel.cpp ---------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// SYCL error handling of enqueue kernel operations
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#include "error_handling.hpp"
14+
15+
#include <CL/sycl/detail/pi.hpp>
16+
17+
namespace cl {
18+
namespace sycl {
19+
namespace detail {
20+
21+
namespace enqueue_kernel_launch {
22+
23+
bool handleInvalidWorkGroupSize(pi_device Device, pi_kernel Kernel,
24+
const NDRDescT &NDRDesc) {
25+
const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0);
26+
27+
size_t VerSize = 0;
28+
PI_CALL(piDeviceGetInfo)(Device, PI_DEVICE_VERSION, 0, nullptr, &VerSize);
29+
assert(VerSize >= 10 &&
30+
"Unexpected device version string"); // strlen("OpenCL X.Y")
31+
string_class VerStr(VerSize, '\0');
32+
PI_CALL(piDeviceGetInfo)(Device, PI_DEVICE_VERSION, VerSize, &VerStr.front(),
33+
nullptr);
34+
const char *Ver = &VerStr[7]; // strlen("OpenCL ")
35+
36+
size_t CompileWGSize[3] = {0};
37+
PI_CALL(piKernelGetGroupInfo)(Kernel, Device,
38+
CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
39+
sizeof(size_t) * 3, CompileWGSize, nullptr);
40+
41+
if (CompileWGSize[0] != 0) {
42+
// OpenCL 1.x && 2.0:
43+
// CL_INVALID_WORK_GROUP_SIZE if local_work_size is NULL and the
44+
// reqd_work_group_size attribute is used to declare the work-group size
45+
// for kernel in the program source.
46+
if (!HasLocalSize && (Ver[0] == '1' || (Ver[0] == '2' && Ver[2] == '0')))
47+
throw sycl::nd_range_error(
48+
"OpenCL 1.x and 2.0 requires to pass local size argument even if "
49+
"required work-group size was specified in the program source",
50+
PI_INVALID_WORK_GROUP_SIZE);
51+
52+
// Any OpenCL version:
53+
// CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and does not
54+
// match the required work-group size for kernel in the program source.
55+
if (NDRDesc.LocalSize[0] != CompileWGSize[0] ||
56+
NDRDesc.LocalSize[1] != CompileWGSize[1] ||
57+
NDRDesc.LocalSize[2] != CompileWGSize[2])
58+
throw sycl::nd_range_error(
59+
"Specified local size doesn't match the required work-group size "
60+
"specified in the program source",
61+
PI_INVALID_WORK_GROUP_SIZE);
62+
}
63+
64+
if (Ver[0] == '1') {
65+
// OpenCL 1.x:
66+
// CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the
67+
// total number of work-items in the work-group computed as
68+
// local_work_size[0] * ... * local_work_size[work_dim – 1] is greater
69+
// than the value specified by CL_DEVICE_MAX_WORK_GROUP_SIZE in
70+
// table 4.3
71+
size_t MaxWGSize = 0;
72+
PI_CALL(piDeviceGetInfo)(Device, PI_DEVICE_MAX_WORK_GROUP_SIZE,
73+
sizeof(size_t), &MaxWGSize, nullptr);
74+
const size_t TotalNumberOfWIs =
75+
NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];
76+
if (TotalNumberOfWIs > MaxWGSize)
77+
throw sycl::nd_range_error(
78+
"Total number of work-items in a work-group cannot exceed "
79+
"info::device::max_work_group_size which is equal to " +
80+
std::to_string(MaxWGSize),
81+
PI_INVALID_WORK_GROUP_SIZE);
82+
} else {
83+
// OpenCL 2.x:
84+
// CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the
85+
// total number of work-items in the work-group computed as
86+
// local_work_size[0] * ... * local_work_size[work_dim – 1] is greater
87+
// than the value specified by CL_KERNEL_WORK_GROUP_SIZE in table 5.21.
88+
size_t KernelWGSize = 0;
89+
PI_CALL(piKernelGetGroupInfo)(Kernel, Device, CL_KERNEL_WORK_GROUP_SIZE,
90+
sizeof(size_t), &KernelWGSize, nullptr);
91+
const size_t TotalNumberOfWIs =
92+
NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];
93+
if (TotalNumberOfWIs > KernelWGSize)
94+
throw sycl::nd_range_error(
95+
"Total number of work-items in a work-group cannot exceed "
96+
"info::kernel_work_group::work_group_size which is equal to " +
97+
std::to_string(KernelWGSize) + " for this kernel",
98+
PI_INVALID_WORK_GROUP_SIZE);
99+
}
100+
101+
if (HasLocalSize) {
102+
const bool NonUniformWGs =
103+
(NDRDesc.LocalSize[0] != 0 &&
104+
NDRDesc.GlobalSize[0] % NDRDesc.LocalSize[0] != 0) ||
105+
(NDRDesc.LocalSize[1] != 0 &&
106+
NDRDesc.GlobalSize[1] % NDRDesc.LocalSize[1] != 0) ||
107+
(NDRDesc.LocalSize[2] != 0 &&
108+
NDRDesc.GlobalSize[2] % NDRDesc.LocalSize[2] != 0);
109+
110+
if (Ver[0] == '1') {
111+
// OpenCL 1.x:
112+
// CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and
113+
// number of workitems specified by global_work_size is not evenly
114+
// divisible by size of work-group given by local_work_size
115+
116+
if (NonUniformWGs)
117+
throw sycl::nd_range_error(
118+
"Non-uniform work-groups are not supported by the target device",
119+
PI_INVALID_WORK_GROUP_SIZE);
120+
} else {
121+
// OpenCL 2.x:
122+
// CL_INVALID_WORK_GROUP_SIZE if the program was compiled with
123+
// –cl-uniform-work-group-size and the number of work-items specified
124+
// by global_work_size is not evenly divisible by size of work-group
125+
// given by local_work_size
126+
127+
pi_program Program = nullptr;
128+
PI_CALL(piKernelGetInfo)(Kernel, CL_KERNEL_PROGRAM, sizeof(pi_program),
129+
&Program, nullptr);
130+
size_t OptsSize = 0;
131+
PI_CALL(piProgramGetBuildInfo)(Program, Device, CL_PROGRAM_BUILD_OPTIONS,
132+
0, nullptr, &OptsSize);
133+
string_class Opts(OptsSize, '\0');
134+
PI_CALL(piProgramGetBuildInfo)(Program, Device, CL_PROGRAM_BUILD_OPTIONS,
135+
OptsSize, &Opts.front(), nullptr);
136+
if (NonUniformWGs) {
137+
const bool HasStd20 = Opts.find("-cl-std=CL2.0") != string_class::npos;
138+
if (!HasStd20)
139+
throw sycl::nd_range_error(
140+
"Non-uniform work-groups are not allowed by default. Underlying "
141+
"OpenCL 2.x implementation supports this feature and to enable "
142+
"it, build device program with -cl-std=CL2.0",
143+
PI_INVALID_WORK_GROUP_SIZE);
144+
else
145+
throw sycl::nd_range_error(
146+
"Non-uniform work-groups are not allowed by default. Underlying "
147+
"OpenCL 2.x implementation supports this feature, but it is "
148+
"disabled by -cl-uniform-work-group-size build flag",
149+
PI_INVALID_WORK_GROUP_SIZE);
150+
}
151+
}
152+
}
153+
154+
// TODO: required number of sub-groups, OpenCL 2.1:
155+
// CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and is not
156+
// consistent with the required number of sub-groups for kernel in the
157+
// program source.
158+
159+
// Fallback
160+
constexpr pi_result Error = PI_INVALID_WORK_GROUP_SIZE;
161+
throw runtime_error(
162+
"OpenCL API failed. OpenCL API returns: " + codeToString(Error), Error);
163+
}
164+
165+
bool handleError(pi_result Error, pi_device Device, pi_kernel Kernel,
166+
const NDRDescT &NDRDesc) {
167+
assert(Error != PI_SUCCESS &&
168+
"Success is expected to be handled on caller side");
169+
switch (Error) {
170+
case PI_INVALID_WORK_GROUP_SIZE:
171+
return handleInvalidWorkGroupSize(Device, Kernel, NDRDesc);
172+
// TODO: Handle other error codes
173+
default:
174+
throw runtime_error(
175+
"OpenCL API failed. OpenCL API returns: " + codeToString(Error), Error);
176+
}
177+
}
178+
179+
} // namespace enqueue_kernel_launch
180+
181+
} // namespace detail
182+
} // namespace sycl
183+
} // namespace cl
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
//===-------- error_handling.hpp - SYCL error handling ---------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#pragma once
10+
11+
#include <CL/sycl/detail/pi.h>
12+
#include <CL/sycl/detail/cg.hpp>
13+
14+
namespace cl {
15+
namespace sycl {
16+
namespace detail {
17+
18+
namespace enqueue_kernel_launch {
19+
/// Analyzes error code and arguments of piEnqueueKernelLaunch to emit
20+
/// user-friendly exception describing the problem.
21+
///
22+
/// This function is expected to be called only for non-success error codes,
23+
/// i.e. the first argument must not be equal to PI_SUCCESS.
24+
///
25+
/// This function actually never returns and always throws an exception with
26+
/// error description.
27+
bool handleError(pi_result, pi_device, pi_kernel, const NDRDescT &);
28+
} // namespace enqueue_kernel_launch
29+
30+
} // namespace detail
31+
} // namespace sycl
32+
} // namespace cl

sycl/source/detail/scheduler/commands.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9+
#include <detail/error_handling/error_handling.hpp>
10+
911
#include "CL/sycl/access/access.hpp"
1012
#include <CL/cl.h>
1113
#include <CL/sycl/detail/clusm.hpp>
@@ -940,11 +942,19 @@ cl_int ExecCGCommand::enqueueImp() {
940942

941943
ReverseRangeDimensionsForKernel(NDRDesc);
942944

943-
PI_CALL(piEnqueueKernelLaunch)(
945+
pi_result Error = PI_CALL_NOCHECK(piEnqueueKernelLaunch)(
944946
MQueue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
945947
&NDRDesc.GlobalSize[0], HasLocalSize ? &NDRDesc.LocalSize[0] : nullptr,
946948
RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], &Event);
947949

950+
if (PI_SUCCESS != Error) {
951+
// If we have got non-success error code, let's analyze it to emit nice
952+
// exception explaining what was wrong
953+
pi_device Device =
954+
detail::getSyclObjImpl(MQueue->get_device())->getHandleRef();
955+
return detail::enqueue_kernel_launch::handleError(Error, Device, Kernel,
956+
NDRDesc);
957+
}
948958
return PI_SUCCESS;
949959
}
950960
case CG::CGTYPE::COPY_USM: {

0 commit comments

Comments
 (0)