Skip to content

Commit d27cff2

Browse files
AlexeySachkovbader
authored andcommitted
[SYCL] Improve error reporting for kernel enqueue (#860)
The idea is to handle error code returned from PI and based on input arguments, emit meaningful exception with error explanation. One of the side-effects of this is that this patch effectively allows to use non-uniform work-groups if underlying OpenCL supports this functionality. Signed-off-by: Alexey Sachkov <[email protected]>
1 parent d442364 commit d27cff2

File tree

8 files changed

+869
-73
lines changed

8 files changed

+869
-73
lines changed

sycl/include/CL/sycl/detail/cg.hpp

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ class HostKernel : public HostKernelBase {
237237
for (int I = 0; I < Dims; ++I) {
238238
if (NDRDesc.LocalSize[I] == 0 ||
239239
NDRDesc.GlobalSize[I] % NDRDesc.LocalSize[I] != 0)
240-
throw sycl::runtime_error("Invalid local size for global size");
240+
throw sycl::nd_range_error("Invalid local size for global size");
241241
GroupSize[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I];
242242
}
243243

@@ -278,7 +278,7 @@ class HostKernel : public HostKernelBase {
278278
for (int I = 0; I < Dims; ++I) {
279279
if (NDRDesc.LocalSize[I] == 0 ||
280280
NDRDesc.GlobalSize[I] % NDRDesc.LocalSize[I] != 0)
281-
throw sycl::runtime_error("Invalid local size for global size");
281+
throw sycl::nd_range_error("Invalid local size for global size");
282282
NGroups[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I];
283283
}
284284

@@ -385,15 +385,6 @@ class CGExecKernel : public CG {
385385
MStreams(std::move(Streams)) {
386386
assert((getType() == RUN_ON_HOST_INTEL || getType() == KERNEL) &&
387387
"Wrong type of exec kernel CG.");
388-
389-
if (MNDRDesc.LocalSize.size() > 0) {
390-
range<3> Excess = (MNDRDesc.GlobalSize % MNDRDesc.LocalSize);
391-
for (int I = 0; I < 3; I++) {
392-
if (Excess[I] != 0)
393-
throw nd_range_error("Global size is not a multiple of local size",
394-
CL_INVALID_WORK_GROUP_SIZE);
395-
}
396-
}
397388
}
398389

399390
std::vector<ArgDesc> getArguments() const { return MArgs; }

sycl/include/CL/sycl/detail/pi.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ typedef enum {
6262
PI_INVALID_DEVICE = CL_INVALID_DEVICE,
6363
PI_INVALID_BINARY = CL_INVALID_BINARY,
6464
PI_MISALIGNED_SUB_BUFFER_OFFSET = CL_MISALIGNED_SUB_BUFFER_OFFSET,
65-
PI_OUT_OF_HOST_MEMORY = CL_OUT_OF_HOST_MEMORY
65+
PI_OUT_OF_HOST_MEMORY = CL_OUT_OF_HOST_MEMORY,
66+
PI_INVALID_WORK_GROUP_SIZE = CL_INVALID_WORK_GROUP_SIZE
6667
} _pi_result;
6768

6869
typedef enum {
@@ -88,7 +89,9 @@ typedef enum {
8889
PI_DEVICE_INFO_PARENT = CL_DEVICE_PARENT_DEVICE,
8990
PI_DEVICE_INFO_PLATFORM = CL_DEVICE_PLATFORM,
9091
PI_DEVICE_INFO_PARTITION_TYPE = CL_DEVICE_PARTITION_TYPE,
91-
PI_DEVICE_INFO_NAME = CL_DEVICE_NAME
92+
PI_DEVICE_INFO_NAME = CL_DEVICE_NAME,
93+
PI_DEVICE_VERSION = CL_DEVICE_VERSION,
94+
PI_DEVICE_MAX_WORK_GROUP_SIZE = CL_DEVICE_MAX_WORK_GROUP_SIZE
9295
} _pi_device_info;
9396

9497
// TODO: populate

sycl/source/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ set(SYCL_SOURCES
4646
"detail/context_impl.cpp"
4747
"detail/device_impl.cpp"
4848
"detail/device_info.cpp"
49+
"detail/error_handling/enqueue_kernel.cpp"
4950
"detail/event_impl.cpp"
5051
"detail/force_device.cpp"
5152
"detail/helpers.cpp"
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
//===------------------- enqueue_kernel.cpp ---------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// SYCL error handling of enqueue kernel operations
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#include "error_handling.hpp"
14+
15+
#include <CL/sycl/detail/pi.hpp>
16+
17+
namespace cl {
18+
namespace sycl {
19+
namespace detail {
20+
21+
namespace enqueue_kernel_launch {
22+
23+
bool handleInvalidWorkGroupSize(pi_device Device, pi_kernel Kernel,
24+
const NDRDescT &NDRDesc) {
25+
const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0);
26+
27+
size_t VerSize = 0;
28+
PI_CALL(piDeviceGetInfo)(Device, PI_DEVICE_VERSION, 0, nullptr, &VerSize);
29+
assert(VerSize >= 10 &&
30+
"Unexpected device version string"); // strlen("OpenCL X.Y")
31+
string_class VerStr(VerSize, '\0');
32+
PI_CALL(piDeviceGetInfo)(Device, PI_DEVICE_VERSION, VerSize, &VerStr.front(),
33+
nullptr);
34+
const char *Ver = &VerStr[7]; // strlen("OpenCL ")
35+
36+
size_t CompileWGSize[3] = {0};
37+
PI_CALL(piKernelGetGroupInfo)(Kernel, Device,
38+
CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
39+
sizeof(size_t) * 3, CompileWGSize, nullptr);
40+
41+
if (CompileWGSize[0] != 0) {
42+
// OpenCL 1.x && 2.0:
43+
// CL_INVALID_WORK_GROUP_SIZE if local_work_size is NULL and the
44+
// reqd_work_group_size attribute is used to declare the work-group size
45+
// for kernel in the program source.
46+
if (!HasLocalSize && (Ver[0] == '1' || (Ver[0] == '2' && Ver[2] == '0')))
47+
throw sycl::nd_range_error(
48+
"OpenCL 1.x and 2.0 requires to pass local size argument even if "
49+
"required work-group size was specified in the program source",
50+
PI_INVALID_WORK_GROUP_SIZE);
51+
52+
// Any OpenCL version:
53+
// CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and does not
54+
// match the required work-group size for kernel in the program source.
55+
if (NDRDesc.LocalSize[0] != CompileWGSize[0] ||
56+
NDRDesc.LocalSize[1] != CompileWGSize[1] ||
57+
NDRDesc.LocalSize[2] != CompileWGSize[2])
58+
throw sycl::nd_range_error(
59+
"Specified local size doesn't match the required work-group size "
60+
"specified in the program source",
61+
PI_INVALID_WORK_GROUP_SIZE);
62+
}
63+
64+
if (Ver[0] == '1') {
65+
// OpenCL 1.x:
66+
// CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the
67+
// total number of work-items in the work-group computed as
68+
// local_work_size[0] * ... * local_work_size[work_dim – 1] is greater
69+
// than the value specified by CL_DEVICE_MAX_WORK_GROUP_SIZE in
70+
// table 4.3
71+
size_t MaxWGSize = 0;
72+
PI_CALL(piDeviceGetInfo)(Device, PI_DEVICE_MAX_WORK_GROUP_SIZE,
73+
sizeof(size_t), &MaxWGSize, nullptr);
74+
const size_t TotalNumberOfWIs =
75+
NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];
76+
if (TotalNumberOfWIs > MaxWGSize)
77+
throw sycl::nd_range_error(
78+
"Total number of work-items in a work-group cannot exceed "
79+
"info::device::max_work_group_size which is equal to " +
80+
std::to_string(MaxWGSize),
81+
PI_INVALID_WORK_GROUP_SIZE);
82+
} else {
83+
// OpenCL 2.x:
84+
// CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the
85+
// total number of work-items in the work-group computed as
86+
// local_work_size[0] * ... * local_work_size[work_dim – 1] is greater
87+
// than the value specified by CL_KERNEL_WORK_GROUP_SIZE in table 5.21.
88+
size_t KernelWGSize = 0;
89+
PI_CALL(piKernelGetGroupInfo)(Kernel, Device, CL_KERNEL_WORK_GROUP_SIZE,
90+
sizeof(size_t), &KernelWGSize, nullptr);
91+
const size_t TotalNumberOfWIs =
92+
NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];
93+
if (TotalNumberOfWIs > KernelWGSize)
94+
throw sycl::nd_range_error(
95+
"Total number of work-items in a work-group cannot exceed "
96+
"info::kernel_work_group::work_group_size which is equal to " +
97+
std::to_string(KernelWGSize) + " for this kernel",
98+
PI_INVALID_WORK_GROUP_SIZE);
99+
}
100+
101+
if (HasLocalSize) {
102+
const bool NonUniformWGs =
103+
(NDRDesc.LocalSize[0] != 0 &&
104+
NDRDesc.GlobalSize[0] % NDRDesc.LocalSize[0] != 0) ||
105+
(NDRDesc.LocalSize[1] != 0 &&
106+
NDRDesc.GlobalSize[1] % NDRDesc.LocalSize[1] != 0) ||
107+
(NDRDesc.LocalSize[2] != 0 &&
108+
NDRDesc.GlobalSize[2] % NDRDesc.LocalSize[2] != 0);
109+
110+
if (Ver[0] == '1') {
111+
// OpenCL 1.x:
112+
// CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and
113+
// number of workitems specified by global_work_size is not evenly
114+
// divisible by size of work-group given by local_work_size
115+
116+
if (NonUniformWGs)
117+
throw sycl::nd_range_error(
118+
"Non-uniform work-groups are not supported by the target device",
119+
PI_INVALID_WORK_GROUP_SIZE);
120+
} else {
121+
// OpenCL 2.x:
122+
// CL_INVALID_WORK_GROUP_SIZE if the program was compiled with
123+
// –cl-uniform-work-group-size and the number of work-items specified
124+
// by global_work_size is not evenly divisible by size of work-group
125+
// given by local_work_size
126+
127+
pi_program Program = nullptr;
128+
PI_CALL(piKernelGetInfo)(Kernel, CL_KERNEL_PROGRAM, sizeof(pi_program),
129+
&Program, nullptr);
130+
size_t OptsSize = 0;
131+
PI_CALL(piProgramGetBuildInfo)(Program, Device, CL_PROGRAM_BUILD_OPTIONS,
132+
0, nullptr, &OptsSize);
133+
string_class Opts(OptsSize, '\0');
134+
PI_CALL(piProgramGetBuildInfo)(Program, Device, CL_PROGRAM_BUILD_OPTIONS,
135+
OptsSize, &Opts.front(), nullptr);
136+
if (NonUniformWGs) {
137+
const bool HasStd20 = Opts.find("-cl-std=CL2.0") != string_class::npos;
138+
if (!HasStd20)
139+
throw sycl::nd_range_error(
140+
"Non-uniform work-groups are not allowed by default. Underlying "
141+
"OpenCL 2.x implementation supports this feature and to enable "
142+
"it, build device program with -cl-std=CL2.0",
143+
PI_INVALID_WORK_GROUP_SIZE);
144+
else
145+
throw sycl::nd_range_error(
146+
"Non-uniform work-groups are not allowed by default. Underlying "
147+
"OpenCL 2.x implementation supports this feature, but it is "
148+
"disabled by -cl-uniform-work-group-size build flag",
149+
PI_INVALID_WORK_GROUP_SIZE);
150+
}
151+
}
152+
}
153+
154+
// TODO: required number of sub-groups, OpenCL 2.1:
155+
// CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and is not
156+
// consistent with the required number of sub-groups for kernel in the
157+
// program source.
158+
159+
// Fallback
160+
constexpr pi_result Error = PI_INVALID_WORK_GROUP_SIZE;
161+
throw runtime_error(
162+
"OpenCL API failed. OpenCL API returns: " + codeToString(Error), Error);
163+
}
164+
165+
bool handleError(pi_result Error, pi_device Device, pi_kernel Kernel,
166+
const NDRDescT &NDRDesc) {
167+
assert(Error != PI_SUCCESS &&
168+
"Success is expected to be handled on caller side");
169+
switch (Error) {
170+
case PI_INVALID_WORK_GROUP_SIZE:
171+
return handleInvalidWorkGroupSize(Device, Kernel, NDRDesc);
172+
// TODO: Handle other error codes
173+
default:
174+
throw runtime_error(
175+
"OpenCL API failed. OpenCL API returns: " + codeToString(Error), Error);
176+
}
177+
}
178+
179+
} // namespace enqueue_kernel_launch
180+
181+
} // namespace detail
182+
} // namespace sycl
183+
} // namespace cl
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
//===-------- error_handling.hpp - SYCL error handling ---------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#pragma once
10+
11+
#include <CL/sycl/detail/pi.h>
12+
#include <CL/sycl/detail/cg.hpp>
13+
14+
namespace cl {
15+
namespace sycl {
16+
namespace detail {
17+
18+
namespace enqueue_kernel_launch {
19+
/// Analyzes error code and arguments of piEnqueueKernelLaunch to emit
20+
/// user-friendly exception describing the problem.
21+
///
22+
/// This function is expected to be called only for non-success error codes,
23+
/// i.e. the first argument must not be equal to PI_SUCCESS.
24+
///
25+
/// This function actually never returns and always throws an exception with
26+
/// error description.
27+
bool handleError(pi_result, pi_device, pi_kernel, const NDRDescT &);
28+
} // namespace enqueue_kernel_launch
29+
30+
} // namespace detail
31+
} // namespace sycl
32+
} // namespace cl

sycl/source/detail/scheduler/commands.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9+
#include <detail/error_handling/error_handling.hpp>
10+
911
#include "CL/sycl/access/access.hpp"
1012
#include <CL/cl.h>
1113
#include <CL/sycl/detail/clusm.hpp>
@@ -940,11 +942,19 @@ cl_int ExecCGCommand::enqueueImp() {
940942

941943
ReverseRangeDimensionsForKernel(NDRDesc);
942944

943-
PI_CALL(piEnqueueKernelLaunch)(
945+
pi_result Error = PI_CALL_NOCHECK(piEnqueueKernelLaunch)(
944946
MQueue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
945947
&NDRDesc.GlobalSize[0], HasLocalSize ? &NDRDesc.LocalSize[0] : nullptr,
946948
RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], &Event);
947949

950+
if (PI_SUCCESS != Error) {
951+
// If we have got non-success error code, let's analyze it to emit nice
952+
// exception explaining what was wrong
953+
pi_device Device =
954+
detail::getSyclObjImpl(MQueue->get_device())->getHandleRef();
955+
return detail::enqueue_kernel_launch::handleError(Error, Device, Kernel,
956+
NDRDesc);
957+
}
948958
return PI_SUCCESS;
949959
}
950960
case CG::CGTYPE::COPY_USM: {

0 commit comments

Comments
 (0)