Skip to content

Commit 62dd13d

Browse files
[SYCL][L0] Support some kernel work sizes greater than UINT32_MAX (#7321)
Workaround for the issue described in #4255 Signed-off-by: Sergey V Maslov <[email protected]>
1 parent fdfb2e6 commit 62dd13d

File tree

1 file changed

+41
-6
lines changed

1 file changed

+41
-6
lines changed

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5323,9 +5323,41 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
53235323
WG[1] = pi_cast<uint32_t>(LocalWorkSize[1]);
53245324
WG[2] = pi_cast<uint32_t>(LocalWorkSize[2]);
53255325
} else {
5326-
ZE_CALL(zeKernelSuggestGroupSize,
5327-
(Kernel->ZeKernel, GlobalWorkSize[0], GlobalWorkSize[1],
5328-
GlobalWorkSize[2], &WG[0], &WG[1], &WG[2]));
5326+
// We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize
5327+
// values do not fit to 32-bit that the API only supports currently.
5328+
bool SuggestGroupSize = true;
5329+
for (int I : {0, 1, 2}) {
5330+
if (GlobalWorkSize[I] > UINT32_MAX) {
5331+
SuggestGroupSize = false;
5332+
}
5333+
}
5334+
if (SuggestGroupSize) {
5335+
ZE_CALL(zeKernelSuggestGroupSize,
5336+
(Kernel->ZeKernel, GlobalWorkSize[0], GlobalWorkSize[1],
5337+
GlobalWorkSize[2], &WG[0], &WG[1], &WG[2]));
5338+
} else {
5339+
for (int I : {0, 1, 2}) {
5340+
// Try to find a I-dimension WG size that the GlobalWorkSize[I] is
5341+
// fully divisable with. Start with the max possible size in
5342+
// each dimension.
5343+
uint32_t GroupSize[] = {
5344+
Queue->Device->ZeDeviceComputeProperties->maxGroupSizeX,
5345+
Queue->Device->ZeDeviceComputeProperties->maxGroupSizeY,
5346+
Queue->Device->ZeDeviceComputeProperties->maxGroupSizeZ};
5347+
GroupSize[I] = std::min(size_t(GroupSize[I]), GlobalWorkSize[I]);
5348+
while (GlobalWorkSize[I] % GroupSize[I]) {
5349+
--GroupSize[I];
5350+
}
5351+
if (GlobalWorkSize[I] / GroupSize[I] > UINT32_MAX) {
5352+
zePrint("piEnqueueKernelLaunch: can't find a WG size "
5353+
"suitable for global work size > UINT32_MAX\n");
5354+
return PI_ERROR_INVALID_WORK_GROUP_SIZE;
5355+
}
5356+
WG[I] = GroupSize[I];
5357+
}
5358+
zePrint("piEnqueueKernelLaunch: using computed WG size = {%d, %d, %d}\n",
5359+
WG[0], WG[1], WG[2]);
5360+
}
53295361
}
53305362

53315363
// TODO: assert if sizes do not fit into 32-bit?
@@ -5357,17 +5389,20 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
53575389
}
53585390

53595391
// Error handling for non-uniform group size case
5360-
if (GlobalWorkSize[0] != (ZeThreadGroupDimensions.groupCountX * WG[0])) {
5392+
if (GlobalWorkSize[0] !=
5393+
size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) {
53615394
zePrint("piEnqueueKernelLaunch: invalid work_dim. The range is not a "
53625395
"multiple of the group size in the 1st dimension\n");
53635396
return PI_ERROR_INVALID_WORK_GROUP_SIZE;
53645397
}
5365-
if (GlobalWorkSize[1] != (ZeThreadGroupDimensions.groupCountY * WG[1])) {
5398+
if (GlobalWorkSize[1] !=
5399+
size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) {
53665400
zePrint("piEnqueueKernelLaunch: invalid work_dim. The range is not a "
53675401
"multiple of the group size in the 2nd dimension\n");
53685402
return PI_ERROR_INVALID_WORK_GROUP_SIZE;
53695403
}
5370-
if (GlobalWorkSize[2] != (ZeThreadGroupDimensions.groupCountZ * WG[2])) {
5404+
if (GlobalWorkSize[2] !=
5405+
size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) {
53715406
zePrint("piEnqueueKernelLaunch: invalid work_dim. The range is not a "
53725407
"multiple of the group size in the 3rd dimension\n");
53735408
return PI_ERROR_INVALID_WORK_GROUP_SIZE;

0 commit comments

Comments
 (0)