Skip to content

Commit a1c9e89

Browse files
AD2605callumfare
authored andcommitted
set attribute allowing cluster size greater than 8
set property cudaFuncAttributeNonPortableClusterSizeAllowed only if cluster launch is used set has_property_cluster_launch only if cluster property is used fix cluster dimensions being set in accordance to grid dimensions fix ordering of cluster dims for workDim 2 fix compilation errors review comments 1 review comments 1 increase cluster size upon launch to check CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED flag being added
1 parent ae136f4 commit a1c9e89

File tree

2 files changed

+42
-21
lines changed

2 files changed

+42
-21
lines changed

source/adapters/cuda/enqueue.cpp

Lines changed: 41 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
530530
}
531531

532532
std::vector<CUlaunchAttribute> launch_attribute(numPropsInLaunchPropList);
533+
534+
// Early exit for zero size kernel
535+
if (*pGlobalWorkSize == 0) {
536+
return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
537+
phEventWaitList, phEvent);
538+
}
539+
540+
// Set the number of threads per block to the number of threads per warp
541+
// by default unless user has provided a better number
542+
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
543+
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
544+
545+
uint32_t LocalSize = hKernel->getLocalSize();
546+
CUfunction CuFunc = hKernel->get();
547+
533548
for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
534549
switch (launchPropList[i].id) {
535550
case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
@@ -540,12 +555,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
540555

541556
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
542557
// Note that cuda orders from right to left wrt SYCL dimensional order.
543-
launch_attribute[i].value.clusterDim.x =
544-
launchPropList[i].value.clusterDim[2];
545-
launch_attribute[i].value.clusterDim.y =
546-
launchPropList[i].value.clusterDim[1];
547-
launch_attribute[i].value.clusterDim.z =
548-
launchPropList[i].value.clusterDim[0];
558+
if (workDim == 3) {
559+
launch_attribute[i].value.clusterDim.x =
560+
launchPropList[i].value.clusterDim[2];
561+
launch_attribute[i].value.clusterDim.y =
562+
launchPropList[i].value.clusterDim[1];
563+
launch_attribute[i].value.clusterDim.z =
564+
launchPropList[i].value.clusterDim[0];
565+
} else if (workDim == 2) {
566+
launch_attribute[i].value.clusterDim.x =
567+
launchPropList[i].value.clusterDim[1];
568+
launch_attribute[i].value.clusterDim.y =
569+
launchPropList[i].value.clusterDim[0];
570+
launch_attribute[i].value.clusterDim.z =
571+
launchPropList[i].value.clusterDim[2];
572+
} else {
573+
launch_attribute[i].value.clusterDim.x =
574+
launchPropList[i].value.clusterDim[0];
575+
launch_attribute[i].value.clusterDim.y =
576+
launchPropList[i].value.clusterDim[1];
577+
launch_attribute[i].value.clusterDim.z =
578+
launchPropList[i].value.clusterDim[2];
579+
}
580+
581+
UR_CHECK_ERROR(cuFuncSetAttribute(
582+
CuFunc, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1));
583+
549584
break;
550585
}
551586
case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {
@@ -560,20 +595,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
560595
}
561596
}
562597

563-
// Early exit for zero size kernel
564-
if (*pGlobalWorkSize == 0) {
565-
return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
566-
phEventWaitList, phEvent);
567-
}
568-
569-
// Set the number of threads per block to the number of threads per warp
570-
// by default unless user has provided a better number
571-
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
572-
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
573-
574-
uint32_t LocalSize = hKernel->getLocalSize();
575-
CUfunction CuFunc = hKernel->get();
576-
577598
// This might return UR_RESULT_ERROR_ADAPTER_SPECIFIC, which cannot be handled
578599
// using the standard UR_CHECK_ERROR
579600
if (ur_result_t Ret =

test/conformance/exp_launch_properties/launch_properties.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ TEST_P(urEnqueueKernelLaunchCustomTest, Success) {
8383
if (cluster_launch_supported) {
8484
ur_exp_launch_property_t cluster_dims_prop;
8585
cluster_dims_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION;
86-
cluster_dims_prop.value.clusterDim[0] = 1;
86+
cluster_dims_prop.value.clusterDim[0] = 16;
8787
cluster_dims_prop.value.clusterDim[1] = 1;
8888
cluster_dims_prop.value.clusterDim[2] = 1;
8989

0 commit comments

Comments
 (0)