@@ -530,6 +530,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
530
530
}
531
531
532
532
std::vector<CUlaunchAttribute> launch_attribute (numPropsInLaunchPropList);
533
+
534
+ // Early exit for zero size kernel
535
+ if (*pGlobalWorkSize == 0 ) {
536
+ return urEnqueueEventsWaitWithBarrier (hQueue, numEventsInWaitList,
537
+ phEventWaitList, phEvent);
538
+ }
539
+
540
+ // Set the number of threads per block to the number of threads per warp
541
+ // by default unless user has provided a better number
542
+ size_t ThreadsPerBlock[3 ] = {32u , 1u , 1u };
543
+ size_t BlocksPerGrid[3 ] = {1u , 1u , 1u };
544
+
545
+ uint32_t LocalSize = hKernel->getLocalSize ();
546
+ CUfunction CuFunc = hKernel->get ();
547
+
533
548
for (uint32_t i = 0 ; i < numPropsInLaunchPropList; i++) {
534
549
switch (launchPropList[i].id ) {
535
550
case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
@@ -540,12 +555,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
540
555
541
556
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
542
557
// Note that cuda orders from right to left wrt SYCL dimensional order.
543
- launch_attribute[i].value .clusterDim .x =
544
- launchPropList[i].value .clusterDim [2 ];
545
- launch_attribute[i].value .clusterDim .y =
546
- launchPropList[i].value .clusterDim [1 ];
547
- launch_attribute[i].value .clusterDim .z =
548
- launchPropList[i].value .clusterDim [0 ];
558
+ if (workDim == 3 ) {
559
+ launch_attribute[i].value .clusterDim .x =
560
+ launchPropList[i].value .clusterDim [2 ];
561
+ launch_attribute[i].value .clusterDim .y =
562
+ launchPropList[i].value .clusterDim [1 ];
563
+ launch_attribute[i].value .clusterDim .z =
564
+ launchPropList[i].value .clusterDim [0 ];
565
+ } else if (workDim == 2 ) {
566
+ launch_attribute[i].value .clusterDim .x =
567
+ launchPropList[i].value .clusterDim [1 ];
568
+ launch_attribute[i].value .clusterDim .y =
569
+ launchPropList[i].value .clusterDim [0 ];
570
+ launch_attribute[i].value .clusterDim .z =
571
+ launchPropList[i].value .clusterDim [2 ];
572
+ } else {
573
+ launch_attribute[i].value .clusterDim .x =
574
+ launchPropList[i].value .clusterDim [0 ];
575
+ launch_attribute[i].value .clusterDim .y =
576
+ launchPropList[i].value .clusterDim [1 ];
577
+ launch_attribute[i].value .clusterDim .z =
578
+ launchPropList[i].value .clusterDim [2 ];
579
+ }
580
+
581
+ UR_CHECK_ERROR (cuFuncSetAttribute (
582
+ CuFunc, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1 ));
583
+
549
584
break ;
550
585
}
551
586
case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {
@@ -560,20 +595,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
560
595
}
561
596
}
562
597
563
- // Early exit for zero size kernel
564
- if (*pGlobalWorkSize == 0 ) {
565
- return urEnqueueEventsWaitWithBarrier (hQueue, numEventsInWaitList,
566
- phEventWaitList, phEvent);
567
- }
568
-
569
- // Set the number of threads per block to the number of threads per warp
570
- // by default unless user has provided a better number
571
- size_t ThreadsPerBlock[3 ] = {32u , 1u , 1u };
572
- size_t BlocksPerGrid[3 ] = {1u , 1u , 1u };
573
-
574
- uint32_t LocalSize = hKernel->getLocalSize ();
575
- CUfunction CuFunc = hKernel->get ();
576
-
577
598
// This might return UR_RESULT_ERROR_ADAPTER_SPECIFIC, which cannot be handled
578
599
// using the standard UR_CHECK_ERROR
579
600
if (ur_result_t Ret =
0 commit comments