@@ -937,6 +937,42 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
937
937
case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: {
938
938
return getInfo (param_value_size, param_value, param_value_size_ret, 0u );
939
939
}
940
+ case PI_DEVICE_INFO_MAX_NUM_SUB_GROUPS: {
941
+ // Number of sub-groups = max block size / warp size + possible remainder
942
+ int max_threads = 0 ;
943
+ cl::sycl::detail::pi::assertion (
944
+ cuDeviceGetAttribute (&max_threads,
945
+ CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
946
+ device->get ()) == CUDA_SUCCESS);
947
+ int warpSize = 0 ;
948
+ cl::sycl::detail::pi::assertion (
949
+ cuDeviceGetAttribute (&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
950
+ device->get ()) == CUDA_SUCCESS);
951
+ int maxWarps = (max_threads + warpSize - 1 ) / warpSize;
952
+ return getInfo (param_value_size, param_value, param_value_size_ret,
953
+ static_cast <uint32_t >(maxWarps));
954
+ }
955
+ case PI_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: {
956
+ // Volta provides independent thread scheduling
957
+ // TODO: Revisit for previous generation GPUs
958
+ int major = 0 ;
959
+ cl::sycl::detail::pi::assertion (
960
+ cuDeviceGetAttribute (&major,
961
+ CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
962
+ device->get ()) == CUDA_SUCCESS);
963
+ bool ifp = (major >= 7 );
964
+ return getInfo (param_value_size, param_value, param_value_size_ret, ifp);
965
+ }
966
+ case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
967
+ // NVIDIA devices only support one sub-group size (the warp size)
968
+ int warpSize = 0 ;
969
+ cl::sycl::detail::pi::assertion (
970
+ cuDeviceGetAttribute (&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
971
+ device->get ()) == CUDA_SUCCESS);
972
+ size_t sizes[1 ] = {static_cast <size_t >(warpSize)};
973
+ return getInfoArray<size_t >(1 , param_value_size, param_value,
974
+ param_value_size_ret, sizes);
975
+ }
940
976
case PI_DEVICE_INFO_MAX_CLOCK_FREQUENCY: {
941
977
int clock_freq = 0 ;
942
978
cl::sycl::detail::pi::assertion (
@@ -3012,14 +3048,53 @@ pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
3012
3048
return PI_INVALID_KERNEL;
3013
3049
}
3014
3050
3015
- // / \TODO Untie from OpenCL
3016
- // / \TODO Not implemented
3017
3051
pi_result cuda_piKernelGetSubGroupInfo (
3018
- pi_kernel kernel, pi_device device, cl_kernel_sub_group_info param_name,
3052
+ pi_kernel kernel, pi_device device, pi_kernel_sub_group_info param_name,
3019
3053
size_t input_value_size, const void *input_value, size_t param_value_size,
3020
3054
void *param_value, size_t *param_value_size_ret) {
3021
- cl::sycl::detail::pi::die (" cuda_piKernelGetSubGroupInfo not implemented" );
3022
- return {};
3055
+ if (kernel != nullptr ) {
3056
+ switch (param_name) {
3057
+ case PI_KERNEL_MAX_SUB_GROUP_SIZE: {
3058
+ // Sub-group size is equivalent to warp size
3059
+ int warpSize = 0 ;
3060
+ cl::sycl::detail::pi::assertion (
3061
+ cuDeviceGetAttribute (&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
3062
+ device->get ()) == CUDA_SUCCESS);
3063
+ return getInfo (param_value_size, param_value, param_value_size_ret,
3064
+ static_cast <uint32_t >(warpSize));
3065
+ }
3066
+ case PI_KERNEL_MAX_NUM_SUB_GROUPS: {
3067
+ // Number of sub-groups = max block size / warp size + possible remainder
3068
+ int max_threads = 0 ;
3069
+ cl::sycl::detail::pi::assertion (
3070
+ cuFuncGetAttribute (&max_threads,
3071
+ CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
3072
+ kernel->get ()) == CUDA_SUCCESS);
3073
+ int warpSize = 0 ;
3074
+ cuda_piKernelGetSubGroupInfo (kernel, device, PI_KERNEL_MAX_SUB_GROUP_SIZE,
3075
+ 0 , nullptr , sizeof (uint32_t ), &warpSize,
3076
+ nullptr );
3077
+ int maxWarps = (max_threads + warpSize - 1 ) / warpSize;
3078
+ return getInfo (param_value_size, param_value, param_value_size_ret,
3079
+ static_cast <uint32_t >(maxWarps));
3080
+ }
3081
+ case PI_KERNEL_COMPILE_NUM_SUB_GROUPS: {
3082
+ // Return value of 0 => not specified
3083
+ // TODO: Revisit if PTX is generated for compile-time work-group sizes
3084
+ return getInfo (param_value_size, param_value, param_value_size_ret, 0 );
3085
+ }
3086
+ case PI_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL: {
3087
+ // Return value of 0 => unspecified or "auto" sub-group size
3088
+ // Correct for now, since warp size may be read from special register
3089
+ // TODO: Return warp size once default is primary sub-group size
3090
+ // TODO: Revisit if we can recover [[sub_group_size]] attribute from PTX
3091
+ return getInfo (param_value_size, param_value, param_value_size_ret, 0 );
3092
+ }
3093
+ default :
3094
+ __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME (param_name);
3095
+ }
3096
+ }
3097
+ return PI_INVALID_KERNEL;
3023
3098
}
3024
3099
3025
3100
pi_result cuda_piKernelRetain (pi_kernel kernel) {
0 commit comments