Skip to content

Commit c36fa65

Browse files
authored
[SYCL][CUDA] Add sub-group info queries (#2992)
Queries return 0 for compile-time information that is currently ignored when generating PTX, and assume that warps only guarantee forward progress on Volta. Signed-off-by: John Pennycook <[email protected]>
1 parent d52d72e commit c36fa65

File tree

1 file changed

+80
-5
lines changed

1 file changed

+80
-5
lines changed

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 80 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -937,6 +937,42 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
937937
case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: {
938938
return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
939939
}
940+
case PI_DEVICE_INFO_MAX_NUM_SUB_GROUPS: {
941+
// Number of sub-groups = max block size / warp size + possible remainder
942+
int max_threads = 0;
943+
cl::sycl::detail::pi::assertion(
944+
cuDeviceGetAttribute(&max_threads,
945+
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
946+
device->get()) == CUDA_SUCCESS);
947+
int warpSize = 0;
948+
cl::sycl::detail::pi::assertion(
949+
cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
950+
device->get()) == CUDA_SUCCESS);
951+
int maxWarps = (max_threads + warpSize - 1) / warpSize;
952+
return getInfo(param_value_size, param_value, param_value_size_ret,
953+
static_cast<uint32_t>(maxWarps));
954+
}
955+
case PI_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: {
956+
// Volta provides independent thread scheduling
957+
// TODO: Revisit for previous generation GPUs
958+
int major = 0;
959+
cl::sycl::detail::pi::assertion(
960+
cuDeviceGetAttribute(&major,
961+
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
962+
device->get()) == CUDA_SUCCESS);
963+
bool ifp = (major >= 7);
964+
return getInfo(param_value_size, param_value, param_value_size_ret, ifp);
965+
}
966+
case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
967+
// NVIDIA devices only support one sub-group size (the warp size)
968+
int warpSize = 0;
969+
cl::sycl::detail::pi::assertion(
970+
cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
971+
device->get()) == CUDA_SUCCESS);
972+
size_t sizes[1] = {static_cast<size_t>(warpSize)};
973+
return getInfoArray<size_t>(1, param_value_size, param_value,
974+
param_value_size_ret, sizes);
975+
}
940976
case PI_DEVICE_INFO_MAX_CLOCK_FREQUENCY: {
941977
int clock_freq = 0;
942978
cl::sycl::detail::pi::assertion(
@@ -3012,14 +3048,53 @@ pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
30123048
return PI_INVALID_KERNEL;
30133049
}
30143050

3015-
/// \TODO Untie from OpenCL
3016-
/// \TODO Not implemented
30173051
pi_result cuda_piKernelGetSubGroupInfo(
3018-
pi_kernel kernel, pi_device device, cl_kernel_sub_group_info param_name,
3052+
pi_kernel kernel, pi_device device, pi_kernel_sub_group_info param_name,
30193053
size_t input_value_size, const void *input_value, size_t param_value_size,
30203054
void *param_value, size_t *param_value_size_ret) {
3021-
cl::sycl::detail::pi::die("cuda_piKernelGetSubGroupInfo not implemented");
3022-
return {};
3055+
if (kernel != nullptr) {
3056+
switch (param_name) {
3057+
case PI_KERNEL_MAX_SUB_GROUP_SIZE: {
3058+
// Sub-group size is equivalent to warp size
3059+
int warpSize = 0;
3060+
cl::sycl::detail::pi::assertion(
3061+
cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
3062+
device->get()) == CUDA_SUCCESS);
3063+
return getInfo(param_value_size, param_value, param_value_size_ret,
3064+
static_cast<uint32_t>(warpSize));
3065+
}
3066+
case PI_KERNEL_MAX_NUM_SUB_GROUPS: {
3067+
// Number of sub-groups = max block size / warp size + possible remainder
3068+
int max_threads = 0;
3069+
cl::sycl::detail::pi::assertion(
3070+
cuFuncGetAttribute(&max_threads,
3071+
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
3072+
kernel->get()) == CUDA_SUCCESS);
3073+
int warpSize = 0;
3074+
cuda_piKernelGetSubGroupInfo(kernel, device, PI_KERNEL_MAX_SUB_GROUP_SIZE,
3075+
0, nullptr, sizeof(uint32_t), &warpSize,
3076+
nullptr);
3077+
int maxWarps = (max_threads + warpSize - 1) / warpSize;
3078+
return getInfo(param_value_size, param_value, param_value_size_ret,
3079+
static_cast<uint32_t>(maxWarps));
3080+
}
3081+
case PI_KERNEL_COMPILE_NUM_SUB_GROUPS: {
3082+
// Return value of 0 => not specified
3083+
// TODO: Revisit if PTX is generated for compile-time work-group sizes
3084+
return getInfo(param_value_size, param_value, param_value_size_ret, 0);
3085+
}
3086+
case PI_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL: {
3087+
// Return value of 0 => unspecified or "auto" sub-group size
3088+
// Correct for now, since warp size may be read from special register
3089+
// TODO: Return warp size once default is primary sub-group size
3090+
// TODO: Revisit if we can recover [[sub_group_size]] attribute from PTX
3091+
return getInfo(param_value_size, param_value, param_value_size_ret, 0);
3092+
}
3093+
default:
3094+
__SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
3095+
}
3096+
}
3097+
return PI_INVALID_KERNEL;
30233098
}
30243099

30253100
pi_result cuda_piKernelRetain(pi_kernel kernel) {

0 commit comments

Comments
 (0)