[ROCM] Navi21 Enablement 9: Range and Multinomial Kernels (pytorch#73550)

micmelesse · jeffdaily · commit 97d23f684c7b · 2022-07-12T08:09:58.000-07:00
Summary: This PR is a follow up to the following prs. pytorch#69942 pytorch#72682 pytorch#72809 pytorch#73543 pytorch#73545 pytorch#73546 pytorch#73548 pytorch#73549 We are adding support to Navi21 GPUs which have a warpsize of 32. We cannot rely on a constant so we have to dynamically look up the warpsize when launching the kernel on the host side. Inside device functions this is not needed and the compiler can correctly detect the correct warpsize to replace the C10_WARP_SIZE constant. Pull Request resolved: pytorch#73550 Reviewed By: malfet Differential Revision: D35444958 Pulled By: ngimel fbshipit-source-id: c65f06d3227c23bb097a71fc6c86e3f884114e04 (cherry picked from commit 7f3ba52)
diff --git a/aten/src/ATen/native/cuda/MultinomialKernel.cu b/aten/src/ATen/native/cuda/MultinomialKernel.cu
@@ -74,12 +74,13 @@ void renormRows(Tensor& t) {
   const int64_t maxThreads = std::min(
       props->maxThreadsPerBlock, cuda_utils::kCUDABlockReduceMaxThreads);
 
+  int warp_size = at::cuda::warp_size();
   dim3 grid(rows < numSM * 4 ? rows : numSM * 4);
-  dim3 block(std::min(maxThreads, C10_WARP_SIZE * ceil_div(cols, int64_t{C10_WARP_SIZE})));
+  dim3 block(std::min(maxThreads, warp_size * ceil_div(cols, int64_t{warp_size})));
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(t.scalar_type(), "renormRows_cuda", [&] {
     renormRowsL1<scalar_t>
-        <<<grid, block, (block.x / C10_WARP_SIZE) * sizeof(scalar_t),
+        <<<grid, block, (block.x / warp_size) * sizeof(scalar_t),
         at::cuda::getCurrentCUDAStream()>>>(t.data_ptr<scalar_t>(),
             rows, cols);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -335,8 +336,9 @@ void multinomial_with_replacement_kernel_impl(
     int maxThreads = props->maxThreadsPerBlock;
     int maxShared = props->sharedMemPerBlock;
 
-    int requiredWarps = at::ceil_div(numCategories, C10_WARP_SIZE);
-    int requiredThreads = std::min(maxThreads, requiredWarps * C10_WARP_SIZE);
+    int warp_size = at::cuda::warp_size();
+    int requiredWarps = at::ceil_div(numCategories, warp_size);
+    int requiredThreads = std::min(maxThreads, requiredWarps * warp_size);
     int requiredShared = requiredThreads * sizeof(accscalar_t);
 
     if (n_sample == 1 && maxShared >= requiredShared) {
diff --git a/aten/src/ATen/native/cuda/RangeFactories.cu b/aten/src/ATen/native/cuda/RangeFactories.cu
@@ -12,16 +12,24 @@
 
 namespace {
 
-constexpr int num_threads = C10_WARP_SIZE * 2;
+#if defined(USE_ROCM)
+constexpr int num_threads() {
+  return 128;
+}
+#else
+constexpr int num_threads() {
+  return C10_WARP_SIZE * 2;
+}
+#endif
 constexpr int thread_work_size = 1;
-constexpr int block_work_size = thread_work_size * num_threads;
+constexpr int block_work_size = thread_work_size * num_threads();
 
 template<typename index_t, typename func_t>
-C10_LAUNCH_BOUNDS_1(num_threads)
+C10_LAUNCH_BOUNDS_1(num_threads())
 __global__ void elementwise_kernel_with_index(index_t N, func_t f, typename function_traits<func_t>::result_type *data) {
   #pragma unroll
   for (int i = 0; i < thread_work_size; i++) {
-    index_t idx = block_work_size * blockIdx.x + num_threads * i + threadIdx.x;
+    index_t idx = block_work_size * blockIdx.x + num_threads() * i + threadIdx.x;
     if (idx < N) {
       data[idx] = f(idx);
     }
@@ -38,10 +46,10 @@ void gpu_kernel_with_index(at::Tensor &output, func_t f) {
   auto stream = at::cuda::getCurrentCUDAStream();
   using scalar_t = typename function_traits<func_t>::result_type;
   if (N <= std::numeric_limits<int>::max()) {
-    elementwise_kernel_with_index<int><<<grid, num_threads, 0, stream>>>(N, f, output.data_ptr<scalar_t>());
+    elementwise_kernel_with_index<int><<<grid, num_threads(), 0, stream>>>(N, f, output.data_ptr<scalar_t>());
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else {
-    elementwise_kernel_with_index<int64_t><<<grid, num_threads, 0, stream>>>(N, f, output.data_ptr<scalar_t>());
+    elementwise_kernel_with_index<int64_t><<<grid, num_threads(), 0, stream>>>(N, f, output.data_ptr<scalar_t>());
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
 }