[ROCm] Improvements to non-vectorized elementwise kernels (#1875)

jerrymannil · dnikolaev-amd · commit 08ceb35f3a52 · 2025-04-17T15:48:51.000Z
* Unroll loops manually to hide memory access latency * Strided access for coalesced memory acesses Co-authors: @akadutta @doru1004 @amd-hhashemi @carlobertolli (cherry picked from commit 2e48656)
diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -445,6 +445,44 @@ __global__ void elementwise_kernel(int N, func_t f) {
   }
 }
 
+#ifdef USE_ROCM
+template <int nt, int vt, typename func_t>
+C10_LAUNCH_BOUNDS_2(nt, 4)
+__global__ void elementwise_kernel_manual_unroll(int N, func_t f) {
+  int tid = threadIdx.x;
+  int nv = nt * vt;
+  int idx = nv * blockIdx.x + tid;
+  if ((idx + nt*(vt-1)) < N) {
+    f(idx, true);
+  } else {
+#pragma unroll
+    for (int i = 0; i < vt; i++) {
+      if (idx < N) {
+        f(idx, false);
+        idx += nt;
+      }
+    }
+  }
+}
+
+template <int nt, int vt, typename func_t>
+C10_LAUNCH_BOUNDS_2(nt, 4)
+__global__ void elementwise_kernel_strided(int N, func_t f) {
+  int tid = threadIdx.x;
+  int idx = nt * vt * blockIdx.x + tid;
+  int step = nt * vt * gridDim.x;
+  while (idx < N) {
+#pragma unroll
+    for (int i = 0; i < vt; i++) {
+      if ((idx + nt * i) < N) {
+        f(idx + nt * i);
+      }
+    }
+    idx += step;
+  }
+}
+#endif
+
 template <int nt, int vt, typename func_t>
 static void launch_legacy_kernel(int64_t N, const func_t& f) {
   TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());
@@ -458,6 +496,37 @@ static void launch_legacy_kernel(int64_t N, const func_t& f) {
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
+#ifdef USE_ROCM
+template <int nt, int vt, typename func_t>
+static void launch_legacy_kernel_manual_unroll(int64_t N, const func_t& f) {
+  TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());
+  if (N == 0) {
+    return;
+  }
+  dim3 block(nt);
+  dim3 grid((N + block.x * vt - 1) / (block.x * vt));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  elementwise_kernel_manual_unroll<nt, vt, func_t><<<grid, block, 0, stream>>>(N, f);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template <int nt, int vt, typename func_t>
+static void launch_legacy_kernel_strided(int64_t N, const func_t& f) {
+  TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());
+  if (N == 0) {
+    return;
+  }
+  dim3 block(nt);
+  dim3 grid(8192);
+  auto stream = at::cuda::getCurrentCUDAStream();
+  int ub_idx = nt * vt;
+  ub_idx = ub_idx * (grid.x - 1) +(block.x - 1);
+  ub_idx = ub_idx + nt*vt;
+  elementwise_kernel_strided<nt, vt, func_t><<<grid, block, 0, stream>>>(N, f);
+   C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+#endif
+
 template <typename traits, typename func_t, typename index_t, size_t... INDEX>
 C10_HOST_DEVICE typename traits::result_type invoke_impl(
     const func_t& f,
@@ -536,12 +605,84 @@ void gpu_kernel_impl_nocast(TensorIteratorBase& iter, const func_t& f) {
     return launch_vectorized_kernel(numel, f, data);
   }
   auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
+#ifndef USE_ROCM
   constexpr int unroll_factor = sizeof(arg0_t) >= 4 ? 2 : 4;
   launch_legacy_kernel<128, unroll_factor>(numel, [=] GPU_LAMBDA(int idx) {
     auto offsets = offset_calc.get(idx);
     arg0_t* out = (arg0_t*)(data[0] + offsets[0]);
     *out = invoke(f, &data[1], &offsets[1], 1);
   });
+#else
+  constexpr int unroll_factor = sizeof(arg0_t) >= 4 ? 4 : 8;
+  constexpr int grp_sz = 128;
+  launch_legacy_kernel_manual_unroll<grp_sz, unroll_factor>(numel, [=] GPU_LAMBDA(int idx, bool unrl4x) {
+     if constexpr (unroll_factor == 4) {
+       if (unrl4x) {
+         auto offsets0 = offset_calc.get(idx);
+         auto offsets1 = offset_calc.get(idx+grp_sz);
+         auto offsets2 = offset_calc.get(idx+grp_sz*2);
+         auto offsets3 = offset_calc.get(idx+grp_sz*3);
+         arg0_t* out0 = (arg0_t*)(data[0] + offsets0[0]);
+         arg0_t* out1 = (arg0_t*)(data[0] + offsets1[0]);
+         arg0_t* out2 = (arg0_t*)(data[0] + offsets2[0]);
+         arg0_t* out3 = (arg0_t*)(data[0] + offsets3[0]);
+         auto tmp0 = invoke(f, &data[1], &offsets0[1], 1);
+         auto tmp1 = invoke(f, &data[1], &offsets1[1], 1);
+         auto tmp2 = invoke(f, &data[1], &offsets2[1], 1);
+         auto tmp3 = invoke(f, &data[1], &offsets3[1], 1);
+         *out0 = tmp0;
+         *out1 = tmp1;
+         *out2 = tmp2;
+         *out3 = tmp3;
+       }
+       else {
+         auto offsets = offset_calc.get(idx);
+         arg0_t* out = (arg0_t*)(data[0] + offsets[0]);
+         *out = invoke(f, &data[1], &offsets[1], 1);
+       }
+     } else {
+         if (unrl4x) {
+           auto offsets0 = offset_calc.get(idx);
+           auto offsets1 = offset_calc.get(idx+grp_sz);
+           auto offsets2 = offset_calc.get(idx+grp_sz*2);
+           auto offsets3 = offset_calc.get(idx+grp_sz*3);
+           auto offsets4 = offset_calc.get(idx+grp_sz*4);
+           auto offsets5 = offset_calc.get(idx+grp_sz*5);
+           auto offsets6 = offset_calc.get(idx+grp_sz*6);
+           auto offsets7 = offset_calc.get(idx+grp_sz*7);
+           arg0_t* out0 = (arg0_t*)(data[0] + offsets0[0]);
+           arg0_t* out1 = (arg0_t*)(data[0] + offsets1[0]);
+           arg0_t* out2 = (arg0_t*)(data[0] + offsets2[0]);
+           arg0_t* out3 = (arg0_t*)(data[0] + offsets3[0]);
+           arg0_t* out4 = (arg0_t*)(data[0] + offsets4[0]);
+           arg0_t* out5 = (arg0_t*)(data[0] + offsets5[0]);
+           arg0_t* out6 = (arg0_t*)(data[0] + offsets6[0]);
+           arg0_t* out7 = (arg0_t*)(data[0] + offsets7[0]);
+           auto tmp0 = invoke(f, &data[1], &offsets0[1], 1);
+           auto tmp1 = invoke(f, &data[1], &offsets1[1], 1);
+           auto tmp2 = invoke(f, &data[1], &offsets2[1], 1);
+           auto tmp3 = invoke(f, &data[1], &offsets3[1], 1);
+           auto tmp4 = invoke(f, &data[1], &offsets4[1], 1);
+           auto tmp5 = invoke(f, &data[1], &offsets5[1], 1);
+           auto tmp6 = invoke(f, &data[1], &offsets6[1], 1);
+           auto tmp7 = invoke(f, &data[1], &offsets7[1], 1);
+           *out0 = tmp0;
+           *out1 = tmp1;
+           *out2 = tmp2;
+           *out3 = tmp3;
+           *out4 = tmp4;
+           *out5 = tmp5;
+           *out6 = tmp6;
+           *out7 = tmp7;
+       }
+       else {
+         auto offsets = offset_calc.get(idx);
+         arg0_t* out = (arg0_t*)(data[0] + offsets[0]);
+         *out = invoke(f, &data[1], &offsets[1], 1);
+       }
+     }
+  });
+#endif
 }
 
 #ifdef USE_ROCM
@@ -759,7 +900,7 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
       dtypes[i] = iter.dtype(i);
       strides[i] = inner_strides[i];
     }
-    launch_legacy_kernel<512, 1>(numel, [=]GPU_LAMBDA(int idx) {
+    launch_legacy_kernel_strided<512, 4>(numel, [=]GPU_LAMBDA(int idx) {
       void* out = data[0] + strides[0] * idx;
       arg0_t result = invoke(f, &data[1], &strides[1], &dtypes[1], idx);
       c10::cast_and_store<arg0_t>(dtypes[0], out, result);