[OpenMP] Simplify parallel reductions (#70983)

jdoerfert · web-flow · commit e9a48f9e05c1 · 2023-11-02T15:50:05.000-07:00
A lot of the code was from a time when we had multiple parallel levels.
The new runtime is much simpler, the code can be simplified a lot which
should speed up reductions too.
diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -44,119 +44,45 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
   }
 }
 
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
-static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
-                                          ShuffleReductFnTy shflFct) {
-  uint32_t size, remote_id, physical_lane_id;
-  physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
-  __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
-  __kmpc_impl_lanemask_t Liveness = mapping::activemask();
-  uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
-  __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
-  do {
-    Liveness = mapping::activemask();
-    remote_id = utils::ffs(Liveness & lanemask_gt);
-    size = utils::popc(Liveness);
-    logical_lane_id /= 2;
-    shflFct(reduce_data, /*LaneId =*/logical_lane_id,
-            /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
-  } while (logical_lane_id % 2 == 0 && size > 1);
-  return (logical_lane_id == 0);
-}
-#endif
-
-static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars,
-                                            uint64_t reduce_size,
-                                            void *reduce_data,
+static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
                                             ShuffleReductFnTy shflFct,
-                                            InterWarpCopyFnTy cpyFct,
-                                            bool isSPMDExecutionMode, bool) {
-  uint32_t BlockThreadId = mapping::getThreadIdInBlock();
-  if (mapping::isMainThreadInGenericMode(/* IsSPMD */ false))
-    BlockThreadId = 0;
+                                            InterWarpCopyFnTy cpyFct) {
   uint32_t NumThreads = omp_get_num_threads();
+  // Handle degenerated parallel regions, including all nested ones, first.
   if (NumThreads == 1)
     return 1;
-    /*
-     * This reduce function handles reduction within a team. It handles
-     * parallel regions in both L1 and L2 parallelism levels. It also
-     * supports Generic, SPMD, and NoOMP modes.
-     *
-     * 1. Reduce within a warp.
-     * 2. Warp master copies value to warp 0 via shared memory.
-     * 3. Warp 0 reduces to a single value.
-     * 4. The reduced value is available in the thread that returns 1.
-     */
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  uint32_t WarpsNeeded =
+
+  /*
+   * 1. Reduce within a warp.
+   * 2. Warp master copies value to warp 0 via shared memory.
+   * 3. Warp 0 reduces to a single value.
+   * 4. The reduced value is available in the thread that returns 1.
+   */
+
+  uint32_t BlockThreadId = mapping::getThreadIdInBlock();
+  uint32_t NumWarps =
       (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
-  uint32_t WarpId = mapping::getWarpIdInBlock();
 
-  // Volta execution model:
   // For the Generic execution mode a parallel region either has 1 thread and
   // beyond that, always a multiple of 32. For the SPMD execution mode we may
   // have any number of threads.
-  if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/NumThreads % mapping::getWarpSize(),
-                              /*LaneId=*/mapping::getThreadIdInBlock() %
-                                  mapping::getWarpSize());
-
-  // When we have more than [mapping::getWarpSize()] number of threads
-  // a block reduction is performed here.
-  //
-  // Only L1 parallel region can enter this if condition.
-  if (NumThreads > mapping::getWarpSize()) {
-    // Gather all the reduced values from each warp
-    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
-
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                BlockThreadId);
-  }
-  return BlockThreadId == 0;
-#else
-  __kmpc_impl_lanemask_t Liveness = mapping::activemask();
-  if (Liveness == lanes::All) // Full warp
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/utils::popc(Liveness),
-                              /*LaneId=*/mapping::getThreadIdInBlock() %
-                                  mapping::getWarpSize());
-  else { // Dispersed lanes. Only threads in L2
-         // parallel region may enter here; return
-         // early.
-    return gpu_irregular_simd_reduce(reduce_data, shflFct);
-  }
+  gpu_regular_warp_reduce(reduce_data, shflFct);
 
   // When we have more than [mapping::getWarpSize()] number of threads
   // a block reduction is performed here.
-  //
-  // Only L1 parallel region can enter this if condition.
   if (NumThreads > mapping::getWarpSize()) {
-    uint32_t WarpsNeeded =
-        (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
     // Gather all the reduced values from each warp
     // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
+    cpyFct(reduce_data, NumWarps);
 
-    uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                BlockThreadId);
-
-    return BlockThreadId == 0;
+    if (BlockThreadId < mapping::getWarpSize())
+      gpu_irregular_warp_reduce(reduce_data, shflFct, NumWarps, BlockThreadId);
   }
 
-  // Get the OMP thread Id. This is different from BlockThreadId in the case of
-  // an L2 parallel region.
-  return TId == 0;
-#endif // __CUDA_ARCH__ >= 700
+  // In Generic and in SPMD mode block thread Id 0 is what we want.
+  // It's either the main thread in SPMD mode or the "acting" main thread in the
+  // parallel region.
+  return BlockThreadId == 0;
 }
 
 uint32_t roundToWarpsize(uint32_t s) {
@@ -173,9 +99,7 @@ extern "C" {
 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
     IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
     void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
-  return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data,
-                                      shflFct, cpyFct, mapping::isSPMDMode(),
-                                      false);
+  return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
 }
 
 /// Mostly like _v2 but with the builtin assumption that we have less than
diff --git a/openmp/libomptarget/test/offloading/generic_reduction.c b/openmp/libomptarget/test/offloading/generic_reduction.c
@@ -0,0 +1,25 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compileoptxx-run-and-check-generic
+
+#include <omp.h>
+#include <stdio.h>
+__attribute__((optnone)) void optnone(void) {}
+
+int main() {
+  int sum = 0, nt;
+#pragma omp target teams map(tofrom : sum, nt) num_teams(1)
+  {
+    nt = 3 * omp_get_max_threads();
+    optnone();
+#pragma omp parallel reduction(+ : sum)
+    sum += 1;
+#pragma omp parallel reduction(+ : sum)
+    sum += 1;
+#pragma omp parallel reduction(+ : sum)
+    sum += 1;
+  }
+  // CHECK: nt: [[NT:.*]]
+  // CHECK: sum: [[NT]]
+  printf("nt: %i\n", nt);
+  printf("sum: %i\n", sum);
+}