Revert "[OpenMP] Provide a specialized team reduction for the common case (llvm#70766)"

ronlieb · ronlieb · commit 07441d5b9640 · 2023-11-04T12:45:29.000-05:00
fails 4 sollve tests: test_target_teams_distribute_reduction_and.c test_target_teams_distribute_reduction_multiply.c test_target_teams_distribute_reduction_and.c test_target_teams_distribute_reduction_multiply.c This reverts commit eab828d. Change-Id: If6beb31e12531c9232ccf9a711fbb2a1cbe99898
diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -178,109 +178,11 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
                                       false);
 }
 
-/// Mostly like _v2 but with the builtin assumption that we have less than
-/// num_of_records (by default 1024) teams.
-int32_t __kmpc_nvptx_teams_reduce_nowait_v3(
-    IdentTy *Loc, int32_t TId, void *__restrict__ GlobalBuffer,
-    uint32_t num_of_records, void *reduce_data, ShuffleReductFnTy shflFct,
-    InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
-    ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
-  // Terminate all threads in non-SPMD mode except for the main thread.
-  uint32_t ThreadId = mapping::getThreadIdInBlock();
-  if (mapping::isGenericMode()) {
-    if (!mapping::isMainThreadInGenericMode())
-      return 0;
-    ThreadId = 0;
-  }
-
-  uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
-
-  // In non-generic mode all workers participate in the teams reduction.
-  // In generic mode only the team main participates in the teams
-  // reduction because the workers are waiting for parallel work.
-  uint32_t NumThreads = omp_get_num_threads();
-  uint32_t TeamId = omp_get_team_num();
-  uint32_t NumTeams = omp_get_num_teams();
-  static unsigned SHARED(ChunkTeamCount);
-
-  // Block progress for teams greater than the current upper
-  // limit. We always only allow a number of teams less or equal
-  // to the number of slots in the buffer.
-  bool IsMain = (ThreadId == 0);
-
-  if (IsMain) {
-    lgcpyFct(GlobalBuffer, TeamId, reduce_data);
-
-    // Propagate the memory writes above to the world.
-    fence::kernel(atomic::release);
-
-    // Increment team counter.
-    // This counter is incremented by all teams in the current
-    // BUFFER_SIZE chunk.
-    ChunkTeamCount = atomic::inc(&Cnt, NumTeams, atomic::acq_rel,
-                                 atomic::MemScopeTy::device);
-  }
-
-  // Synchronize in SPMD mode as in generic mode all but 1 threads are in the
-  // state machine.
-  if (mapping::isSPMDMode())
-    synchronize::threadsAligned(atomic::acq_rel);
-
-  // Each thread will have a local struct containing the values to be
-  // reduced:
-  //      1. do reduction within each warp.
-  //      2. do reduction across warps.
-  //      3. write the final result to the main reduction variable
-  //         by returning 1 in the thread holding the reduction result.
-
-  // Check if this is the very last team.
-  if (ChunkTeamCount != NumTeams - 1)
-    return 0;
-
-  // Last team processing.
-  NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumTeams));
-  if (ThreadId >= NumThreads)
-    return 0;
-
-  // Ensure we see the global memory writes by other teams
-  fence::kernel(atomic::aquire);
-
-  // Load from buffer and reduce.
-  glcpyFct(GlobalBuffer, ThreadId, reduce_data);
-  for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
-    glredFct(GlobalBuffer, i, reduce_data);
-
-  // Reduce across warps to the warp main.
-  gpu_regular_warp_reduce(reduce_data, shflFct);
-
-  uint32_t ActiveThreads = kmpcMin(NumTeams, NumThreads);
-  uint32_t WarpsNeeded =
-      (ActiveThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
-  // Gather all the reduced values from each warp
-  // to the first warp.
-  cpyFct(reduce_data, WarpsNeeded);
-
-  if (mapping::getWarpIdInBlock() == 0)
-    gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
-
-  return IsMain;
-}
-
 int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
     IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
     void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
     ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
     ListGlobalFnTy glredFct) {
-  // The first check is a compile time constant, the second one a runtime check.
-  // If the first one succeeds we will use the specialized version.
-  if ((state::getKernelEnvironment().Configuration.MaxTeams >= 0 &&
-       state::getKernelEnvironment().Configuration.MaxTeams <= num_of_records &&
-       num_of_records == 1024) ||
-      (omp_get_num_teams() <= num_of_records))
-    return __kmpc_nvptx_teams_reduce_nowait_v3(
-        Loc, TId, GlobalBuffer, num_of_records, reduce_data, shflFct, cpyFct,
-        lgcpyFct, lgredFct, glcpyFct, glredFct);
-
   // Terminate all threads in non-SPMD mode except for the master thread.
   uint32_t ThreadId = mapping::getThreadIdInBlock();
   if (mapping::isGenericMode()) {
diff --git a/revert_patches.txt b/revert_patches.txt
@@ -20,3 +20,11 @@ Johannes: revert breaks 3 sollve tests
  test_loop_reduction_and_device.c
  test_loop_reduction_bitand_device.c
  test_loop_reduction_multiply_device.c
+
+Johannes: breaks 4 sollve tests
+[OpenMP] Provide a specialized team reduction for the common case (#70766) 
+   test_target_teams_distribute_reduction_and.c
+   test_target_teams_distribute_reduction_multiply.c
+   test_target_teams_distribute_reduction_and.c
+   test_target_teams_distribute_reduction_multiply.c
+