-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[OpenMP] Simplify parallel reductions #70983
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
A lot of the code was from a time when we had multiple parallel levels. The new runtime is much simpler, the code can be simplified a lot which should speed up reductions too.
@llvm/pr-subscribers-openmp Author: Johannes Doerfert (jdoerfert) ChangesA lot of the code was from a time when we had multiple parallel levels. The new runtime is much simpler, the code can be simplified a lot which should speed up reductions too. Full diff: https://github.com/llvm/llvm-project/pull/70983.diff 2 Files Affected:
diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
index efa09cafa879ec1..0113fbbd4b1497c 100644
--- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -44,119 +44,45 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
}
}
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
-static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
- ShuffleReductFnTy shflFct) {
- uint32_t size, remote_id, physical_lane_id;
- physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
- __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
- __kmpc_impl_lanemask_t Liveness = mapping::activemask();
- uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
- __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
- do {
- Liveness = mapping::activemask();
- remote_id = utils::ffs(Liveness & lanemask_gt);
- size = utils::popc(Liveness);
- logical_lane_id /= 2;
- shflFct(reduce_data, /*LaneId =*/logical_lane_id,
- /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
- } while (logical_lane_id % 2 == 0 && size > 1);
- return (logical_lane_id == 0);
-}
-#endif
-
-static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars,
- uint64_t reduce_size,
- void *reduce_data,
+static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
ShuffleReductFnTy shflFct,
- InterWarpCopyFnTy cpyFct,
- bool isSPMDExecutionMode, bool) {
- uint32_t BlockThreadId = mapping::getThreadIdInBlock();
- if (mapping::isMainThreadInGenericMode(/* IsSPMD */ false))
- BlockThreadId = 0;
+ InterWarpCopyFnTy cpyFct) {
uint32_t NumThreads = omp_get_num_threads();
+ // Handle degenerated parallel regions, including all nested ones, first.
if (NumThreads == 1)
return 1;
- /*
- * This reduce function handles reduction within a team. It handles
- * parallel regions in both L1 and L2 parallelism levels. It also
- * supports Generic, SPMD, and NoOMP modes.
- *
- * 1. Reduce within a warp.
- * 2. Warp master copies value to warp 0 via shared memory.
- * 3. Warp 0 reduces to a single value.
- * 4. The reduced value is available in the thread that returns 1.
- */
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- uint32_t WarpsNeeded =
+
+ /*
+ * 1. Reduce within a warp.
+ * 2. Warp master copies value to warp 0 via shared memory.
+ * 3. Warp 0 reduces to a single value.
+ * 4. The reduced value is available in the thread that returns 1.
+ */
+
+ uint32_t BlockThreadId = mapping::getThreadIdInBlock();
+ uint32_t NumWarps =
(NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
- uint32_t WarpId = mapping::getWarpIdInBlock();
- // Volta execution model:
// For the Generic execution mode a parallel region either has 1 thread and
// beyond that, always a multiple of 32. For the SPMD execution mode we may
// have any number of threads.
- if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
- gpu_regular_warp_reduce(reduce_data, shflFct);
- else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
- gpu_irregular_warp_reduce(reduce_data, shflFct,
- /*LaneCount=*/NumThreads % mapping::getWarpSize(),
- /*LaneId=*/mapping::getThreadIdInBlock() %
- mapping::getWarpSize());
-
- // When we have more than [mapping::getWarpSize()] number of threads
- // a block reduction is performed here.
- //
- // Only L1 parallel region can enter this if condition.
- if (NumThreads > mapping::getWarpSize()) {
- // Gather all the reduced values from each warp
- // to the first warp.
- cpyFct(reduce_data, WarpsNeeded);
-
- if (WarpId == 0)
- gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
- BlockThreadId);
- }
- return BlockThreadId == 0;
-#else
- __kmpc_impl_lanemask_t Liveness = mapping::activemask();
- if (Liveness == lanes::All) // Full warp
- gpu_regular_warp_reduce(reduce_data, shflFct);
- else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
- gpu_irregular_warp_reduce(reduce_data, shflFct,
- /*LaneCount=*/utils::popc(Liveness),
- /*LaneId=*/mapping::getThreadIdInBlock() %
- mapping::getWarpSize());
- else { // Dispersed lanes. Only threads in L2
- // parallel region may enter here; return
- // early.
- return gpu_irregular_simd_reduce(reduce_data, shflFct);
- }
+ gpu_regular_warp_reduce(reduce_data, shflFct);
// When we have more than [mapping::getWarpSize()] number of threads
// a block reduction is performed here.
- //
- // Only L1 parallel region can enter this if condition.
if (NumThreads > mapping::getWarpSize()) {
- uint32_t WarpsNeeded =
- (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
// Gather all the reduced values from each warp
// to the first warp.
- cpyFct(reduce_data, WarpsNeeded);
+ cpyFct(reduce_data, NumWarps);
- uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
- if (WarpId == 0)
- gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
- BlockThreadId);
-
- return BlockThreadId == 0;
+ if (BlockThreadId < mapping::getWarpSize())
+ gpu_irregular_warp_reduce(reduce_data, shflFct, NumWarps, BlockThreadId);
}
- // Get the OMP thread Id. This is different from BlockThreadId in the case of
- // an L2 parallel region.
- return TId == 0;
-#endif // __CUDA_ARCH__ >= 700
+ // In Generic and in SPMD mode block thread Id 0 is what we want.
+ // It's either the main thread in SPMD mode or the "acting" main thread in the
+ // parallel region.
+ return BlockThreadId == 0;
}
uint32_t roundToWarpsize(uint32_t s) {
@@ -173,9 +99,7 @@ extern "C" {
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
- return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data,
- shflFct, cpyFct, mapping::isSPMDMode(),
- false);
+ return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
}
int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
diff --git a/openmp/libomptarget/test/offloading/generic_reduction.c b/openmp/libomptarget/test/offloading/generic_reduction.c
new file mode 100644
index 000000000000000..8b5ff0f067f9725
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/generic_reduction.c
@@ -0,0 +1,25 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compileoptxx-run-and-check-generic
+
+#include <omp.h>
+#include <stdio.h>
+__attribute__((optnone)) void optnone(void) {}
+
+int main() {
+ int sum = 0, nt;
+#pragma omp target teams map(tofrom : sum, nt) num_teams(1)
+ {
+ nt = 3 * omp_get_max_threads();
+ optnone();
+#pragma omp parallel reduction(+ : sum)
+ sum += 1;
+#pragma omp parallel reduction(+ : sum)
+ sum += 1;
+#pragma omp parallel reduction(+ : sum)
+ sum += 1;
+ }
+ // CHECK: nt: [[NT:.*]]
+ // CHECK: sum: [[NT]]
+ printf("nt: %i\n", nt);
+ printf("sum: %i\n", sum);
+}
|
Tested on P100, sm_89, MI250, and some other architecture. Seems to work just fine (according to our tests). |
#pragma omp target teams map(tofrom : sum, nt) num_teams(1) | ||
{ | ||
nt = 3 * omp_get_max_threads(); | ||
optnone(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the intention here to prevent some ordering? Usually some inline assembly will accomplish that if you want nt
not to be constant propagated.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it prevents SPMDztion.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG
patch seems to have regressed on these 3 sollve 5.0 tests test_loop_reduction_and_device.c local build fails, reverting 70983 and they passed. |
@shiltian has the fix for this, he was just working on a test. |
reverts: breaks 3 sollve tests: [OpenMP] Simplify parallel reductions (llvm#70983) Change-Id: I6b4bd37c359f0227fd4ce63e99f46f2c3d3d60c1
This reverts commit e9a48f9 because it breaks 3 sollve 5.0 tests: test_loop_reduction_and_device.c test_loop_reduction_bitand_device.c test_loop_reduction_multiply_device.c
This PR has been reverted since my patch cannot fix the three cases. |
A lot of the code was from a time when we had multiple parallel levels. The new runtime is much simpler, the code can be simplified a lot which should speed up reductions too.