@@ -44,45 +44,119 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
44
44
}
45
45
}
46
46
47
- static int32_t nvptx_parallel_reduce_nowait (void *reduce_data,
47
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
48
+ static uint32_t gpu_irregular_simd_reduce (void *reduce_data,
49
+ ShuffleReductFnTy shflFct) {
50
+ uint32_t size, remote_id, physical_lane_id;
51
+ physical_lane_id = mapping::getThreadIdInBlock () % mapping::getWarpSize ();
52
+ __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT ();
53
+ __kmpc_impl_lanemask_t Liveness = mapping::activemask ();
54
+ uint32_t logical_lane_id = utils::popc (Liveness & lanemask_lt) * 2 ;
55
+ __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT ();
56
+ do {
57
+ Liveness = mapping::activemask ();
58
+ remote_id = utils::ffs (Liveness & lanemask_gt);
59
+ size = utils::popc (Liveness);
60
+ logical_lane_id /= 2 ;
61
+ shflFct (reduce_data, /* LaneId =*/ logical_lane_id,
62
+ /* Offset=*/ remote_id - 1 - physical_lane_id, /* AlgoVersion=*/ 2 );
63
+ } while (logical_lane_id % 2 == 0 && size > 1 );
64
+ return (logical_lane_id == 0 );
65
+ }
66
+ #endif
67
+
68
+ static int32_t nvptx_parallel_reduce_nowait (int32_t TId, int32_t num_vars,
69
+ uint64_t reduce_size,
70
+ void *reduce_data,
48
71
ShuffleReductFnTy shflFct,
49
- InterWarpCopyFnTy cpyFct) {
72
+ InterWarpCopyFnTy cpyFct,
73
+ bool isSPMDExecutionMode, bool ) {
74
+ uint32_t BlockThreadId = mapping::getThreadIdInBlock ();
75
+ if (mapping::isMainThreadInGenericMode (/* IsSPMD */ false ))
76
+ BlockThreadId = 0 ;
50
77
uint32_t NumThreads = omp_get_num_threads ();
51
- // Handle degenerated parallel regions, including all nested ones, first.
52
78
if (NumThreads == 1 )
53
79
return 1 ;
54
-
55
- /*
56
- * 1. Reduce within a warp.
57
- * 2. Warp master copies value to warp 0 via shared memory.
58
- * 3. Warp 0 reduces to a single value.
59
- * 4. The reduced value is available in the thread that returns 1.
60
- */
61
-
62
- uint32_t BlockThreadId = mapping::getThreadIdInBlock ();
63
- uint32_t NumWarps =
80
+ /*
81
+ * This reduce function handles reduction within a team. It handles
82
+ * parallel regions in both L1 and L2 parallelism levels. It also
83
+ * supports Generic, SPMD, and NoOMP modes.
84
+ *
85
+ * 1. Reduce within a warp.
86
+ * 2. Warp master copies value to warp 0 via shared memory.
87
+ * 3. Warp 0 reduces to a single value.
88
+ * 4. The reduced value is available in the thread that returns 1.
89
+ */
90
+
91
+ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
92
+ uint32_t WarpsNeeded =
64
93
(NumThreads + mapping::getWarpSize () - 1 ) / mapping::getWarpSize ();
94
+ uint32_t WarpId = mapping::getWarpIdInBlock ();
65
95
96
+ // Volta execution model:
66
97
// For the Generic execution mode a parallel region either has 1 thread and
67
98
// beyond that, always a multiple of 32. For the SPMD execution mode we may
68
99
// have any number of threads.
69
- gpu_regular_warp_reduce (reduce_data, shflFct);
100
+ if ((NumThreads % mapping::getWarpSize () == 0 ) || (WarpId < WarpsNeeded - 1 ))
101
+ gpu_regular_warp_reduce (reduce_data, shflFct);
102
+ else if (NumThreads > 1 ) // Only SPMD execution mode comes thru this case.
103
+ gpu_irregular_warp_reduce (reduce_data, shflFct,
104
+ /* LaneCount=*/ NumThreads % mapping::getWarpSize (),
105
+ /* LaneId=*/ mapping::getThreadIdInBlock () %
106
+ mapping::getWarpSize ());
70
107
71
108
// When we have more than [mapping::getWarpSize()] number of threads
72
109
// a block reduction is performed here.
110
+ //
111
+ // Only L1 parallel region can enter this if condition.
73
112
if (NumThreads > mapping::getWarpSize ()) {
74
113
// Gather all the reduced values from each warp
75
114
// to the first warp.
76
- cpyFct (reduce_data, NumWarps );
115
+ cpyFct (reduce_data, WarpsNeeded );
77
116
78
- if (BlockThreadId < mapping::getWarpSize ())
79
- gpu_irregular_warp_reduce (reduce_data, shflFct, NumWarps, BlockThreadId);
117
+ if (WarpId == 0 )
118
+ gpu_irregular_warp_reduce (reduce_data, shflFct, WarpsNeeded,
119
+ BlockThreadId);
80
120
}
81
-
82
- // In Generic and in SPMD mode block thread Id 0 is what we want.
83
- // It's either the main thread in SPMD mode or the "acting" main thread in the
84
- // parallel region.
85
121
return BlockThreadId == 0 ;
122
+ #else
123
+ __kmpc_impl_lanemask_t Liveness = mapping::activemask ();
124
+ if (Liveness == lanes::All) // Full warp
125
+ gpu_regular_warp_reduce (reduce_data, shflFct);
126
+ else if (!(Liveness & (Liveness + 1 ))) // Partial warp but contiguous lanes
127
+ gpu_irregular_warp_reduce (reduce_data, shflFct,
128
+ /* LaneCount=*/ utils::popc (Liveness),
129
+ /* LaneId=*/ mapping::getThreadIdInBlock () %
130
+ mapping::getWarpSize ());
131
+ else { // Dispersed lanes. Only threads in L2
132
+ // parallel region may enter here; return
133
+ // early.
134
+ return gpu_irregular_simd_reduce (reduce_data, shflFct);
135
+ }
136
+
137
+ // When we have more than [mapping::getWarpSize()] number of threads
138
+ // a block reduction is performed here.
139
+ //
140
+ // Only L1 parallel region can enter this if condition.
141
+ if (NumThreads > mapping::getWarpSize ()) {
142
+ uint32_t WarpsNeeded =
143
+ (NumThreads + mapping::getWarpSize () - 1 ) / mapping::getWarpSize ();
144
+ // Gather all the reduced values from each warp
145
+ // to the first warp.
146
+ cpyFct (reduce_data, WarpsNeeded);
147
+
148
+ uint32_t WarpId = BlockThreadId / mapping::getWarpSize ();
149
+ if (WarpId == 0 )
150
+ gpu_irregular_warp_reduce (reduce_data, shflFct, WarpsNeeded,
151
+ BlockThreadId);
152
+
153
+ return BlockThreadId == 0 ;
154
+ }
155
+
156
+ // Get the OMP thread Id. This is different from BlockThreadId in the case of
157
+ // an L2 parallel region.
158
+ return TId == 0 ;
159
+ #endif // __CUDA_ARCH__ >= 700
86
160
}
87
161
88
162
uint32_t roundToWarpsize (uint32_t s) {
@@ -99,7 +173,9 @@ extern "C" {
99
173
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2 (
100
174
IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
101
175
void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
102
- return nvptx_parallel_reduce_nowait (reduce_data, shflFct, cpyFct);
176
+ return nvptx_parallel_reduce_nowait (TId, num_vars, reduce_size, reduce_data,
177
+ shflFct, cpyFct, mapping::isSPMDMode (),
178
+ false );
103
179
}
104
180
105
181
// / Mostly like _v2 but with the builtin assumption that we have less than
0 commit comments