Skip to content

Commit 53bcd1e

Browse files
[libomptarget][nfc] Wrap cuda min() in target_impl
Summary: [libomptarget][nfc] Wrap cuda min() in target_impl nvptx forwards to cuda min, amdgcn implements directly. Sufficient to build parallel.cu for amdgcn, added to CMakeLists. All call sites are homogenous except one that passes a uint32_t and an int32_t. This could be smoothed over by taking two type parameters and some care over the return type, but overall I think the inline <uint32_t> calling attention to what was an implicit sign conversion is cleaner. Reviewers: ABataev, jdoerfert Reviewed By: jdoerfert Subscribers: jvesely, mgorny, openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D71580
1 parent 7a31678 commit 53bcd1e

File tree

5 files changed

+14
-5
lines changed

5 files changed

+14
-5
lines changed

openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ set(cuda_sources
5959
${devicertl_base_directory}/common/src/critical.cu
6060
${devicertl_base_directory}/common/src/loop.cu
6161
${devicertl_base_directory}/common/src/omptarget.cu
62+
${devicertl_base_directory}/common/src/parallel.cu
6263
${devicertl_base_directory}/common/src/sync.cu
6364
${devicertl_base_directory}/common/src/task.cu)
6465

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,10 @@ INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
109109

110110
INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
111111

112+
template <typename T> INLINE T __kmpc_impl_min(T x, T y) {
113+
return x < y ? x : y;
114+
}
115+
112116
INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
113117
return __ballot64(1);
114118
}

openmp/libomptarget/deviceRTLs/common/src/parallel.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer,
7272

7373
// We cannot have more than the # of convergent threads.
7474
if (SimdLimitSource > 0)
75-
*NumLanes = min(ConvergentSize, SimdLimitSource);
75+
*NumLanes = __kmpc_impl_min(ConvergentSize, SimdLimitSource);
7676
else
7777
*NumLanes = ConvergentSize;
7878
ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads",
@@ -149,7 +149,7 @@ EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer,
149149
// We cannot have more than the # of convergent threads.
150150
uint16_t NumThreads;
151151
if (NumThreadsSource > 0)
152-
NumThreads = min(ConvergentSize, NumThreadsSource);
152+
NumThreads = __kmpc_impl_min(ConvergentSize, NumThreadsSource);
153153
else
154154
NumThreads = ConvergentSize;
155155
ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",

openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -480,14 +480,14 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
480480
// by returning 1 in the thread holding the reduction result.
481481

482482
// Check if this is the very last team.
483-
unsigned NumRecs = min(NumTeams, num_of_records);
483+
unsigned NumRecs = __kmpc_impl_min(NumTeams, uint32_t(num_of_records));
484484
if (ChunkTeamCount == NumTeams - Bound - 1) {
485485
//
486486
// Last team processing.
487487
//
488488
if (ThreadId >= NumRecs)
489489
return 0;
490-
NumThreads = roundToWarpsize(min(NumThreads, NumRecs));
490+
NumThreads = roundToWarpsize(__kmpc_impl_min(NumThreads, NumRecs));
491491
if (ThreadId >= NumThreads)
492492
return 0;
493493

@@ -502,7 +502,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
502502

503503
// When we have more than [warpsize] number of threads
504504
// a block reduction is performed here.
505-
uint32_t ActiveThreads = min(NumRecs, NumThreads);
505+
uint32_t ActiveThreads = __kmpc_impl_min(NumRecs, NumThreads);
506506
if (ActiveThreads > WARPSIZE) {
507507
uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
508508
// Gather all the reduced values from each warp

openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,10 @@ INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
104104

105105
INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }
106106

107+
template <typename T> INLINE T __kmpc_impl_min(T x, T y) {
108+
return min(x, y);
109+
}
110+
107111
#ifndef CUDA_VERSION
108112
#error CUDA_VERSION macro is undefined, something wrong with cuda.
109113
#endif

0 commit comments

Comments
 (0)