Skip to content

Commit cb493d2

Browse files
authored
[OpenMP] Replace utilities with 'gpuintrin.h' definitions (llvm#131644)
Summary: Port more instructions. AMD version is at https://gist.github.com/jhuber6/235d7ee95f747c75f9a3cfd8eedac6aa
1 parent e53bea5 commit cb493d2

File tree

2 files changed

+25
-121
lines changed

2 files changed

+25
-121
lines changed

offload/DeviceRTL/src/DeviceUtils.cpp

Lines changed: 15 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -14,117 +14,48 @@
1414
#include "Debug.h"
1515
#include "Interface.h"
1616
#include "Mapping.h"
17+
#include "gpuintrin.h"
1718

1819
using namespace ompx;
1920

20-
namespace impl {
21-
22-
void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
23-
static_assert(sizeof(unsigned long) == 8, "");
24-
*LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
25-
*HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
26-
}
27-
28-
uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
29-
return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
30-
}
31-
32-
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
33-
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
34-
int32_t Width);
35-
36-
uint64_t ballotSync(uint64_t Mask, int32_t Pred);
37-
38-
/// AMDGCN Implementation
39-
///
40-
///{
41-
#ifdef __AMDGPU__
42-
43-
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
44-
int Self = mapping::getThreadIdInWarp();
45-
int Index = SrcLane + (Self & ~(Width - 1));
46-
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
47-
}
48-
49-
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
50-
int32_t Width) {
51-
int Self = mapping::getThreadIdInWarp();
52-
int Index = Self + LaneDelta;
53-
Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
54-
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
55-
}
56-
57-
uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
58-
return Mask & __builtin_amdgcn_ballot_w64(Pred);
59-
}
60-
61-
bool isSharedMemPtr(const void *Ptr) {
62-
return __builtin_amdgcn_is_shared(
63-
(const __attribute__((address_space(0))) void *)Ptr);
64-
}
65-
#endif
66-
///}
67-
68-
/// NVPTX Implementation
69-
///
70-
///{
71-
#ifdef __NVPTX__
72-
73-
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
74-
return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
75-
}
76-
77-
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
78-
int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f;
79-
return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
80-
}
81-
82-
uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
83-
return __nvvm_vote_ballot_sync(static_cast<uint32_t>(Mask), Pred);
84-
}
85-
86-
bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
87-
88-
#endif
89-
///}
90-
} // namespace impl
91-
9221
uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
93-
return impl::Pack(LowBits, HighBits);
22+
return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
9423
}
9524

9625
void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
97-
impl::Unpack(Val, &LowBits, &HighBits);
26+
static_assert(sizeof(unsigned long) == 8, "");
27+
LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
28+
HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
9829
}
9930

10031
int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane,
10132
int32_t Width) {
102-
return impl::shuffle(Mask, Var, SrcLane, Width);
33+
return __gpu_shuffle_idx_u32(Mask, SrcLane, Var, Width);
10334
}
10435

10536
int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
10637
int32_t Width) {
107-
return impl::shuffleDown(Mask, Var, Delta, Width);
38+
int32_t Self = mapping::getThreadIdInWarp();
39+
int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
40+
return __gpu_shuffle_idx_u64(Mask, Index, Var, Width);
10841
}
10942

11043
int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta,
11144
int32_t Width) {
112-
uint32_t Lo, Hi;
113-
utils::unpack(Var, Lo, Hi);
114-
Hi = impl::shuffleDown(Mask, Hi, Delta, Width);
115-
Lo = impl::shuffleDown(Mask, Lo, Delta, Width);
116-
return utils::pack(Lo, Hi);
45+
int32_t Self = mapping::getThreadIdInWarp();
46+
int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
47+
return __gpu_shuffle_idx_u64(Mask, Index, Var, Width);
11748
}
11849

11950
uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
120-
return impl::ballotSync(Mask, Pred);
51+
return __gpu_ballot(Mask, Pred);
12152
}
12253

123-
bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); }
54+
bool utils::isSharedMemPtr(void *Ptr) { return __gpu_is_ptr_local(Ptr); }
12455

12556
extern "C" {
12657
int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
127-
return impl::shuffleDown(lanes::All, Val, Delta, SrcLane);
58+
return utils::shuffleDown(lanes::All, Val, Delta, SrcLane);
12859
}
12960

13061
int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {

offload/DeviceRTL/src/Misc.cpp

Lines changed: 10 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -20,41 +20,6 @@
2020
namespace ompx {
2121
namespace impl {
2222

23-
/// AMDGCN Implementation
24-
///
25-
///{
26-
#ifdef __AMDGPU__
27-
28-
double getWTick() {
29-
// The number of ticks per second for the AMDGPU clock varies by card and can
30-
// only be retrieved by querying the driver. We rely on the device environment
31-
// to inform us what the proper frequency is.
32-
return 1.0 / config::getClockFrequency();
33-
}
34-
35-
double getWTime() {
36-
return static_cast<double>(__builtin_readsteadycounter()) * getWTick();
37-
}
38-
39-
#endif
40-
41-
/// NVPTX Implementation
42-
///
43-
///{
44-
#ifdef __NVPTX__
45-
46-
double getWTick() {
47-
// Timer precision is 1ns
48-
return ((double)1E-9);
49-
}
50-
51-
double getWTime() {
52-
uint64_t nsecs = __nvvm_read_ptx_sreg_globaltimer();
53-
return static_cast<double>(nsecs) * getWTick();
54-
}
55-
56-
#endif
57-
5823
/// Lookup a device-side function using a host pointer /p HstPtr using the table
5924
/// provided by the device plugin. The table is an ordered pair of host and
6025
/// device pointers sorted on the value of the host pointer.
@@ -112,9 +77,17 @@ int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; }
11277

11378
int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; }
11479

115-
double omp_get_wtick(void) { return ompx::impl::getWTick(); }
80+
double omp_get_wtick(void) {
81+
// The number of ticks per second for the AMDGPU clock varies by card and can
82+
// only be retrieved by querying the driver. We rely on the device environment
83+
// to inform us what the proper frequency is. NVPTX uses a nanosecond
84+
// resolution, we could omit the global read but this makes it consistent.
85+
return 1.0 / ompx::config::getClockFrequency();
86+
}
11687

117-
double omp_get_wtime(void) { return ompx::impl::getWTime(); }
88+
double omp_get_wtime(void) {
89+
return static_cast<double>(__builtin_readsteadycounter()) * omp_get_wtick();
90+
}
11891

11992
void *__llvm_omp_indirect_call_lookup(void *HstPtr) {
12093
return ompx::impl::indirectCallLookup(HstPtr);

0 commit comments

Comments
 (0)