Skip to content

[OpenMP] Replace utilities with 'gpuintrin.h' definitions #131644

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 15 additions & 84 deletions offload/DeviceRTL/src/DeviceUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,117 +14,48 @@
#include "Debug.h"
#include "Interface.h"
#include "Mapping.h"
#include "gpuintrin.h"

using namespace ompx;

namespace impl {

void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
static_assert(sizeof(unsigned long) == 8, "");
*LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
*HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
}

uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
}

int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
int32_t Width);

uint64_t ballotSync(uint64_t Mask, int32_t Pred);

/// AMDGCN Implementation
///
///{
#ifdef __AMDGPU__

int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
int Self = mapping::getThreadIdInWarp();
int Index = SrcLane + (Self & ~(Width - 1));
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
}

int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
int32_t Width) {
int Self = mapping::getThreadIdInWarp();
int Index = Self + LaneDelta;
Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
}

uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
return Mask & __builtin_amdgcn_ballot_w64(Pred);
}

bool isSharedMemPtr(const void *Ptr) {
return __builtin_amdgcn_is_shared(
(const __attribute__((address_space(0))) void *)Ptr);
}
#endif
///}

/// NVPTX Implementation
///
///{
#ifdef __NVPTX__

int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
}

int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f;
return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
}

uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
return __nvvm_vote_ballot_sync(static_cast<uint32_t>(Mask), Pred);
}

bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }

#endif
///}
} // namespace impl

uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
return impl::Pack(LowBits, HighBits);
return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
}

void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
impl::Unpack(Val, &LowBits, &HighBits);
static_assert(sizeof(unsigned long) == 8, "");
LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
}

int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane,
int32_t Width) {
return impl::shuffle(Mask, Var, SrcLane, Width);
return __gpu_shuffle_idx_u32(Mask, SrcLane, Var, Width);
}

int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
int32_t Width) {
return impl::shuffleDown(Mask, Var, Delta, Width);
int32_t Self = mapping::getThreadIdInWarp();
int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
return __gpu_shuffle_idx_u64(Mask, Index, Var, Width);
}

int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta,
int32_t Width) {
uint32_t Lo, Hi;
utils::unpack(Var, Lo, Hi);
Hi = impl::shuffleDown(Mask, Hi, Delta, Width);
Lo = impl::shuffleDown(Mask, Lo, Delta, Width);
return utils::pack(Lo, Hi);
int32_t Self = mapping::getThreadIdInWarp();
int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
return __gpu_shuffle_idx_u64(Mask, Index, Var, Width);
}

uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
return impl::ballotSync(Mask, Pred);
return __gpu_ballot(Mask, Pred);
}

bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); }
bool utils::isSharedMemPtr(void *Ptr) { return __gpu_is_ptr_local(Ptr); }

extern "C" {
int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
return impl::shuffleDown(lanes::All, Val, Delta, SrcLane);
return utils::shuffleDown(lanes::All, Val, Delta, SrcLane);
}

int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
Expand Down
47 changes: 10 additions & 37 deletions offload/DeviceRTL/src/Misc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,41 +20,6 @@
namespace ompx {
namespace impl {

/// AMDGCN Implementation
///
///{
#ifdef __AMDGPU__

double getWTick() {
// The number of ticks per second for the AMDGPU clock varies by card and can
// only be retrieved by querying the driver. We rely on the device environment
// to inform us what the proper frequency is.
return 1.0 / config::getClockFrequency();
}

double getWTime() {
return static_cast<double>(__builtin_readsteadycounter()) * getWTick();
}

#endif

/// NVPTX Implementation
///
///{
#ifdef __NVPTX__

double getWTick() {
// Timer precision is 1ns
return ((double)1E-9);
}

double getWTime() {
uint64_t nsecs = __nvvm_read_ptx_sreg_globaltimer();
return static_cast<double>(nsecs) * getWTick();
}

#endif

/// Lookup a device-side function using a host pointer /p HstPtr using the table
/// provided by the device plugin. The table is an ordered pair of host and
/// device pointers sorted on the value of the host pointer.
Expand Down Expand Up @@ -112,9 +77,17 @@ int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; }

int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; }

double omp_get_wtick(void) { return ompx::impl::getWTick(); }
double omp_get_wtick(void) {
// The number of ticks per second for the AMDGPU clock varies by card and can
// only be retrieved by querying the driver. We rely on the device environment
// to inform us what the proper frequency is. NVPTX uses a nanosecond
// resolution, we could omit the global read but this makes it consistent.
return 1.0 / ompx::config::getClockFrequency();
}

double omp_get_wtime(void) { return ompx::impl::getWTime(); }
double omp_get_wtime(void) {
return static_cast<double>(__builtin_readsteadycounter()) * omp_get_wtick();
}

void *__llvm_omp_indirect_call_lookup(void *HstPtr) {
return ompx::impl::indirectCallLookup(HstPtr);
Expand Down