Skip to content

[openmp][nfc] Use clang gpuintrin for some dispatch to target intrinsics #131907

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 18 additions & 68 deletions offload/DeviceRTL/src/Mapping.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "State.h"

#include "llvm/Frontend/OpenMP/OMPGridValues.h"
#include "clang/lib/Headers/gpuintrin.h"

using namespace ompx;

Expand All @@ -27,22 +28,6 @@ namespace impl {
///{
#ifdef __AMDGPU__

uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }

uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
switch (Dim) {
case 0:
return __builtin_amdgcn_workgroup_size_x();
case 1:
return __builtin_amdgcn_workgroup_size_y();
case 2:
return __builtin_amdgcn_workgroup_size_z();
};
UNREACHABLE("Dim outside range!");
}

LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }

LaneMaskTy lanemaskLT() {
uint32_t Lane = mapping::getThreadIdInWarp();
int64_t Ballot = mapping::activemask();
Expand All @@ -59,22 +44,6 @@ LaneMaskTy lanemaskGT() {
return Mask & Ballot;
}

uint32_t getThreadIdInWarp() {
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
}

uint32_t getThreadIdInBlock(int32_t Dim) {
switch (Dim) {
case 0:
return __builtin_amdgcn_workitem_id_x();
case 1:
return __builtin_amdgcn_workitem_id_y();
case 2:
return __builtin_amdgcn_workitem_id_z();
};
UNREACHABLE("Dim outside range!");
}

uint32_t getNumberOfThreadsInKernel() {
return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() *
__builtin_amdgcn_grid_size_z();
Expand Down Expand Up @@ -120,40 +89,10 @@ uint32_t getNumberOfWarpsInBlock() {
///{
#ifdef __NVPTX__

uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
switch (Dim) {
case 0:
return __nvvm_read_ptx_sreg_ntid_x();
case 1:
return __nvvm_read_ptx_sreg_ntid_y();
case 2:
return __nvvm_read_ptx_sreg_ntid_z();
};
UNREACHABLE("Dim outside range!");
}

uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }

LaneMaskTy activemask() { return __nvvm_activemask(); }

LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }

LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }

uint32_t getThreadIdInBlock(int32_t Dim) {
switch (Dim) {
case 0:
return __nvvm_read_ptx_sreg_tid_x();
case 1:
return __nvvm_read_ptx_sreg_tid_y();
case 2:
return __nvvm_read_ptx_sreg_tid_z();
};
UNREACHABLE("Dim outside range!");
}

uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); }

uint32_t getBlockIdInKernel(int32_t Dim) {
switch (Dim) {
case 0:
Expand Down Expand Up @@ -236,24 +175,29 @@ bool mapping::isLeaderInWarp() {
return utils::popc(Active & LaneMaskLT) == 0;
}

LaneMaskTy mapping::activemask() { return impl::activemask(); }
LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); }

LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); }

LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); }

uint32_t mapping::getThreadIdInWarp() {
uint32_t ThreadIdInWarp = impl::getThreadIdInWarp();
uint32_t ThreadIdInWarp = __gpu_lane_id();
ASSERT(ThreadIdInWarp < impl::getWarpSize(), nullptr);
return ThreadIdInWarp;
}

uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim);
return ThreadIdInBlock;
switch (Dim) {
case 0:
case 1:
case 2:
return __gpu_thread_id(Dim);
};
UNREACHABLE("Dim outside range!");
}

uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); }

uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
Expand All @@ -265,7 +209,13 @@ uint32_t mapping::getMaxTeamThreads() {
}

uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
return impl::getNumberOfThreadsInBlock(Dim);
switch (Dim) {
case 0:
case 1:
case 2:
return __gpu_num_threads(Dim);
};
UNREACHABLE("Dim outside range!");
}

uint32_t mapping::getNumberOfThreadsInKernel() {
Expand Down
Loading