-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[openmp][nfc] Use clang gpuintrin for some dispatch to target intrinsics #131907
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
JonChesterfield
wants to merge
1
commit into
llvm:main
from
JonChesterfield:jc_openmp_some_gpuintrin
Closed
[openmp][nfc] Use clang gpuintrin for some dispatch to target intrinsics #131907
JonChesterfield
wants to merge
1
commit into
llvm:main
from
JonChesterfield:jc_openmp_some_gpuintrin
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@llvm/pr-subscribers-offload Author: Jon Chesterfield (JonChesterfield) ChangesPicked a few straightforward ones to get the ball moving, left the UNREACHABLE path unchanged. Full diff: https://github.com/llvm/llvm-project/pull/131907.diff 1 Files Affected:
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index 641be81cca3ed..53031cbeaa696 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -16,6 +16,7 @@
#include "State.h"
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
+#include "clang/lib/Headers/gpuintrin.h"
using namespace ompx;
@@ -27,22 +28,6 @@ namespace impl {
///{
#ifdef __AMDGPU__
-uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
-
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __builtin_amdgcn_workgroup_size_x();
- case 1:
- return __builtin_amdgcn_workgroup_size_y();
- case 2:
- return __builtin_amdgcn_workgroup_size_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
-
LaneMaskTy lanemaskLT() {
uint32_t Lane = mapping::getThreadIdInWarp();
int64_t Ballot = mapping::activemask();
@@ -59,22 +44,6 @@ LaneMaskTy lanemaskGT() {
return Mask & Ballot;
}
-uint32_t getThreadIdInWarp() {
- return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
-}
-
-uint32_t getThreadIdInBlock(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __builtin_amdgcn_workitem_id_x();
- case 1:
- return __builtin_amdgcn_workitem_id_y();
- case 2:
- return __builtin_amdgcn_workitem_id_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
uint32_t getNumberOfThreadsInKernel() {
return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() *
__builtin_amdgcn_grid_size_z();
@@ -120,40 +89,10 @@ uint32_t getNumberOfWarpsInBlock() {
///{
#ifdef __NVPTX__
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __nvvm_read_ptx_sreg_ntid_x();
- case 1:
- return __nvvm_read_ptx_sreg_ntid_y();
- case 2:
- return __nvvm_read_ptx_sreg_ntid_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }
-
-LaneMaskTy activemask() { return __nvvm_activemask(); }
-
LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }
LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }
-uint32_t getThreadIdInBlock(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __nvvm_read_ptx_sreg_tid_x();
- case 1:
- return __nvvm_read_ptx_sreg_tid_y();
- case 2:
- return __nvvm_read_ptx_sreg_tid_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); }
-
uint32_t getBlockIdInKernel(int32_t Dim) {
switch (Dim) {
case 0:
@@ -236,24 +175,29 @@ bool mapping::isLeaderInWarp() {
return utils::popc(Active & LaneMaskLT) == 0;
}
-LaneMaskTy mapping::activemask() { return impl::activemask(); }
+LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); }
LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); }
LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); }
uint32_t mapping::getThreadIdInWarp() {
- uint32_t ThreadIdInWarp = impl::getThreadIdInWarp();
+ uint32_t ThreadIdInWarp = __gpu_lane_id();
ASSERT(ThreadIdInWarp < impl::getWarpSize(), nullptr);
return ThreadIdInWarp;
}
uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
- uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim);
- return ThreadIdInBlock;
+ switch (Dim) {
+ case 0:
+ case 1:
+ case 2:
+ return __gpu_thread_id(Dim);
+ };
+ UNREACHABLE("Dim outside range!");
}
-uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
+uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); }
uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
@@ -265,7 +209,13 @@ uint32_t mapping::getMaxTeamThreads() {
}
uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
- return impl::getNumberOfThreadsInBlock(Dim);
+ switch (Dim) {
+ case 0:
+ case 1:
+ case 2:
+ return __gpu_num_threads(Dim);
+ };
+ UNREACHABLE("Dim outside range!");
}
uint32_t mapping::getNumberOfThreadsInKernel() {
|
You can test this locally with the following command:git-clang-format --diff 7d7b58bc5d2bacc3d76463d2ee06a13d2a08b084 4d29d2e8d6f264adc82779071b7d09d552873460 --extensions cpp -- offload/DeviceRTL/src/Mapping.cpp View the diff from clang-format here.diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index 53031cbeaa..af94d96251 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -15,8 +15,8 @@
#include "Interface.h"
#include "State.h"
-#include "llvm/Frontend/OpenMP/OMPGridValues.h"
#include "clang/lib/Headers/gpuintrin.h"
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
using namespace ompx;
|
Good stuff. It's a superset of this one, let's ship the bigger one then :) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Picked a few straightforward ones to get the ball moving, left the UNREACHABLE path unchanged.