-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[Clang] Add handlers for 'match_any' and 'match_all' to gpuintrin.h
#127504
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-clang @llvm/pr-subscribers-libc Author: Joseph Huber (jhuber6) ChangesSummary: Full diff: https://github.com/llvm/llvm-project/pull/127504.diff 5 Files Affected:
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 9dad99ffe9439..355e75d0b2d42 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -162,6 +162,62 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
}
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+ uint32_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, !__done)) {
+ if (!__done) {
+ uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+ uint64_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, __done)) {
+ if (!__done) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+ uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ __gpu_sync_lane(__lane_mask);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ __gpu_sync_lane(__lane_mask);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
// Returns true if the flat pointer points to AMDGPU 'shared' memory.
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)((
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 40fa2edebe975..f857a87b5f4c7 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -13,6 +13,10 @@
#error "This file is intended for NVPTX targets or offloading to NVPTX"
#endif
+#ifndef __CUDA_ARCH__
+#define __CUDA_ARCH__ 0
+#endif
+
#include <stdint.h>
#if !defined(__cplusplus)
@@ -168,6 +172,76 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
}
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_any_sync_i32(__lane_mask, __x);
+
+ uint32_t __match_mask = 0;
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, !__done)) {
+ if (!__done) {
+ uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_any_sync_i64(__lane_mask, __x);
+
+ uint64_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, __done)) {
+ if (!__done) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ int predicate;
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
+
+ uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ int predicate;
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
+
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
// Returns true if the flat pointer points to CUDA 'shared' memory.
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
return __nvvm_isspacep_shared(ptr);
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index 323c003f1ff07..0fd3a6498b865 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -92,6 +92,14 @@ LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
}
+LIBC_INLINE uint64_t match_any(uint64_t lane_mask, uint32_t x) {
+ return __gpu_match_any_u32(lane_mask, x);
+}
+
+LIBC_INLINE uint64_t match_all(uint64_t lane_mask, uint32_t x) {
+ return __gpu_match_all_u32(lane_mask, x);
+}
+
[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt
index 68bbc3849bc7e..e066830f6cc0d 100644
--- a/libc/test/integration/src/__support/GPU/CMakeLists.txt
+++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt
@@ -18,3 +18,12 @@ add_integration_test(
LOADER_ARGS
--threads 64
)
+
+add_integration_test(
+ match_test
+ SUITE libc-support-gpu-tests
+ SRCS
+ match.cpp
+ LOADER_ARGS
+ --threads 64
+)
diff --git a/libc/test/integration/src/__support/GPU/match.cpp b/libc/test/integration/src/__support/GPU/match.cpp
new file mode 100644
index 0000000000000..225078022cdc3
--- /dev/null
+++ b/libc/test/integration/src/__support/GPU/match.cpp
@@ -0,0 +1,32 @@
+//===-- Test for the shuffle operations on the GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/GPU/utils.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+// Test to ensure that match any / match all work.
+static void test_match() {
+ uint64_t mask = gpu::get_lane_mask();
+ EXPECT_EQ(0ull, gpu::match_any(mask, gpu::get_lane_id()));
+ EXPECT_EQ(mask, gpu::match_any(mask, 1));
+ EXPECT_EQ(0xffff, gpu::match_any(mask, gpu::get_lane_id() < 16));
+ EXPECT_EQ(mask, gpu::match_all(mask, 1));
+ EXPECT_EQ(0ull, gpu::match_any(mask, gpu::get_lane_id()));
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+ if (gpu::get_thread_id() >= gpu::get_lane_size())
+ return 0;
+
+ test_match();
+
+ return 0;
+}
|
@llvm/pr-subscribers-backend-x86 Author: Joseph Huber (jhuber6) ChangesSummary: Full diff: https://github.com/llvm/llvm-project/pull/127504.diff 5 Files Affected:
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 9dad99ffe9439..355e75d0b2d42 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -162,6 +162,62 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
}
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+ uint32_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, !__done)) {
+ if (!__done) {
+ uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+ uint64_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, __done)) {
+ if (!__done) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+ uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ __gpu_sync_lane(__lane_mask);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ __gpu_sync_lane(__lane_mask);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
// Returns true if the flat pointer points to AMDGPU 'shared' memory.
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)((
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 40fa2edebe975..f857a87b5f4c7 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -13,6 +13,10 @@
#error "This file is intended for NVPTX targets or offloading to NVPTX"
#endif
+#ifndef __CUDA_ARCH__
+#define __CUDA_ARCH__ 0
+#endif
+
#include <stdint.h>
#if !defined(__cplusplus)
@@ -168,6 +172,76 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
}
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_any_sync_i32(__lane_mask, __x);
+
+ uint32_t __match_mask = 0;
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, !__done)) {
+ if (!__done) {
+ uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_any_sync_i64(__lane_mask, __x);
+
+ uint64_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, __done)) {
+ if (!__done) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ int predicate;
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
+
+ uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ int predicate;
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
+
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
// Returns true if the flat pointer points to CUDA 'shared' memory.
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
return __nvvm_isspacep_shared(ptr);
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index 323c003f1ff07..0fd3a6498b865 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -92,6 +92,14 @@ LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
}
+LIBC_INLINE uint64_t match_any(uint64_t lane_mask, uint32_t x) {
+ return __gpu_match_any_u32(lane_mask, x);
+}
+
+LIBC_INLINE uint64_t match_all(uint64_t lane_mask, uint32_t x) {
+ return __gpu_match_all_u32(lane_mask, x);
+}
+
[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt
index 68bbc3849bc7e..e066830f6cc0d 100644
--- a/libc/test/integration/src/__support/GPU/CMakeLists.txt
+++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt
@@ -18,3 +18,12 @@ add_integration_test(
LOADER_ARGS
--threads 64
)
+
+add_integration_test(
+ match_test
+ SUITE libc-support-gpu-tests
+ SRCS
+ match.cpp
+ LOADER_ARGS
+ --threads 64
+)
diff --git a/libc/test/integration/src/__support/GPU/match.cpp b/libc/test/integration/src/__support/GPU/match.cpp
new file mode 100644
index 0000000000000..225078022cdc3
--- /dev/null
+++ b/libc/test/integration/src/__support/GPU/match.cpp
@@ -0,0 +1,32 @@
+//===-- Test for the shuffle operations on the GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/GPU/utils.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+// Test to ensure that match any / match all work.
+static void test_match() {
+ uint64_t mask = gpu::get_lane_mask();
+ EXPECT_EQ(0ull, gpu::match_any(mask, gpu::get_lane_id()));
+ EXPECT_EQ(mask, gpu::match_any(mask, 1));
+ EXPECT_EQ(0xffff, gpu::match_any(mask, gpu::get_lane_id() < 16));
+ EXPECT_EQ(mask, gpu::match_all(mask, 1));
+ EXPECT_EQ(0ull, gpu::match_any(mask, gpu::get_lane_id()));
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+ if (gpu::get_thread_id() >= gpu::get_lane_size())
+ return 0;
+
+ test_match();
+
+ return 0;
+}
|
Summary: These helpers are very useful but currently absent. They allow the user to get a bitmask representing the matches within the warp. I have made an executive decision to drop the `predicate` return from `match_all` because it's easily testable with `match_all() == __activemask()`.
return __gpu_match_any_u32(lane_mask, x); | ||
} | ||
|
||
LIBC_INLINE uint64_t match_all(uint64_t lane_mask, uint32_t x) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suppose this is a C++ interface function instead of C one?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, this is just a wrapper used internally by the libc
.
@@ -162,6 +162,62 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, | |||
((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width)); | |||
} | |||
|
|||
// Returns a bitmask marking all lanes that have the same value of __x. | |||
_DEFAULT_FN_ATTRS static __inline__ uint64_t |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I still think these should be in terms of a lanemask_t defined somewhere
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Honestly I got confused myself whether or not the return value was a number or a mask. I think that having a typedef uint64_t __lanemask_t
would be helpful for keeping that unambiguous, but I think that it's best just to keep it uint64_t
for all targets to simplify code and keep it portable. I might do that in a follow-up patch.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the typedef lets it be portably without forcing it to u64 everywhere
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The typedef would be dependent on the __AMDGCN_WAVESIZE
thing, which is being deprecated. But it would work between NVPTX and AMDGPU I suppose.
/cherry-pick 9a584b0 |
/pull-request #127704 |
Summary:
These helpers are very useful but currently absent. They allow the user
to get a bitmask representing the matches within the warp. I have made
an executive decision to drop the
predicate
return frommatch_all
because it's easily testable with
match_all() == __activemask()
.