Skip to content

Commit 9a584b0

Browse files
authored
[Clang] Add handlers for 'match_any' and 'match_all' to gpuintrin.h (#127504)
Summary: These helpers are very useful but currently absent. They allow the user to get a bitmask representing the matches within the warp. I have made an executive decision to drop the `predicate` return from `match_all` because it's easily testable with `match_all() == __activemask()`.
1 parent a7a3568 commit 9a584b0

File tree

5 files changed

+182
-0
lines changed

5 files changed

+182
-0
lines changed

clang/lib/Headers/amdgpuintrin.h

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,62 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
162162
((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
163163
}
164164

165+
// Returns a bitmask marking all lanes that have the same value of __x.
166+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
167+
__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
168+
uint32_t __match_mask = 0;
169+
170+
bool __done = 0;
171+
while (__gpu_ballot(__lane_mask, !__done)) {
172+
if (!__done) {
173+
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
174+
if (__first == __x) {
175+
__match_mask = __gpu_lane_mask();
176+
__done = 1;
177+
}
178+
}
179+
}
180+
__gpu_sync_lane(__lane_mask);
181+
return __match_mask;
182+
}
183+
184+
// Returns a bitmask marking all lanes that have the same value of __x.
185+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
186+
__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
187+
uint64_t __match_mask = 0;
188+
189+
bool __done = 0;
190+
while (__gpu_ballot(__lane_mask, __done)) {
191+
if (!__done) {
192+
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
193+
if (__first == __x) {
194+
__match_mask = __gpu_lane_mask();
195+
__done = 1;
196+
}
197+
}
198+
}
199+
__gpu_sync_lane(__lane_mask);
200+
return __match_mask;
201+
}
202+
203+
// Returns the current lane mask if every lane contains __x.
204+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
205+
__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
206+
uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
207+
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
208+
__gpu_sync_lane(__lane_mask);
209+
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
210+
}
211+
212+
// Returns the current lane mask if every lane contains __x.
213+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
214+
__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
215+
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
216+
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
217+
__gpu_sync_lane(__lane_mask);
218+
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
219+
}
220+
165221
// Returns true if the flat pointer points to AMDGPU 'shared' memory.
166222
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
167223
return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)((

clang/lib/Headers/nvptxintrin.h

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@
1313
#error "This file is intended for NVPTX targets or offloading to NVPTX"
1414
#endif
1515

16+
#ifndef __CUDA_ARCH__
17+
#define __CUDA_ARCH__ 0
18+
#endif
19+
1620
#include <stdint.h>
1721

1822
#if !defined(__cplusplus)
@@ -168,6 +172,76 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
168172
((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
169173
}
170174

175+
// Returns a bitmask marking all lanes that have the same value of __x.
176+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
177+
__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
178+
// Newer targets can use the dedicated CUDA support.
179+
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
180+
return __nvvm_match_any_sync_i32(__lane_mask, __x);
181+
182+
uint32_t __match_mask = 0;
183+
bool __done = 0;
184+
while (__gpu_ballot(__lane_mask, !__done)) {
185+
if (!__done) {
186+
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
187+
if (__first == __x) {
188+
__match_mask = __gpu_lane_mask();
189+
__done = 1;
190+
}
191+
}
192+
}
193+
return __match_mask;
194+
}
195+
196+
// Returns a bitmask marking all lanes that have the same value of __x.
197+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
198+
__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
199+
// Newer targets can use the dedicated CUDA support.
200+
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
201+
return __nvvm_match_any_sync_i64(__lane_mask, __x);
202+
203+
uint64_t __match_mask = 0;
204+
205+
bool __done = 0;
206+
while (__gpu_ballot(__lane_mask, __done)) {
207+
if (!__done) {
208+
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
209+
if (__first == __x) {
210+
__match_mask = __gpu_lane_mask();
211+
__done = 1;
212+
}
213+
}
214+
}
215+
__gpu_sync_lane(__lane_mask);
216+
return __match_mask;
217+
}
218+
219+
// Returns the current lane mask if every lane contains __x.
220+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
221+
__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
222+
// Newer targets can use the dedicated CUDA support.
223+
int predicate;
224+
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
225+
return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
226+
227+
uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
228+
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
229+
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
230+
}
231+
232+
// Returns the current lane mask if every lane contains __x.
233+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
234+
__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
235+
// Newer targets can use the dedicated CUDA support.
236+
int predicate;
237+
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
238+
return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
239+
240+
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
241+
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
242+
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
243+
}
244+
171245
// Returns true if the flat pointer points to CUDA 'shared' memory.
172246
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
173247
return __nvvm_isspacep_shared(ptr);

libc/src/__support/GPU/utils.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,14 @@ LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
9292
return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
9393
}
9494

95+
LIBC_INLINE uint64_t match_any(uint64_t lane_mask, uint32_t x) {
96+
return __gpu_match_any_u32(lane_mask, x);
97+
}
98+
99+
LIBC_INLINE uint64_t match_all(uint64_t lane_mask, uint32_t x) {
100+
return __gpu_match_all_u32(lane_mask, x);
101+
}
102+
95103
[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
96104

97105
LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {

libc/test/integration/src/__support/GPU/CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,12 @@ add_integration_test(
1818
LOADER_ARGS
1919
--threads 64
2020
)
21+
22+
add_integration_test(
23+
match_test
24+
SUITE libc-support-gpu-tests
25+
SRCS
26+
match.cpp
27+
LOADER_ARGS
28+
--threads 64
29+
)
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
//===-- Test for the shuffle operations on the GPU ------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "src/__support/CPP/bit.h"
10+
#include "src/__support/GPU/utils.h"
11+
#include "test/IntegrationTest/test.h"
12+
13+
using namespace LIBC_NAMESPACE;
14+
15+
// Test to ensure that match any / match all work.
16+
static void test_match() {
17+
uint64_t mask = gpu::get_lane_mask();
18+
EXPECT_EQ(1ull << gpu::get_lane_id(),
19+
gpu::match_any(mask, gpu::get_lane_id()));
20+
EXPECT_EQ(mask, gpu::match_any(mask, 1));
21+
22+
uint64_t expected = gpu::get_lane_id() < 16 ? 0xffff : 0xffff0000;
23+
EXPECT_EQ(expected, gpu::match_any(mask, gpu::get_lane_id() < 16));
24+
EXPECT_EQ(mask, gpu::match_all(mask, 1));
25+
EXPECT_EQ(0ull, gpu::match_all(mask, gpu::get_lane_id()));
26+
}
27+
28+
TEST_MAIN(int argc, char **argv, char **envp) {
29+
if (gpu::get_thread_id() >= gpu::get_lane_size())
30+
return 0;
31+
32+
test_match();
33+
34+
return 0;
35+
}

0 commit comments

Comments
 (0)