Skip to content

[Headers][NFC] Deduplicate gpu_match_ between targets via inlining #131141

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 4 additions & 40 deletions clang/lib/Headers/amdgpuintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,6 @@ _Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
// Attribute to declare a function as a kernel.
#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))

// Defined in gpuintrin.h, used later in this file.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);

// Returns the number of workgroups in the 'x' dimension of the grid.
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
Expand Down Expand Up @@ -146,57 +142,25 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
uint32_t __match_mask = 0;

bool __done = 0;
while (__gpu_ballot(__lane_mask, !__done)) {
if (!__done) {
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
if (__first == __x) {
__match_mask = __gpu_lane_mask();
__done = 1;
}
}
}
__gpu_sync_lane(__lane_mask);
return __match_mask;
return __gpu_match_any_u32_impl(__lane_mask, __x);
}

// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
uint64_t __match_mask = 0;

bool __done = 0;
while (__gpu_ballot(__lane_mask, !__done)) {
if (!__done) {
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
if (__first == __x) {
__match_mask = __gpu_lane_mask();
__done = 1;
}
}
}
__gpu_sync_lane(__lane_mask);
return __match_mask;
return __gpu_match_any_u64_impl(__lane_mask, __x);
}

// Returns the current lane mask if every lane contains __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
__gpu_sync_lane(__lane_mask);
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
return __gpu_match_all_u32_impl(__lane_mask, __x);
}

// Returns the current lane mask if every lane contains __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
__gpu_sync_lane(__lane_mask);
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
return __gpu_match_all_u64_impl(__lane_mask, __x);
}

// Returns true if the flat pointer points to AMDGPU 'shared' memory.
Expand Down
82 changes: 81 additions & 1 deletion clang/lib/Headers/gpuintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,30 @@ _Pragma("push_macro(\"bool\")");
#define bool _Bool
#endif

_Pragma("omp begin declare target device_type(nohost)");
_Pragma("omp begin declare variant match(device = {kind(gpu)})");

// Forward declare a few functions for the implementation header.

// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x);

// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x);

// Returns the current lane mask if every lane contains __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x);

// Returns the current lane mask if every lane contains __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x);

_Pragma("omp end declare variant");
_Pragma("omp end declare target");

#if defined(__NVPTX__)
#include <nvptxintrin.h>
#elif defined(__AMDGPU__)
Expand Down Expand Up @@ -115,7 +139,7 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
}

// Copies the value from the first active thread in the wavefront to the rest.
// Copies the value from the first active thread to the rest.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
Expand Down Expand Up @@ -234,6 +258,62 @@ __DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x)
__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
#undef __DO_LANE_SUM

// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x) {
uint32_t __match_mask = 0;

bool __done = 0;
while (__gpu_ballot(__lane_mask, !__done)) {
if (!__done) {
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
if (__first == __x) {
__match_mask = __gpu_lane_mask();
__done = 1;
}
}
}
__gpu_sync_lane(__lane_mask);
return __match_mask;
}

// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x) {
uint64_t __match_mask = 0;

bool __done = 0;
while (__gpu_ballot(__lane_mask, !__done)) {
if (!__done) {
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
if (__first == __x) {
__match_mask = __gpu_lane_mask();
__done = 1;
}
}
}
__gpu_sync_lane(__lane_mask);
return __match_mask;
}

// Returns the current lane mask if every lane contains __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x) {
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
__gpu_sync_lane(__lane_mask);
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
}

// Returns the current lane mask if every lane contains __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x) {
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
__gpu_sync_lane(__lane_mask);
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
}

_Pragma("omp end declare variant");
_Pragma("omp end declare target");

Expand Down
48 changes: 8 additions & 40 deletions clang/lib/Headers/nvptxintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,6 @@ _Pragma("omp begin declare variant match(device = {arch(nvptx64)})");
// Attribute to declare a function as a kernel.
#define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))

// Defined in gpuintrin.h, used later in this file.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);

// Returns the number of CUDA blocks in the 'x' dimension.
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
return __nvvm_read_ptx_sreg_nctaid_x();
Expand Down Expand Up @@ -156,20 +152,9 @@ __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
// Newer targets can use the dedicated CUDA support.
#if __CUDA_ARCH__ >= 700
return __nvvm_match_any_sync_i32(__lane_mask, __x);
#else
return __gpu_match_any_u32_impl(__lane_mask, __x);
#endif

uint32_t __match_mask = 0;
bool __done = 0;
while (__gpu_ballot(__lane_mask, !__done)) {
if (!__done) {
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
if (__first == __x) {
__match_mask = __gpu_lane_mask();
__done = 1;
}
}
}
return __match_mask;
}

// Returns a bitmask marking all lanes that have the same value of __x.
Expand All @@ -178,22 +163,9 @@ __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
// Newer targets can use the dedicated CUDA support.
#if __CUDA_ARCH__ >= 700
return __nvvm_match_any_sync_i64(__lane_mask, __x);
#else
return __gpu_match_any_u64_impl(__lane_mask, __x);
#endif

uint64_t __match_mask = 0;

bool __done = 0;
while (__gpu_ballot(__lane_mask, !__done)) {
if (!__done) {
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
if (__first == __x) {
__match_mask = __gpu_lane_mask();
__done = 1;
}
}
}
__gpu_sync_lane(__lane_mask);
return __match_mask;
}

// Returns the current lane mask if every lane contains __x.
Expand All @@ -203,11 +175,9 @@ __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
#if __CUDA_ARCH__ >= 700
int predicate;
return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
#else
return __gpu_match_all_u32_impl(__lane_mask, __x);
#endif

uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
}

// Returns the current lane mask if every lane contains __x.
Expand All @@ -217,11 +187,9 @@ __gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
#if __CUDA_ARCH__ >= 700
int predicate;
return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
#else
return __gpu_match_all_u64_impl(__lane_mask, __x);
#endif

uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
}

// Returns true if the flat pointer points to CUDA 'shared' memory.
Expand Down
Loading