[Clang] Add handlers for 'match_any' and 'match_all' to `gpuintrin.h` #127504

jhuber6 · 2025-02-17T15:08:43Z

Summary:
These helpers are very useful but currently absent. They allow the user
to get a bitmask representing the matches within the warp. I have made
an executive decision to drop the predicate return from match_all
because it's easily testable with match_all() == __activemask().

llvmbot · 2025-02-17T15:09:21Z

@llvm/pr-subscribers-clang
@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-libc

Author: Joseph Huber (jhuber6)

Changes

Summary:
These helpers are very useful but currently absent. They allow the user
to get a bitmask representing the matches within the warp. I have made
an executive decision to drop the predicate return from match_all
because it's easily testable with match_all() == __activemask().

Full diff: https://github.com/llvm/llvm-project/pull/127504.diff

5 Files Affected:

(modified) clang/lib/Headers/amdgpuintrin.h (+56)
(modified) clang/lib/Headers/nvptxintrin.h (+74)
(modified) libc/src/__support/GPU/utils.h (+8)
(modified) libc/test/integration/src/__support/GPU/CMakeLists.txt (+9)
(added) libc/test/integration/src/__support/GPU/match.cpp (+32)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 9dad99ffe9439..355e75d0b2d42 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -162,6 +162,62 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
          ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
 }
 
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  uint32_t __match_mask = 0;
+
+  bool __done = 0;
+  while (__gpu_ballot(__lane_mask, !__done)) {
+    if (!__done) {
+      uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+      if (__first == __x) {
+        __match_mask = __gpu_lane_mask();
+        __done = 1;
+      }
+    }
+  }
+  __gpu_sync_lane(__lane_mask);
+  return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+  uint64_t __match_mask = 0;
+
+  bool __done = 0;
+  while (__gpu_ballot(__lane_mask, __done)) {
+    if (!__done) {
+      uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+      if (__first == __x) {
+        __match_mask = __gpu_lane_mask();
+        __done = 1;
+      }
+    }
+  }
+  __gpu_sync_lane(__lane_mask);
+  return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+  uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+  uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+  __gpu_sync_lane(__lane_mask);
+  return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+  uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+  uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+  __gpu_sync_lane(__lane_mask);
+  return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
 // Returns true if the flat pointer points to AMDGPU 'shared' memory.
 _DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
   return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)((
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 40fa2edebe975..f857a87b5f4c7 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -13,6 +13,10 @@
 #error "This file is intended for NVPTX targets or offloading to NVPTX"
 #endif
 
+#ifndef __CUDA_ARCH__
+#define __CUDA_ARCH__ 0
+#endif
+
 #include <stdint.h>
 
 #if !defined(__cplusplus)
@@ -168,6 +172,76 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
          ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
 }
 
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  // Newer targets can use the dedicated CUDA support.
+  if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+    return __nvvm_match_any_sync_i32(__lane_mask, __x);
+
+  uint32_t __match_mask = 0;
+  bool __done = 0;
+  while (__gpu_ballot(__lane_mask, !__done)) {
+    if (!__done) {
+      uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+      if (__first == __x) {
+        __match_mask = __gpu_lane_mask();
+        __done = 1;
+      }
+    }
+  }
+  return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+  // Newer targets can use the dedicated CUDA support.
+  if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+    return __nvvm_match_any_sync_i64(__lane_mask, __x);
+
+  uint64_t __match_mask = 0;
+
+  bool __done = 0;
+  while (__gpu_ballot(__lane_mask, __done)) {
+    if (!__done) {
+      uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+      if (__first == __x) {
+        __match_mask = __gpu_lane_mask();
+        __done = 1;
+      }
+    }
+  }
+  __gpu_sync_lane(__lane_mask);
+  return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+  // Newer targets can use the dedicated CUDA support.
+  int predicate;
+  if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+    return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
+
+  uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+  uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+  return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+  // Newer targets can use the dedicated CUDA support.
+  int predicate;
+  if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+    return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
+
+  uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+  uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+  return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
 // Returns true if the flat pointer points to CUDA 'shared' memory.
 _DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
   return __nvvm_isspacep_shared(ptr);
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index 323c003f1ff07..0fd3a6498b865 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -92,6 +92,14 @@ LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
   return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
 }
 
+LIBC_INLINE uint64_t match_any(uint64_t lane_mask, uint32_t x) {
+  return __gpu_match_any_u32(lane_mask, x);
+}
+
+LIBC_INLINE uint64_t match_all(uint64_t lane_mask, uint32_t x) {
+  return __gpu_match_all_u32(lane_mask, x);
+}
+
 [[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
 
 LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt
index 68bbc3849bc7e..e066830f6cc0d 100644
--- a/libc/test/integration/src/__support/GPU/CMakeLists.txt
+++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt
@@ -18,3 +18,12 @@ add_integration_test(
   LOADER_ARGS
     --threads 64
 )
+
+add_integration_test(
+  match_test
+  SUITE libc-support-gpu-tests
+  SRCS
+    match.cpp
+  LOADER_ARGS
+    --threads 64
+)
diff --git a/libc/test/integration/src/__support/GPU/match.cpp b/libc/test/integration/src/__support/GPU/match.cpp
new file mode 100644
index 0000000000000..225078022cdc3
--- /dev/null
+++ b/libc/test/integration/src/__support/GPU/match.cpp
@@ -0,0 +1,32 @@
+//===-- Test for the shuffle operations on the GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/GPU/utils.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+// Test to ensure that match any / match all work.
+static void test_match() {
+  uint64_t mask = gpu::get_lane_mask();
+  EXPECT_EQ(0ull, gpu::match_any(mask, gpu::get_lane_id()));
+  EXPECT_EQ(mask, gpu::match_any(mask, 1));
+  EXPECT_EQ(0xffff, gpu::match_any(mask, gpu::get_lane_id() < 16));
+  EXPECT_EQ(mask, gpu::match_all(mask, 1));
+  EXPECT_EQ(0ull, gpu::match_any(mask, gpu::get_lane_id()));
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+  if (gpu::get_thread_id() >= gpu::get_lane_size())
+    return 0;
+
+  test_match();
+
+  return 0;
+}

llvmbot · 2025-02-17T15:09:21Z

@llvm/pr-subscribers-backend-x86

Author: Joseph Huber (jhuber6)

Changes

Summary:
These helpers are very useful but currently absent. They allow the user
to get a bitmask representing the matches within the warp. I have made
an executive decision to drop the predicate return from match_all
because it's easily testable with match_all() == __activemask().

Full diff: https://github.com/llvm/llvm-project/pull/127504.diff

5 Files Affected:

(modified) clang/lib/Headers/amdgpuintrin.h (+56)
(modified) clang/lib/Headers/nvptxintrin.h (+74)
(modified) libc/src/__support/GPU/utils.h (+8)
(modified) libc/test/integration/src/__support/GPU/CMakeLists.txt (+9)
(added) libc/test/integration/src/__support/GPU/match.cpp (+32)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 9dad99ffe9439..355e75d0b2d42 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -162,6 +162,62 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
          ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
 }
 
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  uint32_t __match_mask = 0;
+
+  bool __done = 0;
+  while (__gpu_ballot(__lane_mask, !__done)) {
+    if (!__done) {
+      uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+      if (__first == __x) {
+        __match_mask = __gpu_lane_mask();
+        __done = 1;
+      }
+    }
+  }
+  __gpu_sync_lane(__lane_mask);
+  return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+  uint64_t __match_mask = 0;
+
+  bool __done = 0;
+  while (__gpu_ballot(__lane_mask, __done)) {
+    if (!__done) {
+      uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+      if (__first == __x) {
+        __match_mask = __gpu_lane_mask();
+        __done = 1;
+      }
+    }
+  }
+  __gpu_sync_lane(__lane_mask);
+  return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+  uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+  uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+  __gpu_sync_lane(__lane_mask);
+  return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+  uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+  uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+  __gpu_sync_lane(__lane_mask);
+  return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
 // Returns true if the flat pointer points to AMDGPU 'shared' memory.
 _DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
   return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)((
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 40fa2edebe975..f857a87b5f4c7 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -13,6 +13,10 @@
 #error "This file is intended for NVPTX targets or offloading to NVPTX"
 #endif
 
+#ifndef __CUDA_ARCH__
+#define __CUDA_ARCH__ 0
+#endif
+
 #include <stdint.h>
 
 #if !defined(__cplusplus)
@@ -168,6 +172,76 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
          ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
 }
 
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  // Newer targets can use the dedicated CUDA support.
+  if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+    return __nvvm_match_any_sync_i32(__lane_mask, __x);
+
+  uint32_t __match_mask = 0;
+  bool __done = 0;
+  while (__gpu_ballot(__lane_mask, !__done)) {
+    if (!__done) {
+      uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+      if (__first == __x) {
+        __match_mask = __gpu_lane_mask();
+        __done = 1;
+      }
+    }
+  }
+  return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+  // Newer targets can use the dedicated CUDA support.
+  if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+    return __nvvm_match_any_sync_i64(__lane_mask, __x);
+
+  uint64_t __match_mask = 0;
+
+  bool __done = 0;
+  while (__gpu_ballot(__lane_mask, __done)) {
+    if (!__done) {
+      uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+      if (__first == __x) {
+        __match_mask = __gpu_lane_mask();
+        __done = 1;
+      }
+    }
+  }
+  __gpu_sync_lane(__lane_mask);
+  return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+  // Newer targets can use the dedicated CUDA support.
+  int predicate;
+  if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+    return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
+
+  uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+  uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+  return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+  // Newer targets can use the dedicated CUDA support.
+  int predicate;
+  if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+    return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
+
+  uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+  uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+  return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
 // Returns true if the flat pointer points to CUDA 'shared' memory.
 _DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
   return __nvvm_isspacep_shared(ptr);
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index 323c003f1ff07..0fd3a6498b865 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -92,6 +92,14 @@ LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
   return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
 }
 
+LIBC_INLINE uint64_t match_any(uint64_t lane_mask, uint32_t x) {
+  return __gpu_match_any_u32(lane_mask, x);
+}
+
+LIBC_INLINE uint64_t match_all(uint64_t lane_mask, uint32_t x) {
+  return __gpu_match_all_u32(lane_mask, x);
+}
+
 [[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
 
 LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt
index 68bbc3849bc7e..e066830f6cc0d 100644
--- a/libc/test/integration/src/__support/GPU/CMakeLists.txt
+++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt
@@ -18,3 +18,12 @@ add_integration_test(
   LOADER_ARGS
     --threads 64
 )
+
+add_integration_test(
+  match_test
+  SUITE libc-support-gpu-tests
+  SRCS
+    match.cpp
+  LOADER_ARGS
+    --threads 64
+)
diff --git a/libc/test/integration/src/__support/GPU/match.cpp b/libc/test/integration/src/__support/GPU/match.cpp
new file mode 100644
index 0000000000000..225078022cdc3
--- /dev/null
+++ b/libc/test/integration/src/__support/GPU/match.cpp
@@ -0,0 +1,32 @@
+//===-- Test for the shuffle operations on the GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/GPU/utils.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+// Test to ensure that match any / match all work.
+static void test_match() {
+  uint64_t mask = gpu::get_lane_mask();
+  EXPECT_EQ(0ull, gpu::match_any(mask, gpu::get_lane_id()));
+  EXPECT_EQ(mask, gpu::match_any(mask, 1));
+  EXPECT_EQ(0xffff, gpu::match_any(mask, gpu::get_lane_id() < 16));
+  EXPECT_EQ(mask, gpu::match_all(mask, 1));
+  EXPECT_EQ(0ull, gpu::match_any(mask, gpu::get_lane_id()));
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+  if (gpu::get_thread_id() >= gpu::get_lane_size())
+    return 0;
+
+  test_match();
+
+  return 0;
+}

Summary: These helpers are very useful but currently absent. They allow the user to get a bitmask representing the matches within the warp. I have made an executive decision to drop the `predicate` return from `match_all` because it's easily testable with `match_all() == __activemask()`.

shiltian · 2025-02-17T17:00:41Z

libc/src/__support/GPU/utils.h

+  return __gpu_match_any_u32(lane_mask, x);
+}
+
+LIBC_INLINE uint64_t match_all(uint64_t lane_mask, uint32_t x) {


I suppose this is a C++ interface function instead of C one?

Yes, this is just a wrapper used internally by the libc.

arsenm · 2025-02-17T17:12:14Z

clang/lib/Headers/amdgpuintrin.h

@@ -162,6 +162,62 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
         ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
 }

+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t


I still think these should be in terms of a lanemask_t defined somewhere

Honestly I got confused myself whether or not the return value was a number or a mask. I think that having a typedef uint64_t __lanemask_t would be helpful for keeping that unambiguous, but I think that it's best just to keep it uint64_t for all targets to simplify code and keep it portable. I might do that in a follow-up patch.

the typedef lets it be portably without forcing it to u64 everywhere

The typedef would be dependent on the __AMDGCN_WAVESIZE thing, which is being deprecated. But it would work between NVPTX and AMDGPU I suppose.

jhuber6 · 2025-02-18T22:00:37Z

/cherry-pick 9a584b0

llvmbot · 2025-02-18T22:06:23Z

/pull-request #127704

jhuber6 requested review from arsenm, Artem-B, JonChesterfield, shiltian and yxsamliu February 17, 2025 15:08

llvmbot added clang Clang issues not falling into any other category backend:AMDGPU backend:X86 clang:headers Headers provided by Clang, e.g. for intrinsics libc labels Feb 17, 2025

jhuber6 force-pushed the Match branch from ef1627c to 821ad9e Compare February 17, 2025 15:32

shiltian reviewed Feb 17, 2025

View reviewed changes

arsenm reviewed Feb 17, 2025

View reviewed changes

shiltian approved these changes Feb 17, 2025

View reviewed changes

jhuber6 merged commit 9a584b0 into llvm:main Feb 17, 2025
12 of 14 checks passed

jhuber6 deleted the Match branch February 17, 2025 20:06

jhuber6 added this to the LLVM 20.X Release milestone Feb 18, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Clang] Add handlers for 'match_any' and 'match_all' to `gpuintrin.h` #127504

[Clang] Add handlers for 'match_any' and 'match_all' to `gpuintrin.h` #127504

Uh oh!

jhuber6 commented Feb 17, 2025

Uh oh!

llvmbot commented Feb 17, 2025 •

edited

Loading

Uh oh!

llvmbot commented Feb 17, 2025

Uh oh!

shiltian Feb 17, 2025

Uh oh!

jhuber6 Feb 17, 2025

Uh oh!

arsenm Feb 17, 2025

Uh oh!

jhuber6 Feb 17, 2025

Uh oh!

arsenm Feb 18, 2025

Uh oh!

jhuber6 Feb 18, 2025

Uh oh!

Uh oh!

jhuber6 commented Feb 18, 2025

Uh oh!

llvmbot commented Feb 18, 2025

Uh oh!

Uh oh!

[Clang] Add handlers for 'match_any' and 'match_all' to gpuintrin.h #127504

[Clang] Add handlers for 'match_any' and 'match_all' to gpuintrin.h #127504

Uh oh!

Conversation

jhuber6 commented Feb 17, 2025

Uh oh!

llvmbot commented Feb 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Feb 17, 2025

Uh oh!

shiltian Feb 17, 2025

Choose a reason for hiding this comment

Uh oh!

jhuber6 Feb 17, 2025

Choose a reason for hiding this comment

Uh oh!

arsenm Feb 17, 2025

Choose a reason for hiding this comment

Uh oh!

jhuber6 Feb 17, 2025

Choose a reason for hiding this comment

Uh oh!

arsenm Feb 18, 2025

Choose a reason for hiding this comment

Uh oh!

jhuber6 Feb 18, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

jhuber6 commented Feb 18, 2025

Uh oh!

llvmbot commented Feb 18, 2025

Uh oh!

Uh oh!

[Clang] Add handlers for 'match_any' and 'match_all' to `gpuintrin.h` #127504

[Clang] Add handlers for 'match_any' and 'match_all' to `gpuintrin.h` #127504

llvmbot commented Feb 17, 2025 •

edited

Loading