Skip to content

Commit b085f16

Browse files
committed
[libc] Remove remaining GPU architecture dependent instructions
Summary: Recent patches have added solutions to the remaining sources of divergence. This patch simply removes the last occures of things like `has_builtin`, `ifdef` or builtins with feature requirements. The one exception here is `nanosleep`, but I made changes in the `__nvvm_reflect` pass to make usage like this actually work at O0. Depends on #81331
1 parent 1dacfd1 commit b085f16

File tree

5 files changed

+14
-32
lines changed

5 files changed

+14
-32
lines changed

libc/src/__support/GPU/amdgpu/utils.h

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -152,14 +152,7 @@ LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
152152
/// Returns a fixed-frequency timestamp. The actual frequency is dependent on
153153
/// the card and can only be queried via the driver.
154154
LIBC_INLINE uint64_t fixed_frequency_clock() {
155-
if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_sendmsg_rtnl))
156-
return __builtin_amdgcn_s_sendmsg_rtnl(0x83);
157-
else if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memrealtime))
158-
return __builtin_amdgcn_s_memrealtime();
159-
else if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memtime))
160-
return __builtin_amdgcn_s_memtime();
161-
else
162-
return 0;
155+
return __builtin_readsteadycounter();
163156
}
164157

165158
/// Terminates execution of the associated wavefront.

libc/src/__support/GPU/nvptx/utils.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -135,13 +135,11 @@ LIBC_INLINE uint32_t get_lane_size() { return 32; }
135135
}
136136

137137
/// Returns the current value of the GPU's processor clock.
138-
LIBC_INLINE uint64_t processor_clock() {
139-
return __nvvm_read_ptx_sreg_clock64();
140-
}
138+
LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
141139

142140
/// Returns a global fixed-frequency timer at nanosecond frequency.
143141
LIBC_INLINE uint64_t fixed_frequency_clock() {
144-
return __nvvm_read_ptx_sreg_globaltimer();
142+
return __builtin_readsteadycounter();
145143
}
146144

147145
/// Terminates execution of the calling thread.

libc/src/__support/RPC/rpc_util.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,9 @@ namespace rpc {
2121

2222
/// Suspend the thread briefly to assist the thread scheduler during busy loops.
2323
LIBC_INLINE void sleep_briefly() {
24-
#if defined(LIBC_TARGET_ARCH_IS_NVPTX) && __CUDA_ARCH__ >= 700
25-
__nvvm_nanosleep(64);
24+
#if defined(LIBC_TARGET_ARCH_IS_NVPTX)
25+
if (__nvvm_reflect("__CUDA_ARCH") >= 700)
26+
LIBC_INLINE_ASM("nanosleep.u32 64;" ::: "memory");
2627
#elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)
2728
__builtin_amdgcn_s_sleep(2);
2829
#elif defined(LIBC_TARGET_ARCH_IS_X86)

libc/src/time/gpu/nanosleep.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,15 @@ LLVM_LIBC_FUNCTION(int, nanosleep,
2323
uint64_t tick_rate = TICKS_PER_SEC / GPU_CLOCKS_PER_SEC;
2424

2525
uint64_t start = gpu::fixed_frequency_clock();
26-
#if defined(LIBC_TARGET_ARCH_IS_NVPTX) && __CUDA_ARCH__ >= 700
26+
#if defined(LIBC_TARGET_ARCH_IS_NVPTX)
2727
uint64_t end = start + (nsecs + tick_rate - 1) / tick_rate;
2828
uint64_t cur = gpu::fixed_frequency_clock();
2929
// The NVPTX architecture supports sleeping and guaruntees the actual time
3030
// slept will be somewhere between zero and twice the requested amount. Here
3131
// we will sleep again if we undershot the time.
3232
while (cur < end) {
33-
__nvvm_nanosleep(static_cast<uint32_t>(nsecs));
33+
if (__nvvm_reflect("__CUDA_ARCH") >= 700)
34+
LIBC_INLINE_ASM("nanosleep.u32 %0;" ::"r"(nsecs));
3435
cur = gpu::fixed_frequency_clock();
3536
nsecs -= nsecs > cur - start ? cur - start : 0;
3637
}

libc/src/time/gpu/time_utils.h

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,24 +15,13 @@ namespace LIBC_NAMESPACE {
1515

1616
#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
1717
// AMDGPU does not have a single set frequency. Different architectures and
18-
// cards can have vary values. Here we default to a few known values, but for
19-
// complete support the frequency needs to be read from the kernel driver.
20-
#if defined(__GFX10__) || defined(__GFX11__) || defined(__GFX12__) || \
21-
defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
22-
// These architectures use a 100 MHz fixed frequency clock.
23-
constexpr uint64_t clock_freq = 100000000;
24-
#elif defined(__GFX9__)
25-
// These architectures use a 25 MHz fixed frequency clock expect for Vega 10
26-
// which is actually 27 Mhz. We default to 25 MHz in all cases anyway.
27-
constexpr uint64_t clock_freq = 25000000;
28-
#else
29-
// The frequency for these architecture is unknown. We simply default to zero.
30-
constexpr uint64_t clock_freq = 0;
31-
#endif
18+
// cards can have different values. The actualy frequency needs to be read from
19+
// the kernel driver and will be between 25 MHz and 100 MHz on most cards. All
20+
// cards following the GFX9 ISAs use a 100 MHz clock so we will default to that.
21+
constexpr uint64_t clock_freq = 100000000UL;
3222

3323
// We provide an externally visible symbol such that the runtime can set
34-
// this to the correct value. If it is not set we try to default to the
35-
// known values.
24+
// this to the correct value.
3625
extern "C" [[gnu::visibility("protected")]] uint64_t
3726
[[clang::address_space(4)]] __llvm_libc_clock_freq;
3827
#define GPU_CLOCKS_PER_SEC static_cast<clock_t>(__llvm_libc_clock_freq)

0 commit comments

Comments
 (0)