Skip to content

Commit eb66e31

Browse files
authored
[libc] Add Timing Utils for AMDGPU (llvm#96828)
PR for adding AMDGPU timing utils for benchmarking. I was not able to test this code since I do not have an AMD GPU, but I was able to successfully compile this code using -DRUNTIMES_amdgcn-amd-amdhsa_LIBC_GPU_TEST_ARCHITECTURE=gfx90a -DRUNTIMES_amdgcn-amd-amdhsa_LIBC_GPU_LOADER_EXECUTABLE=echo -DRUNTIMES_amdgcn_amd-amdhsa_LIBC_GPU_TARGET_ARCHITECTURE=gfx90a to force the code to compile without having an AMD gpu on my machine. @jhuber6
1 parent dd3aa5e commit eb66e31

File tree

3 files changed

+120
-1
lines changed

3 files changed

+120
-1
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
add_header_library(
2+
amdgpu_timing
3+
HDRS
4+
timing.h
5+
DEPENDS
6+
libc.src.__support.common
7+
)
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
//===------------- AMDGPU implementation of timing utils --------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
10+
#define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
11+
12+
#include "src/__support/CPP/type_traits.h"
13+
#include "src/__support/GPU/utils.h"
14+
#include "src/__support/common.h"
15+
#include "src/__support/macros/attributes.h"
16+
#include "src/__support/macros/config.h"
17+
18+
#include <stdint.h>
19+
20+
// AMDGPU does not support input register constraints for i1 and i8, so we must
21+
// cast them to uint16_t's before loading them into registers.
22+
#define FORCE_TO_REGISTER(TYPE, VARIABLE) \
23+
if constexpr (cpp::is_same_v<TYPE, char> || cpp::is_same_v<TYPE, bool>) \
24+
asm("" ::"v"(static_cast<uint16_t>(VARIABLE))); \
25+
else \
26+
asm("" ::"v"(VARIABLE))
27+
28+
namespace LIBC_NAMESPACE {
29+
30+
// Returns the overhead associated with calling the profiling region. This
31+
// allows us to substract the constant-time overhead from the latency to
32+
// obtain a true result. This can vary with system load.
33+
[[gnu::noinline]] static LIBC_INLINE uint64_t overhead() {
34+
gpu::memory_fence();
35+
uint64_t start = gpu::processor_clock();
36+
uint32_t result = 0.0;
37+
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
38+
asm("" ::"s"(start));
39+
uint64_t stop = gpu::processor_clock();
40+
return stop - start;
41+
}
42+
43+
// Profile a simple function and obtain its latency in clock cycles on the
44+
// system. This function cannot be inlined or else it will disturb the very
45+
// delicate balance of hard-coded dependencies.
46+
template <typename F, typename T>
47+
[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) {
48+
// We need to store the input somewhere to guarantee that the compiler
49+
// will not constant propagate it and remove the profiling region.
50+
volatile T storage = t;
51+
T arg = storage;
52+
53+
FORCE_TO_REGISTER(T, arg);
54+
55+
// The AMDGPU architecture needs to wait on pending results.
56+
gpu::memory_fence();
57+
// Get the current timestamp from the clock.
58+
uint64_t start = gpu::processor_clock();
59+
60+
// This forces the compiler to load the input argument and run the clock
61+
// cycle counter before the profiling region.
62+
FORCE_TO_REGISTER(T, arg);
63+
asm("" ::"s"(start));
64+
65+
// Run the function under test and return its value.
66+
auto result = f(arg);
67+
68+
// This inline assembly performs a no-op which forces the result to both
69+
// be used and prevents us from exiting this region before it's complete.
70+
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
71+
72+
// Obtain the current timestamp after running the calculation and force
73+
// ordering.
74+
uint64_t stop = gpu::processor_clock();
75+
asm("" ::"s"(stop));
76+
gpu::memory_fence();
77+
78+
// Return the time elapsed.
79+
return stop - start;
80+
}
81+
82+
template <typename F, typename T1, typename T2>
83+
[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
84+
volatile T1 storage1 = t1;
85+
volatile T2 storage2 = t2;
86+
T1 arg1 = storage1;
87+
T2 arg2 = storage2;
88+
89+
FORCE_TO_REGISTER(T1, arg1);
90+
FORCE_TO_REGISTER(T2, arg2);
91+
92+
gpu::memory_fence();
93+
uint64_t start = gpu::processor_clock();
94+
95+
FORCE_TO_REGISTER(T1, arg1);
96+
FORCE_TO_REGISTER(T2, arg2);
97+
asm("" ::"s"(start));
98+
99+
auto result = f(arg1, arg2);
100+
101+
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
102+
103+
uint64_t stop = gpu::processor_clock();
104+
asm("" ::"s"(stop));
105+
gpu::memory_fence();
106+
107+
return stop - start;
108+
}
109+
110+
} // namespace LIBC_NAMESPACE
111+
112+
#endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU

libc/benchmarks/gpu/timing/timing.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
#include "src/__support/macros/properties/architectures.h"
1313

1414
#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
15-
#error "amdgpu not yet supported"
15+
#include "amdgpu/timing.h"
1616
#elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
1717
#include "nvptx/timing.h"
1818
#else

0 commit comments

Comments
 (0)