Skip to content

Commit 9a070d6

Browse files
authored
[libc] [gpu] Add Generic, NvSin, and OcmlSinf64 Throughput Benchmark (#101917)
This PR implements lntue@2a15842 to provide better throughput benchmarking for libc `sin()` and `__nv_sin()`. These changes have not been tested on AMDGPU yet, only compiled.
1 parent 8334d2b commit 9a070d6

File tree

6 files changed

+128
-80
lines changed

6 files changed

+128
-80
lines changed

libc/benchmarks/gpu/LibcGpuBenchmark.h

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ namespace benchmarks {
2121

2222
struct BenchmarkOptions {
2323
uint32_t initial_iterations = 1;
24-
uint32_t min_iterations = 50;
24+
uint32_t min_iterations = 1;
2525
uint32_t max_iterations = 10000000;
2626
uint32_t min_samples = 4;
2727
uint32_t max_samples = 1000;
@@ -111,9 +111,15 @@ class Benchmark {
111111
};
112112

113113
// We want our random values to be approximately
114-
// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
115-
// 2^(max_exponent + 1)
116-
template <typename T> static T get_rand_input() {
114+
// Output: a random number with the exponent field between min_exp and max_exp,
115+
// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
116+
// Caveats:
117+
// -EXP_BIAS corresponding to denormal values,
118+
// EXP_BIAS + 1 corresponding to inf or nan.
119+
template <typename T>
120+
static T
121+
get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
122+
int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
117123
using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
118124

119125
// Required to correctly instantiate FPBits for floats and doubles.
@@ -125,10 +131,11 @@ template <typename T> static T get_rand_input() {
125131
static_cast<uint64_t>(LIBC_NAMESPACE::rand());
126132
else
127133
bits = LIBC_NAMESPACE::rand();
128-
double scale = 0.5 + LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN / 2048.0;
134+
double scale =
135+
static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
129136
FPBits fp(bits);
130137
fp.set_biased_exponent(
131-
static_cast<uint32_t>(fp.get_biased_exponent() * scale));
138+
static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
132139
return fp.get_val();
133140
}
134141

@@ -141,19 +148,15 @@ template <typename T> class MathPerf {
141148
public:
142149
typedef T Func(T);
143150

144-
static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
145-
StorageType ending_bit, StorageType step) {
146-
uint64_t total_time = 0;
147-
if (step <= 0)
148-
step = 1;
149-
volatile T result;
150-
for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
151-
T x = FPBits(bits).get_val();
152-
total_time += LIBC_NAMESPACE::latency(f, x);
153-
}
154-
StorageType num_runs = (ending_bit - starting_bit) / step + 1;
155-
156-
return total_time / num_runs;
151+
template <size_t N = 1>
152+
static uint64_t run_throughput_in_range(Func f, int min_exp, int max_exp) {
153+
cpp::array<T, N> inputs;
154+
for (size_t i = 0; i < N; ++i)
155+
inputs[i] = get_rand_input<T>(min_exp, max_exp);
156+
157+
uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs);
158+
159+
return total_time / N;
157160
}
158161
};
159162

@@ -176,5 +179,4 @@ template <typename T> class MathPerf {
176179
#define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \
177180
BENCHMARK_N_THREADS(SuiteName, TestName, Func, \
178181
LIBC_NAMESPACE::gpu::get_lane_size())
179-
180182
#endif

libc/benchmarks/gpu/src/math/sin_benchmark.cpp

Lines changed: 25 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -15,51 +15,41 @@
1515
#include "src/math/amdgpu/declarations.h"
1616
#endif
1717

18-
constexpr double M_PI = 3.14159265358979323846;
19-
uint64_t get_bits(double x) {
20-
return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
21-
}
22-
2318
// BENCHMARK() expects a function that with no parameters that returns a
2419
// uint64_t representing the latency. Defining each benchmark using macro that
2520
// expands to a lambda to allow us to switch the implementation of `sin()` to
2621
// easily register NVPTX benchmarks.
27-
#define BM_RANDOM_INPUT(Func) \
28-
[]() { \
29-
double x = LIBC_NAMESPACE::benchmarks::get_rand_input<double>(); \
30-
return LIBC_NAMESPACE::latency(Func, x); \
31-
}
32-
BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
33-
34-
#define BM_TWO_PI(Func) \
22+
#define BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, N) \
3523
[]() { \
36-
return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
37-
Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64)); \
24+
return LIBC_NAMESPACE::benchmarks::MathPerf< \
25+
double>::run_throughput_in_range<N>(Func, MIN_EXP, MAX_EXP); \
3826
}
39-
BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));
4027

41-
#define BM_LARGE_INT(Func) \
42-
[]() { \
43-
return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
44-
Func, 0, get_bits(1 << 30), get_bits(1 << 4)); \
45-
}
46-
BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
47-
BM_LARGE_INT(LIBC_NAMESPACE::sin));
28+
#define BENCH(Name, Func, MIN_EXP, MAX_EXP) \
29+
SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1, \
30+
BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1)); \
31+
SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_128, \
32+
BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 128)); \
33+
SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1024, \
34+
BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1024)); \
35+
SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_4096, \
36+
BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 4096))
37+
38+
BENCH(Sin, LIBC_NAMESPACE::sin, -1023, 1023);
39+
BENCH(SinTwoPi, LIBC_NAMESPACE::sin, -10, 3);
40+
BENCH(SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
41+
BENCH(SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);
4842

4943
#ifdef NVPTX_MATH_FOUND
50-
BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
51-
BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
52-
BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
53-
BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
54-
BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
55-
BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
44+
BENCH(NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
45+
BENCH(NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
46+
BENCH(NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
47+
BENCH(NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
5648
#endif
5749

5850
#ifdef AMDGPU_MATH_FOUND
59-
BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSin,
60-
BM_RANDOM_INPUT(LIBC_NAMESPACE::__ocml_sin_f64));
61-
BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinTwoPi,
62-
BM_TWO_PI(LIBC_NAMESPACE::__ocml_sin_f64));
63-
BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinLargeInt,
64-
BM_LARGE_INT(LIBC_NAMESPACE::__ocml_sin_f64));
51+
BENCH(AmdgpuSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
52+
BENCH(AmdgpuSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
53+
BENCH(AmdgpuSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
54+
BENCH(AmdgpuSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
6555
#endif

libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,8 @@ add_header_library(
44
timing.h
55
DEPENDS
66
libc.src.__support.common
7+
libc.src.__support.macros.config
8+
libc.src.__support.macros.attributes
9+
libc.src.__support.CPP.type_traits
10+
libc.src.__support.CPP.array
711
)

libc/benchmarks/gpu/timing/amdgpu/timing.h

Lines changed: 41 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
1010
#define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
1111

12+
#include "src/__support/CPP/array.h"
1213
#include "src/__support/CPP/type_traits.h"
1314
#include "src/__support/GPU/utils.h"
1415
#include "src/__support/common.h"
@@ -17,14 +18,6 @@
1718

1819
#include <stdint.h>
1920

20-
// AMDGPU does not support input register constraints for i1 and i8, so we must
21-
// cast them to uint16_t's before loading them into registers.
22-
#define FORCE_TO_REGISTER(TYPE, VARIABLE) \
23-
if constexpr (cpp::is_same_v<TYPE, char> || cpp::is_same_v<TYPE, bool>) \
24-
asm("" ::"v"(static_cast<uint16_t>(VARIABLE))); \
25-
else \
26-
asm("" ::"v"(VARIABLE))
27-
2821
namespace LIBC_NAMESPACE_DECL {
2922

3023
// Returns the overhead associated with calling the profiling region. This
@@ -50,25 +43,29 @@ template <typename F, typename T>
5043
volatile T storage = t;
5144
T arg = storage;
5245

53-
FORCE_TO_REGISTER(T, arg);
54-
5546
// The AMDGPU architecture needs to wait on pending results.
5647
gpu::memory_fence();
5748
// Get the current timestamp from the clock.
5849
uint64_t start = gpu::processor_clock();
5950

6051
// This forces the compiler to load the input argument and run the clock
6152
// cycle counter before the profiling region.
62-
FORCE_TO_REGISTER(T, arg);
6353
asm("" ::"s"(start));
6454

6555
// Run the function under test and return its value.
6656
auto result = f(arg);
6757

6858
// This inline assembly performs a no-op which forces the result to both
6959
// be used and prevents us from exiting this region before it's complete.
70-
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
71-
static_cast<uint32_t>(result)));
60+
if constexpr (cpp::is_same_v<decltype(result), char> ||
61+
cpp::is_same_v<decltype(result), bool>)
62+
// AMDGPU does not support input register constraints for i1 and i8, so we
63+
// cast it to a 32-bit integer. This does not add an additional assembly
64+
// instruction (https://godbolt.org/z/zxGqv8G91).
65+
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
66+
static_cast<uint32_t>(result)));
67+
else
68+
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
7269

7370
// Obtain the current timestamp after running the calculation and force
7471
// ordering.
@@ -87,20 +84,19 @@ template <typename F, typename T1, typename T2>
8784
T1 arg1 = storage1;
8885
T2 arg2 = storage2;
8986

90-
FORCE_TO_REGISTER(T1, arg1);
91-
FORCE_TO_REGISTER(T2, arg2);
92-
9387
gpu::memory_fence();
9488
uint64_t start = gpu::processor_clock();
9589

96-
FORCE_TO_REGISTER(T1, arg1);
97-
FORCE_TO_REGISTER(T2, arg2);
9890
asm("" ::"s"(start));
9991

10092
auto result = f(arg1, arg2);
10193

102-
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
103-
static_cast<uint32_t>(result)));
94+
if constexpr (cpp::is_same_v<decltype(result), char> ||
95+
cpp::is_same_v<decltype(result), bool>)
96+
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
97+
static_cast<uint32_t>(result)));
98+
else
99+
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
104100

105101
uint64_t stop = gpu::processor_clock();
106102
asm("" ::"s"(stop));
@@ -109,6 +105,31 @@ template <typename F, typename T1, typename T2>
109105
return stop - start;
110106
}
111107

108+
// Provides throughput benchmarking.
109+
template <typename F, typename T, size_t N>
110+
[[gnu::noinline]] static LIBC_INLINE uint64_t
111+
throughput(F f, const cpp::array<T, N> &inputs) {
112+
asm("" ::"v"(&inputs));
113+
114+
gpu::memory_fence();
115+
uint64_t start = gpu::processor_clock();
116+
117+
asm("" ::"s"(start));
118+
119+
for (auto input : inputs) {
120+
auto result = f(input);
121+
122+
asm("" ::"v"(result));
123+
}
124+
125+
uint64_t stop = gpu::processor_clock();
126+
asm("" ::"s"(stop));
127+
gpu::memory_fence();
128+
129+
// Return the time elapsed.
130+
return stop - start;
131+
}
132+
112133
} // namespace LIBC_NAMESPACE_DECL
113134

114135
#endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU

libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,8 @@ add_header_library(
44
timing.h
55
DEPENDS
66
libc.src.__support.common
7+
libc.src.__support.macros.config
8+
libc.src.__support.macros.attributes
9+
libc.src.__support.CPP.type_traits
10+
libc.src.__support.CPP.array
711
)

libc/benchmarks/gpu/timing/nvptx/timing.h

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
#ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
1010
#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
1111

12+
#include "src/__support/CPP/array.h"
13+
#include "src/__support/CPP/type_traits.h"
1214
#include "src/__support/GPU/utils.h"
1315
#include "src/__support/common.h"
1416
#include "src/__support/macros/attributes.h"
@@ -25,7 +27,7 @@ namespace LIBC_NAMESPACE_DECL {
2527
volatile uint32_t x = 1;
2628
uint32_t y = x;
2729
uint64_t start = gpu::processor_clock();
28-
asm("" ::"r"(y), "llr"(start));
30+
asm("" ::"llr"(start));
2931
uint32_t result = y;
3032
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
3133
uint64_t stop = gpu::processor_clock();
@@ -42,15 +44,14 @@ template <typename F, typename T>
4244
// not constant propagate it and remove the profiling region.
4345
volatile T storage = t;
4446
T arg = storage;
45-
asm("" ::"r"(arg));
4647

4748
// Get the current timestamp from the clock.
4849
gpu::memory_fence();
4950
uint64_t start = gpu::processor_clock();
5051

5152
// This forces the compiler to load the input argument and run the clock cycle
5253
// counter before the profiling region.
53-
asm("" ::"r"(arg), "llr"(start));
54+
asm("" ::"llr"(start));
5455

5556
// Run the function under test and return its value.
5657
auto result = f(arg);
@@ -76,12 +77,11 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
7677
volatile T2 storage2 = t2;
7778
T1 arg = storage;
7879
T2 arg2 = storage2;
79-
asm("" ::"r"(arg), "r"(arg2));
8080

8181
gpu::memory_fence();
8282
uint64_t start = gpu::processor_clock();
8383

84-
asm("" ::"r"(arg), "r"(arg2), "llr"(start));
84+
asm("" ::"llr"(start));
8585

8686
auto result = f(arg, arg2);
8787

@@ -94,6 +94,33 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
9494

9595
return stop - start;
9696
}
97+
98+
// Provides throughput benchmarking.
99+
template <typename F, typename T, size_t N>
100+
[[gnu::noinline]] static LIBC_INLINE uint64_t
101+
throughput(F f, const cpp::array<T, N> &inputs) {
102+
asm("" ::"r"(&inputs));
103+
104+
gpu::memory_fence();
105+
uint64_t start = gpu::processor_clock();
106+
107+
asm("" ::"llr"(start));
108+
109+
uint64_t result;
110+
for (auto input : inputs) {
111+
asm("" ::"r"(input));
112+
result = f(input);
113+
asm("" ::"r"(result));
114+
}
115+
116+
uint64_t stop = gpu::processor_clock();
117+
gpu::memory_fence();
118+
asm("" ::"r"(stop));
119+
volatile auto output = result;
120+
121+
// Return the time elapsed.
122+
return stop - start;
123+
}
97124
} // namespace LIBC_NAMESPACE_DECL
98125

99126
#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX

0 commit comments

Comments
 (0)