Skip to content

Commit 2a15842

Browse files
committed
[libc][gpu] More benchmark for GPUs.
1 parent ca26ea2 commit 2a15842

File tree

3 files changed

+62
-67
lines changed

3 files changed

+62
-67
lines changed

libc/benchmarks/gpu/LibcGpuBenchmark.h

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,15 @@ class Benchmark {
111111
};
112112

113113
// We want our random values to be approximately
114-
// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
115-
// 2^(max_exponent + 1)
116-
template <typename T> static T get_rand_input() {
114+
// Output: a random number with the exponent field between min_exp and max_exp,
115+
// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
116+
// Caveats:
117+
// -EXP_BIAS corresponding to denormal values,
118+
// EXP_BIAS + 1 corresponding to inf or nan.
119+
template <typename T>
120+
static T
121+
get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
122+
int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
117123
using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
118124

119125
// Required to correctly instantiate FPBits for floats and doubles.
@@ -125,10 +131,11 @@ template <typename T> static T get_rand_input() {
125131
static_cast<uint64_t>(LIBC_NAMESPACE::rand());
126132
else
127133
bits = LIBC_NAMESPACE::rand();
128-
double scale = 0.5 + LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN / 2048.0;
134+
double scale =
135+
static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
129136
FPBits fp(bits);
130137
fp.set_biased_exponent(
131-
static_cast<uint32_t>(fp.get_biased_exponent() * scale));
138+
static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
132139
return fp.get_val();
133140
}
134141

@@ -141,19 +148,15 @@ template <typename T> class MathPerf {
141148
public:
142149
typedef T Func(T);
143150

144-
static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
145-
StorageType ending_bit, StorageType step) {
146-
uint64_t total_time = 0;
147-
if (step <= 0)
148-
step = 1;
149-
volatile T result;
150-
for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
151-
T x = FPBits(bits).get_val();
152-
total_time += LIBC_NAMESPACE::latency(f, x);
153-
}
154-
StorageType num_runs = (ending_bit - starting_bit) / step + 1;
155-
156-
return total_time / num_runs;
151+
template <size_t N = 1>
152+
static uint64_t run_perf_in_range(Func f, int min_exp, int max_exp) {
153+
cpp::array<T, N> inputs;
154+
for (size_t i = 0; i < N; ++i)
155+
inputs[i] = get_rand_input<T>(min_exp, max_exp);
156+
157+
uint64_t total_time = LIBC_NAMESPACE::latency(f, inputs);
158+
159+
return total_time / N;
157160
}
158161
};
159162

libc/benchmarks/gpu/src/math/sin_benchmark.cpp

Lines changed: 25 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -15,51 +15,41 @@
1515
#include "src/math/amdgpu/declarations.h"
1616
#endif
1717

18-
constexpr double M_PI = 3.14159265358979323846;
19-
uint64_t get_bits(double x) {
20-
return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
21-
}
22-
2318
// BENCHMARK() expects a function that with no parameters that returns a
2419
// uint64_t representing the latency. Defining each benchmark using macro that
2520
// expands to a lambda to allow us to switch the implementation of `sin()` to
2621
// easily register NVPTX benchmarks.
27-
#define BM_RANDOM_INPUT(Func) \
28-
[]() { \
29-
double x = LIBC_NAMESPACE::benchmarks::get_rand_input<double>(); \
30-
return LIBC_NAMESPACE::latency(Func, x); \
31-
}
32-
BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
33-
34-
#define BM_TWO_PI(Func) \
22+
#define BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, N) \
3523
[]() { \
36-
return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
37-
Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64)); \
24+
return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range<N>( \
25+
Func, MIN_EXP, MAX_EXP); \
3826
}
39-
BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));
4027

41-
#define BM_LARGE_INT(Func) \
42-
[]() { \
43-
return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
44-
Func, 0, get_bits(1 << 30), get_bits(1 << 4)); \
45-
}
46-
BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
47-
BM_LARGE_INT(LIBC_NAMESPACE::sin));
28+
#define BENCH(Name, Func, MIN_EXP, MAX_EXP) \
29+
BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1, \
30+
BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1)); \
31+
BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_128, \
32+
BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 128)); \
33+
BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1024, \
34+
BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1024)); \
35+
BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_4096, \
36+
BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 4096))
37+
38+
BENCH(Sin, LIBC_NAMESPACE::sin, -1023, 1023);
39+
BENCH(SinTwoPi, LIBC_NAMESPACE::sin, -10, 3);
40+
BENCH(SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
41+
BENCH(SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);
4842

4943
#ifdef NVPTX_MATH_FOUND
50-
BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
51-
BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
52-
BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
53-
BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
54-
BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
55-
BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
44+
BENCH(NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
45+
BENCH(NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
46+
BENCH(NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
47+
BENCH(NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
5648
#endif
5749

5850
#ifdef AMDGPU_MATH_FOUND
59-
BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSin,
60-
BM_RANDOM_INPUT(LIBC_NAMESPACE::__ocml_sin_f64));
61-
BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinTwoPi,
62-
BM_TWO_PI(LIBC_NAMESPACE::__ocml_sin_f64));
63-
BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinLargeInt,
64-
BM_LARGE_INT(LIBC_NAMESPACE::__ocml_sin_f64));
51+
BENCH(AmdgpuSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
52+
BENCH(AmdgpuSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
53+
BENCH(AmdgpuSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
54+
BENCH(AmdgpuSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
6555
#endif

libc/benchmarks/gpu/timing/amdgpu/timing.h

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
1010
#define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
1111

12+
#include "src/__support/CPP/array.h"
1213
#include "src/__support/CPP/type_traits.h"
1314
#include "src/__support/GPU/utils.h"
1415
#include "src/__support/common.h"
@@ -43,14 +44,13 @@ namespace LIBC_NAMESPACE_DECL {
4344
// Profile a simple function and obtain its latency in clock cycles on the
4445
// system. This function cannot be inlined or else it will disturb the very
4546
// delicate balance of hard-coded dependencies.
46-
template <typename F, typename T>
47-
[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) {
48-
// We need to store the input somewhere to guarantee that the compiler
49-
// will not constant propagate it and remove the profiling region.
50-
volatile T storage = t;
51-
T arg = storage;
47+
template <typename F, typename T, size_t N>
48+
[[gnu::noinline]] static LIBC_INLINE uint64_t
49+
latency(F f, const cpp::array<T, N> &inputs) {
50+
// // We need to store the input somewhere to guarantee that the compiler
51+
// // will not constant propagate it and remove the profiling region.
5252

53-
FORCE_TO_REGISTER(T, arg);
53+
FORCE_TO_REGISTER(decltype(inputs), inputs);
5454

5555
// The AMDGPU architecture needs to wait on pending results.
5656
gpu::memory_fence();
@@ -59,16 +59,18 @@ template <typename F, typename T>
5959

6060
// This forces the compiler to load the input argument and run the clock
6161
// cycle counter before the profiling region.
62-
FORCE_TO_REGISTER(T, arg);
62+
FORCE_TO_REGISTER(decltype(inputs), inputs);
6363
asm("" ::"s"(start));
6464

6565
// Run the function under test and return its value.
66-
auto result = f(arg);
67-
68-
// This inline assembly performs a no-op which forces the result to both
69-
// be used and prevents us from exiting this region before it's complete.
70-
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
71-
static_cast<uint32_t>(result)));
66+
for (auto input : inputs) {
67+
auto result = f(input);
68+
69+
// This inline assembly performs a no-op which forces the result to both
70+
// be used and prevents us from exiting this region before it's complete.
71+
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
72+
static_cast<uint32_t>(result)));
73+
}
7274

7375
// Obtain the current timestamp after running the calculation and force
7476
// ordering.

0 commit comments

Comments
 (0)