Skip to content

Commit 677796c

Browse files
authored
[libc] Add Generic and NVPTX Sin Benchmark (#99795)
This PR adds sin benchmarking for a range of values and on a pregenerated random distribution.
1 parent 6d02f62 commit 677796c

File tree

8 files changed

+156
-7
lines changed

8 files changed

+156
-7
lines changed

libc/benchmarks/gpu/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ function(add_benchmark benchmark_name)
2222
${BENCHMARK_LINK_LIBRARIES}
2323
DEPENDS
2424
libc.src.stdio.printf
25+
libc.src.stdlib.srand
26+
libc.src.stdlib.rand
2527
${BENCHMARK_DEPENDS}
2628
${BENCHMARK_UNPARSED_ARGUMENTS}
2729
COMPILE_OPTIONS
@@ -52,13 +54,17 @@ add_unittest_framework_library(
5254
libc.src.__support.CPP.limits
5355
libc.src.__support.CPP.algorithm
5456
libc.src.__support.CPP.atomic
57+
libc.src.__support.CPP.array
5558
libc.src.__support.fixed_point.fx_rep
5659
libc.src.__support.macros.properties.types
5760
libc.src.__support.OSUtil.osutil
5861
libc.src.__support.uint128
62+
libc.src.__support.FPUtil.fp_bits
5963
libc.src.__support.FPUtil.sqrt
6064
libc.src.__support.fixedvector
6165
libc.src.time.clock
66+
libc.src.stdlib.rand
67+
libc.src.stdlib.srand
6268
libc.benchmarks.gpu.timing.timing
6369
libc.src.stdio.printf
6470
)

libc/benchmarks/gpu/LibcGpuBenchmark.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "src/__support/fixedvector.h"
99
#include "src/__support/macros/config.h"
1010
#include "src/stdio/printf.h"
11+
#include "src/stdlib/srand.h"
1112
#include "src/time/gpu/time_utils.h"
1213

1314
namespace LIBC_NAMESPACE_DECL {
@@ -136,8 +137,10 @@ void print_header() {
136137
void Benchmark::run_benchmarks() {
137138
uint64_t id = gpu::get_thread_id();
138139

139-
if (id == 0)
140+
if (id == 0) {
140141
print_header();
142+
LIBC_NAMESPACE::srand(gpu::processor_clock());
143+
}
141144

142145
gpu::sync_threads();
143146

libc/benchmarks/gpu/LibcGpuBenchmark.h

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,14 @@
33

44
#include "benchmarks/gpu/BenchmarkLogger.h"
55
#include "benchmarks/gpu/timing/timing.h"
6+
#include "src/__support/CPP/array.h"
67
#include "src/__support/CPP/functional.h"
78
#include "src/__support/CPP/limits.h"
89
#include "src/__support/CPP/string_view.h"
10+
#include "src/__support/CPP/type_traits.h"
11+
#include "src/__support/FPUtil/FPBits.h"
912
#include "src/__support/macros/config.h"
13+
#include "src/stdlib/rand.h"
1014
#include "src/time/clock.h"
1115

1216
#include <stdint.h>
@@ -105,6 +109,54 @@ class Benchmark {
105109
return benchmark(options, func);
106110
}
107111
};
112+
113+
// We want our random values to be approximately
114+
// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
115+
// 2^(max_exponent + 1)
116+
template <typename T> static T get_rand_input() {
117+
using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
118+
119+
// Required to correctly instantiate FPBits for floats and doubles.
120+
using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>),
121+
uint64_t, uint32_t>;
122+
RandType bits;
123+
if constexpr (cpp::is_same_v<T, uint64_t>)
124+
bits = (static_cast<uint64_t>(LIBC_NAMESPACE::rand()) << 32) |
125+
static_cast<uint64_t>(LIBC_NAMESPACE::rand());
126+
else
127+
bits = LIBC_NAMESPACE::rand();
128+
double scale = 0.5 + LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN / 2048.0;
129+
FPBits fp(bits);
130+
fp.set_biased_exponent(
131+
static_cast<uint32_t>(fp.get_biased_exponent() * scale));
132+
return fp.get_val();
133+
}
134+
135+
template <typename T> class MathPerf {
136+
using FPBits = fputil::FPBits<T>;
137+
using StorageType = typename FPBits::StorageType;
138+
static constexpr StorageType UIntMax =
139+
cpp::numeric_limits<StorageType>::max();
140+
141+
public:
142+
typedef T Func(T);
143+
144+
static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
145+
StorageType ending_bit, StorageType step) {
146+
uint64_t total_time = 0;
147+
if (step <= 0)
148+
step = 1;
149+
volatile T result;
150+
for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
151+
T x = FPBits(bits).get_val();
152+
total_time += LIBC_NAMESPACE::latency(f, x);
153+
}
154+
StorageType num_runs = (ending_bit - starting_bit) / step + 1;
155+
156+
return total_time / num_runs;
157+
}
158+
};
159+
108160
} // namespace benchmarks
109161
} // namespace LIBC_NAMESPACE_DECL
110162

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
add_subdirectory(ctype)
2+
add_subdirectory(math)
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
add_custom_target(libc-gpu-math-benchmarks)
2+
3+
if(CUDAToolkit_FOUND)
4+
set(libdevice_path ${CUDAToolkit_BIN_DIR}/../nvvm/libdevice/libdevice.10.bc)
5+
if (EXISTS ${libdevice_path})
6+
set(nvptx_bitcode_link_flags
7+
"SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${libdevice_path}")
8+
# Compile definition needed so the benchmark knows to register
9+
# NVPTX benchmarks.
10+
set(nvptx_math_found "-DNVPTX_MATH_FOUND=1")
11+
endif()
12+
endif()
13+
14+
add_benchmark(
15+
sin_benchmark
16+
SUITE
17+
libc-gpu-math-benchmarks
18+
SRCS
19+
sin_benchmark.cpp
20+
DEPENDS
21+
libc.src.math.sin
22+
libc.src.stdlib.srand
23+
libc.src.stdlib.rand
24+
libc.src.__support.FPUtil.fp_bits
25+
libc.src.__support.CPP.bit
26+
libc.src.__support.CPP.array
27+
COMPILE_OPTIONS
28+
${nvptx_math_found}
29+
${nvptx_bitcode_link_flags}
30+
LOADER_ARGS
31+
--threads 64
32+
)
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#include "benchmarks/gpu/LibcGpuBenchmark.h"
2+
3+
#include "src/__support/CPP/array.h"
4+
#include "src/__support/CPP/bit.h"
5+
#include "src/__support/CPP/functional.h"
6+
#include "src/__support/FPUtil/FPBits.h"
7+
#include "src/math/sin.h"
8+
#include "src/stdlib/rand.h"
9+
#include "src/stdlib/srand.h"
10+
11+
#ifdef NVPTX_MATH_FOUND
12+
#include "src/math/nvptx/declarations.h"
13+
#endif
14+
15+
constexpr double M_PI = 3.14159265358979323846;
16+
uint64_t get_bits(double x) {
17+
return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
18+
}
19+
20+
// BENCHMARK() expects a function that with no parameters that returns a
21+
// uint64_t representing the latency. Defining each benchmark using macro that
22+
// expands to a lambda to allow us to switch the implementation of `sin()` to
23+
// easily register NVPTX benchmarks.
24+
#define BM_RANDOM_INPUT(Func) \
25+
[]() { \
26+
double x = LIBC_NAMESPACE::benchmarks::get_rand_input<double>(); \
27+
return LIBC_NAMESPACE::latency(Func, x); \
28+
}
29+
BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
30+
31+
#define BM_TWO_PI(Func) \
32+
[]() { \
33+
return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
34+
Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64)); \
35+
}
36+
BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));
37+
38+
#define BM_LARGE_INT(Func) \
39+
[]() { \
40+
return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
41+
Func, 0, get_bits(1 << 30), get_bits(1 << 4)); \
42+
}
43+
BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
44+
BM_LARGE_INT(LIBC_NAMESPACE::sin));
45+
46+
#ifdef NVPTX_MATH_FOUND
47+
BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
48+
BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
49+
BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
50+
BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
51+
BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
52+
BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
53+
#endif

libc/benchmarks/gpu/timing/amdgpu/timing.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ namespace LIBC_NAMESPACE_DECL {
3434
gpu::memory_fence();
3535
uint64_t start = gpu::processor_clock();
3636
uint32_t result = 0.0;
37-
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
37+
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
3838
asm("" ::"s"(start));
3939
uint64_t stop = gpu::processor_clock();
4040
return stop - start;
@@ -67,7 +67,8 @@ template <typename F, typename T>
6767

6868
// This inline assembly performs a no-op which forces the result to both
6969
// be used and prevents us from exiting this region before it's complete.
70-
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
70+
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
71+
static_cast<uint32_t>(result)));
7172

7273
// Obtain the current timestamp after running the calculation and force
7374
// ordering.
@@ -98,7 +99,8 @@ template <typename F, typename T1, typename T2>
9899

99100
auto result = f(arg1, arg2);
100101

101-
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
102+
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
103+
static_cast<uint32_t>(result)));
102104

103105
uint64_t stop = gpu::processor_clock();
104106
asm("" ::"s"(stop));

libc/benchmarks/gpu/timing/nvptx/timing.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ namespace LIBC_NAMESPACE_DECL {
2727
uint64_t start = gpu::processor_clock();
2828
asm("" ::"r"(y), "llr"(start));
2929
uint32_t result = y;
30-
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
30+
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
3131
uint64_t stop = gpu::processor_clock();
3232
volatile auto storage = result;
3333
return stop - start;
@@ -57,7 +57,7 @@ template <typename F, typename T>
5757

5858
// This inline assembly performs a no-op which forces the result to both be
5959
// used and prevents us from exiting this region before it's complete.
60-
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
60+
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
6161

6262
// Obtain the current timestamp after running the calculation and force
6363
// ordering.
@@ -85,7 +85,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
8585

8686
auto result = f(arg, arg2);
8787

88-
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
88+
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
8989

9090
uint64_t stop = gpu::processor_clock();
9191
gpu::memory_fence();

0 commit comments

Comments
 (0)