Skip to content

[libc] Add Generic and NVPTX Sin Benchmark #99795

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions libc/benchmarks/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ function(add_benchmark benchmark_name)
${BENCHMARK_LINK_LIBRARIES}
DEPENDS
libc.src.stdio.printf
libc.src.stdlib.srand
libc.src.stdlib.rand
${BENCHMARK_DEPENDS}
${BENCHMARK_UNPARSED_ARGUMENTS}
COMPILE_OPTIONS
Expand Down Expand Up @@ -52,13 +54,17 @@ add_unittest_framework_library(
libc.src.__support.CPP.limits
libc.src.__support.CPP.algorithm
libc.src.__support.CPP.atomic
libc.src.__support.CPP.array
libc.src.__support.fixed_point.fx_rep
libc.src.__support.macros.properties.types
libc.src.__support.OSUtil.osutil
libc.src.__support.uint128
libc.src.__support.FPUtil.fp_bits
libc.src.__support.FPUtil.sqrt
libc.src.__support.fixedvector
libc.src.time.clock
libc.src.stdlib.rand
libc.src.stdlib.srand
libc.benchmarks.gpu.timing.timing
libc.src.stdio.printf
)
Expand Down
5 changes: 4 additions & 1 deletion libc/benchmarks/gpu/LibcGpuBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "src/__support/fixedvector.h"
#include "src/__support/macros/config.h"
#include "src/stdio/printf.h"
#include "src/stdlib/srand.h"
#include "src/time/gpu/time_utils.h"

namespace LIBC_NAMESPACE_DECL {
Expand Down Expand Up @@ -136,8 +137,10 @@ void print_header() {
void Benchmark::run_benchmarks() {
uint64_t id = gpu::get_thread_id();

if (id == 0)
if (id == 0) {
print_header();
LIBC_NAMESPACE::srand(gpu::processor_clock());
}

gpu::sync_threads();

Expand Down
52 changes: 52 additions & 0 deletions libc/benchmarks/gpu/LibcGpuBenchmark.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@

#include "benchmarks/gpu/BenchmarkLogger.h"
#include "benchmarks/gpu/timing/timing.h"
#include "src/__support/CPP/array.h"
#include "src/__support/CPP/functional.h"
#include "src/__support/CPP/limits.h"
#include "src/__support/CPP/string_view.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/macros/config.h"
#include "src/stdlib/rand.h"
#include "src/time/clock.h"

#include <stdint.h>
Expand Down Expand Up @@ -105,6 +109,54 @@ class Benchmark {
return benchmark(options, func);
}
};

// We want our random values to be approximately
// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
// 2^(max_exponent + 1)
template <typename T> static T get_rand_input() {
using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;

// Required to correctly instantiate FPBits for floats and doubles.
using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>),
uint64_t, uint32_t>;
RandType bits;
if constexpr (cpp::is_same_v<T, uint64_t>)
bits = (static_cast<uint64_t>(LIBC_NAMESPACE::rand()) << 32) |
static_cast<uint64_t>(LIBC_NAMESPACE::rand());
else
bits = LIBC_NAMESPACE::rand();
double scale = 0.5 + LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN / 2048.0;
FPBits fp(bits);
fp.set_biased_exponent(
static_cast<uint32_t>(fp.get_biased_exponent() * scale));
return fp.get_val();
}

template <typename T> class MathPerf {
using FPBits = fputil::FPBits<T>;
using StorageType = typename FPBits::StorageType;
static constexpr StorageType UIntMax =
cpp::numeric_limits<StorageType>::max();

public:
typedef T Func(T);

static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
StorageType ending_bit, StorageType step) {
uint64_t total_time = 0;
if (step <= 0)
step = 1;
volatile T result;
for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
T x = FPBits(bits).get_val();
total_time += LIBC_NAMESPACE::latency(f, x);
}
StorageType num_runs = (ending_bit - starting_bit) / step + 1;

return total_time / num_runs;
}
};

} // namespace benchmarks
} // namespace LIBC_NAMESPACE_DECL

Expand Down
1 change: 1 addition & 0 deletions libc/benchmarks/gpu/src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
add_subdirectory(ctype)
add_subdirectory(math)
32 changes: 32 additions & 0 deletions libc/benchmarks/gpu/src/math/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
add_custom_target(libc-gpu-math-benchmarks)

if(CUDAToolkit_FOUND)
set(libdevice_path ${CUDAToolkit_BIN_DIR}/../nvvm/libdevice/libdevice.10.bc)
if (EXISTS ${libdevice_path})
set(nvptx_bitcode_link_flags
"SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${libdevice_path}")
# Compile definition needed so the benchmark knows to register
# NVPTX benchmarks.
set(nvptx_math_found "-DNVPTX_MATH_FOUND=1")
endif()
endif()

add_benchmark(
sin_benchmark
SUITE
libc-gpu-math-benchmarks
SRCS
sin_benchmark.cpp
DEPENDS
libc.src.math.sin
libc.src.stdlib.srand
libc.src.stdlib.rand
libc.src.__support.FPUtil.fp_bits
libc.src.__support.CPP.bit
libc.src.__support.CPP.array
COMPILE_OPTIONS
${nvptx_math_found}
${nvptx_bitcode_link_flags}
LOADER_ARGS
--threads 64
)
53 changes: 53 additions & 0 deletions libc/benchmarks/gpu/src/math/sin_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#include "benchmarks/gpu/LibcGpuBenchmark.h"

#include "src/__support/CPP/array.h"
#include "src/__support/CPP/bit.h"
#include "src/__support/CPP/functional.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/math/sin.h"
#include "src/stdlib/rand.h"
#include "src/stdlib/srand.h"

#ifdef NVPTX_MATH_FOUND
#include "src/math/nvptx/declarations.h"
#endif

constexpr double M_PI = 3.14159265358979323846;
uint64_t get_bits(double x) {
return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
}

// BENCHMARK() expects a function that with no parameters that returns a
// uint64_t representing the latency. Defining each benchmark using macro that
// expands to a lambda to allow us to switch the implementation of `sin()` to
// easily register NVPTX benchmarks.
#define BM_RANDOM_INPUT(Func) \
[]() { \
double x = LIBC_NAMESPACE::benchmarks::get_rand_input<double>(); \
return LIBC_NAMESPACE::latency(Func, x); \
}
BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));

#define BM_TWO_PI(Func) \
[]() { \
return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64)); \
}
BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));

#define BM_LARGE_INT(Func) \
[]() { \
return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
Func, 0, get_bits(1 << 30), get_bits(1 << 4)); \
}
BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
BM_LARGE_INT(LIBC_NAMESPACE::sin));

#ifdef NVPTX_MATH_FOUND
BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
#endif
8 changes: 5 additions & 3 deletions libc/benchmarks/gpu/timing/amdgpu/timing.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ namespace LIBC_NAMESPACE_DECL {
gpu::memory_fence();
uint64_t start = gpu::processor_clock();
uint32_t result = 0.0;
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
asm("" ::"s"(start));
uint64_t stop = gpu::processor_clock();
return stop - start;
Expand Down Expand Up @@ -67,7 +67,8 @@ template <typename F, typename T>

// This inline assembly performs a no-op which forces the result to both
// be used and prevents us from exiting this region before it's complete.
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
static_cast<uint32_t>(result)));

// Obtain the current timestamp after running the calculation and force
// ordering.
Expand Down Expand Up @@ -98,7 +99,8 @@ template <typename F, typename T1, typename T2>

auto result = f(arg1, arg2);

asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
static_cast<uint32_t>(result)));

uint64_t stop = gpu::processor_clock();
asm("" ::"s"(stop));
Expand Down
6 changes: 3 additions & 3 deletions libc/benchmarks/gpu/timing/nvptx/timing.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ namespace LIBC_NAMESPACE_DECL {
uint64_t start = gpu::processor_clock();
asm("" ::"r"(y), "llr"(start));
uint32_t result = y;
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
uint64_t stop = gpu::processor_clock();
volatile auto storage = result;
return stop - start;
Expand Down Expand Up @@ -57,7 +57,7 @@ template <typename F, typename T>

// This inline assembly performs a no-op which forces the result to both be
// used and prevents us from exiting this region before it's complete.
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));

// Obtain the current timestamp after running the calculation and force
// ordering.
Expand Down Expand Up @@ -85,7 +85,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {

auto result = f(arg, arg2);

asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));

uint64_t stop = gpu::processor_clock();
gpu::memory_fence();
Expand Down