Skip to content

[libc] [gpu] Add Generic, NvSin, and OcmlSinf64 Throughput Benchmark #101917

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 22 additions & 20 deletions libc/benchmarks/gpu/LibcGpuBenchmark.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ namespace benchmarks {

struct BenchmarkOptions {
uint32_t initial_iterations = 1;
uint32_t min_iterations = 50;
uint32_t min_iterations = 1;
uint32_t max_iterations = 10000000;
uint32_t min_samples = 4;
uint32_t max_samples = 1000;
Expand Down Expand Up @@ -111,9 +111,15 @@ class Benchmark {
};

// We want our random values to be approximately
// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
// 2^(max_exponent + 1)
template <typename T> static T get_rand_input() {
// Output: a random number with the exponent field between min_exp and max_exp,
// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
// Caveats:
// -EXP_BIAS corresponding to denormal values,
// EXP_BIAS + 1 corresponding to inf or nan.
template <typename T>
static T
get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;

// Required to correctly instantiate FPBits for floats and doubles.
Expand All @@ -125,10 +131,11 @@ template <typename T> static T get_rand_input() {
static_cast<uint64_t>(LIBC_NAMESPACE::rand());
else
bits = LIBC_NAMESPACE::rand();
double scale = 0.5 + LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN / 2048.0;
double scale =
static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
FPBits fp(bits);
fp.set_biased_exponent(
static_cast<uint32_t>(fp.get_biased_exponent() * scale));
static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
return fp.get_val();
}

Expand All @@ -141,19 +148,15 @@ template <typename T> class MathPerf {
public:
typedef T Func(T);

static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
StorageType ending_bit, StorageType step) {
uint64_t total_time = 0;
if (step <= 0)
step = 1;
volatile T result;
for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
T x = FPBits(bits).get_val();
total_time += LIBC_NAMESPACE::latency(f, x);
}
StorageType num_runs = (ending_bit - starting_bit) / step + 1;

return total_time / num_runs;
template <size_t N = 1>
static uint64_t run_throughput_in_range(Func f, int min_exp, int max_exp) {
cpp::array<T, N> inputs;
for (size_t i = 0; i < N; ++i)
inputs[i] = get_rand_input<T>(min_exp, max_exp);

uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs);

return total_time / N;
}
};

Expand All @@ -176,5 +179,4 @@ template <typename T> class MathPerf {
#define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \
BENCHMARK_N_THREADS(SuiteName, TestName, Func, \
LIBC_NAMESPACE::gpu::get_lane_size())

#endif
60 changes: 25 additions & 35 deletions libc/benchmarks/gpu/src/math/sin_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,51 +15,41 @@
#include "src/math/amdgpu/declarations.h"
#endif

constexpr double M_PI = 3.14159265358979323846;
uint64_t get_bits(double x) {
return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
}

// BENCHMARK() expects a function that with no parameters that returns a
// uint64_t representing the latency. Defining each benchmark using macro that
// expands to a lambda to allow us to switch the implementation of `sin()` to
// easily register NVPTX benchmarks.
#define BM_RANDOM_INPUT(Func) \
[]() { \
double x = LIBC_NAMESPACE::benchmarks::get_rand_input<double>(); \
return LIBC_NAMESPACE::latency(Func, x); \
}
BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));

#define BM_TWO_PI(Func) \
#define BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, N) \
[]() { \
return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64)); \
return LIBC_NAMESPACE::benchmarks::MathPerf< \
double>::run_throughput_in_range<N>(Func, MIN_EXP, MAX_EXP); \
}
BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));

#define BM_LARGE_INT(Func) \
[]() { \
return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
Func, 0, get_bits(1 << 30), get_bits(1 << 4)); \
}
BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
BM_LARGE_INT(LIBC_NAMESPACE::sin));
#define BENCH(Name, Func, MIN_EXP, MAX_EXP) \
SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1, \
BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1)); \
SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_128, \
BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 128)); \
SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1024, \
BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1024)); \
SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_4096, \
BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 4096))

BENCH(Sin, LIBC_NAMESPACE::sin, -1023, 1023);
BENCH(SinTwoPi, LIBC_NAMESPACE::sin, -10, 3);
BENCH(SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
BENCH(SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);

#ifdef NVPTX_MATH_FOUND
BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
BENCH(NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
BENCH(NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
BENCH(NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
BENCH(NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
#endif

#ifdef AMDGPU_MATH_FOUND
BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSin,
BM_RANDOM_INPUT(LIBC_NAMESPACE::__ocml_sin_f64));
BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinTwoPi,
BM_TWO_PI(LIBC_NAMESPACE::__ocml_sin_f64));
BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinLargeInt,
BM_LARGE_INT(LIBC_NAMESPACE::__ocml_sin_f64));
BENCH(AmdgpuSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
BENCH(AmdgpuSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
BENCH(AmdgpuSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
BENCH(AmdgpuSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
#endif
4 changes: 4 additions & 0 deletions libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,8 @@ add_header_library(
timing.h
DEPENDS
libc.src.__support.common
libc.src.__support.macros.config
libc.src.__support.macros.attributes
libc.src.__support.CPP.type_traits
libc.src.__support.CPP.array
)
61 changes: 41 additions & 20 deletions libc/benchmarks/gpu/timing/amdgpu/timing.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
#define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU

#include "src/__support/CPP/array.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/common.h"
Expand All @@ -17,14 +18,6 @@

#include <stdint.h>

// AMDGPU does not support input register constraints for i1 and i8, so we must
// cast them to uint16_t's before loading them into registers.
#define FORCE_TO_REGISTER(TYPE, VARIABLE) \
if constexpr (cpp::is_same_v<TYPE, char> || cpp::is_same_v<TYPE, bool>) \
asm("" ::"v"(static_cast<uint16_t>(VARIABLE))); \
else \
asm("" ::"v"(VARIABLE))

namespace LIBC_NAMESPACE_DECL {

// Returns the overhead associated with calling the profiling region. This
Expand All @@ -50,25 +43,29 @@ template <typename F, typename T>
volatile T storage = t;
T arg = storage;

FORCE_TO_REGISTER(T, arg);

// The AMDGPU architecture needs to wait on pending results.
gpu::memory_fence();
// Get the current timestamp from the clock.
uint64_t start = gpu::processor_clock();

// This forces the compiler to load the input argument and run the clock
// cycle counter before the profiling region.
FORCE_TO_REGISTER(T, arg);
asm("" ::"s"(start));

// Run the function under test and return its value.
auto result = f(arg);

// This inline assembly performs a no-op which forces the result to both
// be used and prevents us from exiting this region before it's complete.
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
static_cast<uint32_t>(result)));
if constexpr (cpp::is_same_v<decltype(result), char> ||
cpp::is_same_v<decltype(result), bool>)
// AMDGPU does not support input register constraints for i1 and i8, so we
// cast it to a 32-bit integer. This does not add an additional assembly
// instruction (https://godbolt.org/z/zxGqv8G91).
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
static_cast<uint32_t>(result)));
else
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));

// Obtain the current timestamp after running the calculation and force
// ordering.
Expand All @@ -87,20 +84,19 @@ template <typename F, typename T1, typename T2>
T1 arg1 = storage1;
T2 arg2 = storage2;

FORCE_TO_REGISTER(T1, arg1);
FORCE_TO_REGISTER(T2, arg2);

gpu::memory_fence();
uint64_t start = gpu::processor_clock();

FORCE_TO_REGISTER(T1, arg1);
FORCE_TO_REGISTER(T2, arg2);
asm("" ::"s"(start));

auto result = f(arg1, arg2);

asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
static_cast<uint32_t>(result)));
if constexpr (cpp::is_same_v<decltype(result), char> ||
cpp::is_same_v<decltype(result), bool>)
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
static_cast<uint32_t>(result)));
else
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));

uint64_t stop = gpu::processor_clock();
asm("" ::"s"(stop));
Expand All @@ -109,6 +105,31 @@ template <typename F, typename T1, typename T2>
return stop - start;
}

// Provides throughput benchmarking.
template <typename F, typename T, size_t N>
[[gnu::noinline]] static LIBC_INLINE uint64_t
throughput(F f, const cpp::array<T, N> &inputs) {
asm("" ::"v"(&inputs));

gpu::memory_fence();
uint64_t start = gpu::processor_clock();

asm("" ::"s"(start));

for (auto input : inputs) {
auto result = f(input);

asm("" ::"v"(result));
}

uint64_t stop = gpu::processor_clock();
asm("" ::"s"(stop));
gpu::memory_fence();

// Return the time elapsed.
return stop - start;
}

} // namespace LIBC_NAMESPACE_DECL

#endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
4 changes: 4 additions & 0 deletions libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,8 @@ add_header_library(
timing.h
DEPENDS
libc.src.__support.common
libc.src.__support.macros.config
libc.src.__support.macros.attributes
libc.src.__support.CPP.type_traits
libc.src.__support.CPP.array
)
37 changes: 32 additions & 5 deletions libc/benchmarks/gpu/timing/nvptx/timing.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX

#include "src/__support/CPP/array.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/common.h"
#include "src/__support/macros/attributes.h"
Expand All @@ -25,7 +27,7 @@ namespace LIBC_NAMESPACE_DECL {
volatile uint32_t x = 1;
uint32_t y = x;
uint64_t start = gpu::processor_clock();
asm("" ::"r"(y), "llr"(start));
asm("" ::"llr"(start));
uint32_t result = y;
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
uint64_t stop = gpu::processor_clock();
Expand All @@ -42,15 +44,14 @@ template <typename F, typename T>
// not constant propagate it and remove the profiling region.
volatile T storage = t;
T arg = storage;
asm("" ::"r"(arg));

// Get the current timestamp from the clock.
gpu::memory_fence();
uint64_t start = gpu::processor_clock();

// This forces the compiler to load the input argument and run the clock cycle
// counter before the profiling region.
asm("" ::"r"(arg), "llr"(start));
asm("" ::"llr"(start));

// Run the function under test and return its value.
auto result = f(arg);
Expand All @@ -76,12 +77,11 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
volatile T2 storage2 = t2;
T1 arg = storage;
T2 arg2 = storage2;
asm("" ::"r"(arg), "r"(arg2));

gpu::memory_fence();
uint64_t start = gpu::processor_clock();

asm("" ::"r"(arg), "r"(arg2), "llr"(start));
asm("" ::"llr"(start));

auto result = f(arg, arg2);

Expand All @@ -94,6 +94,33 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {

return stop - start;
}

// Provides throughput benchmarking.
template <typename F, typename T, size_t N>
[[gnu::noinline]] static LIBC_INLINE uint64_t
throughput(F f, const cpp::array<T, N> &inputs) {
asm("" ::"r"(&inputs));

gpu::memory_fence();
uint64_t start = gpu::processor_clock();

asm("" ::"llr"(start));

uint64_t result;
for (auto input : inputs) {
asm("" ::"r"(input));
result = f(input);
asm("" ::"r"(result));
}

uint64_t stop = gpu::processor_clock();
gpu::memory_fence();
asm("" ::"r"(stop));
volatile auto output = result;

// Return the time elapsed.
return stop - start;
}
} // namespace LIBC_NAMESPACE_DECL

#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
Loading