Skip to content

[libc] [gpu] Add Generic, NvSin, and OcmlSinf64 Throughput Benchmark #101917

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 22 additions & 20 deletions libc/benchmarks/gpu/LibcGpuBenchmark.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ namespace benchmarks {

struct BenchmarkOptions {
uint32_t initial_iterations = 1;
uint32_t min_iterations = 50;
uint32_t min_iterations = 1;
uint32_t max_iterations = 10000000;
uint32_t min_samples = 4;
uint32_t max_samples = 1000;
Expand Down Expand Up @@ -111,9 +111,15 @@ class Benchmark {
};

// We want our random values to be approximately
// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
// 2^(max_exponent + 1)
template <typename T> static T get_rand_input() {
// Output: a random number with the exponent field between min_exp and max_exp,
// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
// Caveats:
// -EXP_BIAS corresponding to denormal values,
// EXP_BIAS + 1 corresponding to inf or nan.
template <typename T>
static T
get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;

// Required to correctly instantiate FPBits for floats and doubles.
Expand All @@ -125,10 +131,11 @@ template <typename T> static T get_rand_input() {
static_cast<uint64_t>(LIBC_NAMESPACE::rand());
else
bits = LIBC_NAMESPACE::rand();
double scale = 0.5 + LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN / 2048.0;
double scale =
static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
FPBits fp(bits);
fp.set_biased_exponent(
static_cast<uint32_t>(fp.get_biased_exponent() * scale));
static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
return fp.get_val();
}

Expand All @@ -141,19 +148,15 @@ template <typename T> class MathPerf {
public:
typedef T Func(T);

static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
StorageType ending_bit, StorageType step) {
uint64_t total_time = 0;
if (step <= 0)
step = 1;
volatile T result;
for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
T x = FPBits(bits).get_val();
total_time += LIBC_NAMESPACE::latency(f, x);
}
StorageType num_runs = (ending_bit - starting_bit) / step + 1;

return total_time / num_runs;
template <size_t N = 1>
static uint64_t run_perf_in_range(Func f, int min_exp, int max_exp) {
cpp::array<T, N> inputs;
for (size_t i = 0; i < N; ++i)
inputs[i] = get_rand_input<T>(min_exp, max_exp);

uint64_t total_time = LIBC_NAMESPACE::latency(f, inputs);

return total_time / N;
}
};

Expand All @@ -176,5 +179,4 @@ template <typename T> class MathPerf {
#define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \
BENCHMARK_N_THREADS(SuiteName, TestName, Func, \
LIBC_NAMESPACE::gpu::get_lane_size())

#endif
60 changes: 25 additions & 35 deletions libc/benchmarks/gpu/src/math/sin_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,51 +15,41 @@
#include "src/math/amdgpu/declarations.h"
#endif

constexpr double M_PI = 3.14159265358979323846;
uint64_t get_bits(double x) {
return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
}

// BENCHMARK() expects a function that with no parameters that returns a
// uint64_t representing the latency. Defining each benchmark using macro that
// expands to a lambda to allow us to switch the implementation of `sin()` to
// easily register NVPTX benchmarks.
#define BM_RANDOM_INPUT(Func) \
[]() { \
double x = LIBC_NAMESPACE::benchmarks::get_rand_input<double>(); \
return LIBC_NAMESPACE::latency(Func, x); \
}
BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));

#define BM_TWO_PI(Func) \
#define BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, N) \
[]() { \
return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64)); \
return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range<N>( \
Func, MIN_EXP, MAX_EXP); \
}
BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));

#define BM_LARGE_INT(Func) \
[]() { \
return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
Func, 0, get_bits(1 << 30), get_bits(1 << 4)); \
}
BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
BM_LARGE_INT(LIBC_NAMESPACE::sin));
#define BENCH(Name, Func, MIN_EXP, MAX_EXP) \
SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1, \
BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1)); \
SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_128, \
BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 128)); \
SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1024, \
BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1024)); \
SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_4096, \
BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 4096))

BENCH(Sin, LIBC_NAMESPACE::sin, -1023, 1023);
BENCH(SinTwoPi, LIBC_NAMESPACE::sin, -10, 3);
BENCH(SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
BENCH(SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);

#ifdef NVPTX_MATH_FOUND
BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
BENCH(NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
BENCH(NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
BENCH(NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
BENCH(NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
#endif

#ifdef AMDGPU_MATH_FOUND
BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSin,
BM_RANDOM_INPUT(LIBC_NAMESPACE::__ocml_sin_f64));
BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinTwoPi,
BM_TWO_PI(LIBC_NAMESPACE::__ocml_sin_f64));
BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinLargeInt,
BM_LARGE_INT(LIBC_NAMESPACE::__ocml_sin_f64));
BENCH(AmdgpuSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
BENCH(AmdgpuSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
BENCH(AmdgpuSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
BENCH(AmdgpuSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
#endif
4 changes: 4 additions & 0 deletions libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,8 @@ add_header_library(
timing.h
DEPENDS
libc.src.__support.common
libc.src.__support.macros.config
libc.src.__support.macros.attributes
libc.src.__support.CPP.type_traits
libc.src.__support.CPP.array
)
51 changes: 35 additions & 16 deletions libc/benchmarks/gpu/timing/amdgpu/timing.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,10 @@
#include "src/__support/common.h"
#include "src/__support/macros/attributes.h"
#include "src/__support/macros/config.h"
#include "src/__support/CPP/array.h"

#include <stdint.h>

// AMDGPU does not support input register constraints for i1 and i8, so we must
// cast them to uint16_t's before loading them into registers.
#define FORCE_TO_REGISTER(TYPE, VARIABLE) \
if constexpr (cpp::is_same_v<TYPE, char> || cpp::is_same_v<TYPE, bool>) \
asm("" ::"v"(static_cast<uint16_t>(VARIABLE))); \
else \
asm("" ::"v"(VARIABLE))

namespace LIBC_NAMESPACE_DECL {

// Returns the overhead associated with calling the profiling region. This
Expand All @@ -50,7 +43,8 @@ template <typename F, typename T>
volatile T storage = t;
T arg = storage;

FORCE_TO_REGISTER(T, arg);
// VGPR constraints can only accept primitive values.
asm("" ::"v"(&arg));

// The AMDGPU architecture needs to wait on pending results.
gpu::memory_fence();
Expand All @@ -59,8 +53,7 @@ template <typename F, typename T>

// This forces the compiler to load the input argument and run the clock
// cycle counter before the profiling region.
FORCE_TO_REGISTER(T, arg);
asm("" ::"s"(start));
asm("" ::"s"(start), "v"(&arg));

// Run the function under test and return its value.
auto result = f(arg);
Expand All @@ -87,15 +80,12 @@ template <typename F, typename T1, typename T2>
T1 arg1 = storage1;
T2 arg2 = storage2;

FORCE_TO_REGISTER(T1, arg1);
FORCE_TO_REGISTER(T2, arg2);
asm("" ::"v"(&arg1), "v"(&arg2));

gpu::memory_fence();
uint64_t start = gpu::processor_clock();

FORCE_TO_REGISTER(T1, arg1);
FORCE_TO_REGISTER(T2, arg2);
asm("" ::"s"(start));
asm("" ::"s"(start), "v"(&arg1), "v"(&arg2));

auto result = f(arg1, arg2);

Expand All @@ -109,6 +99,35 @@ template <typename F, typename T1, typename T2>
return stop - start;
}

// Provides throughput benchmarking.
template <typename F, typename T, size_t N>
[[gnu::noinline]] static LIBC_INLINE uint64_t
latency(F f, const cpp::array<T, N> &inputs) {
volatile auto storage = &inputs;
auto array_pointer = storage;
asm("" ::"v"(array_pointer));
auto register_array = *array_pointer;

gpu::memory_fence();
uint64_t start = gpu::processor_clock();

asm("" ::"s"(start), "v"(array_pointer));

for (auto input : register_array) {
auto result = f(input);

asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
static_cast<uint32_t>(result)));
}

uint64_t stop = gpu::processor_clock();
asm("" ::"s"(stop));
gpu::memory_fence();

// Return the time elapsed.
return stop - start;
}

} // namespace LIBC_NAMESPACE_DECL

#endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
4 changes: 4 additions & 0 deletions libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,8 @@ add_header_library(
timing.h
DEPENDS
libc.src.__support.common
libc.src.__support.macros.config
libc.src.__support.macros.attributes
libc.src.__support.CPP.type_traits
libc.src.__support.CPP.array
)
32 changes: 32 additions & 0 deletions libc/benchmarks/gpu/timing/nvptx/timing.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX

#include "src/__support/CPP/array.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/common.h"
#include "src/__support/macros/attributes.h"
Expand Down Expand Up @@ -94,6 +96,36 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {

return stop - start;
}

// Provides throughput benchmarking.
template <typename F, typename T, size_t N>
[[gnu::noinline]] static LIBC_INLINE uint64_t
latency(F f, const cpp::array<T, N> &inputs) {
volatile auto storage = &inputs;
auto array_pointer = storage;
asm("" ::"r"(array_pointer));
auto register_array = *array_pointer;

gpu::memory_fence();
uint64_t start = gpu::processor_clock();

asm("" ::"r"(array_pointer), "llr"(start));

uint64_t result;
for (auto input : register_array) {
asm("" ::"r"(input));
result = f(input);
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
}

uint64_t stop = gpu::processor_clock();
gpu::memory_fence();
asm("" ::"r"(stop));
volatile auto output = result;

// Return the time elapsed.
return stop - start;
}
} // namespace LIBC_NAMESPACE_DECL

#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
Loading