-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[libc] [gpu] Add Generic, NvSin, and OcmlSinf64 Throughput Benchmark #101917
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-libc Author: None (jameshu15869) ChangesThis PR implements lntue@2a15842 to provide better throughput benchmarking for libc These changes have not been tested on AMDGPU yet, only compiled. Full diff: https://github.com/llvm/llvm-project/pull/101917.diff 6 Files Affected:
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 2b85b146ed745..39e4a6e9e0152 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -21,7 +21,7 @@ namespace benchmarks {
struct BenchmarkOptions {
uint32_t initial_iterations = 1;
- uint32_t min_iterations = 50;
+ uint32_t min_iterations = 1;
uint32_t max_iterations = 10000000;
uint32_t min_samples = 4;
uint32_t max_samples = 1000;
@@ -111,9 +111,15 @@ class Benchmark {
};
// We want our random values to be approximately
-// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
-// 2^(max_exponent + 1)
-template <typename T> static T get_rand_input() {
+// Output: a random number with the exponent field between min_exp and max_exp,
+// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
+// Caveats:
+// -EXP_BIAS corresponding to denormal values,
+// EXP_BIAS + 1 corresponding to inf or nan.
+template <typename T>
+static T
+get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
+ int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
// Required to correctly instantiate FPBits for floats and doubles.
@@ -125,10 +131,11 @@ template <typename T> static T get_rand_input() {
static_cast<uint64_t>(LIBC_NAMESPACE::rand());
else
bits = LIBC_NAMESPACE::rand();
- double scale = 0.5 + LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN / 2048.0;
+ double scale =
+ static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
FPBits fp(bits);
fp.set_biased_exponent(
- static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+ static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
return fp.get_val();
}
@@ -141,19 +148,15 @@ template <typename T> class MathPerf {
public:
typedef T Func(T);
- static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
- StorageType ending_bit, StorageType step) {
- uint64_t total_time = 0;
- if (step <= 0)
- step = 1;
- volatile T result;
- for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
- T x = FPBits(bits).get_val();
- total_time += LIBC_NAMESPACE::latency(f, x);
- }
- StorageType num_runs = (ending_bit - starting_bit) / step + 1;
-
- return total_time / num_runs;
+ template <size_t N = 1>
+ static uint64_t run_perf_in_range(Func f, int min_exp, int max_exp) {
+ cpp::array<T, N> inputs;
+ for (size_t i = 0; i < N; ++i)
+ inputs[i] = get_rand_input<T>(min_exp, max_exp);
+
+ uint64_t total_time = LIBC_NAMESPACE::latency(f, inputs);
+
+ return total_time / N;
}
};
@@ -176,5 +179,4 @@ template <typename T> class MathPerf {
#define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \
BENCHMARK_N_THREADS(SuiteName, TestName, Func, \
LIBC_NAMESPACE::gpu::get_lane_size())
-
#endif
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index 5849ea3e99bb0..03f824deae6a5 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -15,51 +15,41 @@
#include "src/math/amdgpu/declarations.h"
#endif
-constexpr double M_PI = 3.14159265358979323846;
-uint64_t get_bits(double x) {
- return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
-}
-
// BENCHMARK() expects a function that with no parameters that returns a
// uint64_t representing the latency. Defining each benchmark using macro that
// expands to a lambda to allow us to switch the implementation of `sin()` to
// easily register NVPTX benchmarks.
-#define BM_RANDOM_INPUT(Func) \
- []() { \
- double x = LIBC_NAMESPACE::benchmarks::get_rand_input<double>(); \
- return LIBC_NAMESPACE::latency(Func, x); \
- }
-BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
-
-#define BM_TWO_PI(Func) \
+#define BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, N) \
[]() { \
- return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
- Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64)); \
+ return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range<N>( \
+ Func, MIN_EXP, MAX_EXP); \
}
-BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));
-#define BM_LARGE_INT(Func) \
- []() { \
- return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
- Func, 0, get_bits(1 << 30), get_bits(1 << 4)); \
- }
-BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
- BM_LARGE_INT(LIBC_NAMESPACE::sin));
+#define BENCH(Name, Func, MIN_EXP, MAX_EXP) \
+ SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1, \
+ BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1)); \
+ SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_128, \
+ BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 128)); \
+ SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1024, \
+ BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1024)); \
+ SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_4096, \
+ BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 4096))
+
+BENCH(Sin, LIBC_NAMESPACE::sin, -1023, 1023);
+BENCH(SinTwoPi, LIBC_NAMESPACE::sin, -10, 3);
+BENCH(SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
+BENCH(SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);
#ifdef NVPTX_MATH_FOUND
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
- BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
- BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
- BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
+BENCH(NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
+BENCH(NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
+BENCH(NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
+BENCH(NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
#endif
#ifdef AMDGPU_MATH_FOUND
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSin,
- BM_RANDOM_INPUT(LIBC_NAMESPACE::__ocml_sin_f64));
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinTwoPi,
- BM_TWO_PI(LIBC_NAMESPACE::__ocml_sin_f64));
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinLargeInt,
- BM_LARGE_INT(LIBC_NAMESPACE::__ocml_sin_f64));
+BENCH(AmdgpuSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
+BENCH(AmdgpuSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
+BENCH(AmdgpuSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
+BENCH(AmdgpuSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
#endif
diff --git a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
index 179429db9a09a..aa5dcd33bee9c 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
@@ -4,4 +4,8 @@ add_header_library(
timing.h
DEPENDS
libc.src.__support.common
+ libc.src.__support.macros.config
+ libc.src.__support.macros.attributes
+ libc.src.__support.CPP.type_traits
+ libc.src.__support.CPP.array
)
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index e308d619e9569..e53eb25f83930 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -14,17 +14,10 @@
#include "src/__support/common.h"
#include "src/__support/macros/attributes.h"
#include "src/__support/macros/config.h"
+#include "src/__support/CPP/array.h"
#include <stdint.h>
-// AMDGPU does not support input register constraints for i1 and i8, so we must
-// cast them to uint16_t's before loading them into registers.
-#define FORCE_TO_REGISTER(TYPE, VARIABLE) \
- if constexpr (cpp::is_same_v<TYPE, char> || cpp::is_same_v<TYPE, bool>) \
- asm("" ::"v"(static_cast<uint16_t>(VARIABLE))); \
- else \
- asm("" ::"v"(VARIABLE))
-
namespace LIBC_NAMESPACE_DECL {
// Returns the overhead associated with calling the profiling region. This
@@ -50,7 +43,8 @@ template <typename F, typename T>
volatile T storage = t;
T arg = storage;
- FORCE_TO_REGISTER(T, arg);
+ // VGPR constraints can only accept primitive values.
+ asm("" ::"v"(&arg));
// The AMDGPU architecture needs to wait on pending results.
gpu::memory_fence();
@@ -59,8 +53,7 @@ template <typename F, typename T>
// This forces the compiler to load the input argument and run the clock
// cycle counter before the profiling region.
- FORCE_TO_REGISTER(T, arg);
- asm("" ::"s"(start));
+ asm("" ::"s"(start), "v"(&arg));
// Run the function under test and return its value.
auto result = f(arg);
@@ -87,15 +80,12 @@ template <typename F, typename T1, typename T2>
T1 arg1 = storage1;
T2 arg2 = storage2;
- FORCE_TO_REGISTER(T1, arg1);
- FORCE_TO_REGISTER(T2, arg2);
+ asm("" ::"v"(&arg1), "v"(&arg2));
gpu::memory_fence();
uint64_t start = gpu::processor_clock();
- FORCE_TO_REGISTER(T1, arg1);
- FORCE_TO_REGISTER(T2, arg2);
- asm("" ::"s"(start));
+ asm("" ::"s"(start), "v"(&arg1), "v"(&arg2));
auto result = f(arg1, arg2);
@@ -109,6 +99,35 @@ template <typename F, typename T1, typename T2>
return stop - start;
}
+// Provides throughput benchmarking.
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t
+latency(F f, const cpp::array<T, N> &inputs) {
+ volatile auto storage = &inputs;
+ auto array_pointer = storage;
+ asm("" ::"v"(array_pointer));
+ auto register_array = *array_pointer;
+
+ gpu::memory_fence();
+ uint64_t start = gpu::processor_clock();
+
+ asm("" ::"s"(start), "v"(array_pointer));
+
+ for (auto input : register_array) {
+ auto result = f(input);
+
+ asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+ static_cast<uint32_t>(result)));
+ }
+
+ uint64_t stop = gpu::processor_clock();
+ asm("" ::"s"(stop));
+ gpu::memory_fence();
+
+ // Return the time elapsed.
+ return stop - start;
+}
+
} // namespace LIBC_NAMESPACE_DECL
#endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
index 9958e16206a41..2723c8940814c 100644
--- a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -4,4 +4,8 @@ add_header_library(
timing.h
DEPENDS
libc.src.__support.common
+ libc.src.__support.macros.config
+ libc.src.__support.macros.attributes
+ libc.src.__support.CPP.type_traits
+ libc.src.__support.CPP.array
)
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index b426dfd0ea153..dee8d6ea41f47 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -9,6 +9,8 @@
#ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/type_traits.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/common.h"
#include "src/__support/macros/attributes.h"
@@ -94,6 +96,36 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
return stop - start;
}
+
+// Provides throughput benchmarking.
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t
+latency(F f, const cpp::array<T, N> &inputs) {
+ volatile auto storage = &inputs;
+ auto array_pointer = storage;
+ asm("" ::"r"(array_pointer));
+ auto register_array = *array_pointer;
+
+ gpu::memory_fence();
+ uint64_t start = gpu::processor_clock();
+
+ asm("" ::"r"(array_pointer), "llr"(start));
+
+ uint64_t result;
+ for (auto input : register_array) {
+ asm("" ::"r"(input));
+ result = f(input);
+ asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
+ }
+
+ uint64_t stop = gpu::processor_clock();
+ gpu::memory_fence();
+ asm("" ::"r"(stop));
+ volatile auto output = result;
+
+ // Return the time elapsed.
+ return stop - start;
+}
} // namespace LIBC_NAMESPACE_DECL
#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm wondering if we shouldn't have separate functions for throughput and latency. We likely want to keep the old assembly constraints for the latency checks, but can use something different if we put it in an array.
Also, @lntue, is it necessary to even use a loop? If we want strict throughput couldn't we just do something like
#pragma unroll
for (int i = 0; i < DEPTH; ++i) {
auto x = fn(input);
asm("" : "r"(input) ::); // Probably need to trick the compiler into thinking this changed.
}
I don't think I changed the existing latency constraints, the new constraints in this diff are for an overloaded EDIT: Sorry, I got confused - i did make some unnecessary changes to the latency asm constraints |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, I see you added new ones called latency
as well. Somewhat surprised the type deduction is working here if so.
Yeah, I think the original intent of changing the constraints on AMDGPU was that I realized using the pointer might be possible instead of having to do that |
I'll need to double check the ASM generated there, I forget if capturing the pointer prevented it from carrying the input in a register. |
This PR implements lntue@2a15842 to provide better throughput benchmarking for libc
sin()
and__nv_sin()
.These changes have not been tested on AMDGPU yet, only compiled.