-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[libc] Add Generic and NVPTX Sin Benchmark #99795
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-libc Author: None (jameshu15869) ChangesThis PR adds sin benchmarking for a range of values and on a pregenerated random distribution. Full diff: https://github.com/llvm/llvm-project/pull/99795.diff 6 Files Affected:
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 29e27724e1ab3..ba6958a0c68d5 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -47,13 +47,16 @@ add_unittest_framework_library(
libc.src.__support.CPP.limits
libc.src.__support.CPP.algorithm
libc.src.__support.CPP.atomic
+ libc.src.__support.CPP.array
libc.src.__support.fixed_point.fx_rep
libc.src.__support.macros.properties.types
libc.src.__support.OSUtil.osutil
libc.src.__support.uint128
+ libc.src.__support.FPUtil.fp_bits
libc.src.__support.FPUtil.sqrt
libc.src.__support.fixedvector
libc.src.time.clock
+ libc.src.stdlib.rand
libc.benchmarks.gpu.timing.timing
)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index c926d8efd7db2..05a6621036b0b 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -107,6 +107,9 @@ void print_results(Benchmark *b) {
void Benchmark::run_benchmarks() {
uint64_t id = gpu::get_thread_id();
+ if (id == 0) {
+ LIBC_NAMESPACE::benchmarks::init_random_input();
+ }
gpu::sync_threads();
for (Benchmark *b : benchmarks) {
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 29d7ba8b0a132..5d84959e17c4b 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -3,10 +3,13 @@
#include "benchmarks/gpu/BenchmarkLogger.h"
#include "benchmarks/gpu/timing/timing.h"
+#include "src/__support/CPP/array.h"
#include "src/__support/CPP/functional.h"
#include "src/__support/CPP/limits.h"
#include "src/__support/CPP/string_view.h"
+#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/macros/config.h"
+#include "src/stdlib/rand.h"
#include "src/time/clock.h"
#include <stdint.h>
@@ -102,6 +105,56 @@ class Benchmark {
return benchmark(options, func);
}
};
+
+// We want our random values to be approximately
+// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
+// 2^(max_exponent + 1)
+// The largest integer that can be stored in a double is 2^53
+static constexpr int MAX_EXPONENT = 52;
+static constexpr int RANDOM_INPUT_SIZE = 1024;
+static cpp::array<double, RANDOM_INPUT_SIZE> random_input;
+
+static double get_rand() {
+ using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
+ uint64_t bits = LIBC_NAMESPACE::rand();
+ double scale = 0.5 + MAX_EXPONENT / 2048.0;
+ FPBits fp(bits);
+ fp.set_biased_exponent(
+ static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+ return fp.get_val();
+}
+
+static void init_random_input() {
+ for (int i = 0; i < RANDOM_INPUT_SIZE; i++) {
+ random_input[i] = get_rand();
+ }
+}
+
+template <typename T> class MathPerf {
+ using FPBits = fputil::FPBits<T>;
+ using StorageType = typename FPBits::StorageType;
+ static constexpr StorageType UIntMax =
+ cpp::numeric_limits<StorageType>::max();
+
+public:
+ typedef T Func(T);
+
+ static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
+ StorageType ending_bit, StorageType step) {
+ uint64_t total_time = 0;
+ if (step <= 0)
+ step = 1;
+ volatile T result;
+ for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
+ T x = FPBits(bits).get_val();
+ total_time += LIBC_NAMESPACE::latency(f, x);
+ }
+ StorageType num_runs = (ending_bit - starting_bit) / step + 1;
+
+ return total_time / num_runs;
+ }
+};
+
} // namespace benchmarks
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/benchmarks/gpu/src/CMakeLists.txt b/libc/benchmarks/gpu/src/CMakeLists.txt
index 42eb4f7b5909a..f15d082e4dd2b 100644
--- a/libc/benchmarks/gpu/src/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/CMakeLists.txt
@@ -1 +1,2 @@
add_subdirectory(ctype)
+add_subdirectory(math)
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
new file mode 100644
index 0000000000000..2b27652e46ae9
--- /dev/null
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -0,0 +1,31 @@
+add_custom_target(libc-gpu-math-benchmarks)
+
+if(CUDAToolkit_FOUND)
+ set(libdevice_path ${CUDAToolkit_BIN_DIR}/../nvvm/libdevice/libdevice.10.bc)
+ if (EXISTS ${libdevice_path})
+ set(nvptx_bitcode_link_flags
+ "SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${libdevice_path}")
+ # Compile definition needed so the benchmark knows to register
+ # NVPTX benchmarks.
+ set(nvptx_math_found "-DNVPTX_MATH_FOUND=1")
+ endif()
+endif()
+
+add_benchmark(
+ sin_benchmark
+ SUITE
+ libc-gpu-math-benchmarks
+ SRCS
+ sin_benchmark.cpp
+ DEPENDS
+ libc.src.math.sin
+ libc.src.stdlib.srand
+ libc.src.stdlib.rand
+ libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.CPP.bit
+ COMPILE_OPTIONS
+ ${nvptx_math_found}
+ ${nvptx_bitcode_link_flags}
+ LOADER_ARGS
+ --threads 64
+)
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
new file mode 100644
index 0000000000000..ac35e22b57287
--- /dev/null
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -0,0 +1,55 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/functional.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/math/sin.h"
+#include "src/stdlib/rand.h"
+#include "src/stdlib/srand.h"
+
+#ifdef NVPTX_MATH_FOUND
+#include "src/math/nvptx/declarations.h"
+#endif
+
+constexpr double M_PI = 3.14159265358979323846;
+uint64_t get_bits(double x) {
+ return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
+}
+
+// BENCHMARK() expects a function that with no parameters that returns a
+// uint64_t representing the latency. Defining each benchmark using macro that
+// expands to a lambda to allow us to switch the implementation of `sin()` to
+// easily register NVPTX benchmarks.
+#define BM_RANDOM_INPUT(Func) \
+ []() { \
+ uint64_t total_time = 0; \
+ for (double i : LIBC_NAMESPACE::benchmarks::random_input) { \
+ total_time += LIBC_NAMESPACE::latency(Func, i); \
+ } \
+ return total_time / LIBC_NAMESPACE::benchmarks::random_input.size(); \
+ }
+BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
+
+#define BM_TWO_PI(Func) \
+ []() { \
+ return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
+ Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64)); \
+ }
+BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));
+
+#define BM_LARGE_INT(Func) \
+ []() { \
+ return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range( \
+ Func, 0, get_bits(1 << 30), get_bits(1 << 4)); \
+ }
+BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
+ BM_LARGE_INT(LIBC_NAMESPACE::sin));
+
+#ifdef NVPTX_MATH_FOUND
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
+ BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
+ BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
+ BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
+#endif
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Neat, it's interesting to get very specific timings against the vendor implementations.
@lntue do you know if anything here is common with the math stuff you already have written?
|
||
static double get_rand() { | ||
using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>; | ||
uint64_t bits = LIBC_NAMESPACE::rand(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you want the results to be truly random we can probably just seed the RNG with the low bits of get_processor_clock()
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I also wonder if we should make a reeantrant version of rand
so we can have a state private to each thread instead of the shared state.
I.e.
val = rand_r(&state)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think for math function tests we really care too much about real random, as long as it shuffled enough
static double get_rand() { | ||
using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>; | ||
uint64_t bits = LIBC_NAMESPACE::rand(); | ||
double scale = 0.5 + MAX_EXPONENT / 2048.0; | ||
FPBits fp(bits); | ||
fp.set_biased_exponent( | ||
static_cast<uint32_t>(fp.get_biased_exponent() * scale)); | ||
return fp.get_val(); | ||
} | ||
|
||
static void init_random_input() { | ||
LIBC_NAMESPACE::srand(LIBC_NAMESPACE::gpu::processor_clock()); | ||
for (int i = 0; i < RANDOM_INPUT_SIZE; i++) { | ||
random_input[i] = get_rand(); | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I figured this would be handled in the benchmark itself, like we do it before the latency calculation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, yeah that makes sense. I was thinking it might reduce duplication if we have to do the same thing for multiple benchmarks, but I think you're right about moving it to the benchmark itself
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But now that I think about it, what was the way to share the array between all threads without using a global? I'm assuming that we want all threads to see the same random input, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can put it as a utility in the header.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you mean keep the array in the header but call init_random_array()
from the benchmark itself?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah I just mean if you want a utility to get some random input, just make it a function in the header and have the benchmarks call it.
53030fc
to
81f86e5
Compare
81f86e5
to
a0f1905
Compare
template <size_t Size> | ||
static void init_random_double_input(cpp::array<double, Size> &values) { | ||
LIBC_NAMESPACE::srand(LIBC_NAMESPACE::gpu::processor_clock()); | ||
for (int i = 0; i < Size; i++) { | ||
values[i] = get_rand_double(); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This isn't used anywhere, right? Can probably just drop it for now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oops, I missed this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We might want to move this srand()
call into the benchmark somewhere when we do other init stuff.
// The largest integer that can be stored in a double is 2^53 | ||
static constexpr int MAX_EXPONENT = 52; | ||
|
||
static double get_rand_double() { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should probably template this because we'll want the same treatment for sin
and sinf
and sinf16
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since floats should have a different MAX_EXPONENT, how would you suggest we template this? Is there a nice way to switch the value of MAX_EXPONENT
when using floats and doubles?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FPBits<T>::FRACTION_LEN
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I forget the name of it, but it's standard type_traits
stuff, see https://godbolt.org/z/4TGrvadYY.
// Required to correctly instantiate FPBits for floats and doubles. | ||
using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>), | ||
uint64_t, uint32_t>; | ||
RandType bits = LIBC_NAMESPACE::rand(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
rand()
always returns an int, so it's always 32-bit.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think I was having some problems when trying to instantiate FPBits
where it would only accept uint64_t
for doubles and uint32_t
for floats. Are you able to check if you can reproduce this locally?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if that means we need to call rand
twice to get an actual 64-bit value... @lntue might have some better insight here. Maybe we just need a better reentrant random function.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess we can put a call to srand(gpu::processor_clock())
inside of the benchmark registration code.
Ah, did you want to run |
|
Conceptually, should each individual benchmark (e.g. |
Just call it once when we initialize the benchmarks, the sequence will then effectively be "random" whenever any other thread gets a value from the sequence. |
I'm thinking about having thread 0 call |
Pretty much like this. diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index a9a912538cd8..2574eb46f160 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -136,8 +136,10 @@ void print_header() {
void Benchmark::run_benchmarks() {
uint64_t id = gpu::get_thread_id();
- if (id == 0)
+ if (id == 0) {
+ srand(gpu::processor_clock());
print_header();
+ }
gpu::sync_threads(); |
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :); | ||
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"( | ||
static_cast<uint32_t>(result)) | ||
:); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this colon required?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oops, I swear I removed this already
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG, thanks. Looking forward to getting this working for AMD and then testing a few more functions.
This PR adds sin benchmarking for a range of values and on a pregenerated random distribution.