[libc][gpu] More benchmark for GPUs.

lntue · lntue · commit 2a158426d4b9 · 2024-08-02T00:04:22.000-04:00
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -111,9 +111,15 @@ class Benchmark {
 };
 
 // We want our random values to be approximately
-// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
-// 2^(max_exponent + 1)
-template <typename T> static T get_rand_input() {
+// Output: a random number with the exponent field between min_exp and max_exp,
+// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
+// Caveats:
+//   -EXP_BIAS corresponding to denormal values,
+//   EXP_BIAS + 1 corresponding to inf or nan.
+template <typename T>
+static T
+get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
+               int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
 
   // Required to correctly instantiate FPBits for floats and doubles.
@@ -125,10 +131,11 @@ template <typename T> static T get_rand_input() {
            static_cast<uint64_t>(LIBC_NAMESPACE::rand());
   else
     bits = LIBC_NAMESPACE::rand();
-  double scale = 0.5 + LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN / 2048.0;
+  double scale =
+      static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
   FPBits fp(bits);
   fp.set_biased_exponent(
-      static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+      static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
   return fp.get_val();
 }
 
@@ -141,19 +148,15 @@ template <typename T> class MathPerf {
 public:
   typedef T Func(T);
 
-  static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
-                                    StorageType ending_bit, StorageType step) {
-    uint64_t total_time = 0;
-    if (step <= 0)
-      step = 1;
-    volatile T result;
-    for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
-      T x = FPBits(bits).get_val();
-      total_time += LIBC_NAMESPACE::latency(f, x);
-    }
-    StorageType num_runs = (ending_bit - starting_bit) / step + 1;
-
-    return total_time / num_runs;
+  template <size_t N = 1>
+  static uint64_t run_perf_in_range(Func f, int min_exp, int max_exp) {
+    cpp::array<T, N> inputs;
+    for (size_t i = 0; i < N; ++i)
+      inputs[i] = get_rand_input<T>(min_exp, max_exp);
+
+    uint64_t total_time = LIBC_NAMESPACE::latency(f, inputs);
+
+    return total_time / N;
   }
 };
 
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -15,51 +15,41 @@
 #include "src/math/amdgpu/declarations.h"
 #endif
 
-constexpr double M_PI = 3.14159265358979323846;
-uint64_t get_bits(double x) {
-  return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
-}
-
 // BENCHMARK() expects a function that with no parameters that returns a
 // uint64_t representing the latency. Defining each benchmark using macro that
 // expands to a lambda to allow us to switch the implementation of `sin()` to
 // easily register NVPTX benchmarks.
-#define BM_RANDOM_INPUT(Func)                                                  \
-  []() {                                                                       \
-    double x = LIBC_NAMESPACE::benchmarks::get_rand_input<double>();           \
-    return LIBC_NAMESPACE::latency(Func, x);                                   \
-  }
-BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
-
-#define BM_TWO_PI(Func)                                                        \
+#define BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, N)                             \
   []() {                                                                       \
-    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
-        Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64));                     \
+    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range<N>( \
+        Func, MIN_EXP, MAX_EXP);                                               \
   }
-BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));
 
-#define BM_LARGE_INT(Func)                                                     \
-  []() {                                                                       \
-    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
-        Func, 0, get_bits(1 << 30), get_bits(1 << 4));                         \
-  }
-BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
-          BM_LARGE_INT(LIBC_NAMESPACE::sin));
+#define BENCH(Name, Func, MIN_EXP, MAX_EXP)                                    \
+  BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1,                                 \
+            BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1));                       \
+  BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_128,                               \
+            BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 128));                     \
+  BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1024,                              \
+            BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1024));                    \
+  BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_4096,                              \
+            BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 4096))
+
+BENCH(Sin, LIBC_NAMESPACE::sin, -1023, 1023);
+BENCH(SinTwoPi, LIBC_NAMESPACE::sin, -10, 3);
+BENCH(SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
+BENCH(SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);
 
 #ifdef NVPTX_MATH_FOUND
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
-          BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
-          BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
-          BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
+BENCH(NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
+BENCH(NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
+BENCH(NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
+BENCH(NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
 #endif
 
 #ifdef AMDGPU_MATH_FOUND
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSin,
-          BM_RANDOM_INPUT(LIBC_NAMESPACE::__ocml_sin_f64));
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinTwoPi,
-          BM_TWO_PI(LIBC_NAMESPACE::__ocml_sin_f64));
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinLargeInt,
-          BM_LARGE_INT(LIBC_NAMESPACE::__ocml_sin_f64));
+BENCH(AmdgpuSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
+BENCH(AmdgpuSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
+BENCH(AmdgpuSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
+BENCH(AmdgpuSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
 #endif
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
 #define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
 
+#include "src/__support/CPP/array.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/common.h"
@@ -43,14 +44,13 @@ namespace LIBC_NAMESPACE_DECL {
 // Profile a simple function and obtain its latency in clock cycles on the
 // system. This function cannot be inlined or else it will disturb the very
 // delicate balance of hard-coded dependencies.
-template <typename F, typename T>
-[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) {
-  // We need to store the input somewhere to guarantee that the compiler
-  // will not constant propagate it and remove the profiling region.
-  volatile T storage = t;
-  T arg = storage;
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t
+latency(F f, const cpp::array<T, N> &inputs) {
+  // // We need to store the input somewhere to guarantee that the compiler
+  // // will not constant propagate it and remove the profiling region.
 
-  FORCE_TO_REGISTER(T, arg);
+  FORCE_TO_REGISTER(decltype(inputs), inputs);
 
   // The AMDGPU architecture needs to wait on pending results.
   gpu::memory_fence();
@@ -59,16 +59,18 @@ template <typename F, typename T>
 
   // This forces the compiler to load the input argument and run the clock
   // cycle counter before the profiling region.
-  FORCE_TO_REGISTER(T, arg);
+  FORCE_TO_REGISTER(decltype(inputs), inputs);
   asm("" ::"s"(start));
 
   // Run the function under test and return its value.
-  auto result = f(arg);
-
-  // This inline assembly performs a no-op which forces the result to both
-  // be used and prevents us from exiting this region before it's complete.
-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
-      static_cast<uint32_t>(result)));
+  for (auto input : inputs) {
+    auto result = f(input);
+
+    // This inline assembly performs a no-op which forces the result to both
+    // be used and prevents us from exiting this region before it's complete.
+    asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+        static_cast<uint32_t>(result)));
+  }
 
   // Obtain the current timestamp after running the calculation and force
   // ordering.