[libc] [gpu] Add Generic, NvSin, and OcmlSinf64 Throughput Benchmark (#101917)

jameshu15869 · web-flow · commit 9a070d6d0f0c · 2024-08-08T15:05:34.000-05:00
This PR implements lntue@2a15842 to provide better throughput benchmarking for libc `sin()` and `__nv_sin()`. These changes have not been tested on AMDGPU yet, only compiled.
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -21,7 +21,7 @@ namespace benchmarks {
 
 struct BenchmarkOptions {
   uint32_t initial_iterations = 1;
-  uint32_t min_iterations = 50;
+  uint32_t min_iterations = 1;
   uint32_t max_iterations = 10000000;
   uint32_t min_samples = 4;
   uint32_t max_samples = 1000;
@@ -111,9 +111,15 @@ class Benchmark {
 };
 
 // We want our random values to be approximately
-// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
-// 2^(max_exponent + 1)
-template <typename T> static T get_rand_input() {
+// Output: a random number with the exponent field between min_exp and max_exp,
+// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
+// Caveats:
+//   -EXP_BIAS corresponding to denormal values,
+//   EXP_BIAS + 1 corresponding to inf or nan.
+template <typename T>
+static T
+get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
+               int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
 
   // Required to correctly instantiate FPBits for floats and doubles.
@@ -125,10 +131,11 @@ template <typename T> static T get_rand_input() {
            static_cast<uint64_t>(LIBC_NAMESPACE::rand());
   else
     bits = LIBC_NAMESPACE::rand();
-  double scale = 0.5 + LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN / 2048.0;
+  double scale =
+      static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
   FPBits fp(bits);
   fp.set_biased_exponent(
-      static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+      static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
   return fp.get_val();
 }
 
@@ -141,19 +148,15 @@ template <typename T> class MathPerf {
 public:
   typedef T Func(T);
 
-  static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
-                                    StorageType ending_bit, StorageType step) {
-    uint64_t total_time = 0;
-    if (step <= 0)
-      step = 1;
-    volatile T result;
-    for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
-      T x = FPBits(bits).get_val();
-      total_time += LIBC_NAMESPACE::latency(f, x);
-    }
-    StorageType num_runs = (ending_bit - starting_bit) / step + 1;
-
-    return total_time / num_runs;
+  template <size_t N = 1>
+  static uint64_t run_throughput_in_range(Func f, int min_exp, int max_exp) {
+    cpp::array<T, N> inputs;
+    for (size_t i = 0; i < N; ++i)
+      inputs[i] = get_rand_input<T>(min_exp, max_exp);
+
+    uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs);
+
+    return total_time / N;
   }
 };
 
@@ -176,5 +179,4 @@ template <typename T> class MathPerf {
 #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func)                       \
   BENCHMARK_N_THREADS(SuiteName, TestName, Func,                               \
                       LIBC_NAMESPACE::gpu::get_lane_size())
-
 #endif
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -15,51 +15,41 @@
 #include "src/math/amdgpu/declarations.h"
 #endif
 
-constexpr double M_PI = 3.14159265358979323846;
-uint64_t get_bits(double x) {
-  return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
-}
-
 // BENCHMARK() expects a function that with no parameters that returns a
 // uint64_t representing the latency. Defining each benchmark using macro that
 // expands to a lambda to allow us to switch the implementation of `sin()` to
 // easily register NVPTX benchmarks.
-#define BM_RANDOM_INPUT(Func)                                                  \
-  []() {                                                                       \
-    double x = LIBC_NAMESPACE::benchmarks::get_rand_input<double>();           \
-    return LIBC_NAMESPACE::latency(Func, x);                                   \
-  }
-BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
-
-#define BM_TWO_PI(Func)                                                        \
+#define BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, N)                             \
   []() {                                                                       \
-    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
-        Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64));                     \
+    return LIBC_NAMESPACE::benchmarks::MathPerf<                               \
+        double>::run_throughput_in_range<N>(Func, MIN_EXP, MAX_EXP);           \
   }
-BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));
 
-#define BM_LARGE_INT(Func)                                                     \
-  []() {                                                                       \
-    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
-        Func, 0, get_bits(1 << 30), get_bits(1 << 4));                         \
-  }
-BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
-          BM_LARGE_INT(LIBC_NAMESPACE::sin));
+#define BENCH(Name, Func, MIN_EXP, MAX_EXP)                                    \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1,                     \
+                        BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1));           \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_128,                   \
+                        BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 128));         \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1024,                  \
+                        BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1024));        \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_4096,                  \
+                        BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 4096))
+
+BENCH(Sin, LIBC_NAMESPACE::sin, -1023, 1023);
+BENCH(SinTwoPi, LIBC_NAMESPACE::sin, -10, 3);
+BENCH(SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
+BENCH(SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);
 
 #ifdef NVPTX_MATH_FOUND
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
-          BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
-          BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
-          BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
+BENCH(NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
+BENCH(NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
+BENCH(NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
+BENCH(NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
 #endif
 
 #ifdef AMDGPU_MATH_FOUND
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSin,
-          BM_RANDOM_INPUT(LIBC_NAMESPACE::__ocml_sin_f64));
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinTwoPi,
-          BM_TWO_PI(LIBC_NAMESPACE::__ocml_sin_f64));
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinLargeInt,
-          BM_LARGE_INT(LIBC_NAMESPACE::__ocml_sin_f64));
+BENCH(AmdgpuSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
+BENCH(AmdgpuSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
+BENCH(AmdgpuSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
+BENCH(AmdgpuSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
 #endif
diff --git a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
@@ -4,4 +4,8 @@ add_header_library(
     timing.h
   DEPENDS
     libc.src.__support.common
+    libc.src.__support.macros.config
+    libc.src.__support.macros.attributes
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.CPP.array
 )
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
 #define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
 
+#include "src/__support/CPP/array.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/common.h"
@@ -17,14 +18,6 @@
 
 #include <stdint.h>
 
-// AMDGPU does not support input register constraints for i1 and i8, so we must
-// cast them to uint16_t's before loading them into registers.
-#define FORCE_TO_REGISTER(TYPE, VARIABLE)                                      \
-  if constexpr (cpp::is_same_v<TYPE, char> || cpp::is_same_v<TYPE, bool>)      \
-    asm("" ::"v"(static_cast<uint16_t>(VARIABLE)));                            \
-  else                                                                         \
-    asm("" ::"v"(VARIABLE))
-
 namespace LIBC_NAMESPACE_DECL {
 
 // Returns the overhead associated with calling the profiling region. This
@@ -50,25 +43,29 @@ template <typename F, typename T>
   volatile T storage = t;
   T arg = storage;
 
-  FORCE_TO_REGISTER(T, arg);
-
   // The AMDGPU architecture needs to wait on pending results.
   gpu::memory_fence();
   // Get the current timestamp from the clock.
   uint64_t start = gpu::processor_clock();
 
   // This forces the compiler to load the input argument and run the clock
   // cycle counter before the profiling region.
-  FORCE_TO_REGISTER(T, arg);
   asm("" ::"s"(start));
 
   // Run the function under test and return its value.
   auto result = f(arg);
 
   // This inline assembly performs a no-op which forces the result to both
   // be used and prevents us from exiting this region before it's complete.
-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
-      static_cast<uint32_t>(result)));
+  if constexpr (cpp::is_same_v<decltype(result), char> ||
+                cpp::is_same_v<decltype(result), bool>)
+    // AMDGPU does not support input register constraints for i1 and i8, so we
+    // cast it to a 32-bit integer. This does not add an additional assembly
+    // instruction (https://godbolt.org/z/zxGqv8G91).
+    asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+        static_cast<uint32_t>(result)));
+  else
+    asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
 
   // Obtain the current timestamp after running the calculation and force
   // ordering.
@@ -87,20 +84,19 @@ template <typename F, typename T1, typename T2>
   T1 arg1 = storage1;
   T2 arg2 = storage2;
 
-  FORCE_TO_REGISTER(T1, arg1);
-  FORCE_TO_REGISTER(T2, arg2);
-
   gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
 
-  FORCE_TO_REGISTER(T1, arg1);
-  FORCE_TO_REGISTER(T2, arg2);
   asm("" ::"s"(start));
 
   auto result = f(arg1, arg2);
 
-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
-      static_cast<uint32_t>(result)));
+  if constexpr (cpp::is_same_v<decltype(result), char> ||
+                cpp::is_same_v<decltype(result), bool>)
+    asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+        static_cast<uint32_t>(result)));
+  else
+    asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
 
   uint64_t stop = gpu::processor_clock();
   asm("" ::"s"(stop));
@@ -109,6 +105,31 @@ template <typename F, typename T1, typename T2>
   return stop - start;
 }
 
+// Provides throughput benchmarking.
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t
+throughput(F f, const cpp::array<T, N> &inputs) {
+  asm("" ::"v"(&inputs));
+
+  gpu::memory_fence();
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"s"(start));
+
+  for (auto input : inputs) {
+    auto result = f(input);
+
+    asm("" ::"v"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  asm("" ::"s"(stop));
+  gpu::memory_fence();
+
+  // Return the time elapsed.
+  return stop - start;
+}
+
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -4,4 +4,8 @@ add_header_library(
     timing.h
   DEPENDS
     libc.src.__support.common
+    libc.src.__support.macros.config
+    libc.src.__support.macros.attributes
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.CPP.array
 )
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
 #define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
 
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/attributes.h"
@@ -25,7 +27,7 @@ namespace LIBC_NAMESPACE_DECL {
   volatile uint32_t x = 1;
   uint32_t y = x;
   uint64_t start = gpu::processor_clock();
-  asm("" ::"r"(y), "llr"(start));
+  asm("" ::"llr"(start));
   uint32_t result = y;
   asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
   uint64_t stop = gpu::processor_clock();
@@ -42,15 +44,14 @@ template <typename F, typename T>
   // not constant propagate it and remove the profiling region.
   volatile T storage = t;
   T arg = storage;
-  asm("" ::"r"(arg));
 
   // Get the current timestamp from the clock.
   gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
 
   // This forces the compiler to load the input argument and run the clock cycle
   // counter before the profiling region.
-  asm("" ::"r"(arg), "llr"(start));
+  asm("" ::"llr"(start));
 
   // Run the function under test and return its value.
   auto result = f(arg);
@@ -76,12 +77,11 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
   volatile T2 storage2 = t2;
   T1 arg = storage;
   T2 arg2 = storage2;
-  asm("" ::"r"(arg), "r"(arg2));
 
   gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
 
-  asm("" ::"r"(arg), "r"(arg2), "llr"(start));
+  asm("" ::"llr"(start));
 
   auto result = f(arg, arg2);
 
@@ -94,6 +94,33 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
 
   return stop - start;
 }
+
+// Provides throughput benchmarking.
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t
+throughput(F f, const cpp::array<T, N> &inputs) {
+  asm("" ::"r"(&inputs));
+
+  gpu::memory_fence();
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"llr"(start));
+
+  uint64_t result;
+  for (auto input : inputs) {
+    asm("" ::"r"(input));
+    result = f(input);
+    asm("" ::"r"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  gpu::memory_fence();
+  asm("" ::"r"(stop));
+  volatile auto output = result;
+
+  // Return the time elapsed.
+  return stop - start;
+}
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX

Original file line number	Diff line number	Diff line change
`@@ -4,4 +4,8 @@ add_header_library(`
`4`	`4`	`timing.h`
`5`	`5`	`DEPENDS`
`6`	`6`	`libc.src.__support.common`
	`7`	`+ libc.src.__support.macros.config`
	`8`	`+ libc.src.__support.macros.attributes`
	`9`	`+ libc.src.__support.CPP.type_traits`
	`10`	`+ libc.src.__support.CPP.array`
`7`	`11`	`)`