[libc] Add Generic and NVPTX Sin Benchmark (#99795)

jameshu15869 · web-flow · commit 677796cab327 · 2024-07-29T22:09:11.000-05:00
This PR adds sin benchmarking for a range of values and on a
pregenerated random distribution.
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
@@ -22,6 +22,8 @@ function(add_benchmark benchmark_name)
       ${BENCHMARK_LINK_LIBRARIES}
     DEPENDS
       libc.src.stdio.printf
+      libc.src.stdlib.srand
+      libc.src.stdlib.rand
       ${BENCHMARK_DEPENDS}
     ${BENCHMARK_UNPARSED_ARGUMENTS}
     COMPILE_OPTIONS
@@ -52,13 +54,17 @@ add_unittest_framework_library(
     libc.src.__support.CPP.limits
     libc.src.__support.CPP.algorithm
     libc.src.__support.CPP.atomic
+    libc.src.__support.CPP.array
     libc.src.__support.fixed_point.fx_rep
     libc.src.__support.macros.properties.types
     libc.src.__support.OSUtil.osutil
     libc.src.__support.uint128
+    libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.sqrt
     libc.src.__support.fixedvector
     libc.src.time.clock
+    libc.src.stdlib.rand
+    libc.src.stdlib.srand
     libc.benchmarks.gpu.timing.timing
     libc.src.stdio.printf
 )
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -8,6 +8,7 @@
 #include "src/__support/fixedvector.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf.h"
+#include "src/stdlib/srand.h"
 #include "src/time/gpu/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -136,8 +137,10 @@ void print_header() {
 void Benchmark::run_benchmarks() {
   uint64_t id = gpu::get_thread_id();
 
-  if (id == 0)
+  if (id == 0) {
     print_header();
+    LIBC_NAMESPACE::srand(gpu::processor_clock());
+  }
 
   gpu::sync_threads();
 
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -3,10 +3,14 @@
 
 #include "benchmarks/gpu/BenchmarkLogger.h"
 #include "benchmarks/gpu/timing/timing.h"
+#include "src/__support/CPP/array.h"
 #include "src/__support/CPP/functional.h"
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/string_view.h"
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/macros/config.h"
+#include "src/stdlib/rand.h"
 #include "src/time/clock.h"
 
 #include <stdint.h>
@@ -105,6 +109,54 @@ class Benchmark {
     return benchmark(options, func);
   }
 };
+
+// We want our random values to be approximately
+// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
+// 2^(max_exponent + 1)
+template <typename T> static T get_rand_input() {
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
+
+  // Required to correctly instantiate FPBits for floats and doubles.
+  using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>),
+                                               uint64_t, uint32_t>;
+  RandType bits;
+  if constexpr (cpp::is_same_v<T, uint64_t>)
+    bits = (static_cast<uint64_t>(LIBC_NAMESPACE::rand()) << 32) |
+           static_cast<uint64_t>(LIBC_NAMESPACE::rand());
+  else
+    bits = LIBC_NAMESPACE::rand();
+  double scale = 0.5 + LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN / 2048.0;
+  FPBits fp(bits);
+  fp.set_biased_exponent(
+      static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+  return fp.get_val();
+}
+
+template <typename T> class MathPerf {
+  using FPBits = fputil::FPBits<T>;
+  using StorageType = typename FPBits::StorageType;
+  static constexpr StorageType UIntMax =
+      cpp::numeric_limits<StorageType>::max();
+
+public:
+  typedef T Func(T);
+
+  static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
+                                    StorageType ending_bit, StorageType step) {
+    uint64_t total_time = 0;
+    if (step <= 0)
+      step = 1;
+    volatile T result;
+    for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
+      T x = FPBits(bits).get_val();
+      total_time += LIBC_NAMESPACE::latency(f, x);
+    }
+    StorageType num_runs = (ending_bit - starting_bit) / step + 1;
+
+    return total_time / num_runs;
+  }
+};
+
 } // namespace benchmarks
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/benchmarks/gpu/src/CMakeLists.txt b/libc/benchmarks/gpu/src/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(ctype)
+add_subdirectory(math)
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -0,0 +1,32 @@
+add_custom_target(libc-gpu-math-benchmarks)
+
+if(CUDAToolkit_FOUND)
+  set(libdevice_path ${CUDAToolkit_BIN_DIR}/../nvvm/libdevice/libdevice.10.bc)
+  if (EXISTS ${libdevice_path})
+    set(nvptx_bitcode_link_flags
+        "SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${libdevice_path}")
+    # Compile definition needed so the benchmark knows to register
+    # NVPTX benchmarks.
+    set(nvptx_math_found "-DNVPTX_MATH_FOUND=1")
+  endif()
+endif()
+
+add_benchmark(
+  sin_benchmark
+  SUITE
+    libc-gpu-math-benchmarks
+  SRCS
+    sin_benchmark.cpp
+  DEPENDS
+    libc.src.math.sin
+    libc.src.stdlib.srand
+    libc.src.stdlib.rand
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.CPP.bit
+    libc.src.__support.CPP.array
+  COMPILE_OPTIONS
+    ${nvptx_math_found}
+    ${nvptx_bitcode_link_flags}
+  LOADER_ARGS
+    --threads 64
+)
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -0,0 +1,53 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/functional.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/math/sin.h"
+#include "src/stdlib/rand.h"
+#include "src/stdlib/srand.h"
+
+#ifdef NVPTX_MATH_FOUND
+#include "src/math/nvptx/declarations.h"
+#endif
+
+constexpr double M_PI = 3.14159265358979323846;
+uint64_t get_bits(double x) {
+  return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
+}
+
+// BENCHMARK() expects a function that with no parameters that returns a
+// uint64_t representing the latency. Defining each benchmark using macro that
+// expands to a lambda to allow us to switch the implementation of `sin()` to
+// easily register NVPTX benchmarks.
+#define BM_RANDOM_INPUT(Func)                                                  \
+  []() {                                                                       \
+    double x = LIBC_NAMESPACE::benchmarks::get_rand_input<double>();           \
+    return LIBC_NAMESPACE::latency(Func, x);                                   \
+  }
+BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
+
+#define BM_TWO_PI(Func)                                                        \
+  []() {                                                                       \
+    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
+        Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64));                     \
+  }
+BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));
+
+#define BM_LARGE_INT(Func)                                                     \
+  []() {                                                                       \
+    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
+        Func, 0, get_bits(1 << 30), get_bits(1 << 4));                         \
+  }
+BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
+          BM_LARGE_INT(LIBC_NAMESPACE::sin));
+
+#ifdef NVPTX_MATH_FOUND
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
+          BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
+          BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
+          BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
+#endif
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -34,7 +34,7 @@ namespace LIBC_NAMESPACE_DECL {
   gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
   uint32_t result = 0.0;
-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
   asm("" ::"s"(start));
   uint64_t stop = gpu::processor_clock();
   return stop - start;
@@ -67,7 +67,8 @@ template <typename F, typename T>
 
   // This inline assembly performs a no-op which forces the result to both
   // be used and prevents us from exiting this region before it's complete.
-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+      static_cast<uint32_t>(result)));
 
   // Obtain the current timestamp after running the calculation and force
   // ordering.
@@ -98,7 +99,8 @@ template <typename F, typename T1, typename T2>
 
   auto result = f(arg1, arg2);
 
-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+      static_cast<uint32_t>(result)));
 
   uint64_t stop = gpu::processor_clock();
   asm("" ::"s"(stop));
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -27,7 +27,7 @@ namespace LIBC_NAMESPACE_DECL {
   uint64_t start = gpu::processor_clock();
   asm("" ::"r"(y), "llr"(start));
   uint32_t result = y;
-  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
   uint64_t stop = gpu::processor_clock();
   volatile auto storage = result;
   return stop - start;
@@ -57,7 +57,7 @@ template <typename F, typename T>
 
   // This inline assembly performs a no-op which forces the result to both be
   // used and prevents us from exiting this region before it's complete.
-  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
 
   // Obtain the current timestamp after running the calculation and force
   // ordering.
@@ -85,7 +85,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
 
   auto result = f(arg, arg2);
 
-  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
 
   uint64_t stop = gpu::processor_clock();
   gpu::memory_fence();

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`add_subdirectory(ctype)`
	`2`	`+add_subdirectory(math)`