[libc] Add Generic and NVPTX Sin Benchmark #99795

jameshu15869 · 2024-07-21T03:38:43Z

This PR adds sin benchmarking for a range of values and on a pregenerated random distribution.

llvmbot · 2024-07-21T03:39:11Z

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-libc

Author: None (jameshu15869)

Changes

This PR adds sin benchmarking for a range of values and on a pregenerated random distribution.

Full diff: https://github.com/llvm/llvm-project/pull/99795.diff

6 Files Affected:

(modified) libc/benchmarks/gpu/CMakeLists.txt (+3)
(modified) libc/benchmarks/gpu/LibcGpuBenchmark.cpp (+3)
(modified) libc/benchmarks/gpu/LibcGpuBenchmark.h (+53)
(modified) libc/benchmarks/gpu/src/CMakeLists.txt (+1)
(added) libc/benchmarks/gpu/src/math/CMakeLists.txt (+31)
(added) libc/benchmarks/gpu/src/math/sin_benchmark.cpp (+55)

diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 29e27724e1ab3..ba6958a0c68d5 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -47,13 +47,16 @@ add_unittest_framework_library(
     libc.src.__support.CPP.limits
     libc.src.__support.CPP.algorithm
     libc.src.__support.CPP.atomic
+    libc.src.__support.CPP.array
     libc.src.__support.fixed_point.fx_rep
     libc.src.__support.macros.properties.types
     libc.src.__support.OSUtil.osutil
     libc.src.__support.uint128
+    libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.sqrt
     libc.src.__support.fixedvector
     libc.src.time.clock
+    libc.src.stdlib.rand
     libc.benchmarks.gpu.timing.timing
 )
 
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index c926d8efd7db2..05a6621036b0b 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -107,6 +107,9 @@ void print_results(Benchmark *b) {
 
 void Benchmark::run_benchmarks() {
   uint64_t id = gpu::get_thread_id();
+  if (id == 0) {
+    LIBC_NAMESPACE::benchmarks::init_random_input();
+  }
   gpu::sync_threads();
 
   for (Benchmark *b : benchmarks) {
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 29d7ba8b0a132..5d84959e17c4b 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -3,10 +3,13 @@
 
 #include "benchmarks/gpu/BenchmarkLogger.h"
 #include "benchmarks/gpu/timing/timing.h"
+#include "src/__support/CPP/array.h"
 #include "src/__support/CPP/functional.h"
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/string_view.h"
+#include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/macros/config.h"
+#include "src/stdlib/rand.h"
 #include "src/time/clock.h"
 
 #include <stdint.h>
@@ -102,6 +105,56 @@ class Benchmark {
     return benchmark(options, func);
   }
 };
+
+// We want our random values to be approximately
+// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
+// 2^(max_exponent + 1)
+// The largest integer that can be stored in a double is 2^53
+static constexpr int MAX_EXPONENT = 52;
+static constexpr int RANDOM_INPUT_SIZE = 1024;
+static cpp::array<double, RANDOM_INPUT_SIZE> random_input;
+
+static double get_rand() {
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
+  uint64_t bits = LIBC_NAMESPACE::rand();
+  double scale = 0.5 + MAX_EXPONENT / 2048.0;
+  FPBits fp(bits);
+  fp.set_biased_exponent(
+      static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+  return fp.get_val();
+}
+
+static void init_random_input() {
+  for (int i = 0; i < RANDOM_INPUT_SIZE; i++) {
+    random_input[i] = get_rand();
+  }
+}
+
+template <typename T> class MathPerf {
+  using FPBits = fputil::FPBits<T>;
+  using StorageType = typename FPBits::StorageType;
+  static constexpr StorageType UIntMax =
+      cpp::numeric_limits<StorageType>::max();
+
+public:
+  typedef T Func(T);
+
+  static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
+                                    StorageType ending_bit, StorageType step) {
+    uint64_t total_time = 0;
+    if (step <= 0)
+      step = 1;
+    volatile T result;
+    for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
+      T x = FPBits(bits).get_val();
+      total_time += LIBC_NAMESPACE::latency(f, x);
+    }
+    StorageType num_runs = (ending_bit - starting_bit) / step + 1;
+
+    return total_time / num_runs;
+  }
+};
+
 } // namespace benchmarks
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/benchmarks/gpu/src/CMakeLists.txt b/libc/benchmarks/gpu/src/CMakeLists.txt
index 42eb4f7b5909a..f15d082e4dd2b 100644
--- a/libc/benchmarks/gpu/src/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(ctype)
+add_subdirectory(math)
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
new file mode 100644
index 0000000000000..2b27652e46ae9
--- /dev/null
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -0,0 +1,31 @@
+add_custom_target(libc-gpu-math-benchmarks)
+
+if(CUDAToolkit_FOUND)
+  set(libdevice_path ${CUDAToolkit_BIN_DIR}/../nvvm/libdevice/libdevice.10.bc)
+  if (EXISTS ${libdevice_path})
+    set(nvptx_bitcode_link_flags
+        "SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${libdevice_path}")
+    # Compile definition needed so the benchmark knows to register
+    # NVPTX benchmarks.
+    set(nvptx_math_found "-DNVPTX_MATH_FOUND=1")
+  endif()
+endif()
+
+add_benchmark(
+  sin_benchmark
+  SUITE
+    libc-gpu-math-benchmarks
+  SRCS
+    sin_benchmark.cpp
+  DEPENDS
+    libc.src.math.sin
+    libc.src.stdlib.srand
+    libc.src.stdlib.rand
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.CPP.bit
+  COMPILE_OPTIONS
+    ${nvptx_math_found}
+    ${nvptx_bitcode_link_flags}
+  LOADER_ARGS
+    --threads 64
+)
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
new file mode 100644
index 0000000000000..ac35e22b57287
--- /dev/null
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -0,0 +1,55 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/functional.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/math/sin.h"
+#include "src/stdlib/rand.h"
+#include "src/stdlib/srand.h"
+
+#ifdef NVPTX_MATH_FOUND
+#include "src/math/nvptx/declarations.h"
+#endif
+
+constexpr double M_PI = 3.14159265358979323846;
+uint64_t get_bits(double x) {
+  return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
+}
+
+// BENCHMARK() expects a function that with no parameters that returns a
+// uint64_t representing the latency. Defining each benchmark using macro that
+// expands to a lambda to allow us to switch the implementation of `sin()` to
+// easily register NVPTX benchmarks.
+#define BM_RANDOM_INPUT(Func)                                                  \
+  []() {                                                                       \
+    uint64_t total_time = 0;                                                   \
+    for (double i : LIBC_NAMESPACE::benchmarks::random_input) {                \
+      total_time += LIBC_NAMESPACE::latency(Func, i);                          \
+    }                                                                          \
+    return total_time / LIBC_NAMESPACE::benchmarks::random_input.size();       \
+  }
+BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
+
+#define BM_TWO_PI(Func)                                                        \
+  []() {                                                                       \
+    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
+        Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64));                     \
+  }
+BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));
+
+#define BM_LARGE_INT(Func)                                                     \
+  []() {                                                                       \
+    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
+        Func, 0, get_bits(1 << 30), get_bits(1 << 4));                         \
+  }
+BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
+          BM_LARGE_INT(LIBC_NAMESPACE::sin));
+
+#ifdef NVPTX_MATH_FOUND
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
+          BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
+          BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
+          BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
+#endif

jhuber6

Neat, it's interesting to get very specific timings against the vendor implementations.

@lntue do you know if anything here is common with the math stuff you already have written?

jhuber6 · 2024-07-21T03:50:27Z

libc/benchmarks/gpu/LibcGpuBenchmark.h

+
+static double get_rand() {
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
+  uint64_t bits = LIBC_NAMESPACE::rand();


If you want the results to be truly random we can probably just seed the RNG with the low bits of get_processor_clock().

I also wonder if we should make a reeantrant version of rand so we can have a state private to each thread instead of the shared state.

I.e.
val = rand_r(&state)

I don't think for math function tests we really care too much about real random, as long as it shuffled enough

jhuber6 · 2024-07-21T22:49:05Z

libc/benchmarks/gpu/LibcGpuBenchmark.cpp

+static double get_rand() {
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
+  uint64_t bits = LIBC_NAMESPACE::rand();
+  double scale = 0.5 + MAX_EXPONENT / 2048.0;
+  FPBits fp(bits);
+  fp.set_biased_exponent(
+      static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+  return fp.get_val();
+}
+
+static void init_random_input() {
+  LIBC_NAMESPACE::srand(LIBC_NAMESPACE::gpu::processor_clock());
+  for (int i = 0; i < RANDOM_INPUT_SIZE; i++) {
+    random_input[i] = get_rand();
+  }
+}
+


I figured this would be handled in the benchmark itself, like we do it before the latency calculation.

Ah, yeah that makes sense. I was thinking it might reduce duplication if we have to do the same thing for multiple benchmarks, but I think you're right about moving it to the benchmark itself

But now that I think about it, what was the way to share the array between all threads without using a global? I'm assuming that we want all threads to see the same random input, right?

Can put it as a utility in the header.

Do you mean keep the array in the header but call init_random_array() from the benchmark itself?

Yeah I just mean if you want a utility to get some random input, just make it a function in the header and have the benchmarks call it.

libc/benchmarks/gpu/LibcGpuBenchmark.cpp

libc/benchmarks/gpu/src/math/sin_benchmark.cpp

jhuber6 · 2024-07-28T02:15:29Z

libc/benchmarks/gpu/LibcGpuBenchmark.h

+template <size_t Size>
+static void init_random_double_input(cpp::array<double, Size> &values) {
+  LIBC_NAMESPACE::srand(LIBC_NAMESPACE::gpu::processor_clock());
+  for (int i = 0; i < Size; i++) {
+    values[i] = get_rand_double();
+  }
+}


This isn't used anywhere, right? Can probably just drop it for now.

Oops, I missed this

We might want to move this srand() call into the benchmark somewhere when we do other init stuff.

jhuber6 · 2024-07-28T02:17:09Z

libc/benchmarks/gpu/LibcGpuBenchmark.h

+// The largest integer that can be stored in a double is 2^53
+static constexpr int MAX_EXPONENT = 52;
+
+static double get_rand_double() {


We should probably template this because we'll want the same treatment for sin and sinf and sinf16.

Since floats should have a different MAX_EXPONENT, how would you suggest we template this? Is there a nice way to switch the value of MAX_EXPONENTwhen using floats and doubles?

FPBits<T>::FRACTION_LEN

I forget the name of it, but it's standard type_traits stuff, see https://godbolt.org/z/4TGrvadYY.

libc/cmake/modules/LLVMLibCCompileOptionRules.cmake

libc/benchmarks/gpu/timing/amdgpu/timing.h

jhuber6 · 2024-07-28T03:02:12Z

libc/benchmarks/gpu/LibcGpuBenchmark.h

+  // Required to correctly instantiate FPBits for floats and doubles.
+  using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>),
+                                               uint64_t, uint32_t>;
+  RandType bits = LIBC_NAMESPACE::rand();


rand() always returns an int, so it's always 32-bit.

I think I was having some problems when trying to instantiate FPBits where it would only accept uint64_t for doubles and uint32_t for floats. Are you able to check if you can reproduce this locally?

I wonder if that means we need to call rand twice to get an actual 64-bit value... @lntue might have some better insight here. Maybe we just need a better reentrant random function.

jhuber6

I guess we can put a call to srand(gpu::processor_clock()) inside of the benchmark registration code.

jameshu15869 · 2024-07-30T02:33:30Z

Ah, did you want to run srand() for every benchmark? Right now I have the call to srand() inside only the benchmark that uses it (e.g. BM_RANDOM_INPUT in sin_benchmark.cpp). I was thinking that way each iteration would definitely get a different value, if that makes sense

jhuber6 · 2024-07-30T02:36:08Z

Ah, did you want to run srand() for every benchmark? Right now I have the call to srand() inside only the benchmark that uses it (e.g. BM_RANDOM_INPUT in sin_benchmark.cpp). I was thinking that way each iteration would definitely get a different value, if that makes sense

srand() just initializes the seed, the implementation shares the global random state with every thread so we can just have a single one initialize it to something other than 1.

jameshu15869 · 2024-07-30T02:41:14Z

srand() just initializes the seed, the implementation shares the global random state with every thread so we can just have a single one initialize it to something other than 1.

Conceptually, should each individual benchmark (e.g. Sin, NvSin, etc.) call srand() once? Or should it just be called once for each invocation of our entire framework?

jhuber6 · 2024-07-30T02:42:32Z

srand() just initializes the seed, the implementation shares the global random state with every thread so we can just have a single one initialize it to something other than 1.

Conceptually, should each individual benchmark (e.g. Sin, NvSin, etc.) call srand() once? Or should it just be called once for each invocation of our entire framework?

Just call it once when we initialize the benchmarks, the sequence will then effectively be "random" whenever any other thread gets a value from the sequence.

jameshu15869 · 2024-07-30T02:46:48Z

Just call it once when we initialize the benchmarks, the sequence will then effectively be "random" whenever any other thread gets a value from the sequence.

I'm thinking about having thread 0 call srand() right before we start iterating through our benchmark list and running them (So it's only called once throughout the execution) - is that close to what you were thinking?

jhuber6 · 2024-07-30T02:49:00Z

Pretty much like this.

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index a9a912538cd8..2574eb46f160 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -136,8 +136,10 @@ void print_header() {
 void Benchmark::run_benchmarks() {
   uint64_t id = gpu::get_thread_id();
 
-  if (id == 0)
+  if (id == 0) {
+    srand(gpu::processor_clock());
     print_header();
+  }
 
   gpu::sync_threads();

jhuber6 · 2024-07-30T02:51:10Z

libc/benchmarks/gpu/timing/amdgpu/timing.h

-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+          static_cast<uint32_t>(result))
+      :);


Is this colon required?

Oops, I swear I removed this already

jhuber6

LG, thanks. Looking forward to getting this working for AMD and then testing a few more functions.

jameshu15869 marked this pull request as draft July 21, 2024 03:38

llvmbot added the libc label Jul 21, 2024

jameshu15869 changed the title ~~[libc] Add Generic and NVPTX Sin Benchmark~~ [libc] DRAFT: Add Generic and NVPTX Sin Benchmark Jul 21, 2024

jhuber6 reviewed Jul 21, 2024

View reviewed changes

jameshu15869 force-pushed the sin-benchmark branch from 53030fc to 81f86e5 Compare July 22, 2024 02:01

jhuber6 reviewed Jul 24, 2024

View reviewed changes

libc/benchmarks/gpu/LibcGpuBenchmark.cpp Outdated Show resolved Hide resolved

libc/benchmarks/gpu/src/math/sin_benchmark.cpp Show resolved Hide resolved

jameshu15869 added 8 commits July 27, 2024 21:56

implement generic sin benchmark and compare with nvsin

dbb4f54

basic generic and nvptx sin benchmark

788f615

minor fixes

7d37fed

seed rand with processor clock

3a39304

move random array to per thread

c3e14be

use single variable instead of array

908bdef

allow AMDGPU to store doubles to register

2adc13d

minor fixes

a0f1905

jameshu15869 force-pushed the sin-benchmark branch from 81f86e5 to a0f1905 Compare July 28, 2024 02:13

jameshu15869 marked this pull request as ready for review July 28, 2024 02:13

llvmbot added the backend:AMDGPU label Jul 28, 2024

jhuber6 reviewed Jul 28, 2024

View reviewed changes

address comments

84c2afb

jhuber6 reviewed Jul 28, 2024

View reviewed changes

Correctly generate 64 bit random values

1729e13

jameshu15869 changed the title ~~[libc] DRAFT: Add Generic and NVPTX Sin Benchmark~~ [libc] Add Generic and NVPTX Sin Benchmark Jul 30, 2024

jhuber6 reviewed Jul 30, 2024

View reviewed changes

Switch to only 1 thread calling srand() once

080f61f

jhuber6 reviewed Jul 30, 2024

View reviewed changes

jameshu15869 added 2 commits July 29, 2024 22:51

Remove extra LIBC_NAMESPACE

ae902b0

remove unnecessary colons

5fc1219

jhuber6 approved these changes Jul 30, 2024

View reviewed changes

jhuber6 merged commit 677796c into llvm:main Jul 30, 2024
6 checks passed

[libc] Add Generic and NVPTX Sin Benchmark #99795

[libc] Add Generic and NVPTX Sin Benchmark #99795

Uh oh!

Conversation

jameshu15869 commented Jul 21, 2024

Uh oh!

llvmbot commented Jul 21, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jhuber6 left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jhuber6 left a comment

Choose a reason for hiding this comment

Uh oh!

jameshu15869 commented Jul 30, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jhuber6 commented Jul 30, 2024

Uh oh!

jameshu15869 commented Jul 30, 2024

Uh oh!

jhuber6 commented Jul 30, 2024

Uh oh!

jameshu15869 commented Jul 30, 2024

Uh oh!

jhuber6 commented Jul 30, 2024

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jhuber6 left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

llvmbot commented Jul 21, 2024 •

edited

Loading

jameshu15869 commented Jul 30, 2024 •

edited

Loading