[libc] [gpu] Add Generic, NvSin, and OcmlSinf64 Throughput Benchmark #101917

jameshu15869 · 2024-08-05T01:59:24Z

This PR implements lntue@2a15842 to provide better throughput benchmarking for libc sin() and __nv_sin().

These changes have not been tested on AMDGPU yet, only compiled.

llvmbot · 2024-08-05T01:59:56Z

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-libc

Author: None (jameshu15869)

Changes

This PR implements lntue@2a15842 to provide better throughput benchmarking for libc sin() and __nv_sin().

These changes have not been tested on AMDGPU yet, only compiled.

Full diff: https://github.com/llvm/llvm-project/pull/101917.diff

6 Files Affected:

(modified) libc/benchmarks/gpu/LibcGpuBenchmark.h (+22-20)
(modified) libc/benchmarks/gpu/src/math/sin_benchmark.cpp (+25-35)
(modified) libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt (+4)
(modified) libc/benchmarks/gpu/timing/amdgpu/timing.h (+35-16)
(modified) libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt (+4)
(modified) libc/benchmarks/gpu/timing/nvptx/timing.h (+32)

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 2b85b146ed745..39e4a6e9e0152 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -21,7 +21,7 @@ namespace benchmarks {
 
 struct BenchmarkOptions {
   uint32_t initial_iterations = 1;
-  uint32_t min_iterations = 50;
+  uint32_t min_iterations = 1;
   uint32_t max_iterations = 10000000;
   uint32_t min_samples = 4;
   uint32_t max_samples = 1000;
@@ -111,9 +111,15 @@ class Benchmark {
 };
 
 // We want our random values to be approximately
-// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
-// 2^(max_exponent + 1)
-template <typename T> static T get_rand_input() {
+// Output: a random number with the exponent field between min_exp and max_exp,
+// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
+// Caveats:
+//   -EXP_BIAS corresponding to denormal values,
+//   EXP_BIAS + 1 corresponding to inf or nan.
+template <typename T>
+static T
+get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
+               int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
 
   // Required to correctly instantiate FPBits for floats and doubles.
@@ -125,10 +131,11 @@ template <typename T> static T get_rand_input() {
            static_cast<uint64_t>(LIBC_NAMESPACE::rand());
   else
     bits = LIBC_NAMESPACE::rand();
-  double scale = 0.5 + LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN / 2048.0;
+  double scale =
+      static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
   FPBits fp(bits);
   fp.set_biased_exponent(
-      static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+      static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
   return fp.get_val();
 }
 
@@ -141,19 +148,15 @@ template <typename T> class MathPerf {
 public:
   typedef T Func(T);
 
-  static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
-                                    StorageType ending_bit, StorageType step) {
-    uint64_t total_time = 0;
-    if (step <= 0)
-      step = 1;
-    volatile T result;
-    for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
-      T x = FPBits(bits).get_val();
-      total_time += LIBC_NAMESPACE::latency(f, x);
-    }
-    StorageType num_runs = (ending_bit - starting_bit) / step + 1;
-
-    return total_time / num_runs;
+  template <size_t N = 1>
+  static uint64_t run_perf_in_range(Func f, int min_exp, int max_exp) {
+    cpp::array<T, N> inputs;
+    for (size_t i = 0; i < N; ++i)
+      inputs[i] = get_rand_input<T>(min_exp, max_exp);
+
+    uint64_t total_time = LIBC_NAMESPACE::latency(f, inputs);
+
+    return total_time / N;
   }
 };
 
@@ -176,5 +179,4 @@ template <typename T> class MathPerf {
 #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func)                       \
   BENCHMARK_N_THREADS(SuiteName, TestName, Func,                               \
                       LIBC_NAMESPACE::gpu::get_lane_size())
-
 #endif
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index 5849ea3e99bb0..03f824deae6a5 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -15,51 +15,41 @@
 #include "src/math/amdgpu/declarations.h"
 #endif
 
-constexpr double M_PI = 3.14159265358979323846;
-uint64_t get_bits(double x) {
-  return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
-}
-
 // BENCHMARK() expects a function that with no parameters that returns a
 // uint64_t representing the latency. Defining each benchmark using macro that
 // expands to a lambda to allow us to switch the implementation of `sin()` to
 // easily register NVPTX benchmarks.
-#define BM_RANDOM_INPUT(Func)                                                  \
-  []() {                                                                       \
-    double x = LIBC_NAMESPACE::benchmarks::get_rand_input<double>();           \
-    return LIBC_NAMESPACE::latency(Func, x);                                   \
-  }
-BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
-
-#define BM_TWO_PI(Func)                                                        \
+#define BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, N)                             \
   []() {                                                                       \
-    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
-        Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64));                     \
+    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range<N>( \
+        Func, MIN_EXP, MAX_EXP);                                               \
   }
-BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));
 
-#define BM_LARGE_INT(Func)                                                     \
-  []() {                                                                       \
-    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
-        Func, 0, get_bits(1 << 30), get_bits(1 << 4));                         \
-  }
-BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
-          BM_LARGE_INT(LIBC_NAMESPACE::sin));
+#define BENCH(Name, Func, MIN_EXP, MAX_EXP)                                    \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1,                     \
+                        BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1));           \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_128,                   \
+                        BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 128));         \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1024,                  \
+                        BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1024));        \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_4096,                  \
+                        BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 4096))
+
+BENCH(Sin, LIBC_NAMESPACE::sin, -1023, 1023);
+BENCH(SinTwoPi, LIBC_NAMESPACE::sin, -10, 3);
+BENCH(SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
+BENCH(SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);
 
 #ifdef NVPTX_MATH_FOUND
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
-          BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
-          BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
-          BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
+BENCH(NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
+BENCH(NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
+BENCH(NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
+BENCH(NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
 #endif
 
 #ifdef AMDGPU_MATH_FOUND
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSin,
-          BM_RANDOM_INPUT(LIBC_NAMESPACE::__ocml_sin_f64));
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinTwoPi,
-          BM_TWO_PI(LIBC_NAMESPACE::__ocml_sin_f64));
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinLargeInt,
-          BM_LARGE_INT(LIBC_NAMESPACE::__ocml_sin_f64));
+BENCH(AmdgpuSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
+BENCH(AmdgpuSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
+BENCH(AmdgpuSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
+BENCH(AmdgpuSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
 #endif
diff --git a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
index 179429db9a09a..aa5dcd33bee9c 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
@@ -4,4 +4,8 @@ add_header_library(
     timing.h
   DEPENDS
     libc.src.__support.common
+    libc.src.__support.macros.config
+    libc.src.__support.macros.attributes
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.CPP.array
 )
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index e308d619e9569..e53eb25f83930 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -14,17 +14,10 @@
 #include "src/__support/common.h"
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/CPP/array.h"
 
 #include <stdint.h>
 
-// AMDGPU does not support input register constraints for i1 and i8, so we must
-// cast them to uint16_t's before loading them into registers.
-#define FORCE_TO_REGISTER(TYPE, VARIABLE)                                      \
-  if constexpr (cpp::is_same_v<TYPE, char> || cpp::is_same_v<TYPE, bool>)      \
-    asm("" ::"v"(static_cast<uint16_t>(VARIABLE)));                            \
-  else                                                                         \
-    asm("" ::"v"(VARIABLE))
-
 namespace LIBC_NAMESPACE_DECL {
 
 // Returns the overhead associated with calling the profiling region. This
@@ -50,7 +43,8 @@ template <typename F, typename T>
   volatile T storage = t;
   T arg = storage;
 
-  FORCE_TO_REGISTER(T, arg);
+  // VGPR constraints can only accept primitive values.
+  asm("" ::"v"(&arg));
 
   // The AMDGPU architecture needs to wait on pending results.
   gpu::memory_fence();
@@ -59,8 +53,7 @@ template <typename F, typename T>
 
   // This forces the compiler to load the input argument and run the clock
   // cycle counter before the profiling region.
-  FORCE_TO_REGISTER(T, arg);
-  asm("" ::"s"(start));
+  asm("" ::"s"(start), "v"(&arg));
 
   // Run the function under test and return its value.
   auto result = f(arg);
@@ -87,15 +80,12 @@ template <typename F, typename T1, typename T2>
   T1 arg1 = storage1;
   T2 arg2 = storage2;
 
-  FORCE_TO_REGISTER(T1, arg1);
-  FORCE_TO_REGISTER(T2, arg2);
+  asm("" ::"v"(&arg1), "v"(&arg2));
 
   gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
 
-  FORCE_TO_REGISTER(T1, arg1);
-  FORCE_TO_REGISTER(T2, arg2);
-  asm("" ::"s"(start));
+  asm("" ::"s"(start), "v"(&arg1), "v"(&arg2));
 
   auto result = f(arg1, arg2);
 
@@ -109,6 +99,35 @@ template <typename F, typename T1, typename T2>
   return stop - start;
 }
 
+// Provides throughput benchmarking.
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t
+latency(F f, const cpp::array<T, N> &inputs) {
+  volatile auto storage = &inputs;
+  auto array_pointer = storage;
+  asm("" ::"v"(array_pointer));
+  auto register_array = *array_pointer;
+
+  gpu::memory_fence();
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"s"(start), "v"(array_pointer));
+
+  for (auto input : register_array) {
+    auto result = f(input);
+
+    asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+        static_cast<uint32_t>(result)));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  asm("" ::"s"(stop));
+  gpu::memory_fence();
+
+  // Return the time elapsed.
+  return stop - start;
+}
+
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
index 9958e16206a41..2723c8940814c 100644
--- a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -4,4 +4,8 @@ add_header_library(
     timing.h
   DEPENDS
     libc.src.__support.common
+    libc.src.__support.macros.config
+    libc.src.__support.macros.attributes
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.CPP.array
 )
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index b426dfd0ea153..dee8d6ea41f47 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
 #define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
 
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/attributes.h"
@@ -94,6 +96,36 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
 
   return stop - start;
 }
+
+// Provides throughput benchmarking.
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t
+latency(F f, const cpp::array<T, N> &inputs) {
+  volatile auto storage = &inputs;
+  auto array_pointer = storage;
+  asm("" ::"r"(array_pointer));
+  auto register_array = *array_pointer;
+
+  gpu::memory_fence();
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"r"(array_pointer), "llr"(start));
+
+  uint64_t result;
+  for (auto input : register_array) {
+    asm("" ::"r"(input));
+    result = f(input);
+    asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  gpu::memory_fence();
+  asm("" ::"r"(stop));
+  volatile auto output = result;
+
+  // Return the time elapsed.
+  return stop - start;
+}
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX

github-actions · 2024-08-05T02:03:09Z

✅ With the latest revision this PR passed the C/C++ code formatter.

jhuber6

I'm wondering if we shouldn't have separate functions for throughput and latency. We likely want to keep the old assembly constraints for the latency checks, but can use something different if we put it in an array.

Also, @lntue, is it necessary to even use a loop? If we want strict throughput couldn't we just do something like

#pragma unroll
for (int i = 0; i < DEPTH; ++i) {
  auto x = fn(input);
  asm("" : "r"(input) ::); // Probably need to trick the compiler into thinking this changed.
}

jameshu15869 · 2024-08-05T02:11:49Z

We likely want to keep the old assembly constraints for the latency checks, but can use something different if we put it in an array.

I don't think I changed the existing latency constraints, the new constraints in this diff are for an overloaded latency() function - should I rename it to something else?

EDIT: Sorry, I got confused - i did make some unnecessary changes to the latency asm constraints

jhuber6

Ah, I see you added new ones called latency as well. Somewhat surprised the type deduction is working here if so.

jameshu15869 · 2024-08-05T02:16:01Z

Ah, I see you added new ones called latency as well.

Yeah, I think the original intent of changing the constraints on AMDGPU was that I realized using the pointer might be possible instead of having to do that FORCE_TO_REGISTER() workaround before - should we just keep that part in?

jhuber6 · 2024-08-05T02:25:06Z

Ah, I see you added new ones called latency as well.

Yeah, I think the original intent of changing the constraints AMDGPU was that I realized using the pointer might be possible instead of having to do that FORCE_TO_REGISTER() workaround before - should we just keep that part in?

I'll need to double check the ASM generated there, I forget if capturing the pointer prevented it from carrying the input in a register.

…t on nvptx

jameshu15869 added 2 commits August 4, 2024 21:52

add generic and nvptx sin throughput benchmark

557117d

add trailing new line

bffe71a

llvmbot added backend:AMDGPU libc labels Aug 5, 2024

jhuber6 reviewed Aug 5, 2024

View reviewed changes

run clang-format

2963568

jhuber6 reviewed Aug 5, 2024

View reviewed changes

jameshu15869 changed the title ~~[libc] [gpu] Add Generic and NvSin Throughput Benchmark~~ [libc] [gpu] Add Generic, NvSin, and OcmlSinf64 Throughput Benchmark Aug 8, 2024

jameshu15869 added 2 commits August 8, 2024 14:04

remove arg inline asm capture and capture array pointer for throughpu…

df3f0fa

…t on nvptx

fix asm constraints for amdgpu

47a0f43

jhuber6 approved these changes Aug 8, 2024

View reviewed changes

jhuber6 merged commit 9a070d6 into llvm:main Aug 8, 2024
5 of 6 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[libc] [gpu] Add Generic, NvSin, and OcmlSinf64 Throughput Benchmark #101917

[libc] [gpu] Add Generic, NvSin, and OcmlSinf64 Throughput Benchmark #101917

Uh oh!

jameshu15869 commented Aug 5, 2024

Uh oh!

llvmbot commented Aug 5, 2024 •

edited

Loading

Uh oh!

github-actions bot commented Aug 5, 2024 •

edited

Loading

Uh oh!

jhuber6 left a comment

Uh oh!

jameshu15869 commented Aug 5, 2024 •

edited

Loading

Uh oh!

jhuber6 left a comment

Uh oh!

jameshu15869 commented Aug 5, 2024 •

edited

Loading

Uh oh!

jhuber6 commented Aug 5, 2024

Uh oh!

Uh oh!

Uh oh!

[libc] [gpu] Add Generic, NvSin, and OcmlSinf64 Throughput Benchmark #101917

[libc] [gpu] Add Generic, NvSin, and OcmlSinf64 Throughput Benchmark #101917

Uh oh!

Conversation

jameshu15869 commented Aug 5, 2024

Uh oh!

llvmbot commented Aug 5, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Aug 5, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jhuber6 left a comment

Choose a reason for hiding this comment

Uh oh!

jameshu15869 commented Aug 5, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jhuber6 left a comment

Choose a reason for hiding this comment

Uh oh!

jameshu15869 commented Aug 5, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jhuber6 commented Aug 5, 2024

Uh oh!

Uh oh!

Uh oh!

llvmbot commented Aug 5, 2024 •

edited

Loading

github-actions bot commented Aug 5, 2024 •

edited

Loading

jameshu15869 commented Aug 5, 2024 •

edited

Loading

jameshu15869 commented Aug 5, 2024 •

edited

Loading