-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[libc] Use Atomics in GPU Benchmarks #98842
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-libc Author: None (jameshu15869) ChangesFull diff: https://github.com/llvm/llvm-project/pull/98842.diff 4 Files Affected:
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index d167abcaf2db1..eaeecbdacd23e 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -43,6 +43,7 @@ add_unittest_framework_library(
libc.src.__support.CPP.functional
libc.src.__support.CPP.limits
libc.src.__support.CPP.algorithm
+ libc.src.__support.CPP.atomic
libc.src.__support.fixed_point.fx_rep
libc.src.__support.macros.properties.types
libc.src.__support.OSUtil.osutil
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 7f60c9cc4a2f4..d3c9e3350b1a8 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -1,6 +1,7 @@
#include "LibcGpuBenchmark.h"
#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/atomic.h"
#include "src/__support/CPP/string.h"
#include "src/__support/FPUtil/sqrt.h"
#include "src/__support/GPU/utils.h"
@@ -11,61 +12,110 @@ namespace LIBC_NAMESPACE {
namespace benchmarks {
FixedVector<Benchmark *, 64> benchmarks;
-cpp::array<BenchmarkResult, 1024> results;
void Benchmark::add_benchmark(Benchmark *benchmark) {
benchmarks.push_back(benchmark);
}
-BenchmarkResult
-reduce_results(const cpp::array<BenchmarkResult, 1024> &results) {
- BenchmarkResult result;
- uint64_t cycles_sum = 0;
- double standard_deviation_sum = 0;
- uint64_t min = UINT64_MAX;
- uint64_t max = 0;
- uint32_t samples_sum = 0;
- uint32_t iterations_sum = 0;
- clock_t time_sum = 0;
- uint64_t num_threads = gpu::get_num_threads();
- for (uint64_t i = 0; i < num_threads; i++) {
- BenchmarkResult current_result = results[i];
- cycles_sum += current_result.cycles;
- standard_deviation_sum += current_result.standard_deviation;
- min = cpp::min(min, current_result.min);
- max = cpp::max(max, current_result.max);
- samples_sum += current_result.samples;
- iterations_sum += current_result.total_iterations;
- time_sum += current_result.total_time;
+void update_sums(const BenchmarkResult ¤t_result,
+ cpp::Atomic<uint64_t> &active_threads,
+ cpp::Atomic<uint64_t> &cycles_sum,
+ cpp::Atomic<uint64_t> &standard_deviation_sum,
+ cpp::Atomic<uint64_t> &min, cpp::Atomic<uint64_t> &max,
+ cpp::Atomic<uint32_t> &samples_sum,
+ cpp::Atomic<uint32_t> &iterations_sum,
+ cpp::Atomic<clock_t> &time_sum) {
+ gpu::memory_fence();
+ active_threads.fetch_add(1, cpp::MemoryOrder::RELAXED);
+
+ cycles_sum.fetch_add(current_result.cycles, cpp::MemoryOrder::RELAXED);
+ standard_deviation_sum.fetch_add(
+ static_cast<uint64_t>(current_result.standard_deviation),
+ cpp::MemoryOrder::RELAXED);
+
+ // Perform a CAS loop to atomically update the min
+ uint64_t orig_min = min.load(cpp::MemoryOrder::RELAXED);
+ while (!min.compare_exchange_strong(
+ orig_min, cpp::min(orig_min, current_result.min),
+ cpp::MemoryOrder::ACQUIRE, cpp::MemoryOrder::RELAXED)) {
}
- result.cycles = cycles_sum / num_threads;
- result.standard_deviation = standard_deviation_sum / num_threads;
- result.min = min;
- result.max = max;
- result.samples = samples_sum / num_threads;
- result.total_iterations = iterations_sum / num_threads;
- result.total_time = time_sum / num_threads;
- return result;
+
+ // Perform a CAS loop to atomically update the max
+ uint64_t orig_max = max.load(cpp::MemoryOrder::RELAXED);
+ while (!max.compare_exchange_strong(
+ orig_max, cpp::max(orig_max, current_result.max),
+ cpp::MemoryOrder::ACQUIRE, cpp::MemoryOrder::RELAXED)) {
+ }
+
+ samples_sum.fetch_add(current_result.samples, cpp::MemoryOrder::RELAXED);
+ iterations_sum.fetch_add(current_result.total_iterations,
+ cpp::MemoryOrder::RELAXED);
+ time_sum.fetch_add(current_result.total_time, cpp::MemoryOrder::RELAXED);
+ gpu::memory_fence();
+}
+
+cpp::Atomic<uint64_t> cycles_sum = 0;
+cpp::Atomic<uint64_t> standard_deviation_sum = 0;
+cpp::Atomic<uint64_t> min = UINT64_MAX;
+cpp::Atomic<uint64_t> max = 0;
+cpp::Atomic<uint32_t> samples_sum = 0;
+cpp::Atomic<uint32_t> iterations_sum = 0;
+cpp::Atomic<clock_t> time_sum = 0;
+cpp::Atomic<uint64_t> active_threads = 0;
+
+void print_results(Benchmark *b) {
+ constexpr auto GREEN = "\033[32m";
+ constexpr auto RESET = "\033[0m";
+
+ BenchmarkResult result;
+ gpu::memory_fence();
+ int num_threads = active_threads.load(cpp::MemoryOrder::RELAXED);
+ result.cycles = cycles_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+ result.standard_deviation =
+ standard_deviation_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+ result.min = min.load(cpp::MemoryOrder::RELAXED);
+ result.max = max.load(cpp::MemoryOrder::RELAXED);
+ result.samples = samples_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+ result.total_iterations =
+ iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+ result.total_time = time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+ gpu::memory_fence();
+ log << GREEN << "[ RUN ] " << RESET << b->get_name() << '\n';
+ log << GREEN << "[ OK ] " << RESET << b->get_name() << ": "
+ << result.cycles << " cycles, " << result.min << " min, " << result.max
+ << " max, " << result.total_iterations << " iterations, "
+ << result.total_time << " ns, "
+ << static_cast<long>(result.standard_deviation)
+ << " stddev (num threads: " << num_threads << ")\n";
}
void Benchmark::run_benchmarks() {
uint64_t id = gpu::get_thread_id();
gpu::sync_threads();
- for (Benchmark *b : benchmarks)
- results[id] = b->run();
- gpu::sync_threads();
- if (id == 0) {
- for (Benchmark const *b : benchmarks) {
- BenchmarkResult all_results = reduce_results(results);
- constexpr auto GREEN = "\033[32m";
- constexpr auto RESET = "\033[0m";
- log << GREEN << "[ RUN ] " << RESET << b->get_name() << '\n';
- log << GREEN << "[ OK ] " << RESET << b->get_name() << ": "
- << all_results.cycles << " cycles, " << all_results.min << " min, "
- << all_results.max << " max, " << all_results.total_iterations
- << " iterations, " << all_results.total_time << " ns, "
- << static_cast<long>(all_results.standard_deviation) << " stddev\n";
+ for (Benchmark *b : benchmarks) {
+ gpu::memory_fence();
+ if (id == 0) {
+ active_threads.store(0, cpp::MemoryOrder::RELAXED);
+ cycles_sum.store(0, cpp::MemoryOrder::RELAXED);
+ standard_deviation_sum.store(0, cpp::MemoryOrder::RELAXED);
+ min.store(UINT64_MAX, cpp::MemoryOrder::RELAXED);
+ max.store(0, cpp::MemoryOrder::RELAXED);
+ samples_sum.store(0, cpp::MemoryOrder::RELAXED);
+ iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
+ time_sum.store(0, cpp::MemoryOrder::RELAXED);
+ }
+ gpu::memory_fence();
+ gpu::sync_threads();
+
+ auto current_result = b->run();
+ update_sums(current_result, active_threads, cycles_sum,
+ standard_deviation_sum, min, max, samples_sum, iterations_sum,
+ time_sum);
+ gpu::sync_threads();
+
+ if (id == 0) {
+ print_results(b);
}
}
gpu::sync_threads();
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index ffc858911b1c0..b6bc8acb17a15 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -87,6 +87,7 @@ class Benchmark {
}
static void run_benchmarks();
+ const cpp::string_view get_name() const { return name; }
protected:
static void add_benchmark(Benchmark *benchmark);
@@ -96,13 +97,12 @@ class Benchmark {
BenchmarkOptions options;
return benchmark(options, func);
}
- const cpp::string_view get_name() const { return name; }
};
} // namespace benchmarks
} // namespace LIBC_NAMESPACE
#define BENCHMARK(SuiteName, TestName, Func) \
LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
- Func, #SuiteName "." #TestName);
+ Func, #SuiteName "." #TestName)
#endif
diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
index 4050bc0ec77b9..6f8d247902f76 100644
--- a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
@@ -6,4 +6,16 @@ uint64_t BM_IsAlnum() {
char x = 'c';
return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
}
-BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWrapper, BM_IsAlnum);
+BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnum, BM_IsAlnum);
+
+uint64_t BM_IsAlnumCapital() {
+ char x = 'A';
+ return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
+}
+BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumCapital, BM_IsAlnumCapital);
+
+uint64_t BM_IsAlnumNotAlnum() {
+ char x = '{';
+ return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
+}
+BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumNotAlnum, BM_IsAlnumNotAlnum);
|
f2f9f95
to
568b01a
Compare
cpp::MemoryOrder::ACQUIRE, cpp::MemoryOrder::RELAXED)) { | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cpp::MemoryOrder::ACQUIRE, cpp::MemoryOrder::RELAXED)) { | |
} | |
cpp::MemoryOrder::ACQUIRE, cpp::MemoryOrder::RELAXED)); |
cpp::Atomic<uint64_t> cycles_sum = 0; | ||
cpp::Atomic<uint64_t> standard_deviation_sum = 0; | ||
cpp::Atomic<uint64_t> min = UINT64_MAX; | ||
cpp::Atomic<uint64_t> max = 0; | ||
cpp::Atomic<uint32_t> samples_sum = 0; | ||
cpp::Atomic<uint32_t> iterations_sum = 0; | ||
cpp::Atomic<clock_t> time_sum = 0; | ||
cpp::Atomic<uint64_t> active_threads = 0; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm thinking all of this should go into a big struct called BenchmarkState
or something.
result.total_iterations = | ||
iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; | ||
result.total_time = time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; | ||
gpu::memory_fence(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should be an atomic_thread_fence(cpp::MemoryOrder::RELEASE)
most likely.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a list of supported MemoryOrderings on NVPTX?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So, the ordering is actually a lie on NVPTX because they don't support specific thread fences, but it's good practice since AMDGPU does. On NVPTX it's just a full thread fence.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oops, I think I misunderstood your original comment here
while (!min.compare_exchange_strong( | ||
orig_min, cpp::min(orig_min, result.min), cpp::MemoryOrder::ACQUIRE, | ||
cpp::MemoryOrder::RELAXED)) { | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
while (!min.compare_exchange_strong( | |
orig_min, cpp::min(orig_min, result.min), cpp::MemoryOrder::ACQUIRE, | |
cpp::MemoryOrder::RELAXED)) { | |
} | |
while (!min.compare_exchange_strong( | |
orig_min, cpp::min(orig_min, result.min), cpp::MemoryOrder::ACQUIRE, | |
cpp::MemoryOrder::RELAXED)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
clang-format
is insisting that the semicolon go on the next line - which do we prefer?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do what clang-format
says, semicolon on the next line is better since it makes it more obvious it's a loop with no body.
iterations_sum.fetch_add(result.total_iterations, | ||
cpp::MemoryOrder::RELAXED); | ||
time_sum.fetch_add(result.total_time, cpp::MemoryOrder::RELAXED); | ||
gpu::memory_fence(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
gpu::memory_fence(); | |
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is there a specific reason for choosing the cpp fence instead of a memory fence? Are these two memory fences separate things?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
They're two separate things. The memory fence is more "flush all pending memory operations" the thread fence is "memory operations cannot move past this fence"
result.total_iterations = | ||
iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; | ||
result.total_time = time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; | ||
gpu::memory_fence(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So, the ordering is actually a lie on NVPTX because they don't support specific thread fences, but it's good practice since AMDGPU does. On NVPTX it's just a full thread fence.
} | ||
|
||
void Benchmark::run_benchmarks() { | ||
uint64_t id = gpu::get_thread_id(); | ||
gpu::sync_threads(); | ||
|
||
for (Benchmark *b : benchmarks) { | ||
results[id] = b->run(); | ||
gpu::memory_fence(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think you need these memory fences.
This PR replaces our old method of reducing the benchmark results by using an array to using atomics instead. This should help us implement single threaded benchmarks.