Skip to content

Commit 568b01a

Browse files
committed
use atomics instead of reducing
1 parent 0fc4e30 commit 568b01a

File tree

3 files changed

+93
-41
lines changed

3 files changed

+93
-41
lines changed

libc/benchmarks/gpu/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ add_unittest_framework_library(
4343
libc.src.__support.CPP.functional
4444
libc.src.__support.CPP.limits
4545
libc.src.__support.CPP.algorithm
46+
libc.src.__support.CPP.atomic
4647
libc.src.__support.fixed_point.fx_rep
4748
libc.src.__support.macros.properties.types
4849
libc.src.__support.OSUtil.osutil

libc/benchmarks/gpu/LibcGpuBenchmark.cpp

Lines changed: 90 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "LibcGpuBenchmark.h"
22
#include "src/__support/CPP/algorithm.h"
33
#include "src/__support/CPP/array.h"
4+
#include "src/__support/CPP/atomic.h"
45
#include "src/__support/CPP/string.h"
56
#include "src/__support/FPUtil/sqrt.h"
67
#include "src/__support/GPU/utils.h"
@@ -12,60 +13,110 @@ namespace LIBC_NAMESPACE_DECL {
1213
namespace benchmarks {
1314

1415
FixedVector<Benchmark *, 64> benchmarks;
15-
cpp::array<BenchmarkResult, 1024> results;
1616

1717
void Benchmark::add_benchmark(Benchmark *benchmark) {
1818
benchmarks.push_back(benchmark);
1919
}
2020

21-
BenchmarkResult
22-
reduce_results(const cpp::array<BenchmarkResult, 1024> &results) {
23-
BenchmarkResult result;
24-
uint64_t cycles_sum = 0;
25-
double standard_deviation_sum = 0;
26-
uint64_t min = UINT64_MAX;
27-
uint64_t max = 0;
28-
uint32_t samples_sum = 0;
29-
uint32_t iterations_sum = 0;
30-
clock_t time_sum = 0;
31-
uint64_t num_threads = gpu::get_num_threads();
32-
for (uint64_t i = 0; i < num_threads; i++) {
33-
BenchmarkResult current_result = results[i];
34-
cycles_sum += current_result.cycles;
35-
standard_deviation_sum += current_result.standard_deviation;
36-
min = cpp::min(min, current_result.min);
37-
max = cpp::max(max, current_result.max);
38-
samples_sum += current_result.samples;
39-
iterations_sum += current_result.total_iterations;
40-
time_sum += current_result.total_time;
21+
void update_sums(const BenchmarkResult &current_result,
22+
cpp::Atomic<uint64_t> &active_threads,
23+
cpp::Atomic<uint64_t> &cycles_sum,
24+
cpp::Atomic<uint64_t> &standard_deviation_sum,
25+
cpp::Atomic<uint64_t> &min, cpp::Atomic<uint64_t> &max,
26+
cpp::Atomic<uint32_t> &samples_sum,
27+
cpp::Atomic<uint32_t> &iterations_sum,
28+
cpp::Atomic<clock_t> &time_sum) {
29+
gpu::memory_fence();
30+
active_threads.fetch_add(1, cpp::MemoryOrder::RELAXED);
31+
32+
cycles_sum.fetch_add(current_result.cycles, cpp::MemoryOrder::RELAXED);
33+
standard_deviation_sum.fetch_add(
34+
static_cast<uint64_t>(current_result.standard_deviation),
35+
cpp::MemoryOrder::RELAXED);
36+
37+
// Perform a CAS loop to atomically update the min
38+
uint64_t orig_min = min.load(cpp::MemoryOrder::RELAXED);
39+
while (!min.compare_exchange_strong(
40+
orig_min, cpp::min(orig_min, current_result.min),
41+
cpp::MemoryOrder::ACQUIRE, cpp::MemoryOrder::RELAXED)) {
4142
}
42-
result.cycles = cycles_sum / num_threads;
43-
result.standard_deviation = standard_deviation_sum / num_threads;
44-
result.min = min;
45-
result.max = max;
46-
result.samples = samples_sum / num_threads;
47-
result.total_iterations = iterations_sum / num_threads;
48-
result.total_time = time_sum / num_threads;
49-
return result;
43+
44+
// Perform a CAS loop to atomically update the max
45+
uint64_t orig_max = max.load(cpp::MemoryOrder::RELAXED);
46+
while (!max.compare_exchange_strong(
47+
orig_max, cpp::max(orig_max, current_result.max),
48+
cpp::MemoryOrder::ACQUIRE, cpp::MemoryOrder::RELAXED)) {
49+
}
50+
51+
samples_sum.fetch_add(current_result.samples, cpp::MemoryOrder::RELAXED);
52+
iterations_sum.fetch_add(current_result.total_iterations,
53+
cpp::MemoryOrder::RELAXED);
54+
time_sum.fetch_add(current_result.total_time, cpp::MemoryOrder::RELAXED);
55+
gpu::memory_fence();
56+
}
57+
58+
cpp::Atomic<uint64_t> cycles_sum = 0;
59+
cpp::Atomic<uint64_t> standard_deviation_sum = 0;
60+
cpp::Atomic<uint64_t> min = UINT64_MAX;
61+
cpp::Atomic<uint64_t> max = 0;
62+
cpp::Atomic<uint32_t> samples_sum = 0;
63+
cpp::Atomic<uint32_t> iterations_sum = 0;
64+
cpp::Atomic<clock_t> time_sum = 0;
65+
cpp::Atomic<uint64_t> active_threads = 0;
66+
67+
void print_results(Benchmark *b) {
68+
constexpr auto GREEN = "\033[32m";
69+
constexpr auto RESET = "\033[0m";
70+
71+
BenchmarkResult result;
72+
gpu::memory_fence();
73+
int num_threads = active_threads.load(cpp::MemoryOrder::RELAXED);
74+
result.cycles = cycles_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
75+
result.standard_deviation =
76+
standard_deviation_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
77+
result.min = min.load(cpp::MemoryOrder::RELAXED);
78+
result.max = max.load(cpp::MemoryOrder::RELAXED);
79+
result.samples = samples_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
80+
result.total_iterations =
81+
iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
82+
result.total_time = time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
83+
gpu::memory_fence();
84+
log << GREEN << "[ RUN ] " << RESET << b->get_name() << '\n';
85+
log << GREEN << "[ OK ] " << RESET << b->get_name() << ": "
86+
<< result.cycles << " cycles, " << result.min << " min, " << result.max
87+
<< " max, " << result.total_iterations << " iterations, "
88+
<< result.total_time << " ns, "
89+
<< static_cast<long>(result.standard_deviation)
90+
<< " stddev (num threads: " << num_threads << ")\n";
5091
}
5192

5293
void Benchmark::run_benchmarks() {
5394
uint64_t id = gpu::get_thread_id();
5495
gpu::sync_threads();
5596

5697
for (Benchmark *b : benchmarks) {
57-
results[id] = b->run();
98+
gpu::memory_fence();
99+
if (id == 0) {
100+
active_threads.store(0, cpp::MemoryOrder::RELAXED);
101+
cycles_sum.store(0, cpp::MemoryOrder::RELAXED);
102+
standard_deviation_sum.store(0, cpp::MemoryOrder::RELAXED);
103+
min.store(UINT64_MAX, cpp::MemoryOrder::RELAXED);
104+
max.store(0, cpp::MemoryOrder::RELAXED);
105+
samples_sum.store(0, cpp::MemoryOrder::RELAXED);
106+
iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
107+
time_sum.store(0, cpp::MemoryOrder::RELAXED);
108+
}
109+
gpu::memory_fence();
58110
gpu::sync_threads();
111+
112+
auto current_result = b->run();
113+
update_sums(current_result, active_threads, cycles_sum,
114+
standard_deviation_sum, min, max, samples_sum, iterations_sum,
115+
time_sum);
116+
gpu::sync_threads();
117+
59118
if (id == 0) {
60-
BenchmarkResult all_results = reduce_results(results);
61-
constexpr auto GREEN = "\033[32m";
62-
constexpr auto RESET = "\033[0m";
63-
log << GREEN << "[ RUN ] " << RESET << b->get_name() << '\n';
64-
log << GREEN << "[ OK ] " << RESET << b->get_name() << ": "
65-
<< all_results.cycles << " cycles, " << all_results.min << " min, "
66-
<< all_results.max << " max, " << all_results.total_iterations
67-
<< " iterations, " << all_results.total_time << " ns, "
68-
<< static_cast<long>(all_results.standard_deviation) << " stddev\n";
119+
print_results(b);
69120
}
70121
}
71122
gpu::sync_threads();

libc/benchmarks/gpu/LibcGpuBenchmark.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ class Benchmark {
8888
}
8989

9090
static void run_benchmarks();
91+
const cpp::string_view get_name() const { return name; }
9192

9293
protected:
9394
static void add_benchmark(Benchmark *benchmark);
@@ -97,13 +98,12 @@ class Benchmark {
9798
BenchmarkOptions options;
9899
return benchmark(options, func);
99100
}
100-
const cpp::string_view get_name() const { return name; }
101101
};
102102
} // namespace benchmarks
103103
} // namespace LIBC_NAMESPACE_DECL
104104

105105
#define BENCHMARK(SuiteName, TestName, Func) \
106106
LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
107-
Func, #SuiteName "." #TestName);
107+
Func, #SuiteName "." #TestName)
108108

109109
#endif

0 commit comments

Comments
 (0)