1
1
#include " LibcGpuBenchmark.h"
2
2
#include " src/__support/CPP/algorithm.h"
3
3
#include " src/__support/CPP/array.h"
4
+ #include " src/__support/CPP/atomic.h"
4
5
#include " src/__support/CPP/string.h"
5
6
#include " src/__support/FPUtil/sqrt.h"
6
7
#include " src/__support/GPU/utils.h"
@@ -12,60 +13,110 @@ namespace LIBC_NAMESPACE_DECL {
12
13
namespace benchmarks {
13
14
14
15
FixedVector<Benchmark *, 64 > benchmarks;
15
- cpp::array<BenchmarkResult, 1024 > results;
16
16
17
17
void Benchmark::add_benchmark (Benchmark *benchmark) {
18
18
benchmarks.push_back (benchmark);
19
19
}
20
20
21
- BenchmarkResult
22
- reduce_results (const cpp::array<BenchmarkResult, 1024 > &results) {
23
- BenchmarkResult result;
24
- uint64_t cycles_sum = 0 ;
25
- double standard_deviation_sum = 0 ;
26
- uint64_t min = UINT64_MAX;
27
- uint64_t max = 0 ;
28
- uint32_t samples_sum = 0 ;
29
- uint32_t iterations_sum = 0 ;
30
- clock_t time_sum = 0 ;
31
- uint64_t num_threads = gpu::get_num_threads ();
32
- for (uint64_t i = 0 ; i < num_threads; i++) {
33
- BenchmarkResult current_result = results[i];
34
- cycles_sum += current_result.cycles ;
35
- standard_deviation_sum += current_result.standard_deviation ;
36
- min = cpp::min (min, current_result.min );
37
- max = cpp::max (max, current_result.max );
38
- samples_sum += current_result.samples ;
39
- iterations_sum += current_result.total_iterations ;
40
- time_sum += current_result.total_time ;
21
+ void update_sums (const BenchmarkResult ¤t_result,
22
+ cpp::Atomic<uint64_t > &active_threads,
23
+ cpp::Atomic<uint64_t > &cycles_sum,
24
+ cpp::Atomic<uint64_t > &standard_deviation_sum,
25
+ cpp::Atomic<uint64_t > &min, cpp::Atomic<uint64_t > &max,
26
+ cpp::Atomic<uint32_t > &samples_sum,
27
+ cpp::Atomic<uint32_t > &iterations_sum,
28
+ cpp::Atomic<clock_t > &time_sum) {
29
+ gpu::memory_fence ();
30
+ active_threads.fetch_add (1 , cpp::MemoryOrder::RELAXED);
31
+
32
+ cycles_sum.fetch_add (current_result.cycles , cpp::MemoryOrder::RELAXED);
33
+ standard_deviation_sum.fetch_add (
34
+ static_cast <uint64_t >(current_result.standard_deviation ),
35
+ cpp::MemoryOrder::RELAXED);
36
+
37
+ // Perform a CAS loop to atomically update the min
38
+ uint64_t orig_min = min.load (cpp::MemoryOrder::RELAXED);
39
+ while (!min.compare_exchange_strong (
40
+ orig_min, cpp::min (orig_min, current_result.min ),
41
+ cpp::MemoryOrder::ACQUIRE, cpp::MemoryOrder::RELAXED)) {
41
42
}
42
- result.cycles = cycles_sum / num_threads;
43
- result.standard_deviation = standard_deviation_sum / num_threads;
44
- result.min = min;
45
- result.max = max;
46
- result.samples = samples_sum / num_threads;
47
- result.total_iterations = iterations_sum / num_threads;
48
- result.total_time = time_sum / num_threads;
49
- return result;
43
+
44
+ // Perform a CAS loop to atomically update the max
45
+ uint64_t orig_max = max.load (cpp::MemoryOrder::RELAXED);
46
+ while (!max.compare_exchange_strong (
47
+ orig_max, cpp::max (orig_max, current_result.max ),
48
+ cpp::MemoryOrder::ACQUIRE, cpp::MemoryOrder::RELAXED)) {
49
+ }
50
+
51
+ samples_sum.fetch_add (current_result.samples , cpp::MemoryOrder::RELAXED);
52
+ iterations_sum.fetch_add (current_result.total_iterations ,
53
+ cpp::MemoryOrder::RELAXED);
54
+ time_sum.fetch_add (current_result.total_time , cpp::MemoryOrder::RELAXED);
55
+ gpu::memory_fence ();
56
+ }
57
+
58
+ cpp::Atomic<uint64_t > cycles_sum = 0 ;
59
+ cpp::Atomic<uint64_t > standard_deviation_sum = 0 ;
60
+ cpp::Atomic<uint64_t > min = UINT64_MAX;
61
+ cpp::Atomic<uint64_t > max = 0 ;
62
+ cpp::Atomic<uint32_t > samples_sum = 0 ;
63
+ cpp::Atomic<uint32_t > iterations_sum = 0 ;
64
+ cpp::Atomic<clock_t > time_sum = 0 ;
65
+ cpp::Atomic<uint64_t > active_threads = 0 ;
66
+
67
+ void print_results (Benchmark *b) {
68
+ constexpr auto GREEN = " \033 [32m" ;
69
+ constexpr auto RESET = " \033 [0m" ;
70
+
71
+ BenchmarkResult result;
72
+ gpu::memory_fence ();
73
+ int num_threads = active_threads.load (cpp::MemoryOrder::RELAXED);
74
+ result.cycles = cycles_sum.load (cpp::MemoryOrder::RELAXED) / num_threads;
75
+ result.standard_deviation =
76
+ standard_deviation_sum.load (cpp::MemoryOrder::RELAXED) / num_threads;
77
+ result.min = min.load (cpp::MemoryOrder::RELAXED);
78
+ result.max = max.load (cpp::MemoryOrder::RELAXED);
79
+ result.samples = samples_sum.load (cpp::MemoryOrder::RELAXED) / num_threads;
80
+ result.total_iterations =
81
+ iterations_sum.load (cpp::MemoryOrder::RELAXED) / num_threads;
82
+ result.total_time = time_sum.load (cpp::MemoryOrder::RELAXED) / num_threads;
83
+ gpu::memory_fence ();
84
+ log << GREEN << " [ RUN ] " << RESET << b->get_name () << ' \n ' ;
85
+ log << GREEN << " [ OK ] " << RESET << b->get_name () << " : "
86
+ << result.cycles << " cycles, " << result.min << " min, " << result.max
87
+ << " max, " << result.total_iterations << " iterations, "
88
+ << result.total_time << " ns, "
89
+ << static_cast <long >(result.standard_deviation )
90
+ << " stddev (num threads: " << num_threads << " )\n " ;
50
91
}
51
92
52
93
void Benchmark::run_benchmarks () {
53
94
uint64_t id = gpu::get_thread_id ();
54
95
gpu::sync_threads ();
55
96
56
97
for (Benchmark *b : benchmarks) {
57
- results[id] = b->run ();
98
+ gpu::memory_fence ();
99
+ if (id == 0 ) {
100
+ active_threads.store (0 , cpp::MemoryOrder::RELAXED);
101
+ cycles_sum.store (0 , cpp::MemoryOrder::RELAXED);
102
+ standard_deviation_sum.store (0 , cpp::MemoryOrder::RELAXED);
103
+ min.store (UINT64_MAX, cpp::MemoryOrder::RELAXED);
104
+ max.store (0 , cpp::MemoryOrder::RELAXED);
105
+ samples_sum.store (0 , cpp::MemoryOrder::RELAXED);
106
+ iterations_sum.store (0 , cpp::MemoryOrder::RELAXED);
107
+ time_sum.store (0 , cpp::MemoryOrder::RELAXED);
108
+ }
109
+ gpu::memory_fence ();
58
110
gpu::sync_threads ();
111
+
112
+ auto current_result = b->run ();
113
+ update_sums (current_result, active_threads, cycles_sum,
114
+ standard_deviation_sum, min, max, samples_sum, iterations_sum,
115
+ time_sum);
116
+ gpu::sync_threads ();
117
+
59
118
if (id == 0 ) {
60
- BenchmarkResult all_results = reduce_results (results);
61
- constexpr auto GREEN = " \033 [32m" ;
62
- constexpr auto RESET = " \033 [0m" ;
63
- log << GREEN << " [ RUN ] " << RESET << b->get_name () << ' \n ' ;
64
- log << GREEN << " [ OK ] " << RESET << b->get_name () << " : "
65
- << all_results.cycles << " cycles, " << all_results.min << " min, "
66
- << all_results.max << " max, " << all_results.total_iterations
67
- << " iterations, " << all_results.total_time << " ns, "
68
- << static_cast <long >(all_results.standard_deviation ) << " stddev\n " ;
119
+ print_results (b);
69
120
}
70
121
}
71
122
gpu::sync_threads ();
0 commit comments