Skip to content

Commit 4538802

Browse files
authored
[MicroBenchmarks/LoopVectorization] Add Microbenchmark for Epilogue Vectorization (#165)
This microbenchmark attempts to test the impact of epilogue vectorization on loops with varying epilogue lengths and vector widths for loops with and without reductions. PR: #165
1 parent 4403710 commit 4538802

File tree

2 files changed

+97
-0
lines changed

2 files changed

+97
-0
lines changed

MicroBenchmarks/LoopVectorization/CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,12 @@ llvm_test_executable(LoopInterleavingBenchmarks
2626
)
2727

2828
target_link_libraries(LoopInterleavingBenchmarks benchmark)
29+
30+
llvm_test_run()
31+
32+
llvm_test_executable(LoopEpilogueVectorizationBenchmarks
33+
main.cpp
34+
EpilogueVectorization.cpp
35+
)
36+
37+
target_link_libraries(LoopEpilogueVectorizationBenchmarks benchmark)
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
// This program tests performance impact of Epilogue Vectorization
2+
// with varying epilogue lengths, and vector widths.
3+
#include <cstdint>
4+
#include <memory>
5+
#include <random>
6+
7+
#include "benchmark/benchmark.h"
8+
9+
static std::mt19937 rng;
10+
uint64_t g_sum = 0;
11+
12+
// Initialize array A with random numbers.
13+
template <typename Ty>
14+
static void init_data(const std::unique_ptr<Ty[]> &A, unsigned N) {
15+
std::uniform_int_distribution<Ty> distrib(std::numeric_limits<Ty>::min(),
16+
std::numeric_limits<Ty>::max());
17+
for (unsigned I = 0; I < N; I++)
18+
A[I] = distrib(rng);
19+
}
20+
21+
// Helper to block optimizing \p F based on its arguments.
22+
template <typename F, typename... Args>
23+
__attribute__((optnone)) static uint64_t callThroughOptnone(F &&f, Args &&...args) {
24+
return f(std::forward<Args>(args)...);
25+
}
26+
27+
template <typename Ty>
28+
static void __attribute__((always_inline))
29+
runBenchForEpilogueVectorization(benchmark::State &state,
30+
uint64_t (*Fn)(Ty *, Ty *, Ty *, int)) {
31+
auto Iterations = state.range(0);
32+
std::unique_ptr<Ty[]> A(new Ty[Iterations]);
33+
std::unique_ptr<Ty[]> B(new Ty[Iterations]);
34+
std::unique_ptr<Ty[]> C(new Ty[Iterations]);
35+
init_data(A, Iterations);
36+
init_data(B, Iterations);
37+
init_data(C, Iterations);
38+
for (auto _ : state) {
39+
benchmark::DoNotOptimize(A);
40+
benchmark::DoNotOptimize(B);
41+
benchmark::DoNotOptimize(C);
42+
benchmark::ClobberMemory();
43+
g_sum += callThroughOptnone(Fn, &A[0], &B[0], &C[0], Iterations);
44+
}
45+
}
46+
47+
template <typename Ty>
48+
static uint64_t __attribute__((noinline))
49+
loopAutoVec(Ty *A, Ty *B, Ty *C, int Iterations) {
50+
for (int J = 0; J < Iterations; J++) {
51+
A[J] = B[J] + C[J];
52+
}
53+
return 0;
54+
}
55+
56+
template <typename Ty>
57+
static uint64_t __attribute__((noinline))
58+
loopWithReductionAutoVec(Ty *A, Ty *B, Ty *C, int Iterations) {
59+
uint64_t sum = 0;
60+
for (int J = 0; J < Iterations; J++) {
61+
sum += A[J];
62+
}
63+
return sum;
64+
}
65+
66+
template <typename Ty> void benchAutoVec(benchmark::State &state) {
67+
runBenchForEpilogueVectorization<Ty>(state, &loopAutoVec<Ty>);
68+
}
69+
70+
template <typename Ty> void benchReductionAutoVec(benchmark::State &state) {
71+
runBenchForEpilogueVectorization<Ty>(state, &loopWithReductionAutoVec<Ty>);
72+
}
73+
74+
#ifdef ALL_LOOP_EPILOGUE_TESTS
75+
BENCHMARK_TEMPLATE(benchAutoVec, uint8_t)->DenseRange(65, 127, 1);
76+
BENCHMARK_TEMPLATE(benchReductionAutoVec, uint8_t)->DenseRange(65, 127, 1);
77+
BENCHMARK_TEMPLATE(benchAutoVec, uint16_t)->DenseRange(65, 127, 1);
78+
BENCHMARK_TEMPLATE(benchReductionAutoVec, uint16_t)->DenseRange(65, 127, 1);
79+
BENCHMARK_TEMPLATE(benchAutoVec, uint32_t)->DenseRange(65, 127, 1);
80+
BENCHMARK_TEMPLATE(benchReductionAutoVec, uint32_t)->DenseRange(65, 127, 1);
81+
#else
82+
BENCHMARK_TEMPLATE(benchAutoVec, uint8_t)->Arg(65)->Arg(127);
83+
BENCHMARK_TEMPLATE(benchReductionAutoVec, uint8_t)->Arg(65)->Arg(127);
84+
BENCHMARK_TEMPLATE(benchAutoVec, uint16_t)->Arg(65)->Arg(127);
85+
BENCHMARK_TEMPLATE(benchReductionAutoVec, uint16_t)->Arg(65)->Arg(127);
86+
BENCHMARK_TEMPLATE(benchAutoVec, uint32_t)->Arg(65)->Arg(127);
87+
BENCHMARK_TEMPLATE(benchReductionAutoVec, uint32_t)->Arg(65)->Arg(127);
88+
#endif

0 commit comments

Comments
 (0)