|
| 1 | +// This program tests performance impact of Epilogue Vectorization |
| 2 | +// with varying epilogue lengths, and vector widths. |
| 3 | +#include <cstdint> |
| 4 | +#include <memory> |
| 5 | +#include <random> |
| 6 | + |
| 7 | +#include "benchmark/benchmark.h" |
| 8 | + |
| 9 | +static std::mt19937 rng; |
| 10 | +uint64_t g_sum = 0; |
| 11 | + |
| 12 | +// Initialize array A with random numbers. |
| 13 | +template <typename Ty> |
| 14 | +static void init_data(const std::unique_ptr<Ty[]> &A, unsigned N) { |
| 15 | + std::uniform_int_distribution<Ty> distrib(std::numeric_limits<Ty>::min(), |
| 16 | + std::numeric_limits<Ty>::max()); |
| 17 | + for (unsigned I = 0; I < N; I++) |
| 18 | + A[I] = distrib(rng); |
| 19 | +} |
| 20 | + |
| 21 | +// Helper to block optimizing \p F based on its arguments. |
| 22 | +template <typename F, typename... Args> |
| 23 | +__attribute__((optnone)) static uint64_t callThroughOptnone(F &&f, Args &&...args) { |
| 24 | + return f(std::forward<Args>(args)...); |
| 25 | +} |
| 26 | + |
| 27 | +template <typename Ty> |
| 28 | +static void __attribute__((always_inline)) |
| 29 | +runBenchForEpilogueVectorization(benchmark::State &state, |
| 30 | + uint64_t (*Fn)(Ty *, Ty *, Ty *, int)) { |
| 31 | + auto Iterations = state.range(0); |
| 32 | + std::unique_ptr<Ty[]> A(new Ty[Iterations]); |
| 33 | + std::unique_ptr<Ty[]> B(new Ty[Iterations]); |
| 34 | + std::unique_ptr<Ty[]> C(new Ty[Iterations]); |
| 35 | + init_data(A, Iterations); |
| 36 | + init_data(B, Iterations); |
| 37 | + init_data(C, Iterations); |
| 38 | + for (auto _ : state) { |
| 39 | + benchmark::DoNotOptimize(A); |
| 40 | + benchmark::DoNotOptimize(B); |
| 41 | + benchmark::DoNotOptimize(C); |
| 42 | + benchmark::ClobberMemory(); |
| 43 | + g_sum += callThroughOptnone(Fn, &A[0], &B[0], &C[0], Iterations); |
| 44 | + } |
| 45 | +} |
| 46 | + |
| 47 | +template <typename Ty> |
| 48 | +static uint64_t __attribute__((noinline)) |
| 49 | +loopAutoVec(Ty *A, Ty *B, Ty *C, int Iterations) { |
| 50 | + for (int J = 0; J < Iterations; J++) { |
| 51 | + A[J] = B[J] + C[J]; |
| 52 | + } |
| 53 | + return 0; |
| 54 | +} |
| 55 | + |
| 56 | +template <typename Ty> |
| 57 | +static uint64_t __attribute__((noinline)) |
| 58 | +loopWithReductionAutoVec(Ty *A, Ty *B, Ty *C, int Iterations) { |
| 59 | + uint64_t sum = 0; |
| 60 | + for (int J = 0; J < Iterations; J++) { |
| 61 | + sum += A[J]; |
| 62 | + } |
| 63 | + return sum; |
| 64 | +} |
| 65 | + |
| 66 | +template <typename Ty> void benchAutoVec(benchmark::State &state) { |
| 67 | + runBenchForEpilogueVectorization<Ty>(state, &loopAutoVec<Ty>); |
| 68 | +} |
| 69 | + |
| 70 | +template <typename Ty> void benchReductionAutoVec(benchmark::State &state) { |
| 71 | + runBenchForEpilogueVectorization<Ty>(state, &loopWithReductionAutoVec<Ty>); |
| 72 | +} |
| 73 | + |
| 74 | +#ifdef ALL_LOOP_EPILOGUE_TESTS |
| 75 | +BENCHMARK_TEMPLATE(benchAutoVec, uint8_t)->DenseRange(65, 127, 1); |
| 76 | +BENCHMARK_TEMPLATE(benchReductionAutoVec, uint8_t)->DenseRange(65, 127, 1); |
| 77 | +BENCHMARK_TEMPLATE(benchAutoVec, uint16_t)->DenseRange(65, 127, 1); |
| 78 | +BENCHMARK_TEMPLATE(benchReductionAutoVec, uint16_t)->DenseRange(65, 127, 1); |
| 79 | +BENCHMARK_TEMPLATE(benchAutoVec, uint32_t)->DenseRange(65, 127, 1); |
| 80 | +BENCHMARK_TEMPLATE(benchReductionAutoVec, uint32_t)->DenseRange(65, 127, 1); |
| 81 | +#else |
| 82 | +BENCHMARK_TEMPLATE(benchAutoVec, uint8_t)->Arg(65)->Arg(127); |
| 83 | +BENCHMARK_TEMPLATE(benchReductionAutoVec, uint8_t)->Arg(65)->Arg(127); |
| 84 | +BENCHMARK_TEMPLATE(benchAutoVec, uint16_t)->Arg(65)->Arg(127); |
| 85 | +BENCHMARK_TEMPLATE(benchReductionAutoVec, uint16_t)->Arg(65)->Arg(127); |
| 86 | +BENCHMARK_TEMPLATE(benchAutoVec, uint32_t)->Arg(65)->Arg(127); |
| 87 | +BENCHMARK_TEMPLATE(benchReductionAutoVec, uint32_t)->Arg(65)->Arg(127); |
| 88 | +#endif |
0 commit comments