1
+ // This program tests vectorized truncates & zero-extends for performance and
2
+ // correctness
1
3
#include < iostream>
2
4
#include < memory>
3
5
#include < random>
@@ -11,70 +13,178 @@ static std::mt19937 rng;
11
13
// Initialize array A with random numbers.
12
14
template <typename Ty>
13
15
static void init_data (const std::unique_ptr<Ty[]> &A, unsigned N) {
14
- std::uniform_int_distribution<uint64_t > distrib (
15
- std::numeric_limits<Ty>::min (), std::numeric_limits<Ty>::max ());
16
- for (unsigned i = 0 ; i < N; i++)
17
- A[i] = static_cast <Ty>(distrib (rng));
16
+ std::uniform_int_distribution<Ty> distrib (std::numeric_limits<Ty>::min (),
17
+ std::numeric_limits<Ty>::max ());
18
+ for (unsigned I = 0 ; I < N; I++)
19
+ A[I] = distrib (rng);
20
+ }
21
+
22
+ // Truncate/Zero-extend elements to create expected results with no
23
+ // vectorization
24
+ template <typename Ty1, typename Ty2>
25
+ static void truncOrZextWithNoVec (const Ty1 *A, Ty2 *B, int Iterations) {
26
+ #pragma clang loop vectorize(disable)
27
+ for (unsigned I = 0 ; I < Iterations; I++) {
28
+ B[I] = A[I];
29
+ }
18
30
}
19
31
20
32
// Truncate/Zero-extend each vector element in a vectorized loop with vectorization width 8
21
- template <typename Ty1, typename Ty2> static void truncOrZextVecInLoopWithVW8 (const Ty1 *A, Ty2 *B, int iterations) {
33
+ template <typename Ty1, typename Ty2>
34
+ static void truncOrZextVecInLoopWithVW8 (const Ty1 *A, Ty2 *B, int Iterations) {
22
35
#pragma clang loop vectorize_width(8) interleave_count(4)
23
- for (unsigned i = 0 ; i < iterations; i++) {
24
- B[i] = A[i];
36
+ for (unsigned I = 0 ; I < Iterations; I++) {
37
+ B[I] = A[I];
38
+ }
39
+ }
40
+
41
+ // Truncate/Zero-extend each vector element in a vectorized loop with
42
+ // vectorization width 16
43
+ template <typename Ty1, typename Ty2>
44
+ static void truncOrZextVecInLoopWithVW16 (const Ty1 *A, Ty2 *B, int Iterations) {
45
+ #pragma clang loop vectorize_width(16) interleave_count(4)
46
+ for (unsigned I = 0 ; I < Iterations; I++) {
47
+ B[I] = A[I];
48
+ }
49
+ }
50
+
51
+ // Truncate/Zero-extend each vector element in a vectorized loop
52
+ template <typename Ty1, typename Ty2>
53
+ static void truncOrZextVecInLoop (const Ty1 *A, Ty2 *B, int Iterations) {
54
+ #pragma clang loop vectorize(enable)
55
+ for (unsigned I = 0 ; I < Iterations; I++) {
56
+ B[I] = A[I];
57
+ }
58
+ }
59
+
60
+ // Truncate/Zero-extend each vector element while adding in a vectorized loop
61
+ // with vectorization width 8
62
+ template <typename Ty1, typename Ty2>
63
+ static void truncOrZextVecWithAddInLoopWithVW8 (const Ty1 *A, Ty2 *B,
64
+ int Iterations) {
65
+ #pragma clang loop vectorize_width(8) interleave_count(4)
66
+ for (unsigned I = 0 ; I < Iterations; I++) {
67
+ B[I] += A[I];
68
+ }
69
+ }
70
+
71
+ // Truncate/Zero-extend each vector element while adding in a vectorized loop
72
+ // vectorization width 16
73
+ template <typename Ty1, typename Ty2>
74
+ static void truncOrZextVecWithAddInLoopWithVW16 (const Ty1 *A, Ty2 *B,
75
+ int Iterations) {
76
+ #pragma clang loop vectorize_width(16) interleave_count(4)
77
+ for (unsigned I = 0 ; I < Iterations; I++) {
78
+ B[I] += A[I];
79
+ }
80
+ }
81
+
82
+ // Truncate/Zero-extend each vector element while adding in a vectorized loop
83
+ template <typename Ty1, typename Ty2>
84
+ static void truncOrZextVecWithAddInLoop (const Ty1 *A, Ty2 *B, int Iterations) {
85
+ #pragma clang loop vectorize(enable)
86
+ for (unsigned I = 0 ; I < Iterations; I++) {
87
+ B[I] += A[I];
25
88
}
26
89
}
27
90
28
- template <typename Ty1, typename Ty2> static void __attribute__ ((always_inline))
29
- benchForTruncOrZextVecInLoopWithVW8(benchmark::State &state) {
91
+ template <typename Ty1, typename Ty2>
92
+ static void __attribute__ ((always_inline))
93
+ benchForTruncOrZextVecInLoop(benchmark::State &state,
94
+ void (*Fn)(const Ty1 *, Ty2 *, int )) {
30
95
std::unique_ptr<Ty1[]> A (new Ty1[ITERATIONS]);
31
96
std::unique_ptr<Ty2[]> B (new Ty2[ITERATIONS]);
97
+ std::unique_ptr<Ty2[]> C (new Ty2[ITERATIONS]);
98
+
32
99
init_data (A, ITERATIONS);
33
- init_data (B, ITERATIONS);
100
+
101
+ // Check for correctness
102
+ truncOrZextWithNoVec (&A[0 ], &C[0 ], ITERATIONS);
103
+ Fn (&A[0 ], &B[0 ], ITERATIONS);
104
+ for (int I = 0 ; I < ITERATIONS; I++) {
105
+ if (B[I] != C[I]) {
106
+ std::cerr << " ERROR: Trunc or ZExt operation on " << A[I]
107
+ << " is showing result " << B[I] << " instead of " << C[I]
108
+ << " \n " ;
109
+ exit (1 );
110
+ }
111
+ }
112
+
34
113
for (auto _ : state) {
35
114
benchmark::DoNotOptimize (B);
36
115
benchmark::ClobberMemory ();
37
- truncOrZextVecInLoopWithVW8 (&A[0 ], &B[0 ], ITERATIONS);
38
- }
39
- }
40
-
41
- // Truncate/Zero-extend each vector element in a vectorized loop
42
- template <typename Ty1, typename Ty2> static void truncOrZextVecInLoop (const Ty1 *A, Ty2 *B, int iterations) {
43
- #pragma clang loop interleave_count(4)
44
- for (unsigned i = 0 ; i < iterations; i++) {
45
- B[i] = A[i];
116
+ Fn (&A[0 ], &B[0 ], ITERATIONS);
46
117
}
47
118
}
48
119
49
- template <typename Ty1, typename Ty2> static void __attribute__ ((always_inline))
50
- benchForTruncOrZextVecInLoop(benchmark::State &state) {
120
+ template <typename Ty1, typename Ty2>
121
+ static void __attribute__ ((always_inline))
122
+ benchForTruncOrZextVecWithAddInLoop(benchmark::State &state,
123
+ void (*Fn)(const Ty1 *, Ty2 *, int )) {
51
124
std::unique_ptr<Ty1[]> A (new Ty1[ITERATIONS]);
52
125
std::unique_ptr<Ty2[]> B (new Ty2[ITERATIONS]);
126
+ std::unique_ptr<Ty2[]> C (new Ty2[ITERATIONS]);
53
127
init_data (A, ITERATIONS);
54
128
init_data (B, ITERATIONS);
55
129
for (auto _ : state) {
56
130
benchmark::DoNotOptimize (B);
57
131
benchmark::ClobberMemory ();
58
- truncOrZextVecInLoop (&A[0 ], &B[0 ], ITERATIONS);
132
+ Fn (&A[0 ], &B[0 ], ITERATIONS);
59
133
}
60
134
}
61
135
62
136
// Add vectorized truncate or zero-extend operation benchmarks for different element types
63
- #define ADD_BENCHMARK (ty1, ty2 ) \
64
- void benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_(benchmark::State &state) { \
65
- benchForTruncOrZextVecInLoopWithVW8<ty1, ty2>(state); \
137
+ #define ADD_BENCHMARK (ty1, ty2 ) \
138
+ void benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_( \
139
+ benchmark::State &state) { \
140
+ benchForTruncOrZextVecInLoop<ty1, ty2>(state, \
141
+ &truncOrZextVecInLoopWithVW8); \
66
142
} \
67
- BENCHMARK (benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_); \
68
- void benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_(benchmark::State &state) { \
69
- benchForTruncOrZextVecInLoop<ty1, ty2>(state); \
143
+ BENCHMARK (benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_); \
144
+ void benchForTruncOrZextVecInLoopWithVW16From_##ty1##_To_##ty2##_( \
145
+ benchmark::State &state) { \
146
+ benchForTruncOrZextVecInLoop<ty1, ty2>(state, \
147
+ &truncOrZextVecInLoopWithVW16); \
70
148
} \
71
- BENCHMARK (benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_); \
149
+ BENCHMARK (benchForTruncOrZextVecInLoopWithVW16From_##ty1##_To_##ty2##_); \
150
+ void benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_( \
151
+ benchmark::State &state) { \
152
+ benchForTruncOrZextVecInLoop<ty1, ty2>(state, &truncOrZextVecInLoop); \
153
+ } \
154
+ BENCHMARK (benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_); \
155
+ void benchForTruncOrZextVecWithAddInLoopWithVW8From_##ty1##_To_##ty2##_( \
156
+ benchmark::State &state) { \
157
+ benchForTruncOrZextVecWithAddInLoop<ty1, ty2>( \
158
+ state, &truncOrZextVecWithAddInLoopWithVW8); \
159
+ } \
160
+ BENCHMARK ( \
161
+ benchForTruncOrZextVecWithAddInLoopWithVW8From_##ty1##_To_##ty2##_); \
162
+ void benchForTruncOrZextVecWithAddInLoopWithVW16From_##ty1##_To_##ty2##_( \
163
+ benchmark::State &state) { \
164
+ benchForTruncOrZextVecWithAddInLoop<ty1, ty2>( \
165
+ state, &truncOrZextVecWithAddInLoopWithVW16); \
166
+ } \
167
+ BENCHMARK ( \
168
+ benchForTruncOrZextVecWithAddInLoopWithVW16From_##ty1##_To_##ty2##_); \
169
+ void benchForTruncOrZextVecWithAddInLoopFrom_##ty1##_To_##ty2##_( \
170
+ benchmark::State &state) { \
171
+ benchForTruncOrZextVecWithAddInLoop<ty1, ty2>( \
172
+ state, &truncOrZextVecWithAddInLoop); \
173
+ } \
174
+ BENCHMARK (benchForTruncOrZextVecWithAddInLoopFrom_##ty1##_To_##ty2##_);
72
175
73
176
/* Vectorized truncate operations */
74
- ADD_BENCHMARK (uint64_t , uint8_t )
75
- ADD_BENCHMARK(uint32_t , uint8_t )
76
177
ADD_BENCHMARK (uint16_t , uint8_t )
77
-
178
+ ADD_BENCHMARK(uint32_t , uint8_t )
179
+ ADD_BENCHMARK(uint64_t , uint8_t )
180
+ ADD_BENCHMARK(uint32_t , uint16_t )
181
+ ADD_BENCHMARK(uint64_t , uint16_t )
182
+ ADD_BENCHMARK(uint64_t , uint32_t )
78
183
79
184
/* Vectorized zero extend operations */
185
+ ADD_BENCHMARK(uint8_t , uint16_t )
80
186
ADD_BENCHMARK(uint8_t , uint32_t )
187
+ ADD_BENCHMARK(uint8_t , uint64_t )
188
+ ADD_BENCHMARK(uint16_t , uint32_t )
189
+ ADD_BENCHMARK(uint16_t , uint64_t )
190
+ ADD_BENCHMARK(uint32_t , uint64_t )
0 commit comments