Skip to content

Commit e1c9ff3

Browse files
committed
[libc++] Optimize mismatch tail
1 parent 6c6f71a commit e1c9ff3

File tree

3 files changed

+86
-17
lines changed

3 files changed

+86
-17
lines changed

libcxx/benchmarks/algorithms/mismatch.bench.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,15 @@
1010
#include <benchmark/benchmark.h>
1111
#include <random>
1212

13+
void BenchmarkSizes(benchmark::internal::Benchmark* Benchmark) {
14+
Benchmark->DenseRange(1, 8);
15+
for (size_t i = 16; i != 1 << 20; i *= 2) {
16+
Benchmark->Arg(i - 1);
17+
Benchmark->Arg(i);
18+
Benchmark->Arg(i + 1);
19+
}
20+
}
21+
1322
// TODO: Look into benchmarking aligned and unaligned memory explicitly
1423
// (currently things happen to be aligned because they are malloced that way)
1524
template <class T>
@@ -24,8 +33,8 @@ static void bm_mismatch(benchmark::State& state) {
2433
benchmark::DoNotOptimize(std::mismatch(vec1.begin(), vec1.end(), vec2.begin()));
2534
}
2635
}
27-
BENCHMARK(bm_mismatch<char>)->DenseRange(1, 8)->Range(16, 1 << 20);
28-
BENCHMARK(bm_mismatch<short>)->DenseRange(1, 8)->Range(16, 1 << 20);
29-
BENCHMARK(bm_mismatch<int>)->DenseRange(1, 8)->Range(16, 1 << 20);
36+
BENCHMARK(bm_mismatch<char>)->Apply(BenchmarkSizes);
37+
BENCHMARK(bm_mismatch<short>)->Apply(BenchmarkSizes);
38+
BENCHMARK(bm_mismatch<int>)->Apply(BenchmarkSizes);
3039

3140
BENCHMARK_MAIN();

libcxx/include/__algorithm/mismatch.h

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -64,27 +64,59 @@ __mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred& __pred, _Proj1& __
6464
constexpr size_t __unroll_count = 4;
6565
constexpr size_t __vec_size = __native_vector_size<_Tp>;
6666
using __vec = __simd_vector<_Tp, __vec_size>;
67-
while (!__libcpp_is_constant_evaluated() && static_cast<size_t>(__last1 - __first1) >= __unroll_count * __vec_size)
68-
[[__unlikely__]] {
69-
__vec __lhs[__unroll_count];
70-
__vec __rhs[__unroll_count];
71-
72-
for (size_t __i = 0; __i != __unroll_count; ++__i) {
73-
__lhs[__i] = std::__load_vector<__vec>(__first1 + __i * __vec_size);
74-
__rhs[__i] = std::__load_vector<__vec>(__first2 + __i * __vec_size);
67+
68+
if (!__libcpp_is_constant_evaluated()) {
69+
auto __orig_first1 = __first1;
70+
auto __last2 = __first2 + (__last1 - __first1);
71+
while (static_cast<size_t>(__last1 - __first1) >= __unroll_count * __vec_size) [[__unlikely__]] {
72+
__vec __lhs[__unroll_count];
73+
__vec __rhs[__unroll_count];
74+
75+
for (size_t __i = 0; __i != __unroll_count; ++__i) {
76+
__lhs[__i] = std::__load_vector<__vec>(__first1 + __i * __vec_size);
77+
__rhs[__i] = std::__load_vector<__vec>(__first2 + __i * __vec_size);
78+
}
79+
80+
for (size_t __i = 0; __i != __unroll_count; ++__i) {
81+
if (auto __cmp_res = __lhs[__i] == __rhs[__i]; !std::__all_of(__cmp_res)) {
82+
auto __offset = __i * __vec_size + std::__find_first_not_set(__cmp_res);
83+
return {__first1 + __offset, __first2 + __offset};
84+
}
85+
}
86+
87+
__first1 += __unroll_count * __vec_size;
88+
__first2 += __unroll_count * __vec_size;
7589
}
7690

77-
for (size_t __i = 0; __i != __unroll_count; ++__i) {
78-
if (auto __cmp_res = __lhs[__i] == __rhs[__i]; !std::__all_of(__cmp_res)) {
79-
auto __offset = __i * __vec_size + std::__find_first_not_set(__cmp_res);
91+
// check the remaining 0-3 vectors
92+
while (static_cast<size_t>(__last1 - __first1) >= __vec_size) {
93+
if (auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
94+
!std::__all_of(__cmp_res)) {
95+
auto __offset = std::__find_first_not_set(__cmp_res);
8096
return {__first1 + __offset, __first2 + __offset};
8197
}
98+
__first1 += __vec_size;
99+
__first2 += __vec_size;
82100
}
83101

84-
__first1 += __unroll_count * __vec_size;
85-
__first2 += __unroll_count * __vec_size;
102+
if (__last1 - __first1 == 0)
103+
return {__first1, __first2};
104+
105+
// Check if we can load elements in fron of the current pointer. If that's the case load a vector at
106+
// (last - vector_size) to check the remaining elements
107+
if (static_cast<size_t>(__first1 - __orig_first1) >= __vec_size) {
108+
__first1 = __last1 - __vec_size;
109+
__first2 = __last2 - __vec_size;
110+
auto __offset =
111+
std::__find_first_not_set(std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2));
112+
return {__first1 + __offset, __first2 + __offset};
113+
} // else loop over the elements individually
114+
115+
// TODO: Consider vectorizing the loop tail further with
116+
// - smaller vectors
117+
// - loading bytes out of range if it's known to be safe
86118
}
87-
// TODO: Consider vectorizing the tail
119+
88120
return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2);
89121
}
90122

libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,5 +144,33 @@ int main(int, char**) {
144144
}
145145
}
146146

147+
{ // check the tail of the vectorized loop
148+
for (size_t vec_size = 1; vec_size != 256; ++vec_size) {
149+
{
150+
std::vector<char> lhs(256);
151+
std::vector<char> rhs(256);
152+
153+
check<char*>(lhs, rhs, lhs.size());
154+
lhs.back() = 1;
155+
check<char*>(lhs, rhs, lhs.size() - 1);
156+
lhs.back() = 0;
157+
rhs.back() = 1;
158+
check<char*>(lhs, rhs, lhs.size() - 1);
159+
rhs.back() = 0;
160+
}
161+
{
162+
std::vector<int> lhs(256);
163+
std::vector<int> rhs(256);
164+
165+
check<int*>(lhs, rhs, lhs.size());
166+
lhs.back() = 1;
167+
check<int*>(lhs, rhs, lhs.size() - 1);
168+
lhs.back() = 0;
169+
rhs.back() = 1;
170+
check<int*>(lhs, rhs, lhs.size() - 1);
171+
rhs.back() = 0;
172+
}
173+
}
174+
}
147175
return 0;
148176
}

0 commit comments

Comments
 (0)