Skip to content

Commit 1c5041e

Browse files
committed
[libc++] Vectorize mismatch
1 parent 2d0137d commit 1c5041e

File tree

11 files changed

+335
-190
lines changed

11 files changed

+335
-190
lines changed

libcxx/benchmarks/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ set(BENCHMARK_TESTS
182182
algorithms/make_heap_then_sort_heap.bench.cpp
183183
algorithms/min.bench.cpp
184184
algorithms/min_max_element.bench.cpp
185+
algorithms/mismatch.bench.cpp
185186
algorithms/pop_heap.bench.cpp
186187
algorithms/pstl.stable_sort.bench.cpp
187188
algorithms/push_heap.bench.cpp
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include <algorithm>
10+
#include <benchmark/benchmark.h>
11+
#include <random>
12+
13+
template <class T>
14+
static void bm_mismatch(benchmark::State& state) {
15+
std::vector<T> vec1(state.range(), '1');
16+
std::vector<T> vec2(state.range(), '1');
17+
std::mt19937_64 rng(std::random_device{}());
18+
19+
for (auto _ : state) {
20+
auto idx = rng() % vec1.size();
21+
vec1[idx] = '2';
22+
benchmark::DoNotOptimize(vec1);
23+
benchmark::DoNotOptimize(std::mismatch(vec1.begin(), vec1.end(), vec2.begin()));
24+
vec1[idx] = '1';
25+
}
26+
}
27+
BENCHMARK(bm_mismatch<char>)->DenseRange(1, 8)->Range(16, 1 << 20);
28+
BENCHMARK(bm_mismatch<short>)->DenseRange(1, 8)->Range(16, 1 << 20);
29+
BENCHMARK(bm_mismatch<int>)->DenseRange(1, 8)->Range(16, 1 << 20);
30+
31+
BENCHMARK_MAIN();

libcxx/include/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ set(files
217217
__algorithm/shift_right.h
218218
__algorithm/shuffle.h
219219
__algorithm/sift_down.h
220+
__algorithm/simd_utils.h
220221
__algorithm/sort.h
221222
__algorithm/sort_heap.h
222223
__algorithm/stable_partition.h

libcxx/include/__algorithm/mismatch.h

Lines changed: 73 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,23 +11,89 @@
1111
#define _LIBCPP___ALGORITHM_MISMATCH_H
1212

1313
#include <__algorithm/comp.h>
14+
#include <__algorithm/simd_utils.h>
15+
#include <__algorithm/unwrap_iter.h>
1416
#include <__config>
15-
#include <__iterator/iterator_traits.h>
17+
#include <__functional/identity.h>
18+
#include <__type_traits/invoke.h>
19+
#include <__type_traits/is_constant_evaluated.h>
20+
#include <__type_traits/is_equality_comparable.h>
21+
#include <__type_traits/operation_traits.h>
22+
#include <__utility/move.h>
1623
#include <__utility/pair.h>
1724

1825
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
1926
# pragma GCC system_header
2027
#endif
2128

29+
_LIBCPP_PUSH_MACROS
30+
#include <__undef_macros>
31+
2232
_LIBCPP_BEGIN_NAMESPACE_STD
2333

34+
template <class _Iter1, class _Sent1, class _Iter2, class _Pred, class _Proj1, class _Proj2>
35+
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter1, _Iter2>
36+
__mismatch_loop(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) {
37+
while (__first1 != __last1) {
38+
if (!std::__invoke(__pred, std::__invoke(__proj1, *__first1), std::__invoke(__proj2, *__first2)))
39+
break;
40+
++__first1;
41+
++__first2;
42+
}
43+
return std::make_pair(std::move(__first1), std::move(__first2));
44+
}
45+
46+
template <class _Iter1, class _Sent1, class _Iter2, class _Pred, class _Proj1, class _Proj2>
47+
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter1, _Iter2>
48+
__mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) {
49+
return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2);
50+
}
51+
52+
#if _LIBCPP_VECTORIZE_ALGORIHTMS
53+
54+
template <class _Tp,
55+
class _Pred,
56+
class _Proj1,
57+
class _Proj2,
58+
__enable_if_t<is_integral<_Tp>::value && __desugars_to<__equal_tag, _Pred, _Tp, _Tp>::value &&
59+
__is_identity<_Proj1>::value && __is_identity<_Proj2>::value,
60+
int> = 0>
61+
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Tp*, _Tp*>
62+
__mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) {
63+
constexpr size_t __unroll_count = 4;
64+
constexpr size_t __vec_size = __native_vector_size<_Tp>;
65+
using __vec = __simd_vector<_Tp, __vec_size>;
66+
while (!__libcpp_is_constant_evaluated() && static_cast<size_t>(__last1 - __first1) >= __unroll_count * __vec_size) {
67+
__vec __lhs[__unroll_count];
68+
__vec __rhs[__unroll_count];
69+
70+
for (size_t __i = 0; __i != __unroll_count; ++__i) {
71+
__lhs[__i] = std::__load_vector<__vec_size>(__first1 + __i * __vec_size);
72+
__rhs[__i] = std::__load_vector<__vec_size>(__first2 + __i * __vec_size);
73+
}
74+
75+
for (size_t __i = 0; __i != __unroll_count; ++__i) {
76+
if (auto __cmp_res = __lhs[__i] == __rhs[__i]; !std::__all_of(__cmp_res)) {
77+
auto __offset = __i * __unroll_count + std::__find_first_not_set(__cmp_res);
78+
return {__first1 + __offset, __first2 + __offset};
79+
}
80+
}
81+
82+
__first1 += __unroll_count * __vec_size;
83+
__first2 += __unroll_count * __vec_size;
84+
}
85+
return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2);
86+
}
87+
88+
#endif // _LIBCPP_VECTORIZE_ALGORIHTMS
89+
2490
template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
2591
_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InputIterator1, _InputIterator2>
2692
mismatch(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _BinaryPredicate __pred) {
27-
for (; __first1 != __last1; ++__first1, (void)++__first2)
28-
if (!__pred(*__first1, *__first2))
29-
break;
30-
return pair<_InputIterator1, _InputIterator2>(__first1, __first2);
93+
__identity __proj;
94+
auto __res = std::__mismatch(
95+
std::__unwrap_iter(__first1), std::__unwrap_iter(__last1), std::__unwrap_iter(__first2), __pred, __proj, __proj);
96+
return std::make_pair(std::__rewrap_iter(__first1, __res.first), std::__rewrap_iter(__first2, __res.second));
3197
}
3298

3399
template <class _InputIterator1, class _InputIterator2>
@@ -59,4 +125,6 @@ mismatch(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __fi
59125

60126
_LIBCPP_END_NAMESPACE_STD
61127

128+
_LIBCPP_POP_MACROS
129+
62130
#endif // _LIBCPP___ALGORITHM_MISMATCH_H
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef _LIBCPP___ALGORITHM_SIMD_UTILS_H
10+
#define _LIBCPP___ALGORITHM_SIMD_UTILS_H
11+
12+
#include <__bit/bit_cast.h>
13+
#include <__bit/countr.h>
14+
#include <__config>
15+
#include <__type_traits/is_arithmetic.h>
16+
#include <__type_traits/is_same.h>
17+
#include <__utility/integer_sequence.h>
18+
#include <cstddef>
19+
#include <cstdint>
20+
21+
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
22+
# pragma GCC system_header
23+
#endif
24+
25+
#if _LIBCPP_STD_VER >= 14 && __has_attribute(__ext_vector_type__) && __has_builtin(__builtin_reduce_and) && \
26+
__has_builtin(__builtin_convertvector)
27+
# define _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS 1
28+
#else
29+
# define _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS 0
30+
#endif
31+
32+
#if _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS && !defined(__OPTIMIZE_SIZE__)
33+
# define _LIBCPP_VECTORIZE_ALGORIHTMS 1
34+
#else
35+
# define _LIBCPP_VECTORIZE_ALGORIHTMS 0
36+
#endif
37+
38+
#if _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS
39+
40+
_LIBCPP_BEGIN_NAMESPACE_STD
41+
42+
# if defined(__AVX512F__)
43+
template <class _Tp>
44+
inline constexpr size_t __native_vector_size = 64 / sizeof(_Tp);
45+
# elif defined(__AVX__)
46+
template <class _Tp>
47+
inline constexpr size_t __native_vector_size = 32 / sizeof(_Tp);
48+
# elif defined(__SSE__) || defined(__ARM_NEON__)
49+
template <class _Tp>
50+
inline constexpr size_t __native_vector_size = 16 / sizeof(_Tp);
51+
# elif defined(__MMX__)
52+
template <class _Tp>
53+
inline constexpr size_t __native_vector_size = 8 / sizeof(_Tp);
54+
# else
55+
template <class _Tp>
56+
inline constexpr size_t __native_vector_size = 1;
57+
# endif
58+
59+
template <class _Tp, size_t _Np>
60+
using __simd_vector __attribute__((__ext_vector_type__(_Np))) = _Tp;
61+
62+
template <size_t _Np, class _Tp>
63+
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector<_Tp, _Np> __load_vector(const _Tp* __ptr) noexcept {
64+
return [=]<size_t... _Indices>(index_sequence<_Indices...>) noexcept {
65+
return __simd_vector<_Tp, _Np>{__ptr[_Indices]...};
66+
}(make_index_sequence<_Np>{});
67+
}
68+
69+
template <class _Tp, size_t _Np>
70+
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool __all_of(__simd_vector<_Tp, _Np> __vec) noexcept {
71+
return __builtin_reduce_and(__builtin_convertvector(__vec, __simd_vector<bool, _Np>));
72+
}
73+
74+
template <class _Tp, size_t _Np>
75+
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI size_t __find_first_set(__simd_vector<_Tp, _Np> __vec) noexcept {
76+
using __mask_vec = __simd_vector<bool, _Np>;
77+
78+
auto __impl = [&]<class _MaskT>(_MaskT) noexcept {
79+
return std::__countr_zero(std::__bit_cast<_MaskT>(__builtin_convertvector(__vec, __mask_vec)));
80+
};
81+
82+
if constexpr (sizeof(__mask_vec) == sizeof(uint8_t)) {
83+
return __impl(uint8_t{});
84+
} else if constexpr (sizeof(__mask_vec) == sizeof(uint16_t)) {
85+
return __impl(uint16_t{});
86+
} else if constexpr (sizeof(__mask_vec) == sizeof(uint32_t)) {
87+
return __impl(uint32_t{});
88+
} else if constexpr (sizeof(__mask_vec) == sizeof(uint64_t)) {
89+
return __impl(uint64_t{});
90+
} else {
91+
static_assert(sizeof(__mask_vec) == 0, "unexpected required size for mask integer type");
92+
}
93+
}
94+
95+
template <class _Tp, size_t _Np>
96+
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI size_t __find_first_not_set(__simd_vector<_Tp, _Np> __vec) noexcept {
97+
return std::__find_first_set(~__vec);
98+
}
99+
100+
_LIBCPP_END_NAMESPACE_STD
101+
102+
#endif // _LIBCPP_STD_VER >= 14 && __has_attribute(__ext_vector_type__) && __has_builtin(__builtin_reduce_and) &&
103+
// __has_builtin(__builtin_convertvector)
104+
105+
#endif // _LIBCPP___ALGORITHM_SIMD_UTILS_H

libcxx/include/__bit/bit_cast.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,15 @@
1919

2020
_LIBCPP_BEGIN_NAMESPACE_STD
2121

22+
#ifndef _LIBCPP_CXX03_LANG
23+
24+
template <class _ToType, class _FromType>
25+
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI constexpr _ToType __bit_cast(const _FromType& __from) noexcept {
26+
return __builtin_bit_cast(_ToType, __from);
27+
}
28+
29+
#endif // _LIBCPP_CXX03_LANG
30+
2231
#if _LIBCPP_STD_VER >= 20
2332

2433
template <class _ToType, class _FromType>

libcxx/include/__bit/countr.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,8 @@ _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __libcpp_ct
3535
return __builtin_ctzll(__x);
3636
}
3737

38-
#if _LIBCPP_STD_VER >= 20
39-
40-
template <__libcpp_unsigned_integer _Tp>
41-
_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr int countr_zero(_Tp __t) noexcept {
38+
template <class _Tp>
39+
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 int __countr_zero(_Tp __t) _NOEXCEPT {
4240
if (__t == 0)
4341
return numeric_limits<_Tp>::digits;
4442

@@ -59,6 +57,13 @@ _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr int countr_zero(_Tp __t) n
5957
}
6058
}
6159

60+
#if _LIBCPP_STD_VER >= 20
61+
62+
template <__libcpp_unsigned_integer _Tp>
63+
_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr int countr_zero(_Tp __t) noexcept {
64+
return std::__countr_zero(__t);
65+
}
66+
6267
template <__libcpp_unsigned_integer _Tp>
6368
_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr int countr_one(_Tp __t) noexcept {
6469
return __t != numeric_limits<_Tp>::max() ? std::countr_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits;

libcxx/include/libcxx.imp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@
217217
{ include: [ "<__algorithm/shift_right.h>", "private", "<algorithm>", "public" ] },
218218
{ include: [ "<__algorithm/shuffle.h>", "private", "<algorithm>", "public" ] },
219219
{ include: [ "<__algorithm/sift_down.h>", "private", "<algorithm>", "public" ] },
220+
{ include: [ "<__algorithm/simd_utils.h>", "private", "<algorithm>", "public" ] },
220221
{ include: [ "<__algorithm/sort.h>", "private", "<algorithm>", "public" ] },
221222
{ include: [ "<__algorithm/sort_heap.h>", "private", "<algorithm>", "public" ] },
222223
{ include: [ "<__algorithm/stable_partition.h>", "private", "<algorithm>", "public" ] },

libcxx/include/module.modulemap.in

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -697,7 +697,10 @@ module std_private_algorithm_minmax [system
697697
export *
698698
}
699699
module std_private_algorithm_minmax_element [system] { header "__algorithm/minmax_element.h" }
700-
module std_private_algorithm_mismatch [system] { header "__algorithm/mismatch.h" }
700+
module std_private_algorithm_mismatch [system] {
701+
header "__algorithm/mismatch.h"
702+
export std_private_algorithm_simd_utils
703+
}
701704
module std_private_algorithm_move [system] { header "__algorithm/move.h" }
702705
module std_private_algorithm_move_backward [system] { header "__algorithm/move_backward.h" }
703706
module std_private_algorithm_next_permutation [system] { header "__algorithm/next_permutation.h" }
@@ -1048,6 +1051,7 @@ module std_private_algorithm_sort [system
10481051
header "__algorithm/sort.h"
10491052
export std_private_debug_utils_strict_weak_ordering_check
10501053
}
1054+
module std_private_algorithm_simd_utils [system] { header "__algorithm/simd_utils.h" }
10511055
module std_private_algorithm_sort_heap [system] { header "__algorithm/sort_heap.h" }
10521056
module std_private_algorithm_stable_partition [system] { header "__algorithm/stable_partition.h" }
10531057
module std_private_algorithm_stable_sort [system] { header "__algorithm/stable_sort.h" }

0 commit comments

Comments
 (0)