Skip to content

Commit 7717a54

Browse files
authored
[libc++] Optimize ranges::equal for vector<bool>::iterator (#121084)
This PR optimizes the performance of `std::ranges::equal` for `vector<bool>::iterator`, addressing a subtask outlined in issue #64038. The optimizations yield performance improvements of up to 188x for aligned equality comparison and 82x for unaligned equality comparison. Moreover, comprehensive tests covering up to 4 storage words (256 bytes) with odd and even bit sizes are provided, which validate the proposed optimizations in this patch.
1 parent 7ffeab3 commit 7717a54

File tree

8 files changed

+451
-221
lines changed

8 files changed

+451
-221
lines changed

libcxx/docs/ReleaseNotes/21.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ Improvements and New Features
4747
- The ``std::ranges::{copy, copy_n, copy_backward, move, move_backward}`` algorithms have been optimized for
4848
``std::vector<bool>::iterator``, resulting in a performance improvement of up to 2000x.
4949

50+
- The ``std::ranges::equal`` algorithm has been optimized for ``std::vector<bool>::iterator``, resulting in a performance
51+
improvement of up to 188x.
52+
5053
- Updated formatting library to Unicode 16.0.0.
5154

5255
Deprecations and Removals

libcxx/include/__algorithm/equal.h

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,20 @@
1111
#define _LIBCPP___ALGORITHM_EQUAL_H
1212

1313
#include <__algorithm/comp.h>
14+
#include <__algorithm/min.h>
1415
#include <__algorithm/unwrap_iter.h>
1516
#include <__config>
1617
#include <__functional/identity.h>
18+
#include <__fwd/bit_reference.h>
1719
#include <__iterator/distance.h>
1820
#include <__iterator/iterator_traits.h>
21+
#include <__memory/pointer_traits.h>
1922
#include <__string/constexpr_c_functions.h>
2023
#include <__type_traits/desugars_to.h>
2124
#include <__type_traits/enable_if.h>
2225
#include <__type_traits/invoke.h>
2326
#include <__type_traits/is_equality_comparable.h>
27+
#include <__type_traits/is_same.h>
2428
#include <__type_traits/is_volatile.h>
2529
#include <__utility/move.h>
2630

@@ -33,6 +37,136 @@ _LIBCPP_PUSH_MACROS
3337

3438
_LIBCPP_BEGIN_NAMESPACE_STD
3539

40+
template <class _Cp, bool _IsConst1, bool _IsConst2>
41+
[[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool
42+
__equal_unaligned(__bit_iterator<_Cp, _IsConst1> __first1,
43+
__bit_iterator<_Cp, _IsConst1> __last1,
44+
__bit_iterator<_Cp, _IsConst2> __first2) {
45+
using _It = __bit_iterator<_Cp, _IsConst1>;
46+
using difference_type = typename _It::difference_type;
47+
using __storage_type = typename _It::__storage_type;
48+
49+
const int __bits_per_word = _It::__bits_per_word;
50+
difference_type __n = __last1 - __first1;
51+
if (__n > 0) {
52+
// do first word
53+
if (__first1.__ctz_ != 0) {
54+
unsigned __clz_f = __bits_per_word - __first1.__ctz_;
55+
difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
56+
__n -= __dn;
57+
__storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
58+
__storage_type __b = *__first1.__seg_ & __m;
59+
unsigned __clz_r = __bits_per_word - __first2.__ctz_;
60+
__storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
61+
__m = (~__storage_type(0) << __first2.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
62+
if (__first2.__ctz_ > __first1.__ctz_) {
63+
if ((*__first2.__seg_ & __m) != (__b << (__first2.__ctz_ - __first1.__ctz_)))
64+
return false;
65+
} else {
66+
if ((*__first2.__seg_ & __m) != (__b >> (__first1.__ctz_ - __first2.__ctz_)))
67+
return false;
68+
}
69+
__first2.__seg_ += (__ddn + __first2.__ctz_) / __bits_per_word;
70+
__first2.__ctz_ = static_cast<unsigned>((__ddn + __first2.__ctz_) % __bits_per_word);
71+
__dn -= __ddn;
72+
if (__dn > 0) {
73+
__m = ~__storage_type(0) >> (__bits_per_word - __dn);
74+
if ((*__first2.__seg_ & __m) != (__b >> (__first1.__ctz_ + __ddn)))
75+
return false;
76+
__first2.__ctz_ = static_cast<unsigned>(__dn);
77+
}
78+
++__first1.__seg_;
79+
// __first1.__ctz_ = 0;
80+
}
81+
// __first1.__ctz_ == 0;
82+
// do middle words
83+
unsigned __clz_r = __bits_per_word - __first2.__ctz_;
84+
__storage_type __m = ~__storage_type(0) << __first2.__ctz_;
85+
for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first1.__seg_) {
86+
__storage_type __b = *__first1.__seg_;
87+
if ((*__first2.__seg_ & __m) != (__b << __first2.__ctz_))
88+
return false;
89+
++__first2.__seg_;
90+
if ((*__first2.__seg_ & ~__m) != (__b >> __clz_r))
91+
return false;
92+
}
93+
// do last word
94+
if (__n > 0) {
95+
__m = ~__storage_type(0) >> (__bits_per_word - __n);
96+
__storage_type __b = *__first1.__seg_ & __m;
97+
__storage_type __dn = std::min(__n, static_cast<difference_type>(__clz_r));
98+
__m = (~__storage_type(0) << __first2.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
99+
if ((*__first2.__seg_ & __m) != (__b << __first2.__ctz_))
100+
return false;
101+
__first2.__seg_ += (__dn + __first2.__ctz_) / __bits_per_word;
102+
__first2.__ctz_ = static_cast<unsigned>((__dn + __first2.__ctz_) % __bits_per_word);
103+
__n -= __dn;
104+
if (__n > 0) {
105+
__m = ~__storage_type(0) >> (__bits_per_word - __n);
106+
if ((*__first2.__seg_ & __m) != (__b >> __dn))
107+
return false;
108+
}
109+
}
110+
}
111+
return true;
112+
}
113+
114+
template <class _Cp, bool _IsConst1, bool _IsConst2>
115+
[[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool
116+
__equal_aligned(__bit_iterator<_Cp, _IsConst1> __first1,
117+
__bit_iterator<_Cp, _IsConst1> __last1,
118+
__bit_iterator<_Cp, _IsConst2> __first2) {
119+
using _It = __bit_iterator<_Cp, _IsConst1>;
120+
using difference_type = typename _It::difference_type;
121+
using __storage_type = typename _It::__storage_type;
122+
123+
const int __bits_per_word = _It::__bits_per_word;
124+
difference_type __n = __last1 - __first1;
125+
if (__n > 0) {
126+
// do first word
127+
if (__first1.__ctz_ != 0) {
128+
unsigned __clz = __bits_per_word - __first1.__ctz_;
129+
difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
130+
__n -= __dn;
131+
__storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
132+
if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
133+
return false;
134+
++__first2.__seg_;
135+
++__first1.__seg_;
136+
// __first1.__ctz_ = 0;
137+
// __first2.__ctz_ = 0;
138+
}
139+
// __first1.__ctz_ == 0;
140+
// __first2.__ctz_ == 0;
141+
// do middle words
142+
for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first1.__seg_, ++__first2.__seg_)
143+
if (*__first2.__seg_ != *__first1.__seg_)
144+
return false;
145+
// do last word
146+
if (__n > 0) {
147+
__storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
148+
if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
149+
return false;
150+
}
151+
}
152+
return true;
153+
}
154+
155+
template <class _Cp,
156+
bool _IsConst1,
157+
bool _IsConst2,
158+
class _BinaryPredicate,
159+
__enable_if_t<__desugars_to_v<__equal_tag, _BinaryPredicate, bool, bool>, int> = 0>
160+
[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __equal_iter_impl(
161+
__bit_iterator<_Cp, _IsConst1> __first1,
162+
__bit_iterator<_Cp, _IsConst1> __last1,
163+
__bit_iterator<_Cp, _IsConst2> __first2,
164+
_BinaryPredicate) {
165+
if (__first1.__ctz_ == __first2.__ctz_)
166+
return std::__equal_aligned(__first1, __last1, __first2);
167+
return std::__equal_unaligned(__first1, __last1, __first2);
168+
}
169+
36170
template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
37171
[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __equal_iter_impl(
38172
_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _BinaryPredicate& __pred) {
@@ -94,6 +228,28 @@ __equal_impl(_Tp* __first1, _Tp* __last1, _Up* __first2, _Up*, _Pred&, _Proj1&,
94228
return std::__constexpr_memcmp_equal(__first1, __first2, __element_count(__last1 - __first1));
95229
}
96230

231+
template <class _Cp,
232+
bool _IsConst1,
233+
bool _IsConst2,
234+
class _Pred,
235+
class _Proj1,
236+
class _Proj2,
237+
__enable_if_t<__desugars_to_v<__equal_tag, _Pred, bool, bool> && __is_identity<_Proj1>::value &&
238+
__is_identity<_Proj2>::value,
239+
int> = 0>
240+
[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __equal_impl(
241+
__bit_iterator<_Cp, _IsConst1> __first1,
242+
__bit_iterator<_Cp, _IsConst1> __last1,
243+
__bit_iterator<_Cp, _IsConst2> __first2,
244+
__bit_iterator<_Cp, _IsConst2>,
245+
_Pred&,
246+
_Proj1&,
247+
_Proj2&) {
248+
if (__first1.__ctz_ == __first2.__ctz_)
249+
return std::__equal_aligned(__first1, __last1, __first2);
250+
return std::__equal_unaligned(__first1, __last1, __first2);
251+
}
252+
97253
template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
98254
[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
99255
equal(_InputIterator1 __first1,

libcxx/include/__bit_reference

Lines changed: 33 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -10,23 +10,28 @@
1010
#ifndef _LIBCPP___BIT_REFERENCE
1111
#define _LIBCPP___BIT_REFERENCE
1212

13+
#include <__algorithm/comp.h>
1314
#include <__algorithm/copy.h>
1415
#include <__algorithm/copy_backward.h>
1516
#include <__algorithm/copy_n.h>
17+
#include <__algorithm/equal.h>
1618
#include <__algorithm/min.h>
1719
#include <__assert>
1820
#include <__bit/countr.h>
1921
#include <__compare/ordering.h>
2022
#include <__config>
2123
#include <__cstddef/ptrdiff_t.h>
2224
#include <__cstddef/size_t.h>
25+
#include <__functional/identity.h>
2326
#include <__fwd/bit_reference.h>
2427
#include <__iterator/iterator_traits.h>
2528
#include <__memory/construct_at.h>
2629
#include <__memory/pointer_traits.h>
2730
#include <__type_traits/conditional.h>
31+
#include <__type_traits/desugars_to.h>
2832
#include <__type_traits/enable_if.h>
2933
#include <__type_traits/is_constant_evaluated.h>
34+
#include <__type_traits/is_same.h>
3035
#include <__type_traits/is_unsigned.h>
3136
#include <__type_traits/void_t.h>
3237
#include <__utility/pair.h>
@@ -428,127 +433,6 @@ rotate(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __middle,
428433
return __r;
429434
}
430435

431-
// equal
432-
433-
template <class _Cp, bool _IC1, bool _IC2>
434-
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool __equal_unaligned(
435-
__bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2) {
436-
using _It = __bit_iterator<_Cp, _IC1>;
437-
using difference_type = typename _It::difference_type;
438-
using __storage_type = typename _It::__storage_type;
439-
440-
const int __bits_per_word = _It::__bits_per_word;
441-
difference_type __n = __last1 - __first1;
442-
if (__n > 0) {
443-
// do first word
444-
if (__first1.__ctz_ != 0) {
445-
unsigned __clz_f = __bits_per_word - __first1.__ctz_;
446-
difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
447-
__n -= __dn;
448-
__storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
449-
__storage_type __b = *__first1.__seg_ & __m;
450-
unsigned __clz_r = __bits_per_word - __first2.__ctz_;
451-
__storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
452-
__m = (~__storage_type(0) << __first2.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
453-
if (__first2.__ctz_ > __first1.__ctz_) {
454-
if ((*__first2.__seg_ & __m) != (__b << (__first2.__ctz_ - __first1.__ctz_)))
455-
return false;
456-
} else {
457-
if ((*__first2.__seg_ & __m) != (__b >> (__first1.__ctz_ - __first2.__ctz_)))
458-
return false;
459-
}
460-
__first2.__seg_ += (__ddn + __first2.__ctz_) / __bits_per_word;
461-
__first2.__ctz_ = static_cast<unsigned>((__ddn + __first2.__ctz_) % __bits_per_word);
462-
__dn -= __ddn;
463-
if (__dn > 0) {
464-
__m = ~__storage_type(0) >> (__bits_per_word - __dn);
465-
if ((*__first2.__seg_ & __m) != (__b >> (__first1.__ctz_ + __ddn)))
466-
return false;
467-
__first2.__ctz_ = static_cast<unsigned>(__dn);
468-
}
469-
++__first1.__seg_;
470-
// __first1.__ctz_ = 0;
471-
}
472-
// __first1.__ctz_ == 0;
473-
// do middle words
474-
unsigned __clz_r = __bits_per_word - __first2.__ctz_;
475-
__storage_type __m = ~__storage_type(0) << __first2.__ctz_;
476-
for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first1.__seg_) {
477-
__storage_type __b = *__first1.__seg_;
478-
if ((*__first2.__seg_ & __m) != (__b << __first2.__ctz_))
479-
return false;
480-
++__first2.__seg_;
481-
if ((*__first2.__seg_ & ~__m) != (__b >> __clz_r))
482-
return false;
483-
}
484-
// do last word
485-
if (__n > 0) {
486-
__m = ~__storage_type(0) >> (__bits_per_word - __n);
487-
__storage_type __b = *__first1.__seg_ & __m;
488-
__storage_type __dn = std::min(__n, static_cast<difference_type>(__clz_r));
489-
__m = (~__storage_type(0) << __first2.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
490-
if ((*__first2.__seg_ & __m) != (__b << __first2.__ctz_))
491-
return false;
492-
__first2.__seg_ += (__dn + __first2.__ctz_) / __bits_per_word;
493-
__first2.__ctz_ = static_cast<unsigned>((__dn + __first2.__ctz_) % __bits_per_word);
494-
__n -= __dn;
495-
if (__n > 0) {
496-
__m = ~__storage_type(0) >> (__bits_per_word - __n);
497-
if ((*__first2.__seg_ & __m) != (__b >> __dn))
498-
return false;
499-
}
500-
}
501-
}
502-
return true;
503-
}
504-
505-
template <class _Cp, bool _IC1, bool _IC2>
506-
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool __equal_aligned(
507-
__bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2) {
508-
using _It = __bit_iterator<_Cp, _IC1>;
509-
using difference_type = typename _It::difference_type;
510-
using __storage_type = typename _It::__storage_type;
511-
512-
const int __bits_per_word = _It::__bits_per_word;
513-
difference_type __n = __last1 - __first1;
514-
if (__n > 0) {
515-
// do first word
516-
if (__first1.__ctz_ != 0) {
517-
unsigned __clz = __bits_per_word - __first1.__ctz_;
518-
difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
519-
__n -= __dn;
520-
__storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
521-
if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
522-
return false;
523-
++__first2.__seg_;
524-
++__first1.__seg_;
525-
// __first1.__ctz_ = 0;
526-
// __first2.__ctz_ = 0;
527-
}
528-
// __first1.__ctz_ == 0;
529-
// __first2.__ctz_ == 0;
530-
// do middle words
531-
for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first1.__seg_, ++__first2.__seg_)
532-
if (*__first2.__seg_ != *__first1.__seg_)
533-
return false;
534-
// do last word
535-
if (__n > 0) {
536-
__storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
537-
if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
538-
return false;
539-
}
540-
}
541-
return true;
542-
}
543-
544-
template <class _Cp, bool _IC1, bool _IC2>
545-
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
546-
equal(__bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2) {
547-
if (__first1.__ctz_ == __first2.__ctz_)
548-
return std::__equal_aligned(__first1, __last1, __first2);
549-
return std::__equal_unaligned(__first1, __last1, __first2);
550-
}
551-
552436
template <class _Cp, bool _IsConst, typename _Cp::__storage_type>
553437
class __bit_iterator {
554438
public:
@@ -771,15 +655,36 @@ private:
771655
template <class _Dp>
772656
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false>
773657
rotate(__bit_iterator<_Dp, false>, __bit_iterator<_Dp, false>, __bit_iterator<_Dp, false>);
774-
template <class _Dp, bool _IC1, bool _IC2>
775-
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend bool
776-
__equal_aligned(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>);
777-
template <class _Dp, bool _IC1, bool _IC2>
658+
template <class _Dp, bool _IsConst1, bool _IsConst2>
778659
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend bool
779-
__equal_unaligned(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>);
780-
template <class _Dp, bool _IC1, bool _IC2>
660+
__equal_aligned(__bit_iterator<_Dp, _IsConst1>, __bit_iterator<_Dp, _IsConst1>, __bit_iterator<_Dp, _IsConst2>);
661+
template <class _Dp, bool _IsConst1, bool _IsConst2>
781662
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend bool
782-
equal(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>);
663+
__equal_unaligned(__bit_iterator<_Dp, _IsConst1>, __bit_iterator<_Dp, _IsConst1>, __bit_iterator<_Dp, _IsConst2>);
664+
template <class _Dp,
665+
bool _IsConst1,
666+
bool _IsConst2,
667+
class _BinaryPredicate,
668+
__enable_if_t<__desugars_to_v<__equal_tag, _BinaryPredicate, bool, bool>, int> >
669+
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 friend bool __equal_iter_impl(
670+
__bit_iterator<_Dp, _IsConst1>, __bit_iterator<_Dp, _IsConst1>, __bit_iterator<_Dp, _IsConst2>, _BinaryPredicate);
671+
template <class _Dp,
672+
bool _IsConst1,
673+
bool _IsConst2,
674+
class _Pred,
675+
class _Proj1,
676+
class _Proj2,
677+
__enable_if_t<__desugars_to_v<__equal_tag, _Pred, bool, bool> && __is_identity<_Proj1>::value &&
678+
__is_identity<_Proj2>::value,
679+
int> >
680+
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 friend bool __equal_impl(
681+
__bit_iterator<_Dp, _IsConst1> __first1,
682+
__bit_iterator<_Dp, _IsConst1> __last1,
683+
__bit_iterator<_Dp, _IsConst2> __first2,
684+
__bit_iterator<_Dp, _IsConst2>,
685+
_Pred&,
686+
_Proj1&,
687+
_Proj2&);
783688
template <bool _ToFind, class _Dp, bool _IC>
784689
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, _IC>
785690
__find_bool(__bit_iterator<_Dp, _IC>, typename __size_difference_type_traits<_Dp>::size_type);

0 commit comments

Comments
 (0)