Skip to content

Commit a12744f

Browse files
authored
[libc++] Optimize ranges::swap_ranges for vector<bool>::iterator (#121150)
This PR optimizes the performance of `std::ranges::swap_ranges` for `vector<bool>::iterator`, addressing a subtask outlined in issue #64038. The optimizations yield performance improvements of up to **611x** for aligned range swap and **78x** for unaligned range swap comparison. Additionally, comprehensive tests covering up to 4 storage words (256 bytes) with odd and even bit sizes are provided, which validate the proposed optimizations in this patch.
1 parent b08769c commit a12744f

File tree

6 files changed

+291
-151
lines changed

6 files changed

+291
-151
lines changed

libcxx/docs/ReleaseNotes/21.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ Improvements and New Features
5050
- The ``std::ranges::equal`` algorithm has been optimized for ``std::vector<bool>::iterator``, resulting in a performance
5151
improvement of up to 188x.
5252

53+
- The ``std::ranges::swap_ranges`` algorithm has been optimized for ``std::vector<bool>::iterator``, resulting in a
54+
performance improvement of up to 611x.
55+
5356
- Updated formatting library to Unicode 16.0.0.
5457

5558
Deprecations and Removals

libcxx/include/__algorithm/swap_ranges.h

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,12 @@
1010
#define _LIBCPP___ALGORITHM_SWAP_RANGES_H
1111

1212
#include <__algorithm/iterator_operations.h>
13+
#include <__algorithm/min.h>
1314
#include <__config>
15+
#include <__fwd/bit_reference.h>
1416
#include <__utility/move.h>
1517
#include <__utility/pair.h>
18+
#include <__utility/swap.h>
1619

1720
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
1821
# pragma GCC system_header
@@ -23,6 +26,165 @@ _LIBCPP_PUSH_MACROS
2326

2427
_LIBCPP_BEGIN_NAMESPACE_STD
2528

29+
template <class _Cl, class _Cr>
30+
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cr, false> __swap_ranges_aligned(
31+
__bit_iterator<_Cl, false> __first, __bit_iterator<_Cl, false> __last, __bit_iterator<_Cr, false> __result) {
32+
using _I1 = __bit_iterator<_Cl, false>;
33+
using difference_type = typename _I1::difference_type;
34+
using __storage_type = typename _I1::__storage_type;
35+
36+
const int __bits_per_word = _I1::__bits_per_word;
37+
difference_type __n = __last - __first;
38+
if (__n > 0) {
39+
// do first word
40+
if (__first.__ctz_ != 0) {
41+
unsigned __clz = __bits_per_word - __first.__ctz_;
42+
difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
43+
__n -= __dn;
44+
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
45+
__storage_type __b1 = *__first.__seg_ & __m;
46+
*__first.__seg_ &= ~__m;
47+
__storage_type __b2 = *__result.__seg_ & __m;
48+
*__result.__seg_ &= ~__m;
49+
*__result.__seg_ |= __b1;
50+
*__first.__seg_ |= __b2;
51+
__result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
52+
__result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
53+
++__first.__seg_;
54+
// __first.__ctz_ = 0;
55+
}
56+
// __first.__ctz_ == 0;
57+
// do middle words
58+
for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_, ++__result.__seg_)
59+
swap(*__first.__seg_, *__result.__seg_);
60+
// do last word
61+
if (__n > 0) {
62+
__storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
63+
__storage_type __b1 = *__first.__seg_ & __m;
64+
*__first.__seg_ &= ~__m;
65+
__storage_type __b2 = *__result.__seg_ & __m;
66+
*__result.__seg_ &= ~__m;
67+
*__result.__seg_ |= __b1;
68+
*__first.__seg_ |= __b2;
69+
__result.__ctz_ = static_cast<unsigned>(__n);
70+
}
71+
}
72+
return __result;
73+
}
74+
75+
template <class _Cl, class _Cr>
76+
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cr, false> __swap_ranges_unaligned(
77+
__bit_iterator<_Cl, false> __first, __bit_iterator<_Cl, false> __last, __bit_iterator<_Cr, false> __result) {
78+
using _I1 = __bit_iterator<_Cl, false>;
79+
using difference_type = typename _I1::difference_type;
80+
using __storage_type = typename _I1::__storage_type;
81+
82+
const int __bits_per_word = _I1::__bits_per_word;
83+
difference_type __n = __last - __first;
84+
if (__n > 0) {
85+
// do first word
86+
if (__first.__ctz_ != 0) {
87+
unsigned __clz_f = __bits_per_word - __first.__ctz_;
88+
difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
89+
__n -= __dn;
90+
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
91+
__storage_type __b1 = *__first.__seg_ & __m;
92+
*__first.__seg_ &= ~__m;
93+
unsigned __clz_r = __bits_per_word - __result.__ctz_;
94+
__storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
95+
__m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
96+
__storage_type __b2 = *__result.__seg_ & __m;
97+
*__result.__seg_ &= ~__m;
98+
if (__result.__ctz_ > __first.__ctz_) {
99+
unsigned __s = __result.__ctz_ - __first.__ctz_;
100+
*__result.__seg_ |= __b1 << __s;
101+
*__first.__seg_ |= __b2 >> __s;
102+
} else {
103+
unsigned __s = __first.__ctz_ - __result.__ctz_;
104+
*__result.__seg_ |= __b1 >> __s;
105+
*__first.__seg_ |= __b2 << __s;
106+
}
107+
__result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
108+
__result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
109+
__dn -= __ddn;
110+
if (__dn > 0) {
111+
__m = ~__storage_type(0) >> (__bits_per_word - __dn);
112+
__b2 = *__result.__seg_ & __m;
113+
*__result.__seg_ &= ~__m;
114+
unsigned __s = __first.__ctz_ + __ddn;
115+
*__result.__seg_ |= __b1 >> __s;
116+
*__first.__seg_ |= __b2 << __s;
117+
__result.__ctz_ = static_cast<unsigned>(__dn);
118+
}
119+
++__first.__seg_;
120+
// __first.__ctz_ = 0;
121+
}
122+
// __first.__ctz_ == 0;
123+
// do middle words
124+
__storage_type __m = ~__storage_type(0) << __result.__ctz_;
125+
unsigned __clz_r = __bits_per_word - __result.__ctz_;
126+
for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) {
127+
__storage_type __b1 = *__first.__seg_;
128+
__storage_type __b2 = *__result.__seg_ & __m;
129+
*__result.__seg_ &= ~__m;
130+
*__result.__seg_ |= __b1 << __result.__ctz_;
131+
*__first.__seg_ = __b2 >> __result.__ctz_;
132+
++__result.__seg_;
133+
__b2 = *__result.__seg_ & ~__m;
134+
*__result.__seg_ &= __m;
135+
*__result.__seg_ |= __b1 >> __clz_r;
136+
*__first.__seg_ |= __b2 << __clz_r;
137+
}
138+
// do last word
139+
if (__n > 0) {
140+
__m = ~__storage_type(0) >> (__bits_per_word - __n);
141+
__storage_type __b1 = *__first.__seg_ & __m;
142+
*__first.__seg_ &= ~__m;
143+
__storage_type __dn = std::min<__storage_type>(__n, __clz_r);
144+
__m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
145+
__storage_type __b2 = *__result.__seg_ & __m;
146+
*__result.__seg_ &= ~__m;
147+
*__result.__seg_ |= __b1 << __result.__ctz_;
148+
*__first.__seg_ |= __b2 >> __result.__ctz_;
149+
__result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
150+
__result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
151+
__n -= __dn;
152+
if (__n > 0) {
153+
__m = ~__storage_type(0) >> (__bits_per_word - __n);
154+
__b2 = *__result.__seg_ & __m;
155+
*__result.__seg_ &= ~__m;
156+
*__result.__seg_ |= __b1 >> __dn;
157+
*__first.__seg_ |= __b2 << __dn;
158+
__result.__ctz_ = static_cast<unsigned>(__n);
159+
}
160+
}
161+
}
162+
return __result;
163+
}
164+
165+
// 2+1 iterators: size2 >= size1; used by std::swap_ranges.
166+
template <class, class _Cl, class _Cr>
167+
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cl, false>, __bit_iterator<_Cr, false> >
168+
__swap_ranges(__bit_iterator<_Cl, false> __first1,
169+
__bit_iterator<_Cl, false> __last1,
170+
__bit_iterator<_Cr, false> __first2) {
171+
if (__first1.__ctz_ == __first2.__ctz_)
172+
return std::make_pair(__last1, std::__swap_ranges_aligned(__first1, __last1, __first2));
173+
return std::make_pair(__last1, std::__swap_ranges_unaligned(__first1, __last1, __first2));
174+
}
175+
176+
// 2+2 iterators: used by std::ranges::swap_ranges.
177+
template <class _AlgPolicy, class _Cl, class _Cr>
178+
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cl, false>, __bit_iterator<_Cr, false> >
179+
__swap_ranges(__bit_iterator<_Cl, false> __first1,
180+
__bit_iterator<_Cl, false> __last1,
181+
__bit_iterator<_Cr, false> __first2,
182+
__bit_iterator<_Cr, false> __last2) {
183+
if (__last1 - __first1 < __last2 - __first2)
184+
return std::make_pair(__last1, std::__swap_ranges<_AlgPolicy>(__first1, __last1, __first2).second);
185+
return std::make_pair(std::__swap_ranges<_AlgPolicy>(__first2, __last2, __first1).second, __last2);
186+
}
187+
26188
// 2+2 iterators: the shorter size will be used.
27189
template <class _AlgPolicy, class _ForwardIterator1, class _Sentinel1, class _ForwardIterator2, class _Sentinel2>
28190
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_ForwardIterator1, _ForwardIterator2>

libcxx/include/__bit_reference

Lines changed: 6 additions & 151 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <__algorithm/copy_n.h>
1717
#include <__algorithm/equal.h>
1818
#include <__algorithm/min.h>
19+
#include <__algorithm/swap_ranges.h>
1920
#include <__assert>
2021
#include <__bit/countr.h>
2122
#include <__compare/ordering.h>
@@ -215,152 +216,6 @@ private:
215216
__mask_(__m) {}
216217
};
217218

218-
// swap_ranges
219-
220-
template <class _Cl, class _Cr>
221-
_LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cr, false> __swap_ranges_aligned(
222-
__bit_iterator<_Cl, false> __first, __bit_iterator<_Cl, false> __last, __bit_iterator<_Cr, false> __result) {
223-
using _I1 = __bit_iterator<_Cl, false>;
224-
using difference_type = typename _I1::difference_type;
225-
using __storage_type = typename _I1::__storage_type;
226-
227-
const int __bits_per_word = _I1::__bits_per_word;
228-
difference_type __n = __last - __first;
229-
if (__n > 0) {
230-
// do first word
231-
if (__first.__ctz_ != 0) {
232-
unsigned __clz = __bits_per_word - __first.__ctz_;
233-
difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
234-
__n -= __dn;
235-
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
236-
__storage_type __b1 = *__first.__seg_ & __m;
237-
*__first.__seg_ &= ~__m;
238-
__storage_type __b2 = *__result.__seg_ & __m;
239-
*__result.__seg_ &= ~__m;
240-
*__result.__seg_ |= __b1;
241-
*__first.__seg_ |= __b2;
242-
__result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
243-
__result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
244-
++__first.__seg_;
245-
// __first.__ctz_ = 0;
246-
}
247-
// __first.__ctz_ == 0;
248-
// do middle words
249-
for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_, ++__result.__seg_)
250-
swap(*__first.__seg_, *__result.__seg_);
251-
// do last word
252-
if (__n > 0) {
253-
__storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
254-
__storage_type __b1 = *__first.__seg_ & __m;
255-
*__first.__seg_ &= ~__m;
256-
__storage_type __b2 = *__result.__seg_ & __m;
257-
*__result.__seg_ &= ~__m;
258-
*__result.__seg_ |= __b1;
259-
*__first.__seg_ |= __b2;
260-
__result.__ctz_ = static_cast<unsigned>(__n);
261-
}
262-
}
263-
return __result;
264-
}
265-
266-
template <class _Cl, class _Cr>
267-
_LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cr, false> __swap_ranges_unaligned(
268-
__bit_iterator<_Cl, false> __first, __bit_iterator<_Cl, false> __last, __bit_iterator<_Cr, false> __result) {
269-
using _I1 = __bit_iterator<_Cl, false>;
270-
using difference_type = typename _I1::difference_type;
271-
using __storage_type = typename _I1::__storage_type;
272-
273-
const int __bits_per_word = _I1::__bits_per_word;
274-
difference_type __n = __last - __first;
275-
if (__n > 0) {
276-
// do first word
277-
if (__first.__ctz_ != 0) {
278-
unsigned __clz_f = __bits_per_word - __first.__ctz_;
279-
difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
280-
__n -= __dn;
281-
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
282-
__storage_type __b1 = *__first.__seg_ & __m;
283-
*__first.__seg_ &= ~__m;
284-
unsigned __clz_r = __bits_per_word - __result.__ctz_;
285-
__storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
286-
__m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
287-
__storage_type __b2 = *__result.__seg_ & __m;
288-
*__result.__seg_ &= ~__m;
289-
if (__result.__ctz_ > __first.__ctz_) {
290-
unsigned __s = __result.__ctz_ - __first.__ctz_;
291-
*__result.__seg_ |= __b1 << __s;
292-
*__first.__seg_ |= __b2 >> __s;
293-
} else {
294-
unsigned __s = __first.__ctz_ - __result.__ctz_;
295-
*__result.__seg_ |= __b1 >> __s;
296-
*__first.__seg_ |= __b2 << __s;
297-
}
298-
__result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
299-
__result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
300-
__dn -= __ddn;
301-
if (__dn > 0) {
302-
__m = ~__storage_type(0) >> (__bits_per_word - __dn);
303-
__b2 = *__result.__seg_ & __m;
304-
*__result.__seg_ &= ~__m;
305-
unsigned __s = __first.__ctz_ + __ddn;
306-
*__result.__seg_ |= __b1 >> __s;
307-
*__first.__seg_ |= __b2 << __s;
308-
__result.__ctz_ = static_cast<unsigned>(__dn);
309-
}
310-
++__first.__seg_;
311-
// __first.__ctz_ = 0;
312-
}
313-
// __first.__ctz_ == 0;
314-
// do middle words
315-
__storage_type __m = ~__storage_type(0) << __result.__ctz_;
316-
unsigned __clz_r = __bits_per_word - __result.__ctz_;
317-
for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) {
318-
__storage_type __b1 = *__first.__seg_;
319-
__storage_type __b2 = *__result.__seg_ & __m;
320-
*__result.__seg_ &= ~__m;
321-
*__result.__seg_ |= __b1 << __result.__ctz_;
322-
*__first.__seg_ = __b2 >> __result.__ctz_;
323-
++__result.__seg_;
324-
__b2 = *__result.__seg_ & ~__m;
325-
*__result.__seg_ &= __m;
326-
*__result.__seg_ |= __b1 >> __clz_r;
327-
*__first.__seg_ |= __b2 << __clz_r;
328-
}
329-
// do last word
330-
if (__n > 0) {
331-
__m = ~__storage_type(0) >> (__bits_per_word - __n);
332-
__storage_type __b1 = *__first.__seg_ & __m;
333-
*__first.__seg_ &= ~__m;
334-
__storage_type __dn = std::min<__storage_type>(__n, __clz_r);
335-
__m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
336-
__storage_type __b2 = *__result.__seg_ & __m;
337-
*__result.__seg_ &= ~__m;
338-
*__result.__seg_ |= __b1 << __result.__ctz_;
339-
*__first.__seg_ |= __b2 >> __result.__ctz_;
340-
__result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
341-
__result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
342-
__n -= __dn;
343-
if (__n > 0) {
344-
__m = ~__storage_type(0) >> (__bits_per_word - __n);
345-
__b2 = *__result.__seg_ & __m;
346-
*__result.__seg_ &= ~__m;
347-
*__result.__seg_ |= __b1 >> __dn;
348-
*__first.__seg_ |= __b2 << __dn;
349-
__result.__ctz_ = static_cast<unsigned>(__n);
350-
}
351-
}
352-
}
353-
return __result;
354-
}
355-
356-
template <class _Cl, class _Cr>
357-
inline _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cr, false> swap_ranges(
358-
__bit_iterator<_Cl, false> __first1, __bit_iterator<_Cl, false> __last1, __bit_iterator<_Cr, false> __first2) {
359-
if (__first1.__ctz_ == __first2.__ctz_)
360-
return std::__swap_ranges_aligned(__first1, __last1, __first2);
361-
return std::__swap_ranges_unaligned(__first1, __last1, __first2);
362-
}
363-
364219
// rotate
365220

366221
template <class _Cp>
@@ -644,14 +499,14 @@ private:
644499
template <class _AlgPolicy>
645500
friend struct __copy_backward_impl;
646501
template <class _Cl, class _Cr>
647-
friend __bit_iterator<_Cr, false>
502+
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Cr, false>
648503
__swap_ranges_aligned(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);
649504
template <class _Cl, class _Cr>
650-
friend __bit_iterator<_Cr, false>
505+
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Cr, false>
651506
__swap_ranges_unaligned(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);
652-
template <class _Cl, class _Cr>
653-
friend __bit_iterator<_Cr, false>
654-
swap_ranges(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);
507+
template <class, class _Cl, class _Cr>
508+
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend pair<__bit_iterator<_Cl, false>, __bit_iterator<_Cr, false> >
509+
__swap_ranges(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);
655510
template <class _Dp>
656511
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false>
657512
rotate(__bit_iterator<_Dp, false>, __bit_iterator<_Dp, false>, __bit_iterator<_Dp, false>);

0 commit comments

Comments
 (0)