Skip to content

Commit a066217

Browse files
authored
[libc++] Speed up set_intersection() by fast-forwarding over ranges of non-matching elements with one-sided binary search. (#75230)
One-sided binary search, aka meta binary search, has been in the public domain for decades, and has the general advantage of being constant time in the best case, with the downside of executing at most 2*log(N) comparisons vs classic binary search's exact log(N). There are two scenarios in which it really shines: the first one is when operating over non-random-access iterators, because the classic algorithm requires knowing the container's size upfront, which adds N iterator increments to the complexity. The second one is when traversing the container in order, trying to fast-forward to the next value: in that case the classic algorithm requires at least O(N*log(N)) comparisons and, for non-random-access iterators, O(N^2) iterator increments, whereas the one-sided version will yield O(N) operations on both counts, with a best-case of O(log(N)) comparisons which is very common in practice.
1 parent eb7d54a commit a066217

File tree

15 files changed

+985
-219
lines changed

15 files changed

+985
-219
lines changed

libcxx/benchmarks/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ set(BENCHMARK_TESTS
135135
algorithms/ranges_sort.bench.cpp
136136
algorithms/ranges_sort_heap.bench.cpp
137137
algorithms/ranges_stable_sort.bench.cpp
138+
algorithms/set_intersection.bench.cpp
138139
algorithms/sort.bench.cpp
139140
algorithms/sort_heap.bench.cpp
140141
algorithms/stable_sort.bench.cpp
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include <algorithm>
10+
#include <cstdlib>
11+
#include <iterator>
12+
#include <set>
13+
#include <vector>
14+
15+
#include "common.h"
16+
#include "test_iterators.h"
17+
18+
namespace {
19+
20+
// types of containers we'll want to test, covering interesting iterator types
21+
struct VectorContainer {
22+
template <typename... Args>
23+
using type = std::vector<Args...>;
24+
25+
static constexpr const char* Name = "Vector";
26+
};
27+
28+
struct SetContainer {
29+
template <typename... Args>
30+
using type = std::set<Args...>;
31+
32+
static constexpr const char* Name = "Set";
33+
};
34+
35+
using AllContainerTypes = std::tuple<VectorContainer, SetContainer>;
36+
37+
// set_intersection performance may depend on where matching values lie
38+
enum class OverlapPosition {
39+
None,
40+
Front,
41+
// performance-wise, matches at the back are identical to ones at the front
42+
Interlaced,
43+
};
44+
45+
struct AllOverlapPositions : EnumValuesAsTuple<AllOverlapPositions, OverlapPosition, 3> {
46+
static constexpr const char* Names[] = {"None", "Front", "Interlaced"};
47+
};
48+
49+
// forward_iterator wrapping which, for each increment, moves the underlying iterator forward Stride elements
50+
template <typename Wrapped>
51+
struct StridedFwdIt {
52+
Wrapped base_;
53+
unsigned stride_;
54+
55+
using iterator_category = std::forward_iterator_tag;
56+
using difference_type = typename Wrapped::difference_type;
57+
using value_type = typename Wrapped::value_type;
58+
using pointer = typename Wrapped::pointer;
59+
using reference = typename Wrapped::reference;
60+
61+
StridedFwdIt(Wrapped base, unsigned stride) : base_(base), stride_(stride) { assert(stride_ != 0); }
62+
63+
StridedFwdIt operator++() {
64+
for (unsigned i = 0; i < stride_; ++i)
65+
++base_;
66+
return *this;
67+
}
68+
StridedFwdIt operator++(int) {
69+
auto tmp = *this;
70+
++*this;
71+
return tmp;
72+
}
73+
value_type& operator*() { return *base_; }
74+
const value_type& operator*() const { return *base_; }
75+
value_type& operator->() { return *base_; }
76+
const value_type& operator->() const { return *base_; }
77+
bool operator==(const StridedFwdIt& o) const { return base_ == o.base_; }
78+
bool operator!=(const StridedFwdIt& o) const { return !operator==(o); }
79+
};
80+
template <typename Wrapped>
81+
StridedFwdIt(Wrapped, unsigned) -> StridedFwdIt<Wrapped>;
82+
83+
template <typename T>
84+
std::vector<T> getVectorOfRandom(size_t N) {
85+
std::vector<T> v;
86+
fillValues(v, N, Order::Random);
87+
sortValues(v, Order::Random);
88+
return std::vector<T>(v);
89+
}
90+
91+
// Realistically, data won't all be nicely contiguous in a container,
92+
// we'll go through some effort to ensure that it's shuffled through memory
93+
// this is especially important for containers with non-contiguous element
94+
// storage, but it will affect even a std::vector, because when you copy a
95+
// std::vector<std::string> the underlying data storage position for the char
96+
// arrays of the copy are likely to have high locality
97+
template <class Container>
98+
std::pair<Container, Container> genCacheUnfriendlyData(size_t size1, size_t size2, OverlapPosition pos) {
99+
using ValueType = typename Container::value_type;
100+
auto move_into = [](auto first, auto last) {
101+
Container out;
102+
std::move(first, last, std::inserter(out, out.begin()));
103+
return out;
104+
};
105+
const auto src_size = pos == OverlapPosition::None ? size1 + size2 : std::max(size1, size2);
106+
std::vector<ValueType> src = getVectorOfRandom<ValueType>(src_size);
107+
108+
if (pos == OverlapPosition::None) {
109+
std::sort(src.begin(), src.end());
110+
return std::make_pair(move_into(src.begin(), src.begin() + size1), move_into(src.begin() + size1, src.end()));
111+
}
112+
113+
// All other overlap types will have to copy some part of the data, but if
114+
// we copy after sorting it will likely have high locality, so we sort
115+
// each copy separately
116+
auto copy = src;
117+
std::sort(src.begin(), src.end());
118+
std::sort(copy.begin(), copy.end());
119+
120+
switch (pos) {
121+
case OverlapPosition::None:
122+
// we like -Wswitch :)
123+
break;
124+
125+
case OverlapPosition::Front:
126+
return std::make_pair(move_into(src.begin(), src.begin() + size1), move_into(copy.begin(), copy.begin() + size2));
127+
128+
case OverlapPosition::Interlaced:
129+
const auto stride1 = size1 < size2 ? size2 / size1 : 1;
130+
const auto stride2 = size2 < size1 ? size1 / size2 : 1;
131+
return std::make_pair(move_into(StridedFwdIt(src.begin(), stride1), StridedFwdIt(src.end(), stride1)),
132+
move_into(StridedFwdIt(copy.begin(), stride2), StridedFwdIt(copy.end(), stride2)));
133+
}
134+
std::abort(); // would be std::unreachable() if it could
135+
return std::pair<Container, Container>();
136+
}
137+
138+
template <class ValueType, class Container, class Overlap>
139+
struct SetIntersection {
140+
using ContainerType = typename Container::template type<Value<ValueType>>;
141+
size_t size1_;
142+
size_t size2_;
143+
144+
SetIntersection(size_t size1, size_t size2) : size1_(size1), size2_(size2) {}
145+
146+
bool skip() const noexcept {
147+
// let's save some time and skip simmetrical runs
148+
return size1_ < size2_;
149+
}
150+
151+
void run(benchmark::State& state) const {
152+
auto input = genCacheUnfriendlyData<ContainerType>(size1_, size2_, Overlap());
153+
std::vector<Value<ValueType>> out(std::min(size1_, size2_));
154+
155+
const auto BATCH_SIZE = std::max(size_t{512}, (2 * TestSetElements) / (size1_ + size2_));
156+
for (const auto& _ : state) {
157+
while (state.KeepRunningBatch(BATCH_SIZE)) {
158+
for (unsigned i = 0; i < BATCH_SIZE; ++i) {
159+
const auto& [c1, c2] = input;
160+
auto res = std::set_intersection(c1.begin(), c1.end(), c2.begin(), c2.end(), out.begin());
161+
benchmark::DoNotOptimize(res);
162+
}
163+
}
164+
}
165+
}
166+
167+
std::string name() const {
168+
return std::string("SetIntersection") + Overlap::name() + '_' + Container::Name + ValueType::name() + '_' +
169+
std::to_string(size1_) + '_' + std::to_string(size2_);
170+
}
171+
};
172+
173+
} // namespace
174+
175+
int main(int argc, char** argv) { /**/
176+
benchmark::Initialize(&argc, argv);
177+
if (benchmark::ReportUnrecognizedArguments(argc, argv))
178+
return 1;
179+
180+
makeCartesianProductBenchmark<SetIntersection, AllValueTypes, AllContainerTypes, AllOverlapPositions>(
181+
Quantities, Quantities);
182+
benchmark::RunSpecifiedBenchmarks();
183+
return 0;
184+
}

libcxx/docs/ReleaseNotes/19.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,10 @@ Improvements and New Features
7171
- The ``std::ranges::minmax`` algorithm has been optimized for integral types, resulting in a performance increase of
7272
up to 100x.
7373

74+
- The ``std::set_intersection`` and ``std::ranges::set_intersection`` algorithms have been optimized to fast-forward over
75+
contiguous ranges of non-matching values, reducing the number of comparisons from linear to
76+
logarithmic growth with the number of elements in best-case scenarios.
77+
7478
- The ``_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM`` macro has been added to make the declarations in ``<strstream>`` available.
7579

7680
- The ``_LIBCPP_ENABLE_CXX26_REMOVED_WSTRING_CONVERT`` macro has been added to make the declarations in ``<locale>``

libcxx/include/__algorithm/iterator_operations.h

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include <__algorithm/iter_swap.h>
1313
#include <__algorithm/ranges_iterator_concept.h>
14+
#include <__assert>
1415
#include <__config>
1516
#include <__iterator/advance.h>
1617
#include <__iterator/distance.h>
@@ -160,6 +161,59 @@ struct _IterOps<_ClassicAlgPolicy> {
160161
_LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX14 void __advance_to(_Iter& __first, _Iter __last) {
161162
__first = __last;
162163
}
164+
165+
// advance with sentinel, a la std::ranges::advance
166+
template <class _Iter>
167+
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_Iter>
168+
__advance_to(_Iter& __iter, __difference_type<_Iter> __count, const _Iter& __sentinel) {
169+
return _IterOps::__advance_to(__iter, __count, __sentinel, typename iterator_traits<_Iter>::iterator_category());
170+
}
171+
172+
private:
173+
// advance with sentinel, a la std::ranges::advance -- InputIterator specialization
174+
template <class _InputIter>
175+
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_InputIter> __advance_to(
176+
_InputIter& __iter, __difference_type<_InputIter> __count, const _InputIter& __sentinel, input_iterator_tag) {
177+
__difference_type<_InputIter> __dist = 0;
178+
for (; __dist < __count && __iter != __sentinel; ++__dist)
179+
++__iter;
180+
return __count - __dist;
181+
}
182+
183+
// advance with sentinel, a la std::ranges::advance -- BidirectionalIterator specialization
184+
template <class _BiDirIter>
185+
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_BiDirIter>
186+
__advance_to(_BiDirIter& __iter,
187+
__difference_type<_BiDirIter> __count,
188+
const _BiDirIter& __sentinel,
189+
bidirectional_iterator_tag) {
190+
__difference_type<_BiDirIter> __dist = 0;
191+
if (__count >= 0)
192+
for (; __dist < __count && __iter != __sentinel; ++__dist)
193+
++__iter;
194+
else
195+
for (__count = -__count; __dist < __count && __iter != __sentinel; ++__dist)
196+
--__iter;
197+
return __count - __dist;
198+
}
199+
200+
// advance with sentinel, a la std::ranges::advance -- RandomIterator specialization
201+
template <class _RandIter>
202+
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_RandIter>
203+
__advance_to(_RandIter& __iter,
204+
__difference_type<_RandIter> __count,
205+
const _RandIter& __sentinel,
206+
random_access_iterator_tag) {
207+
auto __dist = _IterOps::distance(__iter, __sentinel);
208+
_LIBCPP_ASSERT_VALID_INPUT_RANGE(
209+
__count == 0 || (__dist < 0) == (__count < 0), "__sentinel must precede __iter when __count < 0");
210+
if (__count < 0)
211+
__dist = __dist > __count ? __dist : __count;
212+
else
213+
__dist = __dist < __count ? __dist : __count;
214+
__iter += __dist;
215+
return __count - __dist;
216+
}
163217
};
164218

165219
_LIBCPP_END_NAMESPACE_STD

libcxx/include/__algorithm/lower_bound.h

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,13 @@
2727

2828
_LIBCPP_BEGIN_NAMESPACE_STD
2929

30-
template <class _AlgPolicy, class _Iter, class _Sent, class _Type, class _Proj, class _Comp>
31-
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
32-
__lower_bound(_Iter __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
33-
auto __len = _IterOps<_AlgPolicy>::distance(__first, __last);
34-
30+
template <class _AlgPolicy, class _Iter, class _Type, class _Proj, class _Comp>
31+
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter __lower_bound_bisecting(
32+
_Iter __first,
33+
const _Type& __value,
34+
typename iterator_traits<_Iter>::difference_type __len,
35+
_Comp& __comp,
36+
_Proj& __proj) {
3537
while (__len != 0) {
3638
auto __l2 = std::__half_positive(__len);
3739
_Iter __m = __first;
@@ -46,6 +48,48 @@ __lower_bound(_Iter __first, _Sent __last, const _Type& __value, _Comp& __comp,
4648
return __first;
4749
}
4850

51+
// One-sided binary search, aka meta binary search, has been in the public domain for decades, and has the general
52+
// advantage of being \Omega(1) rather than the classic algorithm's \Omega(log(n)), with the downside of executing at
53+
// most 2*log(n) comparisons vs the classic algorithm's exact log(n). There are two scenarios in which it really shines:
54+
// the first one is when operating over non-random-access iterators, because the classic algorithm requires knowing the
55+
// container's size upfront, which adds \Omega(n) iterator increments to the complexity. The second one is when you're
56+
// traversing the container in order, trying to fast-forward to the next value: in that case, the classic algorithm
57+
// would yield \Omega(n*log(n)) comparisons and, for non-random-access iterators, \Omega(n^2) iterator increments,
58+
// whereas the one-sided version will yield O(n) operations on both counts, with a \Omega(log(n)) bound on the number of
59+
// comparisons.
60+
template <class _AlgPolicy, class _ForwardIterator, class _Sent, class _Type, class _Proj, class _Comp>
61+
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
62+
__lower_bound_onesided(_ForwardIterator __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
63+
// step = 0, ensuring we can always short-circuit when distance is 1 later on
64+
if (__first == __last || !std::__invoke(__comp, std::__invoke(__proj, *__first), __value))
65+
return __first;
66+
67+
using _Distance = typename iterator_traits<_ForwardIterator>::difference_type;
68+
for (_Distance __step = 1; __first != __last; __step <<= 1) {
69+
auto __it = __first;
70+
auto __dist = __step - _IterOps<_AlgPolicy>::__advance_to(__it, __step, __last);
71+
// once we reach the last range where needle can be we must start
72+
// looking inwards, bisecting that range
73+
if (__it == __last || !std::__invoke(__comp, std::__invoke(__proj, *__it), __value)) {
74+
// we've already checked the previous value and it was less, we can save
75+
// one comparison by skipping bisection
76+
if (__dist == 1)
77+
return __it;
78+
return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj);
79+
}
80+
// range not found, move forward!
81+
__first = __it;
82+
}
83+
return __first;
84+
}
85+
86+
template <class _AlgPolicy, class _ForwardIterator, class _Sent, class _Type, class _Proj, class _Comp>
87+
_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
88+
__lower_bound(_ForwardIterator __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
89+
const auto __dist = _IterOps<_AlgPolicy>::distance(__first, __last);
90+
return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj);
91+
}
92+
4993
template <class _ForwardIterator, class _Tp, class _Compare>
5094
_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
5195
lower_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, _Compare __comp) {

0 commit comments

Comments
 (0)