llvm
diff --git a/‎libcxx/benchmarks/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎libcxx/benchmarks/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎libcxx/benchmarks/algorithms/set_intersection.bench.cpp
Lines changed: 184 additions & 0 deletions b/‎libcxx/benchmarks/algorithms/set_intersection.bench.cpp
Lines changed: 184 additions & 0 deletions
diff --git a/‎libcxx/docs/ReleaseNotes/19.rst
Lines changed: 4 additions & 0 deletions b/‎libcxx/docs/ReleaseNotes/19.rst
Lines changed: 4 additions & 0 deletions
diff --git a/‎libcxx/include/__algorithm/iterator_operations.h
Lines changed: 54 additions & 0 deletions b/‎libcxx/include/__algorithm/iterator_operations.h
Lines changed: 54 additions & 0 deletions
diff --git a/‎libcxx/include/__algorithm/lower_bound.h
Lines changed: 49 additions & 5 deletions b/‎libcxx/include/__algorithm/lower_bound.h
Lines changed: 49 additions & 5 deletions
@@ -135,6 +135,7 @@ set(BENCHMARK_TESTS
     algorithms/ranges_sort.bench.cpp
     algorithms/ranges_sort_heap.bench.cpp
     algorithms/ranges_stable_sort.bench.cpp
+    algorithms/set_intersection.bench.cpp
     algorithms/sort.bench.cpp
     algorithms/sort_heap.bench.cpp
     algorithms/stable_sort.bench.cpp
 
@@ -0,0 +1,184 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstdlib>
+#include <iterator>
+#include <set>
+#include <vector>
+
+#include "common.h"
+#include "test_iterators.h"
+
+namespace {
+
+// types of containers we'll want to test, covering interesting iterator types
+struct VectorContainer {
+  template <typename... Args>
+  using type = std::vector<Args...>;
+
+  static constexpr const char* Name = "Vector";
+};
+
+struct SetContainer {
+  template <typename... Args>
+  using type = std::set<Args...>;
+
+  static constexpr const char* Name = "Set";
+};
+
+using AllContainerTypes = std::tuple<VectorContainer, SetContainer>;
+
+// set_intersection performance may depend on where matching values lie
+enum class OverlapPosition {
+  None,
+  Front,
+  // performance-wise, matches at the back are identical to ones at the front
+  Interlaced,
+};
+
+struct AllOverlapPositions : EnumValuesAsTuple<AllOverlapPositions, OverlapPosition, 3> {
+  static constexpr const char* Names[] = {"None", "Front", "Interlaced"};
+};
+
+// forward_iterator wrapping which, for each increment, moves the underlying iterator forward Stride elements
+template <typename Wrapped>
+struct StridedFwdIt {
+  Wrapped base_;
+  unsigned stride_;
+
+  using iterator_category = std::forward_iterator_tag;
+  using difference_type   = typename Wrapped::difference_type;
+  using value_type        = typename Wrapped::value_type;
+  using pointer           = typename Wrapped::pointer;
+  using reference         = typename Wrapped::reference;
+
+  StridedFwdIt(Wrapped base, unsigned stride) : base_(base), stride_(stride) { assert(stride_ != 0); }
+
+  StridedFwdIt operator++() {
+    for (unsigned i = 0; i < stride_; ++i)
+      ++base_;
+    return *this;
+  }
+  StridedFwdIt operator++(int) {
+    auto tmp = *this;
+    ++*this;
+    return tmp;
+  }
+  value_type& operator*() { return *base_; }
+  const value_type& operator*() const { return *base_; }
+  value_type& operator->() { return *base_; }
+  const value_type& operator->() const { return *base_; }
+  bool operator==(const StridedFwdIt& o) const { return base_ == o.base_; }
+  bool operator!=(const StridedFwdIt& o) const { return !operator==(o); }
+};
+template <typename Wrapped>
+StridedFwdIt(Wrapped, unsigned) -> StridedFwdIt<Wrapped>;
+
+template <typename T>
+std::vector<T> getVectorOfRandom(size_t N) {
+  std::vector<T> v;
+  fillValues(v, N, Order::Random);
+  sortValues(v, Order::Random);
+  return std::vector<T>(v);
+}
+
+// Realistically, data won't all be nicely contiguous in a container,
+// we'll go through some effort to ensure that it's shuffled through memory
+// this is especially important for containers with non-contiguous element
+// storage, but it will affect even a std::vector, because when you copy a
+// std::vector<std::string> the underlying data storage position for the char
+// arrays of the copy are likely to have high locality
+template <class Container>
+std::pair<Container, Container> genCacheUnfriendlyData(size_t size1, size_t size2, OverlapPosition pos) {
+  using ValueType = typename Container::value_type;
+  auto move_into  = [](auto first, auto last) {
+    Container out;
+    std::move(first, last, std::inserter(out, out.begin()));
+    return out;
+  };
+  const auto src_size        = pos == OverlapPosition::None ? size1 + size2 : std::max(size1, size2);
+  std::vector<ValueType> src = getVectorOfRandom<ValueType>(src_size);
+
+  if (pos == OverlapPosition::None) {
+    std::sort(src.begin(), src.end());
+    return std::make_pair(move_into(src.begin(), src.begin() + size1), move_into(src.begin() + size1, src.end()));
+  }
+
+  // All other overlap types will have to copy some part of the data, but if
+  // we copy after sorting it will likely have high locality, so we sort
+  // each copy separately
+  auto copy = src;
+  std::sort(src.begin(), src.end());
+  std::sort(copy.begin(), copy.end());
+
+  switch (pos) {
+  case OverlapPosition::None:
+    // we like -Wswitch :)
+    break;
+
+  case OverlapPosition::Front:
+    return std::make_pair(move_into(src.begin(), src.begin() + size1), move_into(copy.begin(), copy.begin() + size2));
+
+  case OverlapPosition::Interlaced:
+    const auto stride1 = size1 < size2 ? size2 / size1 : 1;
+    const auto stride2 = size2 < size1 ? size1 / size2 : 1;
+    return std::make_pair(move_into(StridedFwdIt(src.begin(), stride1), StridedFwdIt(src.end(), stride1)),
+                          move_into(StridedFwdIt(copy.begin(), stride2), StridedFwdIt(copy.end(), stride2)));
+  }
+  std::abort(); // would be std::unreachable() if it could
+  return std::pair<Container, Container>();
+}
+
+template <class ValueType, class Container, class Overlap>
+struct SetIntersection {
+  using ContainerType = typename Container::template type<Value<ValueType>>;
+  size_t size1_;
+  size_t size2_;
+
+  SetIntersection(size_t size1, size_t size2) : size1_(size1), size2_(size2) {}
+
+  bool skip() const noexcept {
+    // let's save some time and skip simmetrical runs
+    return size1_ < size2_;
+  }
+
+  void run(benchmark::State& state) const {
+    auto input = genCacheUnfriendlyData<ContainerType>(size1_, size2_, Overlap());
+    std::vector<Value<ValueType>> out(std::min(size1_, size2_));
+
+    const auto BATCH_SIZE = std::max(size_t{512}, (2 * TestSetElements) / (size1_ + size2_));
+    for (const auto& _ : state) {
+      while (state.KeepRunningBatch(BATCH_SIZE)) {
+        for (unsigned i = 0; i < BATCH_SIZE; ++i) {
+          const auto& [c1, c2] = input;
+          auto res             = std::set_intersection(c1.begin(), c1.end(), c2.begin(), c2.end(), out.begin());
+          benchmark::DoNotOptimize(res);
+        }
+      }
+    }
+  }
+
+  std::string name() const {
+    return std::string("SetIntersection") + Overlap::name() + '_' + Container::Name + ValueType::name() + '_' +
+           std::to_string(size1_) + '_' + std::to_string(size2_);
+  }
+};
+
+} // namespace
+
+int main(int argc, char** argv) { /**/
+  benchmark::Initialize(&argc, argv);
+  if (benchmark::ReportUnrecognizedArguments(argc, argv))
+    return 1;
+
+  makeCartesianProductBenchmark<SetIntersection, AllValueTypes, AllContainerTypes, AllOverlapPositions>(
+      Quantities, Quantities);
+  benchmark::RunSpecifiedBenchmarks();
+  return 0;
+}
@@ -71,6 +71,10 @@ Improvements and New Features
 - The ``std::ranges::minmax`` algorithm has been optimized for integral types, resulting in a performance increase of
   up to 100x.
 
+- The ``std::set_intersection`` and ``std::ranges::set_intersection`` algorithms have been optimized to fast-forward over
+  contiguous ranges of non-matching values, reducing the number of comparisons from linear to 
+  logarithmic growth with the number of elements in best-case scenarios.
+
 - The ``_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM`` macro has been added to make the declarations in ``<strstream>`` available.
 
 - The ``_LIBCPP_ENABLE_CXX26_REMOVED_WSTRING_CONVERT`` macro has been added to make the declarations in ``<locale>``
 
@@ -11,6 +11,7 @@
 
 #include <__algorithm/iter_swap.h>
 #include <__algorithm/ranges_iterator_concept.h>
+#include <__assert>
 #include <__config>
 #include <__iterator/advance.h>
 #include <__iterator/distance.h>
@@ -160,6 +161,59 @@ struct _IterOps<_ClassicAlgPolicy> {
   _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX14 void __advance_to(_Iter& __first, _Iter __last) {
     __first = __last;
   }
+
+  // advance with sentinel, a la std::ranges::advance
+  template <class _Iter>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_Iter>
+  __advance_to(_Iter& __iter, __difference_type<_Iter> __count, const _Iter& __sentinel) {
+    return _IterOps::__advance_to(__iter, __count, __sentinel, typename iterator_traits<_Iter>::iterator_category());
+  }
+
+private:
+  // advance with sentinel, a la std::ranges::advance -- InputIterator specialization
+  template <class _InputIter>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_InputIter> __advance_to(
+      _InputIter& __iter, __difference_type<_InputIter> __count, const _InputIter& __sentinel, input_iterator_tag) {
+    __difference_type<_InputIter> __dist = 0;
+    for (; __dist < __count && __iter != __sentinel; ++__dist)
+      ++__iter;
+    return __count - __dist;
+  }
+
+  // advance with sentinel, a la std::ranges::advance -- BidirectionalIterator specialization
+  template <class _BiDirIter>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_BiDirIter>
+  __advance_to(_BiDirIter& __iter,
+               __difference_type<_BiDirIter> __count,
+               const _BiDirIter& __sentinel,
+               bidirectional_iterator_tag) {
+    __difference_type<_BiDirIter> __dist = 0;
+    if (__count >= 0)
+      for (; __dist < __count && __iter != __sentinel; ++__dist)
+        ++__iter;
+    else
+      for (__count = -__count; __dist < __count && __iter != __sentinel; ++__dist)
+        --__iter;
+    return __count - __dist;
+  }
+
+  // advance with sentinel, a la std::ranges::advance -- RandomIterator specialization
+  template <class _RandIter>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_RandIter>
+  __advance_to(_RandIter& __iter,
+               __difference_type<_RandIter> __count,
+               const _RandIter& __sentinel,
+               random_access_iterator_tag) {
+    auto __dist = _IterOps::distance(__iter, __sentinel);
+    _LIBCPP_ASSERT_VALID_INPUT_RANGE(
+        __count == 0 || (__dist < 0) == (__count < 0), "__sentinel must precede __iter when __count < 0");
+    if (__count < 0)
+      __dist = __dist > __count ? __dist : __count;
+    else
+      __dist = __dist < __count ? __dist : __count;
+    __iter += __dist;
+    return __count - __dist;
+  }
 };
 
 _LIBCPP_END_NAMESPACE_STD
 
@@ -27,11 +27,13 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _AlgPolicy, class _Iter, class _Sent, class _Type, class _Proj, class _Comp>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
-__lower_bound(_Iter __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
-  auto __len = _IterOps<_AlgPolicy>::distance(__first, __last);
-
+template <class _AlgPolicy, class _Iter, class _Type, class _Proj, class _Comp>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter __lower_bound_bisecting(
+    _Iter __first,
+    const _Type& __value,
+    typename iterator_traits<_Iter>::difference_type __len,
+    _Comp& __comp,
+    _Proj& __proj) {
   while (__len != 0) {
     auto __l2 = std::__half_positive(__len);
     _Iter __m = __first;
@@ -46,6 +48,48 @@ __lower_bound(_Iter __first, _Sent __last, const _Type& __value, _Comp& __comp,
   return __first;
 }
 
+// One-sided binary search, aka meta binary search, has been in the public domain for decades, and has the general
+// advantage of being \Omega(1) rather than the classic algorithm's \Omega(log(n)), with the downside of executing at
+// most 2*log(n) comparisons vs the classic algorithm's exact log(n). There are two scenarios in which it really shines:
+// the first one is when operating over non-random-access iterators, because the classic algorithm requires knowing the
+// container's size upfront, which adds \Omega(n) iterator increments to the complexity. The second one is when you're
+// traversing the container in order, trying to fast-forward to the next value: in that case, the classic algorithm
+// would yield \Omega(n*log(n)) comparisons and, for non-random-access iterators, \Omega(n^2) iterator increments,
+// whereas the one-sided version will yield O(n) operations on both counts, with a \Omega(log(n)) bound on the number of
+// comparisons.
+template <class _AlgPolicy, class _ForwardIterator, class _Sent, class _Type, class _Proj, class _Comp>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
+__lower_bound_onesided(_ForwardIterator __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
+  // step = 0, ensuring we can always short-circuit when distance is 1 later on
+  if (__first == __last || !std::__invoke(__comp, std::__invoke(__proj, *__first), __value))
+    return __first;
+
+  using _Distance = typename iterator_traits<_ForwardIterator>::difference_type;
+  for (_Distance __step = 1; __first != __last; __step <<= 1) {
+    auto __it   = __first;
+    auto __dist = __step - _IterOps<_AlgPolicy>::__advance_to(__it, __step, __last);
+    // once we reach the last range where needle can be we must start
+    // looking inwards, bisecting that range
+    if (__it == __last || !std::__invoke(__comp, std::__invoke(__proj, *__it), __value)) {
+      // we've already checked the previous value and it was less, we can save
+      // one comparison by skipping bisection
+      if (__dist == 1)
+        return __it;
+      return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj);
+    }
+    // range not found, move forward!
+    __first = __it;
+  }
+  return __first;
+}
+
+template <class _AlgPolicy, class _ForwardIterator, class _Sent, class _Type, class _Proj, class _Comp>
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
+__lower_bound(_ForwardIterator __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
+  const auto __dist = _IterOps<_AlgPolicy>::distance(__first, __last);
+  return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj);
+}
+
 template <class _ForwardIterator, class _Tp, class _Compare>
 _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
 lower_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, _Compare __comp) {