Skip to content

[libc++] Optimize ranges::{for_each, for_each_n} for segmented iterators #132896

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions libcxx/docs/ReleaseNotes/21.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,9 @@ Improvements and New Features
- The segmented iterator optimization for ``std::for_each`` has been backported to C++11. Previously it was only available
in C++23 and later.

- The ``std::for_each_n`` algorithm has been optimized for segmented iterators, resulting in a performance improvement of
up to 17.7x for ``std::deque<short>`` iterators, and up to 13.9x for ``std::join_view<vector<vector<short>>>`` iterators.
- The ``std::for_each_n``, ``std::ranges::for_each`` and ``std::ranges::for_each_n`` algorithms have been optimized for
segmented iterators, resulting in a performance improvement of up to 17.7x for ``std::deque<short>`` iterators, and up
to 13.9x for ``std::join_view<vector<vector<short>>>`` iterators.

- The ``bitset::to_string`` function has been optimized, resulting in a performance improvement of up to 8.3x for bitsets
with uniformly distributed zeros and ones, and up to 13.5x and 16.1x for sparse and dense bitsets, respectively.
Expand Down
35 changes: 24 additions & 11 deletions libcxx/include/__algorithm/for_each.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,41 +12,54 @@

#include <__algorithm/for_each_segment.h>
#include <__config>
#include <__functional/identity.h>
#include <__iterator/segmented_iterator.h>
#include <__type_traits/enable_if.h>
#include <__type_traits/invoke.h>
#include <__utility/move.h>

#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
#endif

_LIBCPP_PUSH_MACROS
#include <__undef_macros>

_LIBCPP_BEGIN_NAMESPACE_STD

template <class _InputIterator, class _Sent, class _Func>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __for_each(_InputIterator __first, _Sent __last, _Func& __f) {
template <class _InputIterator, class _Sent, class _Func, class _Proj>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
__for_each(_InputIterator __first, _Sent __last, _Func& __f, _Proj& __proj) {
for (; __first != __last; ++__first)
__f(*__first);
std::__invoke(__f, std::__invoke(__proj, *__first));
return __first;
}

#ifndef _LIBCPP_CXX03_LANG
template <class _SegmentedIterator,
class _Function,
class _Func,
class _Proj,
__enable_if_t<__is_segmented_iterator<_SegmentedIterator>::value, int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
__for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function& __func) {
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
__for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Func& __func, _Proj& __proj) {
using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
std::__for_each_segment(__first, __last, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
std::__for_each(__lfirst, __llast, __func);
std::__for_each(__lfirst, __llast, __func, __proj);
});
return __last;
}
#endif // !_LIBCPP_CXX03_LANG

template <class _InputIterator, class _Function>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Function
for_each(_InputIterator __first, _InputIterator __last, _Function __f) {
std::__for_each(__first, __last, __f);
template <class _InputIterator, class _Func>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Func
for_each(_InputIterator __first, _InputIterator __last, _Func __f) {
__identity __proj;
std::__for_each(__first, __last, __f, __proj);
return __f;
}

_LIBCPP_END_NAMESPACE_STD

_LIBCPP_POP_MACROS

#endif // _LIBCPP___ALGORITHM_FOR_EACH_H
26 changes: 16 additions & 10 deletions libcxx/include/__algorithm/for_each_n.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@
#include <__algorithm/for_each.h>
#include <__algorithm/for_each_n_segment.h>
#include <__config>
#include <__functional/identity.h>
#include <__iterator/iterator_traits.h>
#include <__iterator/segmented_iterator.h>
#include <__type_traits/disjunction.h>
#include <__type_traits/enable_if.h>
#include <__type_traits/invoke.h>
#include <__type_traits/negation.h>
#include <__utility/convert_to_integral.h>
#include <__utility/move.h>
Expand All @@ -33,16 +35,17 @@ _LIBCPP_BEGIN_NAMESPACE_STD
template <class _InputIterator,
class _Size,
class _Func,
class _Proj,
__enable_if_t<!__has_random_access_iterator_category<_InputIterator>::value &&
_Or< _Not<__is_segmented_iterator<_InputIterator> >,
_Not<__has_random_access_local_iterator<_InputIterator> > >::value,
int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
__for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f) {
__for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f, _Proj& __proj) {
typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
_IntegralSize __n = __orig_n;
while (__n > 0) {
__f(*__first);
std::__invoke(__f, std::__invoke(__proj, *__first));
++__first;
--__n;
}
Expand All @@ -52,39 +55,42 @@ __for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f) {
template <class _RandIter,
class _Size,
class _Func,
class _Proj,
__enable_if_t<__has_random_access_iterator_category<_RandIter>::value, int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter
__for_each_n(_RandIter __first, _Size __orig_n, _Func& __f) {
__for_each_n(_RandIter __first, _Size __orig_n, _Func& __f, _Proj& __proj) {
typename std::iterator_traits<_RandIter>::difference_type __n = __orig_n;
auto __last = __first + __n;
std::__for_each(__first, __last, __f);
return std::move(__last);
std::__for_each(__first, __last, __f, __proj);
return __last;
}

#ifndef _LIBCPP_CXX03_LANG
template <class _SegmentedIterator,
class _Size,
class _Func,
class _Proj,
__enable_if_t<!__has_random_access_iterator_category<_SegmentedIterator>::value &&
__is_segmented_iterator<_SegmentedIterator>::value &&
__has_random_access_iterator_category<
typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
__for_each_n(_SegmentedIterator __first, _Size __orig_n, _Func& __f) {
__for_each_n(_SegmentedIterator __first, _Size __orig_n, _Func& __f, _Proj& __proj) {
using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
return std::__for_each_n_segment(__first, __orig_n, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
std::__for_each(__lfirst, __llast, __f);
std::__for_each(__lfirst, __llast, __f, __proj);
});
}
#endif // !_LIBCPP_CXX03_LANG

#if _LIBCPP_STD_VER >= 17

template <class _InputIterator, class _Size, class _Function>
template <class _InputIterator, class _Size, class _Func>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
return std::__for_each_n(__first, __orig_n, __f);
for_each_n(_InputIterator __first, _Size __orig_n, _Func __f) {
__identity __proj;
return std::__for_each_n(__first, __orig_n, __f, __proj);
}

#endif // _LIBCPP_STD_VER >= 17
Expand Down
18 changes: 14 additions & 4 deletions libcxx/include/__algorithm/ranges_for_each.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@
#ifndef _LIBCPP___ALGORITHM_RANGES_FOR_EACH_H
#define _LIBCPP___ALGORITHM_RANGES_FOR_EACH_H

#include <__algorithm/for_each.h>
#include <__algorithm/for_each_n.h>
#include <__algorithm/in_fun_result.h>
#include <__concepts/assignable.h>
#include <__config>
#include <__functional/identity.h>
#include <__functional/invoke.h>
#include <__iterator/concepts.h>
#include <__iterator/projected.h>
#include <__ranges/access.h>
Expand Down Expand Up @@ -41,9 +43,17 @@ struct __for_each {
template <class _Iter, class _Sent, class _Proj, class _Func>
_LIBCPP_HIDE_FROM_ABI constexpr static for_each_result<_Iter, _Func>
__for_each_impl(_Iter __first, _Sent __last, _Func& __func, _Proj& __proj) {
for (; __first != __last; ++__first)
std::invoke(__func, std::invoke(__proj, *__first));
return {std::move(__first), std::move(__func)};
// In the case where we have different iterator and sentinel types, the segmented iterator optimization
// in std::for_each will not kick in. Therefore, we prefer std::for_each_n in that case (whenever we can
// obtain the `n`).
if constexpr (!std::assignable_from<_Iter&, _Sent> && std::sized_sentinel_for<_Sent, _Iter>) {
auto __n = __last - __first;
auto __end = std::__for_each_n(std::move(__first), __n, __func, __proj);
return {std::move(__end), std::move(__func)};
} else {
auto __end = std::__for_each(std::move(__first), std::move(__last), __func, __proj);
return {std::move(__end), std::move(__func)};
}
}

public:
Expand Down
9 changes: 3 additions & 6 deletions libcxx/include/__algorithm/ranges_for_each_n.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
#ifndef _LIBCPP___ALGORITHM_RANGES_FOR_EACH_N_H
#define _LIBCPP___ALGORITHM_RANGES_FOR_EACH_N_H

#include <__algorithm/for_each_n.h>
#include <__algorithm/in_fun_result.h>
#include <__config>
#include <__functional/identity.h>
#include <__functional/invoke.h>
#include <__iterator/concepts.h>
#include <__iterator/incrementable_traits.h>
#include <__iterator/iterator_traits.h>
Expand Down Expand Up @@ -40,11 +40,8 @@ struct __for_each_n {
template <input_iterator _Iter, class _Proj = identity, indirectly_unary_invocable<projected<_Iter, _Proj>> _Func>
_LIBCPP_HIDE_FROM_ABI constexpr for_each_n_result<_Iter, _Func>
operator()(_Iter __first, iter_difference_t<_Iter> __count, _Func __func, _Proj __proj = {}) const {
while (__count-- > 0) {
std::invoke(__func, std::invoke(__proj, *__first));
++__first;
}
return {std::move(__first), std::move(__func)};
auto __last = std::__for_each_n(std::move(__first), __count, __func, __proj);
return {std::move(__last), std::move(__func)};
}
};

Expand Down
1 change: 1 addition & 0 deletions libcxx/include/experimental/iterator
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ _LIBCPP_POP_MACROS
# if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
# include <cstddef>
# include <iosfwd>
# include <optional>
# include <type_traits>
# endif
#endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
Expand Down
1 change: 1 addition & 0 deletions libcxx/include/mutex
Original file line number Diff line number Diff line change
Expand Up @@ -504,6 +504,7 @@ _LIBCPP_POP_MACROS
# include <initializer_list>
# include <iosfwd>
# include <new>
# include <optional>
# include <stdexcept>
# include <system_error>
# include <type_traits>
Expand Down
1 change: 1 addition & 0 deletions libcxx/include/shared_mutex
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,7 @@ _LIBCPP_POP_MACROS
# endif // _LIBCPP_HAS_THREADS

# if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
# include <optional>
# include <system_error>
# endif
#endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <cstddef>
#include <deque>
#include <list>
#include <ranges>
#include <string>
#include <vector>

Expand All @@ -23,6 +24,7 @@ int main(int argc, char** argv) {
// {std,ranges}::for_each
{
auto bm = []<class Container>(std::string name, auto for_each) {
using ElemType = typename Container::value_type;
benchmark::RegisterBenchmark(
name,
[for_each](auto& st) {
Expand All @@ -33,15 +35,14 @@ int main(int argc, char** argv) {

for ([[maybe_unused]] auto _ : st) {
benchmark::DoNotOptimize(c);
auto result = for_each(first, last, [](int& x) { x = std::clamp(x, 10, 100); });
auto result = for_each(first, last, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
benchmark::DoNotOptimize(result);
}
})
->Arg(8)
->Arg(32)
->Arg(50) // non power-of-two
->Arg(8192)
->Arg(1 << 20);
->Arg(8192);
};
bm.operator()<std::vector<int>>("std::for_each(vector<int>)", std_for_each);
bm.operator()<std::deque<int>>("std::for_each(deque<int>)", std_for_each);
Expand All @@ -51,6 +52,42 @@ int main(int argc, char** argv) {
bm.operator()<std::list<int>>("rng::for_each(list<int>)", std::ranges::for_each);
}

// {std,ranges}::for_each for join_view
{
auto bm = []<class Container>(std::string name, auto for_each) {
using C1 = typename Container::value_type;
using ElemType = typename C1::value_type;

benchmark::RegisterBenchmark(
name,
[for_each](auto& st) {
std::size_t const size = st.range(0);
std::size_t const seg_size = 256;
std::size_t const segments = (size + seg_size - 1) / seg_size;
Container c(segments);
for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
c[i].resize(std::min(seg_size, n), ElemType(1));
}

auto view = c | std::views::join;
auto first = view.begin();
auto last = view.end();

for ([[maybe_unused]] auto _ : st) {
benchmark::DoNotOptimize(c);
auto result = for_each(first, last, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
benchmark::DoNotOptimize(result);
}
})
->Arg(8)
->Arg(32)
->Arg(50) // non power-of-two
->Arg(8192);
};
bm.operator()<std::vector<std::vector<int>>>("std::for_each(join_view(vector<vector<int>>))", std_for_each);
bm.operator()<std::vector<std::vector<int>>>("rng::for_each(join_view(vector<vector<int>>)", std::ranges::for_each);
}

benchmark::Initialize(&argc, argv);
benchmark::RunSpecifiedBenchmarks();
benchmark::Shutdown();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
int main(int argc, char** argv) {
auto std_for_each_n = [](auto first, auto n, auto f) { return std::for_each_n(first, n, f); };

// std::for_each_n
// {std,ranges}::for_each_n
{
auto bm = []<class Container>(std::string name, auto for_each_n) {
using ElemType = typename Container::value_type;
Expand All @@ -41,19 +41,17 @@ int main(int argc, char** argv) {
->Arg(8)
->Arg(32)
->Arg(50) // non power-of-two
->Arg(1024)
->Arg(4096)
->Arg(8192)
->Arg(1 << 14)
->Arg(1 << 16)
->Arg(1 << 18);
->Arg(8192);
};
bm.operator()<std::vector<int>>("std::for_each_n(vector<int>)", std_for_each_n);
bm.operator()<std::deque<int>>("std::for_each_n(deque<int>)", std_for_each_n);
bm.operator()<std::list<int>>("std::for_each_n(list<int>)", std_for_each_n);
bm.operator()<std::vector<int>>("rng::for_each_n(vector<int>)", std::ranges::for_each_n);
Comment on lines 48 to +49
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's use the same numbers as for the std::for_each benchmarks.

bm.operator()<std::deque<int>>("rng::for_each_n(deque<int>)", std::ranges::for_each_n);
bm.operator()<std::list<int>>("rng::for_each_n(list<int>)", std::ranges::for_each_n);
}

// std::for_each_n for join_view
// {std,ranges}::for_each_n for join_view
{
auto bm = []<class Container>(std::string name, auto for_each_n) {
using C1 = typename Container::value_type;
Expand Down Expand Up @@ -81,14 +79,11 @@ int main(int argc, char** argv) {
->Arg(8)
->Arg(32)
->Arg(50) // non power-of-two
->Arg(1024)
->Arg(4096)
->Arg(8192)
->Arg(1 << 14)
->Arg(1 << 16)
->Arg(1 << 18);
->Arg(8192);
};
bm.operator()<std::vector<std::vector<int>>>("std::for_each_n(join_view(vector<vector<int>>))", std_for_each_n);
bm.operator()<std::vector<std::vector<int>>>(
"rng::for_each_n(join_view(vector<vector<int>>)", std::ranges::for_each_n);
}

benchmark::Initialize(&argc, argv);
Expand Down
Loading
Loading