Skip to content

[libc++] Optimize std::for_each_n for segmented iterators #135468

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions libcxx/docs/ReleaseNotes/21.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ Improvements and New Features
- The segmented iterator optimization for ``std::for_each`` has been backported to C++11. Previously it was only available
in C++23 and later.

- The ``std::for_each_n`` algorithm has been optimized for segmented iterators, resulting in a performance improvement of
up to 17.7x for ``std::deque<short>`` iterators, and up to 13.9x for ``std::join_view<vector<vector<short>>>`` iterators.

Deprecations and Removals
-------------------------

Expand Down
1 change: 1 addition & 0 deletions libcxx/include/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ set(files
__algorithm/find_segment_if.h
__algorithm/for_each.h
__algorithm/for_each_n.h
__algorithm/for_each_n_segment.h
__algorithm/for_each_segment.h
__algorithm/generate.h
__algorithm/generate_n.h
Expand Down
69 changes: 62 additions & 7 deletions libcxx/include/__algorithm/for_each_n.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,32 +10,87 @@
#ifndef _LIBCPP___ALGORITHM_FOR_EACH_N_H
#define _LIBCPP___ALGORITHM_FOR_EACH_N_H

#include <__algorithm/for_each.h>
#include <__algorithm/for_each_n_segment.h>
#include <__config>
#include <__iterator/iterator_traits.h>
#include <__iterator/segmented_iterator.h>
#include <__type_traits/disjunction.h>
#include <__type_traits/enable_if.h>
#include <__type_traits/negation.h>
#include <__utility/convert_to_integral.h>
#include <__utility/move.h>

#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
#endif

_LIBCPP_BEGIN_NAMESPACE_STD
_LIBCPP_PUSH_MACROS
#include <__undef_macros>

#if _LIBCPP_STD_VER >= 17
_LIBCPP_BEGIN_NAMESPACE_STD

template <class _InputIterator, class _Size, class _Function>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
template <class _InputIterator,
class _Size,
class _Func,
__enable_if_t<!__has_random_access_iterator_category<_InputIterator>::value &&
_Or< _Not<__is_segmented_iterator<_InputIterator> >,
_Not<__has_random_access_local_iterator<_InputIterator> > >::value,
int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
__for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f) {
typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
_IntegralSize __n = __orig_n;
while (__n > 0) {
__f(*__first);
++__first;
--__n;
}
return __first;
return std::move(__first);
}

#endif
template <class _RandIter,
class _Size,
class _Func,
__enable_if_t<__has_random_access_iterator_category<_RandIter>::value, int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter
__for_each_n(_RandIter __first, _Size __orig_n, _Func& __f) {
typename std::iterator_traits<_RandIter>::difference_type __n = __orig_n;
auto __last = __first + __n;
std::__for_each(__first, __last, __f);
return std::move(__last);
}

#ifndef _LIBCPP_CXX03_LANG
template <class _SegmentedIterator,
class _Size,
class _Func,
__enable_if_t<!__has_random_access_iterator_category<_SegmentedIterator>::value &&
__is_segmented_iterator<_SegmentedIterator>::value &&
__has_random_access_iterator_category<
typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
__for_each_n(_SegmentedIterator __first, _Size __orig_n, _Func& __f) {
using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
return std::__for_each_n_segment(__first, __orig_n, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
std::__for_each(__lfirst, __llast, __f);
});
}
#endif // !_LIBCPP_CXX03_LANG

#if _LIBCPP_STD_VER >= 17

template <class _InputIterator, class _Size, class _Function>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
return std::__for_each_n(__first, __orig_n, __f);
}

#endif // _LIBCPP_STD_VER >= 17

_LIBCPP_END_NAMESPACE_STD

_LIBCPP_POP_MACROS

#endif // _LIBCPP___ALGORITHM_FOR_EACH_N_H
63 changes: 63 additions & 0 deletions libcxx/include/__algorithm/for_each_n_segment.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
#define _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H

#include <__config>
#include <__iterator/iterator_traits.h>
#include <__iterator/segmented_iterator.h>

#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
#endif

_LIBCPP_BEGIN_NAMESPACE_STD

// __for_each_n_segment optimizes linear iteration over segmented iterators. It processes a segmented
// input range [__first, __first + __n) by applying the functor __func to each element within the segment.
// The return value of __func is ignored, and the function returns an iterator pointing to one past the
// last processed element in the input range.

template <class _SegmentedIterator, class _Size, class _Functor>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _SegmentedIterator
__for_each_n_segment(_SegmentedIterator __first, _Size __orig_n, _Functor __func) {
static_assert(__is_segmented_iterator<_SegmentedIterator>::value &&
__has_random_access_iterator_category<
typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
"__for_each_n_segment only works with segmented iterators with random-access local iterators");
if (__orig_n <= 0)
return __first;

using _Traits = __segmented_iterator_traits<_SegmentedIterator>;
using __local_iter_t = typename _Traits::__local_iterator;
using __difference_t = typename std::iterator_traits<__local_iter_t>::difference_type;
__difference_t __n = __orig_n;
auto __seg = _Traits::__segment(__first);
auto __local_first = _Traits::__local(__first);
__local_iter_t __local_last;

while (__n > 0) {
__local_last = _Traits::__end(__seg);
auto __seg_size = __local_last - __local_first;
if (__n <= __seg_size) {
__local_last = __local_first + __n;
__func(__local_first, __local_last);
break;
}
__func(__local_first, __local_last);
__n -= __seg_size;
__local_first = _Traits::__begin(++__seg);
}

return _Traits::__compose(__seg, __local_last);
}

_LIBCPP_END_NAMESPACE_STD

#endif // _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
6 changes: 6 additions & 0 deletions libcxx/include/__iterator/segmented_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@

#include <__config>
#include <__cstddef/size_t.h>
#include <__iterator/iterator_traits.h>
#include <__type_traits/integral_constant.h>

#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
Expand Down Expand Up @@ -74,6 +75,11 @@ struct __has_specialization<_Tp, sizeof(_Tp) * 0> : true_type {};
template <class _Iterator>
using __is_segmented_iterator _LIBCPP_NODEBUG = __has_specialization<__segmented_iterator_traits<_Iterator> >;

template <class _SegmentedIterator>
struct __has_random_access_local_iterator
: __has_random_access_iterator_category<
typename __segmented_iterator_traits< _SegmentedIterator >::__local_iterator > {};

_LIBCPP_END_NAMESPACE_STD

#endif // _LIBCPP___SEGMENTED_ITERATOR_H
1 change: 1 addition & 0 deletions libcxx/include/module.modulemap.in
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,7 @@ module std [system] {
module find_segment_if { header "__algorithm/find_segment_if.h" }
module find { header "__algorithm/find.h" }
module for_each_n { header "__algorithm/for_each_n.h" }
module for_each_n_segment { header "__algorithm/for_each_n_segment.h" }
module for_each_segment { header "__algorithm/for_each_segment.h" }
module for_each { header "__algorithm/for_each.h" }
module generate_n { header "__algorithm/generate_n.h" }
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

// UNSUPPORTED: c++03, c++11, c++14, c++17

#include <algorithm>
#include <cstddef>
#include <deque>
#include <list>
#include <ranges>
#include <string>
#include <vector>

#include <benchmark/benchmark.h>

int main(int argc, char** argv) {
auto std_for_each_n = [](auto first, auto n, auto f) { return std::for_each_n(first, n, f); };

// std::for_each_n
{
auto bm = []<class Container>(std::string name, auto for_each_n) {
using ElemType = typename Container::value_type;
benchmark::RegisterBenchmark(
name,
[for_each_n](auto& st) {
std::size_t const n = st.range(0);
Container c(n, 1);
auto first = c.begin();

for ([[maybe_unused]] auto _ : st) {
benchmark::DoNotOptimize(c);
auto result = for_each_n(first, n, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
benchmark::DoNotOptimize(result);
}
})
->Arg(8)
->Arg(32)
->Arg(50) // non power-of-two
->Arg(1024)
->Arg(4096)
->Arg(8192)
->Arg(1 << 14)
->Arg(1 << 16)
->Arg(1 << 18);
};
bm.operator()<std::vector<int>>("std::for_each_n(vector<int>)", std_for_each_n);
bm.operator()<std::deque<int>>("std::for_each_n(deque<int>)", std_for_each_n);
bm.operator()<std::list<int>>("std::for_each_n(list<int>)", std_for_each_n);
}

// std::for_each_n for join_view
{
auto bm = []<class Container>(std::string name, auto for_each_n) {
using C1 = typename Container::value_type;
using ElemType = typename C1::value_type;
benchmark::RegisterBenchmark(
name,
[for_each_n](auto& st) {
std::size_t const size = st.range(0);
std::size_t const seg_size = 256;
std::size_t const segments = (size + seg_size - 1) / seg_size;
Container c(segments);
for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
c[i].resize(std::min(seg_size, n), ElemType(1));
}

auto view = c | std::views::join;
auto first = view.begin();

for ([[maybe_unused]] auto _ : st) {
benchmark::DoNotOptimize(c);
auto result = for_each_n(first, size, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
benchmark::DoNotOptimize(result);
}
})
->Arg(8)
->Arg(32)
->Arg(50) // non power-of-two
->Arg(1024)
->Arg(4096)
->Arg(8192)
->Arg(1 << 14)
->Arg(1 << 16)
->Arg(1 << 18);
};
bm.operator()<std::vector<std::vector<int>>>("std::for_each_n(join_view(vector<vector<int>>))", std_for_each_n);
}

benchmark::Initialize(&argc, argv);
benchmark::RunSpecifiedBenchmarks();
benchmark::Shutdown();
return 0;
}
Loading
Loading