Skip to content

Commit ee6ec2c

Browse files
committed
[libc++][PSTL] Implement std::reduce and std::transform_reduce
Reviewed By: ldionne, #libc Spies: libcxx-commits, miyuki Differential Revision: https://reviews.llvm.org/D150736
1 parent b1f4168 commit ee6ec2c

33 files changed

+813
-504
lines changed

libcxx/include/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ set(files
8282
__algorithm/pstl_backends/cpu_backends/serial.h
8383
__algorithm/pstl_backends/cpu_backends/thread.h
8484
__algorithm/pstl_backends/cpu_backends/transform.h
85+
__algorithm/pstl_backends/cpu_backends/transform_reduce.h
8586
__algorithm/pstl_copy.h
8687
__algorithm/pstl_fill.h
8788
__algorithm/pstl_find.h
@@ -517,6 +518,8 @@ set(files
517518
__numeric/iota.h
518519
__numeric/midpoint.h
519520
__numeric/partial_sum.h
521+
__numeric/pstl_reduce.h
522+
__numeric/pstl_transform_reduce.h
520523
__numeric/reduce.h
521524
__numeric/transform_exclusive_scan.h
522525
__numeric/transform_inclusive_scan.h
@@ -787,6 +790,7 @@ set(files
787790
__type_traits/nat.h
788791
__type_traits/negation.h
789792
__type_traits/noexcept_move_assign_container.h
793+
__type_traits/operation_traits.h
790794
__type_traits/predicate_traits.h
791795
__type_traits/promote.h
792796
__type_traits/rank.h

libcxx/include/__algorithm/pstl_backend.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,29 @@ A PSTL parallel backend is a tag type to which the following functions are assoc
4242
_OutIterator __result,
4343
_BinaryOperation __op);
4444
45+
template <class _ExecutionPolicy,
46+
class _Iterator1,
47+
class _Iterator2,
48+
class _Tp,
49+
class _BinaryOperation1,
50+
class _BinaryOperation2>
51+
_Tp __pstl_transform_reduce(_Backend,
52+
_Iterator1 __first1,
53+
_Iterator1 __last1,
54+
_Iterator2 __first2,
55+
_Iterator2 __last2,
56+
_Tp __init,
57+
_BinaryOperation1 __reduce,
58+
_BinaryOperation2 __transform);
59+
60+
template <class _ExecutionPolicy, class _Iterator, class _Tp, class _BinaryOperation, class _UnaryOperation>
61+
_Tp __pstl_transform_reduce(_Backend,
62+
_Iterator __first,
63+
_Iterator __last,
64+
_Tp __init,
65+
_BinaryOperation __reduce,
66+
_UnaryOperation __transform);
67+
4568
// TODO: Complete this list
4669
4770
The following functions are optional but can be provided. If provided, they are used by the corresponding
@@ -81,6 +104,12 @@ implemented, all the algorithms will eventually forward to the basis algorithms
81104
_OutIterator __result,
82105
_Comp __comp);
83106
107+
template <class _ExecutionPolicy, class _Iterator, class _Tp, class _BinaryOperation>
108+
_Tp __pstl_reduce(_Backend, _Iterator __first, _Iterator __last, _Tp __init, _BinaryOperation __op);
109+
110+
temlate <class _ExecutionPolicy, class _Iterator>
111+
__iter_value_type<_Iterator> __pstl_reduce(_Backend, _Iterator __first, _Iterator __last);
112+
84113
// TODO: Complete this list
85114
86115
*/

libcxx/include/__algorithm/pstl_backends/cpu_backend.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717
template <class _RandomAccessIterator, class _Functor>
1818
void __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func);
1919
20+
template <class _Iterator, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduction>
21+
_Tp __parallel_transform_reduce(_Iterator __first, _Iterator __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduction);
22+
2023
// Cancel the execution of other jobs - they aren't needed anymore
2124
void __cancel_execution();
2225
@@ -38,10 +41,12 @@
3841
*/
3942

4043
#include <__algorithm/pstl_backends/cpu_backends/any_of.h>
44+
#include <__algorithm/pstl_backends/cpu_backends/backend.h>
4145
#include <__algorithm/pstl_backends/cpu_backends/fill.h>
4246
#include <__algorithm/pstl_backends/cpu_backends/find_if.h>
4347
#include <__algorithm/pstl_backends/cpu_backends/for_each.h>
4448
#include <__algorithm/pstl_backends/cpu_backends/merge.h>
4549
#include <__algorithm/pstl_backends/cpu_backends/transform.h>
50+
#include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
4651

4752
#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H

libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_BACKEND_H
1111

1212
#include <__config>
13+
#include <cstddef>
1314

1415
#if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL)
1516
# include <__algorithm/pstl_backends/cpu_backends/serial.h>
@@ -23,10 +24,16 @@
2324
# pragma GCC system_header
2425
#endif
2526

27+
#if _LIBCPP_STD_VER >= 17
28+
2629
_LIBCPP_BEGIN_NAMESPACE_STD
2730

2831
struct __cpu_backend_tag {};
2932

33+
inline constexpr size_t __lane_size = 64;
34+
3035
_LIBCPP_END_NAMESPACE_STD
3136

37+
#endif // _LIBCPP_STD_VER >= 17
38+
3239
#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_BACKEND_H

libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,6 @@ __parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool
5353
return __extremum != __initial_dist ? __first + __extremum : __last;
5454
}
5555

56-
const std::size_t __lane_size = 64;
57-
5856
template <class _Index, class _DifferenceType, class _Compare>
5957
_LIBCPP_HIDE_FROM_ABI _Index
6058
__simd_first(_Index __first, _DifferenceType __begin, _DifferenceType __end, _Compare __comp) noexcept {

libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_SERIAL_H
1212

1313
#include <__config>
14+
#include <__utility/move.h>
1415

1516
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
1617
# pragma GCC system_header
@@ -28,6 +29,12 @@ _LIBCPP_HIDE_FROM_ABI void __parallel_for(_RandomAccessIterator __first, _Random
2829
__f(__first, __last);
2930
}
3031

32+
template <class _Index, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduce>
33+
_LIBCPP_HIDE_FROM_ABI _Tp
34+
__parallel_transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
35+
return __reduce(std::move(__first), std::move(__last), std::move(__init));
36+
}
37+
3138
_LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {}
3239

3340
template <class _RandomAccessIterator1,

libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include <__assert>
1313
#include <__config>
14+
#include <__utility/move.h>
1415

1516
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
1617
# pragma GCC system_header
@@ -31,6 +32,12 @@ _LIBCPP_HIDE_FROM_ABI void __parallel_for(_RandomAccessIterator __first, _Random
3132
__f(__first, __last);
3233
}
3334

35+
template <class _Index, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduce>
36+
_LIBCPP_HIDE_FROM_ABI _Tp
37+
__parallel_transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
38+
return __reduce(std::move(__first), std::move(__last), std::move(__init));
39+
}
40+
3441
_LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {}
3542

3643
template <class _RandomAccessIterator1,
Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_TRANSFORM_REDUCE_H
10+
#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_TRANSFORM_REDUCE_H
11+
12+
#include <__algorithm/pstl_backends/cpu_backends/backend.h>
13+
#include <__config>
14+
#include <__iterator/iterator_traits.h>
15+
#include <__numeric/transform_reduce.h>
16+
#include <__type_traits/is_arithmetic.h>
17+
#include <__type_traits/is_execution_policy.h>
18+
#include <__type_traits/operation_traits.h>
19+
#include <__utility/move.h>
20+
#include <__utility/terminate_on_exception.h>
21+
#include <new>
22+
23+
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
24+
# pragma GCC system_header
25+
#endif
26+
27+
#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
28+
29+
_LIBCPP_BEGIN_NAMESPACE_STD
30+
31+
template <
32+
typename _DifferenceType,
33+
typename _Tp,
34+
typename _BinaryOperation,
35+
typename _UnaryOperation,
36+
__enable_if_t<__is_trivial_plus_operation<_BinaryOperation, _Tp, _Tp>::value && is_arithmetic_v<_Tp>, int> = 0>
37+
_LIBCPP_HIDE_FROM_ABI _Tp
38+
__simd_transform_reduce(_DifferenceType __n, _Tp __init, _BinaryOperation, _UnaryOperation __f) noexcept {
39+
_PSTL_PRAGMA_SIMD_REDUCTION(+ : __init)
40+
for (_DifferenceType __i = 0; __i < __n; ++__i)
41+
__init += __f(__i);
42+
return __init;
43+
}
44+
45+
template <
46+
typename _Size,
47+
typename _Tp,
48+
typename _BinaryOperation,
49+
typename _UnaryOperation,
50+
__enable_if_t<!(__is_trivial_plus_operation<_BinaryOperation, _Tp, _Tp>::value && is_arithmetic_v<_Tp>), int> = 0>
51+
_LIBCPP_HIDE_FROM_ABI _Tp
52+
__simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _UnaryOperation __f) noexcept {
53+
const _Size __block_size = __lane_size / sizeof(_Tp);
54+
if (__n > 2 * __block_size && __block_size > 1) {
55+
alignas(__lane_size) char __lane_buffer[__lane_size];
56+
_Tp* __lane = reinterpret_cast<_Tp*>(__lane_buffer);
57+
58+
// initializer
59+
_PSTL_PRAGMA_SIMD
60+
for (_Size __i = 0; __i < __block_size; ++__i) {
61+
::new (__lane + __i) _Tp(__binary_op(__f(__i), __f(__block_size + __i)));
62+
}
63+
// main loop
64+
_Size __i = 2 * __block_size;
65+
const _Size __last_iteration = __block_size * (__n / __block_size);
66+
for (; __i < __last_iteration; __i += __block_size) {
67+
_PSTL_PRAGMA_SIMD
68+
for (_Size __j = 0; __j < __block_size; ++__j) {
69+
__lane[__j] = __binary_op(std::move(__lane[__j]), __f(__i + __j));
70+
}
71+
}
72+
// remainder
73+
_PSTL_PRAGMA_SIMD
74+
for (_Size __j = 0; __j < __n - __last_iteration; ++__j) {
75+
__lane[__j] = __binary_op(std::move(__lane[__j]), __f(__last_iteration + __j));
76+
}
77+
// combiner
78+
for (_Size __j = 0; __j < __block_size; ++__j) {
79+
__init = __binary_op(std::move(__init), std::move(__lane[__j]));
80+
}
81+
// destroyer
82+
_PSTL_PRAGMA_SIMD
83+
for (_Size __j = 0; __j < __block_size; ++__j) {
84+
__lane[__j].~_Tp();
85+
}
86+
} else {
87+
for (_Size __i = 0; __i < __n; ++__i) {
88+
__init = __binary_op(std::move(__init), __f(__i));
89+
}
90+
}
91+
return __init;
92+
}
93+
94+
template <class _ExecutionPolicy,
95+
class _ForwardIterator1,
96+
class _ForwardIterator2,
97+
class _Tp,
98+
class _BinaryOperation1,
99+
class _BinaryOperation2>
100+
_LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
101+
__cpu_backend_tag,
102+
_ForwardIterator1 __first1,
103+
_ForwardIterator1 __last1,
104+
_ForwardIterator2 __first2,
105+
_Tp __init,
106+
_BinaryOperation1 __reduce,
107+
_BinaryOperation2 __transform) {
108+
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
109+
__has_random_access_iterator_category<_ForwardIterator1>::value &&
110+
__has_random_access_iterator_category<_ForwardIterator2>::value) {
111+
return std::__terminate_on_exception([&] {
112+
return __par_backend::__parallel_transform_reduce(
113+
__first1,
114+
std::move(__last1),
115+
[__first1, __first2, __transform](_ForwardIterator1 __iter) {
116+
return __transform(*__iter, *(__first2 + (__iter - __first1)));
117+
},
118+
std::move(__init),
119+
std::move(__reduce),
120+
[__first1, __first2, __reduce, __transform](
121+
_ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last, _Tp __brick_init) {
122+
return std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
123+
__cpu_backend_tag{},
124+
__brick_first,
125+
std::move(__brick_last),
126+
__first2 + (__brick_first - __first1),
127+
std::move(__brick_init),
128+
std::move(__reduce),
129+
std::move(__transform));
130+
});
131+
});
132+
} else if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
133+
__has_random_access_iterator_category<_ForwardIterator1>::value &&
134+
__has_random_access_iterator_category<_ForwardIterator2>::value) {
135+
return std::__simd_transform_reduce(
136+
__last1 - __first1, std::move(__init), std::move(__reduce), [&](__iter_diff_t<_ForwardIterator1> __i) {
137+
return __transform(__first1[__i], __first2[__i]);
138+
});
139+
} else {
140+
return std::transform_reduce(
141+
std::move(__first1),
142+
std::move(__last1),
143+
std::move(__first2),
144+
std::move(__init),
145+
std::move(__reduce),
146+
std::move(__transform));
147+
}
148+
}
149+
150+
template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class _UnaryOperation>
151+
_LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
152+
__cpu_backend_tag,
153+
_ForwardIterator __first,
154+
_ForwardIterator __last,
155+
_Tp __init,
156+
_BinaryOperation __reduce,
157+
_UnaryOperation __transform) {
158+
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
159+
__has_random_access_iterator_category<_ForwardIterator>::value) {
160+
return std::__terminate_on_exception([&] {
161+
return __par_backend::__parallel_transform_reduce(
162+
std::move(__first),
163+
std::move(__last),
164+
[__transform](_ForwardIterator __iter) { return __transform(*__iter); },
165+
std::move(__init),
166+
std::move(__reduce),
167+
[=](_ForwardIterator __brick_first, _ForwardIterator __brick_last, _Tp __brick_init) {
168+
return std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
169+
__cpu_backend_tag{},
170+
std::move(__brick_first),
171+
std::move(__brick_last),
172+
std::move(__brick_init),
173+
std::move(__reduce),
174+
std::move(__transform));
175+
});
176+
});
177+
} else if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
178+
__has_random_access_iterator_category<_ForwardIterator>::value) {
179+
return std::__simd_transform_reduce(
180+
__last - __first,
181+
std::move(__init),
182+
std::move(__reduce),
183+
[=, &__transform](__iter_diff_t<_ForwardIterator> __i) { return __transform(__first[__i]); });
184+
} else {
185+
return std::transform_reduce(
186+
std::move(__first), std::move(__last), std::move(__init), std::move(__reduce), std::move(__transform));
187+
}
188+
}
189+
190+
_LIBCPP_END_NAMESPACE_STD
191+
192+
#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
193+
194+
#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_TRANSFORM_REDUCE_H

libcxx/include/__functional/operations.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include <__functional/binary_function.h>
1515
#include <__functional/unary_function.h>
1616
#include <__type_traits/integral_constant.h>
17+
#include <__type_traits/operation_traits.h>
1718
#include <__type_traits/predicate_traits.h>
1819
#include <__utility/forward.h>
1920

@@ -40,6 +41,14 @@ struct _LIBCPP_TEMPLATE_VIS plus
4041
};
4142
_LIBCPP_CTAD_SUPPORTED_FOR_TYPE(plus);
4243

44+
template <class _Tp>
45+
struct __is_trivial_plus_operation<plus<_Tp>, _Tp, _Tp> : true_type {};
46+
47+
#if _LIBCPP_STD_VER >= 14
48+
template <class _Tp, class _Up>
49+
struct __is_trivial_plus_operation<plus<>, _Tp, _Up> : true_type {};
50+
#endif
51+
4352
#if _LIBCPP_STD_VER >= 14
4453
template <>
4554
struct _LIBCPP_TEMPLATE_VIS plus<void>

0 commit comments

Comments
 (0)