4
4
macro_rules! impl_arithmetic_reductions {
5
5
( $id: ident, $elem_ty: ident) => {
6
6
impl $id {
7
- /// Lane-wise addition of the vector elements.
7
+ /// Horizontal sum of the vector elements.
8
8
///
9
- /// FIXME: document guarantees with respect to:
10
- /// * integers: overflow behavior
11
- /// * floats: order and NaNs
9
+ /// The intrinsic performs a tree-reduction of the vector elements.
10
+ /// That is, for an 8 element vector:
11
+ ///
12
+ /// > ((x0 + x1) + (x2 + x3)) + ((x4 + x5) + (x6 + x7))
13
+ ///
14
+ /// # Integer vectors
15
+ ///
16
+ /// If an operation overflows it returns the mathematical result
17
+ /// modulo `2^n` where `n` is the number of times it overflows.
18
+ ///
19
+ /// # Floating-point vectors
20
+ ///
21
+ /// If one of the vector element is `NaN` the reduction returns
22
+ /// `NaN`.
12
23
#[ cfg( not( target_arch = "aarch64" ) ) ]
13
24
#[ inline]
14
- pub fn sum ( self ) -> $elem_ty {
25
+ pub fn wrapping_sum ( self ) -> $elem_ty {
15
26
use coresimd:: simd_llvm:: simd_reduce_add_ordered;
16
27
unsafe { simd_reduce_add_ordered( self , 0 as $elem_ty) }
17
28
}
18
- /// Lane-wise addition of the vector elements.
29
+ /// Horizontal sum of the vector elements.
30
+ ///
31
+ /// The intrinsic performs a tree-reduction of the vector elements.
32
+ /// That is, for an 8 element vector:
33
+ ///
34
+ /// > ((x0 + x1) + (x2 + x3)) + ((x4 + x5) + (x6 + x7))
35
+ ///
36
+ /// # Integer vectors
37
+ ///
38
+ /// If an operation overflows it returns the mathematical result
39
+ /// modulo `2^n` where `n` is the number of times it overflows.
40
+ ///
41
+ /// # Floating-point vectors
19
42
///
20
- /// FIXME: document guarantees with respect to:
21
- /// * integers: overflow behavior
22
- /// * floats: order and NaNs
43
+ /// If one of the vector element is `NaN` the reduction returns
44
+ /// `NaN`.
23
45
#[ cfg( target_arch = "aarch64" ) ]
24
46
#[ inline]
25
- pub fn sum ( self ) -> $elem_ty {
47
+ pub fn wrapping_sum ( self ) -> $elem_ty {
26
48
// FIXME: broken on AArch64
27
49
// https://bugs.llvm.org/show_bug.cgi?id=36796
50
+ use super :: codegen:: wrapping:: Wrapping ;
28
51
let mut x = self . extract( 0 ) as $elem_ty;
29
52
for i in 1 ..$id:: lanes( ) {
30
- x += self . extract( i) as $elem_ty;
53
+ x = Wrapping :: add ( x , self . extract( i) as $elem_ty) ;
31
54
}
32
55
x
33
56
}
34
57
35
- /// Lane-wise multiplication of the vector elements.
58
+ /// Horizontal product of the vector elements.
36
59
///
37
- /// FIXME: document guarantees with respect to:
38
- /// * integers: overflow behavior
39
- /// * floats: order and NaNs
60
+ /// The intrinsic performs a tree-reduction of the vector elements.
61
+ /// That is, for an 8 element vector:
62
+ ///
63
+ /// > ((x0 * x1) * (x2 * x3)) * ((x4 * x5) * (x6 * x7))
64
+ ///
65
+ /// # Integer vectors
66
+ ///
67
+ /// If an operation overflows it returns the mathematical result
68
+ /// modulo `2^n` where `n` is the number of times it overflows.
69
+ ///
70
+ /// # Floating-point vectors
71
+ ///
72
+ /// If one of the vector element is `NaN` the reduction returns
73
+ /// `NaN`.
40
74
#[ cfg( not( target_arch = "aarch64" ) ) ]
41
75
#[ inline]
42
- pub fn product ( self ) -> $elem_ty {
76
+ pub fn wrapping_product ( self ) -> $elem_ty {
43
77
use coresimd:: simd_llvm:: simd_reduce_mul_ordered;
44
78
unsafe { simd_reduce_mul_ordered( self , 1 as $elem_ty) }
45
79
}
46
- /// Lane-wise multiplication of the vector elements.
80
+ /// Horizontal product of the vector elements.
81
+ ///
82
+ /// The intrinsic performs a tree-reduction of the vector elements.
83
+ /// That is, for an 8 element vector:
84
+ ///
85
+ /// > ((x0 * x1) * (x2 * x3)) * ((x4 * x5) * (x6 * x7))
86
+ ///
87
+ /// # Integer vectors
88
+ ///
89
+ /// If an operation overflows it returns the mathematical result
90
+ /// modulo `2^n` where `n` is the number of times it overflows.
91
+ ///
92
+ /// # Floating-point vectors
47
93
///
48
- /// FIXME: document guarantees with respect to:
49
- /// * integers: overflow behavior
50
- /// * floats: order and NaNs
94
+ /// If one of the vector element is `NaN` the reduction returns
95
+ /// `NaN`.
51
96
#[ cfg( target_arch = "aarch64" ) ]
52
97
#[ inline]
53
- pub fn product ( self ) -> $elem_ty {
98
+ pub fn wrapping_product ( self ) -> $elem_ty {
54
99
// FIXME: broken on AArch64
55
100
// https://bugs.llvm.org/show_bug.cgi?id=36796
101
+ use super :: codegen:: wrapping:: Wrapping ;
56
102
let mut x = self . extract( 0 ) as $elem_ty;
57
103
for i in 1 ..$id:: lanes( ) {
58
- x *= self . extract( i) as $elem_ty;
104
+ x = Wrapping :: mul ( x , self . extract( i) as $elem_ty) ;
59
105
}
60
106
x
61
107
}
@@ -78,25 +124,25 @@ macro_rules! test_arithmetic_reductions {
78
124
}
79
125
80
126
#[ test]
81
- fn sum ( ) {
127
+ fn wrapping_sum ( ) {
82
128
use coresimd:: simd:: $id;
83
129
let v = $id:: splat( 0 as $elem_ty) ;
84
- assert_eq!( v. sum ( ) , 0 as $elem_ty) ;
130
+ assert_eq!( v. wrapping_sum ( ) , 0 as $elem_ty) ;
85
131
let v = $id:: splat( 1 as $elem_ty) ;
86
- assert_eq!( v. sum ( ) , $id:: lanes( ) as $elem_ty) ;
132
+ assert_eq!( v. wrapping_sum ( ) , $id:: lanes( ) as $elem_ty) ;
87
133
let v = alternating( 2 ) ;
88
134
assert_eq!(
89
- v. sum ( ) ,
135
+ v. wrapping_sum ( ) ,
90
136
( $id:: lanes( ) / 2 + $id:: lanes( ) ) as $elem_ty
91
137
) ;
92
138
}
93
139
#[ test]
94
- fn product ( ) {
140
+ fn wrapping_product ( ) {
95
141
use coresimd:: simd:: $id;
96
142
let v = $id:: splat( 0 as $elem_ty) ;
97
- assert_eq!( v. product ( ) , 0 as $elem_ty) ;
143
+ assert_eq!( v. wrapping_product ( ) , 0 as $elem_ty) ;
98
144
let v = $id:: splat( 1 as $elem_ty) ;
99
- assert_eq!( v. product ( ) , 1 as $elem_ty) ;
145
+ assert_eq!( v. wrapping_product ( ) , 1 as $elem_ty) ;
100
146
let f = match $id:: lanes( ) {
101
147
64 => 16 ,
102
148
32 => 8 ,
@@ -105,7 +151,7 @@ macro_rules! test_arithmetic_reductions {
105
151
} ;
106
152
let v = alternating( f) ;
107
153
assert_eq!(
108
- v. product ( ) ,
154
+ v. wrapping_product ( ) ,
109
155
( 2_usize . pow( ( $id:: lanes( ) / f) as u32 ) as $elem_ty)
110
156
) ;
111
157
}
0 commit comments