19
19
return (GENTYPE)(((BGENTYPE)x * (BGENTYPE)y) >> GENSIZE); \
20
20
}
21
21
22
- // FOIL-based long mul_hi
23
- //
24
- // Summary: Treat mul_hi(long x, long y) as:
25
- // (a+b) * (c+d) where a and c are the high-order parts of x and y respectively
26
- // and b and d are the low-order parts of x and y.
27
- // Thinking back to algebra, we use FOIL to do the work.
22
+ #define __CLC_MUL_HI_DEC_IMPL (BTYPE , TYPE , BITS ) \
23
+ __CLC_MUL_HI_IMPL(BTYPE, TYPE, BITS) \
24
+ __CLC_MUL_HI_VEC_IMPL(BTYPE##2, TYPE##2, BITS) \
25
+ __CLC_MUL_HI_VEC_IMPL(BTYPE##3, TYPE##3, BITS) \
26
+ __CLC_MUL_HI_VEC_IMPL(BTYPE##4, TYPE##4, BITS) \
27
+ __CLC_MUL_HI_VEC_IMPL(BTYPE##8, TYPE##8, BITS) \
28
+ __CLC_MUL_HI_VEC_IMPL(BTYPE##16, TYPE##16, BITS)
29
+
28
30
_CLC_OVERLOAD _CLC_DEF long __clc_mul_hi (long x , long y ) {
29
31
long f , o , i ;
30
32
ulong l ;
@@ -81,32 +83,33 @@ _CLC_OVERLOAD _CLC_DEF ulong __clc_mul_hi(ulong x, ulong y) {
81
83
return (f + (__clc_hadd (o , (i + (l >> 32 ))) >> 31 ));
82
84
}
83
85
84
- #define __CLC_MUL_HI_VEC (GENTYPE ) \
85
- _CLC_OVERLOAD _CLC_DEF GENTYPE##2 __clc_mul_hi(GENTYPE##2 x, GENTYPE##2 y) { \
86
- return (GENTYPE##2){__clc_mul_hi(x.s0, y.s0), __clc_mul_hi(x.s1, y.s1)}; \
87
- } \
88
- _CLC_OVERLOAD _CLC_DEF GENTYPE##3 __clc_mul_hi(GENTYPE##3 x, GENTYPE##3 y) { \
89
- return (GENTYPE##3){__clc_mul_hi(x.s0, y.s0), __clc_mul_hi(x.s1, y.s1), \
90
- __clc_mul_hi(x.s2, y.s2)}; \
91
- } \
92
- _CLC_OVERLOAD _CLC_DEF GENTYPE##4 __clc_mul_hi(GENTYPE##4 x, GENTYPE##4 y) { \
93
- return (GENTYPE##4){__clc_mul_hi(x.lo, y.lo), __clc_mul_hi(x.hi, y.hi)}; \
94
- } \
95
- _CLC_OVERLOAD _CLC_DEF GENTYPE##8 __clc_mul_hi(GENTYPE##8 x, GENTYPE##8 y) { \
96
- return (GENTYPE##8){__clc_mul_hi(x.lo, y.lo), __clc_mul_hi(x.hi, y.hi)}; \
97
- } \
98
- _CLC_OVERLOAD _CLC_DEF GENTYPE##16 __clc_mul_hi(GENTYPE##16 x, \
99
- GENTYPE##16 y) { \
100
- return (GENTYPE##16){__clc_mul_hi(x.lo, y.lo), __clc_mul_hi(x.hi, y.hi)}; \
86
+ // Vector-based mul_hi implementation for logn/ulong. See comments in the scalar
87
+ // versions for more detail.
88
+ #define __CLC_MUL_HI_LONG_VEC_IMPL (TY , UTY ) \
89
+ _CLC_OVERLOAD _CLC_DEF TY __clc_mul_hi(TY x, TY y) { \
90
+ TY f, o, i; \
91
+ UTY l; \
92
+ \
93
+ TY x_hi = x >> 32; \
94
+ TY x_lo = x & UINT_MAX; \
95
+ TY y_hi = y >> 32; \
96
+ TY y_lo = y & UINT_MAX; \
97
+ \
98
+ f = x_hi * y_hi; \
99
+ o = x_hi * y_lo; \
100
+ i = x_lo * y_hi; \
101
+ l = __CLC_CONVERT_TY(x_lo * y_lo, UTY); \
102
+ i += __CLC_CONVERT_TY(l >> (UTY)32, TY); \
103
+ \
104
+ return f + (__clc_hadd(o, i) >> (TY)31); \
101
105
}
102
106
103
- #define __CLC_MUL_HI_DEC_IMPL (BTYPE , TYPE , BITS ) \
104
- __CLC_MUL_HI_IMPL(BTYPE, TYPE, BITS) \
105
- __CLC_MUL_HI_VEC_IMPL(BTYPE##2, TYPE##2, BITS) \
106
- __CLC_MUL_HI_VEC_IMPL(BTYPE##3, TYPE##3, BITS) \
107
- __CLC_MUL_HI_VEC_IMPL(BTYPE##4, TYPE##4, BITS) \
108
- __CLC_MUL_HI_VEC_IMPL(BTYPE##8, TYPE##8, BITS) \
109
- __CLC_MUL_HI_VEC_IMPL(BTYPE##16, TYPE##16, BITS)
107
+ #define __CLC_MUL_HI_LONG_IMPL (BTYPE , UBTYPE ) \
108
+ __CLC_MUL_HI_LONG_VEC_IMPL(BTYPE##2, UBTYPE##2) \
109
+ __CLC_MUL_HI_LONG_VEC_IMPL(BTYPE##3, UBTYPE##3) \
110
+ __CLC_MUL_HI_LONG_VEC_IMPL(BTYPE##4, UBTYPE##4) \
111
+ __CLC_MUL_HI_LONG_VEC_IMPL(BTYPE##8, UBTYPE##8) \
112
+ __CLC_MUL_HI_LONG_VEC_IMPL(BTYPE##16, UBTYPE##16)
110
113
111
114
#define __CLC_MUL_HI_TYPES () \
112
115
__CLC_MUL_HI_DEC_IMPL(short, char, 8) \
@@ -115,14 +118,15 @@ _CLC_OVERLOAD _CLC_DEF ulong __clc_mul_hi(ulong x, ulong y) {
115
118
__CLC_MUL_HI_DEC_IMPL(uint, ushort, 16) \
116
119
__CLC_MUL_HI_DEC_IMPL(long, int, 32) \
117
120
__CLC_MUL_HI_DEC_IMPL(ulong, uint, 32) \
118
- __CLC_MUL_HI_VEC (long) \
119
- __CLC_MUL_HI_VEC( ulong)
121
+ __CLC_MUL_HI_LONG_IMPL (long, ulong) \
122
+ __CLC_MUL_HI_LONG_IMPL(ulong, ulong)
120
123
121
124
__CLC_MUL_HI_TYPES ()
122
125
123
126
#undef __CLC_MUL_HI_TYPES
127
+ #undef __CLC_MUL_HI_LONG_IMPL
128
+ #undef __CLC_MUL_HI_LONG_VEC_IMPL
124
129
#undef __CLC_MUL_HI_DEC_IMPL
125
130
#undef __CLC_MUL_HI_IMPL
126
- #undef __CLC_MUL_HI_VEC
127
131
#undef __CLC_MUL_HI_VEC_IMPL
128
132
#undef __CLC_CONVERT_TY
0 commit comments