Skip to content

Commit 3c4f2fa

Browse files
committed
optimize mul_hi long
1 parent 9a2c0dd commit 3c4f2fa

File tree

1 file changed

+37
-33
lines changed

1 file changed

+37
-33
lines changed

libclc/clc/lib/generic/integer/clc_mul_hi.cl

Lines changed: 37 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,14 @@
1919
return (GENTYPE)(((BGENTYPE)x * (BGENTYPE)y) >> GENSIZE); \
2020
}
2121

22-
// FOIL-based long mul_hi
23-
//
24-
// Summary: Treat mul_hi(long x, long y) as:
25-
// (a+b) * (c+d) where a and c are the high-order parts of x and y respectively
26-
// and b and d are the low-order parts of x and y.
27-
// Thinking back to algebra, we use FOIL to do the work.
22+
#define __CLC_MUL_HI_DEC_IMPL(BTYPE, TYPE, BITS) \
23+
__CLC_MUL_HI_IMPL(BTYPE, TYPE, BITS) \
24+
__CLC_MUL_HI_VEC_IMPL(BTYPE##2, TYPE##2, BITS) \
25+
__CLC_MUL_HI_VEC_IMPL(BTYPE##3, TYPE##3, BITS) \
26+
__CLC_MUL_HI_VEC_IMPL(BTYPE##4, TYPE##4, BITS) \
27+
__CLC_MUL_HI_VEC_IMPL(BTYPE##8, TYPE##8, BITS) \
28+
__CLC_MUL_HI_VEC_IMPL(BTYPE##16, TYPE##16, BITS)
29+
2830
_CLC_OVERLOAD _CLC_DEF long __clc_mul_hi(long x, long y) {
2931
long f, o, i;
3032
ulong l;
@@ -81,32 +83,33 @@ _CLC_OVERLOAD _CLC_DEF ulong __clc_mul_hi(ulong x, ulong y) {
8183
return (f + (__clc_hadd(o, (i + (l >> 32))) >> 31));
8284
}
8385

84-
#define __CLC_MUL_HI_VEC(GENTYPE) \
85-
_CLC_OVERLOAD _CLC_DEF GENTYPE##2 __clc_mul_hi(GENTYPE##2 x, GENTYPE##2 y) { \
86-
return (GENTYPE##2){__clc_mul_hi(x.s0, y.s0), __clc_mul_hi(x.s1, y.s1)}; \
87-
} \
88-
_CLC_OVERLOAD _CLC_DEF GENTYPE##3 __clc_mul_hi(GENTYPE##3 x, GENTYPE##3 y) { \
89-
return (GENTYPE##3){__clc_mul_hi(x.s0, y.s0), __clc_mul_hi(x.s1, y.s1), \
90-
__clc_mul_hi(x.s2, y.s2)}; \
91-
} \
92-
_CLC_OVERLOAD _CLC_DEF GENTYPE##4 __clc_mul_hi(GENTYPE##4 x, GENTYPE##4 y) { \
93-
return (GENTYPE##4){__clc_mul_hi(x.lo, y.lo), __clc_mul_hi(x.hi, y.hi)}; \
94-
} \
95-
_CLC_OVERLOAD _CLC_DEF GENTYPE##8 __clc_mul_hi(GENTYPE##8 x, GENTYPE##8 y) { \
96-
return (GENTYPE##8){__clc_mul_hi(x.lo, y.lo), __clc_mul_hi(x.hi, y.hi)}; \
97-
} \
98-
_CLC_OVERLOAD _CLC_DEF GENTYPE##16 __clc_mul_hi(GENTYPE##16 x, \
99-
GENTYPE##16 y) { \
100-
return (GENTYPE##16){__clc_mul_hi(x.lo, y.lo), __clc_mul_hi(x.hi, y.hi)}; \
86+
// Vector-based mul_hi implementation for logn/ulong. See comments in the scalar
87+
// versions for more detail.
88+
#define __CLC_MUL_HI_LONG_VEC_IMPL(TY, UTY) \
89+
_CLC_OVERLOAD _CLC_DEF TY __clc_mul_hi(TY x, TY y) { \
90+
TY f, o, i; \
91+
UTY l; \
92+
\
93+
TY x_hi = x >> 32; \
94+
TY x_lo = x & UINT_MAX; \
95+
TY y_hi = y >> 32; \
96+
TY y_lo = y & UINT_MAX; \
97+
\
98+
f = x_hi * y_hi; \
99+
o = x_hi * y_lo; \
100+
i = x_lo * y_hi; \
101+
l = __CLC_CONVERT_TY(x_lo * y_lo, UTY); \
102+
i += __CLC_CONVERT_TY(l >> (UTY)32, TY); \
103+
\
104+
return f + (__clc_hadd(o, i) >> (TY)31); \
101105
}
102106

103-
#define __CLC_MUL_HI_DEC_IMPL(BTYPE, TYPE, BITS) \
104-
__CLC_MUL_HI_IMPL(BTYPE, TYPE, BITS) \
105-
__CLC_MUL_HI_VEC_IMPL(BTYPE##2, TYPE##2, BITS) \
106-
__CLC_MUL_HI_VEC_IMPL(BTYPE##3, TYPE##3, BITS) \
107-
__CLC_MUL_HI_VEC_IMPL(BTYPE##4, TYPE##4, BITS) \
108-
__CLC_MUL_HI_VEC_IMPL(BTYPE##8, TYPE##8, BITS) \
109-
__CLC_MUL_HI_VEC_IMPL(BTYPE##16, TYPE##16, BITS)
107+
#define __CLC_MUL_HI_LONG_IMPL(BTYPE, UBTYPE) \
108+
__CLC_MUL_HI_LONG_VEC_IMPL(BTYPE##2, UBTYPE##2) \
109+
__CLC_MUL_HI_LONG_VEC_IMPL(BTYPE##3, UBTYPE##3) \
110+
__CLC_MUL_HI_LONG_VEC_IMPL(BTYPE##4, UBTYPE##4) \
111+
__CLC_MUL_HI_LONG_VEC_IMPL(BTYPE##8, UBTYPE##8) \
112+
__CLC_MUL_HI_LONG_VEC_IMPL(BTYPE##16, UBTYPE##16)
110113

111114
#define __CLC_MUL_HI_TYPES() \
112115
__CLC_MUL_HI_DEC_IMPL(short, char, 8) \
@@ -115,14 +118,15 @@ _CLC_OVERLOAD _CLC_DEF ulong __clc_mul_hi(ulong x, ulong y) {
115118
__CLC_MUL_HI_DEC_IMPL(uint, ushort, 16) \
116119
__CLC_MUL_HI_DEC_IMPL(long, int, 32) \
117120
__CLC_MUL_HI_DEC_IMPL(ulong, uint, 32) \
118-
__CLC_MUL_HI_VEC(long) \
119-
__CLC_MUL_HI_VEC(ulong)
121+
__CLC_MUL_HI_LONG_IMPL(long, ulong) \
122+
__CLC_MUL_HI_LONG_IMPL(ulong, ulong)
120123

121124
__CLC_MUL_HI_TYPES()
122125

123126
#undef __CLC_MUL_HI_TYPES
127+
#undef __CLC_MUL_HI_LONG_IMPL
128+
#undef __CLC_MUL_HI_LONG_VEC_IMPL
124129
#undef __CLC_MUL_HI_DEC_IMPL
125130
#undef __CLC_MUL_HI_IMPL
126-
#undef __CLC_MUL_HI_VEC
127131
#undef __CLC_MUL_HI_VEC_IMPL
128132
#undef __CLC_CONVERT_TY

0 commit comments

Comments
 (0)