Skip to content

Commit 9a2c0dd

Browse files
committed
optimize mul_hi
1 parent 151b6c8 commit 9a2c0dd

File tree

1 file changed

+18
-3
lines changed

1 file changed

+18
-3
lines changed

libclc/clc/lib/generic/integer/clc_mul_hi.cl

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,17 @@
22
#include <clc/integer/definitions.h>
33
#include <clc/internal/clc.h>
44

5+
// TODO: Replace with __clc_convert_<type> when available
6+
#define __CLC_CONVERT_TY(X, TY) __builtin_convertvector(X, TY)
7+
8+
#define __CLC_MUL_HI_VEC_IMPL(BGENTYPE, GENTYPE, GENSIZE) \
9+
_CLC_OVERLOAD _CLC_DEF GENTYPE __clc_mul_hi(GENTYPE x, GENTYPE y) { \
10+
BGENTYPE large_x = __CLC_CONVERT_TY(x, BGENTYPE); \
11+
BGENTYPE large_y = __CLC_CONVERT_TY(y, BGENTYPE); \
12+
BGENTYPE large_mul_hi = (large_x * large_y) >> (BGENTYPE)GENSIZE; \
13+
return __CLC_CONVERT_TY(large_mul_hi, GENTYPE); \
14+
}
15+
516
// For all types EXCEPT long, which is implemented separately
617
#define __CLC_MUL_HI_IMPL(BGENTYPE, GENTYPE, GENSIZE) \
718
_CLC_OVERLOAD _CLC_DEF GENTYPE __clc_mul_hi(GENTYPE x, GENTYPE y) { \
@@ -14,7 +25,6 @@
1425
// (a+b) * (c+d) where a and c are the high-order parts of x and y respectively
1526
// and b and d are the low-order parts of x and y.
1627
// Thinking back to algebra, we use FOIL to do the work.
17-
1828
_CLC_OVERLOAD _CLC_DEF long __clc_mul_hi(long x, long y) {
1929
long f, o, i;
2030
ulong l;
@@ -92,7 +102,11 @@ _CLC_OVERLOAD _CLC_DEF ulong __clc_mul_hi(ulong x, ulong y) {
92102

93103
#define __CLC_MUL_HI_DEC_IMPL(BTYPE, TYPE, BITS) \
94104
__CLC_MUL_HI_IMPL(BTYPE, TYPE, BITS) \
95-
__CLC_MUL_HI_VEC(TYPE)
105+
__CLC_MUL_HI_VEC_IMPL(BTYPE##2, TYPE##2, BITS) \
106+
__CLC_MUL_HI_VEC_IMPL(BTYPE##3, TYPE##3, BITS) \
107+
__CLC_MUL_HI_VEC_IMPL(BTYPE##4, TYPE##4, BITS) \
108+
__CLC_MUL_HI_VEC_IMPL(BTYPE##8, TYPE##8, BITS) \
109+
__CLC_MUL_HI_VEC_IMPL(BTYPE##16, TYPE##16, BITS)
96110

97111
#define __CLC_MUL_HI_TYPES() \
98112
__CLC_MUL_HI_DEC_IMPL(short, char, 8) \
@@ -110,4 +124,5 @@ __CLC_MUL_HI_TYPES()
110124
#undef __CLC_MUL_HI_DEC_IMPL
111125
#undef __CLC_MUL_HI_IMPL
112126
#undef __CLC_MUL_HI_VEC
113-
#undef __CLC_B32
127+
#undef __CLC_MUL_HI_VEC_IMPL
128+
#undef __CLC_CONVERT_TY

0 commit comments

Comments
 (0)