Skip to content

Commit c8eb865

Browse files
authored
[libclc] Move mad to the CLC library (#123607)
All targets build `__clc_mad` -- even SPIR-V targets -- since it compiles to the optimal `llvm.fmuladd` intrinsic. There is no change to the bytecode generated for non-SPIR-V targets. The `mix` builtin, which is implemented as a wrapper around `mad`, is left as an OpenCL-layer wrapper of `__clc_mad`. I don't know if it's worth having a specific CLC version of `mix`. The changes to the other CLC files/functions are moving uses of `mad` to `__clc_mad`, and reformatting. There is an additional instance of `trunc` becoming `__clc_trunc`, which was missed before.
1 parent 8368018 commit c8eb865

File tree

22 files changed

+2014
-1840
lines changed

22 files changed

+2014
-1840
lines changed

libclc/clc/include/clc/clcmacro.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,33 @@
184184
return BUILTIN(x); \
185185
}
186186

187+
#define _CLC_DEFINE_TERNARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, \
188+
ARG2_TYPE, ARG3_TYPE) \
189+
_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y, \
190+
ARG3_TYPE z) { \
191+
return BUILTIN(x, y, z); \
192+
} \
193+
_CLC_DEF _CLC_OVERLOAD RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y, \
194+
ARG3_TYPE##2 z) { \
195+
return BUILTIN(x, y, z); \
196+
} \
197+
_CLC_DEF _CLC_OVERLOAD RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y, \
198+
ARG3_TYPE##3 z) { \
199+
return BUILTIN(x, y, z); \
200+
} \
201+
_CLC_DEF _CLC_OVERLOAD RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y, \
202+
ARG3_TYPE##4 z) { \
203+
return BUILTIN(x, y, z); \
204+
} \
205+
_CLC_DEF _CLC_OVERLOAD RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y, \
206+
ARG3_TYPE##8 z) { \
207+
return BUILTIN(x, y, z); \
208+
} \
209+
_CLC_DEF _CLC_OVERLOAD RET_TYPE##16 FUNCTION( \
210+
ARG1_TYPE##16 x, ARG2_TYPE##16 y, ARG3_TYPE##16 z) { \
211+
return BUILTIN(x, y, z); \
212+
}
213+
187214
#ifdef cl_khr_fp16
188215

189216
#pragma OPENCL EXTENSION cl_khr_fp16 : enable

libclc/clc/include/clc/math/clc_mad.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#ifndef __CLC_MATH_CLC_MAD_H__
2+
#define __CLC_MATH_CLC_MAD_H__
3+
4+
#define __CLC_BODY <clc/math/ternary_decl.inc>
5+
#define __CLC_FUNCTION __clc_mad
6+
7+
#include <clc/math/gentype.inc>
8+
9+
#undef __CLC_BODY
10+
#undef __CLC_FUNCTION
11+
12+
#endif // __CLC_MATH_CLC_MAD_H__
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a,
2+
__CLC_GENTYPE b,
3+
__CLC_GENTYPE c);

libclc/clc/lib/clspv/SOURCES

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
../generic/math/clc_ceil.cl
22
../generic/math/clc_fabs.cl
33
../generic/math/clc_floor.cl
4+
../generic/math/clc_mad.cl
45
../generic/math/clc_rint.cl
56
../generic/math/clc_trunc.cl
67
../generic/shared/clc_clamp.cl

libclc/clc/lib/generic/SOURCES

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ integer/clc_abs_diff.cl
77
math/clc_ceil.cl
88
math/clc_fabs.cl
99
math/clc_floor.cl
10+
math/clc_mad.cl
1011
math/clc_rint.cl
1112
math/clc_trunc.cl
1213
relational/clc_all.cl
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#include <clc/internal/clc.h>
2+
3+
#define __CLC_BODY <clc_mad.inc>
4+
#include <clc/math/gentype.inc>
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_mad(__CLC_GENTYPE a, __CLC_GENTYPE b,
2+
__CLC_GENTYPE c) {
3+
#pragma OPENCL FP_CONTRACT ON
4+
return a * b + c;
5+
}

libclc/clc/lib/spirv/SOURCES

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
../generic/math/clc_ceil.cl
66
../generic/math/clc_fabs.cl
77
../generic/math/clc_floor.cl
8+
../generic/math/clc_mad.cl
89
../generic/math/clc_rint.cl
910
../generic/math/clc_trunc.cl
1011
../generic/shared/clc_clamp.cl

libclc/clc/lib/spirv64/SOURCES

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
../generic/math/clc_ceil.cl
66
../generic/math/clc_fabs.cl
77
../generic/math/clc_floor.cl
8+
../generic/math/clc_mad.cl
89
../generic/math/clc_rint.cl
910
../generic/math/clc_trunc.cl
1011
../generic/shared/clc_clamp.cl

libclc/generic/include/clc/math/ternary_decl.inc

Lines changed: 0 additions & 1 deletion
This file was deleted.

libclc/generic/lib/common/mix.cl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include <clc/clc.h>
2+
#include <clc/math/clc_mad.h>
23

34
#define __CLC_BODY <mix.inc>
45
#include <clc/math/gentype.inc>

libclc/generic/lib/common/mix.inc

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1-
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE a) {
2-
return mad( y - x, a, x );
1+
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y,
2+
__CLC_GENTYPE a) {
3+
return __clc_mad(y - x, a, x);
34
}
45

56
#ifndef __CLC_SCALAR
6-
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_SCALAR_GENTYPE a) {
7-
return mix(x, y, (__CLC_GENTYPE)a);
7+
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mix(__CLC_GENTYPE x, __CLC_GENTYPE y,
8+
__CLC_SCALAR_GENTYPE a) {
9+
return mix(x, y, (__CLC_GENTYPE)a);
810
}
911
#endif

libclc/generic/lib/math/clc_exp10.cl

Lines changed: 100 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
#include <clc/clc.h>
2424
#include <clc/clcmacro.h>
25+
#include <clc/math/clc_mad.h>
2526
#include <clc/relational/clc_isnan.h>
2627

2728
#include "config.h"
@@ -53,98 +54,109 @@
5354
//
5455
// e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
5556

56-
_CLC_DEF _CLC_OVERLOAD float __clc_exp10(float x)
57-
{
58-
const float X_MAX = 0x1.344134p+5f; // 128*log2/log10 : 38.53183944498959
59-
const float X_MIN = -0x1.66d3e8p+5f; // -149*log2/log10 : -44.8534693539332
60-
61-
const float R_64_BY_LOG10_2 = 0x1.a934f0p+7f; // 64*log10/log2 : 212.6033980727912
62-
const float R_LOG10_2_BY_64_LD = 0x1.340000p-8f; // log2/(64 * log10) lead : 0.004699707
63-
const float R_LOG10_2_BY_64_TL = 0x1.04d426p-18f; // log2/(64 * log10) tail : 0.00000388665057
64-
const float R_LN10 = 0x1.26bb1cp+1f;
65-
66-
int return_nan = __clc_isnan(x);
67-
int return_inf = x > X_MAX;
68-
int return_zero = x < X_MIN;
69-
70-
int n = convert_int(x * R_64_BY_LOG10_2);
71-
72-
float fn = (float)n;
73-
int j = n & 0x3f;
74-
int m = n >> 6;
75-
int m2 = m << EXPSHIFTBITS_SP32;
76-
float r;
77-
78-
r = R_LN10 * mad(fn, -R_LOG10_2_BY_64_TL, mad(fn, -R_LOG10_2_BY_64_LD, x));
79-
80-
// Truncated Taylor series for e^r
81-
float z2 = mad(mad(mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, 0x1.000000p-1f), r*r, r);
82-
83-
float two_to_jby64 = USE_TABLE(exp_tbl, j);
84-
z2 = mad(two_to_jby64, z2, two_to_jby64);
85-
86-
float z2s = z2 * as_float(0x1 << (m + 149));
87-
float z2n = as_float(as_int(z2) + m2);
88-
z2 = m <= -126 ? z2s : z2n;
89-
90-
91-
z2 = return_inf ? as_float(PINFBITPATT_SP32) : z2;
92-
z2 = return_zero ? 0.0f : z2;
93-
z2 = return_nan ? x : z2;
94-
return z2;
57+
_CLC_DEF _CLC_OVERLOAD float __clc_exp10(float x) {
58+
// 128*log2/log10 : 38.53183944498959
59+
const float X_MAX = 0x1.344134p+5f;
60+
// -149*log2/log10 : -44.8534693539332
61+
const float X_MIN = -0x1.66d3e8p+5f;
62+
// 64*log10/log2 : 212.6033980727912
63+
const float R_64_BY_LOG10_2 = 0x1.a934f0p+7f;
64+
// log2/(64 * log10) lead : 0.004699707
65+
const float R_LOG10_2_BY_64_LD = 0x1.340000p-8f;
66+
// log2/(64 * log10) tail : 0.00000388665057
67+
const float R_LOG10_2_BY_64_TL = 0x1.04d426p-18f;
68+
const float R_LN10 = 0x1.26bb1cp+1f;
69+
70+
int return_nan = __clc_isnan(x);
71+
int return_inf = x > X_MAX;
72+
int return_zero = x < X_MIN;
73+
74+
int n = convert_int(x * R_64_BY_LOG10_2);
75+
76+
float fn = (float)n;
77+
int j = n & 0x3f;
78+
int m = n >> 6;
79+
int m2 = m << EXPSHIFTBITS_SP32;
80+
float r;
81+
82+
r = R_LN10 *
83+
__clc_mad(fn, -R_LOG10_2_BY_64_TL, __clc_mad(fn, -R_LOG10_2_BY_64_LD, x));
84+
85+
// Truncated Taylor series for e^r
86+
float z2 = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f),
87+
r, 0x1.000000p-1f),
88+
r * r, r);
89+
90+
float two_to_jby64 = USE_TABLE(exp_tbl, j);
91+
z2 = __clc_mad(two_to_jby64, z2, two_to_jby64);
92+
93+
float z2s = z2 * as_float(0x1 << (m + 149));
94+
float z2n = as_float(as_int(z2) + m2);
95+
z2 = m <= -126 ? z2s : z2n;
96+
97+
z2 = return_inf ? as_float(PINFBITPATT_SP32) : z2;
98+
z2 = return_zero ? 0.0f : z2;
99+
z2 = return_nan ? x : z2;
100+
return z2;
95101
}
96102
_CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_exp10, float)
97103

98104
#ifdef cl_khr_fp64
99-
_CLC_DEF _CLC_OVERLOAD double __clc_exp10(double x)
100-
{
101-
const double X_MAX = 0x1.34413509f79ffp+8; // 1024*ln(2)/ln(10)
102-
const double X_MIN = -0x1.434e6420f4374p+8; // -1074*ln(2)/ln(10)
103-
104-
const double R_64_BY_LOG10_2 = 0x1.a934f0979a371p+7; // 64*ln(10)/ln(2)
105-
const double R_LOG10_2_BY_64_LD = 0x1.3441350000000p-8; // head ln(2)/(64*ln(10))
106-
const double R_LOG10_2_BY_64_TL = 0x1.3ef3fde623e25p-37; // tail ln(2)/(64*ln(10))
107-
const double R_LN10 = 0x1.26bb1bbb55516p+1; // ln(10)
108-
109-
int n = convert_int(x * R_64_BY_LOG10_2);
110-
111-
double dn = (double)n;
112-
113-
int j = n & 0x3f;
114-
int m = n >> 6;
115-
116-
double r = R_LN10 * fma(-R_LOG10_2_BY_64_TL, dn, fma(-R_LOG10_2_BY_64_LD, dn, x));
117-
118-
// 6 term tail of Taylor expansion of e^r
119-
double z2 = r * fma(r,
120-
fma(r,
121-
fma(r,
122-
fma(r,
123-
fma(r, 0x1.6c16c16c16c17p-10, 0x1.1111111111111p-7),
124-
0x1.5555555555555p-5),
125-
0x1.5555555555555p-3),
126-
0x1.0000000000000p-1),
127-
1.0);
128-
129-
double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
130-
z2 = fma(tv.s0 + tv.s1, z2, tv.s1) + tv.s0;
131-
132-
int small_value = (m < -1022) || ((m == -1022) && (z2 < 1.0));
133-
134-
int n1 = m >> 2;
135-
int n2 = m-n1;
136-
double z3= z2 * as_double(((long)n1 + 1023) << 52);
137-
z3 *= as_double(((long)n2 + 1023) << 52);
138-
139-
z2 = ldexp(z2, m);
140-
z2 = small_value ? z3: z2;
141-
142-
z2 = __clc_isnan(x) ? x : z2;
143-
144-
z2 = x > X_MAX ? as_double(PINFBITPATT_DP64) : z2;
145-
z2 = x < X_MIN ? 0.0 : z2;
146-
147-
return z2;
105+
_CLC_DEF _CLC_OVERLOAD double __clc_exp10(double x) {
106+
// 1024*ln(2)/ln(10)
107+
const double X_MAX = 0x1.34413509f79ffp+8;
108+
// -1074*ln(2)/ln(10)
109+
const double X_MIN = -0x1.434e6420f4374p+8;
110+
// 64*ln(10)/ln(2)
111+
const double R_64_BY_LOG10_2 = 0x1.a934f0979a371p+7;
112+
// head ln(2)/(64*ln(10))
113+
const double R_LOG10_2_BY_64_LD = 0x1.3441350000000p-8;
114+
// tail ln(2)/(64*ln(10))
115+
const double R_LOG10_2_BY_64_TL = 0x1.3ef3fde623e25p-37;
116+
// ln(10)
117+
const double R_LN10 = 0x1.26bb1bbb55516p+1;
118+
119+
int n = convert_int(x * R_64_BY_LOG10_2);
120+
121+
double dn = (double)n;
122+
123+
int j = n & 0x3f;
124+
int m = n >> 6;
125+
126+
double r =
127+
R_LN10 * fma(-R_LOG10_2_BY_64_TL, dn, fma(-R_LOG10_2_BY_64_LD, dn, x));
128+
129+
// 6 term tail of Taylor expansion of e^r
130+
double z2 =
131+
r *
132+
fma(r,
133+
fma(r,
134+
fma(r,
135+
fma(r, fma(r, 0x1.6c16c16c16c17p-10, 0x1.1111111111111p-7),
136+
0x1.5555555555555p-5),
137+
0x1.5555555555555p-3),
138+
0x1.0000000000000p-1),
139+
1.0);
140+
141+
double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
142+
z2 = fma(tv.s0 + tv.s1, z2, tv.s1) + tv.s0;
143+
144+
int small_value = (m < -1022) || ((m == -1022) && (z2 < 1.0));
145+
146+
int n1 = m >> 2;
147+
int n2 = m - n1;
148+
double z3 = z2 * as_double(((long)n1 + 1023) << 52);
149+
z3 *= as_double(((long)n2 + 1023) << 52);
150+
151+
z2 = ldexp(z2, m);
152+
z2 = small_value ? z3 : z2;
153+
154+
z2 = __clc_isnan(x) ? x : z2;
155+
156+
z2 = x > X_MAX ? as_double(PINFBITPATT_DP64) : z2;
157+
z2 = x < X_MIN ? 0.0 : z2;
158+
159+
return z2;
148160
}
149161
_CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_exp10, double)
150162
#endif

libclc/generic/lib/math/clc_hypot.cl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <clc/clc.h>
2424
#include <clc/clcmacro.h>
2525
#include <clc/integer/clc_abs.h>
26+
#include <clc/math/clc_mad.h>
2627
#include <clc/relational/clc_isnan.h>
2728
#include <clc/shared/clc_clamp.h>
2829
#include <math/clc_hypot.h>
@@ -48,7 +49,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_hypot(float x, float y) {
4849
float fi_exp = as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
4950
float fx = as_float(ux) * fi_exp;
5051
float fy = as_float(uy) * fi_exp;
51-
retval = sqrt(mad(fx, fx, fy * fy)) * fx_exp;
52+
retval = sqrt(__clc_mad(fx, fx, fy * fy)) * fx_exp;
5253

5354
retval = ux > PINFBITPATT_SP32 | uy == 0 ? as_float(ux) : retval;
5455
retval = ux == PINFBITPATT_SP32 | uy == PINFBITPATT_SP32

0 commit comments

Comments
 (0)