|
| 1 | +/*========================== begin_copyright_notice ============================ |
| 2 | +
|
| 3 | +Copyright (C) 2024 Intel Corporation |
| 4 | +
|
| 5 | +SPDX-License-Identifier: MIT |
| 6 | +
|
| 7 | +============================= end_copyright_notice ===========================*/ |
| 8 | +#include "../imf.h" |
| 9 | +#pragma OPENCL FP_CONTRACT OFF |
| 10 | +static __constant union { |
| 11 | + unsigned int w; |
| 12 | + float f; |
| 13 | + int i; |
| 14 | +} __sasinh_ha_large_x = {0x49800000u}; |
| 15 | +static __constant union { |
| 16 | + unsigned int w; |
| 17 | + float f; |
| 18 | + int i; |
| 19 | +} __sasinh_ha_small_x = {0x39800000u}; |
| 20 | +// largest norm |
| 21 | +static __constant union { |
| 22 | + unsigned int w; |
| 23 | + float f; |
| 24 | + int i; |
| 25 | +} __sasinh_ha_largest_norm = {0x7f7fffffu}; |
| 26 | +// log(2) |
| 27 | +static __constant union { |
| 28 | + unsigned int w; |
| 29 | + float f; |
| 30 | + int i; |
| 31 | +} __sasinh_ha_ln2l = {0xb102e308u}; |
| 32 | +static __constant union { |
| 33 | + unsigned int w; |
| 34 | + float f; |
| 35 | + int i; |
| 36 | +} __sasinh_ha_ln2h = {0x3f317218u}; |
| 37 | +static __constant union { |
| 38 | + unsigned int w; |
| 39 | + float f; |
| 40 | + int i; |
| 41 | +} __sasinh_ha_c9 = {0xbd3bc2cau}; |
| 42 | +static __constant union { |
| 43 | + unsigned int w; |
| 44 | + float f; |
| 45 | + int i; |
| 46 | +} __sasinh_ha_c8 = {0x3dd8bd42u}; |
| 47 | +static __constant union { |
| 48 | + unsigned int w; |
| 49 | + float f; |
| 50 | + int i; |
| 51 | +} __sasinh_ha_c7 = {0xbe075e7fu}; |
| 52 | +static __constant union { |
| 53 | + unsigned int w; |
| 54 | + float f; |
| 55 | + int i; |
| 56 | +} __sasinh_ha_c6 = {0x3e1445e9u}; |
| 57 | +static __constant union { |
| 58 | + unsigned int w; |
| 59 | + float f; |
| 60 | + int i; |
| 61 | +} __sasinh_ha_c5 = {0xbe2a6712u}; |
| 62 | +static __constant union { |
| 63 | + unsigned int w; |
| 64 | + float f; |
| 65 | + int i; |
| 66 | +} __sasinh_ha_c4 = {0x3e4cb1a3u}; |
| 67 | +static __constant union { |
| 68 | + unsigned int w; |
| 69 | + float f; |
| 70 | + int i; |
| 71 | +} __sasinh_ha_c3 = {0xbe800059u}; |
| 72 | +static __constant union { |
| 73 | + unsigned int w; |
| 74 | + float f; |
| 75 | + int i; |
| 76 | +} __sasinh_ha_c2 = {0x3eaaaae2u}; |
| 77 | +static __constant union { |
| 78 | + unsigned int w; |
| 79 | + float f; |
| 80 | + int i; |
| 81 | +} __sasinh_ha_c1 = {0xbf000000u}; |
| 82 | +static __constant union { |
| 83 | + unsigned int w; |
| 84 | + float f; |
| 85 | + int i; |
| 86 | +} __sasinh_ha_c0 = {0x3f800000u}; |
| 87 | +static __constant float __sasinh_ha_fc0[] = { |
| 88 | + 0x1.fffffep-1, 0x1.dcd7b4p-25, // HI + LO: 0.99999994 + 5.5511784e-08 |
| 89 | + // [0x3f7fffff + 0x336e6bda] |
| 90 | +}; |
| 91 | +static __constant float __sasinh_ha_fln2[] = { |
| 92 | + 0x1.62e42ep-1, 0x1.efa39ep-25, // HI + LO: 0.69314712 + 5.7699989e-08 |
| 93 | + // [0x3f317217 + 0x3377d1cf] |
| 94 | +}; |
| 95 | +__attribute__((always_inline)) inline int |
| 96 | +__ocl_svml_internal_sasinh_ha(float *a, float *r) { |
| 97 | + int nRet = 0; |
| 98 | + float x = *a; |
| 99 | + float x2h, z2h, x2l, z2l, A, B, Bh, Sh, S0h, Sl, RS, E, Yhh; |
| 100 | + float Bl, poly, R, Rl, R0, exponf; |
| 101 | + union { |
| 102 | + unsigned int w; |
| 103 | + float f; |
| 104 | + int i; |
| 105 | + } Yh, Yl, res, xin, sgn, xa, two_expon; |
| 106 | + int expon, e23, iexpon_corr; |
| 107 | + x2h = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(x, x, 0.0f); |
| 108 | + z2h = x2h + 1.0f; |
| 109 | + A = SPIRV_OCL_BUILTIN(fmax, _f32_f32, )(x2h, 1.0f); |
| 110 | + B = SPIRV_OCL_BUILTIN(fmin, _f32_f32, )(x2h, 1.0f); |
| 111 | + x2l = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(x, x, (-x2h)); |
| 112 | + Bh = z2h - A; |
| 113 | + Bl = B - Bh; |
| 114 | + z2l = x2l + Bl; |
| 115 | + RS = 1.0f / SPIRV_OCL_BUILTIN(sqrt, _f32, )(z2h); |
| 116 | + S0h = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(z2h, RS, 0.0f); |
| 117 | + // rsqrt(z2h)*0.5 |
| 118 | + RS *= 0.5f; |
| 119 | + // (1+x^2) - Sh^2 |
| 120 | + E = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )((-S0h), S0h, z2h); |
| 121 | + E = E + z2l; |
| 122 | + // sqrt(1+x^2)_low |
| 123 | + Sl = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(E, RS, 0.0f); |
| 124 | + Sh = S0h + Sl; |
| 125 | + Yhh = Sh - S0h; |
| 126 | + Sl = Sl - Yhh; |
| 127 | + xa.f = SPIRV_OCL_BUILTIN(fabs, _f32, )(x); |
| 128 | + // |x| + Sh + Sl |
| 129 | + Yh.f = xa.f + Sh; |
| 130 | + Yhh = Yh.f - Sh; |
| 131 | + Yl.f = xa.f - Yhh; |
| 132 | + Yl.f = Yl.f + Sl; |
| 133 | + // set Yh, Yl for large |x| |
| 134 | + // will use exponent correction in log computation, for large x |
| 135 | + Yh.f = (xa.f < __sasinh_ha_large_x.f) ? Yh.f : xa.f * 0.5f; |
| 136 | + Yl.f = (xa.f < __sasinh_ha_large_x.f) ? Yl.f : 0; |
| 137 | + // fixup needed for x near largest normal |
| 138 | + iexpon_corr = (xa.f < __sasinh_ha_large_x.f) ? 0 : 2; |
| 139 | + // expon(Yh) + 2 |
| 140 | + expon = ((Yh.w + 0x00400000) >> 23) - 0x7f; |
| 141 | + // new expon |
| 142 | + e23 = expon << 23; |
| 143 | + // 2^(-expon) |
| 144 | + two_expon.w = 0x3f800000 - e23; |
| 145 | + // Yl * 2^(-expon) |
| 146 | + Yl.f *= two_expon.f; |
| 147 | + // Yh * 2^(-expon-2) |
| 148 | + Yh.w -= e23; |
| 149 | + // reduced argument |
| 150 | + R0 = Yh.f - 1.0f; |
| 151 | + R = Yl.f + R0; |
| 152 | + // add exponent correction |
| 153 | + expon += iexpon_corr; |
| 154 | + // log() polynomial |
| 155 | + poly = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(__sasinh_ha_c9.f, R, |
| 156 | + __sasinh_ha_c8.f); |
| 157 | + poly = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(poly, R, __sasinh_ha_c7.f); |
| 158 | + poly = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(poly, R, __sasinh_ha_c6.f); |
| 159 | + poly = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(poly, R, __sasinh_ha_c5.f); |
| 160 | + poly = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(poly, R, __sasinh_ha_c4.f); |
| 161 | + poly = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(poly, R, __sasinh_ha_c3.f); |
| 162 | + poly = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(poly, R, __sasinh_ha_c2.f); |
| 163 | + poly = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(poly, R, __sasinh_ha_c1.f); |
| 164 | + float fR[2], fPoly[2], fExpon[2]; |
| 165 | + fR[0] = R0; |
| 166 | + fR[1] = Yl.f; |
| 167 | + fPoly[0] = poly; |
| 168 | + fPoly[1] = 0.0f; |
| 169 | + fExpon[0] = __sasinh_ha_fln2[0]; |
| 170 | + fExpon[1] = __sasinh_ha_fln2[1]; |
| 171 | + { |
| 172 | + float __ph, __phl; |
| 173 | + __ph = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(fPoly[0], fR[0], 0.0f); |
| 174 | + __phl = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(fPoly[0], fR[0], -__ph); |
| 175 | + fPoly[1] = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(fPoly[1], fR[0], __phl); |
| 176 | + fPoly[1] = |
| 177 | + SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(fPoly[0], fR[1], fPoly[1]); |
| 178 | + fPoly[0] = __ph; |
| 179 | + }; |
| 180 | + { |
| 181 | + float __ph, __ahl, __ahh; |
| 182 | + __ph = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(fPoly[0], 1.0f, |
| 183 | + __sasinh_ha_fc0[0]); |
| 184 | + __ahh = |
| 185 | + SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(__ph, 1.0f, -__sasinh_ha_fc0[0]); |
| 186 | + __ahl = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(fPoly[0], 1.0f, -__ahh); |
| 187 | + fPoly[1] = (fPoly[1] + __sasinh_ha_fc0[1]) + __ahl; |
| 188 | + fPoly[0] = __ph; |
| 189 | + }; |
| 190 | + ; |
| 191 | + { |
| 192 | + float __ph, __phl; |
| 193 | + __ph = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(fExpon[0], expon, 0.0f); |
| 194 | + __phl = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(fExpon[0], expon, -__ph); |
| 195 | + fExpon[1] = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(fExpon[1], expon, __phl); |
| 196 | + fExpon[0] = __ph; |
| 197 | + }; |
| 198 | + { |
| 199 | + float __ph, __phl; |
| 200 | + __ph = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(fPoly[0], fR[0], 0.0f); |
| 201 | + __phl = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(fPoly[0], fR[0], -__ph); |
| 202 | + fPoly[1] = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(fPoly[1], fR[0], __phl); |
| 203 | + fPoly[1] = |
| 204 | + SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(fPoly[0], fR[1], fPoly[1]); |
| 205 | + fPoly[0] = __ph; |
| 206 | + }; |
| 207 | + { |
| 208 | + float __ph, __ahl, __ahh; |
| 209 | + __ph = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(fPoly[0], 1.0f, fExpon[0]); |
| 210 | + __ahh = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(__ph, 1.0f, -fExpon[0]); |
| 211 | + __ahl = SPIRV_OCL_BUILTIN(fma, _f32_f32_f32, )(fPoly[0], 1.0f, -__ahh); |
| 212 | + fPoly[1] = (fPoly[1] + fExpon[1]) + __ahl; |
| 213 | + fPoly[0] = __ph; |
| 214 | + }; |
| 215 | + ; |
| 216 | + res.f = fPoly[0] + fPoly[1]; |
| 217 | + xin.f = x; |
| 218 | + sgn.w = xin.w ^ xa.w; |
| 219 | + res.w ^= sgn.w; |
| 220 | + // fixup for small or Inf/NaN |
| 221 | + res.f = ((xa.f < __sasinh_ha_small_x.f) | (xa.w > __sasinh_ha_largest_norm.w)) |
| 222 | + ? (x + sgn.f) |
| 223 | + : res.f; |
| 224 | + *r = res.f; |
| 225 | + return nRet; |
| 226 | +} |
| 227 | +float __ocl_svml_asinhf_ha(float x) { |
| 228 | + float r; |
| 229 | + __ocl_svml_internal_sasinh_ha(&x, &r); |
| 230 | + return r; |
| 231 | +} |
0 commit comments