|
| 1 | +//===----------------------------------------------------------------------===// |
| 2 | +// |
| 3 | +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | +// |
| 7 | +//===----------------------------------------------------------------------===// |
| 8 | + |
| 9 | +_CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y, |
| 10 | + __CLC_ADDRESS_SPACE int *quo) { |
| 11 | + x = __clc_flush_denormal_if_not_supported(x); |
| 12 | + y = __clc_flush_denormal_if_not_supported(y); |
| 13 | + int ux = __clc_as_int(x); |
| 14 | + int ax = ux & EXSIGNBIT_SP32; |
| 15 | + float xa = __clc_as_float(ax); |
| 16 | + int sx = ux ^ ax; |
| 17 | + int ex = ax >> EXPSHIFTBITS_SP32; |
| 18 | + |
| 19 | + int uy = __clc_as_int(y); |
| 20 | + int ay = uy & EXSIGNBIT_SP32; |
| 21 | + float ya = __clc_as_float(ay); |
| 22 | + int sy = uy ^ ay; |
| 23 | + int ey = ay >> EXPSHIFTBITS_SP32; |
| 24 | + |
| 25 | + float xr = __clc_as_float(0x3f800000 | (ax & 0x007fffff)); |
| 26 | + float yr = __clc_as_float(0x3f800000 | (ay & 0x007fffff)); |
| 27 | + int c; |
| 28 | + int k = ex - ey; |
| 29 | + |
| 30 | + uint q = 0; |
| 31 | + |
| 32 | + while (k > 0) { |
| 33 | + c = xr >= yr; |
| 34 | + q = (q << 1) | c; |
| 35 | + xr -= c ? yr : 0.0f; |
| 36 | + xr += xr; |
| 37 | + --k; |
| 38 | + } |
| 39 | + |
| 40 | + c = xr > yr; |
| 41 | + q = (q << 1) | c; |
| 42 | + xr -= c ? yr : 0.0f; |
| 43 | + |
| 44 | + int lt = ex < ey; |
| 45 | + |
| 46 | + q = lt ? 0 : q; |
| 47 | + xr = lt ? xa : xr; |
| 48 | + yr = lt ? ya : yr; |
| 49 | + |
| 50 | + c = (yr < 2.0f * xr) | ((yr == 2.0f * xr) & ((q & 0x1) == 0x1)); |
| 51 | + xr -= c ? yr : 0.0f; |
| 52 | + q += c; |
| 53 | + |
| 54 | + float s = __clc_as_float(ey << EXPSHIFTBITS_SP32); |
| 55 | + xr *= lt ? 1.0f : s; |
| 56 | + |
| 57 | + int qsgn = sx == sy ? 1 : -1; |
| 58 | + int quot = (q & 0x7f) * qsgn; |
| 59 | + |
| 60 | + c = ax == ay; |
| 61 | + quot = c ? qsgn : quot; |
| 62 | + xr = c ? 0.0f : xr; |
| 63 | + |
| 64 | + xr = __clc_as_float(sx ^ __clc_as_int(xr)); |
| 65 | + |
| 66 | + c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 | |
| 67 | + ay == 0; |
| 68 | + quot = c ? 0 : quot; |
| 69 | + xr = c ? __clc_as_float(QNANBITPATT_SP32) : xr; |
| 70 | + |
| 71 | + *quo = quot; |
| 72 | + |
| 73 | + return xr; |
| 74 | +} |
| 75 | + |
| 76 | +// remquo signature is special, we don't have macro for this |
| 77 | +#define __VEC_REMQUO(TYPE, VEC_SIZE, HALF_VEC_SIZE) \ |
| 78 | + _CLC_DEF _CLC_OVERLOAD TYPE##VEC_SIZE __clc_remquo( \ |
| 79 | + TYPE##VEC_SIZE x, TYPE##VEC_SIZE y, \ |
| 80 | + __CLC_ADDRESS_SPACE int##VEC_SIZE *quo) { \ |
| 81 | + int##HALF_VEC_SIZE lo, hi; \ |
| 82 | + TYPE##VEC_SIZE ret; \ |
| 83 | + ret.lo = __clc_remquo(x.lo, y.lo, &lo); \ |
| 84 | + ret.hi = __clc_remquo(x.hi, y.hi, &hi); \ |
| 85 | + (*quo).lo = lo; \ |
| 86 | + (*quo).hi = hi; \ |
| 87 | + return ret; \ |
| 88 | + } |
| 89 | + |
| 90 | +#define __VEC3_REMQUO(TYPE) \ |
| 91 | + _CLC_DEF _CLC_OVERLOAD TYPE##3 __clc_remquo( \ |
| 92 | + TYPE##3 x, TYPE##3 y, __CLC_ADDRESS_SPACE int##3 * quo) { \ |
| 93 | + int2 lo; \ |
| 94 | + int hi; \ |
| 95 | + TYPE##3 ret; \ |
| 96 | + ret.s01 = __clc_remquo(x.s01, y.s01, &lo); \ |
| 97 | + ret.s2 = __clc_remquo(x.s2, y.s2, &hi); \ |
| 98 | + (*quo).s01 = lo; \ |
| 99 | + (*quo).s2 = hi; \ |
| 100 | + return ret; \ |
| 101 | + } |
| 102 | +__VEC_REMQUO(float, 2, ) |
| 103 | +__VEC3_REMQUO(float) |
| 104 | +__VEC_REMQUO(float, 4, 2) |
| 105 | +__VEC_REMQUO(float, 8, 4) |
| 106 | +__VEC_REMQUO(float, 16, 8) |
| 107 | + |
| 108 | +#ifdef cl_khr_fp64 |
| 109 | + |
| 110 | +#pragma OPENCL EXTENSION cl_khr_fp64 : enable |
| 111 | + |
| 112 | +_CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y, |
| 113 | + __CLC_ADDRESS_SPACE int *pquo) { |
| 114 | + ulong ux = __clc_as_ulong(x); |
| 115 | + ulong ax = ux & ~SIGNBIT_DP64; |
| 116 | + ulong xsgn = ux ^ ax; |
| 117 | + double dx = __clc_as_double(ax); |
| 118 | + int xexp = __clc_convert_int(ax >> EXPSHIFTBITS_DP64); |
| 119 | + int xexp1 = 11 - (int)__clc_clz(ax & MANTBITS_DP64); |
| 120 | + xexp1 = xexp < 1 ? xexp1 : xexp; |
| 121 | + |
| 122 | + ulong uy = __clc_as_ulong(y); |
| 123 | + ulong ay = uy & ~SIGNBIT_DP64; |
| 124 | + double dy = __clc_as_double(ay); |
| 125 | + int yexp = __clc_convert_int(ay >> EXPSHIFTBITS_DP64); |
| 126 | + int yexp1 = 11 - (int)__clc_clz(ay & MANTBITS_DP64); |
| 127 | + yexp1 = yexp < 1 ? yexp1 : yexp; |
| 128 | + |
| 129 | + int qsgn = ((ux ^ uy) & SIGNBIT_DP64) == 0UL ? 1 : -1; |
| 130 | + |
| 131 | + // First assume |x| > |y| |
| 132 | + |
| 133 | + // Set ntimes to the number of times we need to do a |
| 134 | + // partial remainder. If the exponent of x is an exact multiple |
| 135 | + // of 53 larger than the exponent of y, and the mantissa of x is |
| 136 | + // less than the mantissa of y, ntimes will be one too large |
| 137 | + // but it doesn't matter - it just means that we'll go round |
| 138 | + // the loop below one extra time. |
| 139 | + int ntimes = __clc_max(0, (xexp1 - yexp1) / 53); |
| 140 | + double w = __clc_ldexp(dy, ntimes * 53); |
| 141 | + w = ntimes == 0 ? dy : w; |
| 142 | + double scale = ntimes == 0 ? 1.0 : 0x1.0p-53; |
| 143 | + |
| 144 | + // Each time round the loop we compute a partial remainder. |
| 145 | + // This is done by subtracting a large multiple of w |
| 146 | + // from x each time, where w is a scaled up version of y. |
| 147 | + // The subtraction must be performed exactly in quad |
| 148 | + // precision, though the result at each stage can |
| 149 | + // fit exactly in a double precision number. |
| 150 | + int i; |
| 151 | + double t, v, p, pp; |
| 152 | + |
| 153 | + for (i = 0; i < ntimes; i++) { |
| 154 | + // Compute integral multiplier |
| 155 | + t = __clc_trunc(dx / w); |
| 156 | + |
| 157 | + // Compute w * t in quad precision |
| 158 | + p = w * t; |
| 159 | + pp = __clc_fma(w, t, -p); |
| 160 | + |
| 161 | + // Subtract w * t from dx |
| 162 | + v = dx - p; |
| 163 | + dx = v + (((dx - v) - p) - pp); |
| 164 | + |
| 165 | + // If t was one too large, dx will be negative. Add back one w. |
| 166 | + dx += dx < 0.0 ? w : 0.0; |
| 167 | + |
| 168 | + // Scale w down by 2^(-53) for the next iteration |
| 169 | + w *= scale; |
| 170 | + } |
| 171 | + |
| 172 | + // One more time |
| 173 | + // Variable todd says whether the integer t is odd or not |
| 174 | + t = __clc_floor(dx / w); |
| 175 | + long lt = (long)t; |
| 176 | + int todd = lt & 1; |
| 177 | + |
| 178 | + p = w * t; |
| 179 | + pp = __clc_fma(w, t, -p); |
| 180 | + v = dx - p; |
| 181 | + dx = v + (((dx - v) - p) - pp); |
| 182 | + i = dx < 0.0; |
| 183 | + todd ^= i; |
| 184 | + dx += i ? w : 0.0; |
| 185 | + |
| 186 | + lt -= i; |
| 187 | + |
| 188 | + // At this point, dx lies in the range [0,dy) |
| 189 | + |
| 190 | + // For the remainder function, we need to adjust dx |
| 191 | + // so that it lies in the range (-y/2, y/2] by carefully |
| 192 | + // subtracting w (== dy == y) if necessary. The rigmarole |
| 193 | + // with todd is to get the correct sign of the result |
| 194 | + // when x/y lies exactly half way between two integers, |
| 195 | + // when we need to choose the even integer. |
| 196 | + |
| 197 | + int al = (2.0 * dx > w) | (todd & (2.0 * dx == w)); |
| 198 | + double dxl = dx - (al ? w : 0.0); |
| 199 | + |
| 200 | + int ag = (dx > 0.5 * w) | (todd & (dx == 0.5 * w)); |
| 201 | + double dxg = dx - (ag ? w : 0.0); |
| 202 | + |
| 203 | + dx = dy < 0x1.0p+1022 ? dxl : dxg; |
| 204 | + lt += dy < 0x1.0p+1022 ? al : ag; |
| 205 | + int quo = ((int)lt & 0x7f) * qsgn; |
| 206 | + |
| 207 | + double ret = __clc_as_double(xsgn ^ __clc_as_ulong(dx)); |
| 208 | + dx = __clc_as_double(ax); |
| 209 | + |
| 210 | + // Now handle |x| == |y| |
| 211 | + int c = dx == dy; |
| 212 | + t = __clc_as_double(xsgn); |
| 213 | + quo = c ? qsgn : quo; |
| 214 | + ret = c ? t : ret; |
| 215 | + |
| 216 | + // Next, handle |x| < |y| |
| 217 | + c = dx < dy; |
| 218 | + quo = c ? 0 : quo; |
| 219 | + ret = c ? x : ret; |
| 220 | + |
| 221 | + c &= (yexp < 1023 & 2.0 * dx > dy) | (dx > 0.5 * dy); |
| 222 | + quo = c ? qsgn : quo; |
| 223 | + // we could use a conversion here instead since qsgn = +-1 |
| 224 | + p = qsgn == 1 ? -1.0 : 1.0; |
| 225 | + t = __clc_fma(y, p, x); |
| 226 | + ret = c ? t : ret; |
| 227 | + |
| 228 | + // We don't need anything special for |x| == 0 |
| 229 | + |
| 230 | + // |y| is 0 |
| 231 | + c = dy == 0.0; |
| 232 | + quo = c ? 0 : quo; |
| 233 | + ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret; |
| 234 | + |
| 235 | + // y is +-Inf, NaN |
| 236 | + c = yexp > BIASEDEMAX_DP64; |
| 237 | + quo = c ? 0 : quo; |
| 238 | + t = y == y ? x : y; |
| 239 | + ret = c ? t : ret; |
| 240 | + |
| 241 | + // x is +=Inf, NaN |
| 242 | + c = xexp > BIASEDEMAX_DP64; |
| 243 | + quo = c ? 0 : quo; |
| 244 | + ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret; |
| 245 | + |
| 246 | + *pquo = quo; |
| 247 | + return ret; |
| 248 | +} |
| 249 | +__VEC_REMQUO(double, 2, ) |
| 250 | +__VEC3_REMQUO(double) |
| 251 | +__VEC_REMQUO(double, 4, 2) |
| 252 | +__VEC_REMQUO(double, 8, 4) |
| 253 | +__VEC_REMQUO(double, 16, 8) |
| 254 | + |
| 255 | +#endif |
| 256 | + |
| 257 | +#ifdef cl_khr_fp16 |
| 258 | + |
| 259 | +#pragma OPENCL EXTENSION cl_khr_fp16 : enable |
| 260 | + |
| 261 | +_CLC_OVERLOAD _CLC_DEF half __clc_remquo(half x, half y, |
| 262 | + __CLC_ADDRESS_SPACE int *pquo) { |
| 263 | + return (half)__clc_remquo((float)x, (float)y, pquo); |
| 264 | +} |
| 265 | +__VEC_REMQUO(half, 2, ) |
| 266 | +__VEC3_REMQUO(half) |
| 267 | +__VEC_REMQUO(half, 4, 2) |
| 268 | +__VEC_REMQUO(half, 8, 4) |
| 269 | +__VEC_REMQUO(half, 16, 8) |
| 270 | + |
| 271 | +#endif |
0 commit comments