Merge pull request #5784 from apple/eng/PR-102641225

TNorthover · web-flow · commit 8a7a0962ae52 · 2022-12-14T11:17:45.000Z
[compiler-rt][X86] Add half &lt;-&gt; x86_fp80 conversion builtins
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -283,6 +283,7 @@ endif ()
 # long double is not 80 bits on Android or MSVC.
 set(x86_80_BIT_SOURCES
   divxc3.c
+  extendhfxf2.c
   fixxfdi.c
   fixxfti.c
   fixunsxfdi.c
@@ -294,6 +295,7 @@ set(x86_80_BIT_SOURCES
   floatuntixf.c
   mulxc3.c
   powixf2.c
+  truncxfhf2.c
 )
 
 if (NOT MSVC)
diff --git a/compiler-rt/lib/builtins/extendhfxf2.c b/compiler-rt/lib/builtins/extendhfxf2.c
@@ -0,0 +1,24 @@
+//===-- lib/extendhfxf2.c - half -> x86 FP80 conversion -----------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define FP80_PRECISION
+#include "fp_lib.h"
+
+#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_80BIT)
+
+#define SRC_HALF
+#define DST_FP80
+#include "fp_extend_impl.inc"
+
+// Use a forwarding definition and noinline to implement a poor man's alias,
+// as there isn't a good cross-platform way of defining one.
+COMPILER_RT_ABI NOINLINE long double __extendhfxf2(src_t a) {
+  return __extendXfYf2__(a);
+}
+
+#endif
diff --git a/compiler-rt/lib/builtins/fp_extend.h b/compiler-rt/lib/builtins/fp_extend.h
@@ -58,22 +58,37 @@ static const int srcSigBits = 10;
 typedef float dst_t;
 typedef uint32_t dst_rep_t;
 #define DST_REP_C UINT32_C
+static const int dstBits = 32;
 static const int dstSigBits = 23;
+static const int dstIntBits = 0;
 
 #elif defined DST_DOUBLE
 typedef double dst_t;
 typedef uint64_t dst_rep_t;
 #define DST_REP_C UINT64_C
+static const int dstBits = 64;
 static const int dstSigBits = 52;
+static const int dstIntBits = 0;
+
+#elif defined DST_FP80
+typedef long double dst_t;
+typedef __uint128_t dst_rep_t;
+#define DST_REP_C (__uint128_t)
+static const int dstBits = 80;
+static const int dstSigBits = 64;
+static const int dstIntBits = 1;
+
 
 #elif defined DST_QUAD
 typedef long double dst_t;
 typedef __uint128_t dst_rep_t;
 #define DST_REP_C (__uint128_t)
+static const int dstBits = 128;
 static const int dstSigBits = 112;
+static const int dstIntBits = 0;
 
 #else
-#error Destination should be single, double, or quad precision!
+#error Destination should be single, double, fp80, or quad precision!
 #endif // end destination precision
 
 // End of specialization parameters.  Two helper routines for conversion to and
diff --git a/compiler-rt/lib/builtins/fp_extend_impl.inc b/compiler-rt/lib/builtins/fp_extend_impl.inc
@@ -52,12 +52,12 @@ static __inline dst_t __extendXfYf2__(src_t a) {
   const src_rep_t srcQNaN = SRC_REP_C(1) << (srcSigBits - 1);
   const src_rep_t srcNaNCode = srcQNaN - 1;
 
-  const int dstBits = sizeof(dst_t) * CHAR_BIT;
   const int dstExpBits = dstBits - dstSigBits - 1;
   const int dstInfExp = (1 << dstExpBits) - 1;
   const int dstExpBias = dstInfExp >> 1;
 
   const dst_rep_t dstMinNormal = DST_REP_C(1) << dstSigBits;
+  const dst_rep_t dstSignificandMask = dstMinNormal - 1;
 
   // Break a into a sign and representation of the absolute value.
   const src_rep_t aRep = srcToRep(a);
@@ -72,6 +72,19 @@ static __inline dst_t __extendXfYf2__(src_t a) {
     // Extend to the destination type by shifting the significand and
     // exponent into the proper position and rebiasing the exponent.
     absResult = (dst_rep_t)aAbs << (dstSigBits - srcSigBits);
+
+    if (dstIntBits) {
+      // x86_fp80 has an explicit instead of implicit integer bit at the top of
+      // the significand. Canonical values (except denormals & zero) set it to
+      // 1.
+      dst_rep_t absSignificand = absResult & dstSignificandMask;
+      absSignificand >>= 1;
+      absSignificand |= (dst_rep_t)1 << (dstSigBits - 1);
+
+      absResult &= ~dstSignificandMask;
+      absResult |= absSignificand;
+    }
+
     absResult += (dst_rep_t)(dstExpBias - srcExpBias) << dstSigBits;
   }
 
@@ -81,16 +94,18 @@ static __inline dst_t __extendXfYf2__(src_t a) {
     // bit (if needed) and right-aligning the rest of the trailing NaN
     // payload field.
     absResult = (dst_rep_t)dstInfExp << dstSigBits;
-    absResult |= (dst_rep_t)(aAbs & srcQNaN) << (dstSigBits - srcSigBits);
-    absResult |= (dst_rep_t)(aAbs & srcNaNCode) << (dstSigBits - srcSigBits);
+    if (dstIntBits)
+      absResult |= (dst_rep_t)1 << (dstSigBits - 1);
+    absResult |= (dst_rep_t)(aAbs & srcQNaN) << (dstSigBits - dstIntBits - srcSigBits);
+    absResult |= (dst_rep_t)(aAbs & srcNaNCode) << (dstSigBits - dstIntBits - srcSigBits);
   }
 
   else if (aAbs) {
     // a is denormal.
     // renormalize the significand and clear the leading bit, then insert
     // the correct adjusted exponent in the destination type.
     const int scale = src_rep_t_clz(aAbs) - src_rep_t_clz(srcMinNormal);
-    absResult = (dst_rep_t)aAbs << (dstSigBits - srcSigBits + scale);
+    absResult = (dst_rep_t)aAbs << (dstSigBits - dstIntBits - srcSigBits + scale);
     absResult ^= dstMinNormal;
     const int resultExponent = dstExpBias - srcExpBias - scale + 1;
     absResult |= (dst_rep_t)resultExponent << dstSigBits;
diff --git a/compiler-rt/lib/builtins/fp_lib.h b/compiler-rt/lib/builtins/fp_lib.h
@@ -104,6 +104,12 @@ static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) {
 
 COMPILER_RT_ABI fp_t __adddf3(fp_t a, fp_t b);
 
+#elif defined FP80_PRECISION
+#if __LDBL_MANT_DIG__ == 64
+#define CRT_LDBL_80BIT
+// Only x86 does 80-bit floats, only support extend/trunc.
+#endif
+
 #elif defined QUAD_PRECISION
 #if __LDBL_MANT_DIG__ == 113 && defined(__SIZEOF_INT128__)
 #define CRT_LDBL_128BIT
@@ -202,7 +208,7 @@ static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) {
 #undef Word_FullMask
 #endif // __LDBL_MANT_DIG__ == 113 && __SIZEOF_INT128__
 #else
-#error SINGLE_PRECISION, DOUBLE_PRECISION or QUAD_PRECISION must be defined.
+#error SINGLE_PRECISION, DOUBLE_PRECISION, FP80_PRECISION, or QUAD_PRECISION must be defined.
 #endif
 
 #if defined(SINGLE_PRECISION) || defined(DOUBLE_PRECISION) ||                  \
diff --git a/compiler-rt/lib/builtins/fp_trunc.h b/compiler-rt/lib/builtins/fp_trunc.h
@@ -19,22 +19,36 @@
 typedef float src_t;
 typedef uint32_t src_rep_t;
 #define SRC_REP_C UINT32_C
+static const int srcBits = 32;
 static const int srcSigBits = 23;
+static const int srcIntBits = 0;
 
 #elif defined SRC_DOUBLE
 typedef double src_t;
 typedef uint64_t src_rep_t;
 #define SRC_REP_C UINT64_C
+static const int srcBits = 64;
 static const int srcSigBits = 52;
+static const int srcIntBits = 0;
+
+#elif defined SRC_FLT80
+typedef long double src_t;
+typedef __uint128_t src_rep_t;
+#define SRC_REP_C (__uint128_t)
+static const int srcBits = 80;
+static const int srcSigBits = 64;
+static const int srcIntBits = 1;
 
 #elif defined SRC_QUAD
 typedef long double src_t;
 typedef __uint128_t src_rep_t;
 #define SRC_REP_C (__uint128_t)
+static const int srcBits = 128;
 static const int srcSigBits = 112;
+static const int srcIntBits = 0;
 
 #else
-#error Source should be double precision or quad precision!
+#error Source should be double precision, fp80 precision, or quad precision!
 #endif // end source precision
 
 #if defined DST_DOUBLE
@@ -77,7 +91,13 @@ static __inline src_rep_t srcToRep(src_t x) {
     src_t f;
     src_rep_t i;
   } rep = {.f = x};
-  return rep.i;
+  src_rep_t res = rep.i;
+
+  // Zero out the padding bits from the union if needed.
+  if (sizeof(src_rep_t) > sizeof(src_t))
+    res &= (((src_rep_t)1 << sizeof(src_t)*CHAR_BIT) - 1);
+
+  return res;
 }
 
 static __inline dst_t dstFromRep(dst_rep_t x) {
diff --git a/compiler-rt/lib/builtins/fp_trunc_impl.inc b/compiler-rt/lib/builtins/fp_trunc_impl.inc
@@ -1,3 +1,4 @@
+int printf(const char *, ...);
 //= lib/fp_trunc_impl.inc - high precision -> low precision conversion *-*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -38,10 +39,28 @@
 
 #include "fp_trunc.h"
 
+// x86_fp80 has an explicit integer bit at the top of the significand.
+// This allowed more weird denormals, infinities and NaNs in 8087 & 80287;
+// but from 387 onwards those are treated as invalid and we can just
+// ignore the issue by converting to a canonical "usual-format" IEEE-857.
+
+static src_rep_t removeExplicitIntBit(src_rep_t in) {
+  if (!srcIntBits)
+    return in;
+
+  const src_rep_t srcMinNormal = SRC_REP_C(1) << srcSigBits;
+  const src_rep_t srcSignificandMask = srcMinNormal - 1;
+
+  src_rep_t inSig = in & srcSignificandMask;
+  in &= ~srcSignificandMask;
+  in |= (inSig << 1) & srcSignificandMask;
+
+  return in;
+}
+
 static __inline dst_t __truncXfYf2__(src_t a) {
   // Various constants whose values follow from the type parameters.
   // Any reasonable optimizer will fold and propagate all of these.
-  const int srcBits = sizeof(src_t) * CHAR_BIT;
   const int srcExpBits = srcBits - srcSigBits - 1;
   const int srcInfExp = (1 << srcExpBits) - 1;
   const int srcExpBias = srcInfExp >> 1;
@@ -71,14 +90,15 @@ static __inline dst_t __truncXfYf2__(src_t a) {
 
   // Break a into a sign and representation of the absolute value.
   const src_rep_t aRep = srcToRep(a);
-  const src_rep_t aAbs = aRep & srcAbsMask;
+  src_rep_t aAbs = aRep & srcAbsMask;
   const src_rep_t sign = aRep & srcSignMask;
   dst_rep_t absResult;
 
   if (aAbs - underflow < aAbs - overflow) {
     // The exponent of a is within the range of normal numbers in the
     // destination format.  We can convert by simply right-shifting with
     // rounding and adjusting the exponent.
+    aAbs = removeExplicitIntBit(aAbs);
     absResult = aAbs >> (srcSigBits - dstSigBits);
     absResult -= (dst_rep_t)(srcExpBias - dstExpBias) << dstSigBits;
 
@@ -104,10 +124,11 @@ static __inline dst_t __truncXfYf2__(src_t a) {
     // a underflows on conversion to the destination type or is an exact
     // zero.  The result may be a denormal or zero.  Extract the exponent
     // to get the shift amount for the denormalization.
+    aAbs = removeExplicitIntBit(aAbs);
     const int aExp = aAbs >> srcSigBits;
     const int shift = srcExpBias - dstExpBias - aExp + 1;
 
-    const src_rep_t significand = (aRep & srcSignificandMask) | srcMinNormal;
+    const src_rep_t significand = (aAbs & srcSignificandMask) | srcMinNormal;
 
     // Right shift by the denormalization amount with sticky.
     if (shift > srcSigBits) {
diff --git a/compiler-rt/lib/builtins/truncxfhf2.c b/compiler-rt/lib/builtins/truncxfhf2.c
@@ -0,0 +1,23 @@
+//===-- lib/trunctfhf2.c - quad -> half conversion ----------------*- C -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define FP80_PRECISION
+#include "fp_lib.h"
+
+#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_80BIT)
+
+#define SRC_FLT80
+#define DST_HALF
+#include "fp_trunc_impl.inc"
+
+COMPILER_RT_ABI _Float16 __truncxfhf2(long double a) {
+  return __truncXfYf2__(a);
+}
+
+#endif
diff --git a/compiler-rt/test/builtins/Unit/extendhfxf2_test.c b/compiler-rt/test/builtins/Unit/extendhfxf2_test.c
@@ -0,0 +1,96 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_extendhfxf2
+
+#include <stdio.h>
+
+#include "fp_test.h"
+
+long double __extendhfxf2(TYPE_FP16 a);
+
+int test__extendhfxf2(TYPE_FP16 a, uint64_t expectedHi, uint64_t expectedLo)
+{
+    long double x = __extendhfxf2(a);
+    int ret = compareResultLD(x, expectedHi, expectedLo);
+
+    if (ret){
+        printf("error in test__extendhfxf2(%#.4x) = %Lf, "
+               "expected %Lf\n", toRep16(a), x, fromRep80(expectedHi, expectedLo));
+
+    }
+    return ret;
+}
+
+char assumption_1[sizeof(TYPE_FP16) * CHAR_BIT == 16] = {0};
+
+int main()
+{
+    // qNaN
+    if (test__extendhfxf2(fromRep16(0x7e00),
+                          UINT64_C(0x7fff),
+                          UINT64_C(0xc000000000000000)))
+        return 1;
+    // NaN
+    if (test__extendhfxf2(fromRep16(0x7f80),
+                          UINT64_C(0x7fff),
+                          UINT64_C(0xf000000000000000)))
+        return 1;
+    // inf
+    if (test__extendhfxf2(fromRep16(0x7c00),
+                          UINT64_C(0x7fff),
+                          UINT64_C(0x8000000000000000)))
+        return 1;
+    // -inf
+    if (test__extendhfxf2(fromRep16(0xfc00),
+                          UINT64_C(0xffff),
+                          UINT64_C(0x8000000000000000)))
+        return 1;
+    // zero
+    if (test__extendhfxf2(fromRep16(0x0),
+                          UINT64_C(0x0000),
+                          UINT64_C(0x0000000000000000)))
+        return 1;
+    // -zero
+    if (test__extendhfxf2(fromRep16(0x8000),
+                          UINT64_C(0x8000),
+                          UINT64_C(0x0000000000000000)))
+        return 1;
+    if (test__extendhfxf2(fromRep16(0x4248),
+                          UINT64_C(0x4000),
+                          UINT64_C(0xc900000000000000)))
+        return 1;
+    if (test__extendhfxf2(fromRep16(0xc248),
+                          UINT64_C(0xc000),
+                          UINT64_C(0xc900000000000000)))
+        return 1;
+    if (test__extendhfxf2(fromRep16(0x6e62),
+                          UINT64_C(0x400b),
+                          UINT64_C(0xcc40000000000000)))
+        return 1;
+    if (test__extendhfxf2(fromRep16(0x3c00),
+                          UINT64_C(0x3fff),
+                          UINT64_C(0x8000000000000000)))
+        return 1;
+    if (test__extendhfxf2(fromRep16(0x0400),
+                          UINT64_C(0x3ff1),
+                          UINT64_C(0x8000000000000000)))
+        return 1;
+    // denormal
+    if (test__extendhfxf2(fromRep16(0x0010),
+                          UINT64_C(0x3feb),
+                          UINT64_C(0x8000000000000000)))
+        return 1;
+    if (test__extendhfxf2(fromRep16(0x0001),
+                          UINT64_C(0x3fe7),
+                          UINT64_C(0x8000000000000000)))
+        return 1;
+    if (test__extendhfxf2(fromRep16(0x8001),
+                          UINT64_C(0xbfe7),
+                          UINT64_C(0x8000000000000000)))
+        return 1;
+    // max (precise)
+    if (test__extendhfxf2(fromRep16(0x7bff),
+                          UINT64_C(0x400e),
+                          UINT64_C(0xffe0000000000000)))
+        return 1;
+    return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/fp_test.h b/compiler-rt/test/builtins/Unit/fp_test.h
diff --git a/compiler-rt/test/builtins/Unit/truncxfhf2_test.c b/compiler-rt/test/builtins/Unit/truncxfhf2_test.c