[SYCL] Add bfloat16 generic implementation for fmax, fmin (#7732)

jinge90 · bader · web-flow · commit eb1ed10667f1 · 2022-12-15T10:37:33.000-08:00
Signed:sign-off-by: jinge90 &lt;ge.jin@intel.com&gt;

Co-authored-by: Alexey Bader &lt;alexey.bader@intel.com&gt;
diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp
@@ -30,6 +30,14 @@ uint32_t to_uint32_t(sycl::marray<bfloat16, N> x, size_t start) {
 }
 } // namespace detail
 
+// According to bfloat16 format, NAN value's exponent field is 0xFF and
+// significand has non-zero bits.
+template <typename T>
+std::enable_if_t<std::is_same<T, bfloat16>::value, bool> isnan(T x) {
+  oneapi::detail::Bfloat16StorageT XBits = oneapi::detail::bfloat16ToBits(x);
+  return (((XBits & 0x7F80) == 0x7F80) && (XBits & 0x7F)) ? true : false;
+}
+
 template <typename T>
 std::enable_if_t<std::is_same<T, bfloat16>::value, T> fabs(T x) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
@@ -74,20 +82,31 @@ std::enable_if_t<std::is_same<T, bfloat16>::value, T> fmin(T x, T y) {
   oneapi::detail::Bfloat16StorageT YBits = oneapi::detail::bfloat16ToBits(y);
   return oneapi::detail::bitsToBfloat16(__clc_fmin(XBits, YBits));
 #else
-  std::ignore = x;
-  std::ignore = y;
-  throw runtime_error(
-      "bfloat16 math functions are not currently supported on the host device.",
-      PI_ERROR_INVALID_DEVICE);
+  static const oneapi::detail::Bfloat16StorageT CanonicalNan = 0x7FC0;
+  oneapi::detail::Bfloat16StorageT XBits = oneapi::detail::bfloat16ToBits(x);
+  oneapi::detail::Bfloat16StorageT YBits = oneapi::detail::bfloat16ToBits(y);
+  if (isnan(x) && isnan(y))
+    return oneapi::detail::bitsToBfloat16(CanonicalNan);
+
+  if (isnan(x))
+    return y;
+  if (isnan(y))
+    return x;
+  if (((XBits | YBits) ==
+       static_cast<oneapi::detail::Bfloat16StorageT>(0x8000)) &&
+      !(XBits & YBits))
+    return oneapi::detail::bitsToBfloat16(
+        static_cast<oneapi::detail::Bfloat16StorageT>(0x8000));
+
+  return (x < y) ? x : y;
 #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
 }
 
 template <size_t N>
 sycl::marray<bfloat16, N> fmin(sycl::marray<bfloat16, N> x,
                                sycl::marray<bfloat16, N> y) {
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
   sycl::marray<bfloat16, N> res;
-
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
   for (size_t i = 0; i < N / 2; i++) {
     auto partial_res = __clc_fmin(detail::to_uint32_t(x, i * 2),
                                   detail::to_uint32_t(y, i * 2));
@@ -101,15 +120,12 @@ sycl::marray<bfloat16, N> fmin(sycl::marray<bfloat16, N> x,
         oneapi::detail::bfloat16ToBits(y[N - 1]);
     res[N - 1] = oneapi::detail::bitsToBfloat16(__clc_fmin(XBits, YBits));
   }
-
-  return res;
 #else
-  std::ignore = x;
-  std::ignore = y;
-  throw runtime_error(
-      "bfloat16 math functions are not currently supported on the host device.",
-      PI_ERROR_INVALID_DEVICE);
+  for (size_t i = 0; i < N; i++) {
+    res[i] = fmin(x[i], y[i]);
+  }
 #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  return res;
 }
 
 template <typename T>
@@ -119,20 +135,30 @@ std::enable_if_t<std::is_same<T, bfloat16>::value, T> fmax(T x, T y) {
   oneapi::detail::Bfloat16StorageT YBits = oneapi::detail::bfloat16ToBits(y);
   return oneapi::detail::bitsToBfloat16(__clc_fmax(XBits, YBits));
 #else
-  std::ignore = x;
-  std::ignore = y;
-  throw runtime_error(
-      "bfloat16 math functions are not currently supported on the host device.",
-      PI_ERROR_INVALID_DEVICE);
+  static const oneapi::detail::Bfloat16StorageT CanonicalNan = 0x7FC0;
+  oneapi::detail::Bfloat16StorageT XBits = oneapi::detail::bfloat16ToBits(x);
+  oneapi::detail::Bfloat16StorageT YBits = oneapi::detail::bfloat16ToBits(y);
+  if (isnan(x) && isnan(y))
+    return oneapi::detail::bitsToBfloat16(CanonicalNan);
+
+  if (isnan(x))
+    return y;
+  if (isnan(y))
+    return x;
+  if (((XBits | YBits) ==
+       static_cast<oneapi::detail::Bfloat16StorageT>(0x8000)) &&
+      !(XBits & YBits))
+    return oneapi::detail::bitsToBfloat16(0);
+
+  return (x > y) ? x : y;
 #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
 }
 
 template <size_t N>
 sycl::marray<bfloat16, N> fmax(sycl::marray<bfloat16, N> x,
                                sycl::marray<bfloat16, N> y) {
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
   sycl::marray<bfloat16, N> res;
-
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
   for (size_t i = 0; i < N / 2; i++) {
     auto partial_res = __clc_fmax(detail::to_uint32_t(x, i * 2),
                                   detail::to_uint32_t(y, i * 2));
@@ -146,14 +172,12 @@ sycl::marray<bfloat16, N> fmax(sycl::marray<bfloat16, N> x,
         oneapi::detail::bfloat16ToBits(y[N - 1]);
     res[N - 1] = oneapi::detail::bitsToBfloat16(__clc_fmax(XBits, YBits));
   }
-  return res;
 #else
-  std::ignore = x;
-  std::ignore = y;
-  throw runtime_error(
-      "bfloat16 math functions are not currently supported on the host device.",
-      PI_ERROR_INVALID_DEVICE);
+  for (size_t i = 0; i < N; i++) {
+    res[i] = fmax(x[i], y[i]);
+  }
 #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  return res;
 }
 
 template <typename T>
diff --git a/sycl/include/sycl/ext/oneapi/experimental/sycl_complex.hpp b/sycl/include/sycl/ext/oneapi/experimental/sycl_complex.hpp
@@ -1202,7 +1202,7 @@ SYCL_EXTERNAL complex<_Tp> acos(const complex<_Tp> &__x) {
   }
   if (sycl::isinf(__x.imag()))
     return complex<_Tp>(__pi / _Tp(2), -__x.imag());
-  if (__x.real() == 0 && (__x.imag() == 0 || isnan(__x.imag())))
+  if (__x.real() == 0 && (__x.imag() == 0 || sycl::isnan(__x.imag())))
     return complex<_Tp>(__pi / _Tp(2), -__x.imag());
   complex<_Tp> __z = log(__x + sqrt(__sqr(__x) - _Tp(1)));
   if (sycl::signbit(__x.imag()))

Original file line number	Diff line number	Diff line change
`@@ -1202,7 +1202,7 @@ SYCL_EXTERNAL complex<_Tp> acos(const complex<_Tp> &__x) {`
`1202`	`1202`	`}`
`1203`	`1203`	`if (sycl::isinf(__x.imag()))`
`1204`	`1204`	`return complex<_Tp>(__pi / _Tp(2), -__x.imag());`
`1205`		`- if (__x.real() == 0 && (__x.imag() == 0 \|\| isnan(__x.imag())))`
	`1205`	`+ if (__x.real() == 0 && (__x.imag() == 0 \|\| sycl::isnan(__x.imag())))`
`1206`	`1206`	`return complex<_Tp>(__pi / _Tp(2), -__x.imag());`
`1207`	`1207`	`complex<_Tp> __z = log(__x + sqrt(__sqr(__x) - _Tp(1)));`
`1208`	`1208`	`if (sycl::signbit(__x.imag()))`