[SYCL] Add generic impl for some bf16 math functions (#8583)

jinge90 · JackAKirk · web-flow · commit c7759bb8cd0b · 2023-05-15T23:08:08.000-07:00
Signed-off-by: jinge90 &lt;ge.jin@intel.com&gt;
Co-authored-by: JackAKirk &lt;jack.kirk@codeplay.com&gt;
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc
@@ -22,7 +22,7 @@
 
 == Notice
 
-Copyright © 2022-2022 Intel Corporation. All rights reserved.
+Copyright © 2022-2023 Intel Corporation. All rights reserved.
 
 Khronos® is a registered trademark and SYCL™ and SPIR™ are trademarks of
 The Khronos Group Inc. OpenCL™ is a trademark of Apple Inc. used by permission
@@ -55,13 +55,16 @@ specification.
 
 == Overview
 
-This extension adds `bfloat16` support to the `fma`, `fmin`, `fmax`, `fabs`
-and `isnan` SYCL floating point math functions. These functions can be used as
-element wise operations on matrices, supplementing the `bfloat16` support
-in the sycl_ext_oneapi_matrix extension.
+This extension adds `bfloat16` support to the `fma`, `fmin`, `fmax`, `fabs`,
+`isnan`, `ceil`, `floor`, `cos`, `sin`, `exp`, `exp2`, `exp10`, `log`, `log2`,
+`log10`, `rint`, `sqrt`, `rsqrt` and `trunc` SYCL floating point math functions.
+These functions can be used as element wise operations on matrices, supplementing
+the `bfloat16` support in the sycl_ext_oneapi_matrix extension.
 
-The descriptions of the `fma`, `fmin`, `fmax`, `fabs` and `isnan` SYCL floating
-point math functions can be found in the SYCL specification:
+The descriptions of the `fma`, `fmin`, `fmax`, `fabs`, `isnan`, `ceil`, `floor`,
+`cos`, `sin`, `exp`, `exp2`, `exp10`, `log`, `log2`, `log10`, `rint`, `sqrt`,
+`rsqrt` and `trunc` SYCL floating point math functions can be found in the SYCL
+specification:
 https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_math_functions.
 
 == Specification
@@ -80,7 +83,7 @@ supports.
 [%header,cols="1,5"]
 |===
 |Value |Description
-|1     |Initial extension version. Base features are supported.
+|1     |The APIs of this experimental extension are not versioned, so the feature-test macro always has this value.
 |===   
 
 === Extension to `enum class aspect`
@@ -184,7 +187,194 @@ T fabs(T x);
 
 ===== Description
 
-Compute absolute value of a `bfloat16`.
+Compute absolute value of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+
+==== ceil
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+template <typename T>
+T ceil(T x);
+} // namespace sycl::ext::oneapi::experimental
+```
+
+===== Description
+
+Returns `x` rounded to an integral value using the round to positive infinity rounding mode
+
+==== floor
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+template <typename T>
+T floor(T x);
+} // namespace sycl::ext::oneapi::experimental
+```
+
+===== Description
+
+Returns `x` rounded to an integral value using the round to negative infinity rounding mode
+for a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+
+==== cos
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+template <typename T>
+T cos(T x);
+} // namespace sycl::ext::oneapi::experimental
+```
+
+===== Description
+
+Compute cosine of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+
+==== sin
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+template <typename T>
+T sin(T x);
+} // namespace sycl::ext::oneapi::experimental
+```
+
+===== Description
+
+Compute sine of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+
+
+==== exp
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+template <typename T>
+T exp(T x);
+} // namespace sycl::ext::oneapi::experimental
+```
+
+===== Description
+
+Compute the base-e exponential of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+
+==== exp2
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+template <typename T>
+T exp2(T x);
+} // namespace sycl::ext::oneapi::experimental
+```
+
+===== Description
+
+Compute the base-2 exponential of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+
+==== exp10
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+template <typename T>
+T exp10(T x);
+} // namespace sycl::ext::oneapi::experimental
+```
+
+===== Description
+
+Compute the base-10 exponential of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+
+==== log
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+template <typename T>
+T log(T x);
+} // namespace sycl::ext::oneapi::experimental
+```
+
+===== Description
+
+Compute natural logarithm of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+
+==== log2
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+template <typename T>
+T log2(T x);
+} // namespace sycl::ext::oneapi::experimental
+```
+
+===== Description
+
+Compute base-2 logarithm of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+
+==== log10
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+template <typename T>
+T log10(T x);
+} // namespace sycl::ext::oneapi::experimental
+```
+
+===== Description
+
+Compute base-10 logarithm of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+
+
+==== rint
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+template <typename T>
+T rint(T x);
+} // namespace sycl::ext::oneapi::experimental
+```
+
+===== Description
+
+Returns `x` rounded to an integral value using the round to nearest even rounding mode
+for a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+
+==== sqrt
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+template <typename T>
+T sqrt(T x);
+} // namespace sycl::ext::oneapi::experimental
+```
+
+===== Description
+
+Compute square root of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+
+==== rsqrt
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+template <typename T>
+T rsqrt(T x);
+} // namespace sycl::ext::oneapi::experimental
+```
+
+===== Description
+
+Compute inverse square root of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+
+==== trunc
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+template <typename T>
+T trunc(T x);
+} // namespace sycl::ext::oneapi::experimental
+```
+
+===== Description
+
+Returns `x` rounded to an integral value using the round to zero rounding mode
+for a `bfloat16` value or `sycl::marray<bfloat16, N>`.
 
 == Issues
 
diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp
@@ -232,6 +232,53 @@ sycl::marray<bfloat16, N> fma(sycl::marray<bfloat16, N> x,
   return res;
 }
 
+#define BFLOAT16_MATH_FP32_WRAPPERS(op)                                        \
+  template <typename T>                                                        \
+  std::enable_if_t<std::is_same<T, bfloat16>::value, T> op(T x) {              \
+    return sycl::ext::oneapi::bfloat16{sycl::op(float{x})};                    \
+  }
+
+#define BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(op)                                 \
+  template <size_t N>                                                          \
+  sycl::marray<bfloat16, N> op(sycl::marray<bfloat16, N> x) {                  \
+    sycl::marray<bfloat16, N> res;                                             \
+    for (size_t i = 0; i < N; i++) {                                           \
+      res[i] = op(x[i]);                                                       \
+    }                                                                          \
+    return res;                                                                \
+  }
+
+BFLOAT16_MATH_FP32_WRAPPERS(ceil)
+BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(ceil)
+BFLOAT16_MATH_FP32_WRAPPERS(cos)
+BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(cos)
+BFLOAT16_MATH_FP32_WRAPPERS(exp)
+BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp)
+BFLOAT16_MATH_FP32_WRAPPERS(exp10)
+BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp10)
+BFLOAT16_MATH_FP32_WRAPPERS(exp2)
+BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp2)
+BFLOAT16_MATH_FP32_WRAPPERS(floor)
+BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(floor)
+BFLOAT16_MATH_FP32_WRAPPERS(log)
+BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log)
+BFLOAT16_MATH_FP32_WRAPPERS(log2)
+BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log2)
+BFLOAT16_MATH_FP32_WRAPPERS(log10)
+BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log10)
+BFLOAT16_MATH_FP32_WRAPPERS(rint)
+BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(rint)
+BFLOAT16_MATH_FP32_WRAPPERS(rsqrt)
+BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(rsqrt)
+BFLOAT16_MATH_FP32_WRAPPERS(sin)
+BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(sin)
+BFLOAT16_MATH_FP32_WRAPPERS(sqrt)
+BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(sqrt)
+BFLOAT16_MATH_FP32_WRAPPERS(trunc)
+BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(trunc)
+
+#undef BFLOAT16_MATH_FP32_WRAPPERS
+#undef BFLOAT16_MATH_FP32_WRAPPERS_MARRAY
 } // namespace ext::oneapi::experimental
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
diff --git a/sycl/test-e2e/BFloat16/bfloat16_builtins.cpp b/sycl/test-e2e/BFloat16/bfloat16_builtins.cpp
@@ -37,8 +37,9 @@ bool check(bool a, bool b) { return (a != b); }
                                                                      cgh);     \
       accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
       cgh.parallel_for(N, [=](id<1> index) {                                   \
+        float ABF16 = float{bfloat16{A[index]}};                               \
         if (check(sycl::ext::oneapi::experimental::NAME(bfloat16{A[index]}),   \
-                  sycl::NAME(A[index]))) {                                     \
+                  sycl::NAME(ABF16))) {                                        \
           ERR[0] = 1;                                                          \
         }                                                                      \
       });                                                                      \
@@ -61,7 +62,8 @@ bool check(bool a, bool b) { return (a != b); }
         }                                                                      \
         marray<RETTY, SZ> res = NAME(arg);                                     \
         for (int i = 0; i < SZ; i++) {                                         \
-          if (check(res[i], sycl::NAME(A[index][i]))) {                        \
+          float ABF16 = float{bfloat16{A[index][i]}};                          \
+          if (check(res[i], sycl::NAME(ABF16))) {                              \
             ERR[0] = 1;                                                        \
           }                                                                    \
         }                                                                      \
@@ -90,8 +92,10 @@ bool check(bool a, bool b) { return (a != b); }
                                                                      cgh);     \
       accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
       cgh.parallel_for(N, [=](id<1> index) {                                   \
+        float ABF16 = float{bfloat16{A[index]}};                               \
+        float BBF16 = float{bfloat16{B[index]}};                               \
         if (check(NAME(bfloat16{A[index]}, bfloat16{B[index]}),                \
-                  NAME(A[index], B[index]))) {                                 \
+                  NAME(ABF16, BBF16))) {                                       \
           ERR[0] = 1;                                                          \
         }                                                                      \
       });                                                                      \
@@ -118,7 +122,9 @@ bool check(bool a, bool b) { return (a != b); }
         }                                                                      \
         marray<bfloat16, SZ> res = NAME(arg0, arg1);                           \
         for (int i = 0; i < SZ; i++) {                                         \
-          if (check(res[i], NAME(A[index][i], B[index][i]))) {                 \
+          float ABF16 = float{bfloat16{A[index][i]}};                          \
+          float BBF16 = float{bfloat16{B[index][i]}};                          \
+          if (check(res[i], NAME(ABF16, BBF16))) {                             \
             ERR[0] = 1;                                                        \
           }                                                                    \
         }                                                                      \
@@ -150,9 +156,12 @@ bool check(bool a, bool b) { return (a != b); }
                                                                      cgh);     \
       accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
       cgh.parallel_for(N, [=](id<1> index) {                                   \
+        float ABF16 = float{bfloat16{A[index]}};                               \
+        float BBF16 = float{bfloat16{B[index]}};                               \
+        float CBF16 = float{bfloat16{C[index]}};                               \
         if (check(NAME(bfloat16{A[index]}, bfloat16{B[index]},                 \
                        bfloat16{C[index]}),                                    \
-                  NAME(A[index], B[index], C[index]))) {                       \
+                  NAME(ABF16, BBF16, CBF16))) {                                \
           ERR[0] = 1;                                                          \
         }                                                                      \
       });                                                                      \
@@ -183,7 +192,10 @@ bool check(bool a, bool b) { return (a != b); }
         }                                                                      \
         marray<bfloat16, SZ> res = NAME(arg0, arg1, arg2);                     \
         for (int i = 0; i < SZ; i++) {                                         \
-          if (check(res[i], NAME(A[index][i], B[index][i], C[index][i]))) {    \
+          float ABF16 = float{bfloat16{A[index][i]}};                          \
+          float BBF16 = float{bfloat16{B[index][i]}};                          \
+          float CBF16 = float{bfloat16{C[index][i]}};                          \
+          if (check(res[i], NAME(ABF16, BBF16, CBF16))) {                      \
             ERR[0] = 1;                                                        \
           }                                                                    \
         }                                                                      \
@@ -245,5 +257,35 @@ int main() {
   a[0] = a[N - 1] = NAN;
   TEST_BUILTIN_1(isnan, bool);
 
+  // Orignal input 'a[0...N-1]' are in range [-0.5, 0.5),
+  // need to update it for generic math testing.
+  // sin, cos testing
+  for (int i = 0; i < N; ++i) {
+    a[i] = (i / (float)(N - 1)) * 6.28;
+    if ((i & 0x1) == 0x1)
+      a[i] = -a[i];
+  }
+  TEST_BUILTIN_1(cos, sycl::ext::oneapi::bfloat16);
+  TEST_BUILTIN_1(sin, sycl::ext::oneapi::bfloat16);
+
+  // ceil, floor, trunc, exp, exp2, exp10, rint testing
+  TEST_BUILTIN_1(ceil, sycl::ext::oneapi::bfloat16);
+  TEST_BUILTIN_1(floor, sycl::ext::oneapi::bfloat16);
+  TEST_BUILTIN_1(trunc, sycl::ext::oneapi::bfloat16);
+  TEST_BUILTIN_1(exp, sycl::ext::oneapi::bfloat16);
+  TEST_BUILTIN_1(exp10, sycl::ext::oneapi::bfloat16);
+  TEST_BUILTIN_1(exp2, sycl::ext::oneapi::bfloat16);
+  TEST_BUILTIN_1(rint, sycl::ext::oneapi::bfloat16);
+
+  // log, log2, log10, sqrt, rsqrt testing, the input
+  // must be positive.
+  for (int i = 0; i < N; ++i)
+    a[i] = a[i] + 8.5;
+  TEST_BUILTIN_1(sqrt, sycl::ext::oneapi::bfloat16);
+  TEST_BUILTIN_1(rsqrt, sycl::ext::oneapi::bfloat16);
+  TEST_BUILTIN_1(log, sycl::ext::oneapi::bfloat16);
+  TEST_BUILTIN_1(log2, sycl::ext::oneapi::bfloat16);
+  TEST_BUILTIN_1(log10, sycl::ext::oneapi::bfloat16);
+
   return 0;
 }