[SYCL] Optimize vec<bfloat> math builtins (#14106)

uditagarwal97 · web-flow · commit 0032a8407d3b · 2024-06-24T17:01:58.000-07:00
Followup and blocked by: #14105 Currently, `vec<bfloat>` math builtins do element-by-element operations. This PR optimize `vec<bfloat>` math builtins by: (1) Converting `vec<bfloat>` to `vec<float>`. (2) Do the operation on `vec<float>` (which uses Spirv built-ins underneath for optimized vector operations). (3) Convert back the return value to `vec<bfloat>`. Look at the beautiful diff in `check_device_code/vector/vector_bf16_builtins.cpp` to visualize the device code generated before and after this optimization.
diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp
@@ -62,13 +62,26 @@ template <size_t N> sycl::marray<bool, N> isnan(sycl::marray<bfloat16, N> x) {
 template <typename T, int N = num_elements_v<T>>
 std::enable_if_t<is_vec_or_swizzle_bf16_v<T>, sycl::vec<int16_t, N>>
 isnan(T x) {
+
+#if defined(__SYCL_DEVICE_ONLY__) && (defined(__SPIR__) || defined(__SPIRV__))
+  // Convert BFloat16 vector to float vec and call isnan().
+  sycl::vec<float, N> FVec =
+      x.template convert<float, sycl::rounding_mode::automatic>();
+  auto Res = isnan(FVec);
+
+  // For vec<float>, the return type of isnan is vec<int32_t> so,
+  // an explicit conversion is required to vec<int16_t>.
+  return Res.template convert<int16_t>();
+#else
+
   sycl::vec<int16_t, N> res;
   for (size_t i = 0; i < N; i++) {
     // The result of isnan is 0 or 1 but SPEC requires
     // isnan() of vec/swizzle to return -1 or 0.
     res[i] = isnan(x[i]) ? -1 : 0;
   }
   return res;
+#endif
 }
 
 /******************* fabs ********************/
@@ -120,11 +133,19 @@ sycl::marray<bfloat16, N> fabs(sycl::marray<bfloat16, N> x) {
 template <typename T, int N = num_elements_v<T>>
 std::enable_if_t<is_vec_or_swizzle_bf16_v<T>, sycl::vec<bfloat16, N>>
 fabs(T x) {
+#if defined(__SYCL_DEVICE_ONLY__) && (defined(__SPIR__) || defined(__SPIRV__))
+  // Convert BFloat16 vector to float vec.
+  sycl::vec<float, N> FVec =
+      x.template convert<float, sycl::rounding_mode::automatic>();
+  auto Res = fabs(FVec);
+  return Res.template convert<bfloat16>();
+#else
   sycl::vec<bfloat16, N> res;
   for (size_t i = 0; i < N; i++) {
     res[i] = fabs(x[i]);
   }
   return res;
+#endif
 }
 
 /******************* fmin ********************/
@@ -193,11 +214,21 @@ std::enable_if_t<is_vec_or_swizzle_bf16_v<T1> && is_vec_or_swizzle_bf16_v<T2> &&
                      N1 == N2,
                  sycl::vec<bfloat16, N1>>
 fmin(T1 x, T2 y) {
+#if defined(__SYCL_DEVICE_ONLY__) && (defined(__SPIR__) || defined(__SPIRV__))
+  // Convert BFloat16 vectors to float vecs.
+  sycl::vec<float, N1> FVecX =
+      x.template convert<float, sycl::rounding_mode::automatic>();
+  sycl::vec<float, N1> FVecY =
+      y.template convert<float, sycl::rounding_mode::automatic>();
+  auto Res = fmin(FVecX, FVecY);
+  return Res.template convert<bfloat16>();
+#else
   sycl::vec<bfloat16, N1> res;
   for (size_t i = 0; i < N1; i++) {
     res[i] = fmin(x[i], y[i]);
   }
   return res;
+#endif
 }
 
 /******************* fmax ********************/
@@ -265,11 +296,21 @@ std::enable_if_t<is_vec_or_swizzle_bf16_v<T1> && is_vec_or_swizzle_bf16_v<T2> &&
                      N1 == N2,
                  sycl::vec<bfloat16, N1>>
 fmax(T1 x, T2 y) {
+#if defined(__SYCL_DEVICE_ONLY__) && (defined(__SPIR__) || defined(__SPIRV__))
+  // Convert BFloat16 vectors to float vecs.
+  sycl::vec<float, N1> FVecX =
+      x.template convert<float, sycl::rounding_mode::automatic>();
+  sycl::vec<float, N1> FVecY =
+      y.template convert<float, sycl::rounding_mode::automatic>();
+  auto Res = fmax(FVecX, FVecY);
+  return Res.template convert<bfloat16>();
+#else
   sycl::vec<bfloat16, N1> res;
   for (size_t i = 0; i < N1; i++) {
     res[i] = fmax(x[i], y[i]);
   }
   return res;
+#endif
 }
 
 /******************* fma *********************/
@@ -327,11 +368,24 @@ std::enable_if_t<is_vec_or_swizzle_bf16_v<T1> && is_vec_or_swizzle_bf16_v<T2> &&
                      is_vec_or_swizzle_bf16_v<T3> && N1 == N2 && N2 == N3,
                  sycl::vec<bfloat16, N1>>
 fma(T1 x, T2 y, T3 z) {
+#if defined(__SYCL_DEVICE_ONLY__) && (defined(__SPIR__) || defined(__SPIRV__))
+  // Convert BFloat16 vectors to float vecs.
+  sycl::vec<float, N1> FVecX =
+      x.template convert<float, sycl::rounding_mode::automatic>();
+  sycl::vec<float, N1> FVecY =
+      y.template convert<float, sycl::rounding_mode::automatic>();
+  sycl::vec<float, N1> FVecZ =
+      z.template convert<float, sycl::rounding_mode::automatic>();
+
+  auto Res = fma(FVecX, FVecY, FVecZ);
+  return Res.template convert<bfloat16>();
+#else
   sycl::vec<bfloat16, N1> res;
   for (size_t i = 0; i < N1; i++) {
     res[i] = fma(x[i], y[i], z[i]);
   }
   return res;
+#endif
 }
 
 /******************* unary math operations ********************/
@@ -352,6 +406,18 @@ fma(T1 x, T2 y, T3 z) {
     return res;                                                                \
   }
 
+#if defined(__SYCL_DEVICE_ONLY__) && (defined(__SPIR__) || defined(__SPIRV__))
+#define BFLOAT16_MATH_FP32_WRAPPERS_VEC(op)                                    \
+  /* Overload for BF16 vec and swizzles. */                                    \
+  template <typename T, int N = num_elements_v<T>>                             \
+  std::enable_if_t<is_vec_or_swizzle_bf16_v<T>, sycl::vec<bfloat16, N>> op(    \
+      T x) {                                                                   \
+    sycl::vec<float, N> FVec =                                                 \
+        x.template convert<float, sycl::rounding_mode::automatic>();           \
+    auto Res = op(FVec);                                                       \
+    return Res.template convert<bfloat16>();                                   \
+  }
+#else
 #define BFLOAT16_MATH_FP32_WRAPPERS_VEC(op)                                    \
   /* Overload for BF16 vec and swizzles. */                                    \
   template <typename T, int N = num_elements_v<T>>                             \
@@ -363,6 +429,7 @@ fma(T1 x, T2 y, T3 z) {
     }                                                                          \
     return res;                                                                \
   }
+#endif
 
 BFLOAT16_MATH_FP32_WRAPPERS(ceil)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(ceil)
diff --git a/sycl/include/sycl/vector_preview.hpp b/sycl/include/sycl/vector_preview.hpp
@@ -1363,7 +1363,11 @@ class SwizzleOp {
   template <typename convertT, rounding_mode roundingMode>
   vec<convertT, sizeof...(Indexes)> convert() const {
     // First materialize the swizzle to vec_t and then apply convert() to it.
-    vec_t Tmp = *this;
+    vec_t Tmp;
+    std::array<int, getNumElements()> Idxs{Indexes...};
+    for (size_t I = 0; I < Idxs.size(); ++I) {
+      Tmp[I] = (*m_Vector)[Idxs[I]];
+    }
     return Tmp.template convert<convertT, roundingMode>();
   }
 
diff --git a/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp b/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp
@@ -31,12 +31,11 @@ bool check(bool a, bool b) { return (a != b); }
     for (int i = 0; i < SZ; i++) {                                             \
       arg[i] = INPVAL;                                                         \
     }                                                                          \
-    /* Perform the operation. */                                               \              
-    vec<RETTY, SZ>                                                             \
-        res = sycl::ext::oneapi::experimental::NAME(arg);                      \
+    /* Perform the operation. */                                               \
+    vec<RETTY, SZ> res = sycl::ext::oneapi::experimental::NAME(arg);           \
     vec<RETTY, 2> res2 =                                                       \
         sycl::ext::oneapi::experimental::NAME(arg.template swizzle<0, 0>());   \
-    /* Check the result. */                                                    \                   
+    /* Check the result. */                                                    \
     if (res2[0] != res[0] || res2[1] != res[0]) {                              \
       ERR[0] += 1;                                                             \
     }                                                                          \
@@ -56,9 +55,8 @@ bool check(bool a, bool b) { return (a != b); }
       arg[i] = INPVAL;                                                         \
       arg2[i] = inpVal2;                                                       \
     }                                                                          \
-    /* Perform the operation. */                                               \              
-    vec<RETTY, SZ>                                                             \
-        res = sycl::ext::oneapi::experimental::NAME(arg, arg2);                \
+    /* Perform the operation. */                                               \
+    vec<RETTY, SZ> res = sycl::ext::oneapi::experimental::NAME(arg, arg2);     \
     /* Swizzle and vec different combination. */                               \
     vec<RETTY, 2> res2 = sycl::ext::oneapi::experimental::NAME(                \
         arg.template swizzle<0, 0>(), arg2.template swizzle<0, 0>());          \
diff --git a/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp b/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp