fix for older vec implementation

uditagarwal97 · uditagarwal97 · commit 7651a30b8c4e · 2024-06-14T14:11:57.000-07:00
diff --git a/sycl/include/sycl/vector.hpp b/sycl/include/sycl/vector.hpp
@@ -732,7 +732,8 @@ template <typename Type, int NumElements> class vec {
     if constexpr (!IsUsingArrayOnDevice) {
       return m_Data;
     } else {
-      return sycl::bit_cast<vector_t>(m_Data);
+      auto ptr = bit_cast<const vector_t *>((&m_Data)->data());
+      return *ptr;
     }
   }
 #endif // __SYCL_DEVICE_ONLY__
@@ -788,77 +789,64 @@ template <typename Type, int NumElements> class vec {
     using bfloat16 = sycl::ext::oneapi::bfloat16;
     static_assert(std::is_integral_v<vec_data_t<convertT>> ||
                       detail::is_floating_point<convertT>::value ||
-                      std::is_same_v<bfloat16, convertT>,
+                      std::is_same_v<convertT, bfloat16>,
                   "Unsupported convertT");
     using T = vec_data_t<DataT>;
     using R = vec_data_t<convertT>;
     using OpenCLT = detail::ConvertToOpenCLType_t<T>;
     using OpenCLR = detail::ConvertToOpenCLType_t<R>;
-
     vec<convertT, NumElements> Result;
 
-    // we are not on CUDA, see intel/llvm#11840
-#if defined(__SYCL_DEVICE_ONLY__) && !defined(__NVPTX__)
-    // Convert BF16 vector -> float vector and vice versa.
-    if constexpr (((IsBfloat16 && std::is_same_v<convertT, float>) ||
-                   (std::is_same_v<convertT, bfloat16> &&
-                    std::is_same_v<DataT, float>)) &&
-                  NumElements > 1) {
-
-      using BF16ExtType = sycl::ext::oneapi::detail::Bfloat16StorageT
-          __attribute__((ext_vector_type(NumElements)));
-      using FloatExtType = float __attribute__((ext_vector_type(NumElements)));
-      vec<convertT, NumElements> convertedVec;
-
-      if constexpr (IsBfloat16)
-        convertedVec =
-            detail::convertImpl<bfloat16, float, roundingMode, NumElements,
-                                BF16ExtType, FloatExtType>(
-                static_cast<vector_t>(*this));
-      else
-        convertedVec =
-            detail::convertImpl<float, bfloat16, roundingMode, NumElements,
-                                FloatExtType, BF16ExtType>(
-                static_cast<vector_t>(*this));
-
-      return vec<convertT, NumElements>(convertedVec);
-    } else if constexpr (NumElements > 1) {
-      using OpenCLVecT = OpenCLT __attribute__((ext_vector_type(NumElements)));
-      using OpenCLVecR = OpenCLR __attribute__((ext_vector_type(NumElements)));
-      // Whole vector conversion can only be done, if:
-      constexpr bool canUseNativeVectorConvert =
-          // - both vectors are represented using native vector types;
-          NativeVec && vec<convertT, NumElements>::NativeVec &&
-          // - vec storage has an equivalent OpenCL native vector it is
-          // implicitly
-          //   convertible to. There are some corner cases where it is not the
-          //   case with char, long and long long types.
-          std::is_convertible_v<decltype(m_Data), OpenCLVecT> &&
-          std::is_convertible_v<decltype(Result.m_Data), OpenCLVecR> &&
-          // - it is not a signed to unsigned (or vice versa) conversion
-          //   see comments within 'convertImpl' for more details;
-          !detail::is_sint_to_from_uint<T, R>::value &&
-          // - destination type is not bool. bool is stored as integer under the
-          //   hood and therefore conversion to bool looks like conversion
-          //   between two integer types. Since bit pattern for true and false
-          //   is not defined, there is no guarantee that integer conversion
-          //   yields right results here;
-          !std::is_same_v<convertT, bool>;
-      if constexpr (canUseNativeVectorConvert) {
-        Result.m_Data = detail::convertImpl<T, R, roundingMode, NumElements,
-                                            OpenCLVecT, OpenCLVecR>(m_Data);
-        return Result;
+#if defined(__SYCL_DEVICE_ONLY__)
+    using OpenCLVecT = OpenCLT __attribute__((ext_vector_type(NumElements)));
+    using OpenCLVecR = OpenCLR __attribute__((ext_vector_type(NumElements)));
+    // Whole vector conversion can only be done, if:
+    constexpr bool canUseNativeVectorConvert =
+#ifdef __NVPTX__
+        // - we are not on CUDA, see intel/llvm#11840
+        false &&
+#endif
+        // - both vectors are represented using native vector types;
+        NativeVec && vec<convertT, NumElements>::NativeVec &&
+        // - vec storage has an equivalent OpenCL native vector it is implicitly
+        //   convertible to. There are some corner cases where it is not the
+        //   case with char, long and long long types.
+        std::is_convertible_v<decltype(m_Data), OpenCLVecT> &&
+        std::is_convertible_v<decltype(Result.m_Data), OpenCLVecR> &&
+        // - it is not a signed to unsigned (or vice versa) conversion
+        //   see comments within 'convertImpl' for more details;
+        !detail::is_sint_to_from_uint<T, R>::value &&
+        // - destination type is not bool. bool is stored as integer under the
+        //   hood and therefore conversion to bool looks like conversion between
+        //   two integer types. Since bit pattern for true and false is not
+        //   defined, there is no guarantee that integer conversion yields
+        //   right results here;
+        !std::is_same_v<convertT, bool>;
+    if constexpr (canUseNativeVectorConvert) {
+      Result.m_Data = detail::convertImpl<T, R, roundingMode, NumElements,
+                                          OpenCLVecT, OpenCLVecR>(m_Data);
+    } else
+#endif // defined(__SYCL_DEVICE_ONLY__)
+    {
+      // Otherwise, we fallback to per-element conversion:
+      for (size_t I = 0; I < NumElements; ++I) {
+        // For float -> bf16.
+        if constexpr (std::is_same_v<convertT, bfloat16>) {
+          Result[I] = bfloat16((*this)[I]);
+        } else
+        // For bf16 -> float.
+        if constexpr (std::is_same_v<DataT, bfloat16>) {
+          Result[I] = (float)((*this)[I]);
+        }
+        else {
+          Result.setValue(
+              I, vec_data<convertT>::get(
+                     detail::convertImpl<T, R, roundingMode, 1, OpenCLT, OpenCLR>(
+                         vec_data<T>::get(getValue(I)))));
+        }
       }
     }
-#endif // defined(__SYCL_DEVICE_ONLY__)
 
-    // Otherwise, we fallback to per-element conversion:
-    for (size_t I = 0; I < NumElements; ++I) {
-      Result.setValue(
-          I, vec_data<convertT>::get(
-                 detail::convertImpl<T, R, roundingMode, 1, OpenCLT, OpenCLR>(
-                     vec_data<DataT>::get(getValue(I)))));
-    }
     return Result;
   }
 
diff --git a/sycl/test-e2e/BFloat16/bfloat16_vec.cpp b/sycl/test-e2e/BFloat16/bfloat16_vec.cpp
@@ -10,8 +10,8 @@
 // TODO enable opaque pointers support on CPU.
 // UNSUPPORTED: cpu || accelerator
 
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
+// UN: %{build} -o %t.out
+// UN: %{run} %t.out
 // RUN: %if preview-breaking-changes-supported %{  %{build} -fpreview-breaking-changes -o %t2.out   %}
 // RUN: %if preview-breaking-changes-supported %{ %{run} %t2.out  %}
 
@@ -140,8 +140,8 @@ int main() {
     std::cout << "/ ref0: " << division_ref0            << "    ref1: " << division_ref1 << std::endl;
     std::cout << "div[0]: " << double_division[0]       << "  div[1]: " << double_division[1] << std::endl;
     std::cout << "Float convert ref0: " << double_float[0]    << "    ref1: " << double_float[1] << std::endl;
-    std::cout << "convert[0]: " << fConv2[0]             << "  convert[1]: " << fConv2[1] << std::endl;
-    std::cout << "bf16 convert[0]: " << brev2[0]             << "  bf16 convert[1]: " << brev2[1] << std::endl;
+    std::cout << "convert[0]: " << fConv2[0]            << "  convert[1]: " << fConv2[1] << std::endl;
+    std::cout << "bf16 convert[0]: " << brev2[0]        << "  bf16 convert[1]: " << brev2[1] << std::endl;
 
     assert(twoA[0] == double_float[0]);                      assert(twoA[1] == double_float[1]);
     assert(addition_ref0 == double_addition[0]);             assert(addition_ref1 == double_addition[1]);
@@ -178,8 +178,8 @@ int main() {
             out << "/ ref0: " << division_ref0            << "    ref1: " << division_ref1 << sycl::endl;
             out << "div[0]: " << device_division[0]       << "  div[1]: " << device_division[1] << sycl::endl;
             out << "Float convert ref0: " << device_float[0]    << "    ref1: " << device_float[1] << sycl::endl;
-            out << "convert[0]: " << fConv2[0]             << "  convert[1]: " << fConv2[1] << sycl::endl;
-            out << "bf16 convert[0]: " << brev2[0]             << "  bf16 convert[1]: " << brev2[1] << sycl::endl;
+            out << "convert[0]: " << fConv2[0]            << "  convert[1]: " << fConv2[1] << sycl::endl;
+            out << "bf16 convert[0]: " << brev2[0]        << "  bf16 convert[1]: " << brev2[1] << sycl::endl;
 
             acc[7] = (twoA[0] == device_float[0]) && (twoA[1] == device_float[1]);
             acc[8] = (addition_ref0 == device_addition[0]) && (addition_ref1 == device_addition[1]);