intel · pvchupin · Nov 28, 2022 · Jan 20, 2021 · Jan 25, 2021 · Jan 26, 2021
@@ -12,7 +12,7 @@
 #include <vector>
 
 using namespace sycl;
-using namespace sycl::ext::oneapi::experimental;
+using namespace sycl::ext::oneapi;
 
 constexpr int N = 60; // divisible by all tested array sizes
 constexpr float bf16_eps = 0.00390625;
@@ -222,7 +222,7 @@ bool check(float a, float b) {
 int main() {
   queue q;
 
-  if (q.get_device().has(aspect::ext_oneapi_bfloat16)) {
+  if (q.get_device().has(aspect::ext_oneapi_bfloat16_math_functions)) {
     std::vector<float> a(N), b(N), c(N);
     int err = 0;
 

@@ -0,0 +1,70 @@
+// UNSUPPORTED: hip
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_kernel %s -o %t.out
+// Currently the feature is supported only on CPU and GPU, natively or by
+// software emulation.
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+//==---------- bfloat16_conversions.cpp - SYCL bfloat16 type test ---------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#include <iostream>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+template <typename T> T calculate(T a, T b) {
+  sycl::ext::oneapi::bfloat16 x = -a;
+  sycl::ext::oneapi::bfloat16 y = b;
+  sycl::ext::oneapi::bfloat16 z = x + y;
+  T result = z;
+  return result;
+}
+
+template <typename T> int test_device(queue Q) {
+  T data[3] = {-7.0f, 8.1f, 0.0f};
+
+  buffer<T, 1> buf{data, 3};
+  Q.submit([&](handler &cgh) {
+    accessor numbers{buf, cgh, read_write};
+    cgh.single_task([=]() { numbers[2] = calculate(numbers[0], numbers[1]); });
+  });
+
+  host_accessor hostOutAcc{buf, read_only};
+  std::cout << "Device Result = " << hostOutAcc[2] << std::endl;
+  if (hostOutAcc[2] == 15.125f)
+    return 0;
+  return 1;
+}
+
+template <typename T> int test_host() {
+  T a{-5.6f};
+  T b{-1.1f};
+  T result = calculate(a, b);
+  std::cout << "Host Result = " << result << std::endl;
+  if (result == 4.5f)
+    return 0;
+  return 1;
+}
+
+int main() {
+  queue Q;
+  int result;
+  result = test_host<sycl::half>();
+  result |= test_host<float>();
+  if (Q.get_device().has(aspect::fp16))
+    result |= test_device<sycl::half>(Q);
+  result |= test_device<float>(Q);
+  if (result)
+    std::cout << "FAIL\n";
+  else
+    std::cout << "PASS\n";
+
+  return result;
+}
@@ -0,0 +1,83 @@
+///
+/// Check if bfloat16 example works using fallback libraries
+///
+
+// REQUIRES: opencl-aot, ocloc, cpu, gpu-intel-gen9
+// UNSUPPORTED: cuda
+// CUDA is not compatible with SPIR.
+
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using sycl::ext::oneapi::bfloat16;
+
+float foo(float a, float b) {
+  // Convert from float to bfloat16.
+  bfloat16 A{a};
+  bfloat16 B{b};
+
+  // Convert A and B from bfloat16 to float, do addition on floating-point
+  // numbers, then convert the result to bfloat16 and store it in C.
+  bfloat16 C = A + B;
+
+  // Return the result converted from bfloat16 to float.
+  return C;
+}
+
+int main(int argc, char *argv[]) {
+  float data[3] = {7.0f, 8.1f, 0.0f};
+
+  float result_host = foo(7.0f, 8.1f);
+  std::cout << "CPU Result = " << result_host << std::endl;
+  if (std::abs(15.1f - result_host) > 0.1f) {
+    std::cout << "Test failed. Expected CPU Result ~= 15.1" << std::endl;
+    return 1;
+  }
+
+  queue deviceQueue;
+  buffer<float, 1> buf{data, 3};
+
+  deviceQueue.submit([&](handler &cgh) {
+    accessor numbers{buf, cgh, read_write};
+    cgh.single_task([=]() { numbers[2] = foo(numbers[0], numbers[1]); });
+  });
+
+  host_accessor hostOutAcc{buf, read_only};
+  float result_device = hostOutAcc[2];
+  std::cout << "GPU Result = " << result_device << std::endl;
+  if (std::abs(result_host - result_device) > 0.1f) {
+    std::cout << "Test failed. CPU Result !~= GPU result" << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
@@ -1,10 +1,11 @@
-// UNSUPPORTED: cuda || hip
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// TODO currently the feature isn't supported on most of the devices
-//      need to enable the test when the aspect and device_if feature are
-//      introduced
-// RUNx: %CPU_RUN_PLACEHOLDER %t.out
-// RUNx: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: hip
+// RUN: %if cuda %{%clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 %s -o %t.out %}
+// TODO enable the below when CI supports >=sm_80
+// RUNx: %if cuda %{%GPU_RUN_PLACEHOLDER %t.out %}
+// RUN: %clangxx -fsycl %s -o %t.out
+// TODO currently the feature isn't supported on FPGA.
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUNx: %ACC_RUN_PLACEHOLDER %t.out
 
 //==----------- bfloat16_type.cpp - SYCL bfloat16 type test ----------------==//

@@ -1,5 +1,5 @@
 #include <iostream>
-#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+#include <sycl/ext/oneapi/bfloat16.hpp>
 #include <sycl/sycl.hpp>
 
 #include <cmath>
@@ -11,8 +11,7 @@ constexpr size_t N = 100;
 template <typename T> void assert_close(const T &C, const float ref) {
   for (size_t i = 0; i < N; i++) {
     auto diff = C[i] - ref;
-    assert(std::fabs(static_cast<float>(diff)) <
-           std::numeric_limits<float>::epsilon());
+    assert(std::fabs(static_cast<float>(diff)) < 0.1);
   }
 }
 
@@ -21,7 +20,7 @@ void verify_conv_implicit(queue &q, buffer<float, 1> &a, range<1> &r,
   q.submit([&](handler &cgh) {
     auto A = a.get_access<access::mode::read_write>(cgh);
     cgh.parallel_for<class calc_conv>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
       A[index] = AVal;
     });
   });
@@ -34,9 +33,8 @@ void verify_conv_explicit(queue &q, buffer<float, 1> &a, range<1> &r,
   q.submit([&](handler &cgh) {
     auto A = a.get_access<access::mode::read_write>(cgh);
     cgh.parallel_for<class calc_conv_impl>(r, [=](id<1> index) {
-      uint16_t AVal =
-          sycl::ext::oneapi::experimental::bfloat16::from_float(A[index]);
-      A[index] = sycl::ext::oneapi::experimental::bfloat16::to_float(AVal);
+      sycl::ext::oneapi::bfloat16 AVal = A[index];
+      A[index] = float(AVal);
     });
   });
 
@@ -52,9 +50,9 @@ void verify_add(queue &q, buffer<float, 1> &a, buffer<float, 1> &b, range<1> &r,
     auto B = b.get_access<access::mode::read>(cgh);
     auto C = c.get_access<access::mode::write>(cgh);
     cgh.parallel_for<class calc_add_expl>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
-      sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]};
-      sycl::ext::oneapi::experimental::bfloat16 CVal = AVal + BVal;
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 BVal{B[index]};
+      sycl::ext::oneapi::bfloat16 CVal = AVal + BVal;
       C[index] = CVal;
     });
   });
@@ -71,9 +69,9 @@ void verify_sub(queue &q, buffer<float, 1> &a, buffer<float, 1> &b, range<1> &r,
     auto B = b.get_access<access::mode::read>(cgh);
     auto C = c.get_access<access::mode::write>(cgh);
     cgh.parallel_for<class calc_sub>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
-      sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]};
-      sycl::ext::oneapi::experimental::bfloat16 CVal = AVal - BVal;
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 BVal{B[index]};
+      sycl::ext::oneapi::bfloat16 CVal = AVal - BVal;
       C[index] = CVal;
     });
   });
@@ -88,8 +86,8 @@ void verify_minus(queue &q, buffer<float, 1> &a, range<1> &r, const float ref) {
     auto A = a.get_access<access::mode::read>(cgh);
     auto C = c.get_access<access::mode::write>(cgh);
     cgh.parallel_for<class calc_minus>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
-      sycl::ext::oneapi::experimental::bfloat16 CVal = -AVal;
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 CVal = -AVal;
       C[index] = CVal;
     });
   });
@@ -106,9 +104,9 @@ void verify_mul(queue &q, buffer<float, 1> &a, buffer<float, 1> &b, range<1> &r,
     auto B = b.get_access<access::mode::read>(cgh);
     auto C = c.get_access<access::mode::write>(cgh);
     cgh.parallel_for<class calc_mul>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
-      sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]};
-      sycl::ext::oneapi::experimental::bfloat16 CVal = AVal * BVal;
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 BVal{B[index]};
+      sycl::ext::oneapi::bfloat16 CVal = AVal * BVal;
       C[index] = CVal;
     });
   });
@@ -125,9 +123,9 @@ void verify_div(queue &q, buffer<float, 1> &a, buffer<float, 1> &b, range<1> &r,
     auto B = b.get_access<access::mode::read>(cgh);
     auto C = c.get_access<access::mode::write>(cgh);
     cgh.parallel_for<class calc_div>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
-      sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]};
-      sycl::ext::oneapi::experimental::bfloat16 CVal = AVal / BVal;
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 BVal{B[index]};
+      sycl::ext::oneapi::bfloat16 CVal = AVal / BVal;
       C[index] = CVal;
     });
   });
@@ -144,19 +142,18 @@ void verify_logic(queue &q, buffer<float, 1> &a, buffer<float, 1> &b,
     auto B = b.get_access<access::mode::read>(cgh);
     auto C = c.get_access<access::mode::write>(cgh);
     cgh.parallel_for<class logic>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
-      sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]};
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 BVal{B[index]};
       if (AVal) {
         if (AVal > BVal || AVal >= BVal || AVal < BVal || AVal <= BVal ||
             !BVal) {
-          sycl::ext::oneapi::experimental::bfloat16 CVal =
-              AVal != BVal ? AVal : BVal;
+          sycl::ext::oneapi::bfloat16 CVal = AVal != BVal ? AVal : BVal;
           CVal--;
           CVal++;
           if (AVal == BVal) {
             CVal -= AVal;
-            CVal *= 3.0;
-            CVal /= 2.0;
+            CVal *= 3.0f;
+            CVal /= 2.0f;
           } else
             CVal += BVal;
           C[index] = CVal;
@@ -179,9 +176,9 @@ int run_tests() {
     return 0;
   }
 
-  std::vector<float> vec_a(N, 5.0);
-  std::vector<float> vec_b(N, 2.0);
-  std::vector<float> vec_b_neg(N, -2.0);
+  std::vector<float> vec_a(N, 5.0f);
+  std::vector<float> vec_b(N, 2.0f);
+  std::vector<float> vec_b_neg(N, -2.0f);
 
   range<1> r(N);
   buffer<float, 1> a{vec_a.data(), r};
@@ -190,19 +187,32 @@ int run_tests() {
 
   queue q{dev};
 
-  verify_conv_implicit(q, a, r, 5.0);
-  verify_conv_explicit(q, a, r, 5.0);
-  verify_add(q, a, b, r, 7.0);
-  verify_sub(q, a, b, r, 3.0);
-  verify_mul(q, a, b, r, 10.0);
-  verify_div(q, a, b, r, 2.5);
-  verify_logic(q, a, b, r, 7.0);
-  verify_add(q, a, b_neg, r, 3.0);
-  verify_sub(q, a, b_neg, r, 7.0);
-  verify_minus(q, a, r, -5.0);
-  verify_mul(q, a, b_neg, r, -10.0);
-  verify_div(q, a, b_neg, r, -2.5);
-  verify_logic(q, a, b_neg, r, 3.0);
+  verify_conv_implicit(q, a, r, 5.0f);
+  std::cout << "PASS verify_conv_implicit\n";
+  verify_conv_explicit(q, a, r, 5.0f);
+  std::cout << "PASS verify_conv_explicit\n";
+  verify_add(q, a, b, r, 7.0f);
+  std::cout << "PASS verify_add\n";
+  verify_sub(q, a, b, r, 3.0f);
+  std::cout << "PASS verify_sub\n";
+  verify_mul(q, a, b, r, 10.0f);
+  std::cout << "PASS verify_mul\n";
+  verify_div(q, a, b, r, 2.5f);
+  std::cout << "PASS verify_div\n";
+  verify_logic(q, a, b, r, 7.0f);
+  std::cout << "PASS verify_logic\n";
+  verify_add(q, a, b_neg, r, 3.0f);
+  std::cout << "PASS verify_add\n";
+  verify_sub(q, a, b_neg, r, 7.0f);
+  std::cout << "PASS verify_sub\n";
+  verify_minus(q, a, r, -5.0f);
+  std::cout << "PASS verify_minus\n";
+  verify_mul(q, a, b_neg, r, -10.0f);
+  std::cout << "PASS verify_mul\n";
+  verify_div(q, a, b_neg, r, -2.5f);
+  std::cout << "PASS verify_div\n";
+  verify_logic(q, a, b_neg, r, 3.0f);
+  std::cout << "PASS verify_logic\n";
 
   return 0;
 }
@@ -12,13 +12,4 @@
 
 #include "bfloat16_type.hpp"
 
-int main() {
-  bool has_bfloat16_aspect = false;
-  for (const auto &plt : sycl::platform::get_platforms()) {
-    if (plt.has(aspect::ext_oneapi_bfloat16))
-      has_bfloat16_aspect = true;
-  }
-
-  if (has_bfloat16_aspect)
-    return run_tests();
-}
+int main() { return run_tests(); }