intel
diff --git a/‎.github/CODEOWNERS
Lines changed: 1 addition & 1 deletion b/‎.github/CODEOWNERS
Lines changed: 1 addition & 1 deletion
diff --git a/‎SYCL/AOT/gpu.cpp
Lines changed: 0 additions & 3 deletions b/‎SYCL/AOT/gpu.cpp
Lines changed: 0 additions & 3 deletions
diff --git a/‎SYCL/AOT/multiple-devices.cpp
Lines changed: 0 additions & 3 deletions b/‎SYCL/AOT/multiple-devices.cpp
Lines changed: 0 additions & 3 deletions
diff --git a/‎SYCL/Basic/multisource.cpp
Lines changed: 0 additions & 2 deletions b/‎SYCL/Basic/multisource.cpp
Lines changed: 0 additions & 2 deletions
diff --git a/‎SYCL/Basic/reqd_work_group_size.cpp
Lines changed: 0 additions & 4 deletions b/‎SYCL/Basic/reqd_work_group_size.cpp
Lines changed: 0 additions & 4 deletions
diff --git a/‎SYCL/Basic/subsubdevice.cpp
Lines changed: 171 additions & 0 deletions b/‎SYCL/Basic/subsubdevice.cpp
Lines changed: 171 additions & 0 deletions
diff --git a/‎SYCL/DeviceCodeSplit/aot-gpu.cpp
Lines changed: 0 additions & 3 deletions b/‎SYCL/DeviceCodeSplit/aot-gpu.cpp
Lines changed: 0 additions & 3 deletions
diff --git a/‎SYCL/ESIMD/PrefixSum.cpp
Lines changed: 6 additions & 4 deletions b/‎SYCL/ESIMD/PrefixSum.cpp
Lines changed: 6 additions & 4 deletions
diff --git a/‎SYCL/ESIMD/Prefix_Local_sum2.cpp
Lines changed: 2 additions & 2 deletions b/‎SYCL/ESIMD/Prefix_Local_sum2.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎SYCL/ESIMD/Prefix_Local_sum3.cpp
Lines changed: 4 additions & 2 deletions b/‎SYCL/ESIMD/Prefix_Local_sum3.cpp
Lines changed: 4 additions & 2 deletions
diff --git a/‎SYCL/ESIMD/accessor_gather_scatter.cpp
Lines changed: 1 addition & 0 deletions b/‎SYCL/ESIMD/accessor_gather_scatter.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎SYCL/ESIMD/ext_math.cpp
Lines changed: 31 additions & 13 deletions b/‎SYCL/ESIMD/ext_math.cpp
Lines changed: 31 additions & 13 deletions
@@ -1,4 +1,4 @@
-* @vladimirlaz
+* @vladimirlaz @romanovvlad @bader
 
 # AOT compilation
 SYCL/AOT @AGindinson @dm-vodopyanov @AlexeySachkov @romanovvlad
 
@@ -10,8 +10,5 @@
 // UNSUPPORTED: cuda
 // CUDA is not compatible with SPIR.
 //
-// The test is failing with GPU RT 30.0.100.9667
-// XFAIL: windows
-//
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend=spir64_gen-unknown-unknown-sycldevice "-device *" %S/Inputs/aot.cpp -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
@@ -10,9 +10,6 @@
 // UNSUPPORTED: cuda
 // CUDA is not compatible with SPIR.
 
-// The test is failing with GPU RT 30.0.100.9667
-// XFAIL: windows
-
 // 1-command compilation case
 // Targeting CPU, GPU, FPGA
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64-unknown-unknown-sycldevice,spir64_gen-unknown-unknown-sycldevice,spir64_fpga-unknown-unknown-sycldevice -Xsycl-target-backend=spir64_gen-unknown-unknown-sycldevice "-device *" %S/Inputs/aot.cpp -o %t_all.out
 
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: cuda
-
 // Separate kernel sources and host code sources
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -c -o %t.kernel.o %s -DINIT_KERNEL -DCALC_KERNEL
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -c -o %t.main.o %s -DMAIN_APP
 
@@ -1,7 +1,3 @@
-// XFAIL: cuda
-// The negative test fails on CUDA.  It's not clear whether the CUDA backend
-// respects the reqd_work_group_size attribute.
-
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple  %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
@@ -0,0 +1,171 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+//==------------ subdevice.cpp - SYCL subdevice basic test -----------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <utility>
+
+using namespace cl::sycl;
+
+int main() {
+  try {
+    auto devices = device::get_devices();
+    for (const auto &dev : devices) {
+      // TODO: implement subdevices creation for host device
+      if (dev.is_host())
+        continue;
+
+      assert(dev.get_info<info::device::partition_type_property>() ==
+             info::partition_property::no_partition);
+
+      size_t MaxSubDevices =
+          dev.get_info<info::device::partition_max_sub_devices>();
+
+      if (MaxSubDevices == 0)
+        continue;
+
+      try {
+        auto SubDevicesEq =
+            dev.create_sub_devices<info::partition_property::partition_equally>(
+                1);
+        assert(SubDevicesEq.size() == MaxSubDevices &&
+               "Requested 1 compute unit in each subdevice, expected maximum "
+               "number of subdevices in output");
+        std::cout << "Created " << SubDevicesEq.size()
+                  << " subdevices using equal partition scheme" << std::endl;
+
+        assert(
+            SubDevicesEq[0].get_info<info::device::partition_type_property>() ==
+            info::partition_property::partition_equally);
+
+        assert(SubDevicesEq[0].get_info<info::device::parent_device>().get() ==
+               dev.get());
+      } catch (feature_not_supported) {
+        // okay skip it
+      }
+
+      try {
+        vector_class<size_t> Counts(MaxSubDevices, 1);
+        auto SubDevicesByCount = dev.create_sub_devices<
+            info::partition_property::partition_by_counts>(Counts);
+        assert(SubDevicesByCount.size() == MaxSubDevices &&
+               "Maximum number of subdevices was requested with 1 compute unit "
+               "on each");
+        std::cout << "Created " << SubDevicesByCount.size()
+                  << " subdevices using partition by counts scheme."
+                  << std::endl;
+        assert(SubDevicesByCount[0]
+                   .get_info<info::device::partition_type_property>() ==
+               info::partition_property::partition_by_counts);
+      } catch (feature_not_supported) {
+        // okay skip it
+      }
+
+      try {
+        auto SubDevicesDomainNuma = dev.create_sub_devices<
+            info::partition_property::partition_by_affinity_domain>(
+            info::partition_affinity_domain::numa);
+        std::cout
+            << "Created " << SubDevicesDomainNuma.size()
+            << " subdevices using partition by numa affinity domain scheme."
+            << std::endl;
+
+        auto SubSubDevicesDomainNuma =
+            SubDevicesDomainNuma[0]
+                .create_sub_devices<
+                    info::partition_property::partition_by_affinity_domain>(
+                    info::partition_affinity_domain::numa);
+
+        std::cout << "Created " << SubSubDevicesDomainNuma.size()
+                  << " sub-subdevices from subdevice 0 using partition by numa "
+                     "affinity domain scheme."
+                  << std::endl;
+      } catch (feature_not_supported) {
+        // okay skip it
+      }
+
+      try {
+        auto SubDevicesDomainL4 = dev.create_sub_devices<
+            info::partition_property::partition_by_affinity_domain>(
+            info::partition_affinity_domain::L4_cache);
+        std::cout << "Created " << SubDevicesDomainL4.size()
+                  << " subdevices using partition by L4 cache domain scheme."
+                  << std::endl;
+      } catch (feature_not_supported) {
+        // okay skip it
+      }
+
+      try {
+        auto SubDevicesDomainL3 = dev.create_sub_devices<
+            info::partition_property::partition_by_affinity_domain>(
+            info::partition_affinity_domain::L3_cache);
+        std::cout << "Created " << SubDevicesDomainL3.size()
+                  << " subdevices using partition by L3 cache domain scheme."
+                  << std::endl;
+      } catch (feature_not_supported) {
+        // okay skip it
+      }
+
+      try {
+        auto SubDevicesDomainL2 = dev.create_sub_devices<
+            info::partition_property::partition_by_affinity_domain>(
+            info::partition_affinity_domain::L2_cache);
+        std::cout << "Created " << SubDevicesDomainL2.size()
+                  << " subdevices using partition by L2 cache domain scheme."
+                  << std::endl;
+      } catch (feature_not_supported) {
+        // okay skip it
+      }
+
+      try {
+        auto SubDevicesDomainL1 = dev.create_sub_devices<
+            info::partition_property::partition_by_affinity_domain>(
+            info::partition_affinity_domain::L1_cache);
+        std::cout << "Created " << SubDevicesDomainL1.size()
+                  << " subdevices using partition by L1 cache domain scheme."
+                  << std::endl;
+      } catch (feature_not_supported) {
+        // okay skip it
+      }
+
+      try {
+        auto SubDevicesDomainNextPart = dev.create_sub_devices<
+            info::partition_property::partition_by_affinity_domain>(
+            info::partition_affinity_domain::next_partitionable);
+        std::cout << "Created " << SubDevicesDomainNextPart.size()
+                  << " subdevices using partition by next partitionable "
+                     "domain scheme."
+                  << std::endl;
+
+        auto SubSubDevicesDomainNextPart =
+            SubDevicesDomainNextPart[0]
+                .create_sub_devices<
+                    info::partition_property::partition_by_affinity_domain>(
+                    info::partition_affinity_domain::next_partitionable);
+        std::cout << "Created " << SubSubDevicesDomainNextPart.size()
+                  << " sub-subdevices from subdevice 0 using partition by next "
+                     "partitionable domain scheme."
+                  << std::endl;
+      } catch (feature_not_supported) {
+        // okay skip it
+      }
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what() << std::endl;
+    return 1;
+  }
+  return 0;
+}
@@ -2,9 +2,6 @@
 // UNSUPPORTED: cuda
 // CUDA does neither support device code splitting nor SPIR.
 //
-// The test is failing with GPU RT 30.0.100.9667
-// XFAIL: windows
-//
 // RUN: %clangxx -fsycl -fsycl-device-code-split=per_source \
 // RUN:   -fsycl-targets=spir64_gen-unknown-unknown-sycldevice \
 // RUN:   -Xsycl-target-backend=spir64_gen-unknown-unknown-sycldevice \
 
@@ -184,7 +184,8 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
 
     simd<ushort, 32> p = elm32 < remaining;
 
-    S = gather4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset, p);
+    S = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset,
+                                                           p);
 
     auto cnt_table = S.bit_cast_view<unsigned int, TUPLE_SZ, 32>();
     cnt_table.column(0) += prev;
@@ -214,7 +215,8 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
       cnt_table.select<1, 1, 16, 1>(j, 16) +=
           cnt_table.replicate<1, 0, 16, 0>(j, 15);
     }
-    scatter4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, S, element_offset, p);
+    scatter_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, S, element_offset,
+                                                        p);
     elm32 += 32;
     element_offset += stride_elems * TUPLE_SZ * sizeof(unsigned) * 32;
     prev = cnt_table.column(31);
@@ -252,7 +254,7 @@ void cmk_prefix_iterative(unsigned *buf, unsigned h_pos,
   unsigned n_iter = n_entries / 32;
   for (unsigned i = 0; i < n_iter; i++) {
 
-    S = gather4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
+    S = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
 
     auto cnt_table = S.bit_cast_view<unsigned int, TUPLE_SZ, 32>();
     cnt_table.column(0) += prev;
@@ -288,7 +290,7 @@ void cmk_prefix_iterative(unsigned *buf, unsigned h_pos,
     if (i == n_iter - 1)
       cnt_table.column(31) -= cnt_table.column(30);
 
-    scatter4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, S, element_offset);
+    scatter_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, S, element_offset);
 
     element_offset += stride_elems * TUPLE_SZ * sizeof(unsigned) * 32;
     prev = cnt_table.column(31);
 
@@ -73,13 +73,13 @@ void cmk_acum_iterative(unsigned *buf, unsigned h_pos,
 
   simd<unsigned int, 32 * TUPLE_SZ> S, T;
 
-  S = gather4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
+  S = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
 
 #pragma unroll
   for (int i = 1; i < PREFIX_ENTRIES / 32; i++) {
     element_offset += (stride_elems * 32 * TUPLE_SZ) * sizeof(unsigned);
     // scattered read, each inst reads 16 entries
-    T = gather4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
+    T = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset);
     S += T;
   }
 
 
@@ -197,7 +197,8 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
 
     simd<ushort, 32> p = elm32 < remaining;
 
-    S = gather4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset, p);
+    S = gather_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, element_offset,
+                                                           p);
 
     auto cnt_table = S.bit_cast_view<unsigned int, TUPLE_SZ, 32>();
     cnt_table.column(0) += prev;
@@ -226,7 +227,8 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
       cnt_table.select<1, 1, 16, 1>(j, 16) +=
           cnt_table.replicate<1, 0, 16, 0>(j, 15);
     }
-    scatter4<unsigned int, 32, GATHER_SCATTER_MASK>(buf, S, element_offset, p);
+    scatter_rgba<unsigned int, 32, GATHER_SCATTER_MASK>(buf, S, element_offset,
+                                                        p);
     elm32 += 32;
     element_offset += stride_elems * TUPLE_SZ * sizeof(unsigned) * 32;
     prev = cnt_table.column(31);
 
@@ -67,6 +67,7 @@ template <typename T, unsigned VL, unsigned STRIDE> bool test(queue q) {
       Kernel<T, VL, STRIDE> kernel(acc);
       cgh.parallel_for(glob_range, kernel);
     });
+    e.wait();
   } catch (cl::sycl::exception const &e) {
     std::cout << "SYCL exception caught: " << e.what() << '\n';
     delete[] A;
 
@@ -9,13 +9,16 @@
 // UNSUPPORTED: cuda
 // RUN: %clangxx -fsycl %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// Enable when driver fixes will be propagated into the official release
+// XFAIL: windows
 
 // This test checks extended math operations.
 
 #include "esimd_test_utils.hpp"
 
 #include <CL/sycl.hpp>
 #include <CL/sycl/INTEL/esimd.hpp>
+#include <CL/sycl/builtins_esimd.hpp>
 #include <iostream>
 
 using namespace cl::sycl;
@@ -35,7 +38,16 @@ struct InitDataFuncWide {
 struct InitDataFuncNarrow {
   void operator()(float *In, float *Out, size_t Size) const {
     for (auto I = 0; I < Size; ++I) {
-      In[I] = 2.0f + 16.0f * ((float)I / (float)(Size - 1)); // in [2..16] range
+      In[I] = 2.0f + 16.0f * ((float)I / (float)(Size - 1)); // in [2..18] range
+      Out[I] = (float)0.0;
+    }
+  }
+};
+
+struct InitDataInRange0_5 {
+  void operator()(float *In, float *Out, size_t Size) const {
+    for (auto I = 0; I < Size; ++I) {
+      In[I] = 5.0f * ((float)I / (float)(Size - 1)); // in [0..5] range
       Out[I] = (float)0.0;
     }
   }
@@ -52,7 +64,7 @@ template <MathOp Op> float HostMathFunc(float X);
 
 // --- Specializations per each extended math operation
 
-#define DEFINE_OP(Op, HostOp)                                                  \
+#define DEFINE_ESIMD_OP(Op, HostOp)                                            \
   template <> float HostMathFunc<MathOp::Op>(float X) { return HostOp(X); }    \
   template <int VL> struct DeviceMathFunc<VL, MathOp::Op> {                    \
     simd<float, VL>                                                            \
@@ -61,13 +73,22 @@ template <MathOp Op> float HostMathFunc(float X);
     }                                                                          \
   }
 
-DEFINE_OP(sin, sin);
-DEFINE_OP(cos, cos);
-DEFINE_OP(exp, exp);
-DEFINE_OP(log, log);
-DEFINE_OP(inv, 1.0f /);
-DEFINE_OP(sqrt, sqrt);
-DEFINE_OP(rsqrt, 1.0f / sqrt);
+#define DEFINE_SIMD_OVERLOADED_STD_SYCL_OP(Op, HostOp)                         \
+  template <> float HostMathFunc<MathOp::Op>(float X) { return HostOp(X); }    \
+  template <int VL> struct DeviceMathFunc<VL, MathOp::Op> {                    \
+    simd<float, VL>                                                            \
+    operator()(const simd<float, VL> &X) const SYCL_ESIMD_FUNCTION {           \
+      return sycl::Op<VL>(X);                                                  \
+    }                                                                          \
+  }
+
+DEFINE_SIMD_OVERLOADED_STD_SYCL_OP(sin, sin);
+DEFINE_SIMD_OVERLOADED_STD_SYCL_OP(cos, cos);
+DEFINE_SIMD_OVERLOADED_STD_SYCL_OP(exp, exp);
+DEFINE_SIMD_OVERLOADED_STD_SYCL_OP(log, log);
+DEFINE_ESIMD_OP(inv, 1.0f /);
+DEFINE_ESIMD_OP(sqrt, sqrt);
+DEFINE_ESIMD_OP(rsqrt, 1.0f / sqrt);
 
 // --- Generic kernel calculating an extended math operation on array elements
 
@@ -159,13 +180,10 @@ template <int VL> bool test(queue &Q) {
   Pass &= test<MathOp::sqrt, VL>(Q, "sqrt", InitDataFuncWide{});
   Pass &= test<MathOp::inv, VL>(Q, "inv");
   Pass &= test<MathOp::rsqrt, VL>(Q, "rsqrt");
-// TODO enable these tests after the implementation is fixed
-#if ENABLE_SIN_COS_EXP_LOG
   Pass &= test<MathOp::sin, VL>(Q, "sin", InitDataFuncWide{});
   Pass &= test<MathOp::cos, VL>(Q, "cos", InitDataFuncWide{});
-  Pass &= test<MathOp::exp, VL>(Q, "exp");
+  Pass &= test<MathOp::exp, VL>(Q, "exp", InitDataInRange0_5{});
   Pass &= test<MathOp::log, VL>(Q, "log", InitDataFuncWide{});
-#endif
   return Pass;
 }
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-* @vladimirlaz`
	`1`	`+* @vladimirlaz @romanovvlad @bader`
`2`	`2`
`3`	`3`	`# AOT compilation`
`4`	`4`	`SYCL/AOT @AGindinson @dm-vodopyanov @AlexeySachkov @romanovvlad`