intel · vladimirlaz · Dec 29, 2020 · Dec 20, 2020 · Dec 20, 2020 · kbobrovs
@@ -598,8 +598,11 @@ int main(int argc, char *argv[]) {
   int size = 1 << LOG2_ELEMENTS;
   cout << "BitonicSort (" << size << ") Start..." << std::endl;
 
+  cl::sycl::property_list props{property::queue::enable_profiling{},
+                                property::queue::in_order()};
+
   queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
-          property::queue::enable_profiling{});
+          props);
 
   BitonicSort bitonicSort;
 

@@ -0,0 +1,94 @@
+//==------------------ dp4a.cpp  - DPC++ ESIMD on-device test --------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// TODO enable on Windows
+// REQUIRES: linux && gpu
+// RUN: %clangxx-esimd -fsycl %s -o %t.out
+// RUN: %ESIMD_RUN_PLACEHOLDER %t.out
+// TODO : Enable test for new GPU device
+// XFAIL: *
+
+#include "esimd_test_utils.hpp"
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/INTEL/esimd.hpp>
+#include <iostream>
+
+using namespace cl::sycl;
+
+int main(void) {
+  constexpr unsigned SIZE = 16;
+  constexpr unsigned GROUPSIZE = 1;
+  using DTYPE = unsigned int;
+
+  queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
+
+  auto dev = q.get_device();
+  std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
+  auto ctxt = q.get_context();
+
+  DTYPE *S0 =
+      static_cast<DTYPE *>(malloc_shared(SIZE * sizeof(DTYPE), dev, ctxt));
+  DTYPE *S1 =
+      static_cast<DTYPE *>(malloc_shared(SIZE * sizeof(DTYPE), dev, ctxt));
+  DTYPE *S2 =
+      static_cast<DTYPE *>(malloc_shared(SIZE * sizeof(DTYPE), dev, ctxt));
+
+  DTYPE *RES =
+      static_cast<DTYPE *>(malloc_shared(SIZE * sizeof(DTYPE), dev, ctxt));
+
+  for (unsigned i = 0; i < SIZE; ++i) {
+    S0[i] = 0x32;
+    S1[i] = 0x0102037F;
+    S2[i] = 0x0102037F;
+    RES[i] = 0;
+  }
+
+  cl::sycl::range<1> GroupRange{1};
+
+  cl::sycl::range<1> TaskRange{GROUPSIZE};
+  cl::sycl::nd_range<1> Range(GroupRange, TaskRange);
+
+  try {
+    auto e = q.submit([&](handler &cgh) {
+      cgh.parallel_for<class Test>(
+          Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+            using namespace sycl::INTEL::gpu;
+
+            simd<DTYPE, SIZE> src0(0);
+            src0 = block_load<DTYPE, SIZE>(S0);
+
+            simd<DTYPE, SIZE> src1(0);
+            src1 = block_load<DTYPE, SIZE>(S1);
+
+            simd<DTYPE, SIZE> src2(0);
+            src2 = block_load<DTYPE, SIZE>(S2);
+
+            auto res =
+                esimd_dp4a<DTYPE, DTYPE, DTYPE, DTYPE, SIZE>(src0, src1, src2);
+            block_store<DTYPE, SIZE>(RES, res);
+          });
+    });
+    e.wait();
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
+  }
+
+  int err_cnt = 0;
+  for (unsigned i = 0; i < SIZE; ++i) {
+    if (RES[i] != 0x3F41) {
+      if (++err_cnt < 10) {
+        std::cout << "failed at index " << i << ", " << RES[i]
+                  << " != " << 0x3F41 << "\n";
+      }
+    }
+  }
+
+  std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
+  return err_cnt > 0 ? 1 : 0;
+}
@@ -547,8 +547,12 @@ int main(int argc, char *argv[]) {
     std::cerr << "Usage: kmeans.exe input_file" << std::endl;
     exit(1);
   }
+
+  cl::sycl::property_list props{property::queue::enable_profiling{},
+                                property::queue::in_order()};
   queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
-          property::queue::enable_profiling{});
+          props);
+
   auto dev = q.get_device();
   auto ctxt = q.get_context();
 
@@ -661,7 +665,7 @@ int main(int argc, char *argv[]) {
                                           cmk_accum_reduction((uint *)accum, i);
                                         });
     });
-    e.wait();
+    e1.wait();
     kernel2_time_in_ns += report_time("kernel2", e1);
 #endif
 

@@ -0,0 +1,166 @@
+//==----------- slm_split_barrier.cpp  - DPC++ ESIMD on-device test --------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// TODO enable on Windows
+// REQUIRES: linux && gpu
+// RUN: %clangxx-esimd -fsycl %s -o %t.out
+// RUN: %ESIMD_RUN_PLACEHOLDER %t.out
+
+#include "esimd_test_utils.hpp"
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/INTEL/esimd.hpp>
+#include <iostream>
+
+using namespace cl::sycl;
+using namespace sycl::INTEL::gpu;
+
+#define LOCAL_SIZE 4
+#define GLOBAL_SIZE 6
+#define NUM_THREADS LOCAL_SIZE *GLOBAL_SIZE
+
+/// \brief transfer data from memory to SLM.
+///
+/// Load ::size bytes from memory pointer ::addr starting at ::offset to the
+/// SLM ::slmOffset. ::size must be a multiple of 256.
+///
+ESIMD_INLINE
+void load_to_slm(uint grpSize, uint localId, uint slmOffset, char *addr,
+                 uint offset, uint size) {
+  simd<uint, 16> vOffset(0, 16);
+
+  uint numTotalBlocks = size / 256;
+  uint numBlocks = numTotalBlocks / grpSize;
+  uint numLeftOver = numTotalBlocks % grpSize;
+  numBlocks += (localId < numLeftOver) ? 1 : 0;
+
+  uint threadOffsetInSLM = slmOffset + localId * 256;
+  // in bytes
+  uint threadOffsetInMemory = offset + threadOffsetInSLM;
+  // in unit of bytes
+  simd<uint, 16> vOffsets = vOffset + threadOffsetInSLM;
+
+  for (uint block = 0; block < numBlocks; block++) {
+    simd<uint, 32> row0; // 32 floats or 128 Bytes or 4 GRF-registers
+    simd<uint, 32> row1;
+    simd<uint, 64> rowTrans;
+    row0 = block_load<uint, 32>((const uint *)(addr + threadOffsetInMemory));
+    row1 =
+        block_load<uint, 32>((const uint *)(addr + threadOffsetInMemory + 128));
+
+    // Transpose
+    rowTrans.select<8, 1>(0) = row0.select<8, 4>(0);
+    rowTrans.select<8, 1>(16) = row0.select<8, 4>(1);
+    rowTrans.select<8, 1>(32) = row0.select<8, 4>(2);
+    rowTrans.select<8, 1>(48) = row0.select<8, 4>(3);
+
+    rowTrans.select<8, 1>(8) = row1.select<8, 4>(0);
+    rowTrans.select<8, 1>(24) = row1.select<8, 4>(1);
+    rowTrans.select<8, 1>(40) = row1.select<8, 4>(2);
+    rowTrans.select<8, 1>(56) = row1.select<8, 4>(3);
+
+    slm_store4<uint, 16, ESIMD_ABGR_ENABLE>(rowTrans, vOffsets);
+    threadOffsetInMemory += grpSize * 256;
+    vOffsets += (grpSize * 256);
+  }
+
+  esimd_fence(ESIMD_GLOBAL_COHERENT_FENCE);
+  esimd_sbarrier(ESIMD_SBARRIER_SIGNAL);
+  esimd_sbarrier(ESIMD_SBARRIER_WAIT);
+}
+
+int main(void) {
+  constexpr unsigned VL = 16;
+  constexpr unsigned Size = NUM_THREADS * VL;
+
+  queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
+
+  auto dev = q.get_device();
+  std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
+  auto ctxt = q.get_context();
+  // TODO: release memory in the end of the test
+  uint *A = static_cast<uint *>(malloc_shared(Size * sizeof(uint), dev, ctxt));
+  uint *B = static_cast<uint *>(malloc_shared(Size * sizeof(uint), dev, ctxt));
+
+  // Checking with specific inputs
+  for (int i = 0; i < NUM_THREADS; i++) {
+    uint *A_int = (uint *)(A + i * VL);
+    for (int j = 0; j < VL; j++) {
+      A_int[j] = i + j;
+      std::cout << A_int[j] << " ";
+    }
+    std::cout << std::endl;
+  }
+
+  // We need that many workitems
+  cl::sycl::range<1> GlobalRange{GLOBAL_SIZE};
+
+  // Number of workitems in a workgroup
+  cl::sycl::range<1> LocalRange{LOCAL_SIZE};
+  cl::sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
+
+  try {
+    auto e = q.submit([&](handler &cgh) {
+      cgh.parallel_for<class Test>(
+          Range, [=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+            simd<uint, VL> v_slmData;
+            simd<uint, VL> v_Off(0, 4);
+
+            uint localID = ndi.get_local_id(0);
+            uint groupSize = ndi.get_local_range(0);
+            uint globalID = ndi.get_global_id(0);
+            uint groupID = ndi.get_group(0);
+
+            slm_init(1024);
+
+            int grpMemOffset = groupID * groupSize * VL * 4;
+
+            load_to_slm(groupSize, localID, 0, (char *)A, grpMemOffset,
+                        groupSize * VL * 4);
+
+            auto shiftID = (localID + 1) % 4;
+
+            v_Off = v_Off + shiftID * 64;
+
+            v_slmData = slm_load<uint, VL>(v_Off);
+
+            block_store<uint, VL>(B + globalID * VL, v_slmData);
+          });
+    });
+    e.wait();
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
+  }
+
+  std::cout << "result" << std::endl;
+  int result = 0;
+  for (int i = 0; i < NUM_THREADS; i++) {
+    unsigned int *p = (unsigned int *)(B + i * VL);
+    if ((i % 4) != 3) {
+      for (int j = 0; j < VL; j++) {
+        std::cout << (*p) << " ";
+        if (*p != (i + 1 + j)) {
+          result = -1;
+        }
+        p++;
+      }
+    } else {
+      for (int j = 0; j < VL; j++) {
+        std::cout << (*p) << " ";
+        if (*p != (i - 3 + j)) {
+          result = -1;
+        }
+        p++;
+      }
+    }
+    std::cout << std::endl;
+  }
+
+  std::cout << (result < 0 ? "FAILED\n" : "Passed\n");
+  return result < 0 ? 1 : 0;
+}