intel
diff --git a/‎sycl/test/esimd/on-device/BitonicSortK.cpp
Lines changed: 718 additions & 0 deletions b/‎sycl/test/esimd/on-device/BitonicSortK.cpp
Lines changed: 718 additions & 0 deletions
diff --git a/‎sycl/test/esimd/on-device/BitonicSortKv2.cpp
Lines changed: 633 additions & 0 deletions b/‎sycl/test/esimd/on-device/BitonicSortKv2.cpp
Lines changed: 633 additions & 0 deletions
diff --git a/‎sycl/test/esimd/on-device/SparseMatrixMul/Protein_csr.dat
33.3 MB b/‎sycl/test/esimd/on-device/SparseMatrixMul/Protein_csr.dat
33.3 MB
diff --git a/‎sycl/test/esimd/on-device/accessor.cpp
Lines changed: 122 additions & 0 deletions b/‎sycl/test/esimd/on-device/accessor.cpp
Lines changed: 122 additions & 0 deletions
diff --git a/‎sycl/test/esimd/on-device/histogram.cpp
Lines changed: 248 additions & 0 deletions b/‎sycl/test/esimd/on-device/histogram.cpp
Lines changed: 248 additions & 0 deletions
@@ -0,0 +1,122 @@
+// TODO enable on WIndows
+// REQUIRES: linux
+// REQUIRES: gpu
+// RUN: %clangxx-esimd -fsycl -D_CRT_SECURE_NO_WARNINGS=1 %s -o %t.out
+// RUN: %ESIMD_RUN_PLACEHOLDER %t.out
+
+// This test checks that accessor-based memory accesses work correctly in ESIMD.
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/esimd.hpp>
+
+#include <iostream>
+
+using namespace cl::sycl;
+
+class ESIMDSelector : public device_selector {
+  // Require GPU device unless HOST is requested in SYCL_DEVICE_TYPE env
+  virtual int operator()(const device &device) const {
+    if (const char *dev_type = getenv("SYCL_DEVICE_TYPE")) {
+      if (!strcmp(dev_type, "GPU"))
+        return device.is_gpu() ? 1000 : -1;
+      if (!strcmp(dev_type, "HOST"))
+        return device.is_host() ? 1000 : -1;
+      std::cerr << "Supported 'SYCL_DEVICE_TYPE' env var values are 'GPU' and "
+                   "'HOST', '"
+                << dev_type << "' is not.\n";
+      return -1;
+    }
+    // If "SYCL_DEVICE_TYPE" not defined, only allow gpu device
+    return device.is_gpu() ? 1000 : -1;
+  }
+};
+
+auto exception_handler = [](exception_list l) {
+  for (auto ep : l) {
+    try {
+      std::rethrow_exception(ep);
+    } catch (cl::sycl::exception &e0) {
+      std::cout << "sycl::exception: " << e0.what() << std::endl;
+    } catch (std::exception &e) {
+      std::cout << "std::exception: " << e.what() << std::endl;
+    } catch (...) {
+      std::cout << "generic exception\n";
+    }
+  }
+};
+
+constexpr unsigned int VL = 1024 * 128;
+
+using Ty = float;
+
+int main() {
+  Ty data0[VL] = {0};
+  Ty data1[VL] = {0};
+  constexpr Ty VAL = 5;
+
+  for (int i = 0; i < VL; i++) {
+    data0[i] = i;
+  }
+
+  try {
+    queue q(ESIMDSelector{}, exception_handler);
+
+    buffer<Ty, 1> buf0(data0, range<1>(VL));
+    buffer<Ty, 1> buf1(data1, range<1>(VL));
+
+    q.submit([&](handler &cgh) {
+      std::cout << "Running on "
+                << q.get_device().get_info<cl::sycl::info::device::name>()
+                << "\n";
+
+      auto acc0 = buf0.get_access<access::mode::read_write>(cgh);
+      auto acc1 = buf1.get_access<access::mode::write>(cgh);
+
+      cgh.parallel_for<class Test>(
+          range<1>(1), [=](sycl::id<1> i) SYCL_ESIMD_KERNEL {
+            using namespace sycl::intel::gpu;
+            unsigned int offset = 0;
+            for (int k = 0; k < VL / 16; k++) {
+              simd<Ty, 16> var = block_load<Ty, 16>(acc0, offset);
+              var += VAL;
+              block_store(acc0, offset, var);
+              block_store(acc1, offset, var + 1);
+              offset += 64;
+            }
+          });
+    });
+
+    q.wait();
+
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return 2;
+  }
+
+  int err_cnt = 0;
+
+  for (int i = 0; i < VL; i++) {
+    Ty gold0 = i + VAL;
+    Ty gold1 = gold0 + 1;
+    Ty val0 = data0[i];
+    Ty val1 = data1[i];
+
+    if (val0 != gold0) {
+      if (++err_cnt < 10)
+        std::cerr << "*** ERROR at data0[" << i << "]: " << val0
+                  << " != " << gold0 << "(gold)\n";
+    }
+    if (val1 != gold1) {
+      if (++err_cnt < 10)
+        std::cerr << "*** ERROR at data1[" << i << "]: " << val1
+                  << " != " << gold1 << "(gold)\n";
+    }
+  }
+  if (err_cnt == 0) {
+    std::cout << "Passed\n";
+    return 0;
+  } else {
+    std::cout << "Failed: " << err_cnt << " of " << VL << " errors\n";
+    return 1;
+  }
+}
@@ -0,0 +1,248 @@
+// TODO enable on WIndows
+// REQUIRES: linux
+// REQUIRES: gpu
+// RUN: %clangxx-esimd -fsycl %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %ESIMD_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/esimd.hpp>
+#include <array>
+#include <iostream>
+
+using namespace cl::sycl;
+
+class ESIMDSelector : public device_selector {
+  // Require GPU device unless HOST is requested in SYCL_DEVICE_TYPE env
+  virtual int operator()(const device &device) const {
+    if (const char *dev_type = getenv("SYCL_DEVICE_TYPE")) {
+      if (!strcmp(dev_type, "GPU"))
+        return device.is_gpu() ? 1000 : -1;
+      if (!strcmp(dev_type, "HOST"))
+        return device.is_host() ? 1000 : -1;
+      std::cerr << "Supported 'SYCL_DEVICE_TYPE' env var values are 'GPU' and "
+                   "'HOST', '"
+                << dev_type << "' is not.\n";
+      return -1;
+    }
+    // If "SYCL_DEVICE_TYPE" not defined, only allow gpu device
+    return device.is_gpu() ? 1000 : -1;
+  }
+};
+
+auto exception_handler = [](exception_list l) {
+  for (auto ep : l) {
+    try {
+      std::rethrow_exception(ep);
+    } catch (cl::sycl::exception &e0) {
+      std::cout << "sycl::exception: " << e0.what() << std::endl;
+    } catch (std::exception &e) {
+      std::cout << "std::exception: " << e.what() << std::endl;
+    } catch (...) {
+      std::cout << "generic exception\n";
+    }
+  }
+};
+
+#define NUM_BINS 256
+#define IMG_WIDTH 1024
+#define IMG_HEIGHT 1024
+//
+// each parallel_for handles 64x32 bytes
+//
+#define BLOCK_WIDTH 32
+#define BLOCK_HEIGHT 64
+
+void histogram_CPU(unsigned int width, unsigned int height, unsigned char *srcY,
+                   unsigned int *cpuHistogram) {
+  int i;
+  for (i = 0; i < width * height; i++) {
+    cpuHistogram[srcY[i]] += 1;
+  }
+}
+
+void writeHist(unsigned int *hist) {
+  int total = 0;
+
+  std::cerr << "\nHistogram: \n";
+  for (int i = 0; i < NUM_BINS; i += 8) {
+    std::cerr << "\n  [" << i << " - " << i + 7 << "]:";
+    for (int j = 0; j < 8; j++) {
+      std::cerr << "\t" << hist[i + j];
+      total += hist[i + j];
+    }
+  }
+  std::cerr << "\nTotal = " << total << " \n";
+}
+
+int checkHistogram(unsigned int *refHistogram, unsigned int *hist) {
+
+  for (int i = 0; i < NUM_BINS; i++) {
+    if (refHistogram[i] != hist[i]) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+int main(int argc, char *argv[]) {
+
+  const char *input_file = nullptr;
+  unsigned int width = IMG_WIDTH * sizeof(unsigned int);
+  unsigned int height = IMG_HEIGHT;
+
+  if (argc == 2) {
+    input_file = argv[1];
+  } else {
+    std::cerr << "Usage: Histogram.exe input_file" << std::endl;
+    std::cerr << "No input file specificed. Use default random value ...."
+              << std::endl;
+  }
+
+  // ------------------------------------------------------------------------
+  // Read in image luma plane
+
+  // Allocate Input Buffer
+  queue q(ESIMDSelector{}, exception_handler);
+
+  auto dev = q.get_device();
+  auto ctxt = q.get_context();
+  unsigned char *srcY =
+      static_cast<unsigned char *>(malloc_shared(width * height, dev, ctxt));
+  unsigned int *bins = static_cast<unsigned int *>(
+      malloc_shared(NUM_BINS * sizeof(unsigned int), dev, ctxt));
+  std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
+
+  uint range_width = width / BLOCK_WIDTH;
+  uint range_height = height / BLOCK_HEIGHT;
+
+  if (srcY == NULL) {
+    std::cerr << "Out of memory\n";
+    exit(1);
+  }
+
+  // Initializes input.
+  unsigned int input_size = width * height;
+  std::cerr << "Processing inputs\n";
+
+  if (input_file != nullptr) {
+    FILE *f = fopen(input_file, "rb");
+    if (f == NULL) {
+      std::cerr << "Error opening file " << input_file;
+      std::exit(1);
+    }
+
+    unsigned int cnt = fread(srcY, sizeof(unsigned char), input_size, f);
+    if (cnt != input_size) {
+      std::cerr << "Error reading input from " << input_file;
+      std::exit(1);
+    }
+  } else {
+    srand(2009);
+    for (int i = 0; i < input_size; ++i) {
+      srcY[i] = rand() % 256;
+    }
+  }
+
+  for (int i = 0; i < NUM_BINS; i++) {
+    bins[i] = 0;
+  }
+
+  // ------------------------------------------------------------------------
+  // CPU Execution:
+
+  unsigned int cpuHistogram[NUM_BINS];
+  memset(cpuHistogram, 0, sizeof(cpuHistogram));
+  histogram_CPU(width, height, srcY, cpuHistogram);
+
+  cl::sycl::image<2> Img(srcY, image_channel_order::rgba,
+                         image_channel_type::unsigned_int32,
+                         range<2>{width / sizeof(uint4), height});
+
+  {
+    // create ranges
+    // We need that many workitems
+    auto GlobalRange = range<1>(range_width * range_height);
+    // Number of workitems in a workgroup
+    auto LocalRange = range<1>(1);
+    nd_range<1> Range(GlobalRange, LocalRange);
+
+    auto e = q.submit([&](handler &cgh) {
+      auto readAcc = Img.get_access<uint4, cl::sycl::access::mode::read>(cgh);
+
+      cgh.parallel_for<class Hist>(
+          Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+            using namespace sycl::intel::gpu;
+
+            // Get thread origin offsets
+            uint tid = ndi.get_group(0);
+            uint h_pos = (tid % range_width) * BLOCK_WIDTH;
+            uint v_pos = (tid / range_width) * BLOCK_HEIGHT;
+
+            // Declare a 8x32 uchar matrix to store the input block pixel value
+            simd<unsigned char, 8 * 32> in;
+
+            // Declare a vector to store the local histogram
+            simd<unsigned int, NUM_BINS> histogram(0);
+
+            // Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
+            for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
+              // Perform 2D media block read to load 8x32 pixel block
+              in =
+                  media_block_load<unsigned char, 8, 32>(readAcc, h_pos, v_pos);
+
+          // Accumulate local histogram for each pixel value
+#pragma unroll
+              for (int i = 0; i < 8; i++) {
+#pragma unroll
+                for (int j = 0; j < 32; j++) {
+                  histogram.select<1, 1>(in[i * 32 + j]) += 1;
+                }
+              }
+
+              // Update starting offset for the next work block
+              v_pos += 8;
+            }
+
+            // Declare a vector to store the offset for atomic write operation
+            simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7
+            offset *= sizeof(unsigned int);
+
+        // Update global sum by atomically adding each local histogram
+#pragma unroll
+            for (int i = 0; i < NUM_BINS; i += 8) {
+              // Declare a vector to store the source for atomic write operation
+              simd<unsigned int, 8> src;
+              src = histogram.select<8, 1>(i);
+
+#ifdef __SYCL_DEVICE_ONLY__
+              flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, unsigned int, 8>(
+                  bins, offset, src, 1);
+              offset += 8 * sizeof(unsigned int);
+#else
+              auto vals = block_load<unsigned int, 8>(bins + i);
+              vals = vals + src;
+              block_store<unsigned int, 8>(bins + i, vals);
+#endif
+            }
+          });
+    });
+    e.wait();
+
+    // SYCL will enqueue and run the kernel. Recall that the buffer's data is
+    // given back to the host at the end of scope.
+  } // make sure data is given back to the host at the end of this scope
+
+  writeHist(bins);
+  writeHist(cpuHistogram);
+  // Checking Histogram
+  if (checkHistogram(cpuHistogram, bins)) {
+    std::cerr << "PASSED\n";
+    return 0;
+  } else {
+    std::cerr << "FAILED\n";
+    return 1;
+  }
+
+  return 0;
+}