intel · vladimirlaz · Oct 16, 2020 · Oct 15, 2020 · Oct 15, 2020 · Oct 15, 2020
@@ -0,0 +1,205 @@
+//==--------------- histogram_256_slm.cpp  - DPC++ ESIMD on-device test ----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// TODO enable on Windows and Level Zero
+// REQUIRES: linux && gpu && opencl
+// RUN: %clangxx-esimd -fsycl %s -o %t.out
+// RUN: %ESIMD_RUN_PLACEHOLDER %t.out 16
+
+#include "esimd_test_utils.hpp"
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/INTEL/esimd.hpp>
+#include <iostream>
+
+static constexpr int NUM_BINS = 256;
+static constexpr int SLM_SIZE = (NUM_BINS * 4);
+static constexpr int BLOCK_WIDTH = 32;
+static constexpr int NUM_BLOCKS = 32;
+
+using namespace cl::sycl;
+using namespace sycl::INTEL::gpu;
+
+// Histogram kernel: computes the distribution of pixel intensities
+ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output,
+                                   uint32_t gid, uint32_t lid,
+                                   uint32_t local_size, uint32_t num_blocks) {
+  // Declare and initialize SLM
+  slm_init(SLM_SIZE);
+  uint linear_id = gid * local_size + lid;
+
+  simd<uint, 16> slm_offset(0, 1);
+  slm_offset += 16 * lid;
+  slm_offset *= sizeof(int);
+  simd<uint, 16> slm_data = 0;
+  slm_store<uint, 16>(slm_data, slm_offset);
+  esimd_fence(ESIMD_GLOBAL_COHERENT_FENCE);
+  esimd_barrier();
+
+  // Each thread handles NUM_BLOCKSxBLOCK_WIDTH pixel blocks
+  auto start_off = (linear_id * BLOCK_WIDTH * num_blocks);
+  for (int y = 0; y < num_blocks; y++) {
+    auto start_addr = ((unsigned int *)input_ptr) + start_off;
+    auto data = block_load<uint, 32>(start_addr);
+    auto in = data.format<uchar>();
+
+#pragma unroll
+    for (int j = 0; j < BLOCK_WIDTH * sizeof(int); j += 16) {
+      // Accumulate local histogram for each pixel value
+      auto dataOffset = convert<uint, uchar, 16>(in.select<16, 1>(j).read());
+      dataOffset *= sizeof(int);
+      slm_atomic<EsimdAtomicOpType::ATOMIC_INC, uint, 16>(dataOffset, 1);
+    }
+    start_off += BLOCK_WIDTH;
+  }
+  esimd_fence(ESIMD_GLOBAL_COHERENT_FENCE);
+  esimd_barrier();
+
+  // Update global sum by atomically adding each local histogram
+  simd<uint, 16> local_histogram;
+  local_histogram = slm_load<uint32_t, 16>(slm_offset);
+  flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, 8>(
+      output, slm_offset.select<8, 1>(0), local_histogram.select<8, 1>(0), 1);
+  flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, 8>(
+      output, slm_offset.select<8, 1>(8), local_histogram.select<8, 1>(8), 1);
+}
+
+// This function calculates histogram of the image with the CPU.
+// @param size: the size of the input array.
+// @param src: pointer to the input array.
+// @param cpu_histogram: pointer to the histogram of the input image.
+void HistogramCPU(unsigned int size, unsigned int *src,
+                  unsigned int *cpu_histogram) {
+  for (int i = 0; i < size; i++) {
+    unsigned int x = src[i];
+    cpu_histogram[(x)&0xFFU] += 1;
+    cpu_histogram[(x >> 8) & 0xFFU] += 1;
+    cpu_histogram[(x >> 16) & 0xFFU] += 1;
+    cpu_histogram[(x >> 24) & 0xFFU] += 1;
+  }
+}
+
+// This function compares the output data calculated by the CPU and the
+// GPU separately.
+// If they are identical, return 1, else return 0.
+int CheckHistogram(unsigned int *cpu_histogram, unsigned int *gpu_histogram) {
+  unsigned int bad = 0;
+  for (int i = 0; i < NUM_BINS; i++) {
+    if (cpu_histogram[i] != gpu_histogram[i]) {
+      std::cout << "At " << i << ": CPU = " << cpu_histogram[i]
+                << ", GPU = " << gpu_histogram[i] << std::endl;
+      if (bad >= 256)
+        return 0;
+      bad++;
+    }
+  }
+  if (bad > 0)
+    return 0;
+
+  return 1;
+}
+
+class NumBlocksConst;
+class histogram_slm;
+
+int main(int argc, char **argv) {
+  queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
+  auto dev = q.get_device();
+  auto ctxt = q.get_context();
+
+  const char *input_file = nullptr;
+  unsigned int width = 1024 * sizeof(unsigned int);
+  unsigned int height = 1024;
+
+  // Initializes input.
+  unsigned int input_size = width * height;
+  unsigned int *input_ptr =
+      (unsigned int *)malloc_shared(input_size, dev, ctxt);
+  printf("Processing %dx%d inputs\n", (int)(width / sizeof(unsigned int)),
+         height);
+
+  srand(2009);
+  input_size = input_size / sizeof(int);
+  for (int i = 0; i < input_size; ++i) {
+    input_ptr[i] = rand() % 256;
+    input_ptr[i] |= (rand() % 256) << 8;
+    input_ptr[i] |= (rand() % 256) << 16;
+    input_ptr[i] |= (rand() % 256) << 24;
+  }
+
+  // Allocates system memory for output buffer.
+  int buffer_size = sizeof(unsigned int) * NUM_BINS;
+  unsigned int *hist = new unsigned int[buffer_size];
+  if (hist == nullptr) {
+    std::cerr << "Out of memory\n";
+    exit(1);
+  }
+  memset(hist, 0, buffer_size);
+
+  // Uses the CPU to calculate the histogram output data.
+  unsigned int cpu_histogram[NUM_BINS];
+  memset(cpu_histogram, 0, sizeof(cpu_histogram));
+
+  HistogramCPU(input_size, input_ptr, cpu_histogram);
+
+  std::cout << "finish cpu_histogram\n";
+
+  // Uses the GPU to calculate the histogram output data.
+  unsigned int *output_surface =
+      (uint32_t *)malloc_shared(4 * NUM_BINS, dev, ctxt);
+  memset(output_surface, 0, 4 * NUM_BINS);
+
+  unsigned int num_blocks{NUM_BLOCKS};
+  if (argc == 2) {
+    num_blocks = atoi(argv[1]);
+    std::cout << "new num_blocks = " << num_blocks << "\n";
+  }
+
+  cl::sycl::program prg(q.get_context());
+  sycl::ONEAPI::experimental::spec_constant<unsigned int, NumBlocksConst>
+      num_blocks_const = prg.set_spec_constant<NumBlocksConst>(num_blocks);
+  prg.build_with_kernel_type<histogram_slm>();
+
+  unsigned int num_threads;
+  num_threads = width * height / (num_blocks * BLOCK_WIDTH * sizeof(int));
+
+  auto GlobalRange = cl::sycl::range<1>(num_threads);
+  auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16);
+  cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);
+
+  {
+    auto e = q.submit([&](cl::sycl::handler &cgh) {
+      cgh.parallel_for<histogram_slm>(
+          prg.get_kernel<histogram_slm>(), Range,
+          [=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+            histogram_atomic(input_ptr, output_surface, ndi.get_group(0),
+                             ndi.get_local_id(0), 16, num_blocks_const.get());
+          });
+    });
+    e.wait();
+  }
+
+  std::cout << "finish GPU histogram\n";
+
+  memcpy(hist, output_surface, 4 * NUM_BINS);
+
+  free(output_surface, ctxt);
+
+  free(input_ptr, ctxt);
+
+  // Compares the CPU histogram output data with the
+  // GPU histogram output data.
+  // If there is no difference, the result is correct.
+  // Otherwise there is something wrong.
+  int res = CheckHistogram(cpu_histogram, hist);
+  if (res)
+    std::cout << "PASSED\n";
+  else
+    std::cout << "FAILED\n";
+
+  return res ? 0 : -1;
+}
@@ -0,0 +1,191 @@
+//==---------------- mandelbrot.cpp  - DPC++ ESIMD on-device test ----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// TODO enable on Windows and Level Zero
+// REQUIRES: linux && gpu && opencl
+// RUN: %clangxx-esimd -fsycl %s -I%S/.. -o %t.out
+// RUN: %ESIMD_RUN_PLACEHOLDER %t.out %S/output.ppm %S/golden_hw.ppm 512 -2.09798 -1.19798 0.004 4.0
+
+#include "esimd_test_utils.hpp"
+#include <CL/sycl.hpp>
+#include <CL/sycl/INTEL/esimd.hpp>
+#include <array>
+#include <iostream>
+#include <memory>
+
+using namespace cl::sycl;
+using namespace sycl::INTEL::gpu;
+
+#ifdef _SIM_MODE_
+#define CRUNCH 32
+#else
+#define CRUNCH 512 // emu/hw modes
+#endif
+
+#define SCALE 0.004
+#define XOFF -2.09798
+#define YOFF -1.19798
+
+#define WIDTH 800
+#define HEIGHT 602
+
+template <typename ACC>
+ESIMD_INLINE void mandelbrot(ACC out_image, int ix, int iy, int crunch,
+                             float xOff, float yOff, float scale, float thrs) {
+  ix *= 8;
+  iy *= 2;
+
+  simd<int, 16> m = 0;
+
+  for (auto lane = 0; lane < 16; ++lane) {
+    int ix_lane = ix + (lane & 0x7);
+    int iy_lane = iy + (lane >> 3);
+    float xPos = ix_lane * scale + xOff;
+    float yPos = iy_lane * scale + yOff;
+    float x = 0.0f;
+    float y = 0.0f;
+    float xx = 0.0f;
+    float yy = 0.0f;
+
+    int mtemp = 0;
+    do {
+      y = x * y * 2.0f + yPos;
+      x = xx - yy + xPos;
+      yy = y * y;
+      xx = x * x;
+      mtemp += 1;
+    } while ((mtemp < crunch) & (xx + yy < thrs));
+
+    m.select<1, 0>(lane) = mtemp;
+  }
+
+  simd<int, 16> color = (((m * 15) & 0xff)) + (((m * 7) & 0xff) * 256) +
+                        (((m * 3) & 0xff) * 65536);
+
+  // because the output is a y-tile 2D surface
+  // we can only write 32-byte wide
+  media_block_store<unsigned char, 2, 32>(out_image, ix * sizeof(int), iy,
+                                          color.format<unsigned char>());
+}
+
+class CrunchConst;
+class XoffConst;
+class YoffConst;
+class ScaleConst;
+class ThrsConst;
+
+class Test;
+
+int main(int argc, char *argv[]) {
+  if (argc != 3 && argc != 8) {
+    std::cerr << "Usage: mandelbrot.exe output_file ref_file [crunch xoff yoff "
+                 "scale threshold]"
+              << std::endl;
+    exit(1);
+  }
+
+  // Gets the width and height of the input image.
+  const unsigned img_size = WIDTH * HEIGHT * 4;
+  // Sets output to blank image.
+  unsigned char *buf = new unsigned char[img_size];
+
+  {
+    cl::sycl::image<2> imgOutput((unsigned int *)buf, image_channel_order::rgba,
+                                 image_channel_type::unsigned_int8,
+                                 range<2>{WIDTH, HEIGHT});
+
+    // We need that many workitems
+    uint range_width = WIDTH / 8;
+    uint range_height = HEIGHT / 2;
+    cl::sycl::range<2> GlobalRange{range_width, range_height};
+
+    // Number of workitems in a workgroup
+    cl::sycl::range<2> LocalRange{1, 1};
+
+    queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
+
+    auto dev = q.get_device();
+    auto ctxt = q.get_context();
+    std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
+
+    int crunch{CRUNCH};
+    float xoff{XOFF}, yoff{YOFF}, scale{SCALE}, thrs{4.0f};
+    if (argc == 8) {
+      crunch = atoi(argv[3]);
+      xoff = (float)atof(argv[4]);
+      yoff = (float)atof(argv[5]);
+      scale = (float)atof(argv[6]);
+      thrs = (float)atof(argv[7]);
+      std::cout << "new crunch = " << crunch << ", xoff = " << xoff
+                << ", yoff = " << yoff << ", scale = " << scale
+                << ", thrs = " << thrs << "\n";
+    }
+    cl::sycl::program prg(q.get_context());
+    sycl::ONEAPI::experimental::spec_constant<int, CrunchConst> crunch_const =
+        prg.set_spec_constant<CrunchConst>(crunch);
+    sycl::ONEAPI::experimental::spec_constant<float, XoffConst> xoff_const =
+        prg.set_spec_constant<XoffConst>(xoff);
+    sycl::ONEAPI::experimental::spec_constant<float, YoffConst> yoff_const =
+        prg.set_spec_constant<YoffConst>(yoff);
+    sycl::ONEAPI::experimental::spec_constant<float, ScaleConst> scale_const =
+        prg.set_spec_constant<ScaleConst>(scale);
+    sycl::ONEAPI::experimental::spec_constant<float, ThrsConst> thrs_const =
+        prg.set_spec_constant<ThrsConst>(thrs);
+    prg.build_with_kernel_type<Test>();
+
+    auto e = q.submit([&](cl::sycl::handler &cgh) {
+      auto accOutput =
+          imgOutput.get_access<uint4, cl::sycl::access::mode::write>(cgh);
+
+      cgh.parallel_for<Test>(prg.get_kernel<Test>(), GlobalRange * LocalRange,
+                             [=](item<2> it) SYCL_ESIMD_KERNEL {
+                               uint h_pos = it.get_id(0);
+                               uint v_pos = it.get_id(1);
+                               mandelbrot(accOutput, h_pos, v_pos,
+                                          crunch_const.get(), xoff_const.get(),
+                                          yoff_const.get(), scale_const.get(),
+                                          thrs_const.get());
+                             });
+    });
+    e.wait();
+  }
+
+  char *out_file = argv[1];
+  FILE *dumpfile = fopen(out_file, "w");
+  if (!dumpfile) {
+    std::cerr << "Cannot open " << out_file << std::endl;
+    return -2;
+  }
+  fprintf(dumpfile, "P6\x0d\x0a");
+  fprintf(dumpfile, "%u %u\x0d\x0a", WIDTH, (HEIGHT - 2));
+  fprintf(dumpfile, "%u\x0d\x0a", 255);
+  fclose(dumpfile);
+  dumpfile = fopen(out_file, "ab");
+  for (int32_t i = 0; i < WIDTH * (HEIGHT - 2); ++i) {
+    fwrite(&buf[i * 4], sizeof(char), 1, dumpfile);
+    fwrite(&buf[i * 4 + 1], sizeof(char), 1, dumpfile);
+    fwrite(&buf[i * 4 + 2], sizeof(char), 1, dumpfile);
+  }
+  fclose(dumpfile);
+
+  bool passed = true;
+  if (!esimd_test::cmp_binary_files<unsigned char>(out_file, argv[2], 0)) {
+    std::cerr << out_file << " does not match the reference file " << argv[2]
+              << std::endl;
+    passed = false;
+  }
+
+  delete[] buf;
+
+  if (passed) {
+    std::cerr << "PASSED\n";
+    return 0;
+  } else {
+    std::cerr << "FAILED\n";
+    return 1;
+  }
+}