Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL][ESIMD] refactor timing code #145

Merged
merged 1 commit into from
Feb 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 3 additions & 12 deletions SYCL/ESIMD/BitonicSortK.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -506,16 +506,6 @@ ESIMD_INLINE void cmk_bitonic_merge(AccTy buf, uint32_t n, uint32_t m,
}
}

static double report_time(const string &msg, event e0, event en) {
cl_ulong time_start =
e0.get_profiling_info<info::event_profiling::command_start>();
cl_ulong time_end =
en.get_profiling_info<info::event_profiling::command_end>();
double elapsed = (time_end - time_start) / 1e6;
cout << msg << elapsed << " milliseconds" << std::endl;
return elapsed;
}

struct BitonicSort {
enum {
base_sort_size_ = 256,
Expand Down Expand Up @@ -615,7 +605,7 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
});
});
e.wait();
total_time += report_time("kernel time", e, e);
total_time += esimd_test::report_time("kernel time", e, e);
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return e.get_cl_code();
Expand Down Expand Up @@ -663,7 +653,8 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
}

mergeEvent[k - 1].wait();
total_time += report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);
total_time +=
esimd_test::report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);

cout << " Sorting Time = " << total_time << " msec " << std::endl;
return 1;
Expand Down
15 changes: 3 additions & 12 deletions SYCL/ESIMD/BitonicSortKv2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -421,16 +421,6 @@ ESIMD_INLINE void cmk_bitonic_merge(uint32_t *buf, uint32_t n, uint32_t m,
}
}

static double report_time(const string &msg, event e0, event en) {
cl_ulong time_start =
e0.get_profiling_info<info::event_profiling::command_start>();
cl_ulong time_end =
en.get_profiling_info<info::event_profiling::command_end>();
double elapsed = (time_end - time_start) / 1e6;
cout << msg << elapsed << " milliseconds" << std::endl;
return elapsed;
}

struct BitonicSort {
enum {
base_sort_size_ = 256,
Expand Down Expand Up @@ -532,7 +522,7 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
});
});
e.wait();
total_time += report_time("kernel time", e, e);
total_time += esimd_test::report_time("kernel time", e, e);
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return e.get_cl_code();
Expand Down Expand Up @@ -580,7 +570,8 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
}

mergeEvent[k - 1].wait();
total_time += report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);
total_time +=
esimd_test::report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);

cout << " Sorting Time = " << total_time << " msec " << std::endl;
return 1;
Expand Down
29 changes: 29 additions & 0 deletions SYCL/ESIMD/esimd_test_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#define NOMINMAX

#include <algorithm>
#include <chrono>
#include <cstring>
#include <fstream>
#include <iostream>
Expand Down Expand Up @@ -161,4 +162,32 @@ To bit_cast(const From &src) noexcept {
return dst;
}

// Timer class for measuring elasped time
class Timer {
public:
Timer() : start_(std::chrono::steady_clock::now()) {}

double Elapsed() {
auto now = std::chrono::steady_clock::now();
return std::chrono::duration_cast<Duration>(now - start_).count();
}

private:
using Duration = std::chrono::duration<double>;
std::chrono::steady_clock::time_point start_;
};

// e0 is the first event, en is the last event
// find the time difference between the starting time of the e0 and
// the ending time of en, return micro-second
inline double report_time(const std::string &msg, event e0, event en) {
cl_ulong time_start =
e0.get_profiling_info<info::event_profiling::command_start>();
cl_ulong time_end =
en.get_profiling_info<info::event_profiling::command_end>();
double elapsed = (time_end - time_start) / 1e6;
// cerr << msg << elapsed << " msecs" << std::endl;
return elapsed;
}

} // namespace esimd_test
139 changes: 76 additions & 63 deletions SYCL/ESIMD/histogram.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@ void histogram_CPU(unsigned int width, unsigned int height, unsigned char *srcY,
void writeHist(unsigned int *hist) {
int total = 0;

std::cerr << "\nHistogram: \n";
// std::cerr << "\nHistogram: \n";
for (int i = 0; i < NUM_BINS; i += 8) {
std::cerr << "\n [" << i << " - " << i + 7 << "]:";
// std::cerr << "\n [" << i << " - " << i + 7 << "]:";
for (int j = 0; j < 8; j++) {
std::cerr << "\t" << hist[i + j];
// std::cerr << "\t" << hist[i + j];
total += hist[i + j];
}
}
Expand Down Expand Up @@ -80,7 +80,8 @@ int main(int argc, char *argv[]) {
// Read in image luma plane

// Allocate Input Buffer
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
property::queue::enable_profiling{});

auto dev = q.get_device();
auto ctxt = q.get_context();
Expand Down Expand Up @@ -121,10 +122,6 @@ int main(int argc, char *argv[]) {
}
}

for (int i = 0; i < NUM_BINS; i++) {
bins[i] = 0;
}

// ------------------------------------------------------------------------
// CPU Execution:

Expand All @@ -136,76 +133,89 @@ int main(int argc, char *argv[]) {
image_channel_type::unsigned_int32,
range<2>{width / sizeof(uint4), height});

// Launches the task on the GPU.
double kernel_times = 0;
unsigned num_iters = 10;

try {
// create ranges
// We need that many workitems
auto GlobalRange = range<1>(range_width * range_height);
// Number of workitems in a workgroup
auto LocalRange = range<1>(1);
nd_range<1> Range(GlobalRange, LocalRange);

auto e = q.submit([&](handler &cgh) {
auto readAcc = Img.get_access<uint4, cl::sycl::access::mode::read>(cgh);

cgh.parallel_for<class Hist>(
Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
using namespace sycl::INTEL::gpu;

// Get thread origin offsets
uint tid = ndi.get_group(0);
uint h_pos = (tid % range_width) * BLOCK_WIDTH;
uint v_pos = (tid / range_width) * BLOCK_HEIGHT;

// Declare a 8x32 uchar matrix to store the input block pixel value
simd<unsigned char, 8 * 32> in;

// Declare a vector to store the local histogram
simd<unsigned int, NUM_BINS> histogram(0);

// Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
// Perform 2D media block read to load 8x32 pixel block
in =
media_block_load<unsigned char, 8, 32>(readAcc, h_pos, v_pos);

// Accumulate local histogram for each pixel value
for (int iter = 0; iter <= num_iters; ++iter) {
double etime = 0;
for (int b = 0; b < NUM_BINS; b++)
bins[b] = 0;
// create ranges
// We need that many workitems
auto GlobalRange = range<1>(range_width * range_height);
// Number of workitems in a workgroup
auto LocalRange = range<1>(1);
nd_range<1> Range(GlobalRange, LocalRange);

auto e = q.submit([&](handler &cgh) {
auto readAcc = Img.get_access<uint4, cl::sycl::access::mode::read>(cgh);

cgh.parallel_for<class Hist>(
Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
using namespace sycl::INTEL::gpu;

// Get thread origin offsets
uint tid = ndi.get_group(0);
uint h_pos = (tid % range_width) * BLOCK_WIDTH;
uint v_pos = (tid / range_width) * BLOCK_HEIGHT;

// Declare a 8x32 uchar matrix to store the input block pixel
// value
simd<unsigned char, 8 * 32> in;

// Declare a vector to store the local histogram
simd<unsigned int, NUM_BINS> histogram(0);

// Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
// Perform 2D media block read to load 8x32 pixel block
in = media_block_load<unsigned char, 8, 32>(readAcc, h_pos,
v_pos);

// Accumulate local histogram for each pixel value
#pragma unroll
for (int i = 0; i < 8; i++) {
for (int i = 0; i < 8; i++) {
#pragma unroll
for (int j = 0; j < 32; j++) {
histogram.select<1, 1>(in[i * 32 + j]) += 1;
for (int j = 0; j < 32; j++) {
histogram.select<1, 1>(in[i * 32 + j]) += 1;
}
}
}

// Update starting offset for the next work block
v_pos += 8;
}
// Update starting offset for the next work block
v_pos += 8;
}

// Declare a vector to store the offset for atomic write operation
simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7
offset *= sizeof(unsigned int);
// Declare a vector to store the offset for atomic write operation
simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7
offset *= sizeof(unsigned int);

// Update global sum by atomically adding each local histogram
// Update global sum by atomically adding each local histogram
#pragma unroll
for (int i = 0; i < NUM_BINS; i += 8) {
// Declare a vector to store the source for atomic write operation
simd<unsigned int, 8> src;
src = histogram.select<8, 1>(i);
for (int i = 0; i < NUM_BINS; i += 8) {
// Declare a vector to store the source for atomic write
// operation
simd<unsigned int, 8> src;
src = histogram.select<8, 1>(i);

#ifdef __SYCL_DEVICE_ONLY__
flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, unsigned int, 8>(
bins, offset, src, 1);
offset += 8 * sizeof(unsigned int);
flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, unsigned int, 8>(
bins, offset, src, 1);
offset += 8 * sizeof(unsigned int);
#else
auto vals = block_load<unsigned int, 8>(bins + i);
vals = vals + src;
block_store<unsigned int, 8>(bins + i, vals);
#endif
}
});
});
e.wait();

}
});
});
e.wait();
etime = esimd_test::report_time("kernel time", e, e);
if (iter > 0)
kernel_times += etime;
}
// SYCL will enqueue and run the kernel. Recall that the buffer's data is
// given back to the host at the end of scope.
// make sure data is given back to the host at the end of this scope
Expand All @@ -214,6 +224,9 @@ int main(int argc, char *argv[]) {
return e.get_cl_code();
}

float kernel_time = kernel_times / num_iters;
std::cerr << "GPU kernel time = " << kernel_time << " msec\n";

writeHist(bins);
writeHist(cpuHistogram);
// Checking Histogram
Expand Down
33 changes: 23 additions & 10 deletions SYCL/ESIMD/histogram_256_slm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,8 @@ int CheckHistogram(unsigned int *cpu_histogram, unsigned int *gpu_histogram) {
}

int main() {
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),
property::queue::enable_profiling{});
auto dev = q.get_device();
auto ctxt = q.get_context();

Expand Down Expand Up @@ -146,7 +147,6 @@ int main() {
// Uses the GPU to calculate the histogram output data.
unsigned int *output_surface =
(uint32_t *)malloc_shared(4 * NUM_BINS, dev, ctxt);
memset(output_surface, 0, 4 * NUM_BINS);

unsigned int num_threads;
num_threads = width * height / (NUM_BLOCKS * BLOCK_WIDTH * sizeof(int));
Expand All @@ -155,20 +155,33 @@ int main() {
auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16);
cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);

// Launches the task on the GPU.
double kernel_times = 0;
unsigned num_iters = 10;
try {
auto e = q.submit([&](cl::sycl::handler &cgh) {
cgh.parallel_for<class histogram_slm>(
Range, [=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
histogram_atomic(input_ptr, output_surface, ndi.get_group(0),
ndi.get_local_id(0), 16);
});
});
e.wait();
for (int iter = 0; iter <= num_iters; ++iter) {
double etime = 0;
memset(output_surface, 0, 4 * NUM_BINS);
auto e = q.submit([&](cl::sycl::handler &cgh) {
cgh.parallel_for<class histogram_slm>(
Range, [=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
histogram_atomic(input_ptr, output_surface, ndi.get_group(0),
ndi.get_local_id(0), 16);
});
});
e.wait();
etime = esimd_test::report_time("kernel time", e, e);
if (iter > 0)
kernel_times += etime;
}
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return e.get_cl_code();
}

float kernel_time = kernel_times / num_iters;
std::cerr << "GPU kernel time = " << kernel_time << " msec\n";

std::cout << "finish GPU histogram\n";

memcpy(hist, output_surface, 4 * NUM_BINS);
Expand Down
Loading