Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL][ESIMD] More updates to print performance data in uniform way #528

Merged
merged 3 commits into from
Oct 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 82 additions & 62 deletions SYCL/ESIMD/BitonicSortK.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -592,74 +592,94 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
// Number of workitems in a workgroup
cl::sycl::range<1> SortLocalRange{1};

double total_time = 0;
try {
buffer<uint32_t, 1> bufi(pInputs, range<1>(size));
buffer<uint32_t, 1> bufo(pOutputs, range<1>(size));
// enqueue sort265 kernel
auto e = pQueue_->submit([&](handler &cgh) {
auto acci = bufi.get_access<access::mode::read>(cgh);
auto acco = bufo.get_access<access::mode::write>(cgh);
cgh.parallel_for<class Sort256>(
SortGlobalRange * SortLocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::experimental::esimd;
cmk_bitonic_sort_256(acci, acco, i);
});
});
e.wait();
total_time += esimd_test::report_time("kernel time", e, e);
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return e.get_cl_code();
}
// Start Timer
esimd_test::Timer timer;
double start;

// Launches the task on the GPU.
double kernel_times = 0;
unsigned num_iters = 10;

// num_iters + 1, iteration#0 is for warmup
for (int iter = 0; iter <= num_iters; ++iter) {
try {
buffer<uint32_t, 1> bufi(pInputs, range<1>(size));
buffer<uint32_t, 1> bufo(pOutputs, range<1>(size));
// enqueue sort265 kernel
auto e = pQueue_->submit([&](handler &cgh) {
auto acci = bufi.get_access<access::mode::read>(cgh);
auto acco = bufo.get_access<access::mode::write>(cgh);
cgh.parallel_for<class Sort256>(
SortGlobalRange * SortLocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::experimental::esimd;
cmk_bitonic_sort_256(acci, acco, i);
});
});
e.wait();
double etime = esimd_test::report_time("kernel1 time", e, e);
if (iter > 0)
kernel_times += etime;
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return 0;
}

// Each HW thread swap two 256-element chunks. Hence, we only need
// to launch size/ (base_sort_size*2) HW threads
total_threads = size / (base_sort_size_ * 2);
// create ranges
// We need that many workitems
auto MergeGlobalRange = cl::sycl::range<1>(total_threads);
// Number of workitems in a workgroup
cl::sycl::range<1> MergeLocalRange{1};

// enqueue merge kernel multiple times
// this loop is for stage 8 to stage LOG2_ELEMENTS.
event mergeEvent[(LOG2_ELEMENTS - 8) * (LOG2_ELEMENTS - 7) / 2];
int k = 0;
try {
for (int i = 8; i < LOG2_ELEMENTS; i++) {
// each step halves the stride distance of its prior step.
// 1<<j is the stride distance that the invoked step will handle.
// The recursive steps continue until stride distance 1 is complete.
// For stride distance less than 1<<8, no global synchronization
// is needed, i.e., all work can be done locally within HW threads.
// Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
// compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
// locally.
for (int j = i; j >= 8; j--) {
buffer<uint32_t, 1> buf(pOutputs, range<1>(size));
mergeEvent[k] = pQueue_->submit([&](handler &cgh) {
auto acc = buf.get_access<access::mode::read_write>(cgh);
cgh.parallel_for<class Merge>(
MergeGlobalRange * MergeLocalRange,
[=](id<1> tid) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::experimental::esimd;
cmk_bitonic_merge(acc, j, i, tid);
});
});
k++;
// Each HW thread swap two 256-element chunks. Hence, we only need
// to launch size/ (base_sort_size*2) HW threads
total_threads = size / (base_sort_size_ * 2);
// create ranges
// We need that many workitems
auto MergeGlobalRange = cl::sycl::range<1>(total_threads);
// Number of workitems in a workgroup
cl::sycl::range<1> MergeLocalRange{1};

// enqueue merge kernel multiple times
// this loop is for stage 8 to stage LOG2_ELEMENTS.
event mergeEvent[(LOG2_ELEMENTS - 8) * (LOG2_ELEMENTS - 7) / 2];
int k = 0;
try {
for (int i = 8; i < LOG2_ELEMENTS; i++) {
// each step halves the stride distance of its prior step.
// 1<<j is the stride distance that the invoked step will handle.
// The recursive steps continue until stride distance 1 is complete.
// For stride distance less than 1<<8, no global synchronization
// is needed, i.e., all work can be done locally within HW threads.
// Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
// compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
// locally.
for (int j = i; j >= 8; j--) {
buffer<uint32_t, 1> buf(pOutputs, range<1>(size));
mergeEvent[k] = pQueue_->submit([&](handler &cgh) {
auto acc = buf.get_access<access::mode::read_write>(cgh);
cgh.parallel_for<class Merge>(
MergeGlobalRange * MergeLocalRange,
[=](id<1> tid) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::experimental::esimd;
cmk_bitonic_merge(acc, j, i, tid);
});
});
k++;
}
}
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return 0;
}
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return e.get_cl_code();

mergeEvent[k - 1].wait();
double etime = esimd_test::report_time("kernel2 time", mergeEvent[0],
mergeEvent[k - 1]);
if (iter > 0)
kernel_times += etime;
else
start = timer.Elapsed();
}

mergeEvent[k - 1].wait();
total_time +=
esimd_test::report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);
// End timer.
double end = timer.Elapsed();

cout << " Sorting Time = " << total_time << " msec " << std::endl;
esimd_test::display_timing_stats(kernel_times, num_iters,
(end - start) * 1000);
return 1;
}

Expand Down
134 changes: 77 additions & 57 deletions SYCL/ESIMD/BitonicSortKv2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -513,69 +513,89 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
// Number of workitems in a workgroup
cl::sycl::range<1> SortLocalRange{1};

// enqueue sort265 kernel
double total_time = 0;
try {
auto e = pQueue_->submit([&](handler &cgh) {
cgh.parallel_for<class Sort256>(
SortGlobalRange * SortLocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::experimental::esimd;
cmk_bitonic_sort_256(pInputs, pOutputs, i);
});
});
e.wait();
total_time += esimd_test::report_time("kernel time", e, e);
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return e.get_cl_code();
}
// Start Timer
esimd_test::Timer timer;
double start;

// Launches the task on the GPU.
double kernel_times = 0;
unsigned num_iters = 10;

// num_iters + 1, iteration#0 is for warmup
for (int iter = 0; iter <= num_iters; ++iter) {
// enqueue sort265 kernel
try {
auto e = pQueue_->submit([&](handler &cgh) {
cgh.parallel_for<class Sort256>(
SortGlobalRange * SortLocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::experimental::esimd;
cmk_bitonic_sort_256(pInputs, pOutputs, i);
});
});
e.wait();
double etime = esimd_test::report_time("kernel1 time", e, e);
if (iter > 0)
kernel_times += etime;
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return 0;
}

// Each HW thread swap two 256-element chunks. Hence, we only need
// to launch size/ (base_sort_size*2) HW threads
total_threads = size / (base_sort_size_ * 2);
// create ranges
// We need that many workitems
auto MergeGlobalRange = cl::sycl::range<1>(total_threads);
// Number of workitems in a workgroup
cl::sycl::range<1> MergeLocalRange{1};

// enqueue merge kernel multiple times
// this loop is for stage 8 to stage LOG2_ELEMENTS.
event mergeEvent[(LOG2_ELEMENTS - 8) * (LOG2_ELEMENTS - 7) / 2];
int k = 0;
try {
for (int i = 8; i < LOG2_ELEMENTS; i++) {
// each step halves the stride distance of its prior step.
// 1<<j is the stride distance that the invoked step will handle.
// The recursive steps continue until stride distance 1 is complete.
// For stride distance less than 1<<8, no global synchronization
// is needed, i.e., all work can be done locally within HW threads.
// Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
// compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
// locally.
for (int j = i; j >= 8; j--) {
mergeEvent[k] = pQueue_->submit([&](handler &cgh) {
cgh.parallel_for<class Merge>(
MergeGlobalRange * MergeLocalRange,
[=](id<1> tid) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::experimental::esimd;
cmk_bitonic_merge(pOutputs, j, i, tid);
});
});
// mergeEvent[k].wait();
k++;
// Each HW thread swap two 256-element chunks. Hence, we only need
// to launch size/ (base_sort_size*2) HW threads
total_threads = size / (base_sort_size_ * 2);
// create ranges
// We need that many workitems
auto MergeGlobalRange = cl::sycl::range<1>(total_threads);
// Number of workitems in a workgroup
cl::sycl::range<1> MergeLocalRange{1};

// enqueue merge kernel multiple times
// this loop is for stage 8 to stage LOG2_ELEMENTS.
event mergeEvent[(LOG2_ELEMENTS - 8) * (LOG2_ELEMENTS - 7) / 2];
int k = 0;
try {
for (int i = 8; i < LOG2_ELEMENTS; i++) {
// each step halves the stride distance of its prior step.
// 1<<j is the stride distance that the invoked step will handle.
// The recursive steps continue until stride distance 1 is complete.
// For stride distance less than 1<<8, no global synchronization
// is needed, i.e., all work can be done locally within HW threads.
// Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
// compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
// locally.
for (int j = i; j >= 8; j--) {
mergeEvent[k] = pQueue_->submit([&](handler &cgh) {
cgh.parallel_for<class Merge>(
MergeGlobalRange * MergeLocalRange,
[=](id<1> tid) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::experimental::esimd;
cmk_bitonic_merge(pOutputs, j, i, tid);
});
});
// mergeEvent[k].wait();
k++;
}
}
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return 0;
}
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return e.get_cl_code();

mergeEvent[k - 1].wait();
double etime = esimd_test::report_time("kernel2 time", mergeEvent[0],
mergeEvent[k - 1]);
if (iter > 0)
kernel_times += etime;
else
start = timer.Elapsed();
}

mergeEvent[k - 1].wait();
total_time +=
esimd_test::report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);
// End timer.
double end = timer.Elapsed();

cout << " Sorting Time = " << total_time << " msec " << std::endl;
esimd_test::display_timing_stats(kernel_times, num_iters,
(end - start) * 1000);
return 1;
}

Expand Down
Loading