Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit 8d98508

Browse files
authored
[SYCL][ESIMD] More updates to print performance data in uniform way (#528)
* [SYCL][ESIMD] More updates to print performance data in uniform way This patch changes stencil, linear, mandelbrot, BitonicSort and Prefix_local_sum tests to print performance data the same way as it was done in PR#524 for histogram tests. Signed-off-by: Sergey Dmitriev <[email protected]>
1 parent 1aa3199 commit 8d98508

File tree

10 files changed

+647
-412
lines changed

10 files changed

+647
-412
lines changed

SYCL/ESIMD/BitonicSortK.cpp

Lines changed: 82 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -592,74 +592,94 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
592592
// Number of workitems in a workgroup
593593
cl::sycl::range<1> SortLocalRange{1};
594594

595-
double total_time = 0;
596-
try {
597-
buffer<uint32_t, 1> bufi(pInputs, range<1>(size));
598-
buffer<uint32_t, 1> bufo(pOutputs, range<1>(size));
599-
// enqueue sort265 kernel
600-
auto e = pQueue_->submit([&](handler &cgh) {
601-
auto acci = bufi.get_access<access::mode::read>(cgh);
602-
auto acco = bufo.get_access<access::mode::write>(cgh);
603-
cgh.parallel_for<class Sort256>(
604-
SortGlobalRange * SortLocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
605-
using namespace sycl::ext::intel::experimental::esimd;
606-
cmk_bitonic_sort_256(acci, acco, i);
607-
});
608-
});
609-
e.wait();
610-
total_time += esimd_test::report_time("kernel time", e, e);
611-
} catch (cl::sycl::exception const &e) {
612-
std::cout << "SYCL exception caught: " << e.what() << '\n';
613-
return e.get_cl_code();
614-
}
595+
// Start Timer
596+
esimd_test::Timer timer;
597+
double start;
598+
599+
// Launches the task on the GPU.
600+
double kernel_times = 0;
601+
unsigned num_iters = 10;
602+
603+
// num_iters + 1, iteration#0 is for warmup
604+
for (int iter = 0; iter <= num_iters; ++iter) {
605+
try {
606+
buffer<uint32_t, 1> bufi(pInputs, range<1>(size));
607+
buffer<uint32_t, 1> bufo(pOutputs, range<1>(size));
608+
// enqueue sort265 kernel
609+
auto e = pQueue_->submit([&](handler &cgh) {
610+
auto acci = bufi.get_access<access::mode::read>(cgh);
611+
auto acco = bufo.get_access<access::mode::write>(cgh);
612+
cgh.parallel_for<class Sort256>(
613+
SortGlobalRange * SortLocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
614+
using namespace sycl::ext::intel::experimental::esimd;
615+
cmk_bitonic_sort_256(acci, acco, i);
616+
});
617+
});
618+
e.wait();
619+
double etime = esimd_test::report_time("kernel1 time", e, e);
620+
if (iter > 0)
621+
kernel_times += etime;
622+
} catch (cl::sycl::exception const &e) {
623+
std::cout << "SYCL exception caught: " << e.what() << '\n';
624+
return 0;
625+
}
615626

616-
// Each HW thread swap two 256-element chunks. Hence, we only need
617-
// to launch size/ (base_sort_size*2) HW threads
618-
total_threads = size / (base_sort_size_ * 2);
619-
// create ranges
620-
// We need that many workitems
621-
auto MergeGlobalRange = cl::sycl::range<1>(total_threads);
622-
// Number of workitems in a workgroup
623-
cl::sycl::range<1> MergeLocalRange{1};
624-
625-
// enqueue merge kernel multiple times
626-
// this loop is for stage 8 to stage LOG2_ELEMENTS.
627-
event mergeEvent[(LOG2_ELEMENTS - 8) * (LOG2_ELEMENTS - 7) / 2];
628-
int k = 0;
629-
try {
630-
for (int i = 8; i < LOG2_ELEMENTS; i++) {
631-
// each step halves the stride distance of its prior step.
632-
// 1<<j is the stride distance that the invoked step will handle.
633-
// The recursive steps continue until stride distance 1 is complete.
634-
// For stride distance less than 1<<8, no global synchronization
635-
// is needed, i.e., all work can be done locally within HW threads.
636-
// Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
637-
// compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
638-
// locally.
639-
for (int j = i; j >= 8; j--) {
640-
buffer<uint32_t, 1> buf(pOutputs, range<1>(size));
641-
mergeEvent[k] = pQueue_->submit([&](handler &cgh) {
642-
auto acc = buf.get_access<access::mode::read_write>(cgh);
643-
cgh.parallel_for<class Merge>(
644-
MergeGlobalRange * MergeLocalRange,
645-
[=](id<1> tid) SYCL_ESIMD_KERNEL {
646-
using namespace sycl::ext::intel::experimental::esimd;
647-
cmk_bitonic_merge(acc, j, i, tid);
648-
});
649-
});
650-
k++;
627+
// Each HW thread swap two 256-element chunks. Hence, we only need
628+
// to launch size/ (base_sort_size*2) HW threads
629+
total_threads = size / (base_sort_size_ * 2);
630+
// create ranges
631+
// We need that many workitems
632+
auto MergeGlobalRange = cl::sycl::range<1>(total_threads);
633+
// Number of workitems in a workgroup
634+
cl::sycl::range<1> MergeLocalRange{1};
635+
636+
// enqueue merge kernel multiple times
637+
// this loop is for stage 8 to stage LOG2_ELEMENTS.
638+
event mergeEvent[(LOG2_ELEMENTS - 8) * (LOG2_ELEMENTS - 7) / 2];
639+
int k = 0;
640+
try {
641+
for (int i = 8; i < LOG2_ELEMENTS; i++) {
642+
// each step halves the stride distance of its prior step.
643+
// 1<<j is the stride distance that the invoked step will handle.
644+
// The recursive steps continue until stride distance 1 is complete.
645+
// For stride distance less than 1<<8, no global synchronization
646+
// is needed, i.e., all work can be done locally within HW threads.
647+
// Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
648+
// compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
649+
// locally.
650+
for (int j = i; j >= 8; j--) {
651+
buffer<uint32_t, 1> buf(pOutputs, range<1>(size));
652+
mergeEvent[k] = pQueue_->submit([&](handler &cgh) {
653+
auto acc = buf.get_access<access::mode::read_write>(cgh);
654+
cgh.parallel_for<class Merge>(
655+
MergeGlobalRange * MergeLocalRange,
656+
[=](id<1> tid) SYCL_ESIMD_KERNEL {
657+
using namespace sycl::ext::intel::experimental::esimd;
658+
cmk_bitonic_merge(acc, j, i, tid);
659+
});
660+
});
661+
k++;
662+
}
651663
}
664+
} catch (cl::sycl::exception const &e) {
665+
std::cout << "SYCL exception caught: " << e.what() << '\n';
666+
return 0;
652667
}
653-
} catch (cl::sycl::exception const &e) {
654-
std::cout << "SYCL exception caught: " << e.what() << '\n';
655-
return e.get_cl_code();
668+
669+
mergeEvent[k - 1].wait();
670+
double etime = esimd_test::report_time("kernel2 time", mergeEvent[0],
671+
mergeEvent[k - 1]);
672+
if (iter > 0)
673+
kernel_times += etime;
674+
else
675+
start = timer.Elapsed();
656676
}
657677

658-
mergeEvent[k - 1].wait();
659-
total_time +=
660-
esimd_test::report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);
678+
// End timer.
679+
double end = timer.Elapsed();
661680

662-
cout << " Sorting Time = " << total_time << " msec " << std::endl;
681+
esimd_test::display_timing_stats(kernel_times, num_iters,
682+
(end - start) * 1000);
663683
return 1;
664684
}
665685

SYCL/ESIMD/BitonicSortKv2.cpp

Lines changed: 77 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -513,69 +513,89 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
513513
// Number of workitems in a workgroup
514514
cl::sycl::range<1> SortLocalRange{1};
515515

516-
// enqueue sort265 kernel
517-
double total_time = 0;
518-
try {
519-
auto e = pQueue_->submit([&](handler &cgh) {
520-
cgh.parallel_for<class Sort256>(
521-
SortGlobalRange * SortLocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
522-
using namespace sycl::ext::intel::experimental::esimd;
523-
cmk_bitonic_sort_256(pInputs, pOutputs, i);
524-
});
525-
});
526-
e.wait();
527-
total_time += esimd_test::report_time("kernel time", e, e);
528-
} catch (cl::sycl::exception const &e) {
529-
std::cout << "SYCL exception caught: " << e.what() << '\n';
530-
return e.get_cl_code();
531-
}
516+
// Start Timer
517+
esimd_test::Timer timer;
518+
double start;
519+
520+
// Launches the task on the GPU.
521+
double kernel_times = 0;
522+
unsigned num_iters = 10;
523+
524+
// num_iters + 1, iteration#0 is for warmup
525+
for (int iter = 0; iter <= num_iters; ++iter) {
526+
// enqueue sort265 kernel
527+
try {
528+
auto e = pQueue_->submit([&](handler &cgh) {
529+
cgh.parallel_for<class Sort256>(
530+
SortGlobalRange * SortLocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
531+
using namespace sycl::ext::intel::experimental::esimd;
532+
cmk_bitonic_sort_256(pInputs, pOutputs, i);
533+
});
534+
});
535+
e.wait();
536+
double etime = esimd_test::report_time("kernel1 time", e, e);
537+
if (iter > 0)
538+
kernel_times += etime;
539+
} catch (cl::sycl::exception const &e) {
540+
std::cout << "SYCL exception caught: " << e.what() << '\n';
541+
return 0;
542+
}
532543

533-
// Each HW thread swap two 256-element chunks. Hence, we only need
534-
// to launch size/ (base_sort_size*2) HW threads
535-
total_threads = size / (base_sort_size_ * 2);
536-
// create ranges
537-
// We need that many workitems
538-
auto MergeGlobalRange = cl::sycl::range<1>(total_threads);
539-
// Number of workitems in a workgroup
540-
cl::sycl::range<1> MergeLocalRange{1};
541-
542-
// enqueue merge kernel multiple times
543-
// this loop is for stage 8 to stage LOG2_ELEMENTS.
544-
event mergeEvent[(LOG2_ELEMENTS - 8) * (LOG2_ELEMENTS - 7) / 2];
545-
int k = 0;
546-
try {
547-
for (int i = 8; i < LOG2_ELEMENTS; i++) {
548-
// each step halves the stride distance of its prior step.
549-
// 1<<j is the stride distance that the invoked step will handle.
550-
// The recursive steps continue until stride distance 1 is complete.
551-
// For stride distance less than 1<<8, no global synchronization
552-
// is needed, i.e., all work can be done locally within HW threads.
553-
// Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
554-
// compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
555-
// locally.
556-
for (int j = i; j >= 8; j--) {
557-
mergeEvent[k] = pQueue_->submit([&](handler &cgh) {
558-
cgh.parallel_for<class Merge>(
559-
MergeGlobalRange * MergeLocalRange,
560-
[=](id<1> tid) SYCL_ESIMD_KERNEL {
561-
using namespace sycl::ext::intel::experimental::esimd;
562-
cmk_bitonic_merge(pOutputs, j, i, tid);
563-
});
564-
});
565-
// mergeEvent[k].wait();
566-
k++;
544+
// Each HW thread swap two 256-element chunks. Hence, we only need
545+
// to launch size/ (base_sort_size*2) HW threads
546+
total_threads = size / (base_sort_size_ * 2);
547+
// create ranges
548+
// We need that many workitems
549+
auto MergeGlobalRange = cl::sycl::range<1>(total_threads);
550+
// Number of workitems in a workgroup
551+
cl::sycl::range<1> MergeLocalRange{1};
552+
553+
// enqueue merge kernel multiple times
554+
// this loop is for stage 8 to stage LOG2_ELEMENTS.
555+
event mergeEvent[(LOG2_ELEMENTS - 8) * (LOG2_ELEMENTS - 7) / 2];
556+
int k = 0;
557+
try {
558+
for (int i = 8; i < LOG2_ELEMENTS; i++) {
559+
// each step halves the stride distance of its prior step.
560+
// 1<<j is the stride distance that the invoked step will handle.
561+
// The recursive steps continue until stride distance 1 is complete.
562+
// For stride distance less than 1<<8, no global synchronization
563+
// is needed, i.e., all work can be done locally within HW threads.
564+
// Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
565+
// compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
566+
// locally.
567+
for (int j = i; j >= 8; j--) {
568+
mergeEvent[k] = pQueue_->submit([&](handler &cgh) {
569+
cgh.parallel_for<class Merge>(
570+
MergeGlobalRange * MergeLocalRange,
571+
[=](id<1> tid) SYCL_ESIMD_KERNEL {
572+
using namespace sycl::ext::intel::experimental::esimd;
573+
cmk_bitonic_merge(pOutputs, j, i, tid);
574+
});
575+
});
576+
// mergeEvent[k].wait();
577+
k++;
578+
}
567579
}
580+
} catch (cl::sycl::exception const &e) {
581+
std::cout << "SYCL exception caught: " << e.what() << '\n';
582+
return 0;
568583
}
569-
} catch (cl::sycl::exception const &e) {
570-
std::cout << "SYCL exception caught: " << e.what() << '\n';
571-
return e.get_cl_code();
584+
585+
mergeEvent[k - 1].wait();
586+
double etime = esimd_test::report_time("kernel2 time", mergeEvent[0],
587+
mergeEvent[k - 1]);
588+
if (iter > 0)
589+
kernel_times += etime;
590+
else
591+
start = timer.Elapsed();
572592
}
573593

574-
mergeEvent[k - 1].wait();
575-
total_time +=
576-
esimd_test::report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);
594+
// End timer.
595+
double end = timer.Elapsed();
577596

578-
cout << " Sorting Time = " << total_time << " msec " << std::endl;
597+
esimd_test::display_timing_stats(kernel_times, num_iters,
598+
(end - start) * 1000);
579599
return 1;
580600
}
581601

0 commit comments

Comments
 (0)