@@ -592,74 +592,94 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
592
592
// Number of workitems in a workgroup
593
593
cl::sycl::range<1 > SortLocalRange{1 };
594
594
595
- double total_time = 0 ;
596
- try {
597
- buffer<uint32_t , 1 > bufi (pInputs, range<1 >(size));
598
- buffer<uint32_t , 1 > bufo (pOutputs, range<1 >(size));
599
- // enqueue sort265 kernel
600
- auto e = pQueue_->submit ([&](handler &cgh) {
601
- auto acci = bufi.get_access <access::mode::read>(cgh);
602
- auto acco = bufo.get_access <access::mode::write>(cgh);
603
- cgh.parallel_for <class Sort256 >(
604
- SortGlobalRange * SortLocalRange, [=](id<1 > i) SYCL_ESIMD_KERNEL {
605
- using namespace sycl ::ext::intel::experimental::esimd;
606
- cmk_bitonic_sort_256 (acci, acco, i);
607
- });
608
- });
609
- e.wait ();
610
- total_time += esimd_test::report_time (" kernel time" , e, e);
611
- } catch (cl::sycl::exception const &e) {
612
- std::cout << " SYCL exception caught: " << e.what () << ' \n ' ;
613
- return e.get_cl_code ();
614
- }
595
+ // Start Timer
596
+ esimd_test::Timer timer;
597
+ double start;
598
+
599
+ // Launches the task on the GPU.
600
+ double kernel_times = 0 ;
601
+ unsigned num_iters = 10 ;
602
+
603
+ // num_iters + 1, iteration#0 is for warmup
604
+ for (int iter = 0 ; iter <= num_iters; ++iter) {
605
+ try {
606
+ buffer<uint32_t , 1 > bufi (pInputs, range<1 >(size));
607
+ buffer<uint32_t , 1 > bufo (pOutputs, range<1 >(size));
608
+ // enqueue sort265 kernel
609
+ auto e = pQueue_->submit ([&](handler &cgh) {
610
+ auto acci = bufi.get_access <access::mode::read>(cgh);
611
+ auto acco = bufo.get_access <access::mode::write>(cgh);
612
+ cgh.parallel_for <class Sort256 >(
613
+ SortGlobalRange * SortLocalRange, [=](id<1 > i) SYCL_ESIMD_KERNEL {
614
+ using namespace sycl ::ext::intel::experimental::esimd;
615
+ cmk_bitonic_sort_256 (acci, acco, i);
616
+ });
617
+ });
618
+ e.wait ();
619
+ double etime = esimd_test::report_time (" kernel1 time" , e, e);
620
+ if (iter > 0 )
621
+ kernel_times += etime;
622
+ } catch (cl::sycl::exception const &e) {
623
+ std::cout << " SYCL exception caught: " << e.what () << ' \n ' ;
624
+ return 0 ;
625
+ }
615
626
616
- // Each HW thread swap two 256-element chunks. Hence, we only need
617
- // to launch size/ (base_sort_size*2) HW threads
618
- total_threads = size / (base_sort_size_ * 2 );
619
- // create ranges
620
- // We need that many workitems
621
- auto MergeGlobalRange = cl::sycl::range<1 >(total_threads);
622
- // Number of workitems in a workgroup
623
- cl::sycl::range<1 > MergeLocalRange{1 };
624
-
625
- // enqueue merge kernel multiple times
626
- // this loop is for stage 8 to stage LOG2_ELEMENTS.
627
- event mergeEvent[(LOG2_ELEMENTS - 8 ) * (LOG2_ELEMENTS - 7 ) / 2 ];
628
- int k = 0 ;
629
- try {
630
- for (int i = 8 ; i < LOG2_ELEMENTS; i++) {
631
- // each step halves the stride distance of its prior step.
632
- // 1<<j is the stride distance that the invoked step will handle.
633
- // The recursive steps continue until stride distance 1 is complete.
634
- // For stride distance less than 1<<8, no global synchronization
635
- // is needed, i.e., all work can be done locally within HW threads.
636
- // Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
637
- // compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
638
- // locally.
639
- for (int j = i; j >= 8 ; j--) {
640
- buffer<uint32_t , 1 > buf (pOutputs, range<1 >(size));
641
- mergeEvent[k] = pQueue_->submit ([&](handler &cgh) {
642
- auto acc = buf.get_access <access::mode::read_write>(cgh);
643
- cgh.parallel_for <class Merge >(
644
- MergeGlobalRange * MergeLocalRange,
645
- [=](id<1 > tid) SYCL_ESIMD_KERNEL {
646
- using namespace sycl ::ext::intel::experimental::esimd;
647
- cmk_bitonic_merge (acc, j, i, tid);
648
- });
649
- });
650
- k++;
627
+ // Each HW thread swap two 256-element chunks. Hence, we only need
628
+ // to launch size/ (base_sort_size*2) HW threads
629
+ total_threads = size / (base_sort_size_ * 2 );
630
+ // create ranges
631
+ // We need that many workitems
632
+ auto MergeGlobalRange = cl::sycl::range<1 >(total_threads);
633
+ // Number of workitems in a workgroup
634
+ cl::sycl::range<1 > MergeLocalRange{1 };
635
+
636
+ // enqueue merge kernel multiple times
637
+ // this loop is for stage 8 to stage LOG2_ELEMENTS.
638
+ event mergeEvent[(LOG2_ELEMENTS - 8 ) * (LOG2_ELEMENTS - 7 ) / 2 ];
639
+ int k = 0 ;
640
+ try {
641
+ for (int i = 8 ; i < LOG2_ELEMENTS; i++) {
642
+ // each step halves the stride distance of its prior step.
643
+ // 1<<j is the stride distance that the invoked step will handle.
644
+ // The recursive steps continue until stride distance 1 is complete.
645
+ // For stride distance less than 1<<8, no global synchronization
646
+ // is needed, i.e., all work can be done locally within HW threads.
647
+ // Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
648
+ // compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
649
+ // locally.
650
+ for (int j = i; j >= 8 ; j--) {
651
+ buffer<uint32_t , 1 > buf (pOutputs, range<1 >(size));
652
+ mergeEvent[k] = pQueue_->submit ([&](handler &cgh) {
653
+ auto acc = buf.get_access <access::mode::read_write>(cgh);
654
+ cgh.parallel_for <class Merge >(
655
+ MergeGlobalRange * MergeLocalRange,
656
+ [=](id<1 > tid) SYCL_ESIMD_KERNEL {
657
+ using namespace sycl ::ext::intel::experimental::esimd;
658
+ cmk_bitonic_merge (acc, j, i, tid);
659
+ });
660
+ });
661
+ k++;
662
+ }
651
663
}
664
+ } catch (cl::sycl::exception const &e) {
665
+ std::cout << " SYCL exception caught: " << e.what () << ' \n ' ;
666
+ return 0 ;
652
667
}
653
- } catch (cl::sycl::exception const &e) {
654
- std::cout << " SYCL exception caught: " << e.what () << ' \n ' ;
655
- return e.get_cl_code ();
668
+
669
+ mergeEvent[k - 1 ].wait ();
670
+ double etime = esimd_test::report_time (" kernel2 time" , mergeEvent[0 ],
671
+ mergeEvent[k - 1 ]);
672
+ if (iter > 0 )
673
+ kernel_times += etime;
674
+ else
675
+ start = timer.Elapsed ();
656
676
}
657
677
658
- mergeEvent[k - 1 ].wait ();
659
- total_time +=
660
- esimd_test::report_time (" kernel time" , mergeEvent[0 ], mergeEvent[k - 1 ]);
678
+ // End timer.
679
+ double end = timer.Elapsed ();
661
680
662
- cout << " Sorting Time = " << total_time << " msec " << std::endl;
681
+ esimd_test::display_timing_stats (kernel_times, num_iters,
682
+ (end - start) * 1000 );
663
683
return 1 ;
664
684
}
665
685
0 commit comments