Skip to content

Commit 900aca8

Browse files
committed
Restrict rounding-up optimization to GPU devices.
1 parent 81b777c commit 900aca8

File tree

3 files changed

+21
-7
lines changed

3 files changed

+21
-7
lines changed

sycl/include/CL/sycl/handler.hpp

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -760,17 +760,20 @@ class __SYCL_EXPORT handler {
760760
using NameT =
761761
typename detail::get_kernel_name_t<KernelName, KernelType>::name;
762762

763+
// The work group size preferred by this device.
763764
// A reasonable choice for rounding up the range is 32.
764765
constexpr size_t GoodLocalSizeX = 32;
765766

766767
// Disable the rounding-up optimizations under these conditions:
767-
// 1. The env var SYCL_OPT_PFWGS_DISABLE is set
768-
// 2. When the string SYCL_OPT_PFWGS_DISABLE is in the kernel name.
769-
// 3. The kernel is provided via an interoperability method.
770-
// 4. The API "this_item" is used inside the kernel.
771-
// 5. The range is already a multiple of the rounding factor.
768+
// 1. The device is not a GPU. Only GPUs benefit from rounding.
769+
// 2. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set.
770+
// 3. The string SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is in
771+
// the kernel name.
772+
// 4. The kernel is provided via an interoperability method.
773+
// 5. The API "this_item" is used inside the kernel.
774+
// 6. The range is already a multiple of the rounding factor.
772775
//
773-
// Cases 3 and 4 could be supported with extra effort.
776+
// Cases 4 and 5 could be supported with extra effort.
774777
// As an optimization for the common case it is an
775778
// implementation choice to not support those scenarios.
776779
// Note that "this_item" is a free function, i.e. not tied to any
@@ -784,6 +787,7 @@ class __SYCL_EXPORT handler {
784787
std::string KName = typeid(NameT *).name();
785788
using KI = detail::KernelInfo<KernelName>;
786789
bool DisableRounding =
790+
!is_gpu(MQueue) ||
787791
(getenv("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") != nullptr) ||
788792
(KName.find("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") !=
789793
std::string::npos) ||
@@ -1949,6 +1953,12 @@ class __SYCL_EXPORT handler {
19491953
/// \param Count is a number of bytes to be prefetched.
19501954
void prefetch(const void *Ptr, size_t Count);
19511955

1956+
/// Check if the queue being used is for a GPU device
1957+
///
1958+
/// \param Queue is the queue for this handler.
1959+
/// \return Whether the device is a GPU.
1960+
bool is_gpu(shared_ptr_class<sycl::detail::queue_impl> Queue);
1961+
19521962
private:
19531963
shared_ptr_class<detail::queue_impl> MQueue;
19541964
/// The storage for the arguments passed.

sycl/source/handler.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,5 +343,10 @@ void handler::prefetch(const void *Ptr, size_t Count) {
343343
MLength = Count;
344344
MCGType = detail::CG::PREFETCH_USM;
345345
}
346+
347+
bool handler::is_gpu(shared_ptr_class<sycl::detail::queue_impl> Queue) {
348+
device Dev = Queue->get_device();
349+
return Dev.is_gpu();
350+
}
346351
} // namespace sycl
347352
} // __SYCL_INLINE_NAMESPACE(cl)

sycl/test/basic_tests/parallel_for_range_roundup.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
22
// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %GPU_RUN_PLACEHOLDER %t.out %GPU_CHECK_PLACEHOLDER
3-
// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER
43

54
#include <CL/sycl.hpp>
65

0 commit comments

Comments
 (0)