Skip to content

[SYCL][CUDA] atomic_ref.fetch_add used for fp64 reduction if device.has(atomic64) #3950

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jun 30, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 82 additions & 1 deletion sycl/include/CL/sycl/ONEAPI/reduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,28 @@ using IsReduOptForFastAtomicFetch =
sycl::detail::IsBitAND<T, BinaryOperation>::value)>;
#endif

// This type trait is used to detect if the atomic operation BinaryOperation
// used with operands of the type T is available for using in reduction, in
// addition to the cases covered by "IsReduOptForFastAtomicFetch", if the device
// has the atomic64 aspect. This type trait should only be used if the device
// has the atomic64 aspect. Note that this type trait is currently a subset of
// IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits
// using the reduce_over_group() algorithm to produce stable results across same
// type devices.
// TODO 32 bit floating point atomics are eventually expected to be supported by
// the has_fast_atomics specialization. Once the reducer class is updated to
// replace the deprecated atomic class with atomic_ref, the (sizeof(T) == 4)
// case should be removed here and replaced in IsReduOptForFastAtomicFetch.
template <typename T, class BinaryOperation>
using IsReduOptForAtomic64Add =
#ifdef SYCL_REDUCTION_DETERMINISTIC
bool_constant<false>;
#else
bool_constant<sycl::detail::IsPlus<T, BinaryOperation>::value &&
sycl::detail::is_sgenfloat<T>::value &&
(sizeof(T) == 4 || sizeof(T) == 8)>;
#endif

// This type trait is used to detect if the group algorithm reduce() used with
// operands of the type T and the operation BinaryOperation is available
// for using in reduction.
Expand Down Expand Up @@ -288,6 +310,18 @@ class reducer<T, BinaryOperation,
.fetch_max(MValue);
}

/// Atomic ADD operation: for floating point using atomic_ref
template <typename _T = T, class _BinaryOperation = BinaryOperation>
enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
IsReduOptForAtomic64Add<T, _BinaryOperation>::value>
atomic_combine(_T *ReduVarPtr) const {

atomic_ref<T, sycl::ONEAPI::memory_order::relaxed,
sycl::ONEAPI::memory_scope::device,
access::address_space::global_space>(
*global_ptr<T>(ReduVarPtr)) += MValue;
}

T MValue;
};

Expand Down Expand Up @@ -330,6 +364,8 @@ class reduction_impl : private reduction_impl_base {
using local_accessor_type =
accessor<T, buffer_dim, access::mode::read_write, access::target::local>;

static constexpr bool has_atomic_add_float64 =
IsReduOptForAtomic64Add<T, BinaryOperation>::value;
static constexpr bool has_fast_atomics =
IsReduOptForFastAtomicFetch<T, BinaryOperation>::value;
static constexpr bool has_fast_reduce =
Expand Down Expand Up @@ -636,7 +672,8 @@ class reduction_impl : private reduction_impl_base {
/// require initialization with identity value, then return user's read-write
/// accessor. Otherwise, create 1-element global buffer initialized with
/// identity value and return an accessor to that buffer.
template <bool HasFastAtomics = has_fast_atomics>

template <bool HasFastAtomics = (has_fast_atomics || has_atomic_add_float64)>
std::enable_if_t<HasFastAtomics, rw_accessor_type>
getReadWriteAccessorToInitializedMem(handler &CGH) {
if (!is_usm && !initializeToIdentity())
Expand Down Expand Up @@ -1467,6 +1504,50 @@ void reduCGFunc(handler &CGH, KernelType KernelFunc,
}
}

// Specialization for devices with the atomic64 aspect, which guarantees 64 (and
// temporarily 32) bit floating point support for atomic add.
// TODO 32 bit floating point atomics are eventually expected to be supported by
// the has_fast_atomics specialization. Corresponding changes to
// IsReduOptForAtomic64Add, as prescribed in its documentation, should then also
// be made.
template <typename KernelName, typename KernelType, int Dims, class Reduction>
std::enable_if_t<Reduction::has_atomic_add_float64>
reduCGFuncImplAtomic64(handler &CGH, KernelType KernelFunc,
const nd_range<Dims> &Range, Reduction &,
typename Reduction::rw_accessor_type Out) {
using Name = typename get_reduction_main_kernel_name_t<
KernelName, KernelType, Reduction::is_usm,
Reduction::has_atomic_add_float64,
typename Reduction::rw_accessor_type>::name;
CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
// Call user's function. Reducer.MValue gets initialized there.
typename Reduction::reducer_type Reducer;
KernelFunc(NDIt, Reducer);

typename Reduction::binary_operation BOp;
Reducer.MValue = reduce_over_group(NDIt.get_group(), Reducer.MValue, BOp);
if (NDIt.get_local_linear_id() == 0) {
Reducer.atomic_combine(Reduction::getOutPointer(Out));
}
});
}

// Specialization for devices with the atomic64 aspect, which guarantees 64 (and
// temporarily 32) bit floating point support for atomic add.
// TODO 32 bit floating point atomics are eventually expected to be supported by
// the has_fast_atomics specialization. Corresponding changes to
// IsReduOptForAtomic64Add, as prescribed in its documentation, should then also
// be made.
template <typename KernelName, typename KernelType, int Dims, class Reduction>
enable_if_t<Reduction::has_atomic_add_float64>
reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
const nd_range<Dims> &Range, Reduction &Redu) {

auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH);
reduCGFuncImplAtomic64<KernelName, KernelType, Dims, Reduction>(
CGH, KernelFunc, Range, Redu, Out);
}

inline void associateReduAccsWithHandlerHelper(handler &) {}

template <typename ReductionT>
Expand Down
1 change: 1 addition & 0 deletions sycl/include/CL/sycl/aspects.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ enum class aspect {
ext_intel_mem_channel = 25,
usm_atomic_host_allocations = 26,
usm_atomic_shared_allocations = 27,
atomic64 = 28
};

} // namespace sycl
Expand Down
3 changes: 2 additions & 1 deletion sycl/include/CL/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,8 @@ typedef enum {
PI_DEVICE_INFO_GPU_SLICES = 0x10023,
PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 0x10024,
PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 0x10025,
PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026
PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026,
PI_DEVICE_INFO_ATOMIC_64 = 0x10110
} _pi_device_info;

typedef enum {
Expand Down
60 changes: 59 additions & 1 deletion sycl/include/CL/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,11 @@ class reduction_impl;
using cl::sycl::detail::enable_if_t;
using cl::sycl::detail::queue_impl;

template <typename KernelName, typename KernelType, int Dims, class Reduction>
enable_if_t<Reduction::has_atomic_add_float64>
reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
const nd_range<Dims> &Range, Reduction &Redu);

template <typename KernelName, typename KernelType, int Dims, class Reduction>
enable_if_t<Reduction::has_fast_atomics>
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
Expand Down Expand Up @@ -1382,6 +1387,49 @@ class __SYCL_EXPORT handler {
}
}

/// Implements parallel_for() accepting nd_range \p Range and one reduction
/// object. This version is a specialization for the add operator.
/// It performs runtime checks for device aspect "atomic64"; if found, fast
/// sycl::atomic_ref operations are used to update the reduction at the
/// end of each work-group work. Otherwise the default implementation is
/// used.
//
// If the reduction variable must be initialized with the identity value
// before the kernel run, then an additional working accessor is created,
// initialized with the identity value and used in the kernel. That working
// accessor is then copied to user's accessor or USM pointer after
// the kernel run.
// For USM pointers without initialize_to_identity properties the same scheme
// with working accessor is used as re-using user's USM pointer in the kernel
// would require creation of another variant of user's kernel, which does not
// seem efficient.
template <typename KernelName = detail::auto_name, typename KernelType,
int Dims, typename Reduction>
detail::enable_if_t<Reduction::has_atomic_add_float64>
parallel_for(nd_range<Dims> Range, Reduction Redu,
_KERNELFUNCPARAM(KernelFunc)) {

shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
device D = detail::getDeviceFromHandler(*this);

if (D.has(aspect::atomic64)) {

ONEAPI::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc, Range,
Redu);

if (Reduction::is_usm || Redu.initializeToIdentity()) {
this->finalize();
handler CopyHandler(QueueCopy, MIsHost);
CopyHandler.saveCodeLoc(MCodeLoc);
ONEAPI::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
Redu);
MLastEvent = CopyHandler.finalize();
}
} else {
parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
}
}

/// Defines and invokes a SYCL kernel function for the specified nd_range.
/// Performs reduction operation specified in \p Redu.
///
Expand All @@ -1398,9 +1446,19 @@ class __SYCL_EXPORT handler {
/// optimized implementations waiting for their turn of code-review.
template <typename KernelName = detail::auto_name, typename KernelType,
int Dims, typename Reduction>
detail::enable_if_t<!Reduction::has_fast_atomics>
detail::enable_if_t<!Reduction::has_fast_atomics &&
!Reduction::has_atomic_add_float64>
parallel_for(nd_range<Dims> Range, Reduction Redu,
_KERNELFUNCPARAM(KernelFunc)) {

parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
}

template <typename KernelName, typename KernelType, int Dims,
typename Reduction>
detail::enable_if_t<!Reduction::has_fast_atomics>
parallel_for_Impl(nd_range<Dims> Range, Reduction Redu,
KernelType KernelFunc) {
// This parallel_for() is lowered to the following sequence:
// 1) Call a kernel that a) call user's lambda function and b) performs
// one iteration of reduction, storing the partial reductions/sums
Expand Down
1 change: 1 addition & 0 deletions sycl/include/CL/sycl/info/device_traits.def
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ __SYCL_PARAM_TRAITS_SPEC(device, max_clock_frequency, pi_uint32)
__SYCL_PARAM_TRAITS_SPEC(device, address_bits, pi_uint32)
__SYCL_PARAM_TRAITS_SPEC(device, max_mem_alloc_size, pi_uint64)
__SYCL_PARAM_TRAITS_SPEC(device, image_support, bool)
__SYCL_PARAM_TRAITS_SPEC(device, atomic64, bool)
__SYCL_PARAM_TRAITS_SPEC(device, max_read_image_args, pi_uint32)
__SYCL_PARAM_TRAITS_SPEC(device, max_write_image_args, pi_uint32)
__SYCL_PARAM_TRAITS_SPEC(device, image2d_max_width, size_t)
Expand Down
3 changes: 2 additions & 1 deletion sycl/include/CL/sycl/info/info_desc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,8 @@ enum class device : cl_device_info {
ext_intel_gpu_eu_count_per_subslice =
PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE,
ext_intel_max_mem_bandwidth = PI_DEVICE_INFO_MAX_MEM_BANDWIDTH,
ext_intel_mem_channel = PI_MEM_PROPERTIES_CHANNEL
ext_intel_mem_channel = PI_MEM_PROPERTIES_CHANNEL,
atomic64 = PI_DEVICE_INFO_ATOMIC_64
};

enum class device_type : pi_uint64 {
Expand Down
19 changes: 18 additions & 1 deletion sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -985,6 +985,19 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
bool ifp = (major >= 7);
return getInfo(param_value_size, param_value, param_value_size_ret, ifp);
}

case PI_DEVICE_INFO_ATOMIC_64: {
int major = 0;
cl::sycl::detail::pi::assertion(
cuDeviceGetAttribute(&major,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
device->get()) == CUDA_SUCCESS);

bool atomic64 = (major >= 6) ? true : false;
return getInfo(param_value_size, param_value, param_value_size_ret,
atomic64);
}

case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
// NVIDIA devices only support one sub-group size (the warp size)
int warpSize = 0;
Expand Down Expand Up @@ -1362,7 +1375,11 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
return getInfo(param_value_size, param_value, param_value_size_ret, "");
}
case PI_DEVICE_INFO_EXTENSIONS: {
return getInfo(param_value_size, param_value, param_value_size_ret, "");

std::string SupportedExtensions = "cl_khr_fp64 ";

return getInfo(param_value_size, param_value, param_value_size_ret,
SupportedExtensions.c_str());
}
case PI_DEVICE_INFO_PRINTF_BUFFER_SIZE: {
// The minimum value for the FULL profile is 1 MB.
Expand Down
1 change: 1 addition & 0 deletions sycl/plugins/opencl/pi_opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName,
case PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
case PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH:
case PI_DEVICE_INFO_ATOMIC_64:
return PI_INVALID_VALUE;

default:
Expand Down
2 changes: 2 additions & 0 deletions sycl/source/detail/device_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,8 @@ bool device_impl::has(aspect Aspect) const {
return has_extension("cl_khr_int64_base_atomics");
case aspect::int64_extended_atomics:
return has_extension("cl_khr_int64_extended_atomics");
case aspect::atomic64:
return get_info<info::device::atomic64>();
case aspect::image:
return get_info<info::device::image_support>();
case aspect::online_compiler:
Expand Down
21 changes: 21 additions & 0 deletions sycl/source/detail/device_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,23 @@ template <> struct get_device_info<bool, info::device::queue_profiling> {
}
};

// Specialization for atomic64 that is necessary because
// PI_DEVICE_INFO_ATOMIC_64 is currently only implemented for the cuda backend.
template <> struct get_device_info<bool, info::device::atomic64> {
static bool get(RT::PiDevice dev, const plugin &Plugin) {

bool result = false;

RT::PiResult Err = Plugin.call_nocheck<PiApiKind::piDeviceGetInfo>(
dev, pi::cast<RT::PiDeviceInfo>(info::device::atomic64), sizeof(result),
&result, nullptr);
if (Err != PI_SUCCESS) {
return false;
}
return result;
}
};

// Specialization for exec_capabilities, OpenCL returns a bitfield
template <>
struct get_device_info<vector_class<info::execution_capability>,
Expand Down Expand Up @@ -613,6 +630,10 @@ template <> inline bool get_device_info_host<info::device::image_support>() {
return true;
}

template <> inline bool get_device_info_host<info::device::atomic64>() {
return false;
}

template <>
inline cl_uint get_device_info_host<info::device::max_read_image_args>() {
// current value is the required minimum
Expand Down
1 change: 1 addition & 0 deletions sycl/test/abi/sycl_symbols_linux.dump
Original file line number Diff line number Diff line change
Expand Up @@ -4127,6 +4127,7 @@ _ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65571EEENS3_12param_traitsIS4_XT_
_ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65572EEENS3_12param_traitsIS4_XT_EE11return_typeEv
_ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65573EEENS3_12param_traitsIS4_XT_EE11return_typeEv
_ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65574EEENS3_12param_traitsIS4_XT_EE11return_typeEv
_ZNK2cl4sycl6device8get_infoILNS0_4info6deviceE65808EEENS3_12param_traitsIS4_XT_EE11return_typeEv
_ZNK2cl4sycl6device9getNativeEv
_ZNK2cl4sycl6kernel11get_contextEv
_ZNK2cl4sycl6kernel11get_programEv
Expand Down
3 changes: 3 additions & 0 deletions sycl/test/on-device/basic_tests/aspects.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ int main() {
if (plt.has(aspect::int64_extended_atomics)) {
std::cout << " extended atomic operations" << std::endl;
}
if (plt.has(aspect::atomic64)) {
std::cout << " atomic64" << std::endl;
}
if (plt.has(aspect::image)) {
std::cout << " images" << std::endl;
}
Expand Down