Skip to content

[SYCL][CUDA] Add sub-group shuffles #2623

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 149 additions & 21 deletions sycl/include/CL/sycl/detail/spirv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,29 +35,41 @@ template <> struct group_scope<::cl::sycl::ONEAPI::sub_group> {
static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Subgroup;
};

// Generic shuffles and broadcasts may require multiple calls to SPIR-V
// Generic shuffles and broadcasts may require multiple calls to
// intrinsics, and should use the fewest broadcasts possible
// - Loop over 64-bit chunks until remaining bytes < 64-bit
// - Loop over chunks until remaining bytes < chunk size
// - At most one 32-bit, 16-bit and 8-bit chunk left over
#ifndef __NVPTX__
using ShuffleChunkT = uint64_t;
#else
using ShuffleChunkT = uint32_t;
#endif
template <typename T, typename Functor>
void GenericCall(const Functor &ApplyToBytes) {
if (sizeof(T) >= sizeof(uint64_t)) {
if (sizeof(T) >= sizeof(ShuffleChunkT)) {
#pragma unroll
for (size_t Offset = 0; Offset < sizeof(T); Offset += sizeof(uint64_t)) {
ApplyToBytes(Offset, sizeof(uint64_t));
for (size_t Offset = 0; Offset < sizeof(T);
Offset += sizeof(ShuffleChunkT)) {
ApplyToBytes(Offset, sizeof(ShuffleChunkT));
}
}
if (sizeof(T) % sizeof(uint64_t) >= sizeof(uint32_t)) {
size_t Offset = sizeof(T) / sizeof(uint64_t) * sizeof(uint64_t);
ApplyToBytes(Offset, sizeof(uint32_t));
if (sizeof(ShuffleChunkT) >= sizeof(uint64_t)) {
if (sizeof(T) % sizeof(uint64_t) >= sizeof(uint32_t)) {
size_t Offset = sizeof(T) / sizeof(uint64_t) * sizeof(uint64_t);
ApplyToBytes(Offset, sizeof(uint32_t));
}
}
if (sizeof(T) % sizeof(uint32_t) >= sizeof(uint16_t)) {
size_t Offset = sizeof(T) / sizeof(uint32_t) * sizeof(uint32_t);
ApplyToBytes(Offset, sizeof(uint16_t));
if (sizeof(ShuffleChunkT) >= sizeof(uint32_t)) {
if (sizeof(T) % sizeof(uint32_t) >= sizeof(uint16_t)) {
size_t Offset = sizeof(T) / sizeof(uint32_t) * sizeof(uint32_t);
ApplyToBytes(Offset, sizeof(uint16_t));
}
}
if (sizeof(T) % sizeof(uint16_t) >= sizeof(uint8_t)) {
size_t Offset = sizeof(T) / sizeof(uint16_t) * sizeof(uint16_t);
ApplyToBytes(Offset, sizeof(uint8_t));
if (sizeof(ShuffleChunkT) >= sizeof(uint16_t)) {
if (sizeof(T) % sizeof(uint16_t) >= sizeof(uint8_t)) {
size_t Offset = sizeof(T) / sizeof(uint16_t) * sizeof(uint16_t);
ApplyToBytes(Offset, sizeof(uint8_t));
}
}
}

Expand Down Expand Up @@ -423,48 +435,134 @@ AtomicMax(multi_ptr<T, AddressSpace> MPtr, ONEAPI::memory_scope Scope,
return __spirv_AtomicMax(Ptr, SPIRVScope, SPIRVOrder, Value);
}

// Native shuffles map directly to a SPIR-V SubgroupShuffle intrinsic
// Native shuffles map directly to a shuffle intrinsic:
// - The Intel SPIR-V extension natively supports all arithmetic types
// - The CUDA shfl intrinsics do not support vectors, and we use the _i32
// variants for all scalar types
#ifndef __NVPTX__
template <typename T>
using EnableIfNativeShuffle =
detail::enable_if_t<detail::is_arithmetic<T>::value, T>;
#else
template <typename T>
using EnableIfNativeShuffle = detail::enable_if_t<
std::is_integral<T>::value && (sizeof(T) <= sizeof(int32_t)), T>;

template <typename T>
using EnableIfVectorShuffle =
detail::enable_if_t<detail::is_vector_arithmetic<T>::value, T>;
#endif

#ifdef __NVPTX__
inline uint32_t membermask() {
uint32_t FULL_MASK = 0xFFFFFFFF;
uint32_t max_size = __spirv_SubgroupMaxSize();
uint32_t sg_size = __spirv_SubgroupSize();
return FULL_MASK >> (max_size - sg_size);
}
#endif

template <typename T>
EnableIfNativeShuffle<T> SubgroupShuffle(T x, id<1> local_id) {
#ifndef __NVPTX__
using OCLT = detail::ConvertToOpenCLType_t<T>;
return __spirv_SubgroupShuffleINTEL(OCLT(x),
static_cast<uint32_t>(local_id.get(0)));
#else
return __nvvm_shfl_sync_idx_i32(membermask(), x, local_id.get(0), 0x1f);
#endif
}

template <typename T>
EnableIfNativeShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
#ifndef __NVPTX__
using OCLT = detail::ConvertToOpenCLType_t<T>;
return __spirv_SubgroupShuffleXorINTEL(
OCLT(x), static_cast<uint32_t>(local_id.get(0)));
#else
return __nvvm_shfl_sync_bfly_i32(membermask(), x, local_id.get(0), 0x1f);
#endif
}

template <typename T>
EnableIfNativeShuffle<T> SubgroupShuffleDown(T x, id<1> local_id) {
#ifndef __NVPTX__
using OCLT = detail::ConvertToOpenCLType_t<T>;
return __spirv_SubgroupShuffleDownINTEL(
OCLT(x), OCLT(x), static_cast<uint32_t>(local_id.get(0)));
#else
return __nvvm_shfl_sync_down_i32(membermask(), x, local_id.get(0), 0x1f);
#endif
}

template <typename T>
EnableIfNativeShuffle<T> SubgroupShuffleUp(T x, id<1> local_id) {
#ifndef __NVPTX__
using OCLT = detail::ConvertToOpenCLType_t<T>;
return __spirv_SubgroupShuffleUpINTEL(OCLT(x), OCLT(x),
static_cast<uint32_t>(local_id.get(0)));
#else
return __nvvm_shfl_sync_up_i32(membermask(), x, local_id.get(0), 0);
#endif
}

// Bitcast shuffles can be implemented using a single SPIR-V SubgroupShuffle
#ifdef __NVPTX__
template <typename T>
EnableIfVectorShuffle<T> SubgroupShuffle(T x, id<1> local_id) {
T result;
for (int s = 0; s < x.get_size(); ++s) {
result[s] = SubgroupShuffle(x[s], local_id);
}
return result;
}

template <typename T>
EnableIfVectorShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
T result;
for (int s = 0; s < x.get_size(); ++s) {
result[s] = SubgroupShuffleXor(x[s], local_id);
}
return result;
}

template <typename T>
EnableIfVectorShuffle<T> SubgroupShuffleDown(T x, id<1> local_id) {
T result;
for (int s = 0; s < x.get_size(); ++s) {
result[s] = SubgroupShuffleDown(x[s], local_id);
}
return result;
}

template <typename T>
EnableIfVectorShuffle<T> SubgroupShuffleUp(T x, id<1> local_id) {
T result;
for (int s = 0; s < x.get_size(); ++s) {
result[s] = SubgroupShuffleUp(x[s], local_id);
}
return result;
}
#endif

// Bitcast shuffles can be implemented using a single SubgroupShuffle
// intrinsic, but require type-punning via an appropriate integer type
#ifndef __NVPTX__
template <typename T>
using EnableIfBitcastShuffle =
detail::enable_if_t<!detail::is_arithmetic<T>::value &&
(std::is_trivially_copyable<T>::value &&
(sizeof(T) == 1 || sizeof(T) == 2 ||
sizeof(T) == 4 || sizeof(T) == 8)),
T>;
#else
template <typename T>
using EnableIfBitcastShuffle = detail::enable_if_t<
!(std::is_integral<T>::value && (sizeof(T) <= sizeof(int32_t))) &&
!detail::is_vector_arithmetic<T>::value &&
(std::is_trivially_copyable<T>::value &&
(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4)),
T>;
#endif

template <typename T>
using ConvertToNativeShuffleType_t = select_cl_scalar_integral_unsigned_t<T>;
Expand All @@ -473,57 +571,87 @@ template <typename T>
EnableIfBitcastShuffle<T> SubgroupShuffle(T x, id<1> local_id) {
using ShuffleT = ConvertToNativeShuffleType_t<T>;
auto ShuffleX = detail::bit_cast<ShuffleT>(x);
#ifndef __NVPTX__
ShuffleT Result = __spirv_SubgroupShuffleINTEL(
ShuffleX, static_cast<uint32_t>(local_id.get(0)));
#else
ShuffleT Result =
__nvvm_shfl_sync_idx_i32(membermask(), ShuffleX, local_id.get(0), 0x1f);
#endif
return detail::bit_cast<T>(Result);
}

template <typename T>
EnableIfBitcastShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
using ShuffleT = ConvertToNativeShuffleType_t<T>;
auto ShuffleX = detail::bit_cast<ShuffleT>(x);
#ifndef __NVPTX__
ShuffleT Result = __spirv_SubgroupShuffleXorINTEL(
ShuffleX, static_cast<uint32_t>(local_id.get(0)));
#else
ShuffleT Result =
__nvvm_shfl_sync_bfly_i32(membermask(), ShuffleX, local_id.get(0), 0x1f);
#endif
return detail::bit_cast<T>(Result);
}

template <typename T>
EnableIfBitcastShuffle<T> SubgroupShuffleDown(T x, id<1> local_id) {
using ShuffleT = ConvertToNativeShuffleType_t<T>;
auto ShuffleX = detail::bit_cast<ShuffleT>(x);
#ifndef __NVPTX__
ShuffleT Result = __spirv_SubgroupShuffleDownINTEL(
ShuffleX, ShuffleX, static_cast<uint32_t>(local_id.get(0)));
#else
ShuffleT Result =
__nvvm_shfl_sync_down_i32(membermask(), ShuffleX, local_id.get(0), 0x1f);
#endif
return detail::bit_cast<T>(Result);
}

template <typename T>
EnableIfBitcastShuffle<T> SubgroupShuffleUp(T x, id<1> local_id) {
using ShuffleT = ConvertToNativeShuffleType_t<T>;
auto ShuffleX = detail::bit_cast<ShuffleT>(x);
#ifndef __NVPTX__
ShuffleT Result = __spirv_SubgroupShuffleUpINTEL(
ShuffleX, ShuffleX, static_cast<uint32_t>(local_id.get(0)));
#else
ShuffleT Result =
__nvvm_shfl_sync_up_i32(membermask(), ShuffleX, local_id.get(0), 0);
#endif
return detail::bit_cast<T>(Result);
}

// Generic shuffles may require multiple calls to SPIR-V SubgroupShuffle
// Generic shuffles may require multiple calls to SubgroupShuffle
// intrinsics, and should use the fewest shuffles possible:
// - Loop over 64-bit chunks until remaining bytes < 64-bit
// - At most one 32-bit, 16-bit and 8-bit chunk left over
#ifndef __NVPTX__
template <typename T>
using EnableIfGenericShuffle =
detail::enable_if_t<!detail::is_arithmetic<T>::value &&
!(std::is_trivially_copyable<T>::value &&
(sizeof(T) == 1 || sizeof(T) == 2 ||
sizeof(T) == 4 || sizeof(T) == 8)),
T>;
#else
template <typename T>
using EnableIfGenericShuffle = detail::enable_if_t<
!(std::is_integral<T>::value && (sizeof(T) <= sizeof(int32_t))) &&
!detail::is_vector_arithmetic<T>::value &&
!(std::is_trivially_copyable<T>::value &&
(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4)),
T>;
#endif

template <typename T>
EnableIfGenericShuffle<T> SubgroupShuffle(T x, id<1> local_id) {
T Result;
char *XBytes = reinterpret_cast<char *>(&x);
char *ResultBytes = reinterpret_cast<char *>(&Result);
auto ShuffleBytes = [=](size_t Offset, size_t Size) {
uint64_t ShuffleX, ShuffleResult;
ShuffleChunkT ShuffleX, ShuffleResult;
detail::memcpy(&ShuffleX, XBytes + Offset, Size);
ShuffleResult = SubgroupShuffle(ShuffleX, local_id);
detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
Expand All @@ -538,7 +666,7 @@ EnableIfGenericShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
char *XBytes = reinterpret_cast<char *>(&x);
char *ResultBytes = reinterpret_cast<char *>(&Result);
auto ShuffleBytes = [=](size_t Offset, size_t Size) {
uint64_t ShuffleX, ShuffleResult;
ShuffleChunkT ShuffleX, ShuffleResult;
detail::memcpy(&ShuffleX, XBytes + Offset, Size);
ShuffleResult = SubgroupShuffleXor(ShuffleX, local_id);
detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
Expand All @@ -553,7 +681,7 @@ EnableIfGenericShuffle<T> SubgroupShuffleDown(T x, id<1> local_id) {
char *XBytes = reinterpret_cast<char *>(&x);
char *ResultBytes = reinterpret_cast<char *>(&Result);
auto ShuffleBytes = [=](size_t Offset, size_t Size) {
uint64_t ShuffleX, ShuffleResult;
ShuffleChunkT ShuffleX, ShuffleResult;
detail::memcpy(&ShuffleX, XBytes + Offset, Size);
ShuffleResult = SubgroupShuffleDown(ShuffleX, local_id);
detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
Expand All @@ -568,7 +696,7 @@ EnableIfGenericShuffle<T> SubgroupShuffleUp(T x, id<1> local_id) {
char *XBytes = reinterpret_cast<char *>(&x);
char *ResultBytes = reinterpret_cast<char *>(&Result);
auto ShuffleBytes = [=](size_t Offset, size_t Size) {
uint64_t ShuffleX, ShuffleResult;
ShuffleChunkT ShuffleX, ShuffleResult;
detail::memcpy(&ShuffleX, XBytes + Offset, Size);
ShuffleResult = SubgroupShuffleUp(ShuffleX, local_id);
detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
Expand Down
5 changes: 1 addition & 4 deletions sycl/test/sub_group/generic-shuffle.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
// UNSUPPORTED: cuda
// CUDA compilation and runtime do not yet support sub-groups.
//
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
Expand Down Expand Up @@ -216,7 +213,7 @@ void check_struct(queue &Queue, Generator &Gen, size_t G = 256, size_t L = 64) {

int main() {
queue Queue;
if (!Queue.get_device().has_extension("cl_intel_subgroups")) {
if (Queue.get_device().is_host()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Strictly speaking, this patch doesn't bring shuffles support to all non-host devices and "cl_intel_subgroups" extension is still required, but only for non-CUDA devices.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right. I wasn't sure what to do here, really -- the fact that "cl_intel_subgroups" is required is backend-specific.

@gmlueck: Do we have the necessary infrastructure implemented to query whether a particular extension from sycl/docs/extensions is supported for a given device?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess you are thinking of adding a new "aspect" and then checking it like:

if (Queue.get_device().has(aspect::ext_intel_has_subgroups)) {

Correct?

@glyons-intel has added support for some aspects already, so it probably wouldn't be too hard to add another.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, @gmlueck. That's the sort of thing I was thinking of. But as you noted elsewhere, we really have two options:

  1. Define a new SYCL aspect describing devices that implement the sub-group extension
  2. Commit to the path that all SYCL devices are intended to implement sub-groups

Option 1) addresses @AlexeySachkov's concern about whether the tests are accurately reflecting current implementation. But I think I prefer 2) as the correct long-term direction. Sub-groups aren't an optional feature in SYCL 2020 provisional, and we don't expect users to have to check any sort of extension before using them.

If we go with 1), we should probably define aspects for all of our device-specific extensions and update all the tests accordingly. If we go with 2), we should phase out our use of __spirv_SubgroupShuffleINTEL in favor of the standard __spirv_OpGroupNonUniformShuffle.

Both of these seem fairly big jobs and outside of the scope of this PR. @AlexeySachkov, are you okay to defer the resolution here until a future PR?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While working on the group algorithms, I also stumbled across this check: https://github.com/intel/llvm/blob/sycl/sycl/test/group-algorithm/reduce.cpp#L67. I'd be okay with implementing something similar here in the short-term, if that's preferred.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@AlexeySachkov, are you okay to defer the resolution here until a future PR?

Sure, no objections

std::cout << "Skipping test\n";
return 0;
}
Expand Down
11 changes: 3 additions & 8 deletions sycl/test/sub_group/shuffle.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
// UNSUPPORTED: cuda
// CUDA compilation and runtime do not yet support sub-groups.
//
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
Expand All @@ -19,14 +16,12 @@

int main() {
queue Queue;
if (!Queue.get_device().has_extension("cl_intel_subgroups")) {
if (Queue.get_device().is_host()) {
std::cout << "Skipping test\n";
return 0;
}
if (Queue.get_device().has_extension("cl_intel_subgroups_short")) {
check<short>(Queue);
check<unsigned short>(Queue);
}
check<short>(Queue);
check<unsigned short>(Queue);
check<int>(Queue);
check<int, 2>(Queue);
check<int, 4>(Queue);
Expand Down
5 changes: 1 addition & 4 deletions sycl/test/sub_group/shuffle_fp16.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
// UNSUPPORTED: cuda
// CUDA compilation and runtime do not yet support sub-groups.
//
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
//
Expand All @@ -16,7 +13,7 @@

int main() {
queue Queue;
if (!Queue.get_device().has_extension("cl_intel_subgroups")) {
if (Queue.get_device().is_host()) {
std::cout << "Skipping test\n";
return 0;
}
Expand Down
5 changes: 1 addition & 4 deletions sycl/test/sub_group/shuffle_fp64.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
// UNSUPPORTED: cuda
// CUDA compilation and runtime do not yet support sub-groups.
//
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
Expand All @@ -19,7 +16,7 @@

int main() {
queue Queue;
if (!Queue.get_device().has_extension("cl_intel_subgroups")) {
if (Queue.get_device().is_host()) {
std::cout << "Skipping test\n";
return 0;
}
Expand Down