Skip to content

Commit 551d706

Browse files
authored
[SYCL][CUDA] Add sub-group barrier (#2606)
Uses __nvvm_bar_warp_sync, which is equivalent to CUDA __syncwarp(). Because sub-group functions must always be called in converged control flow, the membermask is always set to represent all active work-items in the warp. Enabling this functionality requires that we switch to PTX 6.4, which is consistent with the existing requirement to use CUDA 10.1. Signed-off-by: John Pennycook <[email protected]>
1 parent b9d72a9 commit 551d706

File tree

4 files changed

+20
-11
lines changed

4 files changed

+20
-11
lines changed

libclc/CMakeLists.txt

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -261,10 +261,13 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
261261
foreach( d ${${t}_devices} )
262262
# Some targets don't have a specific GPU to target
263263
if( ${d} STREQUAL "none" OR ${ARCH} STREQUAL "spirv" OR ${ARCH} STREQUAL "spirv64" )
264-
set( mcpu )
264+
# FIXME: Ideally we would not be tied to a specific PTX ISA version
265+
if( ${ARCH} STREQUAL nvptx OR ${ARCH} STREQUAL nvptx64 )
266+
set( flags "SHELL:-Xclang -target-feature" "SHELL:-Xclang +ptx64")
267+
endif()
265268
set( arch_suffix "${t}" )
266269
else()
267-
set( mcpu "-mcpu=${d}" )
270+
set( flags "-mcpu=${d}" )
268271
set( arch_suffix "${d}-${t}" )
269272
endif()
270273
message( " DEVICE: ${d} ( ${${d}_aliases} )" )
@@ -276,14 +279,14 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
276279
if( ${ARCH} STREQUAL nvptx OR ${ARCH} STREQUAL nvptx64 )
277280
add_libclc_sycl_binding(libspirv_files
278281
TRIPLE ${t}
279-
COMPILE_OPT ${mcpu}
282+
COMPILE_OPT ${flags}
280283
FILES generic/libspirv/sycldevice-binding.cpp)
281284
endif()
282285

283286
add_libclc_builtin_set(libspirv-${arch_suffix}
284287
TRIPLE ${t}
285288
TARGET_ENV libspirv
286-
COMPILE_OPT ${mcpu}
289+
COMPILE_OPT ${flags}
287290
FILES ${libspirv_files}
288291
ALIASES ${${d}_aliases}
289292
GENERATE_TARGET "generate_convert_spirv.cl" "generate_convert_core.cl"
@@ -292,7 +295,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
292295
add_libclc_builtin_set(clc-${arch_suffix}
293296
TRIPLE ${t}
294297
TARGET_ENV clc
295-
COMPILE_OPT ${mcpu}
298+
COMPILE_OPT ${flags}
296299
FILES ${lib_files}
297300
LIB_DEP libspirv-${arch_suffix}
298301
ALIASES ${${d}_aliases}

libclc/cmake/modules/AddLibclc.cmake

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ function(add_libclc_sycl_binding OUT_LIST)
214214
file( TO_CMAKE_PATH ${LIBCLC_ROOT_DIR}/${file} SYCLDEVICE_BINDING )
215215
if( EXISTS ${SYCLDEVICE_BINDING} )
216216
set( SYCLDEVICE_BINDING_OUT ${CMAKE_CURRENT_BINARY_DIR}/sycldevice-binding-${ARG_TRIPLE}/sycldevice-binding.bc )
217+
string( REGEX REPLACE "SHELL:" "" SYLCDEVICE_OPT ${ARG_COMPILE_OPT} )
217218
add_custom_command( OUTPUT ${SYCLDEVICE_BINDING_OUT}
218219
COMMAND ${CMAKE_COMMAND} -E make_directory
219220
${CMAKE_CURRENT_BINARY_DIR}/sycldevice-binding-${ARG_TRIPLE}
@@ -223,7 +224,7 @@ function(add_libclc_sycl_binding OUT_LIST)
223224
-fsycl-device-only
224225
-Dcl_khr_fp64
225226
-I${LIBCLC_ROOT_DIR}/generic/include
226-
${ARG_COMPILE_OPT}
227+
${SYCLDEVICE_OPT}
227228
${SYCLDEVICE_BINDING}
228229
-o ${SYCLDEVICE_BINDING_OUT}
229230
MAIN_DEPENDENCY ${SYCLDEVICE_BINDING}

libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
//===----------------------------------------------------------------------===//
88

99
#include <spirv/spirv.h>
10+
#include <spirv/spirv_types.h>
1011

1112
_CLC_OVERLOAD _CLC_DEF void __spirv_MemoryBarrier(unsigned int memory,
1213
unsigned int semantics) {
@@ -16,5 +17,12 @@ _CLC_OVERLOAD _CLC_DEF void __spirv_MemoryBarrier(unsigned int memory,
1617
_CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT void
1718
__spirv_ControlBarrier(unsigned int scope, unsigned int memory,
1819
unsigned int semantics) {
19-
__syncthreads();
20+
if (scope == Subgroup) {
21+
uint FULL_MASK = 0xFFFFFFFF;
22+
uint max_size = __spirv_SubgroupMaxSize();
23+
uint sg_size = __spirv_SubgroupSize();
24+
__nvvm_bar_warp_sync(FULL_MASK >> (max_size - sg_size));
25+
} else {
26+
__syncthreads();
27+
}
2028
}

sycl/test/sub_group/barrier.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
// UNSUPPORTED: cuda
2-
// CUDA compilation and runtime do not yet support sub-groups.
3-
//
41
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
52
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
63
// RUN: %CPU_RUN_PLACEHOLDER %t.out
@@ -73,7 +70,7 @@ template <typename T> void check(queue &Queue, size_t G = 240, size_t L = 60) {
7370
}
7471
int main() {
7572
queue Queue;
76-
if (!core_sg_supported(Queue.get_device())) {
73+
if (Queue.get_device().is_host()) {
7774
std::cout << "Skipping test\n";
7875
return 0;
7976
}

0 commit comments

Comments
 (0)