Skip to content

Commit 6429ab6

Browse files
committed
Avoid unnecessary calls to cuFuncSetAttribute
Calling cuFuncSetAttribute to set CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES is required to launch kernels using more than 48 kB of local memory[1] (CUDA dynamic shared memory). Without this, cuLaunchKernel fails with CUDA_ERROR_INVALID_VALUE. However, calling cuFuncSetAttribute introduces synchronisation in the CUDA runtime which blocks its execution until all H2D/D2H memory copies are finished (don't know why), therefore effectively blocking kernel launches from overlapping with memory copies. This introduces significant performance degradation in some workflows, specifically in applications launching overlapping memory copies and kernels from multiple host threads into multiple CUDA streams to the same GPU. Avoid the CUDA runtime synchronisation causing poor performance by removing the cuFuncSetAttribute call unless it's strictly necessary. Call it only when a specific carveout is requested by user (using env variables) or when the kernel launch would fail without it (local memory size >48kB). Good performance is recovered for default settings with kernels using little or no local memory. No performance effects were observed for kernel execution time after removing the attribute across a wide range of tested kernels using various amounts of local memory. [1] Related to the 48 kB static shared memory limit, see the footnote for "Maximum amount of shared memory per thread block" in https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability
1 parent 14bafb8 commit 6429ab6

File tree

3 files changed

+24
-8
lines changed

3 files changed

+24
-8
lines changed

sycl/cmake/modules/FetchUnifiedRuntime.cmake

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,19 @@ cmake_path(NORMAL_PATH UR_INTREE_SOURCE_DIR OUTPUT_VARIABLE UR_INTREE_SOURCE_DIR
7575

7676
if(IS_DIRECTORY "${UR_INTREE_SOURCE_DIR}")
7777
set(UR_INTREE_BINARY_DIR ${LLVM_BINARY_DIR}/unified-runtime)
78-
add_subdirectory(${UR_INTREE_SOURCE_DIR} ${UR_INTREE_BINARY_DIR})
78+
set(UNIFIED_RUNTIME_SOURCE_DIR
79+
"${UR_INTREE_SOURCE_DIR}" CACHE PATH
80+
"Path to Unified Runtime Headers" FORCE)
81+
set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
82+
# Due to the use of dependentloadflag and no installer for UMF and hwloc we need
83+
# to link statically on windows
84+
if(WIN32)
85+
set(UMF_BUILD_SHARED_LIBRARY OFF CACHE INTERNAL "Build UMF shared library")
86+
set(UMF_LINK_HWLOC_STATICALLY ON CACHE INTERNAL "static HWLOC")
87+
else()
88+
set(UMF_DISABLE_HWLOC ${SYCL_UMF_DISABLE_HWLOC} CACHE INTERNAL "Disable hwloc for UMF")
89+
endif()
90+
add_subdirectory(${UNIFIED_RUNTIME_SOURCE_DIR} ${UR_INTREE_BINARY_DIR})
7991
elseif(SYCL_UR_USE_FETCH_CONTENT)
8092
include(FetchContent)
8193

@@ -122,7 +134,7 @@ elseif(SYCL_UR_USE_FETCH_CONTENT)
122134
CACHE PATH "Path to external '${name}' adapter source dir" FORCE)
123135
endfunction()
124136

125-
set(UNIFIED_RUNTIME_REPO "https://github.com/rafbiels/unified-runtime.git")
137+
set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
126138
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake)
127139

128140
set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
# commit 4ae15957e0c7575cde45d9d3ae01bd334e052e95
2-
# Author: Rafal Bielski <[email protected]>
3-
# Date: Fri Feb 7 19:22:22 2025 +0000
4-
# Avoid unnecessary calls to cuFuncSetAttribute
5-
set(UNIFIED_RUNTIME_TAG cuda-avoid-cuFuncSetAttribute)
1+
# commit d03f19a88e42cb98be9604ff24b61190d1e48727
2+
# Merge: 3ce6fcc9 84454b0e
3+
# Author: Kenneth Benzie (Benie) <[email protected]>
4+
# Date: Thu Feb 13 11:43:34 2025 +0000
5+
# Merge pull request #2680 from ldorau/Set_UMF_CUDA_INCLUDE_DIR_to_not_fetch_cudart_from_gitlab
6+
# Do not fetch cudart from gitlab for UMF
7+
set(UNIFIED_RUNTIME_TAG d03f19a88e42cb98be9604ff24b61190d1e48727)

unified-runtime/source/adapters/cuda/enqueue.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,9 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
290290
CuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
291291
Device->getMaxChosenLocalMem()));
292292

293-
} else {
293+
} else if (LocalSize > 48 * 1024) {
294+
// CUDA requires explicit carveout of dynamic shared memory size if larger
295+
// than 48 kB, otherwise cuLaunchKernel fails.
294296
UR_CHECK_ERROR(cuFuncSetAttribute(
295297
CuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, LocalSize));
296298
}

0 commit comments

Comments
 (0)