Skip to content

[UR][CUDA][HIP] Align local memory size env var #18489

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
May 20, 2025
Merged
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
// REQUIRES: cuda
// REQUIRES: cuda || hip

// RUN: %{build} -o %t.out
// RUN: %{run} SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE=0 %t.out 2>&1 | FileCheck --check-prefixes=CHECK-ZERO %s
// RUN: %{run} SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE=100000000 %t.out 2>&1 | FileCheck --check-prefixes=CHECK-OVERALLOCATE %s
// RUN: %{run} %if cuda %{UR_CUDA_MAX_LOCAL_MEM_SIZE%} %else %{UR_HIP_MAX_LOCAL_MEM_SIZE%}=0 %t.out 2>&1 | FileCheck --check-prefixes=CHECK-ZERO %s
// RUN: %{run} %if cuda %{UR_CUDA_MAX_LOCAL_MEM_SIZE%} %else %{UR_HIP_MAX_LOCAL_MEM_SIZE%}=100000000 %t.out 2>&1 | FileCheck --check-prefixes=CHECK-OVERALLOCATE %s

//==---------------------- cuda-max-local-mem-size.cpp --------------------===//
//==--- SYCL test to test SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE env var----------===//
//==------------------------ max-local-mem-size.cpp -----------------------===//
//==--- SYCL test to test UR_{CUDA,HIP}_MAX_LOCAL_MEM_SIZE env var---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
Expand All @@ -21,14 +21,14 @@ int main() {
auto LocalSize =
Q.get_device().get_info<sycl::info::device::local_mem_size>();
Q.submit([&](sycl::handler &cgh) {
auto LocalAcc = sycl::local_accessor<float>(LocalSize + 1, cgh);
auto LocalAcc = sycl::local_accessor<char>(LocalSize + 1, cgh);
cgh.parallel_for(sycl::nd_range<1>{32, 32}, [=](sycl::nd_item<1> idx) {
LocalAcc[idx.get_global_linear_id()] *= 2;
});
}).wait();
} catch (const std::exception &e) {
std::puts(e.what());
}
// CHECK-ZERO: Local memory for kernel exceeds the amount requested using SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE
// CHECK-ZERO: Invalid value specified for
// CHECK-OVERALLOCATE: Excessive allocation of local memory on the device
}
2 changes: 1 addition & 1 deletion unified-runtime/source/adapters/cuda/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
// OpenCL's "local memory" maps most closely to CUDA's "shared memory".
// CUDA has its own definition of "local memory", which maps to OpenCL's
// "private memory".
if (hDevice->maxLocalMemSizeChosen()) {
if (hDevice->getMaxChosenLocalMem()) {
return ReturnValue(
static_cast<uint64_t>(hDevice->getMaxChosenLocalMem()));
} else {
Expand Down
19 changes: 13 additions & 6 deletions unified-runtime/source/adapters/cuda/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ struct ur_device_handle_t_ : ur::cuda::handle_base {
int MaxRegsPerBlock{0};
int MaxCapacityLocalMem{0};
int MaxChosenLocalMem{0};
bool MaxLocalMemSizeChosen{false};
uint32_t NumComputeUnits{0};
std::once_flag NVMLInitFlag;
std::optional<nvmlDevice_t> NVMLDevice;
Expand Down Expand Up @@ -69,12 +68,22 @@ struct ur_device_handle_t_ : ur::cuda::handle_base {
static const char *LocalMemSizePtrPI =
std::getenv("SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE");
static const char *LocalMemSizePtr =
LocalMemSizePtrUR ? LocalMemSizePtrUR
: (LocalMemSizePtrPI ? LocalMemSizePtrPI : nullptr);
LocalMemSizePtrUR ? LocalMemSizePtrUR : LocalMemSizePtrPI;

if (LocalMemSizePtr) {
MaxChosenLocalMem = std::atoi(LocalMemSizePtr);
MaxLocalMemSizeChosen = true;
if (MaxChosenLocalMem <= 0) {
setErrorMessage(LocalMemSizePtrUR ? "Invalid value specified for "
"UR_CUDA_MAX_LOCAL_MEM_SIZE"
: "Invalid value specified for "
"SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE",
UR_RESULT_ERROR_INVALID_VALUE);
throw UR_RESULT_ERROR_ADAPTER_SPECIFIC;
}

// Cap chosen local mem size to device capacity, kernel enqueue will fail
// if it actually needs more.
MaxChosenLocalMem = std::min(MaxChosenLocalMem, MaxCapacityLocalMem);
}

// Max size of memory object allocation in bytes.
Expand Down Expand Up @@ -151,8 +160,6 @@ struct ur_device_handle_t_ : ur::cuda::handle_base {

int getMaxChosenLocalMem() const noexcept { return MaxChosenLocalMem; };

bool maxLocalMemSizeChosen() { return MaxLocalMemSizeChosen; };

uint32_t getNumComputeUnits() const noexcept { return NumComputeUnits; };

// bookkeeping for mipmappedArray leaks in Mapping external Memory
Expand Down
35 changes: 10 additions & 25 deletions unified-runtime/source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
size_t (&BlocksPerGrid)[3]) {
size_t MaxWorkGroupSize = 0u;
bool ProvidedLocalWorkGroupSize = LocalWorkSize != nullptr;
uint32_t LocalSize = Kernel->getLocalSize();

try {
// Set the active context here as guessLocalWorkSize needs an active context
Expand Down Expand Up @@ -248,42 +247,28 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
CudaImplicitOffset);
}

uint32_t LocalSize = Kernel->getLocalSize();
if (LocalSize > static_cast<uint32_t>(Device->getMaxCapacityLocalMem())) {
setErrorMessage("Excessive allocation of local memory on the device",
UR_RESULT_ERROR_OUT_OF_RESOURCES);
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
}

if (Device->maxLocalMemSizeChosen()) {
// Set up local memory requirements for kernel.
if (Device->getMaxChosenLocalMem() < 0) {
bool EnvVarHasURPrefix =
std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE") != nullptr;
setErrorMessage(EnvVarHasURPrefix ? "Invalid value specified for "
"UR_CUDA_MAX_LOCAL_MEM_SIZE"
: "Invalid value specified for "
"SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE",
UR_RESULT_ERROR_INVALID_VALUE);
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
}
if (LocalSize > static_cast<uint32_t>(Device->getMaxChosenLocalMem())) {
bool EnvVarHasURPrefix =
std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE") != nullptr;
if (int MaxLocalMem = Device->getMaxChosenLocalMem()) {
if (LocalSize > static_cast<uint32_t>(MaxLocalMem)) {
setErrorMessage(
EnvVarHasURPrefix
? "Local memory for kernel exceeds the amount requested using "
"UR_CUDA_MAX_LOCAL_MEM_SIZE. Try increasing the value of "
"UR_CUDA_MAX_LOCAL_MEM_SIZE."
: "Local memory for kernel exceeds the amount requested using "
"SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE. Try increasing the the "
"value of SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE.",
"Local memory for kernel exceeds the amount requested using "
"UR_CUDA_MAX_LOCAL_MEM_SIZE (or deprecated "
"SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE). Try increasing the maximum "
"local memory.",
UR_RESULT_ERROR_OUT_OF_RESOURCES);
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
}

// Set up local memory requirements for kernel.
UR_CHECK_ERROR(cuFuncSetAttribute(
CuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
Device->getMaxChosenLocalMem()));

MaxLocalMem));
} else if (LocalSize > 48 * 1024) {
// CUDA requires explicit carveout of dynamic shared memory size if larger
// than 48 kB, otherwise cuLaunchKernel fails.
Expand Down
13 changes: 7 additions & 6 deletions unified-runtime/source/adapters/hip/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -382,12 +382,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
// OpenCL's "local memory" maps most closely to HIP's "shared memory".
// HIP has its own definition of "local memory", which maps to OpenCL's
// "private memory".
int LocalMemSize = 0;
UR_CHECK_ERROR(hipDeviceGetAttribute(
&LocalMemSize, hipDeviceAttributeMaxSharedMemoryPerBlock,
hDevice->get()));
assert(LocalMemSize >= 0);
return ReturnValue(static_cast<uint64_t>(LocalMemSize));
if (hDevice->getMaxChosenLocalMem()) {
return ReturnValue(
static_cast<uint64_t>(hDevice->getMaxChosenLocalMem()));
} else {
return ReturnValue(
static_cast<uint64_t>(hDevice->getMaxCapacityLocalMem()));
}
}
case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: {
int EccEnabled = 0;
Expand Down
33 changes: 30 additions & 3 deletions unified-runtime/source/adapters/hip/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ struct ur_device_handle_t_ : ur::hip::handle_base {
int MaxBlockDimX{0};
int MaxBlockDimY{0};
int MaxBlockDimZ{0};
int DeviceMaxLocalMem{0};
int MaxCapacityLocalMem{0};
int MaxChosenLocalMem{0};
int ManagedMemSupport{0};
int ConcurrentManagedAccess{0};
bool HardwareImageSupport{false};
Expand All @@ -51,7 +52,7 @@ struct ur_device_handle_t_ : ur::hip::handle_base {
UR_CHECK_ERROR(hipDeviceGetAttribute(
&MaxBlockDimZ, hipDeviceAttributeMaxBlockDimZ, HIPDevice));
UR_CHECK_ERROR(hipDeviceGetAttribute(
&DeviceMaxLocalMem, hipDeviceAttributeMaxSharedMemoryPerBlock,
&MaxCapacityLocalMem, hipDeviceAttributeMaxSharedMemoryPerBlock,
HIPDevice));
UR_CHECK_ERROR(hipDeviceGetAttribute(
&ManagedMemSupport, hipDeviceAttributeManagedMemory, HIPDevice));
Expand All @@ -64,6 +65,30 @@ struct ur_device_handle_t_ : ur::hip::handle_base {
hipDeviceGetAttribute(&Ret, hipDeviceAttributeImageSupport, HIPDevice));
assert(Ret == 0 || Ret == 1);
HardwareImageSupport = Ret == 1;

// Set local mem max size if env var is present
static const char *LocalMemSzPtrUR =
std::getenv("UR_HIP_MAX_LOCAL_MEM_SIZE");
static const char *LocalMemSzPtrPI =
std::getenv("SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE");
static const char *LocalMemSzPtr =
LocalMemSzPtrUR ? LocalMemSzPtrUR : LocalMemSzPtrPI;

if (LocalMemSzPtr) {
MaxChosenLocalMem = std::atoi(LocalMemSzPtr);
if (MaxChosenLocalMem <= 0) {
setErrorMessage(LocalMemSzPtrUR ? "Invalid value specified for "
"UR_HIP_MAX_LOCAL_MEM_SIZE"
: "Invalid value specified for "
"SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE",
UR_RESULT_ERROR_OUT_OF_RESOURCES);
throw UR_RESULT_ERROR_ADAPTER_SPECIFIC;
}

// Cap chosen local mem size to device capacity, kernel enqueue will fail
// if it actually needs more.
MaxChosenLocalMem = std::min(MaxChosenLocalMem, MaxCapacityLocalMem);
}
}

~ur_device_handle_t_() noexcept(false) {}
Expand All @@ -88,7 +113,9 @@ struct ur_device_handle_t_ : ur::hip::handle_base {

int getMaxBlockDimZ() const noexcept { return MaxBlockDimZ; };

int getDeviceMaxLocalMem() const noexcept { return DeviceMaxLocalMem; };
int getMaxCapacityLocalMem() const noexcept { return MaxCapacityLocalMem; };

int getMaxChosenLocalMem() const noexcept { return MaxChosenLocalMem; };

int getManagedMemSupport() const noexcept { return ManagedMemSupport; };

Expand Down
36 changes: 17 additions & 19 deletions unified-runtime/source/adapters/hip/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1818,28 +1818,26 @@ setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim,
Kernel->setImplicitOffsetArg(sizeof(ImplicitOffset), ImplicitOffset);
}

// Set local mem max size if env var is present
static const char *LocalMemSzPtrUR =
std::getenv("UR_HIP_MAX_LOCAL_MEM_SIZE");
static const char *LocalMemSzPtrPI =
std::getenv("SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE");
static const char *LocalMemSzPtr =
LocalMemSzPtrUR ? LocalMemSzPtrUR
: (LocalMemSzPtrPI ? LocalMemSzPtrPI : nullptr);

if (LocalMemSzPtr) {
int DeviceMaxLocalMem = Device->getDeviceMaxLocalMem();
static const int EnvVal = std::atoi(LocalMemSzPtr);
if (EnvVal <= 0 || EnvVal > DeviceMaxLocalMem) {
setErrorMessage(LocalMemSzPtrUR ? "Invalid value specified for "
"UR_HIP_MAX_LOCAL_MEM_SIZE"
: "Invalid value specified for "
"SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE",
UR_RESULT_ERROR_OUT_OF_RESOURCES);
uint32_t LocalSize = Kernel->getLocalSize();
if (LocalSize > static_cast<uint32_t>(Device->getMaxCapacityLocalMem())) {
setErrorMessage("Excessive allocation of local memory on the device",
UR_RESULT_ERROR_OUT_OF_RESOURCES);
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
}

if (int MaxLocalMem = Device->getMaxChosenLocalMem()) {
if (LocalSize > static_cast<uint32_t>(MaxLocalMem)) {
setErrorMessage(
"Local memory for kernel exceeds the amount requested using "
"UR_HIP_MAX_LOCAL_MEM_SIZE (or deprecated "
"SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE). Try increasing the maximum "
"local memory.",
UR_RESULT_ERROR_OUT_OF_RESOURCES);
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
}

UR_CHECK_ERROR(hipFuncSetAttribute(
HIPFunc, hipFuncAttributeMaxDynamicSharedMemorySize, EnvVal));
HIPFunc, hipFuncAttributeMaxDynamicSharedMemorySize, MaxLocalMem));
}
} catch (ur_result_t Err) {
return Err;
Expand Down
Loading