Skip to content

Commit 343802d

Browse files
authored
[UR][CUDA][HIP] Align local memory size env var (#18489)
- Simplify the env vars check in the CUDA plugin - Move env var checks in the device in the HIP plugin - Align HIP and CUDA plugins - Allow requesting values larger than device capacity, but treat it as device capacity - Update test to work on both cuda and hip
1 parent 95fa847 commit 343802d

File tree

7 files changed

+85
-67
lines changed

7 files changed

+85
-67
lines changed

sycl/test-e2e/Adapters/cuda-max-local-mem-size.cpp renamed to sycl/test-e2e/Adapters/max-local-mem-size.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
// REQUIRES: cuda
1+
// REQUIRES: cuda || hip
22

33
// RUN: %{build} -o %t.out
4-
// RUN: %{run} SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE=0 %t.out 2>&1 | FileCheck --check-prefixes=CHECK-ZERO %s
5-
// RUN: %{run} SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE=100000000 %t.out 2>&1 | FileCheck --check-prefixes=CHECK-OVERALLOCATE %s
4+
// RUN: %{run} %if cuda %{UR_CUDA_MAX_LOCAL_MEM_SIZE%} %else %{UR_HIP_MAX_LOCAL_MEM_SIZE%}=0 %t.out 2>&1 | FileCheck --check-prefixes=CHECK-ZERO %s
5+
// RUN: %{run} %if cuda %{UR_CUDA_MAX_LOCAL_MEM_SIZE%} %else %{UR_HIP_MAX_LOCAL_MEM_SIZE%}=100000000 %t.out 2>&1 | FileCheck --check-prefixes=CHECK-OVERALLOCATE %s
66

7-
//==---------------------- cuda-max-local-mem-size.cpp --------------------===//
8-
//==--- SYCL test to test SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE env var----------===//
7+
//==------------------------ max-local-mem-size.cpp -----------------------===//
8+
//==--- SYCL test to test UR_{CUDA,HIP}_MAX_LOCAL_MEM_SIZE env var---------===//
99
//
1010
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
1111
// See https://llvm.org/LICENSE.txt for license information.
@@ -21,14 +21,14 @@ int main() {
2121
auto LocalSize =
2222
Q.get_device().get_info<sycl::info::device::local_mem_size>();
2323
Q.submit([&](sycl::handler &cgh) {
24-
auto LocalAcc = sycl::local_accessor<float>(LocalSize + 1, cgh);
24+
auto LocalAcc = sycl::local_accessor<char>(LocalSize + 1, cgh);
2525
cgh.parallel_for(sycl::nd_range<1>{32, 32}, [=](sycl::nd_item<1> idx) {
2626
LocalAcc[idx.get_global_linear_id()] *= 2;
2727
});
2828
}).wait();
2929
} catch (const std::exception &e) {
3030
std::puts(e.what());
3131
}
32-
// CHECK-ZERO: Local memory for kernel exceeds the amount requested using SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE
32+
// CHECK-ZERO: Invalid value specified for
3333
// CHECK-OVERALLOCATE: Excessive allocation of local memory on the device
3434
}

unified-runtime/source/adapters/cuda/device.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -505,7 +505,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
505505
// OpenCL's "local memory" maps most closely to CUDA's "shared memory".
506506
// CUDA has its own definition of "local memory", which maps to OpenCL's
507507
// "private memory".
508-
if (hDevice->maxLocalMemSizeChosen()) {
508+
if (hDevice->getMaxChosenLocalMem()) {
509509
return ReturnValue(
510510
static_cast<uint64_t>(hDevice->getMaxChosenLocalMem()));
511511
} else {

unified-runtime/source/adapters/cuda/device.hpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ struct ur_device_handle_t_ : ur::cuda::handle_base {
3434
int MaxRegsPerBlock{0};
3535
int MaxCapacityLocalMem{0};
3636
int MaxChosenLocalMem{0};
37-
bool MaxLocalMemSizeChosen{false};
3837
uint32_t NumComputeUnits{0};
3938
std::once_flag NVMLInitFlag;
4039
std::optional<nvmlDevice_t> NVMLDevice;
@@ -69,12 +68,22 @@ struct ur_device_handle_t_ : ur::cuda::handle_base {
6968
static const char *LocalMemSizePtrPI =
7069
std::getenv("SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE");
7170
static const char *LocalMemSizePtr =
72-
LocalMemSizePtrUR ? LocalMemSizePtrUR
73-
: (LocalMemSizePtrPI ? LocalMemSizePtrPI : nullptr);
71+
LocalMemSizePtrUR ? LocalMemSizePtrUR : LocalMemSizePtrPI;
7472

7573
if (LocalMemSizePtr) {
7674
MaxChosenLocalMem = std::atoi(LocalMemSizePtr);
77-
MaxLocalMemSizeChosen = true;
75+
if (MaxChosenLocalMem <= 0) {
76+
setErrorMessage(LocalMemSizePtrUR ? "Invalid value specified for "
77+
"UR_CUDA_MAX_LOCAL_MEM_SIZE"
78+
: "Invalid value specified for "
79+
"SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE",
80+
UR_RESULT_ERROR_INVALID_VALUE);
81+
throw UR_RESULT_ERROR_ADAPTER_SPECIFIC;
82+
}
83+
84+
// Cap chosen local mem size to device capacity, kernel enqueue will fail
85+
// if it actually needs more.
86+
MaxChosenLocalMem = std::min(MaxChosenLocalMem, MaxCapacityLocalMem);
7887
}
7988

8089
// Max size of memory object allocation in bytes.
@@ -151,8 +160,6 @@ struct ur_device_handle_t_ : ur::cuda::handle_base {
151160

152161
int getMaxChosenLocalMem() const noexcept { return MaxChosenLocalMem; };
153162

154-
bool maxLocalMemSizeChosen() { return MaxLocalMemSizeChosen; };
155-
156163
uint32_t getNumComputeUnits() const noexcept { return NumComputeUnits; };
157164

158165
// bookkeeping for mipmappedArray leaks in Mapping external Memory

unified-runtime/source/adapters/cuda/enqueue.cpp

Lines changed: 10 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,6 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
163163
size_t (&BlocksPerGrid)[3]) {
164164
size_t MaxWorkGroupSize = 0u;
165165
bool ProvidedLocalWorkGroupSize = LocalWorkSize != nullptr;
166-
uint32_t LocalSize = Kernel->getLocalSize();
167166

168167
try {
169168
// Set the active context here as guessLocalWorkSize needs an active context
@@ -248,42 +247,28 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
248247
CudaImplicitOffset);
249248
}
250249

250+
uint32_t LocalSize = Kernel->getLocalSize();
251251
if (LocalSize > static_cast<uint32_t>(Device->getMaxCapacityLocalMem())) {
252252
setErrorMessage("Excessive allocation of local memory on the device",
253253
UR_RESULT_ERROR_OUT_OF_RESOURCES);
254254
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
255255
}
256256

257-
if (Device->maxLocalMemSizeChosen()) {
258-
// Set up local memory requirements for kernel.
259-
if (Device->getMaxChosenLocalMem() < 0) {
260-
bool EnvVarHasURPrefix =
261-
std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE") != nullptr;
262-
setErrorMessage(EnvVarHasURPrefix ? "Invalid value specified for "
263-
"UR_CUDA_MAX_LOCAL_MEM_SIZE"
264-
: "Invalid value specified for "
265-
"SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE",
266-
UR_RESULT_ERROR_INVALID_VALUE);
267-
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
268-
}
269-
if (LocalSize > static_cast<uint32_t>(Device->getMaxChosenLocalMem())) {
270-
bool EnvVarHasURPrefix =
271-
std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE") != nullptr;
257+
if (int MaxLocalMem = Device->getMaxChosenLocalMem()) {
258+
if (LocalSize > static_cast<uint32_t>(MaxLocalMem)) {
272259
setErrorMessage(
273-
EnvVarHasURPrefix
274-
? "Local memory for kernel exceeds the amount requested using "
275-
"UR_CUDA_MAX_LOCAL_MEM_SIZE. Try increasing the value of "
276-
"UR_CUDA_MAX_LOCAL_MEM_SIZE."
277-
: "Local memory for kernel exceeds the amount requested using "
278-
"SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE. Try increasing the the "
279-
"value of SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE.",
260+
"Local memory for kernel exceeds the amount requested using "
261+
"UR_CUDA_MAX_LOCAL_MEM_SIZE (or deprecated "
262+
"SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE). Try increasing the maximum "
263+
"local memory.",
280264
UR_RESULT_ERROR_OUT_OF_RESOURCES);
281265
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
282266
}
267+
268+
// Set up local memory requirements for kernel.
283269
UR_CHECK_ERROR(cuFuncSetAttribute(
284270
CuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
285-
Device->getMaxChosenLocalMem()));
286-
271+
MaxLocalMem));
287272
} else if (LocalSize > 48 * 1024) {
288273
// CUDA requires explicit carveout of dynamic shared memory size if larger
289274
// than 48 kB, otherwise cuLaunchKernel fails.

unified-runtime/source/adapters/hip/device.cpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -382,12 +382,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
382382
// OpenCL's "local memory" maps most closely to HIP's "shared memory".
383383
// HIP has its own definition of "local memory", which maps to OpenCL's
384384
// "private memory".
385-
int LocalMemSize = 0;
386-
UR_CHECK_ERROR(hipDeviceGetAttribute(
387-
&LocalMemSize, hipDeviceAttributeMaxSharedMemoryPerBlock,
388-
hDevice->get()));
389-
assert(LocalMemSize >= 0);
390-
return ReturnValue(static_cast<uint64_t>(LocalMemSize));
385+
if (hDevice->getMaxChosenLocalMem()) {
386+
return ReturnValue(
387+
static_cast<uint64_t>(hDevice->getMaxChosenLocalMem()));
388+
} else {
389+
return ReturnValue(
390+
static_cast<uint64_t>(hDevice->getMaxCapacityLocalMem()));
391+
}
391392
}
392393
case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: {
393394
int EccEnabled = 0;

unified-runtime/source/adapters/hip/device.hpp

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ struct ur_device_handle_t_ : ur::hip::handle_base {
3131
int MaxBlockDimX{0};
3232
int MaxBlockDimY{0};
3333
int MaxBlockDimZ{0};
34-
int DeviceMaxLocalMem{0};
34+
int MaxCapacityLocalMem{0};
35+
int MaxChosenLocalMem{0};
3536
int ManagedMemSupport{0};
3637
int ConcurrentManagedAccess{0};
3738
bool HardwareImageSupport{false};
@@ -51,7 +52,7 @@ struct ur_device_handle_t_ : ur::hip::handle_base {
5152
UR_CHECK_ERROR(hipDeviceGetAttribute(
5253
&MaxBlockDimZ, hipDeviceAttributeMaxBlockDimZ, HIPDevice));
5354
UR_CHECK_ERROR(hipDeviceGetAttribute(
54-
&DeviceMaxLocalMem, hipDeviceAttributeMaxSharedMemoryPerBlock,
55+
&MaxCapacityLocalMem, hipDeviceAttributeMaxSharedMemoryPerBlock,
5556
HIPDevice));
5657
UR_CHECK_ERROR(hipDeviceGetAttribute(
5758
&ManagedMemSupport, hipDeviceAttributeManagedMemory, HIPDevice));
@@ -64,6 +65,30 @@ struct ur_device_handle_t_ : ur::hip::handle_base {
6465
hipDeviceGetAttribute(&Ret, hipDeviceAttributeImageSupport, HIPDevice));
6566
assert(Ret == 0 || Ret == 1);
6667
HardwareImageSupport = Ret == 1;
68+
69+
// Set local mem max size if env var is present
70+
static const char *LocalMemSzPtrUR =
71+
std::getenv("UR_HIP_MAX_LOCAL_MEM_SIZE");
72+
static const char *LocalMemSzPtrPI =
73+
std::getenv("SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE");
74+
static const char *LocalMemSzPtr =
75+
LocalMemSzPtrUR ? LocalMemSzPtrUR : LocalMemSzPtrPI;
76+
77+
if (LocalMemSzPtr) {
78+
MaxChosenLocalMem = std::atoi(LocalMemSzPtr);
79+
if (MaxChosenLocalMem <= 0) {
80+
setErrorMessage(LocalMemSzPtrUR ? "Invalid value specified for "
81+
"UR_HIP_MAX_LOCAL_MEM_SIZE"
82+
: "Invalid value specified for "
83+
"SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE",
84+
UR_RESULT_ERROR_OUT_OF_RESOURCES);
85+
throw UR_RESULT_ERROR_ADAPTER_SPECIFIC;
86+
}
87+
88+
// Cap chosen local mem size to device capacity, kernel enqueue will fail
89+
// if it actually needs more.
90+
MaxChosenLocalMem = std::min(MaxChosenLocalMem, MaxCapacityLocalMem);
91+
}
6792
}
6893

6994
~ur_device_handle_t_() noexcept(false) {}
@@ -88,7 +113,9 @@ struct ur_device_handle_t_ : ur::hip::handle_base {
88113

89114
int getMaxBlockDimZ() const noexcept { return MaxBlockDimZ; };
90115

91-
int getDeviceMaxLocalMem() const noexcept { return DeviceMaxLocalMem; };
116+
int getMaxCapacityLocalMem() const noexcept { return MaxCapacityLocalMem; };
117+
118+
int getMaxChosenLocalMem() const noexcept { return MaxChosenLocalMem; };
92119

93120
int getManagedMemSupport() const noexcept { return ManagedMemSupport; };
94121

unified-runtime/source/adapters/hip/enqueue.cpp

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1818,28 +1818,26 @@ setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim,
18181818
Kernel->setImplicitOffsetArg(sizeof(ImplicitOffset), ImplicitOffset);
18191819
}
18201820

1821-
// Set local mem max size if env var is present
1822-
static const char *LocalMemSzPtrUR =
1823-
std::getenv("UR_HIP_MAX_LOCAL_MEM_SIZE");
1824-
static const char *LocalMemSzPtrPI =
1825-
std::getenv("SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE");
1826-
static const char *LocalMemSzPtr =
1827-
LocalMemSzPtrUR ? LocalMemSzPtrUR
1828-
: (LocalMemSzPtrPI ? LocalMemSzPtrPI : nullptr);
1829-
1830-
if (LocalMemSzPtr) {
1831-
int DeviceMaxLocalMem = Device->getDeviceMaxLocalMem();
1832-
static const int EnvVal = std::atoi(LocalMemSzPtr);
1833-
if (EnvVal <= 0 || EnvVal > DeviceMaxLocalMem) {
1834-
setErrorMessage(LocalMemSzPtrUR ? "Invalid value specified for "
1835-
"UR_HIP_MAX_LOCAL_MEM_SIZE"
1836-
: "Invalid value specified for "
1837-
"SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE",
1838-
UR_RESULT_ERROR_OUT_OF_RESOURCES);
1821+
uint32_t LocalSize = Kernel->getLocalSize();
1822+
if (LocalSize > static_cast<uint32_t>(Device->getMaxCapacityLocalMem())) {
1823+
setErrorMessage("Excessive allocation of local memory on the device",
1824+
UR_RESULT_ERROR_OUT_OF_RESOURCES);
1825+
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
1826+
}
1827+
1828+
if (int MaxLocalMem = Device->getMaxChosenLocalMem()) {
1829+
if (LocalSize > static_cast<uint32_t>(MaxLocalMem)) {
1830+
setErrorMessage(
1831+
"Local memory for kernel exceeds the amount requested using "
1832+
"UR_HIP_MAX_LOCAL_MEM_SIZE (or deprecated "
1833+
"SYCL_PI_HIP_MAX_LOCAL_MEM_SIZE). Try increasing the maximum "
1834+
"local memory.",
1835+
UR_RESULT_ERROR_OUT_OF_RESOURCES);
18391836
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
18401837
}
1838+
18411839
UR_CHECK_ERROR(hipFuncSetAttribute(
1842-
HIPFunc, hipFuncAttributeMaxDynamicSharedMemorySize, EnvVal));
1840+
HIPFunc, hipFuncAttributeMaxDynamicSharedMemorySize, MaxLocalMem));
18431841
}
18441842
} catch (ur_result_t Err) {
18451843
return Err;

0 commit comments

Comments
 (0)