Skip to content

Commit 28590a8

Browse files
author
Jaime Arteaga
committed
[UR][L0] Unify use of large allocation in L0 adapter
Intel(R) GPUs have two modes of operation in terms of allocations: Stateful and stateless mode. Stateful optimizes memory accesses through pointer arithmetic. This can be done as long as allocations used by the allocation are smaller than 4GB. Stateless disables such pointer-arithmetic optimization to allow the kernel to use allocations larger than 4GB. Currently, L0 adapter dynamically and automatically requests the L0 driver large allocations if it detects an allocation size is larger than 4GB. This creates a problem if a kernel has been previously compiled for stateful access. This ultimately means the adapter mixes stateful and stateless behavior, which is not a user-friendly experience. This patch aims at correcting this behavior by defining a default one. On Intel(R) GPUs previous to Intel(R) Data Center GPU Max, default behavior is now stateless, meaning all allocations are only allowed by default. Users can opt-in for stateful mode setting a new environment variable UR_L0_USE_OPTIMIZED_32BIT_ACCESS=1. Addresses: https://stackoverflow.com/questions/75621264/sycl-dot-product-code-gives-wrong-results Signed-off-by: Jaime Arteaga <[email protected]>
1 parent 40c8da9 commit 28590a8

File tree

4 files changed

+70
-6
lines changed

4 files changed

+70
-6
lines changed

source/adapters/level_zero/device.cpp

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
267267
return ReturnValue(uint32_t{64});
268268
}
269269
case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE:
270-
return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize});
270+
// if not optimized for 32-bit access, return total memory size.
271+
// otherwise, return only maximum allocatable size.
272+
if (Device->useOptimized32bitAccess() == 0) {
273+
return ReturnValue(uint64_t{calculateGlobalMemSize(Device)});
274+
} else {
275+
return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize});
276+
}
271277
case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: {
272278
// Support to read physicalSize depends on kernel,
273279
// so fallback into reading totalSize if physicalSize
@@ -911,6 +917,22 @@ ur_device_handle_t_::useImmediateCommandLists() {
911917
}
912918
}
913919

920+
int32_t ur_device_handle_t_::useOptimized32bitAccess() {
921+
static const int32_t Optimize32bitAccessMode = [this] {
922+
// If device is Intel(R) Data Center GPU Max,
923+
// use default provided by L0 driver.
924+
// TODO: Use IP versioning to select based on range of devices
925+
if (this->isPVC())
926+
return -1;
927+
const char *UrRet = std::getenv("UR_L0_USE_OPTIMIZED_32BIT_ACCESS");
928+
if (!UrRet)
929+
return 0;
930+
return std::atoi(UrRet);
931+
}();
932+
933+
return Optimize32bitAccessMode;
934+
}
935+
914936
ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal,
915937
int SubSubDeviceIndex) {
916938
// Maintain various device properties cache.

source/adapters/level_zero/device.hpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,22 @@ struct ur_device_handle_t_ : _ur_object {
145145
// Returns whether immediate command lists are used on this device.
146146
ImmCmdlistMode ImmCommandListUsed{};
147147

148+
// Returns whether large allocations are being used
149+
// or not to have a consistent behavior throughout
150+
// the adapter between the creation of large allocations
151+
// and the compilation of kernels into stateful and
152+
// stateless modes.
153+
// With stateful mode, kernels are compiled with
154+
// pointer-arithmetic optimizations for optimized
155+
// access of allocations smaller than 4GB.
156+
// In stateless mode, such optimizations are not
157+
// applied.
158+
// Even if a GPU supports both modes, L0 driver may
159+
// provide support for only one, like for Intel(R)
160+
// Data Center GPU Max, for which L0 driver only
161+
// supports stateless.
162+
int32_t useOptimized32bitAccess();
163+
148164
bool isSubDevice() { return RootDevice != nullptr; }
149165

150166
// Is this a Data Center GPU Max series (aka PVC)?

source/adapters/level_zero/program.cpp

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp(
148148
ZeModuleDesc.format = (hProgram->State == ur_program_handle_t_::IL)
149149
? ZE_MODULE_FORMAT_IL_SPIRV
150150
: ZE_MODULE_FORMAT_NATIVE;
151+
151152
ZeModuleDesc.inputSize = hProgram->CodeLength;
152153
ZeModuleDesc.pInputModule = hProgram->Code.get();
153-
ZeModuleDesc.pBuildFlags = pOptions;
154+
155+
// if large allocations are selected, then pass
156+
// ze-opt-greater-than-4GB-buffer-required to disable
157+
// stateful optimizations and be able to use larger than
158+
// 4GB allocations on these kernels.
159+
std::string ZeBuildOptions{};
160+
if (pOptions) {
161+
ZeBuildOptions += pOptions;
162+
}
163+
164+
if (phDevices[0]->useOptimized32bitAccess() == 0) {
165+
ZeBuildOptions += " -ze-opt-greater-than-4GB-buffer-required";
166+
}
167+
168+
ZeModuleDesc.pBuildFlags = ZeBuildOptions.c_str();
154169
ZeModuleDesc.pConstants = Shim.ze();
155170

156171
ze_device_handle_t ZeDevice = phDevices[0]->ZeDevice;
@@ -234,8 +249,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile(
234249
// This produces better code because the driver can do cross-module
235250
// optimizations. Therefore, we just remember the compilation flags, so we
236251
// can use them later.
237-
if (Options)
252+
if (Options) {
238253
Program->BuildFlags = Options;
254+
255+
// if large allocations are selected, then pass
256+
// ze-opt-greater-than-4GB-buffer-required to disable
257+
// stateful optimizations and be able to use larger than
258+
// 4GB allocations on these kernels.
259+
if (Context->Devices[0]->useOptimized32bitAccess() == 0) {
260+
Program->BuildFlags += " -ze-opt-greater-than-4GB-buffer-required";
261+
}
262+
}
239263
Program->State = ur_program_handle_t_::Object;
240264

241265
return UR_RESULT_SUCCESS;

source/adapters/level_zero/usm.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -178,9 +178,11 @@ static ur_result_t USMDeviceAllocImpl(void **ResultPtr,
178178
ZeDesc.flags = 0;
179179
ZeDesc.ordinal = 0;
180180

181-
ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
182-
if (Size > Device->ZeDeviceProperties->maxMemAllocSize) {
183-
// Tell Level-Zero to accept Size > maxMemAllocSize
181+
if (Device->useOptimized32bitAccess() == 0 &&
182+
(Size > Device->ZeDeviceProperties->maxMemAllocSize)) {
183+
// Tell Level-Zero to accept Size > maxMemAllocSize if
184+
// large allocations are used.
185+
ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
184186
RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE;
185187
ZeDesc.pNext = &RelaxedDesc;
186188
}

0 commit comments

Comments
 (0)