Skip to content

Commit ce4acbc

Browse files
authored
Merge pull request #1099 from jandres742/largeallocations
[UR][L0] Unify use of large allocation in L0 adapter
2 parents 76aaf05 + 28590a8 commit ce4acbc

File tree

4 files changed

+70
-6
lines changed

4 files changed

+70
-6
lines changed

source/adapters/level_zero/device.cpp

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
267267
return ReturnValue(uint32_t{64});
268268
}
269269
case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE:
270-
return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize});
270+
// if not optimized for 32-bit access, return total memory size.
271+
// otherwise, return only maximum allocatable size.
272+
if (Device->useOptimized32bitAccess() == 0) {
273+
return ReturnValue(uint64_t{calculateGlobalMemSize(Device)});
274+
} else {
275+
return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize});
276+
}
271277
case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: {
272278
// Support to read physicalSize depends on kernel,
273279
// so fallback into reading totalSize if physicalSize
@@ -911,6 +917,22 @@ ur_device_handle_t_::useImmediateCommandLists() {
911917
}
912918
}
913919

920+
int32_t ur_device_handle_t_::useOptimized32bitAccess() {
921+
static const int32_t Optimize32bitAccessMode = [this] {
922+
// If device is Intel(R) Data Center GPU Max,
923+
// use default provided by L0 driver.
924+
// TODO: Use IP versioning to select based on range of devices
925+
if (this->isPVC())
926+
return -1;
927+
const char *UrRet = std::getenv("UR_L0_USE_OPTIMIZED_32BIT_ACCESS");
928+
if (!UrRet)
929+
return 0;
930+
return std::atoi(UrRet);
931+
}();
932+
933+
return Optimize32bitAccessMode;
934+
}
935+
914936
ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal,
915937
int SubSubDeviceIndex) {
916938
// Maintain various device properties cache.

source/adapters/level_zero/device.hpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,22 @@ struct ur_device_handle_t_ : _ur_object {
145145
// Returns whether immediate command lists are used on this device.
146146
ImmCmdlistMode ImmCommandListUsed{};
147147

148+
// Returns whether large allocations are being used
149+
// or not to have a consistent behavior throughout
150+
// the adapter between the creation of large allocations
151+
// and the compilation of kernels into stateful and
152+
// stateless modes.
153+
// With stateful mode, kernels are compiled with
154+
// pointer-arithmetic optimizations for optimized
155+
// access of allocations smaller than 4GB.
156+
// In stateless mode, such optimizations are not
157+
// applied.
158+
// Even if a GPU supports both modes, L0 driver may
159+
// provide support for only one, like for Intel(R)
160+
// Data Center GPU Max, for which L0 driver only
161+
// supports stateless.
162+
int32_t useOptimized32bitAccess();
163+
148164
bool isSubDevice() { return RootDevice != nullptr; }
149165

150166
// Is this a Data Center GPU Max series (aka PVC)?

source/adapters/level_zero/program.cpp

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp(
148148
ZeModuleDesc.format = (hProgram->State == ur_program_handle_t_::IL)
149149
? ZE_MODULE_FORMAT_IL_SPIRV
150150
: ZE_MODULE_FORMAT_NATIVE;
151+
151152
ZeModuleDesc.inputSize = hProgram->CodeLength;
152153
ZeModuleDesc.pInputModule = hProgram->Code.get();
153-
ZeModuleDesc.pBuildFlags = pOptions;
154+
155+
// if large allocations are selected, then pass
156+
// ze-opt-greater-than-4GB-buffer-required to disable
157+
// stateful optimizations and be able to use larger than
158+
// 4GB allocations on these kernels.
159+
std::string ZeBuildOptions{};
160+
if (pOptions) {
161+
ZeBuildOptions += pOptions;
162+
}
163+
164+
if (phDevices[0]->useOptimized32bitAccess() == 0) {
165+
ZeBuildOptions += " -ze-opt-greater-than-4GB-buffer-required";
166+
}
167+
168+
ZeModuleDesc.pBuildFlags = ZeBuildOptions.c_str();
154169
ZeModuleDesc.pConstants = Shim.ze();
155170

156171
ze_device_handle_t ZeDevice = phDevices[0]->ZeDevice;
@@ -234,8 +249,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile(
234249
// This produces better code because the driver can do cross-module
235250
// optimizations. Therefore, we just remember the compilation flags, so we
236251
// can use them later.
237-
if (Options)
252+
if (Options) {
238253
Program->BuildFlags = Options;
254+
255+
// if large allocations are selected, then pass
256+
// ze-opt-greater-than-4GB-buffer-required to disable
257+
// stateful optimizations and be able to use larger than
258+
// 4GB allocations on these kernels.
259+
if (Context->Devices[0]->useOptimized32bitAccess() == 0) {
260+
Program->BuildFlags += " -ze-opt-greater-than-4GB-buffer-required";
261+
}
262+
}
239263
Program->State = ur_program_handle_t_::Object;
240264

241265
return UR_RESULT_SUCCESS;

source/adapters/level_zero/usm.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -178,9 +178,11 @@ static ur_result_t USMDeviceAllocImpl(void **ResultPtr,
178178
ZeDesc.flags = 0;
179179
ZeDesc.ordinal = 0;
180180

181-
ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
182-
if (Size > Device->ZeDeviceProperties->maxMemAllocSize) {
183-
// Tell Level-Zero to accept Size > maxMemAllocSize
181+
if (Device->useOptimized32bitAccess() == 0 &&
182+
(Size > Device->ZeDeviceProperties->maxMemAllocSize)) {
183+
// Tell Level-Zero to accept Size > maxMemAllocSize if
184+
// large allocations are used.
185+
ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
184186
RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE;
185187
ZeDesc.pNext = &RelaxedDesc;
186188
}

0 commit comments

Comments
 (0)