Skip to content

Commit c145959

Browse files
[SYCL] Perform eager initialization on demand (#6430)
1 parent 616ecf7 commit c145959

File tree

4 files changed

+82
-18
lines changed

4 files changed

+82
-18
lines changed

sycl/doc/EnvironmentVariables.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ compiler and runtime.
2323
| `SYCL_ENABLE_DEFAULT_CONTEXTS` | '1' or '0' | Enable ('1') or disable ('0') creation of default platform contexts in SYCL runtime. The default context for each platform contains all devices in the platform. Refer to [Platform Default Contexts](extensions/supported/sycl_ext_oneapi_default_context.asciidoc) extension to learn more. Enabled by default on Linux and disabled on Windows. |
2424
| `SYCL_RT_WARNING_LEVEL` | Positive integer | The higher warning level is used the more warnings and performance hints the runtime library may print. Default value is '0', which means no warning/hint messages from the runtime library are allowed. The value '1' enables performance warnings from device runtime/codegen. The values greater than 1 are reserved for future use. |
2525
| `SYCL_USM_HOSTPTR_IMPORT` | Integer | Enable by specifying non-zero value. Buffers created with a host pointer will result in host data promotion to USM, improving data transfer performance. To use this feature, also set SYCL_HOST_UNIFIED_MEMORY=1. |
26+
| `SYCL_EAGER_INIT` | Integer | Enable by specifying non-zero value. Tells the SYCL runtime to do as much as possible initialization at objects construction as opposed to doing lazy initialization on the fly. This may mean doing some redundant work at warmup but ensures fastest possible execution on the following hot and reportable paths. It also instructs PI plugins to do the same. Default is "0". |
2627

2728
`(*) Note: Any means this environment variable is effective when set to any non-null value.`
2829

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 71 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,15 @@ static void zePrint(const char *Format, ...) {
215215
}
216216
}
217217

218+
// Controls if we should choose doing eager initialization
219+
// to make it happen on warmup paths and have the reportable
220+
// paths be less likely affected.
221+
//
222+
static bool doEagerInit = [] {
223+
const char *EagerInit = std::getenv("SYCL_EAGER_INIT");
224+
return EagerInit ? std::atoi(EagerInit) != 0 : false;
225+
}();
226+
218227
// Controls whether device-scope events are used, and how.
219228
static const enum EventsScope {
220229
// All events are created host-visible.
@@ -1230,7 +1239,7 @@ pi_result _pi_context::getAvailableCommandList(
12301239
// Each command list is paired with an associated fence to track when the
12311240
// command list is available for reuse.
12321241
_pi_result pi_result = PI_ERROR_OUT_OF_RESOURCES;
1233-
ZeStruct<ze_fence_desc_t> ZeFenceDesc;
1242+
12341243
// Initally, we need to check if a command list has already been created
12351244
// on this device that is available for use. If so, then reuse that
12361245
// Level-Zero Command List and Fence for this PI call.
@@ -1270,6 +1279,7 @@ pi_result _pi_context::getAvailableCommandList(
12701279
QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue);
12711280

12721281
ze_fence_handle_t ZeFence;
1282+
ZeStruct<ze_fence_desc_t> ZeFenceDesc;
12731283
ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
12741284
CommandList =
12751285
Queue->CommandListMap
@@ -1310,15 +1320,28 @@ pi_result _pi_context::getAvailableCommandList(
13101320
}
13111321
}
13121322

1313-
// If there are no available command lists nor signalled command lists, then
1314-
// we must create another command list.
1315-
// Once created, this command list & fence are added to the command list fence
1316-
// map.
1317-
ze_command_list_handle_t ZeCommandList;
1323+
// If there are no available command lists nor signalled command lists,
1324+
// then we must create another command list.
1325+
pi_result = Queue->createCommandList(UseCopyEngine, CommandList);
1326+
CommandList->second.ZeFenceInUse = true;
1327+
return pi_result;
1328+
}
1329+
1330+
// Helper function to create a new command-list to this queue and associated
1331+
// fence tracking its completion. This command list & fence are added to the
1332+
// map of command lists in this queue with ZeFenceInUse = false.
1333+
// The caller must hold a lock of the queue already.
1334+
pi_result
1335+
_pi_queue::createCommandList(bool UseCopyEngine,
1336+
pi_command_list_ptr_t &CommandList,
1337+
ze_command_queue_handle_t *ForcedCmdQueue) {
1338+
13181339
ze_fence_handle_t ZeFence;
1340+
ZeStruct<ze_fence_desc_t> ZeFenceDesc;
1341+
ze_command_list_handle_t ZeCommandList;
13191342

1320-
auto &QGroup = Queue->getQueueGroup(UseCopyEngine);
13211343
uint32_t QueueGroupOrdinal;
1344+
auto &QGroup = getQueueGroup(UseCopyEngine);
13221345
auto &ZeCommandQueue =
13231346
ForcedCmdQueue ? *ForcedCmdQueue : QGroup.getZeQueue(&QueueGroupOrdinal);
13241347
if (ForcedCmdQueue)
@@ -1327,19 +1350,16 @@ pi_result _pi_context::getAvailableCommandList(
13271350
ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
13281351
ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
13291352

1330-
ZE_CALL(zeCommandListCreate,
1331-
(Queue->Context->ZeContext, Queue->Device->ZeDevice,
1332-
&ZeCommandListDesc, &ZeCommandList));
1353+
ZE_CALL(zeCommandListCreate, (Context->ZeContext, Device->ZeDevice,
1354+
&ZeCommandListDesc, &ZeCommandList));
13331355

13341356
ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
1335-
std::tie(CommandList, std::ignore) = Queue->CommandListMap.insert(
1357+
std::tie(CommandList, std::ignore) = CommandListMap.insert(
13361358
std::pair<ze_command_list_handle_t, pi_command_list_info_t>(
1337-
ZeCommandList, {ZeFence, true, ZeCommandQueue, QueueGroupOrdinal}));
1338-
if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine))
1339-
return Res;
1340-
pi_result = PI_SUCCESS;
1359+
ZeCommandList, {ZeFence, false, ZeCommandQueue, QueueGroupOrdinal}));
13411360

1342-
return pi_result;
1361+
PI_CALL(insertActiveBarriers(CommandList, UseCopyEngine));
1362+
return PI_SUCCESS;
13431363
}
13441364

13451365
void _pi_queue::adjustBatchSizeForFullBatch(bool IsCopy) {
@@ -3396,6 +3416,41 @@ pi_result piQueueCreate(pi_context Context, pi_device Device,
33963416
} catch (...) {
33973417
return PI_ERROR_UNKNOWN;
33983418
}
3419+
3420+
// Do eager initialization of Level Zero handles on request.
3421+
if (doEagerInit) {
3422+
pi_queue Q = *Queue;
3423+
// Creates said number of command-lists.
3424+
auto warmupQueueGroup = [Q](bool UseCopyEngine,
3425+
uint32_t RepeatCount) -> pi_result {
3426+
pi_command_list_ptr_t CommandList;
3427+
while (RepeatCount--) {
3428+
if (UseImmediateCommandLists) {
3429+
CommandList = Q->getQueueGroup(UseCopyEngine).getImmCmdList();
3430+
} else {
3431+
// Heuristically create some number of regular command-list to reuse.
3432+
for (int I = 0; I < 10; ++I) {
3433+
PI_CALL(Q->createCommandList(UseCopyEngine, CommandList));
3434+
// Immediately return them to the cache of available command-lists.
3435+
std::vector<pi_event> EventsUnused;
3436+
PI_CALL(Q->resetCommandList(CommandList, true /* MakeAvailable */,
3437+
EventsUnused));
3438+
}
3439+
}
3440+
}
3441+
return PI_SUCCESS;
3442+
};
3443+
// Create as many command-lists as there are queues in the group.
3444+
// With this the underlying round-robin logic would initialize all
3445+
// native queues, and create command-lists and their fences.
3446+
PI_CALL(warmupQueueGroup(false, Q->ComputeQueueGroup.UpperIndex -
3447+
Q->ComputeQueueGroup.LowerIndex + 1));
3448+
if (Q->useCopyEngine()) {
3449+
PI_CALL(warmupQueueGroup(true, Q->CopyQueueGroup.UpperIndex -
3450+
Q->CopyQueueGroup.LowerIndex + 1));
3451+
}
3452+
// TODO: warmup event pools. Both host-visible and device-only.
3453+
}
33993454
return PI_SUCCESS;
34003455
}
34013456

sycl/plugins/level_zero/pi_level_zero.hpp

100755100644
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -954,6 +954,14 @@ struct _pi_queue : _pi_object {
954954
// For non-copy commands, IsCopy is set to 'false'.
955955
void adjustBatchSizeForPartialBatch(bool IsCopy);
956956

957+
// Helper function to create a new command-list to this queue and associated
958+
// fence tracking its completion. This command list & fence are added to the
959+
// map of command lists in this queue with ZeFenceInUse = false.
960+
// The caller must hold a lock of the queue already.
961+
pi_result
962+
createCommandList(bool UseCopyEngine, pi_command_list_ptr_t &CommandList,
963+
ze_command_queue_handle_t *ForcedCmdQueue = nullptr);
964+
957965
// Resets the Command List and Associated fence in the ZeCommandListFenceMap.
958966
// If the reset command list should be made available, then MakeAvailable
959967
// needs to be set to true. The caller must verify that this command list and

sycl/source/detail/plugin.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ struct array_fill_helper<Kind, Idx, T> {
5555

5656
template <PiApiKind Kind, size_t Idx, typename T, typename... Args>
5757
struct array_fill_helper<Kind, Idx, T, Args...> {
58-
static void fill(unsigned char *Dst, const T &&Arg, Args &&... Rest) {
58+
static void fill(unsigned char *Dst, const T &&Arg, Args &&...Rest) {
5959
using ArgsTuple = typename PiApiArgTuple<Kind>::type;
6060
// C-style cast is required here.
6161
auto RealArg = (std::tuple_element_t<Idx, ArgsTuple>)(Arg);
@@ -71,7 +71,7 @@ constexpr size_t totalSize(const std::tuple<Ts...> &) {
7171
}
7272

7373
template <PiApiKind Kind, typename... ArgsT>
74-
auto packCallArguments(ArgsT &&... Args) {
74+
auto packCallArguments(ArgsT &&...Args) {
7575
using ArgsTuple = typename PiApiArgTuple<Kind>::type;
7676

7777
constexpr size_t TotalSize = totalSize(ArgsTuple{});

0 commit comments

Comments
 (0)