@@ -89,6 +89,16 @@ static const bool UseImmediateCommandLists = [] {
89
89
return std::stoi (ImmediateFlag) > 0 ;
90
90
}();
91
91
92
+ // This is an experimental option that allows the use of multiple command lists
93
+ // when submitting barriers. The default is 0.
94
+ static const bool UseMultipleCmdlistBarriers = [] {
95
+ const char *UseMultipleCmdlistBarriersFlag =
96
+ std::getenv (" SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS" );
97
+ if (!UseMultipleCmdlistBarriersFlag)
98
+ return false ;
99
+ return std::stoi (UseMultipleCmdlistBarriersFlag) > 0 ;
100
+ }();
101
+
92
102
// This class encapsulates actions taken along with a call to Level Zero API.
93
103
class ZeCall {
94
104
private:
@@ -1182,13 +1192,14 @@ pi_result resetCommandLists(pi_queue Queue) {
1182
1192
}
1183
1193
1184
1194
// Retrieve an available command list to be used in a PI call.
1185
- pi_result
1186
- _pi_context::getAvailableCommandList (pi_queue Queue,
1187
- pi_command_list_ptr_t &CommandList,
1188
- bool UseCopyEngine, bool AllowBatching) {
1195
+ pi_result _pi_context::getAvailableCommandList (
1196
+ pi_queue Queue, pi_command_list_ptr_t &CommandList, bool UseCopyEngine,
1197
+ bool AllowBatching, ze_command_queue_handle_t *ForcedCmdQueue) {
1189
1198
// Immediate commandlists have been pre-allocated and are always available.
1190
1199
if (UseImmediateCommandLists) {
1191
1200
CommandList = Queue->getQueueGroup (UseCopyEngine).getImmCmdList ();
1201
+ if (auto Res = Queue->insertActiveBarriers (CommandList, UseCopyEngine))
1202
+ return Res;
1192
1203
return PI_SUCCESS;
1193
1204
}
1194
1205
@@ -1198,16 +1209,20 @@ _pi_context::getAvailableCommandList(pi_queue Queue,
1198
1209
// First see if there is an command-list open for batching commands
1199
1210
// for this queue.
1200
1211
if (Queue->hasOpenCommandList (UseCopyEngine)) {
1201
- if (AllowBatching) {
1212
+ if (AllowBatching &&
1213
+ (!ForcedCmdQueue ||
1214
+ *ForcedCmdQueue == CommandBatch.OpenCommandList ->second .ZeQueue )) {
1202
1215
CommandList = CommandBatch.OpenCommandList ;
1203
1216
return PI_SUCCESS;
1204
1217
}
1205
- // If this command isn't allowed to be batched, then we need to
1206
- // go ahead and execute what is already in the batched list,
1207
- // and then go on to process this. On exit from executeOpenCommandList
1208
- // OpenCommandList will be invalidated.
1218
+ // If this command isn't allowed to be batched or doesn't match the forced
1219
+ // command queue, then we need to go ahead and execute what is already in
1220
+ // the batched list, and then go on to process this. On exit from
1221
+ // executeOpenCommandList OpenCommandList will be invalidated.
1209
1222
if (auto Res = Queue->executeOpenCommandList (UseCopyEngine))
1210
1223
return Res;
1224
+ // Note that active barriers do not need to be inserted here as they will
1225
+ // have been enqueued into the command-list when they were created.
1211
1226
}
1212
1227
1213
1228
// Create/Reuse the command list, because in Level Zero commands are added to
@@ -1231,10 +1246,13 @@ _pi_context::getAvailableCommandList(pi_queue Queue,
1231
1246
: Queue->Context
1232
1247
->ZeComputeCommandListCache [Queue->Device ->ZeDevice ];
1233
1248
1234
- if (ZeCommandListCache.size () > 0 ) {
1235
- auto &ZeCommandList = ZeCommandListCache.front ();
1249
+ for (auto ZeCommandListIt = ZeCommandListCache.begin ();
1250
+ ZeCommandListIt != ZeCommandListCache.end (); ++ZeCommandListIt) {
1251
+ auto &ZeCommandList = *ZeCommandListIt;
1236
1252
auto it = Queue->CommandListMap .find (ZeCommandList);
1237
1253
if (it != Queue->CommandListMap .end ()) {
1254
+ if (ForcedCmdQueue && *ForcedCmdQueue != it->second .ZeQueue )
1255
+ continue ;
1238
1256
CommandList = it;
1239
1257
if (CommandList->second .ZeFence != nullptr )
1240
1258
CommandList->second .ZeFenceInUse = true ;
@@ -1243,9 +1261,13 @@ _pi_context::getAvailableCommandList(pi_queue Queue,
1243
1261
// wasn't yet used in this queue then create a new entry in this
1244
1262
// queue's map to hold the fence and other associated command
1245
1263
// list information.
1264
+ auto &QGroup = Queue->getQueueGroup (UseCopyEngine);
1246
1265
uint32_t QueueGroupOrdinal;
1247
- auto &ZeCommandQueue =
1248
- Queue->getQueueGroup (UseCopyEngine).getZeQueue (&QueueGroupOrdinal);
1266
+ auto &ZeCommandQueue = ForcedCmdQueue
1267
+ ? *ForcedCmdQueue
1268
+ : QGroup.getZeQueue (&QueueGroupOrdinal);
1269
+ if (ForcedCmdQueue)
1270
+ QueueGroupOrdinal = QGroup.getCmdQueueOrdinal (ZeCommandQueue);
1249
1271
1250
1272
ze_fence_handle_t ZeFence;
1251
1273
ZE_CALL (zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
@@ -1256,7 +1278,9 @@ _pi_context::getAvailableCommandList(pi_queue Queue,
1256
1278
QueueGroupOrdinal})
1257
1279
.first ;
1258
1280
}
1259
- ZeCommandListCache.pop_front ();
1281
+ ZeCommandListCache.erase (ZeCommandListIt);
1282
+ if (auto Res = Queue->insertActiveBarriers (CommandList, UseCopyEngine))
1283
+ return Res;
1260
1284
return PI_SUCCESS;
1261
1285
}
1262
1286
}
@@ -1293,9 +1317,12 @@ _pi_context::getAvailableCommandList(pi_queue Queue,
1293
1317
ze_command_list_handle_t ZeCommandList;
1294
1318
ze_fence_handle_t ZeFence;
1295
1319
1320
+ auto &QGroup = Queue->getQueueGroup (UseCopyEngine);
1296
1321
uint32_t QueueGroupOrdinal;
1297
1322
auto &ZeCommandQueue =
1298
- Queue->getQueueGroup (UseCopyEngine).getZeQueue (&QueueGroupOrdinal);
1323
+ ForcedCmdQueue ? *ForcedCmdQueue : QGroup.getZeQueue (&QueueGroupOrdinal);
1324
+ if (ForcedCmdQueue)
1325
+ QueueGroupOrdinal = QGroup.getCmdQueueOrdinal (ZeCommandQueue);
1299
1326
1300
1327
ZeStruct<ze_command_list_desc_t > ZeCommandListDesc;
1301
1328
ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
@@ -1308,6 +1335,8 @@ _pi_context::getAvailableCommandList(pi_queue Queue,
1308
1335
std::tie (CommandList, std::ignore) = Queue->CommandListMap .insert (
1309
1336
std::pair<ze_command_list_handle_t , pi_command_list_info_t >(
1310
1337
ZeCommandList, {ZeFence, true , ZeCommandQueue, QueueGroupOrdinal}));
1338
+ if (auto Res = Queue->insertActiveBarriers (CommandList, UseCopyEngine))
1339
+ return Res;
1311
1340
pi_result = PI_SUCCESS;
1312
1341
1313
1342
return pi_result;
@@ -1585,6 +1614,18 @@ uint32_t _pi_queue::pi_queue_group_t::getQueueIndex(uint32_t *QueueGroupOrdinal,
1585
1614
return CurrentIndex;
1586
1615
}
1587
1616
1617
+ int32_t _pi_queue::pi_queue_group_t::getCmdQueueOrdinal (
1618
+ ze_command_queue_handle_t CmdQueue) {
1619
+ // Find out the right queue group ordinal (first queue might be "main" or
1620
+ // "link")
1621
+ auto QueueType = Type;
1622
+ if (QueueType != queue_type::Compute)
1623
+ QueueType = (ZeQueues[0 ] == CmdQueue && Queue->Device ->hasMainCopyEngine ())
1624
+ ? queue_type::MainCopy
1625
+ : queue_type::LinkCopy;
1626
+ return Queue->Device ->QueueGroup [QueueType].ZeOrdinal ;
1627
+ }
1628
+
1588
1629
// This function will return one of possibly multiple available native
1589
1630
// queues and the value of the queue group ordinal.
1590
1631
ze_command_queue_handle_t &
@@ -1697,6 +1738,36 @@ pi_command_list_ptr_t _pi_queue::eventOpenCommandList(pi_event Event) {
1697
1738
return CommandListMap.end ();
1698
1739
}
1699
1740
1741
+ pi_result _pi_queue::insertActiveBarriers (pi_command_list_ptr_t &CmdList,
1742
+ bool UseCopyEngine) {
1743
+ // Early exit if there are no active barriers.
1744
+ if (ActiveBarriers.empty ())
1745
+ return PI_SUCCESS;
1746
+
1747
+ // Create a wait-list and retain events. This will filter out finished events.
1748
+ _pi_ze_event_list_t ActiveBarriersWaitList;
1749
+ if (auto Res = ActiveBarriersWaitList.createAndRetainPiZeEventList (
1750
+ ActiveBarriers.size (), ActiveBarriers.data (), this , UseCopyEngine))
1751
+ return Res;
1752
+
1753
+ // We can now release all the active barriers and replace them with the ones
1754
+ // in the wait list.
1755
+ for (pi_event &BarrierEvent : ActiveBarriers)
1756
+ PI_CALL (piEventRelease (BarrierEvent));
1757
+ ActiveBarriers.clear ();
1758
+ ActiveBarriers.insert (
1759
+ ActiveBarriers.end (), ActiveBarriersWaitList.PiEventList ,
1760
+ ActiveBarriersWaitList.PiEventList + ActiveBarriersWaitList.Length );
1761
+
1762
+ // If there are more active barriers, insert a barrier on the command-list. We
1763
+ // do not need an event for finishing so we pass nullptr.
1764
+ if (!ActiveBarriers.empty ())
1765
+ ZE_CALL (zeCommandListAppendBarrier,
1766
+ (CmdList->first , nullptr , ActiveBarriersWaitList.Length ,
1767
+ ActiveBarriersWaitList.ZeEventList ));
1768
+ return PI_SUCCESS;
1769
+ }
1770
+
1700
1771
pi_result _pi_queue::executeOpenCommandList (bool IsCopy) {
1701
1772
auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch;
1702
1773
// If there are any commands still in the open command list for this
@@ -6013,35 +6084,154 @@ pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue,
6013
6084
// Lock automatically releases when this goes out of scope.
6014
6085
std::scoped_lock lock (Queue->Mutex );
6015
6086
6016
- bool UseCopyEngine = false ;
6017
-
6018
- _pi_ze_event_list_t TmpWaitList;
6019
- if (auto Res = TmpWaitList.createAndRetainPiZeEventList (
6020
- NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine))
6021
- return Res;
6087
+ // Helper function for appending a barrier to a command list.
6088
+ auto insertBarrierIntoCmdList =
6089
+ [&Queue](pi_command_list_ptr_t CmdList,
6090
+ const _pi_ze_event_list_t &EventWaitList, pi_event &Event) {
6091
+ if (auto Res = createEventAndAssociateQueue (
6092
+ Queue, &Event, PI_COMMAND_TYPE_USER, CmdList))
6093
+ return Res;
6094
+ Event->WaitList = EventWaitList;
6095
+ ZE_CALL (zeCommandListAppendBarrier,
6096
+ (CmdList->first , Event->ZeEvent , EventWaitList.Length ,
6097
+ EventWaitList.ZeEventList ));
6098
+ return PI_SUCCESS;
6099
+ };
6022
6100
6023
- // Get a new command list to be used on this call
6101
+ // Indicator for whether batching is allowed. This may be changed later in
6102
+ // this function, but allow it by default.
6024
6103
bool OkToBatch = true ;
6025
- pi_command_list_ptr_t CommandList{};
6026
- if (auto Res = Queue->Context ->getAvailableCommandList (
6027
- Queue, CommandList, UseCopyEngine, OkToBatch))
6028
- return Res;
6029
6104
6030
- ze_event_handle_t ZeEvent = nullptr ;
6031
- auto Res = createEventAndAssociateQueue (Queue, Event, PI_COMMAND_TYPE_USER,
6032
- CommandList);
6033
- if (Res != PI_SUCCESS)
6034
- return Res;
6035
- ZeEvent = (*Event)->ZeEvent ;
6036
- (*Event)->WaitList = TmpWaitList;
6105
+ // If we have a list of events to make the barrier from, then we can create a
6106
+ // barrier on these and use the resulting event as our future barrier.
6107
+ // We use the same approach if
6108
+ // SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS is not set to a
6109
+ // positive value.
6110
+ if (NumEventsInWaitList || !UseMultipleCmdlistBarriers) {
6111
+ // Retain the events as they will be owned by the result event.
6112
+ _pi_ze_event_list_t TmpWaitList;
6113
+ if (auto Res = TmpWaitList.createAndRetainPiZeEventList (
6114
+ NumEventsInWaitList, EventWaitList, Queue,
6115
+ /* UseCopyEngine=*/ false ))
6116
+ return Res;
6037
6117
6038
- ZE_CALL (zeCommandListAppendBarrier,
6039
- (CommandList->first , ZeEvent, (*Event)->WaitList .Length ,
6040
- (*Event)->WaitList .ZeEventList ));
6118
+ // Get an arbitrary command-list in the queue.
6119
+ pi_command_list_ptr_t CmdList;
6120
+ if (auto Res = Queue->Context ->getAvailableCommandList (
6121
+ Queue, CmdList,
6122
+ /* UseCopyEngine=*/ false , OkToBatch))
6123
+ return Res;
6041
6124
6042
- // Execute command list asynchronously as the event will be used
6043
- // to track down its completion.
6044
- return Queue->executeCommandList (CommandList, false , OkToBatch);
6125
+ // Insert the barrier into the command-list and execute.
6126
+ if (auto Res = insertBarrierIntoCmdList (CmdList, TmpWaitList, *Event))
6127
+ return Res;
6128
+ if (auto Res = Queue->executeCommandList (CmdList, false , OkToBatch))
6129
+ return Res;
6130
+
6131
+ if (UseMultipleCmdlistBarriers) {
6132
+ // Retain and save the resulting event for future commands.
6133
+ PI_CALL (piEventRetain (*Event));
6134
+ Queue->ActiveBarriers .push_back (*Event);
6135
+ }
6136
+ return PI_SUCCESS;
6137
+ }
6138
+
6139
+ // Since there are no events to explicitly create a barrier for, we are
6140
+ // inserting a queue-wide barrier. As such, the barrier will also encapsulate
6141
+ // the active barriers, so we can release and clear the active barriers list.
6142
+ // Doing it early prevents potential additional barriers from implicitly being
6143
+ // appended.
6144
+ for (pi_event &E : Queue->ActiveBarriers )
6145
+ PI_CALL (piEventRelease (E));
6146
+ Queue->ActiveBarriers .clear ();
6147
+
6148
+ // Get command lists for each command queue.
6149
+ std::vector<pi_command_list_ptr_t > CmdLists;
6150
+ if (UseImmediateCommandLists) {
6151
+ // If immediate command lists are being used, each will act as their own
6152
+ // queue, so we must insert a barrier into each.
6153
+ CmdLists.reserve (Queue->CommandListMap .size ());
6154
+ for (auto It = Queue->CommandListMap .begin ();
6155
+ It != Queue->CommandListMap .end (); ++It)
6156
+ CmdLists.push_back (It);
6157
+ } else if (Queue->ComputeQueueGroup .ZeQueues .empty () &&
6158
+ Queue->CopyQueueGroup .ZeQueues .empty ()) {
6159
+ // If there are no queues, we get any available command list.
6160
+ pi_command_list_ptr_t CmdList;
6161
+ if (auto Res = Queue->Context ->getAvailableCommandList (
6162
+ Queue, CmdList,
6163
+ /* UseCopyEngine=*/ false , OkToBatch))
6164
+ return Res;
6165
+ CmdLists.push_back (CmdList);
6166
+ } else {
6167
+ size_t NumQueues = Queue->ComputeQueueGroup .ZeQueues .size () +
6168
+ Queue->CopyQueueGroup .ZeQueues .size ();
6169
+ // Only allow batching if there is only a single queue as otherwise the
6170
+ // following availability command list lookups will prematurely push
6171
+ // open batch command lists out.
6172
+ OkToBatch = NumQueues == 1 ;
6173
+ // Get an available command list tied to each command queue. We need these
6174
+ // so a queue-wide barrier can be inserted into each command queue.
6175
+ CmdLists.reserve (NumQueues);
6176
+ for (auto QueueGroup : {Queue->ComputeQueueGroup , Queue->CopyQueueGroup }) {
6177
+ bool UseCopyEngine = QueueGroup.Type != _pi_queue::queue_type::Compute;
6178
+ for (ze_command_queue_handle_t ZeQueue : QueueGroup.ZeQueues ) {
6179
+ pi_command_list_ptr_t CmdList;
6180
+ if (auto Res = Queue->Context ->getAvailableCommandList (
6181
+ Queue, CmdList, UseCopyEngine, OkToBatch, &ZeQueue))
6182
+ return Res;
6183
+ CmdLists.push_back (CmdList);
6184
+ }
6185
+ }
6186
+ }
6187
+
6188
+ // Insert a barrier into each unique command queue using the available
6189
+ // command-lists.
6190
+ std::vector<pi_event> EventWaitVector (CmdLists.size ());
6191
+ for (size_t I = 0 ; I < CmdLists.size (); ++I)
6192
+ if (auto Res = insertBarrierIntoCmdList (CmdLists[I], _pi_ze_event_list_t {},
6193
+ EventWaitVector[I]))
6194
+ return Res;
6195
+
6196
+ if (CmdLists.size () > 1 ) {
6197
+ // If there were multiple queues we need to create a "convergence" event to
6198
+ // be our active barrier. This convergence event is signalled by a barrier
6199
+ // on all the events from the barriers we have inserted into each queue.
6200
+ // Use the first command list as our convergence command list.
6201
+ pi_command_list_ptr_t &ConvergenceCmdList = CmdLists[0 ];
6202
+
6203
+ // Create an event list. It will take ownership over all relevant events so
6204
+ // we relinquish ownership and let it keep all events it needs.
6205
+ _pi_ze_event_list_t BaseWaitList;
6206
+ if (auto Res = BaseWaitList.createAndRetainPiZeEventList (
6207
+ EventWaitVector.size (), EventWaitVector.data (), Queue,
6208
+ ConvergenceCmdList->second .isCopy (Queue)))
6209
+ return Res;
6210
+ for (pi_event &E : EventWaitVector)
6211
+ PI_CALL (piEventRelease (E));
6212
+
6213
+ // Insert a barrier with the events from each command-queue into the
6214
+ // convergence command list. The resulting event signals the convergence of
6215
+ // all barriers.
6216
+ if (auto Res =
6217
+ insertBarrierIntoCmdList (ConvergenceCmdList, BaseWaitList, *Event))
6218
+ return Res;
6219
+ } else {
6220
+ // If there is only a single queue we have inserted all the barriers we need
6221
+ // and the single result event can be used as our active barrier and used as
6222
+ // the return event.
6223
+ *Event = EventWaitVector[0 ];
6224
+ }
6225
+
6226
+ // Execute each command list so the barriers can be encountered.
6227
+ for (pi_command_list_ptr_t &CmdList : CmdLists)
6228
+ if (auto Res = Queue->executeCommandList (CmdList, false , OkToBatch))
6229
+ return Res;
6230
+
6231
+ // We must keep the event internally to use if new command lists are created.
6232
+ PI_CALL (piEventRetain (*Event));
6233
+ Queue->ActiveBarriers .push_back (*Event);
6234
+ return PI_SUCCESS;
6045
6235
}
6046
6236
6047
6237
pi_result piEnqueueMemBufferRead (pi_queue Queue, pi_mem Src,
@@ -6134,6 +6324,13 @@ pi_result _pi_queue::synchronize() {
6134
6324
if (ZeQueue)
6135
6325
ZE_CALL (zeHostSynchronize, (ZeQueue));
6136
6326
}
6327
+
6328
+ // With the entire queue synchronized, the active barriers must be done so we
6329
+ // can remove them.
6330
+ for (pi_event &BarrierEvent : ActiveBarriers)
6331
+ PI_CALL (piEventRelease (BarrierEvent));
6332
+ ActiveBarriers.clear ();
6333
+
6137
6334
return PI_SUCCESS;
6138
6335
}
6139
6336
0 commit comments