45
45
│ Prefix │ Commands added to UR command-buffer by UR user │ Suffix │
46
46
└──────────┴────────────────────────────────────────────────┴─────────┘
47
47
48
- ┌───────────────────┬──────────────────────────────┐
49
- Prefix │Reset signal event │ Barrier waiting on wait event│
50
- └───────────────────┴──────────────────────────────┘
48
+ ┌───────────────────┬──────────────┐────────────── ────────────────┐
49
+ Prefix │Reset signal event │ Reset events │ Barrier waiting on wait event│
50
+ └───────────────────┴──────────────┘────────────── ────────────────┘
51
51
52
52
┌─────────────────────────────────────────────┐──────────────┐
53
- Suffix │Barrier waiting on sync-point event, │ Reset events │
54
- │signalling the UR command-buffer signal event│ │
53
+ Suffix │Barrier waiting on sync-point event, │ Query CMD │
54
+ │signalling the UR command-buffer signal event│ Timestamps │
55
55
└─────────────────────────────────────────────┘──────────────┘
56
56
57
57
For a call to `urCommandBufferEnqueueExp` with an event_list `EL`,
@@ -433,6 +433,10 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
433
433
434
434
ZeStruct<ze_command_list_desc_t > ZeCommandListDesc;
435
435
ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
436
+ // Dependencies between commands are explicitly enforced by sync points when
437
+ // enqueuing. Consequently, relax the command ordering in the command list
438
+ // can enable the backend to further optimize the workload
439
+ ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING;
436
440
437
441
ze_command_list_handle_t ZeCommandList;
438
442
// TODO We could optimize this by pooling both Level Zero command-lists and UR
@@ -494,18 +498,6 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) {
494
498
(CommandBuffer->ZeCommandList , CommandBuffer->SignalEvent ->ZeEvent ,
495
499
NumEvents, WaitEventList.data ()));
496
500
497
- // Reset the wait-event for the UR command-buffer that is signalled when its
498
- // submission dependencies have been satisfied.
499
- ZE2UR_CALL (zeCommandListAppendEventReset,
500
- (CommandBuffer->ZeCommandList , CommandBuffer->WaitEvent ->ZeEvent ));
501
-
502
- // Reset the L0 events we use for command-buffer internal sync-points to the
503
- // non-signalled state
504
- for (auto Event : WaitEventList) {
505
- ZE2UR_CALL (zeCommandListAppendEventReset,
506
- (CommandBuffer->ZeCommandList , Event));
507
- }
508
-
509
501
// Close the command list and have it ready for dispatch.
510
502
ZE2UR_CALL (zeCommandListClose, (CommandBuffer->ZeCommandList ));
511
503
return UR_RESULT_SUCCESS;
@@ -899,14 +891,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
899
891
// Create command-list to execute before `CommandListPtr` and will signal
900
892
// when `EventWaitList` dependencies are complete.
901
893
ur_command_list_ptr_t WaitCommandList{};
894
+ UR_CALL (Queue->Context ->getAvailableCommandList (Queue, WaitCommandList, false ,
895
+ false ));
896
+
897
+ // Create a list of events of all the events that compose the command buffer
898
+ // workload.
899
+ // This loop also resets the L0 events we use for command-buffer internal
900
+ // sync-points to the non-signalled state.
901
+ // This is required for multiple submissions.
902
+ const size_t NumEvents = CommandBuffer->SyncPoints .size ();
903
+ std::vector<ze_event_handle_t > WaitEventList{NumEvents};
904
+ for (size_t i = 0 ; i < NumEvents; i++) {
905
+ auto ZeEvent = CommandBuffer->SyncPoints [i]->ZeEvent ;
906
+ WaitEventList[i] = ZeEvent;
907
+ ZE2UR_CALL (zeCommandListAppendEventReset,
908
+ (WaitCommandList->first , ZeEvent));
909
+ }
910
+
902
911
if (NumEventsInWaitList) {
903
912
_ur_ze_event_list_t TmpWaitList;
904
913
UR_CALL (TmpWaitList.createAndRetainUrZeEventList (
905
914
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
906
915
907
- UR_CALL (Queue->Context ->getAvailableCommandList (Queue, WaitCommandList,
908
- false , false ))
909
-
910
916
// Update the WaitList of the Wait Event
911
917
// Events are appended to the WaitList if the WaitList is not empty
912
918
if (CommandBuffer->WaitEvent ->WaitList .isEmpty ())
@@ -919,9 +925,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
919
925
CommandBuffer->WaitEvent ->WaitList .Length ,
920
926
CommandBuffer->WaitEvent ->WaitList .ZeEventList ));
921
927
} else {
922
- UR_CALL (Queue->Context ->getAvailableCommandList (Queue, WaitCommandList,
923
- false , false ));
924
-
925
928
ZE2UR_CALL (zeCommandListAppendSignalEvent,
926
929
(WaitCommandList->first , CommandBuffer->WaitEvent ->ZeEvent ));
927
930
}
@@ -930,17 +933,43 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
930
933
ur_event_handle_t RetEvent{};
931
934
// Create a command-list to signal RetEvent on completion
932
935
ur_command_list_ptr_t SignalCommandList{};
933
- if (Event) {
934
- UR_CALL (Queue->Context ->getAvailableCommandList (Queue, SignalCommandList,
935
- false , false ));
936
+ UR_CALL (Queue->Context ->getAvailableCommandList (Queue, SignalCommandList,
937
+ false , false ));
938
+ // Reset the wait-event for the UR command-buffer that is signalled when its
939
+ // submission dependencies have been satisfied.
940
+ ZE2UR_CALL (zeCommandListAppendEventReset,
941
+ (SignalCommandList->first , CommandBuffer->WaitEvent ->ZeEvent ));
936
942
943
+ if (Event) {
937
944
UR_CALL (createEventAndAssociateQueue (Queue, &RetEvent,
938
945
UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP,
939
946
SignalCommandList, false ));
940
947
941
- ZE2UR_CALL (zeCommandListAppendBarrier,
942
- (SignalCommandList->first , RetEvent->ZeEvent , 1 ,
943
- &(CommandBuffer->SignalEvent ->ZeEvent )));
948
+ if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
949
+ // Multiple submissions of a command buffer implies that we need to save
950
+ // the event timestamps before resubmiting the command buffer. We
951
+ // therefore copy the these timestamps in a dedicated USM memory section
952
+ // before completing the command buffer execution, and then attach this
953
+ // memory to the event returned to users to allow to allow the profiling
954
+ // engine to recover these timestamps.
955
+ command_buffer_profiling_t *Profiling = new command_buffer_profiling_t ();
956
+
957
+ Profiling->NumEvents = WaitEventList.size ();
958
+ Profiling->Timestamps =
959
+ new ze_kernel_timestamp_result_t [Profiling->NumEvents ];
960
+
961
+ ZE2UR_CALL (zeCommandListAppendQueryKernelTimestamps,
962
+ (SignalCommandList->first , WaitEventList.size (),
963
+ WaitEventList.data (), (void *)Profiling->Timestamps , 0 ,
964
+ RetEvent->ZeEvent , 1 ,
965
+ &(CommandBuffer->SignalEvent ->ZeEvent )));
966
+
967
+ RetEvent->CommandData = static_cast <void *>(Profiling);
968
+ } else {
969
+ ZE2UR_CALL (zeCommandListAppendBarrier,
970
+ (SignalCommandList->first , RetEvent->ZeEvent , 1 ,
971
+ &(CommandBuffer->SignalEvent ->ZeEvent )));
972
+ }
944
973
}
945
974
946
975
// Execution our command-lists asynchronously
0 commit comments