Skip to content

Commit d06ba9d

Browse files
authored
Merge pull request #1035 from Bensuo/cmd-buffer-profiling-l0
[EXP][CMDBUF] Add support for recovering CommandBuffer profiling information
2 parents e1414e1 + 35b6a5e commit d06ba9d

File tree

3 files changed

+138
-29
lines changed

3 files changed

+138
-29
lines changed

source/adapters/level_zero/command_buffer.cpp

Lines changed: 58 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,13 @@
4545
│ Prefix │ Commands added to UR command-buffer by UR user │ Suffix │
4646
└──────────┴────────────────────────────────────────────────┴─────────┘
4747
48-
┌───────────────────┬──────────────────────────────┐
49-
Prefix │Reset signal event │ Barrier waiting on wait event│
50-
└───────────────────┴──────────────────────────────┘
48+
┌───────────────────┬──────────────┐──────────────────────────────┐
49+
Prefix │Reset signal event │ Reset events │ Barrier waiting on wait event│
50+
└───────────────────┴──────────────┘──────────────────────────────┘
5151
5252
┌─────────────────────────────────────────────┐──────────────┐
53-
Suffix │Barrier waiting on sync-point event, │ Reset events
54-
│signalling the UR command-buffer signal event│
53+
Suffix │Barrier waiting on sync-point event, │ Query CMD
54+
│signalling the UR command-buffer signal event│ Timestamps
5555
└─────────────────────────────────────────────┘──────────────┘
5656
5757
For a call to `urCommandBufferEnqueueExp` with an event_list `EL`,
@@ -433,6 +433,10 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
433433

434434
ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
435435
ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
436+
// Dependencies between commands are explicitly enforced by sync points when
437+
// enqueuing. Consequently, relax the command ordering in the command list
438+
// can enable the backend to further optimize the workload
439+
ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING;
436440

437441
ze_command_list_handle_t ZeCommandList;
438442
// TODO We could optimize this by pooling both Level Zero command-lists and UR
@@ -494,18 +498,6 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) {
494498
(CommandBuffer->ZeCommandList, CommandBuffer->SignalEvent->ZeEvent,
495499
NumEvents, WaitEventList.data()));
496500

497-
// Reset the wait-event for the UR command-buffer that is signalled when its
498-
// submission dependencies have been satisfied.
499-
ZE2UR_CALL(zeCommandListAppendEventReset,
500-
(CommandBuffer->ZeCommandList, CommandBuffer->WaitEvent->ZeEvent));
501-
502-
// Reset the L0 events we use for command-buffer internal sync-points to the
503-
// non-signalled state
504-
for (auto Event : WaitEventList) {
505-
ZE2UR_CALL(zeCommandListAppendEventReset,
506-
(CommandBuffer->ZeCommandList, Event));
507-
}
508-
509501
// Close the command list and have it ready for dispatch.
510502
ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandList));
511503
return UR_RESULT_SUCCESS;
@@ -899,14 +891,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
899891
// Create command-list to execute before `CommandListPtr` and will signal
900892
// when `EventWaitList` dependencies are complete.
901893
ur_command_list_ptr_t WaitCommandList{};
894+
UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList, false,
895+
false));
896+
897+
// Create a list of events of all the events that compose the command buffer
898+
// workload.
899+
// This loop also resets the L0 events we use for command-buffer internal
900+
// sync-points to the non-signalled state.
901+
// This is required for multiple submissions.
902+
const size_t NumEvents = CommandBuffer->SyncPoints.size();
903+
std::vector<ze_event_handle_t> WaitEventList{NumEvents};
904+
for (size_t i = 0; i < NumEvents; i++) {
905+
auto ZeEvent = CommandBuffer->SyncPoints[i]->ZeEvent;
906+
WaitEventList[i] = ZeEvent;
907+
ZE2UR_CALL(zeCommandListAppendEventReset,
908+
(WaitCommandList->first, ZeEvent));
909+
}
910+
902911
if (NumEventsInWaitList) {
903912
_ur_ze_event_list_t TmpWaitList;
904913
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
905914
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
906915

907-
UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList,
908-
false, false))
909-
910916
// Update the WaitList of the Wait Event
911917
// Events are appended to the WaitList if the WaitList is not empty
912918
if (CommandBuffer->WaitEvent->WaitList.isEmpty())
@@ -919,9 +925,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
919925
CommandBuffer->WaitEvent->WaitList.Length,
920926
CommandBuffer->WaitEvent->WaitList.ZeEventList));
921927
} else {
922-
UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList,
923-
false, false));
924-
925928
ZE2UR_CALL(zeCommandListAppendSignalEvent,
926929
(WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent));
927930
}
@@ -930,17 +933,43 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
930933
ur_event_handle_t RetEvent{};
931934
// Create a command-list to signal RetEvent on completion
932935
ur_command_list_ptr_t SignalCommandList{};
933-
if (Event) {
934-
UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList,
935-
false, false));
936+
UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList,
937+
false, false));
938+
// Reset the wait-event for the UR command-buffer that is signalled when its
939+
// submission dependencies have been satisfied.
940+
ZE2UR_CALL(zeCommandListAppendEventReset,
941+
(SignalCommandList->first, CommandBuffer->WaitEvent->ZeEvent));
936942

943+
if (Event) {
937944
UR_CALL(createEventAndAssociateQueue(Queue, &RetEvent,
938945
UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP,
939946
SignalCommandList, false));
940947

941-
ZE2UR_CALL(zeCommandListAppendBarrier,
942-
(SignalCommandList->first, RetEvent->ZeEvent, 1,
943-
&(CommandBuffer->SignalEvent->ZeEvent)));
948+
if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
949+
// Multiple submissions of a command buffer implies that we need to save
950+
// the event timestamps before resubmiting the command buffer. We
951+
// therefore copy the these timestamps in a dedicated USM memory section
952+
// before completing the command buffer execution, and then attach this
953+
// memory to the event returned to users to allow to allow the profiling
954+
// engine to recover these timestamps.
955+
command_buffer_profiling_t *Profiling = new command_buffer_profiling_t();
956+
957+
Profiling->NumEvents = WaitEventList.size();
958+
Profiling->Timestamps =
959+
new ze_kernel_timestamp_result_t[Profiling->NumEvents];
960+
961+
ZE2UR_CALL(zeCommandListAppendQueryKernelTimestamps,
962+
(SignalCommandList->first, WaitEventList.size(),
963+
WaitEventList.data(), (void *)Profiling->Timestamps, 0,
964+
RetEvent->ZeEvent, 1,
965+
&(CommandBuffer->SignalEvent->ZeEvent)));
966+
967+
RetEvent->CommandData = static_cast<void *>(Profiling);
968+
} else {
969+
ZE2UR_CALL(zeCommandListAppendBarrier,
970+
(SignalCommandList->first, RetEvent->ZeEvent, 1,
971+
&(CommandBuffer->SignalEvent->ZeEvent)));
972+
}
944973
}
945974

946975
// Execution our command-lists asynchronously

source/adapters/level_zero/command_buffer.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,11 @@
1919
#include "context.hpp"
2020
#include "queue.hpp"
2121

22+
struct command_buffer_profiling_t {
23+
ur_exp_command_buffer_sync_point_t NumEvents;
24+
ze_kernel_timestamp_result_t *Timestamps;
25+
};
26+
2227
struct ur_exp_command_buffer_handle_t_ : public _ur_object {
2328
ur_exp_command_buffer_handle_t_(ur_context_handle_t Context,
2429
ur_device_handle_t Device,

source/adapters/level_zero/event.cpp

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <mutex>
1414
#include <string.h>
1515

16+
#include "command_buffer.hpp"
1617
#include "common.hpp"
1718
#include "event.hpp"
1819
#include "ur_level_zero.hpp"
@@ -454,6 +455,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
454455
///< bytes returned in propValue
455456
) {
456457
std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);
458+
457459
if (Event->UrQueue &&
458460
(Event->UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) == 0) {
459461
return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
@@ -470,6 +472,70 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
470472

471473
ze_kernel_timestamp_result_t tsResult;
472474

475+
// A Command-buffer consists of three command-lists for which only a single
476+
// event is returned to users. The actual profiling information related to the
477+
// command-buffer should therefore be extrated from graph events themsleves.
478+
// The timestamps of these events are saved in a memory region attached to
479+
// event usning CommandData field. The timings must therefore be recovered
480+
// from this memory.
481+
if (Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP) {
482+
if (Event->CommandData) {
483+
command_buffer_profiling_t *ProfilingsPtr;
484+
switch (PropName) {
485+
case UR_PROFILING_INFO_COMMAND_START: {
486+
ProfilingsPtr =
487+
static_cast<command_buffer_profiling_t *>(Event->CommandData);
488+
// Sync-point order does not necessarily match to the order of
489+
// execution. We therefore look for the first command executed.
490+
uint64_t MinStart = ProfilingsPtr->Timestamps[0].global.kernelStart;
491+
for (uint64_t i = 1; i < ProfilingsPtr->NumEvents; i++) {
492+
uint64_t Timestamp = ProfilingsPtr->Timestamps[i].global.kernelStart;
493+
if (Timestamp < MinStart) {
494+
MinStart = Timestamp;
495+
}
496+
}
497+
uint64_t ContextStartTime =
498+
(MinStart & TimestampMaxValue) * ZeTimerResolution;
499+
return ReturnValue(ContextStartTime);
500+
}
501+
case UR_PROFILING_INFO_COMMAND_END: {
502+
ProfilingsPtr =
503+
static_cast<command_buffer_profiling_t *>(Event->CommandData);
504+
// Sync-point order does not necessarily match to the order of
505+
// execution. We therefore look for the last command executed.
506+
uint64_t MaxEnd = ProfilingsPtr->Timestamps[0].global.kernelEnd;
507+
uint64_t LastStart = ProfilingsPtr->Timestamps[0].global.kernelStart;
508+
for (uint64_t i = 1; i < ProfilingsPtr->NumEvents; i++) {
509+
uint64_t Timestamp = ProfilingsPtr->Timestamps[i].global.kernelEnd;
510+
if (Timestamp > MaxEnd) {
511+
MaxEnd = Timestamp;
512+
LastStart = ProfilingsPtr->Timestamps[i].global.kernelStart;
513+
}
514+
}
515+
uint64_t ContextStartTime = (LastStart & TimestampMaxValue);
516+
uint64_t ContextEndTime = (MaxEnd & TimestampMaxValue);
517+
518+
//
519+
// Handle a possible wrap-around (the underlying HW counter is <
520+
// 64-bit). Note, it will not report correct time if there were multiple
521+
// wrap arounds, and the longer term plan is to enlarge the capacity of
522+
// the HW timestamps.
523+
//
524+
if (ContextEndTime <= ContextStartTime) {
525+
ContextEndTime += TimestampMaxValue;
526+
}
527+
ContextEndTime *= ZeTimerResolution;
528+
return ReturnValue(ContextEndTime);
529+
}
530+
default:
531+
urPrint("urEventGetProfilingInfo: not supported ParamName\n");
532+
return UR_RESULT_ERROR_INVALID_VALUE;
533+
}
534+
} else {
535+
return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
536+
}
537+
}
538+
473539
switch (PropName) {
474540
case UR_PROFILING_INFO_COMMAND_START: {
475541
ZE2UR_CALL(zeEventQueryKernelTimestamp, (Event->ZeEvent, &tsResult));
@@ -763,6 +829,15 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
763829
return Res;
764830
Event->CommandData = nullptr;
765831
}
832+
if (Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP &&
833+
Event->CommandData) {
834+
// Free the memory extra event allocated for profiling purposed.
835+
command_buffer_profiling_t *ProfilingPtr =
836+
static_cast<command_buffer_profiling_t *>(Event->CommandData);
837+
delete[] ProfilingPtr->Timestamps;
838+
delete ProfilingPtr;
839+
Event->CommandData = nullptr;
840+
}
766841
if (Event->OwnNativeHandle) {
767842
if (DisableEventsCaching) {
768843
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));

0 commit comments

Comments
 (0)