intel · sarnex · Jan 18, 2024 · Sep 27, 2023 · Sep 27, 2023 · Sep 27, 2023
@@ -241,11 +241,22 @@ created on UR command-buffer enqueue.
 
 There is also a *WaitEvent* used by the `ur_exp_command_buffer_handle_t` class
 in the prefix to wait on any dependencies passed in the enqueue wait-list.
-This WaitEvent is reset at the end of the suffix, along with reset commands
-to reset the L0 events used to implement the UR sync-points back to the
-non-signaled state.
-
-![L0 command-buffer diagram](images/L0_UR_command-buffer.svg)
+This WaitEvent is reset in the suffix.
+
+A command-buffer is expected to be submitted multiple times. Consequently,
+we need to ensure that L0 events associated with graph commands have not
+been signaled by a previous execution. These events are therefore reset to the
+non-signaled state before running the actual graph associated commands. Note
+that this reset is performed in the prefix and not in the suffix to avoid
+additional synchronization w.r.t profiling data extraction.
+
+If a command-buffer is about to be submitted to a queue with the profiling
+property enabled, an extra command that copies timestamps of L0 events
+associated with graph commands into a dedicated memory which is attached to the
+returned UR event. This memory stores the profiling information that
+corresponds to the current submission of the command-buffer.
+
+![L0 command-buffer diagram](images/L0_UR_command-buffer-v3.jpg)
 
 For a call to `urCommandBufferEnqueueExp` with an `event_list` *EL*,
 command-buffer *CB*, and return event *RE* our implementation has to submit two

@@ -278,17 +278,33 @@ void event_impl::checkProfilingPreconditions() const {
         "Profiling information is unavailable as the queue associated with "
         "the event does not have the 'enable_profiling' property.");
   }
-  if (MEventFromSubmitedExecCommandBuffer) {
-    throw sycl::exception(make_error_code(sycl::errc::invalid),
-                          "Profiling information is unavailable for events "
-                          "returned by a graph submission.");
-  }
 }
 
 template <>
 uint64_t
 event_impl::get_profiling_info<info::event_profiling::command_submit>() {
   checkProfilingPreconditions();
+  // The delay between the submission and the actual start of a CommandBuffer
+  // can be short. Consequently, the submission time, which is based on
+  // an estimated clock and not on the real device clock, may be ahead of the
+  // start time, which is based on the actual device clock.
+  // MSubmitTime is set in a critical performance path.
+  // Force reading the device clock when setting MSubmitTime may deteriorate
+  // the performance.
+  // Since submit time is an estimated time, we implement this little hack
+  // that allows all profiled time to be meaningful.
+  // (Note that the observed time deviation between the estimated clock and
+  // the real device clock is typically less than 0.5ms. The approximation we
+  // made by forcing the re-sync of submit time to start time is less than
+  // 0.5ms. These timing values were obtained empirically using an integrated
+  // Intel GPU).
+  if (MEventFromSubmittedExecCommandBuffer && !MHostEvent && MEvent) {
+    uint64_t StartTime =
+        get_event_profiling_info<info::event_profiling::command_start>(
+            this->getHandleRef(), this->getPlugin());
+    if (StartTime < MSubmitTime)
+      MSubmitTime = StartTime;
+  }
   return MSubmitTime;
 }
 

@@ -282,12 +282,12 @@ class event_impl {
     return MGraph.lock();
   }
 
-  void setEventFromSubmitedExecCommandBuffer(bool value) {
-    MEventFromSubmitedExecCommandBuffer = value;
+  void setEventFromSubmittedExecCommandBuffer(bool value) {
+    MEventFromSubmittedExecCommandBuffer = value;
   }
 
-  bool isEventFromSubmitedExecCommandBuffer() const {
-    return MEventFromSubmitedExecCommandBuffer;
+  bool isEventFromSubmittedExecCommandBuffer() const {
+    return MEventFromSubmittedExecCommandBuffer;
   }
 
 protected:
@@ -340,8 +340,8 @@ class event_impl {
   /// Store the command graph associated with this event, if any.
   /// This event is also be stored in the graph so a weak_ptr is used.
   std::weak_ptr<ext::oneapi::experimental::detail::graph_impl> MGraph;
-  /// Indicates that the event results from a command graph submission
-  bool MEventFromSubmitedExecCommandBuffer = false;
+  /// Indicates that the event results from a command graph submission.
+  bool MEventFromSubmittedExecCommandBuffer = false;
 
   // If this event represents a submission to a
   // sycl::detail::pi::PiExtCommandBuffer the sync point for that submission is

@@ -757,7 +757,6 @@ exec_graph_impl::enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
     auto NewEvent = std::make_shared<sycl::detail::event_impl>(Queue);
     NewEvent->setContextImpl(Queue->getContextImplPtr());
     NewEvent->setStateIncomplete();
-    NewEvent->setEventFromSubmitedExecCommandBuffer(true);
     return NewEvent;
   });
 
@@ -840,7 +839,7 @@ exec_graph_impl::enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
         NewEvent = sycl::detail::Scheduler::getInstance().addCG(
             std::move(CommandGroup), Queue);
       }
-
+      NewEvent->setEventFromSubmittedExecCommandBuffer(true);
     } else if ((CurrentPartition->MSchedule.size() > 0) &&
                (CurrentPartition->MSchedule.front()->MCGType ==
                 sycl::detail::CG::CGTYPE::CodeplayHostTask)) {

@@ -0,0 +1,190 @@
+// REQUIRES: level_zero || cuda, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out 2>&1
+// RUN: %if ext_oneapi_level_zero %{env UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck --implicit-check-not=LEAK %s %}
+
+// This test checks the profiling of an event returned
+// from graph submission with event::get_profiling_info().
+// It first tests a graph made exclusively of memory operations,
+// then tests a graph made of kernels.
+// The second run is to check that there are no leaks reported with the embedded
+// UR_L0_LEAKS_DEBUG testing capability.
+
+#include "graph_common.hpp"
+
+#define GRAPH_TESTS_VERBOSE_PRINT 0
+
+#if GRAPH_TESTS_VERBOSE_PRINT
+#include <chrono>
+#endif
+
+bool verifyProfiling(event Event) {
+  auto Submit =
+      Event.get_profiling_info<sycl::info::event_profiling::command_submit>();
+  auto Start =
+      Event.get_profiling_info<sycl::info::event_profiling::command_start>();
+  auto End =
+      Event.get_profiling_info<sycl::info::event_profiling::command_end>();
+
+#if GRAPH_TESTS_VERBOSE_PRINT
+  std::cout << "Submit = " << Submit << std::endl;
+  std::cout << "Start = " << Start << std::endl;
+  std::cout << "End = " << End << " ( " << (End - Start) << " ) "
+            << " => full ( " << (End - Submit) << " ) " << std::endl;
+#endif
+
+  assert((Submit && Start && End) && "Profiling information failed.");
+  assert(Submit <= Start);
+  assert(Start < End);
+
+  bool Pass = sycl::info::event_command_status::complete ==
+              Event.get_info<sycl::info::event::command_execution_status>();
+
+  return Pass;
+}
+
+bool compareProfiling(event Event1, event Event2) {
+  assert(Event1 != Event2);
+
+  auto SubmitEvent1 =
+      Event1.get_profiling_info<sycl::info::event_profiling::command_submit>();
+  auto StartEvent1 =
+      Event1.get_profiling_info<sycl::info::event_profiling::command_start>();
+  auto EndEvent1 =
+      Event1.get_profiling_info<sycl::info::event_profiling::command_end>();
+  assert((SubmitEvent1 && StartEvent1 && EndEvent1) &&
+         "Profiling information failed.");
+
+  auto SubmitEvent2 =
+      Event2.get_profiling_info<sycl::info::event_profiling::command_submit>();
+  auto StartEvent2 =
+      Event2.get_profiling_info<sycl::info::event_profiling::command_start>();
+  auto EndEvent2 =
+      Event2.get_profiling_info<sycl::info::event_profiling::command_end>();
+  assert((SubmitEvent2 && StartEvent2 && EndEvent2) &&
+         "Profiling information failed.");
+
+  assert(SubmitEvent1 != SubmitEvent2);
+  assert(StartEvent1 != StartEvent2);
+  assert(EndEvent1 != EndEvent2);
+
+  bool Pass1 = sycl::info::event_command_status::complete ==
+               Event1.get_info<sycl::info::event::command_execution_status>();
+  bool Pass2 = sycl::info::event_command_status::complete ==
+               Event2.get_info<sycl::info::event::command_execution_status>();
+
+  return (Pass1 && Pass2);
+}
+
+// The test checks that get_profiling_info waits for command asccociated with
+// event to complete execution.
+int main() {
+  device Dev;
+  queue Queue{Dev,
+              {sycl::ext::intel::property::queue::no_immediate_command_list{},
+               sycl::property::queue::enable_profiling()}};
+
+  const size_t Size = 100000;
+  int Data[Size] = {0};
+  for (size_t I = 0; I < Size; ++I) {
+    Data[I] = I;
+  }
+  int Values[Size] = {0};
+
+  buffer<int, 1> BufferFrom(Data, range<1>(Size));
+  buffer<int, 1> BufferTo(Values, range<1>(Size));
+
+  buffer<int, 1> BufferA(Data, range<1>(Size));
+  buffer<int, 1> BufferB(Values, range<1>(Size));
+  buffer<int, 1> BufferC(Values, range<1>(Size));
+
+  BufferFrom.set_write_back(false);
+  BufferTo.set_write_back(false);
+  BufferA.set_write_back(false);
+  BufferB.set_write_back(false);
+  BufferC.set_write_back(false);
+  { // buffer copy
+    exp_ext::command_graph CopyGraph{
+        Queue.get_context(),
+        Queue.get_device(),
+        {exp_ext::property::graph::assume_buffer_outlives_graph{}}};
+    CopyGraph.begin_recording(Queue);
+
+    Queue.submit([&](sycl::handler &Cgh) {
+      accessor<int, 1, access::mode::read, access::target::device> AccessorFrom(
+          BufferFrom, Cgh, range<1>(Size));
+      accessor<int, 1, access::mode::write, access::target::device> AccessorTo(
+          BufferTo, Cgh, range<1>(Size));
+      Cgh.copy(AccessorFrom, AccessorTo);
+    });
+
+    CopyGraph.end_recording(Queue);
+
+    // kernel launch
+    exp_ext::command_graph KernelGraph{
+        Queue.get_context(),
+        Queue.get_device(),
+        {exp_ext::property::graph::assume_buffer_outlives_graph{}}};
+    KernelGraph.begin_recording(Queue);
+
+    run_kernels(Queue, Size, BufferA, BufferB, BufferC);
+
+    KernelGraph.end_recording(Queue);
+
+    auto CopyGraphExec = CopyGraph.finalize();
+    auto KernelGraphExec = KernelGraph.finalize();
+
+    event CopyEvent, KernelEvent1, KernelEvent2;
+    // Run graphs
+#if GRAPH_TESTS_VERBOSE_PRINT
+    auto StartCopyGraph = std::chrono::high_resolution_clock::now();
+#endif
+    CopyEvent = Queue.submit(
+        [&](handler &CGH) { CGH.ext_oneapi_graph(CopyGraphExec); });
+    Queue.wait_and_throw();
+#if GRAPH_TESTS_VERBOSE_PRINT
+    auto EndCopyGraph = std::chrono::high_resolution_clock::now();
+    auto StartKernelSubmit1 = std::chrono::high_resolution_clock::now();
+#endif
+    KernelEvent1 = Queue.submit(
+        [&](handler &CGH) { CGH.ext_oneapi_graph(KernelGraphExec); });
+    Queue.wait_and_throw();
+#if GRAPH_TESTS_VERBOSE_PRINT
+    auto endKernelSubmit1 = std::chrono::high_resolution_clock::now();
+    auto StartKernelSubmit2 = std::chrono::high_resolution_clock::now();
+#endif
+    KernelEvent2 = Queue.submit(
+        [&](handler &CGH) { CGH.ext_oneapi_graph(KernelGraphExec); });
+    Queue.wait_and_throw();
+#if GRAPH_TESTS_VERBOSE_PRINT
+    auto endKernelSubmit2 = std::chrono::high_resolution_clock::now();
+
+    double DelayCopy = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                           EndCopyGraph - StartCopyGraph)
+                           .count();
+    std::cout << "Copy Graph delay (in ns) : " << DelayCopy << std::endl;
+    double DelayKernel1 = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                              endKernelSubmit1 - StartKernelSubmit1)
+                              .count();
+    std::cout << "Kernel 1st Execution delay (in ns) : " << DelayKernel1
+              << std::endl;
+    double DelayKernel2 = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                              endKernelSubmit2 - StartKernelSubmit2)
+                              .count();
+    std::cout << "Kernel 2nd Execution delay (in ns) : " << DelayKernel2
+              << std::endl;
+#endif
+
+    // Checks profiling times
+    assert(verifyProfiling(CopyEvent) && verifyProfiling(KernelEvent1) &&
+           verifyProfiling(KernelEvent2) &&
+           compareProfiling(KernelEvent1, KernelEvent2));
+  }
+
+  host_accessor HostData(BufferTo);
+  for (size_t I = 0; I < Size; ++I) {
+    assert(HostData[I] == Values[I]);
+  }
+
+  return 0;
+}