[SYCL][PI][CUDA] Implement profiling info queries (#1298)

steffenlarsen · web-flow · commit 656e769549b8 · 2020-03-14T08:28:48.000+03:00
Implements profiling info queries for PI_PROFILING_COMMAND_QUEUED,
PI_PROFILING_COMMAND_SUBMIT, and PI_PROFILING_COMMAND_START in
piEventGetProfilingInfo for the CUDA backend.

Signed-off-by: Steffen Larsen &lt;steffen.larsen@codeplay.com&gt;
diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h
@@ -440,6 +440,13 @@ constexpr pi_sampler_properties PI_SAMPLER_PROPERTIES_ADDRESSING_MODE =
 constexpr pi_sampler_properties PI_SAMPLER_PROPERTIES_FILTER_MODE =
     CL_SAMPLER_FILTER_MODE;
 
+typedef enum {
+  PI_PROFILING_INFO_COMMAND_QUEUED = CL_PROFILING_COMMAND_QUEUED,
+  PI_PROFILING_INFO_COMMAND_SUBMIT = CL_PROFILING_COMMAND_SUBMIT,
+  PI_PROFILING_INFO_COMMAND_START = CL_PROFILING_COMMAND_START,
+  PI_PROFILING_INFO_COMMAND_END = CL_PROFILING_COMMAND_END
+} _pi_profiling_info;
+
 // NOTE: this is made 64-bit to match the size of cl_mem_flags to
 // make the translation to OpenCL transparent.
 // TODO: populate
@@ -488,6 +495,7 @@ using pi_event_status = _pi_event_status;
 using pi_program_build_info = _pi_program_build_info;
 using pi_program_build_status = _pi_program_build_status;
 using pi_kernel_info = _pi_kernel_info;
+using pi_profiling_info = _pi_profiling_info;
 
 // For compatibility with OpenCL define this not as enum.
 using pi_device_partition_property = intptr_t;
@@ -915,11 +923,9 @@ pi_result piEventGetInfo(pi_event event,
                          size_t param_value_size, void *param_value,
                          size_t *param_value_size_ret);
 
-pi_result
-piEventGetProfilingInfo(pi_event event,
-                        cl_profiling_info param_name, // TODO: untie from OpenCL
-                        size_t param_value_size, void *param_value,
-                        size_t *param_value_size_ret);
+pi_result piEventGetProfilingInfo(pi_event event, pi_profiling_info param_name,
+                                  size_t param_value_size, void *param_value,
+                                  size_t *param_value_size_ret);
 
 pi_result piEventsWait(pi_uint32 num_events, const pi_event *event_list);
 
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
@@ -100,16 +100,19 @@ pi_result cuda_piEventRetain(pi_event event);
 } // extern "C"
 
 _pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue)
-    : commandType_{type}, refCount_{1}, isCompleted_{false},
-      isRecorded_{false},
-      isStarted_{false}, event_{nullptr}, queue_{queue}, context_{context} {
+    : commandType_{type}, refCount_{1}, isCompleted_{false}, isRecorded_{false},
+      isStarted_{false}, evEnd_{nullptr}, evStart_{nullptr}, evQueued_{nullptr},
+      queue_{queue}, context_{context} {
 
   if (is_native_event()) {
-    PI_CHECK_ERROR(cuEventCreate(&event_, 0));
-    PI_CHECK_ERROR(cuEventCreate(&evStart_, 0));
+    PI_CHECK_ERROR(cuEventCreate(&evEnd_, CU_EVENT_DEFAULT));
+
+    if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
+      PI_CHECK_ERROR(cuEventCreate(&evQueued_, CU_EVENT_DEFAULT));
+      PI_CHECK_ERROR(cuEventCreate(&evStart_, CU_EVENT_DEFAULT));
+    }
   }
 
-  
   if (queue_ != nullptr) {
     cuda_piQueueRetain(queue_);
   }
@@ -130,7 +133,9 @@ pi_result _pi_event::start() {
   pi_result result;
 
   try {
-    if (is_native_event()) {
+    if (is_native_event() && queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
+      // NOTE: This relies on the default stream to be unused.
+      result = PI_CHECK_ERROR(cuEventRecord(evQueued_, 0));
       result = PI_CHECK_ERROR(cuEventRecord(evStart_, queue_->get()));
     }
   } catch (pi_result error) {
@@ -141,11 +146,28 @@ pi_result _pi_event::start() {
   return result;
 }
 
+pi_uint64 _pi_event::get_queued_time() const {
+  float miliSeconds = 0.0f;
+  assert(is_started());
+
+  PI_CHECK_ERROR(
+      cuEventElapsedTime(&miliSeconds, context_->evBase_, evQueued_));
+  return static_cast<pi_uint64>(miliSeconds * 1.0e6);
+}
+
+pi_uint64 _pi_event::get_start_time() const {
+  float miliSeconds = 0.0f;
+  assert(is_started());
+
+  PI_CHECK_ERROR(cuEventElapsedTime(&miliSeconds, context_->evBase_, evStart_));
+  return static_cast<pi_uint64>(miliSeconds * 1.0e6);
+}
+
 pi_uint64 _pi_event::get_end_time() const {
   float miliSeconds = 0.0f;
   assert(is_started() && is_recorded());
 
-  PI_CHECK_ERROR(cuEventElapsedTime(&miliSeconds, evStart_, event_));
+  PI_CHECK_ERROR(cuEventElapsedTime(&miliSeconds, context_->evBase_, evEnd_));
   return static_cast<pi_uint64>(miliSeconds * 1.0e6);
 }
 
@@ -166,7 +188,7 @@ pi_result _pi_event::record() {
     CUstream cuStream = queue_->get();
 
     try {
-      result = PI_CHECK_ERROR(cuEventRecord(event_, cuStream));
+      result = PI_CHECK_ERROR(cuEventRecord(evEnd_, cuStream));
     } catch (pi_result error) {
       result = error;
     }
@@ -186,7 +208,7 @@ pi_result _pi_event::wait() {
   pi_result retErr;
   if (is_native_event()) {
     try {
-      retErr = PI_CHECK_ERROR(cuEventSynchronize(event_));
+      retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
     } catch (pi_result error) {
       retErr = error;
     }
@@ -1241,6 +1263,10 @@ pi_result cuda_piContextCreate(const pi_context_properties *properties,
       }
     }
 
+    // Use default stream to record base event counter
+    PI_CHECK_ERROR(cuEventCreate(&piContextPtr->evBase_, CU_EVENT_DEFAULT));
+    PI_CHECK_ERROR(cuEventRecord(piContextPtr->evBase_, 0));
+
     *retcontext = piContextPtr.release();
   } catch (pi_result err) {
     errcode_ret = err;
@@ -1261,6 +1287,8 @@ pi_result cuda_piContextRelease(pi_context ctxt) {
 
   std::unique_ptr<_pi_context> context{ctxt};
 
+  PI_CHECK_ERROR(cuEventDestroy(context->evBase_));
+
   if (!ctxt->is_primary()) {
     CUcontext cuCtxt = ctxt->get();
     CUcontext current = nullptr;
@@ -2373,18 +2401,22 @@ pi_result cuda_piEventGetInfo(pi_event event, pi_event_info param_name,
 
 pi_result cuda_piEventGetProfilingInfo(
     pi_event event,
-    cl_profiling_info param_name, // TODO: untie from OpenCL
+    pi_profiling_info param_name, // TODO: untie from OpenCL
     size_t param_value_size, void *param_value, size_t *param_value_size_ret) {
 
   assert(event != nullptr);
 
   // TODO: CUDA only implements elapsed time, PI interface requires changing
   //
   switch (param_name) {
-  case CL_PROFILING_COMMAND_START:
+  case PI_PROFILING_INFO_COMMAND_QUEUED:
+  case PI_PROFILING_INFO_COMMAND_SUBMIT:
+    return getInfo<pi_uint64>(param_value_size, param_value,
+                              param_value_size_ret, event->get_queued_time());
+  case PI_PROFILING_INFO_COMMAND_START:
     return getInfo<pi_uint64>(param_value_size, param_value,
-                              param_value_size_ret, 0);
-  case CL_PROFILING_COMMAND_END:
+                              param_value_size_ret, event->get_start_time());
+  case PI_PROFILING_INFO_COMMAND_END:
     return getInfo<pi_uint64>(param_value_size, param_value,
                               param_value_size_ret, event->get_end_time());
   default:
diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
@@ -72,12 +72,14 @@ struct _pi_context {
   _pi_device *deviceId_;
   std::atomic_uint32_t refCount_;
 
+  CUevent evBase_; // CUDA event used as base counter
+
   _pi_context(kind k, CUcontext ctxt, _pi_device *devId)
-      : kind_{k}, cuContext_{ctxt}, deviceId_{devId}, refCount_{1} {
+      : kind_{k}, cuContext_{ctxt}, deviceId_{devId}, refCount_{1},
+        evBase_(nullptr) {
     cuda_piDeviceRetain(deviceId_);
   };
 
-
   ~_pi_context() { cuda_piDeviceRelease(deviceId_); }
 
   void invoke_callback()
@@ -238,7 +240,7 @@ class _pi_event {
 
   pi_result start();
 
-  native_type get() const noexcept { return event_; };
+  native_type get() const noexcept { return evEnd_; };
 
   pi_result set_user_event_complete() noexcept {
 
@@ -280,8 +282,15 @@ class _pi_event {
 
   pi_uint32 decrement_reference_count() { return --refCount_; }
 
-  // Returns the elapsed time in nano-seconds since the command(s)
-  // associated with the event have completed
+  // Returns the counter time when the associated command(s) were enqueued
+  //
+  pi_uint64 get_queued_time() const;
+
+  // Returns the counter time when the associated command(s) started execution
+  //
+  pi_uint64 get_start_time() const;
+
+  // Returns the counter time when the associated command(s) completed
   //
   pi_uint64 get_end_time() const;
 
@@ -315,11 +324,14 @@ class _pi_event {
   bool isStarted_; // Signifies wether the operation associated with the
                    // PI event has started or not
 
-  native_type event_; // CUDA event handle. If this _pi_event represents a user
+  native_type evEnd_; // CUDA event handle. If this _pi_event represents a user
                       // event, this will be nullptr.
 
   native_type evStart_; // CUDA event handle associated with the start
 
+  native_type evQueued_; // CUDA event handle associated with the time
+                         // the command was enqueued
+
   pi_queue queue_; // pi_queue associated with the event. If this is a user
                    // event, this will be nullptr.
 
diff --git a/sycl/source/detail/event_info.hpp b/sycl/source/detail/event_info.hpp
@@ -25,7 +25,7 @@ template <info::event_profiling Param> struct get_event_profiling_info {
     RetType Result = 0;
     // TODO catch an exception and put it to list of asynchronous exceptions
     Plugin.call<PiApiKind::piEventGetProfilingInfo>(
-        Event, cl_profiling_info(Param), sizeof(Result), &Result, nullptr);
+        Event, pi_profiling_info(Param), sizeof(Result), &Result, nullptr);
     return Result;
   }
 };
diff --git a/sycl/test/basic_tests/event_profiling_info.cpp b/sycl/test/basic_tests/event_profiling_info.cpp
@@ -6,9 +6,6 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
-// XFAIL: cuda
-// TODO: fails cuda due to unimplemented param_name 4737 in
-//       cuda_piEventGetProfilingInfo
 //==------------------- event_profiling_info.cpp ---------------------------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ template <info::event_profiling Param> struct get_event_profiling_info {`
`25`	`25`	`RetType Result = 0;`
`26`	`26`	`// TODO catch an exception and put it to list of asynchronous exceptions`
`27`	`27`	`Plugin.call<PiApiKind::piEventGetProfilingInfo>(`
`28`		`- Event, cl_profiling_info(Param), sizeof(Result), &Result, nullptr);`
	`28`	`+ Event, pi_profiling_info(Param), sizeof(Result), &Result, nullptr);`
`29`	`29`	`return Result;`
`30`	`30`	`}`
`31`	`31`	`};`