Skip to content

Commit 4016a97

Browse files
author
Steffen Larsen
committed
[SYCL][PI][CUDA] Implement profiling info queries
Implements profiling info queries for PI_PROFILING_COMMAND_QUEUED, PI_PROFILING_COMMAND_SUBMIT, and PI_PROFILING_COMMAND_START in piEventGetProfilingInfo for the CUDA backend. Signed-off-by: Steffen Larsen <[email protected]>
1 parent 52a63fa commit 4016a97

File tree

5 files changed

+76
-29
lines changed

5 files changed

+76
-29
lines changed

sycl/include/CL/sycl/detail/pi.h

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,13 @@ constexpr pi_sampler_properties PI_SAMPLER_PROPERTIES_ADDRESSING_MODE =
440440
constexpr pi_sampler_properties PI_SAMPLER_PROPERTIES_FILTER_MODE =
441441
CL_SAMPLER_FILTER_MODE;
442442

443+
typedef enum {
444+
PI_PROFILING_INFO_COMMAND_QUEUED = CL_PROFILING_COMMAND_QUEUED,
445+
PI_PROFILING_INFO_COMMAND_SUBMIT = CL_PROFILING_COMMAND_SUBMIT,
446+
PI_PROFILING_INFO_COMMAND_START = CL_PROFILING_COMMAND_START,
447+
PI_PROFILING_INFO_COMMAND_END = CL_PROFILING_COMMAND_END
448+
} _pi_profiling_info;
449+
443450
// NOTE: this is made 64-bit to match the size of cl_mem_flags to
444451
// make the translation to OpenCL transparent.
445452
// TODO: populate
@@ -488,6 +495,7 @@ using pi_event_status = _pi_event_status;
488495
using pi_program_build_info = _pi_program_build_info;
489496
using pi_program_build_status = _pi_program_build_status;
490497
using pi_kernel_info = _pi_kernel_info;
498+
using pi_profiling_info = _pi_profiling_info;
491499

492500
// For compatibility with OpenCL define this not as enum.
493501
using pi_device_partition_property = intptr_t;
@@ -915,11 +923,9 @@ pi_result piEventGetInfo(pi_event event,
915923
size_t param_value_size, void *param_value,
916924
size_t *param_value_size_ret);
917925

918-
pi_result
919-
piEventGetProfilingInfo(pi_event event,
920-
cl_profiling_info param_name, // TODO: untie from OpenCL
921-
size_t param_value_size, void *param_value,
922-
size_t *param_value_size_ret);
926+
pi_result piEventGetProfilingInfo(pi_event event, pi_profiling_info param_name,
927+
size_t param_value_size, void *param_value,
928+
size_t *param_value_size_ret);
923929

924930
pi_result piEventsWait(pi_uint32 num_events, const pi_event *event_list);
925931

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -100,16 +100,19 @@ pi_result cuda_piEventRetain(pi_event event);
100100
} // extern "C"
101101

102102
_pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue)
103-
: commandType_{type}, refCount_{1}, isCompleted_{false},
104-
isRecorded_{false},
105-
isStarted_{false}, event_{nullptr}, queue_{queue}, context_{context} {
103+
: commandType_{type}, refCount_{1}, isCompleted_{false}, isRecorded_{false},
104+
isStarted_{false}, evEnd_{nullptr}, evStart_{nullptr}, evQueued_{nullptr},
105+
queue_{queue}, context_{context} {
106106

107107
if (is_native_event()) {
108-
PI_CHECK_ERROR(cuEventCreate(&event_, 0));
109-
PI_CHECK_ERROR(cuEventCreate(&evStart_, 0));
108+
PI_CHECK_ERROR(cuEventCreate(&evEnd_, CU_EVENT_DEFAULT));
109+
110+
if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
111+
PI_CHECK_ERROR(cuEventCreate(&evQueued_, CU_EVENT_DEFAULT));
112+
PI_CHECK_ERROR(cuEventCreate(&evStart_, CU_EVENT_DEFAULT));
113+
}
110114
}
111115

112-
113116
if (queue_ != nullptr) {
114117
cuda_piQueueRetain(queue_);
115118
}
@@ -130,7 +133,9 @@ pi_result _pi_event::start() {
130133
pi_result result;
131134

132135
try {
133-
if (is_native_event()) {
136+
if (is_native_event() && queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
137+
// NOTE: This relies on the default stream to be unused.
138+
result = PI_CHECK_ERROR(cuEventRecord(evQueued_, 0));
134139
result = PI_CHECK_ERROR(cuEventRecord(evStart_, queue_->get()));
135140
}
136141
} catch (pi_result error) {
@@ -141,11 +146,28 @@ pi_result _pi_event::start() {
141146
return result;
142147
}
143148

149+
pi_uint64 _pi_event::get_queued_time() const {
150+
float miliSeconds = 0.0f;
151+
assert(is_started());
152+
153+
PI_CHECK_ERROR(
154+
cuEventElapsedTime(&miliSeconds, context_->evBase_, evQueued_));
155+
return static_cast<pi_uint64>(miliSeconds * 1.0e6);
156+
}
157+
158+
pi_uint64 _pi_event::get_start_time() const {
159+
float miliSeconds = 0.0f;
160+
assert(is_started());
161+
162+
PI_CHECK_ERROR(cuEventElapsedTime(&miliSeconds, context_->evBase_, evStart_));
163+
return static_cast<pi_uint64>(miliSeconds * 1.0e6);
164+
}
165+
144166
pi_uint64 _pi_event::get_end_time() const {
145167
float miliSeconds = 0.0f;
146168
assert(is_started() && is_recorded());
147169

148-
PI_CHECK_ERROR(cuEventElapsedTime(&miliSeconds, evStart_, event_));
170+
PI_CHECK_ERROR(cuEventElapsedTime(&miliSeconds, context_->evBase_, evEnd_));
149171
return static_cast<pi_uint64>(miliSeconds * 1.0e6);
150172
}
151173

@@ -166,7 +188,7 @@ pi_result _pi_event::record() {
166188
CUstream cuStream = queue_->get();
167189

168190
try {
169-
result = PI_CHECK_ERROR(cuEventRecord(event_, cuStream));
191+
result = PI_CHECK_ERROR(cuEventRecord(evEnd_, cuStream));
170192
} catch (pi_result error) {
171193
result = error;
172194
}
@@ -186,7 +208,7 @@ pi_result _pi_event::wait() {
186208
pi_result retErr;
187209
if (is_native_event()) {
188210
try {
189-
retErr = PI_CHECK_ERROR(cuEventSynchronize(event_));
211+
retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
190212
} catch (pi_result error) {
191213
retErr = error;
192214
}
@@ -1241,6 +1263,10 @@ pi_result cuda_piContextCreate(const pi_context_properties *properties,
12411263
}
12421264
}
12431265

1266+
// Use default stream to record base event counter
1267+
PI_CHECK_ERROR(cuEventCreate(&piContextPtr->evBase_, CU_EVENT_DEFAULT));
1268+
PI_CHECK_ERROR(cuEventRecord(piContextPtr->evBase_, 0));
1269+
12441270
*retcontext = piContextPtr.release();
12451271
} catch (pi_result err) {
12461272
errcode_ret = err;
@@ -1261,6 +1287,8 @@ pi_result cuda_piContextRelease(pi_context ctxt) {
12611287

12621288
std::unique_ptr<_pi_context> context{ctxt};
12631289

1290+
PI_CHECK_ERROR(cuEventDestroy(context->evBase_));
1291+
12641292
if (!ctxt->is_primary()) {
12651293
CUcontext cuCtxt = ctxt->get();
12661294
CUcontext current = nullptr;
@@ -2307,18 +2335,22 @@ pi_result cuda_piEventGetInfo(pi_event event, pi_event_info param_name,
23072335

23082336
pi_result cuda_piEventGetProfilingInfo(
23092337
pi_event event,
2310-
cl_profiling_info param_name, // TODO: untie from OpenCL
2338+
pi_profiling_info param_name, // TODO: untie from OpenCL
23112339
size_t param_value_size, void *param_value, size_t *param_value_size_ret) {
23122340

23132341
assert(event != nullptr);
23142342

23152343
// TODO: CUDA only implements elapsed time, PI interface requires changing
23162344
//
23172345
switch (param_name) {
2318-
case CL_PROFILING_COMMAND_START:
2346+
case PI_PROFILING_INFO_COMMAND_QUEUED:
2347+
case PI_PROFILING_INFO_COMMAND_SUBMIT:
2348+
return getInfo<pi_uint64>(param_value_size, param_value,
2349+
param_value_size_ret, event->get_queued_time());
2350+
case PI_PROFILING_INFO_COMMAND_START:
23192351
return getInfo<pi_uint64>(param_value_size, param_value,
2320-
param_value_size_ret, 0);
2321-
case CL_PROFILING_COMMAND_END:
2352+
param_value_size_ret, event->get_start_time());
2353+
case PI_PROFILING_INFO_COMMAND_END:
23222354
return getInfo<pi_uint64>(param_value_size, param_value,
23232355
param_value_size_ret, event->get_end_time());
23242356
default:

sycl/plugins/cuda/pi_cuda.hpp

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,12 +72,14 @@ struct _pi_context {
7272
_pi_device *deviceId_;
7373
std::atomic_uint32_t refCount_;
7474

75+
CUevent evBase_; // CUDA event used as base counter
76+
7577
_pi_context(kind k, CUcontext ctxt, _pi_device *devId)
76-
: kind_{k}, cuContext_{ctxt}, deviceId_{devId}, refCount_{1} {
78+
: kind_{k}, cuContext_{ctxt}, deviceId_{devId}, refCount_{1},
79+
evBase_(nullptr) {
7780
cuda_piDeviceRetain(deviceId_);
7881
};
7982

80-
8183
~_pi_context() { cuda_piDeviceRelease(deviceId_); }
8284

8385
void invoke_callback()
@@ -238,7 +240,7 @@ class _pi_event {
238240

239241
pi_result start();
240242

241-
native_type get() const noexcept { return event_; };
243+
native_type get() const noexcept { return evEnd_; };
242244

243245
pi_result set_user_event_complete() noexcept {
244246

@@ -280,8 +282,15 @@ class _pi_event {
280282

281283
pi_uint32 decrement_reference_count() { return --refCount_; }
282284

283-
// Returns the elapsed time in nano-seconds since the command(s)
284-
// associated with the event have completed
285+
// Returns the counter time when the associated command(s) were enqueued
286+
//
287+
pi_uint64 get_queued_time() const;
288+
289+
// Returns the counter time when the associated command(s) started execution
290+
//
291+
pi_uint64 get_start_time() const;
292+
293+
// Returns the counter time when the associated command(s) completed
285294
//
286295
pi_uint64 get_end_time() const;
287296

@@ -315,11 +324,14 @@ class _pi_event {
315324
bool isStarted_; // Signifies wether the operation associated with the
316325
// PI event has started or not
317326

318-
native_type event_; // CUDA event handle. If this _pi_event represents a user
327+
native_type evEnd_; // CUDA event handle. If this _pi_event represents a user
319328
// event, this will be nullptr.
320329

321330
native_type evStart_; // CUDA event handle associated with the start
322331

332+
native_type evQueued_; // CUDA event handle associated with the time
333+
// the command was enqueued
334+
323335
pi_queue queue_; // pi_queue associated with the event. If this is a user
324336
// event, this will be nullptr.
325337

sycl/source/detail/event_info.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ template <info::event_profiling Param> struct get_event_profiling_info {
2525
RetType Result = 0;
2626
// TODO catch an exception and put it to list of asynchronous exceptions
2727
Plugin.call<PiApiKind::piEventGetProfilingInfo>(
28-
Event, cl_profiling_info(Param), sizeof(Result), &Result, nullptr);
28+
Event, pi_profiling_info(Param), sizeof(Result), &Result, nullptr);
2929
return Result;
3030
}
3131
};

sycl/test/basic_tests/event_profiling_info.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,6 @@
66
// RUN: %CPU_RUN_PLACEHOLDER %t.out
77
// RUN: %GPU_RUN_PLACEHOLDER %t.out
88
// RUN: %ACC_RUN_PLACEHOLDER %t.out
9-
// XFAIL: cuda
10-
// TODO: fails cuda due to unimplemented param_name 4737 in
11-
// cuda_piEventGetProfilingInfo
129
//==------------------- event_profiling_info.cpp ---------------------------==//
1310
//
1411
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

0 commit comments

Comments
 (0)