@@ -100,16 +100,19 @@ pi_result cuda_piEventRetain(pi_event event);
100
100
} // extern "C"
101
101
102
102
_pi_event::_pi_event (pi_command_type type, pi_context context, pi_queue queue)
103
- : commandType_{type}, refCount_{1 }, isCompleted_{false },
104
- isRecorded_ {false },
105
- isStarted_{ false }, event_{ nullptr }, queue_{queue}, context_{context} {
103
+ : commandType_{type}, refCount_{1 }, isCompleted_{false }, isRecorded_{ false },
104
+ isStarted_ {false }, evEnd_{ nullptr }, evStart_{ nullptr }, evQueued_{ nullptr },
105
+ queue_{queue}, context_{context} {
106
106
107
107
if (is_native_event ()) {
108
- PI_CHECK_ERROR (cuEventCreate (&event_, 0 ));
109
- PI_CHECK_ERROR (cuEventCreate (&evStart_, 0 ));
108
+ PI_CHECK_ERROR (cuEventCreate (&evEnd_, CU_EVENT_DEFAULT));
109
+
110
+ if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
111
+ PI_CHECK_ERROR (cuEventCreate (&evQueued_, CU_EVENT_DEFAULT));
112
+ PI_CHECK_ERROR (cuEventCreate (&evStart_, CU_EVENT_DEFAULT));
113
+ }
110
114
}
111
115
112
-
113
116
if (queue_ != nullptr ) {
114
117
cuda_piQueueRetain (queue_);
115
118
}
@@ -130,7 +133,9 @@ pi_result _pi_event::start() {
130
133
pi_result result;
131
134
132
135
try {
133
- if (is_native_event ()) {
136
+ if (is_native_event () && queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
137
+ // NOTE: This relies on the default stream to be unused.
138
+ result = PI_CHECK_ERROR (cuEventRecord (evQueued_, 0 ));
134
139
result = PI_CHECK_ERROR (cuEventRecord (evStart_, queue_->get ()));
135
140
}
136
141
} catch (pi_result error) {
@@ -141,11 +146,28 @@ pi_result _pi_event::start() {
141
146
return result;
142
147
}
143
148
149
+ pi_uint64 _pi_event::get_queued_time () const {
150
+ float miliSeconds = 0 .0f ;
151
+ assert (is_started ());
152
+
153
+ PI_CHECK_ERROR (
154
+ cuEventElapsedTime (&miliSeconds, context_->evBase_ , evQueued_));
155
+ return static_cast <pi_uint64>(miliSeconds * 1.0e6 );
156
+ }
157
+
158
+ pi_uint64 _pi_event::get_start_time () const {
159
+ float miliSeconds = 0 .0f ;
160
+ assert (is_started ());
161
+
162
+ PI_CHECK_ERROR (cuEventElapsedTime (&miliSeconds, context_->evBase_ , evStart_));
163
+ return static_cast <pi_uint64>(miliSeconds * 1.0e6 );
164
+ }
165
+
144
166
pi_uint64 _pi_event::get_end_time () const {
145
167
float miliSeconds = 0 .0f ;
146
168
assert (is_started () && is_recorded ());
147
169
148
- PI_CHECK_ERROR (cuEventElapsedTime (&miliSeconds, evStart_, event_ ));
170
+ PI_CHECK_ERROR (cuEventElapsedTime (&miliSeconds, context_-> evBase_ , evEnd_ ));
149
171
return static_cast <pi_uint64>(miliSeconds * 1.0e6 );
150
172
}
151
173
@@ -166,7 +188,7 @@ pi_result _pi_event::record() {
166
188
CUstream cuStream = queue_->get ();
167
189
168
190
try {
169
- result = PI_CHECK_ERROR (cuEventRecord (event_ , cuStream));
191
+ result = PI_CHECK_ERROR (cuEventRecord (evEnd_ , cuStream));
170
192
} catch (pi_result error) {
171
193
result = error;
172
194
}
@@ -186,7 +208,7 @@ pi_result _pi_event::wait() {
186
208
pi_result retErr;
187
209
if (is_native_event ()) {
188
210
try {
189
- retErr = PI_CHECK_ERROR (cuEventSynchronize (event_ ));
211
+ retErr = PI_CHECK_ERROR (cuEventSynchronize (evEnd_ ));
190
212
} catch (pi_result error) {
191
213
retErr = error;
192
214
}
@@ -1241,6 +1263,10 @@ pi_result cuda_piContextCreate(const pi_context_properties *properties,
1241
1263
}
1242
1264
}
1243
1265
1266
+ // Use default stream to record base event counter
1267
+ PI_CHECK_ERROR (cuEventCreate (&piContextPtr->evBase_ , CU_EVENT_DEFAULT));
1268
+ PI_CHECK_ERROR (cuEventRecord (piContextPtr->evBase_ , 0 ));
1269
+
1244
1270
*retcontext = piContextPtr.release ();
1245
1271
} catch (pi_result err) {
1246
1272
errcode_ret = err;
@@ -1261,6 +1287,8 @@ pi_result cuda_piContextRelease(pi_context ctxt) {
1261
1287
1262
1288
std::unique_ptr<_pi_context> context{ctxt};
1263
1289
1290
+ PI_CHECK_ERROR (cuEventDestroy (context->evBase_ ));
1291
+
1264
1292
if (!ctxt->is_primary ()) {
1265
1293
CUcontext cuCtxt = ctxt->get ();
1266
1294
CUcontext current = nullptr ;
@@ -2373,18 +2401,22 @@ pi_result cuda_piEventGetInfo(pi_event event, pi_event_info param_name,
2373
2401
2374
2402
pi_result cuda_piEventGetProfilingInfo (
2375
2403
pi_event event,
2376
- cl_profiling_info param_name, // TODO: untie from OpenCL
2404
+ pi_profiling_info param_name, // TODO: untie from OpenCL
2377
2405
size_t param_value_size, void *param_value, size_t *param_value_size_ret) {
2378
2406
2379
2407
assert (event != nullptr );
2380
2408
2381
2409
// TODO: CUDA only implements elapsed time, PI interface requires changing
2382
2410
//
2383
2411
switch (param_name) {
2384
- case CL_PROFILING_COMMAND_START:
2412
+ case PI_PROFILING_INFO_COMMAND_QUEUED:
2413
+ case PI_PROFILING_INFO_COMMAND_SUBMIT:
2414
+ return getInfo<pi_uint64>(param_value_size, param_value,
2415
+ param_value_size_ret, event->get_queued_time ());
2416
+ case PI_PROFILING_INFO_COMMAND_START:
2385
2417
return getInfo<pi_uint64>(param_value_size, param_value,
2386
- param_value_size_ret, 0 );
2387
- case CL_PROFILING_COMMAND_END :
2418
+ param_value_size_ret, event-> get_start_time () );
2419
+ case PI_PROFILING_INFO_COMMAND_END :
2388
2420
return getInfo<pi_uint64>(param_value_size, param_value,
2389
2421
param_value_size_ret, event->get_end_time ());
2390
2422
default :
0 commit comments