Skip to content

Commit 71d7797

Browse files
raaiq1romanovvladsmaslov-intel
authored
[SYCL] Fix event profiling for command_submit in L0 and other backends (#7526)
According to SYCL 2020 specification, the timeframe to calculated command submission time: > ... always some time after the [command group function object](https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#command-group-function-object) returns and before the associated call to queue::submit returns. Currently, command submission time is calculated when a command is submitted to the underlying device which may not necessarily be before `queue::submit` returns, e.g `host_accessor` blocking command submission until it's destroyed. This patch changes that timeframe to be always before `queue::submit` returns, specifically right after being persisted by the graph builder and before being enqueued by graph processor. --------- Signed-off-by: Rauf, Rana <[email protected]> Co-authored-by: Romanov Vlad <[email protected]> Co-authored-by: smaslov-intel <[email protected]>
1 parent 03e70b4 commit 71d7797

File tree

20 files changed

+395
-33
lines changed

20 files changed

+395
-33
lines changed

sycl/include/sycl/detail/pi.def

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,8 +141,10 @@ _PI_API(piPluginGetLastError)
141141

142142
_PI_API(piTearDown)
143143

144+
144145
_PI_API(piextUSMEnqueueFill2D)
145146
_PI_API(piextUSMEnqueueMemset2D)
146147
_PI_API(piextUSMEnqueueMemcpy2D)
147148

149+
_PI_API(piGetDeviceAndHostTimer)
148150
#undef _PI_API

sycl/include/sycl/detail/pi.h

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,10 @@
7474
// PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT, and
7575
// PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT context info query
7676
// descriptors.
77+
// 12.22 Add piGetDeviceAndHostTimer to query device wall-clock timestamp
7778

7879
#define _PI_H_VERSION_MAJOR 12
79-
#define _PI_H_VERSION_MINOR 21
80+
#define _PI_H_VERSION_MINOR 22
8081

8182
#define _PI_STRING_HELPER(a) #a
8283
#define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
@@ -1898,9 +1899,24 @@ __SYCL_EXPORT pi_result piTearDown(void *PluginParameter);
18981899
///
18991900
/// \return PI_SUCCESS if plugin is indicating non-fatal warning. Any other
19001901
/// error code indicates that plugin considers this to be a fatal error and the
1901-
/// runtime must handle it or end the application.
1902+
/// Returns the global timestamp from \param device , and syncronized host
1903+
/// timestamp
19021904
__SYCL_EXPORT pi_result piPluginGetLastError(char **message);
19031905

1906+
/// Queries device for it's global timestamp in nanoseconds, and updates
1907+
/// HostTime with the value of the host timer at the closest possible point in
1908+
/// time to that at which DeviceTime was returned.
1909+
///
1910+
/// \param Device device to query for timestamp
1911+
/// \param DeviceTime pointer to store device timestamp in nanoseconds. Optional
1912+
/// argument, can be nullptr
1913+
/// \param HostTime pointer to store host timestamp in
1914+
/// nanoseconds. Optional argurment, can be nullptr in which case timestamp will
1915+
/// not be written
1916+
__SYCL_EXPORT pi_result piGetDeviceAndHostTimer(pi_device Device,
1917+
uint64_t *DeviceTime,
1918+
uint64_t *HostTime);
1919+
19041920
struct _pi_plugin {
19051921
// PI version supported by host passed to the plugin. The Plugin
19061922
// checks and writes the appropriate Function Pointers in

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
#include <algorithm>
2020
#include <cassert>
21+
#include <chrono>
2122
#include <cuda.h>
2223
#include <cuda_device_runtime_api.h>
2324
#include <limits>
@@ -2150,7 +2151,6 @@ pi_result cuda_piContextCreate(const pi_context_properties *properties,
21502151
piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{
21512152
_pi_context::kind::user_defined, newContext, *devices});
21522153
}
2153-
21542154
static std::once_flag initFlag;
21552155
std::call_once(
21562156
initFlag,
@@ -3905,6 +3905,7 @@ pi_result cuda_piEventGetProfilingInfo(pi_event event,
39053905
switch (param_name) {
39063906
case PI_PROFILING_INFO_COMMAND_QUEUED:
39073907
case PI_PROFILING_INFO_COMMAND_SUBMIT:
3908+
// Note: No user for this case
39083909
return getInfo<pi_uint64>(param_value_size, param_value,
39093910
param_value_size_ret, event->get_queued_time());
39103911
case PI_PROFILING_INFO_COMMAND_START:
@@ -5502,6 +5503,35 @@ pi_result cuda_piTearDown(void *) {
55025503
return PI_SUCCESS;
55035504
}
55045505

5506+
pi_result cuda_piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime,
5507+
uint64_t *HostTime) {
5508+
_pi_event::native_type event;
5509+
ScopedContext active(Device->get_context());
5510+
5511+
if (DeviceTime) {
5512+
PI_CHECK_ERROR(cuEventCreate(&event, CU_EVENT_DEFAULT));
5513+
PI_CHECK_ERROR(cuEventRecord(event, 0));
5514+
}
5515+
if (HostTime) {
5516+
5517+
using namespace std::chrono;
5518+
*HostTime =
5519+
duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
5520+
.count();
5521+
}
5522+
5523+
if (DeviceTime) {
5524+
PI_CHECK_ERROR(cuEventSynchronize(event));
5525+
5526+
float elapsedTime = 0.0f;
5527+
PI_CHECK_ERROR(
5528+
cuEventElapsedTime(&elapsedTime, _pi_platform::evBase_, event));
5529+
*DeviceTime = (uint64_t)(elapsedTime * (double)1e6);
5530+
}
5531+
5532+
return PI_SUCCESS;
5533+
}
5534+
55055535
const char SupportedVersion[] = _PI_CUDA_PLUGIN_VERSION_STRING;
55065536

55075537
pi_result piPluginInit(pi_plugin *PluginInit) {
@@ -5650,6 +5680,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
56505680
_PI_CL(piextKernelSetArgSampler, cuda_piextKernelSetArgSampler)
56515681
_PI_CL(piPluginGetLastError, cuda_piPluginGetLastError)
56525682
_PI_CL(piTearDown, cuda_piTearDown)
5683+
_PI_CL(piGetDeviceAndHostTimer, cuda_piGetDeviceAndHostTimer)
56535684

56545685
#undef _PI_CL
56555686

sycl/plugins/cuda/pi_cuda.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ struct _pi_device {
8888
native_type cuDevice_;
8989
std::atomic_uint32_t refCount_;
9090
pi_platform platform_;
91+
pi_context context_;
9192

9293
static constexpr pi_uint32 max_work_item_dimensions = 3u;
9394
size_t max_work_item_sizes[max_work_item_dimensions];
@@ -103,6 +104,10 @@ struct _pi_device {
103104

104105
pi_platform get_platform() const noexcept { return platform_; };
105106

107+
void set_context(pi_context ctx) { context_ = ctx; };
108+
109+
pi_context get_context() { return context_; };
110+
106111
void save_max_work_item_sizes(size_t size,
107112
size_t *save_max_work_item_sizes) noexcept {
108113
memcpy(max_work_item_sizes, save_max_work_item_sizes, size);
@@ -178,6 +183,7 @@ struct _pi_context {
178183
bool backend_owns = true)
179184
: kind_{k}, cuContext_{ctxt}, deviceId_{devId}, refCount_{1},
180185
has_ownership{backend_owns} {
186+
deviceId_->set_context(this);
181187
cuda_piDeviceRetain(deviceId_);
182188
};
183189

sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2049,6 +2049,12 @@ pi_result piTearDown(void *) {
20492049
return PI_SUCCESS;
20502050
}
20512051

2052+
pi_result piGetDeviceAndHostTimer(pi_device device, uint64_t *deviceTime,
2053+
uint64_t *hostTime) {
2054+
PiTrace(
2055+
"Warning : Querying device clock not supported under PI_ESIMD_EMULATOR");
2056+
return PI_SUCCESS;
2057+
}
20522058
const char SupportedVersion[] = _PI_ESIMD_PLUGIN_VERSION_STRING;
20532059

20542060
pi_result piPluginInit(pi_plugin *PluginInit) {

sycl/plugins/hip/pi_hip.cpp

Lines changed: 46 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
#include <algorithm>
2020
#include <cassert>
21+
#include <chrono>
2122
#include <hip/hip_runtime.h>
2223
#include <limits>
2324
#include <memory>
@@ -605,15 +606,16 @@ pi_uint64 _pi_event::get_start_time() const {
605606
assert(is_started());
606607

607608
PI_CHECK_ERROR(
608-
hipEventElapsedTime(&miliSeconds, context_->evBase_, evStart_));
609+
hipEventElapsedTime(&miliSeconds, _pi_platform::evBase_, evStart_));
609610
return static_cast<pi_uint64>(miliSeconds * 1.0e6);
610611
}
611612

612613
pi_uint64 _pi_event::get_end_time() const {
613614
float miliSeconds = 0.0f;
614615
assert(is_started() && is_recorded());
615616

616-
PI_CHECK_ERROR(hipEventElapsedTime(&miliSeconds, context_->evBase_, evEnd_));
617+
PI_CHECK_ERROR(
618+
hipEventElapsedTime(&miliSeconds, _pi_platform::evBase_, evEnd_));
617619
return static_cast<pi_uint64>(miliSeconds * 1.0e6);
618620
}
619621

@@ -1988,10 +1990,16 @@ pi_result hip_piContextCreate(const pi_context_properties *properties,
19881990
_pi_context::kind::user_defined, newContext, *devices});
19891991
}
19901992

1991-
// Use default stream to record base event counter
1992-
PI_CHECK_ERROR(
1993-
hipEventCreateWithFlags(&piContextPtr->evBase_, hipEventDefault));
1994-
PI_CHECK_ERROR(hipEventRecord(piContextPtr->evBase_, 0));
1993+
static std::once_flag initFlag;
1994+
std::call_once(
1995+
initFlag,
1996+
[](pi_result &err) {
1997+
// Use default stream to record base event counter
1998+
PI_CHECK_ERROR(
1999+
hipEventCreateWithFlags(&_pi_platform::evBase_, hipEventDefault));
2000+
PI_CHECK_ERROR(hipEventRecord(_pi_platform::evBase_, 0));
2001+
},
2002+
errcode_ret);
19952003

19962004
// For non-primary scoped contexts keep the last active on top of the stack
19972005
// as `cuCtxCreate` replaces it implicitly otherwise.
@@ -2021,8 +2029,6 @@ pi_result hip_piContextRelease(pi_context ctxt) {
20212029

20222030
std::unique_ptr<_pi_context> context{ctxt};
20232031

2024-
PI_CHECK_ERROR(hipEventDestroy(context->evBase_));
2025-
20262032
if (!ctxt->is_primary()) {
20272033
hipCtx_t hipCtxt = ctxt->get();
20282034
// hipCtxSynchronize is not supported for AMD platform so we can just
@@ -3728,6 +3734,7 @@ pi_result hip_piEventGetProfilingInfo(pi_event event,
37283734
switch (param_name) {
37293735
case PI_PROFILING_INFO_COMMAND_QUEUED:
37303736
case PI_PROFILING_INFO_COMMAND_SUBMIT:
3737+
// Note: No user for this case
37313738
return getInfo<pi_uint64>(param_value_size, param_value,
37323739
param_value_size_ret, event->get_queued_time());
37333740
case PI_PROFILING_INFO_COMMAND_START:
@@ -5229,6 +5236,34 @@ pi_result hip_piTearDown(void *PluginParameter) {
52295236
return PI_SUCCESS;
52305237
}
52315238

5239+
pi_result hip_piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime,
5240+
uint64_t *HostTime) {
5241+
_pi_event::native_type event;
5242+
5243+
ScopedContext active(Device->get_context());
5244+
5245+
if (DeviceTime) {
5246+
PI_CHECK_ERROR(hipEventCreateWithFlags(&event, hipEventDefault));
5247+
PI_CHECK_ERROR(hipEventRecord(event));
5248+
}
5249+
if (HostTime) {
5250+
using namespace std::chrono;
5251+
*HostTime =
5252+
duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
5253+
.count();
5254+
}
5255+
5256+
if (DeviceTime) {
5257+
PI_CHECK_ERROR(hipEventSynchronize(event));
5258+
5259+
float elapsedTime = 0.0f;
5260+
PI_CHECK_ERROR(
5261+
hipEventElapsedTime(&elapsedTime, _pi_platform::evBase_, event));
5262+
*DeviceTime = (uint64_t)(elapsedTime * (double)1e6);
5263+
}
5264+
return PI_SUCCESS;
5265+
}
5266+
52325267
const char SupportedVersion[] = _PI_HIP_PLUGIN_VERSION_STRING;
52335268

52345269
pi_result piPluginInit(pi_plugin *PluginInit) {
@@ -5371,10 +5406,13 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
53715406
_PI_CL(piextKernelSetArgSampler, hip_piextKernelSetArgSampler)
53725407
_PI_CL(piPluginGetLastError, hip_piPluginGetLastError)
53735408
_PI_CL(piTearDown, hip_piTearDown)
5409+
_PI_CL(piGetDeviceAndHostTimer, hip_piGetDeviceAndHostTimer)
53745410

53755411
#undef _PI_CL
53765412

53775413
return PI_SUCCESS;
53785414
}
53795415

53805416
} // extern "C"
5417+
5418+
hipEvent_t _pi_platform::evBase_{nullptr};

sycl/plugins/hip/pi_hip.hpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ using _pi_stream_guard = std::unique_lock<std::mutex>;
6565
/// when devices are used.
6666
///
6767
struct _pi_platform {
68+
static hipEvent_t evBase_; // HIP event used as base counter
6869
std::vector<std::unique_ptr<_pi_device>> devices_;
6970
};
7071

@@ -80,6 +81,7 @@ struct _pi_device {
8081
native_type cuDevice_;
8182
std::atomic_uint32_t refCount_;
8283
pi_platform platform_;
84+
pi_context context_;
8385

8486
public:
8587
_pi_device(native_type cuDevice, pi_platform platform)
@@ -90,6 +92,10 @@ struct _pi_device {
9092
pi_uint32 get_reference_count() const noexcept { return refCount_; }
9193

9294
pi_platform get_platform() const noexcept { return platform_; };
95+
96+
void set_context(pi_context ctx) { context_ = ctx; };
97+
98+
pi_context get_context() { return context_; };
9399
};
94100

95101
/// PI context mapping to a HIP context object.
@@ -146,11 +152,9 @@ struct _pi_context {
146152
_pi_device *deviceId_;
147153
std::atomic_uint32_t refCount_;
148154

149-
hipEvent_t evBase_; // HIP event used as base counter
150-
151155
_pi_context(kind k, hipCtx_t ctxt, _pi_device *devId)
152-
: kind_{k}, hipContext_{ctxt}, deviceId_{devId}, refCount_{1},
153-
evBase_(nullptr) {
156+
: kind_{k}, hipContext_{ctxt}, deviceId_{devId}, refCount_{1} {
157+
deviceId_->set_context(this);
154158
hip_piDeviceRetain(deviceId_);
155159
};
156160

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5949,7 +5949,10 @@ pi_result piEventGetProfilingInfo(pi_event Event, pi_profiling_info ParamName,
59495949
}
59505950
case PI_PROFILING_INFO_COMMAND_QUEUED:
59515951
case PI_PROFILING_INFO_COMMAND_SUBMIT:
5952-
// TODO: Support these when Level Zero supported is added.
5952+
// Note: No users for this case
5953+
// TODO: Implement commmand submission time when needed,
5954+
// by recording device timestamp (using zeDeviceGetGlobalTimestamps)
5955+
// before submitting command to device
59535956
return ReturnValue(uint64_t{0});
59545957
default:
59555958
zePrint("piEventGetProfilingInfo: not supported ParamName\n");
@@ -9321,4 +9324,22 @@ pi_result _pi_buffer::free() {
93219324
return PI_SUCCESS;
93229325
}
93239326

9327+
pi_result piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime,
9328+
uint64_t *HostTime) {
9329+
const uint64_t &ZeTimerResolution =
9330+
Device->ZeDeviceProperties->timerResolution;
9331+
const uint64_t TimestampMaxCount =
9332+
((1ULL << Device->ZeDeviceProperties->kernelTimestampValidBits) - 1ULL);
9333+
uint64_t DeviceClockCount, Dummy;
9334+
9335+
ZE_CALL(zeDeviceGetGlobalTimestamps,
9336+
(Device->ZeDevice, HostTime == nullptr ? &Dummy : HostTime,
9337+
&DeviceClockCount));
9338+
9339+
if (DeviceTime != nullptr) {
9340+
9341+
*DeviceTime = (DeviceClockCount & TimestampMaxCount) * ZeTimerResolution;
9342+
}
9343+
return PI_SUCCESS;
9344+
}
93249345
} // extern "C"

0 commit comments

Comments
 (0)