Skip to content

[SYCL] Cache and reuse events in the Level Zero plugin. #6484

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Aug 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 93 additions & 15 deletions sycl/plugins/level_zero/pi_level_zero.cpp
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,16 @@ static const bool UseMultipleCmdlistBarriers = [] {
return std::stoi(UseMultipleCmdlistBarriersFlag) > 0;
}();

// This is an experimental option that allows to disable caching of events in
// the context.
static const bool DisableEventsCaching = [] {
const char *DisableEventsCachingFlag =
std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_EVENTS_CACHING");
if (!DisableEventsCachingFlag)
return false;
return std::stoi(DisableEventsCachingFlag) != 0;
}();

// This class encapsulates actions taken along with a call to Level Zero API.
class ZeCall {
private:
Expand Down Expand Up @@ -468,10 +478,18 @@ _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &Pool,
std::list<ze_event_pool_handle_t> *ZePoolCache =
getZeEventPoolCache(HostVisible, ProfilingEnabled);

// Remove full pool from the cache.
if (!ZePoolCache->empty()) {
if (NumEventsAvailableInEventPool[ZePoolCache->front()] == 0) {
ZePoolCache->erase(ZePoolCache->begin());
if (DisableEventsCaching) {
// Remove full pool from the cache if events caching is disabled.
ZePoolCache->erase(ZePoolCache->begin());
} else {
// If event caching is enabled then we don't destroy events so there is
// no need to remove pool from the cache and add it back when it has
// available slots. Just keep it in the tail of the cache so that all
// pools can be destroyed during context destruction.
ZePoolCache->push_front(nullptr);
}
}
}
if (ZePoolCache->empty()) {
Expand Down Expand Up @@ -868,7 +886,18 @@ pi_result _pi_context::initialize() {
pi_result _pi_context::finalize() {
// This function is called when pi_context is deallocated, piContextRelease.
// There could be some memory that may have not been deallocated.
// For example, event pool caches would be still alive.
// For example, event and event pool caches would be still alive.

if (!DisableEventsCaching) {
std::scoped_lock Lock(EventCacheMutex);
for (auto &EventCache : EventCaches) {
for (auto Event : EventCache) {
ZE_CALL(zeEventDestroy, (Event->ZeEvent));
delete Event;
}
EventCache.clear();
}
}
{
std::scoped_lock Lock(ZeEventPoolCacheMutex);
for (auto &ZePoolCache : ZeEventPoolCache) {
Expand Down Expand Up @@ -5430,24 +5459,67 @@ _pi_event::getOrCreateHostVisibleEvent(ze_event_handle_t &ZeHostVisibleEvent) {
return PI_SUCCESS;
}

pi_result _pi_event::reset() {
Queue = nullptr;
CleanedUp = false;
Completed = false;
CommandData = nullptr;
CommandType = PI_COMMAND_TYPE_USER;
WaitList = {};
RefCount.reset();

if (!isHostVisible())
HostVisibleEvent = nullptr;

ZE_CALL(zeEventHostReset, (ZeEvent));
return PI_SUCCESS;
}

pi_event _pi_context::getEventFromCache(bool HostVisible, bool WithProfiling) {
std::scoped_lock Lock(EventCacheMutex);
auto Cache = getEventCache(HostVisible, WithProfiling);
if (Cache->empty())
return nullptr;

auto It = Cache->begin();
pi_event Event = *It;
Cache->erase(It);
return Event;
}

void _pi_context::addEventToCache(pi_event Event) {
std::scoped_lock Lock(EventCacheMutex);
auto Cache =
getEventCache(Event->isHostVisible(), Event->isProfilingEnabled());
Event->reset();
Cache->emplace_back(Event);
}

// Helper function for creating a PI event.
// The "Queue" argument specifies the PI queue where a command is submitted.
// The "HostVisible" argument specifies if event needs to be allocated from
// a host-visible pool.
//
static pi_result EventCreate(pi_context Context, pi_queue Queue,
bool HostVisible, pi_event *RetEvent) {

bool ProfilingEnabled =
!Queue || (Queue->Properties & PI_QUEUE_PROFILING_ENABLE) != 0;

size_t Index = 0;
if (auto CachedEvent =
Context->getEventFromCache(HostVisible, ProfilingEnabled)) {
*RetEvent = CachedEvent;
return PI_SUCCESS;
}

ze_event_handle_t ZeEvent;
ze_event_pool_handle_t ZeEventPool = {};

size_t Index = 0;

if (auto Res = Context->getFreeSlotInExistingOrNewPool(
ZeEventPool, Index, HostVisible, ProfilingEnabled))
return Res;

ze_event_handle_t ZeEvent;
ZeStruct<ze_event_desc_t> ZeEventDesc;
ZeEventDesc.index = Index;
ZeEventDesc.wait = 0;
Expand All @@ -5456,9 +5528,9 @@ static pi_result EventCreate(pi_context Context, pi_queue Queue,
ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
} else {
//
// Set the scope to "device" for every event. This is sufficient for global
// device access and peer device access. If needed to be seen on the host
// we are doing special handling, see EventsScope options.
// Set the scope to "device" for every event. This is sufficient for
// global device access and peer device access. If needed to be seen on
// the host we are doing special handling, see EventsScope options.
//
// TODO: see if "sub-device" (ZE_EVENT_SCOPE_FLAG_SUBDEVICE) can better be
// used in some circumstances.
Expand Down Expand Up @@ -5819,7 +5891,12 @@ pi_result piEventRelease(pi_event Event) {
Event->CommandData = nullptr;
}
if (Event->OwnZeEvent) {
ZE_CALL(zeEventDestroy, (Event->ZeEvent));
if (DisableEventsCaching) {
ZE_CALL(zeEventDestroy, (Event->ZeEvent));
auto Context = Event->Context;
if (auto Res = Context->decrementUnreleasedEventsInPool(Event))
return Res;
}
}
// It is possible that host-visible event was never created.
// In case it was check if that's different from this same event
Expand All @@ -5829,18 +5906,19 @@ pi_result piEventRelease(pi_event Event) {
PI_CALL(piEventRelease(Event->HostVisibleEvent));
}

auto Context = Event->Context;
if (auto Res = Context->decrementUnreleasedEventsInPool(Event))
return Res;

// We intentionally incremented the reference counter when an event is
// created so that we can avoid pi_queue is released before the associated
// pi_event is released. Here we have to decrement it so pi_queue
// can be released successfully.
if (Event->Queue) {
PI_CALL(piQueueReleaseInternal(Event->Queue));
}
delete Event;

if (DisableEventsCaching || !Event->OwnZeEvent) {
delete Event;
} else {
Event->Context->addEventToCache(Event);
}

return PI_SUCCESS;
}
Expand Down
30 changes: 28 additions & 2 deletions sycl/plugins/level_zero/pi_level_zero.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,10 @@ template <class T> struct ZeCache : private T {
// thread can reach ref count equal to zero, i.e. only a single thread can pass
// through this check.
struct ReferenceCounter {
ReferenceCounter(pi_uint32 InitVal) : RefCount{InitVal} {}
ReferenceCounter() : RefCount{1} {}

// Reset the counter to the initial value.
void reset() { RefCount = 1; }

// Used when retaining an object.
void increment() { RefCount++; }
Expand Down Expand Up @@ -306,7 +309,7 @@ struct ReferenceCounter {

// Base class to store common data
struct _pi_object {
_pi_object() : RefCount{1} {}
_pi_object() : RefCount{} {}

// Level Zero doesn't do the reference counting, so we have to do.
// Must be atomic to prevent data race when incrementing/decrementing.
Expand Down Expand Up @@ -750,6 +753,12 @@ struct _pi_context : _pi_object {
// when kernel has finished execution.
std::unordered_map<void *, MemAllocRecord> MemAllocs;

// Get pi_event from cache.
pi_event getEventFromCache(bool HostVisible, bool WithProfiling);

// Add pi_event to cache.
void addEventToCache(pi_event);

private:
// If context contains one device then return this device.
// If context contains sub-devices of the same device, then return this parent
Expand Down Expand Up @@ -798,6 +807,20 @@ struct _pi_context : _pi_object {
// Mutex to control operations on event pool caches and the helper maps
// holding the current pool usage counts.
pi_mutex ZeEventPoolCacheMutex;

// Mutex to control operations on event caches.
pi_mutex EventCacheMutex;

// Caches for events.
std::vector<std::list<pi_event>> EventCaches{4};

// Get the cache of events for a provided scope and profiling mode.
auto getEventCache(bool HostVisible, bool WithProfiling) {
if (HostVisible)
return WithProfiling ? &EventCaches[0] : &EventCaches[1];
else
return WithProfiling ? &EventCaches[2] : &EventCaches[3];
}
};

struct _pi_queue : _pi_object {
Expand Down Expand Up @@ -1350,6 +1373,9 @@ struct _pi_event : _pi_object {
// L0 event (if any) is not guranteed to have been signalled, or
// being visible to the host at all.
bool Completed = {false};

// Reset _pi_event object.
pi_result reset();
};

struct _pi_program : _pi_object {
Expand Down