Skip to content

[SYCL] Fixed bug regarding device caching #2566

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Oct 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
277 changes: 164 additions & 113 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -560,8 +560,7 @@ static pi_result copyModule(ze_context_handle_t ZeContext,

static bool setEnvVar(const char *var, const char *value);

static pi_result getOrCreatePlatform(ze_driver_handle_t ZeDriver,
pi_platform *Platform);
static pi_result populateDeviceCacheIfNeeded(pi_platform Platform);

// Forward declarations for mock implementations of Level Zero APIs that
// do not yet work in the driver.
Expand Down Expand Up @@ -649,40 +648,22 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms,
return mapError(ZeResult);
}

// Level Zero does not have concept of Platforms, but Level Zero driver is the
// closest match.
if (Platforms && NumEntries > 0) {
uint32_t ZeDriverCount = 0;
ZE_CALL(zeDriverGet(&ZeDriverCount, nullptr));
if (ZeDriverCount == 0) {
assert(NumPlatforms != 0);
*NumPlatforms = 0;
return PI_SUCCESS;
}
ze_driver_handle_t ZeDriver;
assert(ZeDriverCount == 1);
ZE_CALL(zeDriverGet(&ZeDriverCount, &ZeDriver));

pi_result Res = getOrCreatePlatform(ZeDriver, Platforms);
if (Res != PI_SUCCESS) {
return Res;
}
}

if (NumPlatforms)
*NumPlatforms = 1;

return PI_SUCCESS;
}

// Retrieve a cached Platform that has a matching driver handle or use the
// driver handle to create and initialize a new Platform.
static pi_result getOrCreatePlatform(ze_driver_handle_t ZeDriver,
pi_platform *Platform) {

// We will retrieve the Max CommandList Cache in this lamda function so that
// it only has to be executed once
static pi_uint32 CommandListCacheSizeValue = ([] {
// Cache pi_platforms for reuse in the future
// It solves two problems;
// 1. sycl::platform equality issue; we always return the same pi_platform.
// 2. performance; we can save time by immediately return from cache.
//
// Note: The memory for "PiPlatformsCache" and "PiPlatformsCacheMutex" is
// intentionally leaked because the application may call into the SYCL
// runtime from a global destructor, and such a call could eventually
// access these variables. Therefore, there is no safe time when
// "PiPlatformsCache" and "PiPlatformsCacheMutex" could be deleted.
static auto PiPlatformsCache = new std::vector<pi_platform>;
static auto PiPlatformsCacheMutex = new std::mutex;
static bool PiPlatformCachePopulated = false;

std::lock_guard<std::mutex> Lock(*PiPlatformsCacheMutex);
if (!PiPlatformCachePopulated) {
const char *CommandListCacheSize =
std::getenv("SYCL_PI_LEVEL0_MAX_COMMAND_LIST_CACHE");
pi_uint32 CommandListCacheSizeValue;
Expand All @@ -694,62 +675,69 @@ static pi_result getOrCreatePlatform(ze_driver_handle_t ZeDriver,
"default set.\n");
CommandListCacheSizeValue = 20000;
}
return CommandListCacheSizeValue;
})();

try {
// Cache pi_platforms for reuse in the future
// It solves two problems;
// 1. sycl::device equality issue; we always return the same pi_device.
// 2. performance; we can save time by immediately return from cache.
//
// Note: The memory for "PiPlatformsCache" and "PiPlatformsCacheMutex" is
// intentionally leaked because the application may call into the SYCL
// runtime from a global destructor, and such a call could eventually
// access these variables. Therefore, there is no safe time when
// "PiPlatformsCache" and "PiPlatformsCacheMutex" could be deleted.
static auto PiPlatformsCache = new std::vector<pi_platform>;
static auto PiPlatformsCacheMutex = new std::mutex;

std::lock_guard<std::mutex> Lock(*PiPlatformsCacheMutex);
for (const pi_platform &CachedPlatform : *PiPlatformsCache) {
if (CachedPlatform->ZeDriver == ZeDriver) {
Platform[0] = CachedPlatform;
return PI_SUCCESS;
try {

// Level Zero does not have concept of Platforms, but Level Zero driver is
// the closest match.
uint32_t ZeDriverCount = 0;
ZE_CALL(zeDriverGet(&ZeDriverCount, nullptr));
if (ZeDriverCount == 0) {
PiPlatformCachePopulated = true;
} else {
ze_driver_handle_t ZeDriver;
assert(ZeDriverCount == 1);
ZE_CALL(zeDriverGet(&ZeDriverCount, &ZeDriver));
pi_platform Platform = new _pi_platform(ZeDriver);

// Cache driver properties
ze_driver_properties_t ZeDriverProperties;
ZE_CALL(zeDriverGetProperties(ZeDriver, &ZeDriverProperties));
uint32_t ZeDriverVersion = ZeDriverProperties.driverVersion;
// Intel Level-Zero GPU driver stores version as:
// | 31 - 24 | 23 - 16 | 15 - 0 |
// | Major | Minor | Build |
auto VersionMajor =
std::to_string((ZeDriverVersion & 0xFF000000) >> 24);
auto VersionMinor =
std::to_string((ZeDriverVersion & 0x00FF0000) >> 16);
auto VersionBuild = std::to_string(ZeDriverVersion & 0x0000FFFF);
Platform->ZeDriverVersion =
VersionMajor + "." + VersionMinor + "." + VersionBuild;

ze_api_version_t ZeApiVersion;
ZE_CALL(zeDriverGetApiVersion(ZeDriver, &ZeApiVersion));
Platform->ZeDriverApiVersion =
std::to_string(ZE_MAJOR_VERSION(ZeApiVersion)) + "." +
std::to_string(ZE_MINOR_VERSION(ZeApiVersion));

Platform->ZeMaxCommandListCache = CommandListCacheSizeValue;
// Save a copy in the cache for future uses.
PiPlatformsCache->push_back(Platform);
PiPlatformCachePopulated = true;
}
} catch (const std::bad_alloc &) {
return PI_OUT_OF_HOST_MEMORY;
} catch (...) {
return PI_ERROR_UNKNOWN;
}
}

// TODO: figure out how/when to release this memory
*Platform = new _pi_platform(ZeDriver);

// Cache driver properties
ze_driver_properties_t ZeDriverProperties;
ZE_CALL(zeDriverGetProperties(ZeDriver, &ZeDriverProperties));
uint32_t ZeDriverVersion = ZeDriverProperties.driverVersion;
// Intel Level-Zero GPU driver stores version as:
// | 31 - 24 | 23 - 16 | 15 - 0 |
// | Major | Minor | Build |
auto VersionMajor = std::to_string((ZeDriverVersion & 0xFF000000) >> 24);
auto VersionMinor = std::to_string((ZeDriverVersion & 0x00FF0000) >> 16);
auto VersionBuild = std::to_string(ZeDriverVersion & 0x0000FFFF);
Platform[0]->ZeDriverVersion =
VersionMajor + "." + VersionMinor + "." + VersionBuild;

ze_api_version_t ZeApiVersion;
ZE_CALL(zeDriverGetApiVersion(ZeDriver, &ZeApiVersion));
Platform[0]->ZeDriverApiVersion =
std::to_string(ZE_MAJOR_VERSION(ZeApiVersion)) + "." +
std::to_string(ZE_MINOR_VERSION(ZeApiVersion));

Platform[0]->ZeMaxCommandListCache = CommandListCacheSizeValue;
// save a copy in the cache for future uses.
PiPlatformsCache->push_back(Platform[0]);
} catch (const std::bad_alloc &) {
return PI_OUT_OF_HOST_MEMORY;
} catch (...) {
return PI_ERROR_UNKNOWN;
if (Platforms && NumEntries > 0) {
uint32_t I = 0;
for (const pi_platform &CachedPlatform : *PiPlatformsCache) {
if (I < NumEntries) {
*Platforms++ = CachedPlatform;
I++;
} else {
break;
}
}
}

if (NumPlatforms)
*NumPlatforms = PiPlatformsCache->size();

return PI_SUCCESS;
}

Expand Down Expand Up @@ -818,10 +806,35 @@ pi_result piextPlatformCreateWithNativeHandle(pi_native_handle NativeHandle,
assert(NativeHandle);
assert(Platform);

// Create PI platform from the given Level Zero driver handle or retrieve it
// from the cache.
auto ZeDriver = pi_cast<ze_driver_handle_t>(NativeHandle);
return getOrCreatePlatform(ZeDriver, Platform);

pi_uint32 NumPlatforms = 0;
pi_result Res = piPlatformsGet(0, nullptr, &NumPlatforms);
if (Res != PI_SUCCESS) {
return Res;
}

if (NumPlatforms) {
std::vector<pi_platform> Platforms(NumPlatforms);
Res = piPlatformsGet(NumPlatforms, Platforms.data(), nullptr);
if (Res != PI_SUCCESS) {
return Res;
}

// The SYCL spec requires that the set of platforms must remain fixed for
// the duration of the application's execution. We assume that we found all
// of the Level Zero drivers when we initialized the platform cache, so the
// "NativeHandle" must already be in the cache. If it is not, this must not
// be a valid Level Zero driver.
for (const pi_platform &CachedPlatform : Platforms) {
if (CachedPlatform->ZeDriver == ZeDriver) {
*Platform = CachedPlatform;
return PI_SUCCESS;
}
}
}

return PI_INVALID_VALUE;
}

// Get the cahched PI device created for the L0 device handle.
Expand All @@ -830,9 +843,11 @@ pi_device _pi_platform::getDeviceFromNativeHandle(ze_device_handle_t ZeDevice) {

std::lock_guard<std::mutex> Lock(this->PiDevicesCacheMutex);
auto it = std::find_if(PiDevicesCache.begin(), PiDevicesCache.end(),
[&](pi_device &D) { return D->ZeDevice == ZeDevice; });
[&](std::unique_ptr<_pi_device> &D) {
return D.get()->ZeDevice == ZeDevice;
});
if (it != PiDevicesCache.end()) {
return *it;
return (*it).get();
}
return nullptr;
}
Expand All @@ -842,20 +857,20 @@ pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType,
pi_uint32 *NumDevices) {

assert(Platform);
ze_driver_handle_t ZeDriver = Platform->ZeDriver;

// Get number of devices supporting Level Zero
uint32_t ZeDeviceCount = 0;
std::lock_guard<std::mutex> Lock(Platform->PiDevicesCacheMutex);
ZeDeviceCount = Platform->PiDevicesCache.size();

pi_result Res = populateDeviceCacheIfNeeded(Platform);
if (Res != PI_SUCCESS) {
return Res;
}

ZeDeviceCount = Platform->PiDevicesCache.size();
const bool AskingForGPU = (DeviceType & PI_DEVICE_TYPE_GPU);
const bool AskingForDefault = (DeviceType == PI_DEVICE_TYPE_DEFAULT);

if (ZeDeviceCount == 0) {
ZE_CALL(zeDeviceGet(ZeDriver, &ZeDeviceCount, nullptr));
}

if (ZeDeviceCount == 0 || !(AskingForGPU || AskingForDefault)) {
if (NumDevices)
*NumDevices = 0;
Expand All @@ -871,34 +886,53 @@ pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType,
return PI_SUCCESS;
}

// if devices are already captured in cache, return them from the cache.
for (const pi_device CachedDevice : Platform->PiDevicesCache) {
*Devices++ = CachedDevice;
// Return the devices from the cache.
uint32_t I = 0;
for (const std::unique_ptr<_pi_device> &CachedDevice :
Platform->PiDevicesCache) {
if (I < NumEntries) {
*Devices++ = CachedDevice.get();
I++;
} else {
break;
}
}
if (!Platform->PiDevicesCache.empty()) {

return PI_SUCCESS;
}

// Check the device cache and load it if necessary. The PiDevicesCacheMutex must
// be locked before calling this function to prevent any synchronization issues.
static pi_result populateDeviceCacheIfNeeded(pi_platform Platform) {

if (Platform->DeviceCachePopulated) {
return PI_SUCCESS;
}

ze_driver_handle_t ZeDriver = Platform->ZeDriver;
uint32_t ZeDeviceCount = 0;
ZE_CALL(zeDeviceGet(ZeDriver, &ZeDeviceCount, nullptr));

try {
std::vector<ze_device_handle_t> ZeDevices(ZeDeviceCount);
ZE_CALL(zeDeviceGet(ZeDriver, &ZeDeviceCount, ZeDevices.data()));

for (uint32_t I = 0; I < ZeDeviceCount; ++I) {
if (I < NumEntries) {
Devices[I] = new _pi_device(ZeDevices[I], Platform);
pi_result Result = Devices[I]->initialize();
if (Result != PI_SUCCESS) {
return Result;
}
// save a copy in the cache for future uses.
Platform->PiDevicesCache.push_back(Devices[I]);
std::unique_ptr<_pi_device> Device(
new _pi_device(ZeDevices[I], Platform));
pi_result Result = Device->initialize();
if (Result != PI_SUCCESS) {
return Result;
}
// save a copy in the cache for future uses.
Platform->PiDevicesCache.push_back(std::move(Device));
}
} catch (const std::bad_alloc &) {
return PI_OUT_OF_HOST_MEMORY;
} catch (...) {
return PI_ERROR_UNKNOWN;
}
Platform->DeviceCachePopulated = true;
return PI_SUCCESS;
}

Expand Down Expand Up @@ -1473,11 +1507,28 @@ pi_result piextDeviceCreateWithNativeHandle(pi_native_handle NativeHandle,
assert(Device);
assert(Platform);

// Create PI device from the given Level Zero device handle.
// TODO: get the device from the devices' cache.
std::lock_guard<std::mutex> Lock(Platform->PiDevicesCacheMutex);
pi_result Res = populateDeviceCacheIfNeeded(Platform);
if (Res != PI_SUCCESS) {
return Res;
}

auto ZeDevice = pi_cast<ze_device_handle_t>(NativeHandle);
*Device = new _pi_device(ZeDevice, Platform);
return (*Device)->initialize();

// The SYCL spec requires that the set of devices must remain fixed for the
// duration of the application's execution. We assume that we found all of the
// Level Zero devices when we initialized the device cache, so the
// "NativeHandle" must already be in the cache. If it is not, this must not be
// a valid Level Zero device.
for (const std::unique_ptr<_pi_device> &CachedDevice :
Platform->PiDevicesCache) {
if (CachedDevice->ZeDevice == ZeDevice) {
*Device = CachedDevice.get();
return PI_SUCCESS;
}
}

return PI_INVALID_VALUE;
}

pi_result piContextCreate(const pi_context_properties *Properties,
Expand Down
3 changes: 2 additions & 1 deletion sycl/plugins/level_zero/pi_level_zero.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,10 @@ struct _pi_platform {
std::string ZeDriverApiVersion;

// Cache pi_devices for reuse
std::vector<pi_device> PiDevicesCache;
std::vector<std::unique_ptr<_pi_device>> PiDevicesCache;
std::mutex PiDevicesCacheMutex;
pi_device getDeviceFromNativeHandle(ze_device_handle_t);
bool DeviceCachePopulated = false;

// Maximum Number of Command Lists that can be created.
// This Value is initialized to 20000, but can be changed by the user
Expand Down
Loading