Skip to content

[SYCL][Level Zero] Enable multi-CCS support. #4038

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Jul 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 106 additions & 39 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ static const pi_uint32 ZeSerialize = [] {
return SerializeModeValue;
}();

static const bool CopyEngineRequested = [] {
const char *CopyEngine = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE");
bool UseCopyEngine = (!CopyEngine || (std::stoi(CopyEngine) != 0));
return UseCopyEngine;
}();

// This class encapsulates actions taken along with a call to Level Zero API.
class ZeCall {
private:
Expand Down Expand Up @@ -529,7 +535,8 @@ createEventAndAssociateQueue(pi_queue Queue, pi_event *Event,
return PI_SUCCESS;
}

pi_result _pi_device::initialize() {
pi_result _pi_device::initialize(int SubSubDeviceOrdinal,
int SubSubDeviceIndex) {
uint32_t numQueueGroups = 0;
ZE_CALL(zeDeviceGetCommandQueueGroupProperties,
(ZeDevice, &numQueueGroups, nullptr));
Expand All @@ -542,44 +549,54 @@ pi_result _pi_device::initialize() {
(ZeDevice, &numQueueGroups, QueueProperties.data()));

int ComputeGroupIndex = -1;
for (uint32_t i = 0; i < numQueueGroups; i++) {
if (QueueProperties[i].flags &
ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
ComputeGroupIndex = i;
break;
}
}
// How is it possible that there are no "compute" capabilities?
if (ComputeGroupIndex < 0) {
return PI_ERROR_UNKNOWN;
}
ZeComputeQueueGroupIndex = ComputeGroupIndex;
ZeComputeQueueGroupProperties = QueueProperties[ComputeGroupIndex];

int CopyGroupIndex = -1;
const char *CopyEngine = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE");
bool UseCopyEngine = (!CopyEngine || (std::stoi(CopyEngine) != 0));
if (UseCopyEngine) {
// Initialize a sub-sub-device with its own ordinal and index
if (SubSubDeviceOrdinal >= 0) {
ComputeGroupIndex = SubSubDeviceOrdinal;
ZeComputeEngineIndex = SubSubDeviceIndex;
} else { // This is a root or a sub-device
for (uint32_t i = 0; i < numQueueGroups; i++) {
if (((QueueProperties[i].flags &
ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) &&
(QueueProperties[i].flags &
ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY)) {
CopyGroupIndex = i;
if (QueueProperties[i].flags &
ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
ComputeGroupIndex = i;
break;
}
}
if (CopyGroupIndex < 0)
zePrint("NOTE: blitter/copy engine is not available though it was "
"requested\n");
else
zePrint("NOTE: blitter/copy engine is available\n");
}
ZeCopyQueueGroupIndex = CopyGroupIndex;
if (CopyGroupIndex >= 0) {
ZeCopyQueueGroupProperties = QueueProperties[CopyGroupIndex];
// How is it possible that there are no "compute" capabilities?
if (ComputeGroupIndex < 0) {
return PI_ERROR_UNKNOWN;
}

// The index for a root or a sub-device is always 0.
ZeComputeEngineIndex = 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This deserves a comment why we are using "0" and how it scales to HW


int CopyGroupIndex = -1;
if (CopyEngineRequested) {
for (uint32_t i = 0; i < numQueueGroups; i++) {
if (((QueueProperties[i].flags &
ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) &&
(QueueProperties[i].flags &
ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY)) {
CopyGroupIndex = i;
break;
}
}
if (CopyGroupIndex < 0)
zePrint("NOTE: blitter/copy engine is not available though it was "
"requested\n");
else
zePrint("NOTE: blitter/copy engine is available\n");
}

ZeCopyQueueGroupIndex = CopyGroupIndex;
if (CopyGroupIndex >= 0) {
ZeCopyQueueGroupProperties = QueueProperties[CopyGroupIndex];
}
}

ZeComputeQueueGroupIndex = ComputeGroupIndex;
ZeComputeQueueGroupProperties = QueueProperties[ComputeGroupIndex];

// Cache device properties
ZeDeviceProperties = {};
ZE_CALL(zeDeviceGetProperties, (ZeDevice, &ZeDeviceProperties));
Expand All @@ -598,7 +615,7 @@ pi_result _pi_context::initialize() {
pi_device Device = SingleRootDevice ? SingleRootDevice : Devices[0];
ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;
ZeCommandQueueDesc.ordinal = Device->ZeComputeQueueGroupIndex;
ZeCommandQueueDesc.index = 0;
ZeCommandQueueDesc.index = Device->ZeComputeEngineIndex;
ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
ZE_CALL(
zeCommandListCreateImmediate,
Expand Down Expand Up @@ -1547,6 +1564,50 @@ pi_result _pi_platform::populateDeviceCacheIfNeeded() {
delete[] ZeSubdevices;
return Result;
}

// collect all the ordinals for the sub-sub-devices
std::vector<int> Ordinals;

uint32_t numQueueGroups = 0;
ZE_CALL(zeDeviceGetCommandQueueGroupProperties,
(PiSubDevice->ZeDevice, &numQueueGroups, nullptr));
if (numQueueGroups == 0) {
return PI_ERROR_UNKNOWN;
}
std::vector<ze_command_queue_group_properties_t> QueueProperties(
numQueueGroups);
ZE_CALL(
zeDeviceGetCommandQueueGroupProperties,
(PiSubDevice->ZeDevice, &numQueueGroups, QueueProperties.data()));

for (uint32_t i = 0; i < numQueueGroups; i++) {
if (QueueProperties[i].flags &
ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE &&
QueueProperties[i].numQueues > 1) {
Ordinals.push_back(i);
}
}

// Create PI sub-sub-devices with the sub-device for all the ordinals.
// Each {ordinal, index} points to a specific CCS which constructs
// a sub-sub-device at this point.
for (uint32_t J = 0; J < Ordinals.size(); ++J) {
for (uint32_t K = 0; K < QueueProperties[Ordinals[J]].numQueues;
++K) {
std::unique_ptr<_pi_device> PiSubSubDevice(
new _pi_device(ZeSubdevices[I], this, PiSubDevice.get()));
pi_result Result = PiSubSubDevice->initialize(Ordinals[J], K);
if (Result != PI_SUCCESS) {
return Result;
}

// save pointers to sub-sub-devices for quick retrieval in the
// future.
PiSubDevice->SubDevices.push_back(PiSubSubDevice.get());
PiDevicesCache.push_back(std::move(PiSubSubDevice));
}
}

// save pointers to sub-devices for quick retrieval in the future.
Device->SubDevices.push_back(PiSubDevice.get());
PiDevicesCache.push_back(std::move(PiSubDevice));
Expand Down Expand Up @@ -1777,17 +1838,23 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
case PI_DEVICE_INFO_VERSION:
return ReturnValue(Device->Platform->ZeDriverApiVersion.c_str());
case PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: {
uint32_t ZeSubDeviceCount = 0;
ZE_CALL(zeDeviceGetSubDevices, (ZeDevice, &ZeSubDeviceCount, nullptr));
return ReturnValue(pi_uint32{ZeSubDeviceCount});
pi_result Res = Device->Platform->populateDeviceCacheIfNeeded();
if (Res != PI_SUCCESS) {
return Res;
}
return ReturnValue(pi_uint32{(unsigned int)(Device->SubDevices.size())});
}
case PI_DEVICE_INFO_REFERENCE_COUNT:
return ReturnValue(pi_uint32{Device->RefCount});
case PI_DEVICE_INFO_PARTITION_PROPERTIES: {
// SYCL spec says: if this SYCL device cannot be partitioned into at least
// two sub devices then the returned vector must be empty.
uint32_t ZeSubDeviceCount = 0;
ZE_CALL(zeDeviceGetSubDevices, (ZeDevice, &ZeSubDeviceCount, nullptr));
pi_result Res = Device->Platform->populateDeviceCacheIfNeeded();
if (Res != PI_SUCCESS) {
return Res;
}

uint32_t ZeSubDeviceCount = Device->SubDevices.size();
if (ZeSubDeviceCount < 2) {
return ReturnValue(pi_device_partition_property{0});
}
Expand Down Expand Up @@ -2402,7 +2469,7 @@ pi_result piQueueCreate(pi_context Context, pi_device Device,
ZeDevice = Device->ZeDevice;
ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;
ZeCommandQueueDesc.ordinal = Device->ZeComputeQueueGroupIndex;
ZeCommandQueueDesc.index = 0;
ZeCommandQueueDesc.index = Device->ZeComputeEngineIndex;
ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;

ZE_CALL(zeCommandQueueCreate,
Expand Down
17 changes: 16 additions & 1 deletion sycl/plugins/level_zero/pi_level_zero.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,9 @@ struct _pi_device : _pi_object {
int32_t ZeComputeQueueGroupIndex;
int32_t ZeCopyQueueGroupIndex;

// Keep the index of the compute engine
int32_t ZeComputeEngineIndex = 0;

// Cache the properties of the compute/copy queue groups.
ZeStruct<ze_command_queue_group_properties_t> ZeComputeQueueGroupProperties;
ZeStruct<ze_command_queue_group_properties_t> ZeCopyQueueGroupProperties;
Expand All @@ -299,7 +302,11 @@ struct _pi_device : _pi_object {
bool hasCopyEngine() const { return ZeCopyQueueGroupIndex >= 0; }

// Initialize the entire PI device.
pi_result initialize();
// Optional param `SubSubDeviceOrdinal` `SubSubDeviceIndex` are the compute
// command queue ordinal and index respectively, used to initialize
// sub-sub-devices.
pi_result initialize(int SubSubDeviceOrdinal = -1,
int SubSubDeviceIndex = -1);

// Level Zero device handle.
ze_device_handle_t ZeDevice;
Expand Down Expand Up @@ -358,6 +365,14 @@ struct _pi_context : _pi_object {
// include root device itself as well)
SingleRootDevice =
Devices[0]->RootDevice ? Devices[0]->RootDevice : Devices[0];

// For context with sub subdevices, the SingleRootDevice might still
// not be the root device.
// Check whether the SingleRootDevice is the subdevice or root device.
if (SingleRootDevice->isSubDevice()) {
SingleRootDevice = SingleRootDevice->RootDevice;
}

for (auto &Device : Devices) {
if ((!Device->RootDevice && Device != SingleRootDevice) ||
(Device->RootDevice && Device->RootDevice != SingleRootDevice)) {
Expand Down
6 changes: 3 additions & 3 deletions sycl/source/detail/device_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,9 +199,9 @@ std::vector<device> device_impl::create_sub_devices(
!is_affinity_supported(AffinityDomain)) {
throw cl::sycl::feature_not_supported();
}
const cl_device_partition_property Properties[3] = {
CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
(cl_device_partition_property)AffinityDomain, 0};
const pi_device_partition_property Properties[3] = {
PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
(pi_device_partition_property)AffinityDomain, 0};
size_t SubDevicesCount = get_info<info::device::partition_max_sub_devices>();
return create_sub_devices(Properties, SubDevicesCount);
}
Expand Down