Skip to content

Commit 9bc3bc4

Browse files
authored
[SYCL][Level Zero] Enable multi-CCS support. (#4038)
Signed-off-by: rehana begam <[email protected]>
1 parent 351af24 commit 9bc3bc4

File tree

3 files changed

+125
-43
lines changed

3 files changed

+125
-43
lines changed

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 106 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,12 @@ static const pi_uint32 ZeSerialize = [] {
4646
return SerializeModeValue;
4747
}();
4848

49+
static const bool CopyEngineRequested = [] {
50+
const char *CopyEngine = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE");
51+
bool UseCopyEngine = (!CopyEngine || (std::stoi(CopyEngine) != 0));
52+
return UseCopyEngine;
53+
}();
54+
4955
// This class encapsulates actions taken along with a call to Level Zero API.
5056
class ZeCall {
5157
private:
@@ -529,7 +535,8 @@ createEventAndAssociateQueue(pi_queue Queue, pi_event *Event,
529535
return PI_SUCCESS;
530536
}
531537

532-
pi_result _pi_device::initialize() {
538+
pi_result _pi_device::initialize(int SubSubDeviceOrdinal,
539+
int SubSubDeviceIndex) {
533540
uint32_t numQueueGroups = 0;
534541
ZE_CALL(zeDeviceGetCommandQueueGroupProperties,
535542
(ZeDevice, &numQueueGroups, nullptr));
@@ -542,44 +549,54 @@ pi_result _pi_device::initialize() {
542549
(ZeDevice, &numQueueGroups, QueueProperties.data()));
543550

544551
int ComputeGroupIndex = -1;
545-
for (uint32_t i = 0; i < numQueueGroups; i++) {
546-
if (QueueProperties[i].flags &
547-
ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
548-
ComputeGroupIndex = i;
549-
break;
550-
}
551-
}
552-
// How is it possible that there are no "compute" capabilities?
553-
if (ComputeGroupIndex < 0) {
554-
return PI_ERROR_UNKNOWN;
555-
}
556-
ZeComputeQueueGroupIndex = ComputeGroupIndex;
557-
ZeComputeQueueGroupProperties = QueueProperties[ComputeGroupIndex];
558552

559-
int CopyGroupIndex = -1;
560-
const char *CopyEngine = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE");
561-
bool UseCopyEngine = (!CopyEngine || (std::stoi(CopyEngine) != 0));
562-
if (UseCopyEngine) {
553+
// Initialize a sub-sub-device with its own ordinal and index
554+
if (SubSubDeviceOrdinal >= 0) {
555+
ComputeGroupIndex = SubSubDeviceOrdinal;
556+
ZeComputeEngineIndex = SubSubDeviceIndex;
557+
} else { // This is a root or a sub-device
563558
for (uint32_t i = 0; i < numQueueGroups; i++) {
564-
if (((QueueProperties[i].flags &
565-
ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) &&
566-
(QueueProperties[i].flags &
567-
ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY)) {
568-
CopyGroupIndex = i;
559+
if (QueueProperties[i].flags &
560+
ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
561+
ComputeGroupIndex = i;
569562
break;
570563
}
571564
}
572-
if (CopyGroupIndex < 0)
573-
zePrint("NOTE: blitter/copy engine is not available though it was "
574-
"requested\n");
575-
else
576-
zePrint("NOTE: blitter/copy engine is available\n");
577-
}
578-
ZeCopyQueueGroupIndex = CopyGroupIndex;
579-
if (CopyGroupIndex >= 0) {
580-
ZeCopyQueueGroupProperties = QueueProperties[CopyGroupIndex];
565+
// How is it possible that there are no "compute" capabilities?
566+
if (ComputeGroupIndex < 0) {
567+
return PI_ERROR_UNKNOWN;
568+
}
569+
570+
// The index for a root or a sub-device is always 0.
571+
ZeComputeEngineIndex = 0;
572+
573+
int CopyGroupIndex = -1;
574+
if (CopyEngineRequested) {
575+
for (uint32_t i = 0; i < numQueueGroups; i++) {
576+
if (((QueueProperties[i].flags &
577+
ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) &&
578+
(QueueProperties[i].flags &
579+
ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY)) {
580+
CopyGroupIndex = i;
581+
break;
582+
}
583+
}
584+
if (CopyGroupIndex < 0)
585+
zePrint("NOTE: blitter/copy engine is not available though it was "
586+
"requested\n");
587+
else
588+
zePrint("NOTE: blitter/copy engine is available\n");
589+
}
590+
591+
ZeCopyQueueGroupIndex = CopyGroupIndex;
592+
if (CopyGroupIndex >= 0) {
593+
ZeCopyQueueGroupProperties = QueueProperties[CopyGroupIndex];
594+
}
581595
}
582596

597+
ZeComputeQueueGroupIndex = ComputeGroupIndex;
598+
ZeComputeQueueGroupProperties = QueueProperties[ComputeGroupIndex];
599+
583600
// Cache device properties
584601
ZeDeviceProperties = {};
585602
ZE_CALL(zeDeviceGetProperties, (ZeDevice, &ZeDeviceProperties));
@@ -598,7 +615,7 @@ pi_result _pi_context::initialize() {
598615
pi_device Device = SingleRootDevice ? SingleRootDevice : Devices[0];
599616
ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;
600617
ZeCommandQueueDesc.ordinal = Device->ZeComputeQueueGroupIndex;
601-
ZeCommandQueueDesc.index = 0;
618+
ZeCommandQueueDesc.index = Device->ZeComputeEngineIndex;
602619
ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
603620
ZE_CALL(
604621
zeCommandListCreateImmediate,
@@ -1547,6 +1564,50 @@ pi_result _pi_platform::populateDeviceCacheIfNeeded() {
15471564
delete[] ZeSubdevices;
15481565
return Result;
15491566
}
1567+
1568+
// collect all the ordinals for the sub-sub-devices
1569+
std::vector<int> Ordinals;
1570+
1571+
uint32_t numQueueGroups = 0;
1572+
ZE_CALL(zeDeviceGetCommandQueueGroupProperties,
1573+
(PiSubDevice->ZeDevice, &numQueueGroups, nullptr));
1574+
if (numQueueGroups == 0) {
1575+
return PI_ERROR_UNKNOWN;
1576+
}
1577+
std::vector<ze_command_queue_group_properties_t> QueueProperties(
1578+
numQueueGroups);
1579+
ZE_CALL(
1580+
zeDeviceGetCommandQueueGroupProperties,
1581+
(PiSubDevice->ZeDevice, &numQueueGroups, QueueProperties.data()));
1582+
1583+
for (uint32_t i = 0; i < numQueueGroups; i++) {
1584+
if (QueueProperties[i].flags &
1585+
ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE &&
1586+
QueueProperties[i].numQueues > 1) {
1587+
Ordinals.push_back(i);
1588+
}
1589+
}
1590+
1591+
// Create PI sub-sub-devices with the sub-device for all the ordinals.
1592+
// Each {ordinal, index} points to a specific CCS which constructs
1593+
// a sub-sub-device at this point.
1594+
for (uint32_t J = 0; J < Ordinals.size(); ++J) {
1595+
for (uint32_t K = 0; K < QueueProperties[Ordinals[J]].numQueues;
1596+
++K) {
1597+
std::unique_ptr<_pi_device> PiSubSubDevice(
1598+
new _pi_device(ZeSubdevices[I], this, PiSubDevice.get()));
1599+
pi_result Result = PiSubSubDevice->initialize(Ordinals[J], K);
1600+
if (Result != PI_SUCCESS) {
1601+
return Result;
1602+
}
1603+
1604+
// save pointers to sub-sub-devices for quick retrieval in the
1605+
// future.
1606+
PiSubDevice->SubDevices.push_back(PiSubSubDevice.get());
1607+
PiDevicesCache.push_back(std::move(PiSubSubDevice));
1608+
}
1609+
}
1610+
15501611
// save pointers to sub-devices for quick retrieval in the future.
15511612
Device->SubDevices.push_back(PiSubDevice.get());
15521613
PiDevicesCache.push_back(std::move(PiSubDevice));
@@ -1777,17 +1838,23 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
17771838
case PI_DEVICE_INFO_VERSION:
17781839
return ReturnValue(Device->Platform->ZeDriverApiVersion.c_str());
17791840
case PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: {
1780-
uint32_t ZeSubDeviceCount = 0;
1781-
ZE_CALL(zeDeviceGetSubDevices, (ZeDevice, &ZeSubDeviceCount, nullptr));
1782-
return ReturnValue(pi_uint32{ZeSubDeviceCount});
1841+
pi_result Res = Device->Platform->populateDeviceCacheIfNeeded();
1842+
if (Res != PI_SUCCESS) {
1843+
return Res;
1844+
}
1845+
return ReturnValue(pi_uint32{(unsigned int)(Device->SubDevices.size())});
17831846
}
17841847
case PI_DEVICE_INFO_REFERENCE_COUNT:
17851848
return ReturnValue(pi_uint32{Device->RefCount});
17861849
case PI_DEVICE_INFO_PARTITION_PROPERTIES: {
17871850
// SYCL spec says: if this SYCL device cannot be partitioned into at least
17881851
// two sub devices then the returned vector must be empty.
1789-
uint32_t ZeSubDeviceCount = 0;
1790-
ZE_CALL(zeDeviceGetSubDevices, (ZeDevice, &ZeSubDeviceCount, nullptr));
1852+
pi_result Res = Device->Platform->populateDeviceCacheIfNeeded();
1853+
if (Res != PI_SUCCESS) {
1854+
return Res;
1855+
}
1856+
1857+
uint32_t ZeSubDeviceCount = Device->SubDevices.size();
17911858
if (ZeSubDeviceCount < 2) {
17921859
return ReturnValue(pi_device_partition_property{0});
17931860
}
@@ -2402,7 +2469,7 @@ pi_result piQueueCreate(pi_context Context, pi_device Device,
24022469
ZeDevice = Device->ZeDevice;
24032470
ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;
24042471
ZeCommandQueueDesc.ordinal = Device->ZeComputeQueueGroupIndex;
2405-
ZeCommandQueueDesc.index = 0;
2472+
ZeCommandQueueDesc.index = Device->ZeComputeEngineIndex;
24062473
ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
24072474

24082475
ZE_CALL(zeCommandQueueCreate,

sycl/plugins/level_zero/pi_level_zero.hpp

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,9 @@ struct _pi_device : _pi_object {
291291
int32_t ZeComputeQueueGroupIndex;
292292
int32_t ZeCopyQueueGroupIndex;
293293

294+
// Keep the index of the compute engine
295+
int32_t ZeComputeEngineIndex = 0;
296+
294297
// Cache the properties of the compute/copy queue groups.
295298
ZeStruct<ze_command_queue_group_properties_t> ZeComputeQueueGroupProperties;
296299
ZeStruct<ze_command_queue_group_properties_t> ZeCopyQueueGroupProperties;
@@ -299,7 +302,11 @@ struct _pi_device : _pi_object {
299302
bool hasCopyEngine() const { return ZeCopyQueueGroupIndex >= 0; }
300303

301304
// Initialize the entire PI device.
302-
pi_result initialize();
305+
// Optional param `SubSubDeviceOrdinal` `SubSubDeviceIndex` are the compute
306+
// command queue ordinal and index respectively, used to initialize
307+
// sub-sub-devices.
308+
pi_result initialize(int SubSubDeviceOrdinal = -1,
309+
int SubSubDeviceIndex = -1);
303310

304311
// Level Zero device handle.
305312
ze_device_handle_t ZeDevice;
@@ -358,6 +365,14 @@ struct _pi_context : _pi_object {
358365
// include root device itself as well)
359366
SingleRootDevice =
360367
Devices[0]->RootDevice ? Devices[0]->RootDevice : Devices[0];
368+
369+
// For context with sub subdevices, the SingleRootDevice might still
370+
// not be the root device.
371+
// Check whether the SingleRootDevice is the subdevice or root device.
372+
if (SingleRootDevice->isSubDevice()) {
373+
SingleRootDevice = SingleRootDevice->RootDevice;
374+
}
375+
361376
for (auto &Device : Devices) {
362377
if ((!Device->RootDevice && Device != SingleRootDevice) ||
363378
(Device->RootDevice && Device->RootDevice != SingleRootDevice)) {

sycl/source/detail/device_impl.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -199,9 +199,9 @@ std::vector<device> device_impl::create_sub_devices(
199199
!is_affinity_supported(AffinityDomain)) {
200200
throw cl::sycl::feature_not_supported();
201201
}
202-
const cl_device_partition_property Properties[3] = {
203-
CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
204-
(cl_device_partition_property)AffinityDomain, 0};
202+
const pi_device_partition_property Properties[3] = {
203+
PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
204+
(pi_device_partition_property)AffinityDomain, 0};
205205
size_t SubDevicesCount = get_info<info::device::partition_max_sub_devices>();
206206
return create_sub_devices(Properties, SubDevicesCount);
207207
}

0 commit comments

Comments
 (0)