@@ -46,6 +46,12 @@ static const pi_uint32 ZeSerialize = [] {
46
46
return SerializeModeValue;
47
47
}();
48
48
49
+ static const bool CopyEngineRequested = [] {
50
+ const char *CopyEngine = std::getenv (" SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE" );
51
+ bool UseCopyEngine = (!CopyEngine || (std::stoi (CopyEngine) != 0 ));
52
+ return UseCopyEngine;
53
+ }();
54
+
49
55
// This class encapsulates actions taken along with a call to Level Zero API.
50
56
class ZeCall {
51
57
private:
@@ -529,7 +535,8 @@ createEventAndAssociateQueue(pi_queue Queue, pi_event *Event,
529
535
return PI_SUCCESS;
530
536
}
531
537
532
- pi_result _pi_device::initialize () {
538
+ pi_result _pi_device::initialize (int SubSubDeviceOrdinal,
539
+ int SubSubDeviceIndex) {
533
540
uint32_t numQueueGroups = 0 ;
534
541
ZE_CALL (zeDeviceGetCommandQueueGroupProperties,
535
542
(ZeDevice, &numQueueGroups, nullptr ));
@@ -542,44 +549,54 @@ pi_result _pi_device::initialize() {
542
549
(ZeDevice, &numQueueGroups, QueueProperties.data ()));
543
550
544
551
int ComputeGroupIndex = -1 ;
545
- for (uint32_t i = 0 ; i < numQueueGroups; i++) {
546
- if (QueueProperties[i].flags &
547
- ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
548
- ComputeGroupIndex = i;
549
- break ;
550
- }
551
- }
552
- // How is it possible that there are no "compute" capabilities?
553
- if (ComputeGroupIndex < 0 ) {
554
- return PI_ERROR_UNKNOWN;
555
- }
556
- ZeComputeQueueGroupIndex = ComputeGroupIndex;
557
- ZeComputeQueueGroupProperties = QueueProperties[ComputeGroupIndex];
558
552
559
- int CopyGroupIndex = -1 ;
560
- const char *CopyEngine = std::getenv (" SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE" );
561
- bool UseCopyEngine = (!CopyEngine || (std::stoi (CopyEngine) != 0 ));
562
- if (UseCopyEngine) {
553
+ // Initialize a sub-sub-device with its own ordinal and index
554
+ if (SubSubDeviceOrdinal >= 0 ) {
555
+ ComputeGroupIndex = SubSubDeviceOrdinal;
556
+ ZeComputeEngineIndex = SubSubDeviceIndex;
557
+ } else { // This is a root or a sub-device
563
558
for (uint32_t i = 0 ; i < numQueueGroups; i++) {
564
- if (((QueueProperties[i].flags &
565
- ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0 ) &&
566
- (QueueProperties[i].flags &
567
- ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY)) {
568
- CopyGroupIndex = i;
559
+ if (QueueProperties[i].flags &
560
+ ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
561
+ ComputeGroupIndex = i;
569
562
break ;
570
563
}
571
564
}
572
- if (CopyGroupIndex < 0 )
573
- zePrint (" NOTE: blitter/copy engine is not available though it was "
574
- " requested\n " );
575
- else
576
- zePrint (" NOTE: blitter/copy engine is available\n " );
577
- }
578
- ZeCopyQueueGroupIndex = CopyGroupIndex;
579
- if (CopyGroupIndex >= 0 ) {
580
- ZeCopyQueueGroupProperties = QueueProperties[CopyGroupIndex];
565
+ // How is it possible that there are no "compute" capabilities?
566
+ if (ComputeGroupIndex < 0 ) {
567
+ return PI_ERROR_UNKNOWN;
568
+ }
569
+
570
+ // The index for a root or a sub-device is always 0.
571
+ ZeComputeEngineIndex = 0 ;
572
+
573
+ int CopyGroupIndex = -1 ;
574
+ if (CopyEngineRequested) {
575
+ for (uint32_t i = 0 ; i < numQueueGroups; i++) {
576
+ if (((QueueProperties[i].flags &
577
+ ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0 ) &&
578
+ (QueueProperties[i].flags &
579
+ ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY)) {
580
+ CopyGroupIndex = i;
581
+ break ;
582
+ }
583
+ }
584
+ if (CopyGroupIndex < 0 )
585
+ zePrint (" NOTE: blitter/copy engine is not available though it was "
586
+ " requested\n " );
587
+ else
588
+ zePrint (" NOTE: blitter/copy engine is available\n " );
589
+ }
590
+
591
+ ZeCopyQueueGroupIndex = CopyGroupIndex;
592
+ if (CopyGroupIndex >= 0 ) {
593
+ ZeCopyQueueGroupProperties = QueueProperties[CopyGroupIndex];
594
+ }
581
595
}
582
596
597
+ ZeComputeQueueGroupIndex = ComputeGroupIndex;
598
+ ZeComputeQueueGroupProperties = QueueProperties[ComputeGroupIndex];
599
+
583
600
// Cache device properties
584
601
ZeDeviceProperties = {};
585
602
ZE_CALL (zeDeviceGetProperties, (ZeDevice, &ZeDeviceProperties));
@@ -598,7 +615,7 @@ pi_result _pi_context::initialize() {
598
615
pi_device Device = SingleRootDevice ? SingleRootDevice : Devices[0 ];
599
616
ZeStruct<ze_command_queue_desc_t > ZeCommandQueueDesc;
600
617
ZeCommandQueueDesc.ordinal = Device->ZeComputeQueueGroupIndex ;
601
- ZeCommandQueueDesc.index = 0 ;
618
+ ZeCommandQueueDesc.index = Device-> ZeComputeEngineIndex ;
602
619
ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
603
620
ZE_CALL (
604
621
zeCommandListCreateImmediate,
@@ -1547,6 +1564,50 @@ pi_result _pi_platform::populateDeviceCacheIfNeeded() {
1547
1564
delete[] ZeSubdevices;
1548
1565
return Result;
1549
1566
}
1567
+
1568
+ // collect all the ordinals for the sub-sub-devices
1569
+ std::vector<int > Ordinals;
1570
+
1571
+ uint32_t numQueueGroups = 0 ;
1572
+ ZE_CALL (zeDeviceGetCommandQueueGroupProperties,
1573
+ (PiSubDevice->ZeDevice , &numQueueGroups, nullptr ));
1574
+ if (numQueueGroups == 0 ) {
1575
+ return PI_ERROR_UNKNOWN;
1576
+ }
1577
+ std::vector<ze_command_queue_group_properties_t > QueueProperties (
1578
+ numQueueGroups);
1579
+ ZE_CALL (
1580
+ zeDeviceGetCommandQueueGroupProperties,
1581
+ (PiSubDevice->ZeDevice , &numQueueGroups, QueueProperties.data ()));
1582
+
1583
+ for (uint32_t i = 0 ; i < numQueueGroups; i++) {
1584
+ if (QueueProperties[i].flags &
1585
+ ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE &&
1586
+ QueueProperties[i].numQueues > 1 ) {
1587
+ Ordinals.push_back (i);
1588
+ }
1589
+ }
1590
+
1591
+ // Create PI sub-sub-devices with the sub-device for all the ordinals.
1592
+ // Each {ordinal, index} points to a specific CCS which constructs
1593
+ // a sub-sub-device at this point.
1594
+ for (uint32_t J = 0 ; J < Ordinals.size (); ++J) {
1595
+ for (uint32_t K = 0 ; K < QueueProperties[Ordinals[J]].numQueues ;
1596
+ ++K) {
1597
+ std::unique_ptr<_pi_device> PiSubSubDevice (
1598
+ new _pi_device (ZeSubdevices[I], this , PiSubDevice.get ()));
1599
+ pi_result Result = PiSubSubDevice->initialize (Ordinals[J], K);
1600
+ if (Result != PI_SUCCESS) {
1601
+ return Result;
1602
+ }
1603
+
1604
+ // save pointers to sub-sub-devices for quick retrieval in the
1605
+ // future.
1606
+ PiSubDevice->SubDevices .push_back (PiSubSubDevice.get ());
1607
+ PiDevicesCache.push_back (std::move (PiSubSubDevice));
1608
+ }
1609
+ }
1610
+
1550
1611
// save pointers to sub-devices for quick retrieval in the future.
1551
1612
Device->SubDevices .push_back (PiSubDevice.get ());
1552
1613
PiDevicesCache.push_back (std::move (PiSubDevice));
@@ -1777,17 +1838,23 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
1777
1838
case PI_DEVICE_INFO_VERSION:
1778
1839
return ReturnValue (Device->Platform ->ZeDriverApiVersion .c_str ());
1779
1840
case PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: {
1780
- uint32_t ZeSubDeviceCount = 0 ;
1781
- ZE_CALL (zeDeviceGetSubDevices, (ZeDevice, &ZeSubDeviceCount, nullptr ));
1782
- return ReturnValue (pi_uint32{ZeSubDeviceCount});
1841
+ pi_result Res = Device->Platform ->populateDeviceCacheIfNeeded ();
1842
+ if (Res != PI_SUCCESS) {
1843
+ return Res;
1844
+ }
1845
+ return ReturnValue (pi_uint32{(unsigned int )(Device->SubDevices .size ())});
1783
1846
}
1784
1847
case PI_DEVICE_INFO_REFERENCE_COUNT:
1785
1848
return ReturnValue (pi_uint32{Device->RefCount });
1786
1849
case PI_DEVICE_INFO_PARTITION_PROPERTIES: {
1787
1850
// SYCL spec says: if this SYCL device cannot be partitioned into at least
1788
1851
// two sub devices then the returned vector must be empty.
1789
- uint32_t ZeSubDeviceCount = 0 ;
1790
- ZE_CALL (zeDeviceGetSubDevices, (ZeDevice, &ZeSubDeviceCount, nullptr ));
1852
+ pi_result Res = Device->Platform ->populateDeviceCacheIfNeeded ();
1853
+ if (Res != PI_SUCCESS) {
1854
+ return Res;
1855
+ }
1856
+
1857
+ uint32_t ZeSubDeviceCount = Device->SubDevices .size ();
1791
1858
if (ZeSubDeviceCount < 2 ) {
1792
1859
return ReturnValue (pi_device_partition_property{0 });
1793
1860
}
@@ -2402,7 +2469,7 @@ pi_result piQueueCreate(pi_context Context, pi_device Device,
2402
2469
ZeDevice = Device->ZeDevice ;
2403
2470
ZeStruct<ze_command_queue_desc_t > ZeCommandQueueDesc;
2404
2471
ZeCommandQueueDesc.ordinal = Device->ZeComputeQueueGroupIndex ;
2405
- ZeCommandQueueDesc.index = 0 ;
2472
+ ZeCommandQueueDesc.index = Device-> ZeComputeEngineIndex ;
2406
2473
ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
2407
2474
2408
2475
ZE_CALL (zeCommandQueueCreate,
0 commit comments