Skip to content

Commit 7b207d5

Browse files
feature: CPU copy path for in-order CommandList
Related-To: NEO-7966 Signed-off-by: Dunajski, Bartosz <[email protected]>
1 parent 5a908f6 commit 7b207d5

File tree

5 files changed

+148
-0
lines changed

5 files changed

+148
-0
lines changed

level_zero/core/source/cmdlist/cmdlist_hw_immediate.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
4040
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
4141
using BaseClass = CommandListCoreFamily<gfxCoreFamily>;
4242
using BaseClass::BaseClass;
43+
using BaseClass::copyThroughLockedPtrEnabled;
4344
using BaseClass::executeCommandListImmediate;
4445
using BaseClass::isCopyOnly;
4546
using BaseClass::isInOrderExecutionEnabled;
@@ -174,6 +175,8 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
174175
using BaseClass::inOrderDependencyCounterAllocation;
175176

176177
void printKernelsPrintfOutput(bool hangDetected);
178+
ze_result_t synchronizeInOrderExecution() const;
179+
177180
MOCKABLE_VIRTUAL void checkAssert();
178181
std::atomic<bool> dependenciesPresent{false};
179182
};

level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "shared/source/memory_manager/internal_allocation_storage.h"
2121
#include "shared/source/memory_manager/unified_memory_manager.h"
2222
#include "shared/source/os_interface/os_context.h"
23+
#include "shared/source/utilities/wait_util.h"
2324

2425
#include "level_zero/core/source/cmdlist/cmdlist_hw_immediate.h"
2526
#include "level_zero/core/source/cmdqueue/cmdqueue_hw.h"
@@ -787,6 +788,10 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::performCpuMemcpy(cons
787788
return ZE_RESULT_ERROR_UNKNOWN;
788789
}
789790

791+
if (isInOrderExecutionEnabled()) {
792+
this->dependenciesPresent = false; // wait only for waitlist and in-order sync value
793+
}
794+
790795
if (numWaitEvents > 0) {
791796
uint32_t numEventsThreshold = 5;
792797
if (NEO::DebugManager.flags.ExperimentalCopyThroughLockWaitlistSizeThreshold.get() != -1) {
@@ -826,6 +831,13 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::performCpuMemcpy(cons
826831
this->dependenciesPresent = false;
827832
}
828833

834+
if (isInOrderExecutionEnabled()) {
835+
auto status = synchronizeInOrderExecution();
836+
if (status != ZE_RESULT_SUCCESS) {
837+
return status;
838+
}
839+
}
840+
829841
if (signalEvent) {
830842
signalEvent->setGpuStartTimestamp();
831843
}
@@ -1028,4 +1040,24 @@ bool CommandListCoreFamilyImmediate<gfxCoreFamily>::isRelaxedOrderingDispatchAll
10281040
return NEO::RelaxedOrderingHelper::isRelaxedOrderingDispatchAllowed(*this->csr, numEvents);
10291041
}
10301042

1043+
template <GFXCORE_FAMILY gfxCoreFamily>
1044+
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::synchronizeInOrderExecution() const {
1045+
auto hostAddress = static_cast<uint32_t *>(this->inOrderDependencyCounterAllocation->getUnderlyingBuffer());
1046+
auto waitValue = this->inOrderDependencyCounter;
1047+
1048+
auto lastHangCheckTime = std::chrono::high_resolution_clock::now();
1049+
1050+
while (*hostAddress < waitValue) {
1051+
this->csr->downloadAllocation(*this->inOrderDependencyCounterAllocation);
1052+
1053+
bool status = NEO::WaitUtils::waitFunctionWithPredicate<const uint32_t>(hostAddress, waitValue, std::greater_equal<uint32_t>());
1054+
1055+
if (!status && this->csr->checkGpuHangDetected(std::chrono::high_resolution_clock::now(), lastHangCheckTime)) {
1056+
return ZE_RESULT_ERROR_DEVICE_LOST;
1057+
}
1058+
}
1059+
1060+
return ZE_RESULT_SUCCESS;
1061+
}
1062+
10311063
} // namespace L0

level_zero/core/test/unit_tests/mocks/mock_cmdlist.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ struct WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>
173173
using BaseClass::signalAllEventPackets;
174174
using BaseClass::stateBaseAddressTracking;
175175
using BaseClass::stateComputeModeTracking;
176+
using BaseClass::synchronizeInOrderExecution;
176177

177178
WhiteBox() : BaseClass(BaseClass::defaultNumIddsPerBlock) {}
178179
};

level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1115,6 +1115,107 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingAppendBarrierThenS
11151115
EXPECT_EQ(2u, pcCmd->getImmediateData());
11161116
}
11171117

1118+
HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenCallingSyncThenHandleCompletion, IsAtLeastXeHpCore) {
1119+
auto immCmdList = createImmCmdList<gfxCoreFamily>();
1120+
1121+
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(device->getNEODevice()->getDefaultEngine().commandStreamReceiver);
1122+
1123+
immCmdList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
1124+
1125+
auto hostAddress = static_cast<uint32_t *>(immCmdList->inOrderDependencyCounterAllocation->getUnderlyingBuffer());
1126+
*hostAddress = 0;
1127+
1128+
const uint32_t failCounter = 3;
1129+
uint32_t callCounter = 0;
1130+
1131+
ultCsr->downloadAllocationImpl = [&](GraphicsAllocation &graphicsAllocation) {
1132+
callCounter++;
1133+
if (callCounter >= failCounter) {
1134+
*hostAddress = 1;
1135+
}
1136+
};
1137+
1138+
immCmdList->synchronizeInOrderExecution();
1139+
1140+
EXPECT_EQ(3u, callCounter);
1141+
EXPECT_EQ(2u, ultCsr->checkGpuHangDetectedCalled);
1142+
EXPECT_EQ(1u, *hostAddress);
1143+
}
1144+
1145+
HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenDoingCpuCopyThenSynchronize, IsAtLeastXeHpCore) {
1146+
auto immCmdList = createImmCmdList<gfxCoreFamily>();
1147+
immCmdList->copyThroughLockedPtrEnabled = true;
1148+
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(device->getNEODevice()->getDefaultEngine().commandStreamReceiver);
1149+
1150+
auto eventPool = createEvents(1, false);
1151+
1152+
auto eventHandle = events[0]->toHandle();
1153+
1154+
auto hostAddress = static_cast<uint32_t *>(immCmdList->inOrderDependencyCounterAllocation->getUnderlyingBuffer());
1155+
*hostAddress = 0;
1156+
1157+
const uint32_t failCounter = 3;
1158+
uint32_t callCounter = 0;
1159+
1160+
ultCsr->downloadAllocationImpl = [&](GraphicsAllocation &graphicsAllocation) {
1161+
callCounter++;
1162+
if (callCounter >= failCounter) {
1163+
(*hostAddress)++;
1164+
}
1165+
};
1166+
1167+
immCmdList->appendLaunchKernel(kernel->toHandle(), &groupCount, eventHandle, 0, nullptr, launchParams, false);
1168+
events[0]->setIsCompleted();
1169+
1170+
ultCsr->waitForCompletionWithTimeoutTaskCountCalled = 0;
1171+
ultCsr->flushTagUpdateCalled = false;
1172+
1173+
void *deviceAlloc = nullptr;
1174+
ze_device_mem_alloc_desc_t deviceDesc = {};
1175+
auto result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 128, 128, &deviceAlloc);
1176+
ASSERT_EQ(result, ZE_RESULT_SUCCESS);
1177+
1178+
uint32_t hostCopyData = 0;
1179+
1180+
immCmdList->appendMemoryCopy(deviceAlloc, &hostCopyData, 1, nullptr, 1, &eventHandle, false);
1181+
1182+
EXPECT_EQ(3u, callCounter);
1183+
EXPECT_EQ(1u, *hostAddress);
1184+
EXPECT_EQ(2u, ultCsr->checkGpuHangDetectedCalled);
1185+
EXPECT_EQ(0u, ultCsr->waitForCompletionWithTimeoutTaskCountCalled);
1186+
EXPECT_FALSE(ultCsr->flushTagUpdateCalled);
1187+
1188+
context->freeMem(deviceAlloc);
1189+
}
1190+
1191+
HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenGpuHangDetectedInCpuCopyPathThenReportError, IsAtLeastXeHpCore) {
1192+
auto immCmdList = createImmCmdList<gfxCoreFamily>();
1193+
immCmdList->copyThroughLockedPtrEnabled = true;
1194+
1195+
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(device->getNEODevice()->getDefaultEngine().commandStreamReceiver);
1196+
1197+
auto hostAddress = static_cast<uint32_t *>(immCmdList->inOrderDependencyCounterAllocation->getUnderlyingBuffer());
1198+
*hostAddress = 0;
1199+
1200+
immCmdList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
1201+
1202+
void *deviceAlloc = nullptr;
1203+
ze_device_mem_alloc_desc_t deviceDesc = {};
1204+
auto result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 128, 128, &deviceAlloc);
1205+
ASSERT_EQ(result, ZE_RESULT_SUCCESS);
1206+
1207+
uint32_t hostCopyData = 0;
1208+
1209+
ultCsr->forceReturnGpuHang = true;
1210+
1211+
auto status = immCmdList->appendMemoryCopy(deviceAlloc, &hostCopyData, 1, nullptr, 0, nullptr, false);
1212+
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, status);
1213+
1214+
ultCsr->forceReturnGpuHang = false;
1215+
1216+
context->freeMem(deviceAlloc);
1217+
}
1218+
11181219
struct CommandListAppendLaunchKernelWithImplicitArgs : CommandListAppendLaunchKernel {
11191220
template <typename FamilyType>
11201221
uint64_t getIndirectHeapOffsetForImplicitArgsBuffer(const Mock<::L0::Kernel> &kernel) {

shared/test/common/libult/ult_command_stream_receiver.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,15 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
379379
pollForCompletionCalled++;
380380
}
381381

382+
bool checkGpuHangDetected(CommandStreamReceiver::TimeType currentTime, CommandStreamReceiver::TimeType &lastHangCheckTime) const override {
383+
checkGpuHangDetectedCalled++;
384+
if (forceReturnGpuHang) {
385+
return true;
386+
}
387+
388+
return BaseClass::checkGpuHangDetected(currentTime, lastHangCheckTime);
389+
}
390+
382391
SubmissionStatus sendRenderStateCacheFlush() override {
383392
if (callBaseSendRenderStateCacheFlush) {
384393
return BaseClass::sendRenderStateCacheFlush();
@@ -404,6 +413,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
404413
uint32_t initDirectSubmissionCalled = 0;
405414
uint32_t fillReusableAllocationsListCalled = 0;
406415
uint32_t pollForCompletionCalled = 0;
416+
mutable uint32_t checkGpuHangDetectedCalled = 0;
407417
int ensureCommandBufferAllocationCalled = 0;
408418
DispatchFlags recordedDispatchFlags;
409419
BlitPropertiesContainer receivedBlitProperties = {};
@@ -435,6 +445,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
435445
bool callBaseFillReusableAllocationsList = false;
436446
bool callBaseFlushBcsTask{true};
437447
bool callBaseSendRenderStateCacheFlush = true;
448+
bool forceReturnGpuHang = false;
438449
};
439450

440451
} // namespace NEO

0 commit comments

Comments
 (0)