Skip to content

Commit 6de77db

Browse files
feature: initial support for zeCommandListHostSynchronize
Related-To: LOCI-4191 Signed-off-by: Joshua Santosh Ranjan <[email protected]>
1 parent fb306c8 commit 6de77db

File tree

10 files changed

+151
-14
lines changed

10 files changed

+151
-14
lines changed

level_zero/api/core/ze_barrier_api_entrypoints.h

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2020-2022 Intel Corporation
2+
* Copyright (C) 2020-2023 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -36,6 +36,12 @@ ze_result_t zeDeviceSystemBarrier(
3636
return L0::Device::fromHandle(hDevice)->systemBarrier();
3737
}
3838

39+
ze_result_t ZE_APICALL zeCommandListHostSynchronize(
40+
ze_command_list_handle_t hCommandList,
41+
uint64_t timeout) {
42+
return L0::CommandList::fromHandle(hCommandList)->hostSynchronize(timeout);
43+
}
44+
3945
} // namespace L0
4046

4147
extern "C" {
@@ -74,4 +80,12 @@ ZE_APIEXPORT ze_result_t ZE_APICALL zeDeviceSystemBarrier(
7480
return L0::zeDeviceSystemBarrier(
7581
hDevice);
7682
}
83+
84+
ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListHostSynchronize(
85+
ze_command_list_handle_t hCommandList,
86+
uint64_t timeout) {
87+
return L0::zeCommandListHostSynchronize(
88+
hCommandList,
89+
timeout);
90+
}
7791
}

level_zero/api/core/ze_core_loader.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,7 @@ zeGetCommandListProcAddrTable(
339339
pDdiTable->pfnAppendWriteGlobalTimestamp = L0::zeCommandListAppendWriteGlobalTimestamp;
340340
pDdiTable->pfnAppendMemoryCopyFromContext = L0::zeCommandListAppendMemoryCopyFromContext;
341341
pDdiTable->pfnAppendQueryKernelTimestamps = L0::zeCommandListAppendQueryKernelTimestamps;
342+
pDdiTable->pfnHostSynchronize = L0::zeCommandListHostSynchronize;
342343
driverDdiTable.coreDdiTable.CommandList = *pDdiTable;
343344
if (driverDdiTable.enableTracing) {
344345
pDdiTable->pfnAppendBarrier = zeCommandListAppendBarrierTracing;

level_zero/core/source/cmdlist/cmdlist.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ struct CommandList : _ze_command_list_handle_t {
175175
uint32_t data, ze_event_handle_t signalEventHandle) = 0;
176176
virtual ze_result_t appendWriteToMemory(void *desc, void *ptr,
177177
uint64_t data) = 0;
178+
virtual ze_result_t hostSynchronize(uint64_t timeout) = 0;
178179

179180
static CommandList *create(uint32_t productFamily, Device *device, NEO::EngineGroupType engineGroupType,
180181
ze_command_list_flags_t flags, ze_result_t &resultValue);

level_zero/core/source/cmdlist/cmdlist_hw.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ struct CommandListCoreFamily : CommandListImp {
165165
ze_result_t appendQueryKernelTimestamps(uint32_t numEvents, ze_event_handle_t *phEvents, void *dstptr,
166166
const size_t *pOffsets, ze_event_handle_t hSignalEvent,
167167
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override;
168+
ze_result_t hostSynchronize(uint64_t timeout) override;
168169

169170
ze_result_t appendSignalEvent(ze_event_handle_t hEvent) override;
170171
ze_result_t appendWaitOnEvents(uint32_t numEvents, ze_event_handle_t *phEvent, bool relaxedOrderingAllowed, bool trackDependencies, bool signalInOrderCompletion) override;

level_zero/core/source/cmdlist/cmdlist_hw.inl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2472,6 +2472,11 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendQueryKernelTimestamps(
24722472
return ZE_RESULT_SUCCESS;
24732473
}
24742474

2475+
template <GFXCORE_FAMILY gfxCoreFamily>
2476+
ze_result_t CommandListCoreFamily<gfxCoreFamily>::hostSynchronize(uint64_t timeout) {
2477+
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
2478+
}
2479+
24752480
template <GFXCORE_FAMILY gfxCoreFamily>
24762481
ze_result_t CommandListCoreFamily<gfxCoreFamily>::reserveSpace(size_t size, void **ptr) {
24772482
auto availableSpace = commandContainer.getCommandStream()->getAvailableSpace();

level_zero/core/source/cmdlist/cmdlist_hw_immediate.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
143143
uint32_t numWaitEvents,
144144
ze_event_handle_t *waitEventHandles, bool relaxedOrderingDispatch) override;
145145

146+
ze_result_t hostSynchronize(uint64_t timeout) override;
147+
146148
MOCKABLE_VIRTUAL ze_result_t executeCommandListImmediateWithFlushTask(bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies);
147149
ze_result_t executeCommandListImmediateWithFlushTaskImpl(bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, CommandQueue *cmdQ);
148150

level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -688,6 +688,26 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendLaunchCooperati
688688
return flushImmediate(ret, true, false, relaxedOrderingDispatch, hSignalEvent);
689689
}
690690

691+
template <GFXCORE_FAMILY gfxCoreFamily>
692+
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::hostSynchronize(uint64_t timeout) {
693+
694+
if (this->isFlushTaskSubmissionEnabled && !this->isSyncModeQueue) {
695+
const int64_t timeoutInMicroSeconds = timeout / 1000;
696+
auto syncTaskCount = this->csr->peekTaskCount();
697+
const auto waitStatus = this->csr->waitForCompletionWithTimeout(NEO::WaitParams{false, false, timeoutInMicroSeconds},
698+
syncTaskCount);
699+
if (waitStatus == NEO::WaitStatus::GpuHang) {
700+
this->printKernelsPrintfOutput(true);
701+
this->checkAssert();
702+
return ZE_RESULT_ERROR_DEVICE_LOST;
703+
}
704+
this->csr->getInternalAllocationStorage()->cleanAllocationList(syncTaskCount, NEO::AllocationUsage::TEMPORARY_ALLOCATION);
705+
this->printKernelsPrintfOutput(false);
706+
this->checkAssert();
707+
}
708+
return ZE_RESULT_SUCCESS;
709+
}
710+
691711
template <GFXCORE_FAMILY gfxCoreFamily>
692712
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds,
693713
bool hasRelaxedOrderingDependencies, ze_event_handle_t hSignalEvent) {

level_zero/core/test/black_box_tests/zello_immediate.cpp

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
#include "zello_common.h"
1111

12+
#include <chrono>
1213
#include <cstring>
1314
#include <fstream>
1415
#include <iomanip>
@@ -31,12 +32,13 @@ void createImmediateCommandList(ze_device_handle_t &device,
3132
SUCCESS_OR_TERMINATE(zeCommandListCreateImmediate(context, device, &cmdQueueDesc, &cmdList));
3233
}
3334

34-
void testCopyBetweenHostMemAndDeviceMem(ze_context_handle_t &context, ze_device_handle_t &device, bool syncMode, int32_t copyQueueGroup, bool &validRet) {
35+
void testCopyBetweenHostMemAndDeviceMem(ze_context_handle_t &context, ze_device_handle_t &device, bool syncMode, int32_t copyQueueGroup, bool &validRet, bool useEventBasedSync) {
3536
const size_t allocSize = 4096 + 7; // +7 to brake alignment and make it harder
3637
char *hostBuffer = nullptr;
3738
void *deviceBuffer = nullptr;
3839
char *stackBuffer = new char[allocSize];
3940
ze_command_list_handle_t cmdList;
41+
const bool isEventsUsed = useEventBasedSync && !syncMode;
4042

4143
createImmediateCommandList(device, context, copyQueueGroup, syncMode, cmdList);
4244

@@ -80,13 +82,17 @@ void testCopyBetweenHostMemAndDeviceMem(ze_context_handle_t &context, ze_device_
8082

8183
// Copy from device-allocated memory to stack
8284
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(cmdList, stackBuffer, deviceBuffer, allocSize,
83-
syncMode ? nullptr : hostEvents[0],
85+
isEventsUsed ? hostEvents[0] : nullptr,
8486
syncMode ? 0 : 1,
8587
syncMode ? nullptr : &deviceEvents[0]));
8688

8789
if (!syncMode) {
88-
// If Async mode, use event for sync
89-
SUCCESS_OR_TERMINATE(zeEventHostSynchronize(hostEvents[0], std::numeric_limits<uint64_t>::max()));
90+
if (isEventsUsed) {
91+
// If Async mode, use event for sync
92+
SUCCESS_OR_TERMINATE(zeEventHostSynchronize(hostEvents[0], std::numeric_limits<uint64_t>::max()));
93+
} else {
94+
SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize(cmdList, std::numeric_limits<uint64_t>::max()));
95+
}
9096
}
9197

9298
// Validate stack and xe deviceBuffers have the original data from hostBuffer
@@ -108,11 +114,12 @@ void testCopyBetweenHostMemAndDeviceMem(ze_context_handle_t &context, ze_device_
108114
SUCCESS_OR_TERMINATE(zeCommandListDestroy(cmdList));
109115
}
110116

111-
void executeGpuKernelAndValidate(ze_context_handle_t &context, ze_device_handle_t &device, bool syncMode, bool &outputValidationSuccessful) {
117+
void executeGpuKernelAndValidate(ze_context_handle_t &context, ze_device_handle_t &device, bool syncMode, bool &outputValidationSuccessful, bool useEventBasedSync) {
112118
ze_command_list_handle_t cmdList;
113119

114120
uint32_t computeOrdinal = getCommandQueueOrdinal(device);
115121
createImmediateCommandList(device, context, computeOrdinal, syncMode, cmdList);
122+
const auto isEventsUsed = useEventBasedSync && !syncMode;
116123

117124
// Create two shared buffers
118125
constexpr size_t allocSize = 4096;
@@ -202,17 +209,25 @@ void executeGpuKernelAndValidate(ze_context_handle_t &context, ze_device_handle_
202209
dispatchTraits.groupCountZ = 1u;
203210

204211
SUCCESS_OR_TERMINATE(zeCommandListAppendLaunchKernel(cmdList, kernel, &dispatchTraits,
205-
syncMode ? nullptr : hostEvents[0], 0, nullptr));
212+
isEventsUsed ? hostEvents[0] : nullptr, 0, nullptr));
206213
file.close();
207214
} else {
208215
// Perform a GPU copy
209216
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(cmdList, dstBuffer, srcBuffer, allocSize,
210-
syncMode ? nullptr : hostEvents[0], 0, nullptr));
217+
isEventsUsed ? hostEvents[0] : nullptr, 0, nullptr));
211218
}
212219

213220
if (!syncMode) {
214-
// If Async mode, use event for sync
215-
SUCCESS_OR_TERMINATE(zeEventHostSynchronize(hostEvents[0], std::numeric_limits<uint64_t>::max()));
221+
std::chrono::high_resolution_clock::time_point start, end;
222+
start = std::chrono::high_resolution_clock::now();
223+
if (isEventsUsed) {
224+
// If Async mode, use event for sync
225+
SUCCESS_OR_TERMINATE(zeEventHostSynchronize(hostEvents[0], std::numeric_limits<uint64_t>::max()));
226+
} else {
227+
SUCCESS_OR_TERMINATE(zeCommandListHostSynchronize(cmdList, std::numeric_limits<uint64_t>::max()));
228+
}
229+
end = std::chrono::high_resolution_clock::now();
230+
std::cout << "Time to synchronize : " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
216231
}
217232

218233
// Validate
@@ -245,6 +260,7 @@ int main(int argc, char *argv[]) {
245260

246261
verbose = isVerbose(argc, argv);
247262
bool aubMode = isAubMode(argc, argv);
263+
int useEventBasedSync = getParamValue(argc, argv, "-e", "--useEventsBasedSync", 1);
248264

249265
ze_context_handle_t context = nullptr;
250266
ze_driver_handle_t driverHandle = nullptr;
@@ -259,12 +275,12 @@ int main(int argc, char *argv[]) {
259275
if (outputValidationSuccessful || aubMode) {
260276
// Sync mode with Compute queue
261277
std::cout << "Test case: Sync mode compute queue with Kernel launch \n";
262-
executeGpuKernelAndValidate(context, device, true, outputValidationSuccessful);
278+
executeGpuKernelAndValidate(context, device, true, outputValidationSuccessful, useEventBasedSync);
263279
}
264280
if (outputValidationSuccessful || aubMode) {
265281
// Async mode with Compute queue
266282
std::cout << "\nTest case: Async mode compute queue with Kernel launch \n";
267-
executeGpuKernelAndValidate(context, device, false, outputValidationSuccessful);
283+
executeGpuKernelAndValidate(context, device, false, outputValidationSuccessful, useEventBasedSync);
268284
}
269285

270286
// Find copy queue in root device, if not found, try subdevices
@@ -321,12 +337,12 @@ int main(int argc, char *argv[]) {
321337
if (outputValidationSuccessful || aubMode) {
322338
// Sync mode with Copy queue
323339
std::cout << "\nTest case: Sync mode copy queue for memory copy\n";
324-
testCopyBetweenHostMemAndDeviceMem(context, copyQueueDev, true, copyQueueGroup, outputValidationSuccessful);
340+
testCopyBetweenHostMemAndDeviceMem(context, copyQueueDev, true, copyQueueGroup, outputValidationSuccessful, useEventBasedSync);
325341
}
326342
if (outputValidationSuccessful || aubMode) {
327343
// Async mode with Copy queue
328344
std::cout << "\nTest case: Async mode copy queue for memory copy\n";
329-
testCopyBetweenHostMemAndDeviceMem(context, copyQueueDev, false, copyQueueGroup, outputValidationSuccessful);
345+
testCopyBetweenHostMemAndDeviceMem(context, copyQueueDev, false, copyQueueGroup, outputValidationSuccessful, useEventBasedSync);
330346
}
331347
}
332348

level_zero/core/test/unit_tests/mocks/mock_cmdlist.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -478,6 +478,8 @@ struct MockCommandList : public CommandList {
478478

479479
ADDMETHOD_NOBASE_VOIDRETURN(appendMultiPartitionPrologue, (uint32_t partitionDataSize));
480480
ADDMETHOD_NOBASE_VOIDRETURN(appendMultiPartitionEpilogue, (void));
481+
ADDMETHOD_NOBASE(hostSynchronize, ze_result_t, ZE_RESULT_SUCCESS,
482+
(uint64_t timeout));
481483

482484
uint8_t *batchBuffer = nullptr;
483485
NEO::GraphicsAllocation *mockAllocation = nullptr;

level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2949,5 +2949,80 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenAlignePtrToFillWhenAppendMemoryFil
29492949
context->freeMem(dstBuffer);
29502950
}
29512951

2952+
using ImmediateCommandListHostSynchronize = Test<DeviceFixture>;
2953+
2954+
HWTEST2_F(ImmediateCommandListHostSynchronize, givenFlushTaskEnabledAndNotSyncModeThenWaitForCompletionIsCalled, IsAtLeastSkl) {
2955+
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
2956+
cmdList.copyThroughLockedPtrEnabled = true;
2957+
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
2958+
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
2959+
cmdList.isFlushTaskSubmissionEnabled = true;
2960+
cmdList.isSyncModeQueue = false;
2961+
2962+
EXPECT_EQ(cmdList.hostSynchronize(0), ZE_RESULT_SUCCESS);
2963+
2964+
uint32_t waitForFlushTagUpdateCalled = reinterpret_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(cmdList.csr)->waitForCompletionWithTimeoutTaskCountCalled;
2965+
EXPECT_EQ(waitForFlushTagUpdateCalled, 1u);
2966+
}
2967+
2968+
HWTEST2_F(ImmediateCommandListHostSynchronize, givenSyncModeThenWaitForCompletionIsNotCalled, IsAtLeastSkl) {
2969+
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
2970+
cmdList.copyThroughLockedPtrEnabled = true;
2971+
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
2972+
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
2973+
cmdList.isFlushTaskSubmissionEnabled = true;
2974+
cmdList.isSyncModeQueue = true;
2975+
2976+
EXPECT_EQ(cmdList.hostSynchronize(0), ZE_RESULT_SUCCESS);
2977+
2978+
uint32_t waitForFlushTagUpdateCalled = reinterpret_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(cmdList.csr)->waitForCompletionWithTimeoutTaskCountCalled;
2979+
EXPECT_EQ(waitForFlushTagUpdateCalled, 0u);
2980+
}
2981+
2982+
HWTEST2_F(ImmediateCommandListHostSynchronize, givenFlushTaskSubmissionIsDisabledThenWaitForCompletionIsNotCalled, IsAtLeastSkl) {
2983+
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
2984+
cmdList.copyThroughLockedPtrEnabled = true;
2985+
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
2986+
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
2987+
cmdList.isFlushTaskSubmissionEnabled = false;
2988+
cmdList.isSyncModeQueue = false;
2989+
2990+
EXPECT_EQ(cmdList.hostSynchronize(0), ZE_RESULT_SUCCESS);
2991+
2992+
uint32_t waitForFlushTagUpdateCalled = reinterpret_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(cmdList.csr)->waitForCompletionWithTimeoutTaskCountCalled;
2993+
EXPECT_EQ(waitForFlushTagUpdateCalled, 0u);
2994+
}
2995+
2996+
HWTEST2_F(ImmediateCommandListHostSynchronize, givenGpuStatusIsHangThenDeviceLostIsReturned, IsAtLeastSkl) {
2997+
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
2998+
cmdList.copyThroughLockedPtrEnabled = true;
2999+
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
3000+
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
3001+
cmdList.isFlushTaskSubmissionEnabled = true;
3002+
cmdList.isSyncModeQueue = false;
3003+
reinterpret_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(cmdList.csr)->callBaseWaitForCompletionWithTimeout = false;
3004+
reinterpret_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(cmdList.csr)->returnWaitForCompletionWithTimeout = WaitStatus::GpuHang;
3005+
3006+
EXPECT_EQ(cmdList.hostSynchronize(0), ZE_RESULT_ERROR_DEVICE_LOST);
3007+
3008+
uint32_t waitForFlushTagUpdateCalled = reinterpret_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(cmdList.csr)->waitForCompletionWithTimeoutTaskCountCalled;
3009+
EXPECT_EQ(waitForFlushTagUpdateCalled, 1u);
3010+
}
3011+
3012+
using CommandListHostSynchronize = Test<DeviceFixture>;
3013+
3014+
HWTEST2_F(CommandListHostSynchronize, whenHostSychronizeIsCalledReturnInvalidArgument, IsAtLeastSkl) {
3015+
ze_command_list_desc_t desc = {};
3016+
ze_command_list_handle_t hCommandList = {};
3017+
3018+
ze_result_t result = context->createCommandList(device, &desc, &hCommandList);
3019+
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
3020+
EXPECT_EQ(Context::fromHandle(CommandList::fromHandle(hCommandList)->getCmdListContext()), context);
3021+
3022+
L0::CommandList *commandList = L0::CommandList::fromHandle(hCommandList);
3023+
EXPECT_EQ(commandList->hostSynchronize(0), ZE_RESULT_ERROR_INVALID_ARGUMENT);
3024+
commandList->destroy();
3025+
}
3026+
29523027
} // namespace ult
29533028
} // namespace L0

0 commit comments

Comments
 (0)