Skip to content

[UR] [V2] Fix synchronization between command_list_manager usages #17297

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ if(UR_BUILD_ADAPTER_L0_V2)
${CMAKE_CURRENT_SOURCE_DIR}/v2/event.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/kernel.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/memory.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/lockable.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_api.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.hpp
Expand Down
61 changes: 41 additions & 20 deletions unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,10 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_(

ur_result_t ur_exp_command_buffer_handle_t_::finalizeCommandBuffer() {
// It is not allowed to append to command list from multiple threads.
std::scoped_lock<ur_shared_mutex> guard(this->Mutex);
auto commandListLocked = commandListManager.lock();
UR_ASSERT(!isFinalized, UR_RESULT_ERROR_INVALID_OPERATION);
// Close the command lists and have them ready for dispatch.
ZE2UR_CALL(zeCommandListClose, (this->commandListManager.getZeCommandList()));
ZE2UR_CALL(zeCommandListClose, (commandListLocked->getZeCommandList()));
isFinalized = true;
return UR_RESULT_SUCCESS;
}
Expand Down Expand Up @@ -130,7 +130,8 @@ ur_result_t urCommandBufferAppendKernelLaunchExp(
std::ignore = numKernelAlternatives;
std::ignore = kernelAlternatives;
std::ignore = command;
UR_CALL(commandBuffer->commandListManager.appendKernelLaunch(
auto commandListLocked = commandBuffer->commandListManager.lock();
UR_CALL(commandListLocked->appendKernelLaunch(
hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, 0,
nullptr, nullptr));
return UR_RESULT_SUCCESS;
Expand All @@ -157,8 +158,9 @@ ur_result_t urCommandBufferAppendUSMMemcpyExp(

std::ignore = phCommand;
// Responsibility of UMD to offload to copy engine
UR_CALL(hCommandBuffer->commandListManager.appendUSMMemcpy(
false, pDst, pSrc, size, 0, nullptr, nullptr));
auto commandListLocked = hCommandBuffer->commandListManager.lock();
UR_CALL(commandListLocked->appendUSMMemcpy(false, pDst, pSrc, size, 0,
nullptr, nullptr));

return UR_RESULT_SUCCESS;
} catch (...) {
Expand All @@ -185,7 +187,8 @@ ur_result_t urCommandBufferAppendMemBufferCopyExp(

std::ignore = phCommand;
// Responsibility of UMD to offload to copy engine
UR_CALL(hCommandBuffer->commandListManager.appendMemBufferCopy(
auto commandListLocked = hCommandBuffer->commandListManager.lock();
UR_CALL(commandListLocked->appendMemBufferCopy(
hSrcMem, hDstMem, srcOffset, dstOffset, size, 0, nullptr, nullptr));

return UR_RESULT_SUCCESS;
Expand Down Expand Up @@ -213,8 +216,9 @@ ur_result_t urCommandBufferAppendMemBufferWriteExp(

std::ignore = phCommand;
// Responsibility of UMD to offload to copy engine
UR_CALL(hCommandBuffer->commandListManager.appendMemBufferWrite(
hBuffer, false, offset, size, pSrc, 0, nullptr, nullptr));
auto commandListLocked = hCommandBuffer->commandListManager.lock();
UR_CALL(commandListLocked->appendMemBufferWrite(hBuffer, false, offset, size,
pSrc, 0, nullptr, nullptr));

return UR_RESULT_SUCCESS;
} catch (...) {
Expand All @@ -241,8 +245,9 @@ ur_result_t urCommandBufferAppendMemBufferReadExp(
std::ignore = phCommand;

// Responsibility of UMD to offload to copy engine
UR_CALL(hCommandBuffer->commandListManager.appendMemBufferRead(
hBuffer, false, offset, size, pDst, 0, nullptr, nullptr));
auto commandListLocked = hCommandBuffer->commandListManager.lock();
UR_CALL(commandListLocked->appendMemBufferRead(hBuffer, false, offset, size,
pDst, 0, nullptr, nullptr));

return UR_RESULT_SUCCESS;
} catch (...) {
Expand Down Expand Up @@ -271,7 +276,8 @@ ur_result_t urCommandBufferAppendMemBufferCopyRectExp(

std::ignore = phCommand;
// Responsibility of UMD to offload to copy engine
UR_CALL(hCommandBuffer->commandListManager.appendMemBufferCopyRect(
auto commandListLocked = hCommandBuffer->commandListManager.lock();
UR_CALL(commandListLocked->appendMemBufferCopyRect(
hSrcMem, hDstMem, srcOrigin, dstOrigin, region, srcRowPitch,
srcSlicePitch, dstRowPitch, dstSlicePitch, 0, nullptr, nullptr));

Expand Down Expand Up @@ -303,7 +309,8 @@ ur_result_t urCommandBufferAppendMemBufferWriteRectExp(
std::ignore = phCommand;

// Responsibility of UMD to offload to copy engine
UR_CALL(hCommandBuffer->commandListManager.appendMemBufferWriteRect(
auto commandListLocked = hCommandBuffer->commandListManager.lock();
UR_CALL(commandListLocked->appendMemBufferWriteRect(
hBuffer, false, bufferOffset, hostOffset, region, bufferRowPitch,
bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, 0, nullptr,
nullptr));
Expand Down Expand Up @@ -336,7 +343,8 @@ ur_result_t urCommandBufferAppendMemBufferReadRectExp(
std::ignore = phCommand;

// Responsibility of UMD to offload to copy engine
UR_CALL(hCommandBuffer->commandListManager.appendMemBufferReadRect(
auto commandListLocked = hCommandBuffer->commandListManager.lock();
UR_CALL(commandListLocked->appendMemBufferReadRect(
hBuffer, false, bufferOffset, hostOffset, region, bufferRowPitch,
bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, 0, nullptr,
nullptr));
Expand Down Expand Up @@ -366,8 +374,9 @@ ur_result_t urCommandBufferAppendUSMFillExp(

std::ignore = phCommand;

UR_CALL(hCommandBuffer->commandListManager.appendUSMFill(
pMemory, patternSize, pPattern, size, 0, nullptr, nullptr));
auto commandListLocked = hCommandBuffer->commandListManager.lock();
UR_CALL(commandListLocked->appendUSMFill(pMemory, patternSize, pPattern, size,
0, nullptr, nullptr));
return UR_RESULT_SUCCESS;
} catch (...) {
return exceptionToResult(std::current_exception());
Expand All @@ -393,7 +402,8 @@ ur_result_t urCommandBufferAppendMemBufferFillExp(

std::ignore = phCommand;

UR_CALL(hCommandBuffer->commandListManager.appendMemBufferFill(
auto commandListLocked = hCommandBuffer->commandListManager.lock();
UR_CALL(commandListLocked->appendMemBufferFill(
hBuffer, pPattern, patternSize, offset, size, 0, nullptr, nullptr));
return UR_RESULT_SUCCESS;
} catch (...) {
Expand All @@ -420,8 +430,9 @@ ur_result_t urCommandBufferAppendUSMPrefetchExp(

std::ignore = phCommand;

UR_CALL(hCommandBuffer->commandListManager.appendUSMPrefetch(
pMemory, size, flags, 0, nullptr, nullptr));
auto commandListLocked = hCommandBuffer->commandListManager.lock();
UR_CALL(commandListLocked->appendUSMPrefetch(pMemory, size, flags, 0, nullptr,
nullptr));

return UR_RESULT_SUCCESS;
} catch (...) {
Expand All @@ -447,8 +458,8 @@ ur_result_t urCommandBufferAppendUSMAdviseExp(

std::ignore = phCommand;

UR_CALL(hCommandBuffer->commandListManager.appendUSMAdvise(pMemory, size,
advice, nullptr));
auto commandListLocked = hCommandBuffer->commandListManager.lock();
UR_CALL(commandListLocked->appendUSMAdvise(pMemory, size, advice, nullptr));

return UR_RESULT_SUCCESS;
} catch (...) {
Expand Down Expand Up @@ -483,4 +494,14 @@ urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer,
return exceptionToResult(std::current_exception());
}

ur_result_t urCommandBufferEnqueueExp(
ur_exp_command_buffer_handle_t CommandBuffer, ur_queue_handle_t UrQueue,
uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList,
ur_event_handle_t *Event) try {
return UrQueue->get().enqueueCommandBufferExp(
CommandBuffer, NumEventsInWaitList, EventWaitList, Event);
} catch (...) {
return exceptionToResult(std::current_exception());
}

Comment on lines +497 to +506
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why was this function reintroduced? I think it might be a rebase error

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

whoops, let me know if i should revert this

Copy link
Contributor

@EwanC EwanC Mar 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened a PR to fix this #17476 (not worth a revert, it's dead code but not breaking anything)

} // namespace ur::level_zero
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "common.hpp"
#include "context.hpp"
#include "kernel.hpp"
#include "lockable.hpp"
#include "queue_api.hpp"
#include <ze_api.h>

Expand All @@ -24,7 +25,7 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object {

~ur_exp_command_buffer_handle_t_() = default;

ur_command_list_manager commandListManager;
lockable<ur_command_list_manager> commandListManager;

ur_result_t finalizeCommandBuffer();
// Indicates if command-buffer commands can be updated after it is closed.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,7 @@ ur_result_t ur_command_list_manager::appendKernelLaunch(

ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(device);

std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(this->Mutex,
hKernel->Mutex);
std::scoped_lock<ur_shared_mutex> Lock(hKernel->Mutex);

ze_group_count_t zeThreadGroupDimensions{1, 1, 1};
uint32_t WG[3]{};
Expand Down Expand Up @@ -235,8 +234,6 @@ ur_result_t ur_command_list_manager::appendUSMMemcpy(
ur_event_handle_t *phEvent) {
TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMMemcpy");

std::scoped_lock<ur_shared_mutex> lock(this->Mutex);

auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_MEMCPY);

auto [pWaitEvents, numWaitEvents] =
Expand All @@ -262,8 +259,7 @@ ur_result_t ur_command_list_manager::appendMemBufferFill(
auto hBuffer = hMem->getBuffer();
UR_ASSERT(offset + size <= hBuffer->getSize(), UR_RESULT_ERROR_INVALID_SIZE);

std::scoped_lock<ur_shared_mutex, ur_shared_mutex> lock(this->Mutex,
hBuffer->getMutex());
std::scoped_lock<ur_shared_mutex> lock(hBuffer->getMutex());

return appendGenericFillUnlocked(hBuffer, offset, patternSize, pPattern, size,
numEventsInWaitList, phEventWaitList,
Expand All @@ -276,8 +272,6 @@ ur_result_t ur_command_list_manager::appendUSMFill(
ur_event_handle_t *phEvent) {
TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMFill");

std::scoped_lock<ur_shared_mutex> lock(this->Mutex);

ur_usm_handle_t dstHandle(context, size, pMem);
return appendGenericFillUnlocked(&dstHandle, 0, patternSize, pPattern, size,
numEventsInWaitList, phEventWaitList,
Expand All @@ -292,8 +286,6 @@ ur_result_t ur_command_list_manager::appendUSMPrefetch(

std::ignore = flags;

std::scoped_lock<ur_shared_mutex> lock(this->Mutex);

auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH);

auto [pWaitEvents, numWaitEvents] =
Expand All @@ -320,8 +312,6 @@ ur_command_list_manager::appendUSMAdvise(const void *pMem, size_t size,
ur_event_handle_t *phEvent) {
TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMAdvise");

std::scoped_lock<ur_shared_mutex> lock(this->Mutex);

auto zeAdvice = ur_cast<ze_memory_advice_t>(advice);

auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE);
Expand Down Expand Up @@ -354,8 +344,7 @@ ur_result_t ur_command_list_manager::appendMemBufferRead(

ur_usm_handle_t dstHandle(context, size, pDst);

std::scoped_lock<ur_shared_mutex, ur_shared_mutex> lock(this->Mutex,
hBuffer->getMutex());
std::scoped_lock<ur_shared_mutex> lock(hBuffer->getMutex());

return appendGenericCopyUnlocked(hBuffer, &dstHandle, blockingRead, offset, 0,
size, numEventsInWaitList, phEventWaitList,
Expand All @@ -373,8 +362,7 @@ ur_result_t ur_command_list_manager::appendMemBufferWrite(

ur_usm_handle_t srcHandle(context, size, pSrc);

std::scoped_lock<ur_shared_mutex, ur_shared_mutex> lock(this->Mutex,
hBuffer->getMutex());
std::scoped_lock<ur_shared_mutex> lock(hBuffer->getMutex());

return appendGenericCopyUnlocked(
&srcHandle, hBuffer, blockingWrite, 0, offset, size, numEventsInWaitList,
Expand All @@ -395,8 +383,8 @@ ur_result_t ur_command_list_manager::appendMemBufferCopy(
UR_ASSERT(dstOffset + size <= hBufferDst->getSize(),
UR_RESULT_ERROR_INVALID_SIZE);

std::scoped_lock<ur_shared_mutex, ur_shared_mutex, ur_shared_mutex> lock(
this->Mutex, hBufferSrc->getMutex(), hBufferDst->getMutex());
std::scoped_lock<ur_shared_mutex, ur_shared_mutex> lock(
hBufferSrc->getMutex(), hBufferDst->getMutex());

return appendGenericCopyUnlocked(hBufferSrc, hBufferDst, false, srcOffset,
dstOffset, size, numEventsInWaitList,
Expand All @@ -415,8 +403,7 @@ ur_result_t ur_command_list_manager::appendMemBufferReadRect(
auto hBuffer = hMem->getBuffer();
ur_usm_handle_t dstHandle(context, 0, pDst);

std::scoped_lock<ur_shared_mutex, ur_shared_mutex> lock(this->Mutex,
hBuffer->getMutex());
std::scoped_lock<ur_shared_mutex> lock(hBuffer->getMutex());

return appendRegionCopyUnlocked(
hBuffer, &dstHandle, blockingRead, bufferOrigin, hostOrigin, region,
Expand All @@ -436,8 +423,7 @@ ur_result_t ur_command_list_manager::appendMemBufferWriteRect(
auto hBuffer = hMem->getBuffer();
ur_usm_handle_t srcHandle(context, 0, pSrc);

std::scoped_lock<ur_shared_mutex, ur_shared_mutex> lock(this->Mutex,
hBuffer->getMutex());
std::scoped_lock<ur_shared_mutex> lock(hBuffer->getMutex());

return appendRegionCopyUnlocked(
&srcHandle, hBuffer, blockingWrite, hostOrigin, bufferOrigin, region,
Expand All @@ -457,8 +443,8 @@ ur_result_t ur_command_list_manager::appendMemBufferCopyRect(
auto hBufferSrc = hSrc->getBuffer();
auto hBufferDst = hDst->getBuffer();

std::scoped_lock<ur_shared_mutex, ur_shared_mutex, ur_shared_mutex> lock(
this->Mutex, hBufferSrc->getMutex(), hBufferDst->getMutex());
std::scoped_lock<ur_shared_mutex, ur_shared_mutex> lock(
hBufferSrc->getMutex(), hBufferDst->getMutex());

return appendRegionCopyUnlocked(
hBufferSrc, hBufferDst, false, srcOrigin, dstOrigin, region, srcRowPitch,
Expand All @@ -475,8 +461,6 @@ ur_result_t ur_command_list_manager::appendUSMMemcpy2D(
ur_rect_offset_t zeroOffset{0, 0, 0};
ur_rect_region_t region{width, height, 0};

std::scoped_lock<ur_shared_mutex> lock(this->Mutex);

ur_usm_handle_t srcHandle(context, 0, pSrc);
ur_usm_handle_t dstHandle(context, 0, pDst);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,14 @@ struct wait_list_view {
}
};

struct ur_command_list_manager : public _ur_object {
struct ur_command_list_manager {

ur_command_list_manager(ur_context_handle_t context,
ur_device_handle_t device,
v2::raii::command_list_unique_handle &&commandList,
v2::event_flags_t flags = v2::EVENT_FLAGS_COUNTER,
ur_queue_t_ *queue = nullptr);
ur_command_list_manager(ur_command_list_manager &&src) = default;
~ur_command_list_manager();

ur_result_t appendKernelLaunch(ur_kernel_handle_t hKernel, uint32_t workDim,
Expand Down
58 changes: 58 additions & 0 deletions unified-runtime/source/adapters/level_zero/v2/lockable.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
//===--------- memory.hpp - Level Zero Adapter ---------------------------===//
//
// Copyright (C) 2024 Intel Corporation
//
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
// Exceptions. See LICENSE.TXT
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#pragma once
#include <mutex>

template <typename T> struct locked {
public:
locked(T *object, std::unique_lock<std::mutex> &&lock)
: lock_(std::move(lock)) {
object_ = object;
}
T *operator->() { return object_; }

private:
std::unique_lock<std::mutex> lock_;
T *object_;
};

/*
lockable<T> wraps T class object in exclusive access lock, similar to one used
in rust

construction:
lockable<X> l(arguments, to, construct, X);

access without synchronization:
X* obj_ptr = l.get_no_lock();
obj_ptr->print_name();

exclusive access to object kept in l:
// as long as lock exists, thread has exclusive access to underlaying object
locked<X> lock = l.lock();
// that object is accessed through ->() operator on lock object
lock->print_name();
*/

template <typename T> struct lockable {
public:
template <typename... Args>
lockable(Args &&...args) : object_(std::forward<Args>(args)...) {}
locked<T> lock() {
std::unique_lock lock{mut_};
return locked<T>(&object_, std::move(lock));
}
T *get_no_lock() { return &object_; }

private:
T object_;
std::mutex mut_;
};
Loading
Loading