Skip to content

Commit 9f755b5

Browse files
performance: Bidirectional BCS split implementation for OCL
Related-To: NEO-7877 Signed-off-by: Lukasz Jobczyk <[email protected]>
1 parent 7f24a4b commit 9f755b5

19 files changed

+287
-12
lines changed

level_zero/core/source/device/bcs_split.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,9 @@ struct BcsSplit {
5656
std::vector<CommandQueue *> h2dCmdQs;
5757
std::vector<CommandQueue *> d2hCmdQs;
5858

59-
inline static constexpr size_t h2dEngineMask = 0b000001010;
60-
inline static constexpr size_t d2hEngineMask = 0b010100000;
6159
NEO::BcsInfoMask engines = NEO::EngineHelpers::oddLinkedCopyEnginesMask;
62-
NEO::BcsInfoMask h2dEngines = h2dEngineMask;
63-
NEO::BcsInfoMask d2hEngines = d2hEngineMask;
60+
NEO::BcsInfoMask h2dEngines = NEO::EngineHelpers::h2dCopyEngineMask;
61+
NEO::BcsInfoMask d2hEngines = NEO::EngineHelpers::d2hCopyEngineMask;
6462

6563
template <GFXCORE_FAMILY gfxCoreFamily, typename T, typename K>
6664
ze_result_t appendSplitCall(CommandListCoreFamilyImmediate<gfxCoreFamily> *cmdList,

opencl/source/built_ins/builtins_dispatch_builder.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#pragma once
99
#include "shared/source/built_ins/built_in_ops_base.h"
10+
#include "shared/source/command_stream/transfer_direction.h"
1011
#include "shared/source/helpers/vec.h"
1112

1213
#include "opencl/source/kernel/multi_device_kernel.h"
@@ -50,6 +51,7 @@ struct BuiltinOpParams {
5051
uint32_t dstMipLevel = 0;
5152
void *userPtrForPostOperationCpuCopy = nullptr;
5253
bool bcsSplit = false;
54+
TransferDirection direction = TransferDirection::LocalToLocal;
5355
};
5456

5557
class BuiltinDispatchInfoBuilder {

opencl/source/command_queue/command_queue.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,13 @@ void CommandQueue::constructBcsEnginesForSplit() {
371371
}
372372
}
373373

374+
if (DebugManager.flags.SplitBcsMaskD2H.get() > 0) {
375+
this->d2hEngines = DebugManager.flags.SplitBcsMaskD2H.get();
376+
}
377+
if (DebugManager.flags.SplitBcsMaskH2D.get() > 0) {
378+
this->h2dEngines = DebugManager.flags.SplitBcsMaskH2D.get();
379+
}
380+
374381
this->bcsSplitInitialized = true;
375382
}
376383

opencl/source/command_queue/command_queue.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
437437

438438
bool bcsSplitInitialized = false;
439439
BcsInfoMask splitEngines = EngineHelpers::oddLinkedCopyEnginesMask;
440+
BcsInfoMask h2dEngines = NEO::EngineHelpers::h2dCopyEngineMask;
441+
BcsInfoMask d2hEngines = NEO::EngineHelpers::d2hCopyEngineMask;
440442

441443
LinearStream *commandStream = nullptr;
442444

opencl/source/command_queue/enqueue_common.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,11 +1245,18 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlitSplit(MultiDispatchInfo &dispatchIn
12451245
auto ret = CL_SUCCESS;
12461246
this->releaseMainCopyEngine();
12471247

1248-
StackVec<std::unique_lock<CommandStreamReceiver::MutexType>, 4u> locks;
1249-
StackVec<CommandStreamReceiver *, 4u> copyEngines;
1248+
StackVec<std::unique_lock<CommandStreamReceiver::MutexType>, 2u> locks;
1249+
StackVec<CommandStreamReceiver *, 2u> copyEngines;
1250+
1251+
auto splitEngines = this->splitEngines;
1252+
if (dispatchInfo.peekBuiltinOpParams().direction == NEO::TransferDirection::HostToLocal) {
1253+
splitEngines = this->h2dEngines;
1254+
} else if (dispatchInfo.peekBuiltinOpParams().direction == NEO::TransferDirection::LocalToHost) {
1255+
splitEngines = this->d2hEngines;
1256+
}
12501257

12511258
for (uint32_t i = 0; i < bcsInfoMaskSize; i++) {
1252-
if (this->splitEngines.test(i)) {
1259+
if (splitEngines.test(i)) {
12531260
auto engineType = EngineHelpers::mapBcsIndexToEngineType(i, true);
12541261
auto bcs = getBcsCommandStreamReceiver(engineType);
12551262
if (bcs) {

opencl/source/command_queue/enqueue_copy_buffer.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueCopyBuffer(
4040
dc.dstOffset = {dstOffset, 0, 0};
4141
dc.size = {size, 0, 0};
4242
dc.bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, size, csr);
43+
dc.direction = csrSelectionArgs.direction;
4344

4445
MultiDispatchInfo dispatchInfo(dc);
4546

opencl/source/command_queue/enqueue_copy_buffer_rect.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueCopyBufferRect(
5353
dc.dstRowPitch = dstRowPitch;
5454
dc.dstSlicePitch = dstSlicePitch;
5555
dc.bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, getTotalSizeFromRectRegion(region), csr);
56+
dc.direction = csrSelectionArgs.direction;
5657

5758
MultiDispatchInfo dispatchInfo(dc);
5859
return dispatchBcsOrGpgpuEnqueue<CL_COMMAND_COPY_BUFFER_RECT>(dispatchInfo, surfaces, eBuiltInOps, numEventsInWaitList, eventWaitList, event, false, csr);

opencl/source/command_queue/enqueue_copy_image.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2018-2022 Intel Corporation
2+
* Copyright (C) 2018-2023 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -53,6 +53,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueCopyImage(
5353
dc.dstMipLevel = findMipLevel(dstImage->getImageDesc().image_type, dstOrigin);
5454
}
5555
dc.bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, getTotalSizeFromRectRegion(region), csr);
56+
dc.direction = csrSelectionArgs.direction;
5657

5758
MultiDispatchInfo dispatchInfo(dc);
5859

opencl/source/command_queue/enqueue_read_buffer.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2018-2022 Intel Corporation
2+
* Copyright (C) 2018-2023 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -110,6 +110,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadBuffer(
110110
dc.size = {size, 0, 0};
111111
dc.transferAllocation = mapAllocation ? mapAllocation : hostPtrSurf.getAllocation();
112112
dc.bcsSplit = bcsSplit;
113+
dc.direction = csrSelectionArgs.direction;
113114

114115
MultiDispatchInfo dispatchInfo(dc);
115116

opencl/source/command_queue/enqueue_read_buffer_rect.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2018-2022 Intel Corporation
2+
* Copyright (C) 2018-2023 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -100,6 +100,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadBufferRect(
100100
dc.dstRowPitch = hostRowPitch;
101101
dc.dstSlicePitch = hostSlicePitch;
102102
dc.bcsSplit = bcsSplit;
103+
dc.direction = csrSelectionArgs.direction;
103104

104105
MultiDispatchInfo dispatchInfo(dc);
105106
const auto dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_READ_BUFFER_RECT>(dispatchInfo, surfaces, eBuiltInOps, numEventsInWaitList, eventWaitList, event, blockingRead, csr);

opencl/source/command_queue/enqueue_read_image.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadImage(
114114
dc.userPtrForPostOperationCpuCopy = ptr;
115115
}
116116
dc.bcsSplit = bcsSplit;
117+
dc.direction = csrSelectionArgs.direction;
117118

118119
auto eBuiltInOps = EBuiltInOps::CopyImage3dToBuffer;
119120
MultiDispatchInfo dispatchInfo(dc);

opencl/source/command_queue/enqueue_svm.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMMap(cl_bool blockingMap,
125125
dc.size = {size, 0, 0};
126126
dc.unifiedMemoryArgsRequireMemSync = externalAppCall;
127127
dc.bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, size, csr);
128+
dc.direction = csrSelectionArgs.direction;
128129

129130
MultiDispatchInfo dispatchInfo(dc);
130131
const auto dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_READ_BUFFER>(dispatchInfo, surfaces, EBuiltInOps::CopyBufferToBuffer, numEventsInWaitList, eventWaitList, event, blocking, csr);
@@ -212,6 +213,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMUnmap(void *svmPtr,
212213
dc.size = {svmOperation->regionSize, 0, 0};
213214
dc.unifiedMemoryArgsRequireMemSync = externalAppCall;
214215
dc.bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, svmOperation->regionSize, csr);
216+
dc.direction = csrSelectionArgs.direction;
215217

216218
MultiDispatchInfo dispatchInfo(dc);
217219
const auto dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_READ_BUFFER>(dispatchInfo, surfaces, EBuiltInOps::CopyBufferToBuffer, numEventsInWaitList, eventWaitList, event, false, csr);
@@ -385,6 +387,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMMemcpy(cl_bool blockingCopy,
385387
surfaces[1] = &dstHostPtrSurf;
386388

387389
operationParams.bcsSplit = bcsSplit;
390+
operationParams.direction = csrSelectionArgs.direction;
388391
dispatchInfo.setBuiltinOpParams(operationParams);
389392
dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_READ_BUFFER>(dispatchInfo, surfaces, builtInType, numEventsInWaitList, eventWaitList, event, blockingCopy, csr);
390393
} else if (copyType == HostToSvm) {
@@ -409,6 +412,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMMemcpy(cl_bool blockingCopy,
409412
surfaces[1] = &srcHostPtrSurf;
410413

411414
operationParams.bcsSplit = bcsSplit;
415+
operationParams.direction = csrSelectionArgs.direction;
412416
dispatchInfo.setBuiltinOpParams(operationParams);
413417
dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_WRITE_BUFFER>(dispatchInfo, surfaces, builtInType, numEventsInWaitList, eventWaitList, event, blockingCopy, csr);
414418
} else if (copyType == SvmToSvm) {
@@ -422,6 +426,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMMemcpy(cl_bool blockingCopy,
422426
surfaces[1] = &dstSvmSurf;
423427

424428
operationParams.bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, size, csr);
429+
operationParams.direction = csrSelectionArgs.direction;
425430
dispatchInfo.setBuiltinOpParams(operationParams);
426431
dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_SVM_MEMCPY>(dispatchInfo, surfaces, builtInType, numEventsInWaitList, eventWaitList, event, blockingCopy, csr);
427432
} else {
@@ -449,6 +454,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueSVMMemcpy(cl_bool blockingCopy,
449454
surfaces[1] = &dstHostPtrSurf;
450455

451456
operationParams.bcsSplit = bcsSplit;
457+
operationParams.direction = csrSelectionArgs.direction;
452458
dispatchInfo.setBuiltinOpParams(operationParams);
453459
dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_WRITE_BUFFER>(dispatchInfo, surfaces, builtInType, numEventsInWaitList, eventWaitList, event, blockingCopy, csr);
454460
}

opencl/source/command_queue/enqueue_write_buffer.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2018-2022 Intel Corporation
2+
* Copyright (C) 2018-2023 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -99,6 +99,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueWriteBuffer(
9999
dc.size = {size, 0, 0};
100100
dc.transferAllocation = mapAllocation ? mapAllocation : hostPtrSurf.getAllocation();
101101
dc.bcsSplit = bcsSplit;
102+
dc.direction = csrSelectionArgs.direction;
102103

103104
MultiDispatchInfo dispatchInfo(dc);
104105
const auto dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_WRITE_BUFFER>(dispatchInfo, surfaces, eBuiltInOps, numEventsInWaitList, eventWaitList, event, blockingWrite, csr);

opencl/source/command_queue/enqueue_write_buffer_rect.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2018-2022 Intel Corporation
2+
* Copyright (C) 2018-2023 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -105,6 +105,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueWriteBufferRect(
105105
dc.dstRowPitch = bufferRowPitch;
106106
dc.dstSlicePitch = bufferSlicePitch;
107107
dc.bcsSplit = bcsSplit;
108+
dc.direction = csrSelectionArgs.direction;
108109

109110
MultiDispatchInfo dispatchInfo(dc);
110111
const auto dispatchResult = dispatchBcsOrGpgpuEnqueue<CL_COMMAND_WRITE_BUFFER_RECT>(dispatchInfo, surfaces, eBuiltInOps, numEventsInWaitList, eventWaitList, event, blockingWrite, csr);

opencl/source/command_queue/enqueue_write_image.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueWriteImage(
9898
}
9999
dc.transferAllocation = mapAllocation ? mapAllocation : hostPtrSurf.getAllocation();
100100
dc.bcsSplit = bcsSplit;
101+
dc.direction = csrSelectionArgs.direction;
101102

102103
auto eBuiltInOps = EBuiltInOps::CopyBufferToImage3d;
103104
MultiDispatchInfo dispatchInfo(dc);

0 commit comments

Comments
 (0)