Skip to content

Commit b113223

Browse files
performance: enable staging write for cl buffers
Related-To: NEO-13529 Also, add size threshold on iGPU on Linux, and disable staging if imported host ptr could be reused Signed-off-by: Szymon Morek <[email protected]>
1 parent 35d8e82 commit b113223

File tree

14 files changed

+174
-22
lines changed

14 files changed

+174
-22
lines changed

opencl/source/command_queue/command_queue.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "shared/source/os_interface/os_context.h"
3030
#include "shared/source/os_interface/product_helper.h"
3131
#include "shared/source/utilities/api_intercept.h"
32+
#include "shared/source/utilities/staging_buffer_manager.h"
3233
#include "shared/source/utilities/tag_allocator.h"
3334

3435
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
@@ -548,7 +549,9 @@ WaitStatus CommandQueue::waitUntilComplete(TaskCountType gpgpuTaskCountToWait, R
548549
: getGpgpuCommandStreamReceiver().waitForTaskCount(gpgpuTaskCountToWait);
549550

550551
WAIT_LEAVE()
551-
552+
if (this->context->getStagingBufferManager()) {
553+
this->context->getStagingBufferManager()->resetDetectedPtrs();
554+
}
552555
return waitStatus;
553556
}
554557

opencl/source/command_queue/command_queue_staging.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,8 @@ bool CommandQueue::isValidForStagingTransfer(MemObj *memObj, const void *ptr, bo
173173
switch (memObj->peekClMemObjType()) {
174174
case CL_MEM_OBJECT_IMAGE1D:
175175
case CL_MEM_OBJECT_IMAGE2D:
176-
return stagingBufferManager->isValidForStagingTransfer(this->getDevice(), ptr, hasDependencies);
176+
case CL_MEM_OBJECT_BUFFER:
177+
return stagingBufferManager->isValidForStagingTransfer(this->getDevice(), ptr, memObj->getSize(), hasDependencies);
177178
default:
178179
return false;
179180
}

opencl/source/event/event.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2018-2024 Intel Corporation
2+
* Copyright (C) 2018-2025 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -21,6 +21,7 @@
2121
#include "shared/source/memory_manager/internal_allocation_storage.h"
2222
#include "shared/source/utilities/perf_counter.h"
2323
#include "shared/source/utilities/range.h"
24+
#include "shared/source/utilities/staging_buffer_manager.h"
2425
#include "shared/source/utilities/tag_allocator.h"
2526

2627
#include "opencl/extensions/public/cl_ext_private.h"
@@ -553,6 +554,9 @@ void Event::updateExecutionStatus() {
553554
auto *allocationStorage = cmdQueue->getGpgpuCommandStreamReceiver().getInternalAllocationStorage();
554555
allocationStorage->cleanAllocationList(this->taskCount, TEMPORARY_ALLOCATION);
555556
allocationStorage->cleanAllocationList(this->taskCount, DEFERRED_DEALLOCATION);
557+
if (cmdQueue->getContext().getStagingBufferManager()) {
558+
cmdQueue->getContext().getStagingBufferManager()->resetDetectedPtrs();
559+
}
556560
return;
557561
}
558562

opencl/test/unit_test/command_queue/enqueue_write_buffer_tests.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -660,6 +660,45 @@ HWTEST_F(WriteBufferStagingBufferTest, whenEnqueueStagingWriteBufferCalledThenRe
660660
EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled);
661661
}
662662

663+
HWTEST_F(WriteBufferStagingBufferTest, whenHostPtrRegisteredThenDontUseStagingUntilEventCompleted) {
664+
DebugManagerStateRestore restorer;
665+
debugManager.flags.EnableCopyWithStagingBuffers.set(1);
666+
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context.get(), device.get(), &props);
667+
668+
cl_event event;
669+
auto retVal = mockCommandQueueHw.enqueueWriteBuffer(&buffer,
670+
CL_FALSE,
671+
0,
672+
MemoryConstants::cacheLineSize,
673+
ptr,
674+
nullptr,
675+
0,
676+
nullptr,
677+
&event);
678+
EXPECT_EQ(CL_SUCCESS, retVal);
679+
auto pEvent = castToObjectOrAbort<Event>(event);
680+
681+
EXPECT_TRUE(mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, false));
682+
EXPECT_FALSE(mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, false));
683+
684+
pEvent->updateExecutionStatus();
685+
EXPECT_TRUE(mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, false));
686+
687+
pEvent->release();
688+
}
689+
690+
HWTEST_F(WriteBufferStagingBufferTest, whenHostPtrRegisteredThenDontUseStagingUntilFinishCalled) {
691+
DebugManagerStateRestore restorer;
692+
debugManager.flags.EnableCopyWithStagingBuffers.set(1);
693+
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context.get(), device.get(), &props);
694+
695+
EXPECT_TRUE(mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, false));
696+
EXPECT_FALSE(mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, false));
697+
698+
mockCommandQueueHw.finish();
699+
EXPECT_TRUE(mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, false));
700+
}
701+
663702
HWTEST_F(WriteBufferStagingBufferTest, whenEnqueueStagingWriteBufferCalledWithLargeSizeThenSplitTransfer) {
664703
auto hostPtr = new unsigned char[chunkSize * 4];
665704
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context.get(), device.get(), &props);
@@ -730,4 +769,12 @@ HWTEST_F(WriteBufferStagingBufferTest, whenEnqueueStagingWriteBufferFailedThenPr
730769

731770
EXPECT_EQ(res, CL_INVALID_OPERATION);
732771
EXPECT_EQ(1ul, mockCommandQueueHw.enqueueWriteBufferCounter);
772+
}
773+
774+
HWTEST_F(WriteBufferStagingBufferTest, whenIsValidForStagingTransferCalledThenReturnCorrectValue) {
775+
MockCommandQueueHw<FamilyType> mockCommandQueueHw(context.get(), device.get(), &props);
776+
auto isStagingBuffersEnabled = device->getProductHelper().isStagingBuffersEnabled();
777+
unsigned char ptr[16];
778+
779+
EXPECT_EQ(isStagingBuffersEnabled, mockCommandQueueHw.isValidForStagingTransfer(&buffer, ptr, false));
733780
}

opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -811,6 +811,7 @@ HWTEST_F(EnqueueWriteImageTest, whenisValidForStagingTransferCalledThenReturnCor
811811

812812
std::unique_ptr<Image> image(Image1dHelper<>::create(context));
813813
EXPECT_EQ(isStagingBuffersEnabled, pCmdQ->isValidForStagingTransfer(image.get(), ptr, false));
814+
pCmdQ->finish();
814815

815816
image.reset(Image2dHelper<>::create(context));
816817
EXPECT_EQ(isStagingBuffersEnabled, pCmdQ->isValidForStagingTransfer(image.get(), ptr, false));

opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1176,7 +1176,8 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenUpdateTaskCountFromWaitSetWhe
11761176
DebugManagerStateRestore restorer;
11771177
debugManager.flags.UpdateTaskCountFromWait.set(3);
11781178

1179-
CommandQueueHw<FamilyType> commandQueue(nullptr, pClDevice, 0, false);
1179+
MockContext context(pClDevice);
1180+
CommandQueueHw<FamilyType> commandQueue(&context, pClDevice, 0, false);
11801181
commandQueue.taskCount = 10;
11811182

11821183
auto mockCsr = new MockCsrHw2<FamilyType>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());
@@ -1220,7 +1221,8 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenEnabledDirectSubmissionUpdate
12201221
}
12211222
};
12221223

1223-
CommandQueueHw<FamilyType> commandQueue(nullptr, pClDevice, 0, false);
1224+
MockContext context(pClDevice);
1225+
CommandQueueHw<FamilyType> commandQueue(&context, pClDevice, 0, false);
12241226
commandQueue.taskCount = 10;
12251227

12261228
auto mockCsr = new MockCsrHwDirectSubmission(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield());

shared/source/os_interface/linux/os_interface_linux.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,13 @@ bool initDrmOsInterface(std::unique_ptr<HwDeviceId> &&hwDeviceId, uint32_t rootD
5858
return true;
5959
}
6060

61+
bool OSInterface::isSizeWithinThresholdForStaging(size_t size, bool isIGPU) const {
62+
if (isIGPU) {
63+
return size < 512 * MemoryConstants::megaByte;
64+
}
65+
return true;
66+
}
67+
6168
uint32_t OSInterface::getAggregatedProcessCount() const {
6269
if (driverModel && driverModel->getDriverModelType() == DriverModelType::drm) {
6370
return driverModel->as<Drm>()->getAggregatedProcessCount();

shared/source/os_interface/os_interface.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ class OSInterface : public NonCopyableClass {
117117

118118
MOCKABLE_VIRTUAL bool isDebugAttachAvailable() const;
119119
MOCKABLE_VIRTUAL bool isLockablePointer(bool isLockable) const;
120+
MOCKABLE_VIRTUAL bool isSizeWithinThresholdForStaging(size_t size, bool isIGPU) const;
120121
MOCKABLE_VIRTUAL uint32_t getAggregatedProcessCount() const;
121122

122123
static bool osEnabled64kbPages;

shared/source/os_interface/windows/os_interface_win.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ bool OSInterface::isLockablePointer(bool isLockable) const {
2626
return isLockable;
2727
}
2828

29+
bool OSInterface::isSizeWithinThresholdForStaging(size_t size, bool isIGPU) const {
30+
return true;
31+
}
32+
2933
uint32_t OSInterface::getAggregatedProcessCount() const {
3034
return 0;
3135
}

shared/source/utilities/staging_buffer_manager.cpp

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,13 @@
1010
#include "shared/source/command_stream/command_stream_receiver.h"
1111
#include "shared/source/debug_settings/debug_settings_manager.h"
1212
#include "shared/source/device/device.h"
13+
#include "shared/source/execution_environment/root_device_environment.h"
1314
#include "shared/source/helpers/aligned_memory.h"
15+
#include "shared/source/helpers/hw_info.h"
1416
#include "shared/source/memory_manager/unified_memory_manager.h"
17+
#include "shared/source/os_interface/os_interface.h"
1518
#include "shared/source/utilities/heap_allocator.h"
19+
1620
namespace NEO {
1721

1822
StagingBuffer::StagingBuffer(void *baseAddress, size_t size) : baseAddress(baseAddress) {
@@ -285,28 +289,33 @@ void *StagingBufferManager::allocateStagingBuffer(size_t size) {
285289
return hostPtr;
286290
}
287291

288-
bool StagingBufferManager::isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) const {
289-
auto stagingCopyEnabled = device.getProductHelper().isStagingBuffersEnabled();
290-
if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) {
291-
stagingCopyEnabled = debugManager.flags.EnableCopyWithStagingBuffers.get();
292-
}
292+
bool StagingBufferManager::isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) {
293293
auto usmDstData = svmAllocsManager->getSVMAlloc(dstPtr);
294294
auto usmSrcData = svmAllocsManager->getSVMAlloc(srcPtr);
295295
bool hostToUsmCopy = usmSrcData == nullptr && usmDstData != nullptr;
296296
bool isUsedByOsContext = false;
297297
if (usmDstData) {
298298
isUsedByOsContext = usmDstData->gpuAllocations.getGraphicsAllocation(device.getRootDeviceIndex())->isUsedByOsContext(osContextId);
299299
}
300-
return stagingCopyEnabled && hostToUsmCopy && !hasDependencies && (isUsedByOsContext || size <= chunkSize);
300+
return this->isValidForStaging(device, srcPtr, size, hasDependencies) && hostToUsmCopy && (isUsedByOsContext || size <= chunkSize);
301+
}
302+
303+
bool StagingBufferManager::isValidForStagingTransfer(const Device &device, const void *ptr, size_t size, bool hasDependencies) {
304+
auto nonUsmPtr = ptr != nullptr && svmAllocsManager->getSVMAlloc(ptr) == nullptr;
305+
return this->isValidForStaging(device, ptr, size, hasDependencies) && nonUsmPtr;
301306
}
302307

303-
bool StagingBufferManager::isValidForStagingTransfer(const Device &device, const void *ptr, bool hasDependencies) const {
308+
// Common checks for usm, buffers and images
309+
bool StagingBufferManager::isValidForStaging(const Device &device, const void *ptr, size_t size, bool hasDependencies) {
304310
auto stagingCopyEnabled = device.getProductHelper().isStagingBuffersEnabled();
305311
if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) {
306312
stagingCopyEnabled = debugManager.flags.EnableCopyWithStagingBuffers.get();
307313
}
308-
auto nonUsmPtr = ptr != nullptr && svmAllocsManager->getSVMAlloc(ptr) == nullptr;
309-
return stagingCopyEnabled && !hasDependencies && nonUsmPtr;
314+
auto isIntegrated = device.getRootDeviceEnvironment().getHardwareInfo()->capabilityTable.isIntegratedDevice;
315+
auto osInterface = device.getRootDeviceEnvironment().osInterface.get();
316+
bool sizeWithinThreshold = osInterface ? osInterface->isSizeWithinThresholdForStaging(size, isIntegrated) : true;
317+
auto detectedHostPtr = this->registerHostPtr(ptr);
318+
return stagingCopyEnabled && !hasDependencies && !detectedHostPtr && sizeWithinThreshold;
310319
}
311320

312321
void StagingBufferManager::clearTrackedChunks() {
@@ -325,4 +334,16 @@ void StagingBufferManager::trackChunk(const StagingBufferTracker &tracker) {
325334
trackers.push_back(tracker);
326335
}
327336

337+
bool StagingBufferManager::registerHostPtr(const void *ptr) {
338+
auto lock = std::lock_guard<std::mutex>(mtx);
339+
auto isHostPtrDetected = detectedHostPtrs.find(ptr) != detectedHostPtrs.end();
340+
detectedHostPtrs.insert(ptr);
341+
return isHostPtrDetected;
342+
}
343+
344+
void StagingBufferManager::resetDetectedPtrs() {
345+
auto lock = std::lock_guard<std::mutex>(mtx);
346+
detectedHostPtrs.clear();
347+
}
348+
328349
} // namespace NEO

shared/source/utilities/staging_buffer_manager.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <memory>
1717
#include <mutex>
1818
#include <queue>
19+
#include <set>
1920

2021
namespace NEO {
2122
class SVMAllocsManager;
@@ -78,8 +79,8 @@ class StagingBufferManager {
7879
StagingBufferManager &operator=(StagingBufferManager &&other) noexcept = delete;
7980
StagingBufferManager &operator=(const StagingBufferManager &other) = delete;
8081

81-
bool isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) const;
82-
bool isValidForStagingTransfer(const Device &device, const void *ptr, bool hasDependencies) const;
82+
bool isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId);
83+
bool isValidForStagingTransfer(const Device &device, const void *ptr, size_t size, bool hasDependencies);
8384

8485
StagingTransferStatus performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr);
8586
StagingTransferStatus performImageTransfer(const void *ptr, const size_t *globalOrigin, const size_t *globalRegion, size_t rowPitch, ChunkTransferImageFunc &chunkTransferImageFunc, CommandStreamReceiver *csr, bool isRead);
@@ -88,6 +89,9 @@ class StagingBufferManager {
8889
std::pair<HeapAllocator *, uint64_t> requestStagingBuffer(size_t &size);
8990
void trackChunk(const StagingBufferTracker &tracker);
9091

92+
bool registerHostPtr(const void *ptr);
93+
void resetDetectedPtrs();
94+
9195
private:
9296
std::pair<HeapAllocator *, uint64_t> getExistingBuffer(size_t &size);
9397
void *allocateStagingBuffer(size_t size);
@@ -99,6 +103,8 @@ class StagingBufferManager {
99103
WaitStatus fetchHead(StagingQueue &stagingQueue, StagingBufferTracker &tracker) const;
100104
WaitStatus drainAndReleaseStagingQueue(StagingQueue &stagingQueue) const;
101105

106+
bool isValidForStaging(const Device &device, const void *ptr, size_t size, bool hasDependencies);
107+
102108
size_t chunkSize = MemoryConstants::pageSize2M;
103109
std::mutex mtx;
104110
std::vector<StagingBuffer> stagingBuffers;
@@ -108,6 +114,8 @@ class StagingBufferManager {
108114
const RootDeviceIndicesContainer rootDeviceIndices;
109115
const std::map<uint32_t, DeviceBitfield> deviceBitfields;
110116
const bool requiresWritable = false;
117+
118+
std::set<const void *> detectedHostPtrs;
111119
};
112120

113121
} // namespace NEO

shared/test/unit_test/os_interface/linux/os_interface_linux_tests.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,4 +94,15 @@ TEST(OsInterfaceTest, whenOsInterfaceSetupGmmInputArgsThenArgsAreSet) {
9494
EXPECT_EQ(GMM_CLIENT::GMM_OCL_VISTA, passedInputArgs.ClientType);
9595
}
9696

97+
TEST(OsInterfaceTest, GivenLinuxOsInterfaceWhenGetThresholdForStagingCalledThenReturnThresholdForIntegratedDevices) {
98+
OSInterface osInterface;
99+
100+
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
101+
DrmMock *drm = new DrmMock(*executionEnvironment->rootDeviceEnvironments[0]);
102+
103+
osInterface.setDriverModel(std::unique_ptr<DriverModel>(drm));
104+
EXPECT_TRUE(osInterface.isSizeWithinThresholdForStaging(MemoryConstants::gigaByte, false));
105+
EXPECT_FALSE(osInterface.isSizeWithinThresholdForStaging(MemoryConstants::gigaByte, true));
106+
}
107+
97108
} // namespace NEO

shared/test/unit_test/os_interface/windows/os_interface_win_tests.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,3 +153,14 @@ TEST_F(OsInterfaceTest, givenEnableFtrTile64OptimizationDebugKeyWhenSetThenPrope
153153
EXPECT_EQ(1u, passedFtrTable.FtrTile64Optimization);
154154
}
155155
}
156+
157+
TEST_F(OsInterfaceTest, whenGetThresholdForStagingCalledThenReturnNoThreshold) {
158+
MockExecutionEnvironment executionEnvironment;
159+
auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
160+
auto wddm = new WddmMock(rootDeviceEnvironment);
161+
EXPECT_EQ(nullptr, rootDeviceEnvironment.osInterface.get());
162+
wddm->init();
163+
EXPECT_NE(nullptr, rootDeviceEnvironment.osInterface.get());
164+
EXPECT_TRUE(rootDeviceEnvironment.osInterface->isSizeWithinThresholdForStaging(MemoryConstants::gigaByte, false));
165+
EXPECT_TRUE(rootDeviceEnvironment.osInterface->isSizeWithinThresholdForStaging(MemoryConstants::gigaByte, true));
166+
}

0 commit comments

Comments
 (0)