Skip to content

Commit 352583b

Browse files
Detect GPU hang in evictUnusedAllocations()
This change introduces checking of the return value of wait function in case of blocking version of evictUnusedAllocations(). Furthermore, it propagates the error to the callers. It contains also ULTs. Related-To: NEO-6681 Signed-off-by: Patryk Wrobel <[email protected]>
1 parent 641851c commit 352583b

11 files changed

+86
-16
lines changed

opencl/test/unit_test/os_interface/linux/drm_buffer_object_tests.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,19 @@ TEST_F(DrmBufferObjectTest, GivenInvalidParamsWhenCallingExecThenEfaultIsReturne
3939
EXPECT_EQ(EFAULT, bo->exec(0, 0, 0, false, osContext.get(), 0, 1, nullptr, 0u, &execObjectsStorage, 0, 0));
4040
}
4141

42+
TEST_F(DrmBufferObjectTest, GivenDetectedGpuHangDuringEvictUnusedAllocationsWhenCallingExecGpuHangErrorCodeIsRetrurned) {
43+
mock->ioctl_expected.total = 2;
44+
mock->ioctl_res = -1;
45+
mock->errnoValue = EFAULT;
46+
47+
bo->callBaseEvictUnusedAllocations = false;
48+
49+
drm_i915_gem_exec_object2 execObjectsStorage = {};
50+
const auto result = bo->exec(0, 0, 0, false, osContext.get(), 0, 1, nullptr, 0u, &execObjectsStorage, 0, 0);
51+
52+
EXPECT_EQ(BufferObject::GPU_HANG_DETECTED, result);
53+
}
54+
4255
TEST_F(DrmBufferObjectTest, WhenSettingTilingThenCallSucceeds) {
4356
mock->ioctl_expected.total = 1; //set_tiling
4457
auto ret = bo->setTiling(I915_TILING_X, 0);

opencl/test/unit_test/os_interface/linux/drm_residency_handler_prelim_tests.cpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,13 @@ struct MockDrmMemoryOperationsHandlerBind : public DrmMemoryOperationsHandlerBin
3939
bool useBaseEvictUnused = true;
4040
uint32_t evictUnusedCalled = 0;
4141

42-
void evictUnusedAllocations(bool waitForCompletion, bool isLockNeeded) override {
42+
MemoryOperationsStatus evictUnusedAllocations(bool waitForCompletion, bool isLockNeeded) override {
4343
evictUnusedCalled++;
4444
if (useBaseEvictUnused) {
45-
DrmMemoryOperationsHandlerBind::evictUnusedAllocations(waitForCompletion, isLockNeeded);
45+
return DrmMemoryOperationsHandlerBind::evictUnusedAllocations(waitForCompletion, isLockNeeded);
4646
}
47+
48+
return MemoryOperationsStatus::SUCCESS;
4749
}
4850
};
4951

@@ -169,7 +171,6 @@ TEST_F(DrmMemoryOperationsHandlerBindTest, givenObjectAlwaysResidentAndNotUsedWh
169171
}
170172

171173
EXPECT_EQ(mock->context.vmBindCalled, 2u);
172-
173174
operationHandler->evictUnusedAllocations(false, true);
174175

175176
EXPECT_EQ(mock->context.vmBindCalled, 2u);
@@ -239,14 +240,28 @@ HWTEST_F(DrmMemoryOperationsHandlerBindTest, whenEvictUnusedResourcesWithWaitFor
239240
auto &csr = device->getUltCommandStreamReceiver<FamilyType>();
240241
csr.latestWaitForCompletionWithTimeoutTaskCount.store(123u);
241242

242-
operationHandler->evictUnusedAllocations(true, true);
243+
const auto status = operationHandler->evictUnusedAllocations(true, true);
244+
EXPECT_EQ(MemoryOperationsStatus::SUCCESS, status);
243245

244246
auto latestWaitTaskCount = csr.latestWaitForCompletionWithTimeoutTaskCount.load();
245247
EXPECT_NE(latestWaitTaskCount, 123u);
246248

247249
memoryManager->freeGraphicsMemory(allocation);
248250
}
249251

252+
HWTEST_F(DrmMemoryOperationsHandlerBindTest, givenGpuHangWhenEvictUnusedResourcesWithWaitForCompletionThenGpuHangIsReturned) {
253+
auto allocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{device->getRootDeviceIndex(), MemoryConstants::pageSize});
254+
255+
auto &csr = device->getUltCommandStreamReceiver<FamilyType>();
256+
csr.callBaseWaitForCompletionWithTimeout = false;
257+
csr.returnWaitForCompletionWithTimeout = WaitStatus::GpuHang;
258+
259+
const auto status = operationHandler->evictUnusedAllocations(true, true);
260+
EXPECT_EQ(MemoryOperationsStatus::GPU_HANG_DETECTED_DURING_OPERATION, status);
261+
262+
memoryManager->freeGraphicsMemory(allocation);
263+
}
264+
250265
TEST_F(DrmMemoryOperationsHandlerBindTest, whenRunningOutOfMemoryThenUnusedAllocationsAreUnbound) {
251266
auto allocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{device->getRootDeviceIndex(), MemoryConstants::pageSize});
252267

shared/source/memory_manager/memory_operations_status.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ enum class MemoryOperationsStatus : uint32_t {
1717
OUT_OF_MEMORY,
1818
UNSUPPORTED,
1919
DEVICE_UNINITIALIZED,
20+
GPU_HANG_DETECTED_DURING_OPERATION,
2021
};
2122

2223
} // namespace NEO

shared/source/os_interface/linux/drm_buffer_object.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,12 +164,17 @@ int BufferObject::exec(uint32_t used, size_t startOffset, unsigned int flags, bo
164164
return err;
165165
}
166166

167-
static_cast<DrmMemoryOperationsHandler *>(this->drm->getRootDeviceEnvironment().memoryOperationsInterface.get())->evictUnusedAllocations(false, true);
167+
evictUnusedAllocations(false, true);
168168
ret = ioctlHelper->execBuffer(drm, &execbuf, completionGpuAddress, completionValue);
169169
}
170170

171171
if (ret != 0) {
172-
static_cast<DrmMemoryOperationsHandler *>(this->drm->getRootDeviceEnvironment().memoryOperationsInterface.get())->evictUnusedAllocations(true, true);
172+
const auto status = evictUnusedAllocations(true, true);
173+
if (status == MemoryOperationsStatus::GPU_HANG_DETECTED_DURING_OPERATION) {
174+
PRINT_DEBUG_STRING(DebugManager.flags.PrintDebugMessages.get(), stderr, "Error! GPU hang detected in BufferObject::exec(). Returning %d\n", GPU_HANG_DETECTED);
175+
return GPU_HANG_DETECTED;
176+
}
177+
173178
ret = ioctlHelper->execBuffer(drm, &execbuf, completionGpuAddress, completionValue);
174179
}
175180

@@ -182,6 +187,10 @@ int BufferObject::exec(uint32_t used, size_t startOffset, unsigned int flags, bo
182187
return err;
183188
}
184189

190+
MemoryOperationsStatus BufferObject::evictUnusedAllocations(bool waitForCompletion, bool isLockNeeded) {
191+
return static_cast<DrmMemoryOperationsHandler *>(this->drm->getRootDeviceEnvironment().memoryOperationsInterface.get())->evictUnusedAllocations(waitForCompletion, isLockNeeded);
192+
}
193+
185194
void BufferObject::printBOBindingResult(OsContext *osContext, uint32_t vmHandleId, bool bind, int retVal) {
186195
if (retVal == 0) {
187196
if (bind) {

shared/source/os_interface/linux/drm_buffer_object.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
#include "shared/source/gmm_helper/gmm_helper.h"
1111
#include "shared/source/memory_manager/definitions/engine_limits.h"
12+
#include "shared/source/memory_manager/memory_operations_status.h"
1213
#include "shared/source/os_interface/linux/cache_info.h"
1314
#include "shared/source/utilities/stackvec.h"
1415

@@ -137,7 +138,11 @@ class BufferObject {
137138
return this->bindAddresses;
138139
}
139140

141+
static constexpr int GPU_HANG_DETECTED{-7171};
142+
140143
protected:
144+
MOCKABLE_VIRTUAL MemoryOperationsStatus evictUnusedAllocations(bool waitForCompletion, bool isLockNeeded);
145+
141146
Drm *drm = nullptr;
142147
bool perContextVmsUsed = false;
143148
std::atomic<uint32_t> refCount;

shared/source/os_interface/linux/drm_memory_operations_handler.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ class DrmMemoryOperationsHandler : public MemoryOperationsHandler {
2323
virtual MemoryOperationsStatus mergeWithResidencyContainer(OsContext *osContext, ResidencyContainer &residencyContainer) = 0;
2424
virtual std::unique_lock<std::mutex> lockHandlerIfUsed() = 0;
2525

26-
virtual void evictUnusedAllocations(bool waitForCompletion, bool isLockNeeded) = 0;
26+
virtual MemoryOperationsStatus evictUnusedAllocations(bool waitForCompletion, bool isLockNeeded) = 0;
2727

2828
static std::unique_ptr<DrmMemoryOperationsHandler> create(Drm &drm, uint32_t rootDeviceIndex);
2929

shared/source/os_interface/linux/drm_memory_operations_handler_bind.cpp

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ std::unique_lock<std::mutex> DrmMemoryOperationsHandlerBind::lockHandlerIfUsed()
133133
return std::unique_lock<std::mutex>();
134134
}
135135

136-
void DrmMemoryOperationsHandlerBind::evictUnusedAllocations(bool waitForCompletion, bool isLockNeeded) {
136+
MemoryOperationsStatus DrmMemoryOperationsHandlerBind::evictUnusedAllocations(bool waitForCompletion, bool isLockNeeded) {
137137
auto memoryManager = static_cast<DrmMemoryManager *>(this->rootDeviceEnvironment.executionEnvironment.memoryManager.get());
138138

139139
std::unique_lock<std::mutex> evictLock(mutex, std::defer_lock);
@@ -143,11 +143,19 @@ void DrmMemoryOperationsHandlerBind::evictUnusedAllocations(bool waitForCompleti
143143

144144
auto allocLock = memoryManager->acquireAllocLock();
145145

146-
this->evictUnusedAllocationsImpl(memoryManager->getSysMemAllocs(), waitForCompletion);
147-
this->evictUnusedAllocationsImpl(memoryManager->getLocalMemAllocs(this->rootDeviceIndex), waitForCompletion);
146+
for (const auto status : {
147+
this->evictUnusedAllocationsImpl(memoryManager->getSysMemAllocs(), waitForCompletion),
148+
this->evictUnusedAllocationsImpl(memoryManager->getLocalMemAllocs(this->rootDeviceIndex), waitForCompletion)}) {
149+
150+
if (status == MemoryOperationsStatus::GPU_HANG_DETECTED_DURING_OPERATION) {
151+
return MemoryOperationsStatus::GPU_HANG_DETECTED_DURING_OPERATION;
152+
}
153+
}
154+
155+
return MemoryOperationsStatus::SUCCESS;
148156
}
149157

150-
void DrmMemoryOperationsHandlerBind::evictUnusedAllocationsImpl(std::vector<GraphicsAllocation *> &allocationsForEviction, bool waitForCompletion) {
158+
MemoryOperationsStatus DrmMemoryOperationsHandlerBind::evictUnusedAllocationsImpl(std::vector<GraphicsAllocation *> &allocationsForEviction, bool waitForCompletion) {
151159
const auto &engines = this->rootDeviceEnvironment.executionEnvironment.memoryManager->getRegisteredEngines();
152160
std::vector<GraphicsAllocation *> evictCandidates;
153161

@@ -164,7 +172,10 @@ void DrmMemoryOperationsHandlerBind::evictUnusedAllocationsImpl(std::vector<Grap
164172
}
165173

166174
if (waitForCompletion) {
167-
engine.commandStreamReceiver->waitForCompletionWithTimeout(WaitParams{false, false, 0}, engine.commandStreamReceiver->peekLatestFlushedTaskCount());
175+
const auto waitStatus = engine.commandStreamReceiver->waitForCompletionWithTimeout(WaitParams{false, false, 0}, engine.commandStreamReceiver->peekLatestFlushedTaskCount());
176+
if (waitStatus == WaitStatus::GpuHang) {
177+
return MemoryOperationsStatus::GPU_HANG_DETECTED_DURING_OPERATION;
178+
}
168179
}
169180

170181
if (allocation->isUsedByOsContext(engine.osContext->getContextId()) &&
@@ -191,6 +202,8 @@ void DrmMemoryOperationsHandlerBind::evictUnusedAllocationsImpl(std::vector<Grap
191202
}
192203
evictCandidates.clear();
193204
}
205+
206+
return MemoryOperationsStatus::SUCCESS;
194207
}
195208

196209
} // namespace NEO

shared/source/os_interface/linux/drm_memory_operations_handler_bind.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,11 @@ class DrmMemoryOperationsHandlerBind : public DrmMemoryOperationsHandler {
2525
MemoryOperationsStatus mergeWithResidencyContainer(OsContext *osContext, ResidencyContainer &residencyContainer) override;
2626
std::unique_lock<std::mutex> lockHandlerIfUsed() override;
2727

28-
void evictUnusedAllocations(bool waitForCompletion, bool isLockNeeded) override;
28+
MemoryOperationsStatus evictUnusedAllocations(bool waitForCompletion, bool isLockNeeded) override;
2929

3030
protected:
3131
MOCKABLE_VIRTUAL int evictImpl(OsContext *osContext, GraphicsAllocation &gfxAllocation, DeviceBitfield deviceBitfield);
32-
void evictUnusedAllocationsImpl(std::vector<GraphicsAllocation *> &allocationsForEviction, bool waitForCompletion);
32+
MemoryOperationsStatus evictUnusedAllocationsImpl(std::vector<GraphicsAllocation *> &allocationsForEviction, bool waitForCompletion);
3333

3434
RootDeviceEnvironment &rootDeviceEnvironment;
3535
uint32_t rootDeviceIndex = 0;

shared/source/os_interface/linux/drm_memory_operations_handler_default.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,8 @@ std::unique_lock<std::mutex> DrmMemoryOperationsHandlerDefault::lockHandlerIfUse
6565
return std::unique_lock<std::mutex>();
6666
}
6767

68-
void DrmMemoryOperationsHandlerDefault::evictUnusedAllocations(bool waitForCompletion, bool isLockNeeded) {
68+
MemoryOperationsStatus DrmMemoryOperationsHandlerDefault::evictUnusedAllocations(bool waitForCompletion, bool isLockNeeded) {
69+
return MemoryOperationsStatus::SUCCESS;
6970
}
7071

7172
} // namespace NEO

shared/source/os_interface/linux/drm_memory_operations_handler_default.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ class DrmMemoryOperationsHandlerDefault : public DrmMemoryOperationsHandler {
2626
MemoryOperationsStatus mergeWithResidencyContainer(OsContext *osContext, ResidencyContainer &residencyContainer) override;
2727
std::unique_lock<std::mutex> lockHandlerIfUsed() override;
2828

29-
void evictUnusedAllocations(bool waitForCompletion, bool isLockNeeded) override;
29+
MemoryOperationsStatus evictUnusedAllocations(bool waitForCompletion, bool isLockNeeded) override;
3030

3131
protected:
3232
std::unordered_set<GraphicsAllocation *> residency;

shared/test/common/os_interface/linux/drm_buffer_object_fixture.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,23 @@ class TestedBufferObject : public BufferObject {
4646
return BufferObject::exec(used, startOffset, flags, requiresCoherency, osContext, vmHandleId, drmContextId, residency, residencyCount, execObjectsStorage, completionGpuAddress, completionValue);
4747
}
4848

49+
MemoryOperationsStatus evictUnusedAllocations(bool waitForCompletion, bool isLockNeeded) override {
50+
if (callBaseEvictUnusedAllocations) {
51+
return BufferObject::evictUnusedAllocations(waitForCompletion, isLockNeeded);
52+
}
53+
54+
if (!waitForCompletion) {
55+
return MemoryOperationsStatus::SUCCESS;
56+
}
57+
58+
return MemoryOperationsStatus::GPU_HANG_DETECTED_DURING_OPERATION;
59+
}
60+
4961
uint64_t receivedCompletionGpuAddress = 0;
5062
drm_i915_gem_exec_object2 *execObjectPointerFilled = nullptr;
5163
uint32_t receivedCompletionValue = 0;
5264
uint32_t execCalled = 0;
65+
bool callBaseEvictUnusedAllocations{true};
5366
};
5467

5568
template <typename DrmClass>

0 commit comments

Comments
 (0)