Skip to content

Commit da52303

Browse files
SysMan: Diagnostics warm reset fix.
The following modifications were done as part of the fix for warm reset. 1. Release sysman resources before quiscenting the GPU. 2. Add additional checks to confirm quiscenting of the GPU before launching the diagnostics tests. 3. Fixed warm reset with wait time to allow the changes to be propagated to the entire GPU PCI tree. 4. Modified the ULT's completely to avoid the usage of MOCKS. 5. Made Diagnostics handle creation per-device from per-SubDevice. Related-To: LOCI-3053 Signed-off-by: Vilvaraj, T J Vivek <[email protected]>
1 parent 83e06eb commit da52303

23 files changed

+927
-493
lines changed

level_zero/tools/source/sysman/diagnostics/diagnostics.cpp

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2021 Intel Corporation
2+
* Copyright (C) 2021-2022 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -21,17 +21,15 @@ void DiagnosticsHandleContext::releaseDiagnosticsHandles() {
2121
}
2222
handleList.clear();
2323
}
24-
void DiagnosticsHandleContext::createHandle(ze_device_handle_t deviceHandle, const std::string &diagTests) {
25-
Diagnostics *pDiagnostics = new DiagnosticsImp(pOsSysman, diagTests, deviceHandle);
24+
void DiagnosticsHandleContext::createHandle(const std::string &diagTests) {
25+
Diagnostics *pDiagnostics = new DiagnosticsImp(pOsSysman, diagTests);
2626
handleList.push_back(pDiagnostics);
2727
}
2828

29-
void DiagnosticsHandleContext::init(std::vector<ze_device_handle_t> &deviceHandles) {
29+
void DiagnosticsHandleContext::init() {
3030
OsDiagnostics::getSupportedDiagTestsFromFW(pOsSysman, supportedDiagTests);
31-
for (const auto &deviceHandle : deviceHandles) {
32-
for (const std::string &diagTests : supportedDiagTests) {
33-
createHandle(deviceHandle, diagTests);
34-
}
31+
for (const std::string &diagTests : supportedDiagTests) {
32+
createHandle(diagTests);
3533
}
3634
}
3735

level_zero/tools/source/sysman/diagnostics/diagnostics.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,15 +38,15 @@ struct DiagnosticsHandleContext {
3838
void releaseDiagnosticsHandles();
3939
MOCKABLE_VIRTUAL ~DiagnosticsHandleContext();
4040

41-
MOCKABLE_VIRTUAL void init(std::vector<ze_device_handle_t> &deviceHandles);
41+
MOCKABLE_VIRTUAL void init();
4242

4343
ze_result_t diagnosticsGet(uint32_t *pCount, zes_diag_handle_t *phDiagnostics);
4444
std::vector<std::string> supportedDiagTests = {};
4545
OsSysman *pOsSysman = nullptr;
4646
std::vector<Diagnostics *> handleList = {};
4747

4848
private:
49-
void createHandle(ze_device_handle_t deviceHandle, const std::string &diagTests);
49+
void createHandle(const std::string &diagTests);
5050
};
5151

5252
} // namespace L0

level_zero/tools/source/sysman/diagnostics/diagnostics_imp.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2021 Intel Corporation
2+
* Copyright (C) 2021-2022 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -28,10 +28,8 @@ ze_result_t DiagnosticsImp::diagnosticsRunTests(uint32_t start, uint32_t end, ze
2828
return pOsDiagnostics->osRunDiagTests(start, end, pResult);
2929
}
3030

31-
DiagnosticsImp::DiagnosticsImp(OsSysman *pOsSysman, const std::string &initalizedDiagTest, ze_device_handle_t handle) : deviceHandle(handle) {
32-
ze_device_properties_t deviceProperties = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES};
33-
Device::fromHandle(deviceHandle)->getProperties(&deviceProperties);
34-
pOsDiagnostics = OsDiagnostics::create(pOsSysman, initalizedDiagTest, deviceProperties.flags & ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE, deviceProperties.subdeviceId);
31+
DiagnosticsImp::DiagnosticsImp(OsSysman *pOsSysman, const std::string &initalizedDiagTest) {
32+
pOsDiagnostics = OsDiagnostics::create(pOsSysman, initalizedDiagTest);
3533
UNRECOVERABLE_IF(nullptr == pOsDiagnostics);
3634
}
3735

level_zero/tools/source/sysman/diagnostics/diagnostics_imp.h

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2021 Intel Corporation
2+
* Copyright (C) 2021-2022 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -23,12 +23,9 @@ class DiagnosticsImp : public Diagnostics, NEO::NonCopyableOrMovableClass {
2323
ze_result_t diagnosticsGetTests(uint32_t *pCount, zes_diag_test_t *pTests) override;
2424
ze_result_t diagnosticsRunTests(uint32_t start, uint32_t end, zes_diag_result_t *pResult) override;
2525
DiagnosticsImp() = default;
26-
DiagnosticsImp(OsSysman *pOsSysman, const std::string &initalizedDiagTest, ze_device_handle_t handle);
26+
DiagnosticsImp(OsSysman *pOsSysman, const std::string &initalizedDiagTest);
2727
~DiagnosticsImp() override;
2828
std::unique_ptr<OsDiagnostics> pOsDiagnostics = nullptr;
29-
30-
private:
31-
ze_device_handle_t deviceHandle = nullptr;
3229
};
3330

3431
} // namespace L0

level_zero/tools/source/sysman/diagnostics/linux/CMakeLists.txt

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,6 @@ set(L0_SRCS_TOOLS_SYSMAN_DIAGNOSTICS_LINUX
1010
${CMAKE_CURRENT_SOURCE_DIR}/os_diagnostics_imp.h
1111
)
1212

13-
if(NEO_ENABLE_i915_PRELIM_DETECTION)
14-
list(APPEND L0_SRCS_TOOLS_SYSMAN_DIAGNOSTICS_LINUX
15-
${CMAKE_CURRENT_SOURCE_DIR}/os_diagnostics_helper_prelim.cpp
16-
)
17-
else()
18-
list(APPEND L0_SRCS_TOOLS_SYSMAN_DIAGNOSTICS_LINUX
19-
${CMAKE_CURRENT_SOURCE_DIR}/os_diagnostics_helper.cpp
20-
)
21-
endif()
22-
2313
if(UNIX)
2414
target_sources(${L0_STATIC_LIB_NAME}
2515
PRIVATE

level_zero/tools/source/sysman/diagnostics/linux/os_diagnostics_helper.cpp

Lines changed: 0 additions & 20 deletions
This file was deleted.

level_zero/tools/source/sysman/diagnostics/linux/os_diagnostics_helper_prelim.cpp

Lines changed: 0 additions & 76 deletions
This file was deleted.

level_zero/tools/source/sysman/diagnostics/linux/os_diagnostics_imp.cpp

Lines changed: 121 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2021 Intel Corporation
2+
* Copyright (C) 2021-2022 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -11,9 +11,126 @@
1111

1212
#include "level_zero/core/source/device/device_imp.h"
1313

14+
#include <linux/pci_regs.h>
15+
1416
namespace L0 {
1517
const std::string LinuxDiagnosticsImp::deviceDir("device");
1618

19+
//the sysfs node will be at /sys/class/drm/card<n>/invalidate_lmem_mmaps
20+
const std::string LinuxDiagnosticsImp::invalidateLmemFile("invalidate_lmem_mmaps");
21+
// the sysfs node will be at /sys/class/drm/card<n>/quiesce_gpu
22+
const std::string LinuxDiagnosticsImp::quiescentGpuFile("quiesce_gpu");
23+
void OsDiagnostics::getSupportedDiagTestsFromFW(void *pOsSysman, std::vector<std::string> &supportedDiagTests) {
24+
LinuxSysmanImp *pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
25+
if (IGFX_PVC == pLinuxSysmanImp->getProductFamily()) {
26+
FirmwareUtil *pFwInterface = pLinuxSysmanImp->getFwUtilInterface();
27+
if (pFwInterface != nullptr) {
28+
if (ZE_RESULT_SUCCESS == static_cast<FirmwareUtil *>(pFwInterface)->fwDeviceInit()) {
29+
static_cast<FirmwareUtil *>(pFwInterface)->fwSupportedDiagTests(supportedDiagTests);
30+
}
31+
}
32+
}
33+
}
34+
35+
ze_result_t LinuxDiagnosticsImp::gpuProcessCleanup() {
36+
::pid_t myPid = pProcfsAccess->myProcessId();
37+
std::vector<::pid_t> processes;
38+
std::vector<int> myPidFds;
39+
ze_result_t result = pProcfsAccess->listProcesses(processes);
40+
if (ZE_RESULT_SUCCESS != result) {
41+
return result;
42+
}
43+
44+
for (auto &&pid : processes) {
45+
std::vector<int> fds;
46+
pLinuxSysmanImp->getPidFdsForOpenDevice(pProcfsAccess, pSysfsAccess, pid, fds);
47+
if (pid == myPid) {
48+
// L0 is expected to have this file open.
49+
// Keep list of fds. Close before unbind.
50+
myPidFds = fds;
51+
continue;
52+
}
53+
if (!fds.empty()) {
54+
pProcfsAccess->kill(pid);
55+
}
56+
}
57+
58+
for (auto &&fd : myPidFds) {
59+
// Close open filedescriptors to the device
60+
// before unbinding device.
61+
// From this point forward, there is no
62+
// graceful way to fail the reset call.
63+
// All future ze calls by this process for this
64+
// device will fail.
65+
::close(fd);
66+
}
67+
return ZE_RESULT_SUCCESS;
68+
}
69+
70+
// before running diagnostics need to close all active workloads
71+
// writing 1 to /sys/class/drm/card<n>/quiesce_gpu will signal KMD
72+
//to close and clear all allocations,
73+
//ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE will be sent till the kworker confirms that
74+
//all allocations are closed and GPU is be wedged.
75+
// GPU will only be unwedged after warm/cold reset
76+
//writing 1 to /sys/class/drm/card<n>/invalidate_lmem_mmaps clears
77+
// all memory mappings where LMEMBAR is being referenced are invalidated.
78+
//Also prevents new ones from being created.
79+
//It will invalidate LMEM memory mappings only when sysfs entry quiesce_gpu is set.
80+
ze_result_t LinuxDiagnosticsImp::waitForQuiescentCompletion() {
81+
uint32_t count = 0;
82+
const int intVal = 1;
83+
ze_result_t result = ZE_RESULT_ERROR_UNKNOWN;
84+
do {
85+
result = pSysfsAccess->write(quiescentGpuFile, intVal);
86+
if (ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE == result) {
87+
count++;
88+
this->pSleepFunctionSecs(1); // Sleep for 1second every loop, gives enough time for KMD to clear all allocations and wedge the system
89+
auto processResult = gpuProcessCleanup();
90+
if (ZE_RESULT_SUCCESS != processResult) {
91+
return processResult;
92+
}
93+
} else if (ZE_RESULT_SUCCESS == result) {
94+
break;
95+
} else {
96+
return result;
97+
}
98+
} while (count < 10); //limiting to 10 retries as we can endup going into a infinite loop if the cleanup and a process start are out of sync
99+
result = pSysfsAccess->write(invalidateLmemFile, intVal);
100+
if (ZE_RESULT_SUCCESS != result) {
101+
return result;
102+
}
103+
return result;
104+
}
105+
106+
ze_result_t LinuxDiagnosticsImp::osRunDiagTestsinFW(zes_diag_result_t *pResult) {
107+
pLinuxSysmanImp->diagnosticsReset = true;
108+
pLinuxSysmanImp->releaseDeviceResources();
109+
ze_result_t result = gpuProcessCleanup();
110+
if (ZE_RESULT_SUCCESS != result) {
111+
return result;
112+
}
113+
result = waitForQuiescentCompletion();
114+
if (ZE_RESULT_SUCCESS != result) {
115+
return result;
116+
}
117+
result = pFwInterface->fwRunDiagTests(osDiagType, pResult);
118+
if (ZE_RESULT_SUCCESS != result) {
119+
return result;
120+
}
121+
if (*pResult == ZES_DIAG_RESULT_REBOOT_FOR_REPAIR) {
122+
result = pLinuxSysmanImp->osColdReset();
123+
if (result != ZE_RESULT_SUCCESS) {
124+
return result;
125+
}
126+
}
127+
result = pLinuxSysmanImp->osWarmReset(); // we need to at least do a Warm reset to bring the machine out of wedged state
128+
if (result != ZE_RESULT_SUCCESS) {
129+
return result;
130+
}
131+
return pLinuxSysmanImp->initDevice();
132+
}
133+
17134
void LinuxDiagnosticsImp::osGetDiagProperties(zes_diag_properties_t *pProperties) {
18135
pProperties->onSubdevice = isSubdevice;
19136
pProperties->subdeviceId = subdeviceId;
@@ -30,20 +147,15 @@ ze_result_t LinuxDiagnosticsImp::osRunDiagTests(uint32_t start, uint32_t end, ze
30147
return osRunDiagTestsinFW(pResult);
31148
}
32149

33-
LinuxDiagnosticsImp::LinuxDiagnosticsImp(OsSysman *pOsSysman, const std::string &diagTests, ze_bool_t onSubdevice, uint32_t subdeviceId) : osDiagType(diagTests), isSubdevice(onSubdevice), subdeviceId(subdeviceId) {
150+
LinuxDiagnosticsImp::LinuxDiagnosticsImp(OsSysman *pOsSysman, const std::string &diagTests) : osDiagType(diagTests) {
34151
pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
35152
pFwInterface = pLinuxSysmanImp->getFwUtilInterface();
36153
pSysfsAccess = &pLinuxSysmanImp->getSysfsAccess();
37-
pFsAccess = &pLinuxSysmanImp->getFsAccess();
38154
pProcfsAccess = &pLinuxSysmanImp->getProcfsAccess();
39-
pDevice = pLinuxSysmanImp->getDeviceHandle();
40-
auto device = static_cast<DeviceImp *>(pDevice);
41-
executionEnvironment = device->getNEODevice()->getExecutionEnvironment();
42-
rootDeviceIndex = device->getNEODevice()->getRootDeviceIndex();
43155
}
44156

45-
std::unique_ptr<OsDiagnostics> OsDiagnostics::create(OsSysman *pOsSysman, const std::string &diagTests, ze_bool_t onSubdevice, uint32_t subdeviceId) {
46-
std::unique_ptr<LinuxDiagnosticsImp> pLinuxDiagnosticsImp = std::make_unique<LinuxDiagnosticsImp>(pOsSysman, diagTests, onSubdevice, subdeviceId);
157+
std::unique_ptr<OsDiagnostics> OsDiagnostics::create(OsSysman *pOsSysman, const std::string &diagTests) {
158+
std::unique_ptr<LinuxDiagnosticsImp> pLinuxDiagnosticsImp = std::make_unique<LinuxDiagnosticsImp>(pOsSysman, diagTests);
47159
return pLinuxDiagnosticsImp;
48160
}
49161

level_zero/tools/source/sysman/diagnostics/linux/os_diagnostics_imp.h

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -20,31 +20,20 @@ class LinuxDiagnosticsImp : public OsDiagnostics, NEO::NonCopyableOrMovableClass
2020
ze_result_t osGetDiagTests(uint32_t *pCount, zes_diag_test_t *pTests) override;
2121
ze_result_t osRunDiagTests(uint32_t start, uint32_t end, zes_diag_result_t *pResult) override;
2222
ze_result_t osRunDiagTestsinFW(zes_diag_result_t *pResult);
23-
ze_result_t osWarmReset();
2423
LinuxDiagnosticsImp() = default;
25-
LinuxDiagnosticsImp(OsSysman *pOsSysman, const std::string &diagTests, ze_bool_t onSubdevice, uint32_t subdeviceId);
24+
LinuxDiagnosticsImp(OsSysman *pOsSysman, const std::string &diagTests);
2625
~LinuxDiagnosticsImp() override = default;
2726
std::string osDiagType = "unknown";
28-
ze_result_t osColdReset();
27+
decltype(&L0::SysmanUtils::sleep) pSleepFunctionSecs = L0::SysmanUtils::sleep;
2928

3029
protected:
3130
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
3231
FirmwareUtil *pFwInterface = nullptr;
3332
SysfsAccess *pSysfsAccess = nullptr;
3433
FsAccess *pFsAccess = nullptr;
3534
ProcfsAccess *pProcfsAccess = nullptr;
36-
Device *pDevice = nullptr;
37-
std::string devicePciBdf = "";
38-
NEO::ExecutionEnvironment *executionEnvironment = nullptr;
39-
uint32_t rootDeviceIndex = 0u;
40-
decltype(&NEO::SysCalls::open) openFunction = NEO::SysCalls::open;
41-
decltype(&NEO::SysCalls::close) closeFunction = NEO::SysCalls::close;
42-
decltype(&NEO::SysCalls::pread) preadFunction = NEO::SysCalls::pread;
43-
decltype(&NEO::SysCalls::pwrite) pwriteFunction = NEO::SysCalls::pwrite;
44-
void releaseSysmanDeviceResources();
45-
void releaseDeviceResources();
46-
ze_result_t initDevice();
47-
void reInitSysmanDeviceResources();
35+
ze_result_t gpuProcessCleanup();
36+
ze_result_t waitForQuiescentCompletion();
4837

4938
private:
5039
static const std::string quiescentGpuFile;

0 commit comments

Comments
 (0)