1
1
/*
2
- * Copyright (C) 2021 Intel Corporation
2
+ * Copyright (C) 2021-2022 Intel Corporation
3
3
*
4
4
* SPDX-License-Identifier: MIT
5
5
*
11
11
12
12
#include " level_zero/core/source/device/device_imp.h"
13
13
14
+ #include < linux/pci_regs.h>
15
+
14
16
namespace L0 {
15
17
const std::string LinuxDiagnosticsImp::deviceDir (" device" );
16
18
19
+ // the sysfs node will be at /sys/class/drm/card<n>/invalidate_lmem_mmaps
20
+ const std::string LinuxDiagnosticsImp::invalidateLmemFile (" invalidate_lmem_mmaps" );
21
+ // the sysfs node will be at /sys/class/drm/card<n>/quiesce_gpu
22
+ const std::string LinuxDiagnosticsImp::quiescentGpuFile (" quiesce_gpu" );
23
+ void OsDiagnostics::getSupportedDiagTestsFromFW (void *pOsSysman, std::vector<std::string> &supportedDiagTests) {
24
+ LinuxSysmanImp *pLinuxSysmanImp = static_cast <LinuxSysmanImp *>(pOsSysman);
25
+ if (IGFX_PVC == pLinuxSysmanImp->getProductFamily ()) {
26
+ FirmwareUtil *pFwInterface = pLinuxSysmanImp->getFwUtilInterface ();
27
+ if (pFwInterface != nullptr ) {
28
+ if (ZE_RESULT_SUCCESS == static_cast <FirmwareUtil *>(pFwInterface)->fwDeviceInit ()) {
29
+ static_cast <FirmwareUtil *>(pFwInterface)->fwSupportedDiagTests (supportedDiagTests);
30
+ }
31
+ }
32
+ }
33
+ }
34
+
35
+ ze_result_t LinuxDiagnosticsImp::gpuProcessCleanup () {
36
+ ::pid_t myPid = pProcfsAccess->myProcessId ();
37
+ std::vector<::pid_t > processes;
38
+ std::vector<int > myPidFds;
39
+ ze_result_t result = pProcfsAccess->listProcesses (processes);
40
+ if (ZE_RESULT_SUCCESS != result) {
41
+ return result;
42
+ }
43
+
44
+ for (auto &&pid : processes) {
45
+ std::vector<int > fds;
46
+ pLinuxSysmanImp->getPidFdsForOpenDevice (pProcfsAccess, pSysfsAccess, pid, fds);
47
+ if (pid == myPid) {
48
+ // L0 is expected to have this file open.
49
+ // Keep list of fds. Close before unbind.
50
+ myPidFds = fds;
51
+ continue ;
52
+ }
53
+ if (!fds.empty ()) {
54
+ pProcfsAccess->kill (pid);
55
+ }
56
+ }
57
+
58
+ for (auto &&fd : myPidFds) {
59
+ // Close open filedescriptors to the device
60
+ // before unbinding device.
61
+ // From this point forward, there is no
62
+ // graceful way to fail the reset call.
63
+ // All future ze calls by this process for this
64
+ // device will fail.
65
+ ::close (fd);
66
+ }
67
+ return ZE_RESULT_SUCCESS;
68
+ }
69
+
70
+ // before running diagnostics need to close all active workloads
71
+ // writing 1 to /sys/class/drm/card<n>/quiesce_gpu will signal KMD
72
+ // to close and clear all allocations,
73
+ // ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE will be sent till the kworker confirms that
74
+ // all allocations are closed and GPU is be wedged.
75
+ // GPU will only be unwedged after warm/cold reset
76
+ // writing 1 to /sys/class/drm/card<n>/invalidate_lmem_mmaps clears
77
+ // all memory mappings where LMEMBAR is being referenced are invalidated.
78
+ // Also prevents new ones from being created.
79
+ // It will invalidate LMEM memory mappings only when sysfs entry quiesce_gpu is set.
80
+ ze_result_t LinuxDiagnosticsImp::waitForQuiescentCompletion () {
81
+ uint32_t count = 0 ;
82
+ const int intVal = 1 ;
83
+ ze_result_t result = ZE_RESULT_ERROR_UNKNOWN;
84
+ do {
85
+ result = pSysfsAccess->write (quiescentGpuFile, intVal);
86
+ if (ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE == result) {
87
+ count++;
88
+ this ->pSleepFunctionSecs (1 ); // Sleep for 1second every loop, gives enough time for KMD to clear all allocations and wedge the system
89
+ auto processResult = gpuProcessCleanup ();
90
+ if (ZE_RESULT_SUCCESS != processResult) {
91
+ return processResult;
92
+ }
93
+ } else if (ZE_RESULT_SUCCESS == result) {
94
+ break ;
95
+ } else {
96
+ return result;
97
+ }
98
+ } while (count < 10 ); // limiting to 10 retries as we can endup going into a infinite loop if the cleanup and a process start are out of sync
99
+ result = pSysfsAccess->write (invalidateLmemFile, intVal);
100
+ if (ZE_RESULT_SUCCESS != result) {
101
+ return result;
102
+ }
103
+ return result;
104
+ }
105
+
106
+ ze_result_t LinuxDiagnosticsImp::osRunDiagTestsinFW (zes_diag_result_t *pResult) {
107
+ pLinuxSysmanImp->diagnosticsReset = true ;
108
+ pLinuxSysmanImp->releaseDeviceResources ();
109
+ ze_result_t result = gpuProcessCleanup ();
110
+ if (ZE_RESULT_SUCCESS != result) {
111
+ return result;
112
+ }
113
+ result = waitForQuiescentCompletion ();
114
+ if (ZE_RESULT_SUCCESS != result) {
115
+ return result;
116
+ }
117
+ result = pFwInterface->fwRunDiagTests (osDiagType, pResult);
118
+ if (ZE_RESULT_SUCCESS != result) {
119
+ return result;
120
+ }
121
+ if (*pResult == ZES_DIAG_RESULT_REBOOT_FOR_REPAIR) {
122
+ result = pLinuxSysmanImp->osColdReset ();
123
+ if (result != ZE_RESULT_SUCCESS) {
124
+ return result;
125
+ }
126
+ }
127
+ result = pLinuxSysmanImp->osWarmReset (); // we need to at least do a Warm reset to bring the machine out of wedged state
128
+ if (result != ZE_RESULT_SUCCESS) {
129
+ return result;
130
+ }
131
+ return pLinuxSysmanImp->initDevice ();
132
+ }
133
+
17
134
void LinuxDiagnosticsImp::osGetDiagProperties (zes_diag_properties_t *pProperties) {
18
135
pProperties->onSubdevice = isSubdevice;
19
136
pProperties->subdeviceId = subdeviceId;
@@ -30,20 +147,15 @@ ze_result_t LinuxDiagnosticsImp::osRunDiagTests(uint32_t start, uint32_t end, ze
30
147
return osRunDiagTestsinFW (pResult);
31
148
}
32
149
33
- LinuxDiagnosticsImp::LinuxDiagnosticsImp (OsSysman *pOsSysman, const std::string &diagTests, ze_bool_t onSubdevice, uint32_t subdeviceId ) : osDiagType(diagTests), isSubdevice(onSubdevice), subdeviceId(subdeviceId ) {
150
+ LinuxDiagnosticsImp::LinuxDiagnosticsImp (OsSysman *pOsSysman, const std::string &diagTests) : osDiagType(diagTests) {
34
151
pLinuxSysmanImp = static_cast <LinuxSysmanImp *>(pOsSysman);
35
152
pFwInterface = pLinuxSysmanImp->getFwUtilInterface ();
36
153
pSysfsAccess = &pLinuxSysmanImp->getSysfsAccess ();
37
- pFsAccess = &pLinuxSysmanImp->getFsAccess ();
38
154
pProcfsAccess = &pLinuxSysmanImp->getProcfsAccess ();
39
- pDevice = pLinuxSysmanImp->getDeviceHandle ();
40
- auto device = static_cast <DeviceImp *>(pDevice);
41
- executionEnvironment = device->getNEODevice ()->getExecutionEnvironment ();
42
- rootDeviceIndex = device->getNEODevice ()->getRootDeviceIndex ();
43
155
}
44
156
45
- std::unique_ptr<OsDiagnostics> OsDiagnostics::create (OsSysman *pOsSysman, const std::string &diagTests, ze_bool_t onSubdevice, uint32_t subdeviceId ) {
46
- std::unique_ptr<LinuxDiagnosticsImp> pLinuxDiagnosticsImp = std::make_unique<LinuxDiagnosticsImp>(pOsSysman, diagTests, onSubdevice, subdeviceId );
157
+ std::unique_ptr<OsDiagnostics> OsDiagnostics::create (OsSysman *pOsSysman, const std::string &diagTests) {
158
+ std::unique_ptr<LinuxDiagnosticsImp> pLinuxDiagnosticsImp = std::make_unique<LinuxDiagnosticsImp>(pOsSysman, diagTests);
47
159
return pLinuxDiagnosticsImp;
48
160
}
49
161
0 commit comments