|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
| 2 | +/* Copyright(c) 2022 Intel Corporation. */ |
| 3 | + |
| 4 | +#include <linux/cpu.h> |
| 5 | +#include <linux/delay.h> |
| 6 | +#include <linux/fs.h> |
| 7 | +#include <linux/nmi.h> |
| 8 | +#include <linux/slab.h> |
| 9 | +#include <linux/stop_machine.h> |
| 10 | + |
| 11 | +#include "ifs.h" |
| 12 | + |
| 13 | +/* |
| 14 | + * Note all code and data in this file is protected by |
| 15 | + * ifs_sem. On HT systems all threads on a core will |
| 16 | + * execute together, but only the first thread on the |
| 17 | + * core will update results of the test. |
| 18 | + */ |
| 19 | + |
| 20 | +/* Max retries on the same chunk */ |
| 21 | +#define MAX_IFS_RETRIES 5 |
| 22 | + |
| 23 | +/* |
| 24 | + * Number of TSC cycles that a logical CPU will wait for the other |
| 25 | + * logical CPU on the core in the WRMSR(ACTIVATE_SCAN). |
| 26 | + */ |
| 27 | +#define IFS_THREAD_WAIT 100000 |
| 28 | + |
| 29 | +enum ifs_status_err_code { |
| 30 | + IFS_NO_ERROR = 0, |
| 31 | + IFS_OTHER_THREAD_COULD_NOT_JOIN = 1, |
| 32 | + IFS_INTERRUPTED_BEFORE_RENDEZVOUS = 2, |
| 33 | + IFS_POWER_MGMT_INADEQUATE_FOR_SCAN = 3, |
| 34 | + IFS_INVALID_CHUNK_RANGE = 4, |
| 35 | + IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS = 5, |
| 36 | + IFS_CORE_NOT_CAPABLE_CURRENTLY = 6, |
| 37 | + IFS_UNASSIGNED_ERROR_CODE = 7, |
| 38 | + IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT = 8, |
| 39 | + IFS_INTERRUPTED_DURING_EXECUTION = 9, |
| 40 | +}; |
| 41 | + |
| 42 | +static const char * const scan_test_status[] = { |
| 43 | + [IFS_NO_ERROR] = "SCAN no error", |
| 44 | + [IFS_OTHER_THREAD_COULD_NOT_JOIN] = "Other thread could not join.", |
| 45 | + [IFS_INTERRUPTED_BEFORE_RENDEZVOUS] = "Interrupt occurred prior to SCAN coordination.", |
| 46 | + [IFS_POWER_MGMT_INADEQUATE_FOR_SCAN] = |
| 47 | + "Core Abort SCAN Response due to power management condition.", |
| 48 | + [IFS_INVALID_CHUNK_RANGE] = "Non valid chunks in the range", |
| 49 | + [IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS] = "Mismatch in arguments between threads T0/T1.", |
| 50 | + [IFS_CORE_NOT_CAPABLE_CURRENTLY] = "Core not capable of performing SCAN currently", |
| 51 | + [IFS_UNASSIGNED_ERROR_CODE] = "Unassigned error code 0x7", |
| 52 | + [IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT] = |
| 53 | + "Exceeded number of Logical Processors (LP) allowed to run Scan-At-Field concurrently", |
| 54 | + [IFS_INTERRUPTED_DURING_EXECUTION] = "Interrupt occurred prior to SCAN start", |
| 55 | +}; |
| 56 | + |
| 57 | +static void message_not_tested(struct device *dev, int cpu, union ifs_status status) |
| 58 | +{ |
| 59 | + if (status.error_code < ARRAY_SIZE(scan_test_status)) { |
| 60 | + dev_info(dev, "CPU(s) %*pbl: SCAN operation did not start. %s\n", |
| 61 | + cpumask_pr_args(cpu_smt_mask(cpu)), |
| 62 | + scan_test_status[status.error_code]); |
| 63 | + } else if (status.error_code == IFS_SW_TIMEOUT) { |
| 64 | + dev_info(dev, "CPU(s) %*pbl: software timeout during scan\n", |
| 65 | + cpumask_pr_args(cpu_smt_mask(cpu))); |
| 66 | + } else if (status.error_code == IFS_SW_PARTIAL_COMPLETION) { |
| 67 | + dev_info(dev, "CPU(s) %*pbl: %s\n", |
| 68 | + cpumask_pr_args(cpu_smt_mask(cpu)), |
| 69 | + "Not all scan chunks were executed. Maximum forward progress retries exceeded"); |
| 70 | + } else { |
| 71 | + dev_info(dev, "CPU(s) %*pbl: SCAN unknown status %llx\n", |
| 72 | + cpumask_pr_args(cpu_smt_mask(cpu)), status.data); |
| 73 | + } |
| 74 | +} |
| 75 | + |
| 76 | +static void message_fail(struct device *dev, int cpu, union ifs_status status) |
| 77 | +{ |
| 78 | + /* |
| 79 | + * control_error is set when the microcode runs into a problem |
| 80 | + * loading the image from the reserved BIOS memory, or it has |
| 81 | + * been corrupted. Reloading the image may fix this issue. |
| 82 | + */ |
| 83 | + if (status.control_error) { |
| 84 | + dev_err(dev, "CPU(s) %*pbl: could not execute from loaded scan image\n", |
| 85 | + cpumask_pr_args(cpu_smt_mask(cpu))); |
| 86 | + } |
| 87 | + |
| 88 | + /* |
| 89 | + * signature_error is set when the output from the scan chains does not |
| 90 | + * match the expected signature. This might be a transient problem (e.g. |
| 91 | + * due to a bit flip from an alpha particle or neutron). If the problem |
| 92 | + * repeats on a subsequent test, then it indicates an actual problem in |
| 93 | + * the core being tested. |
| 94 | + */ |
| 95 | + if (status.signature_error) { |
| 96 | + dev_err(dev, "CPU(s) %*pbl: test signature incorrect.\n", |
| 97 | + cpumask_pr_args(cpu_smt_mask(cpu))); |
| 98 | + } |
| 99 | +} |
| 100 | + |
| 101 | +static bool can_restart(union ifs_status status) |
| 102 | +{ |
| 103 | + enum ifs_status_err_code err_code = status.error_code; |
| 104 | + |
| 105 | + /* Signature for chunk is bad, or scan test failed */ |
| 106 | + if (status.signature_error || status.control_error) |
| 107 | + return false; |
| 108 | + |
| 109 | + switch (err_code) { |
| 110 | + case IFS_NO_ERROR: |
| 111 | + case IFS_OTHER_THREAD_COULD_NOT_JOIN: |
| 112 | + case IFS_INTERRUPTED_BEFORE_RENDEZVOUS: |
| 113 | + case IFS_POWER_MGMT_INADEQUATE_FOR_SCAN: |
| 114 | + case IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT: |
| 115 | + case IFS_INTERRUPTED_DURING_EXECUTION: |
| 116 | + return true; |
| 117 | + case IFS_INVALID_CHUNK_RANGE: |
| 118 | + case IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS: |
| 119 | + case IFS_CORE_NOT_CAPABLE_CURRENTLY: |
| 120 | + case IFS_UNASSIGNED_ERROR_CODE: |
| 121 | + break; |
| 122 | + } |
| 123 | + return false; |
| 124 | +} |
| 125 | + |
| 126 | +/* |
| 127 | + * Execute the scan. Called "simultaneously" on all threads of a core |
| 128 | + * at high priority using the stop_cpus mechanism. |
| 129 | + */ |
| 130 | +static int doscan(void *data) |
| 131 | +{ |
| 132 | + int cpu = smp_processor_id(); |
| 133 | + u64 *msrs = data; |
| 134 | + int first; |
| 135 | + |
| 136 | + /* Only the first logical CPU on a core reports result */ |
| 137 | + first = cpumask_first(cpu_smt_mask(cpu)); |
| 138 | + |
| 139 | + /* |
| 140 | + * This WRMSR will wait for other HT threads to also write |
| 141 | + * to this MSR (at most for activate.delay cycles). Then it |
| 142 | + * starts scan of each requested chunk. The core scan happens |
| 143 | + * during the "execution" of the WRMSR. This instruction can |
| 144 | + * take up to 200 milliseconds (in the case where all chunks |
| 145 | + * are processed in a single pass) before it retires. |
| 146 | + */ |
| 147 | + wrmsrl(MSR_ACTIVATE_SCAN, msrs[0]); |
| 148 | + |
| 149 | + if (cpu == first) { |
| 150 | + /* Pass back the result of the scan */ |
| 151 | + rdmsrl(MSR_SCAN_STATUS, msrs[1]); |
| 152 | + } |
| 153 | + |
| 154 | + return 0; |
| 155 | +} |
| 156 | + |
| 157 | +/* |
| 158 | + * Use stop_core_cpuslocked() to synchronize writing to MSR_ACTIVATE_SCAN |
| 159 | + * on all threads of the core to be tested. Loop if necessary to complete |
| 160 | + * run of all chunks. Include some defensive tests to make sure forward |
| 161 | + * progress is made, and that the whole test completes in a reasonable time. |
| 162 | + */ |
| 163 | +static void ifs_test_core(int cpu, struct device *dev) |
| 164 | +{ |
| 165 | + union ifs_scan activate; |
| 166 | + union ifs_status status; |
| 167 | + unsigned long timeout; |
| 168 | + struct ifs_data *ifsd; |
| 169 | + u64 msrvals[2]; |
| 170 | + int retries; |
| 171 | + |
| 172 | + ifsd = ifs_get_data(dev); |
| 173 | + |
| 174 | + activate.rsvd = 0; |
| 175 | + activate.delay = IFS_THREAD_WAIT; |
| 176 | + activate.sigmce = 0; |
| 177 | + activate.start = 0; |
| 178 | + activate.stop = ifsd->valid_chunks - 1; |
| 179 | + |
| 180 | + timeout = jiffies + HZ / 2; |
| 181 | + retries = MAX_IFS_RETRIES; |
| 182 | + |
| 183 | + while (activate.start <= activate.stop) { |
| 184 | + if (time_after(jiffies, timeout)) { |
| 185 | + status.error_code = IFS_SW_TIMEOUT; |
| 186 | + break; |
| 187 | + } |
| 188 | + |
| 189 | + msrvals[0] = activate.data; |
| 190 | + stop_core_cpuslocked(cpu, doscan, msrvals); |
| 191 | + |
| 192 | + status.data = msrvals[1]; |
| 193 | + |
| 194 | + /* Some cases can be retried, give up for others */ |
| 195 | + if (!can_restart(status)) |
| 196 | + break; |
| 197 | + |
| 198 | + if (status.chunk_num == activate.start) { |
| 199 | + /* Check for forward progress */ |
| 200 | + if (--retries == 0) { |
| 201 | + if (status.error_code == IFS_NO_ERROR) |
| 202 | + status.error_code = IFS_SW_PARTIAL_COMPLETION; |
| 203 | + break; |
| 204 | + } |
| 205 | + } else { |
| 206 | + retries = MAX_IFS_RETRIES; |
| 207 | + activate.start = status.chunk_num; |
| 208 | + } |
| 209 | + } |
| 210 | + |
| 211 | + /* Update status for this core */ |
| 212 | + ifsd->scan_details = status.data; |
| 213 | + |
| 214 | + if (status.control_error || status.signature_error) { |
| 215 | + ifsd->status = SCAN_TEST_FAIL; |
| 216 | + message_fail(dev, cpu, status); |
| 217 | + } else if (status.error_code) { |
| 218 | + ifsd->status = SCAN_NOT_TESTED; |
| 219 | + message_not_tested(dev, cpu, status); |
| 220 | + } else { |
| 221 | + ifsd->status = SCAN_TEST_PASS; |
| 222 | + } |
| 223 | +} |
| 224 | + |
| 225 | +/* |
| 226 | + * Initiate per core test. It wakes up work queue threads on the target cpu and |
| 227 | + * its sibling cpu. Once all sibling threads wake up, the scan test gets executed and |
| 228 | + * wait for all sibling threads to finish the scan test. |
| 229 | + */ |
| 230 | +int do_core_test(int cpu, struct device *dev) |
| 231 | +{ |
| 232 | + int ret = 0; |
| 233 | + |
| 234 | + /* Prevent CPUs from being taken offline during the scan test */ |
| 235 | + cpus_read_lock(); |
| 236 | + |
| 237 | + if (!cpu_online(cpu)) { |
| 238 | + dev_info(dev, "cannot test on the offline cpu %d\n", cpu); |
| 239 | + ret = -EINVAL; |
| 240 | + goto out; |
| 241 | + } |
| 242 | + |
| 243 | + ifs_test_core(cpu, dev); |
| 244 | +out: |
| 245 | + cpus_read_unlock(); |
| 246 | + return ret; |
| 247 | +} |
0 commit comments