1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright(c) 2022 Intel Corporation. */ 3 4 #include <linux/cpu.h> 5 #include <linux/delay.h> 6 #include <linux/fs.h> 7 #include <linux/nmi.h> 8 #include <linux/slab.h> 9 #include <linux/stop_machine.h> 10 11 #include "ifs.h" 12 13 /* 14 * Note all code and data in this file is protected by 15 * ifs_sem. On HT systems all threads on a core will 16 * execute together, but only the first thread on the 17 * core will update results of the test. 18 */ 19 20 #define CREATE_TRACE_POINTS 21 #include <trace/events/intel_ifs.h> 22 23 /* Max retries on the same chunk */ 24 #define MAX_IFS_RETRIES 5 25 26 /* 27 * Number of TSC cycles that a logical CPU will wait for the other 28 * logical CPU on the core in the WRMSR(ACTIVATE_SCAN). 29 */ 30 #define IFS_THREAD_WAIT 100000 31 32 enum ifs_status_err_code { 33 IFS_NO_ERROR = 0, 34 IFS_OTHER_THREAD_COULD_NOT_JOIN = 1, 35 IFS_INTERRUPTED_BEFORE_RENDEZVOUS = 2, 36 IFS_POWER_MGMT_INADEQUATE_FOR_SCAN = 3, 37 IFS_INVALID_CHUNK_RANGE = 4, 38 IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS = 5, 39 IFS_CORE_NOT_CAPABLE_CURRENTLY = 6, 40 IFS_UNASSIGNED_ERROR_CODE = 7, 41 IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT = 8, 42 IFS_INTERRUPTED_DURING_EXECUTION = 9, 43 IFS_UNASSIGNED_ERROR_CODE_0xA = 0xA, 44 IFS_CORRUPTED_CHUNK = 0xB, 45 }; 46 47 static const char * const scan_test_status[] = { 48 [IFS_NO_ERROR] = "SCAN no error", 49 [IFS_OTHER_THREAD_COULD_NOT_JOIN] = "Other thread could not join.", 50 [IFS_INTERRUPTED_BEFORE_RENDEZVOUS] = "Interrupt occurred prior to SCAN coordination.", 51 [IFS_POWER_MGMT_INADEQUATE_FOR_SCAN] = 52 "Core Abort SCAN Response due to power management condition.", 53 [IFS_INVALID_CHUNK_RANGE] = "Non valid chunks in the range", 54 [IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS] = "Mismatch in arguments between threads T0/T1.", 55 [IFS_CORE_NOT_CAPABLE_CURRENTLY] = "Core not capable of performing SCAN currently", 56 [IFS_UNASSIGNED_ERROR_CODE] = "Unassigned error code 0x7", 57 [IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT] = 58 "Exceeded number of Logical Processors (LP) allowed to run Scan-At-Field concurrently", 59 [IFS_INTERRUPTED_DURING_EXECUTION] = "Interrupt occurred prior to SCAN start", 60 [IFS_UNASSIGNED_ERROR_CODE_0xA] = "Unassigned error code 0xA", 61 [IFS_CORRUPTED_CHUNK] = "Scan operation aborted due to corrupted image. Try reloading", 62 }; 63 64 static void message_not_tested(struct device *dev, int cpu, union ifs_status status) 65 { 66 if (status.error_code < ARRAY_SIZE(scan_test_status)) { 67 dev_info(dev, "CPU(s) %*pbl: SCAN operation did not start. %s\n", 68 cpumask_pr_args(cpu_smt_mask(cpu)), 69 scan_test_status[status.error_code]); 70 } else if (status.error_code == IFS_SW_TIMEOUT) { 71 dev_info(dev, "CPU(s) %*pbl: software timeout during scan\n", 72 cpumask_pr_args(cpu_smt_mask(cpu))); 73 } else if (status.error_code == IFS_SW_PARTIAL_COMPLETION) { 74 dev_info(dev, "CPU(s) %*pbl: %s\n", 75 cpumask_pr_args(cpu_smt_mask(cpu)), 76 "Not all scan chunks were executed. Maximum forward progress retries exceeded"); 77 } else { 78 dev_info(dev, "CPU(s) %*pbl: SCAN unknown status %llx\n", 79 cpumask_pr_args(cpu_smt_mask(cpu)), status.data); 80 } 81 } 82 83 static void message_fail(struct device *dev, int cpu, union ifs_status status) 84 { 85 struct ifs_data *ifsd = ifs_get_data(dev); 86 87 /* 88 * control_error is set when the microcode runs into a problem 89 * loading the image from the reserved BIOS memory, or it has 90 * been corrupted. Reloading the image may fix this issue. 91 */ 92 if (status.control_error) { 93 dev_err(dev, "CPU(s) %*pbl: could not execute from loaded scan image. Batch: %02x version: 0x%x\n", 94 cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version); 95 } 96 97 /* 98 * signature_error is set when the output from the scan chains does not 99 * match the expected signature. This might be a transient problem (e.g. 100 * due to a bit flip from an alpha particle or neutron). If the problem 101 * repeats on a subsequent test, then it indicates an actual problem in 102 * the core being tested. 103 */ 104 if (status.signature_error) { 105 dev_err(dev, "CPU(s) %*pbl: test signature incorrect. Batch: %02x version: 0x%x\n", 106 cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version); 107 } 108 } 109 110 static bool can_restart(union ifs_status status) 111 { 112 enum ifs_status_err_code err_code = status.error_code; 113 114 /* Signature for chunk is bad, or scan test failed */ 115 if (status.signature_error || status.control_error) 116 return false; 117 118 switch (err_code) { 119 case IFS_NO_ERROR: 120 case IFS_OTHER_THREAD_COULD_NOT_JOIN: 121 case IFS_INTERRUPTED_BEFORE_RENDEZVOUS: 122 case IFS_POWER_MGMT_INADEQUATE_FOR_SCAN: 123 case IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT: 124 case IFS_INTERRUPTED_DURING_EXECUTION: 125 return true; 126 case IFS_INVALID_CHUNK_RANGE: 127 case IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS: 128 case IFS_CORE_NOT_CAPABLE_CURRENTLY: 129 case IFS_UNASSIGNED_ERROR_CODE: 130 case IFS_UNASSIGNED_ERROR_CODE_0xA: 131 case IFS_CORRUPTED_CHUNK: 132 break; 133 } 134 return false; 135 } 136 137 /* 138 * Execute the scan. Called "simultaneously" on all threads of a core 139 * at high priority using the stop_cpus mechanism. 140 */ 141 static int doscan(void *data) 142 { 143 int cpu = smp_processor_id(); 144 u64 *msrs = data; 145 int first; 146 147 /* Only the first logical CPU on a core reports result */ 148 first = cpumask_first(cpu_smt_mask(cpu)); 149 150 /* 151 * This WRMSR will wait for other HT threads to also write 152 * to this MSR (at most for activate.delay cycles). Then it 153 * starts scan of each requested chunk. The core scan happens 154 * during the "execution" of the WRMSR. This instruction can 155 * take up to 200 milliseconds (in the case where all chunks 156 * are processed in a single pass) before it retires. 157 */ 158 wrmsrl(MSR_ACTIVATE_SCAN, msrs[0]); 159 160 if (cpu == first) { 161 /* Pass back the result of the scan */ 162 rdmsrl(MSR_SCAN_STATUS, msrs[1]); 163 } 164 165 return 0; 166 } 167 168 /* 169 * Use stop_core_cpuslocked() to synchronize writing to MSR_ACTIVATE_SCAN 170 * on all threads of the core to be tested. Loop if necessary to complete 171 * run of all chunks. Include some defensive tests to make sure forward 172 * progress is made, and that the whole test completes in a reasonable time. 173 */ 174 static void ifs_test_core(int cpu, struct device *dev) 175 { 176 union ifs_scan activate; 177 union ifs_status status; 178 unsigned long timeout; 179 struct ifs_data *ifsd; 180 int to_start, to_stop; 181 int status_chunk; 182 u64 msrvals[2]; 183 int retries; 184 185 ifsd = ifs_get_data(dev); 186 187 activate.gen0.rsvd = 0; 188 activate.delay = IFS_THREAD_WAIT; 189 activate.sigmce = 0; 190 to_start = 0; 191 to_stop = ifsd->valid_chunks - 1; 192 193 if (ifsd->generation) { 194 activate.gen2.start = to_start; 195 activate.gen2.stop = to_stop; 196 } else { 197 activate.gen0.start = to_start; 198 activate.gen0.stop = to_stop; 199 } 200 201 timeout = jiffies + HZ / 2; 202 retries = MAX_IFS_RETRIES; 203 204 while (to_start <= to_stop) { 205 if (time_after(jiffies, timeout)) { 206 status.error_code = IFS_SW_TIMEOUT; 207 break; 208 } 209 210 msrvals[0] = activate.data; 211 stop_core_cpuslocked(cpu, doscan, msrvals); 212 213 status.data = msrvals[1]; 214 215 trace_ifs_status(cpu, to_start, to_stop, status.data); 216 217 /* Some cases can be retried, give up for others */ 218 if (!can_restart(status)) 219 break; 220 221 status_chunk = ifsd->generation ? status.gen2.chunk_num : status.gen0.chunk_num; 222 if (status_chunk == to_start) { 223 /* Check for forward progress */ 224 if (--retries == 0) { 225 if (status.error_code == IFS_NO_ERROR) 226 status.error_code = IFS_SW_PARTIAL_COMPLETION; 227 break; 228 } 229 } else { 230 retries = MAX_IFS_RETRIES; 231 if (ifsd->generation) 232 activate.gen2.start = status_chunk; 233 else 234 activate.gen0.start = status_chunk; 235 to_start = status_chunk; 236 } 237 } 238 239 /* Update status for this core */ 240 ifsd->scan_details = status.data; 241 242 if (status.control_error || status.signature_error) { 243 ifsd->status = SCAN_TEST_FAIL; 244 message_fail(dev, cpu, status); 245 } else if (status.error_code) { 246 ifsd->status = SCAN_NOT_TESTED; 247 message_not_tested(dev, cpu, status); 248 } else { 249 ifsd->status = SCAN_TEST_PASS; 250 } 251 } 252 253 #define SPINUNIT 100 /* 100 nsec */ 254 static atomic_t array_cpus_out; 255 256 /* 257 * Simplified cpu sibling rendezvous loop based on microcode loader __wait_for_cpus() 258 */ 259 static void wait_for_sibling_cpu(atomic_t *t, long long timeout) 260 { 261 int cpu = smp_processor_id(); 262 const struct cpumask *smt_mask = cpu_smt_mask(cpu); 263 int all_cpus = cpumask_weight(smt_mask); 264 265 atomic_inc(t); 266 while (atomic_read(t) < all_cpus) { 267 if (timeout < SPINUNIT) 268 return; 269 ndelay(SPINUNIT); 270 timeout -= SPINUNIT; 271 touch_nmi_watchdog(); 272 } 273 } 274 275 static int do_array_test(void *data) 276 { 277 union ifs_array *command = data; 278 int cpu = smp_processor_id(); 279 int first; 280 281 /* 282 * Only one logical CPU on a core needs to trigger the Array test via MSR write. 283 */ 284 first = cpumask_first(cpu_smt_mask(cpu)); 285 286 if (cpu == first) { 287 wrmsrl(MSR_ARRAY_BIST, command->data); 288 /* Pass back the result of the test */ 289 rdmsrl(MSR_ARRAY_BIST, command->data); 290 } 291 292 /* Tests complete faster if the sibling is spinning here */ 293 wait_for_sibling_cpu(&array_cpus_out, NSEC_PER_SEC); 294 295 return 0; 296 } 297 298 static void ifs_array_test_core(int cpu, struct device *dev) 299 { 300 union ifs_array command = {}; 301 bool timed_out = false; 302 struct ifs_data *ifsd; 303 unsigned long timeout; 304 305 ifsd = ifs_get_data(dev); 306 307 command.array_bitmask = ~0U; 308 timeout = jiffies + HZ / 2; 309 310 do { 311 if (time_after(jiffies, timeout)) { 312 timed_out = true; 313 break; 314 } 315 atomic_set(&array_cpus_out, 0); 316 stop_core_cpuslocked(cpu, do_array_test, &command); 317 318 if (command.ctrl_result) 319 break; 320 } while (command.array_bitmask); 321 322 ifsd->scan_details = command.data; 323 324 if (command.ctrl_result) 325 ifsd->status = SCAN_TEST_FAIL; 326 else if (timed_out || command.array_bitmask) 327 ifsd->status = SCAN_NOT_TESTED; 328 else 329 ifsd->status = SCAN_TEST_PASS; 330 } 331 332 #define ARRAY_GEN1_TEST_ALL_ARRAYS 0x0ULL 333 #define ARRAY_GEN1_STATUS_FAIL 0x1ULL 334 335 static int do_array_test_gen1(void *status) 336 { 337 int cpu = smp_processor_id(); 338 int first; 339 340 first = cpumask_first(cpu_smt_mask(cpu)); 341 342 if (cpu == first) { 343 wrmsrl(MSR_ARRAY_TRIGGER, ARRAY_GEN1_TEST_ALL_ARRAYS); 344 rdmsrl(MSR_ARRAY_STATUS, *((u64 *)status)); 345 } 346 347 return 0; 348 } 349 350 static void ifs_array_test_gen1(int cpu, struct device *dev) 351 { 352 struct ifs_data *ifsd = ifs_get_data(dev); 353 u64 status = 0; 354 355 stop_core_cpuslocked(cpu, do_array_test_gen1, &status); 356 ifsd->scan_details = status; 357 358 if (status & ARRAY_GEN1_STATUS_FAIL) 359 ifsd->status = SCAN_TEST_FAIL; 360 else 361 ifsd->status = SCAN_TEST_PASS; 362 } 363 364 /* 365 * Initiate per core test. It wakes up work queue threads on the target cpu and 366 * its sibling cpu. Once all sibling threads wake up, the scan test gets executed and 367 * wait for all sibling threads to finish the scan test. 368 */ 369 int do_core_test(int cpu, struct device *dev) 370 { 371 const struct ifs_test_caps *test = ifs_get_test_caps(dev); 372 struct ifs_data *ifsd = ifs_get_data(dev); 373 int ret = 0; 374 375 /* Prevent CPUs from being taken offline during the scan test */ 376 cpus_read_lock(); 377 378 if (!cpu_online(cpu)) { 379 dev_info(dev, "cannot test on the offline cpu %d\n", cpu); 380 ret = -EINVAL; 381 goto out; 382 } 383 384 switch (test->test_num) { 385 case IFS_TYPE_SAF: 386 if (!ifsd->loaded) 387 ret = -EPERM; 388 else 389 ifs_test_core(cpu, dev); 390 break; 391 case IFS_TYPE_ARRAY_BIST: 392 if (ifsd->array_gen == ARRAY_GEN0) 393 ifs_array_test_core(cpu, dev); 394 else 395 ifs_array_test_gen1(cpu, dev); 396 break; 397 default: 398 ret = -EINVAL; 399 } 400 out: 401 cpus_read_unlock(); 402 return ret; 403 } 404