1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright(c) 2022 Intel Corporation. */ 3 4 #include <linux/cpu.h> 5 #include <linux/delay.h> 6 #include <linux/fs.h> 7 #include <linux/nmi.h> 8 #include <linux/slab.h> 9 #include <linux/stop_machine.h> 10 11 #include "ifs.h" 12 13 /* 14 * Note all code and data in this file is protected by 15 * ifs_sem. On HT systems all threads on a core will 16 * execute together, but only the first thread on the 17 * core will update results of the test. 18 */ 19 20 #define CREATE_TRACE_POINTS 21 #include <trace/events/intel_ifs.h> 22 23 /* Max retries on the same chunk */ 24 #define MAX_IFS_RETRIES 5 25 26 struct run_params { 27 struct ifs_data *ifsd; 28 union ifs_scan *activate; 29 union ifs_status status; 30 }; 31 32 /* 33 * Number of TSC cycles that a logical CPU will wait for the other 34 * logical CPU on the core in the WRMSR(ACTIVATE_SCAN). 35 */ 36 #define IFS_THREAD_WAIT 100000 37 38 enum ifs_status_err_code { 39 IFS_NO_ERROR = 0, 40 IFS_OTHER_THREAD_COULD_NOT_JOIN = 1, 41 IFS_INTERRUPTED_BEFORE_RENDEZVOUS = 2, 42 IFS_POWER_MGMT_INADEQUATE_FOR_SCAN = 3, 43 IFS_INVALID_CHUNK_RANGE = 4, 44 IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS = 5, 45 IFS_CORE_NOT_CAPABLE_CURRENTLY = 6, 46 IFS_UNASSIGNED_ERROR_CODE = 7, 47 IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT = 8, 48 IFS_INTERRUPTED_DURING_EXECUTION = 9, 49 IFS_UNASSIGNED_ERROR_CODE_0xA = 0xA, 50 IFS_CORRUPTED_CHUNK = 0xB, 51 }; 52 53 static const char * const scan_test_status[] = { 54 [IFS_NO_ERROR] = "SCAN no error", 55 [IFS_OTHER_THREAD_COULD_NOT_JOIN] = "Other thread could not join.", 56 [IFS_INTERRUPTED_BEFORE_RENDEZVOUS] = "Interrupt occurred prior to SCAN coordination.", 57 [IFS_POWER_MGMT_INADEQUATE_FOR_SCAN] = 58 "Core Abort SCAN Response due to power management condition.", 59 [IFS_INVALID_CHUNK_RANGE] = "Non valid chunks in the range", 60 [IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS] = "Mismatch in arguments between threads T0/T1.", 61 [IFS_CORE_NOT_CAPABLE_CURRENTLY] = "Core not capable of performing SCAN currently", 62 [IFS_UNASSIGNED_ERROR_CODE] = "Unassigned error code 0x7", 63 [IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT] = 64 "Exceeded number of Logical Processors (LP) allowed to run Scan-At-Field concurrently", 65 [IFS_INTERRUPTED_DURING_EXECUTION] = "Interrupt occurred prior to SCAN start", 66 [IFS_UNASSIGNED_ERROR_CODE_0xA] = "Unassigned error code 0xA", 67 [IFS_CORRUPTED_CHUNK] = "Scan operation aborted due to corrupted image. Try reloading", 68 }; 69 70 static void message_not_tested(struct device *dev, int cpu, union ifs_status status) 71 { 72 struct ifs_data *ifsd = ifs_get_data(dev); 73 74 /* 75 * control_error is set when the microcode runs into a problem 76 * loading the image from the reserved BIOS memory, or it has 77 * been corrupted. Reloading the image may fix this issue. 78 */ 79 if (status.control_error) { 80 dev_warn(dev, "CPU(s) %*pbl: Scan controller error. Batch: %02x version: 0x%x\n", 81 cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version); 82 return; 83 } 84 85 if (status.error_code < ARRAY_SIZE(scan_test_status)) { 86 dev_info(dev, "CPU(s) %*pbl: SCAN operation did not start. %s\n", 87 cpumask_pr_args(cpu_smt_mask(cpu)), 88 scan_test_status[status.error_code]); 89 } else if (status.error_code == IFS_SW_TIMEOUT) { 90 dev_info(dev, "CPU(s) %*pbl: software timeout during scan\n", 91 cpumask_pr_args(cpu_smt_mask(cpu))); 92 } else if (status.error_code == IFS_SW_PARTIAL_COMPLETION) { 93 dev_info(dev, "CPU(s) %*pbl: %s\n", 94 cpumask_pr_args(cpu_smt_mask(cpu)), 95 "Not all scan chunks were executed. Maximum forward progress retries exceeded"); 96 } else { 97 dev_info(dev, "CPU(s) %*pbl: SCAN unknown status %llx\n", 98 cpumask_pr_args(cpu_smt_mask(cpu)), status.data); 99 } 100 } 101 102 static void message_fail(struct device *dev, int cpu, union ifs_status status) 103 { 104 struct ifs_data *ifsd = ifs_get_data(dev); 105 106 /* 107 * signature_error is set when the output from the scan chains does not 108 * match the expected signature. This might be a transient problem (e.g. 109 * due to a bit flip from an alpha particle or neutron). If the problem 110 * repeats on a subsequent test, then it indicates an actual problem in 111 * the core being tested. 112 */ 113 if (status.signature_error) { 114 dev_err(dev, "CPU(s) %*pbl: test signature incorrect. Batch: %02x version: 0x%x\n", 115 cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version); 116 } 117 } 118 119 static bool can_restart(union ifs_status status) 120 { 121 enum ifs_status_err_code err_code = status.error_code; 122 123 /* Signature for chunk is bad, or scan test failed */ 124 if (status.signature_error || status.control_error) 125 return false; 126 127 switch (err_code) { 128 case IFS_NO_ERROR: 129 case IFS_OTHER_THREAD_COULD_NOT_JOIN: 130 case IFS_INTERRUPTED_BEFORE_RENDEZVOUS: 131 case IFS_POWER_MGMT_INADEQUATE_FOR_SCAN: 132 case IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT: 133 case IFS_INTERRUPTED_DURING_EXECUTION: 134 return true; 135 case IFS_INVALID_CHUNK_RANGE: 136 case IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS: 137 case IFS_CORE_NOT_CAPABLE_CURRENTLY: 138 case IFS_UNASSIGNED_ERROR_CODE: 139 case IFS_UNASSIGNED_ERROR_CODE_0xA: 140 case IFS_CORRUPTED_CHUNK: 141 break; 142 } 143 return false; 144 } 145 146 #define SPINUNIT 100 /* 100 nsec */ 147 static atomic_t array_cpus_in; 148 static atomic_t scan_cpus_in; 149 150 /* 151 * Simplified cpu sibling rendezvous loop based on microcode loader __wait_for_cpus() 152 */ 153 static void wait_for_sibling_cpu(atomic_t *t, long long timeout) 154 { 155 int cpu = smp_processor_id(); 156 const struct cpumask *smt_mask = cpu_smt_mask(cpu); 157 int all_cpus = cpumask_weight(smt_mask); 158 159 atomic_inc(t); 160 while (atomic_read(t) < all_cpus) { 161 if (timeout < SPINUNIT) 162 return; 163 ndelay(SPINUNIT); 164 timeout -= SPINUNIT; 165 touch_nmi_watchdog(); 166 } 167 } 168 169 /* 170 * Execute the scan. Called "simultaneously" on all threads of a core 171 * at high priority using the stop_cpus mechanism. 172 */ 173 static int doscan(void *data) 174 { 175 int cpu = smp_processor_id(), start, stop; 176 struct run_params *params = data; 177 union ifs_status status; 178 struct ifs_data *ifsd; 179 int first; 180 181 ifsd = params->ifsd; 182 183 if (ifsd->generation) { 184 start = params->activate->gen2.start; 185 stop = params->activate->gen2.stop; 186 } else { 187 start = params->activate->gen0.start; 188 stop = params->activate->gen0.stop; 189 } 190 191 /* Only the first logical CPU on a core reports result */ 192 first = cpumask_first(cpu_smt_mask(cpu)); 193 194 wait_for_sibling_cpu(&scan_cpus_in, NSEC_PER_SEC); 195 196 /* 197 * This WRMSR will wait for other HT threads to also write 198 * to this MSR (at most for activate.delay cycles). Then it 199 * starts scan of each requested chunk. The core scan happens 200 * during the "execution" of the WRMSR. This instruction can 201 * take up to 200 milliseconds (in the case where all chunks 202 * are processed in a single pass) before it retires. 203 */ 204 wrmsrl(MSR_ACTIVATE_SCAN, params->activate->data); 205 rdmsrl(MSR_SCAN_STATUS, status.data); 206 207 trace_ifs_status(ifsd->cur_batch, start, stop, status.data); 208 209 /* Pass back the result of the scan */ 210 if (cpu == first) 211 params->status = status; 212 213 return 0; 214 } 215 216 /* 217 * Use stop_core_cpuslocked() to synchronize writing to MSR_ACTIVATE_SCAN 218 * on all threads of the core to be tested. Loop if necessary to complete 219 * run of all chunks. Include some defensive tests to make sure forward 220 * progress is made, and that the whole test completes in a reasonable time. 221 */ 222 static void ifs_test_core(int cpu, struct device *dev) 223 { 224 union ifs_status status = {}; 225 union ifs_scan activate; 226 unsigned long timeout; 227 struct ifs_data *ifsd; 228 int to_start, to_stop; 229 int status_chunk; 230 struct run_params params; 231 int retries; 232 233 ifsd = ifs_get_data(dev); 234 235 activate.gen0.rsvd = 0; 236 activate.delay = IFS_THREAD_WAIT; 237 activate.sigmce = 0; 238 to_start = 0; 239 to_stop = ifsd->valid_chunks - 1; 240 241 params.ifsd = ifs_get_data(dev); 242 243 if (ifsd->generation) { 244 activate.gen2.start = to_start; 245 activate.gen2.stop = to_stop; 246 } else { 247 activate.gen0.start = to_start; 248 activate.gen0.stop = to_stop; 249 } 250 251 timeout = jiffies + HZ / 2; 252 retries = MAX_IFS_RETRIES; 253 254 while (to_start <= to_stop) { 255 if (time_after(jiffies, timeout)) { 256 status.error_code = IFS_SW_TIMEOUT; 257 break; 258 } 259 260 params.activate = &activate; 261 atomic_set(&scan_cpus_in, 0); 262 stop_core_cpuslocked(cpu, doscan, ¶ms); 263 264 status = params.status; 265 266 /* Some cases can be retried, give up for others */ 267 if (!can_restart(status)) 268 break; 269 270 status_chunk = ifsd->generation ? status.gen2.chunk_num : status.gen0.chunk_num; 271 if (status_chunk == to_start) { 272 /* Check for forward progress */ 273 if (--retries == 0) { 274 if (status.error_code == IFS_NO_ERROR) 275 status.error_code = IFS_SW_PARTIAL_COMPLETION; 276 break; 277 } 278 } else { 279 retries = MAX_IFS_RETRIES; 280 if (ifsd->generation) 281 activate.gen2.start = status_chunk; 282 else 283 activate.gen0.start = status_chunk; 284 to_start = status_chunk; 285 } 286 } 287 288 /* Update status for this core */ 289 ifsd->scan_details = status.data; 290 291 if (status.signature_error) { 292 ifsd->status = SCAN_TEST_FAIL; 293 message_fail(dev, cpu, status); 294 } else if (status.control_error || status.error_code) { 295 ifsd->status = SCAN_NOT_TESTED; 296 message_not_tested(dev, cpu, status); 297 } else { 298 ifsd->status = SCAN_TEST_PASS; 299 } 300 } 301 302 static int do_array_test(void *data) 303 { 304 union ifs_array *command = data; 305 int cpu = smp_processor_id(); 306 int first; 307 308 wait_for_sibling_cpu(&array_cpus_in, NSEC_PER_SEC); 309 310 /* 311 * Only one logical CPU on a core needs to trigger the Array test via MSR write. 312 */ 313 first = cpumask_first(cpu_smt_mask(cpu)); 314 315 if (cpu == first) { 316 wrmsrl(MSR_ARRAY_BIST, command->data); 317 /* Pass back the result of the test */ 318 rdmsrl(MSR_ARRAY_BIST, command->data); 319 } 320 321 return 0; 322 } 323 324 static void ifs_array_test_core(int cpu, struct device *dev) 325 { 326 union ifs_array command = {}; 327 bool timed_out = false; 328 struct ifs_data *ifsd; 329 unsigned long timeout; 330 331 ifsd = ifs_get_data(dev); 332 333 command.array_bitmask = ~0U; 334 timeout = jiffies + HZ / 2; 335 336 do { 337 if (time_after(jiffies, timeout)) { 338 timed_out = true; 339 break; 340 } 341 atomic_set(&array_cpus_in, 0); 342 stop_core_cpuslocked(cpu, do_array_test, &command); 343 344 if (command.ctrl_result) 345 break; 346 } while (command.array_bitmask); 347 348 ifsd->scan_details = command.data; 349 350 if (command.ctrl_result) 351 ifsd->status = SCAN_TEST_FAIL; 352 else if (timed_out || command.array_bitmask) 353 ifsd->status = SCAN_NOT_TESTED; 354 else 355 ifsd->status = SCAN_TEST_PASS; 356 } 357 358 #define ARRAY_GEN1_TEST_ALL_ARRAYS 0x0ULL 359 #define ARRAY_GEN1_STATUS_FAIL 0x1ULL 360 361 static int do_array_test_gen1(void *status) 362 { 363 int cpu = smp_processor_id(); 364 int first; 365 366 first = cpumask_first(cpu_smt_mask(cpu)); 367 368 if (cpu == first) { 369 wrmsrl(MSR_ARRAY_TRIGGER, ARRAY_GEN1_TEST_ALL_ARRAYS); 370 rdmsrl(MSR_ARRAY_STATUS, *((u64 *)status)); 371 } 372 373 return 0; 374 } 375 376 static void ifs_array_test_gen1(int cpu, struct device *dev) 377 { 378 struct ifs_data *ifsd = ifs_get_data(dev); 379 u64 status = 0; 380 381 stop_core_cpuslocked(cpu, do_array_test_gen1, &status); 382 ifsd->scan_details = status; 383 384 if (status & ARRAY_GEN1_STATUS_FAIL) 385 ifsd->status = SCAN_TEST_FAIL; 386 else 387 ifsd->status = SCAN_TEST_PASS; 388 } 389 390 /* 391 * Initiate per core test. It wakes up work queue threads on the target cpu and 392 * its sibling cpu. Once all sibling threads wake up, the scan test gets executed and 393 * wait for all sibling threads to finish the scan test. 394 */ 395 int do_core_test(int cpu, struct device *dev) 396 { 397 const struct ifs_test_caps *test = ifs_get_test_caps(dev); 398 struct ifs_data *ifsd = ifs_get_data(dev); 399 int ret = 0; 400 401 /* Prevent CPUs from being taken offline during the scan test */ 402 cpus_read_lock(); 403 404 if (!cpu_online(cpu)) { 405 dev_info(dev, "cannot test on the offline cpu %d\n", cpu); 406 ret = -EINVAL; 407 goto out; 408 } 409 410 switch (test->test_num) { 411 case IFS_TYPE_SAF: 412 if (!ifsd->loaded) 413 ret = -EPERM; 414 else 415 ifs_test_core(cpu, dev); 416 break; 417 case IFS_TYPE_ARRAY_BIST: 418 if (ifsd->array_gen == ARRAY_GEN0) 419 ifs_array_test_core(cpu, dev); 420 else 421 ifs_array_test_gen1(cpu, dev); 422 break; 423 default: 424 ret = -EINVAL; 425 } 426 out: 427 cpus_read_unlock(); 428 return ret; 429 } 430