xref: /linux/drivers/platform/x86/intel/ifs/runtest.c (revision b00f7f4f8e936da55f2e6c7fd96391ef54c145fc)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright(c) 2022 Intel Corporation. */
3 
4 #include <linux/cpu.h>
5 #include <linux/delay.h>
6 #include <linux/fs.h>
7 #include <linux/nmi.h>
8 #include <linux/slab.h>
9 #include <linux/stop_machine.h>
10 
11 #include "ifs.h"
12 
13 /*
14  * Note all code and data in this file is protected by
15  * ifs_sem. On HT systems all threads on a core will
16  * execute together, but only the first thread on the
17  * core will update results of the test.
18  */
19 
20 #define CREATE_TRACE_POINTS
21 #include <trace/events/intel_ifs.h>
22 
23 /* Max retries on the same chunk */
24 #define MAX_IFS_RETRIES  5
25 
26 struct run_params {
27 	struct ifs_data *ifsd;
28 	union ifs_scan *activate;
29 	union ifs_status status;
30 };
31 
32 /*
33  * Number of TSC cycles that a logical CPU will wait for the other
34  * logical CPU on the core in the WRMSR(ACTIVATE_SCAN).
35  */
36 #define IFS_THREAD_WAIT 100000
37 
38 enum ifs_status_err_code {
39 	IFS_NO_ERROR				= 0,
40 	IFS_OTHER_THREAD_COULD_NOT_JOIN		= 1,
41 	IFS_INTERRUPTED_BEFORE_RENDEZVOUS	= 2,
42 	IFS_POWER_MGMT_INADEQUATE_FOR_SCAN	= 3,
43 	IFS_INVALID_CHUNK_RANGE			= 4,
44 	IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS	= 5,
45 	IFS_CORE_NOT_CAPABLE_CURRENTLY		= 6,
46 	IFS_UNASSIGNED_ERROR_CODE		= 7,
47 	IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT	= 8,
48 	IFS_INTERRUPTED_DURING_EXECUTION	= 9,
49 	IFS_UNASSIGNED_ERROR_CODE_0xA		= 0xA,
50 	IFS_CORRUPTED_CHUNK		= 0xB,
51 };
52 
53 static const char * const scan_test_status[] = {
54 	[IFS_NO_ERROR] = "SCAN no error",
55 	[IFS_OTHER_THREAD_COULD_NOT_JOIN] = "Other thread could not join.",
56 	[IFS_INTERRUPTED_BEFORE_RENDEZVOUS] = "Interrupt occurred prior to SCAN coordination.",
57 	[IFS_POWER_MGMT_INADEQUATE_FOR_SCAN] =
58 	"Core Abort SCAN Response due to power management condition.",
59 	[IFS_INVALID_CHUNK_RANGE] = "Non valid chunks in the range",
60 	[IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS] = "Mismatch in arguments between threads T0/T1.",
61 	[IFS_CORE_NOT_CAPABLE_CURRENTLY] = "Core not capable of performing SCAN currently",
62 	[IFS_UNASSIGNED_ERROR_CODE] = "Unassigned error code 0x7",
63 	[IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT] =
64 	"Exceeded number of Logical Processors (LP) allowed to run Scan-At-Field concurrently",
65 	[IFS_INTERRUPTED_DURING_EXECUTION] = "Interrupt occurred prior to SCAN start",
66 	[IFS_UNASSIGNED_ERROR_CODE_0xA] = "Unassigned error code 0xA",
67 	[IFS_CORRUPTED_CHUNK] = "Scan operation aborted due to corrupted image. Try reloading",
68 };
69 
70 static void message_not_tested(struct device *dev, int cpu, union ifs_status status)
71 {
72 	struct ifs_data *ifsd = ifs_get_data(dev);
73 
74 	/*
75 	 * control_error is set when the microcode runs into a problem
76 	 * loading the image from the reserved BIOS memory, or it has
77 	 * been corrupted. Reloading the image may fix this issue.
78 	 */
79 	if (status.control_error) {
80 		dev_warn(dev, "CPU(s) %*pbl: Scan controller error. Batch: %02x version: 0x%x\n",
81 			 cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version);
82 		return;
83 	}
84 
85 	if (status.error_code < ARRAY_SIZE(scan_test_status)) {
86 		dev_info(dev, "CPU(s) %*pbl: SCAN operation did not start. %s\n",
87 			 cpumask_pr_args(cpu_smt_mask(cpu)),
88 			 scan_test_status[status.error_code]);
89 	} else if (status.error_code == IFS_SW_TIMEOUT) {
90 		dev_info(dev, "CPU(s) %*pbl: software timeout during scan\n",
91 			 cpumask_pr_args(cpu_smt_mask(cpu)));
92 	} else if (status.error_code == IFS_SW_PARTIAL_COMPLETION) {
93 		dev_info(dev, "CPU(s) %*pbl: %s\n",
94 			 cpumask_pr_args(cpu_smt_mask(cpu)),
95 			 "Not all scan chunks were executed. Maximum forward progress retries exceeded");
96 	} else {
97 		dev_info(dev, "CPU(s) %*pbl: SCAN unknown status %llx\n",
98 			 cpumask_pr_args(cpu_smt_mask(cpu)), status.data);
99 	}
100 }
101 
102 static void message_fail(struct device *dev, int cpu, union ifs_status status)
103 {
104 	struct ifs_data *ifsd = ifs_get_data(dev);
105 
106 	/*
107 	 * signature_error is set when the output from the scan chains does not
108 	 * match the expected signature. This might be a transient problem (e.g.
109 	 * due to a bit flip from an alpha particle or neutron). If the problem
110 	 * repeats on a subsequent test, then it indicates an actual problem in
111 	 * the core being tested.
112 	 */
113 	if (status.signature_error) {
114 		dev_err(dev, "CPU(s) %*pbl: test signature incorrect. Batch: %02x version: 0x%x\n",
115 			cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version);
116 	}
117 }
118 
119 static bool can_restart(union ifs_status status)
120 {
121 	enum ifs_status_err_code err_code = status.error_code;
122 
123 	/* Signature for chunk is bad, or scan test failed */
124 	if (status.signature_error || status.control_error)
125 		return false;
126 
127 	switch (err_code) {
128 	case IFS_NO_ERROR:
129 	case IFS_OTHER_THREAD_COULD_NOT_JOIN:
130 	case IFS_INTERRUPTED_BEFORE_RENDEZVOUS:
131 	case IFS_POWER_MGMT_INADEQUATE_FOR_SCAN:
132 	case IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT:
133 	case IFS_INTERRUPTED_DURING_EXECUTION:
134 		return true;
135 	case IFS_INVALID_CHUNK_RANGE:
136 	case IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS:
137 	case IFS_CORE_NOT_CAPABLE_CURRENTLY:
138 	case IFS_UNASSIGNED_ERROR_CODE:
139 	case IFS_UNASSIGNED_ERROR_CODE_0xA:
140 	case IFS_CORRUPTED_CHUNK:
141 		break;
142 	}
143 	return false;
144 }
145 
146 #define SPINUNIT 100 /* 100 nsec */
147 static atomic_t array_cpus_in;
148 static atomic_t scan_cpus_in;
149 
150 /*
151  * Simplified cpu sibling rendezvous loop based on microcode loader __wait_for_cpus()
152  */
153 static void wait_for_sibling_cpu(atomic_t *t, long long timeout)
154 {
155 	int cpu = smp_processor_id();
156 	const struct cpumask *smt_mask = cpu_smt_mask(cpu);
157 	int all_cpus = cpumask_weight(smt_mask);
158 
159 	atomic_inc(t);
160 	while (atomic_read(t) < all_cpus) {
161 		if (timeout < SPINUNIT)
162 			return;
163 		ndelay(SPINUNIT);
164 		timeout -= SPINUNIT;
165 		touch_nmi_watchdog();
166 	}
167 }
168 
169 /*
170  * Execute the scan. Called "simultaneously" on all threads of a core
171  * at high priority using the stop_cpus mechanism.
172  */
173 static int doscan(void *data)
174 {
175 	int cpu = smp_processor_id(), start, stop;
176 	struct run_params *params = data;
177 	union ifs_status status;
178 	struct ifs_data *ifsd;
179 	int first;
180 
181 	ifsd = params->ifsd;
182 
183 	if (ifsd->generation) {
184 		start = params->activate->gen2.start;
185 		stop = params->activate->gen2.stop;
186 	} else {
187 		start = params->activate->gen0.start;
188 		stop = params->activate->gen0.stop;
189 	}
190 
191 	/* Only the first logical CPU on a core reports result */
192 	first = cpumask_first(cpu_smt_mask(cpu));
193 
194 	wait_for_sibling_cpu(&scan_cpus_in, NSEC_PER_SEC);
195 
196 	/*
197 	 * This WRMSR will wait for other HT threads to also write
198 	 * to this MSR (at most for activate.delay cycles). Then it
199 	 * starts scan of each requested chunk. The core scan happens
200 	 * during the "execution" of the WRMSR. This instruction can
201 	 * take up to 200 milliseconds (in the case where all chunks
202 	 * are processed in a single pass) before it retires.
203 	 */
204 	wrmsrl(MSR_ACTIVATE_SCAN, params->activate->data);
205 	rdmsrl(MSR_SCAN_STATUS, status.data);
206 
207 	trace_ifs_status(ifsd->cur_batch, start, stop, status.data);
208 
209 	/* Pass back the result of the scan */
210 	if (cpu == first)
211 		params->status = status;
212 
213 	return 0;
214 }
215 
216 /*
217  * Use stop_core_cpuslocked() to synchronize writing to MSR_ACTIVATE_SCAN
218  * on all threads of the core to be tested. Loop if necessary to complete
219  * run of all chunks. Include some defensive tests to make sure forward
220  * progress is made, and that the whole test completes in a reasonable time.
221  */
222 static void ifs_test_core(int cpu, struct device *dev)
223 {
224 	union ifs_status status = {};
225 	union ifs_scan activate;
226 	unsigned long timeout;
227 	struct ifs_data *ifsd;
228 	int to_start, to_stop;
229 	int status_chunk;
230 	struct run_params params;
231 	int retries;
232 
233 	ifsd = ifs_get_data(dev);
234 
235 	activate.gen0.rsvd = 0;
236 	activate.delay = IFS_THREAD_WAIT;
237 	activate.sigmce = 0;
238 	to_start = 0;
239 	to_stop = ifsd->valid_chunks - 1;
240 
241 	params.ifsd = ifs_get_data(dev);
242 
243 	if (ifsd->generation) {
244 		activate.gen2.start = to_start;
245 		activate.gen2.stop = to_stop;
246 	} else {
247 		activate.gen0.start = to_start;
248 		activate.gen0.stop = to_stop;
249 	}
250 
251 	timeout = jiffies + HZ / 2;
252 	retries = MAX_IFS_RETRIES;
253 
254 	while (to_start <= to_stop) {
255 		if (time_after(jiffies, timeout)) {
256 			status.error_code = IFS_SW_TIMEOUT;
257 			break;
258 		}
259 
260 		params.activate = &activate;
261 		atomic_set(&scan_cpus_in, 0);
262 		stop_core_cpuslocked(cpu, doscan, &params);
263 
264 		status = params.status;
265 
266 		/* Some cases can be retried, give up for others */
267 		if (!can_restart(status))
268 			break;
269 
270 		status_chunk = ifsd->generation ? status.gen2.chunk_num : status.gen0.chunk_num;
271 		if (status_chunk == to_start) {
272 			/* Check for forward progress */
273 			if (--retries == 0) {
274 				if (status.error_code == IFS_NO_ERROR)
275 					status.error_code = IFS_SW_PARTIAL_COMPLETION;
276 				break;
277 			}
278 		} else {
279 			retries = MAX_IFS_RETRIES;
280 			if (ifsd->generation)
281 				activate.gen2.start = status_chunk;
282 			else
283 				activate.gen0.start = status_chunk;
284 			to_start = status_chunk;
285 		}
286 	}
287 
288 	/* Update status for this core */
289 	ifsd->scan_details = status.data;
290 
291 	if (status.signature_error) {
292 		ifsd->status = SCAN_TEST_FAIL;
293 		message_fail(dev, cpu, status);
294 	} else if (status.control_error || status.error_code) {
295 		ifsd->status = SCAN_NOT_TESTED;
296 		message_not_tested(dev, cpu, status);
297 	} else {
298 		ifsd->status = SCAN_TEST_PASS;
299 	}
300 }
301 
302 static int do_array_test(void *data)
303 {
304 	union ifs_array *command = data;
305 	int cpu = smp_processor_id();
306 	int first;
307 
308 	wait_for_sibling_cpu(&array_cpus_in, NSEC_PER_SEC);
309 
310 	/*
311 	 * Only one logical CPU on a core needs to trigger the Array test via MSR write.
312 	 */
313 	first = cpumask_first(cpu_smt_mask(cpu));
314 
315 	if (cpu == first) {
316 		wrmsrl(MSR_ARRAY_BIST, command->data);
317 		/* Pass back the result of the test */
318 		rdmsrl(MSR_ARRAY_BIST, command->data);
319 	}
320 
321 	return 0;
322 }
323 
324 static void ifs_array_test_core(int cpu, struct device *dev)
325 {
326 	union ifs_array command = {};
327 	bool timed_out = false;
328 	struct ifs_data *ifsd;
329 	unsigned long timeout;
330 
331 	ifsd = ifs_get_data(dev);
332 
333 	command.array_bitmask = ~0U;
334 	timeout = jiffies + HZ / 2;
335 
336 	do {
337 		if (time_after(jiffies, timeout)) {
338 			timed_out = true;
339 			break;
340 		}
341 		atomic_set(&array_cpus_in, 0);
342 		stop_core_cpuslocked(cpu, do_array_test, &command);
343 
344 		if (command.ctrl_result)
345 			break;
346 	} while (command.array_bitmask);
347 
348 	ifsd->scan_details = command.data;
349 
350 	if (command.ctrl_result)
351 		ifsd->status = SCAN_TEST_FAIL;
352 	else if (timed_out || command.array_bitmask)
353 		ifsd->status = SCAN_NOT_TESTED;
354 	else
355 		ifsd->status = SCAN_TEST_PASS;
356 }
357 
358 #define ARRAY_GEN1_TEST_ALL_ARRAYS	0x0ULL
359 #define ARRAY_GEN1_STATUS_FAIL		0x1ULL
360 
361 static int do_array_test_gen1(void *status)
362 {
363 	int cpu = smp_processor_id();
364 	int first;
365 
366 	first = cpumask_first(cpu_smt_mask(cpu));
367 
368 	if (cpu == first) {
369 		wrmsrl(MSR_ARRAY_TRIGGER, ARRAY_GEN1_TEST_ALL_ARRAYS);
370 		rdmsrl(MSR_ARRAY_STATUS, *((u64 *)status));
371 	}
372 
373 	return 0;
374 }
375 
376 static void ifs_array_test_gen1(int cpu, struct device *dev)
377 {
378 	struct ifs_data *ifsd = ifs_get_data(dev);
379 	u64 status = 0;
380 
381 	stop_core_cpuslocked(cpu, do_array_test_gen1, &status);
382 	ifsd->scan_details = status;
383 
384 	if (status & ARRAY_GEN1_STATUS_FAIL)
385 		ifsd->status = SCAN_TEST_FAIL;
386 	else
387 		ifsd->status = SCAN_TEST_PASS;
388 }
389 
390 /*
391  * Initiate per core test. It wakes up work queue threads on the target cpu and
392  * its sibling cpu. Once all sibling threads wake up, the scan test gets executed and
393  * wait for all sibling threads to finish the scan test.
394  */
395 int do_core_test(int cpu, struct device *dev)
396 {
397 	const struct ifs_test_caps *test = ifs_get_test_caps(dev);
398 	struct ifs_data *ifsd = ifs_get_data(dev);
399 	int ret = 0;
400 
401 	/* Prevent CPUs from being taken offline during the scan test */
402 	cpus_read_lock();
403 
404 	if (!cpu_online(cpu)) {
405 		dev_info(dev, "cannot test on the offline cpu %d\n", cpu);
406 		ret = -EINVAL;
407 		goto out;
408 	}
409 
410 	switch (test->test_num) {
411 	case IFS_TYPE_SAF:
412 		if (!ifsd->loaded)
413 			ret = -EPERM;
414 		else
415 			ifs_test_core(cpu, dev);
416 		break;
417 	case IFS_TYPE_ARRAY_BIST:
418 		if (ifsd->array_gen == ARRAY_GEN0)
419 			ifs_array_test_core(cpu, dev);
420 		else
421 			ifs_array_test_gen1(cpu, dev);
422 		break;
423 	default:
424 		ret = -EINVAL;
425 	}
426 out:
427 	cpus_read_unlock();
428 	return ret;
429 }
430