xref: /linux/drivers/platform/x86/intel/ifs/runtest.c (revision 223981db9bafb80f558162c148f261e2ff043dbe)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright(c) 2022 Intel Corporation. */
3 
4 #include <linux/cpu.h>
5 #include <linux/delay.h>
6 #include <linux/fs.h>
7 #include <linux/nmi.h>
8 #include <linux/slab.h>
9 #include <linux/stop_machine.h>
10 
11 #include "ifs.h"
12 
13 /*
14  * Note all code and data in this file is protected by
15  * ifs_sem. On HT systems all threads on a core will
16  * execute together, but only the first thread on the
17  * core will update results of the test.
18  */
19 
20 #define CREATE_TRACE_POINTS
21 #include <trace/events/intel_ifs.h>
22 
23 /* Max retries on the same chunk */
24 #define MAX_IFS_RETRIES  5
25 
26 /*
27  * Number of TSC cycles that a logical CPU will wait for the other
28  * logical CPU on the core in the WRMSR(ACTIVATE_SCAN).
29  */
30 #define IFS_THREAD_WAIT 100000
31 
32 enum ifs_status_err_code {
33 	IFS_NO_ERROR				= 0,
34 	IFS_OTHER_THREAD_COULD_NOT_JOIN		= 1,
35 	IFS_INTERRUPTED_BEFORE_RENDEZVOUS	= 2,
36 	IFS_POWER_MGMT_INADEQUATE_FOR_SCAN	= 3,
37 	IFS_INVALID_CHUNK_RANGE			= 4,
38 	IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS	= 5,
39 	IFS_CORE_NOT_CAPABLE_CURRENTLY		= 6,
40 	IFS_UNASSIGNED_ERROR_CODE		= 7,
41 	IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT	= 8,
42 	IFS_INTERRUPTED_DURING_EXECUTION	= 9,
43 	IFS_UNASSIGNED_ERROR_CODE_0xA		= 0xA,
44 	IFS_CORRUPTED_CHUNK		= 0xB,
45 };
46 
47 static const char * const scan_test_status[] = {
48 	[IFS_NO_ERROR] = "SCAN no error",
49 	[IFS_OTHER_THREAD_COULD_NOT_JOIN] = "Other thread could not join.",
50 	[IFS_INTERRUPTED_BEFORE_RENDEZVOUS] = "Interrupt occurred prior to SCAN coordination.",
51 	[IFS_POWER_MGMT_INADEQUATE_FOR_SCAN] =
52 	"Core Abort SCAN Response due to power management condition.",
53 	[IFS_INVALID_CHUNK_RANGE] = "Non valid chunks in the range",
54 	[IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS] = "Mismatch in arguments between threads T0/T1.",
55 	[IFS_CORE_NOT_CAPABLE_CURRENTLY] = "Core not capable of performing SCAN currently",
56 	[IFS_UNASSIGNED_ERROR_CODE] = "Unassigned error code 0x7",
57 	[IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT] =
58 	"Exceeded number of Logical Processors (LP) allowed to run Scan-At-Field concurrently",
59 	[IFS_INTERRUPTED_DURING_EXECUTION] = "Interrupt occurred prior to SCAN start",
60 	[IFS_UNASSIGNED_ERROR_CODE_0xA] = "Unassigned error code 0xA",
61 	[IFS_CORRUPTED_CHUNK] = "Scan operation aborted due to corrupted image. Try reloading",
62 };
63 
64 static void message_not_tested(struct device *dev, int cpu, union ifs_status status)
65 {
66 	if (status.error_code < ARRAY_SIZE(scan_test_status)) {
67 		dev_info(dev, "CPU(s) %*pbl: SCAN operation did not start. %s\n",
68 			 cpumask_pr_args(cpu_smt_mask(cpu)),
69 			 scan_test_status[status.error_code]);
70 	} else if (status.error_code == IFS_SW_TIMEOUT) {
71 		dev_info(dev, "CPU(s) %*pbl: software timeout during scan\n",
72 			 cpumask_pr_args(cpu_smt_mask(cpu)));
73 	} else if (status.error_code == IFS_SW_PARTIAL_COMPLETION) {
74 		dev_info(dev, "CPU(s) %*pbl: %s\n",
75 			 cpumask_pr_args(cpu_smt_mask(cpu)),
76 			 "Not all scan chunks were executed. Maximum forward progress retries exceeded");
77 	} else {
78 		dev_info(dev, "CPU(s) %*pbl: SCAN unknown status %llx\n",
79 			 cpumask_pr_args(cpu_smt_mask(cpu)), status.data);
80 	}
81 }
82 
83 static void message_fail(struct device *dev, int cpu, union ifs_status status)
84 {
85 	struct ifs_data *ifsd = ifs_get_data(dev);
86 
87 	/*
88 	 * control_error is set when the microcode runs into a problem
89 	 * loading the image from the reserved BIOS memory, or it has
90 	 * been corrupted. Reloading the image may fix this issue.
91 	 */
92 	if (status.control_error) {
93 		dev_err(dev, "CPU(s) %*pbl: could not execute from loaded scan image. Batch: %02x version: 0x%x\n",
94 			cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version);
95 	}
96 
97 	/*
98 	 * signature_error is set when the output from the scan chains does not
99 	 * match the expected signature. This might be a transient problem (e.g.
100 	 * due to a bit flip from an alpha particle or neutron). If the problem
101 	 * repeats on a subsequent test, then it indicates an actual problem in
102 	 * the core being tested.
103 	 */
104 	if (status.signature_error) {
105 		dev_err(dev, "CPU(s) %*pbl: test signature incorrect. Batch: %02x version: 0x%x\n",
106 			cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version);
107 	}
108 }
109 
110 static bool can_restart(union ifs_status status)
111 {
112 	enum ifs_status_err_code err_code = status.error_code;
113 
114 	/* Signature for chunk is bad, or scan test failed */
115 	if (status.signature_error || status.control_error)
116 		return false;
117 
118 	switch (err_code) {
119 	case IFS_NO_ERROR:
120 	case IFS_OTHER_THREAD_COULD_NOT_JOIN:
121 	case IFS_INTERRUPTED_BEFORE_RENDEZVOUS:
122 	case IFS_POWER_MGMT_INADEQUATE_FOR_SCAN:
123 	case IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT:
124 	case IFS_INTERRUPTED_DURING_EXECUTION:
125 		return true;
126 	case IFS_INVALID_CHUNK_RANGE:
127 	case IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS:
128 	case IFS_CORE_NOT_CAPABLE_CURRENTLY:
129 	case IFS_UNASSIGNED_ERROR_CODE:
130 	case IFS_UNASSIGNED_ERROR_CODE_0xA:
131 	case IFS_CORRUPTED_CHUNK:
132 		break;
133 	}
134 	return false;
135 }
136 
137 /*
138  * Execute the scan. Called "simultaneously" on all threads of a core
139  * at high priority using the stop_cpus mechanism.
140  */
141 static int doscan(void *data)
142 {
143 	int cpu = smp_processor_id();
144 	u64 *msrs = data;
145 	int first;
146 
147 	/* Only the first logical CPU on a core reports result */
148 	first = cpumask_first(cpu_smt_mask(cpu));
149 
150 	/*
151 	 * This WRMSR will wait for other HT threads to also write
152 	 * to this MSR (at most for activate.delay cycles). Then it
153 	 * starts scan of each requested chunk. The core scan happens
154 	 * during the "execution" of the WRMSR. This instruction can
155 	 * take up to 200 milliseconds (in the case where all chunks
156 	 * are processed in a single pass) before it retires.
157 	 */
158 	wrmsrl(MSR_ACTIVATE_SCAN, msrs[0]);
159 
160 	if (cpu == first) {
161 		/* Pass back the result of the scan */
162 		rdmsrl(MSR_SCAN_STATUS, msrs[1]);
163 	}
164 
165 	return 0;
166 }
167 
168 /*
169  * Use stop_core_cpuslocked() to synchronize writing to MSR_ACTIVATE_SCAN
170  * on all threads of the core to be tested. Loop if necessary to complete
171  * run of all chunks. Include some defensive tests to make sure forward
172  * progress is made, and that the whole test completes in a reasonable time.
173  */
174 static void ifs_test_core(int cpu, struct device *dev)
175 {
176 	union ifs_scan activate;
177 	union ifs_status status;
178 	unsigned long timeout;
179 	struct ifs_data *ifsd;
180 	int to_start, to_stop;
181 	int status_chunk;
182 	u64 msrvals[2];
183 	int retries;
184 
185 	ifsd = ifs_get_data(dev);
186 
187 	activate.gen0.rsvd = 0;
188 	activate.delay = IFS_THREAD_WAIT;
189 	activate.sigmce = 0;
190 	to_start = 0;
191 	to_stop = ifsd->valid_chunks - 1;
192 
193 	if (ifsd->generation) {
194 		activate.gen2.start = to_start;
195 		activate.gen2.stop = to_stop;
196 	} else {
197 		activate.gen0.start = to_start;
198 		activate.gen0.stop = to_stop;
199 	}
200 
201 	timeout = jiffies + HZ / 2;
202 	retries = MAX_IFS_RETRIES;
203 
204 	while (to_start <= to_stop) {
205 		if (time_after(jiffies, timeout)) {
206 			status.error_code = IFS_SW_TIMEOUT;
207 			break;
208 		}
209 
210 		msrvals[0] = activate.data;
211 		stop_core_cpuslocked(cpu, doscan, msrvals);
212 
213 		status.data = msrvals[1];
214 
215 		trace_ifs_status(cpu, to_start, to_stop, status.data);
216 
217 		/* Some cases can be retried, give up for others */
218 		if (!can_restart(status))
219 			break;
220 
221 		status_chunk = ifsd->generation ? status.gen2.chunk_num : status.gen0.chunk_num;
222 		if (status_chunk == to_start) {
223 			/* Check for forward progress */
224 			if (--retries == 0) {
225 				if (status.error_code == IFS_NO_ERROR)
226 					status.error_code = IFS_SW_PARTIAL_COMPLETION;
227 				break;
228 			}
229 		} else {
230 			retries = MAX_IFS_RETRIES;
231 			if (ifsd->generation)
232 				activate.gen2.start = status_chunk;
233 			else
234 				activate.gen0.start = status_chunk;
235 			to_start = status_chunk;
236 		}
237 	}
238 
239 	/* Update status for this core */
240 	ifsd->scan_details = status.data;
241 
242 	if (status.control_error || status.signature_error) {
243 		ifsd->status = SCAN_TEST_FAIL;
244 		message_fail(dev, cpu, status);
245 	} else if (status.error_code) {
246 		ifsd->status = SCAN_NOT_TESTED;
247 		message_not_tested(dev, cpu, status);
248 	} else {
249 		ifsd->status = SCAN_TEST_PASS;
250 	}
251 }
252 
253 #define SPINUNIT 100 /* 100 nsec */
254 static atomic_t array_cpus_out;
255 
256 /*
257  * Simplified cpu sibling rendezvous loop based on microcode loader __wait_for_cpus()
258  */
259 static void wait_for_sibling_cpu(atomic_t *t, long long timeout)
260 {
261 	int cpu = smp_processor_id();
262 	const struct cpumask *smt_mask = cpu_smt_mask(cpu);
263 	int all_cpus = cpumask_weight(smt_mask);
264 
265 	atomic_inc(t);
266 	while (atomic_read(t) < all_cpus) {
267 		if (timeout < SPINUNIT)
268 			return;
269 		ndelay(SPINUNIT);
270 		timeout -= SPINUNIT;
271 		touch_nmi_watchdog();
272 	}
273 }
274 
275 static int do_array_test(void *data)
276 {
277 	union ifs_array *command = data;
278 	int cpu = smp_processor_id();
279 	int first;
280 
281 	/*
282 	 * Only one logical CPU on a core needs to trigger the Array test via MSR write.
283 	 */
284 	first = cpumask_first(cpu_smt_mask(cpu));
285 
286 	if (cpu == first) {
287 		wrmsrl(MSR_ARRAY_BIST, command->data);
288 		/* Pass back the result of the test */
289 		rdmsrl(MSR_ARRAY_BIST, command->data);
290 	}
291 
292 	/* Tests complete faster if the sibling is spinning here */
293 	wait_for_sibling_cpu(&array_cpus_out, NSEC_PER_SEC);
294 
295 	return 0;
296 }
297 
298 static void ifs_array_test_core(int cpu, struct device *dev)
299 {
300 	union ifs_array command = {};
301 	bool timed_out = false;
302 	struct ifs_data *ifsd;
303 	unsigned long timeout;
304 
305 	ifsd = ifs_get_data(dev);
306 
307 	command.array_bitmask = ~0U;
308 	timeout = jiffies + HZ / 2;
309 
310 	do {
311 		if (time_after(jiffies, timeout)) {
312 			timed_out = true;
313 			break;
314 		}
315 		atomic_set(&array_cpus_out, 0);
316 		stop_core_cpuslocked(cpu, do_array_test, &command);
317 
318 		if (command.ctrl_result)
319 			break;
320 	} while (command.array_bitmask);
321 
322 	ifsd->scan_details = command.data;
323 
324 	if (command.ctrl_result)
325 		ifsd->status = SCAN_TEST_FAIL;
326 	else if (timed_out || command.array_bitmask)
327 		ifsd->status = SCAN_NOT_TESTED;
328 	else
329 		ifsd->status = SCAN_TEST_PASS;
330 }
331 
332 #define ARRAY_GEN1_TEST_ALL_ARRAYS	0x0ULL
333 #define ARRAY_GEN1_STATUS_FAIL		0x1ULL
334 
335 static int do_array_test_gen1(void *status)
336 {
337 	int cpu = smp_processor_id();
338 	int first;
339 
340 	first = cpumask_first(cpu_smt_mask(cpu));
341 
342 	if (cpu == first) {
343 		wrmsrl(MSR_ARRAY_TRIGGER, ARRAY_GEN1_TEST_ALL_ARRAYS);
344 		rdmsrl(MSR_ARRAY_STATUS, *((u64 *)status));
345 	}
346 
347 	return 0;
348 }
349 
350 static void ifs_array_test_gen1(int cpu, struct device *dev)
351 {
352 	struct ifs_data *ifsd = ifs_get_data(dev);
353 	u64 status = 0;
354 
355 	stop_core_cpuslocked(cpu, do_array_test_gen1, &status);
356 	ifsd->scan_details = status;
357 
358 	if (status & ARRAY_GEN1_STATUS_FAIL)
359 		ifsd->status = SCAN_TEST_FAIL;
360 	else
361 		ifsd->status = SCAN_TEST_PASS;
362 }
363 
364 /*
365  * Initiate per core test. It wakes up work queue threads on the target cpu and
366  * its sibling cpu. Once all sibling threads wake up, the scan test gets executed and
367  * wait for all sibling threads to finish the scan test.
368  */
369 int do_core_test(int cpu, struct device *dev)
370 {
371 	const struct ifs_test_caps *test = ifs_get_test_caps(dev);
372 	struct ifs_data *ifsd = ifs_get_data(dev);
373 	int ret = 0;
374 
375 	/* Prevent CPUs from being taken offline during the scan test */
376 	cpus_read_lock();
377 
378 	if (!cpu_online(cpu)) {
379 		dev_info(dev, "cannot test on the offline cpu %d\n", cpu);
380 		ret = -EINVAL;
381 		goto out;
382 	}
383 
384 	switch (test->test_num) {
385 	case IFS_TYPE_SAF:
386 		if (!ifsd->loaded)
387 			ret = -EPERM;
388 		else
389 			ifs_test_core(cpu, dev);
390 		break;
391 	case IFS_TYPE_ARRAY_BIST:
392 		if (ifsd->array_gen == ARRAY_GEN0)
393 			ifs_array_test_core(cpu, dev);
394 		else
395 			ifs_array_test_gen1(cpu, dev);
396 		break;
397 	default:
398 		ret = -EINVAL;
399 	}
400 out:
401 	cpus_read_unlock();
402 	return ret;
403 }
404