xref: /linux/drivers/platform/x86/intel/ifs/runtest.c (revision 156010ed9c2ac1e9df6c11b1f688cf8a6e0152e6)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright(c) 2022 Intel Corporation. */
3 
4 #include <linux/cpu.h>
5 #include <linux/delay.h>
6 #include <linux/fs.h>
7 #include <linux/nmi.h>
8 #include <linux/slab.h>
9 #include <linux/stop_machine.h>
10 
11 #include "ifs.h"
12 
13 /*
14  * Note all code and data in this file is protected by
15  * ifs_sem. On HT systems all threads on a core will
16  * execute together, but only the first thread on the
17  * core will update results of the test.
18  */
19 
20 #define CREATE_TRACE_POINTS
21 #include <trace/events/intel_ifs.h>
22 
23 /* Max retries on the same chunk */
24 #define MAX_IFS_RETRIES  5
25 
26 /*
27  * Number of TSC cycles that a logical CPU will wait for the other
28  * logical CPU on the core in the WRMSR(ACTIVATE_SCAN).
29  */
30 #define IFS_THREAD_WAIT 100000
31 
32 enum ifs_status_err_code {
33 	IFS_NO_ERROR				= 0,
34 	IFS_OTHER_THREAD_COULD_NOT_JOIN		= 1,
35 	IFS_INTERRUPTED_BEFORE_RENDEZVOUS	= 2,
36 	IFS_POWER_MGMT_INADEQUATE_FOR_SCAN	= 3,
37 	IFS_INVALID_CHUNK_RANGE			= 4,
38 	IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS	= 5,
39 	IFS_CORE_NOT_CAPABLE_CURRENTLY		= 6,
40 	IFS_UNASSIGNED_ERROR_CODE		= 7,
41 	IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT	= 8,
42 	IFS_INTERRUPTED_DURING_EXECUTION	= 9,
43 };
44 
45 static const char * const scan_test_status[] = {
46 	[IFS_NO_ERROR] = "SCAN no error",
47 	[IFS_OTHER_THREAD_COULD_NOT_JOIN] = "Other thread could not join.",
48 	[IFS_INTERRUPTED_BEFORE_RENDEZVOUS] = "Interrupt occurred prior to SCAN coordination.",
49 	[IFS_POWER_MGMT_INADEQUATE_FOR_SCAN] =
50 	"Core Abort SCAN Response due to power management condition.",
51 	[IFS_INVALID_CHUNK_RANGE] = "Non valid chunks in the range",
52 	[IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS] = "Mismatch in arguments between threads T0/T1.",
53 	[IFS_CORE_NOT_CAPABLE_CURRENTLY] = "Core not capable of performing SCAN currently",
54 	[IFS_UNASSIGNED_ERROR_CODE] = "Unassigned error code 0x7",
55 	[IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT] =
56 	"Exceeded number of Logical Processors (LP) allowed to run Scan-At-Field concurrently",
57 	[IFS_INTERRUPTED_DURING_EXECUTION] = "Interrupt occurred prior to SCAN start",
58 };
59 
60 static void message_not_tested(struct device *dev, int cpu, union ifs_status status)
61 {
62 	if (status.error_code < ARRAY_SIZE(scan_test_status)) {
63 		dev_info(dev, "CPU(s) %*pbl: SCAN operation did not start. %s\n",
64 			 cpumask_pr_args(cpu_smt_mask(cpu)),
65 			 scan_test_status[status.error_code]);
66 	} else if (status.error_code == IFS_SW_TIMEOUT) {
67 		dev_info(dev, "CPU(s) %*pbl: software timeout during scan\n",
68 			 cpumask_pr_args(cpu_smt_mask(cpu)));
69 	} else if (status.error_code == IFS_SW_PARTIAL_COMPLETION) {
70 		dev_info(dev, "CPU(s) %*pbl: %s\n",
71 			 cpumask_pr_args(cpu_smt_mask(cpu)),
72 			 "Not all scan chunks were executed. Maximum forward progress retries exceeded");
73 	} else {
74 		dev_info(dev, "CPU(s) %*pbl: SCAN unknown status %llx\n",
75 			 cpumask_pr_args(cpu_smt_mask(cpu)), status.data);
76 	}
77 }
78 
79 static void message_fail(struct device *dev, int cpu, union ifs_status status)
80 {
81 	struct ifs_data *ifsd = ifs_get_data(dev);
82 
83 	/*
84 	 * control_error is set when the microcode runs into a problem
85 	 * loading the image from the reserved BIOS memory, or it has
86 	 * been corrupted. Reloading the image may fix this issue.
87 	 */
88 	if (status.control_error) {
89 		dev_err(dev, "CPU(s) %*pbl: could not execute from loaded scan image. Batch: %02x version: 0x%x\n",
90 			cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version);
91 	}
92 
93 	/*
94 	 * signature_error is set when the output from the scan chains does not
95 	 * match the expected signature. This might be a transient problem (e.g.
96 	 * due to a bit flip from an alpha particle or neutron). If the problem
97 	 * repeats on a subsequent test, then it indicates an actual problem in
98 	 * the core being tested.
99 	 */
100 	if (status.signature_error) {
101 		dev_err(dev, "CPU(s) %*pbl: test signature incorrect. Batch: %02x version: 0x%x\n",
102 			cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version);
103 	}
104 }
105 
106 static bool can_restart(union ifs_status status)
107 {
108 	enum ifs_status_err_code err_code = status.error_code;
109 
110 	/* Signature for chunk is bad, or scan test failed */
111 	if (status.signature_error || status.control_error)
112 		return false;
113 
114 	switch (err_code) {
115 	case IFS_NO_ERROR:
116 	case IFS_OTHER_THREAD_COULD_NOT_JOIN:
117 	case IFS_INTERRUPTED_BEFORE_RENDEZVOUS:
118 	case IFS_POWER_MGMT_INADEQUATE_FOR_SCAN:
119 	case IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT:
120 	case IFS_INTERRUPTED_DURING_EXECUTION:
121 		return true;
122 	case IFS_INVALID_CHUNK_RANGE:
123 	case IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS:
124 	case IFS_CORE_NOT_CAPABLE_CURRENTLY:
125 	case IFS_UNASSIGNED_ERROR_CODE:
126 		break;
127 	}
128 	return false;
129 }
130 
131 /*
132  * Execute the scan. Called "simultaneously" on all threads of a core
133  * at high priority using the stop_cpus mechanism.
134  */
135 static int doscan(void *data)
136 {
137 	int cpu = smp_processor_id();
138 	u64 *msrs = data;
139 	int first;
140 
141 	/* Only the first logical CPU on a core reports result */
142 	first = cpumask_first(cpu_smt_mask(cpu));
143 
144 	/*
145 	 * This WRMSR will wait for other HT threads to also write
146 	 * to this MSR (at most for activate.delay cycles). Then it
147 	 * starts scan of each requested chunk. The core scan happens
148 	 * during the "execution" of the WRMSR. This instruction can
149 	 * take up to 200 milliseconds (in the case where all chunks
150 	 * are processed in a single pass) before it retires.
151 	 */
152 	wrmsrl(MSR_ACTIVATE_SCAN, msrs[0]);
153 
154 	if (cpu == first) {
155 		/* Pass back the result of the scan */
156 		rdmsrl(MSR_SCAN_STATUS, msrs[1]);
157 	}
158 
159 	return 0;
160 }
161 
162 /*
163  * Use stop_core_cpuslocked() to synchronize writing to MSR_ACTIVATE_SCAN
164  * on all threads of the core to be tested. Loop if necessary to complete
165  * run of all chunks. Include some defensive tests to make sure forward
166  * progress is made, and that the whole test completes in a reasonable time.
167  */
168 static void ifs_test_core(int cpu, struct device *dev)
169 {
170 	union ifs_scan activate;
171 	union ifs_status status;
172 	unsigned long timeout;
173 	struct ifs_data *ifsd;
174 	u64 msrvals[2];
175 	int retries;
176 
177 	ifsd = ifs_get_data(dev);
178 
179 	activate.rsvd = 0;
180 	activate.delay = IFS_THREAD_WAIT;
181 	activate.sigmce = 0;
182 	activate.start = 0;
183 	activate.stop = ifsd->valid_chunks - 1;
184 
185 	timeout = jiffies + HZ / 2;
186 	retries = MAX_IFS_RETRIES;
187 
188 	while (activate.start <= activate.stop) {
189 		if (time_after(jiffies, timeout)) {
190 			status.error_code = IFS_SW_TIMEOUT;
191 			break;
192 		}
193 
194 		msrvals[0] = activate.data;
195 		stop_core_cpuslocked(cpu, doscan, msrvals);
196 
197 		status.data = msrvals[1];
198 
199 		trace_ifs_status(cpu, activate, status);
200 
201 		/* Some cases can be retried, give up for others */
202 		if (!can_restart(status))
203 			break;
204 
205 		if (status.chunk_num == activate.start) {
206 			/* Check for forward progress */
207 			if (--retries == 0) {
208 				if (status.error_code == IFS_NO_ERROR)
209 					status.error_code = IFS_SW_PARTIAL_COMPLETION;
210 				break;
211 			}
212 		} else {
213 			retries = MAX_IFS_RETRIES;
214 			activate.start = status.chunk_num;
215 		}
216 	}
217 
218 	/* Update status for this core */
219 	ifsd->scan_details = status.data;
220 
221 	if (status.control_error || status.signature_error) {
222 		ifsd->status = SCAN_TEST_FAIL;
223 		message_fail(dev, cpu, status);
224 	} else if (status.error_code) {
225 		ifsd->status = SCAN_NOT_TESTED;
226 		message_not_tested(dev, cpu, status);
227 	} else {
228 		ifsd->status = SCAN_TEST_PASS;
229 	}
230 }
231 
232 /*
233  * Initiate per core test. It wakes up work queue threads on the target cpu and
234  * its sibling cpu. Once all sibling threads wake up, the scan test gets executed and
235  * wait for all sibling threads to finish the scan test.
236  */
237 int do_core_test(int cpu, struct device *dev)
238 {
239 	int ret = 0;
240 
241 	/* Prevent CPUs from being taken offline during the scan test */
242 	cpus_read_lock();
243 
244 	if (!cpu_online(cpu)) {
245 		dev_info(dev, "cannot test on the offline cpu %d\n", cpu);
246 		ret = -EINVAL;
247 		goto out;
248 	}
249 
250 	ifs_test_core(cpu, dev);
251 out:
252 	cpus_read_unlock();
253 	return ret;
254 }
255