xref: /linux/drivers/firmware/efi/cper.c (revision b85d45947951d23cb22d90caecf4c1eb81342c96)
1 /*
2  * UEFI Common Platform Error Record (CPER) support
3  *
4  * Copyright (C) 2010, Intel Corp.
5  *	Author: Huang Ying <ying.huang@intel.com>
6  *
7  * CPER is the format used to describe platform hardware error by
8  * various tables, such as ERST, BERT and HEST etc.
9  *
10  * For more information about CPER, please refer to Appendix N of UEFI
11  * Specification version 2.4.
12  *
13  * This program is free software; you can redistribute it and/or
14  * modify it under the terms of the GNU General Public License version
15  * 2 as published by the Free Software Foundation.
16  *
17  * This program is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20  * GNU General Public License for more details.
21  *
22  * You should have received a copy of the GNU General Public License
23  * along with this program; if not, write to the Free Software
24  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25  */
26 
27 #include <linux/kernel.h>
28 #include <linux/module.h>
29 #include <linux/time.h>
30 #include <linux/cper.h>
31 #include <linux/dmi.h>
32 #include <linux/acpi.h>
33 #include <linux/pci.h>
34 #include <linux/aer.h>
35 
36 #define INDENT_SP	" "
37 
38 static char rcd_decode_str[CPER_REC_LEN];
39 
40 /*
41  * CPER record ID need to be unique even after reboot, because record
42  * ID is used as index for ERST storage, while CPER records from
43  * multiple boot may co-exist in ERST.
44  */
45 u64 cper_next_record_id(void)
46 {
47 	static atomic64_t seq;
48 
49 	if (!atomic64_read(&seq))
50 		atomic64_set(&seq, ((u64)get_seconds()) << 32);
51 
52 	return atomic64_inc_return(&seq);
53 }
54 EXPORT_SYMBOL_GPL(cper_next_record_id);
55 
56 static const char * const severity_strs[] = {
57 	"recoverable",
58 	"fatal",
59 	"corrected",
60 	"info",
61 };
62 
63 const char *cper_severity_str(unsigned int severity)
64 {
65 	return severity < ARRAY_SIZE(severity_strs) ?
66 		severity_strs[severity] : "unknown";
67 }
68 EXPORT_SYMBOL_GPL(cper_severity_str);
69 
70 /*
71  * cper_print_bits - print strings for set bits
72  * @pfx: prefix for each line, including log level and prefix string
73  * @bits: bit mask
74  * @strs: string array, indexed by bit position
75  * @strs_size: size of the string array: @strs
76  *
77  * For each set bit in @bits, print the corresponding string in @strs.
78  * If the output length is longer than 80, multiple line will be
79  * printed, with @pfx is printed at the beginning of each line.
80  */
81 void cper_print_bits(const char *pfx, unsigned int bits,
82 		     const char * const strs[], unsigned int strs_size)
83 {
84 	int i, len = 0;
85 	const char *str;
86 	char buf[84];
87 
88 	for (i = 0; i < strs_size; i++) {
89 		if (!(bits & (1U << i)))
90 			continue;
91 		str = strs[i];
92 		if (!str)
93 			continue;
94 		if (len && len + strlen(str) + 2 > 80) {
95 			printk("%s\n", buf);
96 			len = 0;
97 		}
98 		if (!len)
99 			len = snprintf(buf, sizeof(buf), "%s%s", pfx, str);
100 		else
101 			len += snprintf(buf+len, sizeof(buf)-len, ", %s", str);
102 	}
103 	if (len)
104 		printk("%s\n", buf);
105 }
106 
107 static const char * const proc_type_strs[] = {
108 	"IA32/X64",
109 	"IA64",
110 };
111 
112 static const char * const proc_isa_strs[] = {
113 	"IA32",
114 	"IA64",
115 	"X64",
116 };
117 
118 static const char * const proc_error_type_strs[] = {
119 	"cache error",
120 	"TLB error",
121 	"bus error",
122 	"micro-architectural error",
123 };
124 
125 static const char * const proc_op_strs[] = {
126 	"unknown or generic",
127 	"data read",
128 	"data write",
129 	"instruction execution",
130 };
131 
132 static const char * const proc_flag_strs[] = {
133 	"restartable",
134 	"precise IP",
135 	"overflow",
136 	"corrected",
137 };
138 
139 static void cper_print_proc_generic(const char *pfx,
140 				    const struct cper_sec_proc_generic *proc)
141 {
142 	if (proc->validation_bits & CPER_PROC_VALID_TYPE)
143 		printk("%s""processor_type: %d, %s\n", pfx, proc->proc_type,
144 		       proc->proc_type < ARRAY_SIZE(proc_type_strs) ?
145 		       proc_type_strs[proc->proc_type] : "unknown");
146 	if (proc->validation_bits & CPER_PROC_VALID_ISA)
147 		printk("%s""processor_isa: %d, %s\n", pfx, proc->proc_isa,
148 		       proc->proc_isa < ARRAY_SIZE(proc_isa_strs) ?
149 		       proc_isa_strs[proc->proc_isa] : "unknown");
150 	if (proc->validation_bits & CPER_PROC_VALID_ERROR_TYPE) {
151 		printk("%s""error_type: 0x%02x\n", pfx, proc->proc_error_type);
152 		cper_print_bits(pfx, proc->proc_error_type,
153 				proc_error_type_strs,
154 				ARRAY_SIZE(proc_error_type_strs));
155 	}
156 	if (proc->validation_bits & CPER_PROC_VALID_OPERATION)
157 		printk("%s""operation: %d, %s\n", pfx, proc->operation,
158 		       proc->operation < ARRAY_SIZE(proc_op_strs) ?
159 		       proc_op_strs[proc->operation] : "unknown");
160 	if (proc->validation_bits & CPER_PROC_VALID_FLAGS) {
161 		printk("%s""flags: 0x%02x\n", pfx, proc->flags);
162 		cper_print_bits(pfx, proc->flags, proc_flag_strs,
163 				ARRAY_SIZE(proc_flag_strs));
164 	}
165 	if (proc->validation_bits & CPER_PROC_VALID_LEVEL)
166 		printk("%s""level: %d\n", pfx, proc->level);
167 	if (proc->validation_bits & CPER_PROC_VALID_VERSION)
168 		printk("%s""version_info: 0x%016llx\n", pfx, proc->cpu_version);
169 	if (proc->validation_bits & CPER_PROC_VALID_ID)
170 		printk("%s""processor_id: 0x%016llx\n", pfx, proc->proc_id);
171 	if (proc->validation_bits & CPER_PROC_VALID_TARGET_ADDRESS)
172 		printk("%s""target_address: 0x%016llx\n",
173 		       pfx, proc->target_addr);
174 	if (proc->validation_bits & CPER_PROC_VALID_REQUESTOR_ID)
175 		printk("%s""requestor_id: 0x%016llx\n",
176 		       pfx, proc->requestor_id);
177 	if (proc->validation_bits & CPER_PROC_VALID_RESPONDER_ID)
178 		printk("%s""responder_id: 0x%016llx\n",
179 		       pfx, proc->responder_id);
180 	if (proc->validation_bits & CPER_PROC_VALID_IP)
181 		printk("%s""IP: 0x%016llx\n", pfx, proc->ip);
182 }
183 
184 static const char * const mem_err_type_strs[] = {
185 	"unknown",
186 	"no error",
187 	"single-bit ECC",
188 	"multi-bit ECC",
189 	"single-symbol chipkill ECC",
190 	"multi-symbol chipkill ECC",
191 	"master abort",
192 	"target abort",
193 	"parity error",
194 	"watchdog timeout",
195 	"invalid address",
196 	"mirror Broken",
197 	"memory sparing",
198 	"scrub corrected error",
199 	"scrub uncorrected error",
200 	"physical memory map-out event",
201 };
202 
203 const char *cper_mem_err_type_str(unsigned int etype)
204 {
205 	return etype < ARRAY_SIZE(mem_err_type_strs) ?
206 		mem_err_type_strs[etype] : "unknown";
207 }
208 EXPORT_SYMBOL_GPL(cper_mem_err_type_str);
209 
210 static int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg)
211 {
212 	u32 len, n;
213 
214 	if (!msg)
215 		return 0;
216 
217 	n = 0;
218 	len = CPER_REC_LEN - 1;
219 	if (mem->validation_bits & CPER_MEM_VALID_NODE)
220 		n += scnprintf(msg + n, len - n, "node: %d ", mem->node);
221 	if (mem->validation_bits & CPER_MEM_VALID_CARD)
222 		n += scnprintf(msg + n, len - n, "card: %d ", mem->card);
223 	if (mem->validation_bits & CPER_MEM_VALID_MODULE)
224 		n += scnprintf(msg + n, len - n, "module: %d ", mem->module);
225 	if (mem->validation_bits & CPER_MEM_VALID_RANK_NUMBER)
226 		n += scnprintf(msg + n, len - n, "rank: %d ", mem->rank);
227 	if (mem->validation_bits & CPER_MEM_VALID_BANK)
228 		n += scnprintf(msg + n, len - n, "bank: %d ", mem->bank);
229 	if (mem->validation_bits & CPER_MEM_VALID_DEVICE)
230 		n += scnprintf(msg + n, len - n, "device: %d ", mem->device);
231 	if (mem->validation_bits & CPER_MEM_VALID_ROW)
232 		n += scnprintf(msg + n, len - n, "row: %d ", mem->row);
233 	if (mem->validation_bits & CPER_MEM_VALID_COLUMN)
234 		n += scnprintf(msg + n, len - n, "column: %d ", mem->column);
235 	if (mem->validation_bits & CPER_MEM_VALID_BIT_POSITION)
236 		n += scnprintf(msg + n, len - n, "bit_position: %d ",
237 			       mem->bit_pos);
238 	if (mem->validation_bits & CPER_MEM_VALID_REQUESTOR_ID)
239 		n += scnprintf(msg + n, len - n, "requestor_id: 0x%016llx ",
240 			       mem->requestor_id);
241 	if (mem->validation_bits & CPER_MEM_VALID_RESPONDER_ID)
242 		n += scnprintf(msg + n, len - n, "responder_id: 0x%016llx ",
243 			       mem->responder_id);
244 	if (mem->validation_bits & CPER_MEM_VALID_TARGET_ID)
245 		scnprintf(msg + n, len - n, "target_id: 0x%016llx ",
246 			  mem->target_id);
247 
248 	msg[n] = '\0';
249 	return n;
250 }
251 
252 static int cper_dimm_err_location(struct cper_mem_err_compact *mem, char *msg)
253 {
254 	u32 len, n;
255 	const char *bank = NULL, *device = NULL;
256 
257 	if (!msg || !(mem->validation_bits & CPER_MEM_VALID_MODULE_HANDLE))
258 		return 0;
259 
260 	n = 0;
261 	len = CPER_REC_LEN - 1;
262 	dmi_memdev_name(mem->mem_dev_handle, &bank, &device);
263 	if (bank && device)
264 		n = snprintf(msg, len, "DIMM location: %s %s ", bank, device);
265 	else
266 		n = snprintf(msg, len,
267 			     "DIMM location: not present. DMI handle: 0x%.4x ",
268 			     mem->mem_dev_handle);
269 
270 	msg[n] = '\0';
271 	return n;
272 }
273 
274 void cper_mem_err_pack(const struct cper_sec_mem_err *mem,
275 		       struct cper_mem_err_compact *cmem)
276 {
277 	cmem->validation_bits = mem->validation_bits;
278 	cmem->node = mem->node;
279 	cmem->card = mem->card;
280 	cmem->module = mem->module;
281 	cmem->bank = mem->bank;
282 	cmem->device = mem->device;
283 	cmem->row = mem->row;
284 	cmem->column = mem->column;
285 	cmem->bit_pos = mem->bit_pos;
286 	cmem->requestor_id = mem->requestor_id;
287 	cmem->responder_id = mem->responder_id;
288 	cmem->target_id = mem->target_id;
289 	cmem->rank = mem->rank;
290 	cmem->mem_array_handle = mem->mem_array_handle;
291 	cmem->mem_dev_handle = mem->mem_dev_handle;
292 }
293 
294 const char *cper_mem_err_unpack(struct trace_seq *p,
295 				struct cper_mem_err_compact *cmem)
296 {
297 	const char *ret = trace_seq_buffer_ptr(p);
298 
299 	if (cper_mem_err_location(cmem, rcd_decode_str))
300 		trace_seq_printf(p, "%s", rcd_decode_str);
301 	if (cper_dimm_err_location(cmem, rcd_decode_str))
302 		trace_seq_printf(p, "%s", rcd_decode_str);
303 	trace_seq_putc(p, '\0');
304 
305 	return ret;
306 }
307 
308 static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem,
309 	int len)
310 {
311 	struct cper_mem_err_compact cmem;
312 
313 	/* Don't trust UEFI 2.1/2.2 structure with bad validation bits */
314 	if (len == sizeof(struct cper_sec_mem_err_old) &&
315 	    (mem->validation_bits & ~(CPER_MEM_VALID_RANK_NUMBER - 1))) {
316 		pr_err(FW_WARN "valid bits set for fields beyond structure\n");
317 		return;
318 	}
319 	if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS)
320 		printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status);
321 	if (mem->validation_bits & CPER_MEM_VALID_PA)
322 		printk("%s""physical_address: 0x%016llx\n",
323 		       pfx, mem->physical_addr);
324 	if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
325 		printk("%s""physical_address_mask: 0x%016llx\n",
326 		       pfx, mem->physical_addr_mask);
327 	cper_mem_err_pack(mem, &cmem);
328 	if (cper_mem_err_location(&cmem, rcd_decode_str))
329 		printk("%s%s\n", pfx, rcd_decode_str);
330 	if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
331 		u8 etype = mem->error_type;
332 		printk("%s""error_type: %d, %s\n", pfx, etype,
333 		       cper_mem_err_type_str(etype));
334 	}
335 	if (cper_dimm_err_location(&cmem, rcd_decode_str))
336 		printk("%s%s\n", pfx, rcd_decode_str);
337 }
338 
339 static const char * const pcie_port_type_strs[] = {
340 	"PCIe end point",
341 	"legacy PCI end point",
342 	"unknown",
343 	"unknown",
344 	"root port",
345 	"upstream switch port",
346 	"downstream switch port",
347 	"PCIe to PCI/PCI-X bridge",
348 	"PCI/PCI-X to PCIe bridge",
349 	"root complex integrated endpoint device",
350 	"root complex event collector",
351 };
352 
353 static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
354 			    const struct acpi_hest_generic_data *gdata)
355 {
356 	if (pcie->validation_bits & CPER_PCIE_VALID_PORT_TYPE)
357 		printk("%s""port_type: %d, %s\n", pfx, pcie->port_type,
358 		       pcie->port_type < ARRAY_SIZE(pcie_port_type_strs) ?
359 		       pcie_port_type_strs[pcie->port_type] : "unknown");
360 	if (pcie->validation_bits & CPER_PCIE_VALID_VERSION)
361 		printk("%s""version: %d.%d\n", pfx,
362 		       pcie->version.major, pcie->version.minor);
363 	if (pcie->validation_bits & CPER_PCIE_VALID_COMMAND_STATUS)
364 		printk("%s""command: 0x%04x, status: 0x%04x\n", pfx,
365 		       pcie->command, pcie->status);
366 	if (pcie->validation_bits & CPER_PCIE_VALID_DEVICE_ID) {
367 		const __u8 *p;
368 		printk("%s""device_id: %04x:%02x:%02x.%x\n", pfx,
369 		       pcie->device_id.segment, pcie->device_id.bus,
370 		       pcie->device_id.device, pcie->device_id.function);
371 		printk("%s""slot: %d\n", pfx,
372 		       pcie->device_id.slot >> CPER_PCIE_SLOT_SHIFT);
373 		printk("%s""secondary_bus: 0x%02x\n", pfx,
374 		       pcie->device_id.secondary_bus);
375 		printk("%s""vendor_id: 0x%04x, device_id: 0x%04x\n", pfx,
376 		       pcie->device_id.vendor_id, pcie->device_id.device_id);
377 		p = pcie->device_id.class_code;
378 		printk("%s""class_code: %02x%02x%02x\n", pfx, p[0], p[1], p[2]);
379 	}
380 	if (pcie->validation_bits & CPER_PCIE_VALID_SERIAL_NUMBER)
381 		printk("%s""serial number: 0x%04x, 0x%04x\n", pfx,
382 		       pcie->serial_number.lower, pcie->serial_number.upper);
383 	if (pcie->validation_bits & CPER_PCIE_VALID_BRIDGE_CONTROL_STATUS)
384 		printk(
385 	"%s""bridge: secondary_status: 0x%04x, control: 0x%04x\n",
386 	pfx, pcie->bridge.secondary_status, pcie->bridge.control);
387 }
388 
389 static void cper_estatus_print_section(
390 	const char *pfx, const struct acpi_hest_generic_data *gdata, int sec_no)
391 {
392 	uuid_le *sec_type = (uuid_le *)gdata->section_type;
393 	__u16 severity;
394 	char newpfx[64];
395 
396 	severity = gdata->error_severity;
397 	printk("%s""Error %d, type: %s\n", pfx, sec_no,
398 	       cper_severity_str(severity));
399 	if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
400 		printk("%s""fru_id: %pUl\n", pfx, (uuid_le *)gdata->fru_id);
401 	if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
402 		printk("%s""fru_text: %.20s\n", pfx, gdata->fru_text);
403 
404 	snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
405 	if (!uuid_le_cmp(*sec_type, CPER_SEC_PROC_GENERIC)) {
406 		struct cper_sec_proc_generic *proc_err = (void *)(gdata + 1);
407 		printk("%s""section_type: general processor error\n", newpfx);
408 		if (gdata->error_data_length >= sizeof(*proc_err))
409 			cper_print_proc_generic(newpfx, proc_err);
410 		else
411 			goto err_section_too_small;
412 	} else if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
413 		struct cper_sec_mem_err *mem_err = (void *)(gdata + 1);
414 		printk("%s""section_type: memory error\n", newpfx);
415 		if (gdata->error_data_length >=
416 		    sizeof(struct cper_sec_mem_err_old))
417 			cper_print_mem(newpfx, mem_err,
418 				       gdata->error_data_length);
419 		else
420 			goto err_section_too_small;
421 	} else if (!uuid_le_cmp(*sec_type, CPER_SEC_PCIE)) {
422 		struct cper_sec_pcie *pcie = (void *)(gdata + 1);
423 		printk("%s""section_type: PCIe error\n", newpfx);
424 		if (gdata->error_data_length >= sizeof(*pcie))
425 			cper_print_pcie(newpfx, pcie, gdata);
426 		else
427 			goto err_section_too_small;
428 	} else
429 		printk("%s""section type: unknown, %pUl\n", newpfx, sec_type);
430 
431 	return;
432 
433 err_section_too_small:
434 	pr_err(FW_WARN "error section length is too small\n");
435 }
436 
437 void cper_estatus_print(const char *pfx,
438 			const struct acpi_hest_generic_status *estatus)
439 {
440 	struct acpi_hest_generic_data *gdata;
441 	unsigned int data_len, gedata_len;
442 	int sec_no = 0;
443 	char newpfx[64];
444 	__u16 severity;
445 
446 	severity = estatus->error_severity;
447 	if (severity == CPER_SEV_CORRECTED)
448 		printk("%s%s\n", pfx,
449 		       "It has been corrected by h/w "
450 		       "and requires no further action");
451 	printk("%s""event severity: %s\n", pfx, cper_severity_str(severity));
452 	data_len = estatus->data_length;
453 	gdata = (struct acpi_hest_generic_data *)(estatus + 1);
454 	snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
455 	while (data_len >= sizeof(*gdata)) {
456 		gedata_len = gdata->error_data_length;
457 		cper_estatus_print_section(newpfx, gdata, sec_no);
458 		data_len -= gedata_len + sizeof(*gdata);
459 		gdata = (void *)(gdata + 1) + gedata_len;
460 		sec_no++;
461 	}
462 }
463 EXPORT_SYMBOL_GPL(cper_estatus_print);
464 
465 int cper_estatus_check_header(const struct acpi_hest_generic_status *estatus)
466 {
467 	if (estatus->data_length &&
468 	    estatus->data_length < sizeof(struct acpi_hest_generic_data))
469 		return -EINVAL;
470 	if (estatus->raw_data_length &&
471 	    estatus->raw_data_offset < sizeof(*estatus) + estatus->data_length)
472 		return -EINVAL;
473 
474 	return 0;
475 }
476 EXPORT_SYMBOL_GPL(cper_estatus_check_header);
477 
478 int cper_estatus_check(const struct acpi_hest_generic_status *estatus)
479 {
480 	struct acpi_hest_generic_data *gdata;
481 	unsigned int data_len, gedata_len;
482 	int rc;
483 
484 	rc = cper_estatus_check_header(estatus);
485 	if (rc)
486 		return rc;
487 	data_len = estatus->data_length;
488 	gdata = (struct acpi_hest_generic_data *)(estatus + 1);
489 	while (data_len >= sizeof(*gdata)) {
490 		gedata_len = gdata->error_data_length;
491 		if (gedata_len > data_len - sizeof(*gdata))
492 			return -EINVAL;
493 		data_len -= gedata_len + sizeof(*gdata);
494 		gdata = (void *)(gdata + 1) + gedata_len;
495 	}
496 	if (data_len)
497 		return -EINVAL;
498 
499 	return 0;
500 }
501 EXPORT_SYMBOL_GPL(cper_estatus_check);
502