xref: /linux/drivers/edac/mce_amd.c (revision 7a5f1cd22d47f8ca4b760b6334378ae42c1bd24b)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/module.h>
3 #include <linux/slab.h>
4 
5 #include <asm/cpu.h>
6 #include <asm/msr.h>
7 
8 #include "mce_amd.h"
9 
10 static struct amd_decoder_ops fam_ops;
11 
12 static u8 xec_mask	 = 0xf;
13 
14 static void (*decode_dram_ecc)(int node_id, struct mce *m);
15 
16 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
17 {
18 	decode_dram_ecc = f;
19 }
20 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
21 
22 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
23 {
24 	if (decode_dram_ecc) {
25 		WARN_ON(decode_dram_ecc != f);
26 
27 		decode_dram_ecc = NULL;
28 	}
29 }
30 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
31 
32 /*
33  * string representation for the different MCA reported error types, see F3x48
34  * or MSR0000_0411.
35  */
36 
37 /* transaction type */
38 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
39 
40 /* cache level */
41 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
42 
43 /* memory transaction type */
44 static const char * const rrrr_msgs[] = {
45        "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
46 };
47 
48 /* participating processor */
49 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
50 EXPORT_SYMBOL_GPL(pp_msgs);
51 
52 /* request timeout */
53 static const char * const to_msgs[] = { "no timeout", "timed out" };
54 
55 /* memory or i/o */
56 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
57 
58 /* internal error type */
59 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
60 
61 static const char * const f15h_mc1_mce_desc[] = {
62 	"UC during a demand linefill from L2",
63 	"Parity error during data load from IC",
64 	"Parity error for IC valid bit",
65 	"Main tag parity error",
66 	"Parity error in prediction queue",
67 	"PFB data/address parity error",
68 	"Parity error in the branch status reg",
69 	"PFB promotion address error",
70 	"Tag error during probe/victimization",
71 	"Parity error for IC probe tag valid bit",
72 	"PFB non-cacheable bit parity error",
73 	"PFB valid bit parity error",			/* xec = 0xd */
74 	"Microcode Patch Buffer",			/* xec = 010 */
75 	"uop queue",
76 	"insn buffer",
77 	"predecode buffer",
78 	"fetch address FIFO",
79 	"dispatch uop queue"
80 };
81 
82 static const char * const f15h_mc2_mce_desc[] = {
83 	"Fill ECC error on data fills",			/* xec = 0x4 */
84 	"Fill parity error on insn fills",
85 	"Prefetcher request FIFO parity error",
86 	"PRQ address parity error",
87 	"PRQ data parity error",
88 	"WCC Tag ECC error",
89 	"WCC Data ECC error",
90 	"WCB Data parity error",
91 	"VB Data ECC or parity error",
92 	"L2 Tag ECC error",				/* xec = 0x10 */
93 	"Hard L2 Tag ECC error",
94 	"Multiple hits on L2 tag",
95 	"XAB parity error",
96 	"PRB address parity error"
97 };
98 
99 static const char * const mc4_mce_desc[] = {
100 	"DRAM ECC error detected on the NB",
101 	"CRC error detected on HT link",
102 	"Link-defined sync error packets detected on HT link",
103 	"HT Master abort",
104 	"HT Target abort",
105 	"Invalid GART PTE entry during GART table walk",
106 	"Unsupported atomic RMW received from an IO link",
107 	"Watchdog timeout due to lack of progress",
108 	"DRAM ECC error detected on the NB",
109 	"SVM DMA Exclusion Vector error",
110 	"HT data error detected on link",
111 	"Protocol error (link, L3, probe filter)",
112 	"NB internal arrays parity error",
113 	"DRAM addr/ctl signals parity error",
114 	"IO link transmission error",
115 	"L3 data cache ECC error",			/* xec = 0x1c */
116 	"L3 cache tag error",
117 	"L3 LRU parity bits error",
118 	"ECC Error in the Probe Filter directory"
119 };
120 
121 static const char * const mc5_mce_desc[] = {
122 	"CPU Watchdog timer expire",
123 	"Wakeup array dest tag",
124 	"AG payload array",
125 	"EX payload array",
126 	"IDRF array",
127 	"Retire dispatch queue",
128 	"Mapper checkpoint array",
129 	"Physical register file EX0 port",
130 	"Physical register file EX1 port",
131 	"Physical register file AG0 port",
132 	"Physical register file AG1 port",
133 	"Flag register file",
134 	"DE error occurred",
135 	"Retire status queue"
136 };
137 
138 static const char * const mc6_mce_desc[] = {
139 	"Hardware Assertion",
140 	"Free List",
141 	"Physical Register File",
142 	"Retire Queue",
143 	"Scheduler table",
144 	"Status Register File",
145 };
146 
147 static bool f12h_mc0_mce(u16 ec, u8 xec)
148 {
149 	bool ret = false;
150 
151 	if (MEM_ERROR(ec)) {
152 		u8 ll = LL(ec);
153 		ret = true;
154 
155 		if (ll == LL_L2)
156 			pr_cont("during L1 linefill from L2.\n");
157 		else if (ll == LL_L1)
158 			pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
159 		else
160 			ret = false;
161 	}
162 	return ret;
163 }
164 
165 static bool f10h_mc0_mce(u16 ec, u8 xec)
166 {
167 	if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
168 		pr_cont("during data scrub.\n");
169 		return true;
170 	}
171 	return f12h_mc0_mce(ec, xec);
172 }
173 
174 static bool k8_mc0_mce(u16 ec, u8 xec)
175 {
176 	if (BUS_ERROR(ec)) {
177 		pr_cont("during system linefill.\n");
178 		return true;
179 	}
180 
181 	return f10h_mc0_mce(ec, xec);
182 }
183 
184 static bool cat_mc0_mce(u16 ec, u8 xec)
185 {
186 	u8 r4	 = R4(ec);
187 	bool ret = true;
188 
189 	if (MEM_ERROR(ec)) {
190 
191 		if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
192 			return false;
193 
194 		switch (r4) {
195 		case R4_DRD:
196 		case R4_DWR:
197 			pr_cont("Data/Tag parity error due to %s.\n",
198 				(r4 == R4_DRD ? "load/hw prf" : "store"));
199 			break;
200 		case R4_EVICT:
201 			pr_cont("Copyback parity error on a tag miss.\n");
202 			break;
203 		case R4_SNOOP:
204 			pr_cont("Tag parity error during snoop.\n");
205 			break;
206 		default:
207 			ret = false;
208 		}
209 	} else if (BUS_ERROR(ec)) {
210 
211 		if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
212 			return false;
213 
214 		pr_cont("System read data error on a ");
215 
216 		switch (r4) {
217 		case R4_RD:
218 			pr_cont("TLB reload.\n");
219 			break;
220 		case R4_DWR:
221 			pr_cont("store.\n");
222 			break;
223 		case R4_DRD:
224 			pr_cont("load.\n");
225 			break;
226 		default:
227 			ret = false;
228 		}
229 	} else {
230 		ret = false;
231 	}
232 
233 	return ret;
234 }
235 
236 static bool f15h_mc0_mce(u16 ec, u8 xec)
237 {
238 	bool ret = true;
239 
240 	if (MEM_ERROR(ec)) {
241 
242 		switch (xec) {
243 		case 0x0:
244 			pr_cont("Data Array access error.\n");
245 			break;
246 
247 		case 0x1:
248 			pr_cont("UC error during a linefill from L2/NB.\n");
249 			break;
250 
251 		case 0x2:
252 		case 0x11:
253 			pr_cont("STQ access error.\n");
254 			break;
255 
256 		case 0x3:
257 			pr_cont("SCB access error.\n");
258 			break;
259 
260 		case 0x10:
261 			pr_cont("Tag error.\n");
262 			break;
263 
264 		case 0x12:
265 			pr_cont("LDQ access error.\n");
266 			break;
267 
268 		default:
269 			ret = false;
270 		}
271 	} else if (BUS_ERROR(ec)) {
272 
273 		if (!xec)
274 			pr_cont("System Read Data Error.\n");
275 		else
276 			pr_cont(" Internal error condition type %d.\n", xec);
277 	} else if (INT_ERROR(ec)) {
278 		if (xec <= 0x1f)
279 			pr_cont("Hardware Assert.\n");
280 		else
281 			ret = false;
282 
283 	} else
284 		ret = false;
285 
286 	return ret;
287 }
288 
289 static void decode_mc0_mce(struct mce *m)
290 {
291 	u16 ec = EC(m->status);
292 	u8 xec = XEC(m->status, xec_mask);
293 
294 	pr_emerg(HW_ERR "MC0 Error: ");
295 
296 	/* TLB error signatures are the same across families */
297 	if (TLB_ERROR(ec)) {
298 		if (TT(ec) == TT_DATA) {
299 			pr_cont("%s TLB %s.\n", LL_MSG(ec),
300 				((xec == 2) ? "locked miss"
301 					    : (xec ? "multimatch" : "parity")));
302 			return;
303 		}
304 	} else if (fam_ops.mc0_mce(ec, xec))
305 		;
306 	else
307 		pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
308 }
309 
310 static bool k8_mc1_mce(u16 ec, u8 xec)
311 {
312 	u8 ll	 = LL(ec);
313 	bool ret = true;
314 
315 	if (!MEM_ERROR(ec))
316 		return false;
317 
318 	if (ll == 0x2)
319 		pr_cont("during a linefill from L2.\n");
320 	else if (ll == 0x1) {
321 		switch (R4(ec)) {
322 		case R4_IRD:
323 			pr_cont("Parity error during data load.\n");
324 			break;
325 
326 		case R4_EVICT:
327 			pr_cont("Copyback Parity/Victim error.\n");
328 			break;
329 
330 		case R4_SNOOP:
331 			pr_cont("Tag Snoop error.\n");
332 			break;
333 
334 		default:
335 			ret = false;
336 			break;
337 		}
338 	} else
339 		ret = false;
340 
341 	return ret;
342 }
343 
344 static bool cat_mc1_mce(u16 ec, u8 xec)
345 {
346 	u8 r4    = R4(ec);
347 	bool ret = true;
348 
349 	if (!MEM_ERROR(ec))
350 		return false;
351 
352 	if (TT(ec) != TT_INSTR)
353 		return false;
354 
355 	if (r4 == R4_IRD)
356 		pr_cont("Data/tag array parity error for a tag hit.\n");
357 	else if (r4 == R4_SNOOP)
358 		pr_cont("Tag error during snoop/victimization.\n");
359 	else if (xec == 0x0)
360 		pr_cont("Tag parity error from victim castout.\n");
361 	else if (xec == 0x2)
362 		pr_cont("Microcode patch RAM parity error.\n");
363 	else
364 		ret = false;
365 
366 	return ret;
367 }
368 
369 static bool f15h_mc1_mce(u16 ec, u8 xec)
370 {
371 	bool ret = true;
372 
373 	if (!MEM_ERROR(ec))
374 		return false;
375 
376 	switch (xec) {
377 	case 0x0 ... 0xa:
378 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
379 		break;
380 
381 	case 0xd:
382 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
383 		break;
384 
385 	case 0x10:
386 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
387 		break;
388 
389 	case 0x11 ... 0x15:
390 		pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
391 		break;
392 
393 	default:
394 		ret = false;
395 	}
396 	return ret;
397 }
398 
399 static void decode_mc1_mce(struct mce *m)
400 {
401 	u16 ec = EC(m->status);
402 	u8 xec = XEC(m->status, xec_mask);
403 
404 	pr_emerg(HW_ERR "MC1 Error: ");
405 
406 	if (TLB_ERROR(ec))
407 		pr_cont("%s TLB %s.\n", LL_MSG(ec),
408 			(xec ? "multimatch" : "parity error"));
409 	else if (BUS_ERROR(ec)) {
410 		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
411 
412 		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
413 	} else if (INT_ERROR(ec)) {
414 		if (xec <= 0x3f)
415 			pr_cont("Hardware Assert.\n");
416 		else
417 			goto wrong_mc1_mce;
418 	} else if (fam_ops.mc1_mce(ec, xec))
419 		;
420 	else
421 		goto wrong_mc1_mce;
422 
423 	return;
424 
425 wrong_mc1_mce:
426 	pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
427 }
428 
429 static bool k8_mc2_mce(u16 ec, u8 xec)
430 {
431 	bool ret = true;
432 
433 	if (xec == 0x1)
434 		pr_cont(" in the write data buffers.\n");
435 	else if (xec == 0x3)
436 		pr_cont(" in the victim data buffers.\n");
437 	else if (xec == 0x2 && MEM_ERROR(ec))
438 		pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
439 	else if (xec == 0x0) {
440 		if (TLB_ERROR(ec))
441 			pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
442 				TT_MSG(ec));
443 		else if (BUS_ERROR(ec))
444 			pr_cont(": %s/ECC error in data read from NB: %s.\n",
445 				R4_MSG(ec), PP_MSG(ec));
446 		else if (MEM_ERROR(ec)) {
447 			u8 r4 = R4(ec);
448 
449 			if (r4 >= 0x7)
450 				pr_cont(": %s error during data copyback.\n",
451 					R4_MSG(ec));
452 			else if (r4 <= 0x1)
453 				pr_cont(": %s parity/ECC error during data "
454 					"access from L2.\n", R4_MSG(ec));
455 			else
456 				ret = false;
457 		} else
458 			ret = false;
459 	} else
460 		ret = false;
461 
462 	return ret;
463 }
464 
465 static bool f15h_mc2_mce(u16 ec, u8 xec)
466 {
467 	bool ret = true;
468 
469 	if (TLB_ERROR(ec)) {
470 		if (xec == 0x0)
471 			pr_cont("Data parity TLB read error.\n");
472 		else if (xec == 0x1)
473 			pr_cont("Poison data provided for TLB fill.\n");
474 		else
475 			ret = false;
476 	} else if (BUS_ERROR(ec)) {
477 		if (xec > 2)
478 			ret = false;
479 
480 		pr_cont("Error during attempted NB data read.\n");
481 	} else if (MEM_ERROR(ec)) {
482 		switch (xec) {
483 		case 0x4 ... 0xc:
484 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
485 			break;
486 
487 		case 0x10 ... 0x14:
488 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
489 			break;
490 
491 		default:
492 			ret = false;
493 		}
494 	} else if (INT_ERROR(ec)) {
495 		if (xec <= 0x3f)
496 			pr_cont("Hardware Assert.\n");
497 		else
498 			ret = false;
499 	}
500 
501 	return ret;
502 }
503 
504 static bool f16h_mc2_mce(u16 ec, u8 xec)
505 {
506 	u8 r4 = R4(ec);
507 
508 	if (!MEM_ERROR(ec))
509 		return false;
510 
511 	switch (xec) {
512 	case 0x04 ... 0x05:
513 		pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
514 		break;
515 
516 	case 0x09 ... 0x0b:
517 	case 0x0d ... 0x0f:
518 		pr_cont("ECC error in L2 tag (%s).\n",
519 			((r4 == R4_GEN)   ? "BankReq" :
520 			((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
521 		break;
522 
523 	case 0x10 ... 0x19:
524 	case 0x1b:
525 		pr_cont("ECC error in L2 data array (%s).\n",
526 			(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
527 			((r4 == R4_GEN)   ? "Attr" :
528 			((r4 == R4_EVICT) ? "Vict" : "Fill"))));
529 		break;
530 
531 	case 0x1c ... 0x1d:
532 	case 0x1f:
533 		pr_cont("Parity error in L2 attribute bits (%s).\n",
534 			((r4 == R4_RD)  ? "Hit"  :
535 			((r4 == R4_GEN) ? "Attr" : "Fill")));
536 		break;
537 
538 	default:
539 		return false;
540 	}
541 
542 	return true;
543 }
544 
545 static void decode_mc2_mce(struct mce *m)
546 {
547 	u16 ec = EC(m->status);
548 	u8 xec = XEC(m->status, xec_mask);
549 
550 	pr_emerg(HW_ERR "MC2 Error: ");
551 
552 	if (!fam_ops.mc2_mce(ec, xec))
553 		pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
554 }
555 
556 static void decode_mc3_mce(struct mce *m)
557 {
558 	u16 ec = EC(m->status);
559 	u8 xec = XEC(m->status, xec_mask);
560 
561 	if (boot_cpu_data.x86 >= 0x14) {
562 		pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
563 			 " please report on LKML.\n");
564 		return;
565 	}
566 
567 	pr_emerg(HW_ERR "MC3 Error");
568 
569 	if (xec == 0x0) {
570 		u8 r4 = R4(ec);
571 
572 		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
573 			goto wrong_mc3_mce;
574 
575 		pr_cont(" during %s.\n", R4_MSG(ec));
576 	} else
577 		goto wrong_mc3_mce;
578 
579 	return;
580 
581  wrong_mc3_mce:
582 	pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
583 }
584 
585 static void decode_mc4_mce(struct mce *m)
586 {
587 	unsigned int fam = x86_family(m->cpuid);
588 	int node_id = topology_amd_node_id(m->extcpu);
589 	u16 ec = EC(m->status);
590 	u8 xec = XEC(m->status, 0x1f);
591 	u8 offset = 0;
592 
593 	pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
594 
595 	switch (xec) {
596 	case 0x0 ... 0xe:
597 
598 		/* special handling for DRAM ECCs */
599 		if (xec == 0x0 || xec == 0x8) {
600 			/* no ECCs on F11h */
601 			if (fam == 0x11)
602 				goto wrong_mc4_mce;
603 
604 			pr_cont("%s.\n", mc4_mce_desc[xec]);
605 
606 			if (decode_dram_ecc)
607 				decode_dram_ecc(node_id, m);
608 			return;
609 		}
610 		break;
611 
612 	case 0xf:
613 		if (TLB_ERROR(ec))
614 			pr_cont("GART Table Walk data error.\n");
615 		else if (BUS_ERROR(ec))
616 			pr_cont("DMA Exclusion Vector Table Walk error.\n");
617 		else
618 			goto wrong_mc4_mce;
619 		return;
620 
621 	case 0x19:
622 		if (fam == 0x15 || fam == 0x16)
623 			pr_cont("Compute Unit Data Error.\n");
624 		else
625 			goto wrong_mc4_mce;
626 		return;
627 
628 	case 0x1c ... 0x1f:
629 		offset = 13;
630 		break;
631 
632 	default:
633 		goto wrong_mc4_mce;
634 	}
635 
636 	pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
637 	return;
638 
639  wrong_mc4_mce:
640 	pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
641 }
642 
643 static void decode_mc5_mce(struct mce *m)
644 {
645 	unsigned int fam = x86_family(m->cpuid);
646 	u16 ec = EC(m->status);
647 	u8 xec = XEC(m->status, xec_mask);
648 
649 	if (fam == 0xf || fam == 0x11)
650 		goto wrong_mc5_mce;
651 
652 	pr_emerg(HW_ERR "MC5 Error: ");
653 
654 	if (INT_ERROR(ec)) {
655 		if (xec <= 0x1f) {
656 			pr_cont("Hardware Assert.\n");
657 			return;
658 		} else
659 			goto wrong_mc5_mce;
660 	}
661 
662 	if (xec == 0x0 || xec == 0xc)
663 		pr_cont("%s.\n", mc5_mce_desc[xec]);
664 	else if (xec <= 0xd)
665 		pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
666 	else
667 		goto wrong_mc5_mce;
668 
669 	return;
670 
671  wrong_mc5_mce:
672 	pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
673 }
674 
675 static void decode_mc6_mce(struct mce *m)
676 {
677 	u8 xec = XEC(m->status, xec_mask);
678 
679 	pr_emerg(HW_ERR "MC6 Error: ");
680 
681 	if (xec > 0x5)
682 		goto wrong_mc6_mce;
683 
684 	pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
685 	return;
686 
687  wrong_mc6_mce:
688 	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
689 }
690 
691 static const char * const smca_long_names[] = {
692 	[SMCA_CS ... SMCA_CS_V2]	= "Coherent Station",
693 	[SMCA_DACC_BE]			= "DACC Back-end Unit",
694 	[SMCA_DACC_FE]			= "DACC Front-end Unit",
695 	[SMCA_DE]			= "Decode Unit",
696 	[SMCA_EDDR5CMN]			= "eDDR5 CMN Unit",
697 	[SMCA_EX]			= "Execution Unit",
698 	[SMCA_FP]			= "Floating Point Unit",
699 	[SMCA_GMI_PCS]			= "Global Memory Interconnect PCS Unit",
700 	[SMCA_GMI_PHY]			= "Global Memory Interconnect PHY Unit",
701 	[SMCA_IF]			= "Instruction Fetch Unit",
702 	[SMCA_L2_CACHE]			= "L2 Cache",
703 	[SMCA_L3_CACHE]			= "L3 Cache",
704 	[SMCA_LS ... SMCA_LS_V2]	= "Load Store Unit",
705 	[SMCA_MP5]			= "Microprocessor 5 Unit",
706 	[SMCA_MPART]			= "MPART Unit",
707 	[SMCA_MPASP ... SMCA_MPASP_V2]	= "MPASP Unit",
708 	[SMCA_MPDACC]			= "MPDACC Unit",
709 	[SMCA_MPDMA]			= "MPDMA Unit",
710 	[SMCA_MPM]			= "MPM Unit",
711 	[SMCA_MPRAS]			= "MPRAS Unit",
712 	[SMCA_NBIF]			= "NBIF Unit",
713 	[SMCA_NBIO]			= "Northbridge IO Unit",
714 	[SMCA_PB]			= "Parameter Block",
715 	[SMCA_PCIE ... SMCA_PCIE_V2]	= "PCI Express Unit",
716 	[SMCA_PCIE_PL]			= "PCIe Link Unit",
717 	[SMCA_PIE]			= "Power, Interrupts, etc.",
718 	[SMCA_PSP ... SMCA_PSP_V2]	= "Platform Security Processor",
719 	[SMCA_RESERVED]			= "Reserved",
720 	[SMCA_SATA]			= "SATA Unit",
721 	[SMCA_SHUB]			= "System Hub Unit",
722 	[SMCA_SMU ... SMCA_SMU_V2]	= "System Management Unit",
723 	[SMCA_SSBDCI]			= "Die to Die Interconnect Unit",
724 
725 	/* UMC v2 is separate because both of them can exist in a single system. */
726 	[SMCA_UMC]			= "Unified Memory Controller",
727 	[SMCA_UMC_V2]			= "Unified Memory Controller v2",
728 	[SMCA_USB]			= "USB Unit",
729 	[SMCA_WAFL_PHY]			= "WAFL PHY Unit",
730 	[SMCA_XGMI_PCS]			= "Ext Global Memory Interconnect PCS Unit",
731 	[SMCA_XGMI_PHY]			= "Ext Global Memory Interconnect PHY Unit",
732 };
733 
734 static const char *smca_get_long_name(enum smca_bank_types t)
735 {
736 	if (t >= N_SMCA_BANK_TYPES)
737 		return NULL;
738 
739 	return smca_long_names[t];
740 }
741 
742 /* Decode errors according to Scalable MCA specification */
743 static void decode_smca_error(struct mce *m)
744 {
745 	enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
746 	u8 xec = XEC(m->status, xec_mask);
747 
748 	if (bank_type >= N_SMCA_BANK_TYPES)
749 		return;
750 
751 	if (bank_type == SMCA_RESERVED) {
752 		pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
753 		return;
754 	}
755 
756 	pr_emerg(HW_ERR "%s Ext. Error Code: %d", smca_get_long_name(bank_type), xec);
757 
758 	if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) &&
759 	    xec == 0 && decode_dram_ecc)
760 		decode_dram_ecc(topology_amd_node_id(m->extcpu), m);
761 }
762 
763 static inline void amd_decode_err_code(u16 ec)
764 {
765 	if (INT_ERROR(ec)) {
766 		pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
767 		return;
768 	}
769 
770 	pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
771 
772 	if (BUS_ERROR(ec))
773 		pr_cont(", mem/io: %s", II_MSG(ec));
774 	else
775 		pr_cont(", tx: %s", TT_MSG(ec));
776 
777 	if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
778 		pr_cont(", mem-tx: %s", R4_MSG(ec));
779 
780 		if (BUS_ERROR(ec))
781 			pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
782 	}
783 
784 	pr_cont("\n");
785 }
786 
787 static const char *decode_error_status(struct mce *m)
788 {
789 	if (m->status & MCI_STATUS_UC) {
790 		if (m->status & MCI_STATUS_PCC)
791 			return "System Fatal error.";
792 		if (m->mcgstatus & MCG_STATUS_RIPV)
793 			return "Uncorrected, software restartable error.";
794 		return "Uncorrected, software containable error.";
795 	}
796 
797 	if (m->status & MCI_STATUS_DEFERRED)
798 		return "Deferred error, no action required.";
799 
800 	return "Corrected error, no action required.";
801 }
802 
803 static int
804 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
805 {
806 	struct mce *m = (struct mce *)data;
807 	struct mce_hw_err *err = to_mce_hw_err(m);
808 	unsigned int fam = x86_family(m->cpuid);
809 	u32 mca_config_lo = 0, dummy;
810 	int ecc;
811 
812 	if (m->kflags & MCE_HANDLED_CEC)
813 		return NOTIFY_DONE;
814 
815 	pr_emerg(HW_ERR "%s\n", decode_error_status(m));
816 
817 	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
818 		m->extcpu,
819 		fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
820 		m->bank,
821 		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
822 		((m->status & MCI_STATUS_UC)	? "UE"	  :
823 		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
824 		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
825 		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"),
826 		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"));
827 
828 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
829 		rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(m->bank), &mca_config_lo, &dummy);
830 
831 		if (mca_config_lo & MCI_CONFIG_MCAX)
832 			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
833 
834 		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
835 	}
836 
837 	/* do the two bits[14:13] together */
838 	ecc = (m->status >> 45) & 0x3;
839 	if (ecc)
840 		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
841 
842 	if (fam >= 0x15) {
843 		pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
844 
845 		/* F15h, bank4, bit 43 is part of McaStatSubCache. */
846 		if (fam != 0x15 || m->bank != 4)
847 			pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
848 	}
849 
850 	if (fam >= 0x17)
851 		pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
852 
853 	pr_cont("]: 0x%016llx\n", m->status);
854 
855 	if (m->status & MCI_STATUS_ADDRV)
856 		pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
857 
858 	if (m->ppin)
859 		pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin);
860 
861 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
862 		pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
863 
864 		if (m->status & MCI_STATUS_SYNDV) {
865 			pr_cont(", Syndrome: 0x%016llx\n", m->synd);
866 			if (mca_config_lo & MCI_CONFIG_FRUTEXT) {
867 				char frutext[17];
868 
869 				frutext[16] = '\0';
870 				memcpy(&frutext[0], &err->vendor.amd.synd1, 8);
871 				memcpy(&frutext[8], &err->vendor.amd.synd2, 8);
872 
873 				pr_emerg(HW_ERR "FRU Text: %s", frutext);
874 			}
875 		}
876 
877 		pr_cont("\n");
878 
879 		decode_smca_error(m);
880 		goto err_code;
881 	}
882 
883 	if (m->tsc)
884 		pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
885 
886 	/* Doesn't matter which member to test. */
887 	if (!fam_ops.mc0_mce)
888 		goto err_code;
889 
890 	switch (m->bank) {
891 	case 0:
892 		decode_mc0_mce(m);
893 		break;
894 
895 	case 1:
896 		decode_mc1_mce(m);
897 		break;
898 
899 	case 2:
900 		decode_mc2_mce(m);
901 		break;
902 
903 	case 3:
904 		decode_mc3_mce(m);
905 		break;
906 
907 	case 4:
908 		decode_mc4_mce(m);
909 		break;
910 
911 	case 5:
912 		decode_mc5_mce(m);
913 		break;
914 
915 	case 6:
916 		decode_mc6_mce(m);
917 		break;
918 
919 	default:
920 		break;
921 	}
922 
923  err_code:
924 	amd_decode_err_code(m->status & 0xffff);
925 
926 	m->kflags |= MCE_HANDLED_EDAC;
927 	return NOTIFY_OK;
928 }
929 
930 static struct notifier_block amd_mce_dec_nb = {
931 	.notifier_call	= amd_decode_mce,
932 	.priority	= MCE_PRIO_EDAC,
933 };
934 
935 static int __init mce_amd_init(void)
936 {
937 	struct cpuinfo_x86 *c = &boot_cpu_data;
938 
939 	if (c->x86_vendor != X86_VENDOR_AMD &&
940 	    c->x86_vendor != X86_VENDOR_HYGON)
941 		return -ENODEV;
942 
943 	if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
944 		return -ENODEV;
945 
946 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
947 		xec_mask = 0x3f;
948 		goto out;
949 	}
950 
951 	switch (c->x86) {
952 	case 0xf:
953 		fam_ops.mc0_mce = k8_mc0_mce;
954 		fam_ops.mc1_mce = k8_mc1_mce;
955 		fam_ops.mc2_mce = k8_mc2_mce;
956 		break;
957 
958 	case 0x10:
959 		fam_ops.mc0_mce = f10h_mc0_mce;
960 		fam_ops.mc1_mce = k8_mc1_mce;
961 		fam_ops.mc2_mce = k8_mc2_mce;
962 		break;
963 
964 	case 0x11:
965 		fam_ops.mc0_mce = k8_mc0_mce;
966 		fam_ops.mc1_mce = k8_mc1_mce;
967 		fam_ops.mc2_mce = k8_mc2_mce;
968 		break;
969 
970 	case 0x12:
971 		fam_ops.mc0_mce = f12h_mc0_mce;
972 		fam_ops.mc1_mce = k8_mc1_mce;
973 		fam_ops.mc2_mce = k8_mc2_mce;
974 		break;
975 
976 	case 0x14:
977 		fam_ops.mc0_mce = cat_mc0_mce;
978 		fam_ops.mc1_mce = cat_mc1_mce;
979 		fam_ops.mc2_mce = k8_mc2_mce;
980 		break;
981 
982 	case 0x15:
983 		xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
984 
985 		fam_ops.mc0_mce = f15h_mc0_mce;
986 		fam_ops.mc1_mce = f15h_mc1_mce;
987 		fam_ops.mc2_mce = f15h_mc2_mce;
988 		break;
989 
990 	case 0x16:
991 		xec_mask = 0x1f;
992 		fam_ops.mc0_mce = cat_mc0_mce;
993 		fam_ops.mc1_mce = cat_mc1_mce;
994 		fam_ops.mc2_mce = f16h_mc2_mce;
995 		break;
996 
997 	case 0x17:
998 	case 0x18:
999 		pr_warn_once("Decoding supported only on Scalable MCA processors.\n");
1000 		return -EINVAL;
1001 
1002 	default:
1003 		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1004 		return -EINVAL;
1005 	}
1006 
1007 out:
1008 	pr_info("MCE: In-kernel MCE decoding enabled.\n");
1009 
1010 	mce_register_decode_chain(&amd_mce_dec_nb);
1011 
1012 	return 0;
1013 }
1014 early_initcall(mce_amd_init);
1015 
1016 #ifdef MODULE
1017 static void __exit mce_amd_exit(void)
1018 {
1019 	mce_unregister_decode_chain(&amd_mce_dec_nb);
1020 }
1021 
1022 MODULE_DESCRIPTION("AMD MCE decoder");
1023 MODULE_ALIAS("edac-mce-amd");
1024 MODULE_LICENSE("GPL");
1025 module_exit(mce_amd_exit);
1026 #endif
1027