xref: /linux/drivers/edac/mce_amd.c (revision 7f71507851fc7764b36a3221839607d3a45c2025)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/module.h>
3 #include <linux/slab.h>
4 
5 #include <asm/cpu.h>
6 
7 #include "mce_amd.h"
8 
9 static struct amd_decoder_ops fam_ops;
10 
11 static u8 xec_mask	 = 0xf;
12 
13 static void (*decode_dram_ecc)(int node_id, struct mce *m);
14 
15 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
16 {
17 	decode_dram_ecc = f;
18 }
19 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
20 
21 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
22 {
23 	if (decode_dram_ecc) {
24 		WARN_ON(decode_dram_ecc != f);
25 
26 		decode_dram_ecc = NULL;
27 	}
28 }
29 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
30 
31 /*
32  * string representation for the different MCA reported error types, see F3x48
33  * or MSR0000_0411.
34  */
35 
36 /* transaction type */
37 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
38 
39 /* cache level */
40 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
41 
42 /* memory transaction type */
43 static const char * const rrrr_msgs[] = {
44        "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
45 };
46 
47 /* participating processor */
48 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
49 EXPORT_SYMBOL_GPL(pp_msgs);
50 
51 /* request timeout */
52 static const char * const to_msgs[] = { "no timeout", "timed out" };
53 
54 /* memory or i/o */
55 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
56 
57 /* internal error type */
58 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
59 
60 static const char * const f15h_mc1_mce_desc[] = {
61 	"UC during a demand linefill from L2",
62 	"Parity error during data load from IC",
63 	"Parity error for IC valid bit",
64 	"Main tag parity error",
65 	"Parity error in prediction queue",
66 	"PFB data/address parity error",
67 	"Parity error in the branch status reg",
68 	"PFB promotion address error",
69 	"Tag error during probe/victimization",
70 	"Parity error for IC probe tag valid bit",
71 	"PFB non-cacheable bit parity error",
72 	"PFB valid bit parity error",			/* xec = 0xd */
73 	"Microcode Patch Buffer",			/* xec = 010 */
74 	"uop queue",
75 	"insn buffer",
76 	"predecode buffer",
77 	"fetch address FIFO",
78 	"dispatch uop queue"
79 };
80 
81 static const char * const f15h_mc2_mce_desc[] = {
82 	"Fill ECC error on data fills",			/* xec = 0x4 */
83 	"Fill parity error on insn fills",
84 	"Prefetcher request FIFO parity error",
85 	"PRQ address parity error",
86 	"PRQ data parity error",
87 	"WCC Tag ECC error",
88 	"WCC Data ECC error",
89 	"WCB Data parity error",
90 	"VB Data ECC or parity error",
91 	"L2 Tag ECC error",				/* xec = 0x10 */
92 	"Hard L2 Tag ECC error",
93 	"Multiple hits on L2 tag",
94 	"XAB parity error",
95 	"PRB address parity error"
96 };
97 
98 static const char * const mc4_mce_desc[] = {
99 	"DRAM ECC error detected on the NB",
100 	"CRC error detected on HT link",
101 	"Link-defined sync error packets detected on HT link",
102 	"HT Master abort",
103 	"HT Target abort",
104 	"Invalid GART PTE entry during GART table walk",
105 	"Unsupported atomic RMW received from an IO link",
106 	"Watchdog timeout due to lack of progress",
107 	"DRAM ECC error detected on the NB",
108 	"SVM DMA Exclusion Vector error",
109 	"HT data error detected on link",
110 	"Protocol error (link, L3, probe filter)",
111 	"NB internal arrays parity error",
112 	"DRAM addr/ctl signals parity error",
113 	"IO link transmission error",
114 	"L3 data cache ECC error",			/* xec = 0x1c */
115 	"L3 cache tag error",
116 	"L3 LRU parity bits error",
117 	"ECC Error in the Probe Filter directory"
118 };
119 
120 static const char * const mc5_mce_desc[] = {
121 	"CPU Watchdog timer expire",
122 	"Wakeup array dest tag",
123 	"AG payload array",
124 	"EX payload array",
125 	"IDRF array",
126 	"Retire dispatch queue",
127 	"Mapper checkpoint array",
128 	"Physical register file EX0 port",
129 	"Physical register file EX1 port",
130 	"Physical register file AG0 port",
131 	"Physical register file AG1 port",
132 	"Flag register file",
133 	"DE error occurred",
134 	"Retire status queue"
135 };
136 
137 static const char * const mc6_mce_desc[] = {
138 	"Hardware Assertion",
139 	"Free List",
140 	"Physical Register File",
141 	"Retire Queue",
142 	"Scheduler table",
143 	"Status Register File",
144 };
145 
146 static bool f12h_mc0_mce(u16 ec, u8 xec)
147 {
148 	bool ret = false;
149 
150 	if (MEM_ERROR(ec)) {
151 		u8 ll = LL(ec);
152 		ret = true;
153 
154 		if (ll == LL_L2)
155 			pr_cont("during L1 linefill from L2.\n");
156 		else if (ll == LL_L1)
157 			pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
158 		else
159 			ret = false;
160 	}
161 	return ret;
162 }
163 
164 static bool f10h_mc0_mce(u16 ec, u8 xec)
165 {
166 	if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
167 		pr_cont("during data scrub.\n");
168 		return true;
169 	}
170 	return f12h_mc0_mce(ec, xec);
171 }
172 
173 static bool k8_mc0_mce(u16 ec, u8 xec)
174 {
175 	if (BUS_ERROR(ec)) {
176 		pr_cont("during system linefill.\n");
177 		return true;
178 	}
179 
180 	return f10h_mc0_mce(ec, xec);
181 }
182 
183 static bool cat_mc0_mce(u16 ec, u8 xec)
184 {
185 	u8 r4	 = R4(ec);
186 	bool ret = true;
187 
188 	if (MEM_ERROR(ec)) {
189 
190 		if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
191 			return false;
192 
193 		switch (r4) {
194 		case R4_DRD:
195 		case R4_DWR:
196 			pr_cont("Data/Tag parity error due to %s.\n",
197 				(r4 == R4_DRD ? "load/hw prf" : "store"));
198 			break;
199 		case R4_EVICT:
200 			pr_cont("Copyback parity error on a tag miss.\n");
201 			break;
202 		case R4_SNOOP:
203 			pr_cont("Tag parity error during snoop.\n");
204 			break;
205 		default:
206 			ret = false;
207 		}
208 	} else if (BUS_ERROR(ec)) {
209 
210 		if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
211 			return false;
212 
213 		pr_cont("System read data error on a ");
214 
215 		switch (r4) {
216 		case R4_RD:
217 			pr_cont("TLB reload.\n");
218 			break;
219 		case R4_DWR:
220 			pr_cont("store.\n");
221 			break;
222 		case R4_DRD:
223 			pr_cont("load.\n");
224 			break;
225 		default:
226 			ret = false;
227 		}
228 	} else {
229 		ret = false;
230 	}
231 
232 	return ret;
233 }
234 
235 static bool f15h_mc0_mce(u16 ec, u8 xec)
236 {
237 	bool ret = true;
238 
239 	if (MEM_ERROR(ec)) {
240 
241 		switch (xec) {
242 		case 0x0:
243 			pr_cont("Data Array access error.\n");
244 			break;
245 
246 		case 0x1:
247 			pr_cont("UC error during a linefill from L2/NB.\n");
248 			break;
249 
250 		case 0x2:
251 		case 0x11:
252 			pr_cont("STQ access error.\n");
253 			break;
254 
255 		case 0x3:
256 			pr_cont("SCB access error.\n");
257 			break;
258 
259 		case 0x10:
260 			pr_cont("Tag error.\n");
261 			break;
262 
263 		case 0x12:
264 			pr_cont("LDQ access error.\n");
265 			break;
266 
267 		default:
268 			ret = false;
269 		}
270 	} else if (BUS_ERROR(ec)) {
271 
272 		if (!xec)
273 			pr_cont("System Read Data Error.\n");
274 		else
275 			pr_cont(" Internal error condition type %d.\n", xec);
276 	} else if (INT_ERROR(ec)) {
277 		if (xec <= 0x1f)
278 			pr_cont("Hardware Assert.\n");
279 		else
280 			ret = false;
281 
282 	} else
283 		ret = false;
284 
285 	return ret;
286 }
287 
288 static void decode_mc0_mce(struct mce *m)
289 {
290 	u16 ec = EC(m->status);
291 	u8 xec = XEC(m->status, xec_mask);
292 
293 	pr_emerg(HW_ERR "MC0 Error: ");
294 
295 	/* TLB error signatures are the same across families */
296 	if (TLB_ERROR(ec)) {
297 		if (TT(ec) == TT_DATA) {
298 			pr_cont("%s TLB %s.\n", LL_MSG(ec),
299 				((xec == 2) ? "locked miss"
300 					    : (xec ? "multimatch" : "parity")));
301 			return;
302 		}
303 	} else if (fam_ops.mc0_mce(ec, xec))
304 		;
305 	else
306 		pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
307 }
308 
309 static bool k8_mc1_mce(u16 ec, u8 xec)
310 {
311 	u8 ll	 = LL(ec);
312 	bool ret = true;
313 
314 	if (!MEM_ERROR(ec))
315 		return false;
316 
317 	if (ll == 0x2)
318 		pr_cont("during a linefill from L2.\n");
319 	else if (ll == 0x1) {
320 		switch (R4(ec)) {
321 		case R4_IRD:
322 			pr_cont("Parity error during data load.\n");
323 			break;
324 
325 		case R4_EVICT:
326 			pr_cont("Copyback Parity/Victim error.\n");
327 			break;
328 
329 		case R4_SNOOP:
330 			pr_cont("Tag Snoop error.\n");
331 			break;
332 
333 		default:
334 			ret = false;
335 			break;
336 		}
337 	} else
338 		ret = false;
339 
340 	return ret;
341 }
342 
343 static bool cat_mc1_mce(u16 ec, u8 xec)
344 {
345 	u8 r4    = R4(ec);
346 	bool ret = true;
347 
348 	if (!MEM_ERROR(ec))
349 		return false;
350 
351 	if (TT(ec) != TT_INSTR)
352 		return false;
353 
354 	if (r4 == R4_IRD)
355 		pr_cont("Data/tag array parity error for a tag hit.\n");
356 	else if (r4 == R4_SNOOP)
357 		pr_cont("Tag error during snoop/victimization.\n");
358 	else if (xec == 0x0)
359 		pr_cont("Tag parity error from victim castout.\n");
360 	else if (xec == 0x2)
361 		pr_cont("Microcode patch RAM parity error.\n");
362 	else
363 		ret = false;
364 
365 	return ret;
366 }
367 
368 static bool f15h_mc1_mce(u16 ec, u8 xec)
369 {
370 	bool ret = true;
371 
372 	if (!MEM_ERROR(ec))
373 		return false;
374 
375 	switch (xec) {
376 	case 0x0 ... 0xa:
377 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
378 		break;
379 
380 	case 0xd:
381 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
382 		break;
383 
384 	case 0x10:
385 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
386 		break;
387 
388 	case 0x11 ... 0x15:
389 		pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
390 		break;
391 
392 	default:
393 		ret = false;
394 	}
395 	return ret;
396 }
397 
398 static void decode_mc1_mce(struct mce *m)
399 {
400 	u16 ec = EC(m->status);
401 	u8 xec = XEC(m->status, xec_mask);
402 
403 	pr_emerg(HW_ERR "MC1 Error: ");
404 
405 	if (TLB_ERROR(ec))
406 		pr_cont("%s TLB %s.\n", LL_MSG(ec),
407 			(xec ? "multimatch" : "parity error"));
408 	else if (BUS_ERROR(ec)) {
409 		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
410 
411 		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
412 	} else if (INT_ERROR(ec)) {
413 		if (xec <= 0x3f)
414 			pr_cont("Hardware Assert.\n");
415 		else
416 			goto wrong_mc1_mce;
417 	} else if (fam_ops.mc1_mce(ec, xec))
418 		;
419 	else
420 		goto wrong_mc1_mce;
421 
422 	return;
423 
424 wrong_mc1_mce:
425 	pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
426 }
427 
428 static bool k8_mc2_mce(u16 ec, u8 xec)
429 {
430 	bool ret = true;
431 
432 	if (xec == 0x1)
433 		pr_cont(" in the write data buffers.\n");
434 	else if (xec == 0x3)
435 		pr_cont(" in the victim data buffers.\n");
436 	else if (xec == 0x2 && MEM_ERROR(ec))
437 		pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
438 	else if (xec == 0x0) {
439 		if (TLB_ERROR(ec))
440 			pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
441 				TT_MSG(ec));
442 		else if (BUS_ERROR(ec))
443 			pr_cont(": %s/ECC error in data read from NB: %s.\n",
444 				R4_MSG(ec), PP_MSG(ec));
445 		else if (MEM_ERROR(ec)) {
446 			u8 r4 = R4(ec);
447 
448 			if (r4 >= 0x7)
449 				pr_cont(": %s error during data copyback.\n",
450 					R4_MSG(ec));
451 			else if (r4 <= 0x1)
452 				pr_cont(": %s parity/ECC error during data "
453 					"access from L2.\n", R4_MSG(ec));
454 			else
455 				ret = false;
456 		} else
457 			ret = false;
458 	} else
459 		ret = false;
460 
461 	return ret;
462 }
463 
464 static bool f15h_mc2_mce(u16 ec, u8 xec)
465 {
466 	bool ret = true;
467 
468 	if (TLB_ERROR(ec)) {
469 		if (xec == 0x0)
470 			pr_cont("Data parity TLB read error.\n");
471 		else if (xec == 0x1)
472 			pr_cont("Poison data provided for TLB fill.\n");
473 		else
474 			ret = false;
475 	} else if (BUS_ERROR(ec)) {
476 		if (xec > 2)
477 			ret = false;
478 
479 		pr_cont("Error during attempted NB data read.\n");
480 	} else if (MEM_ERROR(ec)) {
481 		switch (xec) {
482 		case 0x4 ... 0xc:
483 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
484 			break;
485 
486 		case 0x10 ... 0x14:
487 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
488 			break;
489 
490 		default:
491 			ret = false;
492 		}
493 	} else if (INT_ERROR(ec)) {
494 		if (xec <= 0x3f)
495 			pr_cont("Hardware Assert.\n");
496 		else
497 			ret = false;
498 	}
499 
500 	return ret;
501 }
502 
503 static bool f16h_mc2_mce(u16 ec, u8 xec)
504 {
505 	u8 r4 = R4(ec);
506 
507 	if (!MEM_ERROR(ec))
508 		return false;
509 
510 	switch (xec) {
511 	case 0x04 ... 0x05:
512 		pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
513 		break;
514 
515 	case 0x09 ... 0x0b:
516 	case 0x0d ... 0x0f:
517 		pr_cont("ECC error in L2 tag (%s).\n",
518 			((r4 == R4_GEN)   ? "BankReq" :
519 			((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
520 		break;
521 
522 	case 0x10 ... 0x19:
523 	case 0x1b:
524 		pr_cont("ECC error in L2 data array (%s).\n",
525 			(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
526 			((r4 == R4_GEN)   ? "Attr" :
527 			((r4 == R4_EVICT) ? "Vict" : "Fill"))));
528 		break;
529 
530 	case 0x1c ... 0x1d:
531 	case 0x1f:
532 		pr_cont("Parity error in L2 attribute bits (%s).\n",
533 			((r4 == R4_RD)  ? "Hit"  :
534 			((r4 == R4_GEN) ? "Attr" : "Fill")));
535 		break;
536 
537 	default:
538 		return false;
539 	}
540 
541 	return true;
542 }
543 
544 static void decode_mc2_mce(struct mce *m)
545 {
546 	u16 ec = EC(m->status);
547 	u8 xec = XEC(m->status, xec_mask);
548 
549 	pr_emerg(HW_ERR "MC2 Error: ");
550 
551 	if (!fam_ops.mc2_mce(ec, xec))
552 		pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
553 }
554 
555 static void decode_mc3_mce(struct mce *m)
556 {
557 	u16 ec = EC(m->status);
558 	u8 xec = XEC(m->status, xec_mask);
559 
560 	if (boot_cpu_data.x86 >= 0x14) {
561 		pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
562 			 " please report on LKML.\n");
563 		return;
564 	}
565 
566 	pr_emerg(HW_ERR "MC3 Error");
567 
568 	if (xec == 0x0) {
569 		u8 r4 = R4(ec);
570 
571 		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
572 			goto wrong_mc3_mce;
573 
574 		pr_cont(" during %s.\n", R4_MSG(ec));
575 	} else
576 		goto wrong_mc3_mce;
577 
578 	return;
579 
580  wrong_mc3_mce:
581 	pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
582 }
583 
584 static void decode_mc4_mce(struct mce *m)
585 {
586 	unsigned int fam = x86_family(m->cpuid);
587 	int node_id = topology_amd_node_id(m->extcpu);
588 	u16 ec = EC(m->status);
589 	u8 xec = XEC(m->status, 0x1f);
590 	u8 offset = 0;
591 
592 	pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
593 
594 	switch (xec) {
595 	case 0x0 ... 0xe:
596 
597 		/* special handling for DRAM ECCs */
598 		if (xec == 0x0 || xec == 0x8) {
599 			/* no ECCs on F11h */
600 			if (fam == 0x11)
601 				goto wrong_mc4_mce;
602 
603 			pr_cont("%s.\n", mc4_mce_desc[xec]);
604 
605 			if (decode_dram_ecc)
606 				decode_dram_ecc(node_id, m);
607 			return;
608 		}
609 		break;
610 
611 	case 0xf:
612 		if (TLB_ERROR(ec))
613 			pr_cont("GART Table Walk data error.\n");
614 		else if (BUS_ERROR(ec))
615 			pr_cont("DMA Exclusion Vector Table Walk error.\n");
616 		else
617 			goto wrong_mc4_mce;
618 		return;
619 
620 	case 0x19:
621 		if (fam == 0x15 || fam == 0x16)
622 			pr_cont("Compute Unit Data Error.\n");
623 		else
624 			goto wrong_mc4_mce;
625 		return;
626 
627 	case 0x1c ... 0x1f:
628 		offset = 13;
629 		break;
630 
631 	default:
632 		goto wrong_mc4_mce;
633 	}
634 
635 	pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
636 	return;
637 
638  wrong_mc4_mce:
639 	pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
640 }
641 
642 static void decode_mc5_mce(struct mce *m)
643 {
644 	unsigned int fam = x86_family(m->cpuid);
645 	u16 ec = EC(m->status);
646 	u8 xec = XEC(m->status, xec_mask);
647 
648 	if (fam == 0xf || fam == 0x11)
649 		goto wrong_mc5_mce;
650 
651 	pr_emerg(HW_ERR "MC5 Error: ");
652 
653 	if (INT_ERROR(ec)) {
654 		if (xec <= 0x1f) {
655 			pr_cont("Hardware Assert.\n");
656 			return;
657 		} else
658 			goto wrong_mc5_mce;
659 	}
660 
661 	if (xec == 0x0 || xec == 0xc)
662 		pr_cont("%s.\n", mc5_mce_desc[xec]);
663 	else if (xec <= 0xd)
664 		pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
665 	else
666 		goto wrong_mc5_mce;
667 
668 	return;
669 
670  wrong_mc5_mce:
671 	pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
672 }
673 
674 static void decode_mc6_mce(struct mce *m)
675 {
676 	u8 xec = XEC(m->status, xec_mask);
677 
678 	pr_emerg(HW_ERR "MC6 Error: ");
679 
680 	if (xec > 0x5)
681 		goto wrong_mc6_mce;
682 
683 	pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
684 	return;
685 
686  wrong_mc6_mce:
687 	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
688 }
689 
690 static const char * const smca_long_names[] = {
691 	[SMCA_LS ... SMCA_LS_V2]	= "Load Store Unit",
692 	[SMCA_IF]			= "Instruction Fetch Unit",
693 	[SMCA_L2_CACHE]			= "L2 Cache",
694 	[SMCA_DE]			= "Decode Unit",
695 	[SMCA_RESERVED]			= "Reserved",
696 	[SMCA_EX]			= "Execution Unit",
697 	[SMCA_FP]			= "Floating Point Unit",
698 	[SMCA_L3_CACHE]			= "L3 Cache",
699 	[SMCA_CS ... SMCA_CS_V2]	= "Coherent Slave",
700 	[SMCA_PIE]			= "Power, Interrupts, etc.",
701 
702 	/* UMC v2 is separate because both of them can exist in a single system. */
703 	[SMCA_UMC]			= "Unified Memory Controller",
704 	[SMCA_UMC_V2]			= "Unified Memory Controller v2",
705 	[SMCA_PB]			= "Parameter Block",
706 	[SMCA_PSP ... SMCA_PSP_V2]	= "Platform Security Processor",
707 	[SMCA_SMU ... SMCA_SMU_V2]	= "System Management Unit",
708 	[SMCA_MP5]			= "Microprocessor 5 Unit",
709 	[SMCA_MPDMA]			= "MPDMA Unit",
710 	[SMCA_NBIO]			= "Northbridge IO Unit",
711 	[SMCA_PCIE ... SMCA_PCIE_V2]	= "PCI Express Unit",
712 	[SMCA_XGMI_PCS]			= "Ext Global Memory Interconnect PCS Unit",
713 	[SMCA_NBIF]			= "NBIF Unit",
714 	[SMCA_SHUB]			= "System Hub Unit",
715 	[SMCA_SATA]			= "SATA Unit",
716 	[SMCA_USB]			= "USB Unit",
717 	[SMCA_GMI_PCS]			= "Global Memory Interconnect PCS Unit",
718 	[SMCA_XGMI_PHY]			= "Ext Global Memory Interconnect PHY Unit",
719 	[SMCA_WAFL_PHY]			= "WAFL PHY Unit",
720 	[SMCA_GMI_PHY]			= "Global Memory Interconnect PHY Unit",
721 };
722 
723 static const char *smca_get_long_name(enum smca_bank_types t)
724 {
725 	if (t >= N_SMCA_BANK_TYPES)
726 		return NULL;
727 
728 	return smca_long_names[t];
729 }
730 
731 /* Decode errors according to Scalable MCA specification */
732 static void decode_smca_error(struct mce *m)
733 {
734 	enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
735 	u8 xec = XEC(m->status, xec_mask);
736 
737 	if (bank_type >= N_SMCA_BANK_TYPES)
738 		return;
739 
740 	if (bank_type == SMCA_RESERVED) {
741 		pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
742 		return;
743 	}
744 
745 	pr_emerg(HW_ERR "%s Ext. Error Code: %d", smca_get_long_name(bank_type), xec);
746 
747 	if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) &&
748 	    xec == 0 && decode_dram_ecc)
749 		decode_dram_ecc(topology_amd_node_id(m->extcpu), m);
750 }
751 
752 static inline void amd_decode_err_code(u16 ec)
753 {
754 	if (INT_ERROR(ec)) {
755 		pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
756 		return;
757 	}
758 
759 	pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
760 
761 	if (BUS_ERROR(ec))
762 		pr_cont(", mem/io: %s", II_MSG(ec));
763 	else
764 		pr_cont(", tx: %s", TT_MSG(ec));
765 
766 	if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
767 		pr_cont(", mem-tx: %s", R4_MSG(ec));
768 
769 		if (BUS_ERROR(ec))
770 			pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
771 	}
772 
773 	pr_cont("\n");
774 }
775 
776 static const char *decode_error_status(struct mce *m)
777 {
778 	if (m->status & MCI_STATUS_UC) {
779 		if (m->status & MCI_STATUS_PCC)
780 			return "System Fatal error.";
781 		if (m->mcgstatus & MCG_STATUS_RIPV)
782 			return "Uncorrected, software restartable error.";
783 		return "Uncorrected, software containable error.";
784 	}
785 
786 	if (m->status & MCI_STATUS_DEFERRED)
787 		return "Deferred error, no action required.";
788 
789 	return "Corrected error, no action required.";
790 }
791 
792 static int
793 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
794 {
795 	struct mce *m = (struct mce *)data;
796 	struct mce_hw_err *err = to_mce_hw_err(m);
797 	unsigned int fam = x86_family(m->cpuid);
798 	u32 mca_config_lo = 0, dummy;
799 	int ecc;
800 
801 	if (m->kflags & MCE_HANDLED_CEC)
802 		return NOTIFY_DONE;
803 
804 	pr_emerg(HW_ERR "%s\n", decode_error_status(m));
805 
806 	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
807 		m->extcpu,
808 		fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
809 		m->bank,
810 		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
811 		((m->status & MCI_STATUS_UC)	? "UE"	  :
812 		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
813 		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
814 		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"),
815 		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"));
816 
817 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
818 		rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(m->bank), &mca_config_lo, &dummy);
819 
820 		if (mca_config_lo & MCI_CONFIG_MCAX)
821 			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
822 
823 		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
824 	}
825 
826 	/* do the two bits[14:13] together */
827 	ecc = (m->status >> 45) & 0x3;
828 	if (ecc)
829 		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
830 
831 	if (fam >= 0x15) {
832 		pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
833 
834 		/* F15h, bank4, bit 43 is part of McaStatSubCache. */
835 		if (fam != 0x15 || m->bank != 4)
836 			pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
837 	}
838 
839 	if (fam >= 0x17)
840 		pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
841 
842 	pr_cont("]: 0x%016llx\n", m->status);
843 
844 	if (m->status & MCI_STATUS_ADDRV)
845 		pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
846 
847 	if (m->ppin)
848 		pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin);
849 
850 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
851 		pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
852 
853 		if (m->status & MCI_STATUS_SYNDV) {
854 			pr_cont(", Syndrome: 0x%016llx\n", m->synd);
855 			if (mca_config_lo & MCI_CONFIG_FRUTEXT) {
856 				char frutext[17];
857 
858 				frutext[16] = '\0';
859 				memcpy(&frutext[0], &err->vendor.amd.synd1, 8);
860 				memcpy(&frutext[8], &err->vendor.amd.synd2, 8);
861 
862 				pr_emerg(HW_ERR "FRU Text: %s", frutext);
863 			}
864 		}
865 
866 		pr_cont("\n");
867 
868 		decode_smca_error(m);
869 		goto err_code;
870 	}
871 
872 	if (m->tsc)
873 		pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
874 
875 	/* Doesn't matter which member to test. */
876 	if (!fam_ops.mc0_mce)
877 		goto err_code;
878 
879 	switch (m->bank) {
880 	case 0:
881 		decode_mc0_mce(m);
882 		break;
883 
884 	case 1:
885 		decode_mc1_mce(m);
886 		break;
887 
888 	case 2:
889 		decode_mc2_mce(m);
890 		break;
891 
892 	case 3:
893 		decode_mc3_mce(m);
894 		break;
895 
896 	case 4:
897 		decode_mc4_mce(m);
898 		break;
899 
900 	case 5:
901 		decode_mc5_mce(m);
902 		break;
903 
904 	case 6:
905 		decode_mc6_mce(m);
906 		break;
907 
908 	default:
909 		break;
910 	}
911 
912  err_code:
913 	amd_decode_err_code(m->status & 0xffff);
914 
915 	m->kflags |= MCE_HANDLED_EDAC;
916 	return NOTIFY_OK;
917 }
918 
919 static struct notifier_block amd_mce_dec_nb = {
920 	.notifier_call	= amd_decode_mce,
921 	.priority	= MCE_PRIO_EDAC,
922 };
923 
924 static int __init mce_amd_init(void)
925 {
926 	struct cpuinfo_x86 *c = &boot_cpu_data;
927 
928 	if (c->x86_vendor != X86_VENDOR_AMD &&
929 	    c->x86_vendor != X86_VENDOR_HYGON)
930 		return -ENODEV;
931 
932 	if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
933 		return -ENODEV;
934 
935 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
936 		xec_mask = 0x3f;
937 		goto out;
938 	}
939 
940 	switch (c->x86) {
941 	case 0xf:
942 		fam_ops.mc0_mce = k8_mc0_mce;
943 		fam_ops.mc1_mce = k8_mc1_mce;
944 		fam_ops.mc2_mce = k8_mc2_mce;
945 		break;
946 
947 	case 0x10:
948 		fam_ops.mc0_mce = f10h_mc0_mce;
949 		fam_ops.mc1_mce = k8_mc1_mce;
950 		fam_ops.mc2_mce = k8_mc2_mce;
951 		break;
952 
953 	case 0x11:
954 		fam_ops.mc0_mce = k8_mc0_mce;
955 		fam_ops.mc1_mce = k8_mc1_mce;
956 		fam_ops.mc2_mce = k8_mc2_mce;
957 		break;
958 
959 	case 0x12:
960 		fam_ops.mc0_mce = f12h_mc0_mce;
961 		fam_ops.mc1_mce = k8_mc1_mce;
962 		fam_ops.mc2_mce = k8_mc2_mce;
963 		break;
964 
965 	case 0x14:
966 		fam_ops.mc0_mce = cat_mc0_mce;
967 		fam_ops.mc1_mce = cat_mc1_mce;
968 		fam_ops.mc2_mce = k8_mc2_mce;
969 		break;
970 
971 	case 0x15:
972 		xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
973 
974 		fam_ops.mc0_mce = f15h_mc0_mce;
975 		fam_ops.mc1_mce = f15h_mc1_mce;
976 		fam_ops.mc2_mce = f15h_mc2_mce;
977 		break;
978 
979 	case 0x16:
980 		xec_mask = 0x1f;
981 		fam_ops.mc0_mce = cat_mc0_mce;
982 		fam_ops.mc1_mce = cat_mc1_mce;
983 		fam_ops.mc2_mce = f16h_mc2_mce;
984 		break;
985 
986 	case 0x17:
987 	case 0x18:
988 		pr_warn_once("Decoding supported only on Scalable MCA processors.\n");
989 		return -EINVAL;
990 
991 	default:
992 		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
993 		return -EINVAL;
994 	}
995 
996 out:
997 	pr_info("MCE: In-kernel MCE decoding enabled.\n");
998 
999 	mce_register_decode_chain(&amd_mce_dec_nb);
1000 
1001 	return 0;
1002 }
1003 early_initcall(mce_amd_init);
1004 
1005 #ifdef MODULE
1006 static void __exit mce_amd_exit(void)
1007 {
1008 	mce_unregister_decode_chain(&amd_mce_dec_nb);
1009 }
1010 
1011 MODULE_DESCRIPTION("AMD MCE decoder");
1012 MODULE_ALIAS("edac-mce-amd");
1013 MODULE_LICENSE("GPL");
1014 module_exit(mce_amd_exit);
1015 #endif
1016