xref: /linux/drivers/edac/mce_amd.c (revision 0883c2c06fb5bcf5b9e008270827e63c09a88c1e)
1 #include <linux/module.h>
2 #include <linux/slab.h>
3 
4 #include "mce_amd.h"
5 
6 static struct amd_decoder_ops *fam_ops;
7 
8 static u8 xec_mask	 = 0xf;
9 
10 static bool report_gart_errors;
11 static void (*nb_bus_decoder)(int node_id, struct mce *m);
12 
13 void amd_report_gart_errors(bool v)
14 {
15 	report_gart_errors = v;
16 }
17 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
18 
19 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
20 {
21 	nb_bus_decoder = f;
22 }
23 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
24 
25 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
26 {
27 	if (nb_bus_decoder) {
28 		WARN_ON(nb_bus_decoder != f);
29 
30 		nb_bus_decoder = NULL;
31 	}
32 }
33 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
34 
35 /*
36  * string representation for the different MCA reported error types, see F3x48
37  * or MSR0000_0411.
38  */
39 
40 /* transaction type */
41 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
42 
43 /* cache level */
44 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
45 
46 /* memory transaction type */
47 static const char * const rrrr_msgs[] = {
48        "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
49 };
50 
51 /* participating processor */
52 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
53 EXPORT_SYMBOL_GPL(pp_msgs);
54 
55 /* request timeout */
56 static const char * const to_msgs[] = { "no timeout", "timed out" };
57 
58 /* memory or i/o */
59 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
60 
61 /* internal error type */
62 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
63 
64 static const char * const f15h_mc1_mce_desc[] = {
65 	"UC during a demand linefill from L2",
66 	"Parity error during data load from IC",
67 	"Parity error for IC valid bit",
68 	"Main tag parity error",
69 	"Parity error in prediction queue",
70 	"PFB data/address parity error",
71 	"Parity error in the branch status reg",
72 	"PFB promotion address error",
73 	"Tag error during probe/victimization",
74 	"Parity error for IC probe tag valid bit",
75 	"PFB non-cacheable bit parity error",
76 	"PFB valid bit parity error",			/* xec = 0xd */
77 	"Microcode Patch Buffer",			/* xec = 010 */
78 	"uop queue",
79 	"insn buffer",
80 	"predecode buffer",
81 	"fetch address FIFO",
82 	"dispatch uop queue"
83 };
84 
85 static const char * const f15h_mc2_mce_desc[] = {
86 	"Fill ECC error on data fills",			/* xec = 0x4 */
87 	"Fill parity error on insn fills",
88 	"Prefetcher request FIFO parity error",
89 	"PRQ address parity error",
90 	"PRQ data parity error",
91 	"WCC Tag ECC error",
92 	"WCC Data ECC error",
93 	"WCB Data parity error",
94 	"VB Data ECC or parity error",
95 	"L2 Tag ECC error",				/* xec = 0x10 */
96 	"Hard L2 Tag ECC error",
97 	"Multiple hits on L2 tag",
98 	"XAB parity error",
99 	"PRB address parity error"
100 };
101 
102 static const char * const mc4_mce_desc[] = {
103 	"DRAM ECC error detected on the NB",
104 	"CRC error detected on HT link",
105 	"Link-defined sync error packets detected on HT link",
106 	"HT Master abort",
107 	"HT Target abort",
108 	"Invalid GART PTE entry during GART table walk",
109 	"Unsupported atomic RMW received from an IO link",
110 	"Watchdog timeout due to lack of progress",
111 	"DRAM ECC error detected on the NB",
112 	"SVM DMA Exclusion Vector error",
113 	"HT data error detected on link",
114 	"Protocol error (link, L3, probe filter)",
115 	"NB internal arrays parity error",
116 	"DRAM addr/ctl signals parity error",
117 	"IO link transmission error",
118 	"L3 data cache ECC error",			/* xec = 0x1c */
119 	"L3 cache tag error",
120 	"L3 LRU parity bits error",
121 	"ECC Error in the Probe Filter directory"
122 };
123 
124 static const char * const mc5_mce_desc[] = {
125 	"CPU Watchdog timer expire",
126 	"Wakeup array dest tag",
127 	"AG payload array",
128 	"EX payload array",
129 	"IDRF array",
130 	"Retire dispatch queue",
131 	"Mapper checkpoint array",
132 	"Physical register file EX0 port",
133 	"Physical register file EX1 port",
134 	"Physical register file AG0 port",
135 	"Physical register file AG1 port",
136 	"Flag register file",
137 	"DE error occurred",
138 	"Retire status queue"
139 };
140 
141 static const char * const mc6_mce_desc[] = {
142 	"Hardware Assertion",
143 	"Free List",
144 	"Physical Register File",
145 	"Retire Queue",
146 	"Scheduler table",
147 	"Status Register File",
148 };
149 
150 /* Scalable MCA error strings */
151 static const char * const f17h_ls_mce_desc[] = {
152 	"Load queue parity",
153 	"Store queue parity",
154 	"Miss address buffer payload parity",
155 	"L1 TLB parity",
156 	"",						/* reserved */
157 	"DC tag error type 6",
158 	"DC tag error type 1",
159 	"Internal error type 1",
160 	"Internal error type 2",
161 	"Sys Read data error thread 0",
162 	"Sys read data error thread 1",
163 	"DC tag error type 2",
164 	"DC data error type 1 (poison comsumption)",
165 	"DC data error type 2",
166 	"DC data error type 3",
167 	"DC tag error type 4",
168 	"L2 TLB parity",
169 	"PDC parity error",
170 	"DC tag error type 3",
171 	"DC tag error type 5",
172 	"L2 fill data error",
173 };
174 
175 static const char * const f17h_if_mce_desc[] = {
176 	"microtag probe port parity error",
177 	"IC microtag or full tag multi-hit error",
178 	"IC full tag parity",
179 	"IC data array parity",
180 	"Decoupling queue phys addr parity error",
181 	"L0 ITLB parity error",
182 	"L1 ITLB parity error",
183 	"L2 ITLB parity error",
184 	"BPQ snoop parity on Thread 0",
185 	"BPQ snoop parity on Thread 1",
186 	"L1 BTB multi-match error",
187 	"L2 BTB multi-match error",
188 };
189 
190 static const char * const f17h_l2_mce_desc[] = {
191 	"L2M tag multi-way-hit error",
192 	"L2M tag ECC error",
193 	"L2M data ECC error",
194 	"HW assert",
195 };
196 
197 static const char * const f17h_de_mce_desc[] = {
198 	"uop cache tag parity error",
199 	"uop cache data parity error",
200 	"Insn buffer parity error",
201 	"Insn dispatch queue parity error",
202 	"Fetch address FIFO parity",
203 	"Patch RAM data parity",
204 	"Patch RAM sequencer parity",
205 	"uop buffer parity"
206 };
207 
208 static const char * const f17h_ex_mce_desc[] = {
209 	"Watchdog timeout error",
210 	"Phy register file parity",
211 	"Flag register file parity",
212 	"Immediate displacement register file parity",
213 	"Address generator payload parity",
214 	"EX payload parity",
215 	"Checkpoint queue parity",
216 	"Retire dispatch queue parity",
217 };
218 
219 static const char * const f17h_fp_mce_desc[] = {
220 	"Physical register file parity",
221 	"Freelist parity error",
222 	"Schedule queue parity",
223 	"NSQ parity error",
224 	"Retire queue parity",
225 	"Status register file parity",
226 };
227 
228 static const char * const f17h_l3_mce_desc[] = {
229 	"Shadow tag macro ECC error",
230 	"Shadow tag macro multi-way-hit error",
231 	"L3M tag ECC error",
232 	"L3M tag multi-way-hit error",
233 	"L3M data ECC error",
234 	"XI parity, L3 fill done channel error",
235 	"L3 victim queue parity",
236 	"L3 HW assert",
237 };
238 
239 static const char * const f17h_cs_mce_desc[] = {
240 	"Illegal request from transport layer",
241 	"Address violation",
242 	"Security violation",
243 	"Illegal response from transport layer",
244 	"Unexpected response",
245 	"Parity error on incoming request or probe response data",
246 	"Parity error on incoming read response data",
247 	"Atomic request parity",
248 	"ECC error on probe filter access",
249 };
250 
251 static const char * const f17h_pie_mce_desc[] = {
252 	"HW assert",
253 	"Internal PIE register security violation",
254 	"Error on GMI link",
255 	"Poison data written to internal PIE register",
256 };
257 
258 static const char * const f17h_umc_mce_desc[] = {
259 	"DRAM ECC error",
260 	"Data poison error on DRAM",
261 	"SDP parity error",
262 	"Advanced peripheral bus error",
263 	"Command/address parity error",
264 	"Write data CRC error",
265 };
266 
267 static const char * const f17h_pb_mce_desc[] = {
268 	"Parameter Block RAM ECC error",
269 };
270 
271 static const char * const f17h_psp_mce_desc[] = {
272 	"PSP RAM ECC or parity error",
273 };
274 
275 static const char * const f17h_smu_mce_desc[] = {
276 	"SMU RAM ECC or parity error",
277 };
278 
279 static bool f12h_mc0_mce(u16 ec, u8 xec)
280 {
281 	bool ret = false;
282 
283 	if (MEM_ERROR(ec)) {
284 		u8 ll = LL(ec);
285 		ret = true;
286 
287 		if (ll == LL_L2)
288 			pr_cont("during L1 linefill from L2.\n");
289 		else if (ll == LL_L1)
290 			pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
291 		else
292 			ret = false;
293 	}
294 	return ret;
295 }
296 
297 static bool f10h_mc0_mce(u16 ec, u8 xec)
298 {
299 	if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
300 		pr_cont("during data scrub.\n");
301 		return true;
302 	}
303 	return f12h_mc0_mce(ec, xec);
304 }
305 
306 static bool k8_mc0_mce(u16 ec, u8 xec)
307 {
308 	if (BUS_ERROR(ec)) {
309 		pr_cont("during system linefill.\n");
310 		return true;
311 	}
312 
313 	return f10h_mc0_mce(ec, xec);
314 }
315 
316 static bool cat_mc0_mce(u16 ec, u8 xec)
317 {
318 	u8 r4	 = R4(ec);
319 	bool ret = true;
320 
321 	if (MEM_ERROR(ec)) {
322 
323 		if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
324 			return false;
325 
326 		switch (r4) {
327 		case R4_DRD:
328 		case R4_DWR:
329 			pr_cont("Data/Tag parity error due to %s.\n",
330 				(r4 == R4_DRD ? "load/hw prf" : "store"));
331 			break;
332 		case R4_EVICT:
333 			pr_cont("Copyback parity error on a tag miss.\n");
334 			break;
335 		case R4_SNOOP:
336 			pr_cont("Tag parity error during snoop.\n");
337 			break;
338 		default:
339 			ret = false;
340 		}
341 	} else if (BUS_ERROR(ec)) {
342 
343 		if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
344 			return false;
345 
346 		pr_cont("System read data error on a ");
347 
348 		switch (r4) {
349 		case R4_RD:
350 			pr_cont("TLB reload.\n");
351 			break;
352 		case R4_DWR:
353 			pr_cont("store.\n");
354 			break;
355 		case R4_DRD:
356 			pr_cont("load.\n");
357 			break;
358 		default:
359 			ret = false;
360 		}
361 	} else {
362 		ret = false;
363 	}
364 
365 	return ret;
366 }
367 
368 static bool f15h_mc0_mce(u16 ec, u8 xec)
369 {
370 	bool ret = true;
371 
372 	if (MEM_ERROR(ec)) {
373 
374 		switch (xec) {
375 		case 0x0:
376 			pr_cont("Data Array access error.\n");
377 			break;
378 
379 		case 0x1:
380 			pr_cont("UC error during a linefill from L2/NB.\n");
381 			break;
382 
383 		case 0x2:
384 		case 0x11:
385 			pr_cont("STQ access error.\n");
386 			break;
387 
388 		case 0x3:
389 			pr_cont("SCB access error.\n");
390 			break;
391 
392 		case 0x10:
393 			pr_cont("Tag error.\n");
394 			break;
395 
396 		case 0x12:
397 			pr_cont("LDQ access error.\n");
398 			break;
399 
400 		default:
401 			ret = false;
402 		}
403 	} else if (BUS_ERROR(ec)) {
404 
405 		if (!xec)
406 			pr_cont("System Read Data Error.\n");
407 		else
408 			pr_cont(" Internal error condition type %d.\n", xec);
409 	} else if (INT_ERROR(ec)) {
410 		if (xec <= 0x1f)
411 			pr_cont("Hardware Assert.\n");
412 		else
413 			ret = false;
414 
415 	} else
416 		ret = false;
417 
418 	return ret;
419 }
420 
421 static void decode_mc0_mce(struct mce *m)
422 {
423 	u16 ec = EC(m->status);
424 	u8 xec = XEC(m->status, xec_mask);
425 
426 	pr_emerg(HW_ERR "MC0 Error: ");
427 
428 	/* TLB error signatures are the same across families */
429 	if (TLB_ERROR(ec)) {
430 		if (TT(ec) == TT_DATA) {
431 			pr_cont("%s TLB %s.\n", LL_MSG(ec),
432 				((xec == 2) ? "locked miss"
433 					    : (xec ? "multimatch" : "parity")));
434 			return;
435 		}
436 	} else if (fam_ops->mc0_mce(ec, xec))
437 		;
438 	else
439 		pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
440 }
441 
442 static bool k8_mc1_mce(u16 ec, u8 xec)
443 {
444 	u8 ll	 = LL(ec);
445 	bool ret = true;
446 
447 	if (!MEM_ERROR(ec))
448 		return false;
449 
450 	if (ll == 0x2)
451 		pr_cont("during a linefill from L2.\n");
452 	else if (ll == 0x1) {
453 		switch (R4(ec)) {
454 		case R4_IRD:
455 			pr_cont("Parity error during data load.\n");
456 			break;
457 
458 		case R4_EVICT:
459 			pr_cont("Copyback Parity/Victim error.\n");
460 			break;
461 
462 		case R4_SNOOP:
463 			pr_cont("Tag Snoop error.\n");
464 			break;
465 
466 		default:
467 			ret = false;
468 			break;
469 		}
470 	} else
471 		ret = false;
472 
473 	return ret;
474 }
475 
476 static bool cat_mc1_mce(u16 ec, u8 xec)
477 {
478 	u8 r4    = R4(ec);
479 	bool ret = true;
480 
481 	if (!MEM_ERROR(ec))
482 		return false;
483 
484 	if (TT(ec) != TT_INSTR)
485 		return false;
486 
487 	if (r4 == R4_IRD)
488 		pr_cont("Data/tag array parity error for a tag hit.\n");
489 	else if (r4 == R4_SNOOP)
490 		pr_cont("Tag error during snoop/victimization.\n");
491 	else if (xec == 0x0)
492 		pr_cont("Tag parity error from victim castout.\n");
493 	else if (xec == 0x2)
494 		pr_cont("Microcode patch RAM parity error.\n");
495 	else
496 		ret = false;
497 
498 	return ret;
499 }
500 
501 static bool f15h_mc1_mce(u16 ec, u8 xec)
502 {
503 	bool ret = true;
504 
505 	if (!MEM_ERROR(ec))
506 		return false;
507 
508 	switch (xec) {
509 	case 0x0 ... 0xa:
510 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
511 		break;
512 
513 	case 0xd:
514 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
515 		break;
516 
517 	case 0x10:
518 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
519 		break;
520 
521 	case 0x11 ... 0x15:
522 		pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
523 		break;
524 
525 	default:
526 		ret = false;
527 	}
528 	return ret;
529 }
530 
531 static void decode_mc1_mce(struct mce *m)
532 {
533 	u16 ec = EC(m->status);
534 	u8 xec = XEC(m->status, xec_mask);
535 
536 	pr_emerg(HW_ERR "MC1 Error: ");
537 
538 	if (TLB_ERROR(ec))
539 		pr_cont("%s TLB %s.\n", LL_MSG(ec),
540 			(xec ? "multimatch" : "parity error"));
541 	else if (BUS_ERROR(ec)) {
542 		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
543 
544 		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
545 	} else if (INT_ERROR(ec)) {
546 		if (xec <= 0x3f)
547 			pr_cont("Hardware Assert.\n");
548 		else
549 			goto wrong_mc1_mce;
550 	} else if (fam_ops->mc1_mce(ec, xec))
551 		;
552 	else
553 		goto wrong_mc1_mce;
554 
555 	return;
556 
557 wrong_mc1_mce:
558 	pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
559 }
560 
561 static bool k8_mc2_mce(u16 ec, u8 xec)
562 {
563 	bool ret = true;
564 
565 	if (xec == 0x1)
566 		pr_cont(" in the write data buffers.\n");
567 	else if (xec == 0x3)
568 		pr_cont(" in the victim data buffers.\n");
569 	else if (xec == 0x2 && MEM_ERROR(ec))
570 		pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
571 	else if (xec == 0x0) {
572 		if (TLB_ERROR(ec))
573 			pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
574 				TT_MSG(ec));
575 		else if (BUS_ERROR(ec))
576 			pr_cont(": %s/ECC error in data read from NB: %s.\n",
577 				R4_MSG(ec), PP_MSG(ec));
578 		else if (MEM_ERROR(ec)) {
579 			u8 r4 = R4(ec);
580 
581 			if (r4 >= 0x7)
582 				pr_cont(": %s error during data copyback.\n",
583 					R4_MSG(ec));
584 			else if (r4 <= 0x1)
585 				pr_cont(": %s parity/ECC error during data "
586 					"access from L2.\n", R4_MSG(ec));
587 			else
588 				ret = false;
589 		} else
590 			ret = false;
591 	} else
592 		ret = false;
593 
594 	return ret;
595 }
596 
597 static bool f15h_mc2_mce(u16 ec, u8 xec)
598 {
599 	bool ret = true;
600 
601 	if (TLB_ERROR(ec)) {
602 		if (xec == 0x0)
603 			pr_cont("Data parity TLB read error.\n");
604 		else if (xec == 0x1)
605 			pr_cont("Poison data provided for TLB fill.\n");
606 		else
607 			ret = false;
608 	} else if (BUS_ERROR(ec)) {
609 		if (xec > 2)
610 			ret = false;
611 
612 		pr_cont("Error during attempted NB data read.\n");
613 	} else if (MEM_ERROR(ec)) {
614 		switch (xec) {
615 		case 0x4 ... 0xc:
616 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
617 			break;
618 
619 		case 0x10 ... 0x14:
620 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
621 			break;
622 
623 		default:
624 			ret = false;
625 		}
626 	} else if (INT_ERROR(ec)) {
627 		if (xec <= 0x3f)
628 			pr_cont("Hardware Assert.\n");
629 		else
630 			ret = false;
631 	}
632 
633 	return ret;
634 }
635 
636 static bool f16h_mc2_mce(u16 ec, u8 xec)
637 {
638 	u8 r4 = R4(ec);
639 
640 	if (!MEM_ERROR(ec))
641 		return false;
642 
643 	switch (xec) {
644 	case 0x04 ... 0x05:
645 		pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
646 		break;
647 
648 	case 0x09 ... 0x0b:
649 	case 0x0d ... 0x0f:
650 		pr_cont("ECC error in L2 tag (%s).\n",
651 			((r4 == R4_GEN)   ? "BankReq" :
652 			((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
653 		break;
654 
655 	case 0x10 ... 0x19:
656 	case 0x1b:
657 		pr_cont("ECC error in L2 data array (%s).\n",
658 			(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
659 			((r4 == R4_GEN)   ? "Attr" :
660 			((r4 == R4_EVICT) ? "Vict" : "Fill"))));
661 		break;
662 
663 	case 0x1c ... 0x1d:
664 	case 0x1f:
665 		pr_cont("Parity error in L2 attribute bits (%s).\n",
666 			((r4 == R4_RD)  ? "Hit"  :
667 			((r4 == R4_GEN) ? "Attr" : "Fill")));
668 		break;
669 
670 	default:
671 		return false;
672 	}
673 
674 	return true;
675 }
676 
677 static void decode_mc2_mce(struct mce *m)
678 {
679 	u16 ec = EC(m->status);
680 	u8 xec = XEC(m->status, xec_mask);
681 
682 	pr_emerg(HW_ERR "MC2 Error: ");
683 
684 	if (!fam_ops->mc2_mce(ec, xec))
685 		pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
686 }
687 
688 static void decode_mc3_mce(struct mce *m)
689 {
690 	u16 ec = EC(m->status);
691 	u8 xec = XEC(m->status, xec_mask);
692 
693 	if (boot_cpu_data.x86 >= 0x14) {
694 		pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
695 			 " please report on LKML.\n");
696 		return;
697 	}
698 
699 	pr_emerg(HW_ERR "MC3 Error");
700 
701 	if (xec == 0x0) {
702 		u8 r4 = R4(ec);
703 
704 		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
705 			goto wrong_mc3_mce;
706 
707 		pr_cont(" during %s.\n", R4_MSG(ec));
708 	} else
709 		goto wrong_mc3_mce;
710 
711 	return;
712 
713  wrong_mc3_mce:
714 	pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
715 }
716 
717 static void decode_mc4_mce(struct mce *m)
718 {
719 	struct cpuinfo_x86 *c = &boot_cpu_data;
720 	int node_id = amd_get_nb_id(m->extcpu);
721 	u16 ec = EC(m->status);
722 	u8 xec = XEC(m->status, 0x1f);
723 	u8 offset = 0;
724 
725 	pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
726 
727 	switch (xec) {
728 	case 0x0 ... 0xe:
729 
730 		/* special handling for DRAM ECCs */
731 		if (xec == 0x0 || xec == 0x8) {
732 			/* no ECCs on F11h */
733 			if (c->x86 == 0x11)
734 				goto wrong_mc4_mce;
735 
736 			pr_cont("%s.\n", mc4_mce_desc[xec]);
737 
738 			if (nb_bus_decoder)
739 				nb_bus_decoder(node_id, m);
740 			return;
741 		}
742 		break;
743 
744 	case 0xf:
745 		if (TLB_ERROR(ec))
746 			pr_cont("GART Table Walk data error.\n");
747 		else if (BUS_ERROR(ec))
748 			pr_cont("DMA Exclusion Vector Table Walk error.\n");
749 		else
750 			goto wrong_mc4_mce;
751 		return;
752 
753 	case 0x19:
754 		if (boot_cpu_data.x86 == 0x15 || boot_cpu_data.x86 == 0x16)
755 			pr_cont("Compute Unit Data Error.\n");
756 		else
757 			goto wrong_mc4_mce;
758 		return;
759 
760 	case 0x1c ... 0x1f:
761 		offset = 13;
762 		break;
763 
764 	default:
765 		goto wrong_mc4_mce;
766 	}
767 
768 	pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
769 	return;
770 
771  wrong_mc4_mce:
772 	pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
773 }
774 
775 static void decode_mc5_mce(struct mce *m)
776 {
777 	struct cpuinfo_x86 *c = &boot_cpu_data;
778 	u16 ec = EC(m->status);
779 	u8 xec = XEC(m->status, xec_mask);
780 
781 	if (c->x86 == 0xf || c->x86 == 0x11)
782 		goto wrong_mc5_mce;
783 
784 	pr_emerg(HW_ERR "MC5 Error: ");
785 
786 	if (INT_ERROR(ec)) {
787 		if (xec <= 0x1f) {
788 			pr_cont("Hardware Assert.\n");
789 			return;
790 		} else
791 			goto wrong_mc5_mce;
792 	}
793 
794 	if (xec == 0x0 || xec == 0xc)
795 		pr_cont("%s.\n", mc5_mce_desc[xec]);
796 	else if (xec <= 0xd)
797 		pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
798 	else
799 		goto wrong_mc5_mce;
800 
801 	return;
802 
803  wrong_mc5_mce:
804 	pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
805 }
806 
807 static void decode_mc6_mce(struct mce *m)
808 {
809 	u8 xec = XEC(m->status, xec_mask);
810 
811 	pr_emerg(HW_ERR "MC6 Error: ");
812 
813 	if (xec > 0x5)
814 		goto wrong_mc6_mce;
815 
816 	pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
817 	return;
818 
819  wrong_mc6_mce:
820 	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
821 }
822 
823 static void decode_f17h_core_errors(const char *ip_name, u8 xec,
824 				   unsigned int mca_type)
825 {
826 	const char * const *error_desc_array;
827 	size_t len;
828 
829 	pr_emerg(HW_ERR "%s Error: ", ip_name);
830 
831 	switch (mca_type) {
832 	case SMCA_LS:
833 		error_desc_array = f17h_ls_mce_desc;
834 		len = ARRAY_SIZE(f17h_ls_mce_desc) - 1;
835 
836 		if (xec == 0x4) {
837 			pr_cont("Unrecognized LS MCA error code.\n");
838 			return;
839 		}
840 		break;
841 
842 	case SMCA_IF:
843 		error_desc_array = f17h_if_mce_desc;
844 		len = ARRAY_SIZE(f17h_if_mce_desc) - 1;
845 		break;
846 
847 	case SMCA_L2_CACHE:
848 		error_desc_array = f17h_l2_mce_desc;
849 		len = ARRAY_SIZE(f17h_l2_mce_desc) - 1;
850 		break;
851 
852 	case SMCA_DE:
853 		error_desc_array = f17h_de_mce_desc;
854 		len = ARRAY_SIZE(f17h_de_mce_desc) - 1;
855 		break;
856 
857 	case SMCA_EX:
858 		error_desc_array = f17h_ex_mce_desc;
859 		len = ARRAY_SIZE(f17h_ex_mce_desc) - 1;
860 		break;
861 
862 	case SMCA_FP:
863 		error_desc_array = f17h_fp_mce_desc;
864 		len = ARRAY_SIZE(f17h_fp_mce_desc) - 1;
865 		break;
866 
867 	case SMCA_L3_CACHE:
868 		error_desc_array = f17h_l3_mce_desc;
869 		len = ARRAY_SIZE(f17h_l3_mce_desc) - 1;
870 		break;
871 
872 	default:
873 		pr_cont("Corrupted MCA core error info.\n");
874 		return;
875 	}
876 
877 	if (xec > len) {
878 		pr_cont("Unrecognized %s MCA bank error code.\n",
879 			 amd_core_mcablock_names[mca_type]);
880 		return;
881 	}
882 
883 	pr_cont("%s.\n", error_desc_array[xec]);
884 }
885 
886 static void decode_df_errors(u8 xec, unsigned int mca_type)
887 {
888 	const char * const *error_desc_array;
889 	size_t len;
890 
891 	pr_emerg(HW_ERR "Data Fabric Error: ");
892 
893 	switch (mca_type) {
894 	case  SMCA_CS:
895 		error_desc_array = f17h_cs_mce_desc;
896 		len = ARRAY_SIZE(f17h_cs_mce_desc) - 1;
897 		break;
898 
899 	case SMCA_PIE:
900 		error_desc_array = f17h_pie_mce_desc;
901 		len = ARRAY_SIZE(f17h_pie_mce_desc) - 1;
902 		break;
903 
904 	default:
905 		pr_cont("Corrupted MCA Data Fabric info.\n");
906 		return;
907 	}
908 
909 	if (xec > len) {
910 		pr_cont("Unrecognized %s MCA bank error code.\n",
911 			 amd_df_mcablock_names[mca_type]);
912 		return;
913 	}
914 
915 	pr_cont("%s.\n", error_desc_array[xec]);
916 }
917 
918 /* Decode errors according to Scalable MCA specification */
919 static void decode_smca_errors(struct mce *m)
920 {
921 	u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank);
922 	unsigned int hwid, mca_type, i;
923 	u8 xec = XEC(m->status, xec_mask);
924 	const char * const *error_desc_array;
925 	const char *ip_name;
926 	u32 low, high;
927 	size_t len;
928 
929 	if (rdmsr_safe(addr, &low, &high)) {
930 		pr_emerg("Invalid IP block specified, error information is unreliable.\n");
931 		return;
932 	}
933 
934 	hwid = high & MCI_IPID_HWID;
935 	mca_type = (high & MCI_IPID_MCATYPE) >> 16;
936 
937 	pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", m->bank, high, low);
938 
939 	/*
940 	 * Based on hwid and mca_type values, decode errors from respective IPs.
941 	 * Note: mca_type values make sense only in the context of an hwid.
942 	 */
943 	for (i = 0; i < ARRAY_SIZE(amd_hwids); i++)
944 		if (amd_hwids[i].hwid == hwid)
945 			break;
946 
947 	switch (i) {
948 	case SMCA_F17H_CORE:
949 		ip_name = (mca_type == SMCA_L3_CACHE) ?
950 			  "L3 Cache" : "F17h Core";
951 		return decode_f17h_core_errors(ip_name, xec, mca_type);
952 		break;
953 
954 	case SMCA_DF:
955 		return decode_df_errors(xec, mca_type);
956 		break;
957 
958 	case SMCA_UMC:
959 		error_desc_array = f17h_umc_mce_desc;
960 		len = ARRAY_SIZE(f17h_umc_mce_desc) - 1;
961 		break;
962 
963 	case SMCA_PB:
964 		error_desc_array = f17h_pb_mce_desc;
965 		len = ARRAY_SIZE(f17h_pb_mce_desc) - 1;
966 		break;
967 
968 	case SMCA_PSP:
969 		error_desc_array = f17h_psp_mce_desc;
970 		len = ARRAY_SIZE(f17h_psp_mce_desc) - 1;
971 		break;
972 
973 	case SMCA_SMU:
974 		error_desc_array = f17h_smu_mce_desc;
975 		len = ARRAY_SIZE(f17h_smu_mce_desc) - 1;
976 		break;
977 
978 	default:
979 		pr_emerg(HW_ERR "HWID:%d does not match any existing IPs.\n", hwid);
980 		return;
981 	}
982 
983 	ip_name = amd_hwids[i].name;
984 	pr_emerg(HW_ERR "%s Error: ", ip_name);
985 
986 	if (xec > len) {
987 		pr_cont("Unrecognized %s MCA bank error code.\n", ip_name);
988 		return;
989 	}
990 
991 	pr_cont("%s.\n", error_desc_array[xec]);
992 }
993 
994 static inline void amd_decode_err_code(u16 ec)
995 {
996 	if (INT_ERROR(ec)) {
997 		pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
998 		return;
999 	}
1000 
1001 	pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
1002 
1003 	if (BUS_ERROR(ec))
1004 		pr_cont(", mem/io: %s", II_MSG(ec));
1005 	else
1006 		pr_cont(", tx: %s", TT_MSG(ec));
1007 
1008 	if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
1009 		pr_cont(", mem-tx: %s", R4_MSG(ec));
1010 
1011 		if (BUS_ERROR(ec))
1012 			pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
1013 	}
1014 
1015 	pr_cont("\n");
1016 }
1017 
1018 /*
1019  * Filter out unwanted MCE signatures here.
1020  */
1021 static bool amd_filter_mce(struct mce *m)
1022 {
1023 	u8 xec = (m->status >> 16) & 0x1f;
1024 
1025 	/*
1026 	 * NB GART TLB error reporting is disabled by default.
1027 	 */
1028 	if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
1029 		return true;
1030 
1031 	return false;
1032 }
1033 
1034 static const char *decode_error_status(struct mce *m)
1035 {
1036 	if (m->status & MCI_STATUS_UC) {
1037 		if (m->status & MCI_STATUS_PCC)
1038 			return "System Fatal error.";
1039 		if (m->mcgstatus & MCG_STATUS_RIPV)
1040 			return "Uncorrected, software restartable error.";
1041 		return "Uncorrected, software containable error.";
1042 	}
1043 
1044 	if (m->status & MCI_STATUS_DEFERRED)
1045 		return "Deferred error.";
1046 
1047 	return "Corrected error, no action required.";
1048 }
1049 
1050 int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
1051 {
1052 	struct mce *m = (struct mce *)data;
1053 	struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
1054 	int ecc;
1055 
1056 	if (amd_filter_mce(m))
1057 		return NOTIFY_STOP;
1058 
1059 	pr_emerg(HW_ERR "%s\n", decode_error_status(m));
1060 
1061 	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
1062 		m->extcpu,
1063 		c->x86, c->x86_model, c->x86_mask,
1064 		m->bank,
1065 		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
1066 		((m->status & MCI_STATUS_UC)	? "UE"	  :
1067 		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
1068 		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
1069 		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"),
1070 		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"));
1071 
1072 	if (c->x86 >= 0x15)
1073 		pr_cont("|%s|%s",
1074 			((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
1075 			((m->status & MCI_STATUS_POISON)   ? "Poison"   : "-"));
1076 
1077 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
1078 		u32 low, high;
1079 		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
1080 
1081 		if (!rdmsr_safe(addr, &low, &high) &&
1082 		    (low & MCI_CONFIG_MCAX))
1083 			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
1084 	}
1085 
1086 	/* do the two bits[14:13] together */
1087 	ecc = (m->status >> 45) & 0x3;
1088 	if (ecc)
1089 		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
1090 
1091 	pr_cont("]: 0x%016llx\n", m->status);
1092 
1093 	if (m->status & MCI_STATUS_ADDRV)
1094 		pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr);
1095 
1096 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
1097 		decode_smca_errors(m);
1098 		goto err_code;
1099 	}
1100 
1101 	if (!fam_ops)
1102 		goto err_code;
1103 
1104 	switch (m->bank) {
1105 	case 0:
1106 		decode_mc0_mce(m);
1107 		break;
1108 
1109 	case 1:
1110 		decode_mc1_mce(m);
1111 		break;
1112 
1113 	case 2:
1114 		decode_mc2_mce(m);
1115 		break;
1116 
1117 	case 3:
1118 		decode_mc3_mce(m);
1119 		break;
1120 
1121 	case 4:
1122 		decode_mc4_mce(m);
1123 		break;
1124 
1125 	case 5:
1126 		decode_mc5_mce(m);
1127 		break;
1128 
1129 	case 6:
1130 		decode_mc6_mce(m);
1131 		break;
1132 
1133 	default:
1134 		break;
1135 	}
1136 
1137  err_code:
1138 	amd_decode_err_code(m->status & 0xffff);
1139 
1140 	return NOTIFY_STOP;
1141 }
1142 EXPORT_SYMBOL_GPL(amd_decode_mce);
1143 
1144 static struct notifier_block amd_mce_dec_nb = {
1145 	.notifier_call	= amd_decode_mce,
1146 };
1147 
1148 static int __init mce_amd_init(void)
1149 {
1150 	struct cpuinfo_x86 *c = &boot_cpu_data;
1151 
1152 	if (c->x86_vendor != X86_VENDOR_AMD)
1153 		return -ENODEV;
1154 
1155 	fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
1156 	if (!fam_ops)
1157 		return -ENOMEM;
1158 
1159 	switch (c->x86) {
1160 	case 0xf:
1161 		fam_ops->mc0_mce = k8_mc0_mce;
1162 		fam_ops->mc1_mce = k8_mc1_mce;
1163 		fam_ops->mc2_mce = k8_mc2_mce;
1164 		break;
1165 
1166 	case 0x10:
1167 		fam_ops->mc0_mce = f10h_mc0_mce;
1168 		fam_ops->mc1_mce = k8_mc1_mce;
1169 		fam_ops->mc2_mce = k8_mc2_mce;
1170 		break;
1171 
1172 	case 0x11:
1173 		fam_ops->mc0_mce = k8_mc0_mce;
1174 		fam_ops->mc1_mce = k8_mc1_mce;
1175 		fam_ops->mc2_mce = k8_mc2_mce;
1176 		break;
1177 
1178 	case 0x12:
1179 		fam_ops->mc0_mce = f12h_mc0_mce;
1180 		fam_ops->mc1_mce = k8_mc1_mce;
1181 		fam_ops->mc2_mce = k8_mc2_mce;
1182 		break;
1183 
1184 	case 0x14:
1185 		fam_ops->mc0_mce = cat_mc0_mce;
1186 		fam_ops->mc1_mce = cat_mc1_mce;
1187 		fam_ops->mc2_mce = k8_mc2_mce;
1188 		break;
1189 
1190 	case 0x15:
1191 		xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1192 
1193 		fam_ops->mc0_mce = f15h_mc0_mce;
1194 		fam_ops->mc1_mce = f15h_mc1_mce;
1195 		fam_ops->mc2_mce = f15h_mc2_mce;
1196 		break;
1197 
1198 	case 0x16:
1199 		xec_mask = 0x1f;
1200 		fam_ops->mc0_mce = cat_mc0_mce;
1201 		fam_ops->mc1_mce = cat_mc1_mce;
1202 		fam_ops->mc2_mce = f16h_mc2_mce;
1203 		break;
1204 
1205 	case 0x17:
1206 		xec_mask = 0x3f;
1207 		if (!boot_cpu_has(X86_FEATURE_SMCA)) {
1208 			printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
1209 			goto err_out;
1210 		}
1211 		break;
1212 
1213 	default:
1214 		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1215 		goto err_out;
1216 	}
1217 
1218 	pr_info("MCE: In-kernel MCE decoding enabled.\n");
1219 
1220 	mce_register_decode_chain(&amd_mce_dec_nb);
1221 
1222 	return 0;
1223 
1224 err_out:
1225 	kfree(fam_ops);
1226 	fam_ops = NULL;
1227 	return -EINVAL;
1228 }
1229 early_initcall(mce_amd_init);
1230 
1231 #ifdef MODULE
1232 static void __exit mce_amd_exit(void)
1233 {
1234 	mce_unregister_decode_chain(&amd_mce_dec_nb);
1235 	kfree(fam_ops);
1236 }
1237 
1238 MODULE_DESCRIPTION("AMD MCE decoder");
1239 MODULE_ALIAS("edac-mce-amd");
1240 MODULE_LICENSE("GPL");
1241 module_exit(mce_amd_exit);
1242 #endif
1243