1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/module.h> 3 #include <linux/slab.h> 4 5 #include <asm/cpu.h> 6 7 #include "mce_amd.h" 8 9 static struct amd_decoder_ops fam_ops; 10 11 static u8 xec_mask = 0xf; 12 13 static void (*decode_dram_ecc)(int node_id, struct mce *m); 14 15 void amd_register_ecc_decoder(void (*f)(int, struct mce *)) 16 { 17 decode_dram_ecc = f; 18 } 19 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); 20 21 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *)) 22 { 23 if (decode_dram_ecc) { 24 WARN_ON(decode_dram_ecc != f); 25 26 decode_dram_ecc = NULL; 27 } 28 } 29 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); 30 31 /* 32 * string representation for the different MCA reported error types, see F3x48 33 * or MSR0000_0411. 34 */ 35 36 /* transaction type */ 37 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" }; 38 39 /* cache level */ 40 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" }; 41 42 /* memory transaction type */ 43 static const char * const rrrr_msgs[] = { 44 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP" 45 }; 46 47 /* participating processor */ 48 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" }; 49 EXPORT_SYMBOL_GPL(pp_msgs); 50 51 /* request timeout */ 52 static const char * const to_msgs[] = { "no timeout", "timed out" }; 53 54 /* memory or i/o */ 55 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; 56 57 /* internal error type */ 58 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" }; 59 60 static const char * const f15h_mc1_mce_desc[] = { 61 "UC during a demand linefill from L2", 62 "Parity error during data load from IC", 63 "Parity error for IC valid bit", 64 "Main tag parity error", 65 "Parity error in prediction queue", 66 "PFB data/address parity error", 67 "Parity error in the branch status reg", 68 "PFB promotion address error", 69 "Tag error during probe/victimization", 70 "Parity error for IC probe tag valid bit", 71 "PFB non-cacheable bit parity error", 72 "PFB valid bit parity error", /* xec = 0xd */ 73 "Microcode Patch Buffer", /* xec = 010 */ 74 "uop queue", 75 "insn buffer", 76 "predecode buffer", 77 "fetch address FIFO", 78 "dispatch uop queue" 79 }; 80 81 static const char * const f15h_mc2_mce_desc[] = { 82 "Fill ECC error on data fills", /* xec = 0x4 */ 83 "Fill parity error on insn fills", 84 "Prefetcher request FIFO parity error", 85 "PRQ address parity error", 86 "PRQ data parity error", 87 "WCC Tag ECC error", 88 "WCC Data ECC error", 89 "WCB Data parity error", 90 "VB Data ECC or parity error", 91 "L2 Tag ECC error", /* xec = 0x10 */ 92 "Hard L2 Tag ECC error", 93 "Multiple hits on L2 tag", 94 "XAB parity error", 95 "PRB address parity error" 96 }; 97 98 static const char * const mc4_mce_desc[] = { 99 "DRAM ECC error detected on the NB", 100 "CRC error detected on HT link", 101 "Link-defined sync error packets detected on HT link", 102 "HT Master abort", 103 "HT Target abort", 104 "Invalid GART PTE entry during GART table walk", 105 "Unsupported atomic RMW received from an IO link", 106 "Watchdog timeout due to lack of progress", 107 "DRAM ECC error detected on the NB", 108 "SVM DMA Exclusion Vector error", 109 "HT data error detected on link", 110 "Protocol error (link, L3, probe filter)", 111 "NB internal arrays parity error", 112 "DRAM addr/ctl signals parity error", 113 "IO link transmission error", 114 "L3 data cache ECC error", /* xec = 0x1c */ 115 "L3 cache tag error", 116 "L3 LRU parity bits error", 117 "ECC Error in the Probe Filter directory" 118 }; 119 120 static const char * const mc5_mce_desc[] = { 121 "CPU Watchdog timer expire", 122 "Wakeup array dest tag", 123 "AG payload array", 124 "EX payload array", 125 "IDRF array", 126 "Retire dispatch queue", 127 "Mapper checkpoint array", 128 "Physical register file EX0 port", 129 "Physical register file EX1 port", 130 "Physical register file AG0 port", 131 "Physical register file AG1 port", 132 "Flag register file", 133 "DE error occurred", 134 "Retire status queue" 135 }; 136 137 static const char * const mc6_mce_desc[] = { 138 "Hardware Assertion", 139 "Free List", 140 "Physical Register File", 141 "Retire Queue", 142 "Scheduler table", 143 "Status Register File", 144 }; 145 146 static bool f12h_mc0_mce(u16 ec, u8 xec) 147 { 148 bool ret = false; 149 150 if (MEM_ERROR(ec)) { 151 u8 ll = LL(ec); 152 ret = true; 153 154 if (ll == LL_L2) 155 pr_cont("during L1 linefill from L2.\n"); 156 else if (ll == LL_L1) 157 pr_cont("Data/Tag %s error.\n", R4_MSG(ec)); 158 else 159 ret = false; 160 } 161 return ret; 162 } 163 164 static bool f10h_mc0_mce(u16 ec, u8 xec) 165 { 166 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) { 167 pr_cont("during data scrub.\n"); 168 return true; 169 } 170 return f12h_mc0_mce(ec, xec); 171 } 172 173 static bool k8_mc0_mce(u16 ec, u8 xec) 174 { 175 if (BUS_ERROR(ec)) { 176 pr_cont("during system linefill.\n"); 177 return true; 178 } 179 180 return f10h_mc0_mce(ec, xec); 181 } 182 183 static bool cat_mc0_mce(u16 ec, u8 xec) 184 { 185 u8 r4 = R4(ec); 186 bool ret = true; 187 188 if (MEM_ERROR(ec)) { 189 190 if (TT(ec) != TT_DATA || LL(ec) != LL_L1) 191 return false; 192 193 switch (r4) { 194 case R4_DRD: 195 case R4_DWR: 196 pr_cont("Data/Tag parity error due to %s.\n", 197 (r4 == R4_DRD ? "load/hw prf" : "store")); 198 break; 199 case R4_EVICT: 200 pr_cont("Copyback parity error on a tag miss.\n"); 201 break; 202 case R4_SNOOP: 203 pr_cont("Tag parity error during snoop.\n"); 204 break; 205 default: 206 ret = false; 207 } 208 } else if (BUS_ERROR(ec)) { 209 210 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG) 211 return false; 212 213 pr_cont("System read data error on a "); 214 215 switch (r4) { 216 case R4_RD: 217 pr_cont("TLB reload.\n"); 218 break; 219 case R4_DWR: 220 pr_cont("store.\n"); 221 break; 222 case R4_DRD: 223 pr_cont("load.\n"); 224 break; 225 default: 226 ret = false; 227 } 228 } else { 229 ret = false; 230 } 231 232 return ret; 233 } 234 235 static bool f15h_mc0_mce(u16 ec, u8 xec) 236 { 237 bool ret = true; 238 239 if (MEM_ERROR(ec)) { 240 241 switch (xec) { 242 case 0x0: 243 pr_cont("Data Array access error.\n"); 244 break; 245 246 case 0x1: 247 pr_cont("UC error during a linefill from L2/NB.\n"); 248 break; 249 250 case 0x2: 251 case 0x11: 252 pr_cont("STQ access error.\n"); 253 break; 254 255 case 0x3: 256 pr_cont("SCB access error.\n"); 257 break; 258 259 case 0x10: 260 pr_cont("Tag error.\n"); 261 break; 262 263 case 0x12: 264 pr_cont("LDQ access error.\n"); 265 break; 266 267 default: 268 ret = false; 269 } 270 } else if (BUS_ERROR(ec)) { 271 272 if (!xec) 273 pr_cont("System Read Data Error.\n"); 274 else 275 pr_cont(" Internal error condition type %d.\n", xec); 276 } else if (INT_ERROR(ec)) { 277 if (xec <= 0x1f) 278 pr_cont("Hardware Assert.\n"); 279 else 280 ret = false; 281 282 } else 283 ret = false; 284 285 return ret; 286 } 287 288 static void decode_mc0_mce(struct mce *m) 289 { 290 u16 ec = EC(m->status); 291 u8 xec = XEC(m->status, xec_mask); 292 293 pr_emerg(HW_ERR "MC0 Error: "); 294 295 /* TLB error signatures are the same across families */ 296 if (TLB_ERROR(ec)) { 297 if (TT(ec) == TT_DATA) { 298 pr_cont("%s TLB %s.\n", LL_MSG(ec), 299 ((xec == 2) ? "locked miss" 300 : (xec ? "multimatch" : "parity"))); 301 return; 302 } 303 } else if (fam_ops.mc0_mce(ec, xec)) 304 ; 305 else 306 pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n"); 307 } 308 309 static bool k8_mc1_mce(u16 ec, u8 xec) 310 { 311 u8 ll = LL(ec); 312 bool ret = true; 313 314 if (!MEM_ERROR(ec)) 315 return false; 316 317 if (ll == 0x2) 318 pr_cont("during a linefill from L2.\n"); 319 else if (ll == 0x1) { 320 switch (R4(ec)) { 321 case R4_IRD: 322 pr_cont("Parity error during data load.\n"); 323 break; 324 325 case R4_EVICT: 326 pr_cont("Copyback Parity/Victim error.\n"); 327 break; 328 329 case R4_SNOOP: 330 pr_cont("Tag Snoop error.\n"); 331 break; 332 333 default: 334 ret = false; 335 break; 336 } 337 } else 338 ret = false; 339 340 return ret; 341 } 342 343 static bool cat_mc1_mce(u16 ec, u8 xec) 344 { 345 u8 r4 = R4(ec); 346 bool ret = true; 347 348 if (!MEM_ERROR(ec)) 349 return false; 350 351 if (TT(ec) != TT_INSTR) 352 return false; 353 354 if (r4 == R4_IRD) 355 pr_cont("Data/tag array parity error for a tag hit.\n"); 356 else if (r4 == R4_SNOOP) 357 pr_cont("Tag error during snoop/victimization.\n"); 358 else if (xec == 0x0) 359 pr_cont("Tag parity error from victim castout.\n"); 360 else if (xec == 0x2) 361 pr_cont("Microcode patch RAM parity error.\n"); 362 else 363 ret = false; 364 365 return ret; 366 } 367 368 static bool f15h_mc1_mce(u16 ec, u8 xec) 369 { 370 bool ret = true; 371 372 if (!MEM_ERROR(ec)) 373 return false; 374 375 switch (xec) { 376 case 0x0 ... 0xa: 377 pr_cont("%s.\n", f15h_mc1_mce_desc[xec]); 378 break; 379 380 case 0xd: 381 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]); 382 break; 383 384 case 0x10: 385 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]); 386 break; 387 388 case 0x11 ... 0x15: 389 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]); 390 break; 391 392 default: 393 ret = false; 394 } 395 return ret; 396 } 397 398 static void decode_mc1_mce(struct mce *m) 399 { 400 u16 ec = EC(m->status); 401 u8 xec = XEC(m->status, xec_mask); 402 403 pr_emerg(HW_ERR "MC1 Error: "); 404 405 if (TLB_ERROR(ec)) 406 pr_cont("%s TLB %s.\n", LL_MSG(ec), 407 (xec ? "multimatch" : "parity error")); 408 else if (BUS_ERROR(ec)) { 409 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58))); 410 411 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read")); 412 } else if (INT_ERROR(ec)) { 413 if (xec <= 0x3f) 414 pr_cont("Hardware Assert.\n"); 415 else 416 goto wrong_mc1_mce; 417 } else if (fam_ops.mc1_mce(ec, xec)) 418 ; 419 else 420 goto wrong_mc1_mce; 421 422 return; 423 424 wrong_mc1_mce: 425 pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n"); 426 } 427 428 static bool k8_mc2_mce(u16 ec, u8 xec) 429 { 430 bool ret = true; 431 432 if (xec == 0x1) 433 pr_cont(" in the write data buffers.\n"); 434 else if (xec == 0x3) 435 pr_cont(" in the victim data buffers.\n"); 436 else if (xec == 0x2 && MEM_ERROR(ec)) 437 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec)); 438 else if (xec == 0x0) { 439 if (TLB_ERROR(ec)) 440 pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n", 441 TT_MSG(ec)); 442 else if (BUS_ERROR(ec)) 443 pr_cont(": %s/ECC error in data read from NB: %s.\n", 444 R4_MSG(ec), PP_MSG(ec)); 445 else if (MEM_ERROR(ec)) { 446 u8 r4 = R4(ec); 447 448 if (r4 >= 0x7) 449 pr_cont(": %s error during data copyback.\n", 450 R4_MSG(ec)); 451 else if (r4 <= 0x1) 452 pr_cont(": %s parity/ECC error during data " 453 "access from L2.\n", R4_MSG(ec)); 454 else 455 ret = false; 456 } else 457 ret = false; 458 } else 459 ret = false; 460 461 return ret; 462 } 463 464 static bool f15h_mc2_mce(u16 ec, u8 xec) 465 { 466 bool ret = true; 467 468 if (TLB_ERROR(ec)) { 469 if (xec == 0x0) 470 pr_cont("Data parity TLB read error.\n"); 471 else if (xec == 0x1) 472 pr_cont("Poison data provided for TLB fill.\n"); 473 else 474 ret = false; 475 } else if (BUS_ERROR(ec)) { 476 if (xec > 2) 477 ret = false; 478 479 pr_cont("Error during attempted NB data read.\n"); 480 } else if (MEM_ERROR(ec)) { 481 switch (xec) { 482 case 0x4 ... 0xc: 483 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]); 484 break; 485 486 case 0x10 ... 0x14: 487 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]); 488 break; 489 490 default: 491 ret = false; 492 } 493 } else if (INT_ERROR(ec)) { 494 if (xec <= 0x3f) 495 pr_cont("Hardware Assert.\n"); 496 else 497 ret = false; 498 } 499 500 return ret; 501 } 502 503 static bool f16h_mc2_mce(u16 ec, u8 xec) 504 { 505 u8 r4 = R4(ec); 506 507 if (!MEM_ERROR(ec)) 508 return false; 509 510 switch (xec) { 511 case 0x04 ... 0x05: 512 pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O'); 513 break; 514 515 case 0x09 ... 0x0b: 516 case 0x0d ... 0x0f: 517 pr_cont("ECC error in L2 tag (%s).\n", 518 ((r4 == R4_GEN) ? "BankReq" : 519 ((r4 == R4_SNOOP) ? "Prb" : "Fill"))); 520 break; 521 522 case 0x10 ... 0x19: 523 case 0x1b: 524 pr_cont("ECC error in L2 data array (%s).\n", 525 (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" : 526 ((r4 == R4_GEN) ? "Attr" : 527 ((r4 == R4_EVICT) ? "Vict" : "Fill")))); 528 break; 529 530 case 0x1c ... 0x1d: 531 case 0x1f: 532 pr_cont("Parity error in L2 attribute bits (%s).\n", 533 ((r4 == R4_RD) ? "Hit" : 534 ((r4 == R4_GEN) ? "Attr" : "Fill"))); 535 break; 536 537 default: 538 return false; 539 } 540 541 return true; 542 } 543 544 static void decode_mc2_mce(struct mce *m) 545 { 546 u16 ec = EC(m->status); 547 u8 xec = XEC(m->status, xec_mask); 548 549 pr_emerg(HW_ERR "MC2 Error: "); 550 551 if (!fam_ops.mc2_mce(ec, xec)) 552 pr_cont(HW_ERR "Corrupted MC2 MCE info?\n"); 553 } 554 555 static void decode_mc3_mce(struct mce *m) 556 { 557 u16 ec = EC(m->status); 558 u8 xec = XEC(m->status, xec_mask); 559 560 if (boot_cpu_data.x86 >= 0x14) { 561 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family," 562 " please report on LKML.\n"); 563 return; 564 } 565 566 pr_emerg(HW_ERR "MC3 Error"); 567 568 if (xec == 0x0) { 569 u8 r4 = R4(ec); 570 571 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR)) 572 goto wrong_mc3_mce; 573 574 pr_cont(" during %s.\n", R4_MSG(ec)); 575 } else 576 goto wrong_mc3_mce; 577 578 return; 579 580 wrong_mc3_mce: 581 pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n"); 582 } 583 584 static void decode_mc4_mce(struct mce *m) 585 { 586 unsigned int fam = x86_family(m->cpuid); 587 int node_id = topology_amd_node_id(m->extcpu); 588 u16 ec = EC(m->status); 589 u8 xec = XEC(m->status, 0x1f); 590 u8 offset = 0; 591 592 pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id); 593 594 switch (xec) { 595 case 0x0 ... 0xe: 596 597 /* special handling for DRAM ECCs */ 598 if (xec == 0x0 || xec == 0x8) { 599 /* no ECCs on F11h */ 600 if (fam == 0x11) 601 goto wrong_mc4_mce; 602 603 pr_cont("%s.\n", mc4_mce_desc[xec]); 604 605 if (decode_dram_ecc) 606 decode_dram_ecc(node_id, m); 607 return; 608 } 609 break; 610 611 case 0xf: 612 if (TLB_ERROR(ec)) 613 pr_cont("GART Table Walk data error.\n"); 614 else if (BUS_ERROR(ec)) 615 pr_cont("DMA Exclusion Vector Table Walk error.\n"); 616 else 617 goto wrong_mc4_mce; 618 return; 619 620 case 0x19: 621 if (fam == 0x15 || fam == 0x16) 622 pr_cont("Compute Unit Data Error.\n"); 623 else 624 goto wrong_mc4_mce; 625 return; 626 627 case 0x1c ... 0x1f: 628 offset = 13; 629 break; 630 631 default: 632 goto wrong_mc4_mce; 633 } 634 635 pr_cont("%s.\n", mc4_mce_desc[xec - offset]); 636 return; 637 638 wrong_mc4_mce: 639 pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n"); 640 } 641 642 static void decode_mc5_mce(struct mce *m) 643 { 644 unsigned int fam = x86_family(m->cpuid); 645 u16 ec = EC(m->status); 646 u8 xec = XEC(m->status, xec_mask); 647 648 if (fam == 0xf || fam == 0x11) 649 goto wrong_mc5_mce; 650 651 pr_emerg(HW_ERR "MC5 Error: "); 652 653 if (INT_ERROR(ec)) { 654 if (xec <= 0x1f) { 655 pr_cont("Hardware Assert.\n"); 656 return; 657 } else 658 goto wrong_mc5_mce; 659 } 660 661 if (xec == 0x0 || xec == 0xc) 662 pr_cont("%s.\n", mc5_mce_desc[xec]); 663 else if (xec <= 0xd) 664 pr_cont("%s parity error.\n", mc5_mce_desc[xec]); 665 else 666 goto wrong_mc5_mce; 667 668 return; 669 670 wrong_mc5_mce: 671 pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n"); 672 } 673 674 static void decode_mc6_mce(struct mce *m) 675 { 676 u8 xec = XEC(m->status, xec_mask); 677 678 pr_emerg(HW_ERR "MC6 Error: "); 679 680 if (xec > 0x5) 681 goto wrong_mc6_mce; 682 683 pr_cont("%s parity error.\n", mc6_mce_desc[xec]); 684 return; 685 686 wrong_mc6_mce: 687 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n"); 688 } 689 690 static const char * const smca_long_names[] = { 691 [SMCA_LS ... SMCA_LS_V2] = "Load Store Unit", 692 [SMCA_IF] = "Instruction Fetch Unit", 693 [SMCA_L2_CACHE] = "L2 Cache", 694 [SMCA_DE] = "Decode Unit", 695 [SMCA_RESERVED] = "Reserved", 696 [SMCA_EX] = "Execution Unit", 697 [SMCA_FP] = "Floating Point Unit", 698 [SMCA_L3_CACHE] = "L3 Cache", 699 [SMCA_CS ... SMCA_CS_V2] = "Coherent Slave", 700 [SMCA_PIE] = "Power, Interrupts, etc.", 701 702 /* UMC v2 is separate because both of them can exist in a single system. */ 703 [SMCA_UMC] = "Unified Memory Controller", 704 [SMCA_UMC_V2] = "Unified Memory Controller v2", 705 [SMCA_PB] = "Parameter Block", 706 [SMCA_PSP ... SMCA_PSP_V2] = "Platform Security Processor", 707 [SMCA_SMU ... SMCA_SMU_V2] = "System Management Unit", 708 [SMCA_MP5] = "Microprocessor 5 Unit", 709 [SMCA_MPDMA] = "MPDMA Unit", 710 [SMCA_NBIO] = "Northbridge IO Unit", 711 [SMCA_PCIE ... SMCA_PCIE_V2] = "PCI Express Unit", 712 [SMCA_XGMI_PCS] = "Ext Global Memory Interconnect PCS Unit", 713 [SMCA_NBIF] = "NBIF Unit", 714 [SMCA_SHUB] = "System Hub Unit", 715 [SMCA_SATA] = "SATA Unit", 716 [SMCA_USB] = "USB Unit", 717 [SMCA_GMI_PCS] = "Global Memory Interconnect PCS Unit", 718 [SMCA_XGMI_PHY] = "Ext Global Memory Interconnect PHY Unit", 719 [SMCA_WAFL_PHY] = "WAFL PHY Unit", 720 [SMCA_GMI_PHY] = "Global Memory Interconnect PHY Unit", 721 }; 722 723 static const char *smca_get_long_name(enum smca_bank_types t) 724 { 725 if (t >= N_SMCA_BANK_TYPES) 726 return NULL; 727 728 return smca_long_names[t]; 729 } 730 731 /* Decode errors according to Scalable MCA specification */ 732 static void decode_smca_error(struct mce *m) 733 { 734 enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank); 735 u8 xec = XEC(m->status, xec_mask); 736 737 if (bank_type >= N_SMCA_BANK_TYPES) 738 return; 739 740 if (bank_type == SMCA_RESERVED) { 741 pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank); 742 return; 743 } 744 745 pr_emerg(HW_ERR "%s Ext. Error Code: %d", smca_get_long_name(bank_type), xec); 746 747 if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) && 748 xec == 0 && decode_dram_ecc) 749 decode_dram_ecc(topology_amd_node_id(m->extcpu), m); 750 } 751 752 static inline void amd_decode_err_code(u16 ec) 753 { 754 if (INT_ERROR(ec)) { 755 pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec)); 756 return; 757 } 758 759 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec)); 760 761 if (BUS_ERROR(ec)) 762 pr_cont(", mem/io: %s", II_MSG(ec)); 763 else 764 pr_cont(", tx: %s", TT_MSG(ec)); 765 766 if (MEM_ERROR(ec) || BUS_ERROR(ec)) { 767 pr_cont(", mem-tx: %s", R4_MSG(ec)); 768 769 if (BUS_ERROR(ec)) 770 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec)); 771 } 772 773 pr_cont("\n"); 774 } 775 776 static const char *decode_error_status(struct mce *m) 777 { 778 if (m->status & MCI_STATUS_UC) { 779 if (m->status & MCI_STATUS_PCC) 780 return "System Fatal error."; 781 if (m->mcgstatus & MCG_STATUS_RIPV) 782 return "Uncorrected, software restartable error."; 783 return "Uncorrected, software containable error."; 784 } 785 786 if (m->status & MCI_STATUS_DEFERRED) 787 return "Deferred error, no action required."; 788 789 return "Corrected error, no action required."; 790 } 791 792 static int 793 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) 794 { 795 struct mce *m = (struct mce *)data; 796 struct mce_hw_err *err = to_mce_hw_err(m); 797 unsigned int fam = x86_family(m->cpuid); 798 u32 mca_config_lo = 0, dummy; 799 int ecc; 800 801 if (m->kflags & MCE_HANDLED_CEC) 802 return NOTIFY_DONE; 803 804 pr_emerg(HW_ERR "%s\n", decode_error_status(m)); 805 806 pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s", 807 m->extcpu, 808 fam, x86_model(m->cpuid), x86_stepping(m->cpuid), 809 m->bank, 810 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"), 811 ((m->status & MCI_STATUS_UC) ? "UE" : 812 (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"), 813 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"), 814 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"), 815 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-")); 816 817 if (boot_cpu_has(X86_FEATURE_SMCA)) { 818 rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(m->bank), &mca_config_lo, &dummy); 819 820 if (mca_config_lo & MCI_CONFIG_MCAX) 821 pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); 822 823 pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-")); 824 } 825 826 /* do the two bits[14:13] together */ 827 ecc = (m->status >> 45) & 0x3; 828 if (ecc) 829 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U")); 830 831 if (fam >= 0x15) { 832 pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-")); 833 834 /* F15h, bank4, bit 43 is part of McaStatSubCache. */ 835 if (fam != 0x15 || m->bank != 4) 836 pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-")); 837 } 838 839 if (fam >= 0x17) 840 pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-")); 841 842 pr_cont("]: 0x%016llx\n", m->status); 843 844 if (m->status & MCI_STATUS_ADDRV) 845 pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr); 846 847 if (m->ppin) 848 pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin); 849 850 if (boot_cpu_has(X86_FEATURE_SMCA)) { 851 pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid); 852 853 if (m->status & MCI_STATUS_SYNDV) { 854 pr_cont(", Syndrome: 0x%016llx\n", m->synd); 855 if (mca_config_lo & MCI_CONFIG_FRUTEXT) { 856 char frutext[17]; 857 858 frutext[16] = '\0'; 859 memcpy(&frutext[0], &err->vendor.amd.synd1, 8); 860 memcpy(&frutext[8], &err->vendor.amd.synd2, 8); 861 862 pr_emerg(HW_ERR "FRU Text: %s", frutext); 863 } 864 } 865 866 pr_cont("\n"); 867 868 decode_smca_error(m); 869 goto err_code; 870 } 871 872 if (m->tsc) 873 pr_emerg(HW_ERR "TSC: %llu\n", m->tsc); 874 875 /* Doesn't matter which member to test. */ 876 if (!fam_ops.mc0_mce) 877 goto err_code; 878 879 switch (m->bank) { 880 case 0: 881 decode_mc0_mce(m); 882 break; 883 884 case 1: 885 decode_mc1_mce(m); 886 break; 887 888 case 2: 889 decode_mc2_mce(m); 890 break; 891 892 case 3: 893 decode_mc3_mce(m); 894 break; 895 896 case 4: 897 decode_mc4_mce(m); 898 break; 899 900 case 5: 901 decode_mc5_mce(m); 902 break; 903 904 case 6: 905 decode_mc6_mce(m); 906 break; 907 908 default: 909 break; 910 } 911 912 err_code: 913 amd_decode_err_code(m->status & 0xffff); 914 915 m->kflags |= MCE_HANDLED_EDAC; 916 return NOTIFY_OK; 917 } 918 919 static struct notifier_block amd_mce_dec_nb = { 920 .notifier_call = amd_decode_mce, 921 .priority = MCE_PRIO_EDAC, 922 }; 923 924 static int __init mce_amd_init(void) 925 { 926 struct cpuinfo_x86 *c = &boot_cpu_data; 927 928 if (c->x86_vendor != X86_VENDOR_AMD && 929 c->x86_vendor != X86_VENDOR_HYGON) 930 return -ENODEV; 931 932 if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) 933 return -ENODEV; 934 935 if (boot_cpu_has(X86_FEATURE_SMCA)) { 936 xec_mask = 0x3f; 937 goto out; 938 } 939 940 switch (c->x86) { 941 case 0xf: 942 fam_ops.mc0_mce = k8_mc0_mce; 943 fam_ops.mc1_mce = k8_mc1_mce; 944 fam_ops.mc2_mce = k8_mc2_mce; 945 break; 946 947 case 0x10: 948 fam_ops.mc0_mce = f10h_mc0_mce; 949 fam_ops.mc1_mce = k8_mc1_mce; 950 fam_ops.mc2_mce = k8_mc2_mce; 951 break; 952 953 case 0x11: 954 fam_ops.mc0_mce = k8_mc0_mce; 955 fam_ops.mc1_mce = k8_mc1_mce; 956 fam_ops.mc2_mce = k8_mc2_mce; 957 break; 958 959 case 0x12: 960 fam_ops.mc0_mce = f12h_mc0_mce; 961 fam_ops.mc1_mce = k8_mc1_mce; 962 fam_ops.mc2_mce = k8_mc2_mce; 963 break; 964 965 case 0x14: 966 fam_ops.mc0_mce = cat_mc0_mce; 967 fam_ops.mc1_mce = cat_mc1_mce; 968 fam_ops.mc2_mce = k8_mc2_mce; 969 break; 970 971 case 0x15: 972 xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f; 973 974 fam_ops.mc0_mce = f15h_mc0_mce; 975 fam_ops.mc1_mce = f15h_mc1_mce; 976 fam_ops.mc2_mce = f15h_mc2_mce; 977 break; 978 979 case 0x16: 980 xec_mask = 0x1f; 981 fam_ops.mc0_mce = cat_mc0_mce; 982 fam_ops.mc1_mce = cat_mc1_mce; 983 fam_ops.mc2_mce = f16h_mc2_mce; 984 break; 985 986 case 0x17: 987 case 0x18: 988 pr_warn_once("Decoding supported only on Scalable MCA processors.\n"); 989 return -EINVAL; 990 991 default: 992 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86); 993 return -EINVAL; 994 } 995 996 out: 997 pr_info("MCE: In-kernel MCE decoding enabled.\n"); 998 999 mce_register_decode_chain(&amd_mce_dec_nb); 1000 1001 return 0; 1002 } 1003 early_initcall(mce_amd_init); 1004 1005 #ifdef MODULE 1006 static void __exit mce_amd_exit(void) 1007 { 1008 mce_unregister_decode_chain(&amd_mce_dec_nb); 1009 } 1010 1011 MODULE_DESCRIPTION("AMD MCE decoder"); 1012 MODULE_ALIAS("edac-mce-amd"); 1013 MODULE_LICENSE("GPL"); 1014 module_exit(mce_amd_exit); 1015 #endif 1016