1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/module.h> 3 #include <linux/slab.h> 4 5 #include <asm/cpu.h> 6 #include <asm/msr.h> 7 8 #include "mce_amd.h" 9 10 static struct amd_decoder_ops fam_ops; 11 12 static u8 xec_mask = 0xf; 13 14 static void (*decode_dram_ecc)(int node_id, struct mce *m); 15 16 void amd_register_ecc_decoder(void (*f)(int, struct mce *)) 17 { 18 decode_dram_ecc = f; 19 } 20 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); 21 22 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *)) 23 { 24 if (decode_dram_ecc) { 25 WARN_ON(decode_dram_ecc != f); 26 27 decode_dram_ecc = NULL; 28 } 29 } 30 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); 31 32 /* 33 * string representation for the different MCA reported error types, see F3x48 34 * or MSR0000_0411. 35 */ 36 37 /* transaction type */ 38 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" }; 39 40 /* cache level */ 41 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" }; 42 43 /* memory transaction type */ 44 static const char * const rrrr_msgs[] = { 45 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP" 46 }; 47 48 /* participating processor */ 49 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" }; 50 EXPORT_SYMBOL_GPL(pp_msgs); 51 52 /* request timeout */ 53 static const char * const to_msgs[] = { "no timeout", "timed out" }; 54 55 /* memory or i/o */ 56 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; 57 58 /* internal error type */ 59 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" }; 60 61 static const char * const f15h_mc1_mce_desc[] = { 62 "UC during a demand linefill from L2", 63 "Parity error during data load from IC", 64 "Parity error for IC valid bit", 65 "Main tag parity error", 66 "Parity error in prediction queue", 67 "PFB data/address parity error", 68 "Parity error in the branch status reg", 69 "PFB promotion address error", 70 "Tag error during probe/victimization", 71 "Parity error for IC probe tag valid bit", 72 "PFB non-cacheable bit parity error", 73 "PFB valid bit parity error", /* xec = 0xd */ 74 "Microcode Patch Buffer", /* xec = 010 */ 75 "uop queue", 76 "insn buffer", 77 "predecode buffer", 78 "fetch address FIFO", 79 "dispatch uop queue" 80 }; 81 82 static const char * const f15h_mc2_mce_desc[] = { 83 "Fill ECC error on data fills", /* xec = 0x4 */ 84 "Fill parity error on insn fills", 85 "Prefetcher request FIFO parity error", 86 "PRQ address parity error", 87 "PRQ data parity error", 88 "WCC Tag ECC error", 89 "WCC Data ECC error", 90 "WCB Data parity error", 91 "VB Data ECC or parity error", 92 "L2 Tag ECC error", /* xec = 0x10 */ 93 "Hard L2 Tag ECC error", 94 "Multiple hits on L2 tag", 95 "XAB parity error", 96 "PRB address parity error" 97 }; 98 99 static const char * const mc4_mce_desc[] = { 100 "DRAM ECC error detected on the NB", 101 "CRC error detected on HT link", 102 "Link-defined sync error packets detected on HT link", 103 "HT Master abort", 104 "HT Target abort", 105 "Invalid GART PTE entry during GART table walk", 106 "Unsupported atomic RMW received from an IO link", 107 "Watchdog timeout due to lack of progress", 108 "DRAM ECC error detected on the NB", 109 "SVM DMA Exclusion Vector error", 110 "HT data error detected on link", 111 "Protocol error (link, L3, probe filter)", 112 "NB internal arrays parity error", 113 "DRAM addr/ctl signals parity error", 114 "IO link transmission error", 115 "L3 data cache ECC error", /* xec = 0x1c */ 116 "L3 cache tag error", 117 "L3 LRU parity bits error", 118 "ECC Error in the Probe Filter directory" 119 }; 120 121 static const char * const mc5_mce_desc[] = { 122 "CPU Watchdog timer expire", 123 "Wakeup array dest tag", 124 "AG payload array", 125 "EX payload array", 126 "IDRF array", 127 "Retire dispatch queue", 128 "Mapper checkpoint array", 129 "Physical register file EX0 port", 130 "Physical register file EX1 port", 131 "Physical register file AG0 port", 132 "Physical register file AG1 port", 133 "Flag register file", 134 "DE error occurred", 135 "Retire status queue" 136 }; 137 138 static const char * const mc6_mce_desc[] = { 139 "Hardware Assertion", 140 "Free List", 141 "Physical Register File", 142 "Retire Queue", 143 "Scheduler table", 144 "Status Register File", 145 }; 146 147 static bool f12h_mc0_mce(u16 ec, u8 xec) 148 { 149 bool ret = false; 150 151 if (MEM_ERROR(ec)) { 152 u8 ll = LL(ec); 153 ret = true; 154 155 if (ll == LL_L2) 156 pr_cont("during L1 linefill from L2.\n"); 157 else if (ll == LL_L1) 158 pr_cont("Data/Tag %s error.\n", R4_MSG(ec)); 159 else 160 ret = false; 161 } 162 return ret; 163 } 164 165 static bool f10h_mc0_mce(u16 ec, u8 xec) 166 { 167 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) { 168 pr_cont("during data scrub.\n"); 169 return true; 170 } 171 return f12h_mc0_mce(ec, xec); 172 } 173 174 static bool k8_mc0_mce(u16 ec, u8 xec) 175 { 176 if (BUS_ERROR(ec)) { 177 pr_cont("during system linefill.\n"); 178 return true; 179 } 180 181 return f10h_mc0_mce(ec, xec); 182 } 183 184 static bool cat_mc0_mce(u16 ec, u8 xec) 185 { 186 u8 r4 = R4(ec); 187 bool ret = true; 188 189 if (MEM_ERROR(ec)) { 190 191 if (TT(ec) != TT_DATA || LL(ec) != LL_L1) 192 return false; 193 194 switch (r4) { 195 case R4_DRD: 196 case R4_DWR: 197 pr_cont("Data/Tag parity error due to %s.\n", 198 (r4 == R4_DRD ? "load/hw prf" : "store")); 199 break; 200 case R4_EVICT: 201 pr_cont("Copyback parity error on a tag miss.\n"); 202 break; 203 case R4_SNOOP: 204 pr_cont("Tag parity error during snoop.\n"); 205 break; 206 default: 207 ret = false; 208 } 209 } else if (BUS_ERROR(ec)) { 210 211 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG) 212 return false; 213 214 pr_cont("System read data error on a "); 215 216 switch (r4) { 217 case R4_RD: 218 pr_cont("TLB reload.\n"); 219 break; 220 case R4_DWR: 221 pr_cont("store.\n"); 222 break; 223 case R4_DRD: 224 pr_cont("load.\n"); 225 break; 226 default: 227 ret = false; 228 } 229 } else { 230 ret = false; 231 } 232 233 return ret; 234 } 235 236 static bool f15h_mc0_mce(u16 ec, u8 xec) 237 { 238 bool ret = true; 239 240 if (MEM_ERROR(ec)) { 241 242 switch (xec) { 243 case 0x0: 244 pr_cont("Data Array access error.\n"); 245 break; 246 247 case 0x1: 248 pr_cont("UC error during a linefill from L2/NB.\n"); 249 break; 250 251 case 0x2: 252 case 0x11: 253 pr_cont("STQ access error.\n"); 254 break; 255 256 case 0x3: 257 pr_cont("SCB access error.\n"); 258 break; 259 260 case 0x10: 261 pr_cont("Tag error.\n"); 262 break; 263 264 case 0x12: 265 pr_cont("LDQ access error.\n"); 266 break; 267 268 default: 269 ret = false; 270 } 271 } else if (BUS_ERROR(ec)) { 272 273 if (!xec) 274 pr_cont("System Read Data Error.\n"); 275 else 276 pr_cont(" Internal error condition type %d.\n", xec); 277 } else if (INT_ERROR(ec)) { 278 if (xec <= 0x1f) 279 pr_cont("Hardware Assert.\n"); 280 else 281 ret = false; 282 283 } else 284 ret = false; 285 286 return ret; 287 } 288 289 static void decode_mc0_mce(struct mce *m) 290 { 291 u16 ec = EC(m->status); 292 u8 xec = XEC(m->status, xec_mask); 293 294 pr_emerg(HW_ERR "MC0 Error: "); 295 296 /* TLB error signatures are the same across families */ 297 if (TLB_ERROR(ec)) { 298 if (TT(ec) == TT_DATA) { 299 pr_cont("%s TLB %s.\n", LL_MSG(ec), 300 ((xec == 2) ? "locked miss" 301 : (xec ? "multimatch" : "parity"))); 302 return; 303 } 304 } else if (fam_ops.mc0_mce(ec, xec)) 305 ; 306 else 307 pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n"); 308 } 309 310 static bool k8_mc1_mce(u16 ec, u8 xec) 311 { 312 u8 ll = LL(ec); 313 bool ret = true; 314 315 if (!MEM_ERROR(ec)) 316 return false; 317 318 if (ll == 0x2) 319 pr_cont("during a linefill from L2.\n"); 320 else if (ll == 0x1) { 321 switch (R4(ec)) { 322 case R4_IRD: 323 pr_cont("Parity error during data load.\n"); 324 break; 325 326 case R4_EVICT: 327 pr_cont("Copyback Parity/Victim error.\n"); 328 break; 329 330 case R4_SNOOP: 331 pr_cont("Tag Snoop error.\n"); 332 break; 333 334 default: 335 ret = false; 336 break; 337 } 338 } else 339 ret = false; 340 341 return ret; 342 } 343 344 static bool cat_mc1_mce(u16 ec, u8 xec) 345 { 346 u8 r4 = R4(ec); 347 bool ret = true; 348 349 if (!MEM_ERROR(ec)) 350 return false; 351 352 if (TT(ec) != TT_INSTR) 353 return false; 354 355 if (r4 == R4_IRD) 356 pr_cont("Data/tag array parity error for a tag hit.\n"); 357 else if (r4 == R4_SNOOP) 358 pr_cont("Tag error during snoop/victimization.\n"); 359 else if (xec == 0x0) 360 pr_cont("Tag parity error from victim castout.\n"); 361 else if (xec == 0x2) 362 pr_cont("Microcode patch RAM parity error.\n"); 363 else 364 ret = false; 365 366 return ret; 367 } 368 369 static bool f15h_mc1_mce(u16 ec, u8 xec) 370 { 371 bool ret = true; 372 373 if (!MEM_ERROR(ec)) 374 return false; 375 376 switch (xec) { 377 case 0x0 ... 0xa: 378 pr_cont("%s.\n", f15h_mc1_mce_desc[xec]); 379 break; 380 381 case 0xd: 382 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]); 383 break; 384 385 case 0x10: 386 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]); 387 break; 388 389 case 0x11 ... 0x15: 390 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]); 391 break; 392 393 default: 394 ret = false; 395 } 396 return ret; 397 } 398 399 static void decode_mc1_mce(struct mce *m) 400 { 401 u16 ec = EC(m->status); 402 u8 xec = XEC(m->status, xec_mask); 403 404 pr_emerg(HW_ERR "MC1 Error: "); 405 406 if (TLB_ERROR(ec)) 407 pr_cont("%s TLB %s.\n", LL_MSG(ec), 408 (xec ? "multimatch" : "parity error")); 409 else if (BUS_ERROR(ec)) { 410 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58))); 411 412 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read")); 413 } else if (INT_ERROR(ec)) { 414 if (xec <= 0x3f) 415 pr_cont("Hardware Assert.\n"); 416 else 417 goto wrong_mc1_mce; 418 } else if (fam_ops.mc1_mce(ec, xec)) 419 ; 420 else 421 goto wrong_mc1_mce; 422 423 return; 424 425 wrong_mc1_mce: 426 pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n"); 427 } 428 429 static bool k8_mc2_mce(u16 ec, u8 xec) 430 { 431 bool ret = true; 432 433 if (xec == 0x1) 434 pr_cont(" in the write data buffers.\n"); 435 else if (xec == 0x3) 436 pr_cont(" in the victim data buffers.\n"); 437 else if (xec == 0x2 && MEM_ERROR(ec)) 438 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec)); 439 else if (xec == 0x0) { 440 if (TLB_ERROR(ec)) 441 pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n", 442 TT_MSG(ec)); 443 else if (BUS_ERROR(ec)) 444 pr_cont(": %s/ECC error in data read from NB: %s.\n", 445 R4_MSG(ec), PP_MSG(ec)); 446 else if (MEM_ERROR(ec)) { 447 u8 r4 = R4(ec); 448 449 if (r4 >= 0x7) 450 pr_cont(": %s error during data copyback.\n", 451 R4_MSG(ec)); 452 else if (r4 <= 0x1) 453 pr_cont(": %s parity/ECC error during data " 454 "access from L2.\n", R4_MSG(ec)); 455 else 456 ret = false; 457 } else 458 ret = false; 459 } else 460 ret = false; 461 462 return ret; 463 } 464 465 static bool f15h_mc2_mce(u16 ec, u8 xec) 466 { 467 bool ret = true; 468 469 if (TLB_ERROR(ec)) { 470 if (xec == 0x0) 471 pr_cont("Data parity TLB read error.\n"); 472 else if (xec == 0x1) 473 pr_cont("Poison data provided for TLB fill.\n"); 474 else 475 ret = false; 476 } else if (BUS_ERROR(ec)) { 477 if (xec > 2) 478 ret = false; 479 480 pr_cont("Error during attempted NB data read.\n"); 481 } else if (MEM_ERROR(ec)) { 482 switch (xec) { 483 case 0x4 ... 0xc: 484 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]); 485 break; 486 487 case 0x10 ... 0x14: 488 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]); 489 break; 490 491 default: 492 ret = false; 493 } 494 } else if (INT_ERROR(ec)) { 495 if (xec <= 0x3f) 496 pr_cont("Hardware Assert.\n"); 497 else 498 ret = false; 499 } 500 501 return ret; 502 } 503 504 static bool f16h_mc2_mce(u16 ec, u8 xec) 505 { 506 u8 r4 = R4(ec); 507 508 if (!MEM_ERROR(ec)) 509 return false; 510 511 switch (xec) { 512 case 0x04 ... 0x05: 513 pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O'); 514 break; 515 516 case 0x09 ... 0x0b: 517 case 0x0d ... 0x0f: 518 pr_cont("ECC error in L2 tag (%s).\n", 519 ((r4 == R4_GEN) ? "BankReq" : 520 ((r4 == R4_SNOOP) ? "Prb" : "Fill"))); 521 break; 522 523 case 0x10 ... 0x19: 524 case 0x1b: 525 pr_cont("ECC error in L2 data array (%s).\n", 526 (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" : 527 ((r4 == R4_GEN) ? "Attr" : 528 ((r4 == R4_EVICT) ? "Vict" : "Fill")))); 529 break; 530 531 case 0x1c ... 0x1d: 532 case 0x1f: 533 pr_cont("Parity error in L2 attribute bits (%s).\n", 534 ((r4 == R4_RD) ? "Hit" : 535 ((r4 == R4_GEN) ? "Attr" : "Fill"))); 536 break; 537 538 default: 539 return false; 540 } 541 542 return true; 543 } 544 545 static void decode_mc2_mce(struct mce *m) 546 { 547 u16 ec = EC(m->status); 548 u8 xec = XEC(m->status, xec_mask); 549 550 pr_emerg(HW_ERR "MC2 Error: "); 551 552 if (!fam_ops.mc2_mce(ec, xec)) 553 pr_cont(HW_ERR "Corrupted MC2 MCE info?\n"); 554 } 555 556 static void decode_mc3_mce(struct mce *m) 557 { 558 u16 ec = EC(m->status); 559 u8 xec = XEC(m->status, xec_mask); 560 561 if (boot_cpu_data.x86 >= 0x14) { 562 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family," 563 " please report on LKML.\n"); 564 return; 565 } 566 567 pr_emerg(HW_ERR "MC3 Error"); 568 569 if (xec == 0x0) { 570 u8 r4 = R4(ec); 571 572 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR)) 573 goto wrong_mc3_mce; 574 575 pr_cont(" during %s.\n", R4_MSG(ec)); 576 } else 577 goto wrong_mc3_mce; 578 579 return; 580 581 wrong_mc3_mce: 582 pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n"); 583 } 584 585 static void decode_mc4_mce(struct mce *m) 586 { 587 unsigned int fam = x86_family(m->cpuid); 588 int node_id = topology_amd_node_id(m->extcpu); 589 u16 ec = EC(m->status); 590 u8 xec = XEC(m->status, 0x1f); 591 u8 offset = 0; 592 593 pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id); 594 595 switch (xec) { 596 case 0x0 ... 0xe: 597 598 /* special handling for DRAM ECCs */ 599 if (xec == 0x0 || xec == 0x8) { 600 /* no ECCs on F11h */ 601 if (fam == 0x11) 602 goto wrong_mc4_mce; 603 604 pr_cont("%s.\n", mc4_mce_desc[xec]); 605 606 if (decode_dram_ecc) 607 decode_dram_ecc(node_id, m); 608 return; 609 } 610 break; 611 612 case 0xf: 613 if (TLB_ERROR(ec)) 614 pr_cont("GART Table Walk data error.\n"); 615 else if (BUS_ERROR(ec)) 616 pr_cont("DMA Exclusion Vector Table Walk error.\n"); 617 else 618 goto wrong_mc4_mce; 619 return; 620 621 case 0x19: 622 if (fam == 0x15 || fam == 0x16) 623 pr_cont("Compute Unit Data Error.\n"); 624 else 625 goto wrong_mc4_mce; 626 return; 627 628 case 0x1c ... 0x1f: 629 offset = 13; 630 break; 631 632 default: 633 goto wrong_mc4_mce; 634 } 635 636 pr_cont("%s.\n", mc4_mce_desc[xec - offset]); 637 return; 638 639 wrong_mc4_mce: 640 pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n"); 641 } 642 643 static void decode_mc5_mce(struct mce *m) 644 { 645 unsigned int fam = x86_family(m->cpuid); 646 u16 ec = EC(m->status); 647 u8 xec = XEC(m->status, xec_mask); 648 649 if (fam == 0xf || fam == 0x11) 650 goto wrong_mc5_mce; 651 652 pr_emerg(HW_ERR "MC5 Error: "); 653 654 if (INT_ERROR(ec)) { 655 if (xec <= 0x1f) { 656 pr_cont("Hardware Assert.\n"); 657 return; 658 } else 659 goto wrong_mc5_mce; 660 } 661 662 if (xec == 0x0 || xec == 0xc) 663 pr_cont("%s.\n", mc5_mce_desc[xec]); 664 else if (xec <= 0xd) 665 pr_cont("%s parity error.\n", mc5_mce_desc[xec]); 666 else 667 goto wrong_mc5_mce; 668 669 return; 670 671 wrong_mc5_mce: 672 pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n"); 673 } 674 675 static void decode_mc6_mce(struct mce *m) 676 { 677 u8 xec = XEC(m->status, xec_mask); 678 679 pr_emerg(HW_ERR "MC6 Error: "); 680 681 if (xec > 0x5) 682 goto wrong_mc6_mce; 683 684 pr_cont("%s parity error.\n", mc6_mce_desc[xec]); 685 return; 686 687 wrong_mc6_mce: 688 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n"); 689 } 690 691 static const char * const smca_long_names[] = { 692 [SMCA_LS ... SMCA_LS_V2] = "Load Store Unit", 693 [SMCA_IF] = "Instruction Fetch Unit", 694 [SMCA_L2_CACHE] = "L2 Cache", 695 [SMCA_DE] = "Decode Unit", 696 [SMCA_RESERVED] = "Reserved", 697 [SMCA_EX] = "Execution Unit", 698 [SMCA_FP] = "Floating Point Unit", 699 [SMCA_L3_CACHE] = "L3 Cache", 700 [SMCA_CS ... SMCA_CS_V2] = "Coherent Slave", 701 [SMCA_PIE] = "Power, Interrupts, etc.", 702 703 /* UMC v2 is separate because both of them can exist in a single system. */ 704 [SMCA_UMC] = "Unified Memory Controller", 705 [SMCA_UMC_V2] = "Unified Memory Controller v2", 706 [SMCA_PB] = "Parameter Block", 707 [SMCA_PSP ... SMCA_PSP_V2] = "Platform Security Processor", 708 [SMCA_SMU ... SMCA_SMU_V2] = "System Management Unit", 709 [SMCA_MP5] = "Microprocessor 5 Unit", 710 [SMCA_MPDMA] = "MPDMA Unit", 711 [SMCA_NBIO] = "Northbridge IO Unit", 712 [SMCA_PCIE ... SMCA_PCIE_V2] = "PCI Express Unit", 713 [SMCA_XGMI_PCS] = "Ext Global Memory Interconnect PCS Unit", 714 [SMCA_NBIF] = "NBIF Unit", 715 [SMCA_SHUB] = "System Hub Unit", 716 [SMCA_SATA] = "SATA Unit", 717 [SMCA_USB] = "USB Unit", 718 [SMCA_GMI_PCS] = "Global Memory Interconnect PCS Unit", 719 [SMCA_XGMI_PHY] = "Ext Global Memory Interconnect PHY Unit", 720 [SMCA_WAFL_PHY] = "WAFL PHY Unit", 721 [SMCA_GMI_PHY] = "Global Memory Interconnect PHY Unit", 722 }; 723 724 static const char *smca_get_long_name(enum smca_bank_types t) 725 { 726 if (t >= N_SMCA_BANK_TYPES) 727 return NULL; 728 729 return smca_long_names[t]; 730 } 731 732 /* Decode errors according to Scalable MCA specification */ 733 static void decode_smca_error(struct mce *m) 734 { 735 enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank); 736 u8 xec = XEC(m->status, xec_mask); 737 738 if (bank_type >= N_SMCA_BANK_TYPES) 739 return; 740 741 if (bank_type == SMCA_RESERVED) { 742 pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank); 743 return; 744 } 745 746 pr_emerg(HW_ERR "%s Ext. Error Code: %d", smca_get_long_name(bank_type), xec); 747 748 if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) && 749 xec == 0 && decode_dram_ecc) 750 decode_dram_ecc(topology_amd_node_id(m->extcpu), m); 751 } 752 753 static inline void amd_decode_err_code(u16 ec) 754 { 755 if (INT_ERROR(ec)) { 756 pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec)); 757 return; 758 } 759 760 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec)); 761 762 if (BUS_ERROR(ec)) 763 pr_cont(", mem/io: %s", II_MSG(ec)); 764 else 765 pr_cont(", tx: %s", TT_MSG(ec)); 766 767 if (MEM_ERROR(ec) || BUS_ERROR(ec)) { 768 pr_cont(", mem-tx: %s", R4_MSG(ec)); 769 770 if (BUS_ERROR(ec)) 771 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec)); 772 } 773 774 pr_cont("\n"); 775 } 776 777 static const char *decode_error_status(struct mce *m) 778 { 779 if (m->status & MCI_STATUS_UC) { 780 if (m->status & MCI_STATUS_PCC) 781 return "System Fatal error."; 782 if (m->mcgstatus & MCG_STATUS_RIPV) 783 return "Uncorrected, software restartable error."; 784 return "Uncorrected, software containable error."; 785 } 786 787 if (m->status & MCI_STATUS_DEFERRED) 788 return "Deferred error, no action required."; 789 790 return "Corrected error, no action required."; 791 } 792 793 static int 794 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) 795 { 796 struct mce *m = (struct mce *)data; 797 struct mce_hw_err *err = to_mce_hw_err(m); 798 unsigned int fam = x86_family(m->cpuid); 799 u32 mca_config_lo = 0, dummy; 800 int ecc; 801 802 if (m->kflags & MCE_HANDLED_CEC) 803 return NOTIFY_DONE; 804 805 pr_emerg(HW_ERR "%s\n", decode_error_status(m)); 806 807 pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s", 808 m->extcpu, 809 fam, x86_model(m->cpuid), x86_stepping(m->cpuid), 810 m->bank, 811 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"), 812 ((m->status & MCI_STATUS_UC) ? "UE" : 813 (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"), 814 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"), 815 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"), 816 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-")); 817 818 if (boot_cpu_has(X86_FEATURE_SMCA)) { 819 rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(m->bank), &mca_config_lo, &dummy); 820 821 if (mca_config_lo & MCI_CONFIG_MCAX) 822 pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); 823 824 pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-")); 825 } 826 827 /* do the two bits[14:13] together */ 828 ecc = (m->status >> 45) & 0x3; 829 if (ecc) 830 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U")); 831 832 if (fam >= 0x15) { 833 pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-")); 834 835 /* F15h, bank4, bit 43 is part of McaStatSubCache. */ 836 if (fam != 0x15 || m->bank != 4) 837 pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-")); 838 } 839 840 if (fam >= 0x17) 841 pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-")); 842 843 pr_cont("]: 0x%016llx\n", m->status); 844 845 if (m->status & MCI_STATUS_ADDRV) 846 pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr); 847 848 if (m->ppin) 849 pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin); 850 851 if (boot_cpu_has(X86_FEATURE_SMCA)) { 852 pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid); 853 854 if (m->status & MCI_STATUS_SYNDV) { 855 pr_cont(", Syndrome: 0x%016llx\n", m->synd); 856 if (mca_config_lo & MCI_CONFIG_FRUTEXT) { 857 char frutext[17]; 858 859 frutext[16] = '\0'; 860 memcpy(&frutext[0], &err->vendor.amd.synd1, 8); 861 memcpy(&frutext[8], &err->vendor.amd.synd2, 8); 862 863 pr_emerg(HW_ERR "FRU Text: %s", frutext); 864 } 865 } 866 867 pr_cont("\n"); 868 869 decode_smca_error(m); 870 goto err_code; 871 } 872 873 if (m->tsc) 874 pr_emerg(HW_ERR "TSC: %llu\n", m->tsc); 875 876 /* Doesn't matter which member to test. */ 877 if (!fam_ops.mc0_mce) 878 goto err_code; 879 880 switch (m->bank) { 881 case 0: 882 decode_mc0_mce(m); 883 break; 884 885 case 1: 886 decode_mc1_mce(m); 887 break; 888 889 case 2: 890 decode_mc2_mce(m); 891 break; 892 893 case 3: 894 decode_mc3_mce(m); 895 break; 896 897 case 4: 898 decode_mc4_mce(m); 899 break; 900 901 case 5: 902 decode_mc5_mce(m); 903 break; 904 905 case 6: 906 decode_mc6_mce(m); 907 break; 908 909 default: 910 break; 911 } 912 913 err_code: 914 amd_decode_err_code(m->status & 0xffff); 915 916 m->kflags |= MCE_HANDLED_EDAC; 917 return NOTIFY_OK; 918 } 919 920 static struct notifier_block amd_mce_dec_nb = { 921 .notifier_call = amd_decode_mce, 922 .priority = MCE_PRIO_EDAC, 923 }; 924 925 static int __init mce_amd_init(void) 926 { 927 struct cpuinfo_x86 *c = &boot_cpu_data; 928 929 if (c->x86_vendor != X86_VENDOR_AMD && 930 c->x86_vendor != X86_VENDOR_HYGON) 931 return -ENODEV; 932 933 if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) 934 return -ENODEV; 935 936 if (boot_cpu_has(X86_FEATURE_SMCA)) { 937 xec_mask = 0x3f; 938 goto out; 939 } 940 941 switch (c->x86) { 942 case 0xf: 943 fam_ops.mc0_mce = k8_mc0_mce; 944 fam_ops.mc1_mce = k8_mc1_mce; 945 fam_ops.mc2_mce = k8_mc2_mce; 946 break; 947 948 case 0x10: 949 fam_ops.mc0_mce = f10h_mc0_mce; 950 fam_ops.mc1_mce = k8_mc1_mce; 951 fam_ops.mc2_mce = k8_mc2_mce; 952 break; 953 954 case 0x11: 955 fam_ops.mc0_mce = k8_mc0_mce; 956 fam_ops.mc1_mce = k8_mc1_mce; 957 fam_ops.mc2_mce = k8_mc2_mce; 958 break; 959 960 case 0x12: 961 fam_ops.mc0_mce = f12h_mc0_mce; 962 fam_ops.mc1_mce = k8_mc1_mce; 963 fam_ops.mc2_mce = k8_mc2_mce; 964 break; 965 966 case 0x14: 967 fam_ops.mc0_mce = cat_mc0_mce; 968 fam_ops.mc1_mce = cat_mc1_mce; 969 fam_ops.mc2_mce = k8_mc2_mce; 970 break; 971 972 case 0x15: 973 xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f; 974 975 fam_ops.mc0_mce = f15h_mc0_mce; 976 fam_ops.mc1_mce = f15h_mc1_mce; 977 fam_ops.mc2_mce = f15h_mc2_mce; 978 break; 979 980 case 0x16: 981 xec_mask = 0x1f; 982 fam_ops.mc0_mce = cat_mc0_mce; 983 fam_ops.mc1_mce = cat_mc1_mce; 984 fam_ops.mc2_mce = f16h_mc2_mce; 985 break; 986 987 case 0x17: 988 case 0x18: 989 pr_warn_once("Decoding supported only on Scalable MCA processors.\n"); 990 return -EINVAL; 991 992 default: 993 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86); 994 return -EINVAL; 995 } 996 997 out: 998 pr_info("MCE: In-kernel MCE decoding enabled.\n"); 999 1000 mce_register_decode_chain(&amd_mce_dec_nb); 1001 1002 return 0; 1003 } 1004 early_initcall(mce_amd_init); 1005 1006 #ifdef MODULE 1007 static void __exit mce_amd_exit(void) 1008 { 1009 mce_unregister_decode_chain(&amd_mce_dec_nb); 1010 } 1011 1012 MODULE_DESCRIPTION("AMD MCE decoder"); 1013 MODULE_ALIAS("edac-mce-amd"); 1014 MODULE_LICENSE("GPL"); 1015 module_exit(mce_amd_exit); 1016 #endif 1017