1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/module.h> 3 #include <linux/slab.h> 4 5 #include <asm/cpu.h> 6 #include <asm/msr.h> 7 8 #include "mce_amd.h" 9 10 static struct amd_decoder_ops fam_ops; 11 12 static u8 xec_mask = 0xf; 13 14 static void (*decode_dram_ecc)(int node_id, struct mce *m); 15 16 void amd_register_ecc_decoder(void (*f)(int, struct mce *)) 17 { 18 decode_dram_ecc = f; 19 } 20 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); 21 22 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *)) 23 { 24 if (decode_dram_ecc) { 25 WARN_ON(decode_dram_ecc != f); 26 27 decode_dram_ecc = NULL; 28 } 29 } 30 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); 31 32 /* 33 * string representation for the different MCA reported error types, see F3x48 34 * or MSR0000_0411. 35 */ 36 37 /* transaction type */ 38 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" }; 39 40 /* cache level */ 41 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" }; 42 43 /* memory transaction type */ 44 static const char * const rrrr_msgs[] = { 45 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP" 46 }; 47 48 /* participating processor */ 49 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" }; 50 EXPORT_SYMBOL_GPL(pp_msgs); 51 52 /* request timeout */ 53 static const char * const to_msgs[] = { "no timeout", "timed out" }; 54 55 /* memory or i/o */ 56 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; 57 58 /* internal error type */ 59 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" }; 60 61 static const char * const f15h_mc1_mce_desc[] = { 62 "UC during a demand linefill from L2", 63 "Parity error during data load from IC", 64 "Parity error for IC valid bit", 65 "Main tag parity error", 66 "Parity error in prediction queue", 67 "PFB data/address parity error", 68 "Parity error in the branch status reg", 69 "PFB promotion address error", 70 "Tag error during probe/victimization", 71 "Parity error for IC probe tag valid bit", 72 "PFB non-cacheable bit parity error", 73 "PFB valid bit parity error", /* xec = 0xd */ 74 "Microcode Patch Buffer", /* xec = 010 */ 75 "uop queue", 76 "insn buffer", 77 "predecode buffer", 78 "fetch address FIFO", 79 "dispatch uop queue" 80 }; 81 82 static const char * const f15h_mc2_mce_desc[] = { 83 "Fill ECC error on data fills", /* xec = 0x4 */ 84 "Fill parity error on insn fills", 85 "Prefetcher request FIFO parity error", 86 "PRQ address parity error", 87 "PRQ data parity error", 88 "WCC Tag ECC error", 89 "WCC Data ECC error", 90 "WCB Data parity error", 91 "VB Data ECC or parity error", 92 "L2 Tag ECC error", /* xec = 0x10 */ 93 "Hard L2 Tag ECC error", 94 "Multiple hits on L2 tag", 95 "XAB parity error", 96 "PRB address parity error" 97 }; 98 99 static const char * const mc4_mce_desc[] = { 100 "DRAM ECC error detected on the NB", 101 "CRC error detected on HT link", 102 "Link-defined sync error packets detected on HT link", 103 "HT Master abort", 104 "HT Target abort", 105 "Invalid GART PTE entry during GART table walk", 106 "Unsupported atomic RMW received from an IO link", 107 "Watchdog timeout due to lack of progress", 108 "DRAM ECC error detected on the NB", 109 "SVM DMA Exclusion Vector error", 110 "HT data error detected on link", 111 "Protocol error (link, L3, probe filter)", 112 "NB internal arrays parity error", 113 "DRAM addr/ctl signals parity error", 114 "IO link transmission error", 115 "L3 data cache ECC error", /* xec = 0x1c */ 116 "L3 cache tag error", 117 "L3 LRU parity bits error", 118 "ECC Error in the Probe Filter directory" 119 }; 120 121 static const char * const mc5_mce_desc[] = { 122 "CPU Watchdog timer expire", 123 "Wakeup array dest tag", 124 "AG payload array", 125 "EX payload array", 126 "IDRF array", 127 "Retire dispatch queue", 128 "Mapper checkpoint array", 129 "Physical register file EX0 port", 130 "Physical register file EX1 port", 131 "Physical register file AG0 port", 132 "Physical register file AG1 port", 133 "Flag register file", 134 "DE error occurred", 135 "Retire status queue" 136 }; 137 138 static const char * const mc6_mce_desc[] = { 139 "Hardware Assertion", 140 "Free List", 141 "Physical Register File", 142 "Retire Queue", 143 "Scheduler table", 144 "Status Register File", 145 }; 146 147 static bool f12h_mc0_mce(u16 ec, u8 xec) 148 { 149 bool ret = false; 150 151 if (MEM_ERROR(ec)) { 152 u8 ll = LL(ec); 153 ret = true; 154 155 if (ll == LL_L2) 156 pr_cont("during L1 linefill from L2.\n"); 157 else if (ll == LL_L1) 158 pr_cont("Data/Tag %s error.\n", R4_MSG(ec)); 159 else 160 ret = false; 161 } 162 return ret; 163 } 164 165 static bool f10h_mc0_mce(u16 ec, u8 xec) 166 { 167 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) { 168 pr_cont("during data scrub.\n"); 169 return true; 170 } 171 return f12h_mc0_mce(ec, xec); 172 } 173 174 static bool k8_mc0_mce(u16 ec, u8 xec) 175 { 176 if (BUS_ERROR(ec)) { 177 pr_cont("during system linefill.\n"); 178 return true; 179 } 180 181 return f10h_mc0_mce(ec, xec); 182 } 183 184 static bool cat_mc0_mce(u16 ec, u8 xec) 185 { 186 u8 r4 = R4(ec); 187 bool ret = true; 188 189 if (MEM_ERROR(ec)) { 190 191 if (TT(ec) != TT_DATA || LL(ec) != LL_L1) 192 return false; 193 194 switch (r4) { 195 case R4_DRD: 196 case R4_DWR: 197 pr_cont("Data/Tag parity error due to %s.\n", 198 (r4 == R4_DRD ? "load/hw prf" : "store")); 199 break; 200 case R4_EVICT: 201 pr_cont("Copyback parity error on a tag miss.\n"); 202 break; 203 case R4_SNOOP: 204 pr_cont("Tag parity error during snoop.\n"); 205 break; 206 default: 207 ret = false; 208 } 209 } else if (BUS_ERROR(ec)) { 210 211 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG) 212 return false; 213 214 pr_cont("System read data error on a "); 215 216 switch (r4) { 217 case R4_RD: 218 pr_cont("TLB reload.\n"); 219 break; 220 case R4_DWR: 221 pr_cont("store.\n"); 222 break; 223 case R4_DRD: 224 pr_cont("load.\n"); 225 break; 226 default: 227 ret = false; 228 } 229 } else { 230 ret = false; 231 } 232 233 return ret; 234 } 235 236 static bool f15h_mc0_mce(u16 ec, u8 xec) 237 { 238 bool ret = true; 239 240 if (MEM_ERROR(ec)) { 241 242 switch (xec) { 243 case 0x0: 244 pr_cont("Data Array access error.\n"); 245 break; 246 247 case 0x1: 248 pr_cont("UC error during a linefill from L2/NB.\n"); 249 break; 250 251 case 0x2: 252 case 0x11: 253 pr_cont("STQ access error.\n"); 254 break; 255 256 case 0x3: 257 pr_cont("SCB access error.\n"); 258 break; 259 260 case 0x10: 261 pr_cont("Tag error.\n"); 262 break; 263 264 case 0x12: 265 pr_cont("LDQ access error.\n"); 266 break; 267 268 default: 269 ret = false; 270 } 271 } else if (BUS_ERROR(ec)) { 272 273 if (!xec) 274 pr_cont("System Read Data Error.\n"); 275 else 276 pr_cont(" Internal error condition type %d.\n", xec); 277 } else if (INT_ERROR(ec)) { 278 if (xec <= 0x1f) 279 pr_cont("Hardware Assert.\n"); 280 else 281 ret = false; 282 283 } else 284 ret = false; 285 286 return ret; 287 } 288 289 static void decode_mc0_mce(struct mce *m) 290 { 291 u16 ec = EC(m->status); 292 u8 xec = XEC(m->status, xec_mask); 293 294 pr_emerg(HW_ERR "MC0 Error: "); 295 296 /* TLB error signatures are the same across families */ 297 if (TLB_ERROR(ec)) { 298 if (TT(ec) == TT_DATA) { 299 pr_cont("%s TLB %s.\n", LL_MSG(ec), 300 ((xec == 2) ? "locked miss" 301 : (xec ? "multimatch" : "parity"))); 302 return; 303 } 304 } else if (fam_ops.mc0_mce(ec, xec)) 305 ; 306 else 307 pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n"); 308 } 309 310 static bool k8_mc1_mce(u16 ec, u8 xec) 311 { 312 u8 ll = LL(ec); 313 bool ret = true; 314 315 if (!MEM_ERROR(ec)) 316 return false; 317 318 if (ll == 0x2) 319 pr_cont("during a linefill from L2.\n"); 320 else if (ll == 0x1) { 321 switch (R4(ec)) { 322 case R4_IRD: 323 pr_cont("Parity error during data load.\n"); 324 break; 325 326 case R4_EVICT: 327 pr_cont("Copyback Parity/Victim error.\n"); 328 break; 329 330 case R4_SNOOP: 331 pr_cont("Tag Snoop error.\n"); 332 break; 333 334 default: 335 ret = false; 336 break; 337 } 338 } else 339 ret = false; 340 341 return ret; 342 } 343 344 static bool cat_mc1_mce(u16 ec, u8 xec) 345 { 346 u8 r4 = R4(ec); 347 bool ret = true; 348 349 if (!MEM_ERROR(ec)) 350 return false; 351 352 if (TT(ec) != TT_INSTR) 353 return false; 354 355 if (r4 == R4_IRD) 356 pr_cont("Data/tag array parity error for a tag hit.\n"); 357 else if (r4 == R4_SNOOP) 358 pr_cont("Tag error during snoop/victimization.\n"); 359 else if (xec == 0x0) 360 pr_cont("Tag parity error from victim castout.\n"); 361 else if (xec == 0x2) 362 pr_cont("Microcode patch RAM parity error.\n"); 363 else 364 ret = false; 365 366 return ret; 367 } 368 369 static bool f15h_mc1_mce(u16 ec, u8 xec) 370 { 371 bool ret = true; 372 373 if (!MEM_ERROR(ec)) 374 return false; 375 376 switch (xec) { 377 case 0x0 ... 0xa: 378 pr_cont("%s.\n", f15h_mc1_mce_desc[xec]); 379 break; 380 381 case 0xd: 382 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]); 383 break; 384 385 case 0x10: 386 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]); 387 break; 388 389 case 0x11 ... 0x15: 390 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]); 391 break; 392 393 default: 394 ret = false; 395 } 396 return ret; 397 } 398 399 static void decode_mc1_mce(struct mce *m) 400 { 401 u16 ec = EC(m->status); 402 u8 xec = XEC(m->status, xec_mask); 403 404 pr_emerg(HW_ERR "MC1 Error: "); 405 406 if (TLB_ERROR(ec)) 407 pr_cont("%s TLB %s.\n", LL_MSG(ec), 408 (xec ? "multimatch" : "parity error")); 409 else if (BUS_ERROR(ec)) { 410 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58))); 411 412 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read")); 413 } else if (INT_ERROR(ec)) { 414 if (xec <= 0x3f) 415 pr_cont("Hardware Assert.\n"); 416 else 417 goto wrong_mc1_mce; 418 } else if (fam_ops.mc1_mce(ec, xec)) 419 ; 420 else 421 goto wrong_mc1_mce; 422 423 return; 424 425 wrong_mc1_mce: 426 pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n"); 427 } 428 429 static bool k8_mc2_mce(u16 ec, u8 xec) 430 { 431 bool ret = true; 432 433 if (xec == 0x1) 434 pr_cont(" in the write data buffers.\n"); 435 else if (xec == 0x3) 436 pr_cont(" in the victim data buffers.\n"); 437 else if (xec == 0x2 && MEM_ERROR(ec)) 438 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec)); 439 else if (xec == 0x0) { 440 if (TLB_ERROR(ec)) 441 pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n", 442 TT_MSG(ec)); 443 else if (BUS_ERROR(ec)) 444 pr_cont(": %s/ECC error in data read from NB: %s.\n", 445 R4_MSG(ec), PP_MSG(ec)); 446 else if (MEM_ERROR(ec)) { 447 u8 r4 = R4(ec); 448 449 if (r4 >= 0x7) 450 pr_cont(": %s error during data copyback.\n", 451 R4_MSG(ec)); 452 else if (r4 <= 0x1) 453 pr_cont(": %s parity/ECC error during data " 454 "access from L2.\n", R4_MSG(ec)); 455 else 456 ret = false; 457 } else 458 ret = false; 459 } else 460 ret = false; 461 462 return ret; 463 } 464 465 static bool f15h_mc2_mce(u16 ec, u8 xec) 466 { 467 bool ret = true; 468 469 if (TLB_ERROR(ec)) { 470 if (xec == 0x0) 471 pr_cont("Data parity TLB read error.\n"); 472 else if (xec == 0x1) 473 pr_cont("Poison data provided for TLB fill.\n"); 474 else 475 ret = false; 476 } else if (BUS_ERROR(ec)) { 477 if (xec > 2) 478 ret = false; 479 480 pr_cont("Error during attempted NB data read.\n"); 481 } else if (MEM_ERROR(ec)) { 482 switch (xec) { 483 case 0x4 ... 0xc: 484 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]); 485 break; 486 487 case 0x10 ... 0x14: 488 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]); 489 break; 490 491 default: 492 ret = false; 493 } 494 } else if (INT_ERROR(ec)) { 495 if (xec <= 0x3f) 496 pr_cont("Hardware Assert.\n"); 497 else 498 ret = false; 499 } 500 501 return ret; 502 } 503 504 static bool f16h_mc2_mce(u16 ec, u8 xec) 505 { 506 u8 r4 = R4(ec); 507 508 if (!MEM_ERROR(ec)) 509 return false; 510 511 switch (xec) { 512 case 0x04 ... 0x05: 513 pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O'); 514 break; 515 516 case 0x09 ... 0x0b: 517 case 0x0d ... 0x0f: 518 pr_cont("ECC error in L2 tag (%s).\n", 519 ((r4 == R4_GEN) ? "BankReq" : 520 ((r4 == R4_SNOOP) ? "Prb" : "Fill"))); 521 break; 522 523 case 0x10 ... 0x19: 524 case 0x1b: 525 pr_cont("ECC error in L2 data array (%s).\n", 526 (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" : 527 ((r4 == R4_GEN) ? "Attr" : 528 ((r4 == R4_EVICT) ? "Vict" : "Fill")))); 529 break; 530 531 case 0x1c ... 0x1d: 532 case 0x1f: 533 pr_cont("Parity error in L2 attribute bits (%s).\n", 534 ((r4 == R4_RD) ? "Hit" : 535 ((r4 == R4_GEN) ? "Attr" : "Fill"))); 536 break; 537 538 default: 539 return false; 540 } 541 542 return true; 543 } 544 545 static void decode_mc2_mce(struct mce *m) 546 { 547 u16 ec = EC(m->status); 548 u8 xec = XEC(m->status, xec_mask); 549 550 pr_emerg(HW_ERR "MC2 Error: "); 551 552 if (!fam_ops.mc2_mce(ec, xec)) 553 pr_cont(HW_ERR "Corrupted MC2 MCE info?\n"); 554 } 555 556 static void decode_mc3_mce(struct mce *m) 557 { 558 u16 ec = EC(m->status); 559 u8 xec = XEC(m->status, xec_mask); 560 561 if (boot_cpu_data.x86 >= 0x14) { 562 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family," 563 " please report on LKML.\n"); 564 return; 565 } 566 567 pr_emerg(HW_ERR "MC3 Error"); 568 569 if (xec == 0x0) { 570 u8 r4 = R4(ec); 571 572 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR)) 573 goto wrong_mc3_mce; 574 575 pr_cont(" during %s.\n", R4_MSG(ec)); 576 } else 577 goto wrong_mc3_mce; 578 579 return; 580 581 wrong_mc3_mce: 582 pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n"); 583 } 584 585 static void decode_mc4_mce(struct mce *m) 586 { 587 unsigned int fam = x86_family(m->cpuid); 588 int node_id = topology_amd_node_id(m->extcpu); 589 u16 ec = EC(m->status); 590 u8 xec = XEC(m->status, 0x1f); 591 u8 offset = 0; 592 593 pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id); 594 595 switch (xec) { 596 case 0x0 ... 0xe: 597 598 /* special handling for DRAM ECCs */ 599 if (xec == 0x0 || xec == 0x8) { 600 /* no ECCs on F11h */ 601 if (fam == 0x11) 602 goto wrong_mc4_mce; 603 604 pr_cont("%s.\n", mc4_mce_desc[xec]); 605 606 if (decode_dram_ecc) 607 decode_dram_ecc(node_id, m); 608 return; 609 } 610 break; 611 612 case 0xf: 613 if (TLB_ERROR(ec)) 614 pr_cont("GART Table Walk data error.\n"); 615 else if (BUS_ERROR(ec)) 616 pr_cont("DMA Exclusion Vector Table Walk error.\n"); 617 else 618 goto wrong_mc4_mce; 619 return; 620 621 case 0x19: 622 if (fam == 0x15 || fam == 0x16) 623 pr_cont("Compute Unit Data Error.\n"); 624 else 625 goto wrong_mc4_mce; 626 return; 627 628 case 0x1c ... 0x1f: 629 offset = 13; 630 break; 631 632 default: 633 goto wrong_mc4_mce; 634 } 635 636 pr_cont("%s.\n", mc4_mce_desc[xec - offset]); 637 return; 638 639 wrong_mc4_mce: 640 pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n"); 641 } 642 643 static void decode_mc5_mce(struct mce *m) 644 { 645 unsigned int fam = x86_family(m->cpuid); 646 u16 ec = EC(m->status); 647 u8 xec = XEC(m->status, xec_mask); 648 649 if (fam == 0xf || fam == 0x11) 650 goto wrong_mc5_mce; 651 652 pr_emerg(HW_ERR "MC5 Error: "); 653 654 if (INT_ERROR(ec)) { 655 if (xec <= 0x1f) { 656 pr_cont("Hardware Assert.\n"); 657 return; 658 } else 659 goto wrong_mc5_mce; 660 } 661 662 if (xec == 0x0 || xec == 0xc) 663 pr_cont("%s.\n", mc5_mce_desc[xec]); 664 else if (xec <= 0xd) 665 pr_cont("%s parity error.\n", mc5_mce_desc[xec]); 666 else 667 goto wrong_mc5_mce; 668 669 return; 670 671 wrong_mc5_mce: 672 pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n"); 673 } 674 675 static void decode_mc6_mce(struct mce *m) 676 { 677 u8 xec = XEC(m->status, xec_mask); 678 679 pr_emerg(HW_ERR "MC6 Error: "); 680 681 if (xec > 0x5) 682 goto wrong_mc6_mce; 683 684 pr_cont("%s parity error.\n", mc6_mce_desc[xec]); 685 return; 686 687 wrong_mc6_mce: 688 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n"); 689 } 690 691 static const char * const smca_long_names[] = { 692 [SMCA_CS ... SMCA_CS_V2] = "Coherent Station", 693 [SMCA_DACC_BE] = "DACC Back-end Unit", 694 [SMCA_DACC_FE] = "DACC Front-end Unit", 695 [SMCA_DE] = "Decode Unit", 696 [SMCA_EDDR5CMN] = "eDDR5 CMN Unit", 697 [SMCA_EX] = "Execution Unit", 698 [SMCA_FP] = "Floating Point Unit", 699 [SMCA_GMI_PCS] = "Global Memory Interconnect PCS Unit", 700 [SMCA_GMI_PHY] = "Global Memory Interconnect PHY Unit", 701 [SMCA_IF] = "Instruction Fetch Unit", 702 [SMCA_L2_CACHE] = "L2 Cache", 703 [SMCA_L3_CACHE] = "L3 Cache", 704 [SMCA_LS ... SMCA_LS_V2] = "Load Store Unit", 705 [SMCA_MP5] = "Microprocessor 5 Unit", 706 [SMCA_MPART] = "MPART Unit", 707 [SMCA_MPASP ... SMCA_MPASP_V2] = "MPASP Unit", 708 [SMCA_MPDACC] = "MPDACC Unit", 709 [SMCA_MPDMA] = "MPDMA Unit", 710 [SMCA_MPM] = "MPM Unit", 711 [SMCA_MPRAS] = "MPRAS Unit", 712 [SMCA_NBIF] = "NBIF Unit", 713 [SMCA_NBIO] = "Northbridge IO Unit", 714 [SMCA_PB] = "Parameter Block", 715 [SMCA_PCIE ... SMCA_PCIE_V2] = "PCI Express Unit", 716 [SMCA_PCIE_PL] = "PCIe Link Unit", 717 [SMCA_PIE] = "Power, Interrupts, etc.", 718 [SMCA_PSP ... SMCA_PSP_V2] = "Platform Security Processor", 719 [SMCA_RESERVED] = "Reserved", 720 [SMCA_SATA] = "SATA Unit", 721 [SMCA_SHUB] = "System Hub Unit", 722 [SMCA_SMU ... SMCA_SMU_V2] = "System Management Unit", 723 [SMCA_SSBDCI] = "Die to Die Interconnect Unit", 724 725 /* UMC v2 is separate because both of them can exist in a single system. */ 726 [SMCA_UMC] = "Unified Memory Controller", 727 [SMCA_UMC_V2] = "Unified Memory Controller v2", 728 [SMCA_USB] = "USB Unit", 729 [SMCA_WAFL_PHY] = "WAFL PHY Unit", 730 [SMCA_XGMI_PCS] = "Ext Global Memory Interconnect PCS Unit", 731 [SMCA_XGMI_PHY] = "Ext Global Memory Interconnect PHY Unit", 732 }; 733 734 static const char *smca_get_long_name(enum smca_bank_types t) 735 { 736 if (t >= N_SMCA_BANK_TYPES) 737 return NULL; 738 739 return smca_long_names[t]; 740 } 741 742 /* Decode errors according to Scalable MCA specification */ 743 static void decode_smca_error(struct mce *m) 744 { 745 enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank); 746 u8 xec = XEC(m->status, xec_mask); 747 748 if (bank_type >= N_SMCA_BANK_TYPES) 749 return; 750 751 if (bank_type == SMCA_RESERVED) { 752 pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank); 753 return; 754 } 755 756 pr_emerg(HW_ERR "%s Ext. Error Code: %d", smca_get_long_name(bank_type), xec); 757 758 if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) && 759 xec == 0 && decode_dram_ecc) 760 decode_dram_ecc(topology_amd_node_id(m->extcpu), m); 761 } 762 763 static inline void amd_decode_err_code(u16 ec) 764 { 765 if (INT_ERROR(ec)) { 766 pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec)); 767 return; 768 } 769 770 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec)); 771 772 if (BUS_ERROR(ec)) 773 pr_cont(", mem/io: %s", II_MSG(ec)); 774 else 775 pr_cont(", tx: %s", TT_MSG(ec)); 776 777 if (MEM_ERROR(ec) || BUS_ERROR(ec)) { 778 pr_cont(", mem-tx: %s", R4_MSG(ec)); 779 780 if (BUS_ERROR(ec)) 781 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec)); 782 } 783 784 pr_cont("\n"); 785 } 786 787 static const char *decode_error_status(struct mce *m) 788 { 789 if (m->status & MCI_STATUS_UC) { 790 if (m->status & MCI_STATUS_PCC) 791 return "System Fatal error."; 792 if (m->mcgstatus & MCG_STATUS_RIPV) 793 return "Uncorrected, software restartable error."; 794 return "Uncorrected, software containable error."; 795 } 796 797 if (m->status & MCI_STATUS_DEFERRED) 798 return "Deferred error, no action required."; 799 800 return "Corrected error, no action required."; 801 } 802 803 static int 804 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) 805 { 806 struct mce *m = (struct mce *)data; 807 struct mce_hw_err *err = to_mce_hw_err(m); 808 unsigned int fam = x86_family(m->cpuid); 809 u32 mca_config_lo = 0, dummy; 810 int ecc; 811 812 if (m->kflags & MCE_HANDLED_CEC) 813 return NOTIFY_DONE; 814 815 pr_emerg(HW_ERR "%s\n", decode_error_status(m)); 816 817 pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s", 818 m->extcpu, 819 fam, x86_model(m->cpuid), x86_stepping(m->cpuid), 820 m->bank, 821 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"), 822 ((m->status & MCI_STATUS_UC) ? "UE" : 823 (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"), 824 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"), 825 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"), 826 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-")); 827 828 if (boot_cpu_has(X86_FEATURE_SMCA)) { 829 rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(m->bank), &mca_config_lo, &dummy); 830 831 if (mca_config_lo & MCI_CONFIG_MCAX) 832 pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); 833 834 pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-")); 835 } 836 837 /* do the two bits[14:13] together */ 838 ecc = (m->status >> 45) & 0x3; 839 if (ecc) 840 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U")); 841 842 if (fam >= 0x15) { 843 pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-")); 844 845 /* F15h, bank4, bit 43 is part of McaStatSubCache. */ 846 if (fam != 0x15 || m->bank != 4) 847 pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-")); 848 } 849 850 if (fam >= 0x17) 851 pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-")); 852 853 pr_cont("]: 0x%016llx\n", m->status); 854 855 if (m->status & MCI_STATUS_ADDRV) 856 pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr); 857 858 if (m->ppin) 859 pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin); 860 861 if (boot_cpu_has(X86_FEATURE_SMCA)) { 862 pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid); 863 864 if (m->status & MCI_STATUS_SYNDV) { 865 pr_cont(", Syndrome: 0x%016llx\n", m->synd); 866 if (mca_config_lo & MCI_CONFIG_FRUTEXT) { 867 char frutext[17]; 868 869 frutext[16] = '\0'; 870 memcpy(&frutext[0], &err->vendor.amd.synd1, 8); 871 memcpy(&frutext[8], &err->vendor.amd.synd2, 8); 872 873 pr_emerg(HW_ERR "FRU Text: %s", frutext); 874 } 875 } 876 877 pr_cont("\n"); 878 879 decode_smca_error(m); 880 goto err_code; 881 } 882 883 if (m->tsc) 884 pr_emerg(HW_ERR "TSC: %llu\n", m->tsc); 885 886 /* Doesn't matter which member to test. */ 887 if (!fam_ops.mc0_mce) 888 goto err_code; 889 890 switch (m->bank) { 891 case 0: 892 decode_mc0_mce(m); 893 break; 894 895 case 1: 896 decode_mc1_mce(m); 897 break; 898 899 case 2: 900 decode_mc2_mce(m); 901 break; 902 903 case 3: 904 decode_mc3_mce(m); 905 break; 906 907 case 4: 908 decode_mc4_mce(m); 909 break; 910 911 case 5: 912 decode_mc5_mce(m); 913 break; 914 915 case 6: 916 decode_mc6_mce(m); 917 break; 918 919 default: 920 break; 921 } 922 923 err_code: 924 amd_decode_err_code(m->status & 0xffff); 925 926 m->kflags |= MCE_HANDLED_EDAC; 927 return NOTIFY_OK; 928 } 929 930 static struct notifier_block amd_mce_dec_nb = { 931 .notifier_call = amd_decode_mce, 932 .priority = MCE_PRIO_EDAC, 933 }; 934 935 static int __init mce_amd_init(void) 936 { 937 struct cpuinfo_x86 *c = &boot_cpu_data; 938 939 if (c->x86_vendor != X86_VENDOR_AMD && 940 c->x86_vendor != X86_VENDOR_HYGON) 941 return -ENODEV; 942 943 if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) 944 return -ENODEV; 945 946 if (boot_cpu_has(X86_FEATURE_SMCA)) { 947 xec_mask = 0x3f; 948 goto out; 949 } 950 951 switch (c->x86) { 952 case 0xf: 953 fam_ops.mc0_mce = k8_mc0_mce; 954 fam_ops.mc1_mce = k8_mc1_mce; 955 fam_ops.mc2_mce = k8_mc2_mce; 956 break; 957 958 case 0x10: 959 fam_ops.mc0_mce = f10h_mc0_mce; 960 fam_ops.mc1_mce = k8_mc1_mce; 961 fam_ops.mc2_mce = k8_mc2_mce; 962 break; 963 964 case 0x11: 965 fam_ops.mc0_mce = k8_mc0_mce; 966 fam_ops.mc1_mce = k8_mc1_mce; 967 fam_ops.mc2_mce = k8_mc2_mce; 968 break; 969 970 case 0x12: 971 fam_ops.mc0_mce = f12h_mc0_mce; 972 fam_ops.mc1_mce = k8_mc1_mce; 973 fam_ops.mc2_mce = k8_mc2_mce; 974 break; 975 976 case 0x14: 977 fam_ops.mc0_mce = cat_mc0_mce; 978 fam_ops.mc1_mce = cat_mc1_mce; 979 fam_ops.mc2_mce = k8_mc2_mce; 980 break; 981 982 case 0x15: 983 xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f; 984 985 fam_ops.mc0_mce = f15h_mc0_mce; 986 fam_ops.mc1_mce = f15h_mc1_mce; 987 fam_ops.mc2_mce = f15h_mc2_mce; 988 break; 989 990 case 0x16: 991 xec_mask = 0x1f; 992 fam_ops.mc0_mce = cat_mc0_mce; 993 fam_ops.mc1_mce = cat_mc1_mce; 994 fam_ops.mc2_mce = f16h_mc2_mce; 995 break; 996 997 case 0x17: 998 case 0x18: 999 pr_warn_once("Decoding supported only on Scalable MCA processors.\n"); 1000 return -EINVAL; 1001 1002 default: 1003 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86); 1004 return -EINVAL; 1005 } 1006 1007 out: 1008 pr_info("MCE: In-kernel MCE decoding enabled.\n"); 1009 1010 mce_register_decode_chain(&amd_mce_dec_nb); 1011 1012 return 0; 1013 } 1014 early_initcall(mce_amd_init); 1015 1016 #ifdef MODULE 1017 static void __exit mce_amd_exit(void) 1018 { 1019 mce_unregister_decode_chain(&amd_mce_dec_nb); 1020 } 1021 1022 MODULE_DESCRIPTION("AMD MCE decoder"); 1023 MODULE_ALIAS("edac-mce-amd"); 1024 MODULE_LICENSE("GPL"); 1025 module_exit(mce_amd_exit); 1026 #endif 1027