1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * "Generic AMD" model-specific support. If no more-specific support can 29 * be found, or such modules declines to initialize, then for AuthenticAMD 30 * cpus this module can have a crack at providing some AMD model-specific 31 * support that at least goes beyond common MCA architectural features 32 * if not down to the nitty-gritty level for a particular model. We 33 * are layered on top of a cpu module, likely cpu.generic, so there is no 34 * need for us to perform common architecturally-accessible functions. 35 */ 36 37 #include <sys/types.h> 38 #include <sys/cmn_err.h> 39 #include <sys/modctl.h> 40 #include <sys/cpu_module.h> 41 #include <sys/mca_x86.h> 42 #include <sys/pci_cfgspace.h> 43 #include <sys/x86_archext.h> 44 #include <sys/mc_amd.h> 45 #include <sys/fm/protocol.h> 46 #include <sys/fm/cpu/GENAMD.h> 47 #include <sys/fm/smb/fmsmb.h> 48 #include <sys/fm/util.h> 49 #include <sys/nvpair.h> 50 #include <sys/controlregs.h> 51 #include <sys/pghw.h> 52 #include <sys/sunddi.h> 53 #include <sys/sysmacros.h> 54 #include <sys/cpu_module_ms_impl.h> 55 56 #include "authamd.h" 57 58 extern int x86gentopo_legacy; /* x86 generic topo support */ 59 60 int authamd_ms_support_disable = 0; 61 62 #define AUTHAMD_F_REVS_BCDE \ 63 (X86_CHIPREV_AMD_F_REV_B | X86_CHIPREV_AMD_F_REV_C0 | \ 64 X86_CHIPREV_AMD_F_REV_CG | X86_CHIPREV_AMD_F_REV_D | \ 65 X86_CHIPREV_AMD_F_REV_E) 66 67 #define AUTHAMD_F_REVS_FG \ 68 (X86_CHIPREV_AMD_F_REV_F | X86_CHIPREV_AMD_F_REV_G) 69 70 #define AUTHAMD_10_REVS_AB \ 71 (X86_CHIPREV_AMD_10_REV_A | X86_CHIPREV_AMD_10_REV_B) 72 73 /* 74 * Bitmasks of support for various features. Try to enable features 75 * via inclusion in one of these bitmasks and check that at the 76 * feature imlementation - that way new family support may often simply 77 * simply need to update these bitmasks. 78 */ 79 80 /* 81 * Models that include an on-chip NorthBridge. 82 */ 83 #define AUTHAMD_NBONCHIP(rev) \ 84 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 85 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 86 87 /* 88 * Families/revisions for which we can recognise main memory ECC errors. 89 */ 90 #define AUTHAMD_MEMECC_RECOGNISED(rev) \ 91 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 92 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 93 94 /* 95 * Families/revisions that have an Online Spare Control Register 96 */ 97 #define AUTHAMD_HAS_ONLINESPARECTL(rev) \ 98 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) || \ 99 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 100 101 /* 102 * Families/revisions for which we will perform NB MCA Config changes 103 */ 104 #define AUTHAMD_DO_NBMCACFG(rev) \ 105 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 106 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 107 108 /* 109 * Families/revisions that have chip cache scrubbers. 110 */ 111 #define AUTHAMD_HAS_CHIPSCRUB(rev) \ 112 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 113 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 114 115 /* 116 * Families/revisions that have a NB misc register or registers - 117 * evaluates to 0 if no support, otherwise the number of MC4_MISCj. 118 */ 119 #define AUTHAMD_NBMISC_NUM(rev) \ 120 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F)? 1 : \ 121 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A) ? 3 : 0)) 122 123 /* 124 * Families/revision for which we wish not to machine check for GART 125 * table walk errors - bit 10 of NB CTL. 126 */ 127 #define AUTHAMD_NOGARTTBLWLK_MC(rev) \ 128 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 129 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 130 131 /* 132 * Families/revisions that are potentially L3 capable 133 */ 134 #define AUTHAMD_L3CAPABLE(rev) \ 135 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 136 137 /* 138 * We recognise main memory ECC errors for AUTHAMD_MEMECC_RECOGNISED 139 * revisions as: 140 * 141 * - being reported by the NB 142 * - being a compound bus/interconnect error (external to chip) 143 * - having LL of LG 144 * - having II of MEM (but could still be a master/target abort) 145 * - having CECC or UECC set 146 * 147 * We do not check the extended error code (first nibble of the 148 * model-specific error code on AMD) since this has changed from 149 * family 0xf to family 0x10 (ext code 0 now reserved on family 0x10). 150 * Instead we use CECC/UECC to separate off the master/target 151 * abort cases. 152 * 153 * We insist that the detector be the NorthBridge bank; although 154 * IC/DC can report some main memory errors, they do not capture 155 * an address at sufficient resolution to be useful and the NB will 156 * report most errors. 157 */ 158 #define AUTHAMD_IS_MEMECCERR(bank, status) \ 159 ((bank) == AMD_MCA_BANK_NB && \ 160 MCAX86_ERRCODE_ISBUS_INTERCONNECT(MCAX86_ERRCODE(status)) && \ 161 MCAX86_ERRCODE_LL(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_LL_LG && \ 162 MCAX86_ERRCODE_II(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_II_MEM && \ 163 ((status) & (AMD_BANK_STAT_CECC | AMD_BANK_STAT_UECC))) 164 165 static authamd_error_disp_t authamd_memce_disp = { 166 FM_EREPORT_CPU_GENAMD, 167 FM_EREPORT_CPU_GENAMD_MEM_CE, 168 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_MEM_CE 169 }; 170 171 static authamd_error_disp_t authamd_memue_disp = { 172 FM_EREPORT_CPU_GENAMD, 173 FM_EREPORT_CPU_GENAMD_MEM_UE, 174 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_MEM_UE 175 }; 176 177 static authamd_error_disp_t authamd_ckmemce_disp = { 178 FM_EREPORT_CPU_GENAMD, 179 FM_EREPORT_CPU_GENAMD_CKMEM_CE, 180 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_CKMEM_CE 181 }; 182 183 static authamd_error_disp_t authamd_ckmemue_disp = { 184 FM_EREPORT_CPU_GENAMD, 185 FM_EREPORT_CPU_GENAMD_CKMEM_UE, 186 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_CKMEM_UE 187 }; 188 189 /* 190 * We recognise GART walk errors as: 191 * 192 * - being reported by the NB 193 * - being a compound TLB error 194 * - having LL of LG and TT of GEN 195 * - having UC set 196 * - possibly having PCC set (if source CPU) 197 */ 198 #define AUTHAMD_IS_GARTERR(bank, status) \ 199 ((bank) == AMD_MCA_BANK_NB && \ 200 MCAX86_ERRCODE_ISTLB(MCAX86_ERRCODE(status)) && \ 201 MCAX86_ERRCODE_LL(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_LL_LG && \ 202 MCAX86_ERRCODE_TT(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_TT_GEN && \ 203 (status) & MSR_MC_STATUS_UC) 204 205 static authamd_error_disp_t authamd_gart_disp = { 206 FM_EREPORT_CPU_GENAMD, /* use generic subclass */ 207 FM_EREPORT_CPU_GENADM_GARTTBLWLK, /* use generic leafclass */ 208 0 /* no additional payload */ 209 }; 210 211 212 static struct authamd_chipshared *authamd_shared[AUTHAMD_MAX_CHIPS]; 213 214 static int 215 authamd_chip_once(authamd_data_t *authamd, enum authamd_cfgonce_bitnum what) 216 { 217 return (atomic_set_long_excl(&authamd->amd_shared->acs_cfgonce, 218 what) == 0 ? B_TRUE : B_FALSE); 219 } 220 221 static void 222 authamd_pcicfg_write(uint_t chipid, uint_t func, uint_t reg, uint32_t val) 223 { 224 ASSERT(chipid + 24 <= 31); 225 ASSERT((func & 7) == func); 226 ASSERT((reg & 3) == 0 && reg < 256); 227 228 cmi_pci_putl(0, chipid + 24, func, reg, 0, val); 229 } 230 231 static uint32_t 232 authamd_pcicfg_read(uint_t chipid, uint_t func, uint_t reg) 233 { 234 ASSERT(chipid + 24 <= 31); 235 ASSERT((func & 7) == func); 236 ASSERT((reg & 3) == 0 && reg < 256); 237 238 return (cmi_pci_getl(0, chipid + 24, func, reg, 0, 0)); 239 } 240 241 void 242 authamd_bankstatus_prewrite(cmi_hdl_t hdl, authamd_data_t *authamd) 243 { 244 uint64_t hwcr; 245 246 if (cmi_hdl_rdmsr(hdl, MSR_AMD_HWCR, &hwcr) != CMI_SUCCESS) 247 return; 248 249 authamd->amd_hwcr = hwcr; 250 251 if (!(hwcr & AMD_HWCR_MCI_STATUS_WREN)) { 252 hwcr |= AMD_HWCR_MCI_STATUS_WREN; 253 (void) cmi_hdl_wrmsr(hdl, MSR_AMD_HWCR, hwcr); 254 } 255 } 256 257 void 258 authamd_bankstatus_postwrite(cmi_hdl_t hdl, authamd_data_t *authamd) 259 { 260 uint64_t hwcr = authamd->amd_hwcr; 261 262 if (!(hwcr & AMD_HWCR_MCI_STATUS_WREN)) { 263 hwcr &= ~AMD_HWCR_MCI_STATUS_WREN; 264 (void) cmi_hdl_wrmsr(hdl, MSR_AMD_HWCR, hwcr); 265 } 266 } 267 268 /* 269 * Read EccCnt repeatedly for all possible channel/chip-select combos: 270 * 271 * - read sparectl register 272 * - if EccErrCntWrEn is set, clear that bit in the just-read value 273 * and write it back to sparectl; this *may* clobber the EccCnt 274 * for the channel/chip-select combination currently selected, so 275 * we leave this bit clear if we had to clear it 276 * - cycle through all channel/chip-select combinations writing each 277 * combination to sparectl before reading the register back for 278 * EccCnt for that combination; since EccErrCntWrEn is clear 279 * the writes to select what count to read will not themselves 280 * zero any counts 281 */ 282 static int 283 authamd_read_ecccnt(authamd_data_t *authamd, struct authamd_logout *msl) 284 { 285 union mcreg_sparectl sparectl; 286 uint_t chipid = authamd->amd_shared->acs_chipid; 287 uint_t family = authamd->amd_shared->acs_family; 288 uint32_t rev = authamd->amd_shared->acs_rev; 289 int chan, cs; 290 291 /* 292 * Check for feature support; this macro will test down to the 293 * family revision number, whereafter we'll switch on family 294 * assuming that future revisions will use the same register 295 * format. 296 */ 297 if (!AUTHAMD_HAS_ONLINESPARECTL(rev)) { 298 bzero(&msl->aal_eccerrcnt, sizeof (msl->aal_eccerrcnt)); 299 return (0); 300 } 301 302 MCREG_VAL32(&sparectl) = 303 authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL); 304 305 switch (family) { 306 case AUTHAMD_FAMILY_F: 307 MCREG_FIELD_F_revFG(&sparectl, EccErrCntWrEn) = 0; 308 break; 309 310 case AUTHAMD_FAMILY_10: 311 MCREG_FIELD_10_revAB(&sparectl, EccErrCntWrEn) = 0; 312 break; 313 } 314 315 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 316 switch (family) { 317 case AUTHAMD_FAMILY_F: 318 MCREG_FIELD_F_revFG(&sparectl, EccErrCntDramChan) = 319 chan; 320 break; 321 322 case AUTHAMD_FAMILY_10: 323 MCREG_FIELD_10_revAB(&sparectl, EccErrCntDramChan) = 324 chan; 325 break; 326 } 327 328 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 329 switch (family) { 330 case AUTHAMD_FAMILY_F: 331 MCREG_FIELD_F_revFG(&sparectl, 332 EccErrCntDramCs) = cs; 333 break; 334 335 case AUTHAMD_FAMILY_10: 336 MCREG_FIELD_10_revAB(&sparectl, 337 EccErrCntDramCs) = cs; 338 break; 339 } 340 341 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, 342 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 343 344 MCREG_VAL32(&sparectl) = authamd_pcicfg_read(chipid, 345 MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL); 346 347 switch (family) { 348 case AUTHAMD_FAMILY_F: 349 msl->aal_eccerrcnt[chan][cs] = 350 MCREG_FIELD_F_revFG(&sparectl, EccErrCnt); 351 break; 352 case AUTHAMD_FAMILY_10: 353 msl->aal_eccerrcnt[chan][cs] = 354 MCREG_FIELD_10_revAB(&sparectl, EccErrCnt); 355 break; 356 } 357 } 358 } 359 360 return (1); 361 } 362 363 /* 364 * Clear EccCnt for all possible channel/chip-select combos: 365 * 366 * - set EccErrCntWrEn in sparectl, if necessary 367 * - write 0 to EccCnt for all channel/chip-select combinations 368 * - clear EccErrCntWrEn 369 * 370 * If requested also disable the interrupts taken on counter overflow 371 * and on swap done. 372 */ 373 static void 374 authamd_clear_ecccnt(authamd_data_t *authamd, boolean_t clrint) 375 { 376 union mcreg_sparectl sparectl; 377 uint_t chipid = authamd->amd_shared->acs_chipid; 378 uint_t family = authamd->amd_shared->acs_family; 379 uint32_t rev = authamd->amd_shared->acs_rev; 380 int chan, cs; 381 382 if (!AUTHAMD_HAS_ONLINESPARECTL(rev)) 383 return; 384 385 MCREG_VAL32(&sparectl) = 386 authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL); 387 388 switch (family) { 389 case AUTHAMD_FAMILY_F: 390 MCREG_FIELD_F_revFG(&sparectl, EccErrCntWrEn) = 1; 391 if (clrint) { 392 MCREG_FIELD_F_revFG(&sparectl, EccErrInt) = 0; 393 MCREG_FIELD_F_revFG(&sparectl, SwapDoneInt) = 0; 394 } 395 break; 396 397 case AUTHAMD_FAMILY_10: 398 MCREG_FIELD_10_revAB(&sparectl, EccErrCntWrEn) = 1; 399 if (clrint) { 400 MCREG_FIELD_10_revAB(&sparectl, EccErrInt) = 0; 401 MCREG_FIELD_10_revAB(&sparectl, SwapDoneInt) = 0; 402 } 403 break; 404 } 405 406 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, 407 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 408 409 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 410 switch (family) { 411 case AUTHAMD_FAMILY_F: 412 MCREG_FIELD_F_revFG(&sparectl, EccErrCntDramChan) = 413 chan; 414 break; 415 416 case AUTHAMD_FAMILY_10: 417 MCREG_FIELD_10_revAB(&sparectl, EccErrCntDramChan) = 418 chan; 419 break; 420 } 421 422 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 423 switch (family) { 424 case AUTHAMD_FAMILY_F: 425 MCREG_FIELD_F_revFG(&sparectl, 426 EccErrCntDramCs) = cs; 427 MCREG_FIELD_F_revFG(&sparectl, 428 EccErrCnt) = 0; 429 break; 430 431 case AUTHAMD_FAMILY_10: 432 MCREG_FIELD_10_revAB(&sparectl, 433 EccErrCntDramCs) = cs; 434 MCREG_FIELD_10_revAB(&sparectl, 435 EccErrCnt) = 0; 436 break; 437 } 438 439 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, 440 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 441 } 442 } 443 } 444 445 446 /* 447 * Return 448 * 1: supported 449 * 0: unsupported 450 */ 451 static int 452 authamd_supported(uint_t family, uint32_t rev, uint_t chipid) 453 { 454 uint32_t nbcap; 455 456 if (family == AUTHAMD_FAMILY_6) 457 return (1); 458 459 if (family == AUTHAMD_FAMILY_F) 460 return (1); 461 462 /* 463 * On Family 10h, authamd is currently unsupported when there are 464 * multiple nodes on a processor chip. 465 */ 466 if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_D)) { 467 nbcap = authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, 468 MC_CTL_REG_NBCAP); 469 if ((nbcap & MC_NBCAP_MULTINODECPU) == 0) 470 return (1); 471 } else { 472 if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 473 return (1); 474 } 475 476 return (0); 477 } 478 479 /* 480 * cms_init entry point. 481 * 482 * This module provides broad model-specific support for AMD families 483 * 0x6, 0xf and 0x10. Future families will have to be evaluated once their 484 * documentation is available. 485 */ 486 int 487 authamd_init(cmi_hdl_t hdl, void **datap) 488 { 489 uint_t chipid = cmi_hdl_chipid(hdl); 490 struct authamd_chipshared *sp, *osp; 491 uint_t family = cmi_hdl_family(hdl); 492 uint32_t rev = cmi_hdl_chiprev(hdl); 493 authamd_data_t *authamd; 494 uint64_t cap; 495 496 if (authamd_ms_support_disable || 497 !authamd_supported(family, rev, chipid)) 498 return (ENOTSUP); 499 500 if (!(x86_feature & X86_MCA)) 501 return (ENOTSUP); 502 503 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS) 504 return (ENOTSUP); 505 506 if (!(cap & MCG_CAP_CTL_P)) 507 return (ENOTSUP); 508 509 authamd = *datap = kmem_zalloc(sizeof (authamd_data_t), KM_SLEEP); 510 cmi_hdl_hold(hdl); /* release in fini */ 511 authamd->amd_hdl = hdl; 512 513 if ((sp = authamd_shared[chipid]) == NULL) { 514 sp = kmem_zalloc(sizeof (struct authamd_chipshared), KM_SLEEP); 515 sp->acs_chipid = chipid; 516 sp->acs_family = family; 517 sp->acs_rev = cmi_hdl_chiprev(hdl); 518 membar_producer(); 519 520 osp = atomic_cas_ptr(&authamd_shared[chipid], NULL, sp); 521 if (osp != NULL) { 522 kmem_free(sp, sizeof (struct authamd_chipshared)); 523 sp = osp; 524 } 525 } 526 authamd->amd_shared = sp; 527 528 return (0); 529 } 530 531 /* 532 * cms_logout_size entry point. 533 */ 534 /*ARGSUSED*/ 535 size_t 536 authamd_logout_size(cmi_hdl_t hdl) 537 { 538 return (sizeof (struct authamd_logout)); 539 } 540 541 /* 542 * cms_mcgctl_val entry point 543 * 544 * Instead of setting all bits to 1 we can set just those for the 545 * error detector banks known to exist. 546 */ 547 /*ARGSUSED*/ 548 uint64_t 549 authamd_mcgctl_val(cmi_hdl_t hdl, int nbanks, uint64_t proposed) 550 { 551 return (nbanks < 64 ? (1ULL << nbanks) - 1 : proposed); 552 } 553 554 /* 555 * cms_bankctl_skipinit entry point 556 * 557 * On K6 we do not initialize MC0_CTL since, reportedly, this bank (for DC) 558 * may produce spurious machine checks. 559 * 560 * Only allow a single core to setup the NorthBridge MCi_CTL register. 561 */ 562 /*ARGSUSED*/ 563 boolean_t 564 authamd_bankctl_skipinit(cmi_hdl_t hdl, int bank) 565 { 566 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 567 uint32_t rev = authamd->amd_shared->acs_rev; 568 569 if (authamd->amd_shared->acs_family == AUTHAMD_FAMILY_6) 570 return (bank == 0 ? B_TRUE : B_FALSE); 571 572 if (AUTHAMD_NBONCHIP(rev) && bank == AMD_MCA_BANK_NB) { 573 return (authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBMCA) == 574 B_TRUE ? B_FALSE : B_TRUE); 575 } 576 577 return (B_FALSE); 578 } 579 580 /* 581 * cms_bankctl_val entry point 582 */ 583 uint64_t 584 authamd_bankctl_val(cmi_hdl_t hdl, int bank, uint64_t proposed) 585 { 586 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 587 uint32_t rev = authamd->amd_shared->acs_rev; 588 uint64_t val = proposed; 589 590 /* 591 * The Intel MCA says we can write all 1's to enable #MC for 592 * all errors, and AMD docs say much the same. But, depending 593 * perhaps on other config registers, taking machine checks 594 * for some errors such as GART TLB errors and master/target 595 * aborts may be bad - they set UC and sometime also PCC, but 596 * we should not always panic for these error types. 597 * 598 * Our cms_error_action entry point can suppress such panics, 599 * however we can also use the cms_bankctl_val entry point to 600 * veto enabling of some of the known villains in the first place. 601 */ 602 if (bank == AMD_MCA_BANK_NB && AUTHAMD_NOGARTTBLWLK_MC(rev)) 603 val &= ~AMD_NB_EN_GARTTBLWK; 604 605 return (val); 606 } 607 608 /* 609 * Bits to add to NB MCA config (after watchdog config). 610 */ 611 uint32_t authamd_nb_mcacfg_add = AMD_NB_CFG_ADD_CMN; 612 613 /* 614 * Bits to remove from NB MCA config (after watchdog config) 615 */ 616 uint32_t authamd_nb_mcacfg_remove = AMD_NB_CFG_REMOVE_CMN; 617 618 /* 619 * NB Watchdog policy, and rate we use if enabling. 620 */ 621 enum { 622 AUTHAMD_NB_WDOG_LEAVEALONE, 623 AUTHAMD_NB_WDOG_DISABLE, 624 AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED, 625 AUTHAMD_NB_WDOG_ENABLE_FORCE_RATE 626 } authamd_nb_watchdog_policy = AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED; 627 628 uint32_t authamd_nb_mcacfg_wdog = AMD_NB_CFG_WDOGTMRCNTSEL_4095 | 629 AMD_NB_CFG_WDOGTMRBASESEL_1MS; 630 631 /* 632 * Per-core cache scrubbing policy and rates. 633 */ 634 enum { 635 AUTHAMD_SCRUB_BIOSDEFAULT, /* leave as BIOS configured */ 636 AUTHAMD_SCRUB_FIXED, /* assign our chosen rate */ 637 AUTHAMD_SCRUB_MAX /* use higher of ours and BIOS rate */ 638 } authamd_scrub_policy = AUTHAMD_SCRUB_MAX; 639 640 uint32_t authamd_scrub_rate_dcache = 0xf; /* 64K per 0.67 seconds */ 641 uint32_t authamd_scrub_rate_l2cache = 0xe; /* 1MB per 5.3 seconds */ 642 uint32_t authamd_scrub_rate_l3cache = 0xd; /* 1MB per 2.7 seconds */ 643 644 static uint32_t 645 authamd_scrubrate(uint32_t osrate, uint32_t biosrate, const char *varnm) 646 { 647 uint32_t rate; 648 649 if (osrate > AMD_NB_SCRUBCTL_RATE_MAX) { 650 cmn_err(CE_WARN, "%s is too large, resetting to 0x%x\n", 651 varnm, AMD_NB_SCRUBCTL_RATE_MAX); 652 osrate = AMD_NB_SCRUBCTL_RATE_MAX; 653 } 654 655 switch (authamd_scrub_policy) { 656 case AUTHAMD_SCRUB_FIXED: 657 rate = osrate; 658 break; 659 660 default: 661 cmn_err(CE_WARN, "Unknown authamd_scrub_policy %d - " 662 "using default policy of AUTHAMD_SCRUB_MAX", 663 authamd_scrub_policy); 664 /*FALLTHRU*/ 665 666 case AUTHAMD_SCRUB_MAX: 667 if (osrate != 0 && biosrate != 0) 668 rate = MIN(osrate, biosrate); /* small is fast */ 669 else 670 rate = osrate ? osrate : biosrate; 671 } 672 673 return (rate); 674 } 675 676 /* 677 * cms_mca_init entry point. 678 */ 679 /*ARGSUSED*/ 680 void 681 authamd_mca_init(cmi_hdl_t hdl, int nbanks) 682 { 683 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 684 uint32_t rev = authamd->amd_shared->acs_rev; 685 uint_t chipid = authamd->amd_shared->acs_chipid; 686 687 /* 688 * On chips with a NB online spare control register take control 689 * and clear ECC counts. 690 */ 691 if (AUTHAMD_HAS_ONLINESPARECTL(rev) && 692 authamd_chip_once(authamd, AUTHAMD_CFGONCE_ONLNSPRCFG)) { 693 authamd_clear_ecccnt(authamd, B_TRUE); 694 } 695 696 /* 697 * And since we are claiming the telemetry stop the BIOS receiving 698 * an SMI on NB threshold overflow. 699 */ 700 if (AUTHAMD_NBMISC_NUM(rev) && 701 authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBTHRESH)) { 702 union mcmsr_nbmisc nbm; 703 int i; 704 705 authamd_bankstatus_prewrite(hdl, authamd); 706 707 for (i = 0; i < AUTHAMD_NBMISC_NUM(rev); i++) { 708 if (cmi_hdl_rdmsr(hdl, MC_MSR_NB_MISC(i), 709 (uint64_t *)&nbm) != CMI_SUCCESS) 710 continue; 711 712 if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) && 713 MCMSR_FIELD_F_revFG(&nbm, mcmisc_Valid) && 714 MCMSR_FIELD_F_revFG(&nbm, mcmisc_CntP)) { 715 MCMSR_FIELD_F_revFG(&nbm, mcmisc_IntType) = 0; 716 } else if (X86_CHIPREV_ATLEAST(rev, 717 X86_CHIPREV_AMD_10_REV_A) && 718 MCMSR_FIELD_10_revAB(&nbm, mcmisc_Valid) && 719 MCMSR_FIELD_10_revAB(&nbm, mcmisc_CntP)) { 720 MCMSR_FIELD_10_revAB(&nbm, mcmisc_IntType) = 0; 721 } 722 723 (void) cmi_hdl_wrmsr(hdl, MC_MSR_NB_MISC(i), 724 MCMSR_VAL(&nbm)); 725 } 726 727 authamd_bankstatus_postwrite(hdl, authamd); 728 } 729 730 /* 731 * NB MCA Configuration Register. 732 */ 733 if (AUTHAMD_DO_NBMCACFG(rev) && 734 authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBMCACFG)) { 735 uint32_t val = authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, 736 MC_CTL_REG_NBCFG); 737 738 switch (authamd_nb_watchdog_policy) { 739 case AUTHAMD_NB_WDOG_LEAVEALONE: 740 break; 741 742 case AUTHAMD_NB_WDOG_DISABLE: 743 val &= ~(AMD_NB_CFG_WDOGTMRBASESEL_MASK | 744 AMD_NB_CFG_WDOGTMRCNTSEL_MASK); 745 val |= AMD_NB_CFG_WDOGTMRDIS; 746 break; 747 748 default: 749 cmn_err(CE_NOTE, "authamd_nb_watchdog_policy=%d " 750 "unrecognised, using default policy", 751 authamd_nb_watchdog_policy); 752 /*FALLTHRU*/ 753 754 case AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED: 755 if (!(val & AMD_NB_CFG_WDOGTMRDIS)) 756 break; /* if enabled leave rate intact */ 757 /*FALLTHRU*/ 758 759 case AUTHAMD_NB_WDOG_ENABLE_FORCE_RATE: 760 val &= ~(AMD_NB_CFG_WDOGTMRBASESEL_MASK | 761 AMD_NB_CFG_WDOGTMRCNTSEL_MASK | 762 AMD_NB_CFG_WDOGTMRDIS); 763 val |= authamd_nb_mcacfg_wdog; 764 break; 765 } 766 767 /* 768 * Bit 0 of the NB MCA Config register is reserved on family 769 * 0x10. 770 */ 771 if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 772 authamd_nb_mcacfg_add &= ~AMD_NB_CFG_CPUECCERREN; 773 774 val &= ~authamd_nb_mcacfg_remove; 775 val |= authamd_nb_mcacfg_add; 776 777 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, MC_CTL_REG_NBCFG, 778 val); 779 } 780 781 /* 782 * Cache scrubbing. We can't enable DRAM scrubbing since 783 * we don't know the DRAM base for this node. 784 */ 785 if (AUTHAMD_HAS_CHIPSCRUB(rev) && 786 authamd_scrub_policy != AUTHAMD_SCRUB_BIOSDEFAULT && 787 authamd_chip_once(authamd, AUTHAMD_CFGONCE_CACHESCRUB)) { 788 uint32_t val = authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, 789 MC_CTL_REG_SCRUBCTL); 790 int l3cap = 0; 791 792 if (AUTHAMD_L3CAPABLE(rev)) { 793 l3cap = (authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, 794 MC_CTL_REG_NBCAP) & MC_NBCAP_L3CAPABLE) != 0; 795 } 796 797 authamd_scrub_rate_dcache = 798 authamd_scrubrate(authamd_scrub_rate_dcache, 799 (val & AMD_NB_SCRUBCTL_DC_MASK) >> AMD_NB_SCRUBCTL_DC_SHIFT, 800 "authamd_scrub_rate_dcache"); 801 802 authamd_scrub_rate_l2cache = 803 authamd_scrubrate(authamd_scrub_rate_l2cache, 804 (val & AMD_NB_SCRUBCTL_L2_MASK) >> AMD_NB_SCRUBCTL_L2_SHIFT, 805 "authamd_scrub_rate_l2cache"); 806 807 authamd_scrub_rate_l3cache = l3cap ? 808 authamd_scrubrate(authamd_scrub_rate_l3cache, 809 (val & AMD_NB_SCRUBCTL_L3_MASK) >> AMD_NB_SCRUBCTL_L3_SHIFT, 810 "authamd_scrub_rate_l3cache") : 0; 811 812 val = AMD_NB_MKSCRUBCTL(authamd_scrub_rate_l3cache, 813 authamd_scrub_rate_dcache, authamd_scrub_rate_l2cache, 814 val & AMD_NB_SCRUBCTL_DRAM_MASK); 815 816 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, 817 MC_CTL_REG_SCRUBCTL, val); 818 } 819 820 } 821 822 /* 823 * cms_poll_ownermask entry point. 824 */ 825 uint64_t 826 authamd_poll_ownermask(cmi_hdl_t hdl, hrtime_t pintvl) 827 { 828 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 829 struct authamd_chipshared *acsp = authamd->amd_shared; 830 hrtime_t now = gethrtime_waitfree(); 831 hrtime_t last = acsp->acs_poll_timestamp; 832 int dopoll = 0; 833 834 if (now - last > 2 * pintvl || last == 0) { 835 acsp->acs_pollowner = hdl; 836 dopoll = 1; 837 } else if (acsp->acs_pollowner == hdl) { 838 dopoll = 1; 839 } 840 841 if (dopoll) 842 acsp->acs_poll_timestamp = now; 843 844 return (dopoll ? -1ULL : ~(1 << AMD_MCA_BANK_NB)); 845 846 } 847 848 /* 849 * cms_bank_logout entry point. 850 */ 851 /*ARGSUSED*/ 852 void 853 authamd_bank_logout(cmi_hdl_t hdl, int bank, uint64_t status, 854 uint64_t addr, uint64_t misc, void *mslogout) 855 { 856 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 857 struct authamd_logout *msl = mslogout; 858 uint32_t rev = authamd->amd_shared->acs_rev; 859 860 if (msl == NULL) 861 return; 862 863 /* 864 * For main memory ECC errors on revisions with an Online Spare 865 * Control Register grab the ECC counts by channel and chip-select 866 * and reset them to 0. 867 */ 868 if (AUTHAMD_MEMECC_RECOGNISED(rev) && 869 AUTHAMD_IS_MEMECCERR(bank, status) && 870 AUTHAMD_HAS_ONLINESPARECTL(rev)) { 871 if (authamd_read_ecccnt(authamd, msl)) 872 authamd_clear_ecccnt(authamd, B_FALSE); 873 } 874 } 875 876 /* 877 * cms_error_action entry point 878 */ 879 880 int authamd_forgive_uc = 0; /* For test/debug only */ 881 int authamd_forgive_pcc = 0; /* For test/debug only */ 882 int authamd_fake_poison = 0; /* For test/debug only */ 883 884 /*ARGSUSED*/ 885 uint32_t 886 authamd_error_action(cmi_hdl_t hdl, int ismc, int bank, 887 uint64_t status, uint64_t addr, uint64_t misc, void *mslogout) 888 { 889 authamd_error_disp_t *disp; 890 uint32_t rv = 0; 891 892 if (authamd_forgive_uc) 893 rv |= CMS_ERRSCOPE_CLEARED_UC; 894 895 if (authamd_forgive_pcc) 896 rv |= CMS_ERRSCOPE_CURCONTEXT_OK; 897 898 if (authamd_fake_poison && status & MSR_MC_STATUS_UC) 899 rv |= CMS_ERRSCOPE_POISONED; 900 901 if (rv) 902 return (rv); 903 904 disp = authamd_disp_match(hdl, bank, status, addr, misc, mslogout); 905 906 if (disp == &authamd_gart_disp) { 907 /* 908 * GART walk errors set UC and possibly PCC (if source CPU) 909 * but should not be regarded as terminal. 910 */ 911 return (CMS_ERRSCOPE_IGNORE_ERR); 912 } 913 914 /* 915 * May also want to consider master abort and target abort. These 916 * also set UC and PCC (if src CPU) but the requester gets -1 917 * and I believe the IO stuff in Solaris will handle that. 918 */ 919 920 return (rv); 921 } 922 923 /* 924 * cms_disp_match entry point 925 */ 926 /*ARGSUSED*/ 927 cms_cookie_t 928 authamd_disp_match(cmi_hdl_t hdl, int bank, uint64_t status, 929 uint64_t addr, uint64_t misc, void *mslogout) 930 { 931 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 932 /* uint16_t errcode = MCAX86_ERRCODE(status); */ 933 uint16_t exterrcode = AMD_EXT_ERRCODE(status); 934 uint32_t rev = authamd->amd_shared->acs_rev; 935 936 /* 937 * Recognise main memory ECC errors 938 */ 939 if (AUTHAMD_MEMECC_RECOGNISED(rev) && 940 AUTHAMD_IS_MEMECCERR(bank, status)) { 941 if (status & AMD_BANK_STAT_CECC) { 942 return (exterrcode == 0 ? &authamd_memce_disp : 943 &authamd_ckmemce_disp); 944 } else if (status & AMD_BANK_STAT_UECC) { 945 return (exterrcode == 0 ? &authamd_memue_disp : 946 &authamd_ckmemue_disp); 947 } 948 } 949 950 /* 951 * Recognise GART walk errors 952 */ 953 if (AUTHAMD_NOGARTTBLWLK_MC(rev) && AUTHAMD_IS_GARTERR(bank, status)) 954 return (&authamd_gart_disp); 955 956 return (NULL); 957 } 958 959 /* 960 * cms_ereport_class entry point 961 */ 962 /*ARGSUSED*/ 963 void 964 authamd_ereport_class(cmi_hdl_t hdl, cms_cookie_t mscookie, 965 const char **cpuclsp, const char **leafclsp) 966 { 967 const authamd_error_disp_t *aed = mscookie; 968 969 if (aed == NULL) 970 return; 971 972 if (aed->aad_subclass != NULL) 973 *cpuclsp = aed->aad_subclass; 974 if (aed->aad_leafclass != NULL) 975 *leafclsp = aed->aad_leafclass; 976 } 977 978 /*ARGSUSED*/ 979 static void 980 authamd_ereport_add_resource(cmi_hdl_t hdl, authamd_data_t *authamd, 981 nvlist_t *ereport, nv_alloc_t *nva, void *mslogout) 982 { 983 nvlist_t *elems[AUTHAMD_DRAM_NCHANNEL * AUTHAMD_DRAM_NCS]; 984 uint8_t counts[AUTHAMD_DRAM_NCHANNEL * AUTHAMD_DRAM_NCS]; 985 authamd_logout_t *msl; 986 nvlist_t *nvl; 987 int nelems = 0; 988 int i, chan, cs; 989 nvlist_t *board_list = NULL; 990 991 if ((msl = mslogout) == NULL) 992 return; 993 994 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 995 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 996 if (msl->aal_eccerrcnt[chan][cs] == 0) 997 continue; 998 999 if ((nvl = fm_nvlist_create(nva)) == NULL) 1000 continue; 1001 1002 elems[nelems] = nvl; 1003 counts[nelems++] = msl->aal_eccerrcnt[chan][cs]; 1004 1005 if (!x86gentopo_legacy) { 1006 board_list = cmi_hdl_smb_bboard(hdl); 1007 if (board_list == NULL) 1008 continue; 1009 fm_fmri_hc_create(nvl, FM_HC_SCHEME_VERSION, 1010 NULL, NULL, board_list, 4, 1011 "chip", cmi_hdl_smb_chipid(hdl), 1012 "memory-controller", 0, 1013 "dram-channel", chan, 1014 "chip-select", cs); 1015 } else { 1016 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, 1017 NULL, NULL, 5, 1018 "motherboard", 0, 1019 "chip", authamd->amd_shared->acs_chipid, 1020 "memory-controller", 0, 1021 "dram-channel", chan, 1022 "chip-select", cs); 1023 } 1024 } 1025 } 1026 1027 if (nelems == 0) 1028 return; 1029 1030 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_RESOURCE, 1031 DATA_TYPE_NVLIST_ARRAY, nelems, elems, 1032 NULL); 1033 1034 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_RESOURCECNT, 1035 DATA_TYPE_UINT8_ARRAY, nelems, &counts[0], 1036 NULL); 1037 1038 for (i = 0; i < nelems; i++) 1039 fm_nvlist_destroy(elems[i], nva ? FM_NVA_RETAIN : FM_NVA_FREE); 1040 } 1041 1042 /* 1043 * cms_ereport_add_logout entry point 1044 */ 1045 /*ARGSUSED*/ 1046 void 1047 authamd_ereport_add_logout(cmi_hdl_t hdl, nvlist_t *ereport, nv_alloc_t *nva, 1048 int bank, uint64_t status, uint64_t addr, uint64_t misc, 1049 void *mslogout, cms_cookie_t mscookie) 1050 { 1051 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 1052 const authamd_error_disp_t *aed = mscookie; 1053 uint64_t members; 1054 1055 if (aed == NULL) 1056 return; 1057 1058 members = aed->aad_ereport_members; 1059 1060 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYND) { 1061 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_SYND, 1062 DATA_TYPE_UINT16, (uint16_t)AMD_BANK_SYND(status), 1063 NULL); 1064 1065 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYNDTYPE) { 1066 fm_payload_set(ereport, 1067 FM_EREPORT_GENAMD_PAYLOAD_NAME_SYNDTYPE, 1068 DATA_TYPE_STRING, "E", 1069 NULL); 1070 } 1071 } 1072 1073 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_CKSYND) { 1074 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_CKSYND, 1075 DATA_TYPE_UINT16, (uint16_t)AMD_NB_STAT_CKSYND(status), 1076 NULL); 1077 1078 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYNDTYPE) { 1079 fm_payload_set(ereport, 1080 FM_EREPORT_GENAMD_PAYLOAD_NAME_SYNDTYPE, 1081 DATA_TYPE_STRING, "C", 1082 NULL); 1083 } 1084 } 1085 1086 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_RESOURCE && 1087 status & MSR_MC_STATUS_ADDRV) { 1088 authamd_ereport_add_resource(hdl, authamd, ereport, nva, 1089 mslogout); 1090 } 1091 } 1092 1093 /* 1094 * cms_msrinject entry point 1095 */ 1096 cms_errno_t 1097 authamd_msrinject(cmi_hdl_t hdl, uint_t msr, uint64_t val) 1098 { 1099 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 1100 cms_errno_t rv = CMSERR_BADMSRWRITE; 1101 1102 authamd_bankstatus_prewrite(hdl, authamd); 1103 if (cmi_hdl_wrmsr(hdl, msr, val) == CMI_SUCCESS) 1104 rv = CMS_SUCCESS; 1105 authamd_bankstatus_postwrite(hdl, authamd); 1106 1107 return (rv); 1108 } 1109 1110 cms_api_ver_t _cms_api_version = CMS_API_VERSION_0; 1111 1112 const cms_ops_t _cms_ops = { 1113 authamd_init, /* cms_init */ 1114 NULL, /* cms_post_startup */ 1115 NULL, /* cms_post_mpstartup */ 1116 authamd_logout_size, /* cms_logout_size */ 1117 authamd_mcgctl_val, /* cms_mcgctl_val */ 1118 authamd_bankctl_skipinit, /* cms_bankctl_skipinit */ 1119 authamd_bankctl_val, /* cms_bankctl_val */ 1120 NULL, /* cms_bankstatus_skipinit */ 1121 NULL, /* cms_bankstatus_val */ 1122 authamd_mca_init, /* cms_mca_init */ 1123 authamd_poll_ownermask, /* cms_poll_ownermask */ 1124 authamd_bank_logout, /* cms_bank_logout */ 1125 authamd_error_action, /* cms_error_action */ 1126 authamd_disp_match, /* cms_disp_match */ 1127 authamd_ereport_class, /* cms_ereport_class */ 1128 NULL, /* cms_ereport_detector */ 1129 NULL, /* cms_ereport_includestack */ 1130 authamd_ereport_add_logout, /* cms_ereport_add_logout */ 1131 authamd_msrinject, /* cms_msrinject */ 1132 NULL, /* cms_fini */ 1133 }; 1134 1135 static struct modlcpu modlcpu = { 1136 &mod_cpuops, 1137 "Generic AMD model-specific MCA" 1138 }; 1139 1140 static struct modlinkage modlinkage = { 1141 MODREV_1, 1142 (void *)&modlcpu, 1143 NULL 1144 }; 1145 1146 int 1147 _init(void) 1148 { 1149 return (mod_install(&modlinkage)); 1150 } 1151 1152 int 1153 _info(struct modinfo *modinfop) 1154 { 1155 return (mod_info(&modlinkage, modinfop)); 1156 } 1157 1158 int 1159 _fini(void) 1160 { 1161 return (mod_remove(&modlinkage)); 1162 } 1163