1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * "Generic AMD" model-specific support. If no more-specific support can 31 * be found, or such modules declines to initialize, then for AuthenticAMD 32 * cpus this module can have a crack at providing some AMD model-specific 33 * support that at least goes beyond common MCA architectural features 34 * if not down to the nitty-gritty level for a particular model. We 35 * are layered on top of a cpu module, likely cpu.generic, so there is no 36 * need for us to perform common architecturally-accessible functions. 37 */ 38 39 #include <sys/types.h> 40 #include <sys/cmn_err.h> 41 #include <sys/modctl.h> 42 #include <sys/cpu_module.h> 43 #include <sys/mca_x86.h> 44 #include <sys/pci_cfgspace.h> 45 #include <sys/x86_archext.h> 46 #include <sys/mc_amd.h> 47 #include <sys/fm/protocol.h> 48 #include <sys/fm/cpu/GENAMD.h> 49 #include <sys/nvpair.h> 50 #include <sys/controlregs.h> 51 #include <sys/pghw.h> 52 #include <sys/sunddi.h> 53 #include <sys/sysmacros.h> 54 #include <sys/cpu_module_ms_impl.h> 55 56 #include "authamd.h" 57 58 int authamd_ms_support_disable = 0; 59 60 #define AUTHAMD_F_REVS_BCDE \ 61 (X86_CHIPREV_AMD_F_REV_B | X86_CHIPREV_AMD_F_REV_C0 | \ 62 X86_CHIPREV_AMD_F_REV_CG | X86_CHIPREV_AMD_F_REV_D | \ 63 X86_CHIPREV_AMD_F_REV_E) 64 65 #define AUTHAMD_F_REVS_FG \ 66 (X86_CHIPREV_AMD_F_REV_F | X86_CHIPREV_AMD_F_REV_G) 67 68 #define AUTHAMD_10_REVS_AB \ 69 (X86_CHIPREV_AMD_10_REV_A | X86_CHIPREV_AMD_10_REV_B) 70 71 /* 72 * Bitmasks of support for various features. Try to enable features 73 * via inclusion in one of these bitmasks and check that at the 74 * feature imlementation - that way new family support may often simply 75 * simply need to update these bitmasks. 76 */ 77 78 /* 79 * Families that this module will provide some model-specific 80 * support for (if no more-specific module claims it first). 81 * We try to support whole families rather than differentiate down 82 * to revision. 83 */ 84 #define AUTHAMD_SUPPORTED(fam) \ 85 ((fam) == AUTHAMD_FAMILY_6 || (fam) == AUTHAMD_FAMILY_F || \ 86 (fam) == AUTHAMD_FAMILY_10) 87 88 /* 89 * Families/revisions for which we can recognise main memory ECC errors. 90 */ 91 #define AUTHAMD_MEMECC_RECOGNISED(rev) \ 92 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 93 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 94 95 /* 96 * Families/revisions that have an Online Spare Control Register 97 */ 98 #define AUTHAMD_HAS_ONLINESPARECTL(rev) \ 99 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) || \ 100 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 101 102 /* 103 * Families/revisions for which we will perform NB MCA Config changes 104 */ 105 #define AUTHAMD_DO_NBMCACFG(rev) \ 106 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) || \ 107 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 108 109 /* 110 * Families/revisions that have chip cache scrubbers. 111 */ 112 #define AUTHAMD_HAS_CHIPSCRUB(rev) \ 113 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) || \ 114 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 115 116 /* 117 * Families/revisions that have a NB misc register or registers - 118 * evaluates to 0 if no support, otherwise the number of MC4_MISCj. 119 */ 120 #define AUTHAMD_NBMISC_NUM(rev) \ 121 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F)? 1 : \ 122 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A) ? 3 : 0)) 123 124 /* 125 * Families/revision for which we wish not to machine check for GART 126 * table walk errors - bit 10 of NB CTL. 127 */ 128 #define AUTHAMD_NOGARTTBLWLK_MC(rev) \ 129 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 130 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 131 132 /* 133 * Families/revisions that are potentially L3 capable 134 */ 135 #define AUTHAMD_L3CAPABLE(rev) \ 136 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 137 138 /* 139 * We recognise main memory ECC errors for AUTHAMD_MEMECC_RECOGNISED 140 * revisions as: 141 * 142 * - being reported by the NB 143 * - being a compound bus/interconnect error (external to chip) 144 * - having LL of LG 145 * - having II of MEM (but could still be a master/target abort) 146 * - having CECC or UECC set 147 * 148 * We do not check the extended error code (first nibble of the 149 * model-specific error code on AMD) since this has changed from 150 * family 0xf to family 0x10 (ext code 0 now reserved on family 0x10). 151 * Instead we use CECC/UECC to separate off the master/target 152 * abort cases. 153 * 154 * We insist that the detector be the NorthBridge bank; although 155 * IC/DC can report some main memory errors, they do not capture 156 * an address at sufficient resolution to be useful and the NB will 157 * report most errors. 158 */ 159 #define AUTHAMD_IS_MEMECCERR(bank, status) \ 160 ((bank) == AMD_MCA_BANK_NB && \ 161 MCAX86_ERRCODE_ISBUS_INTERCONNECT(MCAX86_ERRCODE(status)) && \ 162 MCAX86_ERRCODE_LL(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_LL_LG && \ 163 MCAX86_ERRCODE_II(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_II_MEM && \ 164 ((status) & (AMD_BANK_STAT_CECC | AMD_BANK_STAT_UECC))) 165 166 static authamd_error_disp_t authamd_memce_disp = { 167 FM_EREPORT_CPU_GENAMD, 168 FM_EREPORT_CPU_GENAMD_MEM_CE, 169 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_MEM_CE 170 }; 171 172 static authamd_error_disp_t authamd_memue_disp = { 173 FM_EREPORT_CPU_GENAMD, 174 FM_EREPORT_CPU_GENAMD_MEM_UE, 175 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_MEM_UE 176 }; 177 178 static authamd_error_disp_t authamd_ckmemce_disp = { 179 FM_EREPORT_CPU_GENAMD, 180 FM_EREPORT_CPU_GENAMD_CKMEM_CE, 181 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_CKMEM_CE 182 }; 183 184 static authamd_error_disp_t authamd_ckmemue_disp = { 185 FM_EREPORT_CPU_GENAMD, 186 FM_EREPORT_CPU_GENAMD_CKMEM_UE, 187 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_CKMEM_UE 188 }; 189 190 /* 191 * We recognise GART walk errors as: 192 * 193 * - being reported by the NB 194 * - being a compound TLB error 195 * - having LL of LG and TT of GEN 196 * - having UC set 197 * - possibly having PCC set (if source CPU) 198 */ 199 #define AUTHAMD_IS_GARTERR(bank, status) \ 200 ((bank) == AMD_MCA_BANK_NB && \ 201 MCAX86_ERRCODE_ISTLB(MCAX86_ERRCODE(status)) && \ 202 MCAX86_ERRCODE_LL(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_LL_LG && \ 203 MCAX86_ERRCODE_TT(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_TT_GEN && \ 204 (status) & MSR_MC_STATUS_UC) 205 206 static authamd_error_disp_t authamd_gart_disp = { 207 FM_EREPORT_CPU_GENAMD, /* use generic subclass */ 208 FM_EREPORT_CPU_GENADM_GARTTBLWLK, /* use generic leafclass */ 209 0 /* no additional payload */ 210 }; 211 212 213 static struct authamd_chipshared *authamd_shared[AUTHAMD_MAX_CHIPS]; 214 215 static int 216 authamd_chip_once(authamd_data_t *authamd, enum authamd_cfgonce_bitnum what) 217 { 218 return (atomic_set_long_excl(&authamd->amd_shared->acs_cfgonce, 219 what) == 0 ? B_TRUE : B_FALSE); 220 } 221 222 static void 223 authamd_pcicfg_write(uint_t chipid, uint_t func, uint_t reg, uint32_t val) 224 { 225 ASSERT(chipid + 24 <= 31); 226 ASSERT((func & 7) == func); 227 ASSERT((reg & 3) == 0 && reg < 256); 228 229 cmi_pci_putl(0, chipid + 24, func, reg, 0, val); 230 } 231 232 static uint32_t 233 authamd_pcicfg_read(uint_t chipid, uint_t func, uint_t reg) 234 { 235 ASSERT(chipid + 24 <= 31); 236 ASSERT((func & 7) == func); 237 ASSERT((reg & 3) == 0 && reg < 256); 238 239 return (cmi_pci_getl(0, chipid + 24, func, reg, 0, 0)); 240 } 241 242 void 243 authamd_bankstatus_prewrite(cmi_hdl_t hdl, authamd_data_t *authamd) 244 { 245 uint64_t hwcr; 246 247 if (cmi_hdl_rdmsr(hdl, MSR_AMD_HWCR, &hwcr) != CMI_SUCCESS) 248 return; 249 250 authamd->amd_hwcr = hwcr; 251 252 if (!(hwcr & AMD_HWCR_MCI_STATUS_WREN)) { 253 hwcr |= AMD_HWCR_MCI_STATUS_WREN; 254 (void) cmi_hdl_wrmsr(hdl, MSR_AMD_HWCR, hwcr); 255 } 256 } 257 258 void 259 authamd_bankstatus_postwrite(cmi_hdl_t hdl, authamd_data_t *authamd) 260 { 261 uint64_t hwcr = authamd->amd_hwcr; 262 263 if (!(hwcr & AMD_HWCR_MCI_STATUS_WREN)) { 264 hwcr &= ~AMD_HWCR_MCI_STATUS_WREN; 265 (void) cmi_hdl_wrmsr(hdl, MSR_AMD_HWCR, hwcr); 266 } 267 } 268 269 /* 270 * Read EccCnt repeatedly for all possible channel/chip-select combos: 271 * 272 * - read sparectl register 273 * - if EccErrCntWrEn is set, clear that bit in the just-read value 274 * and write it back to sparectl; this *may* clobber the EccCnt 275 * for the channel/chip-select combination currently selected, so 276 * we leave this bit clear if we had to clear it 277 * - cycle through all channel/chip-select combinations writing each 278 * combination to sparectl before reading the register back for 279 * EccCnt for that combination; since EccErrCntWrEn is clear 280 * the writes to select what count to read will not themselves 281 * zero any counts 282 */ 283 static int 284 authamd_read_ecccnt(authamd_data_t *authamd, struct authamd_logout *msl) 285 { 286 union mcreg_sparectl sparectl; 287 uint_t chipid = authamd->amd_shared->acs_chipid; 288 uint_t family = authamd->amd_shared->acs_family; 289 uint32_t rev = authamd->amd_shared->acs_rev; 290 int chan, cs; 291 292 /* 293 * Check for feature support; this macro will test down to the 294 * family revision number, whereafter we'll switch on family 295 * assuming that future revisions will use the same register 296 * format. 297 */ 298 if (!AUTHAMD_HAS_ONLINESPARECTL(rev)) { 299 bzero(&msl->aal_eccerrcnt, sizeof (msl->aal_eccerrcnt)); 300 return (0); 301 } 302 303 MCREG_VAL32(&sparectl) = 304 authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL); 305 306 switch (family) { 307 case AUTHAMD_FAMILY_F: 308 MCREG_FIELD_F_revFG(&sparectl, EccErrCntWrEn) = 0; 309 break; 310 311 case AUTHAMD_FAMILY_10: 312 MCREG_FIELD_10_revAB(&sparectl, EccErrCntWrEn) = 0; 313 break; 314 } 315 316 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 317 switch (family) { 318 case AUTHAMD_FAMILY_F: 319 MCREG_FIELD_F_revFG(&sparectl, EccErrCntDramChan) = 320 chan; 321 break; 322 323 case AUTHAMD_FAMILY_10: 324 MCREG_FIELD_10_revAB(&sparectl, EccErrCntDramChan) = 325 chan; 326 break; 327 } 328 329 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 330 switch (family) { 331 case AUTHAMD_FAMILY_F: 332 MCREG_FIELD_F_revFG(&sparectl, 333 EccErrCntDramCs) = cs; 334 break; 335 336 case AUTHAMD_FAMILY_10: 337 MCREG_FIELD_10_revAB(&sparectl, 338 EccErrCntDramCs) = cs; 339 break; 340 } 341 342 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, 343 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 344 345 MCREG_VAL32(&sparectl) = authamd_pcicfg_read(chipid, 346 MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL); 347 348 switch (family) { 349 case AUTHAMD_FAMILY_F: 350 msl->aal_eccerrcnt[chan][cs] = 351 MCREG_FIELD_F_revFG(&sparectl, EccErrCnt); 352 break; 353 case AUTHAMD_FAMILY_10: 354 msl->aal_eccerrcnt[chan][cs] = 355 MCREG_FIELD_10_revAB(&sparectl, EccErrCnt); 356 break; 357 } 358 } 359 } 360 361 return (1); 362 } 363 364 /* 365 * Clear EccCnt for all possible channel/chip-select combos: 366 * 367 * - set EccErrCntWrEn in sparectl, if necessary 368 * - write 0 to EccCnt for all channel/chip-select combinations 369 * - clear EccErrCntWrEn 370 * 371 * If requested also disable the interrupts taken on counter overflow 372 * and on swap done. 373 */ 374 static void 375 authamd_clear_ecccnt(authamd_data_t *authamd, boolean_t clrint) 376 { 377 union mcreg_sparectl sparectl; 378 uint_t chipid = authamd->amd_shared->acs_chipid; 379 uint_t family = authamd->amd_shared->acs_family; 380 uint32_t rev = authamd->amd_shared->acs_rev; 381 int chan, cs; 382 383 if (!AUTHAMD_HAS_ONLINESPARECTL(rev)) 384 return; 385 386 MCREG_VAL32(&sparectl) = 387 authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL); 388 389 switch (family) { 390 case AUTHAMD_FAMILY_F: 391 MCREG_FIELD_F_revFG(&sparectl, EccErrCntWrEn) = 1; 392 if (clrint) { 393 MCREG_FIELD_F_revFG(&sparectl, EccErrInt) = 0; 394 MCREG_FIELD_F_revFG(&sparectl, SwapDoneInt) = 0; 395 } 396 break; 397 398 case AUTHAMD_FAMILY_10: 399 MCREG_FIELD_10_revAB(&sparectl, EccErrCntWrEn) = 1; 400 if (clrint) { 401 MCREG_FIELD_10_revAB(&sparectl, EccErrInt) = 0; 402 MCREG_FIELD_10_revAB(&sparectl, SwapDoneInt) = 0; 403 } 404 break; 405 } 406 407 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, 408 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 409 410 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 411 switch (family) { 412 case AUTHAMD_FAMILY_F: 413 MCREG_FIELD_F_revFG(&sparectl, EccErrCntDramChan) = 414 chan; 415 break; 416 417 case AUTHAMD_FAMILY_10: 418 MCREG_FIELD_10_revAB(&sparectl, EccErrCntDramChan) = 419 chan; 420 break; 421 } 422 423 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 424 switch (family) { 425 case AUTHAMD_FAMILY_F: 426 MCREG_FIELD_F_revFG(&sparectl, 427 EccErrCntDramCs) = cs; 428 MCREG_FIELD_F_revFG(&sparectl, 429 EccErrCnt) = 0; 430 break; 431 432 case AUTHAMD_FAMILY_10: 433 MCREG_FIELD_10_revAB(&sparectl, 434 EccErrCntDramCs) = cs; 435 MCREG_FIELD_10_revAB(&sparectl, 436 EccErrCnt) = 0; 437 break; 438 } 439 440 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, 441 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 442 } 443 } 444 } 445 446 /* 447 * cms_init entry point. 448 * 449 * This module provides broad model-specific support for AMD families 450 * 0x6, 0xf and 0x10. Future families will have to be evaluated once their 451 * documentation is available. 452 */ 453 int 454 authamd_init(cmi_hdl_t hdl, void **datap) 455 { 456 uint_t chipid = cmi_hdl_chipid(hdl); 457 struct authamd_chipshared *sp, *osp; 458 uint_t family = cmi_hdl_family(hdl); 459 authamd_data_t *authamd; 460 uint64_t cap; 461 462 if (authamd_ms_support_disable || !AUTHAMD_SUPPORTED(family)) 463 return (ENOTSUP); 464 465 if (!(x86_feature & X86_MCA)) 466 return (ENOTSUP); 467 468 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS) 469 return (ENOTSUP); 470 471 if (!(cap & MCG_CAP_CTL_P)) 472 return (ENOTSUP); 473 474 authamd = *datap = kmem_zalloc(sizeof (authamd_data_t), KM_SLEEP); 475 cmi_hdl_hold(hdl); /* release in fini */ 476 authamd->amd_hdl = hdl; 477 478 if ((sp = authamd_shared[chipid]) == NULL) { 479 sp = kmem_zalloc(sizeof (struct authamd_chipshared), KM_SLEEP); 480 osp = atomic_cas_ptr(&authamd_shared[chipid], NULL, sp); 481 if (osp != NULL) { 482 kmem_free(sp, sizeof (struct authamd_chipshared)); 483 sp = osp; 484 } else { 485 sp->acs_chipid = chipid; 486 sp->acs_family = family; 487 sp->acs_rev = cmi_hdl_chiprev(hdl); 488 } 489 } 490 authamd->amd_shared = sp; 491 492 return (0); 493 } 494 495 /* 496 * cms_logout_size entry point. 497 */ 498 /*ARGSUSED*/ 499 size_t 500 authamd_logout_size(cmi_hdl_t hdl) 501 { 502 return (sizeof (struct authamd_logout)); 503 } 504 505 /* 506 * cms_mcgctl_val entry point 507 * 508 * Instead of setting all bits to 1 we can set just those for the 509 * error detector banks known to exist. 510 */ 511 /*ARGSUSED*/ 512 uint64_t 513 authamd_mcgctl_val(cmi_hdl_t hdl, int nbanks, uint64_t proposed) 514 { 515 return (nbanks < 64 ? (1ULL << nbanks) - 1 : proposed); 516 } 517 518 /* 519 * cms_bankctl_skipinit entry point 520 * 521 * On K6 we do not initialize MC0_CTL since, reportedly, this bank (for DC) 522 * may produce spurious machine checks. 523 */ 524 /*ARGSUSED*/ 525 boolean_t 526 authamd_bankctl_skipinit(cmi_hdl_t hdl, int bank) 527 { 528 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 529 530 return (authamd->amd_shared->acs_family == AUTHAMD_FAMILY_6 && 531 bank == 0 ? B_TRUE : B_FALSE); 532 } 533 534 /* 535 * cms_bankctl_val entry point 536 */ 537 uint64_t 538 authamd_bankctl_val(cmi_hdl_t hdl, int bank, uint64_t proposed) 539 { 540 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 541 uint32_t rev = authamd->amd_shared->acs_rev; 542 uint64_t val = proposed; 543 544 /* 545 * The Intel MCA says we can write all 1's to enable #MC for 546 * all errors, and AMD docs say much the same. But, depending 547 * perhaps on other config registers, taking machine checks 548 * for some errors such as GART TLB errors and master/target 549 * aborts may be bad - they set UC and sometime also PCC, but 550 * we should not always panic for these error types. 551 * 552 * Our cms_error_action entry point can suppress such panics, 553 * however we can also use the cms_bankctl_val entry point to 554 * veto enabling of some of the known villains in the first place. 555 */ 556 if (bank == AMD_MCA_BANK_NB && AUTHAMD_NOGARTTBLWLK_MC(rev)) 557 val &= ~AMD_NB_EN_GARTTBLWK; 558 559 return (val); 560 } 561 562 /* 563 * Bits to add to NB MCA config (after watchdog config). 564 */ 565 uint32_t authamd_nb_mcacfg_add = AMD_NB_CFG_ADD_CMN; 566 567 /* 568 * Bits to remove from NB MCA config (after watchdog config) 569 */ 570 uint32_t authamd_nb_mcacfg_remove = AMD_NB_CFG_REMOVE_CMN; 571 572 /* 573 * NB Watchdog policy, and rate we use if enabling. 574 */ 575 enum { 576 AUTHAMD_NB_WDOG_LEAVEALONE, 577 AUTHAMD_NB_WDOG_DISABLE, 578 AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED, 579 AUTHAMD_NB_WDOG_ENABLE_FORCE_RATE 580 } authamd_nb_watchdog_policy = AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED; 581 582 uint32_t authamd_nb_mcacfg_wdog = AMD_NB_CFG_WDOGTMRCNTSEL_4095 | 583 AMD_NB_CFG_WDOGTMRBASESEL_1MS; 584 585 /* 586 * Per-core cache scrubbing policy and rates. 587 */ 588 enum { 589 AUTHAMD_SCRUB_BIOSDEFAULT, /* leave as BIOS configured */ 590 AUTHAMD_SCRUB_FIXED, /* assign our chosen rate */ 591 AUTHAMD_SCRUB_MAX /* use higher of ours and BIOS rate */ 592 } authamd_scrub_policy = AUTHAMD_SCRUB_MAX; 593 594 uint32_t authamd_scrub_rate_dcache = 0xf; /* 64K per 0.67 seconds */ 595 uint32_t authamd_scrub_rate_l2cache = 0xe; /* 1MB per 5.3 seconds */ 596 uint32_t authamd_scrub_rate_l3cache = 0xd; /* 1MB per 2.7 seconds */ 597 598 static uint32_t 599 authamd_scrubrate(uint32_t osrate, uint32_t biosrate, const char *varnm) 600 { 601 uint32_t rate; 602 603 if (osrate > AMD_NB_SCRUBCTL_RATE_MAX) { 604 cmn_err(CE_WARN, "%s is too large, resetting to 0x%x\n", 605 varnm, AMD_NB_SCRUBCTL_RATE_MAX); 606 osrate = AMD_NB_SCRUBCTL_RATE_MAX; 607 } 608 609 switch (authamd_scrub_policy) { 610 case AUTHAMD_SCRUB_FIXED: 611 rate = osrate; 612 break; 613 614 default: 615 cmn_err(CE_WARN, "Unknown authamd_scrub_policy %d - " 616 "using default policy of AUTHAMD_SCRUB_MAX", 617 authamd_scrub_policy); 618 /*FALLTHRU*/ 619 620 case AUTHAMD_SCRUB_MAX: 621 if (osrate != 0 && biosrate != 0) 622 rate = MIN(osrate, biosrate); /* small is fast */ 623 else 624 rate = osrate ? osrate : biosrate; 625 } 626 627 return (rate); 628 } 629 630 /* 631 * cms_mca_init entry point. 632 */ 633 /*ARGSUSED*/ 634 void 635 authamd_mca_init(cmi_hdl_t hdl, int nbanks) 636 { 637 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 638 uint32_t rev = authamd->amd_shared->acs_rev; 639 uint_t chipid = authamd->amd_shared->acs_chipid; 640 641 /* 642 * On chips with a NB online spare control register take control 643 * and clear ECC counts. 644 */ 645 if (AUTHAMD_HAS_ONLINESPARECTL(rev) && 646 authamd_chip_once(authamd, AUTHAMD_CFGONCE_ONLNSPRCFG)) { 647 authamd_clear_ecccnt(authamd, B_TRUE); 648 } 649 650 /* 651 * And since we are claiming the telemetry stop the BIOS receiving 652 * an SMI on NB threshold overflow. 653 */ 654 if (AUTHAMD_NBMISC_NUM(rev) && 655 authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBTHRESH)) { 656 union mcmsr_nbmisc nbm; 657 int i; 658 659 authamd_bankstatus_prewrite(hdl, authamd); 660 661 for (i = 0; i < AUTHAMD_NBMISC_NUM(rev); i++) { 662 if (cmi_hdl_rdmsr(hdl, MC_MSR_NB_MISC(i), 663 (uint64_t *)&nbm) != CMI_SUCCESS) 664 continue; 665 666 if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) && 667 MCMSR_FIELD_F_revFG(&nbm, mcmisc_Valid) && 668 MCMSR_FIELD_F_revFG(&nbm, mcmisc_CntP)) { 669 MCMSR_FIELD_F_revFG(&nbm, mcmisc_IntType) = 0; 670 } else if (X86_CHIPREV_ATLEAST(rev, 671 X86_CHIPREV_AMD_10_REV_A) && 672 MCMSR_FIELD_10_revAB(&nbm, mcmisc_Valid) && 673 MCMSR_FIELD_10_revAB(&nbm, mcmisc_CntP)) { 674 MCMSR_FIELD_10_revAB(&nbm, mcmisc_IntType) = 0; 675 } 676 677 (void) cmi_hdl_wrmsr(hdl, MC_MSR_NB_MISC(i), 678 MCMSR_VAL(&nbm)); 679 } 680 681 authamd_bankstatus_postwrite(hdl, authamd); 682 } 683 684 /* 685 * NB MCA Configuration Register. 686 */ 687 if (AUTHAMD_DO_NBMCACFG(rev) && 688 authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBMCACFG)) { 689 uint32_t val = authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, 690 MC_CTL_REG_NBCFG); 691 692 switch (authamd_nb_watchdog_policy) { 693 case AUTHAMD_NB_WDOG_LEAVEALONE: 694 break; 695 696 case AUTHAMD_NB_WDOG_DISABLE: 697 val &= ~(AMD_NB_CFG_WDOGTMRBASESEL_MASK | 698 AMD_NB_CFG_WDOGTMRCNTSEL_MASK); 699 val |= AMD_NB_CFG_WDOGTMRDIS; 700 break; 701 702 default: 703 cmn_err(CE_NOTE, "authamd_nb_watchdog_policy=%d " 704 "unrecognised, using default policy", 705 authamd_nb_watchdog_policy); 706 /*FALLTHRU*/ 707 708 case AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED: 709 if (!(val & AMD_NB_CFG_WDOGTMRDIS)) 710 break; /* if enabled leave rate intact */ 711 /*FALLTHRU*/ 712 713 case AUTHAMD_NB_WDOG_ENABLE_FORCE_RATE: 714 val &= ~(AMD_NB_CFG_WDOGTMRBASESEL_MASK | 715 AMD_NB_CFG_WDOGTMRCNTSEL_MASK | 716 AMD_NB_CFG_WDOGTMRDIS); 717 val |= authamd_nb_mcacfg_wdog; 718 break; 719 } 720 721 /* 722 * Bit 0 of the NB MCA Config register is reserved on family 723 * 0x10. 724 */ 725 if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 726 authamd_nb_mcacfg_add &= ~AMD_NB_CFG_CPUECCERREN; 727 728 val &= ~authamd_nb_mcacfg_remove; 729 val |= authamd_nb_mcacfg_add; 730 731 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, MC_CTL_REG_NBCFG, 732 val); 733 } 734 735 /* 736 * Cache scrubbing. We can't enable DRAM scrubbing since 737 * we don't know the DRAM base for this node. 738 */ 739 if (AUTHAMD_HAS_CHIPSCRUB(rev) && 740 authamd_scrub_policy != AUTHAMD_SCRUB_BIOSDEFAULT && 741 authamd_chip_once(authamd, AUTHAMD_CFGONCE_CACHESCRUB)) { 742 uint32_t val = authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, 743 MC_CTL_REG_SCRUBCTL); 744 int l3cap = 0; 745 746 if (AUTHAMD_L3CAPABLE(rev)) { 747 l3cap = (authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, 748 MC_CTL_REG_NBCAP) & MC_NBCAP_L3CAPABLE) != 0; 749 } 750 751 authamd_scrub_rate_dcache = 752 authamd_scrubrate(authamd_scrub_rate_dcache, 753 (val & AMD_NB_SCRUBCTL_DC_MASK) >> AMD_NB_SCRUBCTL_DC_SHIFT, 754 "authamd_scrub_rate_dcache"); 755 756 authamd_scrub_rate_l2cache = 757 authamd_scrubrate(authamd_scrub_rate_l2cache, 758 (val & AMD_NB_SCRUBCTL_L2_MASK) >> AMD_NB_SCRUBCTL_L2_SHIFT, 759 "authamd_scrub_rate_l2cache"); 760 761 authamd_scrub_rate_l3cache = l3cap ? 762 authamd_scrubrate(authamd_scrub_rate_l3cache, 763 (val & AMD_NB_SCRUBCTL_L3_MASK) >> AMD_NB_SCRUBCTL_L3_SHIFT, 764 "authamd_scrub_rate_l3cache") : 0; 765 766 val = AMD_NB_MKSCRUBCTL(authamd_scrub_rate_l3cache, 767 authamd_scrub_rate_dcache, authamd_scrub_rate_l2cache, 768 val & AMD_NB_SCRUBCTL_DRAM_MASK); 769 770 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, 771 MC_CTL_REG_SCRUBCTL, val); 772 } 773 774 } 775 776 /* 777 * cms_poll_ownermask entry point. 778 */ 779 uint64_t 780 authamd_poll_ownermask(cmi_hdl_t hdl, hrtime_t pintvl) 781 { 782 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 783 struct authamd_chipshared *acsp = authamd->amd_shared; 784 hrtime_t now = gethrtime_waitfree(); 785 hrtime_t last = acsp->acs_poll_timestamp; 786 int dopoll = 0; 787 788 if (now - last > 2 * pintvl || last == 0) { 789 acsp->acs_pollowner = hdl; 790 dopoll = 1; 791 } else if (acsp->acs_pollowner == hdl) { 792 dopoll = 1; 793 } 794 795 if (dopoll) 796 acsp->acs_poll_timestamp = now; 797 798 return (dopoll ? -1ULL : ~(1 << AMD_MCA_BANK_NB)); 799 800 } 801 802 /* 803 * cms_bank_logout entry point. 804 */ 805 /*ARGSUSED*/ 806 void 807 authamd_bank_logout(cmi_hdl_t hdl, int bank, uint64_t status, 808 uint64_t addr, uint64_t misc, void *mslogout) 809 { 810 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 811 struct authamd_logout *msl = mslogout; 812 uint32_t rev = authamd->amd_shared->acs_rev; 813 814 if (msl == NULL) 815 return; 816 817 /* 818 * For main memory ECC errors on revisions with an Online Spare 819 * Control Register grab the ECC counts by channel and chip-select 820 * and reset them to 0. 821 */ 822 if (AUTHAMD_MEMECC_RECOGNISED(rev) && 823 AUTHAMD_IS_MEMECCERR(bank, status) && 824 AUTHAMD_HAS_ONLINESPARECTL(rev)) { 825 if (authamd_read_ecccnt(authamd, msl)) 826 authamd_clear_ecccnt(authamd, B_FALSE); 827 } 828 } 829 830 /* 831 * cms_error_action entry point 832 */ 833 834 int authamd_forgive_uc = 0; /* For test/debug only */ 835 int authamd_forgive_pcc = 0; /* For test/debug only */ 836 int authamd_fake_poison = 0; /* For test/debug only */ 837 838 /*ARGSUSED*/ 839 uint32_t 840 authamd_error_action(cmi_hdl_t hdl, int ismc, int bank, 841 uint64_t status, uint64_t addr, uint64_t misc, void *mslogout) 842 { 843 authamd_error_disp_t *disp; 844 uint32_t rv = 0; 845 846 if (authamd_forgive_uc) 847 rv |= CMS_ERRSCOPE_CLEARED_UC; 848 849 if (authamd_forgive_pcc) 850 rv |= CMS_ERRSCOPE_CURCONTEXT_OK; 851 852 if (authamd_fake_poison && status & MSR_MC_STATUS_UC) 853 rv |= CMS_ERRSCOPE_POISONED; 854 855 if (rv) 856 return (rv); 857 858 disp = authamd_disp_match(hdl, bank, status, addr, misc, mslogout); 859 860 if (disp == &authamd_gart_disp) { 861 /* 862 * GART walk errors set UC and possibly PCC (if source CPU) 863 * but should not be regarded as terminal. 864 */ 865 return (CMS_ERRSCOPE_IGNORE_ERR); 866 } 867 868 /* 869 * May also want to consider master abort and target abort. These 870 * also set UC and PCC (if src CPU) but the requester gets -1 871 * and I believe the IO stuff in Solaris will handle that. 872 */ 873 874 return (rv); 875 } 876 877 /* 878 * cms_disp_match entry point 879 */ 880 /*ARGSUSED*/ 881 cms_cookie_t 882 authamd_disp_match(cmi_hdl_t hdl, int bank, uint64_t status, 883 uint64_t addr, uint64_t misc, void *mslogout) 884 { 885 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 886 /* uint16_t errcode = MCAX86_ERRCODE(status); */ 887 uint16_t exterrcode = AMD_EXT_ERRCODE(status); 888 uint32_t rev = authamd->amd_shared->acs_rev; 889 890 /* 891 * Recognise main memory ECC errors 892 */ 893 if (AUTHAMD_MEMECC_RECOGNISED(rev) && 894 AUTHAMD_IS_MEMECCERR(bank, status)) { 895 if (status & AMD_BANK_STAT_CECC) { 896 return (exterrcode == 0 ? &authamd_memce_disp : 897 &authamd_ckmemce_disp); 898 } else if (status & AMD_BANK_STAT_UECC) { 899 return (exterrcode == 0 ? &authamd_memue_disp : 900 &authamd_ckmemue_disp); 901 } 902 } 903 904 /* 905 * Recognise GART walk errors 906 */ 907 if (AUTHAMD_NOGARTTBLWLK_MC(rev) && AUTHAMD_IS_GARTERR(bank, status)) 908 return (&authamd_gart_disp); 909 910 return (NULL); 911 } 912 913 /* 914 * cms_ereport_class entry point 915 */ 916 /*ARGSUSED*/ 917 void 918 authamd_ereport_class(cmi_hdl_t hdl, cms_cookie_t mscookie, 919 const char **cpuclsp, const char **leafclsp) 920 { 921 const authamd_error_disp_t *aed = mscookie; 922 923 if (aed == NULL) 924 return; 925 926 if (aed->aad_subclass != NULL) 927 *cpuclsp = aed->aad_subclass; 928 if (aed->aad_leafclass != NULL) 929 *leafclsp = aed->aad_leafclass; 930 } 931 932 /*ARGSUSED*/ 933 static void 934 authamd_ereport_add_resource(cmi_hdl_t hdl, authamd_data_t *authamd, 935 nvlist_t *ereport, nv_alloc_t *nva, void *mslogout) 936 { 937 nvlist_t *elems[AUTHAMD_DRAM_NCHANNEL * AUTHAMD_DRAM_NCS]; 938 uint8_t counts[AUTHAMD_DRAM_NCHANNEL * AUTHAMD_DRAM_NCS]; 939 authamd_logout_t *msl; 940 nvlist_t *nvl; 941 int nelems = 0; 942 int i, chan, cs; 943 944 if ((msl = mslogout) == NULL) 945 return; 946 947 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 948 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 949 if (msl->aal_eccerrcnt[chan][cs] == 0) 950 continue; 951 952 if ((nvl = fm_nvlist_create(nva)) == NULL) 953 continue; 954 955 elems[nelems] = nvl; 956 counts[nelems++] = msl->aal_eccerrcnt[chan][cs]; 957 958 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 5, 959 "motherboard", 0, 960 "chip", authamd->amd_shared->acs_chipid, 961 "memory-controller", 0, 962 "dram-channel", chan, 963 "chip-select", cs); 964 } 965 } 966 967 if (nelems == 0) 968 return; 969 970 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_RESOURCE, 971 DATA_TYPE_NVLIST_ARRAY, nelems, elems, 972 NULL); 973 974 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_RESOURCECNT, 975 DATA_TYPE_UINT8_ARRAY, nelems, &counts[0], 976 NULL); 977 978 for (i = 0; i < nelems; i++) 979 fm_nvlist_destroy(elems[i], nva ? FM_NVA_RETAIN : FM_NVA_FREE); 980 } 981 982 /* 983 * cms_ereport_add_logout entry point 984 */ 985 /*ARGSUSED*/ 986 void 987 authamd_ereport_add_logout(cmi_hdl_t hdl, nvlist_t *ereport, nv_alloc_t *nva, 988 int bank, uint64_t status, uint64_t addr, uint64_t misc, 989 void *mslogout, cms_cookie_t mscookie) 990 { 991 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 992 const authamd_error_disp_t *aed = mscookie; 993 uint64_t members; 994 995 if (aed == NULL) 996 return; 997 998 members = aed->aad_ereport_members; 999 1000 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYND) { 1001 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_SYND, 1002 DATA_TYPE_UINT16, (uint16_t)AMD_BANK_SYND(status), 1003 NULL); 1004 1005 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYNDTYPE) { 1006 fm_payload_set(ereport, 1007 FM_EREPORT_GENAMD_PAYLOAD_NAME_SYNDTYPE, 1008 DATA_TYPE_STRING, "E", 1009 NULL); 1010 } 1011 } 1012 1013 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_CKSYND) { 1014 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_CKSYND, 1015 DATA_TYPE_UINT16, (uint16_t)AMD_NB_STAT_CKSYND(status), 1016 NULL); 1017 1018 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYNDTYPE) { 1019 fm_payload_set(ereport, 1020 FM_EREPORT_GENAMD_PAYLOAD_NAME_SYNDTYPE, 1021 DATA_TYPE_STRING, "C", 1022 NULL); 1023 } 1024 } 1025 1026 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_RESOURCE && 1027 status & MSR_MC_STATUS_ADDRV) { 1028 authamd_ereport_add_resource(hdl, authamd, ereport, nva, 1029 mslogout); 1030 } 1031 } 1032 1033 /* 1034 * cms_msrinject entry point 1035 */ 1036 cms_errno_t 1037 authamd_msrinject(cmi_hdl_t hdl, uint_t msr, uint64_t val) 1038 { 1039 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 1040 cms_errno_t rv = CMSERR_BADMSRWRITE; 1041 1042 authamd_bankstatus_prewrite(hdl, authamd); 1043 if (cmi_hdl_wrmsr(hdl, msr, val) == CMI_SUCCESS) 1044 rv = CMS_SUCCESS; 1045 authamd_bankstatus_postwrite(hdl, authamd); 1046 1047 return (rv); 1048 } 1049 1050 cms_api_ver_t _cms_api_version = CMS_API_VERSION_0; 1051 1052 const cms_ops_t _cms_ops = { 1053 authamd_init, /* cms_init */ 1054 NULL, /* cms_post_startup */ 1055 NULL, /* cms_post_mpstartup */ 1056 authamd_logout_size, /* cms_logout_size */ 1057 authamd_mcgctl_val, /* cms_mcgctl_val */ 1058 authamd_bankctl_skipinit, /* cms_bankctl_skipinit */ 1059 authamd_bankctl_val, /* cms_bankctl_val */ 1060 NULL, /* cms_bankstatus_skipinit */ 1061 NULL, /* cms_bankstatus_val */ 1062 authamd_mca_init, /* cms_mca_init */ 1063 authamd_poll_ownermask, /* cms_poll_ownermask */ 1064 authamd_bank_logout, /* cms_bank_logout */ 1065 authamd_error_action, /* cms_error_action */ 1066 authamd_disp_match, /* cms_disp_match */ 1067 authamd_ereport_class, /* cms_ereport_class */ 1068 NULL, /* cms_ereport_detector */ 1069 NULL, /* cms_ereport_includestack */ 1070 authamd_ereport_add_logout, /* cms_ereport_add_logout */ 1071 authamd_msrinject, /* cms_msrinject */ 1072 NULL, /* cms_fini */ 1073 }; 1074 1075 static struct modlcpu modlcpu = { 1076 &mod_cpuops, 1077 "Generic AMD model-specific MCA" 1078 }; 1079 1080 static struct modlinkage modlinkage = { 1081 MODREV_1, 1082 (void *)&modlcpu, 1083 NULL 1084 }; 1085 1086 int 1087 _init(void) 1088 { 1089 return (mod_install(&modlinkage)); 1090 } 1091 1092 int 1093 _info(struct modinfo *modinfop) 1094 { 1095 return (mod_info(&modlinkage, modinfop)); 1096 } 1097 1098 int 1099 _fini(void) 1100 { 1101 return (mod_remove(&modlinkage)); 1102 } 1103