1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * "Generic AMD" model-specific support. If no more-specific support can 31 * be found, or such modules declines to initialize, then for AuthenticAMD 32 * cpus this module can have a crack at providing some AMD model-specific 33 * support that at least goes beyond common MCA architectural features 34 * if not down to the nitty-gritty level for a particular model. We 35 * are layered on top of a cpu module, likely cpu.generic, so there is no 36 * need for us to perform common architecturally-accessible functions. 37 */ 38 39 #include <sys/types.h> 40 #include <sys/cmn_err.h> 41 #include <sys/modctl.h> 42 #include <sys/cpu_module.h> 43 #include <sys/mca_x86.h> 44 #include <sys/pci_cfgspace.h> 45 #include <sys/x86_archext.h> 46 #include <sys/mc_amd.h> 47 #include <sys/fm/protocol.h> 48 #include <sys/fm/cpu/GENAMD.h> 49 #include <sys/nvpair.h> 50 #include <sys/controlregs.h> 51 #include <sys/pghw.h> 52 #include <sys/sunddi.h> 53 #include <sys/sysmacros.h> 54 #include <sys/cpu_module_ms_impl.h> 55 56 #include "authamd.h" 57 58 int authamd_ms_support_disable = 0; 59 60 #define AUTHAMD_F_REVS_BCDE \ 61 (X86_CHIPREV_AMD_F_REV_B | X86_CHIPREV_AMD_F_REV_C0 | \ 62 X86_CHIPREV_AMD_F_REV_CG | X86_CHIPREV_AMD_F_REV_D | \ 63 X86_CHIPREV_AMD_F_REV_E) 64 65 #define AUTHAMD_F_REVS_FG \ 66 (X86_CHIPREV_AMD_F_REV_F | X86_CHIPREV_AMD_F_REV_G) 67 68 #define AUTHAMD_10_REVS_AB \ 69 (X86_CHIPREV_AMD_10_REV_A | X86_CHIPREV_AMD_10_REV_B) 70 71 /* 72 * Bitmasks of support for various features. Try to enable features 73 * via inclusion in one of these bitmasks and check that at the 74 * feature imlementation - that way new family support may often simply 75 * simply need to update these bitmasks. 76 */ 77 78 /* 79 * Families that this module will provide some model-specific 80 * support for (if no more-specific module claims it first). 81 * We try to support whole families rather than differentiate down 82 * to revision. 83 */ 84 #define AUTHAMD_SUPPORTED(fam) \ 85 ((fam) == AUTHAMD_FAMILY_6 || (fam) == AUTHAMD_FAMILY_F || \ 86 (fam) == AUTHAMD_FAMILY_10) 87 88 /* 89 * Models that include an on-chip NorthBridge. 90 */ 91 #define AUTHAMD_NBONCHIP(rev) \ 92 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 93 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 94 95 /* 96 * Families/revisions for which we can recognise main memory ECC errors. 97 */ 98 #define AUTHAMD_MEMECC_RECOGNISED(rev) \ 99 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 100 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 101 102 /* 103 * Families/revisions that have an Online Spare Control Register 104 */ 105 #define AUTHAMD_HAS_ONLINESPARECTL(rev) \ 106 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) || \ 107 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 108 109 /* 110 * Families/revisions for which we will perform NB MCA Config changes 111 */ 112 #define AUTHAMD_DO_NBMCACFG(rev) \ 113 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 114 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 115 116 /* 117 * Families/revisions that have chip cache scrubbers. 118 */ 119 #define AUTHAMD_HAS_CHIPSCRUB(rev) \ 120 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 121 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 122 123 /* 124 * Families/revisions that have a NB misc register or registers - 125 * evaluates to 0 if no support, otherwise the number of MC4_MISCj. 126 */ 127 #define AUTHAMD_NBMISC_NUM(rev) \ 128 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F)? 1 : \ 129 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A) ? 3 : 0)) 130 131 /* 132 * Families/revision for which we wish not to machine check for GART 133 * table walk errors - bit 10 of NB CTL. 134 */ 135 #define AUTHAMD_NOGARTTBLWLK_MC(rev) \ 136 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 137 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 138 139 /* 140 * Families/revisions that are potentially L3 capable 141 */ 142 #define AUTHAMD_L3CAPABLE(rev) \ 143 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 144 145 /* 146 * We recognise main memory ECC errors for AUTHAMD_MEMECC_RECOGNISED 147 * revisions as: 148 * 149 * - being reported by the NB 150 * - being a compound bus/interconnect error (external to chip) 151 * - having LL of LG 152 * - having II of MEM (but could still be a master/target abort) 153 * - having CECC or UECC set 154 * 155 * We do not check the extended error code (first nibble of the 156 * model-specific error code on AMD) since this has changed from 157 * family 0xf to family 0x10 (ext code 0 now reserved on family 0x10). 158 * Instead we use CECC/UECC to separate off the master/target 159 * abort cases. 160 * 161 * We insist that the detector be the NorthBridge bank; although 162 * IC/DC can report some main memory errors, they do not capture 163 * an address at sufficient resolution to be useful and the NB will 164 * report most errors. 165 */ 166 #define AUTHAMD_IS_MEMECCERR(bank, status) \ 167 ((bank) == AMD_MCA_BANK_NB && \ 168 MCAX86_ERRCODE_ISBUS_INTERCONNECT(MCAX86_ERRCODE(status)) && \ 169 MCAX86_ERRCODE_LL(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_LL_LG && \ 170 MCAX86_ERRCODE_II(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_II_MEM && \ 171 ((status) & (AMD_BANK_STAT_CECC | AMD_BANK_STAT_UECC))) 172 173 static authamd_error_disp_t authamd_memce_disp = { 174 FM_EREPORT_CPU_GENAMD, 175 FM_EREPORT_CPU_GENAMD_MEM_CE, 176 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_MEM_CE 177 }; 178 179 static authamd_error_disp_t authamd_memue_disp = { 180 FM_EREPORT_CPU_GENAMD, 181 FM_EREPORT_CPU_GENAMD_MEM_UE, 182 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_MEM_UE 183 }; 184 185 static authamd_error_disp_t authamd_ckmemce_disp = { 186 FM_EREPORT_CPU_GENAMD, 187 FM_EREPORT_CPU_GENAMD_CKMEM_CE, 188 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_CKMEM_CE 189 }; 190 191 static authamd_error_disp_t authamd_ckmemue_disp = { 192 FM_EREPORT_CPU_GENAMD, 193 FM_EREPORT_CPU_GENAMD_CKMEM_UE, 194 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_CKMEM_UE 195 }; 196 197 /* 198 * We recognise GART walk errors as: 199 * 200 * - being reported by the NB 201 * - being a compound TLB error 202 * - having LL of LG and TT of GEN 203 * - having UC set 204 * - possibly having PCC set (if source CPU) 205 */ 206 #define AUTHAMD_IS_GARTERR(bank, status) \ 207 ((bank) == AMD_MCA_BANK_NB && \ 208 MCAX86_ERRCODE_ISTLB(MCAX86_ERRCODE(status)) && \ 209 MCAX86_ERRCODE_LL(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_LL_LG && \ 210 MCAX86_ERRCODE_TT(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_TT_GEN && \ 211 (status) & MSR_MC_STATUS_UC) 212 213 static authamd_error_disp_t authamd_gart_disp = { 214 FM_EREPORT_CPU_GENAMD, /* use generic subclass */ 215 FM_EREPORT_CPU_GENADM_GARTTBLWLK, /* use generic leafclass */ 216 0 /* no additional payload */ 217 }; 218 219 220 static struct authamd_chipshared *authamd_shared[AUTHAMD_MAX_CHIPS]; 221 222 static int 223 authamd_chip_once(authamd_data_t *authamd, enum authamd_cfgonce_bitnum what) 224 { 225 return (atomic_set_long_excl(&authamd->amd_shared->acs_cfgonce, 226 what) == 0 ? B_TRUE : B_FALSE); 227 } 228 229 static void 230 authamd_pcicfg_write(uint_t chipid, uint_t func, uint_t reg, uint32_t val) 231 { 232 ASSERT(chipid + 24 <= 31); 233 ASSERT((func & 7) == func); 234 ASSERT((reg & 3) == 0 && reg < 256); 235 236 cmi_pci_putl(0, chipid + 24, func, reg, 0, val); 237 } 238 239 static uint32_t 240 authamd_pcicfg_read(uint_t chipid, uint_t func, uint_t reg) 241 { 242 ASSERT(chipid + 24 <= 31); 243 ASSERT((func & 7) == func); 244 ASSERT((reg & 3) == 0 && reg < 256); 245 246 return (cmi_pci_getl(0, chipid + 24, func, reg, 0, 0)); 247 } 248 249 void 250 authamd_bankstatus_prewrite(cmi_hdl_t hdl, authamd_data_t *authamd) 251 { 252 uint64_t hwcr; 253 254 if (cmi_hdl_rdmsr(hdl, MSR_AMD_HWCR, &hwcr) != CMI_SUCCESS) 255 return; 256 257 authamd->amd_hwcr = hwcr; 258 259 if (!(hwcr & AMD_HWCR_MCI_STATUS_WREN)) { 260 hwcr |= AMD_HWCR_MCI_STATUS_WREN; 261 (void) cmi_hdl_wrmsr(hdl, MSR_AMD_HWCR, hwcr); 262 } 263 } 264 265 void 266 authamd_bankstatus_postwrite(cmi_hdl_t hdl, authamd_data_t *authamd) 267 { 268 uint64_t hwcr = authamd->amd_hwcr; 269 270 if (!(hwcr & AMD_HWCR_MCI_STATUS_WREN)) { 271 hwcr &= ~AMD_HWCR_MCI_STATUS_WREN; 272 (void) cmi_hdl_wrmsr(hdl, MSR_AMD_HWCR, hwcr); 273 } 274 } 275 276 /* 277 * Read EccCnt repeatedly for all possible channel/chip-select combos: 278 * 279 * - read sparectl register 280 * - if EccErrCntWrEn is set, clear that bit in the just-read value 281 * and write it back to sparectl; this *may* clobber the EccCnt 282 * for the channel/chip-select combination currently selected, so 283 * we leave this bit clear if we had to clear it 284 * - cycle through all channel/chip-select combinations writing each 285 * combination to sparectl before reading the register back for 286 * EccCnt for that combination; since EccErrCntWrEn is clear 287 * the writes to select what count to read will not themselves 288 * zero any counts 289 */ 290 static int 291 authamd_read_ecccnt(authamd_data_t *authamd, struct authamd_logout *msl) 292 { 293 union mcreg_sparectl sparectl; 294 uint_t chipid = authamd->amd_shared->acs_chipid; 295 uint_t family = authamd->amd_shared->acs_family; 296 uint32_t rev = authamd->amd_shared->acs_rev; 297 int chan, cs; 298 299 /* 300 * Check for feature support; this macro will test down to the 301 * family revision number, whereafter we'll switch on family 302 * assuming that future revisions will use the same register 303 * format. 304 */ 305 if (!AUTHAMD_HAS_ONLINESPARECTL(rev)) { 306 bzero(&msl->aal_eccerrcnt, sizeof (msl->aal_eccerrcnt)); 307 return (0); 308 } 309 310 MCREG_VAL32(&sparectl) = 311 authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL); 312 313 switch (family) { 314 case AUTHAMD_FAMILY_F: 315 MCREG_FIELD_F_revFG(&sparectl, EccErrCntWrEn) = 0; 316 break; 317 318 case AUTHAMD_FAMILY_10: 319 MCREG_FIELD_10_revAB(&sparectl, EccErrCntWrEn) = 0; 320 break; 321 } 322 323 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 324 switch (family) { 325 case AUTHAMD_FAMILY_F: 326 MCREG_FIELD_F_revFG(&sparectl, EccErrCntDramChan) = 327 chan; 328 break; 329 330 case AUTHAMD_FAMILY_10: 331 MCREG_FIELD_10_revAB(&sparectl, EccErrCntDramChan) = 332 chan; 333 break; 334 } 335 336 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 337 switch (family) { 338 case AUTHAMD_FAMILY_F: 339 MCREG_FIELD_F_revFG(&sparectl, 340 EccErrCntDramCs) = cs; 341 break; 342 343 case AUTHAMD_FAMILY_10: 344 MCREG_FIELD_10_revAB(&sparectl, 345 EccErrCntDramCs) = cs; 346 break; 347 } 348 349 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, 350 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 351 352 MCREG_VAL32(&sparectl) = authamd_pcicfg_read(chipid, 353 MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL); 354 355 switch (family) { 356 case AUTHAMD_FAMILY_F: 357 msl->aal_eccerrcnt[chan][cs] = 358 MCREG_FIELD_F_revFG(&sparectl, EccErrCnt); 359 break; 360 case AUTHAMD_FAMILY_10: 361 msl->aal_eccerrcnt[chan][cs] = 362 MCREG_FIELD_10_revAB(&sparectl, EccErrCnt); 363 break; 364 } 365 } 366 } 367 368 return (1); 369 } 370 371 /* 372 * Clear EccCnt for all possible channel/chip-select combos: 373 * 374 * - set EccErrCntWrEn in sparectl, if necessary 375 * - write 0 to EccCnt for all channel/chip-select combinations 376 * - clear EccErrCntWrEn 377 * 378 * If requested also disable the interrupts taken on counter overflow 379 * and on swap done. 380 */ 381 static void 382 authamd_clear_ecccnt(authamd_data_t *authamd, boolean_t clrint) 383 { 384 union mcreg_sparectl sparectl; 385 uint_t chipid = authamd->amd_shared->acs_chipid; 386 uint_t family = authamd->amd_shared->acs_family; 387 uint32_t rev = authamd->amd_shared->acs_rev; 388 int chan, cs; 389 390 if (!AUTHAMD_HAS_ONLINESPARECTL(rev)) 391 return; 392 393 MCREG_VAL32(&sparectl) = 394 authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL); 395 396 switch (family) { 397 case AUTHAMD_FAMILY_F: 398 MCREG_FIELD_F_revFG(&sparectl, EccErrCntWrEn) = 1; 399 if (clrint) { 400 MCREG_FIELD_F_revFG(&sparectl, EccErrInt) = 0; 401 MCREG_FIELD_F_revFG(&sparectl, SwapDoneInt) = 0; 402 } 403 break; 404 405 case AUTHAMD_FAMILY_10: 406 MCREG_FIELD_10_revAB(&sparectl, EccErrCntWrEn) = 1; 407 if (clrint) { 408 MCREG_FIELD_10_revAB(&sparectl, EccErrInt) = 0; 409 MCREG_FIELD_10_revAB(&sparectl, SwapDoneInt) = 0; 410 } 411 break; 412 } 413 414 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, 415 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 416 417 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 418 switch (family) { 419 case AUTHAMD_FAMILY_F: 420 MCREG_FIELD_F_revFG(&sparectl, EccErrCntDramChan) = 421 chan; 422 break; 423 424 case AUTHAMD_FAMILY_10: 425 MCREG_FIELD_10_revAB(&sparectl, EccErrCntDramChan) = 426 chan; 427 break; 428 } 429 430 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 431 switch (family) { 432 case AUTHAMD_FAMILY_F: 433 MCREG_FIELD_F_revFG(&sparectl, 434 EccErrCntDramCs) = cs; 435 MCREG_FIELD_F_revFG(&sparectl, 436 EccErrCnt) = 0; 437 break; 438 439 case AUTHAMD_FAMILY_10: 440 MCREG_FIELD_10_revAB(&sparectl, 441 EccErrCntDramCs) = cs; 442 MCREG_FIELD_10_revAB(&sparectl, 443 EccErrCnt) = 0; 444 break; 445 } 446 447 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, 448 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 449 } 450 } 451 } 452 453 /* 454 * cms_init entry point. 455 * 456 * This module provides broad model-specific support for AMD families 457 * 0x6, 0xf and 0x10. Future families will have to be evaluated once their 458 * documentation is available. 459 */ 460 int 461 authamd_init(cmi_hdl_t hdl, void **datap) 462 { 463 uint_t chipid = cmi_hdl_chipid(hdl); 464 struct authamd_chipshared *sp, *osp; 465 uint_t family = cmi_hdl_family(hdl); 466 authamd_data_t *authamd; 467 uint64_t cap; 468 469 if (authamd_ms_support_disable || !AUTHAMD_SUPPORTED(family)) 470 return (ENOTSUP); 471 472 if (!(x86_feature & X86_MCA)) 473 return (ENOTSUP); 474 475 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS) 476 return (ENOTSUP); 477 478 if (!(cap & MCG_CAP_CTL_P)) 479 return (ENOTSUP); 480 481 authamd = *datap = kmem_zalloc(sizeof (authamd_data_t), KM_SLEEP); 482 cmi_hdl_hold(hdl); /* release in fini */ 483 authamd->amd_hdl = hdl; 484 485 if ((sp = authamd_shared[chipid]) == NULL) { 486 sp = kmem_zalloc(sizeof (struct authamd_chipshared), KM_SLEEP); 487 sp->acs_chipid = chipid; 488 sp->acs_family = family; 489 sp->acs_rev = cmi_hdl_chiprev(hdl); 490 membar_producer(); 491 492 osp = atomic_cas_ptr(&authamd_shared[chipid], NULL, sp); 493 if (osp != NULL) { 494 kmem_free(sp, sizeof (struct authamd_chipshared)); 495 sp = osp; 496 } 497 } 498 authamd->amd_shared = sp; 499 500 return (0); 501 } 502 503 /* 504 * cms_logout_size entry point. 505 */ 506 /*ARGSUSED*/ 507 size_t 508 authamd_logout_size(cmi_hdl_t hdl) 509 { 510 return (sizeof (struct authamd_logout)); 511 } 512 513 /* 514 * cms_mcgctl_val entry point 515 * 516 * Instead of setting all bits to 1 we can set just those for the 517 * error detector banks known to exist. 518 */ 519 /*ARGSUSED*/ 520 uint64_t 521 authamd_mcgctl_val(cmi_hdl_t hdl, int nbanks, uint64_t proposed) 522 { 523 return (nbanks < 64 ? (1ULL << nbanks) - 1 : proposed); 524 } 525 526 /* 527 * cms_bankctl_skipinit entry point 528 * 529 * On K6 we do not initialize MC0_CTL since, reportedly, this bank (for DC) 530 * may produce spurious machine checks. 531 * 532 * Only allow a single core to setup the NorthBridge MCi_CTL register. 533 */ 534 /*ARGSUSED*/ 535 boolean_t 536 authamd_bankctl_skipinit(cmi_hdl_t hdl, int bank) 537 { 538 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 539 uint32_t rev = authamd->amd_shared->acs_rev; 540 541 if (authamd->amd_shared->acs_family == AUTHAMD_FAMILY_6) 542 return (bank == 0 ? B_TRUE : B_FALSE); 543 544 if (AUTHAMD_NBONCHIP(rev) && bank == AMD_MCA_BANK_NB) { 545 return (authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBMCA) == 546 B_TRUE ? B_FALSE : B_TRUE); 547 } 548 549 return (B_FALSE); 550 } 551 552 /* 553 * cms_bankctl_val entry point 554 */ 555 uint64_t 556 authamd_bankctl_val(cmi_hdl_t hdl, int bank, uint64_t proposed) 557 { 558 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 559 uint32_t rev = authamd->amd_shared->acs_rev; 560 uint64_t val = proposed; 561 562 /* 563 * The Intel MCA says we can write all 1's to enable #MC for 564 * all errors, and AMD docs say much the same. But, depending 565 * perhaps on other config registers, taking machine checks 566 * for some errors such as GART TLB errors and master/target 567 * aborts may be bad - they set UC and sometime also PCC, but 568 * we should not always panic for these error types. 569 * 570 * Our cms_error_action entry point can suppress such panics, 571 * however we can also use the cms_bankctl_val entry point to 572 * veto enabling of some of the known villains in the first place. 573 */ 574 if (bank == AMD_MCA_BANK_NB && AUTHAMD_NOGARTTBLWLK_MC(rev)) 575 val &= ~AMD_NB_EN_GARTTBLWK; 576 577 return (val); 578 } 579 580 /* 581 * Bits to add to NB MCA config (after watchdog config). 582 */ 583 uint32_t authamd_nb_mcacfg_add = AMD_NB_CFG_ADD_CMN; 584 585 /* 586 * Bits to remove from NB MCA config (after watchdog config) 587 */ 588 uint32_t authamd_nb_mcacfg_remove = AMD_NB_CFG_REMOVE_CMN; 589 590 /* 591 * NB Watchdog policy, and rate we use if enabling. 592 */ 593 enum { 594 AUTHAMD_NB_WDOG_LEAVEALONE, 595 AUTHAMD_NB_WDOG_DISABLE, 596 AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED, 597 AUTHAMD_NB_WDOG_ENABLE_FORCE_RATE 598 } authamd_nb_watchdog_policy = AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED; 599 600 uint32_t authamd_nb_mcacfg_wdog = AMD_NB_CFG_WDOGTMRCNTSEL_4095 | 601 AMD_NB_CFG_WDOGTMRBASESEL_1MS; 602 603 /* 604 * Per-core cache scrubbing policy and rates. 605 */ 606 enum { 607 AUTHAMD_SCRUB_BIOSDEFAULT, /* leave as BIOS configured */ 608 AUTHAMD_SCRUB_FIXED, /* assign our chosen rate */ 609 AUTHAMD_SCRUB_MAX /* use higher of ours and BIOS rate */ 610 } authamd_scrub_policy = AUTHAMD_SCRUB_MAX; 611 612 uint32_t authamd_scrub_rate_dcache = 0xf; /* 64K per 0.67 seconds */ 613 uint32_t authamd_scrub_rate_l2cache = 0xe; /* 1MB per 5.3 seconds */ 614 uint32_t authamd_scrub_rate_l3cache = 0xd; /* 1MB per 2.7 seconds */ 615 616 static uint32_t 617 authamd_scrubrate(uint32_t osrate, uint32_t biosrate, const char *varnm) 618 { 619 uint32_t rate; 620 621 if (osrate > AMD_NB_SCRUBCTL_RATE_MAX) { 622 cmn_err(CE_WARN, "%s is too large, resetting to 0x%x\n", 623 varnm, AMD_NB_SCRUBCTL_RATE_MAX); 624 osrate = AMD_NB_SCRUBCTL_RATE_MAX; 625 } 626 627 switch (authamd_scrub_policy) { 628 case AUTHAMD_SCRUB_FIXED: 629 rate = osrate; 630 break; 631 632 default: 633 cmn_err(CE_WARN, "Unknown authamd_scrub_policy %d - " 634 "using default policy of AUTHAMD_SCRUB_MAX", 635 authamd_scrub_policy); 636 /*FALLTHRU*/ 637 638 case AUTHAMD_SCRUB_MAX: 639 if (osrate != 0 && biosrate != 0) 640 rate = MIN(osrate, biosrate); /* small is fast */ 641 else 642 rate = osrate ? osrate : biosrate; 643 } 644 645 return (rate); 646 } 647 648 /* 649 * cms_mca_init entry point. 650 */ 651 /*ARGSUSED*/ 652 void 653 authamd_mca_init(cmi_hdl_t hdl, int nbanks) 654 { 655 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 656 uint32_t rev = authamd->amd_shared->acs_rev; 657 uint_t chipid = authamd->amd_shared->acs_chipid; 658 659 /* 660 * On chips with a NB online spare control register take control 661 * and clear ECC counts. 662 */ 663 if (AUTHAMD_HAS_ONLINESPARECTL(rev) && 664 authamd_chip_once(authamd, AUTHAMD_CFGONCE_ONLNSPRCFG)) { 665 authamd_clear_ecccnt(authamd, B_TRUE); 666 } 667 668 /* 669 * And since we are claiming the telemetry stop the BIOS receiving 670 * an SMI on NB threshold overflow. 671 */ 672 if (AUTHAMD_NBMISC_NUM(rev) && 673 authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBTHRESH)) { 674 union mcmsr_nbmisc nbm; 675 int i; 676 677 authamd_bankstatus_prewrite(hdl, authamd); 678 679 for (i = 0; i < AUTHAMD_NBMISC_NUM(rev); i++) { 680 if (cmi_hdl_rdmsr(hdl, MC_MSR_NB_MISC(i), 681 (uint64_t *)&nbm) != CMI_SUCCESS) 682 continue; 683 684 if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) && 685 MCMSR_FIELD_F_revFG(&nbm, mcmisc_Valid) && 686 MCMSR_FIELD_F_revFG(&nbm, mcmisc_CntP)) { 687 MCMSR_FIELD_F_revFG(&nbm, mcmisc_IntType) = 0; 688 } else if (X86_CHIPREV_ATLEAST(rev, 689 X86_CHIPREV_AMD_10_REV_A) && 690 MCMSR_FIELD_10_revAB(&nbm, mcmisc_Valid) && 691 MCMSR_FIELD_10_revAB(&nbm, mcmisc_CntP)) { 692 MCMSR_FIELD_10_revAB(&nbm, mcmisc_IntType) = 0; 693 } 694 695 (void) cmi_hdl_wrmsr(hdl, MC_MSR_NB_MISC(i), 696 MCMSR_VAL(&nbm)); 697 } 698 699 authamd_bankstatus_postwrite(hdl, authamd); 700 } 701 702 /* 703 * NB MCA Configuration Register. 704 */ 705 if (AUTHAMD_DO_NBMCACFG(rev) && 706 authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBMCACFG)) { 707 uint32_t val = authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, 708 MC_CTL_REG_NBCFG); 709 710 switch (authamd_nb_watchdog_policy) { 711 case AUTHAMD_NB_WDOG_LEAVEALONE: 712 break; 713 714 case AUTHAMD_NB_WDOG_DISABLE: 715 val &= ~(AMD_NB_CFG_WDOGTMRBASESEL_MASK | 716 AMD_NB_CFG_WDOGTMRCNTSEL_MASK); 717 val |= AMD_NB_CFG_WDOGTMRDIS; 718 break; 719 720 default: 721 cmn_err(CE_NOTE, "authamd_nb_watchdog_policy=%d " 722 "unrecognised, using default policy", 723 authamd_nb_watchdog_policy); 724 /*FALLTHRU*/ 725 726 case AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED: 727 if (!(val & AMD_NB_CFG_WDOGTMRDIS)) 728 break; /* if enabled leave rate intact */ 729 /*FALLTHRU*/ 730 731 case AUTHAMD_NB_WDOG_ENABLE_FORCE_RATE: 732 val &= ~(AMD_NB_CFG_WDOGTMRBASESEL_MASK | 733 AMD_NB_CFG_WDOGTMRCNTSEL_MASK | 734 AMD_NB_CFG_WDOGTMRDIS); 735 val |= authamd_nb_mcacfg_wdog; 736 break; 737 } 738 739 /* 740 * Bit 0 of the NB MCA Config register is reserved on family 741 * 0x10. 742 */ 743 if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 744 authamd_nb_mcacfg_add &= ~AMD_NB_CFG_CPUECCERREN; 745 746 val &= ~authamd_nb_mcacfg_remove; 747 val |= authamd_nb_mcacfg_add; 748 749 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, MC_CTL_REG_NBCFG, 750 val); 751 } 752 753 /* 754 * Cache scrubbing. We can't enable DRAM scrubbing since 755 * we don't know the DRAM base for this node. 756 */ 757 if (AUTHAMD_HAS_CHIPSCRUB(rev) && 758 authamd_scrub_policy != AUTHAMD_SCRUB_BIOSDEFAULT && 759 authamd_chip_once(authamd, AUTHAMD_CFGONCE_CACHESCRUB)) { 760 uint32_t val = authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, 761 MC_CTL_REG_SCRUBCTL); 762 int l3cap = 0; 763 764 if (AUTHAMD_L3CAPABLE(rev)) { 765 l3cap = (authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, 766 MC_CTL_REG_NBCAP) & MC_NBCAP_L3CAPABLE) != 0; 767 } 768 769 authamd_scrub_rate_dcache = 770 authamd_scrubrate(authamd_scrub_rate_dcache, 771 (val & AMD_NB_SCRUBCTL_DC_MASK) >> AMD_NB_SCRUBCTL_DC_SHIFT, 772 "authamd_scrub_rate_dcache"); 773 774 authamd_scrub_rate_l2cache = 775 authamd_scrubrate(authamd_scrub_rate_l2cache, 776 (val & AMD_NB_SCRUBCTL_L2_MASK) >> AMD_NB_SCRUBCTL_L2_SHIFT, 777 "authamd_scrub_rate_l2cache"); 778 779 authamd_scrub_rate_l3cache = l3cap ? 780 authamd_scrubrate(authamd_scrub_rate_l3cache, 781 (val & AMD_NB_SCRUBCTL_L3_MASK) >> AMD_NB_SCRUBCTL_L3_SHIFT, 782 "authamd_scrub_rate_l3cache") : 0; 783 784 val = AMD_NB_MKSCRUBCTL(authamd_scrub_rate_l3cache, 785 authamd_scrub_rate_dcache, authamd_scrub_rate_l2cache, 786 val & AMD_NB_SCRUBCTL_DRAM_MASK); 787 788 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, 789 MC_CTL_REG_SCRUBCTL, val); 790 } 791 792 } 793 794 /* 795 * cms_poll_ownermask entry point. 796 */ 797 uint64_t 798 authamd_poll_ownermask(cmi_hdl_t hdl, hrtime_t pintvl) 799 { 800 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 801 struct authamd_chipshared *acsp = authamd->amd_shared; 802 hrtime_t now = gethrtime_waitfree(); 803 hrtime_t last = acsp->acs_poll_timestamp; 804 int dopoll = 0; 805 806 if (now - last > 2 * pintvl || last == 0) { 807 acsp->acs_pollowner = hdl; 808 dopoll = 1; 809 } else if (acsp->acs_pollowner == hdl) { 810 dopoll = 1; 811 } 812 813 if (dopoll) 814 acsp->acs_poll_timestamp = now; 815 816 return (dopoll ? -1ULL : ~(1 << AMD_MCA_BANK_NB)); 817 818 } 819 820 /* 821 * cms_bank_logout entry point. 822 */ 823 /*ARGSUSED*/ 824 void 825 authamd_bank_logout(cmi_hdl_t hdl, int bank, uint64_t status, 826 uint64_t addr, uint64_t misc, void *mslogout) 827 { 828 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 829 struct authamd_logout *msl = mslogout; 830 uint32_t rev = authamd->amd_shared->acs_rev; 831 832 if (msl == NULL) 833 return; 834 835 /* 836 * For main memory ECC errors on revisions with an Online Spare 837 * Control Register grab the ECC counts by channel and chip-select 838 * and reset them to 0. 839 */ 840 if (AUTHAMD_MEMECC_RECOGNISED(rev) && 841 AUTHAMD_IS_MEMECCERR(bank, status) && 842 AUTHAMD_HAS_ONLINESPARECTL(rev)) { 843 if (authamd_read_ecccnt(authamd, msl)) 844 authamd_clear_ecccnt(authamd, B_FALSE); 845 } 846 } 847 848 /* 849 * cms_error_action entry point 850 */ 851 852 int authamd_forgive_uc = 0; /* For test/debug only */ 853 int authamd_forgive_pcc = 0; /* For test/debug only */ 854 int authamd_fake_poison = 0; /* For test/debug only */ 855 856 /*ARGSUSED*/ 857 uint32_t 858 authamd_error_action(cmi_hdl_t hdl, int ismc, int bank, 859 uint64_t status, uint64_t addr, uint64_t misc, void *mslogout) 860 { 861 authamd_error_disp_t *disp; 862 uint32_t rv = 0; 863 864 if (authamd_forgive_uc) 865 rv |= CMS_ERRSCOPE_CLEARED_UC; 866 867 if (authamd_forgive_pcc) 868 rv |= CMS_ERRSCOPE_CURCONTEXT_OK; 869 870 if (authamd_fake_poison && status & MSR_MC_STATUS_UC) 871 rv |= CMS_ERRSCOPE_POISONED; 872 873 if (rv) 874 return (rv); 875 876 disp = authamd_disp_match(hdl, bank, status, addr, misc, mslogout); 877 878 if (disp == &authamd_gart_disp) { 879 /* 880 * GART walk errors set UC and possibly PCC (if source CPU) 881 * but should not be regarded as terminal. 882 */ 883 return (CMS_ERRSCOPE_IGNORE_ERR); 884 } 885 886 /* 887 * May also want to consider master abort and target abort. These 888 * also set UC and PCC (if src CPU) but the requester gets -1 889 * and I believe the IO stuff in Solaris will handle that. 890 */ 891 892 return (rv); 893 } 894 895 /* 896 * cms_disp_match entry point 897 */ 898 /*ARGSUSED*/ 899 cms_cookie_t 900 authamd_disp_match(cmi_hdl_t hdl, int bank, uint64_t status, 901 uint64_t addr, uint64_t misc, void *mslogout) 902 { 903 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 904 /* uint16_t errcode = MCAX86_ERRCODE(status); */ 905 uint16_t exterrcode = AMD_EXT_ERRCODE(status); 906 uint32_t rev = authamd->amd_shared->acs_rev; 907 908 /* 909 * Recognise main memory ECC errors 910 */ 911 if (AUTHAMD_MEMECC_RECOGNISED(rev) && 912 AUTHAMD_IS_MEMECCERR(bank, status)) { 913 if (status & AMD_BANK_STAT_CECC) { 914 return (exterrcode == 0 ? &authamd_memce_disp : 915 &authamd_ckmemce_disp); 916 } else if (status & AMD_BANK_STAT_UECC) { 917 return (exterrcode == 0 ? &authamd_memue_disp : 918 &authamd_ckmemue_disp); 919 } 920 } 921 922 /* 923 * Recognise GART walk errors 924 */ 925 if (AUTHAMD_NOGARTTBLWLK_MC(rev) && AUTHAMD_IS_GARTERR(bank, status)) 926 return (&authamd_gart_disp); 927 928 return (NULL); 929 } 930 931 /* 932 * cms_ereport_class entry point 933 */ 934 /*ARGSUSED*/ 935 void 936 authamd_ereport_class(cmi_hdl_t hdl, cms_cookie_t mscookie, 937 const char **cpuclsp, const char **leafclsp) 938 { 939 const authamd_error_disp_t *aed = mscookie; 940 941 if (aed == NULL) 942 return; 943 944 if (aed->aad_subclass != NULL) 945 *cpuclsp = aed->aad_subclass; 946 if (aed->aad_leafclass != NULL) 947 *leafclsp = aed->aad_leafclass; 948 } 949 950 /*ARGSUSED*/ 951 static void 952 authamd_ereport_add_resource(cmi_hdl_t hdl, authamd_data_t *authamd, 953 nvlist_t *ereport, nv_alloc_t *nva, void *mslogout) 954 { 955 nvlist_t *elems[AUTHAMD_DRAM_NCHANNEL * AUTHAMD_DRAM_NCS]; 956 uint8_t counts[AUTHAMD_DRAM_NCHANNEL * AUTHAMD_DRAM_NCS]; 957 authamd_logout_t *msl; 958 nvlist_t *nvl; 959 int nelems = 0; 960 int i, chan, cs; 961 962 if ((msl = mslogout) == NULL) 963 return; 964 965 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 966 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 967 if (msl->aal_eccerrcnt[chan][cs] == 0) 968 continue; 969 970 if ((nvl = fm_nvlist_create(nva)) == NULL) 971 continue; 972 973 elems[nelems] = nvl; 974 counts[nelems++] = msl->aal_eccerrcnt[chan][cs]; 975 976 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 5, 977 "motherboard", 0, 978 "chip", authamd->amd_shared->acs_chipid, 979 "memory-controller", 0, 980 "dram-channel", chan, 981 "chip-select", cs); 982 } 983 } 984 985 if (nelems == 0) 986 return; 987 988 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_RESOURCE, 989 DATA_TYPE_NVLIST_ARRAY, nelems, elems, 990 NULL); 991 992 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_RESOURCECNT, 993 DATA_TYPE_UINT8_ARRAY, nelems, &counts[0], 994 NULL); 995 996 for (i = 0; i < nelems; i++) 997 fm_nvlist_destroy(elems[i], nva ? FM_NVA_RETAIN : FM_NVA_FREE); 998 } 999 1000 /* 1001 * cms_ereport_add_logout entry point 1002 */ 1003 /*ARGSUSED*/ 1004 void 1005 authamd_ereport_add_logout(cmi_hdl_t hdl, nvlist_t *ereport, nv_alloc_t *nva, 1006 int bank, uint64_t status, uint64_t addr, uint64_t misc, 1007 void *mslogout, cms_cookie_t mscookie) 1008 { 1009 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 1010 const authamd_error_disp_t *aed = mscookie; 1011 uint64_t members; 1012 1013 if (aed == NULL) 1014 return; 1015 1016 members = aed->aad_ereport_members; 1017 1018 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYND) { 1019 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_SYND, 1020 DATA_TYPE_UINT16, (uint16_t)AMD_BANK_SYND(status), 1021 NULL); 1022 1023 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYNDTYPE) { 1024 fm_payload_set(ereport, 1025 FM_EREPORT_GENAMD_PAYLOAD_NAME_SYNDTYPE, 1026 DATA_TYPE_STRING, "E", 1027 NULL); 1028 } 1029 } 1030 1031 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_CKSYND) { 1032 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_CKSYND, 1033 DATA_TYPE_UINT16, (uint16_t)AMD_NB_STAT_CKSYND(status), 1034 NULL); 1035 1036 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYNDTYPE) { 1037 fm_payload_set(ereport, 1038 FM_EREPORT_GENAMD_PAYLOAD_NAME_SYNDTYPE, 1039 DATA_TYPE_STRING, "C", 1040 NULL); 1041 } 1042 } 1043 1044 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_RESOURCE && 1045 status & MSR_MC_STATUS_ADDRV) { 1046 authamd_ereport_add_resource(hdl, authamd, ereport, nva, 1047 mslogout); 1048 } 1049 } 1050 1051 /* 1052 * cms_msrinject entry point 1053 */ 1054 cms_errno_t 1055 authamd_msrinject(cmi_hdl_t hdl, uint_t msr, uint64_t val) 1056 { 1057 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 1058 cms_errno_t rv = CMSERR_BADMSRWRITE; 1059 1060 authamd_bankstatus_prewrite(hdl, authamd); 1061 if (cmi_hdl_wrmsr(hdl, msr, val) == CMI_SUCCESS) 1062 rv = CMS_SUCCESS; 1063 authamd_bankstatus_postwrite(hdl, authamd); 1064 1065 return (rv); 1066 } 1067 1068 cms_api_ver_t _cms_api_version = CMS_API_VERSION_0; 1069 1070 const cms_ops_t _cms_ops = { 1071 authamd_init, /* cms_init */ 1072 NULL, /* cms_post_startup */ 1073 NULL, /* cms_post_mpstartup */ 1074 authamd_logout_size, /* cms_logout_size */ 1075 authamd_mcgctl_val, /* cms_mcgctl_val */ 1076 authamd_bankctl_skipinit, /* cms_bankctl_skipinit */ 1077 authamd_bankctl_val, /* cms_bankctl_val */ 1078 NULL, /* cms_bankstatus_skipinit */ 1079 NULL, /* cms_bankstatus_val */ 1080 authamd_mca_init, /* cms_mca_init */ 1081 authamd_poll_ownermask, /* cms_poll_ownermask */ 1082 authamd_bank_logout, /* cms_bank_logout */ 1083 authamd_error_action, /* cms_error_action */ 1084 authamd_disp_match, /* cms_disp_match */ 1085 authamd_ereport_class, /* cms_ereport_class */ 1086 NULL, /* cms_ereport_detector */ 1087 NULL, /* cms_ereport_includestack */ 1088 authamd_ereport_add_logout, /* cms_ereport_add_logout */ 1089 authamd_msrinject, /* cms_msrinject */ 1090 NULL, /* cms_fini */ 1091 }; 1092 1093 static struct modlcpu modlcpu = { 1094 &mod_cpuops, 1095 "Generic AMD model-specific MCA" 1096 }; 1097 1098 static struct modlinkage modlinkage = { 1099 MODREV_1, 1100 (void *)&modlcpu, 1101 NULL 1102 }; 1103 1104 int 1105 _init(void) 1106 { 1107 return (mod_install(&modlinkage)); 1108 } 1109 1110 int 1111 _info(struct modinfo *modinfop) 1112 { 1113 return (mod_info(&modlinkage, modinfop)); 1114 } 1115 1116 int 1117 _fini(void) 1118 { 1119 return (mod_remove(&modlinkage)); 1120 } 1121