1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * "Generic AMD" model-specific support. If no more-specific support can 29 * be found, or such modules declines to initialize, then for AuthenticAMD 30 * cpus this module can have a crack at providing some AMD model-specific 31 * support that at least goes beyond common MCA architectural features 32 * if not down to the nitty-gritty level for a particular model. We 33 * are layered on top of a cpu module, likely cpu.generic, so there is no 34 * need for us to perform common architecturally-accessible functions. 35 */ 36 37 #include <sys/types.h> 38 #include <sys/cmn_err.h> 39 #include <sys/modctl.h> 40 #include <sys/cpu_module.h> 41 #include <sys/mca_x86.h> 42 #include <sys/pci_cfgspace.h> 43 #include <sys/x86_archext.h> 44 #include <sys/mc_amd.h> 45 #include <sys/fm/protocol.h> 46 #include <sys/fm/cpu/GENAMD.h> 47 #include <sys/nvpair.h> 48 #include <sys/controlregs.h> 49 #include <sys/pghw.h> 50 #include <sys/sunddi.h> 51 #include <sys/sysmacros.h> 52 #include <sys/cpu_module_ms_impl.h> 53 54 #include "authamd.h" 55 56 int authamd_ms_support_disable = 0; 57 58 #define AUTHAMD_F_REVS_BCDE \ 59 (X86_CHIPREV_AMD_F_REV_B | X86_CHIPREV_AMD_F_REV_C0 | \ 60 X86_CHIPREV_AMD_F_REV_CG | X86_CHIPREV_AMD_F_REV_D | \ 61 X86_CHIPREV_AMD_F_REV_E) 62 63 #define AUTHAMD_F_REVS_FG \ 64 (X86_CHIPREV_AMD_F_REV_F | X86_CHIPREV_AMD_F_REV_G) 65 66 #define AUTHAMD_10_REVS_AB \ 67 (X86_CHIPREV_AMD_10_REV_A | X86_CHIPREV_AMD_10_REV_B) 68 69 /* 70 * Bitmasks of support for various features. Try to enable features 71 * via inclusion in one of these bitmasks and check that at the 72 * feature imlementation - that way new family support may often simply 73 * simply need to update these bitmasks. 74 */ 75 76 /* 77 * Models that include an on-chip NorthBridge. 78 */ 79 #define AUTHAMD_NBONCHIP(rev) \ 80 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 81 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 82 83 /* 84 * Families/revisions for which we can recognise main memory ECC errors. 85 */ 86 #define AUTHAMD_MEMECC_RECOGNISED(rev) \ 87 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 88 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 89 90 /* 91 * Families/revisions that have an Online Spare Control Register 92 */ 93 #define AUTHAMD_HAS_ONLINESPARECTL(rev) \ 94 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) || \ 95 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 96 97 /* 98 * Families/revisions for which we will perform NB MCA Config changes 99 */ 100 #define AUTHAMD_DO_NBMCACFG(rev) \ 101 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 102 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 103 104 /* 105 * Families/revisions that have chip cache scrubbers. 106 */ 107 #define AUTHAMD_HAS_CHIPSCRUB(rev) \ 108 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 109 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 110 111 /* 112 * Families/revisions that have a NB misc register or registers - 113 * evaluates to 0 if no support, otherwise the number of MC4_MISCj. 114 */ 115 #define AUTHAMD_NBMISC_NUM(rev) \ 116 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F)? 1 : \ 117 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A) ? 3 : 0)) 118 119 /* 120 * Families/revision for which we wish not to machine check for GART 121 * table walk errors - bit 10 of NB CTL. 122 */ 123 #define AUTHAMD_NOGARTTBLWLK_MC(rev) \ 124 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 125 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 126 127 /* 128 * Families/revisions that are potentially L3 capable 129 */ 130 #define AUTHAMD_L3CAPABLE(rev) \ 131 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 132 133 /* 134 * We recognise main memory ECC errors for AUTHAMD_MEMECC_RECOGNISED 135 * revisions as: 136 * 137 * - being reported by the NB 138 * - being a compound bus/interconnect error (external to chip) 139 * - having LL of LG 140 * - having II of MEM (but could still be a master/target abort) 141 * - having CECC or UECC set 142 * 143 * We do not check the extended error code (first nibble of the 144 * model-specific error code on AMD) since this has changed from 145 * family 0xf to family 0x10 (ext code 0 now reserved on family 0x10). 146 * Instead we use CECC/UECC to separate off the master/target 147 * abort cases. 148 * 149 * We insist that the detector be the NorthBridge bank; although 150 * IC/DC can report some main memory errors, they do not capture 151 * an address at sufficient resolution to be useful and the NB will 152 * report most errors. 153 */ 154 #define AUTHAMD_IS_MEMECCERR(bank, status) \ 155 ((bank) == AMD_MCA_BANK_NB && \ 156 MCAX86_ERRCODE_ISBUS_INTERCONNECT(MCAX86_ERRCODE(status)) && \ 157 MCAX86_ERRCODE_LL(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_LL_LG && \ 158 MCAX86_ERRCODE_II(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_II_MEM && \ 159 ((status) & (AMD_BANK_STAT_CECC | AMD_BANK_STAT_UECC))) 160 161 static authamd_error_disp_t authamd_memce_disp = { 162 FM_EREPORT_CPU_GENAMD, 163 FM_EREPORT_CPU_GENAMD_MEM_CE, 164 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_MEM_CE 165 }; 166 167 static authamd_error_disp_t authamd_memue_disp = { 168 FM_EREPORT_CPU_GENAMD, 169 FM_EREPORT_CPU_GENAMD_MEM_UE, 170 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_MEM_UE 171 }; 172 173 static authamd_error_disp_t authamd_ckmemce_disp = { 174 FM_EREPORT_CPU_GENAMD, 175 FM_EREPORT_CPU_GENAMD_CKMEM_CE, 176 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_CKMEM_CE 177 }; 178 179 static authamd_error_disp_t authamd_ckmemue_disp = { 180 FM_EREPORT_CPU_GENAMD, 181 FM_EREPORT_CPU_GENAMD_CKMEM_UE, 182 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_CKMEM_UE 183 }; 184 185 /* 186 * We recognise GART walk errors as: 187 * 188 * - being reported by the NB 189 * - being a compound TLB error 190 * - having LL of LG and TT of GEN 191 * - having UC set 192 * - possibly having PCC set (if source CPU) 193 */ 194 #define AUTHAMD_IS_GARTERR(bank, status) \ 195 ((bank) == AMD_MCA_BANK_NB && \ 196 MCAX86_ERRCODE_ISTLB(MCAX86_ERRCODE(status)) && \ 197 MCAX86_ERRCODE_LL(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_LL_LG && \ 198 MCAX86_ERRCODE_TT(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_TT_GEN && \ 199 (status) & MSR_MC_STATUS_UC) 200 201 static authamd_error_disp_t authamd_gart_disp = { 202 FM_EREPORT_CPU_GENAMD, /* use generic subclass */ 203 FM_EREPORT_CPU_GENADM_GARTTBLWLK, /* use generic leafclass */ 204 0 /* no additional payload */ 205 }; 206 207 208 static struct authamd_chipshared *authamd_shared[AUTHAMD_MAX_CHIPS]; 209 210 static int 211 authamd_chip_once(authamd_data_t *authamd, enum authamd_cfgonce_bitnum what) 212 { 213 return (atomic_set_long_excl(&authamd->amd_shared->acs_cfgonce, 214 what) == 0 ? B_TRUE : B_FALSE); 215 } 216 217 static void 218 authamd_pcicfg_write(uint_t chipid, uint_t func, uint_t reg, uint32_t val) 219 { 220 ASSERT(chipid + 24 <= 31); 221 ASSERT((func & 7) == func); 222 ASSERT((reg & 3) == 0 && reg < 256); 223 224 cmi_pci_putl(0, chipid + 24, func, reg, 0, val); 225 } 226 227 static uint32_t 228 authamd_pcicfg_read(uint_t chipid, uint_t func, uint_t reg) 229 { 230 ASSERT(chipid + 24 <= 31); 231 ASSERT((func & 7) == func); 232 ASSERT((reg & 3) == 0 && reg < 256); 233 234 return (cmi_pci_getl(0, chipid + 24, func, reg, 0, 0)); 235 } 236 237 void 238 authamd_bankstatus_prewrite(cmi_hdl_t hdl, authamd_data_t *authamd) 239 { 240 uint64_t hwcr; 241 242 if (cmi_hdl_rdmsr(hdl, MSR_AMD_HWCR, &hwcr) != CMI_SUCCESS) 243 return; 244 245 authamd->amd_hwcr = hwcr; 246 247 if (!(hwcr & AMD_HWCR_MCI_STATUS_WREN)) { 248 hwcr |= AMD_HWCR_MCI_STATUS_WREN; 249 (void) cmi_hdl_wrmsr(hdl, MSR_AMD_HWCR, hwcr); 250 } 251 } 252 253 void 254 authamd_bankstatus_postwrite(cmi_hdl_t hdl, authamd_data_t *authamd) 255 { 256 uint64_t hwcr = authamd->amd_hwcr; 257 258 if (!(hwcr & AMD_HWCR_MCI_STATUS_WREN)) { 259 hwcr &= ~AMD_HWCR_MCI_STATUS_WREN; 260 (void) cmi_hdl_wrmsr(hdl, MSR_AMD_HWCR, hwcr); 261 } 262 } 263 264 /* 265 * Read EccCnt repeatedly for all possible channel/chip-select combos: 266 * 267 * - read sparectl register 268 * - if EccErrCntWrEn is set, clear that bit in the just-read value 269 * and write it back to sparectl; this *may* clobber the EccCnt 270 * for the channel/chip-select combination currently selected, so 271 * we leave this bit clear if we had to clear it 272 * - cycle through all channel/chip-select combinations writing each 273 * combination to sparectl before reading the register back for 274 * EccCnt for that combination; since EccErrCntWrEn is clear 275 * the writes to select what count to read will not themselves 276 * zero any counts 277 */ 278 static int 279 authamd_read_ecccnt(authamd_data_t *authamd, struct authamd_logout *msl) 280 { 281 union mcreg_sparectl sparectl; 282 uint_t chipid = authamd->amd_shared->acs_chipid; 283 uint_t family = authamd->amd_shared->acs_family; 284 uint32_t rev = authamd->amd_shared->acs_rev; 285 int chan, cs; 286 287 /* 288 * Check for feature support; this macro will test down to the 289 * family revision number, whereafter we'll switch on family 290 * assuming that future revisions will use the same register 291 * format. 292 */ 293 if (!AUTHAMD_HAS_ONLINESPARECTL(rev)) { 294 bzero(&msl->aal_eccerrcnt, sizeof (msl->aal_eccerrcnt)); 295 return (0); 296 } 297 298 MCREG_VAL32(&sparectl) = 299 authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL); 300 301 switch (family) { 302 case AUTHAMD_FAMILY_F: 303 MCREG_FIELD_F_revFG(&sparectl, EccErrCntWrEn) = 0; 304 break; 305 306 case AUTHAMD_FAMILY_10: 307 MCREG_FIELD_10_revAB(&sparectl, EccErrCntWrEn) = 0; 308 break; 309 } 310 311 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 312 switch (family) { 313 case AUTHAMD_FAMILY_F: 314 MCREG_FIELD_F_revFG(&sparectl, EccErrCntDramChan) = 315 chan; 316 break; 317 318 case AUTHAMD_FAMILY_10: 319 MCREG_FIELD_10_revAB(&sparectl, EccErrCntDramChan) = 320 chan; 321 break; 322 } 323 324 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 325 switch (family) { 326 case AUTHAMD_FAMILY_F: 327 MCREG_FIELD_F_revFG(&sparectl, 328 EccErrCntDramCs) = cs; 329 break; 330 331 case AUTHAMD_FAMILY_10: 332 MCREG_FIELD_10_revAB(&sparectl, 333 EccErrCntDramCs) = cs; 334 break; 335 } 336 337 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, 338 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 339 340 MCREG_VAL32(&sparectl) = authamd_pcicfg_read(chipid, 341 MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL); 342 343 switch (family) { 344 case AUTHAMD_FAMILY_F: 345 msl->aal_eccerrcnt[chan][cs] = 346 MCREG_FIELD_F_revFG(&sparectl, EccErrCnt); 347 break; 348 case AUTHAMD_FAMILY_10: 349 msl->aal_eccerrcnt[chan][cs] = 350 MCREG_FIELD_10_revAB(&sparectl, EccErrCnt); 351 break; 352 } 353 } 354 } 355 356 return (1); 357 } 358 359 /* 360 * Clear EccCnt for all possible channel/chip-select combos: 361 * 362 * - set EccErrCntWrEn in sparectl, if necessary 363 * - write 0 to EccCnt for all channel/chip-select combinations 364 * - clear EccErrCntWrEn 365 * 366 * If requested also disable the interrupts taken on counter overflow 367 * and on swap done. 368 */ 369 static void 370 authamd_clear_ecccnt(authamd_data_t *authamd, boolean_t clrint) 371 { 372 union mcreg_sparectl sparectl; 373 uint_t chipid = authamd->amd_shared->acs_chipid; 374 uint_t family = authamd->amd_shared->acs_family; 375 uint32_t rev = authamd->amd_shared->acs_rev; 376 int chan, cs; 377 378 if (!AUTHAMD_HAS_ONLINESPARECTL(rev)) 379 return; 380 381 MCREG_VAL32(&sparectl) = 382 authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL); 383 384 switch (family) { 385 case AUTHAMD_FAMILY_F: 386 MCREG_FIELD_F_revFG(&sparectl, EccErrCntWrEn) = 1; 387 if (clrint) { 388 MCREG_FIELD_F_revFG(&sparectl, EccErrInt) = 0; 389 MCREG_FIELD_F_revFG(&sparectl, SwapDoneInt) = 0; 390 } 391 break; 392 393 case AUTHAMD_FAMILY_10: 394 MCREG_FIELD_10_revAB(&sparectl, EccErrCntWrEn) = 1; 395 if (clrint) { 396 MCREG_FIELD_10_revAB(&sparectl, EccErrInt) = 0; 397 MCREG_FIELD_10_revAB(&sparectl, SwapDoneInt) = 0; 398 } 399 break; 400 } 401 402 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, 403 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 404 405 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 406 switch (family) { 407 case AUTHAMD_FAMILY_F: 408 MCREG_FIELD_F_revFG(&sparectl, EccErrCntDramChan) = 409 chan; 410 break; 411 412 case AUTHAMD_FAMILY_10: 413 MCREG_FIELD_10_revAB(&sparectl, EccErrCntDramChan) = 414 chan; 415 break; 416 } 417 418 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 419 switch (family) { 420 case AUTHAMD_FAMILY_F: 421 MCREG_FIELD_F_revFG(&sparectl, 422 EccErrCntDramCs) = cs; 423 MCREG_FIELD_F_revFG(&sparectl, 424 EccErrCnt) = 0; 425 break; 426 427 case AUTHAMD_FAMILY_10: 428 MCREG_FIELD_10_revAB(&sparectl, 429 EccErrCntDramCs) = cs; 430 MCREG_FIELD_10_revAB(&sparectl, 431 EccErrCnt) = 0; 432 break; 433 } 434 435 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, 436 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 437 } 438 } 439 } 440 441 442 /* 443 * Return 444 * 1: supported 445 * 0: unsupported 446 */ 447 static int 448 authamd_supported(uint_t family, uint32_t rev, uint_t chipid) 449 { 450 uint32_t nbcap; 451 452 if (family == AUTHAMD_FAMILY_6) 453 return (1); 454 455 if (family == AUTHAMD_FAMILY_F) 456 return (1); 457 458 /* 459 * On Family 10h, authamd is currently unsupported when there are 460 * multiple nodes on a processor chip. 461 */ 462 if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_D)) { 463 nbcap = authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, 464 MC_CTL_REG_NBCAP); 465 if ((nbcap & MC_NBCAP_MULTINODECPU) == 0) 466 return (1); 467 } else { 468 if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 469 return (1); 470 } 471 472 return (0); 473 } 474 475 /* 476 * cms_init entry point. 477 * 478 * This module provides broad model-specific support for AMD families 479 * 0x6, 0xf and 0x10. Future families will have to be evaluated once their 480 * documentation is available. 481 */ 482 int 483 authamd_init(cmi_hdl_t hdl, void **datap) 484 { 485 uint_t chipid = cmi_hdl_chipid(hdl); 486 struct authamd_chipshared *sp, *osp; 487 uint_t family = cmi_hdl_family(hdl); 488 uint32_t rev = cmi_hdl_chiprev(hdl); 489 authamd_data_t *authamd; 490 uint64_t cap; 491 492 if (authamd_ms_support_disable || 493 !authamd_supported(family, rev, chipid)) 494 return (ENOTSUP); 495 496 if (!(x86_feature & X86_MCA)) 497 return (ENOTSUP); 498 499 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS) 500 return (ENOTSUP); 501 502 if (!(cap & MCG_CAP_CTL_P)) 503 return (ENOTSUP); 504 505 authamd = *datap = kmem_zalloc(sizeof (authamd_data_t), KM_SLEEP); 506 cmi_hdl_hold(hdl); /* release in fini */ 507 authamd->amd_hdl = hdl; 508 509 if ((sp = authamd_shared[chipid]) == NULL) { 510 sp = kmem_zalloc(sizeof (struct authamd_chipshared), KM_SLEEP); 511 sp->acs_chipid = chipid; 512 sp->acs_family = family; 513 sp->acs_rev = cmi_hdl_chiprev(hdl); 514 membar_producer(); 515 516 osp = atomic_cas_ptr(&authamd_shared[chipid], NULL, sp); 517 if (osp != NULL) { 518 kmem_free(sp, sizeof (struct authamd_chipshared)); 519 sp = osp; 520 } 521 } 522 authamd->amd_shared = sp; 523 524 return (0); 525 } 526 527 /* 528 * cms_logout_size entry point. 529 */ 530 /*ARGSUSED*/ 531 size_t 532 authamd_logout_size(cmi_hdl_t hdl) 533 { 534 return (sizeof (struct authamd_logout)); 535 } 536 537 /* 538 * cms_mcgctl_val entry point 539 * 540 * Instead of setting all bits to 1 we can set just those for the 541 * error detector banks known to exist. 542 */ 543 /*ARGSUSED*/ 544 uint64_t 545 authamd_mcgctl_val(cmi_hdl_t hdl, int nbanks, uint64_t proposed) 546 { 547 return (nbanks < 64 ? (1ULL << nbanks) - 1 : proposed); 548 } 549 550 /* 551 * cms_bankctl_skipinit entry point 552 * 553 * On K6 we do not initialize MC0_CTL since, reportedly, this bank (for DC) 554 * may produce spurious machine checks. 555 * 556 * Only allow a single core to setup the NorthBridge MCi_CTL register. 557 */ 558 /*ARGSUSED*/ 559 boolean_t 560 authamd_bankctl_skipinit(cmi_hdl_t hdl, int bank) 561 { 562 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 563 uint32_t rev = authamd->amd_shared->acs_rev; 564 565 if (authamd->amd_shared->acs_family == AUTHAMD_FAMILY_6) 566 return (bank == 0 ? B_TRUE : B_FALSE); 567 568 if (AUTHAMD_NBONCHIP(rev) && bank == AMD_MCA_BANK_NB) { 569 return (authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBMCA) == 570 B_TRUE ? B_FALSE : B_TRUE); 571 } 572 573 return (B_FALSE); 574 } 575 576 /* 577 * cms_bankctl_val entry point 578 */ 579 uint64_t 580 authamd_bankctl_val(cmi_hdl_t hdl, int bank, uint64_t proposed) 581 { 582 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 583 uint32_t rev = authamd->amd_shared->acs_rev; 584 uint64_t val = proposed; 585 586 /* 587 * The Intel MCA says we can write all 1's to enable #MC for 588 * all errors, and AMD docs say much the same. But, depending 589 * perhaps on other config registers, taking machine checks 590 * for some errors such as GART TLB errors and master/target 591 * aborts may be bad - they set UC and sometime also PCC, but 592 * we should not always panic for these error types. 593 * 594 * Our cms_error_action entry point can suppress such panics, 595 * however we can also use the cms_bankctl_val entry point to 596 * veto enabling of some of the known villains in the first place. 597 */ 598 if (bank == AMD_MCA_BANK_NB && AUTHAMD_NOGARTTBLWLK_MC(rev)) 599 val &= ~AMD_NB_EN_GARTTBLWK; 600 601 return (val); 602 } 603 604 /* 605 * Bits to add to NB MCA config (after watchdog config). 606 */ 607 uint32_t authamd_nb_mcacfg_add = AMD_NB_CFG_ADD_CMN; 608 609 /* 610 * Bits to remove from NB MCA config (after watchdog config) 611 */ 612 uint32_t authamd_nb_mcacfg_remove = AMD_NB_CFG_REMOVE_CMN; 613 614 /* 615 * NB Watchdog policy, and rate we use if enabling. 616 */ 617 enum { 618 AUTHAMD_NB_WDOG_LEAVEALONE, 619 AUTHAMD_NB_WDOG_DISABLE, 620 AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED, 621 AUTHAMD_NB_WDOG_ENABLE_FORCE_RATE 622 } authamd_nb_watchdog_policy = AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED; 623 624 uint32_t authamd_nb_mcacfg_wdog = AMD_NB_CFG_WDOGTMRCNTSEL_4095 | 625 AMD_NB_CFG_WDOGTMRBASESEL_1MS; 626 627 /* 628 * Per-core cache scrubbing policy and rates. 629 */ 630 enum { 631 AUTHAMD_SCRUB_BIOSDEFAULT, /* leave as BIOS configured */ 632 AUTHAMD_SCRUB_FIXED, /* assign our chosen rate */ 633 AUTHAMD_SCRUB_MAX /* use higher of ours and BIOS rate */ 634 } authamd_scrub_policy = AUTHAMD_SCRUB_MAX; 635 636 uint32_t authamd_scrub_rate_dcache = 0xf; /* 64K per 0.67 seconds */ 637 uint32_t authamd_scrub_rate_l2cache = 0xe; /* 1MB per 5.3 seconds */ 638 uint32_t authamd_scrub_rate_l3cache = 0xd; /* 1MB per 2.7 seconds */ 639 640 static uint32_t 641 authamd_scrubrate(uint32_t osrate, uint32_t biosrate, const char *varnm) 642 { 643 uint32_t rate; 644 645 if (osrate > AMD_NB_SCRUBCTL_RATE_MAX) { 646 cmn_err(CE_WARN, "%s is too large, resetting to 0x%x\n", 647 varnm, AMD_NB_SCRUBCTL_RATE_MAX); 648 osrate = AMD_NB_SCRUBCTL_RATE_MAX; 649 } 650 651 switch (authamd_scrub_policy) { 652 case AUTHAMD_SCRUB_FIXED: 653 rate = osrate; 654 break; 655 656 default: 657 cmn_err(CE_WARN, "Unknown authamd_scrub_policy %d - " 658 "using default policy of AUTHAMD_SCRUB_MAX", 659 authamd_scrub_policy); 660 /*FALLTHRU*/ 661 662 case AUTHAMD_SCRUB_MAX: 663 if (osrate != 0 && biosrate != 0) 664 rate = MIN(osrate, biosrate); /* small is fast */ 665 else 666 rate = osrate ? osrate : biosrate; 667 } 668 669 return (rate); 670 } 671 672 /* 673 * cms_mca_init entry point. 674 */ 675 /*ARGSUSED*/ 676 void 677 authamd_mca_init(cmi_hdl_t hdl, int nbanks) 678 { 679 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 680 uint32_t rev = authamd->amd_shared->acs_rev; 681 uint_t chipid = authamd->amd_shared->acs_chipid; 682 683 /* 684 * On chips with a NB online spare control register take control 685 * and clear ECC counts. 686 */ 687 if (AUTHAMD_HAS_ONLINESPARECTL(rev) && 688 authamd_chip_once(authamd, AUTHAMD_CFGONCE_ONLNSPRCFG)) { 689 authamd_clear_ecccnt(authamd, B_TRUE); 690 } 691 692 /* 693 * And since we are claiming the telemetry stop the BIOS receiving 694 * an SMI on NB threshold overflow. 695 */ 696 if (AUTHAMD_NBMISC_NUM(rev) && 697 authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBTHRESH)) { 698 union mcmsr_nbmisc nbm; 699 int i; 700 701 authamd_bankstatus_prewrite(hdl, authamd); 702 703 for (i = 0; i < AUTHAMD_NBMISC_NUM(rev); i++) { 704 if (cmi_hdl_rdmsr(hdl, MC_MSR_NB_MISC(i), 705 (uint64_t *)&nbm) != CMI_SUCCESS) 706 continue; 707 708 if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) && 709 MCMSR_FIELD_F_revFG(&nbm, mcmisc_Valid) && 710 MCMSR_FIELD_F_revFG(&nbm, mcmisc_CntP)) { 711 MCMSR_FIELD_F_revFG(&nbm, mcmisc_IntType) = 0; 712 } else if (X86_CHIPREV_ATLEAST(rev, 713 X86_CHIPREV_AMD_10_REV_A) && 714 MCMSR_FIELD_10_revAB(&nbm, mcmisc_Valid) && 715 MCMSR_FIELD_10_revAB(&nbm, mcmisc_CntP)) { 716 MCMSR_FIELD_10_revAB(&nbm, mcmisc_IntType) = 0; 717 } 718 719 (void) cmi_hdl_wrmsr(hdl, MC_MSR_NB_MISC(i), 720 MCMSR_VAL(&nbm)); 721 } 722 723 authamd_bankstatus_postwrite(hdl, authamd); 724 } 725 726 /* 727 * NB MCA Configuration Register. 728 */ 729 if (AUTHAMD_DO_NBMCACFG(rev) && 730 authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBMCACFG)) { 731 uint32_t val = authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, 732 MC_CTL_REG_NBCFG); 733 734 switch (authamd_nb_watchdog_policy) { 735 case AUTHAMD_NB_WDOG_LEAVEALONE: 736 break; 737 738 case AUTHAMD_NB_WDOG_DISABLE: 739 val &= ~(AMD_NB_CFG_WDOGTMRBASESEL_MASK | 740 AMD_NB_CFG_WDOGTMRCNTSEL_MASK); 741 val |= AMD_NB_CFG_WDOGTMRDIS; 742 break; 743 744 default: 745 cmn_err(CE_NOTE, "authamd_nb_watchdog_policy=%d " 746 "unrecognised, using default policy", 747 authamd_nb_watchdog_policy); 748 /*FALLTHRU*/ 749 750 case AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED: 751 if (!(val & AMD_NB_CFG_WDOGTMRDIS)) 752 break; /* if enabled leave rate intact */ 753 /*FALLTHRU*/ 754 755 case AUTHAMD_NB_WDOG_ENABLE_FORCE_RATE: 756 val &= ~(AMD_NB_CFG_WDOGTMRBASESEL_MASK | 757 AMD_NB_CFG_WDOGTMRCNTSEL_MASK | 758 AMD_NB_CFG_WDOGTMRDIS); 759 val |= authamd_nb_mcacfg_wdog; 760 break; 761 } 762 763 /* 764 * Bit 0 of the NB MCA Config register is reserved on family 765 * 0x10. 766 */ 767 if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 768 authamd_nb_mcacfg_add &= ~AMD_NB_CFG_CPUECCERREN; 769 770 val &= ~authamd_nb_mcacfg_remove; 771 val |= authamd_nb_mcacfg_add; 772 773 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, MC_CTL_REG_NBCFG, 774 val); 775 } 776 777 /* 778 * Cache scrubbing. We can't enable DRAM scrubbing since 779 * we don't know the DRAM base for this node. 780 */ 781 if (AUTHAMD_HAS_CHIPSCRUB(rev) && 782 authamd_scrub_policy != AUTHAMD_SCRUB_BIOSDEFAULT && 783 authamd_chip_once(authamd, AUTHAMD_CFGONCE_CACHESCRUB)) { 784 uint32_t val = authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, 785 MC_CTL_REG_SCRUBCTL); 786 int l3cap = 0; 787 788 if (AUTHAMD_L3CAPABLE(rev)) { 789 l3cap = (authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, 790 MC_CTL_REG_NBCAP) & MC_NBCAP_L3CAPABLE) != 0; 791 } 792 793 authamd_scrub_rate_dcache = 794 authamd_scrubrate(authamd_scrub_rate_dcache, 795 (val & AMD_NB_SCRUBCTL_DC_MASK) >> AMD_NB_SCRUBCTL_DC_SHIFT, 796 "authamd_scrub_rate_dcache"); 797 798 authamd_scrub_rate_l2cache = 799 authamd_scrubrate(authamd_scrub_rate_l2cache, 800 (val & AMD_NB_SCRUBCTL_L2_MASK) >> AMD_NB_SCRUBCTL_L2_SHIFT, 801 "authamd_scrub_rate_l2cache"); 802 803 authamd_scrub_rate_l3cache = l3cap ? 804 authamd_scrubrate(authamd_scrub_rate_l3cache, 805 (val & AMD_NB_SCRUBCTL_L3_MASK) >> AMD_NB_SCRUBCTL_L3_SHIFT, 806 "authamd_scrub_rate_l3cache") : 0; 807 808 val = AMD_NB_MKSCRUBCTL(authamd_scrub_rate_l3cache, 809 authamd_scrub_rate_dcache, authamd_scrub_rate_l2cache, 810 val & AMD_NB_SCRUBCTL_DRAM_MASK); 811 812 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, 813 MC_CTL_REG_SCRUBCTL, val); 814 } 815 816 } 817 818 /* 819 * cms_poll_ownermask entry point. 820 */ 821 uint64_t 822 authamd_poll_ownermask(cmi_hdl_t hdl, hrtime_t pintvl) 823 { 824 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 825 struct authamd_chipshared *acsp = authamd->amd_shared; 826 hrtime_t now = gethrtime_waitfree(); 827 hrtime_t last = acsp->acs_poll_timestamp; 828 int dopoll = 0; 829 830 if (now - last > 2 * pintvl || last == 0) { 831 acsp->acs_pollowner = hdl; 832 dopoll = 1; 833 } else if (acsp->acs_pollowner == hdl) { 834 dopoll = 1; 835 } 836 837 if (dopoll) 838 acsp->acs_poll_timestamp = now; 839 840 return (dopoll ? -1ULL : ~(1 << AMD_MCA_BANK_NB)); 841 842 } 843 844 /* 845 * cms_bank_logout entry point. 846 */ 847 /*ARGSUSED*/ 848 void 849 authamd_bank_logout(cmi_hdl_t hdl, int bank, uint64_t status, 850 uint64_t addr, uint64_t misc, void *mslogout) 851 { 852 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 853 struct authamd_logout *msl = mslogout; 854 uint32_t rev = authamd->amd_shared->acs_rev; 855 856 if (msl == NULL) 857 return; 858 859 /* 860 * For main memory ECC errors on revisions with an Online Spare 861 * Control Register grab the ECC counts by channel and chip-select 862 * and reset them to 0. 863 */ 864 if (AUTHAMD_MEMECC_RECOGNISED(rev) && 865 AUTHAMD_IS_MEMECCERR(bank, status) && 866 AUTHAMD_HAS_ONLINESPARECTL(rev)) { 867 if (authamd_read_ecccnt(authamd, msl)) 868 authamd_clear_ecccnt(authamd, B_FALSE); 869 } 870 } 871 872 /* 873 * cms_error_action entry point 874 */ 875 876 int authamd_forgive_uc = 0; /* For test/debug only */ 877 int authamd_forgive_pcc = 0; /* For test/debug only */ 878 int authamd_fake_poison = 0; /* For test/debug only */ 879 880 /*ARGSUSED*/ 881 uint32_t 882 authamd_error_action(cmi_hdl_t hdl, int ismc, int bank, 883 uint64_t status, uint64_t addr, uint64_t misc, void *mslogout) 884 { 885 authamd_error_disp_t *disp; 886 uint32_t rv = 0; 887 888 if (authamd_forgive_uc) 889 rv |= CMS_ERRSCOPE_CLEARED_UC; 890 891 if (authamd_forgive_pcc) 892 rv |= CMS_ERRSCOPE_CURCONTEXT_OK; 893 894 if (authamd_fake_poison && status & MSR_MC_STATUS_UC) 895 rv |= CMS_ERRSCOPE_POISONED; 896 897 if (rv) 898 return (rv); 899 900 disp = authamd_disp_match(hdl, bank, status, addr, misc, mslogout); 901 902 if (disp == &authamd_gart_disp) { 903 /* 904 * GART walk errors set UC and possibly PCC (if source CPU) 905 * but should not be regarded as terminal. 906 */ 907 return (CMS_ERRSCOPE_IGNORE_ERR); 908 } 909 910 /* 911 * May also want to consider master abort and target abort. These 912 * also set UC and PCC (if src CPU) but the requester gets -1 913 * and I believe the IO stuff in Solaris will handle that. 914 */ 915 916 return (rv); 917 } 918 919 /* 920 * cms_disp_match entry point 921 */ 922 /*ARGSUSED*/ 923 cms_cookie_t 924 authamd_disp_match(cmi_hdl_t hdl, int bank, uint64_t status, 925 uint64_t addr, uint64_t misc, void *mslogout) 926 { 927 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 928 /* uint16_t errcode = MCAX86_ERRCODE(status); */ 929 uint16_t exterrcode = AMD_EXT_ERRCODE(status); 930 uint32_t rev = authamd->amd_shared->acs_rev; 931 932 /* 933 * Recognise main memory ECC errors 934 */ 935 if (AUTHAMD_MEMECC_RECOGNISED(rev) && 936 AUTHAMD_IS_MEMECCERR(bank, status)) { 937 if (status & AMD_BANK_STAT_CECC) { 938 return (exterrcode == 0 ? &authamd_memce_disp : 939 &authamd_ckmemce_disp); 940 } else if (status & AMD_BANK_STAT_UECC) { 941 return (exterrcode == 0 ? &authamd_memue_disp : 942 &authamd_ckmemue_disp); 943 } 944 } 945 946 /* 947 * Recognise GART walk errors 948 */ 949 if (AUTHAMD_NOGARTTBLWLK_MC(rev) && AUTHAMD_IS_GARTERR(bank, status)) 950 return (&authamd_gart_disp); 951 952 return (NULL); 953 } 954 955 /* 956 * cms_ereport_class entry point 957 */ 958 /*ARGSUSED*/ 959 void 960 authamd_ereport_class(cmi_hdl_t hdl, cms_cookie_t mscookie, 961 const char **cpuclsp, const char **leafclsp) 962 { 963 const authamd_error_disp_t *aed = mscookie; 964 965 if (aed == NULL) 966 return; 967 968 if (aed->aad_subclass != NULL) 969 *cpuclsp = aed->aad_subclass; 970 if (aed->aad_leafclass != NULL) 971 *leafclsp = aed->aad_leafclass; 972 } 973 974 /*ARGSUSED*/ 975 static void 976 authamd_ereport_add_resource(cmi_hdl_t hdl, authamd_data_t *authamd, 977 nvlist_t *ereport, nv_alloc_t *nva, void *mslogout) 978 { 979 nvlist_t *elems[AUTHAMD_DRAM_NCHANNEL * AUTHAMD_DRAM_NCS]; 980 uint8_t counts[AUTHAMD_DRAM_NCHANNEL * AUTHAMD_DRAM_NCS]; 981 authamd_logout_t *msl; 982 nvlist_t *nvl; 983 int nelems = 0; 984 int i, chan, cs; 985 986 if ((msl = mslogout) == NULL) 987 return; 988 989 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 990 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 991 if (msl->aal_eccerrcnt[chan][cs] == 0) 992 continue; 993 994 if ((nvl = fm_nvlist_create(nva)) == NULL) 995 continue; 996 997 elems[nelems] = nvl; 998 counts[nelems++] = msl->aal_eccerrcnt[chan][cs]; 999 1000 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 5, 1001 "motherboard", 0, 1002 "chip", authamd->amd_shared->acs_chipid, 1003 "memory-controller", 0, 1004 "dram-channel", chan, 1005 "chip-select", cs); 1006 } 1007 } 1008 1009 if (nelems == 0) 1010 return; 1011 1012 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_RESOURCE, 1013 DATA_TYPE_NVLIST_ARRAY, nelems, elems, 1014 NULL); 1015 1016 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_RESOURCECNT, 1017 DATA_TYPE_UINT8_ARRAY, nelems, &counts[0], 1018 NULL); 1019 1020 for (i = 0; i < nelems; i++) 1021 fm_nvlist_destroy(elems[i], nva ? FM_NVA_RETAIN : FM_NVA_FREE); 1022 } 1023 1024 /* 1025 * cms_ereport_add_logout entry point 1026 */ 1027 /*ARGSUSED*/ 1028 void 1029 authamd_ereport_add_logout(cmi_hdl_t hdl, nvlist_t *ereport, nv_alloc_t *nva, 1030 int bank, uint64_t status, uint64_t addr, uint64_t misc, 1031 void *mslogout, cms_cookie_t mscookie) 1032 { 1033 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 1034 const authamd_error_disp_t *aed = mscookie; 1035 uint64_t members; 1036 1037 if (aed == NULL) 1038 return; 1039 1040 members = aed->aad_ereport_members; 1041 1042 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYND) { 1043 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_SYND, 1044 DATA_TYPE_UINT16, (uint16_t)AMD_BANK_SYND(status), 1045 NULL); 1046 1047 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYNDTYPE) { 1048 fm_payload_set(ereport, 1049 FM_EREPORT_GENAMD_PAYLOAD_NAME_SYNDTYPE, 1050 DATA_TYPE_STRING, "E", 1051 NULL); 1052 } 1053 } 1054 1055 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_CKSYND) { 1056 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_CKSYND, 1057 DATA_TYPE_UINT16, (uint16_t)AMD_NB_STAT_CKSYND(status), 1058 NULL); 1059 1060 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYNDTYPE) { 1061 fm_payload_set(ereport, 1062 FM_EREPORT_GENAMD_PAYLOAD_NAME_SYNDTYPE, 1063 DATA_TYPE_STRING, "C", 1064 NULL); 1065 } 1066 } 1067 1068 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_RESOURCE && 1069 status & MSR_MC_STATUS_ADDRV) { 1070 authamd_ereport_add_resource(hdl, authamd, ereport, nva, 1071 mslogout); 1072 } 1073 } 1074 1075 /* 1076 * cms_msrinject entry point 1077 */ 1078 cms_errno_t 1079 authamd_msrinject(cmi_hdl_t hdl, uint_t msr, uint64_t val) 1080 { 1081 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 1082 cms_errno_t rv = CMSERR_BADMSRWRITE; 1083 1084 authamd_bankstatus_prewrite(hdl, authamd); 1085 if (cmi_hdl_wrmsr(hdl, msr, val) == CMI_SUCCESS) 1086 rv = CMS_SUCCESS; 1087 authamd_bankstatus_postwrite(hdl, authamd); 1088 1089 return (rv); 1090 } 1091 1092 cms_api_ver_t _cms_api_version = CMS_API_VERSION_0; 1093 1094 const cms_ops_t _cms_ops = { 1095 authamd_init, /* cms_init */ 1096 NULL, /* cms_post_startup */ 1097 NULL, /* cms_post_mpstartup */ 1098 authamd_logout_size, /* cms_logout_size */ 1099 authamd_mcgctl_val, /* cms_mcgctl_val */ 1100 authamd_bankctl_skipinit, /* cms_bankctl_skipinit */ 1101 authamd_bankctl_val, /* cms_bankctl_val */ 1102 NULL, /* cms_bankstatus_skipinit */ 1103 NULL, /* cms_bankstatus_val */ 1104 authamd_mca_init, /* cms_mca_init */ 1105 authamd_poll_ownermask, /* cms_poll_ownermask */ 1106 authamd_bank_logout, /* cms_bank_logout */ 1107 authamd_error_action, /* cms_error_action */ 1108 authamd_disp_match, /* cms_disp_match */ 1109 authamd_ereport_class, /* cms_ereport_class */ 1110 NULL, /* cms_ereport_detector */ 1111 NULL, /* cms_ereport_includestack */ 1112 authamd_ereport_add_logout, /* cms_ereport_add_logout */ 1113 authamd_msrinject, /* cms_msrinject */ 1114 NULL, /* cms_fini */ 1115 }; 1116 1117 static struct modlcpu modlcpu = { 1118 &mod_cpuops, 1119 "Generic AMD model-specific MCA" 1120 }; 1121 1122 static struct modlinkage modlinkage = { 1123 MODREV_1, 1124 (void *)&modlcpu, 1125 NULL 1126 }; 1127 1128 int 1129 _init(void) 1130 { 1131 return (mod_install(&modlinkage)); 1132 } 1133 1134 int 1135 _info(struct modinfo *modinfop) 1136 { 1137 return (mod_info(&modlinkage, modinfop)); 1138 } 1139 1140 int 1141 _fini(void) 1142 { 1143 return (mod_remove(&modlinkage)); 1144 } 1145