1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * "Generic AMD" model-specific support. If no more-specific support can 29 * be found, or such modules declines to initialize, then for AuthenticAMD 30 * cpus this module can have a crack at providing some AMD model-specific 31 * support that at least goes beyond common MCA architectural features 32 * if not down to the nitty-gritty level for a particular model. We 33 * are layered on top of a cpu module, likely cpu.generic, so there is no 34 * need for us to perform common architecturally-accessible functions. 35 */ 36 37 #include <sys/types.h> 38 #include <sys/cmn_err.h> 39 #include <sys/modctl.h> 40 #include <sys/cpu_module.h> 41 #include <sys/mca_x86.h> 42 #include <sys/pci_cfgspace.h> 43 #include <sys/x86_archext.h> 44 #include <sys/mc_amd.h> 45 #include <sys/fm/protocol.h> 46 #include <sys/fm/cpu/GENAMD.h> 47 #include <sys/fm/smb/fmsmb.h> 48 #include <sys/fm/util.h> 49 #include <sys/nvpair.h> 50 #include <sys/controlregs.h> 51 #include <sys/pghw.h> 52 #include <sys/sunddi.h> 53 #include <sys/sysmacros.h> 54 #include <sys/cpu_module_ms_impl.h> 55 56 #include "authamd.h" 57 58 extern int x86gentopo_legacy; /* x86 generic topo support */ 59 60 int authamd_ms_support_disable = 0; 61 62 #define AUTHAMD_F_REVS_BCDE \ 63 (X86_CHIPREV_AMD_F_REV_B | X86_CHIPREV_AMD_F_REV_C0 | \ 64 X86_CHIPREV_AMD_F_REV_CG | X86_CHIPREV_AMD_F_REV_D | \ 65 X86_CHIPREV_AMD_F_REV_E) 66 67 #define AUTHAMD_F_REVS_FG \ 68 (X86_CHIPREV_AMD_F_REV_F | X86_CHIPREV_AMD_F_REV_G) 69 70 #define AUTHAMD_10_REVS_AB \ 71 (X86_CHIPREV_AMD_10_REV_A | X86_CHIPREV_AMD_10_REV_B) 72 73 /* 74 * Bitmasks of support for various features. Try to enable features 75 * via inclusion in one of these bitmasks and check that at the 76 * feature imlementation - that way new family support may often simply 77 * simply need to update these bitmasks. 78 */ 79 80 /* 81 * Models that include an on-chip NorthBridge. 82 */ 83 #define AUTHAMD_NBONCHIP(rev) \ 84 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 85 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 86 87 /* 88 * Families/revisions for which we can recognise main memory ECC errors. 89 */ 90 #define AUTHAMD_MEMECC_RECOGNISED(rev) \ 91 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 92 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 93 94 /* 95 * Families/revisions that have an Online Spare Control Register 96 */ 97 #define AUTHAMD_HAS_ONLINESPARECTL(rev) \ 98 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) || \ 99 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 100 101 /* 102 * Families/revisions for which we will perform NB MCA Config changes 103 */ 104 #define AUTHAMD_DO_NBMCACFG(rev) \ 105 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 106 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 107 108 /* 109 * Families/revisions that have chip cache scrubbers. 110 */ 111 #define AUTHAMD_HAS_CHIPSCRUB(rev) \ 112 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 113 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 114 115 /* 116 * Families/revisions that have a NB misc register or registers - 117 * evaluates to 0 if no support, otherwise the number of MC4_MISCj. 118 */ 119 #define AUTHAMD_NBMISC_NUM(rev) \ 120 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F)? 1 : \ 121 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A) ? 3 : 0)) 122 123 /* 124 * Families/revision for which we wish not to machine check for GART 125 * table walk errors - bit 10 of NB CTL. 126 */ 127 #define AUTHAMD_NOGARTTBLWLK_MC(rev) \ 128 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 129 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 130 131 /* 132 * Families/revisions that are potentially L3 capable 133 */ 134 #define AUTHAMD_L3CAPABLE(rev) \ 135 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 136 137 /* 138 * We recognise main memory ECC errors for AUTHAMD_MEMECC_RECOGNISED 139 * revisions as: 140 * 141 * - being reported by the NB 142 * - being a compound bus/interconnect error (external to chip) 143 * - having LL of LG 144 * - having II of MEM (but could still be a master/target abort) 145 * - having CECC or UECC set 146 * 147 * We do not check the extended error code (first nibble of the 148 * model-specific error code on AMD) since this has changed from 149 * family 0xf to family 0x10 (ext code 0 now reserved on family 0x10). 150 * Instead we use CECC/UECC to separate off the master/target 151 * abort cases. 152 * 153 * We insist that the detector be the NorthBridge bank; although 154 * IC/DC can report some main memory errors, they do not capture 155 * an address at sufficient resolution to be useful and the NB will 156 * report most errors. 157 */ 158 #define AUTHAMD_IS_MEMECCERR(bank, status) \ 159 ((bank) == AMD_MCA_BANK_NB && \ 160 MCAX86_ERRCODE_ISBUS_INTERCONNECT(MCAX86_ERRCODE(status)) && \ 161 MCAX86_ERRCODE_LL(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_LL_LG && \ 162 MCAX86_ERRCODE_II(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_II_MEM && \ 163 ((status) & (AMD_BANK_STAT_CECC | AMD_BANK_STAT_UECC))) 164 165 static authamd_error_disp_t authamd_memce_disp = { 166 FM_EREPORT_CPU_GENAMD, 167 FM_EREPORT_CPU_GENAMD_MEM_CE, 168 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_MEM_CE 169 }; 170 171 static authamd_error_disp_t authamd_memue_disp = { 172 FM_EREPORT_CPU_GENAMD, 173 FM_EREPORT_CPU_GENAMD_MEM_UE, 174 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_MEM_UE 175 }; 176 177 static authamd_error_disp_t authamd_ckmemce_disp = { 178 FM_EREPORT_CPU_GENAMD, 179 FM_EREPORT_CPU_GENAMD_CKMEM_CE, 180 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_CKMEM_CE 181 }; 182 183 static authamd_error_disp_t authamd_ckmemue_disp = { 184 FM_EREPORT_CPU_GENAMD, 185 FM_EREPORT_CPU_GENAMD_CKMEM_UE, 186 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_CKMEM_UE 187 }; 188 189 /* 190 * We recognise GART walk errors as: 191 * 192 * - being reported by the NB 193 * - being a compound TLB error 194 * - having LL of LG and TT of GEN 195 * - having UC set 196 * - possibly having PCC set (if source CPU) 197 */ 198 #define AUTHAMD_IS_GARTERR(bank, status) \ 199 ((bank) == AMD_MCA_BANK_NB && \ 200 MCAX86_ERRCODE_ISTLB(MCAX86_ERRCODE(status)) && \ 201 MCAX86_ERRCODE_LL(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_LL_LG && \ 202 MCAX86_ERRCODE_TT(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_TT_GEN && \ 203 (status) & MSR_MC_STATUS_UC) 204 205 static authamd_error_disp_t authamd_gart_disp = { 206 FM_EREPORT_CPU_GENAMD, /* use generic subclass */ 207 FM_EREPORT_CPU_GENADM_GARTTBLWLK, /* use generic leafclass */ 208 0 /* no additional payload */ 209 }; 210 211 212 static struct authamd_nodeshared *authamd_shared[AUTHAMD_MAX_NODES]; 213 214 static int 215 authamd_chip_once(authamd_data_t *authamd, enum authamd_cfgonce_bitnum what) 216 { 217 return (atomic_set_long_excl(&authamd->amd_shared->acs_cfgonce, 218 what) == 0 ? B_TRUE : B_FALSE); 219 } 220 221 static void 222 authamd_pcicfg_write(uint_t procnodeid, uint_t func, uint_t reg, uint32_t val) 223 { 224 ASSERT(procnodeid + 24 <= 31); 225 ASSERT((func & 7) == func); 226 ASSERT((reg & 3) == 0 && reg < 256); 227 228 cmi_pci_putl(0, procnodeid + 24, func, reg, 0, val); 229 } 230 231 static uint32_t 232 authamd_pcicfg_read(uint_t procnodeid, uint_t func, uint_t reg) 233 { 234 ASSERT(procnodeid + 24 <= 31); 235 ASSERT((func & 7) == func); 236 ASSERT((reg & 3) == 0 && reg < 256); 237 238 return (cmi_pci_getl(0, procnodeid + 24, func, reg, 0, 0)); 239 } 240 241 void 242 authamd_bankstatus_prewrite(cmi_hdl_t hdl, authamd_data_t *authamd) 243 { 244 uint64_t hwcr; 245 246 if (cmi_hdl_rdmsr(hdl, MSR_AMD_HWCR, &hwcr) != CMI_SUCCESS) 247 return; 248 249 authamd->amd_hwcr = hwcr; 250 251 if (!(hwcr & AMD_HWCR_MCI_STATUS_WREN)) { 252 hwcr |= AMD_HWCR_MCI_STATUS_WREN; 253 (void) cmi_hdl_wrmsr(hdl, MSR_AMD_HWCR, hwcr); 254 } 255 } 256 257 void 258 authamd_bankstatus_postwrite(cmi_hdl_t hdl, authamd_data_t *authamd) 259 { 260 uint64_t hwcr = authamd->amd_hwcr; 261 262 if (!(hwcr & AMD_HWCR_MCI_STATUS_WREN)) { 263 hwcr &= ~AMD_HWCR_MCI_STATUS_WREN; 264 (void) cmi_hdl_wrmsr(hdl, MSR_AMD_HWCR, hwcr); 265 } 266 } 267 268 /* 269 * Read EccCnt repeatedly for all possible channel/chip-select combos: 270 * 271 * - read sparectl register 272 * - if EccErrCntWrEn is set, clear that bit in the just-read value 273 * and write it back to sparectl; this *may* clobber the EccCnt 274 * for the channel/chip-select combination currently selected, so 275 * we leave this bit clear if we had to clear it 276 * - cycle through all channel/chip-select combinations writing each 277 * combination to sparectl before reading the register back for 278 * EccCnt for that combination; since EccErrCntWrEn is clear 279 * the writes to select what count to read will not themselves 280 * zero any counts 281 */ 282 static int 283 authamd_read_ecccnt(authamd_data_t *authamd, struct authamd_logout *msl) 284 { 285 union mcreg_sparectl sparectl; 286 uint_t procnodeid = authamd->amd_shared->acs_procnodeid; 287 uint_t family = authamd->amd_shared->acs_family; 288 uint32_t rev = authamd->amd_shared->acs_rev; 289 int chan, cs; 290 291 /* 292 * Check for feature support; this macro will test down to the 293 * family revision number, whereafter we'll switch on family 294 * assuming that future revisions will use the same register 295 * format. 296 */ 297 if (!AUTHAMD_HAS_ONLINESPARECTL(rev)) { 298 bzero(&msl->aal_eccerrcnt, sizeof (msl->aal_eccerrcnt)); 299 return (0); 300 } 301 302 MCREG_VAL32(&sparectl) = 303 authamd_pcicfg_read(procnodeid, MC_FUNC_MISCCTL, 304 MC_CTL_REG_SPARECTL); 305 306 switch (family) { 307 case AUTHAMD_FAMILY_F: 308 MCREG_FIELD_F_revFG(&sparectl, EccErrCntWrEn) = 0; 309 break; 310 311 case AUTHAMD_FAMILY_10: 312 MCREG_FIELD_10_revAB(&sparectl, EccErrCntWrEn) = 0; 313 break; 314 } 315 316 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 317 switch (family) { 318 case AUTHAMD_FAMILY_F: 319 MCREG_FIELD_F_revFG(&sparectl, EccErrCntDramChan) = 320 chan; 321 break; 322 323 case AUTHAMD_FAMILY_10: 324 MCREG_FIELD_10_revAB(&sparectl, EccErrCntDramChan) = 325 chan; 326 break; 327 } 328 329 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 330 switch (family) { 331 case AUTHAMD_FAMILY_F: 332 MCREG_FIELD_F_revFG(&sparectl, 333 EccErrCntDramCs) = cs; 334 break; 335 336 case AUTHAMD_FAMILY_10: 337 MCREG_FIELD_10_revAB(&sparectl, 338 EccErrCntDramCs) = cs; 339 break; 340 } 341 342 authamd_pcicfg_write(procnodeid, MC_FUNC_MISCCTL, 343 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 344 345 MCREG_VAL32(&sparectl) = authamd_pcicfg_read(procnodeid, 346 MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL); 347 348 switch (family) { 349 case AUTHAMD_FAMILY_F: 350 msl->aal_eccerrcnt[chan][cs] = 351 MCREG_FIELD_F_revFG(&sparectl, EccErrCnt); 352 break; 353 case AUTHAMD_FAMILY_10: 354 msl->aal_eccerrcnt[chan][cs] = 355 MCREG_FIELD_10_revAB(&sparectl, EccErrCnt); 356 break; 357 } 358 } 359 } 360 361 return (1); 362 } 363 364 /* 365 * Clear EccCnt for all possible channel/chip-select combos: 366 * 367 * - set EccErrCntWrEn in sparectl, if necessary 368 * - write 0 to EccCnt for all channel/chip-select combinations 369 * - clear EccErrCntWrEn 370 * 371 * If requested also disable the interrupts taken on counter overflow 372 * and on swap done. 373 */ 374 static void 375 authamd_clear_ecccnt(authamd_data_t *authamd, boolean_t clrint) 376 { 377 union mcreg_sparectl sparectl; 378 uint_t procnodeid = authamd->amd_shared->acs_procnodeid; 379 uint_t family = authamd->amd_shared->acs_family; 380 uint32_t rev = authamd->amd_shared->acs_rev; 381 int chan, cs; 382 383 if (!AUTHAMD_HAS_ONLINESPARECTL(rev)) 384 return; 385 386 MCREG_VAL32(&sparectl) = 387 authamd_pcicfg_read(procnodeid, MC_FUNC_MISCCTL, 388 MC_CTL_REG_SPARECTL); 389 390 switch (family) { 391 case AUTHAMD_FAMILY_F: 392 MCREG_FIELD_F_revFG(&sparectl, EccErrCntWrEn) = 1; 393 if (clrint) { 394 MCREG_FIELD_F_revFG(&sparectl, EccErrInt) = 0; 395 MCREG_FIELD_F_revFG(&sparectl, SwapDoneInt) = 0; 396 } 397 break; 398 399 case AUTHAMD_FAMILY_10: 400 MCREG_FIELD_10_revAB(&sparectl, EccErrCntWrEn) = 1; 401 if (clrint) { 402 MCREG_FIELD_10_revAB(&sparectl, EccErrInt) = 0; 403 MCREG_FIELD_10_revAB(&sparectl, SwapDoneInt) = 0; 404 } 405 break; 406 } 407 408 authamd_pcicfg_write(procnodeid, MC_FUNC_MISCCTL, 409 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 410 411 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 412 switch (family) { 413 case AUTHAMD_FAMILY_F: 414 MCREG_FIELD_F_revFG(&sparectl, EccErrCntDramChan) = 415 chan; 416 break; 417 418 case AUTHAMD_FAMILY_10: 419 MCREG_FIELD_10_revAB(&sparectl, EccErrCntDramChan) = 420 chan; 421 break; 422 } 423 424 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 425 switch (family) { 426 case AUTHAMD_FAMILY_F: 427 MCREG_FIELD_F_revFG(&sparectl, 428 EccErrCntDramCs) = cs; 429 MCREG_FIELD_F_revFG(&sparectl, 430 EccErrCnt) = 0; 431 break; 432 433 case AUTHAMD_FAMILY_10: 434 MCREG_FIELD_10_revAB(&sparectl, 435 EccErrCntDramCs) = cs; 436 MCREG_FIELD_10_revAB(&sparectl, 437 EccErrCnt) = 0; 438 break; 439 } 440 441 authamd_pcicfg_write(procnodeid, MC_FUNC_MISCCTL, 442 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 443 } 444 } 445 } 446 447 448 /* 449 * Return 450 * 1: supported 451 * 0: unsupported 452 */ 453 static int 454 authamd_supported(cmi_hdl_t hdl) 455 { 456 uint_t family = cmi_hdl_family(hdl); 457 458 switch (family) { 459 case AUTHAMD_FAMILY_6: 460 case AUTHAMD_FAMILY_F: 461 case AUTHAMD_FAMILY_10: 462 return (1); 463 default: 464 return (0); 465 } 466 } 467 468 /* 469 * cms_init entry point. 470 * 471 * This module provides broad model-specific support for AMD families 472 * 0x6, 0xf and 0x10. Future families will have to be evaluated once their 473 * documentation is available. 474 */ 475 int 476 authamd_init(cmi_hdl_t hdl, void **datap) 477 { 478 uint_t chipid = cmi_hdl_chipid(hdl); 479 uint_t procnodeid = cmi_hdl_procnodeid(hdl); 480 struct authamd_nodeshared *sp, *osp; 481 uint_t family = cmi_hdl_family(hdl); 482 uint32_t rev = cmi_hdl_chiprev(hdl); 483 authamd_data_t *authamd; 484 uint64_t cap; 485 486 if (authamd_ms_support_disable || 487 !authamd_supported(hdl)) 488 return (ENOTSUP); 489 490 if (!(x86_feature & X86_MCA)) 491 return (ENOTSUP); 492 493 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS) 494 return (ENOTSUP); 495 496 if (!(cap & MCG_CAP_CTL_P)) 497 return (ENOTSUP); 498 499 authamd = *datap = kmem_zalloc(sizeof (authamd_data_t), KM_SLEEP); 500 cmi_hdl_hold(hdl); /* release in fini */ 501 authamd->amd_hdl = hdl; 502 503 if ((sp = authamd_shared[procnodeid]) == NULL) { 504 sp = kmem_zalloc(sizeof (struct authamd_nodeshared), KM_SLEEP); 505 sp->acs_chipid = chipid; 506 sp->acs_procnodeid = procnodeid; 507 sp->acs_family = family; 508 sp->acs_rev = rev; 509 membar_producer(); 510 511 osp = atomic_cas_ptr(&authamd_shared[procnodeid], NULL, sp); 512 if (osp != NULL) { 513 kmem_free(sp, sizeof (struct authamd_nodeshared)); 514 sp = osp; 515 } 516 } 517 authamd->amd_shared = sp; 518 519 return (0); 520 } 521 522 /* 523 * cms_logout_size entry point. 524 */ 525 /*ARGSUSED*/ 526 size_t 527 authamd_logout_size(cmi_hdl_t hdl) 528 { 529 return (sizeof (struct authamd_logout)); 530 } 531 532 /* 533 * cms_mcgctl_val entry point 534 * 535 * Instead of setting all bits to 1 we can set just those for the 536 * error detector banks known to exist. 537 */ 538 /*ARGSUSED*/ 539 uint64_t 540 authamd_mcgctl_val(cmi_hdl_t hdl, int nbanks, uint64_t proposed) 541 { 542 return (nbanks < 64 ? (1ULL << nbanks) - 1 : proposed); 543 } 544 545 /* 546 * cms_bankctl_skipinit entry point 547 * 548 * On K6 we do not initialize MC0_CTL since, reportedly, this bank (for DC) 549 * may produce spurious machine checks. 550 * 551 * Only allow a single core to setup the NorthBridge MCi_CTL register. 552 */ 553 /*ARGSUSED*/ 554 boolean_t 555 authamd_bankctl_skipinit(cmi_hdl_t hdl, int bank) 556 { 557 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 558 uint32_t rev = authamd->amd_shared->acs_rev; 559 560 if (authamd->amd_shared->acs_family == AUTHAMD_FAMILY_6) 561 return (bank == 0 ? B_TRUE : B_FALSE); 562 563 if (AUTHAMD_NBONCHIP(rev) && bank == AMD_MCA_BANK_NB) { 564 return (authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBMCA) == 565 B_TRUE ? B_FALSE : B_TRUE); 566 } 567 568 return (B_FALSE); 569 } 570 571 /* 572 * cms_bankctl_val entry point 573 */ 574 uint64_t 575 authamd_bankctl_val(cmi_hdl_t hdl, int bank, uint64_t proposed) 576 { 577 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 578 uint32_t rev = authamd->amd_shared->acs_rev; 579 uint64_t val = proposed; 580 581 /* 582 * The Intel MCA says we can write all 1's to enable #MC for 583 * all errors, and AMD docs say much the same. But, depending 584 * perhaps on other config registers, taking machine checks 585 * for some errors such as GART TLB errors and master/target 586 * aborts may be bad - they set UC and sometime also PCC, but 587 * we should not always panic for these error types. 588 * 589 * Our cms_error_action entry point can suppress such panics, 590 * however we can also use the cms_bankctl_val entry point to 591 * veto enabling of some of the known villains in the first place. 592 */ 593 if (bank == AMD_MCA_BANK_NB && AUTHAMD_NOGARTTBLWLK_MC(rev)) 594 val &= ~AMD_NB_EN_GARTTBLWK; 595 596 return (val); 597 } 598 599 /* 600 * Bits to add to NB MCA config (after watchdog config). 601 */ 602 uint32_t authamd_nb_mcacfg_add = AMD_NB_CFG_ADD_CMN; 603 604 /* 605 * Bits to remove from NB MCA config (after watchdog config) 606 */ 607 uint32_t authamd_nb_mcacfg_remove = AMD_NB_CFG_REMOVE_CMN; 608 609 /* 610 * NB Watchdog policy, and rate we use if enabling. 611 */ 612 enum { 613 AUTHAMD_NB_WDOG_LEAVEALONE, 614 AUTHAMD_NB_WDOG_DISABLE, 615 AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED, 616 AUTHAMD_NB_WDOG_ENABLE_FORCE_RATE 617 } authamd_nb_watchdog_policy = AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED; 618 619 uint32_t authamd_nb_mcacfg_wdog = AMD_NB_CFG_WDOGTMRCNTSEL_4095 | 620 AMD_NB_CFG_WDOGTMRBASESEL_1MS; 621 622 /* 623 * Per-core cache scrubbing policy and rates. 624 */ 625 enum { 626 AUTHAMD_SCRUB_BIOSDEFAULT, /* leave as BIOS configured */ 627 AUTHAMD_SCRUB_FIXED, /* assign our chosen rate */ 628 AUTHAMD_SCRUB_MAX /* use higher of ours and BIOS rate */ 629 } authamd_scrub_policy = AUTHAMD_SCRUB_MAX; 630 631 uint32_t authamd_scrub_rate_dcache = 0xf; /* 64K per 0.67 seconds */ 632 uint32_t authamd_scrub_rate_l2cache = 0xe; /* 1MB per 5.3 seconds */ 633 uint32_t authamd_scrub_rate_l3cache = 0xd; /* 1MB per 2.7 seconds */ 634 635 static uint32_t 636 authamd_scrubrate(uint32_t osrate, uint32_t biosrate, const char *varnm) 637 { 638 uint32_t rate; 639 640 if (osrate > AMD_NB_SCRUBCTL_RATE_MAX) { 641 cmn_err(CE_WARN, "%s is too large, resetting to 0x%x\n", 642 varnm, AMD_NB_SCRUBCTL_RATE_MAX); 643 osrate = AMD_NB_SCRUBCTL_RATE_MAX; 644 } 645 646 switch (authamd_scrub_policy) { 647 case AUTHAMD_SCRUB_FIXED: 648 rate = osrate; 649 break; 650 651 default: 652 cmn_err(CE_WARN, "Unknown authamd_scrub_policy %d - " 653 "using default policy of AUTHAMD_SCRUB_MAX", 654 authamd_scrub_policy); 655 /*FALLTHRU*/ 656 657 case AUTHAMD_SCRUB_MAX: 658 if (osrate != 0 && biosrate != 0) 659 rate = MIN(osrate, biosrate); /* small is fast */ 660 else 661 rate = osrate ? osrate : biosrate; 662 } 663 664 return (rate); 665 } 666 667 /* 668 * cms_mca_init entry point. 669 */ 670 /*ARGSUSED*/ 671 void 672 authamd_mca_init(cmi_hdl_t hdl, int nbanks) 673 { 674 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 675 uint32_t rev = authamd->amd_shared->acs_rev; 676 uint_t procnodeid = authamd->amd_shared->acs_procnodeid; 677 678 /* 679 * On chips with a NB online spare control register take control 680 * and clear ECC counts. 681 */ 682 if (AUTHAMD_HAS_ONLINESPARECTL(rev) && 683 authamd_chip_once(authamd, AUTHAMD_CFGONCE_ONLNSPRCFG)) { 684 authamd_clear_ecccnt(authamd, B_TRUE); 685 } 686 687 /* 688 * And since we are claiming the telemetry stop the BIOS receiving 689 * an SMI on NB threshold overflow. 690 */ 691 if (AUTHAMD_NBMISC_NUM(rev) && 692 authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBTHRESH)) { 693 union mcmsr_nbmisc nbm; 694 int i; 695 696 authamd_bankstatus_prewrite(hdl, authamd); 697 698 for (i = 0; i < AUTHAMD_NBMISC_NUM(rev); i++) { 699 if (cmi_hdl_rdmsr(hdl, MC_MSR_NB_MISC(i), 700 (uint64_t *)&nbm) != CMI_SUCCESS) 701 continue; 702 703 if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) && 704 MCMSR_FIELD_F_revFG(&nbm, mcmisc_Valid) && 705 MCMSR_FIELD_F_revFG(&nbm, mcmisc_CntP)) { 706 MCMSR_FIELD_F_revFG(&nbm, mcmisc_IntType) = 0; 707 } else if (X86_CHIPREV_ATLEAST(rev, 708 X86_CHIPREV_AMD_10_REV_A) && 709 MCMSR_FIELD_10_revAB(&nbm, mcmisc_Valid) && 710 MCMSR_FIELD_10_revAB(&nbm, mcmisc_CntP)) { 711 MCMSR_FIELD_10_revAB(&nbm, mcmisc_IntType) = 0; 712 } 713 714 (void) cmi_hdl_wrmsr(hdl, MC_MSR_NB_MISC(i), 715 MCMSR_VAL(&nbm)); 716 } 717 718 authamd_bankstatus_postwrite(hdl, authamd); 719 } 720 721 /* 722 * NB MCA Configuration Register. 723 */ 724 if (AUTHAMD_DO_NBMCACFG(rev) && 725 authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBMCACFG)) { 726 uint32_t val = authamd_pcicfg_read(procnodeid, MC_FUNC_MISCCTL, 727 MC_CTL_REG_NBCFG); 728 729 switch (authamd_nb_watchdog_policy) { 730 case AUTHAMD_NB_WDOG_LEAVEALONE: 731 break; 732 733 case AUTHAMD_NB_WDOG_DISABLE: 734 val &= ~(AMD_NB_CFG_WDOGTMRBASESEL_MASK | 735 AMD_NB_CFG_WDOGTMRCNTSEL_MASK); 736 val |= AMD_NB_CFG_WDOGTMRDIS; 737 break; 738 739 default: 740 cmn_err(CE_NOTE, "authamd_nb_watchdog_policy=%d " 741 "unrecognised, using default policy", 742 authamd_nb_watchdog_policy); 743 /*FALLTHRU*/ 744 745 case AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED: 746 if (!(val & AMD_NB_CFG_WDOGTMRDIS)) 747 break; /* if enabled leave rate intact */ 748 /*FALLTHRU*/ 749 750 case AUTHAMD_NB_WDOG_ENABLE_FORCE_RATE: 751 val &= ~(AMD_NB_CFG_WDOGTMRBASESEL_MASK | 752 AMD_NB_CFG_WDOGTMRCNTSEL_MASK | 753 AMD_NB_CFG_WDOGTMRDIS); 754 val |= authamd_nb_mcacfg_wdog; 755 break; 756 } 757 758 /* 759 * Bit 0 of the NB MCA Config register is reserved on family 760 * 0x10. 761 */ 762 if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 763 authamd_nb_mcacfg_add &= ~AMD_NB_CFG_CPUECCERREN; 764 765 val &= ~authamd_nb_mcacfg_remove; 766 val |= authamd_nb_mcacfg_add; 767 768 authamd_pcicfg_write(procnodeid, MC_FUNC_MISCCTL, 769 MC_CTL_REG_NBCFG, val); 770 } 771 772 /* 773 * Cache scrubbing. We can't enable DRAM scrubbing since 774 * we don't know the DRAM base for this node. 775 */ 776 if (AUTHAMD_HAS_CHIPSCRUB(rev) && 777 authamd_scrub_policy != AUTHAMD_SCRUB_BIOSDEFAULT && 778 authamd_chip_once(authamd, AUTHAMD_CFGONCE_CACHESCRUB)) { 779 uint32_t val = authamd_pcicfg_read(procnodeid, MC_FUNC_MISCCTL, 780 MC_CTL_REG_SCRUBCTL); 781 int l3cap = 0; 782 783 if (AUTHAMD_L3CAPABLE(rev)) { 784 l3cap = (authamd_pcicfg_read(procnodeid, 785 MC_FUNC_MISCCTL, MC_CTL_REG_NBCAP) & 786 MC_NBCAP_L3CAPABLE) != 0; 787 } 788 789 authamd_scrub_rate_dcache = 790 authamd_scrubrate(authamd_scrub_rate_dcache, 791 (val & AMD_NB_SCRUBCTL_DC_MASK) >> AMD_NB_SCRUBCTL_DC_SHIFT, 792 "authamd_scrub_rate_dcache"); 793 794 authamd_scrub_rate_l2cache = 795 authamd_scrubrate(authamd_scrub_rate_l2cache, 796 (val & AMD_NB_SCRUBCTL_L2_MASK) >> AMD_NB_SCRUBCTL_L2_SHIFT, 797 "authamd_scrub_rate_l2cache"); 798 799 authamd_scrub_rate_l3cache = l3cap ? 800 authamd_scrubrate(authamd_scrub_rate_l3cache, 801 (val & AMD_NB_SCRUBCTL_L3_MASK) >> AMD_NB_SCRUBCTL_L3_SHIFT, 802 "authamd_scrub_rate_l3cache") : 0; 803 804 val = AMD_NB_MKSCRUBCTL(authamd_scrub_rate_l3cache, 805 authamd_scrub_rate_dcache, authamd_scrub_rate_l2cache, 806 val & AMD_NB_SCRUBCTL_DRAM_MASK); 807 808 authamd_pcicfg_write(procnodeid, MC_FUNC_MISCCTL, 809 MC_CTL_REG_SCRUBCTL, val); 810 } 811 812 } 813 814 /* 815 * cms_poll_ownermask entry point. 816 */ 817 uint64_t 818 authamd_poll_ownermask(cmi_hdl_t hdl, hrtime_t pintvl) 819 { 820 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 821 struct authamd_nodeshared *acsp = authamd->amd_shared; 822 hrtime_t now = gethrtime_waitfree(); 823 hrtime_t last = acsp->acs_poll_timestamp; 824 int dopoll = 0; 825 826 if (now - last > 2 * pintvl || last == 0) { 827 acsp->acs_pollowner = hdl; 828 dopoll = 1; 829 } else if (acsp->acs_pollowner == hdl) { 830 dopoll = 1; 831 } 832 833 if (dopoll) 834 acsp->acs_poll_timestamp = now; 835 836 return (dopoll ? -1ULL : ~(1 << AMD_MCA_BANK_NB)); 837 838 } 839 840 /* 841 * cms_bank_logout entry point. 842 */ 843 /*ARGSUSED*/ 844 void 845 authamd_bank_logout(cmi_hdl_t hdl, int bank, uint64_t status, 846 uint64_t addr, uint64_t misc, void *mslogout) 847 { 848 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 849 struct authamd_logout *msl = mslogout; 850 uint32_t rev = authamd->amd_shared->acs_rev; 851 852 if (msl == NULL) 853 return; 854 855 /* 856 * For main memory ECC errors on revisions with an Online Spare 857 * Control Register grab the ECC counts by channel and chip-select 858 * and reset them to 0. 859 */ 860 if (AUTHAMD_MEMECC_RECOGNISED(rev) && 861 AUTHAMD_IS_MEMECCERR(bank, status) && 862 AUTHAMD_HAS_ONLINESPARECTL(rev)) { 863 if (authamd_read_ecccnt(authamd, msl)) 864 authamd_clear_ecccnt(authamd, B_FALSE); 865 } 866 } 867 868 /* 869 * cms_error_action entry point 870 */ 871 872 int authamd_forgive_uc = 0; /* For test/debug only */ 873 int authamd_forgive_pcc = 0; /* For test/debug only */ 874 int authamd_fake_poison = 0; /* For test/debug only */ 875 876 /*ARGSUSED*/ 877 uint32_t 878 authamd_error_action(cmi_hdl_t hdl, int ismc, int bank, 879 uint64_t status, uint64_t addr, uint64_t misc, void *mslogout) 880 { 881 authamd_error_disp_t *disp; 882 uint32_t rv = 0; 883 884 if (authamd_forgive_uc) 885 rv |= CMS_ERRSCOPE_CLEARED_UC; 886 887 if (authamd_forgive_pcc) 888 rv |= CMS_ERRSCOPE_CURCONTEXT_OK; 889 890 if (authamd_fake_poison && status & MSR_MC_STATUS_UC) 891 rv |= CMS_ERRSCOPE_POISONED; 892 893 if (rv) 894 return (rv); 895 896 disp = authamd_disp_match(hdl, bank, status, addr, misc, mslogout); 897 898 if (disp == &authamd_gart_disp) { 899 /* 900 * GART walk errors set UC and possibly PCC (if source CPU) 901 * but should not be regarded as terminal. 902 */ 903 return (CMS_ERRSCOPE_IGNORE_ERR); 904 } 905 906 /* 907 * May also want to consider master abort and target abort. These 908 * also set UC and PCC (if src CPU) but the requester gets -1 909 * and I believe the IO stuff in Solaris will handle that. 910 */ 911 912 return (rv); 913 } 914 915 /* 916 * cms_disp_match entry point 917 */ 918 /*ARGSUSED*/ 919 cms_cookie_t 920 authamd_disp_match(cmi_hdl_t hdl, int bank, uint64_t status, 921 uint64_t addr, uint64_t misc, void *mslogout) 922 { 923 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 924 /* uint16_t errcode = MCAX86_ERRCODE(status); */ 925 uint16_t exterrcode = AMD_EXT_ERRCODE(status); 926 uint32_t rev = authamd->amd_shared->acs_rev; 927 928 /* 929 * Recognise main memory ECC errors 930 */ 931 if (AUTHAMD_MEMECC_RECOGNISED(rev) && 932 AUTHAMD_IS_MEMECCERR(bank, status)) { 933 if (status & AMD_BANK_STAT_CECC) { 934 return (exterrcode == 0 ? &authamd_memce_disp : 935 &authamd_ckmemce_disp); 936 } else if (status & AMD_BANK_STAT_UECC) { 937 return (exterrcode == 0 ? &authamd_memue_disp : 938 &authamd_ckmemue_disp); 939 } 940 } 941 942 /* 943 * Recognise GART walk errors 944 */ 945 if (AUTHAMD_NOGARTTBLWLK_MC(rev) && AUTHAMD_IS_GARTERR(bank, status)) 946 return (&authamd_gart_disp); 947 948 return (NULL); 949 } 950 951 /* 952 * cms_ereport_class entry point 953 */ 954 /*ARGSUSED*/ 955 void 956 authamd_ereport_class(cmi_hdl_t hdl, cms_cookie_t mscookie, 957 const char **cpuclsp, const char **leafclsp) 958 { 959 const authamd_error_disp_t *aed = mscookie; 960 961 if (aed == NULL) 962 return; 963 964 if (aed->aad_subclass != NULL) 965 *cpuclsp = aed->aad_subclass; 966 if (aed->aad_leafclass != NULL) 967 *leafclsp = aed->aad_leafclass; 968 } 969 970 /*ARGSUSED*/ 971 static void 972 authamd_ereport_add_resource(cmi_hdl_t hdl, authamd_data_t *authamd, 973 nvlist_t *ereport, nv_alloc_t *nva, void *mslogout) 974 { 975 nvlist_t *elems[AUTHAMD_DRAM_NCHANNEL * AUTHAMD_DRAM_NCS]; 976 uint8_t counts[AUTHAMD_DRAM_NCHANNEL * AUTHAMD_DRAM_NCS]; 977 authamd_logout_t *msl; 978 nvlist_t *nvl; 979 int nelems = 0; 980 int i, chan, cs, mc; 981 nvlist_t *board_list = NULL; 982 983 if ((msl = mslogout) == NULL) 984 return; 985 986 /* Assume all processors have the same number of nodes */ 987 mc = authamd->amd_shared->acs_procnodeid % 988 cpuid_get_procnodes_per_pkg(CPU); 989 990 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 991 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 992 if (msl->aal_eccerrcnt[chan][cs] == 0) 993 continue; 994 995 if ((nvl = fm_nvlist_create(nva)) == NULL) 996 continue; 997 998 elems[nelems] = nvl; 999 counts[nelems++] = msl->aal_eccerrcnt[chan][cs]; 1000 1001 if (!x86gentopo_legacy) { 1002 board_list = cmi_hdl_smb_bboard(hdl); 1003 if (board_list == NULL) 1004 continue; 1005 fm_fmri_hc_create(nvl, FM_HC_SCHEME_VERSION, 1006 NULL, NULL, board_list, 4, 1007 "chip", cmi_hdl_smb_chipid(hdl), 1008 "memory-controller", 0, 1009 "dram-channel", chan, 1010 "chip-select", cs); 1011 } else { 1012 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, 1013 NULL, NULL, 5, 1014 "motherboard", 0, 1015 "chip", authamd->amd_shared->acs_chipid, 1016 "memory-controller", mc, 1017 "dram-channel", chan, 1018 "chip-select", cs); 1019 } 1020 } 1021 } 1022 1023 if (nelems == 0) 1024 return; 1025 1026 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_RESOURCE, 1027 DATA_TYPE_NVLIST_ARRAY, nelems, elems, 1028 NULL); 1029 1030 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_RESOURCECNT, 1031 DATA_TYPE_UINT8_ARRAY, nelems, &counts[0], 1032 NULL); 1033 1034 for (i = 0; i < nelems; i++) 1035 fm_nvlist_destroy(elems[i], nva ? FM_NVA_RETAIN : FM_NVA_FREE); 1036 } 1037 1038 /* 1039 * cms_ereport_add_logout entry point 1040 */ 1041 /*ARGSUSED*/ 1042 void 1043 authamd_ereport_add_logout(cmi_hdl_t hdl, nvlist_t *ereport, nv_alloc_t *nva, 1044 int bank, uint64_t status, uint64_t addr, uint64_t misc, 1045 void *mslogout, cms_cookie_t mscookie) 1046 { 1047 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 1048 const authamd_error_disp_t *aed = mscookie; 1049 uint64_t members; 1050 1051 if (aed == NULL) 1052 return; 1053 1054 members = aed->aad_ereport_members; 1055 1056 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYND) { 1057 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_SYND, 1058 DATA_TYPE_UINT16, (uint16_t)AMD_BANK_SYND(status), 1059 NULL); 1060 1061 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYNDTYPE) { 1062 fm_payload_set(ereport, 1063 FM_EREPORT_GENAMD_PAYLOAD_NAME_SYNDTYPE, 1064 DATA_TYPE_STRING, "E", 1065 NULL); 1066 } 1067 } 1068 1069 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_CKSYND) { 1070 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_CKSYND, 1071 DATA_TYPE_UINT16, (uint16_t)AMD_NB_STAT_CKSYND(status), 1072 NULL); 1073 1074 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYNDTYPE) { 1075 fm_payload_set(ereport, 1076 FM_EREPORT_GENAMD_PAYLOAD_NAME_SYNDTYPE, 1077 DATA_TYPE_STRING, "C", 1078 NULL); 1079 } 1080 } 1081 1082 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_RESOURCE && 1083 status & MSR_MC_STATUS_ADDRV) { 1084 authamd_ereport_add_resource(hdl, authamd, ereport, nva, 1085 mslogout); 1086 } 1087 } 1088 1089 /* 1090 * cms_msrinject entry point 1091 */ 1092 cms_errno_t 1093 authamd_msrinject(cmi_hdl_t hdl, uint_t msr, uint64_t val) 1094 { 1095 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 1096 cms_errno_t rv = CMSERR_BADMSRWRITE; 1097 1098 authamd_bankstatus_prewrite(hdl, authamd); 1099 if (cmi_hdl_wrmsr(hdl, msr, val) == CMI_SUCCESS) 1100 rv = CMS_SUCCESS; 1101 authamd_bankstatus_postwrite(hdl, authamd); 1102 1103 return (rv); 1104 } 1105 1106 cms_api_ver_t _cms_api_version = CMS_API_VERSION_0; 1107 1108 const cms_ops_t _cms_ops = { 1109 authamd_init, /* cms_init */ 1110 NULL, /* cms_post_startup */ 1111 NULL, /* cms_post_mpstartup */ 1112 authamd_logout_size, /* cms_logout_size */ 1113 authamd_mcgctl_val, /* cms_mcgctl_val */ 1114 authamd_bankctl_skipinit, /* cms_bankctl_skipinit */ 1115 authamd_bankctl_val, /* cms_bankctl_val */ 1116 NULL, /* cms_bankstatus_skipinit */ 1117 NULL, /* cms_bankstatus_val */ 1118 authamd_mca_init, /* cms_mca_init */ 1119 authamd_poll_ownermask, /* cms_poll_ownermask */ 1120 authamd_bank_logout, /* cms_bank_logout */ 1121 authamd_error_action, /* cms_error_action */ 1122 authamd_disp_match, /* cms_disp_match */ 1123 authamd_ereport_class, /* cms_ereport_class */ 1124 NULL, /* cms_ereport_detector */ 1125 NULL, /* cms_ereport_includestack */ 1126 authamd_ereport_add_logout, /* cms_ereport_add_logout */ 1127 authamd_msrinject, /* cms_msrinject */ 1128 NULL, /* cms_fini */ 1129 }; 1130 1131 static struct modlcpu modlcpu = { 1132 &mod_cpuops, 1133 "Generic AMD model-specific MCA" 1134 }; 1135 1136 static struct modlinkage modlinkage = { 1137 MODREV_1, 1138 (void *)&modlcpu, 1139 NULL 1140 }; 1141 1142 int 1143 _init(void) 1144 { 1145 return (mod_install(&modlinkage)); 1146 } 1147 1148 int 1149 _info(struct modinfo *modinfop) 1150 { 1151 return (mod_info(&modlinkage, modinfop)); 1152 } 1153 1154 int 1155 _fini(void) 1156 { 1157 return (mod_remove(&modlinkage)); 1158 } 1159