1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * "Generic AMD" model-specific support. If no more-specific support can 29 * be found, or such modules declines to initialize, then for AuthenticAMD 30 * cpus this module can have a crack at providing some AMD model-specific 31 * support that at least goes beyond common MCA architectural features 32 * if not down to the nitty-gritty level for a particular model. We 33 * are layered on top of a cpu module, likely cpu.generic, so there is no 34 * need for us to perform common architecturally-accessible functions. 35 */ 36 37 #include <sys/types.h> 38 #include <sys/cmn_err.h> 39 #include <sys/modctl.h> 40 #include <sys/cpu_module.h> 41 #include <sys/mca_x86.h> 42 #include <sys/pci_cfgspace.h> 43 #include <sys/x86_archext.h> 44 #include <sys/mc_amd.h> 45 #include <sys/fm/protocol.h> 46 #include <sys/fm/cpu/GENAMD.h> 47 #include <sys/fm/smb/fmsmb.h> 48 #include <sys/fm/util.h> 49 #include <sys/nvpair.h> 50 #include <sys/controlregs.h> 51 #include <sys/pghw.h> 52 #include <sys/sunddi.h> 53 #include <sys/sysmacros.h> 54 #include <sys/cpu_module_ms_impl.h> 55 56 #include "authamd.h" 57 58 extern int x86gentopo_legacy; /* x86 generic topo support */ 59 60 int authamd_ms_support_disable = 0; 61 62 #define AUTHAMD_F_REVS_BCDE \ 63 (X86_CHIPREV_AMD_F_REV_B | X86_CHIPREV_AMD_F_REV_C0 | \ 64 X86_CHIPREV_AMD_F_REV_CG | X86_CHIPREV_AMD_F_REV_D | \ 65 X86_CHIPREV_AMD_F_REV_E) 66 67 #define AUTHAMD_F_REVS_FG \ 68 (X86_CHIPREV_AMD_F_REV_F | X86_CHIPREV_AMD_F_REV_G) 69 70 #define AUTHAMD_10_REVS_AB \ 71 (X86_CHIPREV_AMD_10_REV_A | X86_CHIPREV_AMD_10_REV_B) 72 73 /* 74 * Bitmasks of support for various features. Try to enable features 75 * via inclusion in one of these bitmasks and check that at the 76 * feature imlementation - that way new family support may often simply 77 * simply need to update these bitmasks. 78 */ 79 80 /* 81 * Models that include an on-chip NorthBridge. 82 */ 83 #define AUTHAMD_NBONCHIP(rev) \ 84 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 85 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 86 87 /* 88 * Families/revisions for which we can recognise main memory ECC errors. 89 */ 90 #define AUTHAMD_MEMECC_RECOGNISED(rev) \ 91 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 92 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 93 94 /* 95 * Families/revisions that have an Online Spare Control Register 96 */ 97 #define AUTHAMD_HAS_ONLINESPARECTL(rev) \ 98 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) || \ 99 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 100 101 /* 102 * Families/revisions for which we will perform NB MCA Config changes 103 */ 104 #define AUTHAMD_DO_NBMCACFG(rev) \ 105 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 106 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 107 108 /* 109 * Families/revisions that have chip cache scrubbers. 110 */ 111 #define AUTHAMD_HAS_CHIPSCRUB(rev) \ 112 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 113 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 114 115 /* 116 * Families/revisions that have a NB misc register or registers - 117 * evaluates to 0 if no support, otherwise the number of MC4_MISCj. 118 */ 119 #define AUTHAMD_NBMISC_NUM(rev) \ 120 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F)? 1 : \ 121 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A) ? 3 : 0)) 122 123 /* 124 * Families/revision for which we wish not to machine check for GART 125 * table walk errors - bit 10 of NB CTL. 126 */ 127 #define AUTHAMD_NOGARTTBLWLK_MC(rev) \ 128 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 129 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 130 131 /* 132 * Families/revisions that are potentially L3 capable 133 */ 134 #define AUTHAMD_L3CAPABLE(rev) \ 135 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 136 137 /* 138 * Families/revisions that support x8 ChipKill ECC 139 */ 140 #define AUTHAMD_SUPPORTS_X8ECC(rev) \ 141 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_D)) 142 143 /* 144 * We recognise main memory ECC errors for AUTHAMD_MEMECC_RECOGNISED 145 * revisions as: 146 * 147 * - being reported by the NB 148 * - being a compound bus/interconnect error (external to chip) 149 * - having LL of LG 150 * - having II of MEM (but could still be a master/target abort) 151 * - having CECC or UECC set 152 * 153 * We do not check the extended error code (first nibble of the 154 * model-specific error code on AMD) since this has changed from 155 * family 0xf to family 0x10 (ext code 0 now reserved on family 0x10). 156 * Instead we use CECC/UECC to separate off the master/target 157 * abort cases. 158 * 159 * We insist that the detector be the NorthBridge bank; although 160 * IC/DC can report some main memory errors, they do not capture 161 * an address at sufficient resolution to be useful and the NB will 162 * report most errors. 163 */ 164 #define AUTHAMD_IS_MEMECCERR(bank, status) \ 165 ((bank) == AMD_MCA_BANK_NB && \ 166 MCAX86_ERRCODE_ISBUS_INTERCONNECT(MCAX86_ERRCODE(status)) && \ 167 MCAX86_ERRCODE_LL(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_LL_LG && \ 168 MCAX86_ERRCODE_II(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_II_MEM && \ 169 ((status) & (AMD_BANK_STAT_CECC | AMD_BANK_STAT_UECC))) 170 171 static authamd_error_disp_t authamd_memce_disp = { 172 FM_EREPORT_CPU_GENAMD, 173 FM_EREPORT_CPU_GENAMD_MEM_CE, 174 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_MEM_CE 175 }; 176 177 static authamd_error_disp_t authamd_memue_disp = { 178 FM_EREPORT_CPU_GENAMD, 179 FM_EREPORT_CPU_GENAMD_MEM_UE, 180 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_MEM_UE 181 }; 182 183 static authamd_error_disp_t authamd_ckmemce_disp = { 184 FM_EREPORT_CPU_GENAMD, 185 FM_EREPORT_CPU_GENAMD_CKMEM_CE, 186 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_CKMEM_CE 187 }; 188 189 static authamd_error_disp_t authamd_ckmemue_disp = { 190 FM_EREPORT_CPU_GENAMD, 191 FM_EREPORT_CPU_GENAMD_CKMEM_UE, 192 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_CKMEM_UE 193 }; 194 195 /* 196 * We recognise GART walk errors as: 197 * 198 * - being reported by the NB 199 * - being a compound TLB error 200 * - having LL of LG and TT of GEN 201 * - having UC set 202 * - possibly having PCC set (if source CPU) 203 */ 204 #define AUTHAMD_IS_GARTERR(bank, status) \ 205 ((bank) == AMD_MCA_BANK_NB && \ 206 MCAX86_ERRCODE_ISTLB(MCAX86_ERRCODE(status)) && \ 207 MCAX86_ERRCODE_LL(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_LL_LG && \ 208 MCAX86_ERRCODE_TT(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_TT_GEN && \ 209 (status) & MSR_MC_STATUS_UC) 210 211 static authamd_error_disp_t authamd_gart_disp = { 212 FM_EREPORT_CPU_GENAMD, /* use generic subclass */ 213 FM_EREPORT_CPU_GENADM_GARTTBLWLK, /* use generic leafclass */ 214 0 /* no additional payload */ 215 }; 216 217 218 static struct authamd_nodeshared *authamd_shared[AUTHAMD_MAX_NODES]; 219 220 static int 221 authamd_chip_once(authamd_data_t *authamd, enum authamd_cfgonce_bitnum what) 222 { 223 return (atomic_set_long_excl(&authamd->amd_shared->ans_cfgonce, 224 what) == 0 ? B_TRUE : B_FALSE); 225 } 226 227 static void 228 authamd_pcicfg_write(uint_t procnodeid, uint_t func, uint_t reg, uint32_t val) 229 { 230 ASSERT(procnodeid + 24 <= 31); 231 ASSERT((func & 7) == func); 232 ASSERT((reg & 3) == 0 && reg < 4096); 233 234 cmi_pci_putl(0, procnodeid + 24, func, reg, 0, val); 235 } 236 237 static uint32_t 238 authamd_pcicfg_read(uint_t procnodeid, uint_t func, uint_t reg) 239 { 240 ASSERT(procnodeid + 24 <= 31); 241 ASSERT((func & 7) == func); 242 ASSERT((reg & 3) == 0 && reg < 4096); 243 244 return (cmi_pci_getl(0, procnodeid + 24, func, reg, 0, 0)); 245 } 246 247 void 248 authamd_bankstatus_prewrite(cmi_hdl_t hdl, authamd_data_t *authamd) 249 { 250 uint64_t hwcr; 251 252 if (cmi_hdl_rdmsr(hdl, MSR_AMD_HWCR, &hwcr) != CMI_SUCCESS) 253 return; 254 255 authamd->amd_hwcr = hwcr; 256 257 if (!(hwcr & AMD_HWCR_MCI_STATUS_WREN)) { 258 hwcr |= AMD_HWCR_MCI_STATUS_WREN; 259 (void) cmi_hdl_wrmsr(hdl, MSR_AMD_HWCR, hwcr); 260 } 261 } 262 263 void 264 authamd_bankstatus_postwrite(cmi_hdl_t hdl, authamd_data_t *authamd) 265 { 266 uint64_t hwcr = authamd->amd_hwcr; 267 268 if (!(hwcr & AMD_HWCR_MCI_STATUS_WREN)) { 269 hwcr &= ~AMD_HWCR_MCI_STATUS_WREN; 270 (void) cmi_hdl_wrmsr(hdl, MSR_AMD_HWCR, hwcr); 271 } 272 } 273 274 /* 275 * Read EccCnt repeatedly for all possible channel/chip-select combos: 276 * 277 * - read sparectl register 278 * - if EccErrCntWrEn is set, clear that bit in the just-read value 279 * and write it back to sparectl; this *may* clobber the EccCnt 280 * for the channel/chip-select combination currently selected, so 281 * we leave this bit clear if we had to clear it 282 * - cycle through all channel/chip-select combinations writing each 283 * combination to sparectl before reading the register back for 284 * EccCnt for that combination; since EccErrCntWrEn is clear 285 * the writes to select what count to read will not themselves 286 * zero any counts 287 */ 288 static int 289 authamd_read_ecccnt(authamd_data_t *authamd, struct authamd_logout *msl) 290 { 291 union mcreg_sparectl sparectl; 292 uint_t procnodeid = authamd->amd_shared->ans_procnodeid; 293 uint_t family = authamd->amd_shared->ans_family; 294 uint32_t rev = authamd->amd_shared->ans_rev; 295 int chan, cs; 296 297 /* 298 * Check for feature support; this macro will test down to the 299 * family revision number, whereafter we'll switch on family 300 * assuming that future revisions will use the same register 301 * format. 302 */ 303 if (!AUTHAMD_HAS_ONLINESPARECTL(rev)) { 304 bzero(&msl->aal_eccerrcnt, sizeof (msl->aal_eccerrcnt)); 305 return (0); 306 } 307 308 MCREG_VAL32(&sparectl) = 309 authamd_pcicfg_read(procnodeid, MC_FUNC_MISCCTL, 310 MC_CTL_REG_SPARECTL); 311 312 switch (family) { 313 case AUTHAMD_FAMILY_F: 314 MCREG_FIELD_F_revFG(&sparectl, EccErrCntWrEn) = 0; 315 break; 316 317 case AUTHAMD_FAMILY_10: 318 MCREG_FIELD_10_revAB(&sparectl, EccErrCntWrEn) = 0; 319 break; 320 } 321 322 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 323 switch (family) { 324 case AUTHAMD_FAMILY_F: 325 MCREG_FIELD_F_revFG(&sparectl, EccErrCntDramChan) = 326 chan; 327 break; 328 329 case AUTHAMD_FAMILY_10: 330 MCREG_FIELD_10_revAB(&sparectl, EccErrCntDramChan) = 331 chan; 332 break; 333 } 334 335 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 336 switch (family) { 337 case AUTHAMD_FAMILY_F: 338 MCREG_FIELD_F_revFG(&sparectl, 339 EccErrCntDramCs) = cs; 340 break; 341 342 case AUTHAMD_FAMILY_10: 343 MCREG_FIELD_10_revAB(&sparectl, 344 EccErrCntDramCs) = cs; 345 break; 346 } 347 348 authamd_pcicfg_write(procnodeid, MC_FUNC_MISCCTL, 349 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 350 351 MCREG_VAL32(&sparectl) = authamd_pcicfg_read(procnodeid, 352 MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL); 353 354 switch (family) { 355 case AUTHAMD_FAMILY_F: 356 msl->aal_eccerrcnt[chan][cs] = 357 MCREG_FIELD_F_revFG(&sparectl, EccErrCnt); 358 break; 359 case AUTHAMD_FAMILY_10: 360 msl->aal_eccerrcnt[chan][cs] = 361 MCREG_FIELD_10_revAB(&sparectl, EccErrCnt); 362 break; 363 } 364 } 365 } 366 367 return (1); 368 } 369 370 /* 371 * Clear EccCnt for all possible channel/chip-select combos: 372 * 373 * - set EccErrCntWrEn in sparectl, if necessary 374 * - write 0 to EccCnt for all channel/chip-select combinations 375 * - clear EccErrCntWrEn 376 * 377 * If requested also disable the interrupts taken on counter overflow 378 * and on swap done. 379 */ 380 static void 381 authamd_clear_ecccnt(authamd_data_t *authamd, boolean_t clrint) 382 { 383 union mcreg_sparectl sparectl; 384 uint_t procnodeid = authamd->amd_shared->ans_procnodeid; 385 uint_t family = authamd->amd_shared->ans_family; 386 uint32_t rev = authamd->amd_shared->ans_rev; 387 int chan, cs; 388 389 if (!AUTHAMD_HAS_ONLINESPARECTL(rev)) 390 return; 391 392 MCREG_VAL32(&sparectl) = 393 authamd_pcicfg_read(procnodeid, MC_FUNC_MISCCTL, 394 MC_CTL_REG_SPARECTL); 395 396 switch (family) { 397 case AUTHAMD_FAMILY_F: 398 MCREG_FIELD_F_revFG(&sparectl, EccErrCntWrEn) = 1; 399 if (clrint) { 400 MCREG_FIELD_F_revFG(&sparectl, EccErrInt) = 0; 401 MCREG_FIELD_F_revFG(&sparectl, SwapDoneInt) = 0; 402 } 403 break; 404 405 case AUTHAMD_FAMILY_10: 406 MCREG_FIELD_10_revAB(&sparectl, EccErrCntWrEn) = 1; 407 if (clrint) { 408 MCREG_FIELD_10_revAB(&sparectl, EccErrInt) = 0; 409 MCREG_FIELD_10_revAB(&sparectl, SwapDoneInt) = 0; 410 } 411 break; 412 } 413 414 authamd_pcicfg_write(procnodeid, MC_FUNC_MISCCTL, 415 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 416 417 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 418 switch (family) { 419 case AUTHAMD_FAMILY_F: 420 MCREG_FIELD_F_revFG(&sparectl, EccErrCntDramChan) = 421 chan; 422 break; 423 424 case AUTHAMD_FAMILY_10: 425 MCREG_FIELD_10_revAB(&sparectl, EccErrCntDramChan) = 426 chan; 427 break; 428 } 429 430 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 431 switch (family) { 432 case AUTHAMD_FAMILY_F: 433 MCREG_FIELD_F_revFG(&sparectl, 434 EccErrCntDramCs) = cs; 435 MCREG_FIELD_F_revFG(&sparectl, 436 EccErrCnt) = 0; 437 break; 438 439 case AUTHAMD_FAMILY_10: 440 MCREG_FIELD_10_revAB(&sparectl, 441 EccErrCntDramCs) = cs; 442 MCREG_FIELD_10_revAB(&sparectl, 443 EccErrCnt) = 0; 444 break; 445 } 446 447 authamd_pcicfg_write(procnodeid, MC_FUNC_MISCCTL, 448 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 449 } 450 } 451 } 452 453 454 /* 455 * Return 456 * 1: supported 457 * 0: unsupported 458 */ 459 static int 460 authamd_supported(cmi_hdl_t hdl) 461 { 462 uint_t family = cmi_hdl_family(hdl); 463 464 switch (family) { 465 case AUTHAMD_FAMILY_6: 466 case AUTHAMD_FAMILY_F: 467 case AUTHAMD_FAMILY_10: 468 return (1); 469 default: 470 return (0); 471 } 472 } 473 474 /* 475 * cms_init entry point. 476 * 477 * This module provides broad model-specific support for AMD families 478 * 0x6, 0xf and 0x10. Future families will have to be evaluated once their 479 * documentation is available. 480 */ 481 int 482 authamd_init(cmi_hdl_t hdl, void **datap) 483 { 484 uint_t chipid = cmi_hdl_chipid(hdl); 485 uint_t procnodeid = cmi_hdl_procnodeid(hdl); 486 struct authamd_nodeshared *sp, *osp; 487 uint_t family = cmi_hdl_family(hdl); 488 uint32_t rev = cmi_hdl_chiprev(hdl); 489 authamd_data_t *authamd; 490 uint64_t cap; 491 492 if (authamd_ms_support_disable || 493 !authamd_supported(hdl)) 494 return (ENOTSUP); 495 496 if (!(x86_feature & X86_MCA)) 497 return (ENOTSUP); 498 499 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS) 500 return (ENOTSUP); 501 502 if (!(cap & MCG_CAP_CTL_P)) 503 return (ENOTSUP); 504 505 authamd = *datap = kmem_zalloc(sizeof (authamd_data_t), KM_SLEEP); 506 cmi_hdl_hold(hdl); /* release in fini */ 507 authamd->amd_hdl = hdl; 508 509 if ((sp = authamd_shared[procnodeid]) == NULL) { 510 sp = kmem_zalloc(sizeof (struct authamd_nodeshared), KM_SLEEP); 511 sp->ans_chipid = chipid; 512 sp->ans_procnodeid = procnodeid; 513 sp->ans_family = family; 514 sp->ans_rev = rev; 515 membar_producer(); 516 517 osp = atomic_cas_ptr(&authamd_shared[procnodeid], NULL, sp); 518 if (osp != NULL) { 519 kmem_free(sp, sizeof (struct authamd_nodeshared)); 520 sp = osp; 521 } 522 } 523 authamd->amd_shared = sp; 524 525 return (0); 526 } 527 528 /* 529 * cms_logout_size entry point. 530 */ 531 /*ARGSUSED*/ 532 size_t 533 authamd_logout_size(cmi_hdl_t hdl) 534 { 535 return (sizeof (struct authamd_logout)); 536 } 537 538 /* 539 * cms_mcgctl_val entry point 540 * 541 * Instead of setting all bits to 1 we can set just those for the 542 * error detector banks known to exist. 543 */ 544 /*ARGSUSED*/ 545 uint64_t 546 authamd_mcgctl_val(cmi_hdl_t hdl, int nbanks, uint64_t proposed) 547 { 548 return (nbanks < 64 ? (1ULL << nbanks) - 1 : proposed); 549 } 550 551 /* 552 * cms_bankctl_skipinit entry point 553 * 554 * On K6 we do not initialize MC0_CTL since, reportedly, this bank (for DC) 555 * may produce spurious machine checks. 556 * 557 * Only allow a single core to setup the NorthBridge MCi_CTL register. 558 */ 559 /*ARGSUSED*/ 560 boolean_t 561 authamd_bankctl_skipinit(cmi_hdl_t hdl, int bank) 562 { 563 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 564 uint32_t rev = authamd->amd_shared->ans_rev; 565 566 if (authamd->amd_shared->ans_family == AUTHAMD_FAMILY_6) 567 return (bank == 0 ? B_TRUE : B_FALSE); 568 569 if (AUTHAMD_NBONCHIP(rev) && bank == AMD_MCA_BANK_NB) { 570 return (authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBMCA) == 571 B_TRUE ? B_FALSE : B_TRUE); 572 } 573 574 return (B_FALSE); 575 } 576 577 /* 578 * cms_bankctl_val entry point 579 */ 580 uint64_t 581 authamd_bankctl_val(cmi_hdl_t hdl, int bank, uint64_t proposed) 582 { 583 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 584 uint32_t rev = authamd->amd_shared->ans_rev; 585 uint64_t val = proposed; 586 587 /* 588 * The Intel MCA says we can write all 1's to enable #MC for 589 * all errors, and AMD docs say much the same. But, depending 590 * perhaps on other config registers, taking machine checks 591 * for some errors such as GART TLB errors and master/target 592 * aborts may be bad - they set UC and sometime also PCC, but 593 * we should not always panic for these error types. 594 * 595 * Our cms_error_action entry point can suppress such panics, 596 * however we can also use the cms_bankctl_val entry point to 597 * veto enabling of some of the known villains in the first place. 598 */ 599 if (bank == AMD_MCA_BANK_NB && AUTHAMD_NOGARTTBLWLK_MC(rev)) 600 val &= ~AMD_NB_EN_GARTTBLWK; 601 602 return (val); 603 } 604 605 /* 606 * Bits to add to NB MCA config (after watchdog config). 607 */ 608 uint32_t authamd_nb_mcacfg_add = AMD_NB_CFG_ADD_CMN; 609 610 /* 611 * Bits to remove from NB MCA config (after watchdog config) 612 */ 613 uint32_t authamd_nb_mcacfg_remove = AMD_NB_CFG_REMOVE_CMN; 614 615 /* 616 * NB Watchdog policy, and rate we use if enabling. 617 */ 618 enum { 619 AUTHAMD_NB_WDOG_LEAVEALONE, 620 AUTHAMD_NB_WDOG_DISABLE, 621 AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED, 622 AUTHAMD_NB_WDOG_ENABLE_FORCE_RATE 623 } authamd_nb_watchdog_policy = AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED; 624 625 uint32_t authamd_nb_mcacfg_wdog = AMD_NB_CFG_WDOGTMRCNTSEL_4095 | 626 AMD_NB_CFG_WDOGTMRBASESEL_1MS; 627 628 /* 629 * Per-core cache scrubbing policy and rates. 630 */ 631 enum { 632 AUTHAMD_SCRUB_BIOSDEFAULT, /* leave as BIOS configured */ 633 AUTHAMD_SCRUB_FIXED, /* assign our chosen rate */ 634 AUTHAMD_SCRUB_MAX /* use higher of ours and BIOS rate */ 635 } authamd_scrub_policy = AUTHAMD_SCRUB_MAX; 636 637 uint32_t authamd_scrub_rate_dcache = 0xf; /* 64K per 0.67 seconds */ 638 uint32_t authamd_scrub_rate_l2cache = 0xe; /* 1MB per 5.3 seconds */ 639 uint32_t authamd_scrub_rate_l3cache = 0xd; /* 1MB per 2.7 seconds */ 640 641 static uint32_t 642 authamd_scrubrate(uint32_t osrate, uint32_t biosrate, const char *varnm) 643 { 644 uint32_t rate; 645 646 if (osrate > AMD_NB_SCRUBCTL_RATE_MAX) { 647 cmn_err(CE_WARN, "%s is too large, resetting to 0x%x\n", 648 varnm, AMD_NB_SCRUBCTL_RATE_MAX); 649 osrate = AMD_NB_SCRUBCTL_RATE_MAX; 650 } 651 652 switch (authamd_scrub_policy) { 653 case AUTHAMD_SCRUB_FIXED: 654 rate = osrate; 655 break; 656 657 default: 658 cmn_err(CE_WARN, "Unknown authamd_scrub_policy %d - " 659 "using default policy of AUTHAMD_SCRUB_MAX", 660 authamd_scrub_policy); 661 /*FALLTHRU*/ 662 663 case AUTHAMD_SCRUB_MAX: 664 if (osrate != 0 && biosrate != 0) 665 rate = MIN(osrate, biosrate); /* small is fast */ 666 else 667 rate = osrate ? osrate : biosrate; 668 } 669 670 return (rate); 671 } 672 673 /* 674 * cms_mca_init entry point. 675 */ 676 /*ARGSUSED*/ 677 void 678 authamd_mca_init(cmi_hdl_t hdl, int nbanks) 679 { 680 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 681 uint32_t rev = authamd->amd_shared->ans_rev; 682 uint_t procnodeid = authamd->amd_shared->ans_procnodeid; 683 684 /* 685 * On chips with a NB online spare control register take control 686 * and clear ECC counts. 687 */ 688 if (AUTHAMD_HAS_ONLINESPARECTL(rev) && 689 authamd_chip_once(authamd, AUTHAMD_CFGONCE_ONLNSPRCFG)) { 690 authamd_clear_ecccnt(authamd, B_TRUE); 691 } 692 693 /* 694 * And since we are claiming the telemetry stop the BIOS receiving 695 * an SMI on NB threshold overflow. 696 */ 697 if (AUTHAMD_NBMISC_NUM(rev) && 698 authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBTHRESH)) { 699 union mcmsr_nbmisc nbm; 700 int i; 701 702 authamd_bankstatus_prewrite(hdl, authamd); 703 704 for (i = 0; i < AUTHAMD_NBMISC_NUM(rev); i++) { 705 if (cmi_hdl_rdmsr(hdl, MC_MSR_NB_MISC(i), 706 (uint64_t *)&nbm) != CMI_SUCCESS) 707 continue; 708 709 if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) && 710 MCMSR_FIELD_F_revFG(&nbm, mcmisc_Valid) && 711 MCMSR_FIELD_F_revFG(&nbm, mcmisc_CntP)) { 712 MCMSR_FIELD_F_revFG(&nbm, mcmisc_IntType) = 0; 713 } else if (X86_CHIPREV_ATLEAST(rev, 714 X86_CHIPREV_AMD_10_REV_A) && 715 MCMSR_FIELD_10_revAB(&nbm, mcmisc_Valid) && 716 MCMSR_FIELD_10_revAB(&nbm, mcmisc_CntP)) { 717 MCMSR_FIELD_10_revAB(&nbm, mcmisc_IntType) = 0; 718 } 719 720 (void) cmi_hdl_wrmsr(hdl, MC_MSR_NB_MISC(i), 721 MCMSR_VAL(&nbm)); 722 } 723 724 authamd_bankstatus_postwrite(hdl, authamd); 725 } 726 727 /* 728 * NB MCA Configuration Register. 729 */ 730 if (AUTHAMD_DO_NBMCACFG(rev) && 731 authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBMCACFG)) { 732 uint32_t val = authamd_pcicfg_read(procnodeid, MC_FUNC_MISCCTL, 733 MC_CTL_REG_NBCFG); 734 735 switch (authamd_nb_watchdog_policy) { 736 case AUTHAMD_NB_WDOG_LEAVEALONE: 737 break; 738 739 case AUTHAMD_NB_WDOG_DISABLE: 740 val &= ~(AMD_NB_CFG_WDOGTMRBASESEL_MASK | 741 AMD_NB_CFG_WDOGTMRCNTSEL_MASK); 742 val |= AMD_NB_CFG_WDOGTMRDIS; 743 break; 744 745 default: 746 cmn_err(CE_NOTE, "authamd_nb_watchdog_policy=%d " 747 "unrecognised, using default policy", 748 authamd_nb_watchdog_policy); 749 /*FALLTHRU*/ 750 751 case AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED: 752 if (!(val & AMD_NB_CFG_WDOGTMRDIS)) 753 break; /* if enabled leave rate intact */ 754 /*FALLTHRU*/ 755 756 case AUTHAMD_NB_WDOG_ENABLE_FORCE_RATE: 757 val &= ~(AMD_NB_CFG_WDOGTMRBASESEL_MASK | 758 AMD_NB_CFG_WDOGTMRCNTSEL_MASK | 759 AMD_NB_CFG_WDOGTMRDIS); 760 val |= authamd_nb_mcacfg_wdog; 761 break; 762 } 763 764 /* 765 * Bit 0 of the NB MCA Config register is reserved on family 766 * 0x10. 767 */ 768 if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 769 authamd_nb_mcacfg_add &= ~AMD_NB_CFG_CPUECCERREN; 770 771 val &= ~authamd_nb_mcacfg_remove; 772 val |= authamd_nb_mcacfg_add; 773 774 authamd_pcicfg_write(procnodeid, MC_FUNC_MISCCTL, 775 MC_CTL_REG_NBCFG, val); 776 } 777 778 /* 779 * Cache scrubbing. We can't enable DRAM scrubbing since 780 * we don't know the DRAM base for this node. 781 */ 782 if (AUTHAMD_HAS_CHIPSCRUB(rev) && 783 authamd_scrub_policy != AUTHAMD_SCRUB_BIOSDEFAULT && 784 authamd_chip_once(authamd, AUTHAMD_CFGONCE_CACHESCRUB)) { 785 uint32_t val = authamd_pcicfg_read(procnodeid, MC_FUNC_MISCCTL, 786 MC_CTL_REG_SCRUBCTL); 787 int l3cap = 0; 788 789 if (AUTHAMD_L3CAPABLE(rev)) { 790 l3cap = (authamd_pcicfg_read(procnodeid, 791 MC_FUNC_MISCCTL, MC_CTL_REG_NBCAP) & 792 MC_NBCAP_L3CAPABLE) != 0; 793 } 794 795 authamd_scrub_rate_dcache = 796 authamd_scrubrate(authamd_scrub_rate_dcache, 797 (val & AMD_NB_SCRUBCTL_DC_MASK) >> AMD_NB_SCRUBCTL_DC_SHIFT, 798 "authamd_scrub_rate_dcache"); 799 800 authamd_scrub_rate_l2cache = 801 authamd_scrubrate(authamd_scrub_rate_l2cache, 802 (val & AMD_NB_SCRUBCTL_L2_MASK) >> AMD_NB_SCRUBCTL_L2_SHIFT, 803 "authamd_scrub_rate_l2cache"); 804 805 authamd_scrub_rate_l3cache = l3cap ? 806 authamd_scrubrate(authamd_scrub_rate_l3cache, 807 (val & AMD_NB_SCRUBCTL_L3_MASK) >> AMD_NB_SCRUBCTL_L3_SHIFT, 808 "authamd_scrub_rate_l3cache") : 0; 809 810 val = AMD_NB_MKSCRUBCTL(authamd_scrub_rate_l3cache, 811 authamd_scrub_rate_dcache, authamd_scrub_rate_l2cache, 812 val & AMD_NB_SCRUBCTL_DRAM_MASK); 813 814 authamd_pcicfg_write(procnodeid, MC_FUNC_MISCCTL, 815 MC_CTL_REG_SCRUBCTL, val); 816 } 817 818 /* 819 * ECC symbol size. Defaults to 4. 820 * Set to 8 on systems that support x8 ECC and have it enabled. 821 */ 822 if (authamd_chip_once(authamd, AUTHAMD_CFGONCE_ECCSYMSZ)) { 823 authamd->amd_shared->ans_eccsymsz = "C4"; 824 if (AUTHAMD_SUPPORTS_X8ECC(rev) && 825 (authamd_pcicfg_read(procnodeid, MC_FUNC_MISCCTL, 826 MC_CTL_REG_EXTNBCFG) & MC_EXTNBCFG_ECCSYMSZ)) 827 authamd->amd_shared->ans_eccsymsz = "C8"; 828 } 829 } 830 831 /* 832 * cms_poll_ownermask entry point. 833 */ 834 uint64_t 835 authamd_poll_ownermask(cmi_hdl_t hdl, hrtime_t pintvl) 836 { 837 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 838 struct authamd_nodeshared *ansp = authamd->amd_shared; 839 hrtime_t now = gethrtime_waitfree(); 840 hrtime_t last = ansp->ans_poll_timestamp; 841 int dopoll = 0; 842 843 if (now - last > 2 * pintvl || last == 0) { 844 ansp->ans_pollowner = hdl; 845 dopoll = 1; 846 } else if (ansp->ans_pollowner == hdl) { 847 dopoll = 1; 848 } 849 850 if (dopoll) 851 ansp->ans_poll_timestamp = now; 852 853 return (dopoll ? -1ULL : ~(1 << AMD_MCA_BANK_NB)); 854 855 } 856 857 /* 858 * cms_bank_logout entry point. 859 */ 860 /*ARGSUSED*/ 861 void 862 authamd_bank_logout(cmi_hdl_t hdl, int bank, uint64_t status, 863 uint64_t addr, uint64_t misc, void *mslogout) 864 { 865 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 866 struct authamd_logout *msl = mslogout; 867 uint32_t rev = authamd->amd_shared->ans_rev; 868 869 if (msl == NULL) 870 return; 871 872 /* 873 * For main memory ECC errors on revisions with an Online Spare 874 * Control Register grab the ECC counts by channel and chip-select 875 * and reset them to 0. 876 */ 877 if (AUTHAMD_MEMECC_RECOGNISED(rev) && 878 AUTHAMD_IS_MEMECCERR(bank, status) && 879 AUTHAMD_HAS_ONLINESPARECTL(rev)) { 880 if (authamd_read_ecccnt(authamd, msl)) 881 authamd_clear_ecccnt(authamd, B_FALSE); 882 } 883 } 884 885 /* 886 * cms_error_action entry point 887 */ 888 889 int authamd_forgive_uc = 0; /* For test/debug only */ 890 int authamd_forgive_pcc = 0; /* For test/debug only */ 891 int authamd_fake_poison = 0; /* For test/debug only */ 892 893 /*ARGSUSED*/ 894 uint32_t 895 authamd_error_action(cmi_hdl_t hdl, int ismc, int bank, 896 uint64_t status, uint64_t addr, uint64_t misc, void *mslogout) 897 { 898 authamd_error_disp_t *disp; 899 uint32_t rv = 0; 900 901 if (authamd_forgive_uc) 902 rv |= CMS_ERRSCOPE_CLEARED_UC; 903 904 if (authamd_forgive_pcc) 905 rv |= CMS_ERRSCOPE_CURCONTEXT_OK; 906 907 if (authamd_fake_poison && status & MSR_MC_STATUS_UC) 908 rv |= CMS_ERRSCOPE_POISONED; 909 910 if (rv) 911 return (rv); 912 913 disp = authamd_disp_match(hdl, bank, status, addr, misc, mslogout); 914 915 if (disp == &authamd_gart_disp) { 916 /* 917 * GART walk errors set UC and possibly PCC (if source CPU) 918 * but should not be regarded as terminal. 919 */ 920 return (CMS_ERRSCOPE_IGNORE_ERR); 921 } 922 923 /* 924 * May also want to consider master abort and target abort. These 925 * also set UC and PCC (if src CPU) but the requester gets -1 926 * and I believe the IO stuff in Solaris will handle that. 927 */ 928 929 return (rv); 930 } 931 932 /* 933 * cms_disp_match entry point 934 */ 935 /*ARGSUSED*/ 936 cms_cookie_t 937 authamd_disp_match(cmi_hdl_t hdl, int bank, uint64_t status, 938 uint64_t addr, uint64_t misc, void *mslogout) 939 { 940 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 941 /* uint16_t errcode = MCAX86_ERRCODE(status); */ 942 uint16_t exterrcode = AMD_EXT_ERRCODE(status); 943 uint32_t rev = authamd->amd_shared->ans_rev; 944 945 /* 946 * Recognise main memory ECC errors 947 */ 948 if (AUTHAMD_MEMECC_RECOGNISED(rev) && 949 AUTHAMD_IS_MEMECCERR(bank, status)) { 950 if (status & AMD_BANK_STAT_CECC) { 951 return (exterrcode == 0 ? &authamd_memce_disp : 952 &authamd_ckmemce_disp); 953 } else if (status & AMD_BANK_STAT_UECC) { 954 return (exterrcode == 0 ? &authamd_memue_disp : 955 &authamd_ckmemue_disp); 956 } 957 } 958 959 /* 960 * Recognise GART walk errors 961 */ 962 if (AUTHAMD_NOGARTTBLWLK_MC(rev) && AUTHAMD_IS_GARTERR(bank, status)) 963 return (&authamd_gart_disp); 964 965 return (NULL); 966 } 967 968 /* 969 * cms_ereport_class entry point 970 */ 971 /*ARGSUSED*/ 972 void 973 authamd_ereport_class(cmi_hdl_t hdl, cms_cookie_t mscookie, 974 const char **cpuclsp, const char **leafclsp) 975 { 976 const authamd_error_disp_t *aed = mscookie; 977 978 if (aed == NULL) 979 return; 980 981 if (aed->aad_subclass != NULL) 982 *cpuclsp = aed->aad_subclass; 983 if (aed->aad_leafclass != NULL) 984 *leafclsp = aed->aad_leafclass; 985 } 986 987 /*ARGSUSED*/ 988 static void 989 authamd_ereport_add_resource(cmi_hdl_t hdl, authamd_data_t *authamd, 990 nvlist_t *ereport, nv_alloc_t *nva, void *mslogout) 991 { 992 nvlist_t *elems[AUTHAMD_DRAM_NCHANNEL * AUTHAMD_DRAM_NCS]; 993 uint8_t counts[AUTHAMD_DRAM_NCHANNEL * AUTHAMD_DRAM_NCS]; 994 authamd_logout_t *msl; 995 nvlist_t *nvl; 996 int nelems = 0; 997 int i, chan, cs, mc; 998 nvlist_t *board_list = NULL; 999 1000 if ((msl = mslogout) == NULL) 1001 return; 1002 1003 /* Assume all processors have the same number of nodes */ 1004 mc = authamd->amd_shared->ans_procnodeid % 1005 cpuid_get_procnodes_per_pkg(CPU); 1006 1007 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 1008 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 1009 if (msl->aal_eccerrcnt[chan][cs] == 0) 1010 continue; 1011 1012 if ((nvl = fm_nvlist_create(nva)) == NULL) 1013 continue; 1014 1015 elems[nelems] = nvl; 1016 counts[nelems++] = msl->aal_eccerrcnt[chan][cs]; 1017 1018 if (!x86gentopo_legacy) { 1019 board_list = cmi_hdl_smb_bboard(hdl); 1020 if (board_list == NULL) 1021 continue; 1022 fm_fmri_hc_create(nvl, FM_HC_SCHEME_VERSION, 1023 NULL, NULL, board_list, 4, 1024 "chip", cmi_hdl_smb_chipid(hdl), 1025 "memory-controller", 0, 1026 "dram-channel", chan, 1027 "chip-select", cs); 1028 } else { 1029 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, 1030 NULL, NULL, 5, 1031 "motherboard", 0, 1032 "chip", authamd->amd_shared->ans_chipid, 1033 "memory-controller", mc, 1034 "dram-channel", chan, 1035 "chip-select", cs); 1036 } 1037 } 1038 } 1039 1040 if (nelems == 0) 1041 return; 1042 1043 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_RESOURCE, 1044 DATA_TYPE_NVLIST_ARRAY, nelems, elems, 1045 NULL); 1046 1047 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_RESOURCECNT, 1048 DATA_TYPE_UINT8_ARRAY, nelems, &counts[0], 1049 NULL); 1050 1051 for (i = 0; i < nelems; i++) 1052 fm_nvlist_destroy(elems[i], nva ? FM_NVA_RETAIN : FM_NVA_FREE); 1053 } 1054 1055 /* 1056 * cms_ereport_add_logout entry point 1057 */ 1058 /*ARGSUSED*/ 1059 void 1060 authamd_ereport_add_logout(cmi_hdl_t hdl, nvlist_t *ereport, nv_alloc_t *nva, 1061 int bank, uint64_t status, uint64_t addr, uint64_t misc, 1062 void *mslogout, cms_cookie_t mscookie) 1063 { 1064 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 1065 const authamd_error_disp_t *aed = mscookie; 1066 uint64_t members; 1067 1068 if (aed == NULL) 1069 return; 1070 1071 members = aed->aad_ereport_members; 1072 1073 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYND) { 1074 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_SYND, 1075 DATA_TYPE_UINT16, (uint16_t)AMD_BANK_SYND(status), 1076 NULL); 1077 1078 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYNDTYPE) { 1079 fm_payload_set(ereport, 1080 FM_EREPORT_GENAMD_PAYLOAD_NAME_SYNDTYPE, 1081 DATA_TYPE_STRING, "E", 1082 NULL); 1083 } 1084 } 1085 1086 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_CKSYND) { 1087 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_CKSYND, 1088 DATA_TYPE_UINT16, (uint16_t)AMD_NB_STAT_CKSYND(status), 1089 NULL); 1090 1091 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYNDTYPE) { 1092 fm_payload_set(ereport, 1093 FM_EREPORT_GENAMD_PAYLOAD_NAME_SYNDTYPE, 1094 DATA_TYPE_STRING, authamd->amd_shared->ans_eccsymsz, 1095 NULL); 1096 } 1097 } 1098 1099 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_RESOURCE && 1100 status & MSR_MC_STATUS_ADDRV) { 1101 authamd_ereport_add_resource(hdl, authamd, ereport, nva, 1102 mslogout); 1103 } 1104 } 1105 1106 /* 1107 * cms_msrinject entry point 1108 */ 1109 cms_errno_t 1110 authamd_msrinject(cmi_hdl_t hdl, uint_t msr, uint64_t val) 1111 { 1112 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 1113 cms_errno_t rv = CMSERR_BADMSRWRITE; 1114 1115 authamd_bankstatus_prewrite(hdl, authamd); 1116 if (cmi_hdl_wrmsr(hdl, msr, val) == CMI_SUCCESS) 1117 rv = CMS_SUCCESS; 1118 authamd_bankstatus_postwrite(hdl, authamd); 1119 1120 return (rv); 1121 } 1122 1123 cms_api_ver_t _cms_api_version = CMS_API_VERSION_1; 1124 1125 const cms_ops_t _cms_ops = { 1126 authamd_init, /* cms_init */ 1127 NULL, /* cms_post_startup */ 1128 NULL, /* cms_post_mpstartup */ 1129 authamd_logout_size, /* cms_logout_size */ 1130 authamd_mcgctl_val, /* cms_mcgctl_val */ 1131 authamd_bankctl_skipinit, /* cms_bankctl_skipinit */ 1132 authamd_bankctl_val, /* cms_bankctl_val */ 1133 NULL, /* cms_bankstatus_skipinit */ 1134 NULL, /* cms_bankstatus_val */ 1135 authamd_mca_init, /* cms_mca_init */ 1136 authamd_poll_ownermask, /* cms_poll_ownermask */ 1137 authamd_bank_logout, /* cms_bank_logout */ 1138 authamd_error_action, /* cms_error_action */ 1139 authamd_disp_match, /* cms_disp_match */ 1140 authamd_ereport_class, /* cms_ereport_class */ 1141 NULL, /* cms_ereport_detector */ 1142 NULL, /* cms_ereport_includestack */ 1143 authamd_ereport_add_logout, /* cms_ereport_add_logout */ 1144 authamd_msrinject, /* cms_msrinject */ 1145 NULL, /* cms_fini */ 1146 }; 1147 1148 static struct modlcpu modlcpu = { 1149 &mod_cpuops, 1150 "Generic AMD model-specific MCA" 1151 }; 1152 1153 static struct modlinkage modlinkage = { 1154 MODREV_1, 1155 (void *)&modlcpu, 1156 NULL 1157 }; 1158 1159 int 1160 _init(void) 1161 { 1162 return (mod_install(&modlinkage)); 1163 } 1164 1165 int 1166 _info(struct modinfo *modinfop) 1167 { 1168 return (mod_info(&modlinkage, modinfop)); 1169 } 1170 1171 int 1172 _fini(void) 1173 { 1174 return (mod_remove(&modlinkage)); 1175 } 1176