1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * "Generic AMD" model-specific support. If no more-specific support can 31 * be found, or such modules declines to initialize, then for AuthenticAMD 32 * cpus this module can have a crack at providing some AMD model-specific 33 * support that at least goes beyond common MCA architectural features 34 * if not down to the nitty-gritty level for a particular model. We 35 * are layered on top of a cpu module, likely cpu.generic, so there is no 36 * need for us to perform common architecturally-accessible functions. 37 */ 38 39 #include <sys/types.h> 40 #include <sys/cmn_err.h> 41 #include <sys/modctl.h> 42 #include <sys/cpu_module.h> 43 #include <sys/mca_x86.h> 44 #include <sys/pci_cfgspace.h> 45 #include <sys/x86_archext.h> 46 #include <sys/mc_amd.h> 47 #include <sys/fm/protocol.h> 48 #include <sys/fm/cpu/GENAMD.h> 49 #include <sys/nvpair.h> 50 #include <sys/controlregs.h> 51 #include <sys/pghw.h> 52 #include <sys/sunddi.h> 53 #include <sys/cpu_module_ms_impl.h> 54 55 #include "authamd.h" 56 57 int authamd_ms_support_disable = 0; 58 59 #define AUTHAMD_F_REVS_BCDE \ 60 (X86_CHIPREV_AMD_F_REV_B | X86_CHIPREV_AMD_F_REV_C0 | \ 61 X86_CHIPREV_AMD_F_REV_CG | X86_CHIPREV_AMD_F_REV_D | \ 62 X86_CHIPREV_AMD_F_REV_E) 63 64 #define AUTHAMD_F_REVS_FG \ 65 (X86_CHIPREV_AMD_F_REV_F | X86_CHIPREV_AMD_F_REV_G) 66 67 #define AUTHAMD_10_REVS_AB \ 68 (X86_CHIPREV_AMD_10_REV_A | X86_CHIPREV_AMD_10_REV_B) 69 70 /* 71 * Bitmasks of support for various features. Try to enable features 72 * via inclusion in one of these bitmasks and check that at the 73 * feature imlementation - that way new family support may often simply 74 * simply need to update these bitmasks. 75 */ 76 77 /* 78 * Families that this module will provide some model-specific 79 * support for (if no more-specific module claims it first). 80 * We try to support whole families rather than differentiate down 81 * to revision. 82 */ 83 #define AUTHAMD_SUPPORTED(fam) \ 84 ((fam) == AUTHAMD_FAMILY_6 || (fam) == AUTHAMD_FAMILY_F || \ 85 (fam) == AUTHAMD_FAMILY_10) 86 87 /* 88 * Families/revisions for which we can recognise main memory ECC errors. 89 */ 90 #define AUTHAMD_MEMECC_RECOGNISED(rev) \ 91 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 92 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 93 94 /* 95 * Families/revisions that have an Online Spare Control Register 96 */ 97 #define AUTHAMD_HAS_ONLINESPARECTL(rev) \ 98 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) || \ 99 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 100 101 /* 102 * Families/revisions that have a NB misc register or registers - 103 * evaluates to 0 if no support, otherwise the number of MC4_MISCj. 104 */ 105 #define AUTHAMD_NBMISC_NUM(rev) \ 106 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F)? 1 : \ 107 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A) ? 3 : 0)) 108 109 /* 110 * Families/revision for which we wish not to machine check for GART 111 * table walk errors - bit 10 of NB CTL. 112 */ 113 #define AUTHAMD_NOGARTTBLWLK_MC(rev) \ 114 (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \ 115 X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A)) 116 117 /* 118 * We recognise main memory ECC errors for AUTHAMD_MEMECC_RECOGNISED 119 * revisions as: 120 * 121 * - being reported by the NB 122 * - being a compound bus/interconnect error (external to chip) 123 * - having LL of LG 124 * - having II of MEM (but could still be a master/target abort) 125 * - having CECC or UECC set 126 * 127 * We do not check the extended error code (first nibble of the 128 * model-specific error code on AMD) since this has changed from 129 * family 0xf to family 0x10 (ext code 0 now reserved on family 0x10). 130 * Instead we use CECC/UECC to separate off the master/target 131 * abort cases. 132 * 133 * We insist that the detector be the NorthBridge bank; although 134 * IC/DC can report some main memory errors, they do not capture 135 * an address at sufficient resolution to be useful and the NB will 136 * report most errors. 137 */ 138 #define AUTHAMD_IS_MEMECCERR(bank, status) \ 139 ((bank) == AMD_MCA_BANK_NB && \ 140 MCAX86_ERRCODE_ISBUS_INTERCONNECT(MCAX86_ERRCODE(status)) && \ 141 MCAX86_ERRCODE_LL(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_LL_LG && \ 142 MCAX86_ERRCODE_II(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_II_MEM && \ 143 ((status) & (AMD_BANK_STAT_CECC | AMD_BANK_STAT_UECC))) 144 145 static authamd_error_disp_t authamd_memce_disp = { 146 FM_EREPORT_CPU_GENAMD, 147 FM_EREPORT_CPU_GENAMD_MEM_CE, 148 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_MEM_CE 149 }; 150 151 static authamd_error_disp_t authamd_memue_disp = { 152 FM_EREPORT_CPU_GENAMD, 153 FM_EREPORT_CPU_GENAMD_MEM_UE, 154 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_MEM_UE 155 }; 156 157 static authamd_error_disp_t authamd_ckmemce_disp = { 158 FM_EREPORT_CPU_GENAMD, 159 FM_EREPORT_CPU_GENAMD_CKMEM_CE, 160 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_CKMEM_CE 161 }; 162 163 static authamd_error_disp_t authamd_ckmemue_disp = { 164 FM_EREPORT_CPU_GENAMD, 165 FM_EREPORT_CPU_GENAMD_CKMEM_UE, 166 FM_EREPORT_GENAMD_PAYLOAD_FLAGS_CKMEM_UE 167 }; 168 169 /* 170 * We recognise GART walk errors as: 171 * 172 * - being reported by the NB 173 * - being a compound TLB error 174 * - having LL of LG and TT of GEN 175 * - having UC set 176 * - possibly having PCC set (if source CPU) 177 */ 178 #define AUTHAMD_IS_GARTERR(bank, status) \ 179 ((bank) == AMD_MCA_BANK_NB && \ 180 MCAX86_ERRCODE_ISTLB(MCAX86_ERRCODE(status)) && \ 181 MCAX86_ERRCODE_LL(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_LL_LG && \ 182 MCAX86_ERRCODE_TT(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_TT_GEN && \ 183 (status) & MSR_MC_STATUS_UC) 184 185 static authamd_error_disp_t authamd_gart_disp = { 186 FM_EREPORT_CPU_GENAMD, /* use generic subclass */ 187 FM_EREPORT_CPU_GENADM_GARTTBLWLK, /* use generic leafclass */ 188 0 /* no additional payload */ 189 }; 190 191 192 static struct authamd_chipshared *authamd_shared[AUTHAMD_MAX_CHIPS]; 193 194 static int 195 authamd_chip_once(authamd_data_t *authamd, enum authamd_cfgonce_bitnum what) 196 { 197 return (atomic_set_long_excl(&authamd->amd_shared->acs_cfgonce, 198 what) == 0 ? B_TRUE : B_FALSE); 199 } 200 201 static void 202 authamd_pcicfg_write(uint_t chipid, uint_t func, uint_t reg, uint32_t val) 203 { 204 ASSERT(chipid + 24 <= 31); 205 ASSERT((func & 7) == func); 206 ASSERT((reg & 3) == 0 && reg < 256); 207 208 cmi_pci_putl(0, chipid + 24, func, reg, 0, val); 209 } 210 211 static uint32_t 212 authamd_pcicfg_read(uint_t chipid, uint_t func, uint_t reg) 213 { 214 ASSERT(chipid + 24 <= 31); 215 ASSERT((func & 7) == func); 216 ASSERT((reg & 3) == 0 && reg < 256); 217 218 return (cmi_pci_getl(0, chipid + 24, func, reg, 0, 0)); 219 } 220 221 void 222 authamd_bankstatus_prewrite(cmi_hdl_t hdl, authamd_data_t *authamd) 223 { 224 uint64_t hwcr; 225 226 if (cmi_hdl_rdmsr(hdl, MSR_AMD_HWCR, &hwcr) != CMI_SUCCESS) 227 return; 228 229 authamd->amd_hwcr = hwcr; 230 231 if (!(hwcr & AMD_HWCR_MCI_STATUS_WREN)) { 232 hwcr |= AMD_HWCR_MCI_STATUS_WREN; 233 (void) cmi_hdl_wrmsr(hdl, MSR_AMD_HWCR, hwcr); 234 } 235 } 236 237 void 238 authamd_bankstatus_postwrite(cmi_hdl_t hdl, authamd_data_t *authamd) 239 { 240 uint64_t hwcr = authamd->amd_hwcr; 241 242 if (!(hwcr & AMD_HWCR_MCI_STATUS_WREN)) { 243 hwcr &= ~AMD_HWCR_MCI_STATUS_WREN; 244 (void) cmi_hdl_wrmsr(hdl, MSR_AMD_HWCR, hwcr); 245 } 246 } 247 248 /* 249 * Read EccCnt repeatedly for all possible channel/chip-select combos: 250 * 251 * - read sparectl register 252 * - if EccErrCntWrEn is set, clear that bit in the just-read value 253 * and write it back to sparectl; this *may* clobber the EccCnt 254 * for the channel/chip-select combination currently selected, so 255 * we leave this bit clear if we had to clear it 256 * - cycle through all channel/chip-select combinations writing each 257 * combination to sparectl before reading the register back for 258 * EccCnt for that combination; since EccErrCntWrEn is clear 259 * the writes to select what count to read will not themselves 260 * zero any counts 261 */ 262 static int 263 authamd_read_ecccnt(authamd_data_t *authamd, struct authamd_logout *msl) 264 { 265 union mcreg_sparectl sparectl; 266 uint_t chipid = authamd->amd_shared->acs_chipid; 267 uint_t family = authamd->amd_shared->acs_family; 268 uint32_t rev = authamd->amd_shared->acs_rev; 269 int chan, cs; 270 271 /* 272 * Check for feature support; this macro will test down to the 273 * family revision number, whereafter we'll switch on family 274 * assuming that future revisions will use the same register 275 * format. 276 */ 277 if (!AUTHAMD_HAS_ONLINESPARECTL(rev)) { 278 bzero(&msl->aal_eccerrcnt, sizeof (msl->aal_eccerrcnt)); 279 return (0); 280 } 281 282 MCREG_VAL32(&sparectl) = 283 authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL); 284 285 switch (family) { 286 case AUTHAMD_FAMILY_F: 287 MCREG_FIELD_F_revFG(&sparectl, EccErrCntWrEn) = 0; 288 break; 289 290 case AUTHAMD_FAMILY_10: 291 MCREG_FIELD_10_revAB(&sparectl, EccErrCntWrEn) = 0; 292 break; 293 } 294 295 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 296 switch (family) { 297 case AUTHAMD_FAMILY_F: 298 MCREG_FIELD_F_revFG(&sparectl, EccErrCntDramChan) = 299 chan; 300 break; 301 302 case AUTHAMD_FAMILY_10: 303 MCREG_FIELD_10_revAB(&sparectl, EccErrCntDramChan) = 304 chan; 305 break; 306 } 307 308 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 309 switch (family) { 310 case AUTHAMD_FAMILY_F: 311 MCREG_FIELD_F_revFG(&sparectl, 312 EccErrCntDramCs) = cs; 313 break; 314 315 case AUTHAMD_FAMILY_10: 316 MCREG_FIELD_10_revAB(&sparectl, 317 EccErrCntDramCs) = cs; 318 break; 319 } 320 321 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, 322 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 323 324 MCREG_VAL32(&sparectl) = authamd_pcicfg_read(chipid, 325 MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL); 326 327 switch (family) { 328 case AUTHAMD_FAMILY_F: 329 msl->aal_eccerrcnt[chan][cs] = 330 MCREG_FIELD_F_revFG(&sparectl, EccErrCnt); 331 break; 332 case AUTHAMD_FAMILY_10: 333 msl->aal_eccerrcnt[chan][cs] = 334 MCREG_FIELD_10_revAB(&sparectl, EccErrCnt); 335 break; 336 } 337 } 338 } 339 340 return (1); 341 } 342 343 /* 344 * Clear EccCnt for all possible channel/chip-select combos: 345 * 346 * - set EccErrCntWrEn in sparectl, if necessary 347 * - write 0 to EccCnt for all channel/chip-select combinations 348 * - clear EccErrCntWrEn 349 * 350 * If requested also disable the interrupts taken on counter overflow 351 * and on swap done. 352 */ 353 static void 354 authamd_clear_ecccnt(authamd_data_t *authamd, boolean_t clrint) 355 { 356 union mcreg_sparectl sparectl; 357 uint_t chipid = authamd->amd_shared->acs_chipid; 358 uint_t family = authamd->amd_shared->acs_family; 359 uint32_t rev = authamd->amd_shared->acs_rev; 360 int chan, cs; 361 362 if (!AUTHAMD_HAS_ONLINESPARECTL(rev)) 363 return; 364 365 MCREG_VAL32(&sparectl) = 366 authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL); 367 368 switch (family) { 369 case AUTHAMD_FAMILY_F: 370 MCREG_FIELD_F_revFG(&sparectl, EccErrCntWrEn) = 1; 371 if (clrint) { 372 MCREG_FIELD_F_revFG(&sparectl, EccErrInt) = 0; 373 MCREG_FIELD_F_revFG(&sparectl, SwapDoneInt) = 0; 374 } 375 break; 376 377 case AUTHAMD_FAMILY_10: 378 MCREG_FIELD_10_revAB(&sparectl, EccErrCntWrEn) = 1; 379 if (clrint) { 380 MCREG_FIELD_10_revAB(&sparectl, EccErrInt) = 0; 381 MCREG_FIELD_10_revAB(&sparectl, SwapDoneInt) = 0; 382 } 383 break; 384 } 385 386 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, 387 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 388 389 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 390 switch (family) { 391 case AUTHAMD_FAMILY_F: 392 MCREG_FIELD_F_revFG(&sparectl, EccErrCntDramChan) = 393 chan; 394 break; 395 396 case AUTHAMD_FAMILY_10: 397 MCREG_FIELD_10_revAB(&sparectl, EccErrCntDramChan) = 398 chan; 399 break; 400 } 401 402 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 403 switch (family) { 404 case AUTHAMD_FAMILY_F: 405 MCREG_FIELD_F_revFG(&sparectl, 406 EccErrCntDramCs) = cs; 407 MCREG_FIELD_F_revFG(&sparectl, 408 EccErrCnt) = 0; 409 break; 410 411 case AUTHAMD_FAMILY_10: 412 MCREG_FIELD_10_revAB(&sparectl, 413 EccErrCntDramCs) = cs; 414 MCREG_FIELD_10_revAB(&sparectl, 415 EccErrCnt) = 0; 416 break; 417 } 418 419 authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, 420 MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl)); 421 } 422 } 423 } 424 425 /* 426 * cms_init entry point. 427 * 428 * This module provides broad model-specific support for AMD families 429 * 0x6, 0xf and 0x10. Future families will have to be evaluated once their 430 * documentation is available. 431 */ 432 int 433 authamd_init(cmi_hdl_t hdl, void **datap) 434 { 435 uint_t chipid = cmi_hdl_chipid(hdl); 436 struct authamd_chipshared *sp, *osp; 437 uint_t family = cmi_hdl_family(hdl); 438 authamd_data_t *authamd; 439 uint64_t cap; 440 441 if (authamd_ms_support_disable || !AUTHAMD_SUPPORTED(family)) 442 return (ENOTSUP); 443 444 if (!(x86_feature & X86_MCA)) 445 return (ENOTSUP); 446 447 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS) 448 return (ENOTSUP); 449 450 if (!(cap & MCG_CAP_CTL_P)) 451 return (ENOTSUP); 452 453 authamd = *datap = kmem_zalloc(sizeof (authamd_data_t), KM_SLEEP); 454 cmi_hdl_hold(hdl); /* release in fini */ 455 authamd->amd_hdl = hdl; 456 457 if ((sp = authamd_shared[chipid]) == NULL) { 458 sp = kmem_zalloc(sizeof (struct authamd_chipshared), KM_SLEEP); 459 osp = atomic_cas_ptr(&authamd_shared[chipid], NULL, sp); 460 if (osp != NULL) { 461 kmem_free(sp, sizeof (struct authamd_chipshared)); 462 sp = osp; 463 } else { 464 sp->acs_chipid = chipid; 465 sp->acs_family = family; 466 sp->acs_rev = cmi_hdl_chiprev(hdl); 467 } 468 } 469 authamd->amd_shared = sp; 470 471 return (0); 472 } 473 474 /* 475 * cms_logout_size entry point. 476 */ 477 /*ARGSUSED*/ 478 size_t 479 authamd_logout_size(cmi_hdl_t hdl) 480 { 481 return (sizeof (struct authamd_logout)); 482 } 483 484 /* 485 * cms_mcgctl_val entry point 486 * 487 * Instead of setting all bits to 1 we can set just those for the 488 * error detector banks known to exist. 489 */ 490 /*ARGSUSED*/ 491 uint64_t 492 authamd_mcgctl_val(cmi_hdl_t hdl, int nbanks, uint64_t proposed) 493 { 494 return (nbanks < 64 ? (1ULL << nbanks) - 1 : proposed); 495 } 496 497 /* 498 * cms_bankctl_skipinit entry point 499 * 500 * On K6 we do not initialize MC0_CTL since, reportedly, this bank (for DC) 501 * may produce spurious machine checks. 502 */ 503 /*ARGSUSED*/ 504 boolean_t 505 authamd_bankctl_skipinit(cmi_hdl_t hdl, int bank) 506 { 507 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 508 509 return (authamd->amd_shared->acs_family == AUTHAMD_FAMILY_6 && 510 bank == 0 ? B_TRUE : B_FALSE); 511 } 512 513 /* 514 * cms_bankctl_val entry point 515 */ 516 uint64_t 517 authamd_bankctl_val(cmi_hdl_t hdl, int bank, uint64_t proposed) 518 { 519 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 520 uint32_t rev = authamd->amd_shared->acs_rev; 521 uint64_t val = proposed; 522 523 /* 524 * The Intel MCA says we can write all 1's to enable #MC for 525 * all errors, and AMD docs say much the same. But, depending 526 * perhaps on other config registers, taking machine checks 527 * for some errors such as GART TLB errors and master/target 528 * aborts may be bad - they set UC and sometime also PCC, but 529 * we should not always panic for these error types. 530 * 531 * Our cms_error_action entry point can suppress such panics, 532 * however we can also use the cms_bankctl_val entry point to 533 * veto enabling of some of the known villains in the first place. 534 */ 535 if (bank == AMD_MCA_BANK_NB && AUTHAMD_NOGARTTBLWLK_MC(rev)) 536 val &= ~AMD_NB_EN_GARTTBLWK; 537 538 return (val); 539 } 540 541 /* 542 * cms_mca_init entry point. 543 */ 544 /*ARGSUSED*/ 545 void 546 authamd_mca_init(cmi_hdl_t hdl, int nbanks) 547 { 548 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 549 uint32_t rev = authamd->amd_shared->acs_rev; 550 551 /* 552 * On chips with a NB online spare control register take control 553 * and clear ECC counts. 554 */ 555 if (AUTHAMD_HAS_ONLINESPARECTL(rev) && 556 authamd_chip_once(authamd, AUTHAMD_CFGONCE_ONLNSPRCFG)) { 557 authamd_clear_ecccnt(authamd, B_TRUE); 558 } 559 560 /* 561 * And since we are claiming the telemetry stop the BIOS receiving 562 * an SMI on NB threshold overflow. 563 */ 564 if (AUTHAMD_NBMISC_NUM(rev) && 565 authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBTHRESH)) { 566 union mcmsr_nbmisc nbm; 567 int i; 568 569 authamd_bankstatus_prewrite(hdl, authamd); 570 571 for (i = 0; i < AUTHAMD_NBMISC_NUM(rev); i++) { 572 if (cmi_hdl_rdmsr(hdl, MC_MSR_NB_MISC(i), 573 (uint64_t *)&nbm) != CMI_SUCCESS) 574 continue; 575 576 if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) && 577 MCMSR_FIELD_F_revFG(&nbm, mcmisc_Valid) && 578 MCMSR_FIELD_F_revFG(&nbm, mcmisc_CntP)) { 579 MCMSR_FIELD_F_revFG(&nbm, mcmisc_IntType) = 0; 580 } else if (X86_CHIPREV_ATLEAST(rev, 581 X86_CHIPREV_AMD_10_REV_A) && 582 MCMSR_FIELD_10_revAB(&nbm, mcmisc_Valid) && 583 MCMSR_FIELD_10_revAB(&nbm, mcmisc_CntP)) { 584 MCMSR_FIELD_10_revAB(&nbm, mcmisc_IntType) = 0; 585 } 586 587 (void) cmi_hdl_wrmsr(hdl, MC_MSR_NB_MISC(i), 588 MCMSR_VAL(&nbm)); 589 } 590 591 authamd_bankstatus_postwrite(hdl, authamd); 592 } 593 } 594 595 /* 596 * cms_bank_logout entry point. 597 */ 598 /*ARGSUSED*/ 599 void 600 authamd_bank_logout(cmi_hdl_t hdl, int bank, uint64_t status, 601 uint64_t addr, uint64_t misc, void *mslogout) 602 { 603 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 604 struct authamd_logout *msl = mslogout; 605 uint32_t rev = authamd->amd_shared->acs_rev; 606 607 if (msl == NULL) 608 return; 609 610 /* 611 * For main memory ECC errors on revisions with an Online Spare 612 * Control Register grab the ECC counts by channel and chip-select 613 * and reset them to 0. 614 */ 615 if (AUTHAMD_MEMECC_RECOGNISED(rev) && 616 AUTHAMD_IS_MEMECCERR(bank, status) && 617 AUTHAMD_HAS_ONLINESPARECTL(rev)) { 618 if (authamd_read_ecccnt(authamd, msl)) 619 authamd_clear_ecccnt(authamd, B_FALSE); 620 } 621 } 622 623 /* 624 * cms_error_action entry point 625 */ 626 627 int authamd_forgive_uc = 0; /* For test/debug only */ 628 int authamd_forgive_pcc = 0; /* For test/debug only */ 629 int authamd_fake_poison = 0; /* For test/debug only */ 630 631 /*ARGSUSED*/ 632 uint32_t 633 authamd_error_action(cmi_hdl_t hdl, int ismc, int bank, 634 uint64_t status, uint64_t addr, uint64_t misc, void *mslogout) 635 { 636 authamd_error_disp_t *disp; 637 uint32_t rv = 0; 638 639 if (authamd_forgive_uc) 640 rv |= CMS_ERRSCOPE_CLEARED_UC; 641 642 if (authamd_forgive_pcc) 643 rv |= CMS_ERRSCOPE_CURCONTEXT_OK; 644 645 if (authamd_fake_poison && status & MSR_MC_STATUS_UC) 646 rv |= CMS_ERRSCOPE_POISONED; 647 648 if (rv) 649 return (rv); 650 651 disp = authamd_disp_match(hdl, bank, status, addr, misc, mslogout); 652 653 if (disp == &authamd_gart_disp) { 654 /* 655 * GART walk errors set UC and possibly PCC (if source CPU) 656 * but should not be regarded as terminal. 657 */ 658 return (CMS_ERRSCOPE_IGNORE_ERR); 659 } 660 661 /* 662 * May also want to consider master abort and target abort. These 663 * also set UC and PCC (if src CPU) but the requester gets -1 664 * and I believe the IO stuff in Solaris will handle that. 665 */ 666 667 return (rv); 668 } 669 670 /* 671 * cms_disp_match entry point 672 */ 673 /*ARGSUSED*/ 674 cms_cookie_t 675 authamd_disp_match(cmi_hdl_t hdl, int bank, uint64_t status, 676 uint64_t addr, uint64_t misc, void *mslogout) 677 { 678 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 679 /* uint16_t errcode = MCAX86_ERRCODE(status); */ 680 uint16_t exterrcode = AMD_EXT_ERRCODE(status); 681 uint32_t rev = authamd->amd_shared->acs_rev; 682 683 /* 684 * Recognise main memory ECC errors 685 */ 686 if (AUTHAMD_MEMECC_RECOGNISED(rev) && 687 AUTHAMD_IS_MEMECCERR(bank, status)) { 688 if (status & AMD_BANK_STAT_CECC) { 689 return (exterrcode == 0 ? &authamd_memce_disp : 690 &authamd_ckmemce_disp); 691 } else if (status & AMD_BANK_STAT_UECC) { 692 return (exterrcode == 0 ? &authamd_memue_disp : 693 &authamd_ckmemue_disp); 694 } 695 } 696 697 /* 698 * Recognise GART walk errors 699 */ 700 if (AUTHAMD_NOGARTTBLWLK_MC(rev) && AUTHAMD_IS_GARTERR(bank, status)) 701 return (&authamd_gart_disp); 702 703 return (NULL); 704 } 705 706 /* 707 * cms_ereport_class entry point 708 */ 709 /*ARGSUSED*/ 710 void 711 authamd_ereport_class(cmi_hdl_t hdl, cms_cookie_t mscookie, 712 const char **cpuclsp, const char **leafclsp) 713 { 714 const authamd_error_disp_t *aed = mscookie; 715 716 if (aed == NULL) 717 return; 718 719 if (aed->aad_subclass != NULL) 720 *cpuclsp = aed->aad_subclass; 721 if (aed->aad_leafclass != NULL) 722 *leafclsp = aed->aad_leafclass; 723 } 724 725 /*ARGSUSED*/ 726 static void 727 authamd_ereport_add_resource(cmi_hdl_t hdl, authamd_data_t *authamd, 728 nvlist_t *ereport, nv_alloc_t *nva, void *mslogout) 729 { 730 nvlist_t *elems[AUTHAMD_DRAM_NCHANNEL * AUTHAMD_DRAM_NCS]; 731 uint8_t counts[AUTHAMD_DRAM_NCHANNEL * AUTHAMD_DRAM_NCS]; 732 authamd_logout_t *msl; 733 nvlist_t *nvl; 734 int nelems = 0; 735 int i, chan, cs; 736 737 if ((msl = mslogout) == NULL) 738 return; 739 740 for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) { 741 for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) { 742 if (msl->aal_eccerrcnt[chan][cs] == 0) 743 continue; 744 745 if ((nvl = fm_nvlist_create(nva)) == NULL) 746 continue; 747 748 elems[nelems] = nvl; 749 counts[nelems++] = msl->aal_eccerrcnt[chan][cs]; 750 751 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 5, 752 "motherboard", 0, 753 "chip", authamd->amd_shared->acs_chipid, 754 "memory-controller", 0, 755 "dram-channel", chan, 756 "chip-select", cs); 757 } 758 } 759 760 if (nelems == 0) 761 return; 762 763 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_RESOURCE, 764 DATA_TYPE_NVLIST_ARRAY, nelems, elems, 765 NULL); 766 767 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_RESOURCECNT, 768 DATA_TYPE_UINT8_ARRAY, nelems, &counts[0], 769 NULL); 770 771 for (i = 0; i < nelems; i++) 772 fm_nvlist_destroy(elems[i], nva ? FM_NVA_RETAIN : FM_NVA_FREE); 773 } 774 775 /* 776 * cms_ereport_add_logout entry point 777 */ 778 /*ARGSUSED*/ 779 void 780 authamd_ereport_add_logout(cmi_hdl_t hdl, nvlist_t *ereport, nv_alloc_t *nva, 781 int bank, uint64_t status, uint64_t addr, uint64_t misc, 782 void *mslogout, cms_cookie_t mscookie) 783 { 784 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 785 const authamd_error_disp_t *aed = mscookie; 786 uint64_t members; 787 788 if (aed == NULL) 789 return; 790 791 members = aed->aad_ereport_members; 792 793 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYND) { 794 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_SYND, 795 DATA_TYPE_UINT16, (uint16_t)AMD_BANK_SYND(status), 796 NULL); 797 798 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYNDTYPE) { 799 fm_payload_set(ereport, 800 FM_EREPORT_GENAMD_PAYLOAD_NAME_SYNDTYPE, 801 DATA_TYPE_STRING, "E", 802 NULL); 803 } 804 } 805 806 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_CKSYND) { 807 fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_CKSYND, 808 DATA_TYPE_UINT16, (uint16_t)AMD_NB_STAT_CKSYND(status), 809 NULL); 810 811 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYNDTYPE) { 812 fm_payload_set(ereport, 813 FM_EREPORT_GENAMD_PAYLOAD_NAME_SYNDTYPE, 814 DATA_TYPE_STRING, "C", 815 NULL); 816 } 817 } 818 819 if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_RESOURCE && 820 status & MSR_MC_STATUS_ADDRV) { 821 authamd_ereport_add_resource(hdl, authamd, ereport, nva, 822 mslogout); 823 } 824 } 825 826 /* 827 * cms_msrinject entry point 828 */ 829 cms_errno_t 830 authamd_msrinject(cmi_hdl_t hdl, uint_t msr, uint64_t val) 831 { 832 authamd_data_t *authamd = cms_hdl_getcmsdata(hdl); 833 cms_errno_t rv = CMSERR_BADMSRWRITE; 834 835 authamd_bankstatus_prewrite(hdl, authamd); 836 if (cmi_hdl_wrmsr(hdl, msr, val) == CMI_SUCCESS) 837 rv = CMS_SUCCESS; 838 authamd_bankstatus_postwrite(hdl, authamd); 839 840 return (rv); 841 } 842 843 cms_api_ver_t _cms_api_version = CMS_API_VERSION_0; 844 845 const cms_ops_t _cms_ops = { 846 authamd_init, /* cms_init */ 847 NULL, /* cms_post_startup */ 848 NULL, /* cms_post_mpstartup */ 849 authamd_logout_size, /* cms_logout_size */ 850 authamd_mcgctl_val, /* cms_mcgctl_val */ 851 authamd_bankctl_skipinit, /* cms_bankctl_skipinit */ 852 authamd_bankctl_val, /* cms_bankctl_val */ 853 NULL, /* cms_bankstatus_skipinit */ 854 NULL, /* cms_bankstatus_val */ 855 authamd_mca_init, /* cms_mca_init */ 856 NULL, /* cms_poll_ownermask */ 857 authamd_bank_logout, /* cms_bank_logout */ 858 authamd_error_action, /* cms_error_action */ 859 authamd_disp_match, /* cms_disp_match */ 860 authamd_ereport_class, /* cms_ereport_class */ 861 NULL, /* cms_ereport_detector */ 862 NULL, /* cms_ereport_includestack */ 863 authamd_ereport_add_logout, /* cms_ereport_add_logout */ 864 authamd_msrinject, /* cms_msrinject */ 865 NULL, /* cms_fini */ 866 }; 867 868 static struct modlcpu modlcpu = { 869 &mod_cpuops, 870 "Generic AMD model-specific MCA" 871 }; 872 873 static struct modlinkage modlinkage = { 874 MODREV_1, 875 (void *)&modlcpu, 876 NULL 877 }; 878 879 int 880 _init(void) 881 { 882 return (mod_install(&modlinkage)); 883 } 884 885 int 886 _info(struct modinfo *modinfop) 887 { 888 return (mod_info(&modlinkage, modinfop)); 889 } 890 891 int 892 _fini(void) 893 { 894 return (mod_remove(&modlinkage)); 895 } 896