1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/mca_x86.h> 28 #include <sys/cpu_module_impl.h> 29 #include <sys/cpu_module_ms.h> 30 #include <sys/cmn_err.h> 31 #include <sys/cpuvar.h> 32 #include <sys/pghw.h> 33 #include <sys/x86_archext.h> 34 #include <sys/sysmacros.h> 35 #include <sys/regset.h> 36 #include <sys/privregs.h> 37 #include <sys/systm.h> 38 #include <sys/types.h> 39 #include <sys/log.h> 40 #include <sys/psw.h> 41 #include <sys/fm/protocol.h> 42 #include <sys/fm/util.h> 43 #include <sys/errorq.h> 44 #include <sys/mca_x86.h> 45 #include <sys/fm/cpu/GMCA.h> 46 #include <sys/fm/smb/fmsmb.h> 47 #include <sys/sysevent.h> 48 #include <sys/ontrap.h> 49 50 #include "gcpu.h" 51 52 extern int x86gentopo_legacy; /* x86 generic topology support */ 53 54 /* 55 * Clear to log telemetry found at initialization. While processor docs 56 * say you should process this telemetry on all but Intel family 0x6 57 * there are way too many exceptions and we want to avoid bogus 58 * diagnoses. 59 */ 60 int gcpu_suppress_log_on_init = 1; 61 62 /* 63 * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at 64 * error logout time. The stack will be included in the ereport if the 65 * error type selects stack inclusion, or in all cases if 66 * gcpu_mca_stack_ereport_include is nonzero. 67 */ 68 int gcpu_mca_stack_flag = 0; 69 int gcpu_mca_stack_ereport_include = 0; 70 71 /* 72 * The number of times to re-read MCA telemetry to try to obtain a 73 * consistent snapshot if we find it to be changing under our feet. 74 */ 75 int gcpu_mca_telemetry_retries = 5; 76 77 #ifndef __xpv 78 int gcpu_mca_cmci_throttling_threshold = 10; 79 int gcpu_mca_cmci_reenable_threshold = 1000; 80 #endif 81 82 static gcpu_error_disp_t gcpu_errtypes[] = { 83 84 /* 85 * Unclassified 86 */ 87 { 88 FM_EREPORT_CPU_GENERIC_UNCLASSIFIED, 89 NULL, 90 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 91 MCAX86_SIMPLE_UNCLASSIFIED_MASKON, 92 MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF 93 }, 94 95 /* 96 * Microcode ROM Parity Error 97 */ 98 { 99 FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY, 100 NULL, 101 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 102 MCAX86_SIMPLE_MC_CODE_PARITY_MASKON, 103 MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF 104 }, 105 106 /* 107 * External - BINIT# from another processor during power-on config 108 */ 109 { 110 FM_EREPORT_CPU_GENERIC_EXTERNAL, 111 NULL, 112 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 113 MCAX86_SIMPLE_EXTERNAL_MASKON, 114 MCAX86_SIMPLE_EXTERNAL_MASKOFF 115 }, 116 117 /* 118 * Functional redundancy check master/slave error 119 */ 120 { 121 FM_EREPORT_CPU_GENERIC_FRC, 122 NULL, 123 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 124 MCAX86_SIMPLE_FRC_MASKON, 125 MCAX86_SIMPLE_FRC_MASKOFF 126 }, 127 128 /* 129 * Internal parity error 130 */ 131 { 132 FM_EREPORT_CPU_GENERIC_INTERNAL_PARITY, 133 NULL, 134 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 135 MCAX86_SIMPLE_INTERNAL_PARITY_MASKON, 136 MCAX86_SIMPLE_INTERNAL_PARITY_MASKOFF 137 }, 138 139 140 /* 141 * Internal timer error 142 */ 143 { 144 FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER, 145 NULL, 146 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 147 MCAX86_SIMPLE_INTERNAL_TIMER_MASKON, 148 MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF 149 }, 150 151 /* 152 * Internal unclassified 153 */ 154 { 155 FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS, 156 NULL, 157 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 158 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON, 159 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF 160 }, 161 162 /* 163 * Compound error codes - generic memory hierarchy 164 */ 165 { 166 FM_EREPORT_CPU_GENERIC_GENMEMHIER, 167 NULL, 168 FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */ 169 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON, 170 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF 171 }, 172 173 /* 174 * Compound error codes - TLB errors 175 */ 176 { 177 FM_EREPORT_CPU_GENERIC_TLB, 178 "%1$s" "TLB" "%2$s" "_ERR", 179 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 180 MCAX86_COMPOUND_TLB_MASKON, 181 MCAX86_COMPOUND_TLB_MASKOFF 182 }, 183 184 /* 185 * Compound error codes - memory hierarchy 186 */ 187 { 188 FM_EREPORT_CPU_GENERIC_MEMHIER, 189 "%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR", 190 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 191 MCAX86_COMPOUND_MEMHIER_MASKON, 192 MCAX86_COMPOUND_MEMHIER_MASKOFF 193 }, 194 195 /* 196 * Compound error codes - bus and interconnect errors 197 */ 198 { 199 FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT, 200 "BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR", 201 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 202 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON, 203 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF 204 }, 205 /* 206 * Compound error codes - memory controller errors 207 */ 208 { 209 FM_EREPORT_CPU_GENERIC_MEMORY_CONTROLLER, 210 "MC" "_" "%8$s" "_" "%9$s" "_ERR", 211 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 212 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKON, 213 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKOFF 214 }, 215 }; 216 217 static gcpu_error_disp_t gcpu_unknown = { 218 FM_EREPORT_CPU_GENERIC_UNKNOWN, 219 "UNKNOWN", 220 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 221 0, 222 0 223 }; 224 225 static errorq_t *gcpu_mca_queue; 226 static kmutex_t gcpu_mca_queue_lock; 227 228 #ifdef __xpv 229 static int isxpv = 1; 230 #else 231 static int isxpv = 0; 232 #endif 233 234 static const gcpu_error_disp_t * 235 gcpu_disp_match(uint16_t code) 236 { 237 const gcpu_error_disp_t *ged = gcpu_errtypes; 238 int i; 239 240 for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t); 241 i++, ged++) { 242 uint16_t on = ged->ged_errcode_mask_on; 243 uint16_t off = ged->ged_errcode_mask_off; 244 245 if ((code & on) == on && (code & off) == 0) 246 return (ged); 247 } 248 249 return (NULL); 250 } 251 252 static uint8_t 253 bit_strip(uint16_t code, uint16_t mask, uint16_t shift) 254 { 255 return ((uint8_t)(code & mask) >> shift); 256 } 257 258 #define BIT_STRIP(code, name) \ 259 bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \ 260 MCAX86_ERRCODE_##name##_SHIFT) 261 262 #define GCPU_MNEMONIC_UNDEF "undefined" 263 #define GCPU_MNEMONIC_RESVD "reserved" 264 265 /* 266 * Mappings of TT, LL, RRRR, PP, II and T values to compound error name 267 * mnemonics and to ereport class name components. 268 */ 269 270 struct gcpu_mnexp { 271 const char *mne_compound; /* used in expanding compound errname */ 272 const char *mne_ereport; /* used in expanding ereport class */ 273 }; 274 275 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */ 276 { "I", FM_EREPORT_CPU_GENERIC_TT_INSTR }, /* INSTR */ 277 { "D", FM_EREPORT_CPU_GENERIC_TT_DATA }, /* DATA */ 278 { "G", FM_EREPORT_CPU_GENERIC_TT_GEN }, /* GEN */ 279 { GCPU_MNEMONIC_UNDEF, "" } 280 }; 281 282 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */ 283 { "LO", FM_EREPORT_CPU_GENERIC_LL_L0 }, /* L0 */ 284 { "L1", FM_EREPORT_CPU_GENERIC_LL_L1 }, /* L1 */ 285 { "L2", FM_EREPORT_CPU_GENERIC_LL_L2 }, /* L2 */ 286 { "LG", FM_EREPORT_CPU_GENERIC_LL_LG } /* LG */ 287 }; 288 289 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */ 290 { "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR }, /* ERR */ 291 { "RD", FM_EREPORT_CPU_GENERIC_RRRR_RD }, /* RD */ 292 { "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR }, /* WR */ 293 { "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD }, /* DRD */ 294 { "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR }, /* DWR */ 295 { "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD }, /* IRD */ 296 { "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH }, /* PREFETCH */ 297 { "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT }, /* EVICT */ 298 { "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP }, /* SNOOP */ 299 }; 300 301 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */ 302 { "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC }, /* SRC */ 303 { "RES", FM_EREPORT_CPU_GENERIC_PP_RES }, /* RES */ 304 { "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS }, /* OBS */ 305 { "", FM_EREPORT_CPU_GENERIC_PP_GEN } /* GEN */ 306 }; 307 308 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */ 309 { "M", FM_EREPORT_CPU_GENERIC_II_MEM }, /* MEM */ 310 { GCPU_MNEMONIC_RESVD, "" }, 311 { "IO", FM_EREPORT_CPU_GENERIC_II_IO }, /* IO */ 312 { "", FM_EREPORT_CPU_GENERIC_II_GEN } /* GEN */ 313 }; 314 315 static struct gcpu_mnexp gcpu_T_mnemonics[] = { /* MCAX86_ERRCODE_T_* */ 316 { "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT }, /* NONE */ 317 { "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT } /* TIMEOUT */ 318 }; 319 320 static struct gcpu_mnexp gcpu_CCCC_mnemonics[] = { /* MCAX86_ERRCODE_CCCC_* */ 321 { "CH0", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH0 */ 322 { "CH1", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH1 */ 323 { "CH2", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH2 */ 324 { "CH3", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH3 */ 325 { "CH4", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH4 */ 326 { "CH5", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH5 */ 327 { "CH6", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH6 */ 328 { "CH7", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH7 */ 329 { "CH8", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH8 */ 330 { "CH9", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH9 */ 331 { "CH10", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH10 */ 332 { "CH11", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH11 */ 333 { "CH12", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH12 */ 334 { "CH13", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH13 */ 335 { "CH14", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH14 */ 336 { "CH", FM_EREPORT_CPU_GENERIC_CCCC } /* GEN */ 337 }; 338 339 static struct gcpu_mnexp gcpu_MMM_mnemonics[] = { /* MCAX86_ERRCODE_MMM_* */ 340 { "GEN", FM_EREPORT_CPU_GENERIC_MMM_ERR }, /* GEN ERR */ 341 { "RD", FM_EREPORT_CPU_GENERIC_MMM_RD }, /* READ */ 342 { "WR", FM_EREPORT_CPU_GENERIC_MMM_WR }, /* WRITE */ 343 { "ADDR_CMD", FM_EREPORT_CPU_GENERIC_MMM_ADRCMD }, /* ADDR, CMD */ 344 { "SCRUB", FM_EREPORT_CPU_GENERIC_MMM_SCRUB }, 345 { GCPU_MNEMONIC_RESVD, ""}, /* RESERVED */ 346 { GCPU_MNEMONIC_RESVD, ""}, /* RESERVED */ 347 { GCPU_MNEMONIC_RESVD, ""} /* RESERVED */ 348 }; 349 350 enum gcpu_mn_namespace { 351 GCPU_MN_NAMESPACE_COMPOUND, 352 GCPU_MN_NAMESPACE_EREPORT 353 }; 354 355 static const char * 356 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint8_t val, 357 enum gcpu_mn_namespace nspace) 358 { 359 if (val >= tbl_sz) 360 return (GCPU_MNEMONIC_UNDEF); /* for all namespaces */ 361 362 switch (nspace) { 363 case GCPU_MN_NAMESPACE_COMPOUND: 364 return (tbl[val].mne_compound); 365 /*NOTREACHED*/ 366 367 case GCPU_MN_NAMESPACE_EREPORT: 368 return (tbl[val].mne_ereport); 369 /*NOTREACHED*/ 370 371 default: 372 return (GCPU_MNEMONIC_UNDEF); 373 /*NOTREACHED*/ 374 } 375 } 376 377 /* 378 * The ereport class leaf component is either a simple string with no 379 * format specifiers, or a string with one or more embedded %n$s specifiers - 380 * positional selection for string arguments. The kernel snprintf does 381 * not support %n$ (and teaching it to do so is too big a headache) so 382 * we will expand this restricted format string ourselves. 383 */ 384 385 #define GCPU_CLASS_VARCOMPS 9 386 387 #define GCPU_MNEMONIC(code, name, nspace) \ 388 gcpu_mnemonic(gcpu_##name##_mnemonics, \ 389 sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \ 390 BIT_STRIP(code, name), nspace) 391 392 static void 393 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status, 394 enum gcpu_mn_namespace nspace) 395 { 396 uint16_t code = MCAX86_ERRCODE(status); 397 const char *mn[GCPU_CLASS_VARCOMPS]; 398 char *p = buf; /* current position in buf */ 399 char *q = buf + buflen; /* pointer past last char in buf */ 400 int which, expfmtchar, error; 401 char c; 402 403 mn[0] = GCPU_MNEMONIC(code, TT, nspace); 404 mn[1] = GCPU_MNEMONIC(code, LL, nspace); 405 mn[2] = GCPU_MNEMONIC(code, RRRR, nspace); 406 mn[3] = GCPU_MNEMONIC(code, PP, nspace); 407 mn[4] = GCPU_MNEMONIC(code, II, nspace); 408 mn[5] = GCPU_MNEMONIC(code, T, nspace); 409 mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : ""; 410 mn[7] = GCPU_MNEMONIC(code, CCCC, nspace); 411 mn[8] = GCPU_MNEMONIC(code, MMM, nspace); 412 413 while (p < q - 1 && (c = *fmt++) != '\0') { 414 if (c != '%') { 415 /* not the beginning of a format specifier - copy */ 416 *p++ = c; 417 continue; 418 } 419 420 error = 0; 421 which = -1; 422 expfmtchar = -1; 423 424 nextfmt: 425 if ((c = *fmt++) == '\0') 426 break; /* early termination of fmt specifier */ 427 428 switch (c) { 429 case '1': 430 case '2': 431 case '3': 432 case '4': 433 case '5': 434 case '6': 435 case '7': 436 case '8': 437 case '9': 438 if (which != -1) { /* allow only one positional digit */ 439 error++; 440 break; 441 } 442 which = c - '1'; 443 goto nextfmt; 444 /*NOTREACHED*/ 445 446 case '$': 447 if (which == -1) { /* no position specified */ 448 error++; 449 break; 450 } 451 expfmtchar = 's'; 452 goto nextfmt; 453 /*NOTREACHED*/ 454 455 case 's': 456 if (expfmtchar != 's') { 457 error++; 458 break; 459 } 460 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s", 461 mn[which]); 462 p += strlen(p); 463 break; 464 465 default: 466 error++; 467 break; 468 } 469 470 if (error) 471 break; 472 } 473 474 *p = '\0'; /* NUL termination */ 475 } 476 477 static void 478 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status, 479 const char *cpuclass, const char *leafclass) 480 { 481 char *p = buf; /* current position in buf */ 482 char *q = buf + buflen; /* pointer past last char in buf */ 483 484 (void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.", 485 FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC); 486 487 p += strlen(p); 488 if (p >= q) 489 return; 490 491 if (leafclass == NULL) { 492 gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status, 493 GCPU_MN_NAMESPACE_EREPORT); 494 } else { 495 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s", 496 leafclass); 497 } 498 } 499 500 /* 501 * Create an "hc" scheme FMRI identifying the given cpu with 502 * motherboard/chip/core/strand instance numbers. 503 */ 504 static nvlist_t * 505 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva) 506 { 507 nvlist_t *nvl, *fmri; 508 509 if ((nvl = fm_nvlist_create(nva)) == NULL) 510 return (NULL); 511 512 if (!x86gentopo_legacy) { 513 fmri = cmi_hdl_smb_bboard(hdl); 514 if (fmri == NULL) 515 return (NULL); 516 517 fm_fmri_hc_create(nvl, FM_HC_SCHEME_VERSION, 518 NULL, NULL, fmri, 3, 519 "chip", cmi_hdl_smb_chipid(hdl), 520 "core", cmi_hdl_coreid(hdl), 521 "strand", cmi_hdl_strandid(hdl)); 522 } else { 523 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 4, 524 "motherboard", 0, 525 "chip", cmi_hdl_chipid(hdl), 526 "core", cmi_hdl_coreid(hdl), 527 "strand", cmi_hdl_strandid(hdl)); 528 } 529 530 return (nvl); 531 } 532 533 int gcpu_bleat_count_thresh = 5; 534 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL; 535 536 /* 537 * Called when we are unable to propogate a logout structure onto an 538 * errorq for subsequent ereport preparation and logging etc. The caller 539 * should usually only decide to call this for severe errors - those we 540 * suspect we may need to panic for. 541 */ 542 static void 543 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl) 544 { 545 hrtime_t now = gethrtime_waitfree(); 546 static hrtime_t gcpu_last_bleat; 547 gcpu_bank_logout_t *gbl; 548 static int bleatcount; 549 int i; 550 551 /* 552 * Throttle spamming of the console. The first gcpu_bleat_count_thresh 553 * can come as fast as we like, but once we've spammed that many 554 * to the console we require a minimum interval to pass before 555 * any more complaints. 556 */ 557 if (++bleatcount > gcpu_bleat_count_thresh) { 558 if (now - gcpu_last_bleat < gcpu_bleat_min_interval) 559 return; 560 else 561 bleatcount = 0; 562 } 563 gcpu_last_bleat = now; 564 565 cmn_err(CE_WARN, 566 "Machine-Check Errors unlogged on chip %d core %d strand %d, " 567 "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl), 568 cmi_hdl_strandid(hdl)); 569 cmn_err(CE_WARN, "MCG_STATUS 0x%016llx", 570 (u_longlong_t)gcl->gcl_mcg_status); 571 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) { 572 uint64_t status = gbl->gbl_status; 573 574 if (!(status & MSR_MC_STATUS_VAL)) 575 continue; 576 577 switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) { 578 case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV: 579 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 580 "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx", 581 i, IA32_MSR_MC(i, STATUS), 582 (u_longlong_t)status, 583 (u_longlong_t)gbl->gbl_addr, 584 (u_longlong_t)gbl->gbl_misc); 585 break; 586 587 case MSR_MC_STATUS_ADDRV: 588 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 589 "STAT 0x%016llx ADDR 0x%016llx", 590 i, IA32_MSR_MC(i, STATUS), 591 (u_longlong_t)status, 592 (u_longlong_t)gbl->gbl_addr); 593 break; 594 595 case MSR_MC_STATUS_MISCV: 596 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 597 "STAT 0x%016llx MISC 0x%016llx", 598 i, IA32_MSR_MC(i, STATUS), 599 (u_longlong_t)status, 600 (u_longlong_t)gbl->gbl_misc); 601 break; 602 603 default: 604 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 605 "STAT 0x%016llx", 606 i, IA32_MSR_MC(i, STATUS), 607 (u_longlong_t)status); 608 break; 609 610 } 611 } 612 } 613 614 #define _GCPU_BSTATUS(status, what) \ 615 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \ 616 (status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE 617 618 static void 619 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl, 620 uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code) 621 { 622 uint64_t members = ged ? ged->ged_ereport_members : 623 FM_EREPORT_PAYLOAD_FLAGS_COMMON; 624 uint64_t mcg = gcl->gcl_mcg_status; 625 int mcip = mcg & MCG_STATUS_MCIP; 626 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno]; 627 uint64_t bstat = gbl->gbl_status; 628 629 /* 630 * Include the compound error name if requested and if this 631 * is a compound error type. 632 */ 633 if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged && 634 ged->ged_compound_fmt != NULL) { 635 char buf[FM_MAX_CLASS]; 636 637 gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code, 638 GCPU_MN_NAMESPACE_COMPOUND); 639 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR, 640 DATA_TYPE_STRING, buf, NULL); 641 } 642 643 /* 644 * Include disposition information for this error 645 */ 646 if (members & FM_EREPORT_PAYLOAD_FLAG_DISP && 647 gbl->gbl_disp != 0) { 648 int i, empty = 1; 649 char buf[128]; 650 char *p = buf, *q = buf + 128; 651 static struct _gcpu_disp_name { 652 uint64_t dv; 653 const char *dn; 654 } disp_names[] = { 655 { CMI_ERRDISP_CURCTXBAD, 656 "processor_context_corrupt" }, 657 { CMI_ERRDISP_RIPV_INVALID, 658 "return_ip_invalid" }, 659 { CMI_ERRDISP_UC_UNCONSTRAINED, 660 "unconstrained" }, 661 { CMI_ERRDISP_FORCEFATAL, 662 "forcefatal" }, 663 { CMI_ERRDISP_IGNORED, 664 "ignored" }, 665 { CMI_ERRDISP_PCC_CLEARED, 666 "corrupt_context_cleared" }, 667 { CMI_ERRDISP_UC_CLEARED, 668 "uncorrected_data_cleared" }, 669 { CMI_ERRDISP_POISONED, 670 "poisoned" }, 671 { CMI_ERRDISP_INCONSISTENT, 672 "telemetry_unstable" }, 673 }; 674 675 for (i = 0; i < sizeof (disp_names) / 676 sizeof (struct _gcpu_disp_name); i++) { 677 if ((gbl->gbl_disp & disp_names[i].dv) == 0) 678 continue; 679 680 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, 681 "%s%s", empty ? "" : ",", disp_names[i].dn); 682 p += strlen(p); 683 empty = 0; 684 } 685 686 if (p != buf) 687 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP, 688 DATA_TYPE_STRING, buf, NULL); 689 } 690 691 /* 692 * If MCG_STATUS is included add that and an indication of whether 693 * this ereport was the result of a machine check or poll. 694 */ 695 if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) { 696 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS, 697 DATA_TYPE_UINT64, mcg, NULL); 698 699 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP, 700 DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL); 701 } 702 703 /* 704 * If an instruction pointer is to be included add one provided 705 * MCG_STATUS indicated it is valid; meaningless for polled events. 706 */ 707 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP && 708 mcg & MCG_STATUS_EIPV) { 709 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP, 710 DATA_TYPE_UINT64, gcl->gcl_ip, NULL); 711 } 712 713 /* 714 * Add an indication of whether the trap occured during privileged code. 715 */ 716 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) { 717 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV, 718 DATA_TYPE_BOOLEAN_VALUE, 719 gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL); 720 } 721 722 /* 723 * If requested, add the index of the MCA bank. This indicates the 724 * n'th bank of 4 MCA registers, and does not necessarily correspond 725 * to MCi_* - use the bank offset to correlate 726 */ 727 if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) { 728 fm_payload_set(ereport, 729 /* Bank number */ 730 FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno, 731 /* Offset of MCi_CTL */ 732 FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64, 733 IA32_MSR_MC(bankno, CTL), 734 NULL); 735 } 736 737 /* 738 * Add MCi_STATUS if requested, and decode it. 739 */ 740 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) { 741 const char *tbes[] = { 742 "No tracking", /* 00 */ 743 "Green - below threshold", /* 01 */ 744 "Yellow - above threshold", /* 10 */ 745 "Reserved" /* 11 */ 746 }; 747 748 fm_payload_set(ereport, 749 /* Bank MCi_STATUS */ 750 FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat, 751 /* Overflow? */ 752 _GCPU_BSTATUS(bstat, OVER), 753 /* Uncorrected? */ 754 _GCPU_BSTATUS(bstat, UC), 755 /* Enabled? */ 756 _GCPU_BSTATUS(bstat, EN), 757 /* Processor context corrupt? */ 758 _GCPU_BSTATUS(bstat, PCC), 759 /* Error code */ 760 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE, 761 DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat), 762 /* Model-specific error code */ 763 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE, 764 DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat), 765 NULL); 766 767 /* 768 * If MCG_CAP.TES_P indicates that that thresholding info 769 * is present in the architural component of the bank status 770 * then include threshold information for this bank. 771 */ 772 if (gcl->gcl_flags & GCPU_GCL_F_TES_P) { 773 fm_payload_set(ereport, 774 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES, 775 DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)], 776 NULL); 777 } 778 } 779 780 /* 781 * MCi_ADDR info if requested and valid. 782 */ 783 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR && 784 bstat & MSR_MC_STATUS_ADDRV) { 785 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR, 786 DATA_TYPE_UINT64, gbl->gbl_addr, NULL); 787 } 788 789 /* 790 * MCi_MISC if requested and MCi_STATUS.MISCV). 791 */ 792 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC && 793 bstat & MSR_MC_STATUS_MISCV) { 794 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC, 795 DATA_TYPE_UINT64, gbl->gbl_misc, NULL); 796 } 797 798 } 799 800 /* 801 * Construct and post an ereport based on the logout information from a 802 * single MCA bank. We are not necessarily running on the cpu that 803 * detected the error. 804 */ 805 static void 806 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx, 807 const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status) 808 { 809 gcpu_data_t *gcpu = gcl->gcl_gcpu; 810 cmi_hdl_t hdl = gcpu->gcpu_hdl; 811 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx]; 812 const char *cpuclass = NULL, *leafclass = NULL; 813 uint16_t code = MCAX86_ERRCODE(status); 814 errorq_elem_t *eqep, *scr_eqep; 815 nvlist_t *ereport, *detector; 816 char buf[FM_MAX_CLASS]; 817 const char *classfmt; 818 nv_alloc_t *nva; 819 820 if (panicstr) { 821 if ((eqep = errorq_reserve(ereport_errorq)) == NULL) 822 return; 823 ereport = errorq_elem_nvl(ereport_errorq, eqep); 824 825 /* 826 * Allocate another element for scratch space, but fallback 827 * to the one we have if that fails. We'd like to use the 828 * additional scratch space for nvlist construction. 829 */ 830 if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL) 831 nva = errorq_elem_nva(ereport_errorq, scr_eqep); 832 else 833 nva = errorq_elem_nva(ereport_errorq, eqep); 834 } else { 835 ereport = fm_nvlist_create(NULL); 836 nva = NULL; 837 } 838 839 if (ereport == NULL) 840 return; 841 842 /* 843 * Common payload data required by the protocol: 844 * - ereport class 845 * - detector 846 * - ENA 847 */ 848 849 /* 850 * Ereport class - call into model-specific support to allow it to 851 * provide a cpu class or leaf class, otherwise calculate our own. 852 */ 853 cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass); 854 classfmt = ged ? ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN; 855 gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass, 856 leafclass); 857 858 /* 859 * The detector FMRI. 860 */ 861 if ((detector = cms_ereport_detector(hdl, bankidx, mscookie, 862 nva)) == NULL) 863 detector = gcpu_fmri_create(hdl, nva); 864 865 /* 866 * Should we define a new ENA format 3?? for chip/core/strand? 867 * It will be better when virtualized. 868 */ 869 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf, 870 fm_ena_generate_cpu(gcl->gcl_timestamp, 871 cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 | 872 cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL); 873 874 if (panicstr) { 875 fm_nvlist_destroy(detector, FM_NVA_RETAIN); 876 nv_alloc_reset(nva); 877 } else { 878 fm_nvlist_destroy(detector, FM_NVA_FREE); 879 } 880 881 /* 882 * Add the architectural ereport class-specific payload data. 883 */ 884 gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code); 885 886 /* 887 * Allow model-specific code to add ereport members. 888 */ 889 cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status, 890 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie); 891 892 /* 893 * Include stack if options is turned on and either selected in 894 * the payload member bitmask or inclusion is forced. 895 */ 896 if (gcpu_mca_stack_flag && 897 (cms_ereport_includestack(hdl, mscookie) == 898 B_TRUE || gcpu_mca_stack_ereport_include)) { 899 fm_payload_stack_add(ereport, gcl->gcl_stack, 900 gcl->gcl_stackdepth); 901 } 902 903 /* 904 * If injection has taken place anytime in the past then note this 905 * on the ereport. 906 */ 907 if (cmi_inj_tainted() == B_TRUE) { 908 fm_payload_set(ereport, "__injected", DATA_TYPE_BOOLEAN_VALUE, 909 B_TRUE, NULL); 910 } 911 912 /* 913 * Post ereport. 914 */ 915 if (panicstr) { 916 errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC); 917 if (scr_eqep) 918 errorq_cancel(ereport_errorq, scr_eqep); 919 } else { 920 (void) fm_ereport_post(ereport, EVCH_TRYHARD); 921 fm_nvlist_destroy(ereport, FM_NVA_FREE); 922 } 923 924 } 925 926 /*ARGSUSED*/ 927 void 928 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe) 929 { 930 const gcpu_logout_t *gcl = data; 931 const gcpu_bank_logout_t *gbl; 932 int i; 933 934 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) { 935 const gcpu_error_disp_t *gened; 936 cms_cookie_t mscookie; 937 938 if (gbl->gbl_status & MSR_MC_STATUS_VAL && 939 !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) { 940 uint16_t code = MCAX86_ERRCODE(gbl->gbl_status); 941 942 /* 943 * Perform a match based on IA32 MCA architectural 944 * components alone. 945 */ 946 gened = gcpu_disp_match(code); /* may be NULL */ 947 948 /* 949 * Now see if an model-specific match can be made. 950 */ 951 mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, i, 952 gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc, 953 gcl->gcl_ms_logout); 954 955 /* 956 * Prepare and dispatch an ereport for logging and 957 * diagnosis. 958 */ 959 gcpu_ereport_post(gcl, i, gened, mscookie, 960 gbl->gbl_status); 961 } else if (gbl->gbl_status & MSR_MC_STATUS_VAL && 962 (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) { 963 /* 964 * Telemetry kept changing as we tried to read 965 * it. Force an unknown ereport leafclass but 966 * keep the telemetry unchanged for logging. 967 */ 968 gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL, 969 gbl->gbl_status); 970 } 971 } 972 } 973 974 static size_t gcpu_mca_queue_datasz = 0; 975 976 /* 977 * The following code is ready to make a weak attempt at growing the 978 * errorq structure size. Since it is not foolproof (we don't know 979 * who may already be producing to the outgoing errorq) our caller 980 * instead assures that we'll always be called with no greater data 981 * size than on our first call. 982 */ 983 static void 984 gcpu_errorq_init(size_t datasz) 985 { 986 int slots; 987 988 mutex_enter(&gcpu_mca_queue_lock); 989 990 if (gcpu_mca_queue_datasz >= datasz) { 991 mutex_exit(&gcpu_mca_queue_lock); 992 return; 993 } 994 995 membar_producer(); 996 if (gcpu_mca_queue) { 997 gcpu_mca_queue_datasz = 0; 998 errorq_destroy(gcpu_mca_queue); 999 } 1000 1001 slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS); 1002 slots = MIN(slots, GCPU_MCA_MAX_ERRORS); 1003 1004 gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain, 1005 NULL, slots, datasz, 1, ERRORQ_VITAL); 1006 1007 if (gcpu_mca_queue != NULL) 1008 gcpu_mca_queue_datasz = datasz; 1009 1010 mutex_exit(&gcpu_mca_queue_lock); 1011 } 1012 1013 /* 1014 * Perform MCA initialization as described in section 14.6 of Intel 64 1015 * and IA-32 Architectures Software Developer's Manual Volume 3A. 1016 */ 1017 1018 static uint_t global_nbanks; 1019 1020 void 1021 gcpu_mca_init(cmi_hdl_t hdl) 1022 { 1023 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1024 uint64_t cap; 1025 uint_t vendor = cmi_hdl_vendor(hdl); 1026 uint_t family = cmi_hdl_family(hdl); 1027 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1028 int mcg_ctl_present; 1029 uint_t nbanks; 1030 uint32_t ctl_skip_mask = 0; 1031 uint32_t status_skip_mask = 0; 1032 size_t mslsz; 1033 int i; 1034 #ifndef __xpv 1035 int mcg_ctl2_present; 1036 uint32_t cmci_capable = 0; 1037 #endif 1038 1039 if (gcpu == NULL) 1040 return; 1041 1042 /* 1043 * Protect from some silly /etc/system settings. 1044 */ 1045 if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100) 1046 gcpu_mca_telemetry_retries = 5; 1047 1048 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS) 1049 return; 1050 1051 /* 1052 * CPU startup code only calls cmi_mca_init if x86_feature indicates 1053 * both MCA and MCE support (i.e., X86_MCA). P5, K6, and earlier 1054 * processors, which have their own * more primitive way of doing 1055 * machine checks, will not have cmi_mca_init called since their 1056 * CPUID information will not indicate both MCA and MCE features. 1057 */ 1058 ASSERT(x86_feature & X86_MCA); 1059 1060 /* 1061 * Determine whether the IA32_MCG_CTL register is present. If it 1062 * is we will enable all features by writing -1 to it towards 1063 * the end of this initialization; if it is absent then volume 3A 1064 * says we must nonetheless continue to initialize the individual 1065 * banks. 1066 */ 1067 mcg_ctl_present = cap & MCG_CAP_CTL_P; 1068 #ifndef __xpv 1069 mcg_ctl2_present = cap & MCG_CAP_CTL2_P; 1070 #endif 1071 1072 /* 1073 * We squirell values away for inspection/debugging. 1074 */ 1075 mca->gcpu_mca_bioscfg.bios_mcg_cap = cap; 1076 if (mcg_ctl_present) 1077 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL, 1078 &mca->gcpu_mca_bioscfg.bios_mcg_ctl); 1079 1080 /* 1081 * Determine the number of error-reporting banks implemented. 1082 */ 1083 mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK; 1084 1085 if (nbanks != 0 && global_nbanks == 0) 1086 global_nbanks = nbanks; /* no race - BSP will get here first */ 1087 1088 /* 1089 * If someone is hiding the number of banks (perhaps we are fully 1090 * virtualized?) or if this processor has more banks than the 1091 * first to set global_nbanks then bail. The latter requirement 1092 * is because we need to size our errorq data structure and we 1093 * don't want to have to grow the errorq (destroy and recreate) 1094 * which may just lose some telemetry. 1095 */ 1096 if (nbanks == 0 || nbanks > global_nbanks) 1097 return; 1098 1099 mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks * 1100 sizeof (struct gcpu_bios_bankcfg), KM_SLEEP); 1101 1102 /* 1103 * Calculate the size we need to allocate for a gcpu_logout_t 1104 * with a gcl_data array big enough for all banks of this cpu. 1105 * Add any space requested by the model-specific logout support. 1106 */ 1107 mslsz = cms_logout_size(hdl); 1108 mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) + 1109 (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz; 1110 1111 for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) { 1112 gcpu_logout_t *gcl; 1113 1114 mca->gcpu_mca_logout[i] = gcl = 1115 kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP); 1116 gcl->gcl_gcpu = gcpu; 1117 gcl->gcl_nbanks = nbanks; 1118 gcl->gcl_ms_logout = (mslsz == 0) ? NULL : 1119 (char *)(&gcl->gcl_data[0]) + nbanks * 1120 sizeof (gcpu_bank_logout_t); 1121 1122 } 1123 1124 #ifdef __xpv 1125 gcpu_xpv_mca_init(nbanks); 1126 #endif 1127 1128 mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1; 1129 1130 #ifndef __xpv 1131 mca->gcpu_bank_cmci = kmem_zalloc(sizeof (gcpu_mca_cmci_t) * nbanks, 1132 KM_SLEEP); 1133 #endif 1134 1135 /* 1136 * Create our errorq to transport the logout structures. This 1137 * can fail so users of gcpu_mca_queue must be prepared for NULL. 1138 */ 1139 gcpu_errorq_init(mca->gcpu_mca_lgsz); 1140 1141 /* 1142 * Not knowing which, if any, banks are shared between cores we 1143 * assure serialization of MCA bank initialization by each cpu 1144 * on the chip. On chip architectures in which some banks are 1145 * shared this will mean the shared resource is initialized more 1146 * than once - we're simply aiming to avoid simultaneous MSR writes 1147 * to the shared resource. 1148 * 1149 * Even with these precautions, some platforms may yield a GP fault 1150 * if a core other than a designated master tries to write anything 1151 * but all 0's to MCi_{STATUS,ADDR,CTL}. So we will perform 1152 * those writes under on_trap protection. 1153 */ 1154 mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock); 1155 1156 /* 1157 * Initialize poller data, but don't start polling yet. 1158 */ 1159 gcpu_mca_poll_init(hdl); 1160 1161 /* 1162 * Work out which MCA banks we will initialize. In MCA logout 1163 * code we will only read those banks which we initialize here. 1164 */ 1165 for (i = 0; i < nbanks; i++) { 1166 boolean_t skipctl = cms_bankctl_skipinit(hdl, i); 1167 boolean_t skipstatus = cms_bankstatus_skipinit(hdl, i); 1168 1169 if (!cms_present(hdl)) { 1170 /* 1171 * Model-specific support is not present, try to use 1172 * sane defaults. 1173 * 1174 * On AMD family 6 processors, reports about spurious 1175 * machine checks indicate that bank 0 should be 1176 * skipped. 1177 * 1178 * On Intel family 6 processors, the documentation tells 1179 * us not to write to MC0_CTL. 1180 * 1181 */ 1182 if (i == 0 && family == 6) { 1183 switch (vendor) { 1184 case X86_VENDOR_AMD: 1185 skipstatus = B_TRUE; 1186 /*FALLTHRU*/ 1187 case X86_VENDOR_Intel: 1188 skipctl = B_TRUE; 1189 break; 1190 } 1191 } 1192 } 1193 1194 ctl_skip_mask |= skipctl << i; 1195 status_skip_mask |= skipstatus << i; 1196 1197 if (skipctl && skipstatus) 1198 continue; 1199 1200 /* 1201 * Record which MCA banks were enabled, from the point of view 1202 * of the whole chip (if some cores share a bank we must be 1203 * sure either can logout from it). 1204 */ 1205 atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i); 1206 1207 #ifndef __xpv 1208 /* 1209 * check CMCI capability 1210 */ 1211 if (mcg_ctl2_present) { 1212 uint64_t ctl2; 1213 uint32_t cap = 0; 1214 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2); 1215 if (ctl2 & MSR_MC_CTL2_EN) 1216 continue; 1217 ctl2 |= MSR_MC_CTL2_EN; 1218 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2); 1219 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2); 1220 mca->gcpu_bank_cmci[i].cmci_cap = cap = 1221 (ctl2 & MSR_MC_CTL2_EN) ? 1 : 0; 1222 if (cap) 1223 cmci_capable ++; 1224 /* 1225 * Set threshold to 1 while unset the en field, to avoid 1226 * CMCI trigged before APIC LVT entry init. 1227 */ 1228 ctl2 = ctl2 & (~MSR_MC_CTL2_EN) | 1; 1229 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2); 1230 1231 /* 1232 * init cmci related count 1233 */ 1234 mca->gcpu_bank_cmci[i].cmci_enabled = 0; 1235 mca->gcpu_bank_cmci[i].drtcmci = 0; 1236 mca->gcpu_bank_cmci[i].ncmci = 0; 1237 } 1238 #endif 1239 } 1240 1241 #ifndef __xpv 1242 if (cmci_capable) 1243 cmi_enable_cmci = 1; 1244 #endif 1245 1246 #ifndef __xpv 1247 /* 1248 * Log any valid telemetry lurking in the MCA banks, but do not 1249 * clear the status registers. Ignore the disposition returned - 1250 * we have already paniced or reset for any nasty errors found here. 1251 * 1252 * Intel vol 3A says that we should not do this on family 0x6, 1253 * and that for any extended family the BIOS clears things 1254 * on power-on reset so you'll only potentially find valid telemetry 1255 * on warm reset (we do it for both - on power-on reset we should 1256 * just see zeroes). 1257 * 1258 * AMD docs since K7 say we should process anything we find here. 1259 */ 1260 if (!gcpu_suppress_log_on_init && 1261 (vendor == X86_VENDOR_Intel && family >= 0xf || 1262 vendor == X86_VENDOR_AMD)) 1263 gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE, 1264 GCPU_MPT_WHAT_POKE_ERR); 1265 1266 /* 1267 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the 1268 * model-specific module the power of veto. 1269 */ 1270 for (i = 0; i < nbanks; i++) { 1271 struct gcpu_bios_bankcfg *bcfgp = 1272 mca->gcpu_mca_bioscfg.bios_bankcfg + i; 1273 1274 /* 1275 * Stash inherited bank MCA state, even for banks we will 1276 * not initialize ourselves. Do not read the MISC register 1277 * unconditionally - on some processors that will #GP on 1278 * banks that do not implement the MISC register (would be 1279 * caught by on_trap, anyway). 1280 */ 1281 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL), 1282 &bcfgp->bios_bank_ctl); 1283 1284 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), 1285 &bcfgp->bios_bank_status); 1286 1287 if (bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV) 1288 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), 1289 &bcfgp->bios_bank_addr); 1290 1291 /* 1292 * In some old BIOS the status value after boot can indicate 1293 * MISCV when there is actually no MISC register for 1294 * that bank. The following read could therefore 1295 * aggravate a general protection fault. This should be 1296 * caught by on_trap, but the #GP fault handler is busted 1297 * and can suffer a double fault even before we get to 1298 * trap() to check for on_trap protection. Until that 1299 * issue is fixed we remove the one access that we know 1300 * can cause a #GP. 1301 * 1302 * if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV) 1303 * (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), 1304 * &bcfgp->bios_bank_misc); 1305 */ 1306 bcfgp->bios_bank_misc = 0; 1307 1308 if (!(ctl_skip_mask & (1 << i))) { 1309 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL), 1310 cms_bankctl_val(hdl, i, -1ULL)); 1311 } 1312 1313 if (!(status_skip_mask & (1 << i))) { 1314 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 1315 cms_bankstatus_val(hdl, i, 0ULL)); 1316 } 1317 } 1318 #endif 1319 /* 1320 * Now let the model-specific support perform further initialization 1321 * of non-architectural features. 1322 */ 1323 cms_mca_init(hdl, nbanks); 1324 1325 #ifndef __xpv 1326 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL); 1327 membar_producer(); 1328 1329 /* enable all machine-check features */ 1330 if (mcg_ctl_present) 1331 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL, 1332 cms_mcgctl_val(hdl, nbanks, -1ULL)); 1333 #endif 1334 1335 mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock); 1336 1337 #ifndef __xpv 1338 /* enable machine-check exception in CR4 */ 1339 cmi_hdl_enable_mce(hdl); 1340 #endif 1341 } 1342 1343 static uint64_t 1344 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu, 1345 gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp) 1346 { 1347 int curctxbad = 0, unconstrained = 0, forcefatal = 0; 1348 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1349 int nbanks = mca->gcpu_mca_nbanks; 1350 gcpu_mce_status_t mce; 1351 gcpu_bank_logout_t *gbl; 1352 uint64_t disp = 0; 1353 int i; 1354 1355 if (mcesp == NULL) 1356 mcesp = &mce; 1357 1358 mcesp->mce_nerr = nerr; 1359 1360 mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc = 1361 mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned = 1362 mcesp->mce_forcefatal = mcesp->mce_ignored = 0; 1363 1364 /* 1365 * If this a machine check then if the return instruction pointer 1366 * is not valid the current context is lost. 1367 */ 1368 if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV)) 1369 disp |= CMI_ERRDISP_RIPV_INVALID; 1370 1371 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) { 1372 uint64_t mcistatus = gbl->gbl_status; 1373 uint32_t ms_scope; 1374 int pcc, uc; 1375 int poisoned; 1376 1377 if (!(mcistatus & MSR_MC_STATUS_VAL)) 1378 continue; 1379 1380 if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT) 1381 continue; 1382 1383 pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0; 1384 uc = (mcistatus & MSR_MC_STATUS_UC) != 0; 1385 mcesp->mce_npcc += pcc; 1386 mcesp->mce_nuc += uc; 1387 1388 ms_scope = cms_error_action(hdl, ismc, i, mcistatus, 1389 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout); 1390 1391 if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) { 1392 pcc = 0; 1393 mcesp->mce_npcc_ok++; 1394 gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED; 1395 } 1396 1397 if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) { 1398 uc = 0; 1399 mcesp->mce_nuc_ok++; 1400 gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED; 1401 } 1402 1403 if (uc) { 1404 poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0; 1405 if (poisoned) { 1406 mcesp->mce_nuc_poisoned++; 1407 gbl->gbl_disp |= CMI_ERRDISP_POISONED; 1408 } 1409 } 1410 1411 if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) { 1412 /* 1413 * We're not being instructed to ignore the error, 1414 * so apply our standard disposition logic to it. 1415 */ 1416 if (uc && !poisoned) { 1417 unconstrained++; 1418 gbl->gbl_disp |= disp | 1419 CMI_ERRDISP_UC_UNCONSTRAINED; 1420 } 1421 1422 if (pcc && ismc) { 1423 curctxbad++; 1424 gbl->gbl_disp |= disp | 1425 CMI_ERRDISP_CURCTXBAD; 1426 } 1427 1428 /* 1429 * Even if the above may not indicate that the error 1430 * is terminal, model-specific support may insist 1431 * that we treat it as such. Such errors wil be 1432 * fatal even if discovered via poll. 1433 */ 1434 if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) { 1435 forcefatal++; 1436 mcesp->mce_forcefatal++; 1437 gbl->gbl_disp |= disp | 1438 CMI_ERRDISP_FORCEFATAL; 1439 } 1440 } else { 1441 mcesp->mce_ignored++; 1442 gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED; 1443 } 1444 } 1445 1446 if (unconstrained > 0) 1447 disp |= CMI_ERRDISP_UC_UNCONSTRAINED; 1448 1449 if (curctxbad > 0) 1450 disp |= CMI_ERRDISP_CURCTXBAD; 1451 1452 if (forcefatal > 0) 1453 disp |= CMI_ERRDISP_FORCEFATAL; 1454 1455 if (gcpu_mca_queue != NULL) { 1456 int how; 1457 1458 if (ismc) { 1459 how = cmi_mce_response(rp, disp) ? 1460 ERRORQ_ASYNC : /* no panic, so arrange drain */ 1461 ERRORQ_SYNC; /* panic flow will drain */ 1462 } else { 1463 how = (disp & CMI_ERRDISP_FORCEFATAL && 1464 cmi_panic_on_ue()) ? 1465 ERRORQ_SYNC : /* poller will panic */ 1466 ERRORQ_ASYNC; /* no panic */ 1467 } 1468 1469 errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how); 1470 } else if (disp != 0) { 1471 gcpu_bleat(hdl, gcl); 1472 } 1473 1474 mcesp->mce_disp = disp; 1475 1476 return (disp); 1477 } 1478 1479 /* 1480 * Gather error telemetry from our source, and then submit it for 1481 * processing. 1482 */ 1483 1484 #define IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \ 1485 ((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0) 1486 1487 #define STATUS_EQV(s1, s2) \ 1488 (((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER)) 1489 1490 static uint32_t gcpu_deferrred_polled_clears; 1491 1492 #ifndef __xpv 1493 static void 1494 gcpu_cmci_logout(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p, 1495 uint64_t status, int what) 1496 { 1497 uint64_t ctl2; 1498 1499 if (bank_cmci_p->cmci_cap && (what == GCPU_MPT_WHAT_CYC_ERR) && 1500 (!(status & MSR_MC_STATUS_VAL) || ((status & MSR_MC_STATUS_VAL) && 1501 !(status & MSR_MC_STATUS_CEC_MASK)))) { 1502 1503 if (!(bank_cmci_p->cmci_enabled)) { 1504 /* 1505 * when cmci is disabled, and the bank has no error or 1506 * no corrected error for 1507 * gcpu_mca_cmci_reenable_threshold consecutive polls, 1508 * turn on this bank's cmci. 1509 */ 1510 1511 bank_cmci_p->drtcmci ++; 1512 1513 if (bank_cmci_p->drtcmci >= 1514 gcpu_mca_cmci_reenable_threshold) { 1515 1516 /* turn on cmci */ 1517 1518 (void) cmi_hdl_rdmsr(hdl, 1519 IA32_MSR_MC_CTL2(bank), &ctl2); 1520 ctl2 |= MSR_MC_CTL2_EN; 1521 (void) cmi_hdl_wrmsr(hdl, 1522 IA32_MSR_MC_CTL2(bank), ctl2); 1523 1524 /* reset counter and set flag */ 1525 bank_cmci_p->drtcmci = 0; 1526 bank_cmci_p->cmci_enabled = 1; 1527 } 1528 } else { 1529 /* 1530 * when cmci is enabled,if is in cyclic poll and the 1531 * bank has no error or no corrected error, reset ncmci 1532 * counter 1533 */ 1534 bank_cmci_p->ncmci = 0; 1535 } 1536 } 1537 } 1538 1539 static void 1540 gcpu_cmci_throttle(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p, 1541 int what) 1542 { 1543 uint64_t ctl2 = 0; 1544 1545 /* 1546 * if cmci of this bank occurred beyond 1547 * gcpu_mca_cmci_throttling_threshold between 2 polls, 1548 * turn off this bank's CMCI; 1549 */ 1550 if (bank_cmci_p->cmci_enabled && what == GCPU_MPT_WHAT_CMCI_ERR) { 1551 1552 /* if it is cmci trap, increase the count */ 1553 bank_cmci_p->ncmci++; 1554 1555 if (bank_cmci_p->ncmci >= gcpu_mca_cmci_throttling_threshold) { 1556 1557 /* turn off cmci */ 1558 1559 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(bank), 1560 &ctl2); 1561 ctl2 &= ~MSR_MC_CTL2_EN; 1562 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(bank), 1563 ctl2); 1564 1565 /* clear the flag and count */ 1566 1567 bank_cmci_p->cmci_enabled = 0; 1568 bank_cmci_p->ncmci = 0; 1569 } 1570 } 1571 } 1572 #endif 1573 1574 static void 1575 clear_mc(int first, int last, int ismc, boolean_t clrstatus, 1576 cmi_hdl_t hdl, gcpu_logout_t *gcl, gcpu_logout_t *pgcl) 1577 { 1578 int i; 1579 gcpu_bank_logout_t *gbl, *pgbl; 1580 uint64_t status; 1581 1582 if (first < 0 || last < 0) 1583 return; 1584 1585 for (i = first, gbl = &gcl->gcl_data[first]; i <= last; i++, gbl++) { 1586 status = gbl->gbl_status; 1587 if (status == 0) 1588 continue; 1589 if (clrstatus == B_FALSE) 1590 goto serialize; 1591 1592 /* 1593 * For i86xpv we always clear status in order to invalidate 1594 * the interposed telemetry. 1595 * 1596 * For native machine checks we always clear status here. For 1597 * native polls we must be a little more cautious since there 1598 * is an outside chance that we may clear telemetry from a 1599 * shared MCA bank on which a sibling core is machine checking. 1600 * 1601 * For polled observations of errors that look like they may 1602 * produce a machine check (UC/PCC and ENabled, although these 1603 * do not guarantee a machine check on error occurence) 1604 * we will not clear the status at this wakeup unless 1605 * we saw the same status at the previous poll. We will 1606 * always process and log the current observations - it 1607 * is only the clearing of MCi_STATUS which may be 1608 * deferred until the next wakeup. 1609 */ 1610 if (isxpv || ismc || !IS_MCE_CANDIDATE(status)) { 1611 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL); 1612 goto serialize; 1613 } 1614 1615 /* 1616 * We have a polled observation of a machine check 1617 * candidate. If we saw essentially the same status at the 1618 * last poll then clear the status now since this appears 1619 * not to be a #MC candidate after all. If we see quite 1620 * different status now then do not clear, but reconsider at 1621 * the next poll. In no actual machine check clears 1622 * the status in the interim then the status should not 1623 * keep changing forever (meaning we'd never clear it) 1624 * since before long we'll simply have latched the highest- 1625 * priority error and set the OVerflow bit. Nonetheless 1626 * we count how many times we defer clearing and after 1627 * a while insist on clearing the status. 1628 */ 1629 pgbl = &pgcl->gcl_data[i]; 1630 if (pgbl->gbl_clrdefcnt != 0) { 1631 /* We deferred clear on this bank at last wakeup */ 1632 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) || 1633 pgbl->gbl_clrdefcnt > 5) { 1634 /* 1635 * Status is unchanged so clear it now and, 1636 * since we have already logged this info, 1637 * avoid logging it again. 1638 */ 1639 gbl->gbl_status = 0; 1640 (void) cmi_hdl_wrmsr(hdl, 1641 IA32_MSR_MC(i, STATUS), 0ULL); 1642 } else { 1643 /* Record deferral for next wakeup */ 1644 gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1; 1645 } 1646 } else { 1647 /* Record initial deferral for next wakeup */ 1648 gbl->gbl_clrdefcnt = 1; 1649 gcpu_deferrred_polled_clears++; 1650 } 1651 1652 serialize: 1653 { 1654 #ifdef __xpv 1655 ; 1656 #else 1657 /* 1658 * Intel Vol 3A says to execute a serializing 1659 * instruction here, ie CPUID. Well WRMSR is also 1660 * defined to be serializing, so the status clear above 1661 * should suffice. To be a good citizen, and since 1662 * some clears are deferred, we'll execute a CPUID 1663 * instruction here. 1664 */ 1665 struct cpuid_regs tmp; 1666 (void) __cpuid_insn(&tmp); 1667 #endif 1668 } 1669 } 1670 } 1671 1672 /*ARGSUSED5*/ 1673 void 1674 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask, 1675 gcpu_mce_status_t *mcesp, boolean_t clrstatus, int what) 1676 { 1677 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1678 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1679 int nbanks = mca->gcpu_mca_nbanks; 1680 gcpu_bank_logout_t *gbl, *pgbl; 1681 gcpu_logout_t *gcl, *pgcl; 1682 int ismc = (rp != NULL); 1683 int ispoll = !ismc; 1684 int i, nerr = 0; 1685 cmi_errno_t err; 1686 uint64_t mcg_status; 1687 uint64_t disp; 1688 uint64_t cap; 1689 int first = -1; 1690 int last = -1; 1691 int willpanic = 0; 1692 1693 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) != 1694 CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != 1695 CMI_SUCCESS) { 1696 if (mcesp != NULL) 1697 mcesp->mce_nerr = mcesp->mce_disp = 0; 1698 return; 1699 } 1700 1701 if (ismc) { 1702 gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION]; 1703 } else { 1704 int pidx = mca->gcpu_mca_nextpoll_idx; 1705 int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ? 1706 GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1; 1707 1708 gcl = mca->gcpu_mca_logout[pidx]; /* current logout */ 1709 pgcl = mca->gcpu_mca_logout[ppidx]; /* previous logout */ 1710 mca->gcpu_mca_nextpoll_idx = ppidx; /* switch next time */ 1711 } 1712 1713 gcl->gcl_timestamp = gethrtime_waitfree(); 1714 gcl->gcl_mcg_status = mcg_status; 1715 gcl->gcl_ip = rp ? rp->r_pc : 0; 1716 1717 gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0; 1718 if (cap & MCG_CAP_TES_P) 1719 gcl->gcl_flags |= GCPU_GCL_F_TES_P; 1720 1721 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) { 1722 uint64_t status, status2, addr, misc; 1723 int retries = gcpu_mca_telemetry_retries; 1724 1725 gbl->gbl_status = 0; 1726 gbl->gbl_disp = 0; 1727 gbl->gbl_clrdefcnt = 0; 1728 1729 /* 1730 * Only logout from MCA banks we have initialized from at 1731 * least one core. If a core shares an MCA bank with another 1732 * but perhaps lost the race to initialize it, then it must 1733 * still be allowed to logout from the shared bank. 1734 */ 1735 if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i)) 1736 continue; 1737 1738 /* 1739 * On a poll look only at the banks we've been asked to check. 1740 */ 1741 if (rp == NULL && !(bankmask & 1 << i)) 1742 continue; 1743 1744 1745 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) != 1746 CMI_SUCCESS) 1747 continue; 1748 1749 #ifndef __xpv 1750 gcpu_cmci_logout(hdl, i, &mca->gcpu_bank_cmci[i], status, what); 1751 #endif 1752 1753 retry: 1754 if (!(status & MSR_MC_STATUS_VAL)) 1755 continue; 1756 1757 /* First and last bank that have valid status */ 1758 if (first < 0) 1759 first = i; 1760 last = i; 1761 1762 addr = -1; 1763 misc = 0; 1764 1765 if (status & MSR_MC_STATUS_ADDRV) 1766 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr); 1767 1768 if (status & MSR_MC_STATUS_MISCV) 1769 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc); 1770 1771 #ifndef __xpv 1772 gcpu_cmci_throttle(hdl, i, &mca->gcpu_bank_cmci[i], what); 1773 #endif 1774 1775 /* 1776 * Allow the model-specific code to extract bank telemetry. 1777 */ 1778 cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout); 1779 1780 /* 1781 * Not all cpu models assure us that the status/address/misc 1782 * data will not change during the above sequence of MSR reads, 1783 * or that it can only change by the addition of the OVerflow 1784 * bit to the status register. If the status has changed 1785 * other than in the overflow bit then we attempt to reread 1786 * for a consistent snapshot, but eventually give up and 1787 * go with what we've got. We only perform this check 1788 * for a poll - a further #MC during a #MC will reset, and 1789 * polled errors should not overwrite higher-priority 1790 * trapping errors (but could set the overflow bit). 1791 */ 1792 if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), 1793 &status2)) == CMI_SUCCESS) { 1794 if (!STATUS_EQV(status, status2)) { 1795 if (retries-- > 0) { 1796 status = status2; 1797 goto retry; 1798 } else { 1799 gbl->gbl_disp |= 1800 CMI_ERRDISP_INCONSISTENT; 1801 } 1802 } 1803 } else if (ispoll && err != CMI_SUCCESS) { 1804 gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT; 1805 } 1806 1807 nerr++; 1808 gbl->gbl_status = status; 1809 gbl->gbl_addr = addr; 1810 gbl->gbl_misc = misc; 1811 1812 /* 1813 * For polled observation, if the count of deferred status 1814 * clears updated in the clear_mc() is nonzero and the 1815 * MCi_STATUS has not changed, the last wakeup has produced 1816 * the ereport of the error. Therefore, clear the status in 1817 * this wakeup to avoid duplicate ereport. 1818 */ 1819 pgbl = &pgcl->gcl_data[i]; 1820 if (!isxpv && ispoll && IS_MCE_CANDIDATE(status) && 1821 pgbl->gbl_clrdefcnt != 0) { 1822 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status)) { 1823 gbl->gbl_status = 0; 1824 (void) cmi_hdl_wrmsr(hdl, 1825 IA32_MSR_MC(i, STATUS), 0ULL); 1826 } 1827 } 1828 } 1829 1830 if (gcpu_mca_stack_flag) 1831 gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH); 1832 else 1833 gcl->gcl_stackdepth = 0; 1834 1835 /* 1836 * Decide our disposition for this error or errors, and submit for 1837 * logging and subsequent diagnosis. 1838 */ 1839 if (nerr != 0) { 1840 disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp); 1841 1842 willpanic = (ismc && cmi_mce_response(rp, disp) == 0); 1843 1844 if (!willpanic) 1845 clear_mc(first, last, ismc, clrstatus, hdl, gcl, pgcl); 1846 } else { 1847 disp = 0; 1848 if (mcesp) { 1849 mcesp->mce_nerr = mcesp->mce_disp = 0; 1850 } 1851 } 1852 1853 /* 1854 * Clear MCG_STATUS if MCIP is set (machine check in progress). 1855 * If a second #MC had occured before now the system would have 1856 * reset. We can only do thise once gcpu_mca_process has copied 1857 * the logout structure. 1858 */ 1859 if (ismc && mcg_status & MCG_STATUS_MCIP) 1860 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0); 1861 1862 /* 1863 * At this point we have read and logged all telemetry that is visible 1864 * under the MCA. On architectures for which the NorthBridge is 1865 * on-chip this may include NB-observed errors, but where the NB 1866 * is off chip it may have been the source of the #MC request and 1867 * so we must call into the memory-controller driver to give it 1868 * a chance to log errors. 1869 */ 1870 if (ismc) { 1871 cmi_mc_logout(hdl, 1, willpanic); 1872 } 1873 } 1874 1875 #ifndef __xpv 1876 int gcpu_mca_trap_vomit_summary = 0; 1877 1878 /* 1879 * On a native machine check exception we come here from mcetrap via 1880 * cmi_mca_trap. A machine check on one cpu of a chip does not trap others 1881 * cpus of the chip, so it is possible that another cpu on this chip could 1882 * initiate a poll while we're in the #mc handler; it is also possible that 1883 * this trap has occured during a poll on this cpu. So we must acquire 1884 * the chip-wide poll lock, but be careful to avoid deadlock. 1885 * 1886 * The 'data' pointer cannot be NULL due to init order. 1887 */ 1888 uint64_t 1889 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp) 1890 { 1891 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1892 kmutex_t *poll_lock = NULL; 1893 gcpu_mce_status_t mce; 1894 uint64_t mcg_status; 1895 int tooklock = 0; 1896 1897 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) != 1898 CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP)) 1899 return (0); 1900 1901 /* 1902 * Synchronize with any poller from another core that may happen 1903 * to share access to one or more of the MCA banks. 1904 */ 1905 if (gcpu->gcpu_shared != NULL) 1906 poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock; 1907 1908 if (poll_lock != NULL && !mutex_owned(poll_lock)) { 1909 /* 1910 * The lock is not owned by the thread we have 1911 * interrupted. Spin for this adaptive lock. 1912 */ 1913 while (!mutex_tryenter(poll_lock)) { 1914 while (mutex_owner(poll_lock) != NULL) 1915 ; 1916 } 1917 tooklock = 1; 1918 } 1919 1920 gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE, GCPU_MPT_WHAT_MC_ERR); 1921 1922 if (tooklock) 1923 mutex_exit(poll_lock); 1924 1925 /* 1926 * gcpu_mca_trap_vomit_summary may be set for debug assistance. 1927 */ 1928 if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) { 1929 cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, " 1930 "%u PCC (%u ok), " 1931 "%u UC (%d ok, %u poisoned), " 1932 "%u forcefatal, %u ignored", 1933 mce.mce_nerr, (u_longlong_t)mce.mce_disp, 1934 mce.mce_npcc, mce.mce_npcc_ok, 1935 mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned, 1936 mce.mce_forcefatal, mce.mce_ignored); 1937 } 1938 1939 return (mce.mce_disp); 1940 } 1941 #endif 1942 1943 /*ARGSUSED*/ 1944 void 1945 gcpu_faulted_enter(cmi_hdl_t hdl) 1946 { 1947 /* Nothing to do here */ 1948 } 1949 1950 /*ARGSUSED*/ 1951 void 1952 gcpu_faulted_exit(cmi_hdl_t hdl) 1953 { 1954 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1955 1956 gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING; 1957 } 1958 1959 /* 1960 * Write the requested values to the indicated MSRs. Having no knowledge 1961 * of the model-specific requirements for writing to these model-specific 1962 * registers, we will only blindly write to those MSRs if the 'force' 1963 * argument is nonzero. That option should only be used in prototyping 1964 * and debugging. 1965 */ 1966 /*ARGSUSED*/ 1967 cmi_errno_t 1968 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs, 1969 int force) 1970 { 1971 int i, errs = 0; 1972 1973 for (i = 0; i < nregs; i++) { 1974 uint_t msr = regs[i].cmr_msrnum; 1975 uint64_t val = regs[i].cmr_msrval; 1976 1977 if (cms_present(hdl)) { 1978 if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS) 1979 errs++; 1980 } else if (force) { 1981 errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS); 1982 } else { 1983 errs++; 1984 } 1985 } 1986 1987 return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN); 1988 } 1989