1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/mca_x86.h> 28 #include <sys/cpu_module_impl.h> 29 #include <sys/cpu_module_ms.h> 30 #include <sys/cmn_err.h> 31 #include <sys/cpuvar.h> 32 #include <sys/pghw.h> 33 #include <sys/x86_archext.h> 34 #include <sys/sysmacros.h> 35 #include <sys/regset.h> 36 #include <sys/privregs.h> 37 #include <sys/systm.h> 38 #include <sys/types.h> 39 #include <sys/log.h> 40 #include <sys/psw.h> 41 #include <sys/fm/protocol.h> 42 #include <sys/fm/util.h> 43 #include <sys/errorq.h> 44 #include <sys/mca_x86.h> 45 #include <sys/fm/cpu/GMCA.h> 46 #include <sys/sysevent.h> 47 #include <sys/ontrap.h> 48 49 #include "gcpu.h" 50 51 /* 52 * Clear to log telemetry found at initialization. While processor docs 53 * say you should process this telemetry on all but Intel family 0x6 54 * there are way too many exceptions and we want to avoid bogus 55 * diagnoses. 56 */ 57 int gcpu_suppress_log_on_init = 1; 58 59 /* 60 * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at 61 * error logout time. The stack will be included in the ereport if the 62 * error type selects stack inclusion, or in all cases if 63 * gcpu_mca_stack_ereport_include is nonzero. 64 */ 65 int gcpu_mca_stack_flag = 0; 66 int gcpu_mca_stack_ereport_include = 0; 67 68 /* 69 * The number of times to re-read MCA telemetry to try to obtain a 70 * consistent snapshot if we find it to be changing under our feet. 71 */ 72 int gcpu_mca_telemetry_retries = 5; 73 74 #ifndef __xpv 75 int gcpu_mca_cmci_throttling_threshold = 10; 76 int gcpu_mca_cmci_reenable_threshold = 1000; 77 #endif 78 79 static gcpu_error_disp_t gcpu_errtypes[] = { 80 81 /* 82 * Unclassified 83 */ 84 { 85 FM_EREPORT_CPU_GENERIC_UNCLASSIFIED, 86 NULL, 87 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 88 MCAX86_SIMPLE_UNCLASSIFIED_MASKON, 89 MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF 90 }, 91 92 /* 93 * Microcode ROM Parity Error 94 */ 95 { 96 FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY, 97 NULL, 98 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 99 MCAX86_SIMPLE_MC_CODE_PARITY_MASKON, 100 MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF 101 }, 102 103 /* 104 * External - BINIT# from another processor during power-on config 105 */ 106 { 107 FM_EREPORT_CPU_GENERIC_EXTERNAL, 108 NULL, 109 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 110 MCAX86_SIMPLE_EXTERNAL_MASKON, 111 MCAX86_SIMPLE_EXTERNAL_MASKOFF 112 }, 113 114 /* 115 * Functional redundancy check master/slave error 116 */ 117 { 118 FM_EREPORT_CPU_GENERIC_FRC, 119 NULL, 120 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 121 MCAX86_SIMPLE_FRC_MASKON, 122 MCAX86_SIMPLE_FRC_MASKOFF 123 }, 124 125 /* 126 * Internal parity error 127 */ 128 { 129 FM_EREPORT_CPU_GENERIC_INTERNAL_PARITY, 130 NULL, 131 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 132 MCAX86_SIMPLE_INTERNAL_PARITY_MASKON, 133 MCAX86_SIMPLE_INTERNAL_PARITY_MASKOFF 134 }, 135 136 137 /* 138 * Internal timer error 139 */ 140 { 141 FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER, 142 NULL, 143 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 144 MCAX86_SIMPLE_INTERNAL_TIMER_MASKON, 145 MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF 146 }, 147 148 /* 149 * Internal unclassified 150 */ 151 { 152 FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS, 153 NULL, 154 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 155 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON, 156 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF 157 }, 158 159 /* 160 * Compound error codes - generic memory hierarchy 161 */ 162 { 163 FM_EREPORT_CPU_GENERIC_GENMEMHIER, 164 NULL, 165 FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */ 166 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON, 167 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF 168 }, 169 170 /* 171 * Compound error codes - TLB errors 172 */ 173 { 174 FM_EREPORT_CPU_GENERIC_TLB, 175 "%1$s" "TLB" "%2$s" "_ERR", 176 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 177 MCAX86_COMPOUND_TLB_MASKON, 178 MCAX86_COMPOUND_TLB_MASKOFF 179 }, 180 181 /* 182 * Compound error codes - memory hierarchy 183 */ 184 { 185 FM_EREPORT_CPU_GENERIC_MEMHIER, 186 "%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR", 187 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 188 MCAX86_COMPOUND_MEMHIER_MASKON, 189 MCAX86_COMPOUND_MEMHIER_MASKOFF 190 }, 191 192 /* 193 * Compound error codes - bus and interconnect errors 194 */ 195 { 196 FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT, 197 "BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR", 198 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 199 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON, 200 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF 201 }, 202 /* 203 * Compound error codes - memory controller errors 204 */ 205 { 206 FM_EREPORT_CPU_GENERIC_MEMORY_CONTROLLER, 207 "MC" "_" "%8$s" "_" "%9$s" "_ERR", 208 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 209 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKON, 210 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKOFF 211 }, 212 }; 213 214 static gcpu_error_disp_t gcpu_unknown = { 215 FM_EREPORT_CPU_GENERIC_UNKNOWN, 216 "UNKNOWN", 217 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 218 0, 219 0 220 }; 221 222 static errorq_t *gcpu_mca_queue; 223 static kmutex_t gcpu_mca_queue_lock; 224 225 #ifdef __xpv 226 static int isxpv = 1; 227 #else 228 static int isxpv = 0; 229 #endif 230 231 static const gcpu_error_disp_t * 232 gcpu_disp_match(uint16_t code) 233 { 234 const gcpu_error_disp_t *ged = gcpu_errtypes; 235 int i; 236 237 for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t); 238 i++, ged++) { 239 uint16_t on = ged->ged_errcode_mask_on; 240 uint16_t off = ged->ged_errcode_mask_off; 241 242 if ((code & on) == on && (code & off) == 0) 243 return (ged); 244 } 245 246 return (NULL); 247 } 248 249 static uint8_t 250 bit_strip(uint16_t code, uint16_t mask, uint16_t shift) 251 { 252 return ((uint8_t)(code & mask) >> shift); 253 } 254 255 #define BIT_STRIP(code, name) \ 256 bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \ 257 MCAX86_ERRCODE_##name##_SHIFT) 258 259 #define GCPU_MNEMONIC_UNDEF "undefined" 260 #define GCPU_MNEMONIC_RESVD "reserved" 261 262 /* 263 * Mappings of TT, LL, RRRR, PP, II and T values to compound error name 264 * mnemonics and to ereport class name components. 265 */ 266 267 struct gcpu_mnexp { 268 const char *mne_compound; /* used in expanding compound errname */ 269 const char *mne_ereport; /* used in expanding ereport class */ 270 }; 271 272 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */ 273 { "I", FM_EREPORT_CPU_GENERIC_TT_INSTR }, /* INSTR */ 274 { "D", FM_EREPORT_CPU_GENERIC_TT_DATA }, /* DATA */ 275 { "G", FM_EREPORT_CPU_GENERIC_TT_GEN }, /* GEN */ 276 { GCPU_MNEMONIC_UNDEF, "" } 277 }; 278 279 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */ 280 { "LO", FM_EREPORT_CPU_GENERIC_LL_L0 }, /* L0 */ 281 { "L1", FM_EREPORT_CPU_GENERIC_LL_L1 }, /* L1 */ 282 { "L2", FM_EREPORT_CPU_GENERIC_LL_L2 }, /* L2 */ 283 { "LG", FM_EREPORT_CPU_GENERIC_LL_LG } /* LG */ 284 }; 285 286 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */ 287 { "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR }, /* ERR */ 288 { "RD", FM_EREPORT_CPU_GENERIC_RRRR_RD }, /* RD */ 289 { "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR }, /* WR */ 290 { "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD }, /* DRD */ 291 { "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR }, /* DWR */ 292 { "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD }, /* IRD */ 293 { "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH }, /* PREFETCH */ 294 { "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT }, /* EVICT */ 295 { "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP }, /* SNOOP */ 296 }; 297 298 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */ 299 { "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC }, /* SRC */ 300 { "RES", FM_EREPORT_CPU_GENERIC_PP_RES }, /* RES */ 301 { "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS }, /* OBS */ 302 { "", FM_EREPORT_CPU_GENERIC_PP_GEN } /* GEN */ 303 }; 304 305 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */ 306 { "M", FM_EREPORT_CPU_GENERIC_II_MEM }, /* MEM */ 307 { GCPU_MNEMONIC_RESVD, "" }, 308 { "IO", FM_EREPORT_CPU_GENERIC_II_IO }, /* IO */ 309 { "", FM_EREPORT_CPU_GENERIC_II_GEN } /* GEN */ 310 }; 311 312 static struct gcpu_mnexp gcpu_T_mnemonics[] = { /* MCAX86_ERRCODE_T_* */ 313 { "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT }, /* NONE */ 314 { "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT } /* TIMEOUT */ 315 }; 316 317 static struct gcpu_mnexp gcpu_CCCC_mnemonics[] = { /* MCAX86_ERRCODE_CCCC_* */ 318 { "CH0", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH0 */ 319 { "CH1", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH1 */ 320 { "CH2", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH2 */ 321 { "CH3", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH3 */ 322 { "CH4", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH4 */ 323 { "CH5", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH5 */ 324 { "CH6", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH6 */ 325 { "CH7", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH7 */ 326 { "CH8", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH8 */ 327 { "CH9", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH9 */ 328 { "CH10", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH10 */ 329 { "CH11", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH11 */ 330 { "CH12", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH12 */ 331 { "CH13", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH13 */ 332 { "CH14", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH14 */ 333 { "CH", FM_EREPORT_CPU_GENERIC_CCCC } /* GEN */ 334 }; 335 336 static struct gcpu_mnexp gcpu_MMM_mnemonics[] = { /* MCAX86_ERRCODE_MMM_* */ 337 { "GEN", FM_EREPORT_CPU_GENERIC_MMM_ERR }, /* GEN ERR */ 338 { "RD", FM_EREPORT_CPU_GENERIC_MMM_RD }, /* READ */ 339 { "WR", FM_EREPORT_CPU_GENERIC_MMM_WR }, /* WRITE */ 340 { "ADDR_CMD", FM_EREPORT_CPU_GENERIC_MMM_ADRCMD }, /* ADDR, CMD */ 341 { "SCRUB", FM_EREPORT_CPU_GENERIC_MMM_SCRUB }, 342 { GCPU_MNEMONIC_RESVD, ""}, /* RESERVED */ 343 { GCPU_MNEMONIC_RESVD, ""}, /* RESERVED */ 344 { GCPU_MNEMONIC_RESVD, ""} /* RESERVED */ 345 }; 346 347 enum gcpu_mn_namespace { 348 GCPU_MN_NAMESPACE_COMPOUND, 349 GCPU_MN_NAMESPACE_EREPORT 350 }; 351 352 static const char * 353 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint8_t val, 354 enum gcpu_mn_namespace nspace) 355 { 356 if (val >= tbl_sz) 357 return (GCPU_MNEMONIC_UNDEF); /* for all namespaces */ 358 359 switch (nspace) { 360 case GCPU_MN_NAMESPACE_COMPOUND: 361 return (tbl[val].mne_compound); 362 /*NOTREACHED*/ 363 364 case GCPU_MN_NAMESPACE_EREPORT: 365 return (tbl[val].mne_ereport); 366 /*NOTREACHED*/ 367 368 default: 369 return (GCPU_MNEMONIC_UNDEF); 370 /*NOTREACHED*/ 371 } 372 } 373 374 /* 375 * The ereport class leaf component is either a simple string with no 376 * format specifiers, or a string with one or more embedded %n$s specifiers - 377 * positional selection for string arguments. The kernel snprintf does 378 * not support %n$ (and teaching it to do so is too big a headache) so 379 * we will expand this restricted format string ourselves. 380 */ 381 382 #define GCPU_CLASS_VARCOMPS 9 383 384 #define GCPU_MNEMONIC(code, name, nspace) \ 385 gcpu_mnemonic(gcpu_##name##_mnemonics, \ 386 sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \ 387 BIT_STRIP(code, name), nspace) 388 389 static void 390 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status, 391 enum gcpu_mn_namespace nspace) 392 { 393 uint16_t code = MCAX86_ERRCODE(status); 394 const char *mn[GCPU_CLASS_VARCOMPS]; 395 char *p = buf; /* current position in buf */ 396 char *q = buf + buflen; /* pointer past last char in buf */ 397 int which, expfmtchar, error; 398 char c; 399 400 mn[0] = GCPU_MNEMONIC(code, TT, nspace); 401 mn[1] = GCPU_MNEMONIC(code, LL, nspace); 402 mn[2] = GCPU_MNEMONIC(code, RRRR, nspace); 403 mn[3] = GCPU_MNEMONIC(code, PP, nspace); 404 mn[4] = GCPU_MNEMONIC(code, II, nspace); 405 mn[5] = GCPU_MNEMONIC(code, T, nspace); 406 mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : ""; 407 mn[7] = GCPU_MNEMONIC(code, CCCC, nspace); 408 mn[8] = GCPU_MNEMONIC(code, MMM, nspace); 409 410 while (p < q - 1 && (c = *fmt++) != '\0') { 411 if (c != '%') { 412 /* not the beginning of a format specifier - copy */ 413 *p++ = c; 414 continue; 415 } 416 417 error = 0; 418 which = -1; 419 expfmtchar = -1; 420 421 nextfmt: 422 if ((c = *fmt++) == '\0') 423 break; /* early termination of fmt specifier */ 424 425 switch (c) { 426 case '1': 427 case '2': 428 case '3': 429 case '4': 430 case '5': 431 case '6': 432 case '7': 433 case '8': 434 case '9': 435 if (which != -1) { /* allow only one positional digit */ 436 error++; 437 break; 438 } 439 which = c - '1'; 440 goto nextfmt; 441 /*NOTREACHED*/ 442 443 case '$': 444 if (which == -1) { /* no position specified */ 445 error++; 446 break; 447 } 448 expfmtchar = 's'; 449 goto nextfmt; 450 /*NOTREACHED*/ 451 452 case 's': 453 if (expfmtchar != 's') { 454 error++; 455 break; 456 } 457 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s", 458 mn[which]); 459 p += strlen(p); 460 break; 461 462 default: 463 error++; 464 break; 465 } 466 467 if (error) 468 break; 469 } 470 471 *p = '\0'; /* NUL termination */ 472 } 473 474 static void 475 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status, 476 const char *cpuclass, const char *leafclass) 477 { 478 char *p = buf; /* current position in buf */ 479 char *q = buf + buflen; /* pointer past last char in buf */ 480 481 (void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.", 482 FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC); 483 484 p += strlen(p); 485 if (p >= q) 486 return; 487 488 if (leafclass == NULL) { 489 gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status, 490 GCPU_MN_NAMESPACE_EREPORT); 491 } else { 492 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s", 493 leafclass); 494 } 495 } 496 497 /* 498 * Create an "hc" scheme FMRI identifying the given cpu with 499 * motherboard/chip/core/strand instance numbers. 500 */ 501 static nvlist_t * 502 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva) 503 { 504 nvlist_t *nvl; 505 506 if ((nvl = fm_nvlist_create(nva)) == NULL) 507 return (NULL); 508 509 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 4, 510 "motherboard", 0, 511 "chip", cmi_hdl_chipid(hdl), 512 "core", cmi_hdl_coreid(hdl), 513 "strand", cmi_hdl_strandid(hdl)); 514 515 return (nvl); 516 } 517 518 int gcpu_bleat_count_thresh = 5; 519 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL; 520 521 /* 522 * Called when we are unable to propogate a logout structure onto an 523 * errorq for subsequent ereport preparation and logging etc. The caller 524 * should usually only decide to call this for severe errors - those we 525 * suspect we may need to panic for. 526 */ 527 static void 528 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl) 529 { 530 hrtime_t now = gethrtime_waitfree(); 531 static hrtime_t gcpu_last_bleat; 532 gcpu_bank_logout_t *gbl; 533 static int bleatcount; 534 int i; 535 536 /* 537 * Throttle spamming of the console. The first gcpu_bleat_count_thresh 538 * can come as fast as we like, but once we've spammed that many 539 * to the console we require a minimum interval to pass before 540 * any more complaints. 541 */ 542 if (++bleatcount > gcpu_bleat_count_thresh) { 543 if (now - gcpu_last_bleat < gcpu_bleat_min_interval) 544 return; 545 else 546 bleatcount = 0; 547 } 548 gcpu_last_bleat = now; 549 550 cmn_err(CE_WARN, 551 "Machine-Check Errors unlogged on chip %d core %d strand %d, " 552 "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl), 553 cmi_hdl_strandid(hdl)); 554 cmn_err(CE_WARN, "MCG_STATUS 0x%016llx", 555 (u_longlong_t)gcl->gcl_mcg_status); 556 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) { 557 uint64_t status = gbl->gbl_status; 558 559 if (!(status & MSR_MC_STATUS_VAL)) 560 continue; 561 562 switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) { 563 case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV: 564 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 565 "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx", 566 i, IA32_MSR_MC(i, STATUS), 567 (u_longlong_t)status, 568 (u_longlong_t)gbl->gbl_addr, 569 (u_longlong_t)gbl->gbl_misc); 570 break; 571 572 case MSR_MC_STATUS_ADDRV: 573 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 574 "STAT 0x%016llx ADDR 0x%016llx", 575 i, IA32_MSR_MC(i, STATUS), 576 (u_longlong_t)status, 577 (u_longlong_t)gbl->gbl_addr); 578 break; 579 580 case MSR_MC_STATUS_MISCV: 581 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 582 "STAT 0x%016llx MISC 0x%016llx", 583 i, IA32_MSR_MC(i, STATUS), 584 (u_longlong_t)status, 585 (u_longlong_t)gbl->gbl_misc); 586 break; 587 588 default: 589 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 590 "STAT 0x%016llx", 591 i, IA32_MSR_MC(i, STATUS), 592 (u_longlong_t)status); 593 break; 594 595 } 596 } 597 } 598 599 #define _GCPU_BSTATUS(status, what) \ 600 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \ 601 (status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE 602 603 static void 604 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl, 605 uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code) 606 { 607 uint64_t members = ged ? ged->ged_ereport_members : 608 FM_EREPORT_PAYLOAD_FLAGS_COMMON; 609 uint64_t mcg = gcl->gcl_mcg_status; 610 int mcip = mcg & MCG_STATUS_MCIP; 611 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno]; 612 uint64_t bstat = gbl->gbl_status; 613 614 /* 615 * Include the compound error name if requested and if this 616 * is a compound error type. 617 */ 618 if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged && 619 ged->ged_compound_fmt != NULL) { 620 char buf[FM_MAX_CLASS]; 621 622 gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code, 623 GCPU_MN_NAMESPACE_COMPOUND); 624 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR, 625 DATA_TYPE_STRING, buf, NULL); 626 } 627 628 /* 629 * Include disposition information for this error 630 */ 631 if (members & FM_EREPORT_PAYLOAD_FLAG_DISP && 632 gbl->gbl_disp != 0) { 633 int i, empty = 1; 634 char buf[128]; 635 char *p = buf, *q = buf + 128; 636 static struct _gcpu_disp_name { 637 uint64_t dv; 638 const char *dn; 639 } disp_names[] = { 640 { CMI_ERRDISP_CURCTXBAD, 641 "processor_context_corrupt" }, 642 { CMI_ERRDISP_RIPV_INVALID, 643 "return_ip_invalid" }, 644 { CMI_ERRDISP_UC_UNCONSTRAINED, 645 "unconstrained" }, 646 { CMI_ERRDISP_FORCEFATAL, 647 "forcefatal" }, 648 { CMI_ERRDISP_IGNORED, 649 "ignored" }, 650 { CMI_ERRDISP_PCC_CLEARED, 651 "corrupt_context_cleared" }, 652 { CMI_ERRDISP_UC_CLEARED, 653 "uncorrected_data_cleared" }, 654 { CMI_ERRDISP_POISONED, 655 "poisoned" }, 656 { CMI_ERRDISP_INCONSISTENT, 657 "telemetry_unstable" }, 658 }; 659 660 for (i = 0; i < sizeof (disp_names) / 661 sizeof (struct _gcpu_disp_name); i++) { 662 if ((gbl->gbl_disp & disp_names[i].dv) == 0) 663 continue; 664 665 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, 666 "%s%s", empty ? "" : ",", disp_names[i].dn); 667 p += strlen(p); 668 empty = 0; 669 } 670 671 if (p != buf) 672 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP, 673 DATA_TYPE_STRING, buf, NULL); 674 } 675 676 /* 677 * If MCG_STATUS is included add that and an indication of whether 678 * this ereport was the result of a machine check or poll. 679 */ 680 if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) { 681 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS, 682 DATA_TYPE_UINT64, mcg, NULL); 683 684 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP, 685 DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL); 686 } 687 688 /* 689 * If an instruction pointer is to be included add one provided 690 * MCG_STATUS indicated it is valid; meaningless for polled events. 691 */ 692 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP && 693 mcg & MCG_STATUS_EIPV) { 694 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP, 695 DATA_TYPE_UINT64, gcl->gcl_ip, NULL); 696 } 697 698 /* 699 * Add an indication of whether the trap occured during privileged code. 700 */ 701 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) { 702 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV, 703 DATA_TYPE_BOOLEAN_VALUE, 704 gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL); 705 } 706 707 /* 708 * If requested, add the index of the MCA bank. This indicates the 709 * n'th bank of 4 MCA registers, and does not necessarily correspond 710 * to MCi_* - use the bank offset to correlate 711 */ 712 if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) { 713 fm_payload_set(ereport, 714 /* Bank number */ 715 FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno, 716 /* Offset of MCi_CTL */ 717 FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64, 718 IA32_MSR_MC(bankno, CTL), 719 NULL); 720 } 721 722 /* 723 * Add MCi_STATUS if requested, and decode it. 724 */ 725 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) { 726 const char *tbes[] = { 727 "No tracking", /* 00 */ 728 "Green - below threshold", /* 01 */ 729 "Yellow - above threshold", /* 10 */ 730 "Reserved" /* 11 */ 731 }; 732 733 fm_payload_set(ereport, 734 /* Bank MCi_STATUS */ 735 FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat, 736 /* Overflow? */ 737 _GCPU_BSTATUS(bstat, OVER), 738 /* Uncorrected? */ 739 _GCPU_BSTATUS(bstat, UC), 740 /* Enabled? */ 741 _GCPU_BSTATUS(bstat, EN), 742 /* Processor context corrupt? */ 743 _GCPU_BSTATUS(bstat, PCC), 744 /* Error code */ 745 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE, 746 DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat), 747 /* Model-specific error code */ 748 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE, 749 DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat), 750 NULL); 751 752 /* 753 * If MCG_CAP.TES_P indicates that that thresholding info 754 * is present in the architural component of the bank status 755 * then include threshold information for this bank. 756 */ 757 if (gcl->gcl_flags & GCPU_GCL_F_TES_P) { 758 fm_payload_set(ereport, 759 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES, 760 DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)], 761 NULL); 762 } 763 } 764 765 /* 766 * MCi_ADDR info if requested and valid. 767 */ 768 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR && 769 bstat & MSR_MC_STATUS_ADDRV) { 770 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR, 771 DATA_TYPE_UINT64, gbl->gbl_addr, NULL); 772 } 773 774 /* 775 * MCi_MISC if requested and MCi_STATUS.MISCV). 776 */ 777 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC && 778 bstat & MSR_MC_STATUS_MISCV) { 779 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC, 780 DATA_TYPE_UINT64, gbl->gbl_misc, NULL); 781 } 782 783 } 784 785 /* 786 * Construct and post an ereport based on the logout information from a 787 * single MCA bank. We are not necessarily running on the cpu that 788 * detected the error. 789 */ 790 static void 791 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx, 792 const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status) 793 { 794 gcpu_data_t *gcpu = gcl->gcl_gcpu; 795 cmi_hdl_t hdl = gcpu->gcpu_hdl; 796 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx]; 797 const char *cpuclass = NULL, *leafclass = NULL; 798 uint16_t code = MCAX86_ERRCODE(status); 799 errorq_elem_t *eqep, *scr_eqep; 800 nvlist_t *ereport, *detector; 801 char buf[FM_MAX_CLASS]; 802 const char *classfmt; 803 nv_alloc_t *nva; 804 805 if (panicstr) { 806 if ((eqep = errorq_reserve(ereport_errorq)) == NULL) 807 return; 808 ereport = errorq_elem_nvl(ereport_errorq, eqep); 809 810 /* 811 * Allocate another element for scratch space, but fallback 812 * to the one we have if that fails. We'd like to use the 813 * additional scratch space for nvlist construction. 814 */ 815 if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL) 816 nva = errorq_elem_nva(ereport_errorq, scr_eqep); 817 else 818 nva = errorq_elem_nva(ereport_errorq, eqep); 819 } else { 820 ereport = fm_nvlist_create(NULL); 821 nva = NULL; 822 } 823 824 if (ereport == NULL) 825 return; 826 827 /* 828 * Common payload data required by the protocol: 829 * - ereport class 830 * - detector 831 * - ENA 832 */ 833 834 /* 835 * Ereport class - call into model-specific support to allow it to 836 * provide a cpu class or leaf class, otherwise calculate our own. 837 */ 838 cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass); 839 classfmt = ged ? ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN; 840 gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass, 841 leafclass); 842 843 /* 844 * The detector FMRI. 845 */ 846 if ((detector = cms_ereport_detector(hdl, bankidx, mscookie, 847 nva)) == NULL) 848 detector = gcpu_fmri_create(hdl, nva); 849 850 /* 851 * Should we define a new ENA format 3?? for chip/core/strand? 852 * It will be better when virtualized. 853 */ 854 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf, 855 fm_ena_generate_cpu(gcl->gcl_timestamp, 856 cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 | 857 cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL); 858 859 if (panicstr) { 860 fm_nvlist_destroy(detector, FM_NVA_RETAIN); 861 nv_alloc_reset(nva); 862 } else { 863 fm_nvlist_destroy(detector, FM_NVA_FREE); 864 } 865 866 /* 867 * Add the architectural ereport class-specific payload data. 868 */ 869 gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code); 870 871 /* 872 * Allow model-specific code to add ereport members. 873 */ 874 cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status, 875 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie); 876 877 /* 878 * Include stack if options is turned on and either selected in 879 * the payload member bitmask or inclusion is forced. 880 */ 881 if (gcpu_mca_stack_flag && 882 (cms_ereport_includestack(hdl, mscookie) == 883 B_TRUE || gcpu_mca_stack_ereport_include)) { 884 fm_payload_stack_add(ereport, gcl->gcl_stack, 885 gcl->gcl_stackdepth); 886 } 887 888 /* 889 * If injection has taken place anytime in the past then note this 890 * on the ereport. 891 */ 892 if (cmi_inj_tainted() == B_TRUE) { 893 fm_payload_set(ereport, "__injected", DATA_TYPE_BOOLEAN_VALUE, 894 B_TRUE, NULL); 895 } 896 897 /* 898 * Post ereport. 899 */ 900 if (panicstr) { 901 errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC); 902 if (scr_eqep) 903 errorq_cancel(ereport_errorq, scr_eqep); 904 } else { 905 (void) fm_ereport_post(ereport, EVCH_TRYHARD); 906 fm_nvlist_destroy(ereport, FM_NVA_FREE); 907 } 908 909 } 910 911 /*ARGSUSED*/ 912 void 913 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe) 914 { 915 const gcpu_logout_t *gcl = data; 916 const gcpu_bank_logout_t *gbl; 917 int i; 918 919 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) { 920 const gcpu_error_disp_t *gened; 921 cms_cookie_t mscookie; 922 923 if (gbl->gbl_status & MSR_MC_STATUS_VAL && 924 !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) { 925 uint16_t code = MCAX86_ERRCODE(gbl->gbl_status); 926 927 /* 928 * Perform a match based on IA32 MCA architectural 929 * components alone. 930 */ 931 gened = gcpu_disp_match(code); /* may be NULL */ 932 933 /* 934 * Now see if an model-specific match can be made. 935 */ 936 mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, i, 937 gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc, 938 gcl->gcl_ms_logout); 939 940 /* 941 * Prepare and dispatch an ereport for logging and 942 * diagnosis. 943 */ 944 gcpu_ereport_post(gcl, i, gened, mscookie, 945 gbl->gbl_status); 946 } else if (gbl->gbl_status & MSR_MC_STATUS_VAL && 947 (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) { 948 /* 949 * Telemetry kept changing as we tried to read 950 * it. Force an unknown ereport leafclass but 951 * keep the telemetry unchanged for logging. 952 */ 953 gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL, 954 gbl->gbl_status); 955 } 956 } 957 } 958 959 static size_t gcpu_mca_queue_datasz = 0; 960 961 /* 962 * The following code is ready to make a weak attempt at growing the 963 * errorq structure size. Since it is not foolproof (we don't know 964 * who may already be producing to the outgoing errorq) our caller 965 * instead assures that we'll always be called with no greater data 966 * size than on our first call. 967 */ 968 static void 969 gcpu_errorq_init(size_t datasz) 970 { 971 int slots; 972 973 mutex_enter(&gcpu_mca_queue_lock); 974 975 if (gcpu_mca_queue_datasz >= datasz) { 976 mutex_exit(&gcpu_mca_queue_lock); 977 return; 978 } 979 980 membar_producer(); 981 if (gcpu_mca_queue) { 982 gcpu_mca_queue_datasz = 0; 983 errorq_destroy(gcpu_mca_queue); 984 } 985 986 slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS); 987 slots = MIN(slots, GCPU_MCA_MAX_ERRORS); 988 989 gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain, 990 NULL, slots, datasz, 1, ERRORQ_VITAL); 991 992 if (gcpu_mca_queue != NULL) 993 gcpu_mca_queue_datasz = datasz; 994 995 mutex_exit(&gcpu_mca_queue_lock); 996 } 997 998 /* 999 * Perform MCA initialization as described in section 14.6 of Intel 64 1000 * and IA-32 Architectures Software Developer's Manual Volume 3A. 1001 */ 1002 1003 static uint_t global_nbanks; 1004 1005 void 1006 gcpu_mca_init(cmi_hdl_t hdl) 1007 { 1008 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1009 uint64_t cap; 1010 uint_t vendor = cmi_hdl_vendor(hdl); 1011 uint_t family = cmi_hdl_family(hdl); 1012 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1013 int mcg_ctl_present; 1014 uint_t nbanks; 1015 uint32_t ctl_skip_mask = 0; 1016 uint32_t status_skip_mask = 0; 1017 size_t mslsz; 1018 int i; 1019 #ifndef __xpv 1020 int mcg_ctl2_present; 1021 uint32_t cmci_capable = 0; 1022 #endif 1023 1024 if (gcpu == NULL) 1025 return; 1026 1027 /* 1028 * Protect from some silly /etc/system settings. 1029 */ 1030 if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100) 1031 gcpu_mca_telemetry_retries = 5; 1032 1033 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS) 1034 return; 1035 1036 /* 1037 * CPU startup code only calls cmi_mca_init if x86_feature indicates 1038 * both MCA and MCE support (i.e., X86_MCA). P5, K6, and earlier 1039 * processors, which have their own * more primitive way of doing 1040 * machine checks, will not have cmi_mca_init called since their 1041 * CPUID information will not indicate both MCA and MCE features. 1042 */ 1043 ASSERT(x86_feature & X86_MCA); 1044 1045 /* 1046 * Determine whether the IA32_MCG_CTL register is present. If it 1047 * is we will enable all features by writing -1 to it towards 1048 * the end of this initialization; if it is absent then volume 3A 1049 * says we must nonetheless continue to initialize the individual 1050 * banks. 1051 */ 1052 mcg_ctl_present = cap & MCG_CAP_CTL_P; 1053 #ifndef __xpv 1054 mcg_ctl2_present = cap & MCG_CAP_CTL2_P; 1055 #endif 1056 1057 /* 1058 * We squirell values away for inspection/debugging. 1059 */ 1060 mca->gcpu_mca_bioscfg.bios_mcg_cap = cap; 1061 if (mcg_ctl_present) 1062 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL, 1063 &mca->gcpu_mca_bioscfg.bios_mcg_ctl); 1064 1065 /* 1066 * Determine the number of error-reporting banks implemented. 1067 */ 1068 mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK; 1069 1070 if (nbanks != 0 && global_nbanks == 0) 1071 global_nbanks = nbanks; /* no race - BSP will get here first */ 1072 1073 /* 1074 * If someone is hiding the number of banks (perhaps we are fully 1075 * virtualized?) or if this processor has more banks than the 1076 * first to set global_nbanks then bail. The latter requirement 1077 * is because we need to size our errorq data structure and we 1078 * don't want to have to grow the errorq (destroy and recreate) 1079 * which may just lose some telemetry. 1080 */ 1081 if (nbanks == 0 || nbanks > global_nbanks) 1082 return; 1083 1084 mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks * 1085 sizeof (struct gcpu_bios_bankcfg), KM_SLEEP); 1086 1087 /* 1088 * Calculate the size we need to allocate for a gcpu_logout_t 1089 * with a gcl_data array big enough for all banks of this cpu. 1090 * Add any space requested by the model-specific logout support. 1091 */ 1092 mslsz = cms_logout_size(hdl); 1093 mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) + 1094 (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz; 1095 1096 for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) { 1097 gcpu_logout_t *gcl; 1098 1099 mca->gcpu_mca_logout[i] = gcl = 1100 kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP); 1101 gcl->gcl_gcpu = gcpu; 1102 gcl->gcl_nbanks = nbanks; 1103 gcl->gcl_ms_logout = (mslsz == 0) ? NULL : 1104 (char *)(&gcl->gcl_data[0]) + nbanks * 1105 sizeof (gcpu_bank_logout_t); 1106 1107 } 1108 1109 #ifdef __xpv 1110 gcpu_xpv_mca_init(nbanks); 1111 #endif 1112 1113 mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1; 1114 1115 #ifndef __xpv 1116 mca->gcpu_bank_cmci = kmem_zalloc(sizeof (gcpu_mca_cmci_t) * nbanks, 1117 KM_SLEEP); 1118 #endif 1119 1120 /* 1121 * Create our errorq to transport the logout structures. This 1122 * can fail so users of gcpu_mca_queue must be prepared for NULL. 1123 */ 1124 gcpu_errorq_init(mca->gcpu_mca_lgsz); 1125 1126 /* 1127 * Not knowing which, if any, banks are shared between cores we 1128 * assure serialization of MCA bank initialization by each cpu 1129 * on the chip. On chip architectures in which some banks are 1130 * shared this will mean the shared resource is initialized more 1131 * than once - we're simply aiming to avoid simultaneous MSR writes 1132 * to the shared resource. 1133 * 1134 * Even with these precautions, some platforms may yield a GP fault 1135 * if a core other than a designated master tries to write anything 1136 * but all 0's to MCi_{STATUS,ADDR,CTL}. So we will perform 1137 * those writes under on_trap protection. 1138 */ 1139 mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock); 1140 1141 /* 1142 * Initialize poller data, but don't start polling yet. 1143 */ 1144 gcpu_mca_poll_init(hdl); 1145 1146 /* 1147 * Work out which MCA banks we will initialize. In MCA logout 1148 * code we will only read those banks which we initialize here. 1149 */ 1150 for (i = 0; i < nbanks; i++) { 1151 boolean_t skipctl = cms_bankctl_skipinit(hdl, i); 1152 boolean_t skipstatus = cms_bankstatus_skipinit(hdl, i); 1153 1154 if (!cms_present(hdl)) { 1155 /* 1156 * Model-specific support is not present, try to use 1157 * sane defaults. 1158 * 1159 * On AMD family 6 processors, reports about spurious 1160 * machine checks indicate that bank 0 should be 1161 * skipped. 1162 * 1163 * On Intel family 6 processors, the documentation tells 1164 * us not to write to MC0_CTL. 1165 * 1166 */ 1167 if (i == 0 && family == 6) { 1168 switch (vendor) { 1169 case X86_VENDOR_AMD: 1170 skipstatus = B_TRUE; 1171 /*FALLTHRU*/ 1172 case X86_VENDOR_Intel: 1173 skipctl = B_TRUE; 1174 break; 1175 } 1176 } 1177 } 1178 1179 ctl_skip_mask |= skipctl << i; 1180 status_skip_mask |= skipstatus << i; 1181 1182 if (skipctl && skipstatus) 1183 continue; 1184 1185 /* 1186 * Record which MCA banks were enabled, from the point of view 1187 * of the whole chip (if some cores share a bank we must be 1188 * sure either can logout from it). 1189 */ 1190 atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i); 1191 1192 #ifndef __xpv 1193 /* 1194 * check CMCI capability 1195 */ 1196 if (mcg_ctl2_present) { 1197 uint64_t ctl2; 1198 uint32_t cap = 0; 1199 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2); 1200 if (ctl2 & MSR_MC_CTL2_EN) 1201 continue; 1202 ctl2 |= MSR_MC_CTL2_EN; 1203 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2); 1204 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2); 1205 mca->gcpu_bank_cmci[i].cmci_cap = cap = 1206 (ctl2 & MSR_MC_CTL2_EN) ? 1 : 0; 1207 if (cap) 1208 cmci_capable ++; 1209 /* 1210 * Set threshold to 1 while unset the en field, to avoid 1211 * CMCI trigged before APIC LVT entry init. 1212 */ 1213 ctl2 = ctl2 & (~MSR_MC_CTL2_EN) | 1; 1214 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2); 1215 1216 /* 1217 * init cmci related count 1218 */ 1219 mca->gcpu_bank_cmci[i].cmci_enabled = 0; 1220 mca->gcpu_bank_cmci[i].drtcmci = 0; 1221 mca->gcpu_bank_cmci[i].ncmci = 0; 1222 } 1223 #endif 1224 } 1225 1226 #ifndef __xpv 1227 if (cmci_capable) 1228 cmi_enable_cmci = 1; 1229 #endif 1230 1231 #ifndef __xpv 1232 /* 1233 * Log any valid telemetry lurking in the MCA banks, but do not 1234 * clear the status registers. Ignore the disposition returned - 1235 * we have already paniced or reset for any nasty errors found here. 1236 * 1237 * Intel vol 3A says that we should not do this on family 0x6, 1238 * and that for any extended family the BIOS clears things 1239 * on power-on reset so you'll only potentially find valid telemetry 1240 * on warm reset (we do it for both - on power-on reset we should 1241 * just see zeroes). 1242 * 1243 * AMD docs since K7 say we should process anything we find here. 1244 */ 1245 if (!gcpu_suppress_log_on_init && 1246 (vendor == X86_VENDOR_Intel && family >= 0xf || 1247 vendor == X86_VENDOR_AMD)) 1248 gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE, 1249 GCPU_MPT_WHAT_POKE_ERR); 1250 1251 /* 1252 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the 1253 * model-specific module the power of veto. 1254 */ 1255 for (i = 0; i < nbanks; i++) { 1256 struct gcpu_bios_bankcfg *bcfgp = 1257 mca->gcpu_mca_bioscfg.bios_bankcfg + i; 1258 1259 /* 1260 * Stash inherited bank MCA state, even for banks we will 1261 * not initialize ourselves. Do not read the MISC register 1262 * unconditionally - on some processors that will #GP on 1263 * banks that do not implement the MISC register (would be 1264 * caught by on_trap, anyway). 1265 */ 1266 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL), 1267 &bcfgp->bios_bank_ctl); 1268 1269 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), 1270 &bcfgp->bios_bank_status); 1271 1272 if (bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV) 1273 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), 1274 &bcfgp->bios_bank_addr); 1275 1276 /* 1277 * In some old BIOS the status value after boot can indicate 1278 * MISCV when there is actually no MISC register for 1279 * that bank. The following read could therefore 1280 * aggravate a general protection fault. This should be 1281 * caught by on_trap, but the #GP fault handler is busted 1282 * and can suffer a double fault even before we get to 1283 * trap() to check for on_trap protection. Until that 1284 * issue is fixed we remove the one access that we know 1285 * can cause a #GP. 1286 * 1287 * if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV) 1288 * (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), 1289 * &bcfgp->bios_bank_misc); 1290 */ 1291 bcfgp->bios_bank_misc = 0; 1292 1293 if (!(ctl_skip_mask & (1 << i))) { 1294 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL), 1295 cms_bankctl_val(hdl, i, -1ULL)); 1296 } 1297 1298 if (!(status_skip_mask & (1 << i))) { 1299 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 1300 cms_bankstatus_val(hdl, i, 0ULL)); 1301 } 1302 } 1303 #endif 1304 /* 1305 * Now let the model-specific support perform further initialization 1306 * of non-architectural features. 1307 */ 1308 cms_mca_init(hdl, nbanks); 1309 1310 #ifndef __xpv 1311 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL); 1312 membar_producer(); 1313 1314 /* enable all machine-check features */ 1315 if (mcg_ctl_present) 1316 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL, 1317 cms_mcgctl_val(hdl, nbanks, -1ULL)); 1318 #endif 1319 1320 mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock); 1321 1322 #ifndef __xpv 1323 /* enable machine-check exception in CR4 */ 1324 cmi_hdl_enable_mce(hdl); 1325 #endif 1326 } 1327 1328 static uint64_t 1329 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu, 1330 gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp) 1331 { 1332 int curctxbad = 0, unconstrained = 0, forcefatal = 0; 1333 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1334 int nbanks = mca->gcpu_mca_nbanks; 1335 gcpu_mce_status_t mce; 1336 gcpu_bank_logout_t *gbl; 1337 uint64_t disp = 0; 1338 int i; 1339 1340 if (mcesp == NULL) 1341 mcesp = &mce; 1342 1343 mcesp->mce_nerr = nerr; 1344 1345 mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc = 1346 mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned = 1347 mcesp->mce_forcefatal = mcesp->mce_ignored = 0; 1348 1349 /* 1350 * If this a machine check then if the return instruction pointer 1351 * is not valid the current context is lost. 1352 */ 1353 if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV)) 1354 disp |= CMI_ERRDISP_RIPV_INVALID; 1355 1356 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) { 1357 uint64_t mcistatus = gbl->gbl_status; 1358 uint32_t ms_scope; 1359 int pcc, uc; 1360 int poisoned; 1361 1362 if (!(mcistatus & MSR_MC_STATUS_VAL)) 1363 continue; 1364 1365 if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT) 1366 continue; 1367 1368 pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0; 1369 uc = (mcistatus & MSR_MC_STATUS_UC) != 0; 1370 mcesp->mce_npcc += pcc; 1371 mcesp->mce_nuc += uc; 1372 1373 ms_scope = cms_error_action(hdl, ismc, i, mcistatus, 1374 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout); 1375 1376 if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) { 1377 pcc = 0; 1378 mcesp->mce_npcc_ok++; 1379 gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED; 1380 } 1381 1382 if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) { 1383 uc = 0; 1384 mcesp->mce_nuc_ok++; 1385 gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED; 1386 } 1387 1388 if (uc) { 1389 poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0; 1390 if (poisoned) { 1391 mcesp->mce_nuc_poisoned++; 1392 gbl->gbl_disp |= CMI_ERRDISP_POISONED; 1393 } 1394 } 1395 1396 if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) { 1397 /* 1398 * We're not being instructed to ignore the error, 1399 * so apply our standard disposition logic to it. 1400 */ 1401 if (uc && !poisoned) { 1402 unconstrained++; 1403 gbl->gbl_disp |= disp | 1404 CMI_ERRDISP_UC_UNCONSTRAINED; 1405 } 1406 1407 if (pcc && ismc) { 1408 curctxbad++; 1409 gbl->gbl_disp |= disp | 1410 CMI_ERRDISP_CURCTXBAD; 1411 } 1412 1413 /* 1414 * Even if the above may not indicate that the error 1415 * is terminal, model-specific support may insist 1416 * that we treat it as such. Such errors wil be 1417 * fatal even if discovered via poll. 1418 */ 1419 if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) { 1420 forcefatal++; 1421 mcesp->mce_forcefatal++; 1422 gbl->gbl_disp |= disp | 1423 CMI_ERRDISP_FORCEFATAL; 1424 } 1425 } else { 1426 mcesp->mce_ignored++; 1427 gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED; 1428 } 1429 } 1430 1431 if (unconstrained > 0) 1432 disp |= CMI_ERRDISP_UC_UNCONSTRAINED; 1433 1434 if (curctxbad > 0) 1435 disp |= CMI_ERRDISP_CURCTXBAD; 1436 1437 if (forcefatal > 0) 1438 disp |= CMI_ERRDISP_FORCEFATAL; 1439 1440 if (gcpu_mca_queue != NULL) { 1441 int how; 1442 1443 if (ismc) { 1444 how = cmi_mce_response(rp, disp) ? 1445 ERRORQ_ASYNC : /* no panic, so arrange drain */ 1446 ERRORQ_SYNC; /* panic flow will drain */ 1447 } else { 1448 how = (disp & CMI_ERRDISP_FORCEFATAL && 1449 cmi_panic_on_ue()) ? 1450 ERRORQ_SYNC : /* poller will panic */ 1451 ERRORQ_ASYNC; /* no panic */ 1452 } 1453 1454 errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how); 1455 } else if (disp != 0) { 1456 gcpu_bleat(hdl, gcl); 1457 } 1458 1459 mcesp->mce_disp = disp; 1460 1461 return (disp); 1462 } 1463 1464 /* 1465 * Gather error telemetry from our source, and then submit it for 1466 * processing. 1467 */ 1468 1469 #define IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \ 1470 ((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0) 1471 1472 #define STATUS_EQV(s1, s2) \ 1473 (((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER)) 1474 1475 static uint32_t gcpu_deferrred_polled_clears; 1476 1477 #ifndef __xpv 1478 static void 1479 gcpu_cmci_logout(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p, 1480 uint64_t status, int what) 1481 { 1482 uint64_t ctl2; 1483 1484 if (bank_cmci_p->cmci_cap && (what == GCPU_MPT_WHAT_CYC_ERR) && 1485 (!(status & MSR_MC_STATUS_VAL) || ((status & MSR_MC_STATUS_VAL) && 1486 !(status & MSR_MC_STATUS_CEC_MASK)))) { 1487 1488 if (!(bank_cmci_p->cmci_enabled)) { 1489 /* 1490 * when cmci is disabled, and the bank has no error or 1491 * no corrected error for 1492 * gcpu_mca_cmci_reenable_threshold consecutive polls, 1493 * turn on this bank's cmci. 1494 */ 1495 1496 bank_cmci_p->drtcmci ++; 1497 1498 if (bank_cmci_p->drtcmci >= 1499 gcpu_mca_cmci_reenable_threshold) { 1500 1501 /* turn on cmci */ 1502 1503 (void) cmi_hdl_rdmsr(hdl, 1504 IA32_MSR_MC_CTL2(bank), &ctl2); 1505 ctl2 |= MSR_MC_CTL2_EN; 1506 (void) cmi_hdl_wrmsr(hdl, 1507 IA32_MSR_MC_CTL2(bank), ctl2); 1508 1509 /* reset counter and set flag */ 1510 bank_cmci_p->drtcmci = 0; 1511 bank_cmci_p->cmci_enabled = 1; 1512 } 1513 } else { 1514 /* 1515 * when cmci is enabled,if is in cyclic poll and the 1516 * bank has no error or no corrected error, reset ncmci 1517 * counter 1518 */ 1519 bank_cmci_p->ncmci = 0; 1520 } 1521 } 1522 } 1523 1524 static void 1525 gcpu_cmci_throttle(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p, 1526 int what) 1527 { 1528 uint64_t ctl2 = 0; 1529 1530 /* 1531 * if cmci of this bank occurred beyond 1532 * gcpu_mca_cmci_throttling_threshold between 2 polls, 1533 * turn off this bank's CMCI; 1534 */ 1535 if (bank_cmci_p->cmci_enabled && what == GCPU_MPT_WHAT_CMCI_ERR) { 1536 1537 /* if it is cmci trap, increase the count */ 1538 bank_cmci_p->ncmci++; 1539 1540 if (bank_cmci_p->ncmci >= gcpu_mca_cmci_throttling_threshold) { 1541 1542 /* turn off cmci */ 1543 1544 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(bank), 1545 &ctl2); 1546 ctl2 &= ~MSR_MC_CTL2_EN; 1547 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(bank), 1548 ctl2); 1549 1550 /* clear the flag and count */ 1551 1552 bank_cmci_p->cmci_enabled = 0; 1553 bank_cmci_p->ncmci = 0; 1554 } 1555 } 1556 } 1557 #endif 1558 1559 static void 1560 clear_mc(int first, int last, int ismc, boolean_t clrstatus, 1561 cmi_hdl_t hdl, gcpu_logout_t *gcl, gcpu_logout_t *pgcl) 1562 { 1563 int i; 1564 gcpu_bank_logout_t *gbl, *pgbl; 1565 uint64_t status; 1566 1567 if (first < 0 || last < 0) 1568 return; 1569 1570 for (i = first, gbl = &gcl->gcl_data[first]; i <= last; i++, gbl++) { 1571 status = gbl->gbl_status; 1572 if (status == 0) 1573 continue; 1574 if (clrstatus == B_FALSE) 1575 goto serialize; 1576 1577 /* 1578 * For i86xpv we always clear status in order to invalidate 1579 * the interposed telemetry. 1580 * 1581 * For native machine checks we always clear status here. For 1582 * native polls we must be a little more cautious since there 1583 * is an outside chance that we may clear telemetry from a 1584 * shared MCA bank on which a sibling core is machine checking. 1585 * 1586 * For polled observations of errors that look like they may 1587 * produce a machine check (UC/PCC and ENabled, although these 1588 * do not guarantee a machine check on error occurence) 1589 * we will not clear the status at this wakeup unless 1590 * we saw the same status at the previous poll. We will 1591 * always process and log the current observations - it 1592 * is only the clearing of MCi_STATUS which may be 1593 * deferred until the next wakeup. 1594 */ 1595 if (isxpv || ismc || !IS_MCE_CANDIDATE(status)) { 1596 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL); 1597 goto serialize; 1598 } 1599 1600 /* 1601 * We have a polled observation of a machine check 1602 * candidate. If we saw essentially the same status at the 1603 * last poll then clear the status now since this appears 1604 * not to be a #MC candidate after all. If we see quite 1605 * different status now then do not clear, but reconsider at 1606 * the next poll. In no actual machine check clears 1607 * the status in the interim then the status should not 1608 * keep changing forever (meaning we'd never clear it) 1609 * since before long we'll simply have latched the highest- 1610 * priority error and set the OVerflow bit. Nonetheless 1611 * we count how many times we defer clearing and after 1612 * a while insist on clearing the status. 1613 */ 1614 pgbl = &pgcl->gcl_data[i]; 1615 if (pgbl->gbl_clrdefcnt != 0) { 1616 /* We deferred clear on this bank at last wakeup */ 1617 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) || 1618 pgbl->gbl_clrdefcnt > 5) { 1619 /* 1620 * Status is unchanged so clear it now and, 1621 * since we have already logged this info, 1622 * avoid logging it again. 1623 */ 1624 gbl->gbl_status = 0; 1625 (void) cmi_hdl_wrmsr(hdl, 1626 IA32_MSR_MC(i, STATUS), 0ULL); 1627 } else { 1628 /* Record deferral for next wakeup */ 1629 gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1; 1630 } 1631 } else { 1632 /* Record initial deferral for next wakeup */ 1633 gbl->gbl_clrdefcnt = 1; 1634 gcpu_deferrred_polled_clears++; 1635 } 1636 1637 serialize: 1638 { 1639 #ifdef __xpv 1640 ; 1641 #else 1642 /* 1643 * Intel Vol 3A says to execute a serializing 1644 * instruction here, ie CPUID. Well WRMSR is also 1645 * defined to be serializing, so the status clear above 1646 * should suffice. To be a good citizen, and since 1647 * some clears are deferred, we'll execute a CPUID 1648 * instruction here. 1649 */ 1650 struct cpuid_regs tmp; 1651 (void) __cpuid_insn(&tmp); 1652 #endif 1653 } 1654 } 1655 } 1656 1657 /*ARGSUSED5*/ 1658 void 1659 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask, 1660 gcpu_mce_status_t *mcesp, boolean_t clrstatus, int what) 1661 { 1662 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1663 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1664 int nbanks = mca->gcpu_mca_nbanks; 1665 gcpu_bank_logout_t *gbl, *pgbl; 1666 gcpu_logout_t *gcl, *pgcl; 1667 int ismc = (rp != NULL); 1668 int ispoll = !ismc; 1669 int i, nerr = 0; 1670 cmi_errno_t err; 1671 uint64_t mcg_status; 1672 uint64_t disp; 1673 uint64_t cap; 1674 int first = -1; 1675 int last = -1; 1676 int willpanic = 0; 1677 1678 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) != 1679 CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != 1680 CMI_SUCCESS) { 1681 if (mcesp != NULL) 1682 mcesp->mce_nerr = mcesp->mce_disp = 0; 1683 return; 1684 } 1685 1686 if (ismc) { 1687 gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION]; 1688 } else { 1689 int pidx = mca->gcpu_mca_nextpoll_idx; 1690 int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ? 1691 GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1; 1692 1693 gcl = mca->gcpu_mca_logout[pidx]; /* current logout */ 1694 pgcl = mca->gcpu_mca_logout[ppidx]; /* previous logout */ 1695 mca->gcpu_mca_nextpoll_idx = ppidx; /* switch next time */ 1696 } 1697 1698 gcl->gcl_timestamp = gethrtime_waitfree(); 1699 gcl->gcl_mcg_status = mcg_status; 1700 gcl->gcl_ip = rp ? rp->r_pc : 0; 1701 1702 gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0; 1703 if (cap & MCG_CAP_TES_P) 1704 gcl->gcl_flags |= GCPU_GCL_F_TES_P; 1705 1706 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) { 1707 uint64_t status, status2, addr, misc; 1708 int retries = gcpu_mca_telemetry_retries; 1709 1710 gbl->gbl_status = 0; 1711 gbl->gbl_disp = 0; 1712 gbl->gbl_clrdefcnt = 0; 1713 1714 /* 1715 * Only logout from MCA banks we have initialized from at 1716 * least one core. If a core shares an MCA bank with another 1717 * but perhaps lost the race to initialize it, then it must 1718 * still be allowed to logout from the shared bank. 1719 */ 1720 if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i)) 1721 continue; 1722 1723 /* 1724 * On a poll look only at the banks we've been asked to check. 1725 */ 1726 if (rp == NULL && !(bankmask & 1 << i)) 1727 continue; 1728 1729 1730 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) != 1731 CMI_SUCCESS) 1732 continue; 1733 1734 #ifndef __xpv 1735 gcpu_cmci_logout(hdl, i, &mca->gcpu_bank_cmci[i], status, what); 1736 #endif 1737 1738 retry: 1739 if (!(status & MSR_MC_STATUS_VAL)) 1740 continue; 1741 1742 /* First and last bank that have valid status */ 1743 if (first < 0) 1744 first = i; 1745 last = i; 1746 1747 addr = -1; 1748 misc = 0; 1749 1750 if (status & MSR_MC_STATUS_ADDRV) 1751 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr); 1752 1753 if (status & MSR_MC_STATUS_MISCV) 1754 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc); 1755 1756 #ifndef __xpv 1757 gcpu_cmci_throttle(hdl, i, &mca->gcpu_bank_cmci[i], what); 1758 #endif 1759 1760 /* 1761 * Allow the model-specific code to extract bank telemetry. 1762 */ 1763 cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout); 1764 1765 /* 1766 * Not all cpu models assure us that the status/address/misc 1767 * data will not change during the above sequence of MSR reads, 1768 * or that it can only change by the addition of the OVerflow 1769 * bit to the status register. If the status has changed 1770 * other than in the overflow bit then we attempt to reread 1771 * for a consistent snapshot, but eventually give up and 1772 * go with what we've got. We only perform this check 1773 * for a poll - a further #MC during a #MC will reset, and 1774 * polled errors should not overwrite higher-priority 1775 * trapping errors (but could set the overflow bit). 1776 */ 1777 if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), 1778 &status2)) == CMI_SUCCESS) { 1779 if (!STATUS_EQV(status, status2)) { 1780 if (retries-- > 0) { 1781 status = status2; 1782 goto retry; 1783 } else { 1784 gbl->gbl_disp |= 1785 CMI_ERRDISP_INCONSISTENT; 1786 } 1787 } 1788 } else if (ispoll && err != CMI_SUCCESS) { 1789 gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT; 1790 } 1791 1792 nerr++; 1793 gbl->gbl_status = status; 1794 gbl->gbl_addr = addr; 1795 gbl->gbl_misc = misc; 1796 1797 /* 1798 * For polled observation, if the count of deferred status 1799 * clears updated in the clear_mc() is nonzero and the 1800 * MCi_STATUS has not changed, the last wakeup has produced 1801 * the ereport of the error. Therefore, clear the status in 1802 * this wakeup to avoid duplicate ereport. 1803 */ 1804 pgbl = &pgcl->gcl_data[i]; 1805 if (!isxpv && ispoll && IS_MCE_CANDIDATE(status) && 1806 pgbl->gbl_clrdefcnt != 0) { 1807 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status)) { 1808 gbl->gbl_status = 0; 1809 (void) cmi_hdl_wrmsr(hdl, 1810 IA32_MSR_MC(i, STATUS), 0ULL); 1811 } 1812 } 1813 } 1814 1815 if (gcpu_mca_stack_flag) 1816 gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH); 1817 else 1818 gcl->gcl_stackdepth = 0; 1819 1820 /* 1821 * Decide our disposition for this error or errors, and submit for 1822 * logging and subsequent diagnosis. 1823 */ 1824 if (nerr != 0) { 1825 disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp); 1826 1827 willpanic = (ismc && cmi_mce_response(rp, disp) == 0); 1828 1829 if (!willpanic) 1830 clear_mc(first, last, ismc, clrstatus, hdl, gcl, pgcl); 1831 } else { 1832 disp = 0; 1833 if (mcesp) { 1834 mcesp->mce_nerr = mcesp->mce_disp = 0; 1835 } 1836 } 1837 1838 /* 1839 * Clear MCG_STATUS if MCIP is set (machine check in progress). 1840 * If a second #MC had occured before now the system would have 1841 * reset. We can only do thise once gcpu_mca_process has copied 1842 * the logout structure. 1843 */ 1844 if (ismc && mcg_status & MCG_STATUS_MCIP) 1845 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0); 1846 1847 /* 1848 * At this point we have read and logged all telemetry that is visible 1849 * under the MCA. On architectures for which the NorthBridge is 1850 * on-chip this may include NB-observed errors, but where the NB 1851 * is off chip it may have been the source of the #MC request and 1852 * so we must call into the memory-controller driver to give it 1853 * a chance to log errors. 1854 */ 1855 if (ismc) { 1856 cmi_mc_logout(hdl, 1, willpanic); 1857 } 1858 } 1859 1860 #ifndef __xpv 1861 int gcpu_mca_trap_vomit_summary = 0; 1862 1863 /* 1864 * On a native machine check exception we come here from mcetrap via 1865 * cmi_mca_trap. A machine check on one cpu of a chip does not trap others 1866 * cpus of the chip, so it is possible that another cpu on this chip could 1867 * initiate a poll while we're in the #mc handler; it is also possible that 1868 * this trap has occured during a poll on this cpu. So we must acquire 1869 * the chip-wide poll lock, but be careful to avoid deadlock. 1870 * 1871 * The 'data' pointer cannot be NULL due to init order. 1872 */ 1873 uint64_t 1874 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp) 1875 { 1876 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1877 kmutex_t *poll_lock = NULL; 1878 gcpu_mce_status_t mce; 1879 uint64_t mcg_status; 1880 int tooklock = 0; 1881 1882 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) != 1883 CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP)) 1884 return (0); 1885 1886 /* 1887 * Synchronize with any poller from another core that may happen 1888 * to share access to one or more of the MCA banks. 1889 */ 1890 if (gcpu->gcpu_shared != NULL) 1891 poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock; 1892 1893 if (poll_lock != NULL && !mutex_owned(poll_lock)) { 1894 /* 1895 * The lock is not owned by the thread we have 1896 * interrupted. Spin for this adaptive lock. 1897 */ 1898 while (!mutex_tryenter(poll_lock)) { 1899 while (mutex_owner(poll_lock) != NULL) 1900 ; 1901 } 1902 tooklock = 1; 1903 } 1904 1905 gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE, GCPU_MPT_WHAT_MC_ERR); 1906 1907 if (tooklock) 1908 mutex_exit(poll_lock); 1909 1910 /* 1911 * gcpu_mca_trap_vomit_summary may be set for debug assistance. 1912 */ 1913 if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) { 1914 cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, " 1915 "%u PCC (%u ok), " 1916 "%u UC (%d ok, %u poisoned), " 1917 "%u forcefatal, %u ignored", 1918 mce.mce_nerr, (u_longlong_t)mce.mce_disp, 1919 mce.mce_npcc, mce.mce_npcc_ok, 1920 mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned, 1921 mce.mce_forcefatal, mce.mce_ignored); 1922 } 1923 1924 return (mce.mce_disp); 1925 } 1926 #endif 1927 1928 /*ARGSUSED*/ 1929 void 1930 gcpu_faulted_enter(cmi_hdl_t hdl) 1931 { 1932 /* Nothing to do here */ 1933 } 1934 1935 /*ARGSUSED*/ 1936 void 1937 gcpu_faulted_exit(cmi_hdl_t hdl) 1938 { 1939 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1940 1941 gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING; 1942 } 1943 1944 /* 1945 * Write the requested values to the indicated MSRs. Having no knowledge 1946 * of the model-specific requirements for writing to these model-specific 1947 * registers, we will only blindly write to those MSRs if the 'force' 1948 * argument is nonzero. That option should only be used in prototyping 1949 * and debugging. 1950 */ 1951 /*ARGSUSED*/ 1952 cmi_errno_t 1953 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs, 1954 int force) 1955 { 1956 int i, errs = 0; 1957 1958 for (i = 0; i < nregs; i++) { 1959 uint_t msr = regs[i].cmr_msrnum; 1960 uint64_t val = regs[i].cmr_msrval; 1961 1962 if (cms_present(hdl)) { 1963 if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS) 1964 errs++; 1965 } else if (force) { 1966 errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS); 1967 } else { 1968 errs++; 1969 } 1970 } 1971 1972 return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN); 1973 } 1974