1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/mca_x86.h> 30 #include <sys/cpu_module_impl.h> 31 #include <sys/cpu_module_ms.h> 32 #include <sys/cmn_err.h> 33 #include <sys/cpuvar.h> 34 #include <sys/pghw.h> 35 #include <sys/x86_archext.h> 36 #include <sys/sysmacros.h> 37 #include <sys/regset.h> 38 #include <sys/privregs.h> 39 #include <sys/systm.h> 40 #include <sys/types.h> 41 #include <sys/log.h> 42 #include <sys/psw.h> 43 #include <sys/fm/protocol.h> 44 #include <sys/fm/util.h> 45 #include <sys/errorq.h> 46 #include <sys/mca_x86.h> 47 #include <sys/fm/cpu/GMCA.h> 48 #include <sys/sysevent.h> 49 #include <sys/ontrap.h> 50 51 #include "gcpu.h" 52 53 /* 54 * Clear to log telemetry found at initialization. While processor docs 55 * say you should process this telemetry on all but Intel family 0x6 56 * there are way too many exceptions and we want to avoid bogus 57 * diagnoses. 58 */ 59 int gcpu_suppress_log_on_init = 1; 60 61 /* 62 * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at 63 * error logout time. The stack will be included in the ereport if the 64 * error type selects stack inclusion, or in all cases if 65 * gcpu_mca_stack_ereport_include is nonzero. 66 */ 67 int gcpu_mca_stack_flag = 0; 68 int gcpu_mca_stack_ereport_include = 0; 69 70 /* 71 * The number of times to re-read MCA telemetry to try to obtain a 72 * consistent snapshot if we find it to be changing under our feet. 73 */ 74 int gcpu_mca_telemetry_retries = 5; 75 76 static gcpu_error_disp_t gcpu_errtypes[] = { 77 78 /* 79 * Unclassified 80 */ 81 { 82 FM_EREPORT_CPU_GENERIC_UNCLASSIFIED, 83 NULL, 84 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 85 MCAX86_SIMPLE_UNCLASSIFIED_MASKON, 86 MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF 87 }, 88 89 /* 90 * Microcode ROM Parity Error 91 */ 92 { 93 FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY, 94 NULL, 95 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 96 MCAX86_SIMPLE_MC_CODE_PARITY_MASKON, 97 MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF 98 }, 99 100 /* 101 * External - BINIT# from another processor during power-on config 102 */ 103 { 104 FM_EREPORT_CPU_GENERIC_EXTERNAL, 105 NULL, 106 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 107 MCAX86_SIMPLE_EXTERNAL_MASKON, 108 MCAX86_SIMPLE_EXTERNAL_MASKOFF 109 }, 110 111 /* 112 * Functional redundancy check master/slave error 113 */ 114 { 115 FM_EREPORT_CPU_GENERIC_FRC, 116 NULL, 117 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 118 MCAX86_SIMPLE_FRC_MASKON, 119 MCAX86_SIMPLE_FRC_MASKOFF 120 }, 121 122 /* 123 * Internal timer error 124 */ 125 { 126 FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER, 127 NULL, 128 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 129 MCAX86_SIMPLE_INTERNAL_TIMER_MASKON, 130 MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF 131 }, 132 133 /* 134 * Internal unclassified 135 */ 136 { 137 FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS, 138 NULL, 139 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 140 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON, 141 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF 142 }, 143 144 /* 145 * Compound error codes - generic memory hierarchy 146 */ 147 { 148 FM_EREPORT_CPU_GENERIC_GENMEMHIER, 149 NULL, 150 FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */ 151 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON, 152 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF 153 }, 154 155 /* 156 * Compound error codes - TLB errors 157 */ 158 { 159 FM_EREPORT_CPU_GENERIC_TLB, 160 "%1$s" "TLB" "%2$s" "_ERR", 161 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 162 MCAX86_COMPOUND_TLB_MASKON, 163 MCAX86_COMPOUND_TLB_MASKOFF 164 }, 165 166 /* 167 * Compound error codes - memory hierarchy 168 */ 169 { 170 FM_EREPORT_CPU_GENERIC_MEMHIER, 171 "%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR", 172 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 173 MCAX86_COMPOUND_MEMHIER_MASKON, 174 MCAX86_COMPOUND_MEMHIER_MASKOFF 175 }, 176 177 /* 178 * Compound error codes - bus and interconnect errors 179 */ 180 { 181 FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT, 182 "BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR", 183 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 184 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON, 185 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF 186 }, 187 }; 188 189 static gcpu_error_disp_t gcpu_unknown = { 190 FM_EREPORT_CPU_GENERIC_UNKNOWN, 191 "UNKNOWN", 192 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 193 0, 194 0 195 }; 196 197 static errorq_t *gcpu_mca_queue; 198 static kmutex_t gcpu_mca_queue_lock; 199 200 static const gcpu_error_disp_t * 201 gcpu_disp_match(uint16_t code) 202 { 203 const gcpu_error_disp_t *ged = gcpu_errtypes; 204 int i; 205 206 for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t); 207 i++, ged++) { 208 uint16_t on = ged->ged_errcode_mask_on; 209 uint16_t off = ged->ged_errcode_mask_off; 210 211 if ((code & on) == on && (code & off) == 0) 212 return (ged); 213 } 214 215 return (NULL); 216 } 217 218 static uint8_t 219 bit_strip(uint16_t code, uint16_t mask, uint16_t shift) 220 { 221 return ((uint8_t)(code & mask) >> shift); 222 } 223 224 #define BIT_STRIP(code, name) \ 225 bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \ 226 MCAX86_ERRCODE_##name##_SHIFT) 227 228 #define GCPU_MNEMONIC_UNDEF "undefined" 229 #define GCPU_MNEMONIC_RESVD "reserved" 230 231 /* 232 * Mappings of TT, LL, RRRR, PP, II and T values to compound error name 233 * mnemonics and to ereport class name components. 234 */ 235 236 struct gcpu_mnexp { 237 const char *mne_compound; /* used in expanding compound errname */ 238 const char *mne_ereport; /* used in expanding ereport class */ 239 }; 240 241 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */ 242 { "I", FM_EREPORT_CPU_GENERIC_TT_INSTR }, /* INSTR */ 243 { "D", FM_EREPORT_CPU_GENERIC_TT_DATA }, /* DATA */ 244 { "G", FM_EREPORT_CPU_GENERIC_TT_GEN }, /* GEN */ 245 { GCPU_MNEMONIC_UNDEF, "" } 246 }; 247 248 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */ 249 { "LO", FM_EREPORT_CPU_GENERIC_LL_L0 }, /* L0 */ 250 { "L1", FM_EREPORT_CPU_GENERIC_LL_L1 }, /* L1 */ 251 { "L2", FM_EREPORT_CPU_GENERIC_LL_L2 }, /* L2 */ 252 { "LG", FM_EREPORT_CPU_GENERIC_LL_LG } /* LG */ 253 }; 254 255 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */ 256 { "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR }, /* ERR */ 257 { "RD", FM_EREPORT_CPU_GENERIC_RRRR_RD }, /* RD */ 258 { "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR }, /* WR */ 259 { "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD }, /* DRD */ 260 { "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR }, /* DWR */ 261 { "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD }, /* IRD */ 262 { "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH }, /* PREFETCH */ 263 { "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT }, /* EVICT */ 264 { "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP }, /* SNOOP */ 265 }; 266 267 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */ 268 { "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC }, /* SRC */ 269 { "RES", FM_EREPORT_CPU_GENERIC_PP_RES }, /* RES */ 270 { "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS }, /* OBS */ 271 { "", FM_EREPORT_CPU_GENERIC_PP_GEN } /* GEN */ 272 }; 273 274 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */ 275 { "M", FM_EREPORT_CPU_GENERIC_II_MEM }, /* MEM */ 276 { GCPU_MNEMONIC_RESVD, "" }, 277 { "IO", FM_EREPORT_CPU_GENERIC_II_IO }, /* IO */ 278 { "", FM_EREPORT_CPU_GENERIC_II_GEN } /* GEN */ 279 }; 280 281 static struct gcpu_mnexp gcpu_T_mnemonics[] = { /* MCAX86_ERRCODE_T_* */ 282 { "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT }, /* NONE */ 283 { "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT } /* TIMEOUT */ 284 }; 285 286 enum gcpu_mn_namespace { 287 GCPU_MN_NAMESPACE_COMPOUND, 288 GCPU_MN_NAMESPACE_EREPORT 289 }; 290 291 static const char * 292 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint8_t val, 293 enum gcpu_mn_namespace nspace) 294 { 295 if (val >= tbl_sz) 296 return (GCPU_MNEMONIC_UNDEF); /* for all namespaces */ 297 298 switch (nspace) { 299 case GCPU_MN_NAMESPACE_COMPOUND: 300 return (tbl[val].mne_compound); 301 /*NOTREACHED*/ 302 303 case GCPU_MN_NAMESPACE_EREPORT: 304 return (tbl[val].mne_ereport); 305 /*NOTREACHED*/ 306 307 default: 308 return (GCPU_MNEMONIC_UNDEF); 309 /*NOTREACHED*/ 310 } 311 } 312 313 /* 314 * The ereport class leaf component is either a simple string with no 315 * format specifiers, or a string with one or more embedded %n$s specifiers - 316 * positional selection for string arguments. The kernel snprintf does 317 * not support %n$ (and teaching it to do so is too big a headache) so 318 * we will expand this restricted format string ourselves. 319 */ 320 321 #define GCPU_CLASS_VARCOMPS 7 322 323 #define GCPU_MNEMONIC(code, name, nspace) \ 324 gcpu_mnemonic(gcpu_##name##_mnemonics, \ 325 sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \ 326 BIT_STRIP(code, name), nspace) 327 328 static void 329 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status, 330 enum gcpu_mn_namespace nspace) 331 { 332 uint16_t code = MCAX86_ERRCODE(status); 333 const char *mn[GCPU_CLASS_VARCOMPS]; 334 char *p = buf; /* current position in buf */ 335 char *q = buf + buflen; /* pointer past last char in buf */ 336 int which, expfmtchar, error; 337 char c; 338 339 mn[0] = GCPU_MNEMONIC(code, TT, nspace); 340 mn[1] = GCPU_MNEMONIC(code, LL, nspace); 341 mn[2] = GCPU_MNEMONIC(code, RRRR, nspace); 342 mn[3] = GCPU_MNEMONIC(code, PP, nspace); 343 mn[4] = GCPU_MNEMONIC(code, II, nspace); 344 mn[5] = GCPU_MNEMONIC(code, T, nspace); 345 mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : ""; 346 347 while (p < q - 1 && (c = *fmt++) != '\0') { 348 if (c != '%') { 349 /* not the beginning of a format specifier - copy */ 350 *p++ = c; 351 continue; 352 } 353 354 error = 0; 355 which = -1; 356 expfmtchar = -1; 357 358 nextfmt: 359 if ((c = *fmt++) == '\0') 360 break; /* early termination of fmt specifier */ 361 362 switch (c) { 363 case '1': 364 case '2': 365 case '3': 366 case '4': 367 case '5': 368 case '6': 369 case '7': 370 if (which != -1) { /* allow only one positional digit */ 371 error++; 372 break; 373 } 374 which = c - '1'; 375 goto nextfmt; 376 /*NOTREACHED*/ 377 378 case '$': 379 if (which == -1) { /* no position specified */ 380 error++; 381 break; 382 } 383 expfmtchar = 's'; 384 goto nextfmt; 385 /*NOTREACHED*/ 386 387 case 's': 388 if (expfmtchar != 's') { 389 error++; 390 break; 391 } 392 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s", 393 mn[which]); 394 p += strlen(p); 395 break; 396 397 default: 398 error++; 399 break; 400 } 401 402 if (error) 403 break; 404 } 405 406 *p = '\0'; /* NUL termination */ 407 } 408 409 static void 410 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status, 411 const char *cpuclass, const char *leafclass) 412 { 413 char *p = buf; /* current position in buf */ 414 char *q = buf + buflen; /* pointer past last char in buf */ 415 416 (void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.", 417 FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC); 418 419 p += strlen(p); 420 if (p >= q) 421 return; 422 423 if (leafclass == NULL) { 424 gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status, 425 GCPU_MN_NAMESPACE_EREPORT); 426 } else { 427 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s", 428 leafclass); 429 } 430 } 431 432 /* 433 * Create an "hc" scheme FMRI identifying the given cpu. We don't know 434 * the actual topology/connectivity of cpus in the system, so we'll 435 * apply /motherboard=0/chip=.../cpu=... in all cases. 436 */ 437 static nvlist_t * 438 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva) 439 { 440 nvlist_t *nvl; 441 442 if ((nvl = fm_nvlist_create(nva)) == NULL) 443 return (NULL); 444 445 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 3, 446 "motherboard", 0, 447 "chip", cmi_hdl_chipid(hdl), 448 "cpu", cmi_hdl_coreid(hdl)); 449 450 return (nvl); 451 } 452 453 int gcpu_bleat_count_thresh = 5; 454 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL; 455 456 /* 457 * Called when we are unable to propogate a logout structure onto an 458 * errorq for subsequent ereport preparation and logging etc. The caller 459 * should usually only decide to call this for severe errors - those we 460 * suspect we may need to panic for. 461 */ 462 static void 463 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl) 464 { 465 hrtime_t now = gethrtime_waitfree(); 466 static hrtime_t gcpu_last_bleat; 467 gcpu_bank_logout_t *gbl; 468 static int bleatcount; 469 int i; 470 471 /* 472 * Throttle spamming of the console. The first gcpu_bleat_count_thresh 473 * can come as fast as we like, but once we've spammed that many 474 * to the console we require a minimum interval to pass before 475 * any more complaints. 476 */ 477 if (++bleatcount > gcpu_bleat_count_thresh) { 478 if (now - gcpu_last_bleat < gcpu_bleat_min_interval) 479 return; 480 else 481 bleatcount = 0; 482 } 483 gcpu_last_bleat = now; 484 485 cmn_err(CE_WARN, "Machine-Check Errors unlogged on chip %d core %d, " 486 "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl)); 487 cmn_err(CE_WARN, "MCG_STATUS 0x%016llx", 488 (u_longlong_t)gcl->gcl_mcg_status); 489 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) { 490 uint64_t status = gbl->gbl_status; 491 492 if (!(status & MSR_MC_STATUS_VAL)) 493 continue; 494 495 switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) { 496 case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV: 497 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 498 "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx", 499 i, IA32_MSR_MC(i, STATUS), 500 (u_longlong_t)status, 501 (u_longlong_t)gbl->gbl_addr, 502 (u_longlong_t)gbl->gbl_misc); 503 break; 504 505 case MSR_MC_STATUS_ADDRV: 506 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 507 "STAT 0x%016llx ADDR 0x%016llx", 508 i, IA32_MSR_MC(i, STATUS), 509 (u_longlong_t)status, 510 (u_longlong_t)gbl->gbl_addr); 511 break; 512 513 case MSR_MC_STATUS_MISCV: 514 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 515 "STAT 0x%016llx MISC 0x%016llx", 516 i, IA32_MSR_MC(i, STATUS), 517 (u_longlong_t)status, 518 (u_longlong_t)gbl->gbl_misc); 519 break; 520 521 default: 522 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 523 "STAT 0x%016llx", 524 i, IA32_MSR_MC(i, STATUS), 525 (u_longlong_t)status); 526 break; 527 528 } 529 } 530 } 531 532 #define _GCPU_BSTATUS(status, what) \ 533 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \ 534 (status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE 535 536 static void 537 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl, 538 uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code) 539 { 540 uint64_t members = ged ? ged->ged_ereport_members : 541 FM_EREPORT_PAYLOAD_FLAGS_COMMON; 542 uint64_t mcg = gcl->gcl_mcg_status; 543 int mcip = mcg & MCG_STATUS_MCIP; 544 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno]; 545 uint64_t bstat = gbl->gbl_status; 546 547 /* 548 * Include the compound error name if requested and if this 549 * is a compound error type. 550 */ 551 if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged && 552 ged->ged_compound_fmt != NULL) { 553 char buf[FM_MAX_CLASS]; 554 555 gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code, 556 GCPU_MN_NAMESPACE_COMPOUND); 557 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR, 558 DATA_TYPE_STRING, buf, NULL); 559 } 560 561 /* 562 * Include disposition information for this error 563 */ 564 if (members & FM_EREPORT_PAYLOAD_FLAG_DISP && 565 gbl->gbl_disp != 0) { 566 int i, empty = 1; 567 char buf[128]; 568 char *p = buf, *q = buf + 128; 569 static struct _gcpu_disp_name { 570 uint64_t dv; 571 const char *dn; 572 } disp_names[] = { 573 { CMI_ERRDISP_CURCTXBAD, 574 "processor_context_corrupt" }, 575 { CMI_ERRDISP_RIPV_INVALID, 576 "return_ip_invalid" }, 577 { CMI_ERRDISP_UC_UNCONSTRAINED, 578 "unconstrained" }, 579 { CMI_ERRDISP_FORCEFATAL, 580 "forcefatal" }, 581 { CMI_ERRDISP_IGNORED, 582 "ignored" }, 583 { CMI_ERRDISP_PCC_CLEARED, 584 "corrupt_context_cleared" }, 585 { CMI_ERRDISP_UC_CLEARED, 586 "uncorrected_data_cleared" }, 587 { CMI_ERRDISP_POISONED, 588 "poisoned" }, 589 { CMI_ERRDISP_INCONSISTENT, 590 "telemetry_unstable" }, 591 }; 592 593 for (i = 0; i < sizeof (disp_names) / 594 sizeof (struct _gcpu_disp_name); i++) { 595 if ((gbl->gbl_disp & disp_names[i].dv) == 0) 596 continue; 597 598 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, 599 "%s%s", empty ? "" : ",", disp_names[i].dn); 600 p += strlen(p); 601 empty = 0; 602 } 603 604 if (p != buf) 605 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP, 606 DATA_TYPE_STRING, buf, NULL); 607 } 608 609 /* 610 * If MCG_STATUS is included add that and an indication of whether 611 * this ereport was the result of a machine check or poll. 612 */ 613 if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) { 614 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS, 615 DATA_TYPE_UINT64, mcg, NULL); 616 617 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP, 618 DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL); 619 } 620 621 /* 622 * If an instruction pointer is to be included add one provided 623 * MCG_STATUS indicated it is valid; meaningless for polled events. 624 */ 625 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP && 626 mcg & MCG_STATUS_EIPV) { 627 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP, 628 DATA_TYPE_UINT64, gcl->gcl_ip, NULL); 629 } 630 631 /* 632 * Add an indication of whether the trap occured during privileged code. 633 */ 634 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) { 635 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV, 636 DATA_TYPE_BOOLEAN_VALUE, 637 gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL); 638 } 639 640 /* 641 * If requested, add the index of the MCA bank. This indicates the 642 * n'th bank of 4 MCA registers, and does not necessarily correspond 643 * to MCi_* - use the bank offset to correlate 644 */ 645 if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) { 646 fm_payload_set(ereport, 647 /* Bank number */ 648 FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno, 649 /* Offset of MCi_CTL */ 650 FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64, 651 IA32_MSR_MC(bankno, CTL), 652 NULL); 653 } 654 655 /* 656 * Add MCi_STATUS if requested, and decode it. 657 */ 658 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) { 659 const char *tbes[] = { 660 "No tracking", /* 00 */ 661 "Green - below threshold", /* 01 */ 662 "Yellow - above threshold", /* 10 */ 663 "Reserved" /* 11 */ 664 }; 665 666 fm_payload_set(ereport, 667 /* Bank MCi_STATUS */ 668 FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat, 669 /* Overflow? */ 670 _GCPU_BSTATUS(bstat, OVER), 671 /* Uncorrected? */ 672 _GCPU_BSTATUS(bstat, UC), 673 /* Enabled? */ 674 _GCPU_BSTATUS(bstat, EN), 675 /* Processor context corrupt? */ 676 _GCPU_BSTATUS(bstat, PCC), 677 /* Error code */ 678 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE, 679 DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat), 680 /* Model-specific error code */ 681 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE, 682 DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat), 683 NULL); 684 685 /* 686 * If MCG_CAP.TES_P indicates that that thresholding info 687 * is present in the architural component of the bank status 688 * then include threshold information for this bank. 689 */ 690 if (gcl->gcl_flags & GCPU_GCL_F_TES_P) { 691 fm_payload_set(ereport, 692 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES, 693 DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)], 694 NULL); 695 } 696 } 697 698 /* 699 * MCi_ADDR info if requested and valid. 700 */ 701 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR && 702 bstat & MSR_MC_STATUS_ADDRV) { 703 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR, 704 DATA_TYPE_UINT64, gbl->gbl_addr, NULL); 705 } 706 707 /* 708 * MCi_MISC if requested and MCi_STATUS.MISCV). 709 */ 710 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC && 711 bstat & MSR_MC_STATUS_MISCV) { 712 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC, 713 DATA_TYPE_UINT64, gbl->gbl_misc, NULL); 714 } 715 716 } 717 718 /* 719 * Construct and post an ereport based on the logout information from a 720 * single MCA bank. We are not necessarily running on the cpu that 721 * detected the error. 722 */ 723 static void 724 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx, 725 const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status) 726 { 727 gcpu_data_t *gcpu = gcl->gcl_gcpu; 728 cmi_hdl_t hdl = gcpu->gcpu_hdl; 729 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx]; 730 const char *cpuclass = NULL, *leafclass = NULL; 731 uint16_t code = MCAX86_ERRCODE(status); 732 errorq_elem_t *eqep, *scr_eqep; 733 nvlist_t *ereport, *detector; 734 char buf[FM_MAX_CLASS]; 735 const char *classfmt; 736 nv_alloc_t *nva; 737 738 if (panicstr) { 739 if ((eqep = errorq_reserve(ereport_errorq)) == NULL) 740 return; 741 ereport = errorq_elem_nvl(ereport_errorq, eqep); 742 743 /* 744 * Allocate another element for scratch space, but fallback 745 * to the one we have if that fails. We'd like to use the 746 * additional scratch space for nvlist construction. 747 */ 748 if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL) 749 nva = errorq_elem_nva(ereport_errorq, scr_eqep); 750 else 751 nva = errorq_elem_nva(ereport_errorq, eqep); 752 } else { 753 ereport = fm_nvlist_create(NULL); 754 nva = NULL; 755 } 756 757 if (ereport == NULL) 758 return; 759 760 /* 761 * Common payload data required by the protocol: 762 * - ereport class 763 * - detector 764 * - ENA 765 */ 766 767 /* 768 * Ereport class - call into model-specific support to allow it to 769 * provide a cpu class or leaf class, otherwise calculate our own. 770 */ 771 cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass); 772 classfmt = ged ? ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN; 773 gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass, 774 leafclass); 775 776 /* 777 * The detector FMRI. 778 */ 779 if ((detector = cms_ereport_detector(hdl, mscookie, nva)) == NULL) 780 detector = gcpu_fmri_create(hdl, nva); 781 782 /* 783 * Should we define a new ENA format 3?? for chip/core/strand? 784 * It will be better when virtualized. 785 */ 786 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf, 787 fm_ena_generate_cpu(gcl->gcl_timestamp, 788 cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 | 789 cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL); 790 791 if (panicstr) { 792 fm_nvlist_destroy(detector, FM_NVA_RETAIN); 793 nv_alloc_reset(nva); 794 } else { 795 fm_nvlist_destroy(detector, FM_NVA_FREE); 796 } 797 798 /* 799 * Add the architectural ereport class-specific payload data. 800 */ 801 gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code); 802 803 /* 804 * Allow model-specific code to add ereport members. 805 */ 806 cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status, 807 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie); 808 809 /* 810 * Include stack if options is turned on and either selected in 811 * the payload member bitmask or inclusion is forced. 812 */ 813 if (gcpu_mca_stack_flag && 814 (cms_ereport_includestack(hdl, mscookie) == 815 B_TRUE || gcpu_mca_stack_ereport_include)) { 816 fm_payload_stack_add(ereport, gcl->gcl_stack, 817 gcl->gcl_stackdepth); 818 } 819 820 /* 821 * Post ereport. 822 */ 823 if (panicstr) { 824 errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC); 825 if (scr_eqep) 826 errorq_cancel(ereport_errorq, scr_eqep); 827 } else { 828 (void) fm_ereport_post(ereport, EVCH_TRYHARD); 829 fm_nvlist_destroy(ereport, FM_NVA_FREE); 830 } 831 832 } 833 834 /*ARGSUSED*/ 835 void 836 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe) 837 { 838 const gcpu_logout_t *gcl = data; 839 const gcpu_bank_logout_t *gbl; 840 int i; 841 842 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) { 843 const gcpu_error_disp_t *gened; 844 cms_cookie_t mscookie; 845 846 if (gbl->gbl_status & MSR_MC_STATUS_VAL && 847 !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) { 848 uint16_t code = MCAX86_ERRCODE(gbl->gbl_status); 849 850 /* 851 * Perform a match based on IA32 MCA architectural 852 * components alone. 853 */ 854 gened = gcpu_disp_match(code); /* may be NULL */ 855 856 /* 857 * Now see if an model-specific match can be made. 858 */ 859 mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, i, 860 gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc, 861 gcl->gcl_ms_logout); 862 863 /* 864 * Prepare and dispatch an ereport for logging and 865 * diagnosis. 866 */ 867 gcpu_ereport_post(gcl, i, gened, mscookie, 868 gbl->gbl_status); 869 } else if (gbl->gbl_status & MSR_MC_STATUS_VAL && 870 (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) { 871 /* 872 * Telemetry kept changing as we tried to read 873 * it. Force an unknown ereport leafclass but 874 * keep the telemetry unchanged for logging. 875 */ 876 gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL, 877 gbl->gbl_status); 878 } 879 } 880 } 881 882 static size_t gcpu_mca_queue_datasz = 0; 883 884 /* 885 * The following code is ready to make a weak attempt at growing the 886 * errorq structure size. Since it is not foolproof (we don't know 887 * who may already be producing to the outgoing errorq) our caller 888 * instead assures that we'll always be called with no greater data 889 * size than on our first call. 890 */ 891 static void 892 gcpu_errorq_init(size_t datasz) 893 { 894 int slots; 895 896 mutex_enter(&gcpu_mca_queue_lock); 897 898 if (gcpu_mca_queue_datasz >= datasz) { 899 mutex_exit(&gcpu_mca_queue_lock); 900 return; 901 } 902 903 membar_producer(); 904 if (gcpu_mca_queue) { 905 gcpu_mca_queue_datasz = 0; 906 errorq_destroy(gcpu_mca_queue); 907 } 908 909 slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS); 910 slots = MIN(slots, GCPU_MCA_MAX_ERRORS); 911 912 gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain, 913 NULL, slots, datasz, 1, ERRORQ_VITAL); 914 915 if (gcpu_mca_queue != NULL) 916 gcpu_mca_queue_datasz = datasz; 917 918 mutex_exit(&gcpu_mca_queue_lock); 919 } 920 921 /* 922 * Perform MCA initialization as described in section 14.6 of Intel 64 923 * and IA-32 Architectures Software Developer's Manual Volume 3A. 924 */ 925 926 static uint_t global_nbanks; 927 928 void 929 gcpu_mca_init(cmi_hdl_t hdl) 930 { 931 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 932 uint64_t cap; 933 uint_t vendor = cmi_hdl_vendor(hdl); 934 uint_t family = cmi_hdl_family(hdl); 935 gcpu_mca_t *mca = &gcpu->gcpu_mca; 936 int mcg_ctl_present; 937 uint_t nbanks; 938 size_t mslsz; 939 int i; 940 941 if (gcpu == NULL) 942 return; 943 944 /* 945 * Protect from some silly /etc/system settings. 946 */ 947 if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100) 948 gcpu_mca_telemetry_retries = 5; 949 950 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS) 951 return; 952 953 /* 954 * CPU startup code only calls cmi_mca_init if x86_feature indicates 955 * both MCA and MCE support (i.e., X86_MCA). P5, K6, and earlier 956 * processors, which have their own * more primitive way of doing 957 * machine checks, will not have cmi_mca_init called since their 958 * CPUID information will not indicate both MCA and MCE features. 959 */ 960 #ifndef __xpv 961 ASSERT(x86_feature & X86_MCA); 962 #endif /* __xpv */ 963 964 /* 965 * Determine whether the IA32_MCG_CTL register is present. If it 966 * is we will enable all features by writing -1 to it towards 967 * the end of this initialization; if it is absent then volume 3A 968 * says we must nonetheless continue to initialize the individual 969 * banks. 970 */ 971 mcg_ctl_present = cap & MCG_CAP_CTL_P; 972 973 /* 974 * We squirell values away for inspection/debugging. 975 */ 976 mca->gcpu_mca_bioscfg.bios_mcg_cap = cap; 977 if (mcg_ctl_present) 978 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL, 979 &mca->gcpu_mca_bioscfg.bios_mcg_ctl); 980 981 /* 982 * Determine the number of error-reporting banks implemented. 983 */ 984 mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK; 985 986 if (nbanks != 0 && global_nbanks == 0) 987 global_nbanks = nbanks; /* no race - BSP will get here first */ 988 989 /* 990 * If someone is hiding the number of banks (perhaps we are fully 991 * virtualized?) or if this processor has more banks than the 992 * first to set global_nbanks then bail. The latter requirement 993 * is because we need to size our errorq data structure and we 994 * don't want to have to grow the errorq (destroy and recreate) 995 * which may just lose some telemetry. 996 */ 997 if (nbanks == 0 || nbanks > global_nbanks) 998 return; 999 1000 mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks * 1001 sizeof (struct gcpu_bios_bankcfg), KM_SLEEP); 1002 1003 /* 1004 * Calculate the size we need to allocate for a gcpu_logout_t 1005 * with a gcl_data array big enough for all banks of this cpu. 1006 * Add any space requested by the model-specific logout support. 1007 */ 1008 mslsz = cms_logout_size(hdl); 1009 mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) + 1010 (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz; 1011 1012 for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) { 1013 gcpu_logout_t *gcl; 1014 1015 mca->gcpu_mca_logout[i] = gcl = 1016 kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP); 1017 gcl->gcl_gcpu = gcpu; 1018 gcl->gcl_nbanks = nbanks; 1019 gcl->gcl_ms_logout = (mslsz == 0) ? NULL : 1020 (char *)(&gcl->gcl_data[0]) + nbanks * 1021 sizeof (gcpu_bank_logout_t); 1022 1023 } 1024 mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1; 1025 1026 /* 1027 * Create our errorq to transport the logout structures. This 1028 * can fail so users of gcpu_mca_queue must be prepared for NULL. 1029 */ 1030 gcpu_errorq_init(mca->gcpu_mca_lgsz); 1031 1032 /* 1033 * Not knowing which, if any, banks are shared between cores we 1034 * assure serialization of MCA bank initialization by each cpu 1035 * on the chip. On chip architectures in which some banks are 1036 * shared this will mean the shared resource is initialized more 1037 * than once - we're simply aiming to avoid simultaneous MSR writes 1038 * to the shared resource. 1039 * 1040 * Even with these precautions, some platforms may yield a GP fault 1041 * if a core other than a designated master tries to write anything 1042 * but all 0's to MCi_{STATUS,ADDR,CTL}. So we will perform 1043 * those writes under on_trap protection. 1044 */ 1045 mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock); 1046 1047 /* 1048 * Initialize poller data, but don't start polling yet. 1049 */ 1050 gcpu_mca_poll_init(hdl); 1051 1052 /* 1053 * Work out which MCA banks we will initialize. In MCA logout 1054 * code we will only read those banks which we initialize here. 1055 */ 1056 for (i = 0; i < nbanks; i++) { 1057 /* 1058 * On Intel family 6 and AMD family 6 we must not enable 1059 * machine check from bank 0 detectors. In the Intel 1060 * case bank 0 is reserved for the platform, while in the 1061 * AMD case reports are that enabling bank 0 (DC) produces 1062 * spurious machine checks. 1063 */ 1064 if (i == 0 && ((vendor == X86_VENDOR_Intel || 1065 vendor == X86_VENDOR_AMD) && family == 6)) 1066 continue; 1067 1068 if (cms_bankctl_skipinit(hdl, i)) 1069 continue; 1070 1071 /* 1072 * Record which MCA banks were enabled, both from the 1073 * point of view of this core and accumulating for the 1074 * whole chip (if some cores share a bank we must be 1075 * sure either can logout from it). 1076 */ 1077 mca->gcpu_actv_banks |= 1 << i; 1078 atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i); 1079 } 1080 1081 /* 1082 * Log any valid telemetry lurking in the MCA banks, but do not 1083 * clear the status registers. Ignore the disposition returned - 1084 * we have already paniced or reset for any nasty errors found here. 1085 * 1086 * Intel vol 3A says that we should not do this on family 0x6, 1087 * and that for any extended family the BIOS clears things 1088 * on power-on reset so you'll only potentially find valid telemetry 1089 * on warm reset (we do it for both - on power-on reset we should 1090 * just see zeroes). 1091 * 1092 * AMD docs since K7 say we should process anything we find here. 1093 */ 1094 if (!gcpu_suppress_log_on_init && 1095 (vendor == X86_VENDOR_Intel && family >= 0xf || 1096 vendor == X86_VENDOR_AMD)) 1097 gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE); 1098 1099 /* 1100 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the 1101 * model-specific module the power of veto. 1102 */ 1103 for (i = 0; i < nbanks; i++) { 1104 struct gcpu_bios_bankcfg *bcfgp = 1105 mca->gcpu_mca_bioscfg.bios_bankcfg + i; 1106 1107 /* 1108 * Stash inherited bank MCA state, even for banks we will 1109 * not initialize ourselves. Do not read the MISC register 1110 * unconditionally - on some processors that will #GP on 1111 * banks that do not implement the MISC register (would be 1112 * caught by on_trap, anyway). 1113 */ 1114 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL), 1115 &bcfgp->bios_bank_ctl); 1116 1117 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), 1118 &bcfgp->bios_bank_status); 1119 1120 if (bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV) 1121 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), 1122 &bcfgp->bios_bank_addr); 1123 1124 /* 1125 * In some old BIOS the status value after boot can indicate 1126 * MISCV when there is actually no MISC register for 1127 * that bank. The following read could therefore 1128 * aggravate a general protection fault. This should be 1129 * caught by on_trap, but the #GP fault handler is busted 1130 * and can suffer a double fault even before we get to 1131 * trap() to check for on_trap protection. Until that 1132 * issue is fixed we remove the one access that we know 1133 * can cause a #GP. 1134 * 1135 * if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV) 1136 * (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), 1137 * &bcfgp->bios_bank_misc); 1138 */ 1139 bcfgp->bios_bank_misc = 0; 1140 1141 if (!(mca->gcpu_actv_banks & 1 << i)) 1142 continue; 1143 1144 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL), 1145 cms_bankctl_val(hdl, i, -1ULL)); 1146 1147 if (!cms_bankstatus_skipinit(hdl, i)) { 1148 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 1149 cms_bankstatus_val(hdl, i, 0ULL)); 1150 } 1151 } 1152 1153 /* 1154 * Now let the model-specific support perform further initialization 1155 * of non-architectural features. 1156 */ 1157 cms_mca_init(hdl, nbanks); 1158 1159 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL); 1160 membar_producer(); 1161 1162 /* enable all machine-check features */ 1163 if (mcg_ctl_present) 1164 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL, 1165 cms_mcgctl_val(hdl, nbanks, -1ULL)); 1166 1167 mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock); 1168 1169 /* enable machine-check exception in CR4 */ 1170 cmi_hdl_enable_mce(hdl); 1171 } 1172 1173 static uint64_t 1174 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu, 1175 gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp) 1176 { 1177 int curctxbad = 0, unconstrained = 0, forcefatal = 0; 1178 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1179 int nbanks = mca->gcpu_mca_nbanks; 1180 gcpu_mce_status_t mce; 1181 gcpu_bank_logout_t *gbl; 1182 uint64_t disp = 0; 1183 int i; 1184 1185 if (mcesp == NULL) 1186 mcesp = &mce; 1187 1188 mcesp->mce_nerr = nerr; 1189 1190 mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc = 1191 mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned = 1192 mcesp->mce_forcefatal = mcesp->mce_ignored = 0; 1193 1194 /* 1195 * If this a machine check then if the return instruction pointer 1196 * is not valid the current context is lost. 1197 */ 1198 if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV)) 1199 disp |= CMI_ERRDISP_RIPV_INVALID; 1200 1201 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) { 1202 uint64_t mcistatus = gbl->gbl_status; 1203 uint32_t ms_scope; 1204 int pcc, uc; 1205 int poisoned; 1206 1207 if (!(mcistatus & MSR_MC_STATUS_VAL)) 1208 continue; 1209 1210 if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT) 1211 continue; 1212 1213 pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0; 1214 uc = (mcistatus & MSR_MC_STATUS_UC) != 0; 1215 mcesp->mce_npcc += pcc; 1216 mcesp->mce_nuc += uc; 1217 1218 ms_scope = cms_error_action(hdl, ismc, i, mcistatus, 1219 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout); 1220 1221 if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) { 1222 pcc = 0; 1223 mcesp->mce_npcc_ok++; 1224 gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED; 1225 } 1226 1227 if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) { 1228 uc = 0; 1229 mcesp->mce_nuc_ok++; 1230 gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED; 1231 } 1232 1233 if (uc) { 1234 poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0; 1235 if (poisoned) { 1236 mcesp->mce_nuc_poisoned++; 1237 gbl->gbl_disp |= CMI_ERRDISP_POISONED; 1238 } 1239 } 1240 1241 if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) { 1242 /* 1243 * We're not being instructed to ignore the error, 1244 * so apply our standard disposition logic to it. 1245 */ 1246 if (uc && !poisoned) { 1247 unconstrained++; 1248 gbl->gbl_disp |= disp | 1249 CMI_ERRDISP_UC_UNCONSTRAINED; 1250 } 1251 1252 if (pcc && ismc) { 1253 curctxbad++; 1254 gbl->gbl_disp |= disp | 1255 CMI_ERRDISP_CURCTXBAD; 1256 } 1257 1258 /* 1259 * Even if the above may not indicate that the error 1260 * is terminal, model-specific support may insist 1261 * that we treat it as such. Such errors wil be 1262 * fatal even if discovered via poll. 1263 */ 1264 if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) { 1265 forcefatal++; 1266 mcesp->mce_forcefatal++; 1267 gbl->gbl_disp |= disp | 1268 CMI_ERRDISP_FORCEFATAL; 1269 } 1270 } else { 1271 mcesp->mce_ignored++; 1272 gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED; 1273 } 1274 } 1275 1276 if (unconstrained > 0) 1277 disp |= CMI_ERRDISP_UC_UNCONSTRAINED; 1278 1279 if (curctxbad > 0) 1280 disp |= CMI_ERRDISP_CURCTXBAD; 1281 1282 if (forcefatal > 0) 1283 disp |= CMI_ERRDISP_FORCEFATAL; 1284 1285 if (gcpu_mca_queue != NULL) { 1286 int how; 1287 1288 if (ismc) { 1289 how = cmi_mce_response(rp, disp) ? 1290 ERRORQ_ASYNC : /* no panic, so arrange drain */ 1291 ERRORQ_SYNC; /* panic flow will drain */ 1292 } else { 1293 how = (disp & CMI_ERRDISP_FORCEFATAL && 1294 cmi_panic_on_ue()) ? 1295 ERRORQ_SYNC : /* poller will panic */ 1296 ERRORQ_ASYNC; /* no panic */ 1297 } 1298 1299 errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how); 1300 } else if (disp != 0) { 1301 gcpu_bleat(hdl, gcl); 1302 } 1303 1304 mcesp->mce_disp = disp; 1305 1306 return (disp); 1307 } 1308 1309 /* 1310 * Gather error telemetry from our source, and then submit it for 1311 * processing. 1312 */ 1313 1314 #define IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \ 1315 ((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0) 1316 1317 #define STATUS_EQV(s1, s2) \ 1318 (((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER)) 1319 1320 static uint32_t gcpu_deferrred_polled_clears; 1321 1322 void 1323 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask, 1324 gcpu_mce_status_t *mcesp, boolean_t clrstatus) 1325 { 1326 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1327 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1328 int nbanks = mca->gcpu_mca_nbanks; 1329 gcpu_bank_logout_t *gbl, *pgbl; 1330 gcpu_logout_t *gcl, *pgcl; 1331 int ismc = (rp != NULL); 1332 int ispoll = !ismc; 1333 int i, nerr = 0; 1334 cmi_errno_t err; 1335 uint64_t mcg_status; 1336 uint64_t disp; 1337 uint64_t cap; 1338 1339 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) != 1340 CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != 1341 CMI_SUCCESS) { 1342 if (mcesp != NULL) 1343 mcesp->mce_nerr = mcesp->mce_disp = 0; 1344 return; 1345 } 1346 1347 if (ismc) { 1348 gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION]; 1349 } else { 1350 int pidx = mca->gcpu_mca_nextpoll_idx; 1351 int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ? 1352 GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1; 1353 1354 gcl = mca->gcpu_mca_logout[pidx]; /* current logout */ 1355 pgcl = mca->gcpu_mca_logout[ppidx]; /* previous logout */ 1356 mca->gcpu_mca_nextpoll_idx = ppidx; /* switch next time */ 1357 } 1358 1359 gcl->gcl_timestamp = gethrtime_waitfree(); 1360 gcl->gcl_mcg_status = mcg_status; 1361 gcl->gcl_ip = rp ? rp->r_pc : 0; 1362 1363 gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0; 1364 if (cap & MCG_CAP_TES_P) 1365 gcl->gcl_flags |= GCPU_GCL_F_TES_P; 1366 1367 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) { 1368 uint64_t status, status2, addr, misc; 1369 int retries = gcpu_mca_telemetry_retries; 1370 1371 gbl->gbl_status = 0; 1372 gbl->gbl_disp = 0; 1373 gbl->gbl_clrdefcnt = 0; 1374 1375 /* 1376 * Only logout from MCA banks we have initialized from at 1377 * least one core. If a core shares an MCA bank with another 1378 * but perhaps lost the race to initialize it, then it must 1379 * still be allowed to logout from the shared bank. 1380 */ 1381 if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i)) 1382 continue; 1383 1384 /* 1385 * On a poll look only at the banks we've been asked to check. 1386 */ 1387 if (rp == NULL && !(bankmask & 1 << i)) 1388 continue; 1389 1390 1391 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) != 1392 CMI_SUCCESS) 1393 continue; 1394 retry: 1395 if (!(status & MSR_MC_STATUS_VAL)) 1396 continue; 1397 1398 addr = -1; 1399 misc = 0; 1400 1401 if (status & MSR_MC_STATUS_ADDRV) 1402 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr); 1403 1404 if (status & MSR_MC_STATUS_MISCV) 1405 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc); 1406 1407 /* 1408 * Allow the model-specific code to extract bank telemetry. 1409 */ 1410 cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout); 1411 1412 /* 1413 * Not all cpu models assure us that the status/address/misc 1414 * data will not change during the above sequence of MSR reads, 1415 * or that it can only change by the addition of the OVerflow 1416 * bit to the status register. If the status has changed 1417 * other than in the overflow bit then we attempt to reread 1418 * for a consistent snapshot, but eventually give up and 1419 * go with what we've got. We only perform this check 1420 * for a poll - a further #MC during a #MC will reset, and 1421 * polled errors should not overwrite higher-priority 1422 * trapping errors (but could set the overflow bit). 1423 */ 1424 if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), 1425 &status2)) == CMI_SUCCESS) { 1426 if (!STATUS_EQV(status, status2)) { 1427 if (retries-- > 0) { 1428 status = status2; 1429 goto retry; 1430 } else { 1431 gbl->gbl_disp |= 1432 CMI_ERRDISP_INCONSISTENT; 1433 } 1434 } 1435 } else if (ispoll && err != CMI_SUCCESS) { 1436 gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT; 1437 } 1438 1439 nerr++; 1440 gbl->gbl_status = status; 1441 gbl->gbl_addr = addr; 1442 gbl->gbl_misc = misc; 1443 1444 if (clrstatus == B_FALSE) 1445 goto serialize; 1446 1447 /* 1448 * For machine checks we always clear status here. For polls 1449 * we must be a little more cautious since there is an 1450 * outside chance that we may clear telemetry from a shared 1451 * MCA bank on which a sibling core is machine checking. 1452 * 1453 * For polled observations of errors that look like they may 1454 * produce a machine check (UC/PCC and ENabled, although these 1455 * do not guarantee a machine check on error occurence) 1456 * we will not clear the status at this wakeup unless 1457 * we saw the same status at the previous poll. We will 1458 * always process and log the current observations - it 1459 * is only the clearing of MCi_STATUS which may be 1460 * deferred until the next wakeup. 1461 */ 1462 if (ismc || !IS_MCE_CANDIDATE(status)) { 1463 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL); 1464 goto serialize; 1465 } 1466 1467 /* 1468 * We have a polled observation of a machine check 1469 * candidate. If we saw essentially the same status at the 1470 * last poll then clear the status now since this appears 1471 * not to be a #MC candidate after all. If we see quite 1472 * different status now then do not clear, but reconsider at 1473 * the next poll. In no actual machine check clears 1474 * the status in the interim then the status should not 1475 * keep changing forever (meaning we'd never clear it) 1476 * since before long we'll simply have latched the highest- 1477 * priority error and set the OVerflow bit. Nonetheless 1478 * we count how many times we defer clearing and after 1479 * a while insist on clearing the status. 1480 */ 1481 pgbl = &pgcl->gcl_data[i]; 1482 if (pgbl->gbl_clrdefcnt != 0) { 1483 /* We deferred clear on this bank at last wakeup */ 1484 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) || 1485 pgbl->gbl_clrdefcnt > 5) { 1486 /* 1487 * Status is unchanged so clear it now and, 1488 * since we have already logged this info, 1489 * avoid logging it again. 1490 */ 1491 gbl->gbl_status = 0; 1492 nerr--; 1493 (void) cmi_hdl_wrmsr(hdl, 1494 IA32_MSR_MC(i, STATUS), 0ULL); 1495 } else { 1496 /* Record deferral for next wakeup */ 1497 gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1; 1498 } 1499 } else { 1500 /* Record initial deferral for next wakeup */ 1501 gbl->gbl_clrdefcnt = 1; 1502 gcpu_deferrred_polled_clears++; 1503 } 1504 1505 serialize: 1506 /* 1507 * Intel Vol 3A says to execute a serializing instruction 1508 * here, ie CPUID. Well WRMSR is also defined to be 1509 * serializing, so the status clear above should suffice. 1510 * To be a good citizen, and since some clears are deferred, 1511 * we'll execute a CPUID instruction here. 1512 */ 1513 { 1514 struct cpuid_regs tmp; 1515 (void) __cpuid_insn(&tmp); 1516 } 1517 } 1518 1519 if (gcpu_mca_stack_flag) 1520 gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH); 1521 else 1522 gcl->gcl_stackdepth = 0; 1523 1524 /* 1525 * Decide our disposition for this error or errors, and submit for 1526 * logging and subsequent diagnosis. 1527 */ 1528 if (nerr != 0) { 1529 disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp); 1530 } else { 1531 disp = 0; 1532 if (mcesp) { 1533 mcesp->mce_nerr = mcesp->mce_disp = 0; 1534 } 1535 } 1536 1537 /* 1538 * Clear MCG_STATUS if MCIP is set (machine check in progress). 1539 * If a second #MC had occured before now the system would have 1540 * reset. We can only do thise once gcpu_mca_process has copied 1541 * the logout structure. 1542 */ 1543 if (ismc && mcg_status & MCG_STATUS_MCIP) 1544 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0); 1545 1546 /* 1547 * At this point we have read and logged all telemetry that is visible 1548 * under the MCA. On architectures for which the NorthBridge is 1549 * on-chip this may include NB-observed errors, but where the NB 1550 * is off chip it may have been the source of the #MC request and 1551 * so we must call into the memory-controller driver to give it 1552 * a chance to log errors. 1553 */ 1554 if (ismc) { 1555 int willpanic = (cmi_mce_response(rp, disp) == 0); 1556 cmi_mc_logout(hdl, 1, willpanic); 1557 } 1558 } 1559 1560 int gcpu_mca_trap_vomit_summary = 0; 1561 1562 /* 1563 * On a native machine check exception we come here from mcetrap via 1564 * cmi_mca_trap. A machine check on one cpu of a chip does not trap others 1565 * cpus of the chip, so it is possible that another cpu on this chip could 1566 * initiate a poll while we're in the #mc handler; it is also possible that 1567 * this trap has occured during a poll on this cpu. So we must acquire 1568 * the chip-wide poll lock, but be careful to avoid deadlock. 1569 * 1570 * The 'data' pointer cannot be NULL due to init order. 1571 */ 1572 uint64_t 1573 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp) 1574 { 1575 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1576 kmutex_t *poll_lock = NULL; 1577 gcpu_mce_status_t mce; 1578 uint64_t mcg_status; 1579 int tooklock = 0; 1580 1581 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) != 1582 CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP)) 1583 return (0); 1584 1585 /* 1586 * Synchronize with any poller from another core that may happen 1587 * to share access to one or more of the MCA banks. 1588 */ 1589 if (gcpu->gcpu_shared != NULL) 1590 poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock; 1591 1592 if (poll_lock != NULL && !mutex_owned(poll_lock)) { 1593 /* 1594 * The lock is not owned by the thread we have 1595 * interrupted. Spin for this adaptive lock. 1596 */ 1597 while (!mutex_tryenter(poll_lock)) { 1598 while (mutex_owner(poll_lock) != NULL) 1599 ; 1600 } 1601 tooklock = 1; 1602 } 1603 1604 gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE); 1605 1606 if (tooklock) 1607 mutex_exit(poll_lock); 1608 1609 /* 1610 * gcpu_mca_trap_vomit_summary may be set for debug assistance. 1611 */ 1612 if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) { 1613 cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, " 1614 "%u PCC (%u ok), " 1615 "%u UC (%d ok, %u poisoned), " 1616 "%u forcefatal, %u ignored", 1617 mce.mce_nerr, (u_longlong_t)mce.mce_disp, 1618 mce.mce_npcc, mce.mce_npcc_ok, 1619 mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned, 1620 mce.mce_forcefatal, mce.mce_ignored); 1621 } 1622 1623 return (mce.mce_disp); 1624 } 1625 1626 /*ARGSUSED*/ 1627 void 1628 gcpu_faulted_enter(cmi_hdl_t hdl) 1629 { 1630 /* Nothing to do here */ 1631 } 1632 1633 /*ARGSUSED*/ 1634 void 1635 gcpu_faulted_exit(cmi_hdl_t hdl) 1636 { 1637 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1638 1639 gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING; 1640 } 1641 1642 /* 1643 * Write the requested values to the indicated MSRs. Having no knowledge 1644 * of the model-specific requirements for writing to these model-specific 1645 * registers, we will only blindly write to those MSRs if the 'force' 1646 * argument is nonzero. That option should only be used in prototyping 1647 * and debugging. 1648 */ 1649 /*ARGSUSED*/ 1650 cmi_errno_t 1651 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs, 1652 int force) 1653 { 1654 int i, errs = 0; 1655 1656 for (i = 0; i < nregs; i++) { 1657 uint_t msr = regs[i].cmr_msrnum; 1658 uint64_t val = regs[i].cmr_msrval; 1659 1660 if (cms_present(hdl)) { 1661 if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS) 1662 errs++; 1663 } else if (force) { 1664 errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS); 1665 } else { 1666 errs++; 1667 } 1668 } 1669 1670 return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN); 1671 } 1672