1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 /* 26 * Copyright (c) 2010, Intel Corporation. 27 * All rights reserved. 28 */ 29 30 #include <sys/mca_x86.h> 31 #include <sys/cpu_module_impl.h> 32 #include <sys/cpu_module_ms.h> 33 #include <sys/cmn_err.h> 34 #include <sys/cpuvar.h> 35 #include <sys/pghw.h> 36 #include <sys/x86_archext.h> 37 #include <sys/sysmacros.h> 38 #include <sys/regset.h> 39 #include <sys/privregs.h> 40 #include <sys/systm.h> 41 #include <sys/types.h> 42 #include <sys/log.h> 43 #include <sys/psw.h> 44 #include <sys/fm/protocol.h> 45 #include <sys/fm/util.h> 46 #include <sys/errorq.h> 47 #include <sys/mca_x86.h> 48 #include <sys/fm/cpu/GMCA.h> 49 #include <sys/fm/smb/fmsmb.h> 50 #include <sys/sysevent.h> 51 #include <sys/ontrap.h> 52 53 #include "gcpu.h" 54 55 extern int x86gentopo_legacy; /* x86 generic topology support */ 56 57 static uint_t gcpu_force_addr_in_payload = 0; 58 59 /* 60 * Clear to log telemetry found at initialization. While processor docs 61 * say you should process this telemetry on all but Intel family 0x6 62 * there are way too many exceptions and we want to avoid bogus 63 * diagnoses. 64 */ 65 int gcpu_suppress_log_on_init = 1; 66 67 /* 68 * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at 69 * error logout time. The stack will be included in the ereport if the 70 * error type selects stack inclusion, or in all cases if 71 * gcpu_mca_stack_ereport_include is nonzero. 72 */ 73 int gcpu_mca_stack_flag = 0; 74 int gcpu_mca_stack_ereport_include = 0; 75 76 /* 77 * The number of times to re-read MCA telemetry to try to obtain a 78 * consistent snapshot if we find it to be changing under our feet. 79 */ 80 int gcpu_mca_telemetry_retries = 5; 81 82 #ifndef __xpv 83 int gcpu_mca_cmci_throttling_threshold = 10; 84 int gcpu_mca_cmci_reenable_threshold = 1000; 85 #endif 86 87 static gcpu_error_disp_t gcpu_errtypes[] = { 88 89 /* 90 * Unclassified 91 */ 92 { 93 FM_EREPORT_CPU_GENERIC_UNCLASSIFIED, 94 NULL, 95 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 96 MCAX86_SIMPLE_UNCLASSIFIED_MASKON, 97 MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF 98 }, 99 100 /* 101 * Microcode ROM Parity Error 102 */ 103 { 104 FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY, 105 NULL, 106 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 107 MCAX86_SIMPLE_MC_CODE_PARITY_MASKON, 108 MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF 109 }, 110 111 /* 112 * External - BINIT# from another processor during power-on config 113 */ 114 { 115 FM_EREPORT_CPU_GENERIC_EXTERNAL, 116 NULL, 117 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 118 MCAX86_SIMPLE_EXTERNAL_MASKON, 119 MCAX86_SIMPLE_EXTERNAL_MASKOFF 120 }, 121 122 /* 123 * Functional redundancy check master/slave error 124 */ 125 { 126 FM_EREPORT_CPU_GENERIC_FRC, 127 NULL, 128 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 129 MCAX86_SIMPLE_FRC_MASKON, 130 MCAX86_SIMPLE_FRC_MASKOFF 131 }, 132 133 /* 134 * Internal parity error 135 */ 136 { 137 FM_EREPORT_CPU_GENERIC_INTERNAL_PARITY, 138 NULL, 139 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 140 MCAX86_SIMPLE_INTERNAL_PARITY_MASKON, 141 MCAX86_SIMPLE_INTERNAL_PARITY_MASKOFF 142 }, 143 144 145 /* 146 * Internal timer error 147 */ 148 { 149 FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER, 150 NULL, 151 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 152 MCAX86_SIMPLE_INTERNAL_TIMER_MASKON, 153 MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF 154 }, 155 156 /* 157 * Internal unclassified 158 */ 159 { 160 FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS, 161 NULL, 162 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 163 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON, 164 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF 165 }, 166 167 /* 168 * Compound error codes - generic memory hierarchy 169 */ 170 { 171 FM_EREPORT_CPU_GENERIC_GENMEMHIER, 172 NULL, 173 FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */ 174 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON, 175 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF 176 }, 177 178 /* 179 * Compound error codes - TLB errors 180 */ 181 { 182 FM_EREPORT_CPU_GENERIC_TLB, 183 "%1$s" "TLB" "%2$s" "_ERR", 184 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 185 MCAX86_COMPOUND_TLB_MASKON, 186 MCAX86_COMPOUND_TLB_MASKOFF 187 }, 188 189 /* 190 * Compound error codes - memory hierarchy 191 */ 192 { 193 FM_EREPORT_CPU_GENERIC_MEMHIER, 194 "%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR", 195 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 196 MCAX86_COMPOUND_MEMHIER_MASKON, 197 MCAX86_COMPOUND_MEMHIER_MASKOFF 198 }, 199 200 /* 201 * Compound error codes - bus and interconnect errors 202 */ 203 { 204 FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT, 205 "BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR", 206 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 207 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON, 208 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF 209 }, 210 /* 211 * Compound error codes - memory controller errors 212 */ 213 { 214 FM_EREPORT_CPU_GENERIC_MEMORY_CONTROLLER, 215 "MC" "_" "%8$s" "_" "%9$s" "_ERR", 216 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 217 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKON, 218 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKOFF 219 }, 220 }; 221 222 static gcpu_error_disp_t gcpu_unknown = { 223 FM_EREPORT_CPU_GENERIC_UNKNOWN, 224 "UNKNOWN", 225 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 226 0, 227 0 228 }; 229 230 static errorq_t *gcpu_mca_queue; 231 static kmutex_t gcpu_mca_queue_lock; 232 233 #ifdef __xpv 234 static int isxpv = 1; 235 #else 236 static int isxpv = 0; 237 #endif 238 239 static const gcpu_error_disp_t * 240 gcpu_disp_match(uint16_t code) 241 { 242 const gcpu_error_disp_t *ged = gcpu_errtypes; 243 int i; 244 245 for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t); 246 i++, ged++) { 247 uint16_t on = ged->ged_errcode_mask_on; 248 uint16_t off = ged->ged_errcode_mask_off; 249 250 if ((code & on) == on && (code & off) == 0) 251 return (ged); 252 } 253 254 return (NULL); 255 } 256 257 static uint16_t 258 bit_strip(uint16_t code, uint16_t mask, uint16_t shift) 259 { 260 return ((code & mask) >> shift); 261 } 262 263 #define BIT_STRIP(code, name) \ 264 bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \ 265 MCAX86_ERRCODE_##name##_SHIFT) 266 267 #define GCPU_MNEMONIC_UNDEF "undefined" 268 #define GCPU_MNEMONIC_RESVD "reserved" 269 270 /* 271 * Mappings of TT, LL, RRRR, PP, II and T values to compound error name 272 * mnemonics and to ereport class name components. 273 */ 274 275 struct gcpu_mnexp { 276 const char *mne_compound; /* used in expanding compound errname */ 277 const char *mne_ereport; /* used in expanding ereport class */ 278 }; 279 280 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */ 281 { "I", FM_EREPORT_CPU_GENERIC_TT_INSTR }, /* INSTR */ 282 { "D", FM_EREPORT_CPU_GENERIC_TT_DATA }, /* DATA */ 283 { "G", FM_EREPORT_CPU_GENERIC_TT_GEN }, /* GEN */ 284 { GCPU_MNEMONIC_UNDEF, "" } 285 }; 286 287 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */ 288 { "LO", FM_EREPORT_CPU_GENERIC_LL_L0 }, /* L0 */ 289 { "L1", FM_EREPORT_CPU_GENERIC_LL_L1 }, /* L1 */ 290 { "L2", FM_EREPORT_CPU_GENERIC_LL_L2 }, /* L2 */ 291 { "LG", FM_EREPORT_CPU_GENERIC_LL_LG } /* LG */ 292 }; 293 294 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */ 295 { "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR }, /* ERR */ 296 { "RD", FM_EREPORT_CPU_GENERIC_RRRR_RD }, /* RD */ 297 { "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR }, /* WR */ 298 { "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD }, /* DRD */ 299 { "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR }, /* DWR */ 300 { "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD }, /* IRD */ 301 { "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH }, /* PREFETCH */ 302 { "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT }, /* EVICT */ 303 { "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP }, /* SNOOP */ 304 }; 305 306 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */ 307 { "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC }, /* SRC */ 308 { "RES", FM_EREPORT_CPU_GENERIC_PP_RES }, /* RES */ 309 { "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS }, /* OBS */ 310 { "", FM_EREPORT_CPU_GENERIC_PP_GEN } /* GEN */ 311 }; 312 313 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */ 314 { "M", FM_EREPORT_CPU_GENERIC_II_MEM }, /* MEM */ 315 { GCPU_MNEMONIC_RESVD, "" }, 316 { "IO", FM_EREPORT_CPU_GENERIC_II_IO }, /* IO */ 317 { "", FM_EREPORT_CPU_GENERIC_II_GEN } /* GEN */ 318 }; 319 320 static struct gcpu_mnexp gcpu_T_mnemonics[] = { /* MCAX86_ERRCODE_T_* */ 321 { "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT }, /* NONE */ 322 { "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT } /* TIMEOUT */ 323 }; 324 325 static struct gcpu_mnexp gcpu_CCCC_mnemonics[] = { /* MCAX86_ERRCODE_CCCC_* */ 326 { "CH0", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH0 */ 327 { "CH1", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH1 */ 328 { "CH2", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH2 */ 329 { "CH3", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH3 */ 330 { "CH4", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH4 */ 331 { "CH5", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH5 */ 332 { "CH6", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH6 */ 333 { "CH7", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH7 */ 334 { "CH8", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH8 */ 335 { "CH9", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH9 */ 336 { "CH10", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH10 */ 337 { "CH11", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH11 */ 338 { "CH12", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH12 */ 339 { "CH13", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH13 */ 340 { "CH14", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH14 */ 341 { "CH", FM_EREPORT_CPU_GENERIC_CCCC } /* GEN */ 342 }; 343 344 static struct gcpu_mnexp gcpu_MMM_mnemonics[] = { /* MCAX86_ERRCODE_MMM_* */ 345 { "GEN", FM_EREPORT_CPU_GENERIC_MMM_ERR }, /* GEN ERR */ 346 { "RD", FM_EREPORT_CPU_GENERIC_MMM_RD }, /* READ */ 347 { "WR", FM_EREPORT_CPU_GENERIC_MMM_WR }, /* WRITE */ 348 { "ADDR_CMD", FM_EREPORT_CPU_GENERIC_MMM_ADRCMD }, /* ADDR, CMD */ 349 { "SCRUB", FM_EREPORT_CPU_GENERIC_MMM_SCRUB }, 350 { GCPU_MNEMONIC_RESVD, ""}, /* RESERVED */ 351 { GCPU_MNEMONIC_RESVD, ""}, /* RESERVED */ 352 { GCPU_MNEMONIC_RESVD, ""} /* RESERVED */ 353 }; 354 355 enum gcpu_mn_namespace { 356 GCPU_MN_NAMESPACE_COMPOUND, 357 GCPU_MN_NAMESPACE_EREPORT 358 }; 359 360 static const char * 361 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint16_t val, 362 enum gcpu_mn_namespace nspace) 363 { 364 if (val >= tbl_sz || val > 0xff) 365 return (GCPU_MNEMONIC_UNDEF); /* for all namespaces */ 366 367 switch (nspace) { 368 case GCPU_MN_NAMESPACE_COMPOUND: 369 return (tbl[val].mne_compound); 370 /*NOTREACHED*/ 371 372 case GCPU_MN_NAMESPACE_EREPORT: 373 return (tbl[val].mne_ereport); 374 /*NOTREACHED*/ 375 376 default: 377 return (GCPU_MNEMONIC_UNDEF); 378 /*NOTREACHED*/ 379 } 380 } 381 382 /* 383 * The ereport class leaf component is either a simple string with no 384 * format specifiers, or a string with one or more embedded %n$s specifiers - 385 * positional selection for string arguments. The kernel snprintf does 386 * not support %n$ (and teaching it to do so is too big a headache) so 387 * we will expand this restricted format string ourselves. 388 */ 389 390 #define GCPU_CLASS_VARCOMPS 9 391 392 #define GCPU_MNEMONIC(code, name, nspace) \ 393 gcpu_mnemonic(gcpu_##name##_mnemonics, \ 394 sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \ 395 BIT_STRIP(code, name), nspace) 396 397 static void 398 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status, 399 enum gcpu_mn_namespace nspace) 400 { 401 uint16_t code = MCAX86_ERRCODE(status); 402 const char *mn[GCPU_CLASS_VARCOMPS]; 403 char *p = buf; /* current position in buf */ 404 char *q = buf + buflen; /* pointer past last char in buf */ 405 int which, expfmtchar, error; 406 char c; 407 408 mn[0] = GCPU_MNEMONIC(code, TT, nspace); 409 mn[1] = GCPU_MNEMONIC(code, LL, nspace); 410 mn[2] = GCPU_MNEMONIC(code, RRRR, nspace); 411 mn[3] = GCPU_MNEMONIC(code, PP, nspace); 412 mn[4] = GCPU_MNEMONIC(code, II, nspace); 413 mn[5] = GCPU_MNEMONIC(code, T, nspace); 414 mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : ""; 415 mn[7] = GCPU_MNEMONIC(code, CCCC, nspace); 416 mn[8] = GCPU_MNEMONIC(code, MMM, nspace); 417 418 while (p < q - 1 && (c = *fmt++) != '\0') { 419 if (c != '%') { 420 /* not the beginning of a format specifier - copy */ 421 *p++ = c; 422 continue; 423 } 424 425 error = 0; 426 which = -1; 427 expfmtchar = -1; 428 429 nextfmt: 430 if ((c = *fmt++) == '\0') 431 break; /* early termination of fmt specifier */ 432 433 switch (c) { 434 case '1': 435 case '2': 436 case '3': 437 case '4': 438 case '5': 439 case '6': 440 case '7': 441 case '8': 442 case '9': 443 if (which != -1) { /* allow only one positional digit */ 444 error++; 445 break; 446 } 447 which = c - '1'; 448 goto nextfmt; 449 /*NOTREACHED*/ 450 451 case '$': 452 if (which == -1) { /* no position specified */ 453 error++; 454 break; 455 } 456 expfmtchar = 's'; 457 goto nextfmt; 458 /*NOTREACHED*/ 459 460 case 's': 461 if (expfmtchar != 's') { 462 error++; 463 break; 464 } 465 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s", 466 mn[which]); 467 p += strlen(p); 468 break; 469 470 default: 471 error++; 472 break; 473 } 474 475 if (error) 476 break; 477 } 478 479 *p = '\0'; /* NUL termination */ 480 } 481 482 static void 483 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status, 484 const char *cpuclass, const char *leafclass) 485 { 486 char *p = buf; /* current position in buf */ 487 char *q = buf + buflen; /* pointer past last char in buf */ 488 489 (void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.", 490 FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC); 491 492 p += strlen(p); 493 if (p >= q) 494 return; 495 496 if (leafclass == NULL) { 497 gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status, 498 GCPU_MN_NAMESPACE_EREPORT); 499 } else { 500 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s", 501 leafclass); 502 } 503 } 504 505 /* 506 * Create an "hc" scheme FMRI identifying the given cpu with 507 * motherboard/chip/core/strand instance numbers. 508 */ 509 static nvlist_t * 510 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva) 511 { 512 nvlist_t *nvl, *fmri; 513 514 if ((nvl = fm_nvlist_create(nva)) == NULL) 515 return (NULL); 516 517 if (!x86gentopo_legacy) { 518 fmri = cmi_hdl_smb_bboard(hdl); 519 if (fmri == NULL) 520 return (NULL); 521 522 fm_fmri_hc_create(nvl, FM_HC_SCHEME_VERSION, 523 NULL, NULL, fmri, 3, 524 "chip", cmi_hdl_smb_chipid(hdl), 525 "core", cmi_hdl_coreid(hdl), 526 "strand", cmi_hdl_strandid(hdl)); 527 } else { 528 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 4, 529 "motherboard", 0, 530 "chip", cmi_hdl_chipid(hdl), 531 "core", cmi_hdl_coreid(hdl), 532 "strand", cmi_hdl_strandid(hdl)); 533 } 534 535 return (nvl); 536 } 537 538 int gcpu_bleat_count_thresh = 5; 539 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL; 540 541 /* 542 * Called when we are unable to propogate a logout structure onto an 543 * errorq for subsequent ereport preparation and logging etc. The caller 544 * should usually only decide to call this for severe errors - those we 545 * suspect we may need to panic for. 546 */ 547 static void 548 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl) 549 { 550 hrtime_t now = gethrtime_waitfree(); 551 static hrtime_t gcpu_last_bleat; 552 gcpu_bank_logout_t *gbl; 553 static int bleatcount; 554 int i; 555 556 /* 557 * Throttle spamming of the console. The first gcpu_bleat_count_thresh 558 * can come as fast as we like, but once we've spammed that many 559 * to the console we require a minimum interval to pass before 560 * any more complaints. 561 */ 562 if (++bleatcount > gcpu_bleat_count_thresh) { 563 if (now - gcpu_last_bleat < gcpu_bleat_min_interval) 564 return; 565 else 566 bleatcount = 0; 567 } 568 gcpu_last_bleat = now; 569 570 cmn_err(CE_WARN, 571 "Machine-Check Errors unlogged on chip %d core %d strand %d, " 572 "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl), 573 cmi_hdl_strandid(hdl)); 574 cmn_err(CE_WARN, "MCG_STATUS 0x%016llx", 575 (u_longlong_t)gcl->gcl_mcg_status); 576 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) { 577 uint64_t status = gbl->gbl_status; 578 579 if (!(status & MSR_MC_STATUS_VAL)) 580 continue; 581 582 /* Force ADDRV for AMD Family 0xf and above */ 583 if (gcpu_force_addr_in_payload) 584 status = status | MSR_MC_STATUS_ADDRV; 585 586 switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) { 587 case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV: 588 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 589 "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx", 590 i, IA32_MSR_MC(i, STATUS), 591 (u_longlong_t)gbl->gbl_status, 592 (u_longlong_t)gbl->gbl_addr, 593 (u_longlong_t)gbl->gbl_misc); 594 break; 595 596 case MSR_MC_STATUS_ADDRV: 597 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 598 "STAT 0x%016llx ADDR 0x%016llx", 599 i, IA32_MSR_MC(i, STATUS), 600 (u_longlong_t)gbl->gbl_status, 601 (u_longlong_t)gbl->gbl_addr); 602 break; 603 604 case MSR_MC_STATUS_MISCV: 605 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 606 "STAT 0x%016llx MISC 0x%016llx", 607 i, IA32_MSR_MC(i, STATUS), 608 (u_longlong_t)gbl->gbl_status, 609 (u_longlong_t)gbl->gbl_misc); 610 break; 611 612 default: 613 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 614 "STAT 0x%016llx", 615 i, IA32_MSR_MC(i, STATUS), 616 (u_longlong_t)gbl->gbl_status); 617 break; 618 619 } 620 } 621 } 622 623 #define _GCPU_BSTATUS(status, what) \ 624 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \ 625 (status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE 626 627 static void 628 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl, 629 uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code) 630 { 631 uint64_t members = ged ? ged->ged_ereport_members : 632 FM_EREPORT_PAYLOAD_FLAGS_COMMON; 633 uint64_t mcg = gcl->gcl_mcg_status; 634 int mcip = mcg & MCG_STATUS_MCIP; 635 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno]; 636 uint64_t bstat = gbl->gbl_status; 637 638 /* 639 * Include the compound error name if requested and if this 640 * is a compound error type. 641 */ 642 if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged && 643 ged->ged_compound_fmt != NULL) { 644 char buf[FM_MAX_CLASS]; 645 646 gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code, 647 GCPU_MN_NAMESPACE_COMPOUND); 648 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR, 649 DATA_TYPE_STRING, buf, NULL); 650 } 651 652 /* 653 * Include disposition information for this error 654 */ 655 if (members & FM_EREPORT_PAYLOAD_FLAG_DISP && 656 gbl->gbl_disp != 0) { 657 int i, empty = 1; 658 char buf[128]; 659 char *p = buf, *q = buf + 128; 660 static struct _gcpu_disp_name { 661 uint64_t dv; 662 const char *dn; 663 } disp_names[] = { 664 { CMI_ERRDISP_CURCTXBAD, 665 "processor_context_corrupt" }, 666 { CMI_ERRDISP_RIPV_INVALID, 667 "return_ip_invalid" }, 668 { CMI_ERRDISP_UC_UNCONSTRAINED, 669 "unconstrained" }, 670 { CMI_ERRDISP_FORCEFATAL, 671 "forcefatal" }, 672 { CMI_ERRDISP_IGNORED, 673 "ignored" }, 674 { CMI_ERRDISP_PCC_CLEARED, 675 "corrupt_context_cleared" }, 676 { CMI_ERRDISP_UC_CLEARED, 677 "uncorrected_data_cleared" }, 678 { CMI_ERRDISP_POISONED, 679 "poisoned" }, 680 { CMI_ERRDISP_INCONSISTENT, 681 "telemetry_unstable" }, 682 }; 683 684 for (i = 0; i < sizeof (disp_names) / 685 sizeof (struct _gcpu_disp_name); i++) { 686 if ((gbl->gbl_disp & disp_names[i].dv) == 0) 687 continue; 688 689 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, 690 "%s%s", empty ? "" : ",", disp_names[i].dn); 691 p += strlen(p); 692 empty = 0; 693 } 694 695 if (p != buf) 696 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP, 697 DATA_TYPE_STRING, buf, NULL); 698 } 699 700 /* 701 * If MCG_STATUS is included add that and an indication of whether 702 * this ereport was the result of a machine check or poll. 703 */ 704 if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) { 705 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS, 706 DATA_TYPE_UINT64, mcg, NULL); 707 708 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP, 709 DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL); 710 } 711 712 /* 713 * If an instruction pointer is to be included add one provided 714 * MCG_STATUS indicated it is valid; meaningless for polled events. 715 */ 716 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP && 717 mcg & MCG_STATUS_EIPV) { 718 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP, 719 DATA_TYPE_UINT64, gcl->gcl_ip, NULL); 720 } 721 722 /* 723 * Add an indication of whether the trap occured during privileged code. 724 */ 725 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) { 726 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV, 727 DATA_TYPE_BOOLEAN_VALUE, 728 gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL); 729 } 730 731 /* 732 * If requested, add the index of the MCA bank. This indicates the 733 * n'th bank of 4 MCA registers, and does not necessarily correspond 734 * to MCi_* - use the bank offset to correlate 735 */ 736 if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) { 737 fm_payload_set(ereport, 738 /* Bank number */ 739 FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno, 740 /* Offset of MCi_CTL */ 741 FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64, 742 IA32_MSR_MC(bankno, CTL), 743 NULL); 744 } 745 746 /* 747 * Add MCi_STATUS if requested, and decode it. 748 */ 749 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) { 750 const char *tbes[] = { 751 "No tracking", /* 00 */ 752 "Green - below threshold", /* 01 */ 753 "Yellow - above threshold", /* 10 */ 754 "Reserved" /* 11 */ 755 }; 756 757 fm_payload_set(ereport, 758 /* Bank MCi_STATUS */ 759 FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat, 760 /* Overflow? */ 761 _GCPU_BSTATUS(bstat, OVER), 762 /* Uncorrected? */ 763 _GCPU_BSTATUS(bstat, UC), 764 /* Enabled? */ 765 _GCPU_BSTATUS(bstat, EN), 766 /* Processor context corrupt? */ 767 _GCPU_BSTATUS(bstat, PCC), 768 /* Error code */ 769 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE, 770 DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat), 771 /* Model-specific error code */ 772 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE, 773 DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat), 774 NULL); 775 776 /* 777 * If MCG_CAP.TES_P indicates that that thresholding info 778 * is present in the architural component of the bank status 779 * then include threshold information for this bank. 780 */ 781 if (gcl->gcl_flags & GCPU_GCL_F_TES_P) { 782 fm_payload_set(ereport, 783 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES, 784 DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)], 785 NULL); 786 } 787 } 788 789 /* 790 * Add MCi_ADDR info if requested and valid. We force addition of 791 * MCi_ADDR, even if its not valid on AMD family 0xf and above, 792 * to aid in analysis of ereports, for WatchDog errors. 793 */ 794 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR && 795 ((bstat & MSR_MC_STATUS_ADDRV) || 796 gcpu_force_addr_in_payload)) { 797 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR, 798 DATA_TYPE_UINT64, gbl->gbl_addr, NULL); 799 } 800 801 /* 802 * MCi_MISC if requested and MCi_STATUS.MISCV). 803 */ 804 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC && 805 bstat & MSR_MC_STATUS_MISCV) { 806 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC, 807 DATA_TYPE_UINT64, gbl->gbl_misc, NULL); 808 } 809 810 } 811 812 /* 813 * Construct and post an ereport based on the logout information from a 814 * single MCA bank. We are not necessarily running on the cpu that 815 * detected the error. 816 */ 817 static void 818 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx, 819 const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status) 820 { 821 gcpu_data_t *gcpu = gcl->gcl_gcpu; 822 cmi_hdl_t hdl = gcpu->gcpu_hdl; 823 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx]; 824 const char *cpuclass = NULL, *leafclass = NULL; 825 uint16_t code = MCAX86_ERRCODE(status); 826 errorq_elem_t *eqep, *scr_eqep; 827 nvlist_t *ereport, *detector; 828 char buf[FM_MAX_CLASS]; 829 const char *classfmt; 830 nv_alloc_t *nva; 831 832 if (panicstr) { 833 if ((eqep = errorq_reserve(ereport_errorq)) == NULL) 834 return; 835 ereport = errorq_elem_nvl(ereport_errorq, eqep); 836 837 /* 838 * Allocate another element for scratch space, but fallback 839 * to the one we have if that fails. We'd like to use the 840 * additional scratch space for nvlist construction. 841 */ 842 if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL) 843 nva = errorq_elem_nva(ereport_errorq, scr_eqep); 844 else 845 nva = errorq_elem_nva(ereport_errorq, eqep); 846 } else { 847 ereport = fm_nvlist_create(NULL); 848 nva = NULL; 849 } 850 851 if (ereport == NULL) 852 return; 853 854 /* 855 * Common payload data required by the protocol: 856 * - ereport class 857 * - detector 858 * - ENA 859 */ 860 861 /* 862 * Ereport class - call into model-specific support to allow it to 863 * provide a cpu class or leaf class, otherwise calculate our own. 864 */ 865 cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass); 866 classfmt = ged ? ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN; 867 gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass, 868 leafclass); 869 870 /* 871 * The detector FMRI. 872 */ 873 if ((detector = cms_ereport_detector(hdl, bankidx, mscookie, 874 nva)) == NULL) 875 detector = gcpu_fmri_create(hdl, nva); 876 877 /* 878 * Should we define a new ENA format 3?? for chip/core/strand? 879 * It will be better when virtualized. 880 */ 881 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf, 882 fm_ena_generate_cpu(gcl->gcl_timestamp, 883 cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 | 884 cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL); 885 886 if (panicstr) { 887 fm_nvlist_destroy(detector, FM_NVA_RETAIN); 888 nv_alloc_reset(nva); 889 } else { 890 fm_nvlist_destroy(detector, FM_NVA_FREE); 891 } 892 893 /* 894 * Add the architectural ereport class-specific payload data. 895 */ 896 gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code); 897 898 /* 899 * Allow model-specific code to add ereport members. 900 */ 901 cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status, 902 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie); 903 904 /* 905 * Include stack if options is turned on and either selected in 906 * the payload member bitmask or inclusion is forced. 907 */ 908 if (gcpu_mca_stack_flag && 909 (cms_ereport_includestack(hdl, mscookie) == 910 B_TRUE || gcpu_mca_stack_ereport_include)) { 911 fm_payload_stack_add(ereport, gcl->gcl_stack, 912 gcl->gcl_stackdepth); 913 } 914 915 /* 916 * If injection has taken place anytime in the past then note this 917 * on the ereport. 918 */ 919 if (cmi_inj_tainted() == B_TRUE) { 920 fm_payload_set(ereport, "__injected", DATA_TYPE_BOOLEAN_VALUE, 921 B_TRUE, NULL); 922 } 923 924 /* 925 * Post ereport. 926 */ 927 if (panicstr) { 928 errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC); 929 if (scr_eqep) 930 errorq_cancel(ereport_errorq, scr_eqep); 931 } else { 932 (void) fm_ereport_post(ereport, EVCH_TRYHARD); 933 fm_nvlist_destroy(ereport, FM_NVA_FREE); 934 } 935 936 } 937 938 /*ARGSUSED*/ 939 void 940 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe) 941 { 942 const gcpu_logout_t *gcl = data; 943 const gcpu_bank_logout_t *gbl; 944 int ismc; 945 int i; 946 947 ismc = gcl->ismc; 948 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) { 949 const gcpu_error_disp_t *gened; 950 cms_cookie_t mscookie; 951 952 if (gbl->gbl_status & MSR_MC_STATUS_VAL && 953 !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) { 954 uint16_t code = MCAX86_ERRCODE(gbl->gbl_status); 955 956 /* 957 * Perform a match based on IA32 MCA architectural 958 * components alone. 959 */ 960 gened = gcpu_disp_match(code); /* may be NULL */ 961 962 /* 963 * Now see if an model-specific match can be made. 964 */ 965 mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, ismc, 966 i, gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc, 967 gcl->gcl_ms_logout); 968 969 /* 970 * Prepare and dispatch an ereport for logging and 971 * diagnosis. 972 */ 973 gcpu_ereport_post(gcl, i, gened, mscookie, 974 gbl->gbl_status); 975 } else if (gbl->gbl_status & MSR_MC_STATUS_VAL && 976 (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) { 977 /* 978 * Telemetry kept changing as we tried to read 979 * it. Force an unknown ereport leafclass but 980 * keep the telemetry unchanged for logging. 981 */ 982 gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL, 983 gbl->gbl_status); 984 } 985 } 986 } 987 988 static size_t gcpu_mca_queue_datasz = 0; 989 990 /* 991 * The following code is ready to make a weak attempt at growing the 992 * errorq structure size. Since it is not foolproof (we don't know 993 * who may already be producing to the outgoing errorq) our caller 994 * instead assures that we'll always be called with no greater data 995 * size than on our first call. 996 */ 997 static void 998 gcpu_errorq_init(size_t datasz) 999 { 1000 int slots; 1001 1002 mutex_enter(&gcpu_mca_queue_lock); 1003 1004 if (gcpu_mca_queue_datasz >= datasz) { 1005 mutex_exit(&gcpu_mca_queue_lock); 1006 return; 1007 } 1008 1009 membar_producer(); 1010 if (gcpu_mca_queue) { 1011 gcpu_mca_queue_datasz = 0; 1012 errorq_destroy(gcpu_mca_queue); 1013 } 1014 1015 slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS); 1016 slots = MIN(slots, GCPU_MCA_MAX_ERRORS); 1017 1018 gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain, 1019 NULL, slots, datasz, 1, ERRORQ_VITAL); 1020 1021 if (gcpu_mca_queue != NULL) 1022 gcpu_mca_queue_datasz = datasz; 1023 1024 mutex_exit(&gcpu_mca_queue_lock); 1025 } 1026 1027 /* 1028 * Perform MCA initialization as described in section 14.6 of Intel 64 1029 * and IA-32 Architectures Software Developer's Manual Volume 3A. 1030 */ 1031 1032 static uint_t global_nbanks; 1033 1034 void 1035 gcpu_mca_init(cmi_hdl_t hdl) 1036 { 1037 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1038 uint64_t cap; 1039 uint_t vendor = cmi_hdl_vendor(hdl); 1040 uint_t family = cmi_hdl_family(hdl); 1041 uint_t rev = cmi_hdl_chiprev(hdl); 1042 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1043 int mcg_ctl_present; 1044 uint_t nbanks; 1045 uint32_t ctl_skip_mask = 0; 1046 uint32_t status_skip_mask = 0; 1047 size_t mslsz; 1048 int i; 1049 #ifndef __xpv 1050 int mcg_ctl2_present; 1051 uint32_t cmci_capable = 0; 1052 #endif 1053 if (gcpu == NULL) 1054 return; 1055 1056 /* We add MCi_ADDR always for AMD Family 0xf and above */ 1057 if (X86_CHIPFAM_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B)) 1058 gcpu_force_addr_in_payload = 1; 1059 1060 /* 1061 * Protect from some silly /etc/system settings. 1062 */ 1063 if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100) 1064 gcpu_mca_telemetry_retries = 5; 1065 1066 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS) 1067 return; 1068 1069 /* 1070 * CPU startup code only calls cmi_mca_init if x86_featureset indicates 1071 * both MCA and MCE support (i.e., X86FSET_MCA). P5, K6, and earlier 1072 * processors, which have their own more primitive way of doing 1073 * machine checks, will not have cmi_mca_init called since their 1074 * CPUID information will not indicate both MCA and MCE features. 1075 */ 1076 ASSERT(is_x86_feature(x86_featureset, X86FSET_MCA)); 1077 1078 /* 1079 * Determine whether the IA32_MCG_CTL register is present. If it 1080 * is we will enable all features by writing -1 to it towards 1081 * the end of this initialization; if it is absent then volume 3A 1082 * says we must nonetheless continue to initialize the individual 1083 * banks. 1084 */ 1085 mcg_ctl_present = cap & MCG_CAP_CTL_P; 1086 #ifndef __xpv 1087 mcg_ctl2_present = cap & MCG_CAP_CTL2_P; 1088 #endif 1089 1090 /* 1091 * We squirell values away for inspection/debugging. 1092 */ 1093 mca->gcpu_mca_bioscfg.bios_mcg_cap = cap; 1094 if (mcg_ctl_present) 1095 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL, 1096 &mca->gcpu_mca_bioscfg.bios_mcg_ctl); 1097 1098 /* 1099 * Determine the number of error-reporting banks implemented. 1100 */ 1101 mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK; 1102 1103 if (nbanks != 0 && global_nbanks == 0) 1104 global_nbanks = nbanks; /* no race - BSP will get here first */ 1105 1106 /* 1107 * If someone is hiding the number of banks (perhaps we are fully 1108 * virtualized?) or if this processor has more banks than the 1109 * first to set global_nbanks then bail. The latter requirement 1110 * is because we need to size our errorq data structure and we 1111 * don't want to have to grow the errorq (destroy and recreate) 1112 * which may just lose some telemetry. 1113 */ 1114 if (nbanks == 0 || nbanks > global_nbanks) 1115 return; 1116 1117 mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks * 1118 sizeof (struct gcpu_bios_bankcfg), KM_SLEEP); 1119 1120 /* 1121 * Calculate the size we need to allocate for a gcpu_logout_t 1122 * with a gcl_data array big enough for all banks of this cpu. 1123 * Add any space requested by the model-specific logout support. 1124 */ 1125 mslsz = cms_logout_size(hdl); 1126 mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) + 1127 (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz; 1128 1129 for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) { 1130 gcpu_logout_t *gcl; 1131 1132 mca->gcpu_mca_logout[i] = gcl = 1133 kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP); 1134 gcl->gcl_gcpu = gcpu; 1135 gcl->gcl_nbanks = nbanks; 1136 gcl->gcl_ms_logout = (mslsz == 0) ? NULL : 1137 (char *)(&gcl->gcl_data[0]) + nbanks * 1138 sizeof (gcpu_bank_logout_t); 1139 1140 } 1141 1142 #ifdef __xpv 1143 gcpu_xpv_mca_init(nbanks); 1144 #endif 1145 1146 mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1; 1147 1148 #ifndef __xpv 1149 mca->gcpu_bank_cmci = kmem_zalloc(sizeof (gcpu_mca_cmci_t) * nbanks, 1150 KM_SLEEP); 1151 #endif 1152 1153 /* 1154 * Create our errorq to transport the logout structures. This 1155 * can fail so users of gcpu_mca_queue must be prepared for NULL. 1156 */ 1157 gcpu_errorq_init(mca->gcpu_mca_lgsz); 1158 1159 /* 1160 * Not knowing which, if any, banks are shared between cores we 1161 * assure serialization of MCA bank initialization by each cpu 1162 * on the chip. On chip architectures in which some banks are 1163 * shared this will mean the shared resource is initialized more 1164 * than once - we're simply aiming to avoid simultaneous MSR writes 1165 * to the shared resource. 1166 * 1167 * Even with these precautions, some platforms may yield a GP fault 1168 * if a core other than a designated master tries to write anything 1169 * but all 0's to MCi_{STATUS,ADDR,CTL}. So we will perform 1170 * those writes under on_trap protection. 1171 */ 1172 mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock); 1173 1174 /* 1175 * Initialize poller data, but don't start polling yet. 1176 */ 1177 gcpu_mca_poll_init(hdl); 1178 1179 /* 1180 * Work out which MCA banks we will initialize. In MCA logout 1181 * code we will only read those banks which we initialize here. 1182 */ 1183 for (i = 0; i < nbanks; i++) { 1184 boolean_t skipctl = cms_bankctl_skipinit(hdl, i); 1185 boolean_t skipstatus = cms_bankstatus_skipinit(hdl, i); 1186 1187 if (!cms_present(hdl)) { 1188 /* 1189 * Model-specific support is not present, try to use 1190 * sane defaults. 1191 * 1192 * On AMD family 6 processors, reports about spurious 1193 * machine checks indicate that bank 0 should be 1194 * skipped. 1195 * 1196 * On Intel family 6 processors, the documentation tells 1197 * us not to write to MC0_CTL. 1198 * 1199 */ 1200 if (i == 0 && family == 6) { 1201 switch (vendor) { 1202 case X86_VENDOR_AMD: 1203 skipstatus = B_TRUE; 1204 /*FALLTHRU*/ 1205 case X86_VENDOR_Intel: 1206 skipctl = B_TRUE; 1207 break; 1208 } 1209 } 1210 } 1211 1212 ctl_skip_mask |= skipctl << i; 1213 status_skip_mask |= skipstatus << i; 1214 1215 if (skipctl && skipstatus) 1216 continue; 1217 1218 /* 1219 * Record which MCA banks were enabled, from the point of view 1220 * of the whole chip (if some cores share a bank we must be 1221 * sure either can logout from it). 1222 */ 1223 atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i); 1224 1225 #ifndef __xpv 1226 /* 1227 * check CMCI capability 1228 */ 1229 if (mcg_ctl2_present) { 1230 uint64_t ctl2; 1231 uint32_t cap = 0; 1232 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2); 1233 if (ctl2 & MSR_MC_CTL2_EN) 1234 continue; 1235 ctl2 |= MSR_MC_CTL2_EN; 1236 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2); 1237 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2); 1238 mca->gcpu_bank_cmci[i].cmci_cap = cap = 1239 (ctl2 & MSR_MC_CTL2_EN) ? 1 : 0; 1240 if (cap) 1241 cmci_capable ++; 1242 /* 1243 * Set threshold to 1 while unset the en field, to avoid 1244 * CMCI trigged before APIC LVT entry init. 1245 */ 1246 ctl2 = ctl2 & (~MSR_MC_CTL2_EN) | 1; 1247 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2); 1248 1249 /* 1250 * init cmci related count 1251 */ 1252 mca->gcpu_bank_cmci[i].cmci_enabled = 0; 1253 mca->gcpu_bank_cmci[i].drtcmci = 0; 1254 mca->gcpu_bank_cmci[i].ncmci = 0; 1255 } 1256 #endif 1257 } 1258 1259 #ifndef __xpv 1260 if (cmci_capable) 1261 cmi_enable_cmci = 1; 1262 #endif 1263 1264 #ifndef __xpv 1265 /* 1266 * Log any valid telemetry lurking in the MCA banks, but do not 1267 * clear the status registers. Ignore the disposition returned - 1268 * we have already paniced or reset for any nasty errors found here. 1269 * 1270 * Intel vol 3A says that we should not do this on family 0x6, 1271 * and that for any extended family the BIOS clears things 1272 * on power-on reset so you'll only potentially find valid telemetry 1273 * on warm reset (we do it for both - on power-on reset we should 1274 * just see zeroes). 1275 * 1276 * AMD docs since K7 say we should process anything we find here. 1277 */ 1278 if (!gcpu_suppress_log_on_init && 1279 (vendor == X86_VENDOR_Intel && family >= 0xf || 1280 vendor == X86_VENDOR_AMD)) 1281 gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE, 1282 GCPU_MPT_WHAT_POKE_ERR); 1283 1284 /* 1285 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the 1286 * model-specific module the power of veto. 1287 */ 1288 for (i = 0; i < nbanks; i++) { 1289 struct gcpu_bios_bankcfg *bcfgp = 1290 mca->gcpu_mca_bioscfg.bios_bankcfg + i; 1291 1292 /* 1293 * Stash inherited bank MCA state, even for banks we will 1294 * not initialize ourselves. Do not read the MISC register 1295 * unconditionally - on some processors that will #GP on 1296 * banks that do not implement the MISC register (would be 1297 * caught by on_trap, anyway). 1298 */ 1299 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL), 1300 &bcfgp->bios_bank_ctl); 1301 1302 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), 1303 &bcfgp->bios_bank_status); 1304 1305 if ((bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV) || 1306 gcpu_force_addr_in_payload) { 1307 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), 1308 &bcfgp->bios_bank_addr); 1309 } 1310 1311 /* 1312 * In some old BIOS the status value after boot can indicate 1313 * MISCV when there is actually no MISC register for 1314 * that bank. The following read could therefore 1315 * aggravate a general protection fault. This should be 1316 * caught by on_trap, but the #GP fault handler is busted 1317 * and can suffer a double fault even before we get to 1318 * trap() to check for on_trap protection. Until that 1319 * issue is fixed we remove the one access that we know 1320 * can cause a #GP. 1321 * 1322 * if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV) 1323 * (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), 1324 * &bcfgp->bios_bank_misc); 1325 */ 1326 bcfgp->bios_bank_misc = 0; 1327 1328 if (!(ctl_skip_mask & (1 << i))) { 1329 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL), 1330 cms_bankctl_val(hdl, i, -1ULL)); 1331 } 1332 1333 if (!(status_skip_mask & (1 << i))) { 1334 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 1335 cms_bankstatus_val(hdl, i, 0ULL)); 1336 } 1337 } 1338 #endif 1339 /* 1340 * Now let the model-specific support perform further initialization 1341 * of non-architectural features. 1342 */ 1343 cms_mca_init(hdl, nbanks); 1344 1345 #ifndef __xpv 1346 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL); 1347 membar_producer(); 1348 1349 /* enable all machine-check features */ 1350 if (mcg_ctl_present) 1351 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL, 1352 cms_mcgctl_val(hdl, nbanks, -1ULL)); 1353 #endif 1354 1355 mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock); 1356 1357 #ifndef __xpv 1358 /* enable machine-check exception in CR4 */ 1359 cmi_hdl_enable_mce(hdl); 1360 #endif 1361 } 1362 1363 static uint64_t 1364 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu, 1365 gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp) 1366 { 1367 int curctxbad = 0, unconstrained = 0, forcefatal = 0; 1368 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1369 int nbanks = mca->gcpu_mca_nbanks; 1370 gcpu_mce_status_t mce; 1371 gcpu_bank_logout_t *gbl; 1372 uint64_t disp = 0; 1373 int i; 1374 1375 if (mcesp == NULL) 1376 mcesp = &mce; 1377 1378 mcesp->mce_nerr = nerr; 1379 1380 mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc = 1381 mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned = 1382 mcesp->mce_forcefatal = mcesp->mce_ignored = 0; 1383 1384 /* 1385 * If this a machine check then if the return instruction pointer 1386 * is not valid the current context is lost. 1387 */ 1388 if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV)) 1389 disp |= CMI_ERRDISP_RIPV_INVALID; 1390 gcl->ismc = ismc; 1391 1392 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) { 1393 uint64_t mcistatus = gbl->gbl_status; 1394 uint32_t ms_scope; 1395 int pcc, uc; 1396 int poisoned; 1397 1398 if (!(mcistatus & MSR_MC_STATUS_VAL)) 1399 continue; 1400 1401 if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT) 1402 continue; 1403 1404 pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0; 1405 uc = (mcistatus & MSR_MC_STATUS_UC) != 0; 1406 mcesp->mce_npcc += pcc; 1407 mcesp->mce_nuc += uc; 1408 1409 ms_scope = cms_error_action(hdl, ismc, i, mcistatus, 1410 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout); 1411 1412 if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) { 1413 pcc = 0; 1414 mcesp->mce_npcc_ok++; 1415 gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED; 1416 } 1417 1418 if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) { 1419 uc = 0; 1420 mcesp->mce_nuc_ok++; 1421 gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED; 1422 } 1423 1424 if (uc) { 1425 poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0; 1426 if (poisoned) { 1427 mcesp->mce_nuc_poisoned++; 1428 gbl->gbl_disp |= CMI_ERRDISP_POISONED; 1429 } 1430 } 1431 1432 if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) { 1433 /* 1434 * We're not being instructed to ignore the error, 1435 * so apply our standard disposition logic to it. 1436 */ 1437 if (uc && !poisoned) { 1438 unconstrained++; 1439 gbl->gbl_disp |= disp | 1440 CMI_ERRDISP_UC_UNCONSTRAINED; 1441 } 1442 1443 if (pcc && ismc) { 1444 curctxbad++; 1445 gbl->gbl_disp |= disp | 1446 CMI_ERRDISP_CURCTXBAD; 1447 } 1448 1449 /* 1450 * Even if the above may not indicate that the error 1451 * is terminal, model-specific support may insist 1452 * that we treat it as such. Such errors wil be 1453 * fatal even if discovered via poll. 1454 */ 1455 if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) { 1456 forcefatal++; 1457 mcesp->mce_forcefatal++; 1458 gbl->gbl_disp |= disp | 1459 CMI_ERRDISP_FORCEFATAL; 1460 } 1461 } else { 1462 mcesp->mce_ignored++; 1463 gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED; 1464 } 1465 } 1466 1467 if (unconstrained > 0) 1468 disp |= CMI_ERRDISP_UC_UNCONSTRAINED; 1469 1470 if (curctxbad > 0) 1471 disp |= CMI_ERRDISP_CURCTXBAD; 1472 1473 if (forcefatal > 0) 1474 disp |= CMI_ERRDISP_FORCEFATAL; 1475 1476 if (gcpu_mca_queue != NULL) { 1477 int how; 1478 1479 if (ismc) { 1480 how = cmi_mce_response(rp, disp) ? 1481 ERRORQ_ASYNC : /* no panic, so arrange drain */ 1482 ERRORQ_SYNC; /* panic flow will drain */ 1483 } else { 1484 how = (disp & CMI_ERRDISP_FORCEFATAL && 1485 cmi_panic_on_ue()) ? 1486 ERRORQ_SYNC : /* poller will panic */ 1487 ERRORQ_ASYNC; /* no panic */ 1488 } 1489 1490 errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how); 1491 } else if (disp != 0) { 1492 gcpu_bleat(hdl, gcl); 1493 } 1494 1495 mcesp->mce_disp = disp; 1496 1497 return (disp); 1498 } 1499 1500 /* 1501 * Gather error telemetry from our source, and then submit it for 1502 * processing. 1503 */ 1504 1505 #define IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \ 1506 ((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0) 1507 1508 #define STATUS_EQV(s1, s2) \ 1509 (((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER)) 1510 1511 static uint32_t gcpu_deferrred_polled_clears; 1512 1513 #ifndef __xpv 1514 static void 1515 gcpu_cmci_logout(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p, 1516 uint64_t status, int what) 1517 { 1518 uint64_t ctl2; 1519 1520 if (bank_cmci_p->cmci_cap && (what == GCPU_MPT_WHAT_CYC_ERR) && 1521 (!(status & MSR_MC_STATUS_VAL) || ((status & MSR_MC_STATUS_VAL) && 1522 !(status & MSR_MC_STATUS_CEC_MASK)))) { 1523 1524 if (!(bank_cmci_p->cmci_enabled)) { 1525 /* 1526 * when cmci is disabled, and the bank has no error or 1527 * no corrected error for 1528 * gcpu_mca_cmci_reenable_threshold consecutive polls, 1529 * turn on this bank's cmci. 1530 */ 1531 1532 bank_cmci_p->drtcmci ++; 1533 1534 if (bank_cmci_p->drtcmci >= 1535 gcpu_mca_cmci_reenable_threshold) { 1536 1537 /* turn on cmci */ 1538 1539 (void) cmi_hdl_rdmsr(hdl, 1540 IA32_MSR_MC_CTL2(bank), &ctl2); 1541 ctl2 |= MSR_MC_CTL2_EN; 1542 (void) cmi_hdl_wrmsr(hdl, 1543 IA32_MSR_MC_CTL2(bank), ctl2); 1544 1545 /* reset counter and set flag */ 1546 bank_cmci_p->drtcmci = 0; 1547 bank_cmci_p->cmci_enabled = 1; 1548 } 1549 } else { 1550 /* 1551 * when cmci is enabled,if is in cyclic poll and the 1552 * bank has no error or no corrected error, reset ncmci 1553 * counter 1554 */ 1555 bank_cmci_p->ncmci = 0; 1556 } 1557 } 1558 } 1559 1560 static void 1561 gcpu_cmci_throttle(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p, 1562 int what) 1563 { 1564 uint64_t ctl2 = 0; 1565 1566 /* 1567 * if cmci of this bank occurred beyond 1568 * gcpu_mca_cmci_throttling_threshold between 2 polls, 1569 * turn off this bank's CMCI; 1570 */ 1571 if (bank_cmci_p->cmci_enabled && what == GCPU_MPT_WHAT_CMCI_ERR) { 1572 1573 /* if it is cmci trap, increase the count */ 1574 bank_cmci_p->ncmci++; 1575 1576 if (bank_cmci_p->ncmci >= gcpu_mca_cmci_throttling_threshold) { 1577 1578 /* turn off cmci */ 1579 1580 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(bank), 1581 &ctl2); 1582 ctl2 &= ~MSR_MC_CTL2_EN; 1583 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(bank), 1584 ctl2); 1585 1586 /* clear the flag and count */ 1587 1588 bank_cmci_p->cmci_enabled = 0; 1589 bank_cmci_p->ncmci = 0; 1590 } 1591 } 1592 } 1593 #endif 1594 1595 static void 1596 clear_mc(int first, int last, int ismc, boolean_t clrstatus, 1597 cmi_hdl_t hdl, gcpu_logout_t *gcl, gcpu_logout_t *pgcl) 1598 { 1599 int i; 1600 gcpu_bank_logout_t *gbl, *pgbl; 1601 uint64_t status; 1602 1603 if (first < 0 || last < 0) 1604 return; 1605 1606 for (i = first, gbl = &gcl->gcl_data[first]; i <= last; i++, gbl++) { 1607 status = gbl->gbl_status; 1608 if (status == 0) 1609 continue; 1610 if (clrstatus == B_FALSE) 1611 goto serialize; 1612 1613 /* 1614 * For i86xpv we always clear status in order to invalidate 1615 * the interposed telemetry. 1616 * 1617 * For native machine checks we always clear status here. For 1618 * native polls we must be a little more cautious since there 1619 * is an outside chance that we may clear telemetry from a 1620 * shared MCA bank on which a sibling core is machine checking. 1621 * 1622 * For polled observations of errors that look like they may 1623 * produce a machine check (UC/PCC and ENabled, although these 1624 * do not guarantee a machine check on error occurence) 1625 * we will not clear the status at this wakeup unless 1626 * we saw the same status at the previous poll. We will 1627 * always process and log the current observations - it 1628 * is only the clearing of MCi_STATUS which may be 1629 * deferred until the next wakeup. 1630 */ 1631 if (isxpv || ismc || !IS_MCE_CANDIDATE(status)) { 1632 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL); 1633 goto serialize; 1634 } 1635 1636 /* 1637 * We have a polled observation of a machine check 1638 * candidate. If we saw essentially the same status at the 1639 * last poll then clear the status now since this appears 1640 * not to be a #MC candidate after all. If we see quite 1641 * different status now then do not clear, but reconsider at 1642 * the next poll. In no actual machine check clears 1643 * the status in the interim then the status should not 1644 * keep changing forever (meaning we'd never clear it) 1645 * since before long we'll simply have latched the highest- 1646 * priority error and set the OVerflow bit. Nonetheless 1647 * we count how many times we defer clearing and after 1648 * a while insist on clearing the status. 1649 */ 1650 pgbl = &pgcl->gcl_data[i]; 1651 if (pgbl->gbl_clrdefcnt != 0) { 1652 /* We deferred clear on this bank at last wakeup */ 1653 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) || 1654 pgbl->gbl_clrdefcnt > 5) { 1655 /* 1656 * Status is unchanged so clear it now and, 1657 * since we have already logged this info, 1658 * avoid logging it again. 1659 */ 1660 gbl->gbl_status = 0; 1661 (void) cmi_hdl_wrmsr(hdl, 1662 IA32_MSR_MC(i, STATUS), 0ULL); 1663 } else { 1664 /* Record deferral for next wakeup */ 1665 gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1; 1666 } 1667 } else { 1668 /* Record initial deferral for next wakeup */ 1669 gbl->gbl_clrdefcnt = 1; 1670 gcpu_deferrred_polled_clears++; 1671 } 1672 1673 serialize: 1674 { 1675 #ifdef __xpv 1676 ; 1677 #else 1678 /* 1679 * Intel Vol 3A says to execute a serializing 1680 * instruction here, ie CPUID. Well WRMSR is also 1681 * defined to be serializing, so the status clear above 1682 * should suffice. To be a good citizen, and since 1683 * some clears are deferred, we'll execute a CPUID 1684 * instruction here. 1685 */ 1686 struct cpuid_regs tmp; 1687 (void) __cpuid_insn(&tmp); 1688 #endif 1689 } 1690 } 1691 } 1692 1693 /*ARGSUSED5*/ 1694 void 1695 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask, 1696 gcpu_mce_status_t *mcesp, boolean_t clrstatus, int what) 1697 { 1698 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1699 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1700 int nbanks = mca->gcpu_mca_nbanks; 1701 gcpu_bank_logout_t *gbl, *pgbl; 1702 gcpu_logout_t *gcl, *pgcl; 1703 int ismc = (rp != NULL); 1704 int ispoll = !ismc; 1705 int i, nerr = 0; 1706 cmi_errno_t err; 1707 uint64_t mcg_status; 1708 uint64_t disp; 1709 uint64_t cap; 1710 int first = -1; 1711 int last = -1; 1712 int willpanic = 0; 1713 1714 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) != 1715 CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != 1716 CMI_SUCCESS) { 1717 if (mcesp != NULL) 1718 mcesp->mce_nerr = mcesp->mce_disp = 0; 1719 return; 1720 } 1721 1722 if (ismc) { 1723 gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION]; 1724 } else { 1725 int pidx = mca->gcpu_mca_nextpoll_idx; 1726 int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ? 1727 GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1; 1728 1729 gcl = mca->gcpu_mca_logout[pidx]; /* current logout */ 1730 pgcl = mca->gcpu_mca_logout[ppidx]; /* previous logout */ 1731 mca->gcpu_mca_nextpoll_idx = ppidx; /* switch next time */ 1732 } 1733 1734 gcl->gcl_timestamp = gethrtime_waitfree(); 1735 gcl->gcl_mcg_status = mcg_status; 1736 gcl->gcl_ip = rp ? rp->r_pc : 0; 1737 1738 gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0; 1739 if (cap & MCG_CAP_TES_P) 1740 gcl->gcl_flags |= GCPU_GCL_F_TES_P; 1741 1742 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) { 1743 uint64_t status, status2, addr, misc; 1744 int retries = gcpu_mca_telemetry_retries; 1745 1746 gbl->gbl_status = 0; 1747 gbl->gbl_disp = 0; 1748 gbl->gbl_clrdefcnt = 0; 1749 1750 /* 1751 * Only logout from MCA banks we have initialized from at 1752 * least one core. If a core shares an MCA bank with another 1753 * but perhaps lost the race to initialize it, then it must 1754 * still be allowed to logout from the shared bank. 1755 */ 1756 if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i)) 1757 continue; 1758 1759 /* 1760 * On a poll look only at the banks we've been asked to check. 1761 */ 1762 if (rp == NULL && !(bankmask & 1 << i)) 1763 continue; 1764 1765 1766 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) != 1767 CMI_SUCCESS) 1768 continue; 1769 1770 #ifndef __xpv 1771 gcpu_cmci_logout(hdl, i, &mca->gcpu_bank_cmci[i], status, what); 1772 #endif 1773 1774 retry: 1775 if (!(status & MSR_MC_STATUS_VAL)) 1776 continue; 1777 1778 /* First and last bank that have valid status */ 1779 if (first < 0) 1780 first = i; 1781 last = i; 1782 1783 addr = -1; 1784 misc = 0; 1785 1786 if ((status & MSR_MC_STATUS_ADDRV) || 1787 gcpu_force_addr_in_payload) 1788 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr); 1789 1790 if (status & MSR_MC_STATUS_MISCV) 1791 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc); 1792 1793 #ifndef __xpv 1794 gcpu_cmci_throttle(hdl, i, &mca->gcpu_bank_cmci[i], what); 1795 #endif 1796 1797 /* 1798 * Allow the model-specific code to extract bank telemetry. 1799 */ 1800 cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout); 1801 1802 /* 1803 * Not all cpu models assure us that the status/address/misc 1804 * data will not change during the above sequence of MSR reads, 1805 * or that it can only change by the addition of the OVerflow 1806 * bit to the status register. If the status has changed 1807 * other than in the overflow bit then we attempt to reread 1808 * for a consistent snapshot, but eventually give up and 1809 * go with what we've got. We only perform this check 1810 * for a poll - a further #MC during a #MC will reset, and 1811 * polled errors should not overwrite higher-priority 1812 * trapping errors (but could set the overflow bit). 1813 */ 1814 if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), 1815 &status2)) == CMI_SUCCESS) { 1816 if (!STATUS_EQV(status, status2)) { 1817 if (retries-- > 0) { 1818 status = status2; 1819 goto retry; 1820 } else { 1821 gbl->gbl_disp |= 1822 CMI_ERRDISP_INCONSISTENT; 1823 } 1824 } 1825 } else if (ispoll && err != CMI_SUCCESS) { 1826 gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT; 1827 } 1828 1829 nerr++; 1830 gbl->gbl_status = status; 1831 gbl->gbl_addr = addr; 1832 gbl->gbl_misc = misc; 1833 1834 /* 1835 * For polled observation, if the count of deferred status 1836 * clears updated in the clear_mc() is nonzero and the 1837 * MCi_STATUS has not changed, the last wakeup has produced 1838 * the ereport of the error. Therefore, clear the status in 1839 * this wakeup to avoid duplicate ereport. 1840 */ 1841 pgbl = &pgcl->gcl_data[i]; 1842 if (!isxpv && ispoll && IS_MCE_CANDIDATE(status) && 1843 pgbl->gbl_clrdefcnt != 0) { 1844 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status)) { 1845 gbl->gbl_status = 0; 1846 (void) cmi_hdl_wrmsr(hdl, 1847 IA32_MSR_MC(i, STATUS), 0ULL); 1848 } 1849 } 1850 } 1851 1852 if (gcpu_mca_stack_flag) 1853 gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH); 1854 else 1855 gcl->gcl_stackdepth = 0; 1856 1857 /* 1858 * Decide our disposition for this error or errors, and submit for 1859 * logging and subsequent diagnosis. 1860 */ 1861 if (nerr != 0) { 1862 disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp); 1863 1864 willpanic = (ismc && cmi_mce_response(rp, disp) == 0); 1865 1866 if (!willpanic) 1867 clear_mc(first, last, ismc, clrstatus, hdl, gcl, pgcl); 1868 } else { 1869 disp = 0; 1870 if (mcesp) { 1871 mcesp->mce_nerr = mcesp->mce_disp = 0; 1872 } 1873 } 1874 1875 /* 1876 * Clear MCG_STATUS if MCIP is set (machine check in progress). 1877 * If a second #MC had occured before now the system would have 1878 * reset. We can only do thise once gcpu_mca_process has copied 1879 * the logout structure. 1880 */ 1881 if (ismc && mcg_status & MCG_STATUS_MCIP) 1882 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0); 1883 1884 /* 1885 * At this point we have read and logged all telemetry that is visible 1886 * under the MCA. On architectures for which the NorthBridge is 1887 * on-chip this may include NB-observed errors, but where the NB 1888 * is off chip it may have been the source of the #MC request and 1889 * so we must call into the memory-controller driver to give it 1890 * a chance to log errors. 1891 */ 1892 if (ismc) { 1893 cmi_mc_logout(hdl, 1, willpanic); 1894 } 1895 } 1896 1897 #ifndef __xpv 1898 int gcpu_mca_trap_vomit_summary = 0; 1899 1900 /* 1901 * On a native machine check exception we come here from mcetrap via 1902 * cmi_mca_trap. A machine check on one cpu of a chip does not trap others 1903 * cpus of the chip, so it is possible that another cpu on this chip could 1904 * initiate a poll while we're in the #mc handler; it is also possible that 1905 * this trap has occured during a poll on this cpu. So we must acquire 1906 * the chip-wide poll lock, but be careful to avoid deadlock. 1907 * 1908 * The 'data' pointer cannot be NULL due to init order. 1909 */ 1910 uint64_t 1911 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp) 1912 { 1913 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1914 kmutex_t *poll_lock = NULL; 1915 gcpu_mce_status_t mce; 1916 uint64_t mcg_status; 1917 int tooklock = 0; 1918 1919 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) != 1920 CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP)) 1921 return (0); 1922 1923 /* 1924 * Synchronize with any poller from another core that may happen 1925 * to share access to one or more of the MCA banks. 1926 */ 1927 if (gcpu->gcpu_shared != NULL) 1928 poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock; 1929 1930 if (poll_lock != NULL && !mutex_owned(poll_lock)) { 1931 /* 1932 * The lock is not owned by the thread we have 1933 * interrupted. Spin for this adaptive lock. 1934 */ 1935 while (!mutex_tryenter(poll_lock)) { 1936 while (mutex_owner(poll_lock) != NULL) 1937 ; 1938 } 1939 tooklock = 1; 1940 } 1941 1942 gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE, GCPU_MPT_WHAT_MC_ERR); 1943 1944 if (tooklock) 1945 mutex_exit(poll_lock); 1946 1947 /* 1948 * gcpu_mca_trap_vomit_summary may be set for debug assistance. 1949 */ 1950 if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) { 1951 cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, " 1952 "%u PCC (%u ok), " 1953 "%u UC (%d ok, %u poisoned), " 1954 "%u forcefatal, %u ignored", 1955 mce.mce_nerr, (u_longlong_t)mce.mce_disp, 1956 mce.mce_npcc, mce.mce_npcc_ok, 1957 mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned, 1958 mce.mce_forcefatal, mce.mce_ignored); 1959 } 1960 1961 return (mce.mce_disp); 1962 } 1963 #endif 1964 1965 /*ARGSUSED*/ 1966 void 1967 gcpu_faulted_enter(cmi_hdl_t hdl) 1968 { 1969 /* Nothing to do here */ 1970 } 1971 1972 /*ARGSUSED*/ 1973 void 1974 gcpu_faulted_exit(cmi_hdl_t hdl) 1975 { 1976 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1977 1978 gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING; 1979 } 1980 1981 /* 1982 * Write the requested values to the indicated MSRs. Having no knowledge 1983 * of the model-specific requirements for writing to these model-specific 1984 * registers, we will only blindly write to those MSRs if the 'force' 1985 * argument is nonzero. That option should only be used in prototyping 1986 * and debugging. 1987 */ 1988 /*ARGSUSED*/ 1989 cmi_errno_t 1990 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs, 1991 int force) 1992 { 1993 int i, errs = 0; 1994 1995 for (i = 0; i < nregs; i++) { 1996 uint_t msr = regs[i].cmr_msrnum; 1997 uint64_t val = regs[i].cmr_msrval; 1998 1999 if (cms_present(hdl)) { 2000 if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS) 2001 errs++; 2002 } else if (force) { 2003 errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS); 2004 } else { 2005 errs++; 2006 } 2007 } 2008 2009 return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN); 2010 } 2011 2012 /* deconfigure gcpu_mca_init() */ 2013 void 2014 gcpu_mca_fini(cmi_hdl_t hdl) 2015 { 2016 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 2017 gcpu_mca_t *mca = &gcpu->gcpu_mca; 2018 int i; 2019 2020 /* 2021 * CPU startup code only calls cmi_mca_init if x86_featureset indicates 2022 * both MCA and MCE support (i.e., X86FSET_MCA). P5, K6, and earlier 2023 * processors, which have their own more primitive way of doing 2024 * machine checks, will not have cmi_mca_init called since their 2025 * CPUID information will not indicate both MCA and MCE features. 2026 */ 2027 if (!is_x86_feature(x86_featureset, X86FSET_MCA)) 2028 return; 2029 #ifndef __xpv 2030 /* 2031 * disable machine check in CR4 2032 */ 2033 cmi_ntv_hwdisable_mce(hdl); 2034 #endif 2035 mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock); 2036 gcpu_mca_poll_fini(hdl); 2037 mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock); 2038 2039 /* 2040 * free resources allocated during init 2041 */ 2042 if (mca->gcpu_bank_cmci != NULL) { 2043 kmem_free(mca->gcpu_bank_cmci, sizeof (gcpu_mca_cmci_t) * 2044 mca->gcpu_mca_nbanks); 2045 } 2046 2047 for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) { 2048 if (mca->gcpu_mca_logout[i] != NULL) { 2049 kmem_free(mca->gcpu_mca_logout[i], mca->gcpu_mca_lgsz); 2050 } 2051 } 2052 2053 if (mca->gcpu_mca_bioscfg.bios_bankcfg != NULL) { 2054 kmem_free(mca->gcpu_mca_bioscfg.bios_bankcfg, 2055 sizeof (struct gcpu_bios_bankcfg) * mca->gcpu_mca_nbanks); 2056 } 2057 } 2058