1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/mca_x86.h> 28 #include <sys/cpu_module_impl.h> 29 #include <sys/cpu_module_ms.h> 30 #include <sys/cmn_err.h> 31 #include <sys/cpuvar.h> 32 #include <sys/pghw.h> 33 #include <sys/x86_archext.h> 34 #include <sys/sysmacros.h> 35 #include <sys/regset.h> 36 #include <sys/privregs.h> 37 #include <sys/systm.h> 38 #include <sys/types.h> 39 #include <sys/log.h> 40 #include <sys/psw.h> 41 #include <sys/fm/protocol.h> 42 #include <sys/fm/util.h> 43 #include <sys/errorq.h> 44 #include <sys/mca_x86.h> 45 #include <sys/fm/cpu/GMCA.h> 46 #include <sys/fm/smb/fmsmb.h> 47 #include <sys/sysevent.h> 48 #include <sys/ontrap.h> 49 50 #include "gcpu.h" 51 52 extern int x86gentopo_legacy; /* x86 generic topology support */ 53 54 static uint_t gcpu_force_addr_in_payload = 0; 55 56 /* 57 * Clear to log telemetry found at initialization. While processor docs 58 * say you should process this telemetry on all but Intel family 0x6 59 * there are way too many exceptions and we want to avoid bogus 60 * diagnoses. 61 */ 62 int gcpu_suppress_log_on_init = 1; 63 64 /* 65 * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at 66 * error logout time. The stack will be included in the ereport if the 67 * error type selects stack inclusion, or in all cases if 68 * gcpu_mca_stack_ereport_include is nonzero. 69 */ 70 int gcpu_mca_stack_flag = 0; 71 int gcpu_mca_stack_ereport_include = 0; 72 73 /* 74 * The number of times to re-read MCA telemetry to try to obtain a 75 * consistent snapshot if we find it to be changing under our feet. 76 */ 77 int gcpu_mca_telemetry_retries = 5; 78 79 #ifndef __xpv 80 int gcpu_mca_cmci_throttling_threshold = 10; 81 int gcpu_mca_cmci_reenable_threshold = 1000; 82 #endif 83 84 static gcpu_error_disp_t gcpu_errtypes[] = { 85 86 /* 87 * Unclassified 88 */ 89 { 90 FM_EREPORT_CPU_GENERIC_UNCLASSIFIED, 91 NULL, 92 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 93 MCAX86_SIMPLE_UNCLASSIFIED_MASKON, 94 MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF 95 }, 96 97 /* 98 * Microcode ROM Parity Error 99 */ 100 { 101 FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY, 102 NULL, 103 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 104 MCAX86_SIMPLE_MC_CODE_PARITY_MASKON, 105 MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF 106 }, 107 108 /* 109 * External - BINIT# from another processor during power-on config 110 */ 111 { 112 FM_EREPORT_CPU_GENERIC_EXTERNAL, 113 NULL, 114 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 115 MCAX86_SIMPLE_EXTERNAL_MASKON, 116 MCAX86_SIMPLE_EXTERNAL_MASKOFF 117 }, 118 119 /* 120 * Functional redundancy check master/slave error 121 */ 122 { 123 FM_EREPORT_CPU_GENERIC_FRC, 124 NULL, 125 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 126 MCAX86_SIMPLE_FRC_MASKON, 127 MCAX86_SIMPLE_FRC_MASKOFF 128 }, 129 130 /* 131 * Internal parity error 132 */ 133 { 134 FM_EREPORT_CPU_GENERIC_INTERNAL_PARITY, 135 NULL, 136 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 137 MCAX86_SIMPLE_INTERNAL_PARITY_MASKON, 138 MCAX86_SIMPLE_INTERNAL_PARITY_MASKOFF 139 }, 140 141 142 /* 143 * Internal timer error 144 */ 145 { 146 FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER, 147 NULL, 148 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 149 MCAX86_SIMPLE_INTERNAL_TIMER_MASKON, 150 MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF 151 }, 152 153 /* 154 * Internal unclassified 155 */ 156 { 157 FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS, 158 NULL, 159 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 160 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON, 161 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF 162 }, 163 164 /* 165 * Compound error codes - generic memory hierarchy 166 */ 167 { 168 FM_EREPORT_CPU_GENERIC_GENMEMHIER, 169 NULL, 170 FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */ 171 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON, 172 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF 173 }, 174 175 /* 176 * Compound error codes - TLB errors 177 */ 178 { 179 FM_EREPORT_CPU_GENERIC_TLB, 180 "%1$s" "TLB" "%2$s" "_ERR", 181 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 182 MCAX86_COMPOUND_TLB_MASKON, 183 MCAX86_COMPOUND_TLB_MASKOFF 184 }, 185 186 /* 187 * Compound error codes - memory hierarchy 188 */ 189 { 190 FM_EREPORT_CPU_GENERIC_MEMHIER, 191 "%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR", 192 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 193 MCAX86_COMPOUND_MEMHIER_MASKON, 194 MCAX86_COMPOUND_MEMHIER_MASKOFF 195 }, 196 197 /* 198 * Compound error codes - bus and interconnect errors 199 */ 200 { 201 FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT, 202 "BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR", 203 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 204 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON, 205 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF 206 }, 207 /* 208 * Compound error codes - memory controller errors 209 */ 210 { 211 FM_EREPORT_CPU_GENERIC_MEMORY_CONTROLLER, 212 "MC" "_" "%8$s" "_" "%9$s" "_ERR", 213 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 214 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKON, 215 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKOFF 216 }, 217 }; 218 219 static gcpu_error_disp_t gcpu_unknown = { 220 FM_EREPORT_CPU_GENERIC_UNKNOWN, 221 "UNKNOWN", 222 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 223 0, 224 0 225 }; 226 227 static errorq_t *gcpu_mca_queue; 228 static kmutex_t gcpu_mca_queue_lock; 229 230 #ifdef __xpv 231 static int isxpv = 1; 232 #else 233 static int isxpv = 0; 234 #endif 235 236 static const gcpu_error_disp_t * 237 gcpu_disp_match(uint16_t code) 238 { 239 const gcpu_error_disp_t *ged = gcpu_errtypes; 240 int i; 241 242 for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t); 243 i++, ged++) { 244 uint16_t on = ged->ged_errcode_mask_on; 245 uint16_t off = ged->ged_errcode_mask_off; 246 247 if ((code & on) == on && (code & off) == 0) 248 return (ged); 249 } 250 251 return (NULL); 252 } 253 254 static uint16_t 255 bit_strip(uint16_t code, uint16_t mask, uint16_t shift) 256 { 257 return ((code & mask) >> shift); 258 } 259 260 #define BIT_STRIP(code, name) \ 261 bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \ 262 MCAX86_ERRCODE_##name##_SHIFT) 263 264 #define GCPU_MNEMONIC_UNDEF "undefined" 265 #define GCPU_MNEMONIC_RESVD "reserved" 266 267 /* 268 * Mappings of TT, LL, RRRR, PP, II and T values to compound error name 269 * mnemonics and to ereport class name components. 270 */ 271 272 struct gcpu_mnexp { 273 const char *mne_compound; /* used in expanding compound errname */ 274 const char *mne_ereport; /* used in expanding ereport class */ 275 }; 276 277 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */ 278 { "I", FM_EREPORT_CPU_GENERIC_TT_INSTR }, /* INSTR */ 279 { "D", FM_EREPORT_CPU_GENERIC_TT_DATA }, /* DATA */ 280 { "G", FM_EREPORT_CPU_GENERIC_TT_GEN }, /* GEN */ 281 { GCPU_MNEMONIC_UNDEF, "" } 282 }; 283 284 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */ 285 { "LO", FM_EREPORT_CPU_GENERIC_LL_L0 }, /* L0 */ 286 { "L1", FM_EREPORT_CPU_GENERIC_LL_L1 }, /* L1 */ 287 { "L2", FM_EREPORT_CPU_GENERIC_LL_L2 }, /* L2 */ 288 { "LG", FM_EREPORT_CPU_GENERIC_LL_LG } /* LG */ 289 }; 290 291 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */ 292 { "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR }, /* ERR */ 293 { "RD", FM_EREPORT_CPU_GENERIC_RRRR_RD }, /* RD */ 294 { "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR }, /* WR */ 295 { "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD }, /* DRD */ 296 { "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR }, /* DWR */ 297 { "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD }, /* IRD */ 298 { "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH }, /* PREFETCH */ 299 { "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT }, /* EVICT */ 300 { "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP }, /* SNOOP */ 301 }; 302 303 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */ 304 { "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC }, /* SRC */ 305 { "RES", FM_EREPORT_CPU_GENERIC_PP_RES }, /* RES */ 306 { "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS }, /* OBS */ 307 { "", FM_EREPORT_CPU_GENERIC_PP_GEN } /* GEN */ 308 }; 309 310 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */ 311 { "M", FM_EREPORT_CPU_GENERIC_II_MEM }, /* MEM */ 312 { GCPU_MNEMONIC_RESVD, "" }, 313 { "IO", FM_EREPORT_CPU_GENERIC_II_IO }, /* IO */ 314 { "", FM_EREPORT_CPU_GENERIC_II_GEN } /* GEN */ 315 }; 316 317 static struct gcpu_mnexp gcpu_T_mnemonics[] = { /* MCAX86_ERRCODE_T_* */ 318 { "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT }, /* NONE */ 319 { "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT } /* TIMEOUT */ 320 }; 321 322 static struct gcpu_mnexp gcpu_CCCC_mnemonics[] = { /* MCAX86_ERRCODE_CCCC_* */ 323 { "CH0", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH0 */ 324 { "CH1", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH1 */ 325 { "CH2", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH2 */ 326 { "CH3", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH3 */ 327 { "CH4", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH4 */ 328 { "CH5", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH5 */ 329 { "CH6", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH6 */ 330 { "CH7", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH7 */ 331 { "CH8", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH8 */ 332 { "CH9", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH9 */ 333 { "CH10", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH10 */ 334 { "CH11", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH11 */ 335 { "CH12", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH12 */ 336 { "CH13", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH13 */ 337 { "CH14", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH14 */ 338 { "CH", FM_EREPORT_CPU_GENERIC_CCCC } /* GEN */ 339 }; 340 341 static struct gcpu_mnexp gcpu_MMM_mnemonics[] = { /* MCAX86_ERRCODE_MMM_* */ 342 { "GEN", FM_EREPORT_CPU_GENERIC_MMM_ERR }, /* GEN ERR */ 343 { "RD", FM_EREPORT_CPU_GENERIC_MMM_RD }, /* READ */ 344 { "WR", FM_EREPORT_CPU_GENERIC_MMM_WR }, /* WRITE */ 345 { "ADDR_CMD", FM_EREPORT_CPU_GENERIC_MMM_ADRCMD }, /* ADDR, CMD */ 346 { "SCRUB", FM_EREPORT_CPU_GENERIC_MMM_SCRUB }, 347 { GCPU_MNEMONIC_RESVD, ""}, /* RESERVED */ 348 { GCPU_MNEMONIC_RESVD, ""}, /* RESERVED */ 349 { GCPU_MNEMONIC_RESVD, ""} /* RESERVED */ 350 }; 351 352 enum gcpu_mn_namespace { 353 GCPU_MN_NAMESPACE_COMPOUND, 354 GCPU_MN_NAMESPACE_EREPORT 355 }; 356 357 static const char * 358 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint16_t val, 359 enum gcpu_mn_namespace nspace) 360 { 361 if (val >= tbl_sz || val > 0xff) 362 return (GCPU_MNEMONIC_UNDEF); /* for all namespaces */ 363 364 switch (nspace) { 365 case GCPU_MN_NAMESPACE_COMPOUND: 366 return (tbl[val].mne_compound); 367 /*NOTREACHED*/ 368 369 case GCPU_MN_NAMESPACE_EREPORT: 370 return (tbl[val].mne_ereport); 371 /*NOTREACHED*/ 372 373 default: 374 return (GCPU_MNEMONIC_UNDEF); 375 /*NOTREACHED*/ 376 } 377 } 378 379 /* 380 * The ereport class leaf component is either a simple string with no 381 * format specifiers, or a string with one or more embedded %n$s specifiers - 382 * positional selection for string arguments. The kernel snprintf does 383 * not support %n$ (and teaching it to do so is too big a headache) so 384 * we will expand this restricted format string ourselves. 385 */ 386 387 #define GCPU_CLASS_VARCOMPS 9 388 389 #define GCPU_MNEMONIC(code, name, nspace) \ 390 gcpu_mnemonic(gcpu_##name##_mnemonics, \ 391 sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \ 392 BIT_STRIP(code, name), nspace) 393 394 static void 395 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status, 396 enum gcpu_mn_namespace nspace) 397 { 398 uint16_t code = MCAX86_ERRCODE(status); 399 const char *mn[GCPU_CLASS_VARCOMPS]; 400 char *p = buf; /* current position in buf */ 401 char *q = buf + buflen; /* pointer past last char in buf */ 402 int which, expfmtchar, error; 403 char c; 404 405 mn[0] = GCPU_MNEMONIC(code, TT, nspace); 406 mn[1] = GCPU_MNEMONIC(code, LL, nspace); 407 mn[2] = GCPU_MNEMONIC(code, RRRR, nspace); 408 mn[3] = GCPU_MNEMONIC(code, PP, nspace); 409 mn[4] = GCPU_MNEMONIC(code, II, nspace); 410 mn[5] = GCPU_MNEMONIC(code, T, nspace); 411 mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : ""; 412 mn[7] = GCPU_MNEMONIC(code, CCCC, nspace); 413 mn[8] = GCPU_MNEMONIC(code, MMM, nspace); 414 415 while (p < q - 1 && (c = *fmt++) != '\0') { 416 if (c != '%') { 417 /* not the beginning of a format specifier - copy */ 418 *p++ = c; 419 continue; 420 } 421 422 error = 0; 423 which = -1; 424 expfmtchar = -1; 425 426 nextfmt: 427 if ((c = *fmt++) == '\0') 428 break; /* early termination of fmt specifier */ 429 430 switch (c) { 431 case '1': 432 case '2': 433 case '3': 434 case '4': 435 case '5': 436 case '6': 437 case '7': 438 case '8': 439 case '9': 440 if (which != -1) { /* allow only one positional digit */ 441 error++; 442 break; 443 } 444 which = c - '1'; 445 goto nextfmt; 446 /*NOTREACHED*/ 447 448 case '$': 449 if (which == -1) { /* no position specified */ 450 error++; 451 break; 452 } 453 expfmtchar = 's'; 454 goto nextfmt; 455 /*NOTREACHED*/ 456 457 case 's': 458 if (expfmtchar != 's') { 459 error++; 460 break; 461 } 462 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s", 463 mn[which]); 464 p += strlen(p); 465 break; 466 467 default: 468 error++; 469 break; 470 } 471 472 if (error) 473 break; 474 } 475 476 *p = '\0'; /* NUL termination */ 477 } 478 479 static void 480 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status, 481 const char *cpuclass, const char *leafclass) 482 { 483 char *p = buf; /* current position in buf */ 484 char *q = buf + buflen; /* pointer past last char in buf */ 485 486 (void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.", 487 FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC); 488 489 p += strlen(p); 490 if (p >= q) 491 return; 492 493 if (leafclass == NULL) { 494 gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status, 495 GCPU_MN_NAMESPACE_EREPORT); 496 } else { 497 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s", 498 leafclass); 499 } 500 } 501 502 /* 503 * Create an "hc" scheme FMRI identifying the given cpu with 504 * motherboard/chip/core/strand instance numbers. 505 */ 506 static nvlist_t * 507 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva) 508 { 509 nvlist_t *nvl, *fmri; 510 511 if ((nvl = fm_nvlist_create(nva)) == NULL) 512 return (NULL); 513 514 if (!x86gentopo_legacy) { 515 fmri = cmi_hdl_smb_bboard(hdl); 516 if (fmri == NULL) 517 return (NULL); 518 519 fm_fmri_hc_create(nvl, FM_HC_SCHEME_VERSION, 520 NULL, NULL, fmri, 3, 521 "chip", cmi_hdl_smb_chipid(hdl), 522 "core", cmi_hdl_coreid(hdl), 523 "strand", cmi_hdl_strandid(hdl)); 524 } else { 525 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 4, 526 "motherboard", 0, 527 "chip", cmi_hdl_chipid(hdl), 528 "core", cmi_hdl_coreid(hdl), 529 "strand", cmi_hdl_strandid(hdl)); 530 } 531 532 return (nvl); 533 } 534 535 int gcpu_bleat_count_thresh = 5; 536 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL; 537 538 /* 539 * Called when we are unable to propogate a logout structure onto an 540 * errorq for subsequent ereport preparation and logging etc. The caller 541 * should usually only decide to call this for severe errors - those we 542 * suspect we may need to panic for. 543 */ 544 static void 545 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl) 546 { 547 hrtime_t now = gethrtime_waitfree(); 548 static hrtime_t gcpu_last_bleat; 549 gcpu_bank_logout_t *gbl; 550 static int bleatcount; 551 int i; 552 553 /* 554 * Throttle spamming of the console. The first gcpu_bleat_count_thresh 555 * can come as fast as we like, but once we've spammed that many 556 * to the console we require a minimum interval to pass before 557 * any more complaints. 558 */ 559 if (++bleatcount > gcpu_bleat_count_thresh) { 560 if (now - gcpu_last_bleat < gcpu_bleat_min_interval) 561 return; 562 else 563 bleatcount = 0; 564 } 565 gcpu_last_bleat = now; 566 567 cmn_err(CE_WARN, 568 "Machine-Check Errors unlogged on chip %d core %d strand %d, " 569 "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl), 570 cmi_hdl_strandid(hdl)); 571 cmn_err(CE_WARN, "MCG_STATUS 0x%016llx", 572 (u_longlong_t)gcl->gcl_mcg_status); 573 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) { 574 uint64_t status = gbl->gbl_status; 575 576 if (!(status & MSR_MC_STATUS_VAL)) 577 continue; 578 579 /* Force ADDRV for AMD Family 0xf and above */ 580 if (gcpu_force_addr_in_payload) 581 status = status | MSR_MC_STATUS_ADDRV; 582 583 switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) { 584 case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV: 585 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 586 "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx", 587 i, IA32_MSR_MC(i, STATUS), 588 (u_longlong_t)gbl->gbl_status, 589 (u_longlong_t)gbl->gbl_addr, 590 (u_longlong_t)gbl->gbl_misc); 591 break; 592 593 case MSR_MC_STATUS_ADDRV: 594 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 595 "STAT 0x%016llx ADDR 0x%016llx", 596 i, IA32_MSR_MC(i, STATUS), 597 (u_longlong_t)gbl->gbl_status, 598 (u_longlong_t)gbl->gbl_addr); 599 break; 600 601 case MSR_MC_STATUS_MISCV: 602 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 603 "STAT 0x%016llx MISC 0x%016llx", 604 i, IA32_MSR_MC(i, STATUS), 605 (u_longlong_t)gbl->gbl_status, 606 (u_longlong_t)gbl->gbl_misc); 607 break; 608 609 default: 610 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 611 "STAT 0x%016llx", 612 i, IA32_MSR_MC(i, STATUS), 613 (u_longlong_t)gbl->gbl_status); 614 break; 615 616 } 617 } 618 } 619 620 #define _GCPU_BSTATUS(status, what) \ 621 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \ 622 (status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE 623 624 static void 625 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl, 626 uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code) 627 { 628 uint64_t members = ged ? ged->ged_ereport_members : 629 FM_EREPORT_PAYLOAD_FLAGS_COMMON; 630 uint64_t mcg = gcl->gcl_mcg_status; 631 int mcip = mcg & MCG_STATUS_MCIP; 632 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno]; 633 uint64_t bstat = gbl->gbl_status; 634 635 /* 636 * Include the compound error name if requested and if this 637 * is a compound error type. 638 */ 639 if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged && 640 ged->ged_compound_fmt != NULL) { 641 char buf[FM_MAX_CLASS]; 642 643 gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code, 644 GCPU_MN_NAMESPACE_COMPOUND); 645 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR, 646 DATA_TYPE_STRING, buf, NULL); 647 } 648 649 /* 650 * Include disposition information for this error 651 */ 652 if (members & FM_EREPORT_PAYLOAD_FLAG_DISP && 653 gbl->gbl_disp != 0) { 654 int i, empty = 1; 655 char buf[128]; 656 char *p = buf, *q = buf + 128; 657 static struct _gcpu_disp_name { 658 uint64_t dv; 659 const char *dn; 660 } disp_names[] = { 661 { CMI_ERRDISP_CURCTXBAD, 662 "processor_context_corrupt" }, 663 { CMI_ERRDISP_RIPV_INVALID, 664 "return_ip_invalid" }, 665 { CMI_ERRDISP_UC_UNCONSTRAINED, 666 "unconstrained" }, 667 { CMI_ERRDISP_FORCEFATAL, 668 "forcefatal" }, 669 { CMI_ERRDISP_IGNORED, 670 "ignored" }, 671 { CMI_ERRDISP_PCC_CLEARED, 672 "corrupt_context_cleared" }, 673 { CMI_ERRDISP_UC_CLEARED, 674 "uncorrected_data_cleared" }, 675 { CMI_ERRDISP_POISONED, 676 "poisoned" }, 677 { CMI_ERRDISP_INCONSISTENT, 678 "telemetry_unstable" }, 679 }; 680 681 for (i = 0; i < sizeof (disp_names) / 682 sizeof (struct _gcpu_disp_name); i++) { 683 if ((gbl->gbl_disp & disp_names[i].dv) == 0) 684 continue; 685 686 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, 687 "%s%s", empty ? "" : ",", disp_names[i].dn); 688 p += strlen(p); 689 empty = 0; 690 } 691 692 if (p != buf) 693 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP, 694 DATA_TYPE_STRING, buf, NULL); 695 } 696 697 /* 698 * If MCG_STATUS is included add that and an indication of whether 699 * this ereport was the result of a machine check or poll. 700 */ 701 if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) { 702 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS, 703 DATA_TYPE_UINT64, mcg, NULL); 704 705 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP, 706 DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL); 707 } 708 709 /* 710 * If an instruction pointer is to be included add one provided 711 * MCG_STATUS indicated it is valid; meaningless for polled events. 712 */ 713 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP && 714 mcg & MCG_STATUS_EIPV) { 715 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP, 716 DATA_TYPE_UINT64, gcl->gcl_ip, NULL); 717 } 718 719 /* 720 * Add an indication of whether the trap occured during privileged code. 721 */ 722 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) { 723 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV, 724 DATA_TYPE_BOOLEAN_VALUE, 725 gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL); 726 } 727 728 /* 729 * If requested, add the index of the MCA bank. This indicates the 730 * n'th bank of 4 MCA registers, and does not necessarily correspond 731 * to MCi_* - use the bank offset to correlate 732 */ 733 if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) { 734 fm_payload_set(ereport, 735 /* Bank number */ 736 FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno, 737 /* Offset of MCi_CTL */ 738 FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64, 739 IA32_MSR_MC(bankno, CTL), 740 NULL); 741 } 742 743 /* 744 * Add MCi_STATUS if requested, and decode it. 745 */ 746 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) { 747 const char *tbes[] = { 748 "No tracking", /* 00 */ 749 "Green - below threshold", /* 01 */ 750 "Yellow - above threshold", /* 10 */ 751 "Reserved" /* 11 */ 752 }; 753 754 fm_payload_set(ereport, 755 /* Bank MCi_STATUS */ 756 FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat, 757 /* Overflow? */ 758 _GCPU_BSTATUS(bstat, OVER), 759 /* Uncorrected? */ 760 _GCPU_BSTATUS(bstat, UC), 761 /* Enabled? */ 762 _GCPU_BSTATUS(bstat, EN), 763 /* Processor context corrupt? */ 764 _GCPU_BSTATUS(bstat, PCC), 765 /* Error code */ 766 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE, 767 DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat), 768 /* Model-specific error code */ 769 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE, 770 DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat), 771 NULL); 772 773 /* 774 * If MCG_CAP.TES_P indicates that that thresholding info 775 * is present in the architural component of the bank status 776 * then include threshold information for this bank. 777 */ 778 if (gcl->gcl_flags & GCPU_GCL_F_TES_P) { 779 fm_payload_set(ereport, 780 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES, 781 DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)], 782 NULL); 783 } 784 } 785 786 /* 787 * Add MCi_ADDR info if requested and valid. We force addition of 788 * MCi_ADDR, even if its not valid on AMD family 0xf and above, 789 * to aid in analysis of ereports, for WatchDog errors. 790 */ 791 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR && 792 ((bstat & MSR_MC_STATUS_ADDRV) || 793 gcpu_force_addr_in_payload)) { 794 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR, 795 DATA_TYPE_UINT64, gbl->gbl_addr, NULL); 796 } 797 798 /* 799 * MCi_MISC if requested and MCi_STATUS.MISCV). 800 */ 801 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC && 802 bstat & MSR_MC_STATUS_MISCV) { 803 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC, 804 DATA_TYPE_UINT64, gbl->gbl_misc, NULL); 805 } 806 807 } 808 809 /* 810 * Construct and post an ereport based on the logout information from a 811 * single MCA bank. We are not necessarily running on the cpu that 812 * detected the error. 813 */ 814 static void 815 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx, 816 const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status) 817 { 818 gcpu_data_t *gcpu = gcl->gcl_gcpu; 819 cmi_hdl_t hdl = gcpu->gcpu_hdl; 820 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx]; 821 const char *cpuclass = NULL, *leafclass = NULL; 822 uint16_t code = MCAX86_ERRCODE(status); 823 errorq_elem_t *eqep, *scr_eqep; 824 nvlist_t *ereport, *detector; 825 char buf[FM_MAX_CLASS]; 826 const char *classfmt; 827 nv_alloc_t *nva; 828 829 if (panicstr) { 830 if ((eqep = errorq_reserve(ereport_errorq)) == NULL) 831 return; 832 ereport = errorq_elem_nvl(ereport_errorq, eqep); 833 834 /* 835 * Allocate another element for scratch space, but fallback 836 * to the one we have if that fails. We'd like to use the 837 * additional scratch space for nvlist construction. 838 */ 839 if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL) 840 nva = errorq_elem_nva(ereport_errorq, scr_eqep); 841 else 842 nva = errorq_elem_nva(ereport_errorq, eqep); 843 } else { 844 ereport = fm_nvlist_create(NULL); 845 nva = NULL; 846 } 847 848 if (ereport == NULL) 849 return; 850 851 /* 852 * Common payload data required by the protocol: 853 * - ereport class 854 * - detector 855 * - ENA 856 */ 857 858 /* 859 * Ereport class - call into model-specific support to allow it to 860 * provide a cpu class or leaf class, otherwise calculate our own. 861 */ 862 cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass); 863 classfmt = ged ? ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN; 864 gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass, 865 leafclass); 866 867 /* 868 * The detector FMRI. 869 */ 870 if ((detector = cms_ereport_detector(hdl, bankidx, mscookie, 871 nva)) == NULL) 872 detector = gcpu_fmri_create(hdl, nva); 873 874 /* 875 * Should we define a new ENA format 3?? for chip/core/strand? 876 * It will be better when virtualized. 877 */ 878 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf, 879 fm_ena_generate_cpu(gcl->gcl_timestamp, 880 cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 | 881 cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL); 882 883 if (panicstr) { 884 fm_nvlist_destroy(detector, FM_NVA_RETAIN); 885 nv_alloc_reset(nva); 886 } else { 887 fm_nvlist_destroy(detector, FM_NVA_FREE); 888 } 889 890 /* 891 * Add the architectural ereport class-specific payload data. 892 */ 893 gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code); 894 895 /* 896 * Allow model-specific code to add ereport members. 897 */ 898 cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status, 899 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie); 900 901 /* 902 * Include stack if options is turned on and either selected in 903 * the payload member bitmask or inclusion is forced. 904 */ 905 if (gcpu_mca_stack_flag && 906 (cms_ereport_includestack(hdl, mscookie) == 907 B_TRUE || gcpu_mca_stack_ereport_include)) { 908 fm_payload_stack_add(ereport, gcl->gcl_stack, 909 gcl->gcl_stackdepth); 910 } 911 912 /* 913 * If injection has taken place anytime in the past then note this 914 * on the ereport. 915 */ 916 if (cmi_inj_tainted() == B_TRUE) { 917 fm_payload_set(ereport, "__injected", DATA_TYPE_BOOLEAN_VALUE, 918 B_TRUE, NULL); 919 } 920 921 /* 922 * Post ereport. 923 */ 924 if (panicstr) { 925 errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC); 926 if (scr_eqep) 927 errorq_cancel(ereport_errorq, scr_eqep); 928 } else { 929 (void) fm_ereport_post(ereport, EVCH_TRYHARD); 930 fm_nvlist_destroy(ereport, FM_NVA_FREE); 931 } 932 933 } 934 935 /*ARGSUSED*/ 936 void 937 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe) 938 { 939 const gcpu_logout_t *gcl = data; 940 const gcpu_bank_logout_t *gbl; 941 int i; 942 943 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) { 944 const gcpu_error_disp_t *gened; 945 cms_cookie_t mscookie; 946 947 if (gbl->gbl_status & MSR_MC_STATUS_VAL && 948 !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) { 949 uint16_t code = MCAX86_ERRCODE(gbl->gbl_status); 950 951 /* 952 * Perform a match based on IA32 MCA architectural 953 * components alone. 954 */ 955 gened = gcpu_disp_match(code); /* may be NULL */ 956 957 /* 958 * Now see if an model-specific match can be made. 959 */ 960 mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, i, 961 gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc, 962 gcl->gcl_ms_logout); 963 964 /* 965 * Prepare and dispatch an ereport for logging and 966 * diagnosis. 967 */ 968 gcpu_ereport_post(gcl, i, gened, mscookie, 969 gbl->gbl_status); 970 } else if (gbl->gbl_status & MSR_MC_STATUS_VAL && 971 (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) { 972 /* 973 * Telemetry kept changing as we tried to read 974 * it. Force an unknown ereport leafclass but 975 * keep the telemetry unchanged for logging. 976 */ 977 gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL, 978 gbl->gbl_status); 979 } 980 } 981 } 982 983 static size_t gcpu_mca_queue_datasz = 0; 984 985 /* 986 * The following code is ready to make a weak attempt at growing the 987 * errorq structure size. Since it is not foolproof (we don't know 988 * who may already be producing to the outgoing errorq) our caller 989 * instead assures that we'll always be called with no greater data 990 * size than on our first call. 991 */ 992 static void 993 gcpu_errorq_init(size_t datasz) 994 { 995 int slots; 996 997 mutex_enter(&gcpu_mca_queue_lock); 998 999 if (gcpu_mca_queue_datasz >= datasz) { 1000 mutex_exit(&gcpu_mca_queue_lock); 1001 return; 1002 } 1003 1004 membar_producer(); 1005 if (gcpu_mca_queue) { 1006 gcpu_mca_queue_datasz = 0; 1007 errorq_destroy(gcpu_mca_queue); 1008 } 1009 1010 slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS); 1011 slots = MIN(slots, GCPU_MCA_MAX_ERRORS); 1012 1013 gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain, 1014 NULL, slots, datasz, 1, ERRORQ_VITAL); 1015 1016 if (gcpu_mca_queue != NULL) 1017 gcpu_mca_queue_datasz = datasz; 1018 1019 mutex_exit(&gcpu_mca_queue_lock); 1020 } 1021 1022 /* 1023 * Perform MCA initialization as described in section 14.6 of Intel 64 1024 * and IA-32 Architectures Software Developer's Manual Volume 3A. 1025 */ 1026 1027 static uint_t global_nbanks; 1028 1029 void 1030 gcpu_mca_init(cmi_hdl_t hdl) 1031 { 1032 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1033 uint64_t cap; 1034 uint_t vendor = cmi_hdl_vendor(hdl); 1035 uint_t family = cmi_hdl_family(hdl); 1036 uint_t rev = cmi_hdl_chiprev(hdl); 1037 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1038 int mcg_ctl_present; 1039 uint_t nbanks; 1040 uint32_t ctl_skip_mask = 0; 1041 uint32_t status_skip_mask = 0; 1042 size_t mslsz; 1043 int i; 1044 #ifndef __xpv 1045 int mcg_ctl2_present; 1046 uint32_t cmci_capable = 0; 1047 #endif 1048 if (gcpu == NULL) 1049 return; 1050 1051 /* We add MCi_ADDR always for AMD Family 0xf and above */ 1052 if (X86_CHIPFAM_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B)) 1053 gcpu_force_addr_in_payload = 1; 1054 1055 /* 1056 * Protect from some silly /etc/system settings. 1057 */ 1058 if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100) 1059 gcpu_mca_telemetry_retries = 5; 1060 1061 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS) 1062 return; 1063 1064 /* 1065 * CPU startup code only calls cmi_mca_init if x86_feature indicates 1066 * both MCA and MCE support (i.e., X86_MCA). P5, K6, and earlier 1067 * processors, which have their own * more primitive way of doing 1068 * machine checks, will not have cmi_mca_init called since their 1069 * CPUID information will not indicate both MCA and MCE features. 1070 */ 1071 ASSERT(x86_feature & X86_MCA); 1072 1073 /* 1074 * Determine whether the IA32_MCG_CTL register is present. If it 1075 * is we will enable all features by writing -1 to it towards 1076 * the end of this initialization; if it is absent then volume 3A 1077 * says we must nonetheless continue to initialize the individual 1078 * banks. 1079 */ 1080 mcg_ctl_present = cap & MCG_CAP_CTL_P; 1081 #ifndef __xpv 1082 mcg_ctl2_present = cap & MCG_CAP_CTL2_P; 1083 #endif 1084 1085 /* 1086 * We squirell values away for inspection/debugging. 1087 */ 1088 mca->gcpu_mca_bioscfg.bios_mcg_cap = cap; 1089 if (mcg_ctl_present) 1090 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL, 1091 &mca->gcpu_mca_bioscfg.bios_mcg_ctl); 1092 1093 /* 1094 * Determine the number of error-reporting banks implemented. 1095 */ 1096 mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK; 1097 1098 if (nbanks != 0 && global_nbanks == 0) 1099 global_nbanks = nbanks; /* no race - BSP will get here first */ 1100 1101 /* 1102 * If someone is hiding the number of banks (perhaps we are fully 1103 * virtualized?) or if this processor has more banks than the 1104 * first to set global_nbanks then bail. The latter requirement 1105 * is because we need to size our errorq data structure and we 1106 * don't want to have to grow the errorq (destroy and recreate) 1107 * which may just lose some telemetry. 1108 */ 1109 if (nbanks == 0 || nbanks > global_nbanks) 1110 return; 1111 1112 mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks * 1113 sizeof (struct gcpu_bios_bankcfg), KM_SLEEP); 1114 1115 /* 1116 * Calculate the size we need to allocate for a gcpu_logout_t 1117 * with a gcl_data array big enough for all banks of this cpu. 1118 * Add any space requested by the model-specific logout support. 1119 */ 1120 mslsz = cms_logout_size(hdl); 1121 mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) + 1122 (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz; 1123 1124 for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) { 1125 gcpu_logout_t *gcl; 1126 1127 mca->gcpu_mca_logout[i] = gcl = 1128 kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP); 1129 gcl->gcl_gcpu = gcpu; 1130 gcl->gcl_nbanks = nbanks; 1131 gcl->gcl_ms_logout = (mslsz == 0) ? NULL : 1132 (char *)(&gcl->gcl_data[0]) + nbanks * 1133 sizeof (gcpu_bank_logout_t); 1134 1135 } 1136 1137 #ifdef __xpv 1138 gcpu_xpv_mca_init(nbanks); 1139 #endif 1140 1141 mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1; 1142 1143 #ifndef __xpv 1144 mca->gcpu_bank_cmci = kmem_zalloc(sizeof (gcpu_mca_cmci_t) * nbanks, 1145 KM_SLEEP); 1146 #endif 1147 1148 /* 1149 * Create our errorq to transport the logout structures. This 1150 * can fail so users of gcpu_mca_queue must be prepared for NULL. 1151 */ 1152 gcpu_errorq_init(mca->gcpu_mca_lgsz); 1153 1154 /* 1155 * Not knowing which, if any, banks are shared between cores we 1156 * assure serialization of MCA bank initialization by each cpu 1157 * on the chip. On chip architectures in which some banks are 1158 * shared this will mean the shared resource is initialized more 1159 * than once - we're simply aiming to avoid simultaneous MSR writes 1160 * to the shared resource. 1161 * 1162 * Even with these precautions, some platforms may yield a GP fault 1163 * if a core other than a designated master tries to write anything 1164 * but all 0's to MCi_{STATUS,ADDR,CTL}. So we will perform 1165 * those writes under on_trap protection. 1166 */ 1167 mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock); 1168 1169 /* 1170 * Initialize poller data, but don't start polling yet. 1171 */ 1172 gcpu_mca_poll_init(hdl); 1173 1174 /* 1175 * Work out which MCA banks we will initialize. In MCA logout 1176 * code we will only read those banks which we initialize here. 1177 */ 1178 for (i = 0; i < nbanks; i++) { 1179 boolean_t skipctl = cms_bankctl_skipinit(hdl, i); 1180 boolean_t skipstatus = cms_bankstatus_skipinit(hdl, i); 1181 1182 if (!cms_present(hdl)) { 1183 /* 1184 * Model-specific support is not present, try to use 1185 * sane defaults. 1186 * 1187 * On AMD family 6 processors, reports about spurious 1188 * machine checks indicate that bank 0 should be 1189 * skipped. 1190 * 1191 * On Intel family 6 processors, the documentation tells 1192 * us not to write to MC0_CTL. 1193 * 1194 */ 1195 if (i == 0 && family == 6) { 1196 switch (vendor) { 1197 case X86_VENDOR_AMD: 1198 skipstatus = B_TRUE; 1199 /*FALLTHRU*/ 1200 case X86_VENDOR_Intel: 1201 skipctl = B_TRUE; 1202 break; 1203 } 1204 } 1205 } 1206 1207 ctl_skip_mask |= skipctl << i; 1208 status_skip_mask |= skipstatus << i; 1209 1210 if (skipctl && skipstatus) 1211 continue; 1212 1213 /* 1214 * Record which MCA banks were enabled, from the point of view 1215 * of the whole chip (if some cores share a bank we must be 1216 * sure either can logout from it). 1217 */ 1218 atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i); 1219 1220 #ifndef __xpv 1221 /* 1222 * check CMCI capability 1223 */ 1224 if (mcg_ctl2_present) { 1225 uint64_t ctl2; 1226 uint32_t cap = 0; 1227 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2); 1228 if (ctl2 & MSR_MC_CTL2_EN) 1229 continue; 1230 ctl2 |= MSR_MC_CTL2_EN; 1231 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2); 1232 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2); 1233 mca->gcpu_bank_cmci[i].cmci_cap = cap = 1234 (ctl2 & MSR_MC_CTL2_EN) ? 1 : 0; 1235 if (cap) 1236 cmci_capable ++; 1237 /* 1238 * Set threshold to 1 while unset the en field, to avoid 1239 * CMCI trigged before APIC LVT entry init. 1240 */ 1241 ctl2 = ctl2 & (~MSR_MC_CTL2_EN) | 1; 1242 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2); 1243 1244 /* 1245 * init cmci related count 1246 */ 1247 mca->gcpu_bank_cmci[i].cmci_enabled = 0; 1248 mca->gcpu_bank_cmci[i].drtcmci = 0; 1249 mca->gcpu_bank_cmci[i].ncmci = 0; 1250 } 1251 #endif 1252 } 1253 1254 #ifndef __xpv 1255 if (cmci_capable) 1256 cmi_enable_cmci = 1; 1257 #endif 1258 1259 #ifndef __xpv 1260 /* 1261 * Log any valid telemetry lurking in the MCA banks, but do not 1262 * clear the status registers. Ignore the disposition returned - 1263 * we have already paniced or reset for any nasty errors found here. 1264 * 1265 * Intel vol 3A says that we should not do this on family 0x6, 1266 * and that for any extended family the BIOS clears things 1267 * on power-on reset so you'll only potentially find valid telemetry 1268 * on warm reset (we do it for both - on power-on reset we should 1269 * just see zeroes). 1270 * 1271 * AMD docs since K7 say we should process anything we find here. 1272 */ 1273 if (!gcpu_suppress_log_on_init && 1274 (vendor == X86_VENDOR_Intel && family >= 0xf || 1275 vendor == X86_VENDOR_AMD)) 1276 gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE, 1277 GCPU_MPT_WHAT_POKE_ERR); 1278 1279 /* 1280 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the 1281 * model-specific module the power of veto. 1282 */ 1283 for (i = 0; i < nbanks; i++) { 1284 struct gcpu_bios_bankcfg *bcfgp = 1285 mca->gcpu_mca_bioscfg.bios_bankcfg + i; 1286 1287 /* 1288 * Stash inherited bank MCA state, even for banks we will 1289 * not initialize ourselves. Do not read the MISC register 1290 * unconditionally - on some processors that will #GP on 1291 * banks that do not implement the MISC register (would be 1292 * caught by on_trap, anyway). 1293 */ 1294 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL), 1295 &bcfgp->bios_bank_ctl); 1296 1297 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), 1298 &bcfgp->bios_bank_status); 1299 1300 if ((bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV) || 1301 gcpu_force_addr_in_payload) { 1302 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), 1303 &bcfgp->bios_bank_addr); 1304 } 1305 1306 /* 1307 * In some old BIOS the status value after boot can indicate 1308 * MISCV when there is actually no MISC register for 1309 * that bank. The following read could therefore 1310 * aggravate a general protection fault. This should be 1311 * caught by on_trap, but the #GP fault handler is busted 1312 * and can suffer a double fault even before we get to 1313 * trap() to check for on_trap protection. Until that 1314 * issue is fixed we remove the one access that we know 1315 * can cause a #GP. 1316 * 1317 * if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV) 1318 * (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), 1319 * &bcfgp->bios_bank_misc); 1320 */ 1321 bcfgp->bios_bank_misc = 0; 1322 1323 if (!(ctl_skip_mask & (1 << i))) { 1324 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL), 1325 cms_bankctl_val(hdl, i, -1ULL)); 1326 } 1327 1328 if (!(status_skip_mask & (1 << i))) { 1329 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 1330 cms_bankstatus_val(hdl, i, 0ULL)); 1331 } 1332 } 1333 #endif 1334 /* 1335 * Now let the model-specific support perform further initialization 1336 * of non-architectural features. 1337 */ 1338 cms_mca_init(hdl, nbanks); 1339 1340 #ifndef __xpv 1341 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL); 1342 membar_producer(); 1343 1344 /* enable all machine-check features */ 1345 if (mcg_ctl_present) 1346 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL, 1347 cms_mcgctl_val(hdl, nbanks, -1ULL)); 1348 #endif 1349 1350 mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock); 1351 1352 #ifndef __xpv 1353 /* enable machine-check exception in CR4 */ 1354 cmi_hdl_enable_mce(hdl); 1355 #endif 1356 } 1357 1358 static uint64_t 1359 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu, 1360 gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp) 1361 { 1362 int curctxbad = 0, unconstrained = 0, forcefatal = 0; 1363 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1364 int nbanks = mca->gcpu_mca_nbanks; 1365 gcpu_mce_status_t mce; 1366 gcpu_bank_logout_t *gbl; 1367 uint64_t disp = 0; 1368 int i; 1369 1370 if (mcesp == NULL) 1371 mcesp = &mce; 1372 1373 mcesp->mce_nerr = nerr; 1374 1375 mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc = 1376 mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned = 1377 mcesp->mce_forcefatal = mcesp->mce_ignored = 0; 1378 1379 /* 1380 * If this a machine check then if the return instruction pointer 1381 * is not valid the current context is lost. 1382 */ 1383 if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV)) 1384 disp |= CMI_ERRDISP_RIPV_INVALID; 1385 1386 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) { 1387 uint64_t mcistatus = gbl->gbl_status; 1388 uint32_t ms_scope; 1389 int pcc, uc; 1390 int poisoned; 1391 1392 if (!(mcistatus & MSR_MC_STATUS_VAL)) 1393 continue; 1394 1395 if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT) 1396 continue; 1397 1398 pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0; 1399 uc = (mcistatus & MSR_MC_STATUS_UC) != 0; 1400 mcesp->mce_npcc += pcc; 1401 mcesp->mce_nuc += uc; 1402 1403 ms_scope = cms_error_action(hdl, ismc, i, mcistatus, 1404 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout); 1405 1406 if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) { 1407 pcc = 0; 1408 mcesp->mce_npcc_ok++; 1409 gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED; 1410 } 1411 1412 if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) { 1413 uc = 0; 1414 mcesp->mce_nuc_ok++; 1415 gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED; 1416 } 1417 1418 if (uc) { 1419 poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0; 1420 if (poisoned) { 1421 mcesp->mce_nuc_poisoned++; 1422 gbl->gbl_disp |= CMI_ERRDISP_POISONED; 1423 } 1424 } 1425 1426 if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) { 1427 /* 1428 * We're not being instructed to ignore the error, 1429 * so apply our standard disposition logic to it. 1430 */ 1431 if (uc && !poisoned) { 1432 unconstrained++; 1433 gbl->gbl_disp |= disp | 1434 CMI_ERRDISP_UC_UNCONSTRAINED; 1435 } 1436 1437 if (pcc && ismc) { 1438 curctxbad++; 1439 gbl->gbl_disp |= disp | 1440 CMI_ERRDISP_CURCTXBAD; 1441 } 1442 1443 /* 1444 * Even if the above may not indicate that the error 1445 * is terminal, model-specific support may insist 1446 * that we treat it as such. Such errors wil be 1447 * fatal even if discovered via poll. 1448 */ 1449 if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) { 1450 forcefatal++; 1451 mcesp->mce_forcefatal++; 1452 gbl->gbl_disp |= disp | 1453 CMI_ERRDISP_FORCEFATAL; 1454 } 1455 } else { 1456 mcesp->mce_ignored++; 1457 gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED; 1458 } 1459 } 1460 1461 if (unconstrained > 0) 1462 disp |= CMI_ERRDISP_UC_UNCONSTRAINED; 1463 1464 if (curctxbad > 0) 1465 disp |= CMI_ERRDISP_CURCTXBAD; 1466 1467 if (forcefatal > 0) 1468 disp |= CMI_ERRDISP_FORCEFATAL; 1469 1470 if (gcpu_mca_queue != NULL) { 1471 int how; 1472 1473 if (ismc) { 1474 how = cmi_mce_response(rp, disp) ? 1475 ERRORQ_ASYNC : /* no panic, so arrange drain */ 1476 ERRORQ_SYNC; /* panic flow will drain */ 1477 } else { 1478 how = (disp & CMI_ERRDISP_FORCEFATAL && 1479 cmi_panic_on_ue()) ? 1480 ERRORQ_SYNC : /* poller will panic */ 1481 ERRORQ_ASYNC; /* no panic */ 1482 } 1483 1484 errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how); 1485 } else if (disp != 0) { 1486 gcpu_bleat(hdl, gcl); 1487 } 1488 1489 mcesp->mce_disp = disp; 1490 1491 return (disp); 1492 } 1493 1494 /* 1495 * Gather error telemetry from our source, and then submit it for 1496 * processing. 1497 */ 1498 1499 #define IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \ 1500 ((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0) 1501 1502 #define STATUS_EQV(s1, s2) \ 1503 (((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER)) 1504 1505 static uint32_t gcpu_deferrred_polled_clears; 1506 1507 #ifndef __xpv 1508 static void 1509 gcpu_cmci_logout(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p, 1510 uint64_t status, int what) 1511 { 1512 uint64_t ctl2; 1513 1514 if (bank_cmci_p->cmci_cap && (what == GCPU_MPT_WHAT_CYC_ERR) && 1515 (!(status & MSR_MC_STATUS_VAL) || ((status & MSR_MC_STATUS_VAL) && 1516 !(status & MSR_MC_STATUS_CEC_MASK)))) { 1517 1518 if (!(bank_cmci_p->cmci_enabled)) { 1519 /* 1520 * when cmci is disabled, and the bank has no error or 1521 * no corrected error for 1522 * gcpu_mca_cmci_reenable_threshold consecutive polls, 1523 * turn on this bank's cmci. 1524 */ 1525 1526 bank_cmci_p->drtcmci ++; 1527 1528 if (bank_cmci_p->drtcmci >= 1529 gcpu_mca_cmci_reenable_threshold) { 1530 1531 /* turn on cmci */ 1532 1533 (void) cmi_hdl_rdmsr(hdl, 1534 IA32_MSR_MC_CTL2(bank), &ctl2); 1535 ctl2 |= MSR_MC_CTL2_EN; 1536 (void) cmi_hdl_wrmsr(hdl, 1537 IA32_MSR_MC_CTL2(bank), ctl2); 1538 1539 /* reset counter and set flag */ 1540 bank_cmci_p->drtcmci = 0; 1541 bank_cmci_p->cmci_enabled = 1; 1542 } 1543 } else { 1544 /* 1545 * when cmci is enabled,if is in cyclic poll and the 1546 * bank has no error or no corrected error, reset ncmci 1547 * counter 1548 */ 1549 bank_cmci_p->ncmci = 0; 1550 } 1551 } 1552 } 1553 1554 static void 1555 gcpu_cmci_throttle(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p, 1556 int what) 1557 { 1558 uint64_t ctl2 = 0; 1559 1560 /* 1561 * if cmci of this bank occurred beyond 1562 * gcpu_mca_cmci_throttling_threshold between 2 polls, 1563 * turn off this bank's CMCI; 1564 */ 1565 if (bank_cmci_p->cmci_enabled && what == GCPU_MPT_WHAT_CMCI_ERR) { 1566 1567 /* if it is cmci trap, increase the count */ 1568 bank_cmci_p->ncmci++; 1569 1570 if (bank_cmci_p->ncmci >= gcpu_mca_cmci_throttling_threshold) { 1571 1572 /* turn off cmci */ 1573 1574 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(bank), 1575 &ctl2); 1576 ctl2 &= ~MSR_MC_CTL2_EN; 1577 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(bank), 1578 ctl2); 1579 1580 /* clear the flag and count */ 1581 1582 bank_cmci_p->cmci_enabled = 0; 1583 bank_cmci_p->ncmci = 0; 1584 } 1585 } 1586 } 1587 #endif 1588 1589 static void 1590 clear_mc(int first, int last, int ismc, boolean_t clrstatus, 1591 cmi_hdl_t hdl, gcpu_logout_t *gcl, gcpu_logout_t *pgcl) 1592 { 1593 int i; 1594 gcpu_bank_logout_t *gbl, *pgbl; 1595 uint64_t status; 1596 1597 if (first < 0 || last < 0) 1598 return; 1599 1600 for (i = first, gbl = &gcl->gcl_data[first]; i <= last; i++, gbl++) { 1601 status = gbl->gbl_status; 1602 if (status == 0) 1603 continue; 1604 if (clrstatus == B_FALSE) 1605 goto serialize; 1606 1607 /* 1608 * For i86xpv we always clear status in order to invalidate 1609 * the interposed telemetry. 1610 * 1611 * For native machine checks we always clear status here. For 1612 * native polls we must be a little more cautious since there 1613 * is an outside chance that we may clear telemetry from a 1614 * shared MCA bank on which a sibling core is machine checking. 1615 * 1616 * For polled observations of errors that look like they may 1617 * produce a machine check (UC/PCC and ENabled, although these 1618 * do not guarantee a machine check on error occurence) 1619 * we will not clear the status at this wakeup unless 1620 * we saw the same status at the previous poll. We will 1621 * always process and log the current observations - it 1622 * is only the clearing of MCi_STATUS which may be 1623 * deferred until the next wakeup. 1624 */ 1625 if (isxpv || ismc || !IS_MCE_CANDIDATE(status)) { 1626 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL); 1627 goto serialize; 1628 } 1629 1630 /* 1631 * We have a polled observation of a machine check 1632 * candidate. If we saw essentially the same status at the 1633 * last poll then clear the status now since this appears 1634 * not to be a #MC candidate after all. If we see quite 1635 * different status now then do not clear, but reconsider at 1636 * the next poll. In no actual machine check clears 1637 * the status in the interim then the status should not 1638 * keep changing forever (meaning we'd never clear it) 1639 * since before long we'll simply have latched the highest- 1640 * priority error and set the OVerflow bit. Nonetheless 1641 * we count how many times we defer clearing and after 1642 * a while insist on clearing the status. 1643 */ 1644 pgbl = &pgcl->gcl_data[i]; 1645 if (pgbl->gbl_clrdefcnt != 0) { 1646 /* We deferred clear on this bank at last wakeup */ 1647 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) || 1648 pgbl->gbl_clrdefcnt > 5) { 1649 /* 1650 * Status is unchanged so clear it now and, 1651 * since we have already logged this info, 1652 * avoid logging it again. 1653 */ 1654 gbl->gbl_status = 0; 1655 (void) cmi_hdl_wrmsr(hdl, 1656 IA32_MSR_MC(i, STATUS), 0ULL); 1657 } else { 1658 /* Record deferral for next wakeup */ 1659 gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1; 1660 } 1661 } else { 1662 /* Record initial deferral for next wakeup */ 1663 gbl->gbl_clrdefcnt = 1; 1664 gcpu_deferrred_polled_clears++; 1665 } 1666 1667 serialize: 1668 { 1669 #ifdef __xpv 1670 ; 1671 #else 1672 /* 1673 * Intel Vol 3A says to execute a serializing 1674 * instruction here, ie CPUID. Well WRMSR is also 1675 * defined to be serializing, so the status clear above 1676 * should suffice. To be a good citizen, and since 1677 * some clears are deferred, we'll execute a CPUID 1678 * instruction here. 1679 */ 1680 struct cpuid_regs tmp; 1681 (void) __cpuid_insn(&tmp); 1682 #endif 1683 } 1684 } 1685 } 1686 1687 /*ARGSUSED5*/ 1688 void 1689 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask, 1690 gcpu_mce_status_t *mcesp, boolean_t clrstatus, int what) 1691 { 1692 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1693 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1694 int nbanks = mca->gcpu_mca_nbanks; 1695 gcpu_bank_logout_t *gbl, *pgbl; 1696 gcpu_logout_t *gcl, *pgcl; 1697 int ismc = (rp != NULL); 1698 int ispoll = !ismc; 1699 int i, nerr = 0; 1700 cmi_errno_t err; 1701 uint64_t mcg_status; 1702 uint64_t disp; 1703 uint64_t cap; 1704 int first = -1; 1705 int last = -1; 1706 int willpanic = 0; 1707 1708 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) != 1709 CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != 1710 CMI_SUCCESS) { 1711 if (mcesp != NULL) 1712 mcesp->mce_nerr = mcesp->mce_disp = 0; 1713 return; 1714 } 1715 1716 if (ismc) { 1717 gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION]; 1718 } else { 1719 int pidx = mca->gcpu_mca_nextpoll_idx; 1720 int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ? 1721 GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1; 1722 1723 gcl = mca->gcpu_mca_logout[pidx]; /* current logout */ 1724 pgcl = mca->gcpu_mca_logout[ppidx]; /* previous logout */ 1725 mca->gcpu_mca_nextpoll_idx = ppidx; /* switch next time */ 1726 } 1727 1728 gcl->gcl_timestamp = gethrtime_waitfree(); 1729 gcl->gcl_mcg_status = mcg_status; 1730 gcl->gcl_ip = rp ? rp->r_pc : 0; 1731 1732 gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0; 1733 if (cap & MCG_CAP_TES_P) 1734 gcl->gcl_flags |= GCPU_GCL_F_TES_P; 1735 1736 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) { 1737 uint64_t status, status2, addr, misc; 1738 int retries = gcpu_mca_telemetry_retries; 1739 1740 gbl->gbl_status = 0; 1741 gbl->gbl_disp = 0; 1742 gbl->gbl_clrdefcnt = 0; 1743 1744 /* 1745 * Only logout from MCA banks we have initialized from at 1746 * least one core. If a core shares an MCA bank with another 1747 * but perhaps lost the race to initialize it, then it must 1748 * still be allowed to logout from the shared bank. 1749 */ 1750 if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i)) 1751 continue; 1752 1753 /* 1754 * On a poll look only at the banks we've been asked to check. 1755 */ 1756 if (rp == NULL && !(bankmask & 1 << i)) 1757 continue; 1758 1759 1760 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) != 1761 CMI_SUCCESS) 1762 continue; 1763 1764 #ifndef __xpv 1765 gcpu_cmci_logout(hdl, i, &mca->gcpu_bank_cmci[i], status, what); 1766 #endif 1767 1768 retry: 1769 if (!(status & MSR_MC_STATUS_VAL)) 1770 continue; 1771 1772 /* First and last bank that have valid status */ 1773 if (first < 0) 1774 first = i; 1775 last = i; 1776 1777 addr = -1; 1778 misc = 0; 1779 1780 if ((status & MSR_MC_STATUS_ADDRV) || 1781 gcpu_force_addr_in_payload) 1782 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr); 1783 1784 if (status & MSR_MC_STATUS_MISCV) 1785 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc); 1786 1787 #ifndef __xpv 1788 gcpu_cmci_throttle(hdl, i, &mca->gcpu_bank_cmci[i], what); 1789 #endif 1790 1791 /* 1792 * Allow the model-specific code to extract bank telemetry. 1793 */ 1794 cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout); 1795 1796 /* 1797 * Not all cpu models assure us that the status/address/misc 1798 * data will not change during the above sequence of MSR reads, 1799 * or that it can only change by the addition of the OVerflow 1800 * bit to the status register. If the status has changed 1801 * other than in the overflow bit then we attempt to reread 1802 * for a consistent snapshot, but eventually give up and 1803 * go with what we've got. We only perform this check 1804 * for a poll - a further #MC during a #MC will reset, and 1805 * polled errors should not overwrite higher-priority 1806 * trapping errors (but could set the overflow bit). 1807 */ 1808 if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), 1809 &status2)) == CMI_SUCCESS) { 1810 if (!STATUS_EQV(status, status2)) { 1811 if (retries-- > 0) { 1812 status = status2; 1813 goto retry; 1814 } else { 1815 gbl->gbl_disp |= 1816 CMI_ERRDISP_INCONSISTENT; 1817 } 1818 } 1819 } else if (ispoll && err != CMI_SUCCESS) { 1820 gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT; 1821 } 1822 1823 nerr++; 1824 gbl->gbl_status = status; 1825 gbl->gbl_addr = addr; 1826 gbl->gbl_misc = misc; 1827 1828 /* 1829 * For polled observation, if the count of deferred status 1830 * clears updated in the clear_mc() is nonzero and the 1831 * MCi_STATUS has not changed, the last wakeup has produced 1832 * the ereport of the error. Therefore, clear the status in 1833 * this wakeup to avoid duplicate ereport. 1834 */ 1835 pgbl = &pgcl->gcl_data[i]; 1836 if (!isxpv && ispoll && IS_MCE_CANDIDATE(status) && 1837 pgbl->gbl_clrdefcnt != 0) { 1838 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status)) { 1839 gbl->gbl_status = 0; 1840 (void) cmi_hdl_wrmsr(hdl, 1841 IA32_MSR_MC(i, STATUS), 0ULL); 1842 } 1843 } 1844 } 1845 1846 if (gcpu_mca_stack_flag) 1847 gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH); 1848 else 1849 gcl->gcl_stackdepth = 0; 1850 1851 /* 1852 * Decide our disposition for this error or errors, and submit for 1853 * logging and subsequent diagnosis. 1854 */ 1855 if (nerr != 0) { 1856 disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp); 1857 1858 willpanic = (ismc && cmi_mce_response(rp, disp) == 0); 1859 1860 if (!willpanic) 1861 clear_mc(first, last, ismc, clrstatus, hdl, gcl, pgcl); 1862 } else { 1863 disp = 0; 1864 if (mcesp) { 1865 mcesp->mce_nerr = mcesp->mce_disp = 0; 1866 } 1867 } 1868 1869 /* 1870 * Clear MCG_STATUS if MCIP is set (machine check in progress). 1871 * If a second #MC had occured before now the system would have 1872 * reset. We can only do thise once gcpu_mca_process has copied 1873 * the logout structure. 1874 */ 1875 if (ismc && mcg_status & MCG_STATUS_MCIP) 1876 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0); 1877 1878 /* 1879 * At this point we have read and logged all telemetry that is visible 1880 * under the MCA. On architectures for which the NorthBridge is 1881 * on-chip this may include NB-observed errors, but where the NB 1882 * is off chip it may have been the source of the #MC request and 1883 * so we must call into the memory-controller driver to give it 1884 * a chance to log errors. 1885 */ 1886 if (ismc) { 1887 cmi_mc_logout(hdl, 1, willpanic); 1888 } 1889 } 1890 1891 #ifndef __xpv 1892 int gcpu_mca_trap_vomit_summary = 0; 1893 1894 /* 1895 * On a native machine check exception we come here from mcetrap via 1896 * cmi_mca_trap. A machine check on one cpu of a chip does not trap others 1897 * cpus of the chip, so it is possible that another cpu on this chip could 1898 * initiate a poll while we're in the #mc handler; it is also possible that 1899 * this trap has occured during a poll on this cpu. So we must acquire 1900 * the chip-wide poll lock, but be careful to avoid deadlock. 1901 * 1902 * The 'data' pointer cannot be NULL due to init order. 1903 */ 1904 uint64_t 1905 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp) 1906 { 1907 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1908 kmutex_t *poll_lock = NULL; 1909 gcpu_mce_status_t mce; 1910 uint64_t mcg_status; 1911 int tooklock = 0; 1912 1913 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) != 1914 CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP)) 1915 return (0); 1916 1917 /* 1918 * Synchronize with any poller from another core that may happen 1919 * to share access to one or more of the MCA banks. 1920 */ 1921 if (gcpu->gcpu_shared != NULL) 1922 poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock; 1923 1924 if (poll_lock != NULL && !mutex_owned(poll_lock)) { 1925 /* 1926 * The lock is not owned by the thread we have 1927 * interrupted. Spin for this adaptive lock. 1928 */ 1929 while (!mutex_tryenter(poll_lock)) { 1930 while (mutex_owner(poll_lock) != NULL) 1931 ; 1932 } 1933 tooklock = 1; 1934 } 1935 1936 gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE, GCPU_MPT_WHAT_MC_ERR); 1937 1938 if (tooklock) 1939 mutex_exit(poll_lock); 1940 1941 /* 1942 * gcpu_mca_trap_vomit_summary may be set for debug assistance. 1943 */ 1944 if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) { 1945 cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, " 1946 "%u PCC (%u ok), " 1947 "%u UC (%d ok, %u poisoned), " 1948 "%u forcefatal, %u ignored", 1949 mce.mce_nerr, (u_longlong_t)mce.mce_disp, 1950 mce.mce_npcc, mce.mce_npcc_ok, 1951 mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned, 1952 mce.mce_forcefatal, mce.mce_ignored); 1953 } 1954 1955 return (mce.mce_disp); 1956 } 1957 #endif 1958 1959 /*ARGSUSED*/ 1960 void 1961 gcpu_faulted_enter(cmi_hdl_t hdl) 1962 { 1963 /* Nothing to do here */ 1964 } 1965 1966 /*ARGSUSED*/ 1967 void 1968 gcpu_faulted_exit(cmi_hdl_t hdl) 1969 { 1970 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1971 1972 gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING; 1973 } 1974 1975 /* 1976 * Write the requested values to the indicated MSRs. Having no knowledge 1977 * of the model-specific requirements for writing to these model-specific 1978 * registers, we will only blindly write to those MSRs if the 'force' 1979 * argument is nonzero. That option should only be used in prototyping 1980 * and debugging. 1981 */ 1982 /*ARGSUSED*/ 1983 cmi_errno_t 1984 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs, 1985 int force) 1986 { 1987 int i, errs = 0; 1988 1989 for (i = 0; i < nregs; i++) { 1990 uint_t msr = regs[i].cmr_msrnum; 1991 uint64_t val = regs[i].cmr_msrval; 1992 1993 if (cms_present(hdl)) { 1994 if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS) 1995 errs++; 1996 } else if (force) { 1997 errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS); 1998 } else { 1999 errs++; 2000 } 2001 } 2002 2003 return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN); 2004 } 2005