1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2018, Joyent, Inc. 25 */ 26 /* 27 * Copyright (c) 2010, Intel Corporation. 28 * All rights reserved. 29 */ 30 31 #include <sys/mca_x86.h> 32 #include <sys/cpu_module_impl.h> 33 #include <sys/cpu_module_ms.h> 34 #include <sys/cmn_err.h> 35 #include <sys/cpuvar.h> 36 #include <sys/pghw.h> 37 #include <sys/x86_archext.h> 38 #include <sys/sysmacros.h> 39 #include <sys/regset.h> 40 #include <sys/privregs.h> 41 #include <sys/systm.h> 42 #include <sys/types.h> 43 #include <sys/log.h> 44 #include <sys/psw.h> 45 #include <sys/fm/protocol.h> 46 #include <sys/fm/util.h> 47 #include <sys/errorq.h> 48 #include <sys/mca_x86.h> 49 #include <sys/fm/cpu/GMCA.h> 50 #include <sys/fm/smb/fmsmb.h> 51 #include <sys/sysevent.h> 52 #include <sys/ontrap.h> 53 #include <sys/smp_impldefs.h> 54 55 #include "gcpu.h" 56 57 extern int x86gentopo_legacy; /* x86 generic topology support */ 58 59 static uint_t gcpu_force_addr_in_payload = 0; 60 61 /* 62 * Clear to log telemetry found at initialization. While processor docs 63 * say you should process this telemetry on all but Intel family 0x6 64 * there are way too many exceptions and we want to avoid bogus 65 * diagnoses. 66 */ 67 int gcpu_suppress_log_on_init = 1; 68 69 /* 70 * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at 71 * error logout time. The stack will be included in the ereport if the 72 * error type selects stack inclusion, or in all cases if 73 * gcpu_mca_stack_ereport_include is nonzero. 74 */ 75 int gcpu_mca_stack_flag = 0; 76 int gcpu_mca_stack_ereport_include = 0; 77 78 /* 79 * The number of times to re-read MCA telemetry to try to obtain a 80 * consistent snapshot if we find it to be changing under our feet. 81 */ 82 int gcpu_mca_telemetry_retries = 5; 83 84 #ifndef __xpv 85 int gcpu_mca_cmci_throttling_threshold = 10; 86 int gcpu_mca_cmci_reenable_threshold = 1000; 87 88 /* 89 * This is used to determine whether or not we have registered the CMCI CPU 90 * setup function. This is protected by cpu_lock. 91 */ 92 static boolean_t gcpu_mca_cpu_registered = B_FALSE; 93 #endif 94 95 static gcpu_error_disp_t gcpu_errtypes[] = { 96 97 /* 98 * Unclassified 99 */ 100 { 101 FM_EREPORT_CPU_GENERIC_UNCLASSIFIED, 102 NULL, 103 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 104 MCAX86_SIMPLE_UNCLASSIFIED_MASKON, 105 MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF 106 }, 107 108 /* 109 * Microcode ROM Parity Error 110 */ 111 { 112 FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY, 113 NULL, 114 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 115 MCAX86_SIMPLE_MC_CODE_PARITY_MASKON, 116 MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF 117 }, 118 119 /* 120 * External - BINIT# from another processor during power-on config 121 */ 122 { 123 FM_EREPORT_CPU_GENERIC_EXTERNAL, 124 NULL, 125 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 126 MCAX86_SIMPLE_EXTERNAL_MASKON, 127 MCAX86_SIMPLE_EXTERNAL_MASKOFF 128 }, 129 130 /* 131 * Functional redundancy check master/slave error 132 */ 133 { 134 FM_EREPORT_CPU_GENERIC_FRC, 135 NULL, 136 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 137 MCAX86_SIMPLE_FRC_MASKON, 138 MCAX86_SIMPLE_FRC_MASKOFF 139 }, 140 141 /* 142 * Internal parity error 143 */ 144 { 145 FM_EREPORT_CPU_GENERIC_INTERNAL_PARITY, 146 NULL, 147 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 148 MCAX86_SIMPLE_INTERNAL_PARITY_MASKON, 149 MCAX86_SIMPLE_INTERNAL_PARITY_MASKOFF 150 }, 151 152 153 /* 154 * Internal timer error 155 */ 156 { 157 FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER, 158 NULL, 159 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 160 MCAX86_SIMPLE_INTERNAL_TIMER_MASKON, 161 MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF 162 }, 163 164 /* 165 * Internal unclassified 166 */ 167 { 168 FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS, 169 NULL, 170 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 171 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON, 172 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF 173 }, 174 175 /* 176 * Compound error codes - generic memory hierarchy 177 */ 178 { 179 FM_EREPORT_CPU_GENERIC_GENMEMHIER, 180 NULL, 181 FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */ 182 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON, 183 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF 184 }, 185 186 /* 187 * Compound error codes - TLB errors 188 */ 189 { 190 FM_EREPORT_CPU_GENERIC_TLB, 191 "%1$s" "TLB" "%2$s" "_ERR", 192 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 193 MCAX86_COMPOUND_TLB_MASKON, 194 MCAX86_COMPOUND_TLB_MASKOFF 195 }, 196 197 /* 198 * Compound error codes - memory hierarchy 199 */ 200 { 201 FM_EREPORT_CPU_GENERIC_MEMHIER, 202 "%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR", 203 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 204 MCAX86_COMPOUND_MEMHIER_MASKON, 205 MCAX86_COMPOUND_MEMHIER_MASKOFF 206 }, 207 208 /* 209 * Compound error codes - bus and interconnect errors 210 */ 211 { 212 FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT, 213 "BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR", 214 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 215 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON, 216 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF 217 }, 218 /* 219 * Compound error codes - memory controller errors 220 */ 221 { 222 FM_EREPORT_CPU_GENERIC_MEMORY_CONTROLLER, 223 "MC" "_" "%8$s" "_" "%9$s" "_ERR", 224 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 225 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKON, 226 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKOFF 227 }, 228 }; 229 230 static gcpu_error_disp_t gcpu_unknown = { 231 FM_EREPORT_CPU_GENERIC_UNKNOWN, 232 "UNKNOWN", 233 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 234 0, 235 0 236 }; 237 238 static errorq_t *gcpu_mca_queue; 239 static kmutex_t gcpu_mca_queue_lock; 240 241 #ifdef __xpv 242 static int isxpv = 1; 243 #else 244 static int isxpv = 0; 245 #endif 246 247 static const gcpu_error_disp_t * 248 gcpu_disp_match(uint16_t code) 249 { 250 const gcpu_error_disp_t *ged = gcpu_errtypes; 251 int i; 252 253 for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t); 254 i++, ged++) { 255 uint16_t on = ged->ged_errcode_mask_on; 256 uint16_t off = ged->ged_errcode_mask_off; 257 258 if ((code & on) == on && (code & off) == 0) 259 return (ged); 260 } 261 262 return (NULL); 263 } 264 265 static uint16_t 266 bit_strip(uint16_t code, uint16_t mask, uint16_t shift) 267 { 268 return ((code & mask) >> shift); 269 } 270 271 #define BIT_STRIP(code, name) \ 272 bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \ 273 MCAX86_ERRCODE_##name##_SHIFT) 274 275 #define GCPU_MNEMONIC_UNDEF "undefined" 276 #define GCPU_MNEMONIC_RESVD "reserved" 277 278 /* 279 * Mappings of TT, LL, RRRR, PP, II and T values to compound error name 280 * mnemonics and to ereport class name components. 281 */ 282 283 struct gcpu_mnexp { 284 const char *mne_compound; /* used in expanding compound errname */ 285 const char *mne_ereport; /* used in expanding ereport class */ 286 }; 287 288 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */ 289 { "I", FM_EREPORT_CPU_GENERIC_TT_INSTR }, /* INSTR */ 290 { "D", FM_EREPORT_CPU_GENERIC_TT_DATA }, /* DATA */ 291 { "G", FM_EREPORT_CPU_GENERIC_TT_GEN }, /* GEN */ 292 { GCPU_MNEMONIC_UNDEF, "" } 293 }; 294 295 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */ 296 { "LO", FM_EREPORT_CPU_GENERIC_LL_L0 }, /* L0 */ 297 { "L1", FM_EREPORT_CPU_GENERIC_LL_L1 }, /* L1 */ 298 { "L2", FM_EREPORT_CPU_GENERIC_LL_L2 }, /* L2 */ 299 { "LG", FM_EREPORT_CPU_GENERIC_LL_LG } /* LG */ 300 }; 301 302 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */ 303 { "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR }, /* ERR */ 304 { "RD", FM_EREPORT_CPU_GENERIC_RRRR_RD }, /* RD */ 305 { "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR }, /* WR */ 306 { "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD }, /* DRD */ 307 { "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR }, /* DWR */ 308 { "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD }, /* IRD */ 309 { "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH }, /* PREFETCH */ 310 { "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT }, /* EVICT */ 311 { "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP }, /* SNOOP */ 312 }; 313 314 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */ 315 { "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC }, /* SRC */ 316 { "RES", FM_EREPORT_CPU_GENERIC_PP_RES }, /* RES */ 317 { "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS }, /* OBS */ 318 { "", FM_EREPORT_CPU_GENERIC_PP_GEN } /* GEN */ 319 }; 320 321 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */ 322 { "M", FM_EREPORT_CPU_GENERIC_II_MEM }, /* MEM */ 323 { GCPU_MNEMONIC_RESVD, "" }, 324 { "IO", FM_EREPORT_CPU_GENERIC_II_IO }, /* IO */ 325 { "", FM_EREPORT_CPU_GENERIC_II_GEN } /* GEN */ 326 }; 327 328 static struct gcpu_mnexp gcpu_T_mnemonics[] = { /* MCAX86_ERRCODE_T_* */ 329 { "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT }, /* NONE */ 330 { "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT } /* TIMEOUT */ 331 }; 332 333 static struct gcpu_mnexp gcpu_CCCC_mnemonics[] = { /* MCAX86_ERRCODE_CCCC_* */ 334 { "CH0", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH0 */ 335 { "CH1", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH1 */ 336 { "CH2", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH2 */ 337 { "CH3", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH3 */ 338 { "CH4", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH4 */ 339 { "CH5", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH5 */ 340 { "CH6", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH6 */ 341 { "CH7", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH7 */ 342 { "CH8", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH8 */ 343 { "CH9", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH9 */ 344 { "CH10", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH10 */ 345 { "CH11", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH11 */ 346 { "CH12", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH12 */ 347 { "CH13", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH13 */ 348 { "CH14", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH14 */ 349 { "CH", FM_EREPORT_CPU_GENERIC_CCCC } /* GEN */ 350 }; 351 352 static struct gcpu_mnexp gcpu_MMM_mnemonics[] = { /* MCAX86_ERRCODE_MMM_* */ 353 { "GEN", FM_EREPORT_CPU_GENERIC_MMM_ERR }, /* GEN ERR */ 354 { "RD", FM_EREPORT_CPU_GENERIC_MMM_RD }, /* READ */ 355 { "WR", FM_EREPORT_CPU_GENERIC_MMM_WR }, /* WRITE */ 356 { "ADDR_CMD", FM_EREPORT_CPU_GENERIC_MMM_ADRCMD }, /* ADDR, CMD */ 357 { "SCRUB", FM_EREPORT_CPU_GENERIC_MMM_SCRUB }, 358 { GCPU_MNEMONIC_RESVD, ""}, /* RESERVED */ 359 { GCPU_MNEMONIC_RESVD, ""}, /* RESERVED */ 360 { GCPU_MNEMONIC_RESVD, ""} /* RESERVED */ 361 }; 362 363 enum gcpu_mn_namespace { 364 GCPU_MN_NAMESPACE_COMPOUND, 365 GCPU_MN_NAMESPACE_EREPORT 366 }; 367 368 static const char * 369 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint16_t val, 370 enum gcpu_mn_namespace nspace) 371 { 372 if (val >= tbl_sz || val > 0xff) 373 return (GCPU_MNEMONIC_UNDEF); /* for all namespaces */ 374 375 switch (nspace) { 376 case GCPU_MN_NAMESPACE_COMPOUND: 377 return (tbl[val].mne_compound); 378 /*NOTREACHED*/ 379 380 case GCPU_MN_NAMESPACE_EREPORT: 381 return (tbl[val].mne_ereport); 382 /*NOTREACHED*/ 383 384 default: 385 return (GCPU_MNEMONIC_UNDEF); 386 /*NOTREACHED*/ 387 } 388 } 389 390 /* 391 * The ereport class leaf component is either a simple string with no 392 * format specifiers, or a string with one or more embedded %n$s specifiers - 393 * positional selection for string arguments. The kernel snprintf does 394 * not support %n$ (and teaching it to do so is too big a headache) so 395 * we will expand this restricted format string ourselves. 396 */ 397 398 #define GCPU_CLASS_VARCOMPS 9 399 400 #define GCPU_MNEMONIC(code, name, nspace) \ 401 gcpu_mnemonic(gcpu_##name##_mnemonics, \ 402 sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \ 403 BIT_STRIP(code, name), nspace) 404 405 static void 406 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status, 407 enum gcpu_mn_namespace nspace) 408 { 409 uint16_t code = MCAX86_ERRCODE(status); 410 const char *mn[GCPU_CLASS_VARCOMPS]; 411 char *p = buf; /* current position in buf */ 412 char *q = buf + buflen; /* pointer past last char in buf */ 413 int which, expfmtchar, error; 414 char c; 415 416 mn[0] = GCPU_MNEMONIC(code, TT, nspace); 417 mn[1] = GCPU_MNEMONIC(code, LL, nspace); 418 mn[2] = GCPU_MNEMONIC(code, RRRR, nspace); 419 mn[3] = GCPU_MNEMONIC(code, PP, nspace); 420 mn[4] = GCPU_MNEMONIC(code, II, nspace); 421 mn[5] = GCPU_MNEMONIC(code, T, nspace); 422 mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : ""; 423 mn[7] = GCPU_MNEMONIC(code, CCCC, nspace); 424 mn[8] = GCPU_MNEMONIC(code, MMM, nspace); 425 426 while (p < q - 1 && (c = *fmt++) != '\0') { 427 if (c != '%') { 428 /* not the beginning of a format specifier - copy */ 429 *p++ = c; 430 continue; 431 } 432 433 error = 0; 434 which = -1; 435 expfmtchar = -1; 436 437 nextfmt: 438 if ((c = *fmt++) == '\0') 439 break; /* early termination of fmt specifier */ 440 441 switch (c) { 442 case '1': 443 case '2': 444 case '3': 445 case '4': 446 case '5': 447 case '6': 448 case '7': 449 case '8': 450 case '9': 451 if (which != -1) { /* allow only one positional digit */ 452 error++; 453 break; 454 } 455 which = c - '1'; 456 goto nextfmt; 457 /*NOTREACHED*/ 458 459 case '$': 460 if (which == -1) { /* no position specified */ 461 error++; 462 break; 463 } 464 expfmtchar = 's'; 465 goto nextfmt; 466 /*NOTREACHED*/ 467 468 case 's': 469 if (expfmtchar != 's') { 470 error++; 471 break; 472 } 473 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s", 474 mn[which]); 475 p += strlen(p); 476 break; 477 478 default: 479 error++; 480 break; 481 } 482 483 if (error) 484 break; 485 } 486 487 *p = '\0'; /* NUL termination */ 488 } 489 490 static void 491 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status, 492 const char *cpuclass, const char *leafclass) 493 { 494 char *p = buf; /* current position in buf */ 495 char *q = buf + buflen; /* pointer past last char in buf */ 496 497 (void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.", 498 FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC); 499 500 p += strlen(p); 501 if (p >= q) 502 return; 503 504 if (leafclass == NULL) { 505 gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status, 506 GCPU_MN_NAMESPACE_EREPORT); 507 } else { 508 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s", 509 leafclass); 510 } 511 } 512 513 /* 514 * Create an "hc" scheme FMRI identifying the given cpu with 515 * motherboard/chip/core/strand instance numbers. 516 */ 517 static nvlist_t * 518 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva) 519 { 520 nvlist_t *nvl, *fmri; 521 522 if ((nvl = fm_nvlist_create(nva)) == NULL) 523 return (NULL); 524 525 if (!x86gentopo_legacy) { 526 fmri = cmi_hdl_smb_bboard(hdl); 527 if (fmri == NULL) 528 return (NULL); 529 530 fm_fmri_hc_create(nvl, FM_HC_SCHEME_VERSION, 531 NULL, NULL, fmri, 3, 532 "chip", cmi_hdl_smb_chipid(hdl), 533 "core", cmi_hdl_coreid(hdl), 534 "strand", cmi_hdl_strandid(hdl)); 535 } else { 536 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 4, 537 "motherboard", 0, 538 "chip", cmi_hdl_chipid(hdl), 539 "core", cmi_hdl_coreid(hdl), 540 "strand", cmi_hdl_strandid(hdl)); 541 } 542 543 return (nvl); 544 } 545 546 int gcpu_bleat_count_thresh = 5; 547 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL; 548 549 /* 550 * Called when we are unable to propogate a logout structure onto an 551 * errorq for subsequent ereport preparation and logging etc. The caller 552 * should usually only decide to call this for severe errors - those we 553 * suspect we may need to panic for. 554 */ 555 static void 556 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl) 557 { 558 hrtime_t now = gethrtime_waitfree(); 559 static hrtime_t gcpu_last_bleat; 560 gcpu_bank_logout_t *gbl; 561 static int bleatcount; 562 int i; 563 564 /* 565 * Throttle spamming of the console. The first gcpu_bleat_count_thresh 566 * can come as fast as we like, but once we've spammed that many 567 * to the console we require a minimum interval to pass before 568 * any more complaints. 569 */ 570 if (++bleatcount > gcpu_bleat_count_thresh) { 571 if (now - gcpu_last_bleat < gcpu_bleat_min_interval) 572 return; 573 else 574 bleatcount = 0; 575 } 576 gcpu_last_bleat = now; 577 578 cmn_err(CE_WARN, 579 "Machine-Check Errors unlogged on chip %d core %d strand %d, " 580 "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl), 581 cmi_hdl_strandid(hdl)); 582 cmn_err(CE_WARN, "MCG_STATUS 0x%016llx", 583 (u_longlong_t)gcl->gcl_mcg_status); 584 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) { 585 uint64_t status = gbl->gbl_status; 586 587 if (!(status & MSR_MC_STATUS_VAL)) 588 continue; 589 590 /* Force ADDRV for AMD Family 0xf and above */ 591 if (gcpu_force_addr_in_payload) 592 status = status | MSR_MC_STATUS_ADDRV; 593 594 switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) { 595 case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV: 596 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 597 "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx", 598 i, IA32_MSR_MC(i, STATUS), 599 (u_longlong_t)gbl->gbl_status, 600 (u_longlong_t)gbl->gbl_addr, 601 (u_longlong_t)gbl->gbl_misc); 602 break; 603 604 case MSR_MC_STATUS_ADDRV: 605 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 606 "STAT 0x%016llx ADDR 0x%016llx", 607 i, IA32_MSR_MC(i, STATUS), 608 (u_longlong_t)gbl->gbl_status, 609 (u_longlong_t)gbl->gbl_addr); 610 break; 611 612 case MSR_MC_STATUS_MISCV: 613 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 614 "STAT 0x%016llx MISC 0x%016llx", 615 i, IA32_MSR_MC(i, STATUS), 616 (u_longlong_t)gbl->gbl_status, 617 (u_longlong_t)gbl->gbl_misc); 618 break; 619 620 default: 621 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 622 "STAT 0x%016llx", 623 i, IA32_MSR_MC(i, STATUS), 624 (u_longlong_t)gbl->gbl_status); 625 break; 626 627 } 628 } 629 } 630 631 #define _GCPU_BSTATUS(status, what) \ 632 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \ 633 (status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE 634 635 static void 636 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl, 637 uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code) 638 { 639 uint64_t members = ged ? ged->ged_ereport_members : 640 FM_EREPORT_PAYLOAD_FLAGS_COMMON; 641 uint64_t mcg = gcl->gcl_mcg_status; 642 int mcip = mcg & MCG_STATUS_MCIP; 643 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno]; 644 uint64_t bstat = gbl->gbl_status; 645 646 /* 647 * Include the compound error name if requested and if this 648 * is a compound error type. 649 */ 650 if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged && 651 ged->ged_compound_fmt != NULL) { 652 char buf[FM_MAX_CLASS]; 653 654 gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code, 655 GCPU_MN_NAMESPACE_COMPOUND); 656 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR, 657 DATA_TYPE_STRING, buf, NULL); 658 } 659 660 /* 661 * Include disposition information for this error 662 */ 663 if (members & FM_EREPORT_PAYLOAD_FLAG_DISP && 664 gbl->gbl_disp != 0) { 665 int i, empty = 1; 666 char buf[128]; 667 char *p = buf, *q = buf + 128; 668 static struct _gcpu_disp_name { 669 uint64_t dv; 670 const char *dn; 671 } disp_names[] = { 672 { CMI_ERRDISP_CURCTXBAD, 673 "processor_context_corrupt" }, 674 { CMI_ERRDISP_RIPV_INVALID, 675 "return_ip_invalid" }, 676 { CMI_ERRDISP_UC_UNCONSTRAINED, 677 "unconstrained" }, 678 { CMI_ERRDISP_FORCEFATAL, 679 "forcefatal" }, 680 { CMI_ERRDISP_IGNORED, 681 "ignored" }, 682 { CMI_ERRDISP_PCC_CLEARED, 683 "corrupt_context_cleared" }, 684 { CMI_ERRDISP_UC_CLEARED, 685 "uncorrected_data_cleared" }, 686 { CMI_ERRDISP_POISONED, 687 "poisoned" }, 688 { CMI_ERRDISP_INCONSISTENT, 689 "telemetry_unstable" }, 690 }; 691 692 for (i = 0; i < sizeof (disp_names) / 693 sizeof (struct _gcpu_disp_name); i++) { 694 if ((gbl->gbl_disp & disp_names[i].dv) == 0) 695 continue; 696 697 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, 698 "%s%s", empty ? "" : ",", disp_names[i].dn); 699 p += strlen(p); 700 empty = 0; 701 } 702 703 if (p != buf) 704 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP, 705 DATA_TYPE_STRING, buf, NULL); 706 } 707 708 /* 709 * If MCG_STATUS is included add that and an indication of whether 710 * this ereport was the result of a machine check or poll. 711 */ 712 if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) { 713 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS, 714 DATA_TYPE_UINT64, mcg, NULL); 715 716 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP, 717 DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL); 718 } 719 720 /* 721 * If an instruction pointer is to be included add one provided 722 * MCG_STATUS indicated it is valid; meaningless for polled events. 723 */ 724 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP && 725 mcg & MCG_STATUS_EIPV) { 726 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP, 727 DATA_TYPE_UINT64, gcl->gcl_ip, NULL); 728 } 729 730 /* 731 * Add an indication of whether the trap occured during privileged code. 732 */ 733 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) { 734 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV, 735 DATA_TYPE_BOOLEAN_VALUE, 736 gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL); 737 } 738 739 /* 740 * If requested, add the index of the MCA bank. This indicates the 741 * n'th bank of 4 MCA registers, and does not necessarily correspond 742 * to MCi_* - use the bank offset to correlate 743 */ 744 if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) { 745 fm_payload_set(ereport, 746 /* Bank number */ 747 FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno, 748 /* Offset of MCi_CTL */ 749 FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64, 750 IA32_MSR_MC(bankno, CTL), 751 NULL); 752 } 753 754 /* 755 * Add MCi_STATUS if requested, and decode it. 756 */ 757 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) { 758 const char *tbes[] = { 759 "No tracking", /* 00 */ 760 "Green - below threshold", /* 01 */ 761 "Yellow - above threshold", /* 10 */ 762 "Reserved" /* 11 */ 763 }; 764 765 fm_payload_set(ereport, 766 /* Bank MCi_STATUS */ 767 FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat, 768 /* Overflow? */ 769 _GCPU_BSTATUS(bstat, OVER), 770 /* Uncorrected? */ 771 _GCPU_BSTATUS(bstat, UC), 772 /* Enabled? */ 773 _GCPU_BSTATUS(bstat, EN), 774 /* Processor context corrupt? */ 775 _GCPU_BSTATUS(bstat, PCC), 776 /* Error code */ 777 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE, 778 DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat), 779 /* Model-specific error code */ 780 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE, 781 DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat), 782 NULL); 783 784 /* 785 * If MCG_CAP.TES_P indicates that that thresholding info 786 * is present in the architural component of the bank status 787 * then include threshold information for this bank. 788 */ 789 if (gcl->gcl_flags & GCPU_GCL_F_TES_P) { 790 fm_payload_set(ereport, 791 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES, 792 DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)], 793 NULL); 794 } 795 } 796 797 /* 798 * Add MCi_ADDR info if requested and valid. We force addition of 799 * MCi_ADDR, even if its not valid on AMD family 0xf and above, 800 * to aid in analysis of ereports, for WatchDog errors. 801 */ 802 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR && 803 ((bstat & MSR_MC_STATUS_ADDRV) || 804 gcpu_force_addr_in_payload)) { 805 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR, 806 DATA_TYPE_UINT64, gbl->gbl_addr, NULL); 807 } 808 809 /* 810 * MCi_MISC if requested and MCi_STATUS.MISCV). 811 */ 812 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC && 813 bstat & MSR_MC_STATUS_MISCV) { 814 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC, 815 DATA_TYPE_UINT64, gbl->gbl_misc, NULL); 816 } 817 818 } 819 820 /* 821 * Construct and post an ereport based on the logout information from a 822 * single MCA bank. We are not necessarily running on the cpu that 823 * detected the error. 824 */ 825 static void 826 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx, 827 const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status) 828 { 829 gcpu_data_t *gcpu = gcl->gcl_gcpu; 830 cmi_hdl_t hdl = gcpu->gcpu_hdl; 831 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx]; 832 const char *cpuclass = NULL, *leafclass = NULL; 833 uint16_t code = MCAX86_ERRCODE(status); 834 errorq_elem_t *eqep, *scr_eqep; 835 nvlist_t *ereport, *detector; 836 char buf[FM_MAX_CLASS]; 837 const char *classfmt; 838 nv_alloc_t *nva; 839 840 if (panicstr) { 841 if ((eqep = errorq_reserve(ereport_errorq)) == NULL) 842 return; 843 ereport = errorq_elem_nvl(ereport_errorq, eqep); 844 845 /* 846 * Allocate another element for scratch space, but fallback 847 * to the one we have if that fails. We'd like to use the 848 * additional scratch space for nvlist construction. 849 */ 850 if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL) 851 nva = errorq_elem_nva(ereport_errorq, scr_eqep); 852 else 853 nva = errorq_elem_nva(ereport_errorq, eqep); 854 } else { 855 ereport = fm_nvlist_create(NULL); 856 nva = NULL; 857 } 858 859 if (ereport == NULL) 860 return; 861 862 /* 863 * Common payload data required by the protocol: 864 * - ereport class 865 * - detector 866 * - ENA 867 */ 868 869 /* 870 * Ereport class - call into model-specific support to allow it to 871 * provide a cpu class or leaf class, otherwise calculate our own. 872 */ 873 cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass); 874 classfmt = ged ? ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN; 875 gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass, 876 leafclass); 877 878 /* 879 * The detector FMRI. 880 */ 881 if ((detector = cms_ereport_detector(hdl, bankidx, mscookie, 882 nva)) == NULL) 883 detector = gcpu_fmri_create(hdl, nva); 884 885 /* 886 * Should we define a new ENA format 3?? for chip/core/strand? 887 * It will be better when virtualized. 888 */ 889 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf, 890 fm_ena_generate_cpu(gcl->gcl_timestamp, 891 cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 | 892 cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL); 893 894 if (panicstr) { 895 fm_nvlist_destroy(detector, FM_NVA_RETAIN); 896 nv_alloc_reset(nva); 897 } else { 898 fm_nvlist_destroy(detector, FM_NVA_FREE); 899 } 900 901 /* 902 * Add the architectural ereport class-specific payload data. 903 */ 904 gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code); 905 906 /* 907 * Allow model-specific code to add ereport members. 908 */ 909 cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status, 910 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie); 911 912 /* 913 * Include stack if options is turned on and either selected in 914 * the payload member bitmask or inclusion is forced. 915 */ 916 if (gcpu_mca_stack_flag && 917 (cms_ereport_includestack(hdl, mscookie) == 918 B_TRUE || gcpu_mca_stack_ereport_include)) { 919 fm_payload_stack_add(ereport, gcl->gcl_stack, 920 gcl->gcl_stackdepth); 921 } 922 923 /* 924 * If injection has taken place anytime in the past then note this 925 * on the ereport. 926 */ 927 if (cmi_inj_tainted() == B_TRUE) { 928 fm_payload_set(ereport, "__injected", DATA_TYPE_BOOLEAN_VALUE, 929 B_TRUE, NULL); 930 } 931 932 /* 933 * Post ereport. 934 */ 935 if (panicstr) { 936 errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC); 937 if (scr_eqep) 938 errorq_cancel(ereport_errorq, scr_eqep); 939 } else { 940 (void) fm_ereport_post(ereport, EVCH_TRYHARD); 941 fm_nvlist_destroy(ereport, FM_NVA_FREE); 942 } 943 944 } 945 946 /*ARGSUSED*/ 947 void 948 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe) 949 { 950 const gcpu_logout_t *gcl = data; 951 const gcpu_bank_logout_t *gbl; 952 int ismc; 953 int i; 954 955 ismc = gcl->ismc; 956 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) { 957 const gcpu_error_disp_t *gened; 958 cms_cookie_t mscookie; 959 960 if (gbl->gbl_status & MSR_MC_STATUS_VAL && 961 !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) { 962 uint16_t code = MCAX86_ERRCODE(gbl->gbl_status); 963 964 /* 965 * Perform a match based on IA32 MCA architectural 966 * components alone. 967 */ 968 gened = gcpu_disp_match(code); /* may be NULL */ 969 970 /* 971 * Now see if an model-specific match can be made. 972 */ 973 mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, ismc, 974 i, gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc, 975 gcl->gcl_ms_logout); 976 977 /* 978 * Prepare and dispatch an ereport for logging and 979 * diagnosis. 980 */ 981 gcpu_ereport_post(gcl, i, gened, mscookie, 982 gbl->gbl_status); 983 } else if (gbl->gbl_status & MSR_MC_STATUS_VAL && 984 (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) { 985 /* 986 * Telemetry kept changing as we tried to read 987 * it. Force an unknown ereport leafclass but 988 * keep the telemetry unchanged for logging. 989 */ 990 gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL, 991 gbl->gbl_status); 992 } 993 } 994 } 995 996 static size_t gcpu_mca_queue_datasz = 0; 997 998 /* 999 * The following code is ready to make a weak attempt at growing the 1000 * errorq structure size. Since it is not foolproof (we don't know 1001 * who may already be producing to the outgoing errorq) our caller 1002 * instead assures that we'll always be called with no greater data 1003 * size than on our first call. 1004 */ 1005 static void 1006 gcpu_errorq_init(size_t datasz) 1007 { 1008 int slots; 1009 1010 mutex_enter(&gcpu_mca_queue_lock); 1011 1012 if (gcpu_mca_queue_datasz >= datasz) { 1013 mutex_exit(&gcpu_mca_queue_lock); 1014 return; 1015 } 1016 1017 membar_producer(); 1018 if (gcpu_mca_queue) { 1019 gcpu_mca_queue_datasz = 0; 1020 errorq_destroy(gcpu_mca_queue); 1021 } 1022 1023 slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS); 1024 slots = MIN(slots, GCPU_MCA_MAX_ERRORS); 1025 1026 gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain, 1027 NULL, slots, datasz, 1, ERRORQ_VITAL); 1028 1029 if (gcpu_mca_queue != NULL) 1030 gcpu_mca_queue_datasz = datasz; 1031 1032 mutex_exit(&gcpu_mca_queue_lock); 1033 } 1034 1035 /* 1036 * Perform MCA initialization as described in section 14.6 of Intel 64 1037 * and IA-32 Architectures Software Developer's Manual Volume 3A. 1038 */ 1039 1040 static uint_t global_nbanks; 1041 1042 #ifndef __xpv 1043 /*ARGSUSED*/ 1044 int 1045 gcpu_cmci_cpu_setup(cpu_setup_t what, int cpuid, void *arg) 1046 { 1047 /* 1048 * In general, we'd expect that in a multi-socket configuration, either 1049 * all CPUs would support CMCI or none of them would. Unfortunately, 1050 * that may not be the case in the wild. While we'd rather check the 1051 * handle's enablement state here, that itself is a bit complicated. We 1052 * don't have a guarantee in a heterogenous situation that the CPU in 1053 * question is using the generic CPU module or not, even though we've 1054 * been registered. As such, we allow the interrupt to be registered and 1055 * written to the local apic anyways. We won't have a CMCI interrupt 1056 * generated anyways because the MCA banks will not be programmed as 1057 * such for that CPU by the polling thread. 1058 */ 1059 switch (what) { 1060 case CPU_ON: 1061 psm_cmci_setup(cpuid, B_TRUE); 1062 break; 1063 case CPU_OFF: 1064 psm_cmci_setup(cpuid, B_FALSE); 1065 break; 1066 default: 1067 break; 1068 } 1069 1070 return (0); 1071 } 1072 1073 void 1074 gcpu_mca_cmci_enable(cmi_hdl_t hdl) 1075 { 1076 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1077 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1078 1079 /* 1080 * If this CPU doesn't support CMCI, don't do anything. 1081 */ 1082 if ((mca->gcpu_mca_flags & GCPU_MCA_F_CMCI_CAPABLE) == 0) 1083 return; 1084 1085 /* 1086 * If we don't have support from the PSM module, then there's nothing we 1087 * can do. Note that this changes as we start up the system. The only 1088 * case where it may be mistakenly NULL is for the boot CPU. The boot 1089 * CPU will have this taken care of for it in gcpu_post_startup(), once 1090 * we know for certain whether or not the PSM module supports CMCI. 1091 */ 1092 if (psm_cmci_setup == NULL) { 1093 return; 1094 } 1095 1096 mca->gcpu_mca_flags |= GCPU_MCA_F_CMCI_ENABLE; 1097 if (MUTEX_HELD(&cpu_lock)) { 1098 if (!gcpu_mca_cpu_registered) { 1099 register_cpu_setup_func(gcpu_cmci_cpu_setup, NULL); 1100 gcpu_mca_cpu_registered = B_TRUE; 1101 } 1102 } else { 1103 mutex_enter(&cpu_lock); 1104 if (!gcpu_mca_cpu_registered) { 1105 register_cpu_setup_func(gcpu_cmci_cpu_setup, NULL); 1106 gcpu_mca_cpu_registered = B_TRUE; 1107 } 1108 mutex_exit(&cpu_lock); 1109 } 1110 1111 /* 1112 * Call the PSM op to make sure that we initialize things on 1113 * this CPU. 1114 */ 1115 psm_cmci_setup(cmi_hdl_logical_id(hdl), B_TRUE); 1116 } 1117 #endif /* !__xpv */ 1118 1119 void 1120 gcpu_mca_init(cmi_hdl_t hdl) 1121 { 1122 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1123 uint64_t cap; 1124 uint_t vendor = cmi_hdl_vendor(hdl); 1125 uint_t family = cmi_hdl_family(hdl); 1126 uint_t rev = cmi_hdl_chiprev(hdl); 1127 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1128 int mcg_ctl_present; 1129 uint_t nbanks; 1130 uint32_t ctl_skip_mask = 0; 1131 uint32_t status_skip_mask = 0; 1132 size_t mslsz; 1133 int i; 1134 #ifndef __xpv 1135 int mcg_ctl2_present; 1136 uint32_t cmci_capable = 0; 1137 #endif 1138 if (gcpu == NULL) 1139 return; 1140 1141 /* We add MCi_ADDR always for AMD Family 0xf and above */ 1142 if (X86_CHIPFAM_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B)) 1143 gcpu_force_addr_in_payload = 1; 1144 1145 /* 1146 * Protect from some silly /etc/system settings. 1147 */ 1148 if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100) 1149 gcpu_mca_telemetry_retries = 5; 1150 1151 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS) 1152 return; 1153 1154 /* 1155 * CPU startup code only calls cmi_mca_init if x86_featureset indicates 1156 * both MCA and MCE support (i.e., X86FSET_MCA). P5, K6, and earlier 1157 * processors, which have their own more primitive way of doing 1158 * machine checks, will not have cmi_mca_init called since their 1159 * CPUID information will not indicate both MCA and MCE features. 1160 */ 1161 ASSERT(is_x86_feature(x86_featureset, X86FSET_MCA)); 1162 1163 /* 1164 * Determine whether the IA32_MCG_CTL register is present. If it 1165 * is we will enable all features by writing -1 to it towards 1166 * the end of this initialization; if it is absent then volume 3A 1167 * says we must nonetheless continue to initialize the individual 1168 * banks. 1169 */ 1170 mcg_ctl_present = cap & MCG_CAP_CTL_P; 1171 #ifndef __xpv 1172 mcg_ctl2_present = cap & MCG_CAP_CTL2_P; 1173 #endif 1174 1175 /* 1176 * We squirell values away for inspection/debugging. 1177 */ 1178 mca->gcpu_mca_bioscfg.bios_mcg_cap = cap; 1179 if (mcg_ctl_present) 1180 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL, 1181 &mca->gcpu_mca_bioscfg.bios_mcg_ctl); 1182 1183 /* 1184 * Determine the number of error-reporting banks implemented. 1185 */ 1186 mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK; 1187 1188 if (nbanks != 0 && global_nbanks == 0) 1189 global_nbanks = nbanks; /* no race - BSP will get here first */ 1190 1191 /* 1192 * If someone is hiding the number of banks (perhaps we are fully 1193 * virtualized?) or if this processor has more banks than the 1194 * first to set global_nbanks then bail. The latter requirement 1195 * is because we need to size our errorq data structure and we 1196 * don't want to have to grow the errorq (destroy and recreate) 1197 * which may just lose some telemetry. 1198 */ 1199 if (nbanks == 0 || nbanks > global_nbanks) 1200 return; 1201 1202 mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks * 1203 sizeof (struct gcpu_bios_bankcfg), KM_SLEEP); 1204 1205 /* 1206 * Calculate the size we need to allocate for a gcpu_logout_t 1207 * with a gcl_data array big enough for all banks of this cpu. 1208 * Add any space requested by the model-specific logout support. 1209 */ 1210 mslsz = cms_logout_size(hdl); 1211 mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) + 1212 (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz; 1213 1214 for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) { 1215 gcpu_logout_t *gcl; 1216 1217 mca->gcpu_mca_logout[i] = gcl = 1218 kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP); 1219 gcl->gcl_gcpu = gcpu; 1220 gcl->gcl_nbanks = nbanks; 1221 gcl->gcl_ms_logout = (mslsz == 0) ? NULL : 1222 (char *)(&gcl->gcl_data[0]) + nbanks * 1223 sizeof (gcpu_bank_logout_t); 1224 1225 } 1226 1227 #ifdef __xpv 1228 gcpu_xpv_mca_init(nbanks); 1229 #endif 1230 1231 mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1; 1232 1233 #ifndef __xpv 1234 mca->gcpu_bank_cmci = kmem_zalloc(sizeof (gcpu_mca_cmci_t) * nbanks, 1235 KM_SLEEP); 1236 #endif 1237 1238 /* 1239 * Create our errorq to transport the logout structures. This 1240 * can fail so users of gcpu_mca_queue must be prepared for NULL. 1241 */ 1242 gcpu_errorq_init(mca->gcpu_mca_lgsz); 1243 1244 /* 1245 * Not knowing which, if any, banks are shared between cores we 1246 * assure serialization of MCA bank initialization by each cpu 1247 * on the chip. On chip architectures in which some banks are 1248 * shared this will mean the shared resource is initialized more 1249 * than once - we're simply aiming to avoid simultaneous MSR writes 1250 * to the shared resource. 1251 * 1252 * Even with these precautions, some platforms may yield a GP fault 1253 * if a core other than a designated master tries to write anything 1254 * but all 0's to MCi_{STATUS,ADDR,CTL}. So we will perform 1255 * those writes under on_trap protection. 1256 */ 1257 mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock); 1258 1259 /* 1260 * Initialize poller data, but don't start polling yet. 1261 */ 1262 gcpu_mca_poll_init(hdl); 1263 1264 /* 1265 * Work out which MCA banks we will initialize. In MCA logout 1266 * code we will only read those banks which we initialize here. 1267 */ 1268 for (i = 0; i < nbanks; i++) { 1269 boolean_t skipctl = cms_bankctl_skipinit(hdl, i); 1270 boolean_t skipstatus = cms_bankstatus_skipinit(hdl, i); 1271 1272 if (!cms_present(hdl)) { 1273 /* 1274 * Model-specific support is not present, try to use 1275 * sane defaults. 1276 * 1277 * On AMD family 6 processors, reports about spurious 1278 * machine checks indicate that bank 0 should be 1279 * skipped. 1280 * 1281 * On Intel family 6 processors, the documentation tells 1282 * us not to write to MC0_CTL. 1283 * 1284 */ 1285 if (i == 0 && family == 6) { 1286 switch (vendor) { 1287 case X86_VENDOR_AMD: 1288 skipstatus = B_TRUE; 1289 /*FALLTHRU*/ 1290 case X86_VENDOR_Intel: 1291 skipctl = B_TRUE; 1292 break; 1293 } 1294 } 1295 } 1296 1297 ctl_skip_mask |= skipctl << i; 1298 status_skip_mask |= skipstatus << i; 1299 1300 if (skipctl && skipstatus) 1301 continue; 1302 1303 /* 1304 * Record which MCA banks were enabled, from the point of view 1305 * of the whole chip (if some cores share a bank we must be 1306 * sure either can logout from it). 1307 */ 1308 atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i); 1309 1310 #ifndef __xpv 1311 /* 1312 * check CMCI capability 1313 */ 1314 if (mcg_ctl2_present) { 1315 uint64_t ctl2; 1316 uint32_t cap = 0; 1317 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2); 1318 if (ctl2 & MSR_MC_CTL2_EN) 1319 continue; 1320 ctl2 |= MSR_MC_CTL2_EN; 1321 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2); 1322 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2); 1323 mca->gcpu_bank_cmci[i].cmci_cap = cap = 1324 (ctl2 & MSR_MC_CTL2_EN) ? 1 : 0; 1325 if (cap) 1326 cmci_capable ++; 1327 /* 1328 * Set threshold to 1 while unset the en field, to avoid 1329 * CMCI trigged before APIC LVT entry init. 1330 */ 1331 ctl2 = ctl2 & (~MSR_MC_CTL2_EN) | 1; 1332 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2); 1333 1334 /* 1335 * init cmci related count 1336 */ 1337 mca->gcpu_bank_cmci[i].cmci_enabled = 0; 1338 mca->gcpu_bank_cmci[i].drtcmci = 0; 1339 mca->gcpu_bank_cmci[i].ncmci = 0; 1340 } 1341 #endif 1342 } 1343 1344 #ifndef __xpv 1345 if (cmci_capable) { 1346 mca->gcpu_mca_flags |= GCPU_MCA_F_CMCI_CAPABLE; 1347 gcpu_mca_cmci_enable(hdl); 1348 } 1349 #endif 1350 1351 #ifndef __xpv 1352 /* 1353 * Log any valid telemetry lurking in the MCA banks, but do not 1354 * clear the status registers. Ignore the disposition returned - 1355 * we have already paniced or reset for any nasty errors found here. 1356 * 1357 * Intel vol 3A says that we should not do this on family 0x6, 1358 * and that for any extended family the BIOS clears things 1359 * on power-on reset so you'll only potentially find valid telemetry 1360 * on warm reset (we do it for both - on power-on reset we should 1361 * just see zeroes). 1362 * 1363 * AMD docs since K7 say we should process anything we find here. 1364 */ 1365 if (!gcpu_suppress_log_on_init && 1366 (vendor == X86_VENDOR_Intel && family >= 0xf || 1367 vendor == X86_VENDOR_AMD)) 1368 gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE, 1369 GCPU_MPT_WHAT_POKE_ERR); 1370 1371 /* 1372 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the 1373 * model-specific module the power of veto. 1374 */ 1375 for (i = 0; i < nbanks; i++) { 1376 struct gcpu_bios_bankcfg *bcfgp = 1377 mca->gcpu_mca_bioscfg.bios_bankcfg + i; 1378 1379 /* 1380 * Stash inherited bank MCA state, even for banks we will 1381 * not initialize ourselves. Do not read the MISC register 1382 * unconditionally - on some processors that will #GP on 1383 * banks that do not implement the MISC register (would be 1384 * caught by on_trap, anyway). 1385 */ 1386 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL), 1387 &bcfgp->bios_bank_ctl); 1388 1389 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), 1390 &bcfgp->bios_bank_status); 1391 1392 if ((bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV) || 1393 gcpu_force_addr_in_payload) { 1394 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), 1395 &bcfgp->bios_bank_addr); 1396 } 1397 1398 /* 1399 * In some old BIOS the status value after boot can indicate 1400 * MISCV when there is actually no MISC register for 1401 * that bank. The following read could therefore 1402 * aggravate a general protection fault. This should be 1403 * caught by on_trap, but the #GP fault handler is busted 1404 * and can suffer a double fault even before we get to 1405 * trap() to check for on_trap protection. Until that 1406 * issue is fixed we remove the one access that we know 1407 * can cause a #GP. 1408 * 1409 * if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV) 1410 * (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), 1411 * &bcfgp->bios_bank_misc); 1412 */ 1413 bcfgp->bios_bank_misc = 0; 1414 1415 if (!(ctl_skip_mask & (1 << i))) { 1416 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL), 1417 cms_bankctl_val(hdl, i, -1ULL)); 1418 } 1419 1420 if (!(status_skip_mask & (1 << i))) { 1421 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 1422 cms_bankstatus_val(hdl, i, 0ULL)); 1423 } 1424 } 1425 #endif 1426 /* 1427 * Now let the model-specific support perform further initialization 1428 * of non-architectural features. 1429 */ 1430 cms_mca_init(hdl, nbanks); 1431 1432 #ifndef __xpv 1433 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL); 1434 membar_producer(); 1435 1436 /* enable all machine-check features */ 1437 if (mcg_ctl_present) 1438 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL, 1439 cms_mcgctl_val(hdl, nbanks, -1ULL)); 1440 #endif 1441 1442 mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock); 1443 1444 #ifndef __xpv 1445 /* enable machine-check exception in CR4 */ 1446 cmi_hdl_enable_mce(hdl); 1447 #endif 1448 } 1449 1450 static uint64_t 1451 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu, 1452 gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp) 1453 { 1454 int curctxbad = 0, unconstrained = 0, forcefatal = 0; 1455 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1456 int nbanks = mca->gcpu_mca_nbanks; 1457 gcpu_mce_status_t mce; 1458 gcpu_bank_logout_t *gbl; 1459 uint64_t disp = 0; 1460 int i; 1461 1462 if (mcesp == NULL) 1463 mcesp = &mce; 1464 1465 mcesp->mce_nerr = nerr; 1466 1467 mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc = 1468 mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned = 1469 mcesp->mce_forcefatal = mcesp->mce_ignored = 0; 1470 1471 /* 1472 * If this a machine check then if the return instruction pointer 1473 * is not valid the current context is lost. 1474 */ 1475 if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV)) 1476 disp |= CMI_ERRDISP_RIPV_INVALID; 1477 gcl->ismc = ismc; 1478 1479 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) { 1480 uint64_t mcistatus = gbl->gbl_status; 1481 uint32_t ms_scope; 1482 int pcc, uc; 1483 int poisoned; 1484 1485 if (!(mcistatus & MSR_MC_STATUS_VAL)) 1486 continue; 1487 1488 if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT) 1489 continue; 1490 1491 pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0; 1492 uc = (mcistatus & MSR_MC_STATUS_UC) != 0; 1493 mcesp->mce_npcc += pcc; 1494 mcesp->mce_nuc += uc; 1495 1496 ms_scope = cms_error_action(hdl, ismc, i, mcistatus, 1497 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout); 1498 1499 if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) { 1500 pcc = 0; 1501 mcesp->mce_npcc_ok++; 1502 gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED; 1503 } 1504 1505 if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) { 1506 uc = 0; 1507 mcesp->mce_nuc_ok++; 1508 gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED; 1509 } 1510 1511 if (uc) { 1512 poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0; 1513 if (poisoned) { 1514 mcesp->mce_nuc_poisoned++; 1515 gbl->gbl_disp |= CMI_ERRDISP_POISONED; 1516 } 1517 } 1518 1519 if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) { 1520 /* 1521 * We're not being instructed to ignore the error, 1522 * so apply our standard disposition logic to it. 1523 */ 1524 if (uc && !poisoned) { 1525 unconstrained++; 1526 gbl->gbl_disp |= disp | 1527 CMI_ERRDISP_UC_UNCONSTRAINED; 1528 } 1529 1530 if (pcc && ismc) { 1531 curctxbad++; 1532 gbl->gbl_disp |= disp | 1533 CMI_ERRDISP_CURCTXBAD; 1534 } 1535 1536 /* 1537 * Even if the above may not indicate that the error 1538 * is terminal, model-specific support may insist 1539 * that we treat it as such. Such errors wil be 1540 * fatal even if discovered via poll. 1541 */ 1542 if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) { 1543 forcefatal++; 1544 mcesp->mce_forcefatal++; 1545 gbl->gbl_disp |= disp | 1546 CMI_ERRDISP_FORCEFATAL; 1547 } 1548 } else { 1549 mcesp->mce_ignored++; 1550 gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED; 1551 } 1552 } 1553 1554 if (unconstrained > 0) 1555 disp |= CMI_ERRDISP_UC_UNCONSTRAINED; 1556 1557 if (curctxbad > 0) 1558 disp |= CMI_ERRDISP_CURCTXBAD; 1559 1560 if (forcefatal > 0) 1561 disp |= CMI_ERRDISP_FORCEFATAL; 1562 1563 if (gcpu_mca_queue != NULL) { 1564 int how; 1565 1566 if (ismc) { 1567 how = cmi_mce_response(rp, disp) ? 1568 ERRORQ_ASYNC : /* no panic, so arrange drain */ 1569 ERRORQ_SYNC; /* panic flow will drain */ 1570 } else { 1571 how = (disp & CMI_ERRDISP_FORCEFATAL && 1572 cmi_panic_on_ue()) ? 1573 ERRORQ_SYNC : /* poller will panic */ 1574 ERRORQ_ASYNC; /* no panic */ 1575 } 1576 1577 errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how); 1578 } else if (disp != 0) { 1579 gcpu_bleat(hdl, gcl); 1580 } 1581 1582 mcesp->mce_disp = disp; 1583 1584 return (disp); 1585 } 1586 1587 /* 1588 * Gather error telemetry from our source, and then submit it for 1589 * processing. 1590 */ 1591 1592 #define IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \ 1593 ((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0) 1594 1595 #define STATUS_EQV(s1, s2) \ 1596 (((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER)) 1597 1598 static uint32_t gcpu_deferrred_polled_clears; 1599 1600 #ifndef __xpv 1601 static void 1602 gcpu_cmci_logout(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p, 1603 uint64_t status, int what) 1604 { 1605 uint64_t ctl2; 1606 1607 if (bank_cmci_p->cmci_cap && (what == GCPU_MPT_WHAT_CYC_ERR) && 1608 (!(status & MSR_MC_STATUS_VAL) || ((status & MSR_MC_STATUS_VAL) && 1609 !(status & MSR_MC_STATUS_CEC_MASK)))) { 1610 1611 if (!(bank_cmci_p->cmci_enabled)) { 1612 /* 1613 * when cmci is disabled, and the bank has no error or 1614 * no corrected error for 1615 * gcpu_mca_cmci_reenable_threshold consecutive polls, 1616 * turn on this bank's cmci. 1617 */ 1618 1619 bank_cmci_p->drtcmci ++; 1620 1621 if (bank_cmci_p->drtcmci >= 1622 gcpu_mca_cmci_reenable_threshold) { 1623 1624 /* turn on cmci */ 1625 1626 (void) cmi_hdl_rdmsr(hdl, 1627 IA32_MSR_MC_CTL2(bank), &ctl2); 1628 ctl2 |= MSR_MC_CTL2_EN; 1629 (void) cmi_hdl_wrmsr(hdl, 1630 IA32_MSR_MC_CTL2(bank), ctl2); 1631 1632 /* reset counter and set flag */ 1633 bank_cmci_p->drtcmci = 0; 1634 bank_cmci_p->cmci_enabled = 1; 1635 } 1636 } else { 1637 /* 1638 * when cmci is enabled,if is in cyclic poll and the 1639 * bank has no error or no corrected error, reset ncmci 1640 * counter 1641 */ 1642 bank_cmci_p->ncmci = 0; 1643 } 1644 } 1645 } 1646 1647 static void 1648 gcpu_cmci_throttle(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p, 1649 int what) 1650 { 1651 uint64_t ctl2 = 0; 1652 1653 /* 1654 * if cmci of this bank occurred beyond 1655 * gcpu_mca_cmci_throttling_threshold between 2 polls, 1656 * turn off this bank's CMCI; 1657 */ 1658 if (bank_cmci_p->cmci_enabled && what == GCPU_MPT_WHAT_CMCI_ERR) { 1659 1660 /* if it is cmci trap, increase the count */ 1661 bank_cmci_p->ncmci++; 1662 1663 if (bank_cmci_p->ncmci >= gcpu_mca_cmci_throttling_threshold) { 1664 1665 /* turn off cmci */ 1666 1667 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(bank), 1668 &ctl2); 1669 ctl2 &= ~MSR_MC_CTL2_EN; 1670 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(bank), 1671 ctl2); 1672 1673 /* clear the flag and count */ 1674 1675 bank_cmci_p->cmci_enabled = 0; 1676 bank_cmci_p->ncmci = 0; 1677 } 1678 } 1679 } 1680 #endif 1681 1682 static void 1683 clear_mc(int first, int last, int ismc, boolean_t clrstatus, 1684 cmi_hdl_t hdl, gcpu_logout_t *gcl, gcpu_logout_t *pgcl) 1685 { 1686 int i; 1687 gcpu_bank_logout_t *gbl, *pgbl; 1688 uint64_t status; 1689 1690 if (first < 0 || last < 0) 1691 return; 1692 1693 for (i = first, gbl = &gcl->gcl_data[first]; i <= last; i++, gbl++) { 1694 status = gbl->gbl_status; 1695 if (status == 0) 1696 continue; 1697 if (clrstatus == B_FALSE) 1698 goto serialize; 1699 1700 /* 1701 * For i86xpv we always clear status in order to invalidate 1702 * the interposed telemetry. 1703 * 1704 * For native machine checks we always clear status here. For 1705 * native polls we must be a little more cautious since there 1706 * is an outside chance that we may clear telemetry from a 1707 * shared MCA bank on which a sibling core is machine checking. 1708 * 1709 * For polled observations of errors that look like they may 1710 * produce a machine check (UC/PCC and ENabled, although these 1711 * do not guarantee a machine check on error occurence) 1712 * we will not clear the status at this wakeup unless 1713 * we saw the same status at the previous poll. We will 1714 * always process and log the current observations - it 1715 * is only the clearing of MCi_STATUS which may be 1716 * deferred until the next wakeup. 1717 */ 1718 if (isxpv || ismc || !IS_MCE_CANDIDATE(status)) { 1719 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL); 1720 goto serialize; 1721 } 1722 1723 /* 1724 * We have a polled observation of a machine check 1725 * candidate. If we saw essentially the same status at the 1726 * last poll then clear the status now since this appears 1727 * not to be a #MC candidate after all. If we see quite 1728 * different status now then do not clear, but reconsider at 1729 * the next poll. In no actual machine check clears 1730 * the status in the interim then the status should not 1731 * keep changing forever (meaning we'd never clear it) 1732 * since before long we'll simply have latched the highest- 1733 * priority error and set the OVerflow bit. Nonetheless 1734 * we count how many times we defer clearing and after 1735 * a while insist on clearing the status. 1736 */ 1737 pgbl = &pgcl->gcl_data[i]; 1738 if (pgbl->gbl_clrdefcnt != 0) { 1739 /* We deferred clear on this bank at last wakeup */ 1740 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) || 1741 pgbl->gbl_clrdefcnt > 5) { 1742 /* 1743 * Status is unchanged so clear it now and, 1744 * since we have already logged this info, 1745 * avoid logging it again. 1746 */ 1747 gbl->gbl_status = 0; 1748 (void) cmi_hdl_wrmsr(hdl, 1749 IA32_MSR_MC(i, STATUS), 0ULL); 1750 } else { 1751 /* Record deferral for next wakeup */ 1752 gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1; 1753 } 1754 } else { 1755 /* Record initial deferral for next wakeup */ 1756 gbl->gbl_clrdefcnt = 1; 1757 gcpu_deferrred_polled_clears++; 1758 } 1759 1760 serialize: 1761 { 1762 #ifdef __xpv 1763 ; 1764 #else 1765 /* 1766 * Intel Vol 3A says to execute a serializing 1767 * instruction here, ie CPUID. Well WRMSR is also 1768 * defined to be serializing, so the status clear above 1769 * should suffice. To be a good citizen, and since 1770 * some clears are deferred, we'll execute a CPUID 1771 * instruction here. 1772 */ 1773 struct cpuid_regs tmp; 1774 (void) __cpuid_insn(&tmp); 1775 #endif 1776 } 1777 } 1778 } 1779 1780 /*ARGSUSED5*/ 1781 void 1782 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask, 1783 gcpu_mce_status_t *mcesp, boolean_t clrstatus, int what) 1784 { 1785 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1786 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1787 int nbanks = mca->gcpu_mca_nbanks; 1788 gcpu_bank_logout_t *gbl, *pgbl; 1789 gcpu_logout_t *gcl, *pgcl; 1790 int ismc = (rp != NULL); 1791 int ispoll = !ismc; 1792 int i, nerr = 0; 1793 cmi_errno_t err; 1794 uint64_t mcg_status; 1795 uint64_t disp; 1796 uint64_t cap; 1797 int first = -1; 1798 int last = -1; 1799 int willpanic = 0; 1800 1801 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) != 1802 CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != 1803 CMI_SUCCESS) { 1804 if (mcesp != NULL) 1805 mcesp->mce_nerr = mcesp->mce_disp = 0; 1806 return; 1807 } 1808 1809 if (ismc) { 1810 gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION]; 1811 } else { 1812 int pidx = mca->gcpu_mca_nextpoll_idx; 1813 int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ? 1814 GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1; 1815 1816 gcl = mca->gcpu_mca_logout[pidx]; /* current logout */ 1817 pgcl = mca->gcpu_mca_logout[ppidx]; /* previous logout */ 1818 mca->gcpu_mca_nextpoll_idx = ppidx; /* switch next time */ 1819 } 1820 1821 gcl->gcl_timestamp = gethrtime_waitfree(); 1822 gcl->gcl_mcg_status = mcg_status; 1823 gcl->gcl_ip = rp ? rp->r_pc : 0; 1824 1825 gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0; 1826 if (cap & MCG_CAP_TES_P) 1827 gcl->gcl_flags |= GCPU_GCL_F_TES_P; 1828 1829 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) { 1830 uint64_t status, status2, addr, misc; 1831 int retries = gcpu_mca_telemetry_retries; 1832 1833 gbl->gbl_status = 0; 1834 gbl->gbl_disp = 0; 1835 gbl->gbl_clrdefcnt = 0; 1836 1837 /* 1838 * Only logout from MCA banks we have initialized from at 1839 * least one core. If a core shares an MCA bank with another 1840 * but perhaps lost the race to initialize it, then it must 1841 * still be allowed to logout from the shared bank. 1842 */ 1843 if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i)) 1844 continue; 1845 1846 /* 1847 * On a poll look only at the banks we've been asked to check. 1848 */ 1849 if (rp == NULL && !(bankmask & 1 << i)) 1850 continue; 1851 1852 1853 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) != 1854 CMI_SUCCESS) 1855 continue; 1856 1857 #ifndef __xpv 1858 gcpu_cmci_logout(hdl, i, &mca->gcpu_bank_cmci[i], status, what); 1859 #endif 1860 1861 retry: 1862 if (!(status & MSR_MC_STATUS_VAL)) 1863 continue; 1864 1865 /* First and last bank that have valid status */ 1866 if (first < 0) 1867 first = i; 1868 last = i; 1869 1870 addr = -1; 1871 misc = 0; 1872 1873 if ((status & MSR_MC_STATUS_ADDRV) || 1874 gcpu_force_addr_in_payload) 1875 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr); 1876 1877 if (status & MSR_MC_STATUS_MISCV) 1878 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc); 1879 1880 #ifndef __xpv 1881 gcpu_cmci_throttle(hdl, i, &mca->gcpu_bank_cmci[i], what); 1882 #endif 1883 1884 /* 1885 * Allow the model-specific code to extract bank telemetry. 1886 */ 1887 cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout); 1888 1889 /* 1890 * Not all cpu models assure us that the status/address/misc 1891 * data will not change during the above sequence of MSR reads, 1892 * or that it can only change by the addition of the OVerflow 1893 * bit to the status register. If the status has changed 1894 * other than in the overflow bit then we attempt to reread 1895 * for a consistent snapshot, but eventually give up and 1896 * go with what we've got. We only perform this check 1897 * for a poll - a further #MC during a #MC will reset, and 1898 * polled errors should not overwrite higher-priority 1899 * trapping errors (but could set the overflow bit). 1900 */ 1901 if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), 1902 &status2)) == CMI_SUCCESS) { 1903 if (!STATUS_EQV(status, status2)) { 1904 if (retries-- > 0) { 1905 status = status2; 1906 goto retry; 1907 } else { 1908 gbl->gbl_disp |= 1909 CMI_ERRDISP_INCONSISTENT; 1910 } 1911 } 1912 } else if (ispoll && err != CMI_SUCCESS) { 1913 gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT; 1914 } 1915 1916 nerr++; 1917 gbl->gbl_status = status; 1918 gbl->gbl_addr = addr; 1919 gbl->gbl_misc = misc; 1920 1921 /* 1922 * For polled observation, if the count of deferred status 1923 * clears updated in the clear_mc() is nonzero and the 1924 * MCi_STATUS has not changed, the last wakeup has produced 1925 * the ereport of the error. Therefore, clear the status in 1926 * this wakeup to avoid duplicate ereport. 1927 */ 1928 pgbl = &pgcl->gcl_data[i]; 1929 if (!isxpv && ispoll && IS_MCE_CANDIDATE(status) && 1930 pgbl->gbl_clrdefcnt != 0) { 1931 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status)) { 1932 gbl->gbl_status = 0; 1933 (void) cmi_hdl_wrmsr(hdl, 1934 IA32_MSR_MC(i, STATUS), 0ULL); 1935 } 1936 } 1937 } 1938 1939 if (gcpu_mca_stack_flag) 1940 gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH); 1941 else 1942 gcl->gcl_stackdepth = 0; 1943 1944 /* 1945 * Decide our disposition for this error or errors, and submit for 1946 * logging and subsequent diagnosis. 1947 */ 1948 if (nerr != 0) { 1949 disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp); 1950 1951 willpanic = (ismc && cmi_mce_response(rp, disp) == 0); 1952 1953 if (!willpanic) 1954 clear_mc(first, last, ismc, clrstatus, hdl, gcl, pgcl); 1955 } else { 1956 disp = 0; 1957 if (mcesp) { 1958 mcesp->mce_nerr = mcesp->mce_disp = 0; 1959 } 1960 } 1961 1962 /* 1963 * Clear MCG_STATUS if MCIP is set (machine check in progress). 1964 * If a second #MC had occured before now the system would have 1965 * reset. We can only do thise once gcpu_mca_process has copied 1966 * the logout structure. 1967 */ 1968 if (ismc && mcg_status & MCG_STATUS_MCIP) 1969 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0); 1970 1971 /* 1972 * At this point we have read and logged all telemetry that is visible 1973 * under the MCA. On architectures for which the NorthBridge is 1974 * on-chip this may include NB-observed errors, but where the NB 1975 * is off chip it may have been the source of the #MC request and 1976 * so we must call into the memory-controller driver to give it 1977 * a chance to log errors. 1978 */ 1979 if (ismc) { 1980 cmi_mc_logout(hdl, 1, willpanic); 1981 } 1982 } 1983 1984 #ifndef __xpv 1985 int gcpu_mca_trap_vomit_summary = 0; 1986 1987 /* 1988 * On a native machine check exception we come here from mcetrap via 1989 * cmi_mca_trap. A machine check on one cpu of a chip does not trap others 1990 * cpus of the chip, so it is possible that another cpu on this chip could 1991 * initiate a poll while we're in the #mc handler; it is also possible that 1992 * this trap has occured during a poll on this cpu. So we must acquire 1993 * the chip-wide poll lock, but be careful to avoid deadlock. 1994 * 1995 * The 'data' pointer cannot be NULL due to init order. 1996 */ 1997 uint64_t 1998 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp) 1999 { 2000 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 2001 kmutex_t *poll_lock = NULL; 2002 gcpu_mce_status_t mce; 2003 uint64_t mcg_status; 2004 int tooklock = 0; 2005 2006 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) != 2007 CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP)) 2008 return (0); 2009 2010 /* 2011 * Synchronize with any poller from another core that may happen 2012 * to share access to one or more of the MCA banks. 2013 */ 2014 if (gcpu->gcpu_shared != NULL) 2015 poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock; 2016 2017 if (poll_lock != NULL && !mutex_owned(poll_lock)) { 2018 /* 2019 * The lock is not owned by the thread we have 2020 * interrupted. Spin for this adaptive lock. 2021 */ 2022 while (!mutex_tryenter(poll_lock)) { 2023 while (mutex_owner(poll_lock) != NULL) 2024 ; 2025 } 2026 tooklock = 1; 2027 } 2028 2029 gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE, GCPU_MPT_WHAT_MC_ERR); 2030 2031 if (tooklock) 2032 mutex_exit(poll_lock); 2033 2034 /* 2035 * gcpu_mca_trap_vomit_summary may be set for debug assistance. 2036 */ 2037 if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) { 2038 cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, " 2039 "%u PCC (%u ok), " 2040 "%u UC (%d ok, %u poisoned), " 2041 "%u forcefatal, %u ignored", 2042 mce.mce_nerr, (u_longlong_t)mce.mce_disp, 2043 mce.mce_npcc, mce.mce_npcc_ok, 2044 mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned, 2045 mce.mce_forcefatal, mce.mce_ignored); 2046 } 2047 2048 return (mce.mce_disp); 2049 } 2050 #endif 2051 2052 /*ARGSUSED*/ 2053 void 2054 gcpu_faulted_enter(cmi_hdl_t hdl) 2055 { 2056 /* Nothing to do here */ 2057 } 2058 2059 /*ARGSUSED*/ 2060 void 2061 gcpu_faulted_exit(cmi_hdl_t hdl) 2062 { 2063 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 2064 2065 gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING; 2066 } 2067 2068 /* 2069 * Write the requested values to the indicated MSRs. Having no knowledge 2070 * of the model-specific requirements for writing to these model-specific 2071 * registers, we will only blindly write to those MSRs if the 'force' 2072 * argument is nonzero. That option should only be used in prototyping 2073 * and debugging. 2074 */ 2075 /*ARGSUSED*/ 2076 cmi_errno_t 2077 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs, 2078 int force) 2079 { 2080 int i, errs = 0; 2081 2082 for (i = 0; i < nregs; i++) { 2083 uint_t msr = regs[i].cmr_msrnum; 2084 uint64_t val = regs[i].cmr_msrval; 2085 2086 if (cms_present(hdl)) { 2087 if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS) 2088 errs++; 2089 } else if (force) { 2090 errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS); 2091 } else { 2092 errs++; 2093 } 2094 } 2095 2096 return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN); 2097 } 2098 2099 /* deconfigure gcpu_mca_init() */ 2100 void 2101 gcpu_mca_fini(cmi_hdl_t hdl) 2102 { 2103 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 2104 gcpu_mca_t *mca = &gcpu->gcpu_mca; 2105 int i; 2106 2107 /* 2108 * CPU startup code only calls cmi_mca_init if x86_featureset indicates 2109 * both MCA and MCE support (i.e., X86FSET_MCA). P5, K6, and earlier 2110 * processors, which have their own more primitive way of doing 2111 * machine checks, will not have cmi_mca_init called since their 2112 * CPUID information will not indicate both MCA and MCE features. 2113 */ 2114 if (!is_x86_feature(x86_featureset, X86FSET_MCA)) 2115 return; 2116 #ifndef __xpv 2117 /* 2118 * disable machine check in CR4 2119 */ 2120 cmi_ntv_hwdisable_mce(hdl); 2121 #endif 2122 mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock); 2123 gcpu_mca_poll_fini(hdl); 2124 mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock); 2125 2126 /* 2127 * free resources allocated during init 2128 */ 2129 if (mca->gcpu_bank_cmci != NULL) { 2130 kmem_free(mca->gcpu_bank_cmci, sizeof (gcpu_mca_cmci_t) * 2131 mca->gcpu_mca_nbanks); 2132 } 2133 2134 for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) { 2135 if (mca->gcpu_mca_logout[i] != NULL) { 2136 kmem_free(mca->gcpu_mca_logout[i], mca->gcpu_mca_lgsz); 2137 } 2138 } 2139 2140 if (mca->gcpu_mca_bioscfg.bios_bankcfg != NULL) { 2141 kmem_free(mca->gcpu_mca_bioscfg.bios_bankcfg, 2142 sizeof (struct gcpu_bios_bankcfg) * mca->gcpu_mca_nbanks); 2143 } 2144 } 2145