1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* 27 * Copyright (c) 2010, Intel Corporation. 28 * All rights reserved. 29 */ 30 31 #include <sys/mca_x86.h> 32 #include <sys/cpu_module_impl.h> 33 #include <sys/cpu_module_ms.h> 34 #include <sys/cmn_err.h> 35 #include <sys/cpuvar.h> 36 #include <sys/pghw.h> 37 #include <sys/x86_archext.h> 38 #include <sys/sysmacros.h> 39 #include <sys/regset.h> 40 #include <sys/privregs.h> 41 #include <sys/systm.h> 42 #include <sys/types.h> 43 #include <sys/log.h> 44 #include <sys/psw.h> 45 #include <sys/fm/protocol.h> 46 #include <sys/fm/util.h> 47 #include <sys/errorq.h> 48 #include <sys/mca_x86.h> 49 #include <sys/fm/cpu/GMCA.h> 50 #include <sys/fm/smb/fmsmb.h> 51 #include <sys/sysevent.h> 52 #include <sys/ontrap.h> 53 54 #include "gcpu.h" 55 56 extern int x86gentopo_legacy; /* x86 generic topology support */ 57 58 static uint_t gcpu_force_addr_in_payload = 0; 59 60 /* 61 * Clear to log telemetry found at initialization. While processor docs 62 * say you should process this telemetry on all but Intel family 0x6 63 * there are way too many exceptions and we want to avoid bogus 64 * diagnoses. 65 */ 66 int gcpu_suppress_log_on_init = 1; 67 68 /* 69 * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at 70 * error logout time. The stack will be included in the ereport if the 71 * error type selects stack inclusion, or in all cases if 72 * gcpu_mca_stack_ereport_include is nonzero. 73 */ 74 int gcpu_mca_stack_flag = 0; 75 int gcpu_mca_stack_ereport_include = 0; 76 77 /* 78 * The number of times to re-read MCA telemetry to try to obtain a 79 * consistent snapshot if we find it to be changing under our feet. 80 */ 81 int gcpu_mca_telemetry_retries = 5; 82 83 #ifndef __xpv 84 int gcpu_mca_cmci_throttling_threshold = 10; 85 int gcpu_mca_cmci_reenable_threshold = 1000; 86 #endif 87 88 static gcpu_error_disp_t gcpu_errtypes[] = { 89 90 /* 91 * Unclassified 92 */ 93 { 94 FM_EREPORT_CPU_GENERIC_UNCLASSIFIED, 95 NULL, 96 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 97 MCAX86_SIMPLE_UNCLASSIFIED_MASKON, 98 MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF 99 }, 100 101 /* 102 * Microcode ROM Parity Error 103 */ 104 { 105 FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY, 106 NULL, 107 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 108 MCAX86_SIMPLE_MC_CODE_PARITY_MASKON, 109 MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF 110 }, 111 112 /* 113 * External - BINIT# from another processor during power-on config 114 */ 115 { 116 FM_EREPORT_CPU_GENERIC_EXTERNAL, 117 NULL, 118 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 119 MCAX86_SIMPLE_EXTERNAL_MASKON, 120 MCAX86_SIMPLE_EXTERNAL_MASKOFF 121 }, 122 123 /* 124 * Functional redundancy check master/slave error 125 */ 126 { 127 FM_EREPORT_CPU_GENERIC_FRC, 128 NULL, 129 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 130 MCAX86_SIMPLE_FRC_MASKON, 131 MCAX86_SIMPLE_FRC_MASKOFF 132 }, 133 134 /* 135 * Internal parity error 136 */ 137 { 138 FM_EREPORT_CPU_GENERIC_INTERNAL_PARITY, 139 NULL, 140 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 141 MCAX86_SIMPLE_INTERNAL_PARITY_MASKON, 142 MCAX86_SIMPLE_INTERNAL_PARITY_MASKOFF 143 }, 144 145 146 /* 147 * Internal timer error 148 */ 149 { 150 FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER, 151 NULL, 152 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 153 MCAX86_SIMPLE_INTERNAL_TIMER_MASKON, 154 MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF 155 }, 156 157 /* 158 * Internal unclassified 159 */ 160 { 161 FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS, 162 NULL, 163 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 164 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON, 165 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF 166 }, 167 168 /* 169 * Compound error codes - generic memory hierarchy 170 */ 171 { 172 FM_EREPORT_CPU_GENERIC_GENMEMHIER, 173 NULL, 174 FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */ 175 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON, 176 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF 177 }, 178 179 /* 180 * Compound error codes - TLB errors 181 */ 182 { 183 FM_EREPORT_CPU_GENERIC_TLB, 184 "%1$s" "TLB" "%2$s" "_ERR", 185 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 186 MCAX86_COMPOUND_TLB_MASKON, 187 MCAX86_COMPOUND_TLB_MASKOFF 188 }, 189 190 /* 191 * Compound error codes - memory hierarchy 192 */ 193 { 194 FM_EREPORT_CPU_GENERIC_MEMHIER, 195 "%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR", 196 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 197 MCAX86_COMPOUND_MEMHIER_MASKON, 198 MCAX86_COMPOUND_MEMHIER_MASKOFF 199 }, 200 201 /* 202 * Compound error codes - bus and interconnect errors 203 */ 204 { 205 FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT, 206 "BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR", 207 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 208 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON, 209 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF 210 }, 211 /* 212 * Compound error codes - memory controller errors 213 */ 214 { 215 FM_EREPORT_CPU_GENERIC_MEMORY_CONTROLLER, 216 "MC" "_" "%8$s" "_" "%9$s" "_ERR", 217 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 218 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKON, 219 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKOFF 220 }, 221 }; 222 223 static gcpu_error_disp_t gcpu_unknown = { 224 FM_EREPORT_CPU_GENERIC_UNKNOWN, 225 "UNKNOWN", 226 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 227 0, 228 0 229 }; 230 231 static errorq_t *gcpu_mca_queue; 232 static kmutex_t gcpu_mca_queue_lock; 233 234 #ifdef __xpv 235 static int isxpv = 1; 236 #else 237 static int isxpv = 0; 238 #endif 239 240 static const gcpu_error_disp_t * 241 gcpu_disp_match(uint16_t code) 242 { 243 const gcpu_error_disp_t *ged = gcpu_errtypes; 244 int i; 245 246 for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t); 247 i++, ged++) { 248 uint16_t on = ged->ged_errcode_mask_on; 249 uint16_t off = ged->ged_errcode_mask_off; 250 251 if ((code & on) == on && (code & off) == 0) 252 return (ged); 253 } 254 255 return (NULL); 256 } 257 258 static uint16_t 259 bit_strip(uint16_t code, uint16_t mask, uint16_t shift) 260 { 261 return ((code & mask) >> shift); 262 } 263 264 #define BIT_STRIP(code, name) \ 265 bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \ 266 MCAX86_ERRCODE_##name##_SHIFT) 267 268 #define GCPU_MNEMONIC_UNDEF "undefined" 269 #define GCPU_MNEMONIC_RESVD "reserved" 270 271 /* 272 * Mappings of TT, LL, RRRR, PP, II and T values to compound error name 273 * mnemonics and to ereport class name components. 274 */ 275 276 struct gcpu_mnexp { 277 const char *mne_compound; /* used in expanding compound errname */ 278 const char *mne_ereport; /* used in expanding ereport class */ 279 }; 280 281 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */ 282 { "I", FM_EREPORT_CPU_GENERIC_TT_INSTR }, /* INSTR */ 283 { "D", FM_EREPORT_CPU_GENERIC_TT_DATA }, /* DATA */ 284 { "G", FM_EREPORT_CPU_GENERIC_TT_GEN }, /* GEN */ 285 { GCPU_MNEMONIC_UNDEF, "" } 286 }; 287 288 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */ 289 { "LO", FM_EREPORT_CPU_GENERIC_LL_L0 }, /* L0 */ 290 { "L1", FM_EREPORT_CPU_GENERIC_LL_L1 }, /* L1 */ 291 { "L2", FM_EREPORT_CPU_GENERIC_LL_L2 }, /* L2 */ 292 { "LG", FM_EREPORT_CPU_GENERIC_LL_LG } /* LG */ 293 }; 294 295 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */ 296 { "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR }, /* ERR */ 297 { "RD", FM_EREPORT_CPU_GENERIC_RRRR_RD }, /* RD */ 298 { "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR }, /* WR */ 299 { "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD }, /* DRD */ 300 { "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR }, /* DWR */ 301 { "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD }, /* IRD */ 302 { "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH }, /* PREFETCH */ 303 { "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT }, /* EVICT */ 304 { "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP }, /* SNOOP */ 305 }; 306 307 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */ 308 { "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC }, /* SRC */ 309 { "RES", FM_EREPORT_CPU_GENERIC_PP_RES }, /* RES */ 310 { "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS }, /* OBS */ 311 { "", FM_EREPORT_CPU_GENERIC_PP_GEN } /* GEN */ 312 }; 313 314 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */ 315 { "M", FM_EREPORT_CPU_GENERIC_II_MEM }, /* MEM */ 316 { GCPU_MNEMONIC_RESVD, "" }, 317 { "IO", FM_EREPORT_CPU_GENERIC_II_IO }, /* IO */ 318 { "", FM_EREPORT_CPU_GENERIC_II_GEN } /* GEN */ 319 }; 320 321 static struct gcpu_mnexp gcpu_T_mnemonics[] = { /* MCAX86_ERRCODE_T_* */ 322 { "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT }, /* NONE */ 323 { "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT } /* TIMEOUT */ 324 }; 325 326 static struct gcpu_mnexp gcpu_CCCC_mnemonics[] = { /* MCAX86_ERRCODE_CCCC_* */ 327 { "CH0", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH0 */ 328 { "CH1", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH1 */ 329 { "CH2", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH2 */ 330 { "CH3", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH3 */ 331 { "CH4", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH4 */ 332 { "CH5", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH5 */ 333 { "CH6", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH6 */ 334 { "CH7", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH7 */ 335 { "CH8", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH8 */ 336 { "CH9", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH9 */ 337 { "CH10", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH10 */ 338 { "CH11", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH11 */ 339 { "CH12", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH12 */ 340 { "CH13", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH13 */ 341 { "CH14", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH14 */ 342 { "CH", FM_EREPORT_CPU_GENERIC_CCCC } /* GEN */ 343 }; 344 345 static struct gcpu_mnexp gcpu_MMM_mnemonics[] = { /* MCAX86_ERRCODE_MMM_* */ 346 { "GEN", FM_EREPORT_CPU_GENERIC_MMM_ERR }, /* GEN ERR */ 347 { "RD", FM_EREPORT_CPU_GENERIC_MMM_RD }, /* READ */ 348 { "WR", FM_EREPORT_CPU_GENERIC_MMM_WR }, /* WRITE */ 349 { "ADDR_CMD", FM_EREPORT_CPU_GENERIC_MMM_ADRCMD }, /* ADDR, CMD */ 350 { "SCRUB", FM_EREPORT_CPU_GENERIC_MMM_SCRUB }, 351 { GCPU_MNEMONIC_RESVD, ""}, /* RESERVED */ 352 { GCPU_MNEMONIC_RESVD, ""}, /* RESERVED */ 353 { GCPU_MNEMONIC_RESVD, ""} /* RESERVED */ 354 }; 355 356 enum gcpu_mn_namespace { 357 GCPU_MN_NAMESPACE_COMPOUND, 358 GCPU_MN_NAMESPACE_EREPORT 359 }; 360 361 static const char * 362 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint16_t val, 363 enum gcpu_mn_namespace nspace) 364 { 365 if (val >= tbl_sz || val > 0xff) 366 return (GCPU_MNEMONIC_UNDEF); /* for all namespaces */ 367 368 switch (nspace) { 369 case GCPU_MN_NAMESPACE_COMPOUND: 370 return (tbl[val].mne_compound); 371 /*NOTREACHED*/ 372 373 case GCPU_MN_NAMESPACE_EREPORT: 374 return (tbl[val].mne_ereport); 375 /*NOTREACHED*/ 376 377 default: 378 return (GCPU_MNEMONIC_UNDEF); 379 /*NOTREACHED*/ 380 } 381 } 382 383 /* 384 * The ereport class leaf component is either a simple string with no 385 * format specifiers, or a string with one or more embedded %n$s specifiers - 386 * positional selection for string arguments. The kernel snprintf does 387 * not support %n$ (and teaching it to do so is too big a headache) so 388 * we will expand this restricted format string ourselves. 389 */ 390 391 #define GCPU_CLASS_VARCOMPS 9 392 393 #define GCPU_MNEMONIC(code, name, nspace) \ 394 gcpu_mnemonic(gcpu_##name##_mnemonics, \ 395 sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \ 396 BIT_STRIP(code, name), nspace) 397 398 static void 399 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status, 400 enum gcpu_mn_namespace nspace) 401 { 402 uint16_t code = MCAX86_ERRCODE(status); 403 const char *mn[GCPU_CLASS_VARCOMPS]; 404 char *p = buf; /* current position in buf */ 405 char *q = buf + buflen; /* pointer past last char in buf */ 406 int which, expfmtchar, error; 407 char c; 408 409 mn[0] = GCPU_MNEMONIC(code, TT, nspace); 410 mn[1] = GCPU_MNEMONIC(code, LL, nspace); 411 mn[2] = GCPU_MNEMONIC(code, RRRR, nspace); 412 mn[3] = GCPU_MNEMONIC(code, PP, nspace); 413 mn[4] = GCPU_MNEMONIC(code, II, nspace); 414 mn[5] = GCPU_MNEMONIC(code, T, nspace); 415 mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : ""; 416 mn[7] = GCPU_MNEMONIC(code, CCCC, nspace); 417 mn[8] = GCPU_MNEMONIC(code, MMM, nspace); 418 419 while (p < q - 1 && (c = *fmt++) != '\0') { 420 if (c != '%') { 421 /* not the beginning of a format specifier - copy */ 422 *p++ = c; 423 continue; 424 } 425 426 error = 0; 427 which = -1; 428 expfmtchar = -1; 429 430 nextfmt: 431 if ((c = *fmt++) == '\0') 432 break; /* early termination of fmt specifier */ 433 434 switch (c) { 435 case '1': 436 case '2': 437 case '3': 438 case '4': 439 case '5': 440 case '6': 441 case '7': 442 case '8': 443 case '9': 444 if (which != -1) { /* allow only one positional digit */ 445 error++; 446 break; 447 } 448 which = c - '1'; 449 goto nextfmt; 450 /*NOTREACHED*/ 451 452 case '$': 453 if (which == -1) { /* no position specified */ 454 error++; 455 break; 456 } 457 expfmtchar = 's'; 458 goto nextfmt; 459 /*NOTREACHED*/ 460 461 case 's': 462 if (expfmtchar != 's') { 463 error++; 464 break; 465 } 466 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s", 467 mn[which]); 468 p += strlen(p); 469 break; 470 471 default: 472 error++; 473 break; 474 } 475 476 if (error) 477 break; 478 } 479 480 *p = '\0'; /* NUL termination */ 481 } 482 483 static void 484 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status, 485 const char *cpuclass, const char *leafclass) 486 { 487 char *p = buf; /* current position in buf */ 488 char *q = buf + buflen; /* pointer past last char in buf */ 489 490 (void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.", 491 FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC); 492 493 p += strlen(p); 494 if (p >= q) 495 return; 496 497 if (leafclass == NULL) { 498 gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status, 499 GCPU_MN_NAMESPACE_EREPORT); 500 } else { 501 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s", 502 leafclass); 503 } 504 } 505 506 /* 507 * Create an "hc" scheme FMRI identifying the given cpu with 508 * motherboard/chip/core/strand instance numbers. 509 */ 510 static nvlist_t * 511 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva) 512 { 513 nvlist_t *nvl, *fmri; 514 515 if ((nvl = fm_nvlist_create(nva)) == NULL) 516 return (NULL); 517 518 if (!x86gentopo_legacy) { 519 fmri = cmi_hdl_smb_bboard(hdl); 520 if (fmri == NULL) 521 return (NULL); 522 523 fm_fmri_hc_create(nvl, FM_HC_SCHEME_VERSION, 524 NULL, NULL, fmri, 3, 525 "chip", cmi_hdl_smb_chipid(hdl), 526 "core", cmi_hdl_coreid(hdl), 527 "strand", cmi_hdl_strandid(hdl)); 528 } else { 529 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 4, 530 "motherboard", 0, 531 "chip", cmi_hdl_chipid(hdl), 532 "core", cmi_hdl_coreid(hdl), 533 "strand", cmi_hdl_strandid(hdl)); 534 } 535 536 return (nvl); 537 } 538 539 int gcpu_bleat_count_thresh = 5; 540 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL; 541 542 /* 543 * Called when we are unable to propogate a logout structure onto an 544 * errorq for subsequent ereport preparation and logging etc. The caller 545 * should usually only decide to call this for severe errors - those we 546 * suspect we may need to panic for. 547 */ 548 static void 549 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl) 550 { 551 hrtime_t now = gethrtime_waitfree(); 552 static hrtime_t gcpu_last_bleat; 553 gcpu_bank_logout_t *gbl; 554 static int bleatcount; 555 int i; 556 557 /* 558 * Throttle spamming of the console. The first gcpu_bleat_count_thresh 559 * can come as fast as we like, but once we've spammed that many 560 * to the console we require a minimum interval to pass before 561 * any more complaints. 562 */ 563 if (++bleatcount > gcpu_bleat_count_thresh) { 564 if (now - gcpu_last_bleat < gcpu_bleat_min_interval) 565 return; 566 else 567 bleatcount = 0; 568 } 569 gcpu_last_bleat = now; 570 571 cmn_err(CE_WARN, 572 "Machine-Check Errors unlogged on chip %d core %d strand %d, " 573 "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl), 574 cmi_hdl_strandid(hdl)); 575 cmn_err(CE_WARN, "MCG_STATUS 0x%016llx", 576 (u_longlong_t)gcl->gcl_mcg_status); 577 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) { 578 uint64_t status = gbl->gbl_status; 579 580 if (!(status & MSR_MC_STATUS_VAL)) 581 continue; 582 583 /* Force ADDRV for AMD Family 0xf and above */ 584 if (gcpu_force_addr_in_payload) 585 status = status | MSR_MC_STATUS_ADDRV; 586 587 switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) { 588 case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV: 589 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 590 "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx", 591 i, IA32_MSR_MC(i, STATUS), 592 (u_longlong_t)gbl->gbl_status, 593 (u_longlong_t)gbl->gbl_addr, 594 (u_longlong_t)gbl->gbl_misc); 595 break; 596 597 case MSR_MC_STATUS_ADDRV: 598 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 599 "STAT 0x%016llx ADDR 0x%016llx", 600 i, IA32_MSR_MC(i, STATUS), 601 (u_longlong_t)gbl->gbl_status, 602 (u_longlong_t)gbl->gbl_addr); 603 break; 604 605 case MSR_MC_STATUS_MISCV: 606 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 607 "STAT 0x%016llx MISC 0x%016llx", 608 i, IA32_MSR_MC(i, STATUS), 609 (u_longlong_t)gbl->gbl_status, 610 (u_longlong_t)gbl->gbl_misc); 611 break; 612 613 default: 614 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 615 "STAT 0x%016llx", 616 i, IA32_MSR_MC(i, STATUS), 617 (u_longlong_t)gbl->gbl_status); 618 break; 619 620 } 621 } 622 } 623 624 #define _GCPU_BSTATUS(status, what) \ 625 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \ 626 (status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE 627 628 static void 629 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl, 630 uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code) 631 { 632 uint64_t members = ged ? ged->ged_ereport_members : 633 FM_EREPORT_PAYLOAD_FLAGS_COMMON; 634 uint64_t mcg = gcl->gcl_mcg_status; 635 int mcip = mcg & MCG_STATUS_MCIP; 636 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno]; 637 uint64_t bstat = gbl->gbl_status; 638 639 /* 640 * Include the compound error name if requested and if this 641 * is a compound error type. 642 */ 643 if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged && 644 ged->ged_compound_fmt != NULL) { 645 char buf[FM_MAX_CLASS]; 646 647 gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code, 648 GCPU_MN_NAMESPACE_COMPOUND); 649 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR, 650 DATA_TYPE_STRING, buf, NULL); 651 } 652 653 /* 654 * Include disposition information for this error 655 */ 656 if (members & FM_EREPORT_PAYLOAD_FLAG_DISP && 657 gbl->gbl_disp != 0) { 658 int i, empty = 1; 659 char buf[128]; 660 char *p = buf, *q = buf + 128; 661 static struct _gcpu_disp_name { 662 uint64_t dv; 663 const char *dn; 664 } disp_names[] = { 665 { CMI_ERRDISP_CURCTXBAD, 666 "processor_context_corrupt" }, 667 { CMI_ERRDISP_RIPV_INVALID, 668 "return_ip_invalid" }, 669 { CMI_ERRDISP_UC_UNCONSTRAINED, 670 "unconstrained" }, 671 { CMI_ERRDISP_FORCEFATAL, 672 "forcefatal" }, 673 { CMI_ERRDISP_IGNORED, 674 "ignored" }, 675 { CMI_ERRDISP_PCC_CLEARED, 676 "corrupt_context_cleared" }, 677 { CMI_ERRDISP_UC_CLEARED, 678 "uncorrected_data_cleared" }, 679 { CMI_ERRDISP_POISONED, 680 "poisoned" }, 681 { CMI_ERRDISP_INCONSISTENT, 682 "telemetry_unstable" }, 683 }; 684 685 for (i = 0; i < sizeof (disp_names) / 686 sizeof (struct _gcpu_disp_name); i++) { 687 if ((gbl->gbl_disp & disp_names[i].dv) == 0) 688 continue; 689 690 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, 691 "%s%s", empty ? "" : ",", disp_names[i].dn); 692 p += strlen(p); 693 empty = 0; 694 } 695 696 if (p != buf) 697 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP, 698 DATA_TYPE_STRING, buf, NULL); 699 } 700 701 /* 702 * If MCG_STATUS is included add that and an indication of whether 703 * this ereport was the result of a machine check or poll. 704 */ 705 if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) { 706 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS, 707 DATA_TYPE_UINT64, mcg, NULL); 708 709 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP, 710 DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL); 711 } 712 713 /* 714 * If an instruction pointer is to be included add one provided 715 * MCG_STATUS indicated it is valid; meaningless for polled events. 716 */ 717 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP && 718 mcg & MCG_STATUS_EIPV) { 719 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP, 720 DATA_TYPE_UINT64, gcl->gcl_ip, NULL); 721 } 722 723 /* 724 * Add an indication of whether the trap occured during privileged code. 725 */ 726 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) { 727 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV, 728 DATA_TYPE_BOOLEAN_VALUE, 729 gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL); 730 } 731 732 /* 733 * If requested, add the index of the MCA bank. This indicates the 734 * n'th bank of 4 MCA registers, and does not necessarily correspond 735 * to MCi_* - use the bank offset to correlate 736 */ 737 if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) { 738 fm_payload_set(ereport, 739 /* Bank number */ 740 FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno, 741 /* Offset of MCi_CTL */ 742 FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64, 743 IA32_MSR_MC(bankno, CTL), 744 NULL); 745 } 746 747 /* 748 * Add MCi_STATUS if requested, and decode it. 749 */ 750 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) { 751 const char *tbes[] = { 752 "No tracking", /* 00 */ 753 "Green - below threshold", /* 01 */ 754 "Yellow - above threshold", /* 10 */ 755 "Reserved" /* 11 */ 756 }; 757 758 fm_payload_set(ereport, 759 /* Bank MCi_STATUS */ 760 FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat, 761 /* Overflow? */ 762 _GCPU_BSTATUS(bstat, OVER), 763 /* Uncorrected? */ 764 _GCPU_BSTATUS(bstat, UC), 765 /* Enabled? */ 766 _GCPU_BSTATUS(bstat, EN), 767 /* Processor context corrupt? */ 768 _GCPU_BSTATUS(bstat, PCC), 769 /* Error code */ 770 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE, 771 DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat), 772 /* Model-specific error code */ 773 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE, 774 DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat), 775 NULL); 776 777 /* 778 * If MCG_CAP.TES_P indicates that that thresholding info 779 * is present in the architural component of the bank status 780 * then include threshold information for this bank. 781 */ 782 if (gcl->gcl_flags & GCPU_GCL_F_TES_P) { 783 fm_payload_set(ereport, 784 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES, 785 DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)], 786 NULL); 787 } 788 } 789 790 /* 791 * Add MCi_ADDR info if requested and valid. We force addition of 792 * MCi_ADDR, even if its not valid on AMD family 0xf and above, 793 * to aid in analysis of ereports, for WatchDog errors. 794 */ 795 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR && 796 ((bstat & MSR_MC_STATUS_ADDRV) || 797 gcpu_force_addr_in_payload)) { 798 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR, 799 DATA_TYPE_UINT64, gbl->gbl_addr, NULL); 800 } 801 802 /* 803 * MCi_MISC if requested and MCi_STATUS.MISCV). 804 */ 805 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC && 806 bstat & MSR_MC_STATUS_MISCV) { 807 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC, 808 DATA_TYPE_UINT64, gbl->gbl_misc, NULL); 809 } 810 811 } 812 813 /* 814 * Construct and post an ereport based on the logout information from a 815 * single MCA bank. We are not necessarily running on the cpu that 816 * detected the error. 817 */ 818 static void 819 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx, 820 const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status) 821 { 822 gcpu_data_t *gcpu = gcl->gcl_gcpu; 823 cmi_hdl_t hdl = gcpu->gcpu_hdl; 824 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx]; 825 const char *cpuclass = NULL, *leafclass = NULL; 826 uint16_t code = MCAX86_ERRCODE(status); 827 errorq_elem_t *eqep, *scr_eqep; 828 nvlist_t *ereport, *detector; 829 char buf[FM_MAX_CLASS]; 830 const char *classfmt; 831 nv_alloc_t *nva; 832 833 if (panicstr) { 834 if ((eqep = errorq_reserve(ereport_errorq)) == NULL) 835 return; 836 ereport = errorq_elem_nvl(ereport_errorq, eqep); 837 838 /* 839 * Allocate another element for scratch space, but fallback 840 * to the one we have if that fails. We'd like to use the 841 * additional scratch space for nvlist construction. 842 */ 843 if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL) 844 nva = errorq_elem_nva(ereport_errorq, scr_eqep); 845 else 846 nva = errorq_elem_nva(ereport_errorq, eqep); 847 } else { 848 ereport = fm_nvlist_create(NULL); 849 nva = NULL; 850 } 851 852 if (ereport == NULL) 853 return; 854 855 /* 856 * Common payload data required by the protocol: 857 * - ereport class 858 * - detector 859 * - ENA 860 */ 861 862 /* 863 * Ereport class - call into model-specific support to allow it to 864 * provide a cpu class or leaf class, otherwise calculate our own. 865 */ 866 cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass); 867 classfmt = ged ? ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN; 868 gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass, 869 leafclass); 870 871 /* 872 * The detector FMRI. 873 */ 874 if ((detector = cms_ereport_detector(hdl, bankidx, mscookie, 875 nva)) == NULL) 876 detector = gcpu_fmri_create(hdl, nva); 877 878 /* 879 * Should we define a new ENA format 3?? for chip/core/strand? 880 * It will be better when virtualized. 881 */ 882 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf, 883 fm_ena_generate_cpu(gcl->gcl_timestamp, 884 cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 | 885 cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL); 886 887 if (panicstr) { 888 fm_nvlist_destroy(detector, FM_NVA_RETAIN); 889 nv_alloc_reset(nva); 890 } else { 891 fm_nvlist_destroy(detector, FM_NVA_FREE); 892 } 893 894 /* 895 * Add the architectural ereport class-specific payload data. 896 */ 897 gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code); 898 899 /* 900 * Allow model-specific code to add ereport members. 901 */ 902 cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status, 903 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie); 904 905 /* 906 * Include stack if options is turned on and either selected in 907 * the payload member bitmask or inclusion is forced. 908 */ 909 if (gcpu_mca_stack_flag && 910 (cms_ereport_includestack(hdl, mscookie) == 911 B_TRUE || gcpu_mca_stack_ereport_include)) { 912 fm_payload_stack_add(ereport, gcl->gcl_stack, 913 gcl->gcl_stackdepth); 914 } 915 916 /* 917 * If injection has taken place anytime in the past then note this 918 * on the ereport. 919 */ 920 if (cmi_inj_tainted() == B_TRUE) { 921 fm_payload_set(ereport, "__injected", DATA_TYPE_BOOLEAN_VALUE, 922 B_TRUE, NULL); 923 } 924 925 /* 926 * Post ereport. 927 */ 928 if (panicstr) { 929 errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC); 930 if (scr_eqep) 931 errorq_cancel(ereport_errorq, scr_eqep); 932 } else { 933 (void) fm_ereport_post(ereport, EVCH_TRYHARD); 934 fm_nvlist_destroy(ereport, FM_NVA_FREE); 935 } 936 937 } 938 939 /*ARGSUSED*/ 940 void 941 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe) 942 { 943 const gcpu_logout_t *gcl = data; 944 const gcpu_bank_logout_t *gbl; 945 int i; 946 947 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) { 948 const gcpu_error_disp_t *gened; 949 cms_cookie_t mscookie; 950 951 if (gbl->gbl_status & MSR_MC_STATUS_VAL && 952 !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) { 953 uint16_t code = MCAX86_ERRCODE(gbl->gbl_status); 954 955 /* 956 * Perform a match based on IA32 MCA architectural 957 * components alone. 958 */ 959 gened = gcpu_disp_match(code); /* may be NULL */ 960 961 /* 962 * Now see if an model-specific match can be made. 963 */ 964 mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, i, 965 gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc, 966 gcl->gcl_ms_logout); 967 968 /* 969 * Prepare and dispatch an ereport for logging and 970 * diagnosis. 971 */ 972 gcpu_ereport_post(gcl, i, gened, mscookie, 973 gbl->gbl_status); 974 } else if (gbl->gbl_status & MSR_MC_STATUS_VAL && 975 (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) { 976 /* 977 * Telemetry kept changing as we tried to read 978 * it. Force an unknown ereport leafclass but 979 * keep the telemetry unchanged for logging. 980 */ 981 gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL, 982 gbl->gbl_status); 983 } 984 } 985 } 986 987 static size_t gcpu_mca_queue_datasz = 0; 988 989 /* 990 * The following code is ready to make a weak attempt at growing the 991 * errorq structure size. Since it is not foolproof (we don't know 992 * who may already be producing to the outgoing errorq) our caller 993 * instead assures that we'll always be called with no greater data 994 * size than on our first call. 995 */ 996 static void 997 gcpu_errorq_init(size_t datasz) 998 { 999 int slots; 1000 1001 mutex_enter(&gcpu_mca_queue_lock); 1002 1003 if (gcpu_mca_queue_datasz >= datasz) { 1004 mutex_exit(&gcpu_mca_queue_lock); 1005 return; 1006 } 1007 1008 membar_producer(); 1009 if (gcpu_mca_queue) { 1010 gcpu_mca_queue_datasz = 0; 1011 errorq_destroy(gcpu_mca_queue); 1012 } 1013 1014 slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS); 1015 slots = MIN(slots, GCPU_MCA_MAX_ERRORS); 1016 1017 gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain, 1018 NULL, slots, datasz, 1, ERRORQ_VITAL); 1019 1020 if (gcpu_mca_queue != NULL) 1021 gcpu_mca_queue_datasz = datasz; 1022 1023 mutex_exit(&gcpu_mca_queue_lock); 1024 } 1025 1026 /* 1027 * Perform MCA initialization as described in section 14.6 of Intel 64 1028 * and IA-32 Architectures Software Developer's Manual Volume 3A. 1029 */ 1030 1031 static uint_t global_nbanks; 1032 1033 void 1034 gcpu_mca_init(cmi_hdl_t hdl) 1035 { 1036 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1037 uint64_t cap; 1038 uint_t vendor = cmi_hdl_vendor(hdl); 1039 uint_t family = cmi_hdl_family(hdl); 1040 uint_t rev = cmi_hdl_chiprev(hdl); 1041 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1042 int mcg_ctl_present; 1043 uint_t nbanks; 1044 uint32_t ctl_skip_mask = 0; 1045 uint32_t status_skip_mask = 0; 1046 size_t mslsz; 1047 int i; 1048 #ifndef __xpv 1049 int mcg_ctl2_present; 1050 uint32_t cmci_capable = 0; 1051 #endif 1052 if (gcpu == NULL) 1053 return; 1054 1055 /* We add MCi_ADDR always for AMD Family 0xf and above */ 1056 if (X86_CHIPFAM_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B)) 1057 gcpu_force_addr_in_payload = 1; 1058 1059 /* 1060 * Protect from some silly /etc/system settings. 1061 */ 1062 if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100) 1063 gcpu_mca_telemetry_retries = 5; 1064 1065 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS) 1066 return; 1067 1068 /* 1069 * CPU startup code only calls cmi_mca_init if x86_feature indicates 1070 * both MCA and MCE support (i.e., X86_MCA). P5, K6, and earlier 1071 * processors, which have their own more primitive way of doing 1072 * machine checks, will not have cmi_mca_init called since their 1073 * CPUID information will not indicate both MCA and MCE features. 1074 */ 1075 ASSERT(x86_feature & X86_MCA); 1076 1077 /* 1078 * Determine whether the IA32_MCG_CTL register is present. If it 1079 * is we will enable all features by writing -1 to it towards 1080 * the end of this initialization; if it is absent then volume 3A 1081 * says we must nonetheless continue to initialize the individual 1082 * banks. 1083 */ 1084 mcg_ctl_present = cap & MCG_CAP_CTL_P; 1085 #ifndef __xpv 1086 mcg_ctl2_present = cap & MCG_CAP_CTL2_P; 1087 #endif 1088 1089 /* 1090 * We squirell values away for inspection/debugging. 1091 */ 1092 mca->gcpu_mca_bioscfg.bios_mcg_cap = cap; 1093 if (mcg_ctl_present) 1094 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL, 1095 &mca->gcpu_mca_bioscfg.bios_mcg_ctl); 1096 1097 /* 1098 * Determine the number of error-reporting banks implemented. 1099 */ 1100 mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK; 1101 1102 if (nbanks != 0 && global_nbanks == 0) 1103 global_nbanks = nbanks; /* no race - BSP will get here first */ 1104 1105 /* 1106 * If someone is hiding the number of banks (perhaps we are fully 1107 * virtualized?) or if this processor has more banks than the 1108 * first to set global_nbanks then bail. The latter requirement 1109 * is because we need to size our errorq data structure and we 1110 * don't want to have to grow the errorq (destroy and recreate) 1111 * which may just lose some telemetry. 1112 */ 1113 if (nbanks == 0 || nbanks > global_nbanks) 1114 return; 1115 1116 mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks * 1117 sizeof (struct gcpu_bios_bankcfg), KM_SLEEP); 1118 1119 /* 1120 * Calculate the size we need to allocate for a gcpu_logout_t 1121 * with a gcl_data array big enough for all banks of this cpu. 1122 * Add any space requested by the model-specific logout support. 1123 */ 1124 mslsz = cms_logout_size(hdl); 1125 mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) + 1126 (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz; 1127 1128 for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) { 1129 gcpu_logout_t *gcl; 1130 1131 mca->gcpu_mca_logout[i] = gcl = 1132 kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP); 1133 gcl->gcl_gcpu = gcpu; 1134 gcl->gcl_nbanks = nbanks; 1135 gcl->gcl_ms_logout = (mslsz == 0) ? NULL : 1136 (char *)(&gcl->gcl_data[0]) + nbanks * 1137 sizeof (gcpu_bank_logout_t); 1138 1139 } 1140 1141 #ifdef __xpv 1142 gcpu_xpv_mca_init(nbanks); 1143 #endif 1144 1145 mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1; 1146 1147 #ifndef __xpv 1148 mca->gcpu_bank_cmci = kmem_zalloc(sizeof (gcpu_mca_cmci_t) * nbanks, 1149 KM_SLEEP); 1150 #endif 1151 1152 /* 1153 * Create our errorq to transport the logout structures. This 1154 * can fail so users of gcpu_mca_queue must be prepared for NULL. 1155 */ 1156 gcpu_errorq_init(mca->gcpu_mca_lgsz); 1157 1158 /* 1159 * Not knowing which, if any, banks are shared between cores we 1160 * assure serialization of MCA bank initialization by each cpu 1161 * on the chip. On chip architectures in which some banks are 1162 * shared this will mean the shared resource is initialized more 1163 * than once - we're simply aiming to avoid simultaneous MSR writes 1164 * to the shared resource. 1165 * 1166 * Even with these precautions, some platforms may yield a GP fault 1167 * if a core other than a designated master tries to write anything 1168 * but all 0's to MCi_{STATUS,ADDR,CTL}. So we will perform 1169 * those writes under on_trap protection. 1170 */ 1171 mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock); 1172 1173 /* 1174 * Initialize poller data, but don't start polling yet. 1175 */ 1176 gcpu_mca_poll_init(hdl); 1177 1178 /* 1179 * Work out which MCA banks we will initialize. In MCA logout 1180 * code we will only read those banks which we initialize here. 1181 */ 1182 for (i = 0; i < nbanks; i++) { 1183 boolean_t skipctl = cms_bankctl_skipinit(hdl, i); 1184 boolean_t skipstatus = cms_bankstatus_skipinit(hdl, i); 1185 1186 if (!cms_present(hdl)) { 1187 /* 1188 * Model-specific support is not present, try to use 1189 * sane defaults. 1190 * 1191 * On AMD family 6 processors, reports about spurious 1192 * machine checks indicate that bank 0 should be 1193 * skipped. 1194 * 1195 * On Intel family 6 processors, the documentation tells 1196 * us not to write to MC0_CTL. 1197 * 1198 */ 1199 if (i == 0 && family == 6) { 1200 switch (vendor) { 1201 case X86_VENDOR_AMD: 1202 skipstatus = B_TRUE; 1203 /*FALLTHRU*/ 1204 case X86_VENDOR_Intel: 1205 skipctl = B_TRUE; 1206 break; 1207 } 1208 } 1209 } 1210 1211 ctl_skip_mask |= skipctl << i; 1212 status_skip_mask |= skipstatus << i; 1213 1214 if (skipctl && skipstatus) 1215 continue; 1216 1217 /* 1218 * Record which MCA banks were enabled, from the point of view 1219 * of the whole chip (if some cores share a bank we must be 1220 * sure either can logout from it). 1221 */ 1222 atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i); 1223 1224 #ifndef __xpv 1225 /* 1226 * check CMCI capability 1227 */ 1228 if (mcg_ctl2_present) { 1229 uint64_t ctl2; 1230 uint32_t cap = 0; 1231 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2); 1232 if (ctl2 & MSR_MC_CTL2_EN) 1233 continue; 1234 ctl2 |= MSR_MC_CTL2_EN; 1235 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2); 1236 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2); 1237 mca->gcpu_bank_cmci[i].cmci_cap = cap = 1238 (ctl2 & MSR_MC_CTL2_EN) ? 1 : 0; 1239 if (cap) 1240 cmci_capable ++; 1241 /* 1242 * Set threshold to 1 while unset the en field, to avoid 1243 * CMCI trigged before APIC LVT entry init. 1244 */ 1245 ctl2 = ctl2 & (~MSR_MC_CTL2_EN) | 1; 1246 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2); 1247 1248 /* 1249 * init cmci related count 1250 */ 1251 mca->gcpu_bank_cmci[i].cmci_enabled = 0; 1252 mca->gcpu_bank_cmci[i].drtcmci = 0; 1253 mca->gcpu_bank_cmci[i].ncmci = 0; 1254 } 1255 #endif 1256 } 1257 1258 #ifndef __xpv 1259 if (cmci_capable) 1260 cmi_enable_cmci = 1; 1261 #endif 1262 1263 #ifndef __xpv 1264 /* 1265 * Log any valid telemetry lurking in the MCA banks, but do not 1266 * clear the status registers. Ignore the disposition returned - 1267 * we have already paniced or reset for any nasty errors found here. 1268 * 1269 * Intel vol 3A says that we should not do this on family 0x6, 1270 * and that for any extended family the BIOS clears things 1271 * on power-on reset so you'll only potentially find valid telemetry 1272 * on warm reset (we do it for both - on power-on reset we should 1273 * just see zeroes). 1274 * 1275 * AMD docs since K7 say we should process anything we find here. 1276 */ 1277 if (!gcpu_suppress_log_on_init && 1278 (vendor == X86_VENDOR_Intel && family >= 0xf || 1279 vendor == X86_VENDOR_AMD)) 1280 gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE, 1281 GCPU_MPT_WHAT_POKE_ERR); 1282 1283 /* 1284 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the 1285 * model-specific module the power of veto. 1286 */ 1287 for (i = 0; i < nbanks; i++) { 1288 struct gcpu_bios_bankcfg *bcfgp = 1289 mca->gcpu_mca_bioscfg.bios_bankcfg + i; 1290 1291 /* 1292 * Stash inherited bank MCA state, even for banks we will 1293 * not initialize ourselves. Do not read the MISC register 1294 * unconditionally - on some processors that will #GP on 1295 * banks that do not implement the MISC register (would be 1296 * caught by on_trap, anyway). 1297 */ 1298 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL), 1299 &bcfgp->bios_bank_ctl); 1300 1301 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), 1302 &bcfgp->bios_bank_status); 1303 1304 if ((bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV) || 1305 gcpu_force_addr_in_payload) { 1306 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), 1307 &bcfgp->bios_bank_addr); 1308 } 1309 1310 /* 1311 * In some old BIOS the status value after boot can indicate 1312 * MISCV when there is actually no MISC register for 1313 * that bank. The following read could therefore 1314 * aggravate a general protection fault. This should be 1315 * caught by on_trap, but the #GP fault handler is busted 1316 * and can suffer a double fault even before we get to 1317 * trap() to check for on_trap protection. Until that 1318 * issue is fixed we remove the one access that we know 1319 * can cause a #GP. 1320 * 1321 * if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV) 1322 * (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), 1323 * &bcfgp->bios_bank_misc); 1324 */ 1325 bcfgp->bios_bank_misc = 0; 1326 1327 if (!(ctl_skip_mask & (1 << i))) { 1328 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL), 1329 cms_bankctl_val(hdl, i, -1ULL)); 1330 } 1331 1332 if (!(status_skip_mask & (1 << i))) { 1333 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 1334 cms_bankstatus_val(hdl, i, 0ULL)); 1335 } 1336 } 1337 #endif 1338 /* 1339 * Now let the model-specific support perform further initialization 1340 * of non-architectural features. 1341 */ 1342 cms_mca_init(hdl, nbanks); 1343 1344 #ifndef __xpv 1345 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL); 1346 membar_producer(); 1347 1348 /* enable all machine-check features */ 1349 if (mcg_ctl_present) 1350 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL, 1351 cms_mcgctl_val(hdl, nbanks, -1ULL)); 1352 #endif 1353 1354 mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock); 1355 1356 #ifndef __xpv 1357 /* enable machine-check exception in CR4 */ 1358 cmi_hdl_enable_mce(hdl); 1359 #endif 1360 } 1361 1362 static uint64_t 1363 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu, 1364 gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp) 1365 { 1366 int curctxbad = 0, unconstrained = 0, forcefatal = 0; 1367 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1368 int nbanks = mca->gcpu_mca_nbanks; 1369 gcpu_mce_status_t mce; 1370 gcpu_bank_logout_t *gbl; 1371 uint64_t disp = 0; 1372 int i; 1373 1374 if (mcesp == NULL) 1375 mcesp = &mce; 1376 1377 mcesp->mce_nerr = nerr; 1378 1379 mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc = 1380 mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned = 1381 mcesp->mce_forcefatal = mcesp->mce_ignored = 0; 1382 1383 /* 1384 * If this a machine check then if the return instruction pointer 1385 * is not valid the current context is lost. 1386 */ 1387 if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV)) 1388 disp |= CMI_ERRDISP_RIPV_INVALID; 1389 1390 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) { 1391 uint64_t mcistatus = gbl->gbl_status; 1392 uint32_t ms_scope; 1393 int pcc, uc; 1394 int poisoned; 1395 1396 if (!(mcistatus & MSR_MC_STATUS_VAL)) 1397 continue; 1398 1399 if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT) 1400 continue; 1401 1402 pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0; 1403 uc = (mcistatus & MSR_MC_STATUS_UC) != 0; 1404 mcesp->mce_npcc += pcc; 1405 mcesp->mce_nuc += uc; 1406 1407 ms_scope = cms_error_action(hdl, ismc, i, mcistatus, 1408 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout); 1409 1410 if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) { 1411 pcc = 0; 1412 mcesp->mce_npcc_ok++; 1413 gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED; 1414 } 1415 1416 if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) { 1417 uc = 0; 1418 mcesp->mce_nuc_ok++; 1419 gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED; 1420 } 1421 1422 if (uc) { 1423 poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0; 1424 if (poisoned) { 1425 mcesp->mce_nuc_poisoned++; 1426 gbl->gbl_disp |= CMI_ERRDISP_POISONED; 1427 } 1428 } 1429 1430 if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) { 1431 /* 1432 * We're not being instructed to ignore the error, 1433 * so apply our standard disposition logic to it. 1434 */ 1435 if (uc && !poisoned) { 1436 unconstrained++; 1437 gbl->gbl_disp |= disp | 1438 CMI_ERRDISP_UC_UNCONSTRAINED; 1439 } 1440 1441 if (pcc && ismc) { 1442 curctxbad++; 1443 gbl->gbl_disp |= disp | 1444 CMI_ERRDISP_CURCTXBAD; 1445 } 1446 1447 /* 1448 * Even if the above may not indicate that the error 1449 * is terminal, model-specific support may insist 1450 * that we treat it as such. Such errors wil be 1451 * fatal even if discovered via poll. 1452 */ 1453 if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) { 1454 forcefatal++; 1455 mcesp->mce_forcefatal++; 1456 gbl->gbl_disp |= disp | 1457 CMI_ERRDISP_FORCEFATAL; 1458 } 1459 } else { 1460 mcesp->mce_ignored++; 1461 gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED; 1462 } 1463 } 1464 1465 if (unconstrained > 0) 1466 disp |= CMI_ERRDISP_UC_UNCONSTRAINED; 1467 1468 if (curctxbad > 0) 1469 disp |= CMI_ERRDISP_CURCTXBAD; 1470 1471 if (forcefatal > 0) 1472 disp |= CMI_ERRDISP_FORCEFATAL; 1473 1474 if (gcpu_mca_queue != NULL) { 1475 int how; 1476 1477 if (ismc) { 1478 how = cmi_mce_response(rp, disp) ? 1479 ERRORQ_ASYNC : /* no panic, so arrange drain */ 1480 ERRORQ_SYNC; /* panic flow will drain */ 1481 } else { 1482 how = (disp & CMI_ERRDISP_FORCEFATAL && 1483 cmi_panic_on_ue()) ? 1484 ERRORQ_SYNC : /* poller will panic */ 1485 ERRORQ_ASYNC; /* no panic */ 1486 } 1487 1488 errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how); 1489 } else if (disp != 0) { 1490 gcpu_bleat(hdl, gcl); 1491 } 1492 1493 mcesp->mce_disp = disp; 1494 1495 return (disp); 1496 } 1497 1498 /* 1499 * Gather error telemetry from our source, and then submit it for 1500 * processing. 1501 */ 1502 1503 #define IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \ 1504 ((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0) 1505 1506 #define STATUS_EQV(s1, s2) \ 1507 (((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER)) 1508 1509 static uint32_t gcpu_deferrred_polled_clears; 1510 1511 #ifndef __xpv 1512 static void 1513 gcpu_cmci_logout(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p, 1514 uint64_t status, int what) 1515 { 1516 uint64_t ctl2; 1517 1518 if (bank_cmci_p->cmci_cap && (what == GCPU_MPT_WHAT_CYC_ERR) && 1519 (!(status & MSR_MC_STATUS_VAL) || ((status & MSR_MC_STATUS_VAL) && 1520 !(status & MSR_MC_STATUS_CEC_MASK)))) { 1521 1522 if (!(bank_cmci_p->cmci_enabled)) { 1523 /* 1524 * when cmci is disabled, and the bank has no error or 1525 * no corrected error for 1526 * gcpu_mca_cmci_reenable_threshold consecutive polls, 1527 * turn on this bank's cmci. 1528 */ 1529 1530 bank_cmci_p->drtcmci ++; 1531 1532 if (bank_cmci_p->drtcmci >= 1533 gcpu_mca_cmci_reenable_threshold) { 1534 1535 /* turn on cmci */ 1536 1537 (void) cmi_hdl_rdmsr(hdl, 1538 IA32_MSR_MC_CTL2(bank), &ctl2); 1539 ctl2 |= MSR_MC_CTL2_EN; 1540 (void) cmi_hdl_wrmsr(hdl, 1541 IA32_MSR_MC_CTL2(bank), ctl2); 1542 1543 /* reset counter and set flag */ 1544 bank_cmci_p->drtcmci = 0; 1545 bank_cmci_p->cmci_enabled = 1; 1546 } 1547 } else { 1548 /* 1549 * when cmci is enabled,if is in cyclic poll and the 1550 * bank has no error or no corrected error, reset ncmci 1551 * counter 1552 */ 1553 bank_cmci_p->ncmci = 0; 1554 } 1555 } 1556 } 1557 1558 static void 1559 gcpu_cmci_throttle(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p, 1560 int what) 1561 { 1562 uint64_t ctl2 = 0; 1563 1564 /* 1565 * if cmci of this bank occurred beyond 1566 * gcpu_mca_cmci_throttling_threshold between 2 polls, 1567 * turn off this bank's CMCI; 1568 */ 1569 if (bank_cmci_p->cmci_enabled && what == GCPU_MPT_WHAT_CMCI_ERR) { 1570 1571 /* if it is cmci trap, increase the count */ 1572 bank_cmci_p->ncmci++; 1573 1574 if (bank_cmci_p->ncmci >= gcpu_mca_cmci_throttling_threshold) { 1575 1576 /* turn off cmci */ 1577 1578 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(bank), 1579 &ctl2); 1580 ctl2 &= ~MSR_MC_CTL2_EN; 1581 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(bank), 1582 ctl2); 1583 1584 /* clear the flag and count */ 1585 1586 bank_cmci_p->cmci_enabled = 0; 1587 bank_cmci_p->ncmci = 0; 1588 } 1589 } 1590 } 1591 #endif 1592 1593 static void 1594 clear_mc(int first, int last, int ismc, boolean_t clrstatus, 1595 cmi_hdl_t hdl, gcpu_logout_t *gcl, gcpu_logout_t *pgcl) 1596 { 1597 int i; 1598 gcpu_bank_logout_t *gbl, *pgbl; 1599 uint64_t status; 1600 1601 if (first < 0 || last < 0) 1602 return; 1603 1604 for (i = first, gbl = &gcl->gcl_data[first]; i <= last; i++, gbl++) { 1605 status = gbl->gbl_status; 1606 if (status == 0) 1607 continue; 1608 if (clrstatus == B_FALSE) 1609 goto serialize; 1610 1611 /* 1612 * For i86xpv we always clear status in order to invalidate 1613 * the interposed telemetry. 1614 * 1615 * For native machine checks we always clear status here. For 1616 * native polls we must be a little more cautious since there 1617 * is an outside chance that we may clear telemetry from a 1618 * shared MCA bank on which a sibling core is machine checking. 1619 * 1620 * For polled observations of errors that look like they may 1621 * produce a machine check (UC/PCC and ENabled, although these 1622 * do not guarantee a machine check on error occurence) 1623 * we will not clear the status at this wakeup unless 1624 * we saw the same status at the previous poll. We will 1625 * always process and log the current observations - it 1626 * is only the clearing of MCi_STATUS which may be 1627 * deferred until the next wakeup. 1628 */ 1629 if (isxpv || ismc || !IS_MCE_CANDIDATE(status)) { 1630 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL); 1631 goto serialize; 1632 } 1633 1634 /* 1635 * We have a polled observation of a machine check 1636 * candidate. If we saw essentially the same status at the 1637 * last poll then clear the status now since this appears 1638 * not to be a #MC candidate after all. If we see quite 1639 * different status now then do not clear, but reconsider at 1640 * the next poll. In no actual machine check clears 1641 * the status in the interim then the status should not 1642 * keep changing forever (meaning we'd never clear it) 1643 * since before long we'll simply have latched the highest- 1644 * priority error and set the OVerflow bit. Nonetheless 1645 * we count how many times we defer clearing and after 1646 * a while insist on clearing the status. 1647 */ 1648 pgbl = &pgcl->gcl_data[i]; 1649 if (pgbl->gbl_clrdefcnt != 0) { 1650 /* We deferred clear on this bank at last wakeup */ 1651 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) || 1652 pgbl->gbl_clrdefcnt > 5) { 1653 /* 1654 * Status is unchanged so clear it now and, 1655 * since we have already logged this info, 1656 * avoid logging it again. 1657 */ 1658 gbl->gbl_status = 0; 1659 (void) cmi_hdl_wrmsr(hdl, 1660 IA32_MSR_MC(i, STATUS), 0ULL); 1661 } else { 1662 /* Record deferral for next wakeup */ 1663 gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1; 1664 } 1665 } else { 1666 /* Record initial deferral for next wakeup */ 1667 gbl->gbl_clrdefcnt = 1; 1668 gcpu_deferrred_polled_clears++; 1669 } 1670 1671 serialize: 1672 { 1673 #ifdef __xpv 1674 ; 1675 #else 1676 /* 1677 * Intel Vol 3A says to execute a serializing 1678 * instruction here, ie CPUID. Well WRMSR is also 1679 * defined to be serializing, so the status clear above 1680 * should suffice. To be a good citizen, and since 1681 * some clears are deferred, we'll execute a CPUID 1682 * instruction here. 1683 */ 1684 struct cpuid_regs tmp; 1685 (void) __cpuid_insn(&tmp); 1686 #endif 1687 } 1688 } 1689 } 1690 1691 /*ARGSUSED5*/ 1692 void 1693 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask, 1694 gcpu_mce_status_t *mcesp, boolean_t clrstatus, int what) 1695 { 1696 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1697 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1698 int nbanks = mca->gcpu_mca_nbanks; 1699 gcpu_bank_logout_t *gbl, *pgbl; 1700 gcpu_logout_t *gcl, *pgcl; 1701 int ismc = (rp != NULL); 1702 int ispoll = !ismc; 1703 int i, nerr = 0; 1704 cmi_errno_t err; 1705 uint64_t mcg_status; 1706 uint64_t disp; 1707 uint64_t cap; 1708 int first = -1; 1709 int last = -1; 1710 int willpanic = 0; 1711 1712 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) != 1713 CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != 1714 CMI_SUCCESS) { 1715 if (mcesp != NULL) 1716 mcesp->mce_nerr = mcesp->mce_disp = 0; 1717 return; 1718 } 1719 1720 if (ismc) { 1721 gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION]; 1722 } else { 1723 int pidx = mca->gcpu_mca_nextpoll_idx; 1724 int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ? 1725 GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1; 1726 1727 gcl = mca->gcpu_mca_logout[pidx]; /* current logout */ 1728 pgcl = mca->gcpu_mca_logout[ppidx]; /* previous logout */ 1729 mca->gcpu_mca_nextpoll_idx = ppidx; /* switch next time */ 1730 } 1731 1732 gcl->gcl_timestamp = gethrtime_waitfree(); 1733 gcl->gcl_mcg_status = mcg_status; 1734 gcl->gcl_ip = rp ? rp->r_pc : 0; 1735 1736 gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0; 1737 if (cap & MCG_CAP_TES_P) 1738 gcl->gcl_flags |= GCPU_GCL_F_TES_P; 1739 1740 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) { 1741 uint64_t status, status2, addr, misc; 1742 int retries = gcpu_mca_telemetry_retries; 1743 1744 gbl->gbl_status = 0; 1745 gbl->gbl_disp = 0; 1746 gbl->gbl_clrdefcnt = 0; 1747 1748 /* 1749 * Only logout from MCA banks we have initialized from at 1750 * least one core. If a core shares an MCA bank with another 1751 * but perhaps lost the race to initialize it, then it must 1752 * still be allowed to logout from the shared bank. 1753 */ 1754 if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i)) 1755 continue; 1756 1757 /* 1758 * On a poll look only at the banks we've been asked to check. 1759 */ 1760 if (rp == NULL && !(bankmask & 1 << i)) 1761 continue; 1762 1763 1764 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) != 1765 CMI_SUCCESS) 1766 continue; 1767 1768 #ifndef __xpv 1769 gcpu_cmci_logout(hdl, i, &mca->gcpu_bank_cmci[i], status, what); 1770 #endif 1771 1772 retry: 1773 if (!(status & MSR_MC_STATUS_VAL)) 1774 continue; 1775 1776 /* First and last bank that have valid status */ 1777 if (first < 0) 1778 first = i; 1779 last = i; 1780 1781 addr = -1; 1782 misc = 0; 1783 1784 if ((status & MSR_MC_STATUS_ADDRV) || 1785 gcpu_force_addr_in_payload) 1786 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr); 1787 1788 if (status & MSR_MC_STATUS_MISCV) 1789 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc); 1790 1791 #ifndef __xpv 1792 gcpu_cmci_throttle(hdl, i, &mca->gcpu_bank_cmci[i], what); 1793 #endif 1794 1795 /* 1796 * Allow the model-specific code to extract bank telemetry. 1797 */ 1798 cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout); 1799 1800 /* 1801 * Not all cpu models assure us that the status/address/misc 1802 * data will not change during the above sequence of MSR reads, 1803 * or that it can only change by the addition of the OVerflow 1804 * bit to the status register. If the status has changed 1805 * other than in the overflow bit then we attempt to reread 1806 * for a consistent snapshot, but eventually give up and 1807 * go with what we've got. We only perform this check 1808 * for a poll - a further #MC during a #MC will reset, and 1809 * polled errors should not overwrite higher-priority 1810 * trapping errors (but could set the overflow bit). 1811 */ 1812 if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), 1813 &status2)) == CMI_SUCCESS) { 1814 if (!STATUS_EQV(status, status2)) { 1815 if (retries-- > 0) { 1816 status = status2; 1817 goto retry; 1818 } else { 1819 gbl->gbl_disp |= 1820 CMI_ERRDISP_INCONSISTENT; 1821 } 1822 } 1823 } else if (ispoll && err != CMI_SUCCESS) { 1824 gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT; 1825 } 1826 1827 nerr++; 1828 gbl->gbl_status = status; 1829 gbl->gbl_addr = addr; 1830 gbl->gbl_misc = misc; 1831 1832 /* 1833 * For polled observation, if the count of deferred status 1834 * clears updated in the clear_mc() is nonzero and the 1835 * MCi_STATUS has not changed, the last wakeup has produced 1836 * the ereport of the error. Therefore, clear the status in 1837 * this wakeup to avoid duplicate ereport. 1838 */ 1839 pgbl = &pgcl->gcl_data[i]; 1840 if (!isxpv && ispoll && IS_MCE_CANDIDATE(status) && 1841 pgbl->gbl_clrdefcnt != 0) { 1842 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status)) { 1843 gbl->gbl_status = 0; 1844 (void) cmi_hdl_wrmsr(hdl, 1845 IA32_MSR_MC(i, STATUS), 0ULL); 1846 } 1847 } 1848 } 1849 1850 if (gcpu_mca_stack_flag) 1851 gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH); 1852 else 1853 gcl->gcl_stackdepth = 0; 1854 1855 /* 1856 * Decide our disposition for this error or errors, and submit for 1857 * logging and subsequent diagnosis. 1858 */ 1859 if (nerr != 0) { 1860 disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp); 1861 1862 willpanic = (ismc && cmi_mce_response(rp, disp) == 0); 1863 1864 if (!willpanic) 1865 clear_mc(first, last, ismc, clrstatus, hdl, gcl, pgcl); 1866 } else { 1867 disp = 0; 1868 if (mcesp) { 1869 mcesp->mce_nerr = mcesp->mce_disp = 0; 1870 } 1871 } 1872 1873 /* 1874 * Clear MCG_STATUS if MCIP is set (machine check in progress). 1875 * If a second #MC had occured before now the system would have 1876 * reset. We can only do thise once gcpu_mca_process has copied 1877 * the logout structure. 1878 */ 1879 if (ismc && mcg_status & MCG_STATUS_MCIP) 1880 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0); 1881 1882 /* 1883 * At this point we have read and logged all telemetry that is visible 1884 * under the MCA. On architectures for which the NorthBridge is 1885 * on-chip this may include NB-observed errors, but where the NB 1886 * is off chip it may have been the source of the #MC request and 1887 * so we must call into the memory-controller driver to give it 1888 * a chance to log errors. 1889 */ 1890 if (ismc) { 1891 cmi_mc_logout(hdl, 1, willpanic); 1892 } 1893 } 1894 1895 #ifndef __xpv 1896 int gcpu_mca_trap_vomit_summary = 0; 1897 1898 /* 1899 * On a native machine check exception we come here from mcetrap via 1900 * cmi_mca_trap. A machine check on one cpu of a chip does not trap others 1901 * cpus of the chip, so it is possible that another cpu on this chip could 1902 * initiate a poll while we're in the #mc handler; it is also possible that 1903 * this trap has occured during a poll on this cpu. So we must acquire 1904 * the chip-wide poll lock, but be careful to avoid deadlock. 1905 * 1906 * The 'data' pointer cannot be NULL due to init order. 1907 */ 1908 uint64_t 1909 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp) 1910 { 1911 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1912 kmutex_t *poll_lock = NULL; 1913 gcpu_mce_status_t mce; 1914 uint64_t mcg_status; 1915 int tooklock = 0; 1916 1917 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) != 1918 CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP)) 1919 return (0); 1920 1921 /* 1922 * Synchronize with any poller from another core that may happen 1923 * to share access to one or more of the MCA banks. 1924 */ 1925 if (gcpu->gcpu_shared != NULL) 1926 poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock; 1927 1928 if (poll_lock != NULL && !mutex_owned(poll_lock)) { 1929 /* 1930 * The lock is not owned by the thread we have 1931 * interrupted. Spin for this adaptive lock. 1932 */ 1933 while (!mutex_tryenter(poll_lock)) { 1934 while (mutex_owner(poll_lock) != NULL) 1935 ; 1936 } 1937 tooklock = 1; 1938 } 1939 1940 gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE, GCPU_MPT_WHAT_MC_ERR); 1941 1942 if (tooklock) 1943 mutex_exit(poll_lock); 1944 1945 /* 1946 * gcpu_mca_trap_vomit_summary may be set for debug assistance. 1947 */ 1948 if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) { 1949 cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, " 1950 "%u PCC (%u ok), " 1951 "%u UC (%d ok, %u poisoned), " 1952 "%u forcefatal, %u ignored", 1953 mce.mce_nerr, (u_longlong_t)mce.mce_disp, 1954 mce.mce_npcc, mce.mce_npcc_ok, 1955 mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned, 1956 mce.mce_forcefatal, mce.mce_ignored); 1957 } 1958 1959 return (mce.mce_disp); 1960 } 1961 #endif 1962 1963 /*ARGSUSED*/ 1964 void 1965 gcpu_faulted_enter(cmi_hdl_t hdl) 1966 { 1967 /* Nothing to do here */ 1968 } 1969 1970 /*ARGSUSED*/ 1971 void 1972 gcpu_faulted_exit(cmi_hdl_t hdl) 1973 { 1974 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1975 1976 gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING; 1977 } 1978 1979 /* 1980 * Write the requested values to the indicated MSRs. Having no knowledge 1981 * of the model-specific requirements for writing to these model-specific 1982 * registers, we will only blindly write to those MSRs if the 'force' 1983 * argument is nonzero. That option should only be used in prototyping 1984 * and debugging. 1985 */ 1986 /*ARGSUSED*/ 1987 cmi_errno_t 1988 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs, 1989 int force) 1990 { 1991 int i, errs = 0; 1992 1993 for (i = 0; i < nregs; i++) { 1994 uint_t msr = regs[i].cmr_msrnum; 1995 uint64_t val = regs[i].cmr_msrval; 1996 1997 if (cms_present(hdl)) { 1998 if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS) 1999 errs++; 2000 } else if (force) { 2001 errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS); 2002 } else { 2003 errs++; 2004 } 2005 } 2006 2007 return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN); 2008 } 2009 2010 /* deconfigure gcpu_mca_init() */ 2011 void 2012 gcpu_mca_fini(cmi_hdl_t hdl) 2013 { 2014 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 2015 gcpu_mca_t *mca = &gcpu->gcpu_mca; 2016 int i; 2017 2018 /* 2019 * CPU startup code only calls cmi_mca_init if x86_feature indicates 2020 * both MCA and MCE support (i.e., X86_MCA). P5, K6, and earlier 2021 * processors, which have their own more primitive way of doing 2022 * machine checks, will not have cmi_mca_init called since their 2023 * CPUID information will not indicate both MCA and MCE features. 2024 */ 2025 if ((x86_feature & X86_MCA) == 0) 2026 return; 2027 #ifndef __xpv 2028 /* 2029 * disable machine check in CR4 2030 */ 2031 cmi_ntv_hwdisable_mce(hdl); 2032 #endif 2033 mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock); 2034 gcpu_mca_poll_fini(hdl); 2035 mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock); 2036 2037 /* 2038 * free resources allocated during init 2039 */ 2040 if (mca->gcpu_bank_cmci != NULL) { 2041 kmem_free(mca->gcpu_bank_cmci, sizeof (gcpu_mca_cmci_t) * 2042 mca->gcpu_mca_nbanks); 2043 } 2044 2045 for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) { 2046 if (mca->gcpu_mca_logout[i] != NULL) { 2047 kmem_free(mca->gcpu_mca_logout[i], mca->gcpu_mca_lgsz); 2048 } 2049 } 2050 2051 if (mca->gcpu_mca_bioscfg.bios_bankcfg != NULL) { 2052 kmem_free(mca->gcpu_mca_bioscfg.bios_bankcfg, 2053 sizeof (struct gcpu_bios_bankcfg) * mca->gcpu_mca_nbanks); 2054 } 2055 } 2056