1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2018, Joyent, Inc. 25 * Copyright 2022 Oxide Computer Co. 26 */ 27 /* 28 * Copyright (c) 2010, Intel Corporation. 29 * All rights reserved. 30 */ 31 32 #include <sys/mca_x86.h> 33 #include <sys/cpu_module_impl.h> 34 #include <sys/cpu_module_ms.h> 35 #include <sys/cmn_err.h> 36 #include <sys/cpuvar.h> 37 #include <sys/pghw.h> 38 #include <sys/x86_archext.h> 39 #include <sys/sysmacros.h> 40 #include <sys/regset.h> 41 #include <sys/privregs.h> 42 #include <sys/systm.h> 43 #include <sys/types.h> 44 #include <sys/log.h> 45 #include <sys/psw.h> 46 #include <sys/fm/protocol.h> 47 #include <sys/fm/util.h> 48 #include <sys/errorq.h> 49 #include <sys/mca_x86.h> 50 #include <sys/fm/cpu/GMCA.h> 51 #include <sys/fm/smb/fmsmb.h> 52 #include <sys/sysevent.h> 53 #include <sys/ontrap.h> 54 #include <sys/smp_impldefs.h> 55 56 #include "gcpu.h" 57 58 extern int x86gentopo_legacy; /* x86 generic topology support */ 59 60 static uint_t gcpu_force_addr_in_payload = 0; 61 62 /* 63 * Clear to log telemetry found at initialization. While processor docs 64 * say you should process this telemetry on all but Intel family 0x6 65 * there are way too many exceptions and we want to avoid bogus 66 * diagnoses. 67 */ 68 int gcpu_suppress_log_on_init = 1; 69 70 /* 71 * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at 72 * error logout time. The stack will be included in the ereport if the 73 * error type selects stack inclusion, or in all cases if 74 * gcpu_mca_stack_ereport_include is nonzero. 75 */ 76 int gcpu_mca_stack_flag = 0; 77 int gcpu_mca_stack_ereport_include = 0; 78 79 /* 80 * The number of times to re-read MCA telemetry to try to obtain a 81 * consistent snapshot if we find it to be changing under our feet. 82 */ 83 int gcpu_mca_telemetry_retries = 5; 84 85 #ifndef __xpv 86 int gcpu_mca_cmci_throttling_threshold = 10; 87 int gcpu_mca_cmci_reenable_threshold = 1000; 88 89 /* 90 * This is used to determine whether or not we have registered the CMCI CPU 91 * setup function. This is protected by cpu_lock. 92 */ 93 static boolean_t gcpu_mca_cpu_registered = B_FALSE; 94 #endif 95 96 static gcpu_error_disp_t gcpu_errtypes[] = { 97 98 /* 99 * Unclassified 100 */ 101 { 102 FM_EREPORT_CPU_GENERIC_UNCLASSIFIED, 103 NULL, 104 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 105 MCAX86_SIMPLE_UNCLASSIFIED_MASKON, 106 MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF 107 }, 108 109 /* 110 * Microcode ROM Parity Error 111 */ 112 { 113 FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY, 114 NULL, 115 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 116 MCAX86_SIMPLE_MC_CODE_PARITY_MASKON, 117 MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF 118 }, 119 120 /* 121 * External - BINIT# from another processor during power-on config 122 */ 123 { 124 FM_EREPORT_CPU_GENERIC_EXTERNAL, 125 NULL, 126 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 127 MCAX86_SIMPLE_EXTERNAL_MASKON, 128 MCAX86_SIMPLE_EXTERNAL_MASKOFF 129 }, 130 131 /* 132 * Functional redundancy check master/slave error 133 */ 134 { 135 FM_EREPORT_CPU_GENERIC_FRC, 136 NULL, 137 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 138 MCAX86_SIMPLE_FRC_MASKON, 139 MCAX86_SIMPLE_FRC_MASKOFF 140 }, 141 142 /* 143 * Internal parity error 144 */ 145 { 146 FM_EREPORT_CPU_GENERIC_INTERNAL_PARITY, 147 NULL, 148 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 149 MCAX86_SIMPLE_INTERNAL_PARITY_MASKON, 150 MCAX86_SIMPLE_INTERNAL_PARITY_MASKOFF 151 }, 152 153 154 /* 155 * Internal timer error 156 */ 157 { 158 FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER, 159 NULL, 160 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 161 MCAX86_SIMPLE_INTERNAL_TIMER_MASKON, 162 MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF 163 }, 164 165 /* 166 * Internal unclassified 167 */ 168 { 169 FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS, 170 NULL, 171 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 172 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON, 173 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF 174 }, 175 176 /* 177 * Compound error codes - generic memory hierarchy 178 */ 179 { 180 FM_EREPORT_CPU_GENERIC_GENMEMHIER, 181 NULL, 182 FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */ 183 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON, 184 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF 185 }, 186 187 /* 188 * Compound error codes - TLB errors 189 */ 190 { 191 FM_EREPORT_CPU_GENERIC_TLB, 192 "%1$s" "TLB" "%2$s" "_ERR", 193 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 194 MCAX86_COMPOUND_TLB_MASKON, 195 MCAX86_COMPOUND_TLB_MASKOFF 196 }, 197 198 /* 199 * Compound error codes - memory hierarchy 200 */ 201 { 202 FM_EREPORT_CPU_GENERIC_MEMHIER, 203 "%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR", 204 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 205 MCAX86_COMPOUND_MEMHIER_MASKON, 206 MCAX86_COMPOUND_MEMHIER_MASKOFF 207 }, 208 209 /* 210 * Compound error codes - bus and interconnect errors 211 */ 212 { 213 FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT, 214 "BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR", 215 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 216 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON, 217 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF 218 }, 219 /* 220 * Compound error codes - memory controller errors 221 */ 222 { 223 FM_EREPORT_CPU_GENERIC_MEMORY_CONTROLLER, 224 "MC" "_" "%8$s" "_" "%9$s" "_ERR", 225 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 226 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKON, 227 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKOFF 228 }, 229 }; 230 231 static gcpu_error_disp_t gcpu_unknown = { 232 FM_EREPORT_CPU_GENERIC_UNKNOWN, 233 "UNKNOWN", 234 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 235 0, 236 0 237 }; 238 239 static errorq_t *gcpu_mca_queue; 240 static kmutex_t gcpu_mca_queue_lock; 241 242 #ifdef __xpv 243 static int isxpv = 1; 244 #else 245 static int isxpv = 0; 246 #endif 247 248 static const gcpu_error_disp_t * 249 gcpu_disp_match(uint16_t code) 250 { 251 const gcpu_error_disp_t *ged = gcpu_errtypes; 252 int i; 253 254 for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t); 255 i++, ged++) { 256 uint16_t on = ged->ged_errcode_mask_on; 257 uint16_t off = ged->ged_errcode_mask_off; 258 259 if ((code & on) == on && (code & off) == 0) 260 return (ged); 261 } 262 263 return (NULL); 264 } 265 266 static uint16_t 267 bit_strip(uint16_t code, uint16_t mask, uint16_t shift) 268 { 269 return ((code & mask) >> shift); 270 } 271 272 #define BIT_STRIP(code, name) \ 273 bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \ 274 MCAX86_ERRCODE_##name##_SHIFT) 275 276 #define GCPU_MNEMONIC_UNDEF "undefined" 277 #define GCPU_MNEMONIC_RESVD "reserved" 278 279 /* 280 * Mappings of TT, LL, RRRR, PP, II and T values to compound error name 281 * mnemonics and to ereport class name components. 282 */ 283 284 struct gcpu_mnexp { 285 const char *mne_compound; /* used in expanding compound errname */ 286 const char *mne_ereport; /* used in expanding ereport class */ 287 }; 288 289 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */ 290 { "I", FM_EREPORT_CPU_GENERIC_TT_INSTR }, /* INSTR */ 291 { "D", FM_EREPORT_CPU_GENERIC_TT_DATA }, /* DATA */ 292 { "G", FM_EREPORT_CPU_GENERIC_TT_GEN }, /* GEN */ 293 { GCPU_MNEMONIC_UNDEF, "" } 294 }; 295 296 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */ 297 { "LO", FM_EREPORT_CPU_GENERIC_LL_L0 }, /* L0 */ 298 { "L1", FM_EREPORT_CPU_GENERIC_LL_L1 }, /* L1 */ 299 { "L2", FM_EREPORT_CPU_GENERIC_LL_L2 }, /* L2 */ 300 { "LG", FM_EREPORT_CPU_GENERIC_LL_LG } /* LG */ 301 }; 302 303 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */ 304 { "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR }, /* ERR */ 305 { "RD", FM_EREPORT_CPU_GENERIC_RRRR_RD }, /* RD */ 306 { "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR }, /* WR */ 307 { "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD }, /* DRD */ 308 { "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR }, /* DWR */ 309 { "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD }, /* IRD */ 310 { "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH }, /* PREFETCH */ 311 { "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT }, /* EVICT */ 312 { "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP }, /* SNOOP */ 313 }; 314 315 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */ 316 { "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC }, /* SRC */ 317 { "RES", FM_EREPORT_CPU_GENERIC_PP_RES }, /* RES */ 318 { "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS }, /* OBS */ 319 { "", FM_EREPORT_CPU_GENERIC_PP_GEN } /* GEN */ 320 }; 321 322 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */ 323 { "M", FM_EREPORT_CPU_GENERIC_II_MEM }, /* MEM */ 324 { GCPU_MNEMONIC_RESVD, "" }, 325 { "IO", FM_EREPORT_CPU_GENERIC_II_IO }, /* IO */ 326 { "", FM_EREPORT_CPU_GENERIC_II_GEN } /* GEN */ 327 }; 328 329 static struct gcpu_mnexp gcpu_T_mnemonics[] = { /* MCAX86_ERRCODE_T_* */ 330 { "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT }, /* NONE */ 331 { "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT } /* TIMEOUT */ 332 }; 333 334 static struct gcpu_mnexp gcpu_CCCC_mnemonics[] = { /* MCAX86_ERRCODE_CCCC_* */ 335 { "CH0", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH0 */ 336 { "CH1", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH1 */ 337 { "CH2", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH2 */ 338 { "CH3", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH3 */ 339 { "CH4", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH4 */ 340 { "CH5", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH5 */ 341 { "CH6", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH6 */ 342 { "CH7", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH7 */ 343 { "CH8", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH8 */ 344 { "CH9", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH9 */ 345 { "CH10", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH10 */ 346 { "CH11", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH11 */ 347 { "CH12", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH12 */ 348 { "CH13", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH13 */ 349 { "CH14", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH14 */ 350 { "CH", FM_EREPORT_CPU_GENERIC_CCCC } /* GEN */ 351 }; 352 353 static struct gcpu_mnexp gcpu_MMM_mnemonics[] = { /* MCAX86_ERRCODE_MMM_* */ 354 { "GEN", FM_EREPORT_CPU_GENERIC_MMM_ERR }, /* GEN ERR */ 355 { "RD", FM_EREPORT_CPU_GENERIC_MMM_RD }, /* READ */ 356 { "WR", FM_EREPORT_CPU_GENERIC_MMM_WR }, /* WRITE */ 357 { "ADDR_CMD", FM_EREPORT_CPU_GENERIC_MMM_ADRCMD }, /* ADDR, CMD */ 358 { "SCRUB", FM_EREPORT_CPU_GENERIC_MMM_SCRUB }, 359 { GCPU_MNEMONIC_RESVD, ""}, /* RESERVED */ 360 { GCPU_MNEMONIC_RESVD, ""}, /* RESERVED */ 361 { GCPU_MNEMONIC_RESVD, ""} /* RESERVED */ 362 }; 363 364 enum gcpu_mn_namespace { 365 GCPU_MN_NAMESPACE_COMPOUND, 366 GCPU_MN_NAMESPACE_EREPORT 367 }; 368 369 static const char * 370 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint16_t val, 371 enum gcpu_mn_namespace nspace) 372 { 373 if (val >= tbl_sz || val > 0xff) 374 return (GCPU_MNEMONIC_UNDEF); /* for all namespaces */ 375 376 switch (nspace) { 377 case GCPU_MN_NAMESPACE_COMPOUND: 378 return (tbl[val].mne_compound); 379 /*NOTREACHED*/ 380 381 case GCPU_MN_NAMESPACE_EREPORT: 382 return (tbl[val].mne_ereport); 383 /*NOTREACHED*/ 384 385 default: 386 return (GCPU_MNEMONIC_UNDEF); 387 /*NOTREACHED*/ 388 } 389 } 390 391 /* 392 * The ereport class leaf component is either a simple string with no 393 * format specifiers, or a string with one or more embedded %n$s specifiers - 394 * positional selection for string arguments. The kernel snprintf does 395 * not support %n$ (and teaching it to do so is too big a headache) so 396 * we will expand this restricted format string ourselves. 397 */ 398 399 #define GCPU_CLASS_VARCOMPS 9 400 401 #define GCPU_MNEMONIC(code, name, nspace) \ 402 gcpu_mnemonic(gcpu_##name##_mnemonics, \ 403 sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \ 404 BIT_STRIP(code, name), nspace) 405 406 static void 407 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status, 408 enum gcpu_mn_namespace nspace) 409 { 410 uint16_t code = MCAX86_ERRCODE(status); 411 const char *mn[GCPU_CLASS_VARCOMPS]; 412 char *p = buf; /* current position in buf */ 413 char *q = buf + buflen; /* pointer past last char in buf */ 414 int which, expfmtchar, error; 415 char c; 416 417 mn[0] = GCPU_MNEMONIC(code, TT, nspace); 418 mn[1] = GCPU_MNEMONIC(code, LL, nspace); 419 mn[2] = GCPU_MNEMONIC(code, RRRR, nspace); 420 mn[3] = GCPU_MNEMONIC(code, PP, nspace); 421 mn[4] = GCPU_MNEMONIC(code, II, nspace); 422 mn[5] = GCPU_MNEMONIC(code, T, nspace); 423 mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : ""; 424 mn[7] = GCPU_MNEMONIC(code, CCCC, nspace); 425 mn[8] = GCPU_MNEMONIC(code, MMM, nspace); 426 427 while (p < q - 1 && (c = *fmt++) != '\0') { 428 if (c != '%') { 429 /* not the beginning of a format specifier - copy */ 430 *p++ = c; 431 continue; 432 } 433 434 error = 0; 435 which = -1; 436 expfmtchar = -1; 437 438 nextfmt: 439 if ((c = *fmt++) == '\0') 440 break; /* early termination of fmt specifier */ 441 442 switch (c) { 443 case '1': 444 case '2': 445 case '3': 446 case '4': 447 case '5': 448 case '6': 449 case '7': 450 case '8': 451 case '9': 452 if (which != -1) { /* allow only one positional digit */ 453 error++; 454 break; 455 } 456 which = c - '1'; 457 goto nextfmt; 458 /*NOTREACHED*/ 459 460 case '$': 461 if (which == -1) { /* no position specified */ 462 error++; 463 break; 464 } 465 expfmtchar = 's'; 466 goto nextfmt; 467 /*NOTREACHED*/ 468 469 case 's': 470 if (expfmtchar != 's') { 471 error++; 472 break; 473 } 474 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s", 475 mn[which]); 476 p += strlen(p); 477 break; 478 479 default: 480 error++; 481 break; 482 } 483 484 if (error) 485 break; 486 } 487 488 *p = '\0'; /* NUL termination */ 489 } 490 491 static void 492 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status, 493 const char *cpuclass, const char *leafclass) 494 { 495 char *p = buf; /* current position in buf */ 496 char *q = buf + buflen; /* pointer past last char in buf */ 497 498 (void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.", 499 FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC); 500 501 p += strlen(p); 502 if (p >= q) 503 return; 504 505 if (leafclass == NULL) { 506 gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status, 507 GCPU_MN_NAMESPACE_EREPORT); 508 } else { 509 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s", 510 leafclass); 511 } 512 } 513 514 /* 515 * Create an "hc" scheme FMRI identifying the given cpu with 516 * motherboard/chip/core/strand instance numbers. 517 */ 518 static nvlist_t * 519 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva) 520 { 521 nvlist_t *nvl, *fmri; 522 523 if ((nvl = fm_nvlist_create(nva)) == NULL) 524 return (NULL); 525 526 if (!x86gentopo_legacy) { 527 fmri = cmi_hdl_smb_bboard(hdl); 528 if (fmri == NULL) 529 return (NULL); 530 531 fm_fmri_hc_create(nvl, FM_HC_SCHEME_VERSION, 532 NULL, NULL, fmri, 3, 533 "chip", cmi_hdl_smb_chipid(hdl), 534 "core", cmi_hdl_coreid(hdl), 535 "strand", cmi_hdl_strandid(hdl)); 536 } else { 537 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 4, 538 "motherboard", 0, 539 "chip", cmi_hdl_chipid(hdl), 540 "core", cmi_hdl_coreid(hdl), 541 "strand", cmi_hdl_strandid(hdl)); 542 } 543 544 return (nvl); 545 } 546 547 int gcpu_bleat_count_thresh = 5; 548 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL; 549 550 /* 551 * Called when we are unable to propogate a logout structure onto an 552 * errorq for subsequent ereport preparation and logging etc. The caller 553 * should usually only decide to call this for severe errors - those we 554 * suspect we may need to panic for. 555 */ 556 static void 557 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl) 558 { 559 hrtime_t now = gethrtime_waitfree(); 560 static hrtime_t gcpu_last_bleat; 561 gcpu_bank_logout_t *gbl; 562 static int bleatcount; 563 int i; 564 565 /* 566 * Throttle spamming of the console. The first gcpu_bleat_count_thresh 567 * can come as fast as we like, but once we've spammed that many 568 * to the console we require a minimum interval to pass before 569 * any more complaints. 570 */ 571 if (++bleatcount > gcpu_bleat_count_thresh) { 572 if (now - gcpu_last_bleat < gcpu_bleat_min_interval) 573 return; 574 else 575 bleatcount = 0; 576 } 577 gcpu_last_bleat = now; 578 579 cmn_err(CE_WARN, 580 "Machine-Check Errors unlogged on chip %d core %d strand %d, " 581 "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl), 582 cmi_hdl_strandid(hdl)); 583 cmn_err(CE_WARN, "MCG_STATUS 0x%016llx", 584 (u_longlong_t)gcl->gcl_mcg_status); 585 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) { 586 uint64_t status = gbl->gbl_status; 587 588 if (!(status & MSR_MC_STATUS_VAL)) 589 continue; 590 591 /* Force ADDRV for AMD Family 0xf and above */ 592 if (gcpu_force_addr_in_payload) 593 status = status | MSR_MC_STATUS_ADDRV; 594 595 switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) { 596 case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV: 597 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 598 "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx", 599 i, IA32_MSR_MC(i, STATUS), 600 (u_longlong_t)gbl->gbl_status, 601 (u_longlong_t)gbl->gbl_addr, 602 (u_longlong_t)gbl->gbl_misc); 603 break; 604 605 case MSR_MC_STATUS_ADDRV: 606 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 607 "STAT 0x%016llx ADDR 0x%016llx", 608 i, IA32_MSR_MC(i, STATUS), 609 (u_longlong_t)gbl->gbl_status, 610 (u_longlong_t)gbl->gbl_addr); 611 break; 612 613 case MSR_MC_STATUS_MISCV: 614 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 615 "STAT 0x%016llx MISC 0x%016llx", 616 i, IA32_MSR_MC(i, STATUS), 617 (u_longlong_t)gbl->gbl_status, 618 (u_longlong_t)gbl->gbl_misc); 619 break; 620 621 default: 622 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 623 "STAT 0x%016llx", 624 i, IA32_MSR_MC(i, STATUS), 625 (u_longlong_t)gbl->gbl_status); 626 break; 627 628 } 629 } 630 } 631 632 #define _GCPU_BSTATUS(status, what) \ 633 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \ 634 (status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE 635 636 static void 637 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl, 638 uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code) 639 { 640 uint64_t members = ged ? ged->ged_ereport_members : 641 FM_EREPORT_PAYLOAD_FLAGS_COMMON; 642 uint64_t mcg = gcl->gcl_mcg_status; 643 int mcip = mcg & MCG_STATUS_MCIP; 644 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno]; 645 uint64_t bstat = gbl->gbl_status; 646 647 /* 648 * Include the compound error name if requested and if this 649 * is a compound error type. 650 */ 651 if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged && 652 ged->ged_compound_fmt != NULL) { 653 char buf[FM_MAX_CLASS]; 654 655 gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code, 656 GCPU_MN_NAMESPACE_COMPOUND); 657 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR, 658 DATA_TYPE_STRING, buf, NULL); 659 } 660 661 /* 662 * Include disposition information for this error 663 */ 664 if (members & FM_EREPORT_PAYLOAD_FLAG_DISP && 665 gbl->gbl_disp != 0) { 666 int i, empty = 1; 667 char buf[128]; 668 char *p = buf, *q = buf + 128; 669 static struct _gcpu_disp_name { 670 uint64_t dv; 671 const char *dn; 672 } disp_names[] = { 673 { CMI_ERRDISP_CURCTXBAD, 674 "processor_context_corrupt" }, 675 { CMI_ERRDISP_RIPV_INVALID, 676 "return_ip_invalid" }, 677 { CMI_ERRDISP_UC_UNCONSTRAINED, 678 "unconstrained" }, 679 { CMI_ERRDISP_FORCEFATAL, 680 "forcefatal" }, 681 { CMI_ERRDISP_IGNORED, 682 "ignored" }, 683 { CMI_ERRDISP_PCC_CLEARED, 684 "corrupt_context_cleared" }, 685 { CMI_ERRDISP_UC_CLEARED, 686 "uncorrected_data_cleared" }, 687 { CMI_ERRDISP_POISONED, 688 "poisoned" }, 689 { CMI_ERRDISP_INCONSISTENT, 690 "telemetry_unstable" }, 691 }; 692 693 for (i = 0; i < sizeof (disp_names) / 694 sizeof (struct _gcpu_disp_name); i++) { 695 if ((gbl->gbl_disp & disp_names[i].dv) == 0) 696 continue; 697 698 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, 699 "%s%s", empty ? "" : ",", disp_names[i].dn); 700 p += strlen(p); 701 empty = 0; 702 } 703 704 if (p != buf) 705 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP, 706 DATA_TYPE_STRING, buf, NULL); 707 } 708 709 /* 710 * If MCG_STATUS is included add that and an indication of whether 711 * this ereport was the result of a machine check or poll. 712 */ 713 if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) { 714 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS, 715 DATA_TYPE_UINT64, mcg, NULL); 716 717 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP, 718 DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL); 719 } 720 721 /* 722 * If an instruction pointer is to be included add one provided 723 * MCG_STATUS indicated it is valid; meaningless for polled events. 724 */ 725 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP && 726 mcg & MCG_STATUS_EIPV) { 727 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP, 728 DATA_TYPE_UINT64, gcl->gcl_ip, NULL); 729 } 730 731 /* 732 * Add an indication of whether the trap occured during privileged code. 733 */ 734 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) { 735 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV, 736 DATA_TYPE_BOOLEAN_VALUE, 737 gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL); 738 } 739 740 /* 741 * If requested, add the index of the MCA bank. This indicates the 742 * n'th bank of 4 MCA registers, and does not necessarily correspond 743 * to MCi_* - use the bank offset to correlate 744 */ 745 if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) { 746 fm_payload_set(ereport, 747 /* Bank number */ 748 FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno, 749 /* Offset of MCi_CTL */ 750 FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64, 751 IA32_MSR_MC(bankno, CTL), 752 NULL); 753 } 754 755 /* 756 * Add MCi_STATUS if requested, and decode it. 757 */ 758 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) { 759 const char *tbes[] = { 760 "No tracking", /* 00 */ 761 "Green - below threshold", /* 01 */ 762 "Yellow - above threshold", /* 10 */ 763 "Reserved" /* 11 */ 764 }; 765 766 fm_payload_set(ereport, 767 /* Bank MCi_STATUS */ 768 FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat, 769 /* Overflow? */ 770 _GCPU_BSTATUS(bstat, OVER), 771 /* Uncorrected? */ 772 _GCPU_BSTATUS(bstat, UC), 773 /* Enabled? */ 774 _GCPU_BSTATUS(bstat, EN), 775 /* Processor context corrupt? */ 776 _GCPU_BSTATUS(bstat, PCC), 777 /* Error code */ 778 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE, 779 DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat), 780 /* Model-specific error code */ 781 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE, 782 DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat), 783 NULL); 784 785 /* 786 * If MCG_CAP.TES_P indicates that that thresholding info 787 * is present in the architural component of the bank status 788 * then include threshold information for this bank. 789 */ 790 if (gcl->gcl_flags & GCPU_GCL_F_TES_P) { 791 fm_payload_set(ereport, 792 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES, 793 DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)], 794 NULL); 795 } 796 } 797 798 /* 799 * Add MCi_ADDR info if requested and valid. We force addition of 800 * MCi_ADDR, even if its not valid on AMD family 0xf and above, 801 * to aid in analysis of ereports, for WatchDog errors. 802 */ 803 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR && 804 ((bstat & MSR_MC_STATUS_ADDRV) || 805 gcpu_force_addr_in_payload)) { 806 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR, 807 DATA_TYPE_UINT64, gbl->gbl_addr, NULL); 808 } 809 810 /* 811 * MCi_MISC if requested and MCi_STATUS.MISCV). 812 */ 813 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC && 814 bstat & MSR_MC_STATUS_MISCV) { 815 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC, 816 DATA_TYPE_UINT64, gbl->gbl_misc, NULL); 817 } 818 819 } 820 821 /* 822 * Construct and post an ereport based on the logout information from a 823 * single MCA bank. We are not necessarily running on the cpu that 824 * detected the error. 825 */ 826 static void 827 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx, 828 const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status) 829 { 830 gcpu_data_t *gcpu = gcl->gcl_gcpu; 831 cmi_hdl_t hdl = gcpu->gcpu_hdl; 832 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx]; 833 const char *cpuclass = NULL, *leafclass = NULL; 834 uint16_t code = MCAX86_ERRCODE(status); 835 errorq_elem_t *eqep, *scr_eqep; 836 nvlist_t *ereport, *detector; 837 char buf[FM_MAX_CLASS]; 838 const char *classfmt; 839 nv_alloc_t *nva; 840 841 if (panicstr) { 842 if ((eqep = errorq_reserve(ereport_errorq)) == NULL) 843 return; 844 ereport = errorq_elem_nvl(ereport_errorq, eqep); 845 846 /* 847 * Allocate another element for scratch space, but fallback 848 * to the one we have if that fails. We'd like to use the 849 * additional scratch space for nvlist construction. 850 */ 851 if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL) 852 nva = errorq_elem_nva(ereport_errorq, scr_eqep); 853 else 854 nva = errorq_elem_nva(ereport_errorq, eqep); 855 } else { 856 ereport = fm_nvlist_create(NULL); 857 nva = NULL; 858 eqep = NULL; 859 scr_eqep = NULL; 860 } 861 862 if (ereport == NULL) 863 return; 864 865 /* 866 * Common payload data required by the protocol: 867 * - ereport class 868 * - detector 869 * - ENA 870 */ 871 872 /* 873 * Ereport class - call into model-specific support to allow it to 874 * provide a cpu class or leaf class, otherwise calculate our own. 875 */ 876 cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass); 877 classfmt = ged ? ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN; 878 gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass, 879 leafclass); 880 881 /* 882 * The detector FMRI. 883 */ 884 if ((detector = cms_ereport_detector(hdl, bankidx, mscookie, 885 nva)) == NULL) 886 detector = gcpu_fmri_create(hdl, nva); 887 888 /* 889 * Should we define a new ENA format 3?? for chip/core/strand? 890 * It will be better when virtualized. 891 */ 892 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf, 893 fm_ena_generate_cpu(gcl->gcl_timestamp, 894 cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 | 895 cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL); 896 897 if (panicstr) { 898 fm_nvlist_destroy(detector, FM_NVA_RETAIN); 899 nv_alloc_reset(nva); 900 } else { 901 fm_nvlist_destroy(detector, FM_NVA_FREE); 902 } 903 904 /* 905 * Add the architectural ereport class-specific payload data. 906 */ 907 gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code); 908 909 /* 910 * Allow model-specific code to add ereport members. 911 */ 912 cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status, 913 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie); 914 915 /* 916 * Include stack if options is turned on and either selected in 917 * the payload member bitmask or inclusion is forced. 918 */ 919 if (gcpu_mca_stack_flag && 920 (cms_ereport_includestack(hdl, mscookie) == 921 B_TRUE || gcpu_mca_stack_ereport_include)) { 922 fm_payload_stack_add(ereport, gcl->gcl_stack, 923 gcl->gcl_stackdepth); 924 } 925 926 /* 927 * If injection has taken place anytime in the past then note this 928 * on the ereport. 929 */ 930 if (cmi_inj_tainted() == B_TRUE) { 931 fm_payload_set(ereport, "__injected", DATA_TYPE_BOOLEAN_VALUE, 932 B_TRUE, NULL); 933 } 934 935 /* 936 * Post ereport. 937 */ 938 if (panicstr) { 939 errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC); 940 if (scr_eqep) 941 errorq_cancel(ereport_errorq, scr_eqep); 942 } else { 943 (void) fm_ereport_post(ereport, EVCH_TRYHARD); 944 fm_nvlist_destroy(ereport, FM_NVA_FREE); 945 } 946 947 } 948 949 /*ARGSUSED*/ 950 void 951 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe) 952 { 953 const gcpu_logout_t *gcl = data; 954 const gcpu_bank_logout_t *gbl; 955 int ismc; 956 int i; 957 958 ismc = gcl->ismc; 959 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) { 960 const gcpu_error_disp_t *gened; 961 cms_cookie_t mscookie; 962 963 if (gbl->gbl_status & MSR_MC_STATUS_VAL && 964 !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) { 965 uint16_t code = MCAX86_ERRCODE(gbl->gbl_status); 966 967 /* 968 * Perform a match based on IA32 MCA architectural 969 * components alone. 970 */ 971 gened = gcpu_disp_match(code); /* may be NULL */ 972 973 /* 974 * Now see if an model-specific match can be made. 975 */ 976 mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, ismc, 977 i, gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc, 978 gcl->gcl_ms_logout); 979 980 /* 981 * Prepare and dispatch an ereport for logging and 982 * diagnosis. 983 */ 984 gcpu_ereport_post(gcl, i, gened, mscookie, 985 gbl->gbl_status); 986 } else if (gbl->gbl_status & MSR_MC_STATUS_VAL && 987 (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) { 988 /* 989 * Telemetry kept changing as we tried to read 990 * it. Force an unknown ereport leafclass but 991 * keep the telemetry unchanged for logging. 992 */ 993 gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL, 994 gbl->gbl_status); 995 } 996 } 997 } 998 999 static size_t gcpu_mca_queue_datasz = 0; 1000 1001 /* 1002 * The following code is ready to make a weak attempt at growing the 1003 * errorq structure size. Since it is not foolproof (we don't know 1004 * who may already be producing to the outgoing errorq) our caller 1005 * instead assures that we'll always be called with no greater data 1006 * size than on our first call. 1007 */ 1008 static void 1009 gcpu_errorq_init(size_t datasz) 1010 { 1011 int slots; 1012 1013 mutex_enter(&gcpu_mca_queue_lock); 1014 1015 if (gcpu_mca_queue_datasz >= datasz) { 1016 mutex_exit(&gcpu_mca_queue_lock); 1017 return; 1018 } 1019 1020 membar_producer(); 1021 if (gcpu_mca_queue) { 1022 gcpu_mca_queue_datasz = 0; 1023 errorq_destroy(gcpu_mca_queue); 1024 } 1025 1026 slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS); 1027 slots = MIN(slots, GCPU_MCA_MAX_ERRORS); 1028 1029 gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain, 1030 NULL, slots, datasz, 1, ERRORQ_VITAL); 1031 1032 if (gcpu_mca_queue != NULL) 1033 gcpu_mca_queue_datasz = datasz; 1034 1035 mutex_exit(&gcpu_mca_queue_lock); 1036 } 1037 1038 /* 1039 * Perform MCA initialization as described in section 14.6 of Intel 64 1040 * and IA-32 Architectures Software Developer's Manual Volume 3A. 1041 */ 1042 1043 static uint_t global_nbanks; 1044 1045 #ifndef __xpv 1046 /*ARGSUSED*/ 1047 int 1048 gcpu_cmci_cpu_setup(cpu_setup_t what, int cpuid, void *arg) 1049 { 1050 /* 1051 * In general, we'd expect that in a multi-socket configuration, either 1052 * all CPUs would support CMCI or none of them would. Unfortunately, 1053 * that may not be the case in the wild. While we'd rather check the 1054 * handle's enablement state here, that itself is a bit complicated. We 1055 * don't have a guarantee in a heterogenous situation that the CPU in 1056 * question is using the generic CPU module or not, even though we've 1057 * been registered. As such, we allow the interrupt to be registered and 1058 * written to the local apic anyways. We won't have a CMCI interrupt 1059 * generated anyways because the MCA banks will not be programmed as 1060 * such for that CPU by the polling thread. 1061 */ 1062 switch (what) { 1063 case CPU_ON: 1064 psm_cmci_setup(cpuid, B_TRUE); 1065 break; 1066 case CPU_OFF: 1067 psm_cmci_setup(cpuid, B_FALSE); 1068 break; 1069 default: 1070 break; 1071 } 1072 1073 return (0); 1074 } 1075 1076 void 1077 gcpu_mca_cmci_enable(cmi_hdl_t hdl) 1078 { 1079 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1080 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1081 1082 /* 1083 * If this CPU doesn't support CMCI, don't do anything. 1084 */ 1085 if ((mca->gcpu_mca_flags & GCPU_MCA_F_CMCI_CAPABLE) == 0) 1086 return; 1087 1088 /* 1089 * If we don't have support from the PSM module, then there's nothing we 1090 * can do. Note that this changes as we start up the system. The only 1091 * case where it may be mistakenly NULL is for the boot CPU. The boot 1092 * CPU will have this taken care of for it in gcpu_post_startup(), once 1093 * we know for certain whether or not the PSM module supports CMCI. 1094 */ 1095 if (psm_cmci_setup == NULL) { 1096 return; 1097 } 1098 1099 mca->gcpu_mca_flags |= GCPU_MCA_F_CMCI_ENABLE; 1100 if (MUTEX_HELD(&cpu_lock)) { 1101 if (!gcpu_mca_cpu_registered) { 1102 register_cpu_setup_func(gcpu_cmci_cpu_setup, NULL); 1103 gcpu_mca_cpu_registered = B_TRUE; 1104 } 1105 } else { 1106 mutex_enter(&cpu_lock); 1107 if (!gcpu_mca_cpu_registered) { 1108 register_cpu_setup_func(gcpu_cmci_cpu_setup, NULL); 1109 gcpu_mca_cpu_registered = B_TRUE; 1110 } 1111 mutex_exit(&cpu_lock); 1112 } 1113 1114 /* 1115 * Call the PSM op to make sure that we initialize things on 1116 * this CPU. 1117 */ 1118 psm_cmci_setup(cmi_hdl_logical_id(hdl), B_TRUE); 1119 } 1120 #endif /* !__xpv */ 1121 1122 void 1123 gcpu_mca_init(cmi_hdl_t hdl) 1124 { 1125 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1126 uint64_t cap; 1127 uint_t vendor = cmi_hdl_vendor(hdl); 1128 uint_t family = cmi_hdl_family(hdl); 1129 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1130 int mcg_ctl_present; 1131 uint_t nbanks; 1132 uint32_t ctl_skip_mask = 0; 1133 uint32_t status_skip_mask = 0; 1134 size_t mslsz; 1135 int i; 1136 #ifndef __xpv 1137 int mcg_ctl2_present; 1138 uint32_t cmci_capable = 0; 1139 #endif 1140 if (gcpu == NULL) 1141 return; 1142 1143 /* We add MCi_ADDR always for AMD Family 0xf and above */ 1144 if (family >= 0xf) 1145 gcpu_force_addr_in_payload = 1; 1146 1147 /* 1148 * Protect from some silly /etc/system settings. 1149 */ 1150 if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100) 1151 gcpu_mca_telemetry_retries = 5; 1152 1153 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS) 1154 return; 1155 1156 /* 1157 * CPU startup code only calls cmi_mca_init if x86_featureset indicates 1158 * both MCA and MCE support (i.e., X86FSET_MCA). P5, K6, and earlier 1159 * processors, which have their own more primitive way of doing 1160 * machine checks, will not have cmi_mca_init called since their 1161 * CPUID information will not indicate both MCA and MCE features. 1162 */ 1163 ASSERT(is_x86_feature(x86_featureset, X86FSET_MCA)); 1164 1165 /* 1166 * Determine whether the IA32_MCG_CTL register is present. If it 1167 * is we will enable all features by writing -1 to it towards 1168 * the end of this initialization; if it is absent then volume 3A 1169 * says we must nonetheless continue to initialize the individual 1170 * banks. 1171 */ 1172 mcg_ctl_present = cap & MCG_CAP_CTL_P; 1173 #ifndef __xpv 1174 mcg_ctl2_present = cap & MCG_CAP_CTL2_P; 1175 #endif 1176 1177 /* 1178 * We squirell values away for inspection/debugging. 1179 */ 1180 mca->gcpu_mca_bioscfg.bios_mcg_cap = cap; 1181 if (mcg_ctl_present) 1182 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL, 1183 &mca->gcpu_mca_bioscfg.bios_mcg_ctl); 1184 1185 /* 1186 * Determine the number of error-reporting banks implemented. 1187 */ 1188 mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK; 1189 1190 if (nbanks != 0 && global_nbanks == 0) 1191 global_nbanks = nbanks; /* no race - BSP will get here first */ 1192 1193 /* 1194 * If someone is hiding the number of banks (perhaps we are fully 1195 * virtualized?) or if this processor has more banks than the 1196 * first to set global_nbanks then bail. The latter requirement 1197 * is because we need to size our errorq data structure and we 1198 * don't want to have to grow the errorq (destroy and recreate) 1199 * which may just lose some telemetry. 1200 */ 1201 if (nbanks == 0 || nbanks > global_nbanks) 1202 return; 1203 1204 mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks * 1205 sizeof (struct gcpu_bios_bankcfg), KM_SLEEP); 1206 1207 /* 1208 * Calculate the size we need to allocate for a gcpu_logout_t 1209 * with a gcl_data array big enough for all banks of this cpu. 1210 * Add any space requested by the model-specific logout support. 1211 */ 1212 mslsz = cms_logout_size(hdl); 1213 mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) + 1214 (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz; 1215 1216 for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) { 1217 gcpu_logout_t *gcl; 1218 1219 mca->gcpu_mca_logout[i] = gcl = 1220 kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP); 1221 gcl->gcl_gcpu = gcpu; 1222 gcl->gcl_nbanks = nbanks; 1223 gcl->gcl_ms_logout = (mslsz == 0) ? NULL : 1224 (char *)(&gcl->gcl_data[0]) + nbanks * 1225 sizeof (gcpu_bank_logout_t); 1226 1227 } 1228 1229 #ifdef __xpv 1230 gcpu_xpv_mca_init(nbanks); 1231 #endif 1232 1233 mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1; 1234 1235 #ifndef __xpv 1236 mca->gcpu_bank_cmci = kmem_zalloc(sizeof (gcpu_mca_cmci_t) * nbanks, 1237 KM_SLEEP); 1238 #endif 1239 1240 /* 1241 * Create our errorq to transport the logout structures. This 1242 * can fail so users of gcpu_mca_queue must be prepared for NULL. 1243 */ 1244 gcpu_errorq_init(mca->gcpu_mca_lgsz); 1245 1246 /* 1247 * Not knowing which, if any, banks are shared between cores we 1248 * assure serialization of MCA bank initialization by each cpu 1249 * on the chip. On chip architectures in which some banks are 1250 * shared this will mean the shared resource is initialized more 1251 * than once - we're simply aiming to avoid simultaneous MSR writes 1252 * to the shared resource. 1253 * 1254 * Even with these precautions, some platforms may yield a GP fault 1255 * if a core other than a designated master tries to write anything 1256 * but all 0's to MCi_{STATUS,ADDR,CTL}. So we will perform 1257 * those writes under on_trap protection. 1258 */ 1259 mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock); 1260 1261 /* 1262 * Initialize poller data, but don't start polling yet. 1263 */ 1264 gcpu_mca_poll_init(hdl); 1265 1266 /* 1267 * Work out which MCA banks we will initialize. In MCA logout 1268 * code we will only read those banks which we initialize here. 1269 */ 1270 for (i = 0; i < nbanks; i++) { 1271 boolean_t skipctl = cms_bankctl_skipinit(hdl, i); 1272 boolean_t skipstatus = cms_bankstatus_skipinit(hdl, i); 1273 1274 if (!cms_present(hdl)) { 1275 /* 1276 * Model-specific support is not present, try to use 1277 * sane defaults. 1278 * 1279 * On AMD family 6 processors, reports about spurious 1280 * machine checks indicate that bank 0 should be 1281 * skipped. 1282 * 1283 * On Intel family 6 processors, the documentation tells 1284 * us not to write to MC0_CTL. 1285 * 1286 */ 1287 if (i == 0 && family == 6) { 1288 switch (vendor) { 1289 case X86_VENDOR_AMD: 1290 skipstatus = B_TRUE; 1291 /*FALLTHRU*/ 1292 case X86_VENDOR_Intel: 1293 skipctl = B_TRUE; 1294 break; 1295 } 1296 } 1297 } 1298 1299 ctl_skip_mask |= skipctl << i; 1300 status_skip_mask |= skipstatus << i; 1301 1302 if (skipctl && skipstatus) 1303 continue; 1304 1305 /* 1306 * Record which MCA banks were enabled, from the point of view 1307 * of the whole chip (if some cores share a bank we must be 1308 * sure either can logout from it). 1309 */ 1310 atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i); 1311 1312 #ifndef __xpv 1313 /* 1314 * check CMCI capability 1315 */ 1316 if (mcg_ctl2_present) { 1317 uint64_t ctl2; 1318 uint32_t cap = 0; 1319 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2); 1320 if (ctl2 & MSR_MC_CTL2_EN) 1321 continue; 1322 ctl2 |= MSR_MC_CTL2_EN; 1323 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2); 1324 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2); 1325 mca->gcpu_bank_cmci[i].cmci_cap = cap = 1326 (ctl2 & MSR_MC_CTL2_EN) ? 1 : 0; 1327 if (cap) 1328 cmci_capable ++; 1329 /* 1330 * Set threshold to 1 while unset the en field, to avoid 1331 * CMCI trigged before APIC LVT entry init. 1332 */ 1333 ctl2 = (ctl2 & (~MSR_MC_CTL2_EN)) | 1; 1334 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2); 1335 1336 /* 1337 * init cmci related count 1338 */ 1339 mca->gcpu_bank_cmci[i].cmci_enabled = 0; 1340 mca->gcpu_bank_cmci[i].drtcmci = 0; 1341 mca->gcpu_bank_cmci[i].ncmci = 0; 1342 } 1343 #endif 1344 } 1345 1346 #ifndef __xpv 1347 if (cmci_capable) { 1348 mca->gcpu_mca_flags |= GCPU_MCA_F_CMCI_CAPABLE; 1349 gcpu_mca_cmci_enable(hdl); 1350 } 1351 #endif 1352 1353 #ifndef __xpv 1354 /* 1355 * Log any valid telemetry lurking in the MCA banks, but do not 1356 * clear the status registers. Ignore the disposition returned - 1357 * we have already paniced or reset for any nasty errors found here. 1358 * 1359 * Intel vol 3A says that we should not do this on family 0x6, 1360 * and that for any extended family the BIOS clears things 1361 * on power-on reset so you'll only potentially find valid telemetry 1362 * on warm reset (we do it for both - on power-on reset we should 1363 * just see zeroes). 1364 * 1365 * AMD docs since K7 say we should process anything we find here. 1366 */ 1367 if (!gcpu_suppress_log_on_init && 1368 ((vendor == X86_VENDOR_Intel && family >= 0xf) || 1369 vendor == X86_VENDOR_AMD || 1370 vendor == X86_VENDOR_HYGON)) 1371 gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE, 1372 GCPU_MPT_WHAT_POKE_ERR); 1373 1374 /* 1375 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the 1376 * model-specific module the power of veto. 1377 */ 1378 for (i = 0; i < nbanks; i++) { 1379 struct gcpu_bios_bankcfg *bcfgp = 1380 mca->gcpu_mca_bioscfg.bios_bankcfg + i; 1381 1382 /* 1383 * Stash inherited bank MCA state, even for banks we will 1384 * not initialize ourselves. Do not read the MISC register 1385 * unconditionally - on some processors that will #GP on 1386 * banks that do not implement the MISC register (would be 1387 * caught by on_trap, anyway). 1388 */ 1389 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL), 1390 &bcfgp->bios_bank_ctl); 1391 1392 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), 1393 &bcfgp->bios_bank_status); 1394 1395 if ((bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV) || 1396 gcpu_force_addr_in_payload) { 1397 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), 1398 &bcfgp->bios_bank_addr); 1399 } 1400 1401 /* 1402 * In some old BIOS the status value after boot can indicate 1403 * MISCV when there is actually no MISC register for 1404 * that bank. The following read could therefore 1405 * aggravate a general protection fault. This should be 1406 * caught by on_trap, but the #GP fault handler is busted 1407 * and can suffer a double fault even before we get to 1408 * trap() to check for on_trap protection. Until that 1409 * issue is fixed we remove the one access that we know 1410 * can cause a #GP. 1411 * 1412 * if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV) 1413 * (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), 1414 * &bcfgp->bios_bank_misc); 1415 */ 1416 bcfgp->bios_bank_misc = 0; 1417 1418 if (!(ctl_skip_mask & (1 << i))) { 1419 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL), 1420 cms_bankctl_val(hdl, i, -1ULL)); 1421 } 1422 1423 if (!(status_skip_mask & (1 << i))) { 1424 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 1425 cms_bankstatus_val(hdl, i, 0ULL)); 1426 } 1427 } 1428 #endif 1429 /* 1430 * Now let the model-specific support perform further initialization 1431 * of non-architectural features. 1432 */ 1433 cms_mca_init(hdl, nbanks); 1434 1435 #ifndef __xpv 1436 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL); 1437 membar_producer(); 1438 1439 /* enable all machine-check features */ 1440 if (mcg_ctl_present) 1441 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL, 1442 cms_mcgctl_val(hdl, nbanks, -1ULL)); 1443 #endif 1444 1445 mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock); 1446 1447 #ifndef __xpv 1448 /* enable machine-check exception in CR4 */ 1449 cmi_hdl_enable_mce(hdl); 1450 #endif 1451 } 1452 1453 static uint64_t 1454 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu, 1455 gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp) 1456 { 1457 int curctxbad = 0, unconstrained = 0, forcefatal = 0; 1458 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1459 int nbanks = mca->gcpu_mca_nbanks; 1460 gcpu_mce_status_t mce; 1461 gcpu_bank_logout_t *gbl; 1462 uint64_t disp = 0; 1463 int i; 1464 1465 if (mcesp == NULL) 1466 mcesp = &mce; 1467 1468 mcesp->mce_nerr = nerr; 1469 1470 mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc = 1471 mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned = 1472 mcesp->mce_forcefatal = mcesp->mce_ignored = 0; 1473 1474 /* 1475 * If this a machine check then if the return instruction pointer 1476 * is not valid the current context is lost. 1477 */ 1478 if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV)) 1479 disp |= CMI_ERRDISP_RIPV_INVALID; 1480 gcl->ismc = ismc; 1481 1482 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) { 1483 uint64_t mcistatus = gbl->gbl_status; 1484 uint32_t ms_scope; 1485 int pcc, uc; 1486 int poisoned; 1487 1488 if (!(mcistatus & MSR_MC_STATUS_VAL)) 1489 continue; 1490 1491 if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT) 1492 continue; 1493 1494 pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0; 1495 uc = (mcistatus & MSR_MC_STATUS_UC) != 0; 1496 mcesp->mce_npcc += pcc; 1497 mcesp->mce_nuc += uc; 1498 1499 ms_scope = cms_error_action(hdl, ismc, i, mcistatus, 1500 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout); 1501 1502 if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) { 1503 pcc = 0; 1504 mcesp->mce_npcc_ok++; 1505 gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED; 1506 } 1507 1508 if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) { 1509 uc = 0; 1510 mcesp->mce_nuc_ok++; 1511 gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED; 1512 } 1513 1514 if (uc) { 1515 poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0; 1516 if (poisoned) { 1517 mcesp->mce_nuc_poisoned++; 1518 gbl->gbl_disp |= CMI_ERRDISP_POISONED; 1519 } 1520 } 1521 1522 if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) { 1523 /* 1524 * We're not being instructed to ignore the error, 1525 * so apply our standard disposition logic to it. 1526 */ 1527 if (uc && !poisoned) { 1528 unconstrained++; 1529 gbl->gbl_disp |= disp | 1530 CMI_ERRDISP_UC_UNCONSTRAINED; 1531 } 1532 1533 if (pcc && ismc) { 1534 curctxbad++; 1535 gbl->gbl_disp |= disp | 1536 CMI_ERRDISP_CURCTXBAD; 1537 } 1538 1539 /* 1540 * Even if the above may not indicate that the error 1541 * is terminal, model-specific support may insist 1542 * that we treat it as such. Such errors wil be 1543 * fatal even if discovered via poll. 1544 */ 1545 if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) { 1546 forcefatal++; 1547 mcesp->mce_forcefatal++; 1548 gbl->gbl_disp |= disp | 1549 CMI_ERRDISP_FORCEFATAL; 1550 } 1551 } else { 1552 mcesp->mce_ignored++; 1553 gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED; 1554 } 1555 } 1556 1557 if (unconstrained > 0) 1558 disp |= CMI_ERRDISP_UC_UNCONSTRAINED; 1559 1560 if (curctxbad > 0) 1561 disp |= CMI_ERRDISP_CURCTXBAD; 1562 1563 if (forcefatal > 0) 1564 disp |= CMI_ERRDISP_FORCEFATAL; 1565 1566 if (gcpu_mca_queue != NULL) { 1567 int how; 1568 1569 if (ismc) { 1570 how = cmi_mce_response(rp, disp) ? 1571 ERRORQ_ASYNC : /* no panic, so arrange drain */ 1572 ERRORQ_SYNC; /* panic flow will drain */ 1573 } else { 1574 how = (disp & CMI_ERRDISP_FORCEFATAL && 1575 cmi_panic_on_ue()) ? 1576 ERRORQ_SYNC : /* poller will panic */ 1577 ERRORQ_ASYNC; /* no panic */ 1578 } 1579 1580 errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how); 1581 } else if (disp != 0) { 1582 gcpu_bleat(hdl, gcl); 1583 } 1584 1585 mcesp->mce_disp = disp; 1586 1587 return (disp); 1588 } 1589 1590 /* 1591 * Gather error telemetry from our source, and then submit it for 1592 * processing. 1593 */ 1594 1595 #define IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \ 1596 ((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0) 1597 1598 #define STATUS_EQV(s1, s2) \ 1599 (((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER)) 1600 1601 static uint32_t gcpu_deferrred_polled_clears; 1602 1603 #ifndef __xpv 1604 static void 1605 gcpu_cmci_logout(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p, 1606 uint64_t status, int what) 1607 { 1608 uint64_t ctl2; 1609 1610 if (bank_cmci_p->cmci_cap && (what == GCPU_MPT_WHAT_CYC_ERR) && 1611 (!(status & MSR_MC_STATUS_VAL) || ((status & MSR_MC_STATUS_VAL) && 1612 !(status & MSR_MC_STATUS_CEC_MASK)))) { 1613 1614 if (!(bank_cmci_p->cmci_enabled)) { 1615 /* 1616 * when cmci is disabled, and the bank has no error or 1617 * no corrected error for 1618 * gcpu_mca_cmci_reenable_threshold consecutive polls, 1619 * turn on this bank's cmci. 1620 */ 1621 1622 bank_cmci_p->drtcmci ++; 1623 1624 if (bank_cmci_p->drtcmci >= 1625 gcpu_mca_cmci_reenable_threshold) { 1626 1627 /* turn on cmci */ 1628 1629 (void) cmi_hdl_rdmsr(hdl, 1630 IA32_MSR_MC_CTL2(bank), &ctl2); 1631 ctl2 |= MSR_MC_CTL2_EN; 1632 (void) cmi_hdl_wrmsr(hdl, 1633 IA32_MSR_MC_CTL2(bank), ctl2); 1634 1635 /* reset counter and set flag */ 1636 bank_cmci_p->drtcmci = 0; 1637 bank_cmci_p->cmci_enabled = 1; 1638 } 1639 } else { 1640 /* 1641 * when cmci is enabled,if is in cyclic poll and the 1642 * bank has no error or no corrected error, reset ncmci 1643 * counter 1644 */ 1645 bank_cmci_p->ncmci = 0; 1646 } 1647 } 1648 } 1649 1650 static void 1651 gcpu_cmci_throttle(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p, 1652 int what) 1653 { 1654 uint64_t ctl2 = 0; 1655 1656 /* 1657 * if cmci of this bank occurred beyond 1658 * gcpu_mca_cmci_throttling_threshold between 2 polls, 1659 * turn off this bank's CMCI; 1660 */ 1661 if (bank_cmci_p->cmci_enabled && what == GCPU_MPT_WHAT_CMCI_ERR) { 1662 1663 /* if it is cmci trap, increase the count */ 1664 bank_cmci_p->ncmci++; 1665 1666 if (bank_cmci_p->ncmci >= gcpu_mca_cmci_throttling_threshold) { 1667 1668 /* turn off cmci */ 1669 1670 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(bank), 1671 &ctl2); 1672 ctl2 &= ~MSR_MC_CTL2_EN; 1673 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(bank), 1674 ctl2); 1675 1676 /* clear the flag and count */ 1677 1678 bank_cmci_p->cmci_enabled = 0; 1679 bank_cmci_p->ncmci = 0; 1680 } 1681 } 1682 } 1683 #endif 1684 1685 static void 1686 clear_mc(int first, int last, int ismc, boolean_t clrstatus, 1687 cmi_hdl_t hdl, gcpu_logout_t *gcl, gcpu_logout_t *pgcl) 1688 { 1689 int i; 1690 gcpu_bank_logout_t *gbl, *pgbl; 1691 uint64_t status; 1692 1693 if (first < 0 || last < 0) 1694 return; 1695 1696 for (i = first, gbl = &gcl->gcl_data[first]; i <= last; i++, gbl++) { 1697 status = gbl->gbl_status; 1698 if (status == 0) 1699 continue; 1700 if (clrstatus == B_FALSE) 1701 goto serialize; 1702 1703 /* 1704 * For i86xpv we always clear status in order to invalidate 1705 * the interposed telemetry. 1706 * 1707 * For native machine checks we always clear status here. For 1708 * native polls we must be a little more cautious since there 1709 * is an outside chance that we may clear telemetry from a 1710 * shared MCA bank on which a sibling core is machine checking. 1711 * 1712 * For polled observations of errors that look like they may 1713 * produce a machine check (UC/PCC and ENabled, although these 1714 * do not guarantee a machine check on error occurence) 1715 * we will not clear the status at this wakeup unless 1716 * we saw the same status at the previous poll. We will 1717 * always process and log the current observations - it 1718 * is only the clearing of MCi_STATUS which may be 1719 * deferred until the next wakeup. 1720 */ 1721 if (isxpv || ismc || !IS_MCE_CANDIDATE(status)) { 1722 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL); 1723 goto serialize; 1724 } 1725 1726 /* 1727 * We have a polled observation of a machine check 1728 * candidate. If we saw essentially the same status at the 1729 * last poll then clear the status now since this appears 1730 * not to be a #MC candidate after all. If we see quite 1731 * different status now then do not clear, but reconsider at 1732 * the next poll. In no actual machine check clears 1733 * the status in the interim then the status should not 1734 * keep changing forever (meaning we'd never clear it) 1735 * since before long we'll simply have latched the highest- 1736 * priority error and set the OVerflow bit. Nonetheless 1737 * we count how many times we defer clearing and after 1738 * a while insist on clearing the status. 1739 */ 1740 pgbl = &pgcl->gcl_data[i]; 1741 if (pgbl->gbl_clrdefcnt != 0) { 1742 /* We deferred clear on this bank at last wakeup */ 1743 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) || 1744 pgbl->gbl_clrdefcnt > 5) { 1745 /* 1746 * Status is unchanged so clear it now and, 1747 * since we have already logged this info, 1748 * avoid logging it again. 1749 */ 1750 gbl->gbl_status = 0; 1751 (void) cmi_hdl_wrmsr(hdl, 1752 IA32_MSR_MC(i, STATUS), 0ULL); 1753 } else { 1754 /* Record deferral for next wakeup */ 1755 gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1; 1756 } 1757 } else { 1758 /* Record initial deferral for next wakeup */ 1759 gbl->gbl_clrdefcnt = 1; 1760 gcpu_deferrred_polled_clears++; 1761 } 1762 1763 serialize: 1764 { 1765 #ifdef __xpv 1766 ; 1767 #else 1768 /* 1769 * Intel Vol 3A says to execute a serializing 1770 * instruction here, ie CPUID. Well WRMSR is also 1771 * defined to be serializing, so the status clear above 1772 * should suffice. To be a good citizen, and since 1773 * some clears are deferred, we'll execute a CPUID 1774 * instruction here. 1775 */ 1776 struct cpuid_regs tmp; 1777 (void) __cpuid_insn(&tmp); 1778 #endif 1779 } 1780 } 1781 } 1782 1783 /*ARGSUSED5*/ 1784 void 1785 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask, 1786 gcpu_mce_status_t *mcesp, boolean_t clrstatus, int what) 1787 { 1788 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1789 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1790 int nbanks = mca->gcpu_mca_nbanks; 1791 gcpu_bank_logout_t *gbl, *pgbl; 1792 gcpu_logout_t *gcl, *pgcl; 1793 int ismc = (rp != NULL); 1794 int ispoll = !ismc; 1795 int i, nerr = 0; 1796 cmi_errno_t err; 1797 uint64_t mcg_status; 1798 uint64_t disp; 1799 uint64_t cap; 1800 int first = -1; 1801 int last = -1; 1802 int willpanic = 0; 1803 1804 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) != 1805 CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != 1806 CMI_SUCCESS) { 1807 if (mcesp != NULL) 1808 mcesp->mce_nerr = mcesp->mce_disp = 0; 1809 return; 1810 } 1811 1812 if (ismc) { 1813 gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION]; 1814 pgcl = NULL; 1815 } else { 1816 int pidx = mca->gcpu_mca_nextpoll_idx; 1817 int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ? 1818 GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1; 1819 1820 gcl = mca->gcpu_mca_logout[pidx]; /* current logout */ 1821 pgcl = mca->gcpu_mca_logout[ppidx]; /* previous logout */ 1822 mca->gcpu_mca_nextpoll_idx = ppidx; /* switch next time */ 1823 } 1824 1825 gcl->gcl_timestamp = gethrtime_waitfree(); 1826 gcl->gcl_mcg_status = mcg_status; 1827 gcl->gcl_ip = rp ? rp->r_pc : 0; 1828 1829 gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0; 1830 if (cap & MCG_CAP_TES_P) 1831 gcl->gcl_flags |= GCPU_GCL_F_TES_P; 1832 1833 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) { 1834 uint64_t status, status2, addr, misc; 1835 int retries = gcpu_mca_telemetry_retries; 1836 1837 gbl->gbl_status = 0; 1838 gbl->gbl_disp = 0; 1839 gbl->gbl_clrdefcnt = 0; 1840 1841 /* 1842 * Only logout from MCA banks we have initialized from at 1843 * least one core. If a core shares an MCA bank with another 1844 * but perhaps lost the race to initialize it, then it must 1845 * still be allowed to logout from the shared bank. 1846 */ 1847 if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i)) 1848 continue; 1849 1850 /* 1851 * On a poll look only at the banks we've been asked to check. 1852 */ 1853 if (rp == NULL && !(bankmask & 1 << i)) 1854 continue; 1855 1856 1857 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) != 1858 CMI_SUCCESS) 1859 continue; 1860 1861 #ifndef __xpv 1862 gcpu_cmci_logout(hdl, i, &mca->gcpu_bank_cmci[i], status, what); 1863 #endif 1864 1865 retry: 1866 if (!(status & MSR_MC_STATUS_VAL)) 1867 continue; 1868 1869 /* First and last bank that have valid status */ 1870 if (first < 0) 1871 first = i; 1872 last = i; 1873 1874 addr = -1; 1875 misc = 0; 1876 1877 if ((status & MSR_MC_STATUS_ADDRV) || 1878 gcpu_force_addr_in_payload) 1879 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr); 1880 1881 if (status & MSR_MC_STATUS_MISCV) 1882 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc); 1883 1884 #ifndef __xpv 1885 gcpu_cmci_throttle(hdl, i, &mca->gcpu_bank_cmci[i], what); 1886 #endif 1887 1888 /* 1889 * Allow the model-specific code to extract bank telemetry. 1890 */ 1891 cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout); 1892 1893 /* 1894 * Not all cpu models assure us that the status/address/misc 1895 * data will not change during the above sequence of MSR reads, 1896 * or that it can only change by the addition of the OVerflow 1897 * bit to the status register. If the status has changed 1898 * other than in the overflow bit then we attempt to reread 1899 * for a consistent snapshot, but eventually give up and 1900 * go with what we've got. We only perform this check 1901 * for a poll - a further #MC during a #MC will reset, and 1902 * polled errors should not overwrite higher-priority 1903 * trapping errors (but could set the overflow bit). 1904 */ 1905 if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), 1906 &status2)) == CMI_SUCCESS) { 1907 if (!STATUS_EQV(status, status2)) { 1908 if (retries-- > 0) { 1909 status = status2; 1910 goto retry; 1911 } else { 1912 gbl->gbl_disp |= 1913 CMI_ERRDISP_INCONSISTENT; 1914 } 1915 } 1916 } else if (ispoll && err != CMI_SUCCESS) { 1917 gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT; 1918 } 1919 1920 nerr++; 1921 gbl->gbl_status = status; 1922 gbl->gbl_addr = addr; 1923 gbl->gbl_misc = misc; 1924 1925 /* 1926 * For polled observation, if the count of deferred status 1927 * clears updated in the clear_mc() is nonzero and the 1928 * MCi_STATUS has not changed, the last wakeup has produced 1929 * the ereport of the error. Therefore, clear the status in 1930 * this wakeup to avoid duplicate ereport. 1931 */ 1932 pgbl = &pgcl->gcl_data[i]; 1933 if (!isxpv && ispoll && IS_MCE_CANDIDATE(status) && 1934 pgbl->gbl_clrdefcnt != 0) { 1935 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status)) { 1936 gbl->gbl_status = 0; 1937 (void) cmi_hdl_wrmsr(hdl, 1938 IA32_MSR_MC(i, STATUS), 0ULL); 1939 } 1940 } 1941 } 1942 1943 if (gcpu_mca_stack_flag) 1944 gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH); 1945 else 1946 gcl->gcl_stackdepth = 0; 1947 1948 /* 1949 * Decide our disposition for this error or errors, and submit for 1950 * logging and subsequent diagnosis. 1951 */ 1952 if (nerr != 0) { 1953 disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp); 1954 1955 willpanic = (ismc && cmi_mce_response(rp, disp) == 0); 1956 1957 if (!willpanic) 1958 clear_mc(first, last, ismc, clrstatus, hdl, gcl, pgcl); 1959 } else { 1960 disp = 0; 1961 if (mcesp) { 1962 mcesp->mce_nerr = mcesp->mce_disp = 0; 1963 } 1964 } 1965 1966 /* 1967 * Clear MCG_STATUS if MCIP is set (machine check in progress). 1968 * If a second #MC had occured before now the system would have 1969 * reset. We can only do thise once gcpu_mca_process has copied 1970 * the logout structure. 1971 */ 1972 if (ismc && mcg_status & MCG_STATUS_MCIP) 1973 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0); 1974 1975 /* 1976 * At this point we have read and logged all telemetry that is visible 1977 * under the MCA. On architectures for which the NorthBridge is 1978 * on-chip this may include NB-observed errors, but where the NB 1979 * is off chip it may have been the source of the #MC request and 1980 * so we must call into the memory-controller driver to give it 1981 * a chance to log errors. 1982 */ 1983 if (ismc) { 1984 cmi_mc_logout(hdl, 1, willpanic); 1985 } 1986 } 1987 1988 #ifndef __xpv 1989 int gcpu_mca_trap_vomit_summary = 0; 1990 1991 /* 1992 * On a native machine check exception we come here from mcetrap via 1993 * cmi_mca_trap. A machine check on one cpu of a chip does not trap others 1994 * cpus of the chip, so it is possible that another cpu on this chip could 1995 * initiate a poll while we're in the #mc handler; it is also possible that 1996 * this trap has occured during a poll on this cpu. So we must acquire 1997 * the chip-wide poll lock, but be careful to avoid deadlock. 1998 * 1999 * The 'data' pointer cannot be NULL due to init order. 2000 */ 2001 uint64_t 2002 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp) 2003 { 2004 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 2005 kmutex_t *poll_lock = NULL; 2006 gcpu_mce_status_t mce; 2007 uint64_t mcg_status; 2008 int tooklock = 0; 2009 2010 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) != 2011 CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP)) 2012 return (0); 2013 2014 /* 2015 * Synchronize with any poller from another core that may happen 2016 * to share access to one or more of the MCA banks. 2017 */ 2018 if (gcpu->gcpu_shared != NULL) 2019 poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock; 2020 2021 if (poll_lock != NULL && !mutex_owned(poll_lock)) { 2022 /* 2023 * The lock is not owned by the thread we have 2024 * interrupted. Spin for this adaptive lock. 2025 */ 2026 while (!mutex_tryenter(poll_lock)) { 2027 while (mutex_owner(poll_lock) != NULL) 2028 ; 2029 } 2030 tooklock = 1; 2031 } 2032 2033 gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE, GCPU_MPT_WHAT_MC_ERR); 2034 2035 if (tooklock) 2036 mutex_exit(poll_lock); 2037 2038 /* 2039 * gcpu_mca_trap_vomit_summary may be set for debug assistance. 2040 */ 2041 if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) { 2042 cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, " 2043 "%u PCC (%u ok), " 2044 "%u UC (%d ok, %u poisoned), " 2045 "%u forcefatal, %u ignored", 2046 mce.mce_nerr, (u_longlong_t)mce.mce_disp, 2047 mce.mce_npcc, mce.mce_npcc_ok, 2048 mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned, 2049 mce.mce_forcefatal, mce.mce_ignored); 2050 } 2051 2052 return (mce.mce_disp); 2053 } 2054 #endif 2055 2056 /*ARGSUSED*/ 2057 void 2058 gcpu_faulted_enter(cmi_hdl_t hdl) 2059 { 2060 /* Nothing to do here */ 2061 } 2062 2063 /*ARGSUSED*/ 2064 void 2065 gcpu_faulted_exit(cmi_hdl_t hdl) 2066 { 2067 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 2068 2069 gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING; 2070 } 2071 2072 /* 2073 * Write the requested values to the indicated MSRs. Having no knowledge 2074 * of the model-specific requirements for writing to these model-specific 2075 * registers, we will only blindly write to those MSRs if the 'force' 2076 * argument is nonzero. That option should only be used in prototyping 2077 * and debugging. 2078 */ 2079 /*ARGSUSED*/ 2080 cmi_errno_t 2081 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs, 2082 int force) 2083 { 2084 int i, errs = 0; 2085 2086 for (i = 0; i < nregs; i++) { 2087 uint_t msr = regs[i].cmr_msrnum; 2088 uint64_t val = regs[i].cmr_msrval; 2089 2090 if (cms_present(hdl)) { 2091 if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS) 2092 errs++; 2093 } else if (force) { 2094 errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS); 2095 } else { 2096 errs++; 2097 } 2098 } 2099 2100 return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN); 2101 } 2102 2103 /* deconfigure gcpu_mca_init() */ 2104 void 2105 gcpu_mca_fini(cmi_hdl_t hdl) 2106 { 2107 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 2108 gcpu_mca_t *mca = &gcpu->gcpu_mca; 2109 int i; 2110 2111 /* 2112 * CPU startup code only calls cmi_mca_init if x86_featureset indicates 2113 * both MCA and MCE support (i.e., X86FSET_MCA). P5, K6, and earlier 2114 * processors, which have their own more primitive way of doing 2115 * machine checks, will not have cmi_mca_init called since their 2116 * CPUID information will not indicate both MCA and MCE features. 2117 */ 2118 if (!is_x86_feature(x86_featureset, X86FSET_MCA)) 2119 return; 2120 #ifndef __xpv 2121 /* 2122 * disable machine check in CR4 2123 */ 2124 cmi_ntv_hwdisable_mce(hdl); 2125 #endif 2126 mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock); 2127 gcpu_mca_poll_fini(hdl); 2128 mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock); 2129 2130 /* 2131 * free resources allocated during init 2132 */ 2133 if (mca->gcpu_bank_cmci != NULL) { 2134 kmem_free(mca->gcpu_bank_cmci, sizeof (gcpu_mca_cmci_t) * 2135 mca->gcpu_mca_nbanks); 2136 } 2137 2138 for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) { 2139 if (mca->gcpu_mca_logout[i] != NULL) { 2140 kmem_free(mca->gcpu_mca_logout[i], mca->gcpu_mca_lgsz); 2141 } 2142 } 2143 2144 if (mca->gcpu_mca_bioscfg.bios_bankcfg != NULL) { 2145 kmem_free(mca->gcpu_mca_bioscfg.bios_bankcfg, 2146 sizeof (struct gcpu_bios_bankcfg) * mca->gcpu_mca_nbanks); 2147 } 2148 } 2149