1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2018, Joyent, Inc.
25 * Copyright 2022 Oxide Computer Co.
26 */
27 /*
28 * Copyright (c) 2010, Intel Corporation.
29 * All rights reserved.
30 */
31
32 #include <sys/mca_x86.h>
33 #include <sys/cpu_module_impl.h>
34 #include <sys/cpu_module_ms.h>
35 #include <sys/cmn_err.h>
36 #include <sys/cpuvar.h>
37 #include <sys/pghw.h>
38 #include <sys/x86_archext.h>
39 #include <sys/sysmacros.h>
40 #include <sys/regset.h>
41 #include <sys/privregs.h>
42 #include <sys/systm.h>
43 #include <sys/types.h>
44 #include <sys/log.h>
45 #include <sys/psw.h>
46 #include <sys/fm/protocol.h>
47 #include <sys/fm/util.h>
48 #include <sys/errorq.h>
49 #include <sys/mca_x86.h>
50 #include <sys/fm/cpu/GMCA.h>
51 #include <sys/fm/smb/fmsmb.h>
52 #include <sys/sysevent.h>
53 #include <sys/ontrap.h>
54 #include <sys/smp_impldefs.h>
55
56 #include "gcpu.h"
57
58 extern int x86gentopo_legacy; /* x86 generic topology support */
59
60 static uint_t gcpu_force_addr_in_payload = 0;
61
62 /*
63 * Clear to log telemetry found at initialization. While processor docs
64 * say you should process this telemetry on all but Intel family 0x6
65 * there are way too many exceptions and we want to avoid bogus
66 * diagnoses.
67 */
68 int gcpu_suppress_log_on_init = 1;
69
70 /*
71 * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at
72 * error logout time. The stack will be included in the ereport if the
73 * error type selects stack inclusion, or in all cases if
74 * gcpu_mca_stack_ereport_include is nonzero.
75 */
76 int gcpu_mca_stack_flag = 0;
77 int gcpu_mca_stack_ereport_include = 0;
78
79 /*
80 * The number of times to re-read MCA telemetry to try to obtain a
81 * consistent snapshot if we find it to be changing under our feet.
82 */
83 int gcpu_mca_telemetry_retries = 5;
84
85 #ifndef __xpv
86 int gcpu_mca_cmci_throttling_threshold = 10;
87 int gcpu_mca_cmci_reenable_threshold = 1000;
88
89 /*
90 * This is used to determine whether or not we have registered the CMCI CPU
91 * setup function. This is protected by cpu_lock.
92 */
93 static boolean_t gcpu_mca_cpu_registered = B_FALSE;
94 #endif
95
96 static gcpu_error_disp_t gcpu_errtypes[] = {
97
98 /*
99 * Unclassified
100 */
101 {
102 FM_EREPORT_CPU_GENERIC_UNCLASSIFIED,
103 NULL,
104 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
105 MCAX86_SIMPLE_UNCLASSIFIED_MASKON,
106 MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF
107 },
108
109 /*
110 * Microcode ROM Parity Error
111 */
112 {
113 FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY,
114 NULL,
115 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
116 MCAX86_SIMPLE_MC_CODE_PARITY_MASKON,
117 MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF
118 },
119
120 /*
121 * External - BINIT# from another processor during power-on config
122 */
123 {
124 FM_EREPORT_CPU_GENERIC_EXTERNAL,
125 NULL,
126 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
127 MCAX86_SIMPLE_EXTERNAL_MASKON,
128 MCAX86_SIMPLE_EXTERNAL_MASKOFF
129 },
130
131 /*
132 * Functional redundancy check master/slave error
133 */
134 {
135 FM_EREPORT_CPU_GENERIC_FRC,
136 NULL,
137 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
138 MCAX86_SIMPLE_FRC_MASKON,
139 MCAX86_SIMPLE_FRC_MASKOFF
140 },
141
142 /*
143 * Internal parity error
144 */
145 {
146 FM_EREPORT_CPU_GENERIC_INTERNAL_PARITY,
147 NULL,
148 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
149 MCAX86_SIMPLE_INTERNAL_PARITY_MASKON,
150 MCAX86_SIMPLE_INTERNAL_PARITY_MASKOFF
151 },
152
153
154 /*
155 * Internal timer error
156 */
157 {
158 FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER,
159 NULL,
160 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
161 MCAX86_SIMPLE_INTERNAL_TIMER_MASKON,
162 MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF
163 },
164
165 /*
166 * Internal unclassified
167 */
168 {
169 FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS,
170 NULL,
171 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
172 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON,
173 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF
174 },
175
176 /*
177 * Compound error codes - generic memory hierarchy
178 */
179 {
180 FM_EREPORT_CPU_GENERIC_GENMEMHIER,
181 NULL,
182 FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */
183 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON,
184 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF
185 },
186
187 /*
188 * Compound error codes - TLB errors
189 */
190 {
191 FM_EREPORT_CPU_GENERIC_TLB,
192 "%1$s" "TLB" "%2$s" "_ERR",
193 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
194 MCAX86_COMPOUND_TLB_MASKON,
195 MCAX86_COMPOUND_TLB_MASKOFF
196 },
197
198 /*
199 * Compound error codes - memory hierarchy
200 */
201 {
202 FM_EREPORT_CPU_GENERIC_MEMHIER,
203 "%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR",
204 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
205 MCAX86_COMPOUND_MEMHIER_MASKON,
206 MCAX86_COMPOUND_MEMHIER_MASKOFF
207 },
208
209 /*
210 * Compound error codes - bus and interconnect errors
211 */
212 {
213 FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT,
214 "BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR",
215 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
216 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON,
217 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF
218 },
219 /*
220 * Compound error codes - memory controller errors
221 */
222 {
223 FM_EREPORT_CPU_GENERIC_MEMORY_CONTROLLER,
224 "MC" "_" "%8$s" "_" "%9$s" "_ERR",
225 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
226 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKON,
227 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKOFF
228 },
229 };
230
231 static gcpu_error_disp_t gcpu_unknown = {
232 FM_EREPORT_CPU_GENERIC_UNKNOWN,
233 "UNKNOWN",
234 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
235 0,
236 0
237 };
238
239 static errorq_t *gcpu_mca_queue;
240 static kmutex_t gcpu_mca_queue_lock;
241
242 #ifdef __xpv
243 static int isxpv = 1;
244 #else
245 static int isxpv = 0;
246 #endif
247
248 static const gcpu_error_disp_t *
gcpu_disp_match(uint16_t code)249 gcpu_disp_match(uint16_t code)
250 {
251 const gcpu_error_disp_t *ged = gcpu_errtypes;
252 int i;
253
254 for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t);
255 i++, ged++) {
256 uint16_t on = ged->ged_errcode_mask_on;
257 uint16_t off = ged->ged_errcode_mask_off;
258
259 if ((code & on) == on && (code & off) == 0)
260 return (ged);
261 }
262
263 return (NULL);
264 }
265
266 static uint16_t
bit_strip(uint16_t code,uint16_t mask,uint16_t shift)267 bit_strip(uint16_t code, uint16_t mask, uint16_t shift)
268 {
269 return ((code & mask) >> shift);
270 }
271
272 #define BIT_STRIP(code, name) \
273 bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \
274 MCAX86_ERRCODE_##name##_SHIFT)
275
276 #define GCPU_MNEMONIC_UNDEF "undefined"
277 #define GCPU_MNEMONIC_RESVD "reserved"
278
279 /*
280 * Mappings of TT, LL, RRRR, PP, II and T values to compound error name
281 * mnemonics and to ereport class name components.
282 */
283
284 struct gcpu_mnexp {
285 const char *mne_compound; /* used in expanding compound errname */
286 const char *mne_ereport; /* used in expanding ereport class */
287 };
288
289 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */
290 { "I", FM_EREPORT_CPU_GENERIC_TT_INSTR }, /* INSTR */
291 { "D", FM_EREPORT_CPU_GENERIC_TT_DATA }, /* DATA */
292 { "G", FM_EREPORT_CPU_GENERIC_TT_GEN }, /* GEN */
293 { GCPU_MNEMONIC_UNDEF, "" }
294 };
295
296 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */
297 { "LO", FM_EREPORT_CPU_GENERIC_LL_L0 }, /* L0 */
298 { "L1", FM_EREPORT_CPU_GENERIC_LL_L1 }, /* L1 */
299 { "L2", FM_EREPORT_CPU_GENERIC_LL_L2 }, /* L2 */
300 { "LG", FM_EREPORT_CPU_GENERIC_LL_LG } /* LG */
301 };
302
303 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */
304 { "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR }, /* ERR */
305 { "RD", FM_EREPORT_CPU_GENERIC_RRRR_RD }, /* RD */
306 { "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR }, /* WR */
307 { "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD }, /* DRD */
308 { "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR }, /* DWR */
309 { "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD }, /* IRD */
310 { "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH }, /* PREFETCH */
311 { "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT }, /* EVICT */
312 { "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP }, /* SNOOP */
313 };
314
315 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */
316 { "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC }, /* SRC */
317 { "RES", FM_EREPORT_CPU_GENERIC_PP_RES }, /* RES */
318 { "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS }, /* OBS */
319 { "", FM_EREPORT_CPU_GENERIC_PP_GEN } /* GEN */
320 };
321
322 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */
323 { "M", FM_EREPORT_CPU_GENERIC_II_MEM }, /* MEM */
324 { GCPU_MNEMONIC_RESVD, "" },
325 { "IO", FM_EREPORT_CPU_GENERIC_II_IO }, /* IO */
326 { "", FM_EREPORT_CPU_GENERIC_II_GEN } /* GEN */
327 };
328
329 static struct gcpu_mnexp gcpu_T_mnemonics[] = { /* MCAX86_ERRCODE_T_* */
330 { "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT }, /* NONE */
331 { "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT } /* TIMEOUT */
332 };
333
334 static struct gcpu_mnexp gcpu_CCCC_mnemonics[] = { /* MCAX86_ERRCODE_CCCC_* */
335 { "CH0", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH0 */
336 { "CH1", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH1 */
337 { "CH2", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH2 */
338 { "CH3", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH3 */
339 { "CH4", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH4 */
340 { "CH5", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH5 */
341 { "CH6", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH6 */
342 { "CH7", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH7 */
343 { "CH8", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH8 */
344 { "CH9", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH9 */
345 { "CH10", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH10 */
346 { "CH11", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH11 */
347 { "CH12", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH12 */
348 { "CH13", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH13 */
349 { "CH14", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH14 */
350 { "CH", FM_EREPORT_CPU_GENERIC_CCCC } /* GEN */
351 };
352
353 static struct gcpu_mnexp gcpu_MMM_mnemonics[] = { /* MCAX86_ERRCODE_MMM_* */
354 { "GEN", FM_EREPORT_CPU_GENERIC_MMM_ERR }, /* GEN ERR */
355 { "RD", FM_EREPORT_CPU_GENERIC_MMM_RD }, /* READ */
356 { "WR", FM_EREPORT_CPU_GENERIC_MMM_WR }, /* WRITE */
357 { "ADDR_CMD", FM_EREPORT_CPU_GENERIC_MMM_ADRCMD }, /* ADDR, CMD */
358 { "SCRUB", FM_EREPORT_CPU_GENERIC_MMM_SCRUB },
359 { GCPU_MNEMONIC_RESVD, ""}, /* RESERVED */
360 { GCPU_MNEMONIC_RESVD, ""}, /* RESERVED */
361 { GCPU_MNEMONIC_RESVD, ""} /* RESERVED */
362 };
363
364 enum gcpu_mn_namespace {
365 GCPU_MN_NAMESPACE_COMPOUND,
366 GCPU_MN_NAMESPACE_EREPORT
367 };
368
369 static const char *
gcpu_mnemonic(const struct gcpu_mnexp * tbl,size_t tbl_sz,uint16_t val,enum gcpu_mn_namespace nspace)370 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint16_t val,
371 enum gcpu_mn_namespace nspace)
372 {
373 if (val >= tbl_sz || val > 0xff)
374 return (GCPU_MNEMONIC_UNDEF); /* for all namespaces */
375
376 switch (nspace) {
377 case GCPU_MN_NAMESPACE_COMPOUND:
378 return (tbl[val].mne_compound);
379 /*NOTREACHED*/
380
381 case GCPU_MN_NAMESPACE_EREPORT:
382 return (tbl[val].mne_ereport);
383 /*NOTREACHED*/
384
385 default:
386 return (GCPU_MNEMONIC_UNDEF);
387 /*NOTREACHED*/
388 }
389 }
390
391 /*
392 * The ereport class leaf component is either a simple string with no
393 * format specifiers, or a string with one or more embedded %n$s specifiers -
394 * positional selection for string arguments. The kernel snprintf does
395 * not support %n$ (and teaching it to do so is too big a headache) so
396 * we will expand this restricted format string ourselves.
397 */
398
399 #define GCPU_CLASS_VARCOMPS 9
400
401 #define GCPU_MNEMONIC(code, name, nspace) \
402 gcpu_mnemonic(gcpu_##name##_mnemonics, \
403 sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \
404 BIT_STRIP(code, name), nspace)
405
406 static void
gcpu_mn_fmt(const char * fmt,char * buf,size_t buflen,uint64_t status,enum gcpu_mn_namespace nspace)407 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
408 enum gcpu_mn_namespace nspace)
409 {
410 uint16_t code = MCAX86_ERRCODE(status);
411 const char *mn[GCPU_CLASS_VARCOMPS];
412 char *p = buf; /* current position in buf */
413 char *q = buf + buflen; /* pointer past last char in buf */
414 int which, expfmtchar, error;
415 char c;
416
417 mn[0] = GCPU_MNEMONIC(code, TT, nspace);
418 mn[1] = GCPU_MNEMONIC(code, LL, nspace);
419 mn[2] = GCPU_MNEMONIC(code, RRRR, nspace);
420 mn[3] = GCPU_MNEMONIC(code, PP, nspace);
421 mn[4] = GCPU_MNEMONIC(code, II, nspace);
422 mn[5] = GCPU_MNEMONIC(code, T, nspace);
423 mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : "";
424 mn[7] = GCPU_MNEMONIC(code, CCCC, nspace);
425 mn[8] = GCPU_MNEMONIC(code, MMM, nspace);
426
427 while (p < q - 1 && (c = *fmt++) != '\0') {
428 if (c != '%') {
429 /* not the beginning of a format specifier - copy */
430 *p++ = c;
431 continue;
432 }
433
434 error = 0;
435 which = -1;
436 expfmtchar = -1;
437
438 nextfmt:
439 if ((c = *fmt++) == '\0')
440 break; /* early termination of fmt specifier */
441
442 switch (c) {
443 case '1':
444 case '2':
445 case '3':
446 case '4':
447 case '5':
448 case '6':
449 case '7':
450 case '8':
451 case '9':
452 if (which != -1) { /* allow only one positional digit */
453 error++;
454 break;
455 }
456 which = c - '1';
457 goto nextfmt;
458 /*NOTREACHED*/
459
460 case '$':
461 if (which == -1) { /* no position specified */
462 error++;
463 break;
464 }
465 expfmtchar = 's';
466 goto nextfmt;
467 /*NOTREACHED*/
468
469 case 's':
470 if (expfmtchar != 's') {
471 error++;
472 break;
473 }
474 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
475 mn[which]);
476 p += strlen(p);
477 break;
478
479 default:
480 error++;
481 break;
482 }
483
484 if (error)
485 break;
486 }
487
488 *p = '\0'; /* NUL termination */
489 }
490
491 static void
gcpu_erpt_clsfmt(const char * fmt,char * buf,size_t buflen,uint64_t status,const char * cpuclass,const char * leafclass)492 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
493 const char *cpuclass, const char *leafclass)
494 {
495 char *p = buf; /* current position in buf */
496 char *q = buf + buflen; /* pointer past last char in buf */
497
498 (void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.",
499 FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC);
500
501 p += strlen(p);
502 if (p >= q)
503 return;
504
505 if (leafclass == NULL) {
506 gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status,
507 GCPU_MN_NAMESPACE_EREPORT);
508 } else {
509 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
510 leafclass);
511 }
512 }
513
514 /*
515 * Create an "hc" scheme FMRI identifying the given cpu with
516 * motherboard/chip/core/strand instance numbers.
517 */
518 static nvlist_t *
gcpu_fmri_create(cmi_hdl_t hdl,nv_alloc_t * nva)519 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva)
520 {
521 nvlist_t *nvl, *fmri;
522
523 if ((nvl = fm_nvlist_create(nva)) == NULL)
524 return (NULL);
525
526 if (!x86gentopo_legacy) {
527 fmri = cmi_hdl_smb_bboard(hdl);
528 if (fmri == NULL)
529 return (NULL);
530
531 fm_fmri_hc_create(nvl, FM_HC_SCHEME_VERSION,
532 NULL, NULL, fmri, 3,
533 "chip", cmi_hdl_smb_chipid(hdl),
534 "core", cmi_hdl_coreid(hdl),
535 "strand", cmi_hdl_strandid(hdl));
536 } else {
537 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 4,
538 "motherboard", 0,
539 "chip", cmi_hdl_chipid(hdl),
540 "core", cmi_hdl_coreid(hdl),
541 "strand", cmi_hdl_strandid(hdl));
542 }
543
544 return (nvl);
545 }
546
547 int gcpu_bleat_count_thresh = 5;
548 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL;
549
550 /*
551 * Called when we are unable to propogate a logout structure onto an
552 * errorq for subsequent ereport preparation and logging etc. The caller
553 * should usually only decide to call this for severe errors - those we
554 * suspect we may need to panic for.
555 */
556 static void
gcpu_bleat(cmi_hdl_t hdl,gcpu_logout_t * gcl)557 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl)
558 {
559 hrtime_t now = gethrtime_waitfree();
560 static hrtime_t gcpu_last_bleat;
561 gcpu_bank_logout_t *gbl;
562 static int bleatcount;
563 int i;
564
565 /*
566 * Throttle spamming of the console. The first gcpu_bleat_count_thresh
567 * can come as fast as we like, but once we've spammed that many
568 * to the console we require a minimum interval to pass before
569 * any more complaints.
570 */
571 if (++bleatcount > gcpu_bleat_count_thresh) {
572 if (now - gcpu_last_bleat < gcpu_bleat_min_interval)
573 return;
574 else
575 bleatcount = 0;
576 }
577 gcpu_last_bleat = now;
578
579 cmn_err(CE_WARN,
580 "Machine-Check Errors unlogged on chip %d core %d strand %d, "
581 "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl),
582 cmi_hdl_strandid(hdl));
583 cmn_err(CE_WARN, "MCG_STATUS 0x%016llx",
584 (u_longlong_t)gcl->gcl_mcg_status);
585 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
586 uint64_t status = gbl->gbl_status;
587
588 if (!(status & MSR_MC_STATUS_VAL))
589 continue;
590
591 /* Force ADDRV for AMD Family 0xf and above */
592 if (gcpu_force_addr_in_payload)
593 status = status | MSR_MC_STATUS_ADDRV;
594
595 switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) {
596 case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV:
597 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
598 "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx",
599 i, IA32_MSR_MC(i, STATUS),
600 (u_longlong_t)gbl->gbl_status,
601 (u_longlong_t)gbl->gbl_addr,
602 (u_longlong_t)gbl->gbl_misc);
603 break;
604
605 case MSR_MC_STATUS_ADDRV:
606 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
607 "STAT 0x%016llx ADDR 0x%016llx",
608 i, IA32_MSR_MC(i, STATUS),
609 (u_longlong_t)gbl->gbl_status,
610 (u_longlong_t)gbl->gbl_addr);
611 break;
612
613 case MSR_MC_STATUS_MISCV:
614 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
615 "STAT 0x%016llx MISC 0x%016llx",
616 i, IA32_MSR_MC(i, STATUS),
617 (u_longlong_t)gbl->gbl_status,
618 (u_longlong_t)gbl->gbl_misc);
619 break;
620
621 default:
622 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
623 "STAT 0x%016llx",
624 i, IA32_MSR_MC(i, STATUS),
625 (u_longlong_t)gbl->gbl_status);
626 break;
627
628 }
629 }
630 }
631
632 #define _GCPU_BSTATUS(status, what) \
633 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \
634 (status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE
635
636 static void
gcpu_ereport_add_logout(nvlist_t * ereport,const gcpu_logout_t * gcl,uint_t bankno,const gcpu_error_disp_t * ged,uint16_t code)637 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl,
638 uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code)
639 {
640 uint64_t members = ged ? ged->ged_ereport_members :
641 FM_EREPORT_PAYLOAD_FLAGS_COMMON;
642 uint64_t mcg = gcl->gcl_mcg_status;
643 int mcip = mcg & MCG_STATUS_MCIP;
644 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno];
645 uint64_t bstat = gbl->gbl_status;
646
647 /*
648 * Include the compound error name if requested and if this
649 * is a compound error type.
650 */
651 if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged &&
652 ged->ged_compound_fmt != NULL) {
653 char buf[FM_MAX_CLASS];
654
655 gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code,
656 GCPU_MN_NAMESPACE_COMPOUND);
657 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR,
658 DATA_TYPE_STRING, buf, NULL);
659 }
660
661 /*
662 * Include disposition information for this error
663 */
664 if (members & FM_EREPORT_PAYLOAD_FLAG_DISP &&
665 gbl->gbl_disp != 0) {
666 int i, empty = 1;
667 char buf[128];
668 char *p = buf, *q = buf + 128;
669 static struct _gcpu_disp_name {
670 uint64_t dv;
671 const char *dn;
672 } disp_names[] = {
673 { CMI_ERRDISP_CURCTXBAD,
674 "processor_context_corrupt" },
675 { CMI_ERRDISP_RIPV_INVALID,
676 "return_ip_invalid" },
677 { CMI_ERRDISP_UC_UNCONSTRAINED,
678 "unconstrained" },
679 { CMI_ERRDISP_FORCEFATAL,
680 "forcefatal" },
681 { CMI_ERRDISP_IGNORED,
682 "ignored" },
683 { CMI_ERRDISP_PCC_CLEARED,
684 "corrupt_context_cleared" },
685 { CMI_ERRDISP_UC_CLEARED,
686 "uncorrected_data_cleared" },
687 { CMI_ERRDISP_POISONED,
688 "poisoned" },
689 { CMI_ERRDISP_INCONSISTENT,
690 "telemetry_unstable" },
691 };
692
693 for (i = 0; i < sizeof (disp_names) /
694 sizeof (struct _gcpu_disp_name); i++) {
695 if ((gbl->gbl_disp & disp_names[i].dv) == 0)
696 continue;
697
698 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p,
699 "%s%s", empty ? "" : ",", disp_names[i].dn);
700 p += strlen(p);
701 empty = 0;
702 }
703
704 if (p != buf)
705 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP,
706 DATA_TYPE_STRING, buf, NULL);
707 }
708
709 /*
710 * If MCG_STATUS is included add that and an indication of whether
711 * this ereport was the result of a machine check or poll.
712 */
713 if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) {
714 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS,
715 DATA_TYPE_UINT64, mcg, NULL);
716
717 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP,
718 DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL);
719 }
720
721 /*
722 * If an instruction pointer is to be included add one provided
723 * MCG_STATUS indicated it is valid; meaningless for polled events.
724 */
725 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP &&
726 mcg & MCG_STATUS_EIPV) {
727 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP,
728 DATA_TYPE_UINT64, gcl->gcl_ip, NULL);
729 }
730
731 /*
732 * Add an indication of whether the trap occured during privileged code.
733 */
734 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) {
735 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV,
736 DATA_TYPE_BOOLEAN_VALUE,
737 gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL);
738 }
739
740 /*
741 * If requested, add the index of the MCA bank. This indicates the
742 * n'th bank of 4 MCA registers, and does not necessarily correspond
743 * to MCi_* - use the bank offset to correlate
744 */
745 if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) {
746 fm_payload_set(ereport,
747 /* Bank number */
748 FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno,
749 /* Offset of MCi_CTL */
750 FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64,
751 IA32_MSR_MC(bankno, CTL),
752 NULL);
753 }
754
755 /*
756 * Add MCi_STATUS if requested, and decode it.
757 */
758 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) {
759 const char *tbes[] = {
760 "No tracking", /* 00 */
761 "Green - below threshold", /* 01 */
762 "Yellow - above threshold", /* 10 */
763 "Reserved" /* 11 */
764 };
765
766 fm_payload_set(ereport,
767 /* Bank MCi_STATUS */
768 FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat,
769 /* Overflow? */
770 _GCPU_BSTATUS(bstat, OVER),
771 /* Uncorrected? */
772 _GCPU_BSTATUS(bstat, UC),
773 /* Enabled? */
774 _GCPU_BSTATUS(bstat, EN),
775 /* Processor context corrupt? */
776 _GCPU_BSTATUS(bstat, PCC),
777 /* Error code */
778 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE,
779 DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat),
780 /* Model-specific error code */
781 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE,
782 DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat),
783 NULL);
784
785 /*
786 * If MCG_CAP.TES_P indicates that that thresholding info
787 * is present in the architural component of the bank status
788 * then include threshold information for this bank.
789 */
790 if (gcl->gcl_flags & GCPU_GCL_F_TES_P) {
791 fm_payload_set(ereport,
792 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES,
793 DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)],
794 NULL);
795 }
796 }
797
798 /*
799 * Add MCi_ADDR info if requested and valid. We force addition of
800 * MCi_ADDR, even if its not valid on AMD family 0xf and above,
801 * to aid in analysis of ereports, for WatchDog errors.
802 */
803 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR &&
804 ((bstat & MSR_MC_STATUS_ADDRV) ||
805 gcpu_force_addr_in_payload)) {
806 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR,
807 DATA_TYPE_UINT64, gbl->gbl_addr, NULL);
808 }
809
810 /*
811 * MCi_MISC if requested and MCi_STATUS.MISCV).
812 */
813 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC &&
814 bstat & MSR_MC_STATUS_MISCV) {
815 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC,
816 DATA_TYPE_UINT64, gbl->gbl_misc, NULL);
817 }
818
819 }
820
821 /*
822 * Construct and post an ereport based on the logout information from a
823 * single MCA bank. We are not necessarily running on the cpu that
824 * detected the error.
825 */
826 static void
gcpu_ereport_post(const gcpu_logout_t * gcl,int bankidx,const gcpu_error_disp_t * ged,cms_cookie_t mscookie,uint64_t status)827 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx,
828 const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status)
829 {
830 gcpu_data_t *gcpu = gcl->gcl_gcpu;
831 cmi_hdl_t hdl = gcpu->gcpu_hdl;
832 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx];
833 const char *cpuclass = NULL, *leafclass = NULL;
834 uint16_t code = MCAX86_ERRCODE(status);
835 errorq_elem_t *eqep, *scr_eqep;
836 nvlist_t *ereport, *detector;
837 char buf[FM_MAX_CLASS];
838 const char *classfmt;
839 nv_alloc_t *nva;
840
841 if (panicstr) {
842 if ((eqep = errorq_reserve(ereport_errorq)) == NULL)
843 return;
844 ereport = errorq_elem_nvl(ereport_errorq, eqep);
845
846 /*
847 * Allocate another element for scratch space, but fallback
848 * to the one we have if that fails. We'd like to use the
849 * additional scratch space for nvlist construction.
850 */
851 if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL)
852 nva = errorq_elem_nva(ereport_errorq, scr_eqep);
853 else
854 nva = errorq_elem_nva(ereport_errorq, eqep);
855 } else {
856 ereport = fm_nvlist_create(NULL);
857 nva = NULL;
858 eqep = NULL;
859 scr_eqep = NULL;
860 }
861
862 if (ereport == NULL)
863 return;
864
865 /*
866 * Common payload data required by the protocol:
867 * - ereport class
868 * - detector
869 * - ENA
870 */
871
872 /*
873 * Ereport class - call into model-specific support to allow it to
874 * provide a cpu class or leaf class, otherwise calculate our own.
875 */
876 cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass);
877 classfmt = ged ? ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN;
878 gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass,
879 leafclass);
880
881 /*
882 * The detector FMRI.
883 */
884 if ((detector = cms_ereport_detector(hdl, bankidx, mscookie,
885 nva)) == NULL)
886 detector = gcpu_fmri_create(hdl, nva);
887
888 /*
889 * Should we define a new ENA format 3?? for chip/core/strand?
890 * It will be better when virtualized.
891 */
892 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf,
893 fm_ena_generate_cpu(gcl->gcl_timestamp,
894 cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 |
895 cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL);
896
897 if (panicstr) {
898 fm_nvlist_destroy(detector, FM_NVA_RETAIN);
899 nv_alloc_reset(nva);
900 } else {
901 fm_nvlist_destroy(detector, FM_NVA_FREE);
902 }
903
904 /*
905 * Add the architectural ereport class-specific payload data.
906 */
907 gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code);
908
909 /*
910 * Allow model-specific code to add ereport members.
911 */
912 cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status,
913 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie);
914
915 /*
916 * Include stack if options is turned on and either selected in
917 * the payload member bitmask or inclusion is forced.
918 */
919 if (gcpu_mca_stack_flag &&
920 (cms_ereport_includestack(hdl, mscookie) ==
921 B_TRUE || gcpu_mca_stack_ereport_include)) {
922 fm_payload_stack_add(ereport, gcl->gcl_stack,
923 gcl->gcl_stackdepth);
924 }
925
926 /*
927 * If injection has taken place anytime in the past then note this
928 * on the ereport.
929 */
930 if (cmi_inj_tainted() == B_TRUE) {
931 fm_payload_set(ereport, "__injected", DATA_TYPE_BOOLEAN_VALUE,
932 B_TRUE, NULL);
933 }
934
935 /*
936 * Post ereport.
937 */
938 if (panicstr) {
939 errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC);
940 if (scr_eqep)
941 errorq_cancel(ereport_errorq, scr_eqep);
942 } else {
943 (void) fm_ereport_post(ereport, EVCH_TRYHARD);
944 fm_nvlist_destroy(ereport, FM_NVA_FREE);
945 }
946
947 }
948
949 /*ARGSUSED*/
950 void
gcpu_mca_drain(void * ignored,const void * data,const errorq_elem_t * eqe)951 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe)
952 {
953 const gcpu_logout_t *gcl = data;
954 const gcpu_bank_logout_t *gbl;
955 int ismc;
956 int i;
957
958 ismc = gcl->ismc;
959 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
960 const gcpu_error_disp_t *gened;
961 cms_cookie_t mscookie;
962
963 if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
964 !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
965 uint16_t code = MCAX86_ERRCODE(gbl->gbl_status);
966
967 /*
968 * Perform a match based on IA32 MCA architectural
969 * components alone.
970 */
971 gened = gcpu_disp_match(code); /* may be NULL */
972
973 /*
974 * Now see if an model-specific match can be made.
975 */
976 mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, ismc,
977 i, gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc,
978 gcl->gcl_ms_logout);
979
980 /*
981 * Prepare and dispatch an ereport for logging and
982 * diagnosis.
983 */
984 gcpu_ereport_post(gcl, i, gened, mscookie,
985 gbl->gbl_status);
986 } else if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
987 (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
988 /*
989 * Telemetry kept changing as we tried to read
990 * it. Force an unknown ereport leafclass but
991 * keep the telemetry unchanged for logging.
992 */
993 gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL,
994 gbl->gbl_status);
995 }
996 }
997 }
998
999 static size_t gcpu_mca_queue_datasz = 0;
1000
1001 /*
1002 * The following code is ready to make a weak attempt at growing the
1003 * errorq structure size. Since it is not foolproof (we don't know
1004 * who may already be producing to the outgoing errorq) our caller
1005 * instead assures that we'll always be called with no greater data
1006 * size than on our first call.
1007 */
1008 static void
gcpu_errorq_init(size_t datasz)1009 gcpu_errorq_init(size_t datasz)
1010 {
1011 int slots;
1012
1013 mutex_enter(&gcpu_mca_queue_lock);
1014
1015 if (gcpu_mca_queue_datasz >= datasz) {
1016 mutex_exit(&gcpu_mca_queue_lock);
1017 return;
1018 }
1019
1020 membar_producer();
1021 if (gcpu_mca_queue) {
1022 gcpu_mca_queue_datasz = 0;
1023 errorq_destroy(gcpu_mca_queue);
1024 }
1025
1026 slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS);
1027 slots = MIN(slots, GCPU_MCA_MAX_ERRORS);
1028
1029 gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain,
1030 NULL, slots, datasz, 1, ERRORQ_VITAL);
1031
1032 if (gcpu_mca_queue != NULL)
1033 gcpu_mca_queue_datasz = datasz;
1034
1035 mutex_exit(&gcpu_mca_queue_lock);
1036 }
1037
1038 /*
1039 * Perform MCA initialization as described in section 14.6 of Intel 64
1040 * and IA-32 Architectures Software Developer's Manual Volume 3A.
1041 */
1042
1043 static uint_t global_nbanks;
1044
1045 #ifndef __xpv
1046 /*ARGSUSED*/
1047 int
gcpu_cmci_cpu_setup(cpu_setup_t what,int cpuid,void * arg)1048 gcpu_cmci_cpu_setup(cpu_setup_t what, int cpuid, void *arg)
1049 {
1050 /*
1051 * In general, we'd expect that in a multi-socket configuration, either
1052 * all CPUs would support CMCI or none of them would. Unfortunately,
1053 * that may not be the case in the wild. While we'd rather check the
1054 * handle's enablement state here, that itself is a bit complicated. We
1055 * don't have a guarantee in a heterogenous situation that the CPU in
1056 * question is using the generic CPU module or not, even though we've
1057 * been registered. As such, we allow the interrupt to be registered and
1058 * written to the local apic anyways. We won't have a CMCI interrupt
1059 * generated anyways because the MCA banks will not be programmed as
1060 * such for that CPU by the polling thread.
1061 */
1062 switch (what) {
1063 case CPU_ON:
1064 psm_cmci_setup(cpuid, B_TRUE);
1065 break;
1066 case CPU_OFF:
1067 psm_cmci_setup(cpuid, B_FALSE);
1068 break;
1069 default:
1070 break;
1071 }
1072
1073 return (0);
1074 }
1075
1076 void
gcpu_mca_cmci_enable(cmi_hdl_t hdl)1077 gcpu_mca_cmci_enable(cmi_hdl_t hdl)
1078 {
1079 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1080 gcpu_mca_t *mca = &gcpu->gcpu_mca;
1081
1082 /*
1083 * If this CPU doesn't support CMCI, don't do anything.
1084 */
1085 if ((mca->gcpu_mca_flags & GCPU_MCA_F_CMCI_CAPABLE) == 0)
1086 return;
1087
1088 /*
1089 * If we don't have support from the PSM module, then there's nothing we
1090 * can do. Note that this changes as we start up the system. The only
1091 * case where it may be mistakenly NULL is for the boot CPU. The boot
1092 * CPU will have this taken care of for it in gcpu_post_startup(), once
1093 * we know for certain whether or not the PSM module supports CMCI.
1094 */
1095 if (psm_cmci_setup == NULL) {
1096 return;
1097 }
1098
1099 mca->gcpu_mca_flags |= GCPU_MCA_F_CMCI_ENABLE;
1100 if (MUTEX_HELD(&cpu_lock)) {
1101 if (!gcpu_mca_cpu_registered) {
1102 register_cpu_setup_func(gcpu_cmci_cpu_setup, NULL);
1103 gcpu_mca_cpu_registered = B_TRUE;
1104 }
1105 } else {
1106 mutex_enter(&cpu_lock);
1107 if (!gcpu_mca_cpu_registered) {
1108 register_cpu_setup_func(gcpu_cmci_cpu_setup, NULL);
1109 gcpu_mca_cpu_registered = B_TRUE;
1110 }
1111 mutex_exit(&cpu_lock);
1112 }
1113
1114 /*
1115 * Call the PSM op to make sure that we initialize things on
1116 * this CPU.
1117 */
1118 psm_cmci_setup(cmi_hdl_logical_id(hdl), B_TRUE);
1119 }
1120 #endif /* !__xpv */
1121
1122 void
gcpu_mca_init(cmi_hdl_t hdl)1123 gcpu_mca_init(cmi_hdl_t hdl)
1124 {
1125 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1126 uint64_t cap;
1127 uint_t vendor = cmi_hdl_vendor(hdl);
1128 uint_t family = cmi_hdl_family(hdl);
1129 gcpu_mca_t *mca = &gcpu->gcpu_mca;
1130 int mcg_ctl_present;
1131 uint_t nbanks;
1132 uint32_t ctl_skip_mask = 0;
1133 uint32_t status_skip_mask = 0;
1134 size_t mslsz;
1135 int i;
1136 #ifndef __xpv
1137 int mcg_ctl2_present;
1138 uint32_t cmci_capable = 0;
1139 #endif
1140 if (gcpu == NULL)
1141 return;
1142
1143 /* We add MCi_ADDR always for AMD Family 0xf and above */
1144 if (family >= 0xf)
1145 gcpu_force_addr_in_payload = 1;
1146
1147 /*
1148 * Protect from some silly /etc/system settings.
1149 */
1150 if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100)
1151 gcpu_mca_telemetry_retries = 5;
1152
1153 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS)
1154 return;
1155
1156 /*
1157 * CPU startup code only calls cmi_mca_init if x86_featureset indicates
1158 * both MCA and MCE support (i.e., X86FSET_MCA). P5, K6, and earlier
1159 * processors, which have their own more primitive way of doing
1160 * machine checks, will not have cmi_mca_init called since their
1161 * CPUID information will not indicate both MCA and MCE features.
1162 */
1163 ASSERT(is_x86_feature(x86_featureset, X86FSET_MCA));
1164
1165 /*
1166 * Determine whether the IA32_MCG_CTL register is present. If it
1167 * is we will enable all features by writing -1 to it towards
1168 * the end of this initialization; if it is absent then volume 3A
1169 * says we must nonetheless continue to initialize the individual
1170 * banks.
1171 */
1172 mcg_ctl_present = cap & MCG_CAP_CTL_P;
1173 #ifndef __xpv
1174 mcg_ctl2_present = cap & MCG_CAP_CTL2_P;
1175 #endif
1176
1177 /*
1178 * We squirell values away for inspection/debugging.
1179 */
1180 mca->gcpu_mca_bioscfg.bios_mcg_cap = cap;
1181 if (mcg_ctl_present)
1182 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL,
1183 &mca->gcpu_mca_bioscfg.bios_mcg_ctl);
1184
1185 /*
1186 * Determine the number of error-reporting banks implemented.
1187 */
1188 mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK;
1189
1190 if (nbanks != 0 && global_nbanks == 0)
1191 global_nbanks = nbanks; /* no race - BSP will get here first */
1192
1193 /*
1194 * If someone is hiding the number of banks (perhaps we are fully
1195 * virtualized?) or if this processor has more banks than the
1196 * first to set global_nbanks then bail. The latter requirement
1197 * is because we need to size our errorq data structure and we
1198 * don't want to have to grow the errorq (destroy and recreate)
1199 * which may just lose some telemetry.
1200 */
1201 if (nbanks == 0 || nbanks > global_nbanks)
1202 return;
1203
1204 mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks *
1205 sizeof (struct gcpu_bios_bankcfg), KM_SLEEP);
1206
1207 /*
1208 * Calculate the size we need to allocate for a gcpu_logout_t
1209 * with a gcl_data array big enough for all banks of this cpu.
1210 * Add any space requested by the model-specific logout support.
1211 */
1212 mslsz = cms_logout_size(hdl);
1213 mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) +
1214 (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz;
1215
1216 for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) {
1217 gcpu_logout_t *gcl;
1218
1219 mca->gcpu_mca_logout[i] = gcl =
1220 kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP);
1221 gcl->gcl_gcpu = gcpu;
1222 gcl->gcl_nbanks = nbanks;
1223 gcl->gcl_ms_logout = (mslsz == 0) ? NULL :
1224 (char *)(&gcl->gcl_data[0]) + nbanks *
1225 sizeof (gcpu_bank_logout_t);
1226
1227 }
1228
1229 #ifdef __xpv
1230 gcpu_xpv_mca_init(nbanks);
1231 #endif
1232
1233 mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1;
1234
1235 #ifndef __xpv
1236 mca->gcpu_bank_cmci = kmem_zalloc(sizeof (gcpu_mca_cmci_t) * nbanks,
1237 KM_SLEEP);
1238 #endif
1239
1240 /*
1241 * Create our errorq to transport the logout structures. This
1242 * can fail so users of gcpu_mca_queue must be prepared for NULL.
1243 */
1244 gcpu_errorq_init(mca->gcpu_mca_lgsz);
1245
1246 /*
1247 * Not knowing which, if any, banks are shared between cores we
1248 * assure serialization of MCA bank initialization by each cpu
1249 * on the chip. On chip architectures in which some banks are
1250 * shared this will mean the shared resource is initialized more
1251 * than once - we're simply aiming to avoid simultaneous MSR writes
1252 * to the shared resource.
1253 *
1254 * Even with these precautions, some platforms may yield a GP fault
1255 * if a core other than a designated master tries to write anything
1256 * but all 0's to MCi_{STATUS,ADDR,CTL}. So we will perform
1257 * those writes under on_trap protection.
1258 */
1259 mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock);
1260
1261 /*
1262 * Initialize poller data, but don't start polling yet.
1263 */
1264 gcpu_mca_poll_init(hdl);
1265
1266 /*
1267 * Work out which MCA banks we will initialize. In MCA logout
1268 * code we will only read those banks which we initialize here.
1269 */
1270 for (i = 0; i < nbanks; i++) {
1271 boolean_t skipctl = cms_bankctl_skipinit(hdl, i);
1272 boolean_t skipstatus = cms_bankstatus_skipinit(hdl, i);
1273
1274 if (!cms_present(hdl)) {
1275 /*
1276 * Model-specific support is not present, try to use
1277 * sane defaults.
1278 *
1279 * On AMD family 6 processors, reports about spurious
1280 * machine checks indicate that bank 0 should be
1281 * skipped.
1282 *
1283 * On Intel family 6 processors, the documentation tells
1284 * us not to write to MC0_CTL.
1285 *
1286 */
1287 if (i == 0 && family == 6) {
1288 switch (vendor) {
1289 case X86_VENDOR_AMD:
1290 skipstatus = B_TRUE;
1291 /*FALLTHRU*/
1292 case X86_VENDOR_Intel:
1293 skipctl = B_TRUE;
1294 break;
1295 }
1296 }
1297 }
1298
1299 ctl_skip_mask |= skipctl << i;
1300 status_skip_mask |= skipstatus << i;
1301
1302 if (skipctl && skipstatus)
1303 continue;
1304
1305 /*
1306 * Record which MCA banks were enabled, from the point of view
1307 * of the whole chip (if some cores share a bank we must be
1308 * sure either can logout from it).
1309 */
1310 atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i);
1311
1312 #ifndef __xpv
1313 /*
1314 * check CMCI capability
1315 */
1316 if (mcg_ctl2_present) {
1317 uint64_t ctl2;
1318 uint32_t cap = 0;
1319 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
1320 if (ctl2 & MSR_MC_CTL2_EN)
1321 continue;
1322 ctl2 |= MSR_MC_CTL2_EN;
1323 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
1324 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
1325 mca->gcpu_bank_cmci[i].cmci_cap = cap =
1326 (ctl2 & MSR_MC_CTL2_EN) ? 1 : 0;
1327 if (cap)
1328 cmci_capable ++;
1329 /*
1330 * Set threshold to 1 while unset the en field, to avoid
1331 * CMCI trigged before APIC LVT entry init.
1332 */
1333 ctl2 = (ctl2 & (~MSR_MC_CTL2_EN)) | 1;
1334 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
1335
1336 /*
1337 * init cmci related count
1338 */
1339 mca->gcpu_bank_cmci[i].cmci_enabled = 0;
1340 mca->gcpu_bank_cmci[i].drtcmci = 0;
1341 mca->gcpu_bank_cmci[i].ncmci = 0;
1342 }
1343 #endif
1344 }
1345
1346 #ifndef __xpv
1347 if (cmci_capable) {
1348 mca->gcpu_mca_flags |= GCPU_MCA_F_CMCI_CAPABLE;
1349 gcpu_mca_cmci_enable(hdl);
1350 }
1351 #endif
1352
1353 #ifndef __xpv
1354 /*
1355 * Log any valid telemetry lurking in the MCA banks, but do not
1356 * clear the status registers. Ignore the disposition returned -
1357 * we have already paniced or reset for any nasty errors found here.
1358 *
1359 * Intel vol 3A says that we should not do this on family 0x6,
1360 * and that for any extended family the BIOS clears things
1361 * on power-on reset so you'll only potentially find valid telemetry
1362 * on warm reset (we do it for both - on power-on reset we should
1363 * just see zeroes).
1364 *
1365 * AMD docs since K7 say we should process anything we find here.
1366 */
1367 if (!gcpu_suppress_log_on_init &&
1368 ((vendor == X86_VENDOR_Intel && family >= 0xf) ||
1369 vendor == X86_VENDOR_AMD ||
1370 vendor == X86_VENDOR_HYGON))
1371 gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE,
1372 GCPU_MPT_WHAT_POKE_ERR);
1373
1374 /*
1375 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the
1376 * model-specific module the power of veto.
1377 */
1378 for (i = 0; i < nbanks; i++) {
1379 struct gcpu_bios_bankcfg *bcfgp =
1380 mca->gcpu_mca_bioscfg.bios_bankcfg + i;
1381
1382 /*
1383 * Stash inherited bank MCA state, even for banks we will
1384 * not initialize ourselves. Do not read the MISC register
1385 * unconditionally - on some processors that will #GP on
1386 * banks that do not implement the MISC register (would be
1387 * caught by on_trap, anyway).
1388 */
1389 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL),
1390 &bcfgp->bios_bank_ctl);
1391
1392 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1393 &bcfgp->bios_bank_status);
1394
1395 if ((bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV) ||
1396 gcpu_force_addr_in_payload) {
1397 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR),
1398 &bcfgp->bios_bank_addr);
1399 }
1400
1401 /*
1402 * In some old BIOS the status value after boot can indicate
1403 * MISCV when there is actually no MISC register for
1404 * that bank. The following read could therefore
1405 * aggravate a general protection fault. This should be
1406 * caught by on_trap, but the #GP fault handler is busted
1407 * and can suffer a double fault even before we get to
1408 * trap() to check for on_trap protection. Until that
1409 * issue is fixed we remove the one access that we know
1410 * can cause a #GP.
1411 *
1412 * if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV)
1413 * (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC),
1414 * &bcfgp->bios_bank_misc);
1415 */
1416 bcfgp->bios_bank_misc = 0;
1417
1418 if (!(ctl_skip_mask & (1 << i))) {
1419 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL),
1420 cms_bankctl_val(hdl, i, -1ULL));
1421 }
1422
1423 if (!(status_skip_mask & (1 << i))) {
1424 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS),
1425 cms_bankstatus_val(hdl, i, 0ULL));
1426 }
1427 }
1428 #endif
1429 /*
1430 * Now let the model-specific support perform further initialization
1431 * of non-architectural features.
1432 */
1433 cms_mca_init(hdl, nbanks);
1434
1435 #ifndef __xpv
1436 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL);
1437 membar_producer();
1438
1439 /* enable all machine-check features */
1440 if (mcg_ctl_present)
1441 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL,
1442 cms_mcgctl_val(hdl, nbanks, -1ULL));
1443 #endif
1444
1445 mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock);
1446
1447 #ifndef __xpv
1448 /* enable machine-check exception in CR4 */
1449 cmi_hdl_enable_mce(hdl);
1450 #endif
1451 }
1452
1453 static uint64_t
gcpu_mca_process(cmi_hdl_t hdl,struct regs * rp,int nerr,gcpu_data_t * gcpu,gcpu_logout_t * gcl,int ismc,gcpu_mce_status_t * mcesp)1454 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu,
1455 gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp)
1456 {
1457 int curctxbad = 0, unconstrained = 0, forcefatal = 0;
1458 gcpu_mca_t *mca = &gcpu->gcpu_mca;
1459 int nbanks = mca->gcpu_mca_nbanks;
1460 gcpu_mce_status_t mce;
1461 gcpu_bank_logout_t *gbl;
1462 uint64_t disp = 0;
1463 int i;
1464
1465 if (mcesp == NULL)
1466 mcesp = &mce;
1467
1468 mcesp->mce_nerr = nerr;
1469
1470 mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc =
1471 mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned =
1472 mcesp->mce_forcefatal = mcesp->mce_ignored = 0;
1473
1474 /*
1475 * If this a machine check then if the return instruction pointer
1476 * is not valid the current context is lost.
1477 */
1478 if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV))
1479 disp |= CMI_ERRDISP_RIPV_INVALID;
1480 gcl->ismc = ismc;
1481
1482 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1483 uint64_t mcistatus = gbl->gbl_status;
1484 uint32_t ms_scope;
1485 int pcc, uc;
1486 int poisoned;
1487
1488 if (!(mcistatus & MSR_MC_STATUS_VAL))
1489 continue;
1490
1491 if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)
1492 continue;
1493
1494 pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0;
1495 uc = (mcistatus & MSR_MC_STATUS_UC) != 0;
1496 mcesp->mce_npcc += pcc;
1497 mcesp->mce_nuc += uc;
1498
1499 ms_scope = cms_error_action(hdl, ismc, i, mcistatus,
1500 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout);
1501
1502 if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) {
1503 pcc = 0;
1504 mcesp->mce_npcc_ok++;
1505 gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED;
1506 }
1507
1508 if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) {
1509 uc = 0;
1510 mcesp->mce_nuc_ok++;
1511 gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED;
1512 }
1513
1514 if (uc) {
1515 poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0;
1516 if (poisoned) {
1517 mcesp->mce_nuc_poisoned++;
1518 gbl->gbl_disp |= CMI_ERRDISP_POISONED;
1519 }
1520 }
1521
1522 if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) {
1523 /*
1524 * We're not being instructed to ignore the error,
1525 * so apply our standard disposition logic to it.
1526 */
1527 if (uc && !poisoned) {
1528 unconstrained++;
1529 gbl->gbl_disp |= disp |
1530 CMI_ERRDISP_UC_UNCONSTRAINED;
1531 }
1532
1533 if (pcc && ismc) {
1534 curctxbad++;
1535 gbl->gbl_disp |= disp |
1536 CMI_ERRDISP_CURCTXBAD;
1537 }
1538
1539 /*
1540 * Even if the above may not indicate that the error
1541 * is terminal, model-specific support may insist
1542 * that we treat it as such. Such errors wil be
1543 * fatal even if discovered via poll.
1544 */
1545 if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) {
1546 forcefatal++;
1547 mcesp->mce_forcefatal++;
1548 gbl->gbl_disp |= disp |
1549 CMI_ERRDISP_FORCEFATAL;
1550 }
1551 } else {
1552 mcesp->mce_ignored++;
1553 gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED;
1554 }
1555 }
1556
1557 if (unconstrained > 0)
1558 disp |= CMI_ERRDISP_UC_UNCONSTRAINED;
1559
1560 if (curctxbad > 0)
1561 disp |= CMI_ERRDISP_CURCTXBAD;
1562
1563 if (forcefatal > 0)
1564 disp |= CMI_ERRDISP_FORCEFATAL;
1565
1566 if (gcpu_mca_queue != NULL) {
1567 int how;
1568
1569 if (ismc) {
1570 how = cmi_mce_response(rp, disp) ?
1571 ERRORQ_ASYNC : /* no panic, so arrange drain */
1572 ERRORQ_SYNC; /* panic flow will drain */
1573 } else {
1574 how = (disp & CMI_ERRDISP_FORCEFATAL &&
1575 cmi_panic_on_ue()) ?
1576 ERRORQ_SYNC : /* poller will panic */
1577 ERRORQ_ASYNC; /* no panic */
1578 }
1579
1580 errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how);
1581 } else if (disp != 0) {
1582 gcpu_bleat(hdl, gcl);
1583 }
1584
1585 mcesp->mce_disp = disp;
1586
1587 return (disp);
1588 }
1589
1590 /*
1591 * Gather error telemetry from our source, and then submit it for
1592 * processing.
1593 */
1594
1595 #define IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \
1596 ((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0)
1597
1598 #define STATUS_EQV(s1, s2) \
1599 (((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER))
1600
1601 static uint32_t gcpu_deferrred_polled_clears;
1602
1603 #ifndef __xpv
1604 static void
gcpu_cmci_logout(cmi_hdl_t hdl,int bank,gcpu_mca_cmci_t * bank_cmci_p,uint64_t status,int what)1605 gcpu_cmci_logout(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
1606 uint64_t status, int what)
1607 {
1608 uint64_t ctl2;
1609
1610 if (bank_cmci_p->cmci_cap && (what == GCPU_MPT_WHAT_CYC_ERR) &&
1611 (!(status & MSR_MC_STATUS_VAL) || ((status & MSR_MC_STATUS_VAL) &&
1612 !(status & MSR_MC_STATUS_CEC_MASK)))) {
1613
1614 if (!(bank_cmci_p->cmci_enabled)) {
1615 /*
1616 * when cmci is disabled, and the bank has no error or
1617 * no corrected error for
1618 * gcpu_mca_cmci_reenable_threshold consecutive polls,
1619 * turn on this bank's cmci.
1620 */
1621
1622 bank_cmci_p->drtcmci ++;
1623
1624 if (bank_cmci_p->drtcmci >=
1625 gcpu_mca_cmci_reenable_threshold) {
1626
1627 /* turn on cmci */
1628
1629 (void) cmi_hdl_rdmsr(hdl,
1630 IA32_MSR_MC_CTL2(bank), &ctl2);
1631 ctl2 |= MSR_MC_CTL2_EN;
1632 (void) cmi_hdl_wrmsr(hdl,
1633 IA32_MSR_MC_CTL2(bank), ctl2);
1634
1635 /* reset counter and set flag */
1636 bank_cmci_p->drtcmci = 0;
1637 bank_cmci_p->cmci_enabled = 1;
1638 }
1639 } else {
1640 /*
1641 * when cmci is enabled,if is in cyclic poll and the
1642 * bank has no error or no corrected error, reset ncmci
1643 * counter
1644 */
1645 bank_cmci_p->ncmci = 0;
1646 }
1647 }
1648 }
1649
1650 static void
gcpu_cmci_throttle(cmi_hdl_t hdl,int bank,gcpu_mca_cmci_t * bank_cmci_p,int what)1651 gcpu_cmci_throttle(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
1652 int what)
1653 {
1654 uint64_t ctl2 = 0;
1655
1656 /*
1657 * if cmci of this bank occurred beyond
1658 * gcpu_mca_cmci_throttling_threshold between 2 polls,
1659 * turn off this bank's CMCI;
1660 */
1661 if (bank_cmci_p->cmci_enabled && what == GCPU_MPT_WHAT_CMCI_ERR) {
1662
1663 /* if it is cmci trap, increase the count */
1664 bank_cmci_p->ncmci++;
1665
1666 if (bank_cmci_p->ncmci >= gcpu_mca_cmci_throttling_threshold) {
1667
1668 /* turn off cmci */
1669
1670 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(bank),
1671 &ctl2);
1672 ctl2 &= ~MSR_MC_CTL2_EN;
1673 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(bank),
1674 ctl2);
1675
1676 /* clear the flag and count */
1677
1678 bank_cmci_p->cmci_enabled = 0;
1679 bank_cmci_p->ncmci = 0;
1680 }
1681 }
1682 }
1683 #endif
1684
1685 static void
clear_mc(int first,int last,int ismc,boolean_t clrstatus,cmi_hdl_t hdl,gcpu_logout_t * gcl,gcpu_logout_t * pgcl)1686 clear_mc(int first, int last, int ismc, boolean_t clrstatus,
1687 cmi_hdl_t hdl, gcpu_logout_t *gcl, gcpu_logout_t *pgcl)
1688 {
1689 int i;
1690 gcpu_bank_logout_t *gbl, *pgbl;
1691 uint64_t status;
1692
1693 if (first < 0 || last < 0)
1694 return;
1695
1696 for (i = first, gbl = &gcl->gcl_data[first]; i <= last; i++, gbl++) {
1697 status = gbl->gbl_status;
1698 if (status == 0)
1699 continue;
1700 if (clrstatus == B_FALSE)
1701 goto serialize;
1702
1703 /*
1704 * For i86xpv we always clear status in order to invalidate
1705 * the interposed telemetry.
1706 *
1707 * For native machine checks we always clear status here. For
1708 * native polls we must be a little more cautious since there
1709 * is an outside chance that we may clear telemetry from a
1710 * shared MCA bank on which a sibling core is machine checking.
1711 *
1712 * For polled observations of errors that look like they may
1713 * produce a machine check (UC/PCC and ENabled, although these
1714 * do not guarantee a machine check on error occurence)
1715 * we will not clear the status at this wakeup unless
1716 * we saw the same status at the previous poll. We will
1717 * always process and log the current observations - it
1718 * is only the clearing of MCi_STATUS which may be
1719 * deferred until the next wakeup.
1720 */
1721 if (isxpv || ismc || !IS_MCE_CANDIDATE(status)) {
1722 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL);
1723 goto serialize;
1724 }
1725
1726 /*
1727 * We have a polled observation of a machine check
1728 * candidate. If we saw essentially the same status at the
1729 * last poll then clear the status now since this appears
1730 * not to be a #MC candidate after all. If we see quite
1731 * different status now then do not clear, but reconsider at
1732 * the next poll. In no actual machine check clears
1733 * the status in the interim then the status should not
1734 * keep changing forever (meaning we'd never clear it)
1735 * since before long we'll simply have latched the highest-
1736 * priority error and set the OVerflow bit. Nonetheless
1737 * we count how many times we defer clearing and after
1738 * a while insist on clearing the status.
1739 */
1740 pgbl = &pgcl->gcl_data[i];
1741 if (pgbl->gbl_clrdefcnt != 0) {
1742 /* We deferred clear on this bank at last wakeup */
1743 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) ||
1744 pgbl->gbl_clrdefcnt > 5) {
1745 /*
1746 * Status is unchanged so clear it now and,
1747 * since we have already logged this info,
1748 * avoid logging it again.
1749 */
1750 gbl->gbl_status = 0;
1751 (void) cmi_hdl_wrmsr(hdl,
1752 IA32_MSR_MC(i, STATUS), 0ULL);
1753 } else {
1754 /* Record deferral for next wakeup */
1755 gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1;
1756 }
1757 } else {
1758 /* Record initial deferral for next wakeup */
1759 gbl->gbl_clrdefcnt = 1;
1760 gcpu_deferrred_polled_clears++;
1761 }
1762
1763 serialize:
1764 {
1765 #ifdef __xpv
1766 ;
1767 #else
1768 /*
1769 * Intel Vol 3A says to execute a serializing
1770 * instruction here, ie CPUID. Well WRMSR is also
1771 * defined to be serializing, so the status clear above
1772 * should suffice. To be a good citizen, and since
1773 * some clears are deferred, we'll execute a CPUID
1774 * instruction here.
1775 */
1776 struct cpuid_regs tmp;
1777 (void) __cpuid_insn(&tmp);
1778 #endif
1779 }
1780 }
1781 }
1782
1783 /*ARGSUSED5*/
1784 void
gcpu_mca_logout(cmi_hdl_t hdl,struct regs * rp,uint64_t bankmask,gcpu_mce_status_t * mcesp,boolean_t clrstatus,int what)1785 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask,
1786 gcpu_mce_status_t *mcesp, boolean_t clrstatus, int what)
1787 {
1788 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1789 gcpu_mca_t *mca = &gcpu->gcpu_mca;
1790 int nbanks = mca->gcpu_mca_nbanks;
1791 gcpu_bank_logout_t *gbl, *pgbl;
1792 gcpu_logout_t *gcl, *pgcl;
1793 int ismc = (rp != NULL);
1794 int ispoll = !ismc;
1795 int i, nerr = 0;
1796 cmi_errno_t err;
1797 uint64_t mcg_status;
1798 uint64_t disp;
1799 uint64_t cap;
1800 int first = -1;
1801 int last = -1;
1802 int willpanic = 0;
1803
1804 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
1805 CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) !=
1806 CMI_SUCCESS) {
1807 if (mcesp != NULL)
1808 mcesp->mce_nerr = mcesp->mce_disp = 0;
1809 return;
1810 }
1811
1812 if (ismc) {
1813 gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION];
1814 pgcl = NULL;
1815 } else {
1816 int pidx = mca->gcpu_mca_nextpoll_idx;
1817 int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ?
1818 GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1;
1819
1820 gcl = mca->gcpu_mca_logout[pidx]; /* current logout */
1821 pgcl = mca->gcpu_mca_logout[ppidx]; /* previous logout */
1822 mca->gcpu_mca_nextpoll_idx = ppidx; /* switch next time */
1823 }
1824
1825 gcl->gcl_timestamp = gethrtime_waitfree();
1826 gcl->gcl_mcg_status = mcg_status;
1827 gcl->gcl_ip = rp ? rp->r_pc : 0;
1828
1829 gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0;
1830 if (cap & MCG_CAP_TES_P)
1831 gcl->gcl_flags |= GCPU_GCL_F_TES_P;
1832
1833 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1834 uint64_t status, status2, addr, misc;
1835 int retries = gcpu_mca_telemetry_retries;
1836
1837 gbl->gbl_status = 0;
1838 gbl->gbl_disp = 0;
1839 gbl->gbl_clrdefcnt = 0;
1840
1841 /*
1842 * Only logout from MCA banks we have initialized from at
1843 * least one core. If a core shares an MCA bank with another
1844 * but perhaps lost the race to initialize it, then it must
1845 * still be allowed to logout from the shared bank.
1846 */
1847 if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i))
1848 continue;
1849
1850 /*
1851 * On a poll look only at the banks we've been asked to check.
1852 */
1853 if (rp == NULL && !(bankmask & 1 << i))
1854 continue;
1855
1856
1857 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) !=
1858 CMI_SUCCESS)
1859 continue;
1860
1861 #ifndef __xpv
1862 gcpu_cmci_logout(hdl, i, &mca->gcpu_bank_cmci[i], status, what);
1863 #endif
1864
1865 retry:
1866 if (!(status & MSR_MC_STATUS_VAL))
1867 continue;
1868
1869 /* First and last bank that have valid status */
1870 if (first < 0)
1871 first = i;
1872 last = i;
1873
1874 addr = -1;
1875 misc = 0;
1876
1877 if ((status & MSR_MC_STATUS_ADDRV) ||
1878 gcpu_force_addr_in_payload)
1879 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr);
1880
1881 if (status & MSR_MC_STATUS_MISCV)
1882 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc);
1883
1884 #ifndef __xpv
1885 gcpu_cmci_throttle(hdl, i, &mca->gcpu_bank_cmci[i], what);
1886 #endif
1887
1888 /*
1889 * Allow the model-specific code to extract bank telemetry.
1890 */
1891 cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout);
1892
1893 /*
1894 * Not all cpu models assure us that the status/address/misc
1895 * data will not change during the above sequence of MSR reads,
1896 * or that it can only change by the addition of the OVerflow
1897 * bit to the status register. If the status has changed
1898 * other than in the overflow bit then we attempt to reread
1899 * for a consistent snapshot, but eventually give up and
1900 * go with what we've got. We only perform this check
1901 * for a poll - a further #MC during a #MC will reset, and
1902 * polled errors should not overwrite higher-priority
1903 * trapping errors (but could set the overflow bit).
1904 */
1905 if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1906 &status2)) == CMI_SUCCESS) {
1907 if (!STATUS_EQV(status, status2)) {
1908 if (retries-- > 0) {
1909 status = status2;
1910 goto retry;
1911 } else {
1912 gbl->gbl_disp |=
1913 CMI_ERRDISP_INCONSISTENT;
1914 }
1915 }
1916 } else if (ispoll && err != CMI_SUCCESS) {
1917 gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT;
1918 }
1919
1920 nerr++;
1921 gbl->gbl_status = status;
1922 gbl->gbl_addr = addr;
1923 gbl->gbl_misc = misc;
1924
1925 /*
1926 * For polled observation, if the count of deferred status
1927 * clears updated in the clear_mc() is nonzero and the
1928 * MCi_STATUS has not changed, the last wakeup has produced
1929 * the ereport of the error. Therefore, clear the status in
1930 * this wakeup to avoid duplicate ereport.
1931 */
1932 pgbl = &pgcl->gcl_data[i];
1933 if (!isxpv && ispoll && IS_MCE_CANDIDATE(status) &&
1934 pgbl->gbl_clrdefcnt != 0) {
1935 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status)) {
1936 gbl->gbl_status = 0;
1937 (void) cmi_hdl_wrmsr(hdl,
1938 IA32_MSR_MC(i, STATUS), 0ULL);
1939 }
1940 }
1941 }
1942
1943 if (gcpu_mca_stack_flag)
1944 gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH);
1945 else
1946 gcl->gcl_stackdepth = 0;
1947
1948 /*
1949 * Decide our disposition for this error or errors, and submit for
1950 * logging and subsequent diagnosis.
1951 */
1952 if (nerr != 0) {
1953 disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp);
1954
1955 willpanic = (ismc && cmi_mce_response(rp, disp) == 0);
1956
1957 if (!willpanic)
1958 clear_mc(first, last, ismc, clrstatus, hdl, gcl, pgcl);
1959 } else {
1960 disp = 0;
1961 if (mcesp) {
1962 mcesp->mce_nerr = mcesp->mce_disp = 0;
1963 }
1964 }
1965
1966 /*
1967 * Clear MCG_STATUS if MCIP is set (machine check in progress).
1968 * If a second #MC had occured before now the system would have
1969 * reset. We can only do thise once gcpu_mca_process has copied
1970 * the logout structure.
1971 */
1972 if (ismc && mcg_status & MCG_STATUS_MCIP)
1973 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0);
1974
1975 /*
1976 * At this point we have read and logged all telemetry that is visible
1977 * under the MCA. On architectures for which the NorthBridge is
1978 * on-chip this may include NB-observed errors, but where the NB
1979 * is off chip it may have been the source of the #MC request and
1980 * so we must call into the memory-controller driver to give it
1981 * a chance to log errors.
1982 */
1983 if (ismc) {
1984 cmi_mc_logout(hdl, 1, willpanic);
1985 }
1986 }
1987
1988 #ifndef __xpv
1989 int gcpu_mca_trap_vomit_summary = 0;
1990
1991 /*
1992 * On a native machine check exception we come here from mcetrap via
1993 * cmi_mca_trap. A machine check on one cpu of a chip does not trap others
1994 * cpus of the chip, so it is possible that another cpu on this chip could
1995 * initiate a poll while we're in the #mc handler; it is also possible that
1996 * this trap has occured during a poll on this cpu. So we must acquire
1997 * the chip-wide poll lock, but be careful to avoid deadlock.
1998 *
1999 * The 'data' pointer cannot be NULL due to init order.
2000 */
2001 uint64_t
gcpu_mca_trap(cmi_hdl_t hdl,struct regs * rp)2002 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp)
2003 {
2004 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
2005 kmutex_t *poll_lock = NULL;
2006 gcpu_mce_status_t mce;
2007 uint64_t mcg_status;
2008 int tooklock = 0;
2009
2010 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
2011 CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP))
2012 return (0);
2013
2014 /*
2015 * Synchronize with any poller from another core that may happen
2016 * to share access to one or more of the MCA banks.
2017 */
2018 if (gcpu->gcpu_shared != NULL)
2019 poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock;
2020
2021 if (poll_lock != NULL && !mutex_owned(poll_lock)) {
2022 /*
2023 * The lock is not owned by the thread we have
2024 * interrupted. Spin for this adaptive lock.
2025 */
2026 while (!mutex_tryenter(poll_lock)) {
2027 while (mutex_owner(poll_lock) != NULL)
2028 ;
2029 }
2030 tooklock = 1;
2031 }
2032
2033 gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE, GCPU_MPT_WHAT_MC_ERR);
2034
2035 if (tooklock)
2036 mutex_exit(poll_lock);
2037
2038 /*
2039 * gcpu_mca_trap_vomit_summary may be set for debug assistance.
2040 */
2041 if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) {
2042 cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, "
2043 "%u PCC (%u ok), "
2044 "%u UC (%d ok, %u poisoned), "
2045 "%u forcefatal, %u ignored",
2046 mce.mce_nerr, (u_longlong_t)mce.mce_disp,
2047 mce.mce_npcc, mce.mce_npcc_ok,
2048 mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned,
2049 mce.mce_forcefatal, mce.mce_ignored);
2050 }
2051
2052 return (mce.mce_disp);
2053 }
2054 #endif
2055
2056 /*ARGSUSED*/
2057 void
gcpu_faulted_enter(cmi_hdl_t hdl)2058 gcpu_faulted_enter(cmi_hdl_t hdl)
2059 {
2060 /* Nothing to do here */
2061 }
2062
2063 /*ARGSUSED*/
2064 void
gcpu_faulted_exit(cmi_hdl_t hdl)2065 gcpu_faulted_exit(cmi_hdl_t hdl)
2066 {
2067 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
2068
2069 gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING;
2070 }
2071
2072 /*
2073 * Write the requested values to the indicated MSRs. Having no knowledge
2074 * of the model-specific requirements for writing to these model-specific
2075 * registers, we will only blindly write to those MSRs if the 'force'
2076 * argument is nonzero. That option should only be used in prototyping
2077 * and debugging.
2078 */
2079 /*ARGSUSED*/
2080 cmi_errno_t
gcpu_msrinject(cmi_hdl_t hdl,cmi_mca_regs_t * regs,uint_t nregs,int force)2081 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs,
2082 int force)
2083 {
2084 int i, errs = 0;
2085
2086 for (i = 0; i < nregs; i++) {
2087 uint_t msr = regs[i].cmr_msrnum;
2088 uint64_t val = regs[i].cmr_msrval;
2089
2090 if (cms_present(hdl)) {
2091 if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS)
2092 errs++;
2093 } else if (force) {
2094 errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS);
2095 } else {
2096 errs++;
2097 }
2098 }
2099
2100 return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN);
2101 }
2102
2103 /* deconfigure gcpu_mca_init() */
2104 void
gcpu_mca_fini(cmi_hdl_t hdl)2105 gcpu_mca_fini(cmi_hdl_t hdl)
2106 {
2107 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
2108 gcpu_mca_t *mca = &gcpu->gcpu_mca;
2109 int i;
2110
2111 /*
2112 * CPU startup code only calls cmi_mca_init if x86_featureset indicates
2113 * both MCA and MCE support (i.e., X86FSET_MCA). P5, K6, and earlier
2114 * processors, which have their own more primitive way of doing
2115 * machine checks, will not have cmi_mca_init called since their
2116 * CPUID information will not indicate both MCA and MCE features.
2117 */
2118 if (!is_x86_feature(x86_featureset, X86FSET_MCA))
2119 return;
2120 #ifndef __xpv
2121 /*
2122 * disable machine check in CR4
2123 */
2124 cmi_ntv_hwdisable_mce(hdl);
2125 #endif
2126 mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock);
2127 gcpu_mca_poll_fini(hdl);
2128 mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock);
2129
2130 /*
2131 * free resources allocated during init
2132 */
2133 if (mca->gcpu_bank_cmci != NULL) {
2134 kmem_free(mca->gcpu_bank_cmci, sizeof (gcpu_mca_cmci_t) *
2135 mca->gcpu_mca_nbanks);
2136 }
2137
2138 for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) {
2139 if (mca->gcpu_mca_logout[i] != NULL) {
2140 kmem_free(mca->gcpu_mca_logout[i], mca->gcpu_mca_lgsz);
2141 }
2142 }
2143
2144 if (mca->gcpu_mca_bioscfg.bios_bankcfg != NULL) {
2145 kmem_free(mca->gcpu_mca_bioscfg.bios_bankcfg,
2146 sizeof (struct gcpu_bios_bankcfg) * mca->gcpu_mca_nbanks);
2147 }
2148 }
2149