xref: /illumos-gate/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_mca.c (revision 89b2a9fbeabf42fa54594df0e5927bcc50a07cc9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/mca_x86.h>
28 #include <sys/cpu_module_impl.h>
29 #include <sys/cpu_module_ms.h>
30 #include <sys/cmn_err.h>
31 #include <sys/cpuvar.h>
32 #include <sys/pghw.h>
33 #include <sys/x86_archext.h>
34 #include <sys/sysmacros.h>
35 #include <sys/regset.h>
36 #include <sys/privregs.h>
37 #include <sys/systm.h>
38 #include <sys/types.h>
39 #include <sys/log.h>
40 #include <sys/psw.h>
41 #include <sys/fm/protocol.h>
42 #include <sys/fm/util.h>
43 #include <sys/errorq.h>
44 #include <sys/mca_x86.h>
45 #include <sys/fm/cpu/GMCA.h>
46 #include <sys/fm/smb/fmsmb.h>
47 #include <sys/sysevent.h>
48 #include <sys/ontrap.h>
49 
50 #include "gcpu.h"
51 
52 extern int x86gentopo_legacy;	/* x86 generic topology support */
53 
54 /*
55  * Clear to log telemetry found at initialization.  While processor docs
56  * say you should process this telemetry on all but Intel family 0x6
57  * there are way too many exceptions and we want to avoid bogus
58  * diagnoses.
59  */
60 int gcpu_suppress_log_on_init = 1;
61 
62 /*
63  * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at
64  * error logout time.  The stack will be included in the ereport if the
65  * error type selects stack inclusion, or in all cases if
66  * gcpu_mca_stack_ereport_include is nonzero.
67  */
68 int gcpu_mca_stack_flag = 0;
69 int gcpu_mca_stack_ereport_include = 0;
70 
71 /*
72  * The number of times to re-read MCA telemetry to try to obtain a
73  * consistent snapshot if we find it to be changing under our feet.
74  */
75 int gcpu_mca_telemetry_retries = 5;
76 
77 #ifndef __xpv
78 int gcpu_mca_cmci_throttling_threshold = 10;
79 int gcpu_mca_cmci_reenable_threshold = 1000;
80 #endif
81 
82 static gcpu_error_disp_t gcpu_errtypes[] = {
83 
84 	/*
85 	 * Unclassified
86 	 */
87 	{
88 		FM_EREPORT_CPU_GENERIC_UNCLASSIFIED,
89 		NULL,
90 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
91 		MCAX86_SIMPLE_UNCLASSIFIED_MASKON,
92 		MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF
93 	},
94 
95 	/*
96 	 * Microcode ROM Parity Error
97 	 */
98 	{
99 		FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY,
100 		NULL,
101 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
102 		MCAX86_SIMPLE_MC_CODE_PARITY_MASKON,
103 		MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF
104 	},
105 
106 	/*
107 	 * External - BINIT# from another processor during power-on config
108 	 */
109 	{
110 		FM_EREPORT_CPU_GENERIC_EXTERNAL,
111 		NULL,
112 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
113 		MCAX86_SIMPLE_EXTERNAL_MASKON,
114 		MCAX86_SIMPLE_EXTERNAL_MASKOFF
115 	},
116 
117 	/*
118 	 * Functional redundancy check master/slave error
119 	 */
120 	{
121 		FM_EREPORT_CPU_GENERIC_FRC,
122 		NULL,
123 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
124 		MCAX86_SIMPLE_FRC_MASKON,
125 		MCAX86_SIMPLE_FRC_MASKOFF
126 	},
127 
128 	/*
129 	 * Internal parity error
130 	 */
131 	{
132 		FM_EREPORT_CPU_GENERIC_INTERNAL_PARITY,
133 		NULL,
134 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
135 		MCAX86_SIMPLE_INTERNAL_PARITY_MASKON,
136 		MCAX86_SIMPLE_INTERNAL_PARITY_MASKOFF
137 	},
138 
139 
140 	/*
141 	 * Internal timer error
142 	 */
143 	{
144 		FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER,
145 		NULL,
146 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
147 		MCAX86_SIMPLE_INTERNAL_TIMER_MASKON,
148 		MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF
149 	},
150 
151 	/*
152 	 * Internal unclassified
153 	 */
154 	{
155 		FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS,
156 		NULL,
157 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
158 		MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON,
159 		MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF
160 	},
161 
162 	/*
163 	 * Compound error codes - generic memory hierarchy
164 	 */
165 	{
166 		FM_EREPORT_CPU_GENERIC_GENMEMHIER,
167 		NULL,
168 		FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */
169 		MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON,
170 		MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF
171 	},
172 
173 	/*
174 	 * Compound error codes - TLB errors
175 	 */
176 	{
177 		FM_EREPORT_CPU_GENERIC_TLB,
178 		"%1$s" "TLB" "%2$s" "_ERR",
179 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
180 		MCAX86_COMPOUND_TLB_MASKON,
181 		MCAX86_COMPOUND_TLB_MASKOFF
182 	},
183 
184 	/*
185 	 * Compound error codes - memory hierarchy
186 	 */
187 	{
188 		FM_EREPORT_CPU_GENERIC_MEMHIER,
189 		"%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR",
190 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
191 		MCAX86_COMPOUND_MEMHIER_MASKON,
192 		MCAX86_COMPOUND_MEMHIER_MASKOFF
193 	},
194 
195 	/*
196 	 * Compound error codes - bus and interconnect errors
197 	 */
198 	{
199 		FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT,
200 		"BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR",
201 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
202 		MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON,
203 		MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF
204 	},
205 	/*
206 	 * Compound error codes - memory controller errors
207 	 */
208 	{
209 		FM_EREPORT_CPU_GENERIC_MEMORY_CONTROLLER,
210 		"MC" "_" "%8$s" "_" "%9$s" "_ERR",
211 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
212 		MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKON,
213 		MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKOFF
214 	},
215 };
216 
217 static gcpu_error_disp_t gcpu_unknown = {
218 	FM_EREPORT_CPU_GENERIC_UNKNOWN,
219 	"UNKNOWN",
220 	FM_EREPORT_PAYLOAD_FLAGS_COMMON,
221 	0,
222 	0
223 };
224 
225 static errorq_t *gcpu_mca_queue;
226 static kmutex_t gcpu_mca_queue_lock;
227 
228 #ifdef __xpv
229 static int isxpv = 1;
230 #else
231 static int isxpv = 0;
232 #endif
233 
234 static const gcpu_error_disp_t *
235 gcpu_disp_match(uint16_t code)
236 {
237 	const gcpu_error_disp_t *ged = gcpu_errtypes;
238 	int i;
239 
240 	for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t);
241 	    i++, ged++) {
242 		uint16_t on = ged->ged_errcode_mask_on;
243 		uint16_t off = ged->ged_errcode_mask_off;
244 
245 		if ((code & on) == on && (code & off) == 0)
246 			return (ged);
247 	}
248 
249 	return (NULL);
250 }
251 
252 static uint8_t
253 bit_strip(uint16_t code, uint16_t mask, uint16_t shift)
254 {
255 	return ((uint8_t)(code & mask) >> shift);
256 }
257 
258 #define	BIT_STRIP(code, name) \
259 	bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \
260 	MCAX86_ERRCODE_##name##_SHIFT)
261 
262 #define	GCPU_MNEMONIC_UNDEF	"undefined"
263 #define	GCPU_MNEMONIC_RESVD	"reserved"
264 
265 /*
266  * Mappings of TT, LL, RRRR, PP, II and T values to compound error name
267  * mnemonics and to ereport class name components.
268  */
269 
270 struct gcpu_mnexp {
271 	const char *mne_compound;	/* used in expanding compound errname */
272 	const char *mne_ereport;	/* used in expanding ereport class */
273 };
274 
275 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */
276 	{ "I", FM_EREPORT_CPU_GENERIC_TT_INSTR },		/* INSTR */
277 	{ "D", FM_EREPORT_CPU_GENERIC_TT_DATA },		/* DATA */
278 	{ "G", FM_EREPORT_CPU_GENERIC_TT_GEN },			/* GEN */
279 	{ GCPU_MNEMONIC_UNDEF, "" }
280 };
281 
282 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */
283 	{ "LO", FM_EREPORT_CPU_GENERIC_LL_L0 },			/* L0 */
284 	{ "L1",	FM_EREPORT_CPU_GENERIC_LL_L1 },			/* L1 */
285 	{ "L2",	FM_EREPORT_CPU_GENERIC_LL_L2 },			/* L2 */
286 	{ "LG", FM_EREPORT_CPU_GENERIC_LL_LG }			/* LG */
287 };
288 
289 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */
290 	{ "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR },		/* ERR */
291 	{ "RD",	FM_EREPORT_CPU_GENERIC_RRRR_RD },		/* RD */
292 	{ "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR },		/* WR */
293 	{ "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD },		/* DRD */
294 	{ "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR },		/* DWR */
295 	{ "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD },		/* IRD */
296 	{ "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH },	/* PREFETCH */
297 	{ "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT },		/* EVICT */
298 	{ "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP },		/* SNOOP */
299 };
300 
301 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */
302 	{ "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC },		/* SRC */
303 	{ "RES", FM_EREPORT_CPU_GENERIC_PP_RES },		/* RES */
304 	{ "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS },		/* OBS */
305 	{ "", FM_EREPORT_CPU_GENERIC_PP_GEN }			/* GEN */
306 };
307 
308 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */
309 	{ "M", FM_EREPORT_CPU_GENERIC_II_MEM },			/* MEM */
310 	{ GCPU_MNEMONIC_RESVD, "" },
311 	{ "IO", FM_EREPORT_CPU_GENERIC_II_IO },			/* IO */
312 	{ "", FM_EREPORT_CPU_GENERIC_II_GEN }			/* GEN */
313 };
314 
315 static struct gcpu_mnexp gcpu_T_mnemonics[] = {	 /* MCAX86_ERRCODE_T_* */
316 	{ "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT },	/* NONE */
317 	{ "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT }		/* TIMEOUT */
318 };
319 
320 static struct gcpu_mnexp gcpu_CCCC_mnemonics[] = { /* MCAX86_ERRCODE_CCCC_* */
321 	{ "CH0", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH0 */
322 	{ "CH1", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH1 */
323 	{ "CH2", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH2 */
324 	{ "CH3", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH3 */
325 	{ "CH4", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH4 */
326 	{ "CH5", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH5 */
327 	{ "CH6", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH6 */
328 	{ "CH7", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH7 */
329 	{ "CH8", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH8 */
330 	{ "CH9", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH9 */
331 	{ "CH10", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH10 */
332 	{ "CH11", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH11 */
333 	{ "CH12", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH12 */
334 	{ "CH13", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH13 */
335 	{ "CH14", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH14 */
336 	{ "CH", FM_EREPORT_CPU_GENERIC_CCCC }		/* GEN */
337 };
338 
339 static struct gcpu_mnexp gcpu_MMM_mnemonics[] = { /* MCAX86_ERRCODE_MMM_* */
340 	{ "GEN", FM_EREPORT_CPU_GENERIC_MMM_ERR },	/* GEN ERR */
341 	{ "RD", FM_EREPORT_CPU_GENERIC_MMM_RD },	/* READ  */
342 	{ "WR", FM_EREPORT_CPU_GENERIC_MMM_WR },	/* WRITE  */
343 	{ "ADDR_CMD", FM_EREPORT_CPU_GENERIC_MMM_ADRCMD },	/* ADDR, CMD  */
344 	{ "SCRUB", FM_EREPORT_CPU_GENERIC_MMM_SCRUB },
345 	{ GCPU_MNEMONIC_RESVD, ""},			/* RESERVED  */
346 	{ GCPU_MNEMONIC_RESVD, ""},			/* RESERVED  */
347 	{ GCPU_MNEMONIC_RESVD, ""}			/* RESERVED  */
348 };
349 
350 enum gcpu_mn_namespace {
351 	GCPU_MN_NAMESPACE_COMPOUND,
352 	GCPU_MN_NAMESPACE_EREPORT
353 };
354 
355 static const char *
356 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint8_t val,
357     enum gcpu_mn_namespace nspace)
358 {
359 	if (val >= tbl_sz)
360 		return (GCPU_MNEMONIC_UNDEF);	/* for all namespaces */
361 
362 	switch (nspace) {
363 	case GCPU_MN_NAMESPACE_COMPOUND:
364 		return (tbl[val].mne_compound);
365 		/*NOTREACHED*/
366 
367 	case GCPU_MN_NAMESPACE_EREPORT:
368 		return (tbl[val].mne_ereport);
369 		/*NOTREACHED*/
370 
371 	default:
372 		return (GCPU_MNEMONIC_UNDEF);
373 		/*NOTREACHED*/
374 	}
375 }
376 
377 /*
378  * The ereport class leaf component is either a simple string with no
379  * format specifiers, or a string with one or more embedded %n$s specifiers -
380  * positional selection for string arguments.  The kernel snprintf does
381  * not support %n$ (and teaching it to do so is too big a headache) so
382  * we will expand this restricted format string ourselves.
383  */
384 
385 #define	GCPU_CLASS_VARCOMPS	9
386 
387 #define	GCPU_MNEMONIC(code, name, nspace) \
388 	gcpu_mnemonic(gcpu_##name##_mnemonics, \
389 	sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \
390 	BIT_STRIP(code, name), nspace)
391 
392 static void
393 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
394     enum gcpu_mn_namespace nspace)
395 {
396 	uint16_t code = MCAX86_ERRCODE(status);
397 	const char *mn[GCPU_CLASS_VARCOMPS];
398 	char *p = buf;			/* current position in buf */
399 	char *q = buf + buflen;		/* pointer past last char in buf */
400 	int which, expfmtchar, error;
401 	char c;
402 
403 	mn[0] = GCPU_MNEMONIC(code, TT, nspace);
404 	mn[1] = GCPU_MNEMONIC(code, LL, nspace);
405 	mn[2] = GCPU_MNEMONIC(code, RRRR, nspace);
406 	mn[3] = GCPU_MNEMONIC(code, PP, nspace);
407 	mn[4] = GCPU_MNEMONIC(code, II, nspace);
408 	mn[5] = GCPU_MNEMONIC(code, T, nspace);
409 	mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : "";
410 	mn[7] = GCPU_MNEMONIC(code, CCCC, nspace);
411 	mn[8] = GCPU_MNEMONIC(code, MMM, nspace);
412 
413 	while (p < q - 1 && (c = *fmt++) != '\0') {
414 		if (c != '%') {
415 			/* not the beginning of a format specifier - copy */
416 			*p++ = c;
417 			continue;
418 		}
419 
420 		error = 0;
421 		which = -1;
422 		expfmtchar = -1;
423 
424 nextfmt:
425 		if ((c = *fmt++) == '\0')
426 			break;	/* early termination of fmt specifier */
427 
428 		switch (c) {
429 		case '1':
430 		case '2':
431 		case '3':
432 		case '4':
433 		case '5':
434 		case '6':
435 		case '7':
436 		case '8':
437 		case '9':
438 			if (which != -1) { /* allow only one positional digit */
439 				error++;
440 				break;
441 			}
442 			which = c - '1';
443 			goto nextfmt;
444 			/*NOTREACHED*/
445 
446 		case '$':
447 			if (which == -1) { /* no position specified */
448 				error++;
449 				break;
450 			}
451 			expfmtchar = 's';
452 			goto nextfmt;
453 			/*NOTREACHED*/
454 
455 		case 's':
456 			if (expfmtchar != 's') {
457 				error++;
458 				break;
459 			}
460 			(void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
461 			    mn[which]);
462 			p += strlen(p);
463 			break;
464 
465 		default:
466 			error++;
467 			break;
468 		}
469 
470 		if (error)
471 			break;
472 	}
473 
474 	*p = '\0';	/* NUL termination */
475 }
476 
477 static void
478 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
479     const char *cpuclass, const char *leafclass)
480 {
481 	char *p = buf;			/* current position in buf */
482 	char *q = buf + buflen;		/* pointer past last char in buf */
483 
484 	(void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.",
485 	    FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC);
486 
487 	p += strlen(p);
488 	if (p >= q)
489 		return;
490 
491 	if (leafclass == NULL) {
492 		gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status,
493 		    GCPU_MN_NAMESPACE_EREPORT);
494 	} else {
495 		(void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
496 		    leafclass);
497 	}
498 }
499 
500 /*
501  * Create an "hc" scheme FMRI identifying the given cpu with
502  * motherboard/chip/core/strand instance numbers.
503  */
504 static nvlist_t *
505 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva)
506 {
507 	nvlist_t *nvl, *fmri;
508 
509 	if ((nvl = fm_nvlist_create(nva)) == NULL)
510 		return (NULL);
511 
512 	if (!x86gentopo_legacy) {
513 		fmri = cmi_hdl_smb_bboard(hdl);
514 		if (fmri == NULL)
515 			return (NULL);
516 
517 		fm_fmri_hc_create(nvl, FM_HC_SCHEME_VERSION,
518 		    NULL, NULL, fmri, 3,
519 		    "chip", cmi_hdl_smb_chipid(hdl),
520 		    "core", cmi_hdl_coreid(hdl),
521 		    "strand", cmi_hdl_strandid(hdl));
522 	} else {
523 		fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 4,
524 		    "motherboard", 0,
525 		    "chip", cmi_hdl_chipid(hdl),
526 		    "core", cmi_hdl_coreid(hdl),
527 		    "strand", cmi_hdl_strandid(hdl));
528 	}
529 
530 	return (nvl);
531 }
532 
533 int gcpu_bleat_count_thresh = 5;
534 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL;
535 
536 /*
537  * Called when we are unable to propogate a logout structure onto an
538  * errorq for subsequent ereport preparation and logging etc.  The caller
539  * should usually only decide to call this for severe errors - those we
540  * suspect we may need to panic for.
541  */
542 static void
543 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl)
544 {
545 	hrtime_t now  = gethrtime_waitfree();
546 	static hrtime_t gcpu_last_bleat;
547 	gcpu_bank_logout_t *gbl;
548 	static int bleatcount;
549 	int i;
550 
551 	/*
552 	 * Throttle spamming of the console.  The first gcpu_bleat_count_thresh
553 	 * can come as fast as we like, but once we've spammed that many
554 	 * to the console we require a minimum interval to pass before
555 	 * any more complaints.
556 	 */
557 	if (++bleatcount > gcpu_bleat_count_thresh) {
558 		if (now - gcpu_last_bleat < gcpu_bleat_min_interval)
559 			return;
560 		else
561 			bleatcount = 0;
562 	}
563 	gcpu_last_bleat = now;
564 
565 	cmn_err(CE_WARN,
566 	    "Machine-Check Errors unlogged on chip %d core %d strand %d, "
567 	    "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl),
568 	    cmi_hdl_strandid(hdl));
569 	cmn_err(CE_WARN, "MCG_STATUS 0x%016llx",
570 	    (u_longlong_t)gcl->gcl_mcg_status);
571 	for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
572 		uint64_t status = gbl->gbl_status;
573 
574 		if (!(status & MSR_MC_STATUS_VAL))
575 			continue;
576 
577 		switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) {
578 		case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV:
579 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
580 			    "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx",
581 			    i, IA32_MSR_MC(i, STATUS),
582 			    (u_longlong_t)status,
583 			    (u_longlong_t)gbl->gbl_addr,
584 			    (u_longlong_t)gbl->gbl_misc);
585 			break;
586 
587 		case MSR_MC_STATUS_ADDRV:
588 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
589 			    "STAT 0x%016llx ADDR 0x%016llx",
590 			    i, IA32_MSR_MC(i, STATUS),
591 			    (u_longlong_t)status,
592 			    (u_longlong_t)gbl->gbl_addr);
593 			break;
594 
595 		case MSR_MC_STATUS_MISCV:
596 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
597 			    "STAT 0x%016llx MISC 0x%016llx",
598 			    i, IA32_MSR_MC(i, STATUS),
599 			    (u_longlong_t)status,
600 			    (u_longlong_t)gbl->gbl_misc);
601 			break;
602 
603 		default:
604 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
605 			    "STAT 0x%016llx",
606 			    i, IA32_MSR_MC(i, STATUS),
607 			    (u_longlong_t)status);
608 			break;
609 
610 		}
611 	}
612 }
613 
614 #define	_GCPU_BSTATUS(status, what) \
615 	FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \
616 	(status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE
617 
618 static void
619 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl,
620     uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code)
621 {
622 	uint64_t members = ged ? ged->ged_ereport_members :
623 	    FM_EREPORT_PAYLOAD_FLAGS_COMMON;
624 	uint64_t mcg = gcl->gcl_mcg_status;
625 	int mcip = mcg & MCG_STATUS_MCIP;
626 	const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno];
627 	uint64_t bstat = gbl->gbl_status;
628 
629 	/*
630 	 * Include the compound error name if requested and if this
631 	 * is a compound error type.
632 	 */
633 	if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged &&
634 	    ged->ged_compound_fmt != NULL) {
635 		char buf[FM_MAX_CLASS];
636 
637 		gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code,
638 		    GCPU_MN_NAMESPACE_COMPOUND);
639 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR,
640 		    DATA_TYPE_STRING, buf, NULL);
641 	}
642 
643 	/*
644 	 * Include disposition information for this error
645 	 */
646 	if (members & FM_EREPORT_PAYLOAD_FLAG_DISP &&
647 	    gbl->gbl_disp != 0) {
648 		int i, empty = 1;
649 		char buf[128];
650 		char *p = buf, *q = buf + 128;
651 		static struct _gcpu_disp_name {
652 			uint64_t dv;
653 			const char *dn;
654 		} disp_names[] = {
655 			{ CMI_ERRDISP_CURCTXBAD,
656 			    "processor_context_corrupt" },
657 			{ CMI_ERRDISP_RIPV_INVALID,
658 			    "return_ip_invalid" },
659 			{ CMI_ERRDISP_UC_UNCONSTRAINED,
660 			    "unconstrained" },
661 			{ CMI_ERRDISP_FORCEFATAL,
662 			    "forcefatal" },
663 			{ CMI_ERRDISP_IGNORED,
664 			    "ignored" },
665 			{ CMI_ERRDISP_PCC_CLEARED,
666 			    "corrupt_context_cleared" },
667 			{ CMI_ERRDISP_UC_CLEARED,
668 			    "uncorrected_data_cleared" },
669 			{ CMI_ERRDISP_POISONED,
670 			    "poisoned" },
671 			{ CMI_ERRDISP_INCONSISTENT,
672 			    "telemetry_unstable" },
673 		};
674 
675 		for (i = 0; i < sizeof (disp_names) /
676 		    sizeof (struct _gcpu_disp_name); i++) {
677 			if ((gbl->gbl_disp & disp_names[i].dv) == 0)
678 				continue;
679 
680 			(void) snprintf(p, (uintptr_t)q - (uintptr_t)p,
681 			    "%s%s", empty ? "" : ",", disp_names[i].dn);
682 			p += strlen(p);
683 			empty = 0;
684 		}
685 
686 		if (p != buf)
687 			fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP,
688 			    DATA_TYPE_STRING, buf, NULL);
689 	}
690 
691 	/*
692 	 * If MCG_STATUS is included add that and an indication of whether
693 	 * this ereport was the result of a machine check or poll.
694 	 */
695 	if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) {
696 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS,
697 		    DATA_TYPE_UINT64, mcg, NULL);
698 
699 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP,
700 		    DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL);
701 	}
702 
703 	/*
704 	 * If an instruction pointer is to be included add one provided
705 	 * MCG_STATUS indicated it is valid; meaningless for polled events.
706 	 */
707 	if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP &&
708 	    mcg & MCG_STATUS_EIPV) {
709 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP,
710 		    DATA_TYPE_UINT64, gcl->gcl_ip, NULL);
711 	}
712 
713 	/*
714 	 * Add an indication of whether the trap occured during privileged code.
715 	 */
716 	if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) {
717 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV,
718 		    DATA_TYPE_BOOLEAN_VALUE,
719 		    gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL);
720 	}
721 
722 	/*
723 	 * If requested, add the index of the MCA bank.  This indicates the
724 	 * n'th bank of 4 MCA registers, and does not necessarily correspond
725 	 * to MCi_* - use the bank offset to correlate
726 	 */
727 	if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) {
728 		fm_payload_set(ereport,
729 		    /* Bank number */
730 		    FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno,
731 		    /* Offset of MCi_CTL */
732 		    FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64,
733 		    IA32_MSR_MC(bankno, CTL),
734 		    NULL);
735 	}
736 
737 	/*
738 	 * Add MCi_STATUS if requested, and decode it.
739 	 */
740 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) {
741 		const char *tbes[] = {
742 			"No tracking",			/* 00 */
743 			"Green - below threshold",	/* 01 */
744 			"Yellow - above threshold",	/* 10 */
745 			"Reserved"			/* 11 */
746 		};
747 
748 		fm_payload_set(ereport,
749 		    /* Bank MCi_STATUS */
750 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat,
751 		    /* Overflow? */
752 		    _GCPU_BSTATUS(bstat, OVER),
753 		    /* Uncorrected? */
754 		    _GCPU_BSTATUS(bstat, UC),
755 		    /* Enabled? */
756 		    _GCPU_BSTATUS(bstat, EN),
757 		    /* Processor context corrupt? */
758 		    _GCPU_BSTATUS(bstat, PCC),
759 		    /* Error code */
760 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE,
761 		    DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat),
762 		    /* Model-specific error code */
763 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE,
764 		    DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat),
765 		    NULL);
766 
767 		/*
768 		 * If MCG_CAP.TES_P indicates that that thresholding info
769 		 * is present in the architural component of the bank status
770 		 * then include threshold information for this bank.
771 		 */
772 		if (gcl->gcl_flags & GCPU_GCL_F_TES_P) {
773 			fm_payload_set(ereport,
774 			    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES,
775 			    DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)],
776 			    NULL);
777 		}
778 	}
779 
780 	/*
781 	 * MCi_ADDR info if requested and valid.
782 	 */
783 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR &&
784 	    bstat & MSR_MC_STATUS_ADDRV) {
785 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR,
786 		    DATA_TYPE_UINT64, gbl->gbl_addr, NULL);
787 	}
788 
789 	/*
790 	 * MCi_MISC if requested and MCi_STATUS.MISCV).
791 	 */
792 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC &&
793 	    bstat & MSR_MC_STATUS_MISCV) {
794 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC,
795 		    DATA_TYPE_UINT64, gbl->gbl_misc, NULL);
796 	}
797 
798 }
799 
800 /*
801  * Construct and post an ereport based on the logout information from a
802  * single MCA bank.  We are not necessarily running on the cpu that
803  * detected the error.
804  */
805 static void
806 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx,
807     const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status)
808 {
809 	gcpu_data_t *gcpu = gcl->gcl_gcpu;
810 	cmi_hdl_t hdl = gcpu->gcpu_hdl;
811 	const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx];
812 	const char *cpuclass = NULL, *leafclass = NULL;
813 	uint16_t code = MCAX86_ERRCODE(status);
814 	errorq_elem_t *eqep, *scr_eqep;
815 	nvlist_t *ereport, *detector;
816 	char buf[FM_MAX_CLASS];
817 	const char *classfmt;
818 	nv_alloc_t *nva;
819 
820 	if (panicstr) {
821 		if ((eqep = errorq_reserve(ereport_errorq)) == NULL)
822 			return;
823 		ereport = errorq_elem_nvl(ereport_errorq, eqep);
824 
825 		/*
826 		 * Allocate another element for scratch space, but fallback
827 		 * to the one we have if that fails.  We'd like to use the
828 		 * additional scratch space for nvlist construction.
829 		 */
830 		if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL)
831 			nva = errorq_elem_nva(ereport_errorq, scr_eqep);
832 		else
833 			nva = errorq_elem_nva(ereport_errorq, eqep);
834 	} else {
835 		ereport = fm_nvlist_create(NULL);
836 		nva = NULL;
837 	}
838 
839 	if (ereport == NULL)
840 		return;
841 
842 	/*
843 	 * Common payload data required by the protocol:
844 	 *	- ereport class
845 	 *	- detector
846 	 *	- ENA
847 	 */
848 
849 	/*
850 	 * Ereport class - call into model-specific support to allow it to
851 	 * provide a cpu class or leaf class, otherwise calculate our own.
852 	 */
853 	cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass);
854 	classfmt = ged ?  ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN;
855 	gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass,
856 	    leafclass);
857 
858 	/*
859 	 * The detector FMRI.
860 	 */
861 	if ((detector = cms_ereport_detector(hdl, bankidx, mscookie,
862 	    nva)) == NULL)
863 		detector = gcpu_fmri_create(hdl, nva);
864 
865 	/*
866 	 * Should we define a new ENA format 3?? for chip/core/strand?
867 	 * It will be better when virtualized.
868 	 */
869 	fm_ereport_set(ereport, FM_EREPORT_VERSION, buf,
870 	    fm_ena_generate_cpu(gcl->gcl_timestamp,
871 	    cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 |
872 	    cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL);
873 
874 	if (panicstr) {
875 		fm_nvlist_destroy(detector, FM_NVA_RETAIN);
876 		nv_alloc_reset(nva);
877 	} else {
878 		fm_nvlist_destroy(detector, FM_NVA_FREE);
879 	}
880 
881 	/*
882 	 * Add the architectural ereport class-specific payload data.
883 	 */
884 	gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code);
885 
886 	/*
887 	 * Allow model-specific code to add ereport members.
888 	 */
889 	cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status,
890 	    gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie);
891 
892 	/*
893 	 * Include stack if options is turned on and either selected in
894 	 * the payload member bitmask or inclusion is forced.
895 	 */
896 	if (gcpu_mca_stack_flag &&
897 	    (cms_ereport_includestack(hdl, mscookie) ==
898 	    B_TRUE || gcpu_mca_stack_ereport_include)) {
899 		fm_payload_stack_add(ereport, gcl->gcl_stack,
900 		    gcl->gcl_stackdepth);
901 	}
902 
903 	/*
904 	 * If injection has taken place anytime in the past then note this
905 	 * on the ereport.
906 	 */
907 	if (cmi_inj_tainted() == B_TRUE) {
908 		fm_payload_set(ereport, "__injected", DATA_TYPE_BOOLEAN_VALUE,
909 		    B_TRUE, NULL);
910 	}
911 
912 	/*
913 	 * Post ereport.
914 	 */
915 	if (panicstr) {
916 		errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC);
917 		if (scr_eqep)
918 			errorq_cancel(ereport_errorq, scr_eqep);
919 	} else {
920 		(void) fm_ereport_post(ereport, EVCH_TRYHARD);
921 		fm_nvlist_destroy(ereport, FM_NVA_FREE);
922 	}
923 
924 }
925 
926 /*ARGSUSED*/
927 void
928 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe)
929 {
930 	const gcpu_logout_t *gcl = data;
931 	const gcpu_bank_logout_t *gbl;
932 	int i;
933 
934 	for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
935 		const gcpu_error_disp_t *gened;
936 		cms_cookie_t mscookie;
937 
938 		if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
939 		    !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
940 			uint16_t code = MCAX86_ERRCODE(gbl->gbl_status);
941 
942 			/*
943 			 * Perform a match based on IA32 MCA architectural
944 			 * components alone.
945 			 */
946 			gened = gcpu_disp_match(code); /* may be NULL */
947 
948 			/*
949 			 * Now see if an model-specific match can be made.
950 			 */
951 			mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, i,
952 			    gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc,
953 			    gcl->gcl_ms_logout);
954 
955 			/*
956 			 * Prepare and dispatch an ereport for logging and
957 			 * diagnosis.
958 			 */
959 			gcpu_ereport_post(gcl, i, gened, mscookie,
960 			    gbl->gbl_status);
961 		} else if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
962 		    (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
963 			/*
964 			 * Telemetry kept changing as we tried to read
965 			 * it.  Force an unknown ereport leafclass but
966 			 * keep the telemetry unchanged for logging.
967 			 */
968 			gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL,
969 			    gbl->gbl_status);
970 		}
971 	}
972 }
973 
974 static size_t gcpu_mca_queue_datasz = 0;
975 
976 /*
977  * The following code is ready to make a weak attempt at growing the
978  * errorq structure size.  Since it is not foolproof (we don't know
979  * who may already be producing to the outgoing errorq) our caller
980  * instead assures that we'll always be called with no greater data
981  * size than on our first call.
982  */
983 static void
984 gcpu_errorq_init(size_t datasz)
985 {
986 	int slots;
987 
988 	mutex_enter(&gcpu_mca_queue_lock);
989 
990 	if (gcpu_mca_queue_datasz >= datasz) {
991 		mutex_exit(&gcpu_mca_queue_lock);
992 		return;
993 	}
994 
995 	membar_producer();
996 	if (gcpu_mca_queue) {
997 		gcpu_mca_queue_datasz = 0;
998 		errorq_destroy(gcpu_mca_queue);
999 	}
1000 
1001 	slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS);
1002 	slots = MIN(slots, GCPU_MCA_MAX_ERRORS);
1003 
1004 	gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain,
1005 	    NULL, slots, datasz, 1, ERRORQ_VITAL);
1006 
1007 	if (gcpu_mca_queue != NULL)
1008 		gcpu_mca_queue_datasz = datasz;
1009 
1010 	mutex_exit(&gcpu_mca_queue_lock);
1011 }
1012 
1013 /*
1014  * Perform MCA initialization as described in section 14.6 of Intel 64
1015  * and IA-32 Architectures Software Developer's Manual Volume 3A.
1016  */
1017 
1018 static uint_t global_nbanks;
1019 
1020 void
1021 gcpu_mca_init(cmi_hdl_t hdl)
1022 {
1023 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1024 	uint64_t cap;
1025 	uint_t vendor = cmi_hdl_vendor(hdl);
1026 	uint_t family = cmi_hdl_family(hdl);
1027 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1028 	int mcg_ctl_present;
1029 	uint_t nbanks;
1030 	uint32_t ctl_skip_mask = 0;
1031 	uint32_t status_skip_mask = 0;
1032 	size_t mslsz;
1033 	int i;
1034 #ifndef __xpv
1035 	int mcg_ctl2_present;
1036 	uint32_t cmci_capable = 0;
1037 #endif
1038 
1039 	if (gcpu == NULL)
1040 		return;
1041 
1042 	/*
1043 	 * Protect from some silly /etc/system settings.
1044 	 */
1045 	if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100)
1046 		gcpu_mca_telemetry_retries = 5;
1047 
1048 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS)
1049 		return;
1050 
1051 	/*
1052 	 * CPU startup code only calls cmi_mca_init if x86_feature indicates
1053 	 * both MCA and MCE support (i.e., X86_MCA).  P5, K6, and earlier
1054 	 * processors, which have their own * more primitive way of doing
1055 	 * machine checks, will not have cmi_mca_init called since their
1056 	 * CPUID information will not indicate both MCA and MCE features.
1057 	 */
1058 	ASSERT(x86_feature & X86_MCA);
1059 
1060 	/*
1061 	 * Determine whether the IA32_MCG_CTL register is present.  If it
1062 	 * is we will enable all features by writing -1 to it towards
1063 	 * the end of this initialization;  if it is absent then volume 3A
1064 	 * says we must nonetheless continue to initialize the individual
1065 	 * banks.
1066 	 */
1067 	mcg_ctl_present = cap & MCG_CAP_CTL_P;
1068 #ifndef __xpv
1069 	mcg_ctl2_present = cap & MCG_CAP_CTL2_P;
1070 #endif
1071 
1072 	/*
1073 	 * We squirell values away for inspection/debugging.
1074 	 */
1075 	mca->gcpu_mca_bioscfg.bios_mcg_cap = cap;
1076 	if (mcg_ctl_present)
1077 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL,
1078 		    &mca->gcpu_mca_bioscfg.bios_mcg_ctl);
1079 
1080 	/*
1081 	 * Determine the number of error-reporting banks implemented.
1082 	 */
1083 	mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK;
1084 
1085 	if (nbanks != 0 && global_nbanks == 0)
1086 		global_nbanks = nbanks;	/* no race - BSP will get here first */
1087 
1088 	/*
1089 	 * If someone is hiding the number of banks (perhaps we are fully
1090 	 * virtualized?) or if this processor has more banks than the
1091 	 * first to set global_nbanks then bail.  The latter requirement
1092 	 * is because we need to size our errorq data structure and we
1093 	 * don't want to have to grow the errorq (destroy and recreate)
1094 	 * which may just lose some telemetry.
1095 	 */
1096 	if (nbanks == 0 || nbanks > global_nbanks)
1097 		return;
1098 
1099 	mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks *
1100 	    sizeof (struct gcpu_bios_bankcfg), KM_SLEEP);
1101 
1102 	/*
1103 	 * Calculate the size we need to allocate for a gcpu_logout_t
1104 	 * with a gcl_data array big enough for all banks of this cpu.
1105 	 * Add any space requested by the model-specific logout support.
1106 	 */
1107 	mslsz = cms_logout_size(hdl);
1108 	mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) +
1109 	    (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz;
1110 
1111 	for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) {
1112 		gcpu_logout_t *gcl;
1113 
1114 		mca->gcpu_mca_logout[i] = gcl =
1115 		    kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP);
1116 		gcl->gcl_gcpu = gcpu;
1117 		gcl->gcl_nbanks = nbanks;
1118 		gcl->gcl_ms_logout = (mslsz == 0) ? NULL :
1119 		    (char *)(&gcl->gcl_data[0]) + nbanks *
1120 		    sizeof (gcpu_bank_logout_t);
1121 
1122 	}
1123 
1124 #ifdef __xpv
1125 	gcpu_xpv_mca_init(nbanks);
1126 #endif
1127 
1128 	mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1;
1129 
1130 #ifndef __xpv
1131 	mca->gcpu_bank_cmci = kmem_zalloc(sizeof (gcpu_mca_cmci_t) * nbanks,
1132 	    KM_SLEEP);
1133 #endif
1134 
1135 	/*
1136 	 * Create our errorq to transport the logout structures.  This
1137 	 * can fail so users of gcpu_mca_queue must be prepared for NULL.
1138 	 */
1139 	gcpu_errorq_init(mca->gcpu_mca_lgsz);
1140 
1141 	/*
1142 	 * Not knowing which, if any, banks are shared between cores we
1143 	 * assure serialization of MCA bank initialization by each cpu
1144 	 * on the chip.  On chip architectures in which some banks are
1145 	 * shared this will mean the shared resource is initialized more
1146 	 * than once - we're simply aiming to avoid simultaneous MSR writes
1147 	 * to the shared resource.
1148 	 *
1149 	 * Even with these precautions, some platforms may yield a GP fault
1150 	 * if a core other than a designated master tries to write anything
1151 	 * but all 0's to MCi_{STATUS,ADDR,CTL}.  So we will perform
1152 	 * those writes under on_trap protection.
1153 	 */
1154 	mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock);
1155 
1156 	/*
1157 	 * Initialize poller data, but don't start polling yet.
1158 	 */
1159 	gcpu_mca_poll_init(hdl);
1160 
1161 	/*
1162 	 * Work out which MCA banks we will initialize.  In MCA logout
1163 	 * code we will only read those banks which we initialize here.
1164 	 */
1165 	for (i = 0; i < nbanks; i++) {
1166 		boolean_t skipctl = cms_bankctl_skipinit(hdl, i);
1167 		boolean_t skipstatus = cms_bankstatus_skipinit(hdl, i);
1168 
1169 		if (!cms_present(hdl)) {
1170 			/*
1171 			 * Model-specific support is not present, try to use
1172 			 * sane defaults.
1173 			 *
1174 			 * On AMD family 6 processors, reports about spurious
1175 			 * machine checks indicate that bank 0 should be
1176 			 * skipped.
1177 			 *
1178 			 * On Intel family 6 processors, the documentation tells
1179 			 * us not to write to MC0_CTL.
1180 			 *
1181 			 */
1182 			if (i == 0 && family == 6) {
1183 				switch (vendor) {
1184 				case X86_VENDOR_AMD:
1185 					skipstatus = B_TRUE;
1186 					/*FALLTHRU*/
1187 				case X86_VENDOR_Intel:
1188 					skipctl = B_TRUE;
1189 					break;
1190 				}
1191 			}
1192 		}
1193 
1194 		ctl_skip_mask |= skipctl << i;
1195 		status_skip_mask |= skipstatus << i;
1196 
1197 		if (skipctl && skipstatus)
1198 			continue;
1199 
1200 		/*
1201 		 * Record which MCA banks were enabled, from the point of view
1202 		 * of the whole chip (if some cores share a bank we must be
1203 		 * sure either can logout from it).
1204 		 */
1205 		atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i);
1206 
1207 #ifndef __xpv
1208 		/*
1209 		 * check CMCI capability
1210 		 */
1211 		if (mcg_ctl2_present) {
1212 			uint64_t ctl2;
1213 			uint32_t cap = 0;
1214 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
1215 			if (ctl2 & MSR_MC_CTL2_EN)
1216 				continue;
1217 			ctl2 |= MSR_MC_CTL2_EN;
1218 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
1219 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
1220 			mca->gcpu_bank_cmci[i].cmci_cap = cap =
1221 			    (ctl2 & MSR_MC_CTL2_EN) ? 1 : 0;
1222 			if (cap)
1223 				cmci_capable ++;
1224 			/*
1225 			 * Set threshold to 1 while unset the en field, to avoid
1226 			 * CMCI trigged before APIC LVT entry init.
1227 			 */
1228 			ctl2 = ctl2 & (~MSR_MC_CTL2_EN) | 1;
1229 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
1230 
1231 			/*
1232 			 * init cmci related count
1233 			 */
1234 			mca->gcpu_bank_cmci[i].cmci_enabled = 0;
1235 			mca->gcpu_bank_cmci[i].drtcmci = 0;
1236 			mca->gcpu_bank_cmci[i].ncmci = 0;
1237 		}
1238 #endif
1239 	}
1240 
1241 #ifndef __xpv
1242 	if (cmci_capable)
1243 		cmi_enable_cmci = 1;
1244 #endif
1245 
1246 #ifndef __xpv
1247 	/*
1248 	 * Log any valid telemetry lurking in the MCA banks, but do not
1249 	 * clear the status registers.  Ignore the disposition returned -
1250 	 * we have already paniced or reset for any nasty errors found here.
1251 	 *
1252 	 * Intel vol 3A says that we should not do this on family 0x6,
1253 	 * and that for any extended family the BIOS clears things
1254 	 * on power-on reset so you'll only potentially find valid telemetry
1255 	 * on warm reset (we do it for both - on power-on reset we should
1256 	 * just see zeroes).
1257 	 *
1258 	 * AMD docs since K7 say we should process anything we find here.
1259 	 */
1260 	if (!gcpu_suppress_log_on_init &&
1261 	    (vendor == X86_VENDOR_Intel && family >= 0xf ||
1262 	    vendor == X86_VENDOR_AMD))
1263 		gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE,
1264 		    GCPU_MPT_WHAT_POKE_ERR);
1265 
1266 	/*
1267 	 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the
1268 	 * model-specific module the power of veto.
1269 	 */
1270 	for (i = 0; i < nbanks; i++) {
1271 		struct gcpu_bios_bankcfg *bcfgp =
1272 		    mca->gcpu_mca_bioscfg.bios_bankcfg + i;
1273 
1274 		/*
1275 		 * Stash inherited bank MCA state, even for banks we will
1276 		 * not initialize ourselves.  Do not read the MISC register
1277 		 * unconditionally - on some processors that will #GP on
1278 		 * banks that do not implement the MISC register (would be
1279 		 * caught by on_trap, anyway).
1280 		 */
1281 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL),
1282 		    &bcfgp->bios_bank_ctl);
1283 
1284 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1285 		    &bcfgp->bios_bank_status);
1286 
1287 		if (bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV)
1288 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR),
1289 			    &bcfgp->bios_bank_addr);
1290 
1291 		/*
1292 		 * In some old BIOS the status value after boot can indicate
1293 		 * MISCV when there is actually no MISC register for
1294 		 * that bank.  The following read could therefore
1295 		 * aggravate a general protection fault.  This should be
1296 		 * caught by on_trap, but the #GP fault handler is busted
1297 		 * and can suffer a double fault even before we get to
1298 		 * trap() to check for on_trap protection.  Until that
1299 		 * issue is fixed we remove the one access that we know
1300 		 * can cause a #GP.
1301 		 *
1302 		 * if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV)
1303 		 *	(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC),
1304 		 *	    &bcfgp->bios_bank_misc);
1305 		 */
1306 		bcfgp->bios_bank_misc = 0;
1307 
1308 		if (!(ctl_skip_mask & (1 << i))) {
1309 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL),
1310 			    cms_bankctl_val(hdl, i, -1ULL));
1311 		}
1312 
1313 		if (!(status_skip_mask & (1 << i))) {
1314 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS),
1315 			    cms_bankstatus_val(hdl, i, 0ULL));
1316 		}
1317 	}
1318 #endif
1319 	/*
1320 	 * Now let the model-specific support perform further initialization
1321 	 * of non-architectural features.
1322 	 */
1323 	cms_mca_init(hdl, nbanks);
1324 
1325 #ifndef __xpv
1326 	(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL);
1327 	membar_producer();
1328 
1329 	/* enable all machine-check features */
1330 	if (mcg_ctl_present)
1331 		(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL,
1332 		    cms_mcgctl_val(hdl, nbanks, -1ULL));
1333 #endif
1334 
1335 	mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock);
1336 
1337 #ifndef __xpv
1338 	/* enable machine-check exception in CR4 */
1339 	cmi_hdl_enable_mce(hdl);
1340 #endif
1341 }
1342 
1343 static uint64_t
1344 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu,
1345     gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp)
1346 {
1347 	int curctxbad = 0, unconstrained = 0, forcefatal = 0;
1348 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1349 	int nbanks = mca->gcpu_mca_nbanks;
1350 	gcpu_mce_status_t mce;
1351 	gcpu_bank_logout_t *gbl;
1352 	uint64_t disp = 0;
1353 	int i;
1354 
1355 	if (mcesp == NULL)
1356 		mcesp = &mce;
1357 
1358 	mcesp->mce_nerr = nerr;
1359 
1360 	mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc =
1361 	    mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned =
1362 	    mcesp->mce_forcefatal = mcesp->mce_ignored = 0;
1363 
1364 	/*
1365 	 * If this a machine check then if the return instruction pointer
1366 	 * is not valid the current context is lost.
1367 	 */
1368 	if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV))
1369 		disp |= CMI_ERRDISP_RIPV_INVALID;
1370 
1371 	for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1372 		uint64_t mcistatus = gbl->gbl_status;
1373 		uint32_t ms_scope;
1374 		int pcc, uc;
1375 		int poisoned;
1376 
1377 		if (!(mcistatus & MSR_MC_STATUS_VAL))
1378 			continue;
1379 
1380 		if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)
1381 			continue;
1382 
1383 		pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0;
1384 		uc = (mcistatus & MSR_MC_STATUS_UC) != 0;
1385 		mcesp->mce_npcc += pcc;
1386 		mcesp->mce_nuc += uc;
1387 
1388 		ms_scope = cms_error_action(hdl, ismc, i, mcistatus,
1389 		    gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout);
1390 
1391 		if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) {
1392 			pcc = 0;
1393 			mcesp->mce_npcc_ok++;
1394 			gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED;
1395 		}
1396 
1397 		if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) {
1398 			uc = 0;
1399 			mcesp->mce_nuc_ok++;
1400 			gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED;
1401 		}
1402 
1403 		if (uc) {
1404 			poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0;
1405 			if (poisoned) {
1406 				mcesp->mce_nuc_poisoned++;
1407 				gbl->gbl_disp |= CMI_ERRDISP_POISONED;
1408 			}
1409 		}
1410 
1411 		if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) {
1412 			/*
1413 			 * We're not being instructed to ignore the error,
1414 			 * so apply our standard disposition logic to it.
1415 			 */
1416 			if (uc && !poisoned) {
1417 				unconstrained++;
1418 				gbl->gbl_disp |= disp |
1419 				    CMI_ERRDISP_UC_UNCONSTRAINED;
1420 			}
1421 
1422 			if (pcc && ismc) {
1423 				curctxbad++;
1424 				gbl->gbl_disp |= disp |
1425 				    CMI_ERRDISP_CURCTXBAD;
1426 			}
1427 
1428 			/*
1429 			 * Even if the above may not indicate that the error
1430 			 * is terminal, model-specific support may insist
1431 			 * that we treat it as such.  Such errors wil be
1432 			 * fatal even if discovered via poll.
1433 			 */
1434 			if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) {
1435 				forcefatal++;
1436 				mcesp->mce_forcefatal++;
1437 				gbl->gbl_disp |= disp |
1438 				    CMI_ERRDISP_FORCEFATAL;
1439 			}
1440 		} else {
1441 			mcesp->mce_ignored++;
1442 			gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED;
1443 		}
1444 	}
1445 
1446 	if (unconstrained > 0)
1447 		disp |= CMI_ERRDISP_UC_UNCONSTRAINED;
1448 
1449 	if (curctxbad > 0)
1450 		disp |= CMI_ERRDISP_CURCTXBAD;
1451 
1452 	if (forcefatal > 0)
1453 		disp |= CMI_ERRDISP_FORCEFATAL;
1454 
1455 	if (gcpu_mca_queue != NULL) {
1456 		int how;
1457 
1458 		if (ismc) {
1459 			how = cmi_mce_response(rp, disp) ?
1460 			    ERRORQ_ASYNC :	/* no panic, so arrange drain */
1461 			    ERRORQ_SYNC;	/* panic flow will drain */
1462 		} else {
1463 			how = (disp & CMI_ERRDISP_FORCEFATAL &&
1464 			    cmi_panic_on_ue()) ?
1465 			    ERRORQ_SYNC :	/* poller will panic */
1466 			    ERRORQ_ASYNC;	/* no panic */
1467 		}
1468 
1469 		errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how);
1470 	} else if (disp != 0) {
1471 		gcpu_bleat(hdl, gcl);
1472 	}
1473 
1474 	mcesp->mce_disp = disp;
1475 
1476 	return (disp);
1477 }
1478 
1479 /*
1480  * Gather error telemetry from our source, and then submit it for
1481  * processing.
1482  */
1483 
1484 #define	IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \
1485 	((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0)
1486 
1487 #define	STATUS_EQV(s1, s2) \
1488 	(((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER))
1489 
1490 static uint32_t gcpu_deferrred_polled_clears;
1491 
1492 #ifndef __xpv
1493 static void
1494 gcpu_cmci_logout(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
1495     uint64_t status, int what)
1496 {
1497 	uint64_t ctl2;
1498 
1499 	if (bank_cmci_p->cmci_cap && (what == GCPU_MPT_WHAT_CYC_ERR) &&
1500 	    (!(status & MSR_MC_STATUS_VAL) || ((status & MSR_MC_STATUS_VAL) &&
1501 	    !(status & MSR_MC_STATUS_CEC_MASK)))) {
1502 
1503 		if (!(bank_cmci_p->cmci_enabled)) {
1504 			/*
1505 			 * when cmci is disabled, and the bank has no error or
1506 			 * no corrected error for
1507 			 * gcpu_mca_cmci_reenable_threshold consecutive polls,
1508 			 * turn on this bank's cmci.
1509 			 */
1510 
1511 			bank_cmci_p->drtcmci ++;
1512 
1513 			if (bank_cmci_p->drtcmci >=
1514 			    gcpu_mca_cmci_reenable_threshold) {
1515 
1516 				/* turn on cmci */
1517 
1518 				(void) cmi_hdl_rdmsr(hdl,
1519 				    IA32_MSR_MC_CTL2(bank), &ctl2);
1520 				ctl2 |= MSR_MC_CTL2_EN;
1521 				(void) cmi_hdl_wrmsr(hdl,
1522 				    IA32_MSR_MC_CTL2(bank), ctl2);
1523 
1524 				/* reset counter and set flag */
1525 				bank_cmci_p->drtcmci = 0;
1526 				bank_cmci_p->cmci_enabled = 1;
1527 			}
1528 		} else {
1529 			/*
1530 			 * when cmci is enabled,if is in cyclic poll and the
1531 			 * bank has no error or no corrected error, reset ncmci
1532 			 * counter
1533 			 */
1534 			bank_cmci_p->ncmci = 0;
1535 		}
1536 	}
1537 }
1538 
1539 static void
1540 gcpu_cmci_throttle(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
1541     int what)
1542 {
1543 	uint64_t ctl2 = 0;
1544 
1545 	/*
1546 	 * if cmci of this bank occurred beyond
1547 	 * gcpu_mca_cmci_throttling_threshold between 2 polls,
1548 	 * turn off this bank's CMCI;
1549 	 */
1550 	if (bank_cmci_p->cmci_enabled && what == GCPU_MPT_WHAT_CMCI_ERR) {
1551 
1552 		/* if it is cmci trap, increase the count */
1553 		bank_cmci_p->ncmci++;
1554 
1555 		if (bank_cmci_p->ncmci >= gcpu_mca_cmci_throttling_threshold) {
1556 
1557 			/* turn off cmci */
1558 
1559 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(bank),
1560 			    &ctl2);
1561 			ctl2 &= ~MSR_MC_CTL2_EN;
1562 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(bank),
1563 			    ctl2);
1564 
1565 			/* clear the flag and count */
1566 
1567 			bank_cmci_p->cmci_enabled = 0;
1568 			bank_cmci_p->ncmci = 0;
1569 		}
1570 	}
1571 }
1572 #endif
1573 
1574 static void
1575 clear_mc(int first, int last, int ismc, boolean_t clrstatus,
1576     cmi_hdl_t hdl, gcpu_logout_t *gcl, gcpu_logout_t *pgcl)
1577 {
1578 	int i;
1579 	gcpu_bank_logout_t *gbl, *pgbl;
1580 	uint64_t status;
1581 
1582 	if (first < 0 || last < 0)
1583 		return;
1584 
1585 	for (i = first, gbl = &gcl->gcl_data[first]; i <= last; i++, gbl++) {
1586 		status = gbl->gbl_status;
1587 		if (status == 0)
1588 			continue;
1589 		if (clrstatus == B_FALSE)
1590 			goto serialize;
1591 
1592 		/*
1593 		 * For i86xpv we always clear status in order to invalidate
1594 		 * the interposed telemetry.
1595 		 *
1596 		 * For native machine checks we always clear status here.  For
1597 		 * native polls we must be a little more cautious since there
1598 		 * is an outside chance that we may clear telemetry from a
1599 		 * shared MCA bank on which a sibling core is machine checking.
1600 		 *
1601 		 * For polled observations of errors that look like they may
1602 		 * produce a machine check (UC/PCC and ENabled, although these
1603 		 * do not guarantee a machine check on error occurence)
1604 		 * we will not clear the status at this wakeup unless
1605 		 * we saw the same status at the previous poll.	 We will
1606 		 * always process and log the current observations - it
1607 		 * is only the clearing of MCi_STATUS which may be
1608 		 * deferred until the next wakeup.
1609 		 */
1610 		if (isxpv || ismc || !IS_MCE_CANDIDATE(status)) {
1611 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL);
1612 			goto serialize;
1613 		}
1614 
1615 		/*
1616 		 * We have a polled observation of a machine check
1617 		 * candidate.  If we saw essentially the same status at the
1618 		 * last poll then clear the status now since this appears
1619 		 * not to be a #MC candidate after all.	 If we see quite
1620 		 * different status now then do not clear, but reconsider at
1621 		 * the next poll.  In no actual machine check clears
1622 		 * the status in the interim then the status should not
1623 		 * keep changing forever (meaning we'd never clear it)
1624 		 * since before long we'll simply have latched the highest-
1625 		 * priority error and set the OVerflow bit.  Nonetheless
1626 		 * we count how many times we defer clearing and after
1627 		 * a while insist on clearing the status.
1628 		 */
1629 		pgbl = &pgcl->gcl_data[i];
1630 		if (pgbl->gbl_clrdefcnt != 0) {
1631 			/* We deferred clear on this bank at last wakeup */
1632 			if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) ||
1633 			    pgbl->gbl_clrdefcnt > 5) {
1634 				/*
1635 				 * Status is unchanged so clear it now and,
1636 				 * since we have already logged this info,
1637 				 * avoid logging it again.
1638 				 */
1639 				gbl->gbl_status = 0;
1640 				(void) cmi_hdl_wrmsr(hdl,
1641 				    IA32_MSR_MC(i, STATUS), 0ULL);
1642 			} else {
1643 				/* Record deferral for next wakeup */
1644 				gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1;
1645 			}
1646 		} else {
1647 			/* Record initial deferral for next wakeup */
1648 			gbl->gbl_clrdefcnt = 1;
1649 			gcpu_deferrred_polled_clears++;
1650 		}
1651 
1652 serialize:
1653 		{
1654 #ifdef __xpv
1655 			;
1656 #else
1657 			/*
1658 			 * Intel Vol 3A says to execute a serializing
1659 			 * instruction here, ie CPUID.	Well WRMSR is also
1660 			 * defined to be serializing, so the status clear above
1661 			 * should suffice.  To be a good citizen, and since
1662 			 * some clears are deferred, we'll execute a CPUID
1663 			 * instruction here.
1664 			 */
1665 			struct cpuid_regs tmp;
1666 			(void) __cpuid_insn(&tmp);
1667 #endif
1668 		}
1669 	}
1670 }
1671 
1672 /*ARGSUSED5*/
1673 void
1674 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask,
1675     gcpu_mce_status_t *mcesp, boolean_t clrstatus, int what)
1676 {
1677 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1678 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1679 	int nbanks = mca->gcpu_mca_nbanks;
1680 	gcpu_bank_logout_t *gbl, *pgbl;
1681 	gcpu_logout_t *gcl, *pgcl;
1682 	int ismc = (rp != NULL);
1683 	int ispoll = !ismc;
1684 	int i, nerr = 0;
1685 	cmi_errno_t err;
1686 	uint64_t mcg_status;
1687 	uint64_t disp;
1688 	uint64_t cap;
1689 	int first = -1;
1690 	int last = -1;
1691 	int willpanic = 0;
1692 
1693 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
1694 	    CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) !=
1695 	    CMI_SUCCESS) {
1696 		if (mcesp != NULL)
1697 			mcesp->mce_nerr = mcesp->mce_disp = 0;
1698 		return;
1699 	}
1700 
1701 	if (ismc) {
1702 		gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION];
1703 	} else {
1704 		int pidx = mca->gcpu_mca_nextpoll_idx;
1705 		int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ?
1706 		    GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1;
1707 
1708 		gcl = mca->gcpu_mca_logout[pidx];	/* current logout */
1709 		pgcl = mca->gcpu_mca_logout[ppidx];	/* previous logout */
1710 		mca->gcpu_mca_nextpoll_idx = ppidx;	/* switch next time */
1711 	}
1712 
1713 	gcl->gcl_timestamp = gethrtime_waitfree();
1714 	gcl->gcl_mcg_status = mcg_status;
1715 	gcl->gcl_ip = rp ? rp->r_pc : 0;
1716 
1717 	gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0;
1718 	if (cap & MCG_CAP_TES_P)
1719 		gcl->gcl_flags |= GCPU_GCL_F_TES_P;
1720 
1721 	for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1722 		uint64_t status, status2, addr, misc;
1723 		int retries = gcpu_mca_telemetry_retries;
1724 
1725 		gbl->gbl_status = 0;
1726 		gbl->gbl_disp = 0;
1727 		gbl->gbl_clrdefcnt = 0;
1728 
1729 		/*
1730 		 * Only logout from MCA banks we have initialized from at
1731 		 * least one core.  If a core shares an MCA bank with another
1732 		 * but perhaps lost the race to initialize it, then it must
1733 		 * still be allowed to logout from the shared bank.
1734 		 */
1735 		if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i))
1736 			continue;
1737 
1738 		/*
1739 		 * On a poll look only at the banks we've been asked to check.
1740 		 */
1741 		if (rp == NULL && !(bankmask & 1 << i))
1742 			continue;
1743 
1744 
1745 		if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) !=
1746 		    CMI_SUCCESS)
1747 			continue;
1748 
1749 #ifndef __xpv
1750 		gcpu_cmci_logout(hdl, i, &mca->gcpu_bank_cmci[i], status, what);
1751 #endif
1752 
1753 retry:
1754 		if (!(status & MSR_MC_STATUS_VAL))
1755 			continue;
1756 
1757 		/* First and last bank that have valid status */
1758 		if (first < 0)
1759 			first = i;
1760 		last = i;
1761 
1762 		addr = -1;
1763 		misc = 0;
1764 
1765 		if (status & MSR_MC_STATUS_ADDRV)
1766 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr);
1767 
1768 		if (status & MSR_MC_STATUS_MISCV)
1769 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc);
1770 
1771 #ifndef __xpv
1772 		gcpu_cmci_throttle(hdl, i, &mca->gcpu_bank_cmci[i], what);
1773 #endif
1774 
1775 		/*
1776 		 * Allow the model-specific code to extract bank telemetry.
1777 		 */
1778 		cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout);
1779 
1780 		/*
1781 		 * Not all cpu models assure us that the status/address/misc
1782 		 * data will not change during the above sequence of MSR reads,
1783 		 * or that it can only change by the addition of the OVerflow
1784 		 * bit to the status register.  If the status has changed
1785 		 * other than in the overflow bit then we attempt to reread
1786 		 * for a consistent snapshot, but eventually give up and
1787 		 * go with what we've got.  We only perform this check
1788 		 * for a poll - a further #MC during a #MC will reset, and
1789 		 * polled errors should not overwrite higher-priority
1790 		 * trapping errors (but could set the overflow bit).
1791 		 */
1792 		if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1793 		    &status2)) == CMI_SUCCESS) {
1794 			if (!STATUS_EQV(status, status2)) {
1795 				if (retries-- > 0) {
1796 					status = status2;
1797 					goto retry;
1798 				} else {
1799 					gbl->gbl_disp |=
1800 					    CMI_ERRDISP_INCONSISTENT;
1801 				}
1802 			}
1803 		} else if (ispoll && err != CMI_SUCCESS) {
1804 			gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT;
1805 		}
1806 
1807 		nerr++;
1808 		gbl->gbl_status = status;
1809 		gbl->gbl_addr = addr;
1810 		gbl->gbl_misc = misc;
1811 
1812 		/*
1813 		 * For polled observation, if the count of deferred status
1814 		 * clears updated in the clear_mc() is nonzero and the
1815 		 * MCi_STATUS has not changed, the last wakeup has produced
1816 		 * the ereport of the error. Therefore, clear the status in
1817 		 * this wakeup to avoid duplicate ereport.
1818 		 */
1819 		pgbl = &pgcl->gcl_data[i];
1820 		if (!isxpv && ispoll && IS_MCE_CANDIDATE(status) &&
1821 		    pgbl->gbl_clrdefcnt != 0) {
1822 			if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status)) {
1823 				gbl->gbl_status = 0;
1824 				(void) cmi_hdl_wrmsr(hdl,
1825 				    IA32_MSR_MC(i, STATUS), 0ULL);
1826 			}
1827 		}
1828 	}
1829 
1830 	if (gcpu_mca_stack_flag)
1831 		gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH);
1832 	else
1833 		gcl->gcl_stackdepth = 0;
1834 
1835 	/*
1836 	 * Decide our disposition for this error or errors, and submit for
1837 	 * logging and subsequent diagnosis.
1838 	 */
1839 	if (nerr != 0) {
1840 		disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp);
1841 
1842 		willpanic = (ismc && cmi_mce_response(rp, disp) == 0);
1843 
1844 		if (!willpanic)
1845 			clear_mc(first, last, ismc, clrstatus, hdl, gcl, pgcl);
1846 	} else {
1847 		disp = 0;
1848 		if (mcesp) {
1849 			mcesp->mce_nerr = mcesp->mce_disp = 0;
1850 		}
1851 	}
1852 
1853 	/*
1854 	 * Clear MCG_STATUS if MCIP is set (machine check in progress).
1855 	 * If a second #MC had occured before now the system would have
1856 	 * reset.  We can only do thise once gcpu_mca_process has copied
1857 	 * the logout structure.
1858 	 */
1859 	if (ismc && mcg_status & MCG_STATUS_MCIP)
1860 		(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0);
1861 
1862 	/*
1863 	 * At this point we have read and logged all telemetry that is visible
1864 	 * under the MCA.  On architectures for which the NorthBridge is
1865 	 * on-chip this may include NB-observed errors, but where the NB
1866 	 * is off chip it may have been the source of the #MC request and
1867 	 * so we must call into the memory-controller driver to give it
1868 	 * a chance to log errors.
1869 	 */
1870 	if (ismc) {
1871 		cmi_mc_logout(hdl, 1, willpanic);
1872 	}
1873 }
1874 
1875 #ifndef __xpv
1876 int gcpu_mca_trap_vomit_summary = 0;
1877 
1878 /*
1879  * On a native machine check exception we come here from mcetrap via
1880  * cmi_mca_trap.  A machine check on one cpu of a chip does not trap others
1881  * cpus of the chip, so it is possible that another cpu on this chip could
1882  * initiate a poll while we're in the #mc handler;  it is also possible that
1883  * this trap has occured during a poll on this cpu.  So we must acquire
1884  * the chip-wide poll lock, but be careful to avoid deadlock.
1885  *
1886  * The 'data' pointer cannot be NULL due to init order.
1887  */
1888 uint64_t
1889 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp)
1890 {
1891 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1892 	kmutex_t *poll_lock = NULL;
1893 	gcpu_mce_status_t mce;
1894 	uint64_t mcg_status;
1895 	int tooklock = 0;
1896 
1897 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
1898 	    CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP))
1899 		return (0);
1900 
1901 	/*
1902 	 * Synchronize with any poller from another core that may happen
1903 	 * to share access to one or more of the MCA banks.
1904 	 */
1905 	if (gcpu->gcpu_shared != NULL)
1906 		poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock;
1907 
1908 	if (poll_lock != NULL && !mutex_owned(poll_lock)) {
1909 		/*
1910 		 * The lock is not owned by the thread we have
1911 		 * interrupted.  Spin for this adaptive lock.
1912 		 */
1913 		while (!mutex_tryenter(poll_lock)) {
1914 			while (mutex_owner(poll_lock) != NULL)
1915 				;
1916 		}
1917 		tooklock = 1;
1918 	}
1919 
1920 	gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE, GCPU_MPT_WHAT_MC_ERR);
1921 
1922 	if (tooklock)
1923 		mutex_exit(poll_lock);
1924 
1925 	/*
1926 	 * gcpu_mca_trap_vomit_summary may be set for debug assistance.
1927 	 */
1928 	if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) {
1929 		cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, "
1930 		    "%u PCC (%u ok), "
1931 		    "%u UC (%d ok, %u poisoned), "
1932 		    "%u forcefatal, %u ignored",
1933 		    mce.mce_nerr, (u_longlong_t)mce.mce_disp,
1934 		    mce.mce_npcc, mce.mce_npcc_ok,
1935 		    mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned,
1936 		    mce.mce_forcefatal, mce.mce_ignored);
1937 	}
1938 
1939 	return (mce.mce_disp);
1940 }
1941 #endif
1942 
1943 /*ARGSUSED*/
1944 void
1945 gcpu_faulted_enter(cmi_hdl_t hdl)
1946 {
1947 	/* Nothing to do here */
1948 }
1949 
1950 /*ARGSUSED*/
1951 void
1952 gcpu_faulted_exit(cmi_hdl_t hdl)
1953 {
1954 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1955 
1956 	gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING;
1957 }
1958 
1959 /*
1960  * Write the requested values to the indicated MSRs.  Having no knowledge
1961  * of the model-specific requirements for writing to these model-specific
1962  * registers, we will only blindly write to those MSRs if the 'force'
1963  * argument is nonzero.  That option should only be used in prototyping
1964  * and debugging.
1965  */
1966 /*ARGSUSED*/
1967 cmi_errno_t
1968 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs,
1969     int force)
1970 {
1971 	int i, errs = 0;
1972 
1973 	for (i = 0; i < nregs; i++) {
1974 		uint_t msr = regs[i].cmr_msrnum;
1975 		uint64_t val = regs[i].cmr_msrval;
1976 
1977 		if (cms_present(hdl)) {
1978 			if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS)
1979 				errs++;
1980 		} else if (force) {
1981 			errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS);
1982 		} else {
1983 			errs++;
1984 		}
1985 	}
1986 
1987 	return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN);
1988 }
1989