xref: /illumos-gate/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_mca.c (revision 75eba5b6d79ed4d2ce3daf7b2806306b6b69a938)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 /*
26  * Copyright (c) 2010, Intel Corporation.
27  * All rights reserved.
28  */
29 
30 #include <sys/mca_x86.h>
31 #include <sys/cpu_module_impl.h>
32 #include <sys/cpu_module_ms.h>
33 #include <sys/cmn_err.h>
34 #include <sys/cpuvar.h>
35 #include <sys/pghw.h>
36 #include <sys/x86_archext.h>
37 #include <sys/sysmacros.h>
38 #include <sys/regset.h>
39 #include <sys/privregs.h>
40 #include <sys/systm.h>
41 #include <sys/types.h>
42 #include <sys/log.h>
43 #include <sys/psw.h>
44 #include <sys/fm/protocol.h>
45 #include <sys/fm/util.h>
46 #include <sys/errorq.h>
47 #include <sys/mca_x86.h>
48 #include <sys/fm/cpu/GMCA.h>
49 #include <sys/fm/smb/fmsmb.h>
50 #include <sys/sysevent.h>
51 #include <sys/ontrap.h>
52 
53 #include "gcpu.h"
54 
55 extern int x86gentopo_legacy;	/* x86 generic topology support */
56 
57 static uint_t gcpu_force_addr_in_payload = 0;
58 
59 /*
60  * Clear to log telemetry found at initialization.  While processor docs
61  * say you should process this telemetry on all but Intel family 0x6
62  * there are way too many exceptions and we want to avoid bogus
63  * diagnoses.
64  */
65 int gcpu_suppress_log_on_init = 1;
66 
67 /*
68  * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at
69  * error logout time.  The stack will be included in the ereport if the
70  * error type selects stack inclusion, or in all cases if
71  * gcpu_mca_stack_ereport_include is nonzero.
72  */
73 int gcpu_mca_stack_flag = 0;
74 int gcpu_mca_stack_ereport_include = 0;
75 
76 /*
77  * The number of times to re-read MCA telemetry to try to obtain a
78  * consistent snapshot if we find it to be changing under our feet.
79  */
80 int gcpu_mca_telemetry_retries = 5;
81 
82 #ifndef __xpv
83 int gcpu_mca_cmci_throttling_threshold = 10;
84 int gcpu_mca_cmci_reenable_threshold = 1000;
85 #endif
86 
87 static gcpu_error_disp_t gcpu_errtypes[] = {
88 
89 	/*
90 	 * Unclassified
91 	 */
92 	{
93 		FM_EREPORT_CPU_GENERIC_UNCLASSIFIED,
94 		NULL,
95 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
96 		MCAX86_SIMPLE_UNCLASSIFIED_MASKON,
97 		MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF
98 	},
99 
100 	/*
101 	 * Microcode ROM Parity Error
102 	 */
103 	{
104 		FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY,
105 		NULL,
106 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
107 		MCAX86_SIMPLE_MC_CODE_PARITY_MASKON,
108 		MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF
109 	},
110 
111 	/*
112 	 * External - BINIT# from another processor during power-on config
113 	 */
114 	{
115 		FM_EREPORT_CPU_GENERIC_EXTERNAL,
116 		NULL,
117 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
118 		MCAX86_SIMPLE_EXTERNAL_MASKON,
119 		MCAX86_SIMPLE_EXTERNAL_MASKOFF
120 	},
121 
122 	/*
123 	 * Functional redundancy check master/slave error
124 	 */
125 	{
126 		FM_EREPORT_CPU_GENERIC_FRC,
127 		NULL,
128 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
129 		MCAX86_SIMPLE_FRC_MASKON,
130 		MCAX86_SIMPLE_FRC_MASKOFF
131 	},
132 
133 	/*
134 	 * Internal parity error
135 	 */
136 	{
137 		FM_EREPORT_CPU_GENERIC_INTERNAL_PARITY,
138 		NULL,
139 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
140 		MCAX86_SIMPLE_INTERNAL_PARITY_MASKON,
141 		MCAX86_SIMPLE_INTERNAL_PARITY_MASKOFF
142 	},
143 
144 
145 	/*
146 	 * Internal timer error
147 	 */
148 	{
149 		FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER,
150 		NULL,
151 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
152 		MCAX86_SIMPLE_INTERNAL_TIMER_MASKON,
153 		MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF
154 	},
155 
156 	/*
157 	 * Internal unclassified
158 	 */
159 	{
160 		FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS,
161 		NULL,
162 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
163 		MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON,
164 		MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF
165 	},
166 
167 	/*
168 	 * Compound error codes - generic memory hierarchy
169 	 */
170 	{
171 		FM_EREPORT_CPU_GENERIC_GENMEMHIER,
172 		NULL,
173 		FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */
174 		MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON,
175 		MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF
176 	},
177 
178 	/*
179 	 * Compound error codes - TLB errors
180 	 */
181 	{
182 		FM_EREPORT_CPU_GENERIC_TLB,
183 		"%1$s" "TLB" "%2$s" "_ERR",
184 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
185 		MCAX86_COMPOUND_TLB_MASKON,
186 		MCAX86_COMPOUND_TLB_MASKOFF
187 	},
188 
189 	/*
190 	 * Compound error codes - memory hierarchy
191 	 */
192 	{
193 		FM_EREPORT_CPU_GENERIC_MEMHIER,
194 		"%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR",
195 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
196 		MCAX86_COMPOUND_MEMHIER_MASKON,
197 		MCAX86_COMPOUND_MEMHIER_MASKOFF
198 	},
199 
200 	/*
201 	 * Compound error codes - bus and interconnect errors
202 	 */
203 	{
204 		FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT,
205 		"BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR",
206 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
207 		MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON,
208 		MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF
209 	},
210 	/*
211 	 * Compound error codes - memory controller errors
212 	 */
213 	{
214 		FM_EREPORT_CPU_GENERIC_MEMORY_CONTROLLER,
215 		"MC" "_" "%8$s" "_" "%9$s" "_ERR",
216 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
217 		MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKON,
218 		MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKOFF
219 	},
220 };
221 
222 static gcpu_error_disp_t gcpu_unknown = {
223 	FM_EREPORT_CPU_GENERIC_UNKNOWN,
224 	"UNKNOWN",
225 	FM_EREPORT_PAYLOAD_FLAGS_COMMON,
226 	0,
227 	0
228 };
229 
230 static errorq_t *gcpu_mca_queue;
231 static kmutex_t gcpu_mca_queue_lock;
232 
233 #ifdef __xpv
234 static int isxpv = 1;
235 #else
236 static int isxpv = 0;
237 #endif
238 
239 static const gcpu_error_disp_t *
240 gcpu_disp_match(uint16_t code)
241 {
242 	const gcpu_error_disp_t *ged = gcpu_errtypes;
243 	int i;
244 
245 	for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t);
246 	    i++, ged++) {
247 		uint16_t on = ged->ged_errcode_mask_on;
248 		uint16_t off = ged->ged_errcode_mask_off;
249 
250 		if ((code & on) == on && (code & off) == 0)
251 			return (ged);
252 	}
253 
254 	return (NULL);
255 }
256 
257 static uint16_t
258 bit_strip(uint16_t code, uint16_t mask, uint16_t shift)
259 {
260 	return ((code & mask) >> shift);
261 }
262 
263 #define	BIT_STRIP(code, name) \
264 	bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \
265 	MCAX86_ERRCODE_##name##_SHIFT)
266 
267 #define	GCPU_MNEMONIC_UNDEF	"undefined"
268 #define	GCPU_MNEMONIC_RESVD	"reserved"
269 
270 /*
271  * Mappings of TT, LL, RRRR, PP, II and T values to compound error name
272  * mnemonics and to ereport class name components.
273  */
274 
275 struct gcpu_mnexp {
276 	const char *mne_compound;	/* used in expanding compound errname */
277 	const char *mne_ereport;	/* used in expanding ereport class */
278 };
279 
280 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */
281 	{ "I", FM_EREPORT_CPU_GENERIC_TT_INSTR },		/* INSTR */
282 	{ "D", FM_EREPORT_CPU_GENERIC_TT_DATA },		/* DATA */
283 	{ "G", FM_EREPORT_CPU_GENERIC_TT_GEN },			/* GEN */
284 	{ GCPU_MNEMONIC_UNDEF, "" }
285 };
286 
287 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */
288 	{ "LO", FM_EREPORT_CPU_GENERIC_LL_L0 },			/* L0 */
289 	{ "L1",	FM_EREPORT_CPU_GENERIC_LL_L1 },			/* L1 */
290 	{ "L2",	FM_EREPORT_CPU_GENERIC_LL_L2 },			/* L2 */
291 	{ "LG", FM_EREPORT_CPU_GENERIC_LL_LG }			/* LG */
292 };
293 
294 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */
295 	{ "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR },		/* ERR */
296 	{ "RD",	FM_EREPORT_CPU_GENERIC_RRRR_RD },		/* RD */
297 	{ "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR },		/* WR */
298 	{ "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD },		/* DRD */
299 	{ "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR },		/* DWR */
300 	{ "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD },		/* IRD */
301 	{ "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH },	/* PREFETCH */
302 	{ "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT },		/* EVICT */
303 	{ "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP },		/* SNOOP */
304 };
305 
306 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */
307 	{ "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC },		/* SRC */
308 	{ "RES", FM_EREPORT_CPU_GENERIC_PP_RES },		/* RES */
309 	{ "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS },		/* OBS */
310 	{ "", FM_EREPORT_CPU_GENERIC_PP_GEN }			/* GEN */
311 };
312 
313 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */
314 	{ "M", FM_EREPORT_CPU_GENERIC_II_MEM },			/* MEM */
315 	{ GCPU_MNEMONIC_RESVD, "" },
316 	{ "IO", FM_EREPORT_CPU_GENERIC_II_IO },			/* IO */
317 	{ "", FM_EREPORT_CPU_GENERIC_II_GEN }			/* GEN */
318 };
319 
320 static struct gcpu_mnexp gcpu_T_mnemonics[] = {	 /* MCAX86_ERRCODE_T_* */
321 	{ "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT },	/* NONE */
322 	{ "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT }		/* TIMEOUT */
323 };
324 
325 static struct gcpu_mnexp gcpu_CCCC_mnemonics[] = { /* MCAX86_ERRCODE_CCCC_* */
326 	{ "CH0", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH0 */
327 	{ "CH1", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH1 */
328 	{ "CH2", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH2 */
329 	{ "CH3", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH3 */
330 	{ "CH4", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH4 */
331 	{ "CH5", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH5 */
332 	{ "CH6", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH6 */
333 	{ "CH7", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH7 */
334 	{ "CH8", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH8 */
335 	{ "CH9", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH9 */
336 	{ "CH10", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH10 */
337 	{ "CH11", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH11 */
338 	{ "CH12", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH12 */
339 	{ "CH13", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH13 */
340 	{ "CH14", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH14 */
341 	{ "CH", FM_EREPORT_CPU_GENERIC_CCCC }		/* GEN */
342 };
343 
344 static struct gcpu_mnexp gcpu_MMM_mnemonics[] = { /* MCAX86_ERRCODE_MMM_* */
345 	{ "GEN", FM_EREPORT_CPU_GENERIC_MMM_ERR },	/* GEN ERR */
346 	{ "RD", FM_EREPORT_CPU_GENERIC_MMM_RD },	/* READ  */
347 	{ "WR", FM_EREPORT_CPU_GENERIC_MMM_WR },	/* WRITE  */
348 	{ "ADDR_CMD", FM_EREPORT_CPU_GENERIC_MMM_ADRCMD },	/* ADDR, CMD  */
349 	{ "SCRUB", FM_EREPORT_CPU_GENERIC_MMM_SCRUB },
350 	{ GCPU_MNEMONIC_RESVD, ""},			/* RESERVED  */
351 	{ GCPU_MNEMONIC_RESVD, ""},			/* RESERVED  */
352 	{ GCPU_MNEMONIC_RESVD, ""}			/* RESERVED  */
353 };
354 
355 enum gcpu_mn_namespace {
356 	GCPU_MN_NAMESPACE_COMPOUND,
357 	GCPU_MN_NAMESPACE_EREPORT
358 };
359 
360 static const char *
361 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint16_t val,
362     enum gcpu_mn_namespace nspace)
363 {
364 	if (val >= tbl_sz || val > 0xff)
365 		return (GCPU_MNEMONIC_UNDEF);	/* for all namespaces */
366 
367 	switch (nspace) {
368 	case GCPU_MN_NAMESPACE_COMPOUND:
369 		return (tbl[val].mne_compound);
370 		/*NOTREACHED*/
371 
372 	case GCPU_MN_NAMESPACE_EREPORT:
373 		return (tbl[val].mne_ereport);
374 		/*NOTREACHED*/
375 
376 	default:
377 		return (GCPU_MNEMONIC_UNDEF);
378 		/*NOTREACHED*/
379 	}
380 }
381 
382 /*
383  * The ereport class leaf component is either a simple string with no
384  * format specifiers, or a string with one or more embedded %n$s specifiers -
385  * positional selection for string arguments.  The kernel snprintf does
386  * not support %n$ (and teaching it to do so is too big a headache) so
387  * we will expand this restricted format string ourselves.
388  */
389 
390 #define	GCPU_CLASS_VARCOMPS	9
391 
392 #define	GCPU_MNEMONIC(code, name, nspace) \
393 	gcpu_mnemonic(gcpu_##name##_mnemonics, \
394 	sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \
395 	BIT_STRIP(code, name), nspace)
396 
397 static void
398 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
399     enum gcpu_mn_namespace nspace)
400 {
401 	uint16_t code = MCAX86_ERRCODE(status);
402 	const char *mn[GCPU_CLASS_VARCOMPS];
403 	char *p = buf;			/* current position in buf */
404 	char *q = buf + buflen;		/* pointer past last char in buf */
405 	int which, expfmtchar, error;
406 	char c;
407 
408 	mn[0] = GCPU_MNEMONIC(code, TT, nspace);
409 	mn[1] = GCPU_MNEMONIC(code, LL, nspace);
410 	mn[2] = GCPU_MNEMONIC(code, RRRR, nspace);
411 	mn[3] = GCPU_MNEMONIC(code, PP, nspace);
412 	mn[4] = GCPU_MNEMONIC(code, II, nspace);
413 	mn[5] = GCPU_MNEMONIC(code, T, nspace);
414 	mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : "";
415 	mn[7] = GCPU_MNEMONIC(code, CCCC, nspace);
416 	mn[8] = GCPU_MNEMONIC(code, MMM, nspace);
417 
418 	while (p < q - 1 && (c = *fmt++) != '\0') {
419 		if (c != '%') {
420 			/* not the beginning of a format specifier - copy */
421 			*p++ = c;
422 			continue;
423 		}
424 
425 		error = 0;
426 		which = -1;
427 		expfmtchar = -1;
428 
429 nextfmt:
430 		if ((c = *fmt++) == '\0')
431 			break;	/* early termination of fmt specifier */
432 
433 		switch (c) {
434 		case '1':
435 		case '2':
436 		case '3':
437 		case '4':
438 		case '5':
439 		case '6':
440 		case '7':
441 		case '8':
442 		case '9':
443 			if (which != -1) { /* allow only one positional digit */
444 				error++;
445 				break;
446 			}
447 			which = c - '1';
448 			goto nextfmt;
449 			/*NOTREACHED*/
450 
451 		case '$':
452 			if (which == -1) { /* no position specified */
453 				error++;
454 				break;
455 			}
456 			expfmtchar = 's';
457 			goto nextfmt;
458 			/*NOTREACHED*/
459 
460 		case 's':
461 			if (expfmtchar != 's') {
462 				error++;
463 				break;
464 			}
465 			(void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
466 			    mn[which]);
467 			p += strlen(p);
468 			break;
469 
470 		default:
471 			error++;
472 			break;
473 		}
474 
475 		if (error)
476 			break;
477 	}
478 
479 	*p = '\0';	/* NUL termination */
480 }
481 
482 static void
483 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
484     const char *cpuclass, const char *leafclass)
485 {
486 	char *p = buf;			/* current position in buf */
487 	char *q = buf + buflen;		/* pointer past last char in buf */
488 
489 	(void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.",
490 	    FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC);
491 
492 	p += strlen(p);
493 	if (p >= q)
494 		return;
495 
496 	if (leafclass == NULL) {
497 		gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status,
498 		    GCPU_MN_NAMESPACE_EREPORT);
499 	} else {
500 		(void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
501 		    leafclass);
502 	}
503 }
504 
505 /*
506  * Create an "hc" scheme FMRI identifying the given cpu with
507  * motherboard/chip/core/strand instance numbers.
508  */
509 static nvlist_t *
510 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva)
511 {
512 	nvlist_t *nvl, *fmri;
513 
514 	if ((nvl = fm_nvlist_create(nva)) == NULL)
515 		return (NULL);
516 
517 	if (!x86gentopo_legacy) {
518 		fmri = cmi_hdl_smb_bboard(hdl);
519 		if (fmri == NULL)
520 			return (NULL);
521 
522 		fm_fmri_hc_create(nvl, FM_HC_SCHEME_VERSION,
523 		    NULL, NULL, fmri, 3,
524 		    "chip", cmi_hdl_smb_chipid(hdl),
525 		    "core", cmi_hdl_coreid(hdl),
526 		    "strand", cmi_hdl_strandid(hdl));
527 	} else {
528 		fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 4,
529 		    "motherboard", 0,
530 		    "chip", cmi_hdl_chipid(hdl),
531 		    "core", cmi_hdl_coreid(hdl),
532 		    "strand", cmi_hdl_strandid(hdl));
533 	}
534 
535 	return (nvl);
536 }
537 
538 int gcpu_bleat_count_thresh = 5;
539 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL;
540 
541 /*
542  * Called when we are unable to propogate a logout structure onto an
543  * errorq for subsequent ereport preparation and logging etc.  The caller
544  * should usually only decide to call this for severe errors - those we
545  * suspect we may need to panic for.
546  */
547 static void
548 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl)
549 {
550 	hrtime_t now  = gethrtime_waitfree();
551 	static hrtime_t gcpu_last_bleat;
552 	gcpu_bank_logout_t *gbl;
553 	static int bleatcount;
554 	int i;
555 
556 	/*
557 	 * Throttle spamming of the console.  The first gcpu_bleat_count_thresh
558 	 * can come as fast as we like, but once we've spammed that many
559 	 * to the console we require a minimum interval to pass before
560 	 * any more complaints.
561 	 */
562 	if (++bleatcount > gcpu_bleat_count_thresh) {
563 		if (now - gcpu_last_bleat < gcpu_bleat_min_interval)
564 			return;
565 		else
566 			bleatcount = 0;
567 	}
568 	gcpu_last_bleat = now;
569 
570 	cmn_err(CE_WARN,
571 	    "Machine-Check Errors unlogged on chip %d core %d strand %d, "
572 	    "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl),
573 	    cmi_hdl_strandid(hdl));
574 	cmn_err(CE_WARN, "MCG_STATUS 0x%016llx",
575 	    (u_longlong_t)gcl->gcl_mcg_status);
576 	for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
577 		uint64_t status = gbl->gbl_status;
578 
579 		if (!(status & MSR_MC_STATUS_VAL))
580 			continue;
581 
582 		/* Force ADDRV for AMD Family 0xf and above */
583 		if (gcpu_force_addr_in_payload)
584 			status = status | MSR_MC_STATUS_ADDRV;
585 
586 		switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) {
587 		case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV:
588 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
589 			    "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx",
590 			    i, IA32_MSR_MC(i, STATUS),
591 			    (u_longlong_t)gbl->gbl_status,
592 			    (u_longlong_t)gbl->gbl_addr,
593 			    (u_longlong_t)gbl->gbl_misc);
594 			break;
595 
596 		case MSR_MC_STATUS_ADDRV:
597 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
598 			    "STAT 0x%016llx ADDR 0x%016llx",
599 			    i, IA32_MSR_MC(i, STATUS),
600 			    (u_longlong_t)gbl->gbl_status,
601 			    (u_longlong_t)gbl->gbl_addr);
602 			break;
603 
604 		case MSR_MC_STATUS_MISCV:
605 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
606 			    "STAT 0x%016llx MISC 0x%016llx",
607 			    i, IA32_MSR_MC(i, STATUS),
608 			    (u_longlong_t)gbl->gbl_status,
609 			    (u_longlong_t)gbl->gbl_misc);
610 			break;
611 
612 		default:
613 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
614 			    "STAT 0x%016llx",
615 			    i, IA32_MSR_MC(i, STATUS),
616 			    (u_longlong_t)gbl->gbl_status);
617 			break;
618 
619 		}
620 	}
621 }
622 
623 #define	_GCPU_BSTATUS(status, what) \
624 	FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \
625 	(status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE
626 
627 static void
628 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl,
629     uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code)
630 {
631 	uint64_t members = ged ? ged->ged_ereport_members :
632 	    FM_EREPORT_PAYLOAD_FLAGS_COMMON;
633 	uint64_t mcg = gcl->gcl_mcg_status;
634 	int mcip = mcg & MCG_STATUS_MCIP;
635 	const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno];
636 	uint64_t bstat = gbl->gbl_status;
637 
638 	/*
639 	 * Include the compound error name if requested and if this
640 	 * is a compound error type.
641 	 */
642 	if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged &&
643 	    ged->ged_compound_fmt != NULL) {
644 		char buf[FM_MAX_CLASS];
645 
646 		gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code,
647 		    GCPU_MN_NAMESPACE_COMPOUND);
648 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR,
649 		    DATA_TYPE_STRING, buf, NULL);
650 	}
651 
652 	/*
653 	 * Include disposition information for this error
654 	 */
655 	if (members & FM_EREPORT_PAYLOAD_FLAG_DISP &&
656 	    gbl->gbl_disp != 0) {
657 		int i, empty = 1;
658 		char buf[128];
659 		char *p = buf, *q = buf + 128;
660 		static struct _gcpu_disp_name {
661 			uint64_t dv;
662 			const char *dn;
663 		} disp_names[] = {
664 			{ CMI_ERRDISP_CURCTXBAD,
665 			    "processor_context_corrupt" },
666 			{ CMI_ERRDISP_RIPV_INVALID,
667 			    "return_ip_invalid" },
668 			{ CMI_ERRDISP_UC_UNCONSTRAINED,
669 			    "unconstrained" },
670 			{ CMI_ERRDISP_FORCEFATAL,
671 			    "forcefatal" },
672 			{ CMI_ERRDISP_IGNORED,
673 			    "ignored" },
674 			{ CMI_ERRDISP_PCC_CLEARED,
675 			    "corrupt_context_cleared" },
676 			{ CMI_ERRDISP_UC_CLEARED,
677 			    "uncorrected_data_cleared" },
678 			{ CMI_ERRDISP_POISONED,
679 			    "poisoned" },
680 			{ CMI_ERRDISP_INCONSISTENT,
681 			    "telemetry_unstable" },
682 		};
683 
684 		for (i = 0; i < sizeof (disp_names) /
685 		    sizeof (struct _gcpu_disp_name); i++) {
686 			if ((gbl->gbl_disp & disp_names[i].dv) == 0)
687 				continue;
688 
689 			(void) snprintf(p, (uintptr_t)q - (uintptr_t)p,
690 			    "%s%s", empty ? "" : ",", disp_names[i].dn);
691 			p += strlen(p);
692 			empty = 0;
693 		}
694 
695 		if (p != buf)
696 			fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP,
697 			    DATA_TYPE_STRING, buf, NULL);
698 	}
699 
700 	/*
701 	 * If MCG_STATUS is included add that and an indication of whether
702 	 * this ereport was the result of a machine check or poll.
703 	 */
704 	if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) {
705 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS,
706 		    DATA_TYPE_UINT64, mcg, NULL);
707 
708 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP,
709 		    DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL);
710 	}
711 
712 	/*
713 	 * If an instruction pointer is to be included add one provided
714 	 * MCG_STATUS indicated it is valid; meaningless for polled events.
715 	 */
716 	if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP &&
717 	    mcg & MCG_STATUS_EIPV) {
718 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP,
719 		    DATA_TYPE_UINT64, gcl->gcl_ip, NULL);
720 	}
721 
722 	/*
723 	 * Add an indication of whether the trap occured during privileged code.
724 	 */
725 	if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) {
726 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV,
727 		    DATA_TYPE_BOOLEAN_VALUE,
728 		    gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL);
729 	}
730 
731 	/*
732 	 * If requested, add the index of the MCA bank.  This indicates the
733 	 * n'th bank of 4 MCA registers, and does not necessarily correspond
734 	 * to MCi_* - use the bank offset to correlate
735 	 */
736 	if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) {
737 		fm_payload_set(ereport,
738 		    /* Bank number */
739 		    FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno,
740 		    /* Offset of MCi_CTL */
741 		    FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64,
742 		    IA32_MSR_MC(bankno, CTL),
743 		    NULL);
744 	}
745 
746 	/*
747 	 * Add MCi_STATUS if requested, and decode it.
748 	 */
749 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) {
750 		const char *tbes[] = {
751 			"No tracking",			/* 00 */
752 			"Green - below threshold",	/* 01 */
753 			"Yellow - above threshold",	/* 10 */
754 			"Reserved"			/* 11 */
755 		};
756 
757 		fm_payload_set(ereport,
758 		    /* Bank MCi_STATUS */
759 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat,
760 		    /* Overflow? */
761 		    _GCPU_BSTATUS(bstat, OVER),
762 		    /* Uncorrected? */
763 		    _GCPU_BSTATUS(bstat, UC),
764 		    /* Enabled? */
765 		    _GCPU_BSTATUS(bstat, EN),
766 		    /* Processor context corrupt? */
767 		    _GCPU_BSTATUS(bstat, PCC),
768 		    /* Error code */
769 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE,
770 		    DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat),
771 		    /* Model-specific error code */
772 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE,
773 		    DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat),
774 		    NULL);
775 
776 		/*
777 		 * If MCG_CAP.TES_P indicates that that thresholding info
778 		 * is present in the architural component of the bank status
779 		 * then include threshold information for this bank.
780 		 */
781 		if (gcl->gcl_flags & GCPU_GCL_F_TES_P) {
782 			fm_payload_set(ereport,
783 			    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES,
784 			    DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)],
785 			    NULL);
786 		}
787 	}
788 
789 	/*
790 	 * Add MCi_ADDR info if requested and valid. We force addition of
791 	 * MCi_ADDR, even if its not valid on AMD family 0xf and above,
792 	 * to aid in analysis of ereports, for WatchDog errors.
793 	 */
794 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR &&
795 	    ((bstat & MSR_MC_STATUS_ADDRV) ||
796 	    gcpu_force_addr_in_payload)) {
797 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR,
798 		    DATA_TYPE_UINT64, gbl->gbl_addr, NULL);
799 	}
800 
801 	/*
802 	 * MCi_MISC if requested and MCi_STATUS.MISCV).
803 	 */
804 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC &&
805 	    bstat & MSR_MC_STATUS_MISCV) {
806 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC,
807 		    DATA_TYPE_UINT64, gbl->gbl_misc, NULL);
808 	}
809 
810 }
811 
812 /*
813  * Construct and post an ereport based on the logout information from a
814  * single MCA bank.  We are not necessarily running on the cpu that
815  * detected the error.
816  */
817 static void
818 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx,
819     const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status)
820 {
821 	gcpu_data_t *gcpu = gcl->gcl_gcpu;
822 	cmi_hdl_t hdl = gcpu->gcpu_hdl;
823 	const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx];
824 	const char *cpuclass = NULL, *leafclass = NULL;
825 	uint16_t code = MCAX86_ERRCODE(status);
826 	errorq_elem_t *eqep, *scr_eqep;
827 	nvlist_t *ereport, *detector;
828 	char buf[FM_MAX_CLASS];
829 	const char *classfmt;
830 	nv_alloc_t *nva;
831 
832 	if (panicstr) {
833 		if ((eqep = errorq_reserve(ereport_errorq)) == NULL)
834 			return;
835 		ereport = errorq_elem_nvl(ereport_errorq, eqep);
836 
837 		/*
838 		 * Allocate another element for scratch space, but fallback
839 		 * to the one we have if that fails.  We'd like to use the
840 		 * additional scratch space for nvlist construction.
841 		 */
842 		if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL)
843 			nva = errorq_elem_nva(ereport_errorq, scr_eqep);
844 		else
845 			nva = errorq_elem_nva(ereport_errorq, eqep);
846 	} else {
847 		ereport = fm_nvlist_create(NULL);
848 		nva = NULL;
849 	}
850 
851 	if (ereport == NULL)
852 		return;
853 
854 	/*
855 	 * Common payload data required by the protocol:
856 	 *	- ereport class
857 	 *	- detector
858 	 *	- ENA
859 	 */
860 
861 	/*
862 	 * Ereport class - call into model-specific support to allow it to
863 	 * provide a cpu class or leaf class, otherwise calculate our own.
864 	 */
865 	cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass);
866 	classfmt = ged ?  ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN;
867 	gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass,
868 	    leafclass);
869 
870 	/*
871 	 * The detector FMRI.
872 	 */
873 	if ((detector = cms_ereport_detector(hdl, bankidx, mscookie,
874 	    nva)) == NULL)
875 		detector = gcpu_fmri_create(hdl, nva);
876 
877 	/*
878 	 * Should we define a new ENA format 3?? for chip/core/strand?
879 	 * It will be better when virtualized.
880 	 */
881 	fm_ereport_set(ereport, FM_EREPORT_VERSION, buf,
882 	    fm_ena_generate_cpu(gcl->gcl_timestamp,
883 	    cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 |
884 	    cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL);
885 
886 	if (panicstr) {
887 		fm_nvlist_destroy(detector, FM_NVA_RETAIN);
888 		nv_alloc_reset(nva);
889 	} else {
890 		fm_nvlist_destroy(detector, FM_NVA_FREE);
891 	}
892 
893 	/*
894 	 * Add the architectural ereport class-specific payload data.
895 	 */
896 	gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code);
897 
898 	/*
899 	 * Allow model-specific code to add ereport members.
900 	 */
901 	cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status,
902 	    gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie);
903 
904 	/*
905 	 * Include stack if options is turned on and either selected in
906 	 * the payload member bitmask or inclusion is forced.
907 	 */
908 	if (gcpu_mca_stack_flag &&
909 	    (cms_ereport_includestack(hdl, mscookie) ==
910 	    B_TRUE || gcpu_mca_stack_ereport_include)) {
911 		fm_payload_stack_add(ereport, gcl->gcl_stack,
912 		    gcl->gcl_stackdepth);
913 	}
914 
915 	/*
916 	 * If injection has taken place anytime in the past then note this
917 	 * on the ereport.
918 	 */
919 	if (cmi_inj_tainted() == B_TRUE) {
920 		fm_payload_set(ereport, "__injected", DATA_TYPE_BOOLEAN_VALUE,
921 		    B_TRUE, NULL);
922 	}
923 
924 	/*
925 	 * Post ereport.
926 	 */
927 	if (panicstr) {
928 		errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC);
929 		if (scr_eqep)
930 			errorq_cancel(ereport_errorq, scr_eqep);
931 	} else {
932 		(void) fm_ereport_post(ereport, EVCH_TRYHARD);
933 		fm_nvlist_destroy(ereport, FM_NVA_FREE);
934 	}
935 
936 }
937 
938 /*ARGSUSED*/
939 void
940 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe)
941 {
942 	const gcpu_logout_t *gcl = data;
943 	const gcpu_bank_logout_t *gbl;
944 	int ismc;
945 	int i;
946 
947 	ismc = gcl->ismc;
948 	for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
949 		const gcpu_error_disp_t *gened;
950 		cms_cookie_t mscookie;
951 
952 		if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
953 		    !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
954 			uint16_t code = MCAX86_ERRCODE(gbl->gbl_status);
955 
956 			/*
957 			 * Perform a match based on IA32 MCA architectural
958 			 * components alone.
959 			 */
960 			gened = gcpu_disp_match(code); /* may be NULL */
961 
962 			/*
963 			 * Now see if an model-specific match can be made.
964 			 */
965 			mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, ismc,
966 			    i, gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc,
967 			    gcl->gcl_ms_logout);
968 
969 			/*
970 			 * Prepare and dispatch an ereport for logging and
971 			 * diagnosis.
972 			 */
973 			gcpu_ereport_post(gcl, i, gened, mscookie,
974 			    gbl->gbl_status);
975 		} else if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
976 		    (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
977 			/*
978 			 * Telemetry kept changing as we tried to read
979 			 * it.  Force an unknown ereport leafclass but
980 			 * keep the telemetry unchanged for logging.
981 			 */
982 			gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL,
983 			    gbl->gbl_status);
984 		}
985 	}
986 }
987 
988 static size_t gcpu_mca_queue_datasz = 0;
989 
990 /*
991  * The following code is ready to make a weak attempt at growing the
992  * errorq structure size.  Since it is not foolproof (we don't know
993  * who may already be producing to the outgoing errorq) our caller
994  * instead assures that we'll always be called with no greater data
995  * size than on our first call.
996  */
997 static void
998 gcpu_errorq_init(size_t datasz)
999 {
1000 	int slots;
1001 
1002 	mutex_enter(&gcpu_mca_queue_lock);
1003 
1004 	if (gcpu_mca_queue_datasz >= datasz) {
1005 		mutex_exit(&gcpu_mca_queue_lock);
1006 		return;
1007 	}
1008 
1009 	membar_producer();
1010 	if (gcpu_mca_queue) {
1011 		gcpu_mca_queue_datasz = 0;
1012 		errorq_destroy(gcpu_mca_queue);
1013 	}
1014 
1015 	slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS);
1016 	slots = MIN(slots, GCPU_MCA_MAX_ERRORS);
1017 
1018 	gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain,
1019 	    NULL, slots, datasz, 1, ERRORQ_VITAL);
1020 
1021 	if (gcpu_mca_queue != NULL)
1022 		gcpu_mca_queue_datasz = datasz;
1023 
1024 	mutex_exit(&gcpu_mca_queue_lock);
1025 }
1026 
1027 /*
1028  * Perform MCA initialization as described in section 14.6 of Intel 64
1029  * and IA-32 Architectures Software Developer's Manual Volume 3A.
1030  */
1031 
1032 static uint_t global_nbanks;
1033 
1034 void
1035 gcpu_mca_init(cmi_hdl_t hdl)
1036 {
1037 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1038 	uint64_t cap;
1039 	uint_t vendor = cmi_hdl_vendor(hdl);
1040 	uint_t family = cmi_hdl_family(hdl);
1041 	uint_t rev = cmi_hdl_chiprev(hdl);
1042 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1043 	int mcg_ctl_present;
1044 	uint_t nbanks;
1045 	uint32_t ctl_skip_mask = 0;
1046 	uint32_t status_skip_mask = 0;
1047 	size_t mslsz;
1048 	int i;
1049 #ifndef __xpv
1050 	int mcg_ctl2_present;
1051 	uint32_t cmci_capable = 0;
1052 #endif
1053 	if (gcpu == NULL)
1054 		return;
1055 
1056 	/* We add MCi_ADDR always for AMD Family 0xf and above */
1057 	if (X86_CHIPFAM_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B))
1058 		gcpu_force_addr_in_payload = 1;
1059 
1060 	/*
1061 	 * Protect from some silly /etc/system settings.
1062 	 */
1063 	if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100)
1064 		gcpu_mca_telemetry_retries = 5;
1065 
1066 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS)
1067 		return;
1068 
1069 	/*
1070 	 * CPU startup code only calls cmi_mca_init if x86_featureset indicates
1071 	 * both MCA and MCE support (i.e., X86FSET_MCA).  P5, K6, and earlier
1072 	 * processors, which have their own more primitive way of doing
1073 	 * machine checks, will not have cmi_mca_init called since their
1074 	 * CPUID information will not indicate both MCA and MCE features.
1075 	 */
1076 	ASSERT(is_x86_feature(x86_featureset, X86FSET_MCA));
1077 
1078 	/*
1079 	 * Determine whether the IA32_MCG_CTL register is present.  If it
1080 	 * is we will enable all features by writing -1 to it towards
1081 	 * the end of this initialization;  if it is absent then volume 3A
1082 	 * says we must nonetheless continue to initialize the individual
1083 	 * banks.
1084 	 */
1085 	mcg_ctl_present = cap & MCG_CAP_CTL_P;
1086 #ifndef __xpv
1087 	mcg_ctl2_present = cap & MCG_CAP_CTL2_P;
1088 #endif
1089 
1090 	/*
1091 	 * We squirell values away for inspection/debugging.
1092 	 */
1093 	mca->gcpu_mca_bioscfg.bios_mcg_cap = cap;
1094 	if (mcg_ctl_present)
1095 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL,
1096 		    &mca->gcpu_mca_bioscfg.bios_mcg_ctl);
1097 
1098 	/*
1099 	 * Determine the number of error-reporting banks implemented.
1100 	 */
1101 	mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK;
1102 
1103 	if (nbanks != 0 && global_nbanks == 0)
1104 		global_nbanks = nbanks;	/* no race - BSP will get here first */
1105 
1106 	/*
1107 	 * If someone is hiding the number of banks (perhaps we are fully
1108 	 * virtualized?) or if this processor has more banks than the
1109 	 * first to set global_nbanks then bail.  The latter requirement
1110 	 * is because we need to size our errorq data structure and we
1111 	 * don't want to have to grow the errorq (destroy and recreate)
1112 	 * which may just lose some telemetry.
1113 	 */
1114 	if (nbanks == 0 || nbanks > global_nbanks)
1115 		return;
1116 
1117 	mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks *
1118 	    sizeof (struct gcpu_bios_bankcfg), KM_SLEEP);
1119 
1120 	/*
1121 	 * Calculate the size we need to allocate for a gcpu_logout_t
1122 	 * with a gcl_data array big enough for all banks of this cpu.
1123 	 * Add any space requested by the model-specific logout support.
1124 	 */
1125 	mslsz = cms_logout_size(hdl);
1126 	mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) +
1127 	    (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz;
1128 
1129 	for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) {
1130 		gcpu_logout_t *gcl;
1131 
1132 		mca->gcpu_mca_logout[i] = gcl =
1133 		    kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP);
1134 		gcl->gcl_gcpu = gcpu;
1135 		gcl->gcl_nbanks = nbanks;
1136 		gcl->gcl_ms_logout = (mslsz == 0) ? NULL :
1137 		    (char *)(&gcl->gcl_data[0]) + nbanks *
1138 		    sizeof (gcpu_bank_logout_t);
1139 
1140 	}
1141 
1142 #ifdef __xpv
1143 	gcpu_xpv_mca_init(nbanks);
1144 #endif
1145 
1146 	mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1;
1147 
1148 #ifndef __xpv
1149 	mca->gcpu_bank_cmci = kmem_zalloc(sizeof (gcpu_mca_cmci_t) * nbanks,
1150 	    KM_SLEEP);
1151 #endif
1152 
1153 	/*
1154 	 * Create our errorq to transport the logout structures.  This
1155 	 * can fail so users of gcpu_mca_queue must be prepared for NULL.
1156 	 */
1157 	gcpu_errorq_init(mca->gcpu_mca_lgsz);
1158 
1159 	/*
1160 	 * Not knowing which, if any, banks are shared between cores we
1161 	 * assure serialization of MCA bank initialization by each cpu
1162 	 * on the chip.  On chip architectures in which some banks are
1163 	 * shared this will mean the shared resource is initialized more
1164 	 * than once - we're simply aiming to avoid simultaneous MSR writes
1165 	 * to the shared resource.
1166 	 *
1167 	 * Even with these precautions, some platforms may yield a GP fault
1168 	 * if a core other than a designated master tries to write anything
1169 	 * but all 0's to MCi_{STATUS,ADDR,CTL}.  So we will perform
1170 	 * those writes under on_trap protection.
1171 	 */
1172 	mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock);
1173 
1174 	/*
1175 	 * Initialize poller data, but don't start polling yet.
1176 	 */
1177 	gcpu_mca_poll_init(hdl);
1178 
1179 	/*
1180 	 * Work out which MCA banks we will initialize.  In MCA logout
1181 	 * code we will only read those banks which we initialize here.
1182 	 */
1183 	for (i = 0; i < nbanks; i++) {
1184 		boolean_t skipctl = cms_bankctl_skipinit(hdl, i);
1185 		boolean_t skipstatus = cms_bankstatus_skipinit(hdl, i);
1186 
1187 		if (!cms_present(hdl)) {
1188 			/*
1189 			 * Model-specific support is not present, try to use
1190 			 * sane defaults.
1191 			 *
1192 			 * On AMD family 6 processors, reports about spurious
1193 			 * machine checks indicate that bank 0 should be
1194 			 * skipped.
1195 			 *
1196 			 * On Intel family 6 processors, the documentation tells
1197 			 * us not to write to MC0_CTL.
1198 			 *
1199 			 */
1200 			if (i == 0 && family == 6) {
1201 				switch (vendor) {
1202 				case X86_VENDOR_AMD:
1203 					skipstatus = B_TRUE;
1204 					/*FALLTHRU*/
1205 				case X86_VENDOR_Intel:
1206 					skipctl = B_TRUE;
1207 					break;
1208 				}
1209 			}
1210 		}
1211 
1212 		ctl_skip_mask |= skipctl << i;
1213 		status_skip_mask |= skipstatus << i;
1214 
1215 		if (skipctl && skipstatus)
1216 			continue;
1217 
1218 		/*
1219 		 * Record which MCA banks were enabled, from the point of view
1220 		 * of the whole chip (if some cores share a bank we must be
1221 		 * sure either can logout from it).
1222 		 */
1223 		atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i);
1224 
1225 #ifndef __xpv
1226 		/*
1227 		 * check CMCI capability
1228 		 */
1229 		if (mcg_ctl2_present) {
1230 			uint64_t ctl2;
1231 			uint32_t cap = 0;
1232 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
1233 			if (ctl2 & MSR_MC_CTL2_EN)
1234 				continue;
1235 			ctl2 |= MSR_MC_CTL2_EN;
1236 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
1237 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
1238 			mca->gcpu_bank_cmci[i].cmci_cap = cap =
1239 			    (ctl2 & MSR_MC_CTL2_EN) ? 1 : 0;
1240 			if (cap)
1241 				cmci_capable ++;
1242 			/*
1243 			 * Set threshold to 1 while unset the en field, to avoid
1244 			 * CMCI trigged before APIC LVT entry init.
1245 			 */
1246 			ctl2 = ctl2 & (~MSR_MC_CTL2_EN) | 1;
1247 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
1248 
1249 			/*
1250 			 * init cmci related count
1251 			 */
1252 			mca->gcpu_bank_cmci[i].cmci_enabled = 0;
1253 			mca->gcpu_bank_cmci[i].drtcmci = 0;
1254 			mca->gcpu_bank_cmci[i].ncmci = 0;
1255 		}
1256 #endif
1257 	}
1258 
1259 #ifndef __xpv
1260 	if (cmci_capable)
1261 		cmi_enable_cmci = 1;
1262 #endif
1263 
1264 #ifndef __xpv
1265 	/*
1266 	 * Log any valid telemetry lurking in the MCA banks, but do not
1267 	 * clear the status registers.  Ignore the disposition returned -
1268 	 * we have already paniced or reset for any nasty errors found here.
1269 	 *
1270 	 * Intel vol 3A says that we should not do this on family 0x6,
1271 	 * and that for any extended family the BIOS clears things
1272 	 * on power-on reset so you'll only potentially find valid telemetry
1273 	 * on warm reset (we do it for both - on power-on reset we should
1274 	 * just see zeroes).
1275 	 *
1276 	 * AMD docs since K7 say we should process anything we find here.
1277 	 */
1278 	if (!gcpu_suppress_log_on_init &&
1279 	    (vendor == X86_VENDOR_Intel && family >= 0xf ||
1280 	    vendor == X86_VENDOR_AMD))
1281 		gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE,
1282 		    GCPU_MPT_WHAT_POKE_ERR);
1283 
1284 	/*
1285 	 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the
1286 	 * model-specific module the power of veto.
1287 	 */
1288 	for (i = 0; i < nbanks; i++) {
1289 		struct gcpu_bios_bankcfg *bcfgp =
1290 		    mca->gcpu_mca_bioscfg.bios_bankcfg + i;
1291 
1292 		/*
1293 		 * Stash inherited bank MCA state, even for banks we will
1294 		 * not initialize ourselves.  Do not read the MISC register
1295 		 * unconditionally - on some processors that will #GP on
1296 		 * banks that do not implement the MISC register (would be
1297 		 * caught by on_trap, anyway).
1298 		 */
1299 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL),
1300 		    &bcfgp->bios_bank_ctl);
1301 
1302 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1303 		    &bcfgp->bios_bank_status);
1304 
1305 		if ((bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV) ||
1306 		    gcpu_force_addr_in_payload) {
1307 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR),
1308 			    &bcfgp->bios_bank_addr);
1309 		}
1310 
1311 		/*
1312 		 * In some old BIOS the status value after boot can indicate
1313 		 * MISCV when there is actually no MISC register for
1314 		 * that bank.  The following read could therefore
1315 		 * aggravate a general protection fault.  This should be
1316 		 * caught by on_trap, but the #GP fault handler is busted
1317 		 * and can suffer a double fault even before we get to
1318 		 * trap() to check for on_trap protection.  Until that
1319 		 * issue is fixed we remove the one access that we know
1320 		 * can cause a #GP.
1321 		 *
1322 		 * if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV)
1323 		 *	(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC),
1324 		 *	    &bcfgp->bios_bank_misc);
1325 		 */
1326 		bcfgp->bios_bank_misc = 0;
1327 
1328 		if (!(ctl_skip_mask & (1 << i))) {
1329 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL),
1330 			    cms_bankctl_val(hdl, i, -1ULL));
1331 		}
1332 
1333 		if (!(status_skip_mask & (1 << i))) {
1334 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS),
1335 			    cms_bankstatus_val(hdl, i, 0ULL));
1336 		}
1337 	}
1338 #endif
1339 	/*
1340 	 * Now let the model-specific support perform further initialization
1341 	 * of non-architectural features.
1342 	 */
1343 	cms_mca_init(hdl, nbanks);
1344 
1345 #ifndef __xpv
1346 	(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL);
1347 	membar_producer();
1348 
1349 	/* enable all machine-check features */
1350 	if (mcg_ctl_present)
1351 		(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL,
1352 		    cms_mcgctl_val(hdl, nbanks, -1ULL));
1353 #endif
1354 
1355 	mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock);
1356 
1357 #ifndef __xpv
1358 	/* enable machine-check exception in CR4 */
1359 	cmi_hdl_enable_mce(hdl);
1360 #endif
1361 }
1362 
1363 static uint64_t
1364 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu,
1365     gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp)
1366 {
1367 	int curctxbad = 0, unconstrained = 0, forcefatal = 0;
1368 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1369 	int nbanks = mca->gcpu_mca_nbanks;
1370 	gcpu_mce_status_t mce;
1371 	gcpu_bank_logout_t *gbl;
1372 	uint64_t disp = 0;
1373 	int i;
1374 
1375 	if (mcesp == NULL)
1376 		mcesp = &mce;
1377 
1378 	mcesp->mce_nerr = nerr;
1379 
1380 	mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc =
1381 	    mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned =
1382 	    mcesp->mce_forcefatal = mcesp->mce_ignored = 0;
1383 
1384 	/*
1385 	 * If this a machine check then if the return instruction pointer
1386 	 * is not valid the current context is lost.
1387 	 */
1388 	if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV))
1389 		disp |= CMI_ERRDISP_RIPV_INVALID;
1390 	gcl->ismc = ismc;
1391 
1392 	for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1393 		uint64_t mcistatus = gbl->gbl_status;
1394 		uint32_t ms_scope;
1395 		int pcc, uc;
1396 		int poisoned;
1397 
1398 		if (!(mcistatus & MSR_MC_STATUS_VAL))
1399 			continue;
1400 
1401 		if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)
1402 			continue;
1403 
1404 		pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0;
1405 		uc = (mcistatus & MSR_MC_STATUS_UC) != 0;
1406 		mcesp->mce_npcc += pcc;
1407 		mcesp->mce_nuc += uc;
1408 
1409 		ms_scope = cms_error_action(hdl, ismc, i, mcistatus,
1410 		    gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout);
1411 
1412 		if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) {
1413 			pcc = 0;
1414 			mcesp->mce_npcc_ok++;
1415 			gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED;
1416 		}
1417 
1418 		if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) {
1419 			uc = 0;
1420 			mcesp->mce_nuc_ok++;
1421 			gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED;
1422 		}
1423 
1424 		if (uc) {
1425 			poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0;
1426 			if (poisoned) {
1427 				mcesp->mce_nuc_poisoned++;
1428 				gbl->gbl_disp |= CMI_ERRDISP_POISONED;
1429 			}
1430 		}
1431 
1432 		if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) {
1433 			/*
1434 			 * We're not being instructed to ignore the error,
1435 			 * so apply our standard disposition logic to it.
1436 			 */
1437 			if (uc && !poisoned) {
1438 				unconstrained++;
1439 				gbl->gbl_disp |= disp |
1440 				    CMI_ERRDISP_UC_UNCONSTRAINED;
1441 			}
1442 
1443 			if (pcc && ismc) {
1444 				curctxbad++;
1445 				gbl->gbl_disp |= disp |
1446 				    CMI_ERRDISP_CURCTXBAD;
1447 			}
1448 
1449 			/*
1450 			 * Even if the above may not indicate that the error
1451 			 * is terminal, model-specific support may insist
1452 			 * that we treat it as such.  Such errors wil be
1453 			 * fatal even if discovered via poll.
1454 			 */
1455 			if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) {
1456 				forcefatal++;
1457 				mcesp->mce_forcefatal++;
1458 				gbl->gbl_disp |= disp |
1459 				    CMI_ERRDISP_FORCEFATAL;
1460 			}
1461 		} else {
1462 			mcesp->mce_ignored++;
1463 			gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED;
1464 		}
1465 	}
1466 
1467 	if (unconstrained > 0)
1468 		disp |= CMI_ERRDISP_UC_UNCONSTRAINED;
1469 
1470 	if (curctxbad > 0)
1471 		disp |= CMI_ERRDISP_CURCTXBAD;
1472 
1473 	if (forcefatal > 0)
1474 		disp |= CMI_ERRDISP_FORCEFATAL;
1475 
1476 	if (gcpu_mca_queue != NULL) {
1477 		int how;
1478 
1479 		if (ismc) {
1480 			how = cmi_mce_response(rp, disp) ?
1481 			    ERRORQ_ASYNC :	/* no panic, so arrange drain */
1482 			    ERRORQ_SYNC;	/* panic flow will drain */
1483 		} else {
1484 			how = (disp & CMI_ERRDISP_FORCEFATAL &&
1485 			    cmi_panic_on_ue()) ?
1486 			    ERRORQ_SYNC :	/* poller will panic */
1487 			    ERRORQ_ASYNC;	/* no panic */
1488 		}
1489 
1490 		errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how);
1491 	} else if (disp != 0) {
1492 		gcpu_bleat(hdl, gcl);
1493 	}
1494 
1495 	mcesp->mce_disp = disp;
1496 
1497 	return (disp);
1498 }
1499 
1500 /*
1501  * Gather error telemetry from our source, and then submit it for
1502  * processing.
1503  */
1504 
1505 #define	IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \
1506 	((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0)
1507 
1508 #define	STATUS_EQV(s1, s2) \
1509 	(((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER))
1510 
1511 static uint32_t gcpu_deferrred_polled_clears;
1512 
1513 #ifndef __xpv
1514 static void
1515 gcpu_cmci_logout(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
1516     uint64_t status, int what)
1517 {
1518 	uint64_t ctl2;
1519 
1520 	if (bank_cmci_p->cmci_cap && (what == GCPU_MPT_WHAT_CYC_ERR) &&
1521 	    (!(status & MSR_MC_STATUS_VAL) || ((status & MSR_MC_STATUS_VAL) &&
1522 	    !(status & MSR_MC_STATUS_CEC_MASK)))) {
1523 
1524 		if (!(bank_cmci_p->cmci_enabled)) {
1525 			/*
1526 			 * when cmci is disabled, and the bank has no error or
1527 			 * no corrected error for
1528 			 * gcpu_mca_cmci_reenable_threshold consecutive polls,
1529 			 * turn on this bank's cmci.
1530 			 */
1531 
1532 			bank_cmci_p->drtcmci ++;
1533 
1534 			if (bank_cmci_p->drtcmci >=
1535 			    gcpu_mca_cmci_reenable_threshold) {
1536 
1537 				/* turn on cmci */
1538 
1539 				(void) cmi_hdl_rdmsr(hdl,
1540 				    IA32_MSR_MC_CTL2(bank), &ctl2);
1541 				ctl2 |= MSR_MC_CTL2_EN;
1542 				(void) cmi_hdl_wrmsr(hdl,
1543 				    IA32_MSR_MC_CTL2(bank), ctl2);
1544 
1545 				/* reset counter and set flag */
1546 				bank_cmci_p->drtcmci = 0;
1547 				bank_cmci_p->cmci_enabled = 1;
1548 			}
1549 		} else {
1550 			/*
1551 			 * when cmci is enabled,if is in cyclic poll and the
1552 			 * bank has no error or no corrected error, reset ncmci
1553 			 * counter
1554 			 */
1555 			bank_cmci_p->ncmci = 0;
1556 		}
1557 	}
1558 }
1559 
1560 static void
1561 gcpu_cmci_throttle(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
1562     int what)
1563 {
1564 	uint64_t ctl2 = 0;
1565 
1566 	/*
1567 	 * if cmci of this bank occurred beyond
1568 	 * gcpu_mca_cmci_throttling_threshold between 2 polls,
1569 	 * turn off this bank's CMCI;
1570 	 */
1571 	if (bank_cmci_p->cmci_enabled && what == GCPU_MPT_WHAT_CMCI_ERR) {
1572 
1573 		/* if it is cmci trap, increase the count */
1574 		bank_cmci_p->ncmci++;
1575 
1576 		if (bank_cmci_p->ncmci >= gcpu_mca_cmci_throttling_threshold) {
1577 
1578 			/* turn off cmci */
1579 
1580 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(bank),
1581 			    &ctl2);
1582 			ctl2 &= ~MSR_MC_CTL2_EN;
1583 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(bank),
1584 			    ctl2);
1585 
1586 			/* clear the flag and count */
1587 
1588 			bank_cmci_p->cmci_enabled = 0;
1589 			bank_cmci_p->ncmci = 0;
1590 		}
1591 	}
1592 }
1593 #endif
1594 
1595 static void
1596 clear_mc(int first, int last, int ismc, boolean_t clrstatus,
1597     cmi_hdl_t hdl, gcpu_logout_t *gcl, gcpu_logout_t *pgcl)
1598 {
1599 	int i;
1600 	gcpu_bank_logout_t *gbl, *pgbl;
1601 	uint64_t status;
1602 
1603 	if (first < 0 || last < 0)
1604 		return;
1605 
1606 	for (i = first, gbl = &gcl->gcl_data[first]; i <= last; i++, gbl++) {
1607 		status = gbl->gbl_status;
1608 		if (status == 0)
1609 			continue;
1610 		if (clrstatus == B_FALSE)
1611 			goto serialize;
1612 
1613 		/*
1614 		 * For i86xpv we always clear status in order to invalidate
1615 		 * the interposed telemetry.
1616 		 *
1617 		 * For native machine checks we always clear status here.  For
1618 		 * native polls we must be a little more cautious since there
1619 		 * is an outside chance that we may clear telemetry from a
1620 		 * shared MCA bank on which a sibling core is machine checking.
1621 		 *
1622 		 * For polled observations of errors that look like they may
1623 		 * produce a machine check (UC/PCC and ENabled, although these
1624 		 * do not guarantee a machine check on error occurence)
1625 		 * we will not clear the status at this wakeup unless
1626 		 * we saw the same status at the previous poll.	 We will
1627 		 * always process and log the current observations - it
1628 		 * is only the clearing of MCi_STATUS which may be
1629 		 * deferred until the next wakeup.
1630 		 */
1631 		if (isxpv || ismc || !IS_MCE_CANDIDATE(status)) {
1632 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL);
1633 			goto serialize;
1634 		}
1635 
1636 		/*
1637 		 * We have a polled observation of a machine check
1638 		 * candidate.  If we saw essentially the same status at the
1639 		 * last poll then clear the status now since this appears
1640 		 * not to be a #MC candidate after all.	 If we see quite
1641 		 * different status now then do not clear, but reconsider at
1642 		 * the next poll.  In no actual machine check clears
1643 		 * the status in the interim then the status should not
1644 		 * keep changing forever (meaning we'd never clear it)
1645 		 * since before long we'll simply have latched the highest-
1646 		 * priority error and set the OVerflow bit.  Nonetheless
1647 		 * we count how many times we defer clearing and after
1648 		 * a while insist on clearing the status.
1649 		 */
1650 		pgbl = &pgcl->gcl_data[i];
1651 		if (pgbl->gbl_clrdefcnt != 0) {
1652 			/* We deferred clear on this bank at last wakeup */
1653 			if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) ||
1654 			    pgbl->gbl_clrdefcnt > 5) {
1655 				/*
1656 				 * Status is unchanged so clear it now and,
1657 				 * since we have already logged this info,
1658 				 * avoid logging it again.
1659 				 */
1660 				gbl->gbl_status = 0;
1661 				(void) cmi_hdl_wrmsr(hdl,
1662 				    IA32_MSR_MC(i, STATUS), 0ULL);
1663 			} else {
1664 				/* Record deferral for next wakeup */
1665 				gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1;
1666 			}
1667 		} else {
1668 			/* Record initial deferral for next wakeup */
1669 			gbl->gbl_clrdefcnt = 1;
1670 			gcpu_deferrred_polled_clears++;
1671 		}
1672 
1673 serialize:
1674 		{
1675 #ifdef __xpv
1676 			;
1677 #else
1678 			/*
1679 			 * Intel Vol 3A says to execute a serializing
1680 			 * instruction here, ie CPUID.	Well WRMSR is also
1681 			 * defined to be serializing, so the status clear above
1682 			 * should suffice.  To be a good citizen, and since
1683 			 * some clears are deferred, we'll execute a CPUID
1684 			 * instruction here.
1685 			 */
1686 			struct cpuid_regs tmp;
1687 			(void) __cpuid_insn(&tmp);
1688 #endif
1689 		}
1690 	}
1691 }
1692 
1693 /*ARGSUSED5*/
1694 void
1695 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask,
1696     gcpu_mce_status_t *mcesp, boolean_t clrstatus, int what)
1697 {
1698 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1699 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1700 	int nbanks = mca->gcpu_mca_nbanks;
1701 	gcpu_bank_logout_t *gbl, *pgbl;
1702 	gcpu_logout_t *gcl, *pgcl;
1703 	int ismc = (rp != NULL);
1704 	int ispoll = !ismc;
1705 	int i, nerr = 0;
1706 	cmi_errno_t err;
1707 	uint64_t mcg_status;
1708 	uint64_t disp;
1709 	uint64_t cap;
1710 	int first = -1;
1711 	int last = -1;
1712 	int willpanic = 0;
1713 
1714 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
1715 	    CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) !=
1716 	    CMI_SUCCESS) {
1717 		if (mcesp != NULL)
1718 			mcesp->mce_nerr = mcesp->mce_disp = 0;
1719 		return;
1720 	}
1721 
1722 	if (ismc) {
1723 		gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION];
1724 	} else {
1725 		int pidx = mca->gcpu_mca_nextpoll_idx;
1726 		int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ?
1727 		    GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1;
1728 
1729 		gcl = mca->gcpu_mca_logout[pidx];	/* current logout */
1730 		pgcl = mca->gcpu_mca_logout[ppidx];	/* previous logout */
1731 		mca->gcpu_mca_nextpoll_idx = ppidx;	/* switch next time */
1732 	}
1733 
1734 	gcl->gcl_timestamp = gethrtime_waitfree();
1735 	gcl->gcl_mcg_status = mcg_status;
1736 	gcl->gcl_ip = rp ? rp->r_pc : 0;
1737 
1738 	gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0;
1739 	if (cap & MCG_CAP_TES_P)
1740 		gcl->gcl_flags |= GCPU_GCL_F_TES_P;
1741 
1742 	for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1743 		uint64_t status, status2, addr, misc;
1744 		int retries = gcpu_mca_telemetry_retries;
1745 
1746 		gbl->gbl_status = 0;
1747 		gbl->gbl_disp = 0;
1748 		gbl->gbl_clrdefcnt = 0;
1749 
1750 		/*
1751 		 * Only logout from MCA banks we have initialized from at
1752 		 * least one core.  If a core shares an MCA bank with another
1753 		 * but perhaps lost the race to initialize it, then it must
1754 		 * still be allowed to logout from the shared bank.
1755 		 */
1756 		if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i))
1757 			continue;
1758 
1759 		/*
1760 		 * On a poll look only at the banks we've been asked to check.
1761 		 */
1762 		if (rp == NULL && !(bankmask & 1 << i))
1763 			continue;
1764 
1765 
1766 		if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) !=
1767 		    CMI_SUCCESS)
1768 			continue;
1769 
1770 #ifndef __xpv
1771 		gcpu_cmci_logout(hdl, i, &mca->gcpu_bank_cmci[i], status, what);
1772 #endif
1773 
1774 retry:
1775 		if (!(status & MSR_MC_STATUS_VAL))
1776 			continue;
1777 
1778 		/* First and last bank that have valid status */
1779 		if (first < 0)
1780 			first = i;
1781 		last = i;
1782 
1783 		addr = -1;
1784 		misc = 0;
1785 
1786 		if ((status & MSR_MC_STATUS_ADDRV) ||
1787 		    gcpu_force_addr_in_payload)
1788 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr);
1789 
1790 		if (status & MSR_MC_STATUS_MISCV)
1791 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc);
1792 
1793 #ifndef __xpv
1794 		gcpu_cmci_throttle(hdl, i, &mca->gcpu_bank_cmci[i], what);
1795 #endif
1796 
1797 		/*
1798 		 * Allow the model-specific code to extract bank telemetry.
1799 		 */
1800 		cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout);
1801 
1802 		/*
1803 		 * Not all cpu models assure us that the status/address/misc
1804 		 * data will not change during the above sequence of MSR reads,
1805 		 * or that it can only change by the addition of the OVerflow
1806 		 * bit to the status register.  If the status has changed
1807 		 * other than in the overflow bit then we attempt to reread
1808 		 * for a consistent snapshot, but eventually give up and
1809 		 * go with what we've got.  We only perform this check
1810 		 * for a poll - a further #MC during a #MC will reset, and
1811 		 * polled errors should not overwrite higher-priority
1812 		 * trapping errors (but could set the overflow bit).
1813 		 */
1814 		if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1815 		    &status2)) == CMI_SUCCESS) {
1816 			if (!STATUS_EQV(status, status2)) {
1817 				if (retries-- > 0) {
1818 					status = status2;
1819 					goto retry;
1820 				} else {
1821 					gbl->gbl_disp |=
1822 					    CMI_ERRDISP_INCONSISTENT;
1823 				}
1824 			}
1825 		} else if (ispoll && err != CMI_SUCCESS) {
1826 			gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT;
1827 		}
1828 
1829 		nerr++;
1830 		gbl->gbl_status = status;
1831 		gbl->gbl_addr = addr;
1832 		gbl->gbl_misc = misc;
1833 
1834 		/*
1835 		 * For polled observation, if the count of deferred status
1836 		 * clears updated in the clear_mc() is nonzero and the
1837 		 * MCi_STATUS has not changed, the last wakeup has produced
1838 		 * the ereport of the error. Therefore, clear the status in
1839 		 * this wakeup to avoid duplicate ereport.
1840 		 */
1841 		pgbl = &pgcl->gcl_data[i];
1842 		if (!isxpv && ispoll && IS_MCE_CANDIDATE(status) &&
1843 		    pgbl->gbl_clrdefcnt != 0) {
1844 			if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status)) {
1845 				gbl->gbl_status = 0;
1846 				(void) cmi_hdl_wrmsr(hdl,
1847 				    IA32_MSR_MC(i, STATUS), 0ULL);
1848 			}
1849 		}
1850 	}
1851 
1852 	if (gcpu_mca_stack_flag)
1853 		gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH);
1854 	else
1855 		gcl->gcl_stackdepth = 0;
1856 
1857 	/*
1858 	 * Decide our disposition for this error or errors, and submit for
1859 	 * logging and subsequent diagnosis.
1860 	 */
1861 	if (nerr != 0) {
1862 		disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp);
1863 
1864 		willpanic = (ismc && cmi_mce_response(rp, disp) == 0);
1865 
1866 		if (!willpanic)
1867 			clear_mc(first, last, ismc, clrstatus, hdl, gcl, pgcl);
1868 	} else {
1869 		disp = 0;
1870 		if (mcesp) {
1871 			mcesp->mce_nerr = mcesp->mce_disp = 0;
1872 		}
1873 	}
1874 
1875 	/*
1876 	 * Clear MCG_STATUS if MCIP is set (machine check in progress).
1877 	 * If a second #MC had occured before now the system would have
1878 	 * reset.  We can only do thise once gcpu_mca_process has copied
1879 	 * the logout structure.
1880 	 */
1881 	if (ismc && mcg_status & MCG_STATUS_MCIP)
1882 		(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0);
1883 
1884 	/*
1885 	 * At this point we have read and logged all telemetry that is visible
1886 	 * under the MCA.  On architectures for which the NorthBridge is
1887 	 * on-chip this may include NB-observed errors, but where the NB
1888 	 * is off chip it may have been the source of the #MC request and
1889 	 * so we must call into the memory-controller driver to give it
1890 	 * a chance to log errors.
1891 	 */
1892 	if (ismc) {
1893 		cmi_mc_logout(hdl, 1, willpanic);
1894 	}
1895 }
1896 
1897 #ifndef __xpv
1898 int gcpu_mca_trap_vomit_summary = 0;
1899 
1900 /*
1901  * On a native machine check exception we come here from mcetrap via
1902  * cmi_mca_trap.  A machine check on one cpu of a chip does not trap others
1903  * cpus of the chip, so it is possible that another cpu on this chip could
1904  * initiate a poll while we're in the #mc handler;  it is also possible that
1905  * this trap has occured during a poll on this cpu.  So we must acquire
1906  * the chip-wide poll lock, but be careful to avoid deadlock.
1907  *
1908  * The 'data' pointer cannot be NULL due to init order.
1909  */
1910 uint64_t
1911 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp)
1912 {
1913 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1914 	kmutex_t *poll_lock = NULL;
1915 	gcpu_mce_status_t mce;
1916 	uint64_t mcg_status;
1917 	int tooklock = 0;
1918 
1919 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
1920 	    CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP))
1921 		return (0);
1922 
1923 	/*
1924 	 * Synchronize with any poller from another core that may happen
1925 	 * to share access to one or more of the MCA banks.
1926 	 */
1927 	if (gcpu->gcpu_shared != NULL)
1928 		poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock;
1929 
1930 	if (poll_lock != NULL && !mutex_owned(poll_lock)) {
1931 		/*
1932 		 * The lock is not owned by the thread we have
1933 		 * interrupted.  Spin for this adaptive lock.
1934 		 */
1935 		while (!mutex_tryenter(poll_lock)) {
1936 			while (mutex_owner(poll_lock) != NULL)
1937 				;
1938 		}
1939 		tooklock = 1;
1940 	}
1941 
1942 	gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE, GCPU_MPT_WHAT_MC_ERR);
1943 
1944 	if (tooklock)
1945 		mutex_exit(poll_lock);
1946 
1947 	/*
1948 	 * gcpu_mca_trap_vomit_summary may be set for debug assistance.
1949 	 */
1950 	if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) {
1951 		cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, "
1952 		    "%u PCC (%u ok), "
1953 		    "%u UC (%d ok, %u poisoned), "
1954 		    "%u forcefatal, %u ignored",
1955 		    mce.mce_nerr, (u_longlong_t)mce.mce_disp,
1956 		    mce.mce_npcc, mce.mce_npcc_ok,
1957 		    mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned,
1958 		    mce.mce_forcefatal, mce.mce_ignored);
1959 	}
1960 
1961 	return (mce.mce_disp);
1962 }
1963 #endif
1964 
1965 /*ARGSUSED*/
1966 void
1967 gcpu_faulted_enter(cmi_hdl_t hdl)
1968 {
1969 	/* Nothing to do here */
1970 }
1971 
1972 /*ARGSUSED*/
1973 void
1974 gcpu_faulted_exit(cmi_hdl_t hdl)
1975 {
1976 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1977 
1978 	gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING;
1979 }
1980 
1981 /*
1982  * Write the requested values to the indicated MSRs.  Having no knowledge
1983  * of the model-specific requirements for writing to these model-specific
1984  * registers, we will only blindly write to those MSRs if the 'force'
1985  * argument is nonzero.  That option should only be used in prototyping
1986  * and debugging.
1987  */
1988 /*ARGSUSED*/
1989 cmi_errno_t
1990 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs,
1991     int force)
1992 {
1993 	int i, errs = 0;
1994 
1995 	for (i = 0; i < nregs; i++) {
1996 		uint_t msr = regs[i].cmr_msrnum;
1997 		uint64_t val = regs[i].cmr_msrval;
1998 
1999 		if (cms_present(hdl)) {
2000 			if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS)
2001 				errs++;
2002 		} else if (force) {
2003 			errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS);
2004 		} else {
2005 			errs++;
2006 		}
2007 	}
2008 
2009 	return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN);
2010 }
2011 
2012 /* deconfigure gcpu_mca_init() */
2013 void
2014 gcpu_mca_fini(cmi_hdl_t hdl)
2015 {
2016 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
2017 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
2018 	int i;
2019 
2020 	/*
2021 	 * CPU startup code only calls cmi_mca_init if x86_featureset indicates
2022 	 * both MCA and MCE support (i.e., X86FSET_MCA).  P5, K6, and earlier
2023 	 * processors, which have their own more primitive way of doing
2024 	 * machine checks, will not have cmi_mca_init called since their
2025 	 * CPUID information will not indicate both MCA and MCE features.
2026 	 */
2027 	if (!is_x86_feature(x86_featureset, X86FSET_MCA))
2028 		return;
2029 #ifndef __xpv
2030 	/*
2031 	 * disable machine check in CR4
2032 	 */
2033 	cmi_ntv_hwdisable_mce(hdl);
2034 #endif
2035 	mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock);
2036 	gcpu_mca_poll_fini(hdl);
2037 	mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock);
2038 
2039 	/*
2040 	 * free resources allocated during init
2041 	 */
2042 	if (mca->gcpu_bank_cmci != NULL) {
2043 		kmem_free(mca->gcpu_bank_cmci, sizeof (gcpu_mca_cmci_t) *
2044 		    mca->gcpu_mca_nbanks);
2045 	}
2046 
2047 	for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) {
2048 		if (mca->gcpu_mca_logout[i] != NULL) {
2049 			kmem_free(mca->gcpu_mca_logout[i], mca->gcpu_mca_lgsz);
2050 		}
2051 	}
2052 
2053 	if (mca->gcpu_mca_bioscfg.bios_bankcfg != NULL) {
2054 		kmem_free(mca->gcpu_mca_bioscfg.bios_bankcfg,
2055 		    sizeof (struct gcpu_bios_bankcfg) * mca->gcpu_mca_nbanks);
2056 	}
2057 }
2058