xref: /illumos-gate/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_mca.c (revision 46b592853d0f4f11781b6b0a7533f267c6aee132)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/mca_x86.h>
28 #include <sys/cpu_module_impl.h>
29 #include <sys/cpu_module_ms.h>
30 #include <sys/cmn_err.h>
31 #include <sys/cpuvar.h>
32 #include <sys/pghw.h>
33 #include <sys/x86_archext.h>
34 #include <sys/sysmacros.h>
35 #include <sys/regset.h>
36 #include <sys/privregs.h>
37 #include <sys/systm.h>
38 #include <sys/types.h>
39 #include <sys/log.h>
40 #include <sys/psw.h>
41 #include <sys/fm/protocol.h>
42 #include <sys/fm/util.h>
43 #include <sys/errorq.h>
44 #include <sys/mca_x86.h>
45 #include <sys/fm/cpu/GMCA.h>
46 #include <sys/sysevent.h>
47 #include <sys/ontrap.h>
48 
49 #include "gcpu.h"
50 
51 /*
52  * Clear to log telemetry found at initialization.  While processor docs
53  * say you should process this telemetry on all but Intel family 0x6
54  * there are way too many exceptions and we want to avoid bogus
55  * diagnoses.
56  */
57 int gcpu_suppress_log_on_init = 1;
58 
59 /*
60  * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at
61  * error logout time.  The stack will be included in the ereport if the
62  * error type selects stack inclusion, or in all cases if
63  * gcpu_mca_stack_ereport_include is nonzero.
64  */
65 int gcpu_mca_stack_flag = 0;
66 int gcpu_mca_stack_ereport_include = 0;
67 
68 /*
69  * The number of times to re-read MCA telemetry to try to obtain a
70  * consistent snapshot if we find it to be changing under our feet.
71  */
72 int gcpu_mca_telemetry_retries = 5;
73 
74 #ifndef __xpv
75 int gcpu_mca_cmci_throttling_threshold = 10;
76 int gcpu_mca_cmci_reenable_threshold = 1000;
77 #endif
78 
79 static gcpu_error_disp_t gcpu_errtypes[] = {
80 
81 	/*
82 	 * Unclassified
83 	 */
84 	{
85 		FM_EREPORT_CPU_GENERIC_UNCLASSIFIED,
86 		NULL,
87 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
88 		MCAX86_SIMPLE_UNCLASSIFIED_MASKON,
89 		MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF
90 	},
91 
92 	/*
93 	 * Microcode ROM Parity Error
94 	 */
95 	{
96 		FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY,
97 		NULL,
98 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
99 		MCAX86_SIMPLE_MC_CODE_PARITY_MASKON,
100 		MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF
101 	},
102 
103 	/*
104 	 * External - BINIT# from another processor during power-on config
105 	 */
106 	{
107 		FM_EREPORT_CPU_GENERIC_EXTERNAL,
108 		NULL,
109 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
110 		MCAX86_SIMPLE_EXTERNAL_MASKON,
111 		MCAX86_SIMPLE_EXTERNAL_MASKOFF
112 	},
113 
114 	/*
115 	 * Functional redundancy check master/slave error
116 	 */
117 	{
118 		FM_EREPORT_CPU_GENERIC_FRC,
119 		NULL,
120 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
121 		MCAX86_SIMPLE_FRC_MASKON,
122 		MCAX86_SIMPLE_FRC_MASKOFF
123 	},
124 
125 	/*
126 	 * Internal parity error
127 	 */
128 	{
129 		FM_EREPORT_CPU_GENERIC_INTERNAL_PARITY,
130 		NULL,
131 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
132 		MCAX86_SIMPLE_INTERNAL_PARITY_MASKON,
133 		MCAX86_SIMPLE_INTERNAL_PARITY_MASKOFF
134 	},
135 
136 
137 	/*
138 	 * Internal timer error
139 	 */
140 	{
141 		FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER,
142 		NULL,
143 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
144 		MCAX86_SIMPLE_INTERNAL_TIMER_MASKON,
145 		MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF
146 	},
147 
148 	/*
149 	 * Internal unclassified
150 	 */
151 	{
152 		FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS,
153 		NULL,
154 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
155 		MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON,
156 		MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF
157 	},
158 
159 	/*
160 	 * Compound error codes - generic memory hierarchy
161 	 */
162 	{
163 		FM_EREPORT_CPU_GENERIC_GENMEMHIER,
164 		NULL,
165 		FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */
166 		MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON,
167 		MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF
168 	},
169 
170 	/*
171 	 * Compound error codes - TLB errors
172 	 */
173 	{
174 		FM_EREPORT_CPU_GENERIC_TLB,
175 		"%1$s" "TLB" "%2$s" "_ERR",
176 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
177 		MCAX86_COMPOUND_TLB_MASKON,
178 		MCAX86_COMPOUND_TLB_MASKOFF
179 	},
180 
181 	/*
182 	 * Compound error codes - memory hierarchy
183 	 */
184 	{
185 		FM_EREPORT_CPU_GENERIC_MEMHIER,
186 		"%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR",
187 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
188 		MCAX86_COMPOUND_MEMHIER_MASKON,
189 		MCAX86_COMPOUND_MEMHIER_MASKOFF
190 	},
191 
192 	/*
193 	 * Compound error codes - bus and interconnect errors
194 	 */
195 	{
196 		FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT,
197 		"BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR",
198 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
199 		MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON,
200 		MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF
201 	},
202 	/*
203 	 * Compound error codes - memory controller errors
204 	 */
205 	{
206 		FM_EREPORT_CPU_GENERIC_MEMORY_CONTROLLER,
207 		"MC" "_" "%8$s" "_" "%9$s" "_ERR",
208 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
209 		MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKON,
210 		MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKOFF
211 	},
212 };
213 
214 static gcpu_error_disp_t gcpu_unknown = {
215 	FM_EREPORT_CPU_GENERIC_UNKNOWN,
216 	"UNKNOWN",
217 	FM_EREPORT_PAYLOAD_FLAGS_COMMON,
218 	0,
219 	0
220 };
221 
222 static errorq_t *gcpu_mca_queue;
223 static kmutex_t gcpu_mca_queue_lock;
224 
225 #ifdef __xpv
226 static int isxpv = 1;
227 #else
228 static int isxpv = 0;
229 #endif
230 
231 static const gcpu_error_disp_t *
232 gcpu_disp_match(uint16_t code)
233 {
234 	const gcpu_error_disp_t *ged = gcpu_errtypes;
235 	int i;
236 
237 	for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t);
238 	    i++, ged++) {
239 		uint16_t on = ged->ged_errcode_mask_on;
240 		uint16_t off = ged->ged_errcode_mask_off;
241 
242 		if ((code & on) == on && (code & off) == 0)
243 			return (ged);
244 	}
245 
246 	return (NULL);
247 }
248 
249 static uint8_t
250 bit_strip(uint16_t code, uint16_t mask, uint16_t shift)
251 {
252 	return ((uint8_t)(code & mask) >> shift);
253 }
254 
255 #define	BIT_STRIP(code, name) \
256 	bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \
257 	MCAX86_ERRCODE_##name##_SHIFT)
258 
259 #define	GCPU_MNEMONIC_UNDEF	"undefined"
260 #define	GCPU_MNEMONIC_RESVD	"reserved"
261 
262 /*
263  * Mappings of TT, LL, RRRR, PP, II and T values to compound error name
264  * mnemonics and to ereport class name components.
265  */
266 
267 struct gcpu_mnexp {
268 	const char *mne_compound;	/* used in expanding compound errname */
269 	const char *mne_ereport;	/* used in expanding ereport class */
270 };
271 
272 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */
273 	{ "I", FM_EREPORT_CPU_GENERIC_TT_INSTR },		/* INSTR */
274 	{ "D", FM_EREPORT_CPU_GENERIC_TT_DATA },		/* DATA */
275 	{ "G", FM_EREPORT_CPU_GENERIC_TT_GEN },			/* GEN */
276 	{ GCPU_MNEMONIC_UNDEF, "" }
277 };
278 
279 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */
280 	{ "LO", FM_EREPORT_CPU_GENERIC_LL_L0 },			/* L0 */
281 	{ "L1",	FM_EREPORT_CPU_GENERIC_LL_L1 },			/* L1 */
282 	{ "L2",	FM_EREPORT_CPU_GENERIC_LL_L2 },			/* L2 */
283 	{ "LG", FM_EREPORT_CPU_GENERIC_LL_LG }			/* LG */
284 };
285 
286 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */
287 	{ "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR },		/* ERR */
288 	{ "RD",	FM_EREPORT_CPU_GENERIC_RRRR_RD },		/* RD */
289 	{ "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR },		/* WR */
290 	{ "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD },		/* DRD */
291 	{ "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR },		/* DWR */
292 	{ "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD },		/* IRD */
293 	{ "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH },	/* PREFETCH */
294 	{ "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT },		/* EVICT */
295 	{ "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP },		/* SNOOP */
296 };
297 
298 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */
299 	{ "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC },		/* SRC */
300 	{ "RES", FM_EREPORT_CPU_GENERIC_PP_RES },		/* RES */
301 	{ "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS },		/* OBS */
302 	{ "", FM_EREPORT_CPU_GENERIC_PP_GEN }			/* GEN */
303 };
304 
305 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */
306 	{ "M", FM_EREPORT_CPU_GENERIC_II_MEM },			/* MEM */
307 	{ GCPU_MNEMONIC_RESVD, "" },
308 	{ "IO", FM_EREPORT_CPU_GENERIC_II_IO },			/* IO */
309 	{ "", FM_EREPORT_CPU_GENERIC_II_GEN }			/* GEN */
310 };
311 
312 static struct gcpu_mnexp gcpu_T_mnemonics[] = {	 /* MCAX86_ERRCODE_T_* */
313 	{ "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT },	/* NONE */
314 	{ "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT }		/* TIMEOUT */
315 };
316 
317 static struct gcpu_mnexp gcpu_CCCC_mnemonics[] = { /* MCAX86_ERRCODE_CCCC_* */
318 	{ "CH0", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH0 */
319 	{ "CH1", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH1 */
320 	{ "CH2", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH2 */
321 	{ "CH3", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH3 */
322 	{ "CH4", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH4 */
323 	{ "CH5", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH5 */
324 	{ "CH6", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH6 */
325 	{ "CH7", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH7 */
326 	{ "CH8", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH8 */
327 	{ "CH9", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH9 */
328 	{ "CH10", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH10 */
329 	{ "CH11", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH11 */
330 	{ "CH12", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH12 */
331 	{ "CH13", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH13 */
332 	{ "CH14", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH14 */
333 	{ "CH", FM_EREPORT_CPU_GENERIC_CCCC }		/* GEN */
334 };
335 
336 static struct gcpu_mnexp gcpu_MMM_mnemonics[] = { /* MCAX86_ERRCODE_MMM_* */
337 	{ "GEN", FM_EREPORT_CPU_GENERIC_MMM_ERR },	/* GEN ERR */
338 	{ "RD", FM_EREPORT_CPU_GENERIC_MMM_RD },	/* READ  */
339 	{ "WR", FM_EREPORT_CPU_GENERIC_MMM_WR },	/* WRITE  */
340 	{ "ADDR_CMD", FM_EREPORT_CPU_GENERIC_MMM_ADRCMD },	/* ADDR, CMD  */
341 	{ "SCRUB", FM_EREPORT_CPU_GENERIC_MMM_SCRUB },
342 	{ GCPU_MNEMONIC_RESVD, ""},			/* RESERVED  */
343 	{ GCPU_MNEMONIC_RESVD, ""},			/* RESERVED  */
344 	{ GCPU_MNEMONIC_RESVD, ""}			/* RESERVED  */
345 };
346 
347 enum gcpu_mn_namespace {
348 	GCPU_MN_NAMESPACE_COMPOUND,
349 	GCPU_MN_NAMESPACE_EREPORT
350 };
351 
352 static const char *
353 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint8_t val,
354     enum gcpu_mn_namespace nspace)
355 {
356 	if (val >= tbl_sz)
357 		return (GCPU_MNEMONIC_UNDEF);	/* for all namespaces */
358 
359 	switch (nspace) {
360 	case GCPU_MN_NAMESPACE_COMPOUND:
361 		return (tbl[val].mne_compound);
362 		/*NOTREACHED*/
363 
364 	case GCPU_MN_NAMESPACE_EREPORT:
365 		return (tbl[val].mne_ereport);
366 		/*NOTREACHED*/
367 
368 	default:
369 		return (GCPU_MNEMONIC_UNDEF);
370 		/*NOTREACHED*/
371 	}
372 }
373 
374 /*
375  * The ereport class leaf component is either a simple string with no
376  * format specifiers, or a string with one or more embedded %n$s specifiers -
377  * positional selection for string arguments.  The kernel snprintf does
378  * not support %n$ (and teaching it to do so is too big a headache) so
379  * we will expand this restricted format string ourselves.
380  */
381 
382 #define	GCPU_CLASS_VARCOMPS	9
383 
384 #define	GCPU_MNEMONIC(code, name, nspace) \
385 	gcpu_mnemonic(gcpu_##name##_mnemonics, \
386 	sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \
387 	BIT_STRIP(code, name), nspace)
388 
389 static void
390 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
391     enum gcpu_mn_namespace nspace)
392 {
393 	uint16_t code = MCAX86_ERRCODE(status);
394 	const char *mn[GCPU_CLASS_VARCOMPS];
395 	char *p = buf;			/* current position in buf */
396 	char *q = buf + buflen;		/* pointer past last char in buf */
397 	int which, expfmtchar, error;
398 	char c;
399 
400 	mn[0] = GCPU_MNEMONIC(code, TT, nspace);
401 	mn[1] = GCPU_MNEMONIC(code, LL, nspace);
402 	mn[2] = GCPU_MNEMONIC(code, RRRR, nspace);
403 	mn[3] = GCPU_MNEMONIC(code, PP, nspace);
404 	mn[4] = GCPU_MNEMONIC(code, II, nspace);
405 	mn[5] = GCPU_MNEMONIC(code, T, nspace);
406 	mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : "";
407 	mn[7] = GCPU_MNEMONIC(code, CCCC, nspace);
408 	mn[8] = GCPU_MNEMONIC(code, MMM, nspace);
409 
410 	while (p < q - 1 && (c = *fmt++) != '\0') {
411 		if (c != '%') {
412 			/* not the beginning of a format specifier - copy */
413 			*p++ = c;
414 			continue;
415 		}
416 
417 		error = 0;
418 		which = -1;
419 		expfmtchar = -1;
420 
421 nextfmt:
422 		if ((c = *fmt++) == '\0')
423 			break;	/* early termination of fmt specifier */
424 
425 		switch (c) {
426 		case '1':
427 		case '2':
428 		case '3':
429 		case '4':
430 		case '5':
431 		case '6':
432 		case '7':
433 		case '8':
434 		case '9':
435 			if (which != -1) { /* allow only one positional digit */
436 				error++;
437 				break;
438 			}
439 			which = c - '1';
440 			goto nextfmt;
441 			/*NOTREACHED*/
442 
443 		case '$':
444 			if (which == -1) { /* no position specified */
445 				error++;
446 				break;
447 			}
448 			expfmtchar = 's';
449 			goto nextfmt;
450 			/*NOTREACHED*/
451 
452 		case 's':
453 			if (expfmtchar != 's') {
454 				error++;
455 				break;
456 			}
457 			(void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
458 			    mn[which]);
459 			p += strlen(p);
460 			break;
461 
462 		default:
463 			error++;
464 			break;
465 		}
466 
467 		if (error)
468 			break;
469 	}
470 
471 	*p = '\0';	/* NUL termination */
472 }
473 
474 static void
475 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
476     const char *cpuclass, const char *leafclass)
477 {
478 	char *p = buf;			/* current position in buf */
479 	char *q = buf + buflen;		/* pointer past last char in buf */
480 
481 	(void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.",
482 	    FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC);
483 
484 	p += strlen(p);
485 	if (p >= q)
486 		return;
487 
488 	if (leafclass == NULL) {
489 		gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status,
490 		    GCPU_MN_NAMESPACE_EREPORT);
491 	} else {
492 		(void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
493 		    leafclass);
494 	}
495 }
496 
497 /*
498  * Create an "hc" scheme FMRI identifying the given cpu with
499  * motherboard/chip/core/strand instance numbers.
500  */
501 static nvlist_t *
502 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva)
503 {
504 	nvlist_t *nvl;
505 
506 	if ((nvl = fm_nvlist_create(nva)) == NULL)
507 		return (NULL);
508 
509 	fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 4,
510 	    "motherboard", 0,
511 	    "chip", cmi_hdl_chipid(hdl),
512 	    "core", cmi_hdl_coreid(hdl),
513 	    "strand", cmi_hdl_strandid(hdl));
514 
515 	return (nvl);
516 }
517 
518 int gcpu_bleat_count_thresh = 5;
519 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL;
520 
521 /*
522  * Called when we are unable to propogate a logout structure onto an
523  * errorq for subsequent ereport preparation and logging etc.  The caller
524  * should usually only decide to call this for severe errors - those we
525  * suspect we may need to panic for.
526  */
527 static void
528 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl)
529 {
530 	hrtime_t now  = gethrtime_waitfree();
531 	static hrtime_t gcpu_last_bleat;
532 	gcpu_bank_logout_t *gbl;
533 	static int bleatcount;
534 	int i;
535 
536 	/*
537 	 * Throttle spamming of the console.  The first gcpu_bleat_count_thresh
538 	 * can come as fast as we like, but once we've spammed that many
539 	 * to the console we require a minimum interval to pass before
540 	 * any more complaints.
541 	 */
542 	if (++bleatcount > gcpu_bleat_count_thresh) {
543 		if (now - gcpu_last_bleat < gcpu_bleat_min_interval)
544 			return;
545 		else
546 			bleatcount = 0;
547 	}
548 	gcpu_last_bleat = now;
549 
550 	cmn_err(CE_WARN,
551 	    "Machine-Check Errors unlogged on chip %d core %d strand %d, "
552 	    "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl),
553 	    cmi_hdl_strandid(hdl));
554 	cmn_err(CE_WARN, "MCG_STATUS 0x%016llx",
555 	    (u_longlong_t)gcl->gcl_mcg_status);
556 	for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
557 		uint64_t status = gbl->gbl_status;
558 
559 		if (!(status & MSR_MC_STATUS_VAL))
560 			continue;
561 
562 		switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) {
563 		case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV:
564 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
565 			    "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx",
566 			    i, IA32_MSR_MC(i, STATUS),
567 			    (u_longlong_t)status,
568 			    (u_longlong_t)gbl->gbl_addr,
569 			    (u_longlong_t)gbl->gbl_misc);
570 			break;
571 
572 		case MSR_MC_STATUS_ADDRV:
573 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
574 			    "STAT 0x%016llx ADDR 0x%016llx",
575 			    i, IA32_MSR_MC(i, STATUS),
576 			    (u_longlong_t)status,
577 			    (u_longlong_t)gbl->gbl_addr);
578 			break;
579 
580 		case MSR_MC_STATUS_MISCV:
581 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
582 			    "STAT 0x%016llx MISC 0x%016llx",
583 			    i, IA32_MSR_MC(i, STATUS),
584 			    (u_longlong_t)status,
585 			    (u_longlong_t)gbl->gbl_misc);
586 			break;
587 
588 		default:
589 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
590 			    "STAT 0x%016llx",
591 			    i, IA32_MSR_MC(i, STATUS),
592 			    (u_longlong_t)status);
593 			break;
594 
595 		}
596 	}
597 }
598 
599 #define	_GCPU_BSTATUS(status, what) \
600 	FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \
601 	(status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE
602 
603 static void
604 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl,
605     uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code)
606 {
607 	uint64_t members = ged ? ged->ged_ereport_members :
608 	    FM_EREPORT_PAYLOAD_FLAGS_COMMON;
609 	uint64_t mcg = gcl->gcl_mcg_status;
610 	int mcip = mcg & MCG_STATUS_MCIP;
611 	const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno];
612 	uint64_t bstat = gbl->gbl_status;
613 
614 	/*
615 	 * Include the compound error name if requested and if this
616 	 * is a compound error type.
617 	 */
618 	if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged &&
619 	    ged->ged_compound_fmt != NULL) {
620 		char buf[FM_MAX_CLASS];
621 
622 		gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code,
623 		    GCPU_MN_NAMESPACE_COMPOUND);
624 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR,
625 		    DATA_TYPE_STRING, buf, NULL);
626 	}
627 
628 	/*
629 	 * Include disposition information for this error
630 	 */
631 	if (members & FM_EREPORT_PAYLOAD_FLAG_DISP &&
632 	    gbl->gbl_disp != 0) {
633 		int i, empty = 1;
634 		char buf[128];
635 		char *p = buf, *q = buf + 128;
636 		static struct _gcpu_disp_name {
637 			uint64_t dv;
638 			const char *dn;
639 		} disp_names[] = {
640 			{ CMI_ERRDISP_CURCTXBAD,
641 			    "processor_context_corrupt" },
642 			{ CMI_ERRDISP_RIPV_INVALID,
643 			    "return_ip_invalid" },
644 			{ CMI_ERRDISP_UC_UNCONSTRAINED,
645 			    "unconstrained" },
646 			{ CMI_ERRDISP_FORCEFATAL,
647 			    "forcefatal" },
648 			{ CMI_ERRDISP_IGNORED,
649 			    "ignored" },
650 			{ CMI_ERRDISP_PCC_CLEARED,
651 			    "corrupt_context_cleared" },
652 			{ CMI_ERRDISP_UC_CLEARED,
653 			    "uncorrected_data_cleared" },
654 			{ CMI_ERRDISP_POISONED,
655 			    "poisoned" },
656 			{ CMI_ERRDISP_INCONSISTENT,
657 			    "telemetry_unstable" },
658 		};
659 
660 		for (i = 0; i < sizeof (disp_names) /
661 		    sizeof (struct _gcpu_disp_name); i++) {
662 			if ((gbl->gbl_disp & disp_names[i].dv) == 0)
663 				continue;
664 
665 			(void) snprintf(p, (uintptr_t)q - (uintptr_t)p,
666 			    "%s%s", empty ? "" : ",", disp_names[i].dn);
667 			p += strlen(p);
668 			empty = 0;
669 		}
670 
671 		if (p != buf)
672 			fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP,
673 			    DATA_TYPE_STRING, buf, NULL);
674 	}
675 
676 	/*
677 	 * If MCG_STATUS is included add that and an indication of whether
678 	 * this ereport was the result of a machine check or poll.
679 	 */
680 	if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) {
681 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS,
682 		    DATA_TYPE_UINT64, mcg, NULL);
683 
684 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP,
685 		    DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL);
686 	}
687 
688 	/*
689 	 * If an instruction pointer is to be included add one provided
690 	 * MCG_STATUS indicated it is valid; meaningless for polled events.
691 	 */
692 	if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP &&
693 	    mcg & MCG_STATUS_EIPV) {
694 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP,
695 		    DATA_TYPE_UINT64, gcl->gcl_ip, NULL);
696 	}
697 
698 	/*
699 	 * Add an indication of whether the trap occured during privileged code.
700 	 */
701 	if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) {
702 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV,
703 		    DATA_TYPE_BOOLEAN_VALUE,
704 		    gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL);
705 	}
706 
707 	/*
708 	 * If requested, add the index of the MCA bank.  This indicates the
709 	 * n'th bank of 4 MCA registers, and does not necessarily correspond
710 	 * to MCi_* - use the bank offset to correlate
711 	 */
712 	if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) {
713 		fm_payload_set(ereport,
714 		    /* Bank number */
715 		    FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno,
716 		    /* Offset of MCi_CTL */
717 		    FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64,
718 		    IA32_MSR_MC(bankno, CTL),
719 		    NULL);
720 	}
721 
722 	/*
723 	 * Add MCi_STATUS if requested, and decode it.
724 	 */
725 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) {
726 		const char *tbes[] = {
727 			"No tracking",			/* 00 */
728 			"Green - below threshold",	/* 01 */
729 			"Yellow - above threshold",	/* 10 */
730 			"Reserved"			/* 11 */
731 		};
732 
733 		fm_payload_set(ereport,
734 		    /* Bank MCi_STATUS */
735 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat,
736 		    /* Overflow? */
737 		    _GCPU_BSTATUS(bstat, OVER),
738 		    /* Uncorrected? */
739 		    _GCPU_BSTATUS(bstat, UC),
740 		    /* Enabled? */
741 		    _GCPU_BSTATUS(bstat, EN),
742 		    /* Processor context corrupt? */
743 		    _GCPU_BSTATUS(bstat, PCC),
744 		    /* Error code */
745 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE,
746 		    DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat),
747 		    /* Model-specific error code */
748 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE,
749 		    DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat),
750 		    NULL);
751 
752 		/*
753 		 * If MCG_CAP.TES_P indicates that that thresholding info
754 		 * is present in the architural component of the bank status
755 		 * then include threshold information for this bank.
756 		 */
757 		if (gcl->gcl_flags & GCPU_GCL_F_TES_P) {
758 			fm_payload_set(ereport,
759 			    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES,
760 			    DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)],
761 			    NULL);
762 		}
763 	}
764 
765 	/*
766 	 * MCi_ADDR info if requested and valid.
767 	 */
768 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR &&
769 	    bstat & MSR_MC_STATUS_ADDRV) {
770 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR,
771 		    DATA_TYPE_UINT64, gbl->gbl_addr, NULL);
772 	}
773 
774 	/*
775 	 * MCi_MISC if requested and MCi_STATUS.MISCV).
776 	 */
777 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC &&
778 	    bstat & MSR_MC_STATUS_MISCV) {
779 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC,
780 		    DATA_TYPE_UINT64, gbl->gbl_misc, NULL);
781 	}
782 
783 }
784 
785 /*
786  * Construct and post an ereport based on the logout information from a
787  * single MCA bank.  We are not necessarily running on the cpu that
788  * detected the error.
789  */
790 static void
791 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx,
792     const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status)
793 {
794 	gcpu_data_t *gcpu = gcl->gcl_gcpu;
795 	cmi_hdl_t hdl = gcpu->gcpu_hdl;
796 	const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx];
797 	const char *cpuclass = NULL, *leafclass = NULL;
798 	uint16_t code = MCAX86_ERRCODE(status);
799 	errorq_elem_t *eqep, *scr_eqep;
800 	nvlist_t *ereport, *detector;
801 	char buf[FM_MAX_CLASS];
802 	const char *classfmt;
803 	nv_alloc_t *nva;
804 
805 	if (panicstr) {
806 		if ((eqep = errorq_reserve(ereport_errorq)) == NULL)
807 			return;
808 		ereport = errorq_elem_nvl(ereport_errorq, eqep);
809 
810 		/*
811 		 * Allocate another element for scratch space, but fallback
812 		 * to the one we have if that fails.  We'd like to use the
813 		 * additional scratch space for nvlist construction.
814 		 */
815 		if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL)
816 			nva = errorq_elem_nva(ereport_errorq, scr_eqep);
817 		else
818 			nva = errorq_elem_nva(ereport_errorq, eqep);
819 	} else {
820 		ereport = fm_nvlist_create(NULL);
821 		nva = NULL;
822 	}
823 
824 	if (ereport == NULL)
825 		return;
826 
827 	/*
828 	 * Common payload data required by the protocol:
829 	 *	- ereport class
830 	 *	- detector
831 	 *	- ENA
832 	 */
833 
834 	/*
835 	 * Ereport class - call into model-specific support to allow it to
836 	 * provide a cpu class or leaf class, otherwise calculate our own.
837 	 */
838 	cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass);
839 	classfmt = ged ?  ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN;
840 	gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass,
841 	    leafclass);
842 
843 	/*
844 	 * The detector FMRI.
845 	 */
846 	if ((detector = cms_ereport_detector(hdl, bankidx, mscookie,
847 	    nva)) == NULL)
848 		detector = gcpu_fmri_create(hdl, nva);
849 
850 	/*
851 	 * Should we define a new ENA format 3?? for chip/core/strand?
852 	 * It will be better when virtualized.
853 	 */
854 	fm_ereport_set(ereport, FM_EREPORT_VERSION, buf,
855 	    fm_ena_generate_cpu(gcl->gcl_timestamp,
856 	    cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 |
857 	    cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL);
858 
859 	if (panicstr) {
860 		fm_nvlist_destroy(detector, FM_NVA_RETAIN);
861 		nv_alloc_reset(nva);
862 	} else {
863 		fm_nvlist_destroy(detector, FM_NVA_FREE);
864 	}
865 
866 	/*
867 	 * Add the architectural ereport class-specific payload data.
868 	 */
869 	gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code);
870 
871 	/*
872 	 * Allow model-specific code to add ereport members.
873 	 */
874 	cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status,
875 	    gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie);
876 
877 	/*
878 	 * Include stack if options is turned on and either selected in
879 	 * the payload member bitmask or inclusion is forced.
880 	 */
881 	if (gcpu_mca_stack_flag &&
882 	    (cms_ereport_includestack(hdl, mscookie) ==
883 	    B_TRUE || gcpu_mca_stack_ereport_include)) {
884 		fm_payload_stack_add(ereport, gcl->gcl_stack,
885 		    gcl->gcl_stackdepth);
886 	}
887 
888 	/*
889 	 * If injection has taken place anytime in the past then note this
890 	 * on the ereport.
891 	 */
892 	if (cmi_inj_tainted() == B_TRUE) {
893 		fm_payload_set(ereport, "__injected", DATA_TYPE_BOOLEAN_VALUE,
894 		    B_TRUE, NULL);
895 	}
896 
897 	/*
898 	 * Post ereport.
899 	 */
900 	if (panicstr) {
901 		errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC);
902 		if (scr_eqep)
903 			errorq_cancel(ereport_errorq, scr_eqep);
904 	} else {
905 		(void) fm_ereport_post(ereport, EVCH_TRYHARD);
906 		fm_nvlist_destroy(ereport, FM_NVA_FREE);
907 	}
908 
909 }
910 
911 /*ARGSUSED*/
912 void
913 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe)
914 {
915 	const gcpu_logout_t *gcl = data;
916 	const gcpu_bank_logout_t *gbl;
917 	int i;
918 
919 	for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
920 		const gcpu_error_disp_t *gened;
921 		cms_cookie_t mscookie;
922 
923 		if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
924 		    !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
925 			uint16_t code = MCAX86_ERRCODE(gbl->gbl_status);
926 
927 			/*
928 			 * Perform a match based on IA32 MCA architectural
929 			 * components alone.
930 			 */
931 			gened = gcpu_disp_match(code); /* may be NULL */
932 
933 			/*
934 			 * Now see if an model-specific match can be made.
935 			 */
936 			mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, i,
937 			    gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc,
938 			    gcl->gcl_ms_logout);
939 
940 			/*
941 			 * Prepare and dispatch an ereport for logging and
942 			 * diagnosis.
943 			 */
944 			gcpu_ereport_post(gcl, i, gened, mscookie,
945 			    gbl->gbl_status);
946 		} else if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
947 		    (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
948 			/*
949 			 * Telemetry kept changing as we tried to read
950 			 * it.  Force an unknown ereport leafclass but
951 			 * keep the telemetry unchanged for logging.
952 			 */
953 			gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL,
954 			    gbl->gbl_status);
955 		}
956 	}
957 }
958 
959 static size_t gcpu_mca_queue_datasz = 0;
960 
961 /*
962  * The following code is ready to make a weak attempt at growing the
963  * errorq structure size.  Since it is not foolproof (we don't know
964  * who may already be producing to the outgoing errorq) our caller
965  * instead assures that we'll always be called with no greater data
966  * size than on our first call.
967  */
968 static void
969 gcpu_errorq_init(size_t datasz)
970 {
971 	int slots;
972 
973 	mutex_enter(&gcpu_mca_queue_lock);
974 
975 	if (gcpu_mca_queue_datasz >= datasz) {
976 		mutex_exit(&gcpu_mca_queue_lock);
977 		return;
978 	}
979 
980 	membar_producer();
981 	if (gcpu_mca_queue) {
982 		gcpu_mca_queue_datasz = 0;
983 		errorq_destroy(gcpu_mca_queue);
984 	}
985 
986 	slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS);
987 	slots = MIN(slots, GCPU_MCA_MAX_ERRORS);
988 
989 	gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain,
990 	    NULL, slots, datasz, 1, ERRORQ_VITAL);
991 
992 	if (gcpu_mca_queue != NULL)
993 		gcpu_mca_queue_datasz = datasz;
994 
995 	mutex_exit(&gcpu_mca_queue_lock);
996 }
997 
998 /*
999  * Perform MCA initialization as described in section 14.6 of Intel 64
1000  * and IA-32 Architectures Software Developer's Manual Volume 3A.
1001  */
1002 
1003 static uint_t global_nbanks;
1004 
1005 void
1006 gcpu_mca_init(cmi_hdl_t hdl)
1007 {
1008 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1009 	uint64_t cap;
1010 	uint_t vendor = cmi_hdl_vendor(hdl);
1011 	uint_t family = cmi_hdl_family(hdl);
1012 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1013 	int mcg_ctl_present;
1014 	uint_t nbanks;
1015 	uint32_t ctl_skip_mask = 0;
1016 	uint32_t status_skip_mask = 0;
1017 	size_t mslsz;
1018 	int i;
1019 #ifndef __xpv
1020 	int mcg_ctl2_present;
1021 	uint32_t cmci_capable = 0;
1022 #endif
1023 
1024 	if (gcpu == NULL)
1025 		return;
1026 
1027 	/*
1028 	 * Protect from some silly /etc/system settings.
1029 	 */
1030 	if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100)
1031 		gcpu_mca_telemetry_retries = 5;
1032 
1033 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS)
1034 		return;
1035 
1036 	/*
1037 	 * CPU startup code only calls cmi_mca_init if x86_feature indicates
1038 	 * both MCA and MCE support (i.e., X86_MCA).  P5, K6, and earlier
1039 	 * processors, which have their own * more primitive way of doing
1040 	 * machine checks, will not have cmi_mca_init called since their
1041 	 * CPUID information will not indicate both MCA and MCE features.
1042 	 */
1043 	ASSERT(x86_feature & X86_MCA);
1044 
1045 	/*
1046 	 * Determine whether the IA32_MCG_CTL register is present.  If it
1047 	 * is we will enable all features by writing -1 to it towards
1048 	 * the end of this initialization;  if it is absent then volume 3A
1049 	 * says we must nonetheless continue to initialize the individual
1050 	 * banks.
1051 	 */
1052 	mcg_ctl_present = cap & MCG_CAP_CTL_P;
1053 #ifndef __xpv
1054 	mcg_ctl2_present = cap & MCG_CAP_CTL2_P;
1055 #endif
1056 
1057 	/*
1058 	 * We squirell values away for inspection/debugging.
1059 	 */
1060 	mca->gcpu_mca_bioscfg.bios_mcg_cap = cap;
1061 	if (mcg_ctl_present)
1062 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL,
1063 		    &mca->gcpu_mca_bioscfg.bios_mcg_ctl);
1064 
1065 	/*
1066 	 * Determine the number of error-reporting banks implemented.
1067 	 */
1068 	mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK;
1069 
1070 	if (nbanks != 0 && global_nbanks == 0)
1071 		global_nbanks = nbanks;	/* no race - BSP will get here first */
1072 
1073 	/*
1074 	 * If someone is hiding the number of banks (perhaps we are fully
1075 	 * virtualized?) or if this processor has more banks than the
1076 	 * first to set global_nbanks then bail.  The latter requirement
1077 	 * is because we need to size our errorq data structure and we
1078 	 * don't want to have to grow the errorq (destroy and recreate)
1079 	 * which may just lose some telemetry.
1080 	 */
1081 	if (nbanks == 0 || nbanks > global_nbanks)
1082 		return;
1083 
1084 	mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks *
1085 	    sizeof (struct gcpu_bios_bankcfg), KM_SLEEP);
1086 
1087 	/*
1088 	 * Calculate the size we need to allocate for a gcpu_logout_t
1089 	 * with a gcl_data array big enough for all banks of this cpu.
1090 	 * Add any space requested by the model-specific logout support.
1091 	 */
1092 	mslsz = cms_logout_size(hdl);
1093 	mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) +
1094 	    (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz;
1095 
1096 	for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) {
1097 		gcpu_logout_t *gcl;
1098 
1099 		mca->gcpu_mca_logout[i] = gcl =
1100 		    kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP);
1101 		gcl->gcl_gcpu = gcpu;
1102 		gcl->gcl_nbanks = nbanks;
1103 		gcl->gcl_ms_logout = (mslsz == 0) ? NULL :
1104 		    (char *)(&gcl->gcl_data[0]) + nbanks *
1105 		    sizeof (gcpu_bank_logout_t);
1106 
1107 	}
1108 
1109 #ifdef __xpv
1110 	gcpu_xpv_mca_init(nbanks);
1111 #endif
1112 
1113 	mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1;
1114 
1115 #ifndef __xpv
1116 	mca->gcpu_bank_cmci = kmem_zalloc(sizeof (gcpu_mca_cmci_t) * nbanks,
1117 	    KM_SLEEP);
1118 #endif
1119 
1120 	/*
1121 	 * Create our errorq to transport the logout structures.  This
1122 	 * can fail so users of gcpu_mca_queue must be prepared for NULL.
1123 	 */
1124 	gcpu_errorq_init(mca->gcpu_mca_lgsz);
1125 
1126 	/*
1127 	 * Not knowing which, if any, banks are shared between cores we
1128 	 * assure serialization of MCA bank initialization by each cpu
1129 	 * on the chip.  On chip architectures in which some banks are
1130 	 * shared this will mean the shared resource is initialized more
1131 	 * than once - we're simply aiming to avoid simultaneous MSR writes
1132 	 * to the shared resource.
1133 	 *
1134 	 * Even with these precautions, some platforms may yield a GP fault
1135 	 * if a core other than a designated master tries to write anything
1136 	 * but all 0's to MCi_{STATUS,ADDR,CTL}.  So we will perform
1137 	 * those writes under on_trap protection.
1138 	 */
1139 	mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock);
1140 
1141 	/*
1142 	 * Initialize poller data, but don't start polling yet.
1143 	 */
1144 	gcpu_mca_poll_init(hdl);
1145 
1146 	/*
1147 	 * Work out which MCA banks we will initialize.  In MCA logout
1148 	 * code we will only read those banks which we initialize here.
1149 	 */
1150 	for (i = 0; i < nbanks; i++) {
1151 		boolean_t skipctl = cms_bankctl_skipinit(hdl, i);
1152 		boolean_t skipstatus = cms_bankstatus_skipinit(hdl, i);
1153 
1154 		if (!cms_present(hdl)) {
1155 			/*
1156 			 * Model-specific support is not present, try to use
1157 			 * sane defaults.
1158 			 *
1159 			 * On AMD family 6 processors, reports about spurious
1160 			 * machine checks indicate that bank 0 should be
1161 			 * skipped.
1162 			 *
1163 			 * On Intel family 6 processors, the documentation tells
1164 			 * us not to write to MC0_CTL.
1165 			 *
1166 			 */
1167 			if (i == 0 && family == 6) {
1168 				switch (vendor) {
1169 				case X86_VENDOR_AMD:
1170 					skipstatus = B_TRUE;
1171 					/*FALLTHRU*/
1172 				case X86_VENDOR_Intel:
1173 					skipctl = B_TRUE;
1174 					break;
1175 				}
1176 			}
1177 		}
1178 
1179 		ctl_skip_mask |= skipctl << i;
1180 		status_skip_mask |= skipstatus << i;
1181 
1182 		if (skipctl && skipstatus)
1183 			continue;
1184 
1185 		/*
1186 		 * Record which MCA banks were enabled, from the point of view
1187 		 * of the whole chip (if some cores share a bank we must be
1188 		 * sure either can logout from it).
1189 		 */
1190 		atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i);
1191 
1192 #ifndef __xpv
1193 		/*
1194 		 * check CMCI capability
1195 		 */
1196 		if (mcg_ctl2_present) {
1197 			uint64_t ctl2;
1198 			uint32_t cap = 0;
1199 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
1200 			if (ctl2 & MSR_MC_CTL2_EN)
1201 				continue;
1202 			ctl2 |= MSR_MC_CTL2_EN;
1203 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
1204 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
1205 			mca->gcpu_bank_cmci[i].cmci_cap = cap =
1206 			    (ctl2 & MSR_MC_CTL2_EN) ? 1 : 0;
1207 			if (cap)
1208 				cmci_capable ++;
1209 			/*
1210 			 * Set threshold to 1 while unset the en field, to avoid
1211 			 * CMCI trigged before APIC LVT entry init.
1212 			 */
1213 			ctl2 = ctl2 & (~MSR_MC_CTL2_EN) | 1;
1214 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
1215 
1216 			/*
1217 			 * init cmci related count
1218 			 */
1219 			mca->gcpu_bank_cmci[i].cmci_enabled = 0;
1220 			mca->gcpu_bank_cmci[i].drtcmci = 0;
1221 			mca->gcpu_bank_cmci[i].ncmci = 0;
1222 		}
1223 #endif
1224 	}
1225 
1226 #ifndef __xpv
1227 	if (cmci_capable)
1228 		cmi_enable_cmci = 1;
1229 #endif
1230 
1231 #ifndef __xpv
1232 	/*
1233 	 * Log any valid telemetry lurking in the MCA banks, but do not
1234 	 * clear the status registers.  Ignore the disposition returned -
1235 	 * we have already paniced or reset for any nasty errors found here.
1236 	 *
1237 	 * Intel vol 3A says that we should not do this on family 0x6,
1238 	 * and that for any extended family the BIOS clears things
1239 	 * on power-on reset so you'll only potentially find valid telemetry
1240 	 * on warm reset (we do it for both - on power-on reset we should
1241 	 * just see zeroes).
1242 	 *
1243 	 * AMD docs since K7 say we should process anything we find here.
1244 	 */
1245 	if (!gcpu_suppress_log_on_init &&
1246 	    (vendor == X86_VENDOR_Intel && family >= 0xf ||
1247 	    vendor == X86_VENDOR_AMD))
1248 		gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE,
1249 		    GCPU_MPT_WHAT_POKE_ERR);
1250 
1251 	/*
1252 	 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the
1253 	 * model-specific module the power of veto.
1254 	 */
1255 	for (i = 0; i < nbanks; i++) {
1256 		struct gcpu_bios_bankcfg *bcfgp =
1257 		    mca->gcpu_mca_bioscfg.bios_bankcfg + i;
1258 
1259 		/*
1260 		 * Stash inherited bank MCA state, even for banks we will
1261 		 * not initialize ourselves.  Do not read the MISC register
1262 		 * unconditionally - on some processors that will #GP on
1263 		 * banks that do not implement the MISC register (would be
1264 		 * caught by on_trap, anyway).
1265 		 */
1266 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL),
1267 		    &bcfgp->bios_bank_ctl);
1268 
1269 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1270 		    &bcfgp->bios_bank_status);
1271 
1272 		if (bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV)
1273 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR),
1274 			    &bcfgp->bios_bank_addr);
1275 
1276 		/*
1277 		 * In some old BIOS the status value after boot can indicate
1278 		 * MISCV when there is actually no MISC register for
1279 		 * that bank.  The following read could therefore
1280 		 * aggravate a general protection fault.  This should be
1281 		 * caught by on_trap, but the #GP fault handler is busted
1282 		 * and can suffer a double fault even before we get to
1283 		 * trap() to check for on_trap protection.  Until that
1284 		 * issue is fixed we remove the one access that we know
1285 		 * can cause a #GP.
1286 		 *
1287 		 * if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV)
1288 		 *	(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC),
1289 		 *	    &bcfgp->bios_bank_misc);
1290 		 */
1291 		bcfgp->bios_bank_misc = 0;
1292 
1293 		if (!(ctl_skip_mask & (1 << i))) {
1294 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL),
1295 			    cms_bankctl_val(hdl, i, -1ULL));
1296 		}
1297 
1298 		if (!(status_skip_mask & (1 << i))) {
1299 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS),
1300 			    cms_bankstatus_val(hdl, i, 0ULL));
1301 		}
1302 	}
1303 #endif
1304 	/*
1305 	 * Now let the model-specific support perform further initialization
1306 	 * of non-architectural features.
1307 	 */
1308 	cms_mca_init(hdl, nbanks);
1309 
1310 #ifndef __xpv
1311 	(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL);
1312 	membar_producer();
1313 
1314 	/* enable all machine-check features */
1315 	if (mcg_ctl_present)
1316 		(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL,
1317 		    cms_mcgctl_val(hdl, nbanks, -1ULL));
1318 #endif
1319 
1320 	mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock);
1321 
1322 #ifndef __xpv
1323 	/* enable machine-check exception in CR4 */
1324 	cmi_hdl_enable_mce(hdl);
1325 #endif
1326 }
1327 
1328 static uint64_t
1329 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu,
1330     gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp)
1331 {
1332 	int curctxbad = 0, unconstrained = 0, forcefatal = 0;
1333 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1334 	int nbanks = mca->gcpu_mca_nbanks;
1335 	gcpu_mce_status_t mce;
1336 	gcpu_bank_logout_t *gbl;
1337 	uint64_t disp = 0;
1338 	int i;
1339 
1340 	if (mcesp == NULL)
1341 		mcesp = &mce;
1342 
1343 	mcesp->mce_nerr = nerr;
1344 
1345 	mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc =
1346 	    mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned =
1347 	    mcesp->mce_forcefatal = mcesp->mce_ignored = 0;
1348 
1349 	/*
1350 	 * If this a machine check then if the return instruction pointer
1351 	 * is not valid the current context is lost.
1352 	 */
1353 	if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV))
1354 		disp |= CMI_ERRDISP_RIPV_INVALID;
1355 
1356 	for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1357 		uint64_t mcistatus = gbl->gbl_status;
1358 		uint32_t ms_scope;
1359 		int pcc, uc;
1360 		int poisoned;
1361 
1362 		if (!(mcistatus & MSR_MC_STATUS_VAL))
1363 			continue;
1364 
1365 		if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)
1366 			continue;
1367 
1368 		pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0;
1369 		uc = (mcistatus & MSR_MC_STATUS_UC) != 0;
1370 		mcesp->mce_npcc += pcc;
1371 		mcesp->mce_nuc += uc;
1372 
1373 		ms_scope = cms_error_action(hdl, ismc, i, mcistatus,
1374 		    gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout);
1375 
1376 		if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) {
1377 			pcc = 0;
1378 			mcesp->mce_npcc_ok++;
1379 			gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED;
1380 		}
1381 
1382 		if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) {
1383 			uc = 0;
1384 			mcesp->mce_nuc_ok++;
1385 			gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED;
1386 		}
1387 
1388 		if (uc) {
1389 			poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0;
1390 			if (poisoned) {
1391 				mcesp->mce_nuc_poisoned++;
1392 				gbl->gbl_disp |= CMI_ERRDISP_POISONED;
1393 			}
1394 		}
1395 
1396 		if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) {
1397 			/*
1398 			 * We're not being instructed to ignore the error,
1399 			 * so apply our standard disposition logic to it.
1400 			 */
1401 			if (uc && !poisoned) {
1402 				unconstrained++;
1403 				gbl->gbl_disp |= disp |
1404 				    CMI_ERRDISP_UC_UNCONSTRAINED;
1405 			}
1406 
1407 			if (pcc && ismc) {
1408 				curctxbad++;
1409 				gbl->gbl_disp |= disp |
1410 				    CMI_ERRDISP_CURCTXBAD;
1411 			}
1412 
1413 			/*
1414 			 * Even if the above may not indicate that the error
1415 			 * is terminal, model-specific support may insist
1416 			 * that we treat it as such.  Such errors wil be
1417 			 * fatal even if discovered via poll.
1418 			 */
1419 			if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) {
1420 				forcefatal++;
1421 				mcesp->mce_forcefatal++;
1422 				gbl->gbl_disp |= disp |
1423 				    CMI_ERRDISP_FORCEFATAL;
1424 			}
1425 		} else {
1426 			mcesp->mce_ignored++;
1427 			gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED;
1428 		}
1429 	}
1430 
1431 	if (unconstrained > 0)
1432 		disp |= CMI_ERRDISP_UC_UNCONSTRAINED;
1433 
1434 	if (curctxbad > 0)
1435 		disp |= CMI_ERRDISP_CURCTXBAD;
1436 
1437 	if (forcefatal > 0)
1438 		disp |= CMI_ERRDISP_FORCEFATAL;
1439 
1440 	if (gcpu_mca_queue != NULL) {
1441 		int how;
1442 
1443 		if (ismc) {
1444 			how = cmi_mce_response(rp, disp) ?
1445 			    ERRORQ_ASYNC :	/* no panic, so arrange drain */
1446 			    ERRORQ_SYNC;	/* panic flow will drain */
1447 		} else {
1448 			how = (disp & CMI_ERRDISP_FORCEFATAL &&
1449 			    cmi_panic_on_ue()) ?
1450 			    ERRORQ_SYNC :	/* poller will panic */
1451 			    ERRORQ_ASYNC;	/* no panic */
1452 		}
1453 
1454 		errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how);
1455 	} else if (disp != 0) {
1456 		gcpu_bleat(hdl, gcl);
1457 	}
1458 
1459 	mcesp->mce_disp = disp;
1460 
1461 	return (disp);
1462 }
1463 
1464 /*
1465  * Gather error telemetry from our source, and then submit it for
1466  * processing.
1467  */
1468 
1469 #define	IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \
1470 	((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0)
1471 
1472 #define	STATUS_EQV(s1, s2) \
1473 	(((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER))
1474 
1475 static uint32_t gcpu_deferrred_polled_clears;
1476 
1477 #ifndef __xpv
1478 static void
1479 gcpu_cmci_logout(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
1480     uint64_t status, int what)
1481 {
1482 	uint64_t ctl2;
1483 
1484 	if (bank_cmci_p->cmci_cap && (what == GCPU_MPT_WHAT_CYC_ERR) &&
1485 	    (!(status & MSR_MC_STATUS_VAL) || ((status & MSR_MC_STATUS_VAL) &&
1486 	    !(status & MSR_MC_STATUS_CEC_MASK)))) {
1487 
1488 		if (!(bank_cmci_p->cmci_enabled)) {
1489 			/*
1490 			 * when cmci is disabled, and the bank has no error or
1491 			 * no corrected error for
1492 			 * gcpu_mca_cmci_reenable_threshold consecutive polls,
1493 			 * turn on this bank's cmci.
1494 			 */
1495 
1496 			bank_cmci_p->drtcmci ++;
1497 
1498 			if (bank_cmci_p->drtcmci >=
1499 			    gcpu_mca_cmci_reenable_threshold) {
1500 
1501 				/* turn on cmci */
1502 
1503 				(void) cmi_hdl_rdmsr(hdl,
1504 				    IA32_MSR_MC_CTL2(bank), &ctl2);
1505 				ctl2 |= MSR_MC_CTL2_EN;
1506 				(void) cmi_hdl_wrmsr(hdl,
1507 				    IA32_MSR_MC_CTL2(bank), ctl2);
1508 
1509 				/* reset counter and set flag */
1510 				bank_cmci_p->drtcmci = 0;
1511 				bank_cmci_p->cmci_enabled = 1;
1512 			}
1513 		} else {
1514 			/*
1515 			 * when cmci is enabled,if is in cyclic poll and the
1516 			 * bank has no error or no corrected error, reset ncmci
1517 			 * counter
1518 			 */
1519 			bank_cmci_p->ncmci = 0;
1520 		}
1521 	}
1522 }
1523 
1524 static void
1525 gcpu_cmci_throttle(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
1526     int what)
1527 {
1528 	uint64_t ctl2 = 0;
1529 
1530 	/*
1531 	 * if cmci of this bank occurred beyond
1532 	 * gcpu_mca_cmci_throttling_threshold between 2 polls,
1533 	 * turn off this bank's CMCI;
1534 	 */
1535 	if (bank_cmci_p->cmci_enabled && what == GCPU_MPT_WHAT_CMCI_ERR) {
1536 
1537 		/* if it is cmci trap, increase the count */
1538 		bank_cmci_p->ncmci++;
1539 
1540 		if (bank_cmci_p->ncmci >= gcpu_mca_cmci_throttling_threshold) {
1541 
1542 			/* turn off cmci */
1543 
1544 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(bank),
1545 			    &ctl2);
1546 			ctl2 &= ~MSR_MC_CTL2_EN;
1547 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(bank),
1548 			    ctl2);
1549 
1550 			/* clear the flag and count */
1551 
1552 			bank_cmci_p->cmci_enabled = 0;
1553 			bank_cmci_p->ncmci = 0;
1554 		}
1555 	}
1556 }
1557 #endif
1558 
1559 static void
1560 clear_mc(int first, int last, int ismc, boolean_t clrstatus,
1561     cmi_hdl_t hdl, gcpu_logout_t *gcl, gcpu_logout_t *pgcl)
1562 {
1563 	int i;
1564 	gcpu_bank_logout_t *gbl, *pgbl;
1565 	uint64_t status;
1566 
1567 	if (first < 0 || last < 0)
1568 		return;
1569 
1570 	for (i = first, gbl = &gcl->gcl_data[first]; i <= last; i++, gbl++) {
1571 		status = gbl->gbl_status;
1572 		if (status == 0)
1573 			continue;
1574 		if (clrstatus == B_FALSE)
1575 			goto serialize;
1576 
1577 		/*
1578 		 * For i86xpv we always clear status in order to invalidate
1579 		 * the interposed telemetry.
1580 		 *
1581 		 * For native machine checks we always clear status here.  For
1582 		 * native polls we must be a little more cautious since there
1583 		 * is an outside chance that we may clear telemetry from a
1584 		 * shared MCA bank on which a sibling core is machine checking.
1585 		 *
1586 		 * For polled observations of errors that look like they may
1587 		 * produce a machine check (UC/PCC and ENabled, although these
1588 		 * do not guarantee a machine check on error occurence)
1589 		 * we will not clear the status at this wakeup unless
1590 		 * we saw the same status at the previous poll.	 We will
1591 		 * always process and log the current observations - it
1592 		 * is only the clearing of MCi_STATUS which may be
1593 		 * deferred until the next wakeup.
1594 		 */
1595 		if (isxpv || ismc || !IS_MCE_CANDIDATE(status)) {
1596 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL);
1597 			goto serialize;
1598 		}
1599 
1600 		/*
1601 		 * We have a polled observation of a machine check
1602 		 * candidate.  If we saw essentially the same status at the
1603 		 * last poll then clear the status now since this appears
1604 		 * not to be a #MC candidate after all.	 If we see quite
1605 		 * different status now then do not clear, but reconsider at
1606 		 * the next poll.  In no actual machine check clears
1607 		 * the status in the interim then the status should not
1608 		 * keep changing forever (meaning we'd never clear it)
1609 		 * since before long we'll simply have latched the highest-
1610 		 * priority error and set the OVerflow bit.  Nonetheless
1611 		 * we count how many times we defer clearing and after
1612 		 * a while insist on clearing the status.
1613 		 */
1614 		pgbl = &pgcl->gcl_data[i];
1615 		if (pgbl->gbl_clrdefcnt != 0) {
1616 			/* We deferred clear on this bank at last wakeup */
1617 			if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) ||
1618 			    pgbl->gbl_clrdefcnt > 5) {
1619 				/*
1620 				 * Status is unchanged so clear it now and,
1621 				 * since we have already logged this info,
1622 				 * avoid logging it again.
1623 				 */
1624 				gbl->gbl_status = 0;
1625 				(void) cmi_hdl_wrmsr(hdl,
1626 				    IA32_MSR_MC(i, STATUS), 0ULL);
1627 			} else {
1628 				/* Record deferral for next wakeup */
1629 				gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1;
1630 			}
1631 		} else {
1632 			/* Record initial deferral for next wakeup */
1633 			gbl->gbl_clrdefcnt = 1;
1634 			gcpu_deferrred_polled_clears++;
1635 		}
1636 
1637 serialize:
1638 		{
1639 #ifdef __xpv
1640 			;
1641 #else
1642 			/*
1643 			 * Intel Vol 3A says to execute a serializing
1644 			 * instruction here, ie CPUID.	Well WRMSR is also
1645 			 * defined to be serializing, so the status clear above
1646 			 * should suffice.  To be a good citizen, and since
1647 			 * some clears are deferred, we'll execute a CPUID
1648 			 * instruction here.
1649 			 */
1650 			struct cpuid_regs tmp;
1651 			(void) __cpuid_insn(&tmp);
1652 #endif
1653 		}
1654 	}
1655 }
1656 
1657 /*ARGSUSED5*/
1658 void
1659 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask,
1660     gcpu_mce_status_t *mcesp, boolean_t clrstatus, int what)
1661 {
1662 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1663 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1664 	int nbanks = mca->gcpu_mca_nbanks;
1665 	gcpu_bank_logout_t *gbl, *pgbl;
1666 	gcpu_logout_t *gcl, *pgcl;
1667 	int ismc = (rp != NULL);
1668 	int ispoll = !ismc;
1669 	int i, nerr = 0;
1670 	cmi_errno_t err;
1671 	uint64_t mcg_status;
1672 	uint64_t disp;
1673 	uint64_t cap;
1674 	int first = -1;
1675 	int last = -1;
1676 	int willpanic = 0;
1677 
1678 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
1679 	    CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) !=
1680 	    CMI_SUCCESS) {
1681 		if (mcesp != NULL)
1682 			mcesp->mce_nerr = mcesp->mce_disp = 0;
1683 		return;
1684 	}
1685 
1686 	if (ismc) {
1687 		gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION];
1688 	} else {
1689 		int pidx = mca->gcpu_mca_nextpoll_idx;
1690 		int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ?
1691 		    GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1;
1692 
1693 		gcl = mca->gcpu_mca_logout[pidx];	/* current logout */
1694 		pgcl = mca->gcpu_mca_logout[ppidx];	/* previous logout */
1695 		mca->gcpu_mca_nextpoll_idx = ppidx;	/* switch next time */
1696 	}
1697 
1698 	gcl->gcl_timestamp = gethrtime_waitfree();
1699 	gcl->gcl_mcg_status = mcg_status;
1700 	gcl->gcl_ip = rp ? rp->r_pc : 0;
1701 
1702 	gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0;
1703 	if (cap & MCG_CAP_TES_P)
1704 		gcl->gcl_flags |= GCPU_GCL_F_TES_P;
1705 
1706 	for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1707 		uint64_t status, status2, addr, misc;
1708 		int retries = gcpu_mca_telemetry_retries;
1709 
1710 		gbl->gbl_status = 0;
1711 		gbl->gbl_disp = 0;
1712 		gbl->gbl_clrdefcnt = 0;
1713 
1714 		/*
1715 		 * Only logout from MCA banks we have initialized from at
1716 		 * least one core.  If a core shares an MCA bank with another
1717 		 * but perhaps lost the race to initialize it, then it must
1718 		 * still be allowed to logout from the shared bank.
1719 		 */
1720 		if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i))
1721 			continue;
1722 
1723 		/*
1724 		 * On a poll look only at the banks we've been asked to check.
1725 		 */
1726 		if (rp == NULL && !(bankmask & 1 << i))
1727 			continue;
1728 
1729 
1730 		if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) !=
1731 		    CMI_SUCCESS)
1732 			continue;
1733 
1734 #ifndef __xpv
1735 		gcpu_cmci_logout(hdl, i, &mca->gcpu_bank_cmci[i], status, what);
1736 #endif
1737 
1738 retry:
1739 		if (!(status & MSR_MC_STATUS_VAL))
1740 			continue;
1741 
1742 		/* First and last bank that have valid status */
1743 		if (first < 0)
1744 			first = i;
1745 		last = i;
1746 
1747 		addr = -1;
1748 		misc = 0;
1749 
1750 		if (status & MSR_MC_STATUS_ADDRV)
1751 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr);
1752 
1753 		if (status & MSR_MC_STATUS_MISCV)
1754 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc);
1755 
1756 #ifndef __xpv
1757 		gcpu_cmci_throttle(hdl, i, &mca->gcpu_bank_cmci[i], what);
1758 #endif
1759 
1760 		/*
1761 		 * Allow the model-specific code to extract bank telemetry.
1762 		 */
1763 		cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout);
1764 
1765 		/*
1766 		 * Not all cpu models assure us that the status/address/misc
1767 		 * data will not change during the above sequence of MSR reads,
1768 		 * or that it can only change by the addition of the OVerflow
1769 		 * bit to the status register.  If the status has changed
1770 		 * other than in the overflow bit then we attempt to reread
1771 		 * for a consistent snapshot, but eventually give up and
1772 		 * go with what we've got.  We only perform this check
1773 		 * for a poll - a further #MC during a #MC will reset, and
1774 		 * polled errors should not overwrite higher-priority
1775 		 * trapping errors (but could set the overflow bit).
1776 		 */
1777 		if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1778 		    &status2)) == CMI_SUCCESS) {
1779 			if (!STATUS_EQV(status, status2)) {
1780 				if (retries-- > 0) {
1781 					status = status2;
1782 					goto retry;
1783 				} else {
1784 					gbl->gbl_disp |=
1785 					    CMI_ERRDISP_INCONSISTENT;
1786 				}
1787 			}
1788 		} else if (ispoll && err != CMI_SUCCESS) {
1789 			gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT;
1790 		}
1791 
1792 		nerr++;
1793 		gbl->gbl_status = status;
1794 		gbl->gbl_addr = addr;
1795 		gbl->gbl_misc = misc;
1796 
1797 		/*
1798 		 * For polled observation, if the count of deferred status
1799 		 * clears updated in the clear_mc() is nonzero and the
1800 		 * MCi_STATUS has not changed, the last wakeup has produced
1801 		 * the ereport of the error. Therefore, clear the status in
1802 		 * this wakeup to avoid duplicate ereport.
1803 		 */
1804 		pgbl = &pgcl->gcl_data[i];
1805 		if (!isxpv && ispoll && IS_MCE_CANDIDATE(status) &&
1806 		    pgbl->gbl_clrdefcnt != 0) {
1807 			if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status)) {
1808 				gbl->gbl_status = 0;
1809 				(void) cmi_hdl_wrmsr(hdl,
1810 				    IA32_MSR_MC(i, STATUS), 0ULL);
1811 			}
1812 		}
1813 	}
1814 
1815 	if (gcpu_mca_stack_flag)
1816 		gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH);
1817 	else
1818 		gcl->gcl_stackdepth = 0;
1819 
1820 	/*
1821 	 * Decide our disposition for this error or errors, and submit for
1822 	 * logging and subsequent diagnosis.
1823 	 */
1824 	if (nerr != 0) {
1825 		disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp);
1826 
1827 		willpanic = (ismc && cmi_mce_response(rp, disp) == 0);
1828 
1829 		if (!willpanic)
1830 			clear_mc(first, last, ismc, clrstatus, hdl, gcl, pgcl);
1831 	} else {
1832 		disp = 0;
1833 		if (mcesp) {
1834 			mcesp->mce_nerr = mcesp->mce_disp = 0;
1835 		}
1836 	}
1837 
1838 	/*
1839 	 * Clear MCG_STATUS if MCIP is set (machine check in progress).
1840 	 * If a second #MC had occured before now the system would have
1841 	 * reset.  We can only do thise once gcpu_mca_process has copied
1842 	 * the logout structure.
1843 	 */
1844 	if (ismc && mcg_status & MCG_STATUS_MCIP)
1845 		(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0);
1846 
1847 	/*
1848 	 * At this point we have read and logged all telemetry that is visible
1849 	 * under the MCA.  On architectures for which the NorthBridge is
1850 	 * on-chip this may include NB-observed errors, but where the NB
1851 	 * is off chip it may have been the source of the #MC request and
1852 	 * so we must call into the memory-controller driver to give it
1853 	 * a chance to log errors.
1854 	 */
1855 	if (ismc) {
1856 		cmi_mc_logout(hdl, 1, willpanic);
1857 	}
1858 }
1859 
1860 #ifndef __xpv
1861 int gcpu_mca_trap_vomit_summary = 0;
1862 
1863 /*
1864  * On a native machine check exception we come here from mcetrap via
1865  * cmi_mca_trap.  A machine check on one cpu of a chip does not trap others
1866  * cpus of the chip, so it is possible that another cpu on this chip could
1867  * initiate a poll while we're in the #mc handler;  it is also possible that
1868  * this trap has occured during a poll on this cpu.  So we must acquire
1869  * the chip-wide poll lock, but be careful to avoid deadlock.
1870  *
1871  * The 'data' pointer cannot be NULL due to init order.
1872  */
1873 uint64_t
1874 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp)
1875 {
1876 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1877 	kmutex_t *poll_lock = NULL;
1878 	gcpu_mce_status_t mce;
1879 	uint64_t mcg_status;
1880 	int tooklock = 0;
1881 
1882 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
1883 	    CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP))
1884 		return (0);
1885 
1886 	/*
1887 	 * Synchronize with any poller from another core that may happen
1888 	 * to share access to one or more of the MCA banks.
1889 	 */
1890 	if (gcpu->gcpu_shared != NULL)
1891 		poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock;
1892 
1893 	if (poll_lock != NULL && !mutex_owned(poll_lock)) {
1894 		/*
1895 		 * The lock is not owned by the thread we have
1896 		 * interrupted.  Spin for this adaptive lock.
1897 		 */
1898 		while (!mutex_tryenter(poll_lock)) {
1899 			while (mutex_owner(poll_lock) != NULL)
1900 				;
1901 		}
1902 		tooklock = 1;
1903 	}
1904 
1905 	gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE, GCPU_MPT_WHAT_MC_ERR);
1906 
1907 	if (tooklock)
1908 		mutex_exit(poll_lock);
1909 
1910 	/*
1911 	 * gcpu_mca_trap_vomit_summary may be set for debug assistance.
1912 	 */
1913 	if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) {
1914 		cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, "
1915 		    "%u PCC (%u ok), "
1916 		    "%u UC (%d ok, %u poisoned), "
1917 		    "%u forcefatal, %u ignored",
1918 		    mce.mce_nerr, (u_longlong_t)mce.mce_disp,
1919 		    mce.mce_npcc, mce.mce_npcc_ok,
1920 		    mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned,
1921 		    mce.mce_forcefatal, mce.mce_ignored);
1922 	}
1923 
1924 	return (mce.mce_disp);
1925 }
1926 #endif
1927 
1928 /*ARGSUSED*/
1929 void
1930 gcpu_faulted_enter(cmi_hdl_t hdl)
1931 {
1932 	/* Nothing to do here */
1933 }
1934 
1935 /*ARGSUSED*/
1936 void
1937 gcpu_faulted_exit(cmi_hdl_t hdl)
1938 {
1939 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1940 
1941 	gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING;
1942 }
1943 
1944 /*
1945  * Write the requested values to the indicated MSRs.  Having no knowledge
1946  * of the model-specific requirements for writing to these model-specific
1947  * registers, we will only blindly write to those MSRs if the 'force'
1948  * argument is nonzero.  That option should only be used in prototyping
1949  * and debugging.
1950  */
1951 /*ARGSUSED*/
1952 cmi_errno_t
1953 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs,
1954     int force)
1955 {
1956 	int i, errs = 0;
1957 
1958 	for (i = 0; i < nregs; i++) {
1959 		uint_t msr = regs[i].cmr_msrnum;
1960 		uint64_t val = regs[i].cmr_msrval;
1961 
1962 		if (cms_present(hdl)) {
1963 			if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS)
1964 				errs++;
1965 		} else if (force) {
1966 			errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS);
1967 		} else {
1968 			errs++;
1969 		}
1970 	}
1971 
1972 	return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN);
1973 }
1974