xref: /illumos-gate/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_mca.c (revision dea9f5e6a4938723acec9624b3aa3f680f2f5c9f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2018, Joyent, Inc.
25  */
26 /*
27  * Copyright (c) 2010, Intel Corporation.
28  * All rights reserved.
29  */
30 
31 #include <sys/mca_x86.h>
32 #include <sys/cpu_module_impl.h>
33 #include <sys/cpu_module_ms.h>
34 #include <sys/cmn_err.h>
35 #include <sys/cpuvar.h>
36 #include <sys/pghw.h>
37 #include <sys/x86_archext.h>
38 #include <sys/sysmacros.h>
39 #include <sys/regset.h>
40 #include <sys/privregs.h>
41 #include <sys/systm.h>
42 #include <sys/types.h>
43 #include <sys/log.h>
44 #include <sys/psw.h>
45 #include <sys/fm/protocol.h>
46 #include <sys/fm/util.h>
47 #include <sys/errorq.h>
48 #include <sys/mca_x86.h>
49 #include <sys/fm/cpu/GMCA.h>
50 #include <sys/fm/smb/fmsmb.h>
51 #include <sys/sysevent.h>
52 #include <sys/ontrap.h>
53 #include <sys/smp_impldefs.h>
54 
55 #include "gcpu.h"
56 
57 extern int x86gentopo_legacy;	/* x86 generic topology support */
58 
59 static uint_t gcpu_force_addr_in_payload = 0;
60 
61 /*
62  * Clear to log telemetry found at initialization.  While processor docs
63  * say you should process this telemetry on all but Intel family 0x6
64  * there are way too many exceptions and we want to avoid bogus
65  * diagnoses.
66  */
67 int gcpu_suppress_log_on_init = 1;
68 
69 /*
70  * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at
71  * error logout time.  The stack will be included in the ereport if the
72  * error type selects stack inclusion, or in all cases if
73  * gcpu_mca_stack_ereport_include is nonzero.
74  */
75 int gcpu_mca_stack_flag = 0;
76 int gcpu_mca_stack_ereport_include = 0;
77 
78 /*
79  * The number of times to re-read MCA telemetry to try to obtain a
80  * consistent snapshot if we find it to be changing under our feet.
81  */
82 int gcpu_mca_telemetry_retries = 5;
83 
84 #ifndef __xpv
85 int gcpu_mca_cmci_throttling_threshold = 10;
86 int gcpu_mca_cmci_reenable_threshold = 1000;
87 
88 /*
89  * This is used to determine whether or not we have registered the CMCI CPU
90  * setup function. This is protected by cpu_lock.
91  */
92 static boolean_t gcpu_mca_cpu_registered = B_FALSE;
93 #endif
94 
95 static gcpu_error_disp_t gcpu_errtypes[] = {
96 
97 	/*
98 	 * Unclassified
99 	 */
100 	{
101 		FM_EREPORT_CPU_GENERIC_UNCLASSIFIED,
102 		NULL,
103 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
104 		MCAX86_SIMPLE_UNCLASSIFIED_MASKON,
105 		MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF
106 	},
107 
108 	/*
109 	 * Microcode ROM Parity Error
110 	 */
111 	{
112 		FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY,
113 		NULL,
114 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
115 		MCAX86_SIMPLE_MC_CODE_PARITY_MASKON,
116 		MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF
117 	},
118 
119 	/*
120 	 * External - BINIT# from another processor during power-on config
121 	 */
122 	{
123 		FM_EREPORT_CPU_GENERIC_EXTERNAL,
124 		NULL,
125 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
126 		MCAX86_SIMPLE_EXTERNAL_MASKON,
127 		MCAX86_SIMPLE_EXTERNAL_MASKOFF
128 	},
129 
130 	/*
131 	 * Functional redundancy check master/slave error
132 	 */
133 	{
134 		FM_EREPORT_CPU_GENERIC_FRC,
135 		NULL,
136 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
137 		MCAX86_SIMPLE_FRC_MASKON,
138 		MCAX86_SIMPLE_FRC_MASKOFF
139 	},
140 
141 	/*
142 	 * Internal parity error
143 	 */
144 	{
145 		FM_EREPORT_CPU_GENERIC_INTERNAL_PARITY,
146 		NULL,
147 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
148 		MCAX86_SIMPLE_INTERNAL_PARITY_MASKON,
149 		MCAX86_SIMPLE_INTERNAL_PARITY_MASKOFF
150 	},
151 
152 
153 	/*
154 	 * Internal timer error
155 	 */
156 	{
157 		FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER,
158 		NULL,
159 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
160 		MCAX86_SIMPLE_INTERNAL_TIMER_MASKON,
161 		MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF
162 	},
163 
164 	/*
165 	 * Internal unclassified
166 	 */
167 	{
168 		FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS,
169 		NULL,
170 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
171 		MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON,
172 		MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF
173 	},
174 
175 	/*
176 	 * Compound error codes - generic memory hierarchy
177 	 */
178 	{
179 		FM_EREPORT_CPU_GENERIC_GENMEMHIER,
180 		NULL,
181 		FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */
182 		MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON,
183 		MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF
184 	},
185 
186 	/*
187 	 * Compound error codes - TLB errors
188 	 */
189 	{
190 		FM_EREPORT_CPU_GENERIC_TLB,
191 		"%1$s" "TLB" "%2$s" "_ERR",
192 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
193 		MCAX86_COMPOUND_TLB_MASKON,
194 		MCAX86_COMPOUND_TLB_MASKOFF
195 	},
196 
197 	/*
198 	 * Compound error codes - memory hierarchy
199 	 */
200 	{
201 		FM_EREPORT_CPU_GENERIC_MEMHIER,
202 		"%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR",
203 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
204 		MCAX86_COMPOUND_MEMHIER_MASKON,
205 		MCAX86_COMPOUND_MEMHIER_MASKOFF
206 	},
207 
208 	/*
209 	 * Compound error codes - bus and interconnect errors
210 	 */
211 	{
212 		FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT,
213 		"BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR",
214 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
215 		MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON,
216 		MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF
217 	},
218 	/*
219 	 * Compound error codes - memory controller errors
220 	 */
221 	{
222 		FM_EREPORT_CPU_GENERIC_MEMORY_CONTROLLER,
223 		"MC" "_" "%8$s" "_" "%9$s" "_ERR",
224 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
225 		MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKON,
226 		MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKOFF
227 	},
228 };
229 
230 static gcpu_error_disp_t gcpu_unknown = {
231 	FM_EREPORT_CPU_GENERIC_UNKNOWN,
232 	"UNKNOWN",
233 	FM_EREPORT_PAYLOAD_FLAGS_COMMON,
234 	0,
235 	0
236 };
237 
238 static errorq_t *gcpu_mca_queue;
239 static kmutex_t gcpu_mca_queue_lock;
240 
241 #ifdef __xpv
242 static int isxpv = 1;
243 #else
244 static int isxpv = 0;
245 #endif
246 
247 static const gcpu_error_disp_t *
248 gcpu_disp_match(uint16_t code)
249 {
250 	const gcpu_error_disp_t *ged = gcpu_errtypes;
251 	int i;
252 
253 	for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t);
254 	    i++, ged++) {
255 		uint16_t on = ged->ged_errcode_mask_on;
256 		uint16_t off = ged->ged_errcode_mask_off;
257 
258 		if ((code & on) == on && (code & off) == 0)
259 			return (ged);
260 	}
261 
262 	return (NULL);
263 }
264 
265 static uint16_t
266 bit_strip(uint16_t code, uint16_t mask, uint16_t shift)
267 {
268 	return ((code & mask) >> shift);
269 }
270 
271 #define	BIT_STRIP(code, name) \
272 	bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \
273 	MCAX86_ERRCODE_##name##_SHIFT)
274 
275 #define	GCPU_MNEMONIC_UNDEF	"undefined"
276 #define	GCPU_MNEMONIC_RESVD	"reserved"
277 
278 /*
279  * Mappings of TT, LL, RRRR, PP, II and T values to compound error name
280  * mnemonics and to ereport class name components.
281  */
282 
283 struct gcpu_mnexp {
284 	const char *mne_compound;	/* used in expanding compound errname */
285 	const char *mne_ereport;	/* used in expanding ereport class */
286 };
287 
288 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */
289 	{ "I", FM_EREPORT_CPU_GENERIC_TT_INSTR },		/* INSTR */
290 	{ "D", FM_EREPORT_CPU_GENERIC_TT_DATA },		/* DATA */
291 	{ "G", FM_EREPORT_CPU_GENERIC_TT_GEN },			/* GEN */
292 	{ GCPU_MNEMONIC_UNDEF, "" }
293 };
294 
295 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */
296 	{ "LO", FM_EREPORT_CPU_GENERIC_LL_L0 },			/* L0 */
297 	{ "L1",	FM_EREPORT_CPU_GENERIC_LL_L1 },			/* L1 */
298 	{ "L2",	FM_EREPORT_CPU_GENERIC_LL_L2 },			/* L2 */
299 	{ "LG", FM_EREPORT_CPU_GENERIC_LL_LG }			/* LG */
300 };
301 
302 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */
303 	{ "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR },		/* ERR */
304 	{ "RD",	FM_EREPORT_CPU_GENERIC_RRRR_RD },		/* RD */
305 	{ "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR },		/* WR */
306 	{ "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD },		/* DRD */
307 	{ "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR },		/* DWR */
308 	{ "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD },		/* IRD */
309 	{ "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH },	/* PREFETCH */
310 	{ "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT },		/* EVICT */
311 	{ "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP },		/* SNOOP */
312 };
313 
314 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */
315 	{ "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC },		/* SRC */
316 	{ "RES", FM_EREPORT_CPU_GENERIC_PP_RES },		/* RES */
317 	{ "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS },		/* OBS */
318 	{ "", FM_EREPORT_CPU_GENERIC_PP_GEN }			/* GEN */
319 };
320 
321 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */
322 	{ "M", FM_EREPORT_CPU_GENERIC_II_MEM },			/* MEM */
323 	{ GCPU_MNEMONIC_RESVD, "" },
324 	{ "IO", FM_EREPORT_CPU_GENERIC_II_IO },			/* IO */
325 	{ "", FM_EREPORT_CPU_GENERIC_II_GEN }			/* GEN */
326 };
327 
328 static struct gcpu_mnexp gcpu_T_mnemonics[] = {	 /* MCAX86_ERRCODE_T_* */
329 	{ "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT },	/* NONE */
330 	{ "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT }		/* TIMEOUT */
331 };
332 
333 static struct gcpu_mnexp gcpu_CCCC_mnemonics[] = { /* MCAX86_ERRCODE_CCCC_* */
334 	{ "CH0", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH0 */
335 	{ "CH1", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH1 */
336 	{ "CH2", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH2 */
337 	{ "CH3", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH3 */
338 	{ "CH4", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH4 */
339 	{ "CH5", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH5 */
340 	{ "CH6", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH6 */
341 	{ "CH7", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH7 */
342 	{ "CH8", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH8 */
343 	{ "CH9", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH9 */
344 	{ "CH10", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH10 */
345 	{ "CH11", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH11 */
346 	{ "CH12", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH12 */
347 	{ "CH13", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH13 */
348 	{ "CH14", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH14 */
349 	{ "CH", FM_EREPORT_CPU_GENERIC_CCCC }		/* GEN */
350 };
351 
352 static struct gcpu_mnexp gcpu_MMM_mnemonics[] = { /* MCAX86_ERRCODE_MMM_* */
353 	{ "GEN", FM_EREPORT_CPU_GENERIC_MMM_ERR },	/* GEN ERR */
354 	{ "RD", FM_EREPORT_CPU_GENERIC_MMM_RD },	/* READ  */
355 	{ "WR", FM_EREPORT_CPU_GENERIC_MMM_WR },	/* WRITE  */
356 	{ "ADDR_CMD", FM_EREPORT_CPU_GENERIC_MMM_ADRCMD },	/* ADDR, CMD  */
357 	{ "SCRUB", FM_EREPORT_CPU_GENERIC_MMM_SCRUB },
358 	{ GCPU_MNEMONIC_RESVD, ""},			/* RESERVED  */
359 	{ GCPU_MNEMONIC_RESVD, ""},			/* RESERVED  */
360 	{ GCPU_MNEMONIC_RESVD, ""}			/* RESERVED  */
361 };
362 
363 enum gcpu_mn_namespace {
364 	GCPU_MN_NAMESPACE_COMPOUND,
365 	GCPU_MN_NAMESPACE_EREPORT
366 };
367 
368 static const char *
369 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint16_t val,
370     enum gcpu_mn_namespace nspace)
371 {
372 	if (val >= tbl_sz || val > 0xff)
373 		return (GCPU_MNEMONIC_UNDEF);	/* for all namespaces */
374 
375 	switch (nspace) {
376 	case GCPU_MN_NAMESPACE_COMPOUND:
377 		return (tbl[val].mne_compound);
378 		/*NOTREACHED*/
379 
380 	case GCPU_MN_NAMESPACE_EREPORT:
381 		return (tbl[val].mne_ereport);
382 		/*NOTREACHED*/
383 
384 	default:
385 		return (GCPU_MNEMONIC_UNDEF);
386 		/*NOTREACHED*/
387 	}
388 }
389 
390 /*
391  * The ereport class leaf component is either a simple string with no
392  * format specifiers, or a string with one or more embedded %n$s specifiers -
393  * positional selection for string arguments.  The kernel snprintf does
394  * not support %n$ (and teaching it to do so is too big a headache) so
395  * we will expand this restricted format string ourselves.
396  */
397 
398 #define	GCPU_CLASS_VARCOMPS	9
399 
400 #define	GCPU_MNEMONIC(code, name, nspace) \
401 	gcpu_mnemonic(gcpu_##name##_mnemonics, \
402 	sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \
403 	BIT_STRIP(code, name), nspace)
404 
405 static void
406 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
407     enum gcpu_mn_namespace nspace)
408 {
409 	uint16_t code = MCAX86_ERRCODE(status);
410 	const char *mn[GCPU_CLASS_VARCOMPS];
411 	char *p = buf;			/* current position in buf */
412 	char *q = buf + buflen;		/* pointer past last char in buf */
413 	int which, expfmtchar, error;
414 	char c;
415 
416 	mn[0] = GCPU_MNEMONIC(code, TT, nspace);
417 	mn[1] = GCPU_MNEMONIC(code, LL, nspace);
418 	mn[2] = GCPU_MNEMONIC(code, RRRR, nspace);
419 	mn[3] = GCPU_MNEMONIC(code, PP, nspace);
420 	mn[4] = GCPU_MNEMONIC(code, II, nspace);
421 	mn[5] = GCPU_MNEMONIC(code, T, nspace);
422 	mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : "";
423 	mn[7] = GCPU_MNEMONIC(code, CCCC, nspace);
424 	mn[8] = GCPU_MNEMONIC(code, MMM, nspace);
425 
426 	while (p < q - 1 && (c = *fmt++) != '\0') {
427 		if (c != '%') {
428 			/* not the beginning of a format specifier - copy */
429 			*p++ = c;
430 			continue;
431 		}
432 
433 		error = 0;
434 		which = -1;
435 		expfmtchar = -1;
436 
437 nextfmt:
438 		if ((c = *fmt++) == '\0')
439 			break;	/* early termination of fmt specifier */
440 
441 		switch (c) {
442 		case '1':
443 		case '2':
444 		case '3':
445 		case '4':
446 		case '5':
447 		case '6':
448 		case '7':
449 		case '8':
450 		case '9':
451 			if (which != -1) { /* allow only one positional digit */
452 				error++;
453 				break;
454 			}
455 			which = c - '1';
456 			goto nextfmt;
457 			/*NOTREACHED*/
458 
459 		case '$':
460 			if (which == -1) { /* no position specified */
461 				error++;
462 				break;
463 			}
464 			expfmtchar = 's';
465 			goto nextfmt;
466 			/*NOTREACHED*/
467 
468 		case 's':
469 			if (expfmtchar != 's') {
470 				error++;
471 				break;
472 			}
473 			(void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
474 			    mn[which]);
475 			p += strlen(p);
476 			break;
477 
478 		default:
479 			error++;
480 			break;
481 		}
482 
483 		if (error)
484 			break;
485 	}
486 
487 	*p = '\0';	/* NUL termination */
488 }
489 
490 static void
491 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
492     const char *cpuclass, const char *leafclass)
493 {
494 	char *p = buf;			/* current position in buf */
495 	char *q = buf + buflen;		/* pointer past last char in buf */
496 
497 	(void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.",
498 	    FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC);
499 
500 	p += strlen(p);
501 	if (p >= q)
502 		return;
503 
504 	if (leafclass == NULL) {
505 		gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status,
506 		    GCPU_MN_NAMESPACE_EREPORT);
507 	} else {
508 		(void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
509 		    leafclass);
510 	}
511 }
512 
513 /*
514  * Create an "hc" scheme FMRI identifying the given cpu with
515  * motherboard/chip/core/strand instance numbers.
516  */
517 static nvlist_t *
518 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva)
519 {
520 	nvlist_t *nvl, *fmri;
521 
522 	if ((nvl = fm_nvlist_create(nva)) == NULL)
523 		return (NULL);
524 
525 	if (!x86gentopo_legacy) {
526 		fmri = cmi_hdl_smb_bboard(hdl);
527 		if (fmri == NULL)
528 			return (NULL);
529 
530 		fm_fmri_hc_create(nvl, FM_HC_SCHEME_VERSION,
531 		    NULL, NULL, fmri, 3,
532 		    "chip", cmi_hdl_smb_chipid(hdl),
533 		    "core", cmi_hdl_coreid(hdl),
534 		    "strand", cmi_hdl_strandid(hdl));
535 	} else {
536 		fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 4,
537 		    "motherboard", 0,
538 		    "chip", cmi_hdl_chipid(hdl),
539 		    "core", cmi_hdl_coreid(hdl),
540 		    "strand", cmi_hdl_strandid(hdl));
541 	}
542 
543 	return (nvl);
544 }
545 
546 int gcpu_bleat_count_thresh = 5;
547 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL;
548 
549 /*
550  * Called when we are unable to propogate a logout structure onto an
551  * errorq for subsequent ereport preparation and logging etc.  The caller
552  * should usually only decide to call this for severe errors - those we
553  * suspect we may need to panic for.
554  */
555 static void
556 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl)
557 {
558 	hrtime_t now  = gethrtime_waitfree();
559 	static hrtime_t gcpu_last_bleat;
560 	gcpu_bank_logout_t *gbl;
561 	static int bleatcount;
562 	int i;
563 
564 	/*
565 	 * Throttle spamming of the console.  The first gcpu_bleat_count_thresh
566 	 * can come as fast as we like, but once we've spammed that many
567 	 * to the console we require a minimum interval to pass before
568 	 * any more complaints.
569 	 */
570 	if (++bleatcount > gcpu_bleat_count_thresh) {
571 		if (now - gcpu_last_bleat < gcpu_bleat_min_interval)
572 			return;
573 		else
574 			bleatcount = 0;
575 	}
576 	gcpu_last_bleat = now;
577 
578 	cmn_err(CE_WARN,
579 	    "Machine-Check Errors unlogged on chip %d core %d strand %d, "
580 	    "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl),
581 	    cmi_hdl_strandid(hdl));
582 	cmn_err(CE_WARN, "MCG_STATUS 0x%016llx",
583 	    (u_longlong_t)gcl->gcl_mcg_status);
584 	for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
585 		uint64_t status = gbl->gbl_status;
586 
587 		if (!(status & MSR_MC_STATUS_VAL))
588 			continue;
589 
590 		/* Force ADDRV for AMD Family 0xf and above */
591 		if (gcpu_force_addr_in_payload)
592 			status = status | MSR_MC_STATUS_ADDRV;
593 
594 		switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) {
595 		case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV:
596 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
597 			    "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx",
598 			    i, IA32_MSR_MC(i, STATUS),
599 			    (u_longlong_t)gbl->gbl_status,
600 			    (u_longlong_t)gbl->gbl_addr,
601 			    (u_longlong_t)gbl->gbl_misc);
602 			break;
603 
604 		case MSR_MC_STATUS_ADDRV:
605 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
606 			    "STAT 0x%016llx ADDR 0x%016llx",
607 			    i, IA32_MSR_MC(i, STATUS),
608 			    (u_longlong_t)gbl->gbl_status,
609 			    (u_longlong_t)gbl->gbl_addr);
610 			break;
611 
612 		case MSR_MC_STATUS_MISCV:
613 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
614 			    "STAT 0x%016llx MISC 0x%016llx",
615 			    i, IA32_MSR_MC(i, STATUS),
616 			    (u_longlong_t)gbl->gbl_status,
617 			    (u_longlong_t)gbl->gbl_misc);
618 			break;
619 
620 		default:
621 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
622 			    "STAT 0x%016llx",
623 			    i, IA32_MSR_MC(i, STATUS),
624 			    (u_longlong_t)gbl->gbl_status);
625 			break;
626 
627 		}
628 	}
629 }
630 
631 #define	_GCPU_BSTATUS(status, what) \
632 	FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \
633 	(status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE
634 
635 static void
636 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl,
637     uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code)
638 {
639 	uint64_t members = ged ? ged->ged_ereport_members :
640 	    FM_EREPORT_PAYLOAD_FLAGS_COMMON;
641 	uint64_t mcg = gcl->gcl_mcg_status;
642 	int mcip = mcg & MCG_STATUS_MCIP;
643 	const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno];
644 	uint64_t bstat = gbl->gbl_status;
645 
646 	/*
647 	 * Include the compound error name if requested and if this
648 	 * is a compound error type.
649 	 */
650 	if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged &&
651 	    ged->ged_compound_fmt != NULL) {
652 		char buf[FM_MAX_CLASS];
653 
654 		gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code,
655 		    GCPU_MN_NAMESPACE_COMPOUND);
656 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR,
657 		    DATA_TYPE_STRING, buf, NULL);
658 	}
659 
660 	/*
661 	 * Include disposition information for this error
662 	 */
663 	if (members & FM_EREPORT_PAYLOAD_FLAG_DISP &&
664 	    gbl->gbl_disp != 0) {
665 		int i, empty = 1;
666 		char buf[128];
667 		char *p = buf, *q = buf + 128;
668 		static struct _gcpu_disp_name {
669 			uint64_t dv;
670 			const char *dn;
671 		} disp_names[] = {
672 			{ CMI_ERRDISP_CURCTXBAD,
673 			    "processor_context_corrupt" },
674 			{ CMI_ERRDISP_RIPV_INVALID,
675 			    "return_ip_invalid" },
676 			{ CMI_ERRDISP_UC_UNCONSTRAINED,
677 			    "unconstrained" },
678 			{ CMI_ERRDISP_FORCEFATAL,
679 			    "forcefatal" },
680 			{ CMI_ERRDISP_IGNORED,
681 			    "ignored" },
682 			{ CMI_ERRDISP_PCC_CLEARED,
683 			    "corrupt_context_cleared" },
684 			{ CMI_ERRDISP_UC_CLEARED,
685 			    "uncorrected_data_cleared" },
686 			{ CMI_ERRDISP_POISONED,
687 			    "poisoned" },
688 			{ CMI_ERRDISP_INCONSISTENT,
689 			    "telemetry_unstable" },
690 		};
691 
692 		for (i = 0; i < sizeof (disp_names) /
693 		    sizeof (struct _gcpu_disp_name); i++) {
694 			if ((gbl->gbl_disp & disp_names[i].dv) == 0)
695 				continue;
696 
697 			(void) snprintf(p, (uintptr_t)q - (uintptr_t)p,
698 			    "%s%s", empty ? "" : ",", disp_names[i].dn);
699 			p += strlen(p);
700 			empty = 0;
701 		}
702 
703 		if (p != buf)
704 			fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP,
705 			    DATA_TYPE_STRING, buf, NULL);
706 	}
707 
708 	/*
709 	 * If MCG_STATUS is included add that and an indication of whether
710 	 * this ereport was the result of a machine check or poll.
711 	 */
712 	if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) {
713 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS,
714 		    DATA_TYPE_UINT64, mcg, NULL);
715 
716 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP,
717 		    DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL);
718 	}
719 
720 	/*
721 	 * If an instruction pointer is to be included add one provided
722 	 * MCG_STATUS indicated it is valid; meaningless for polled events.
723 	 */
724 	if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP &&
725 	    mcg & MCG_STATUS_EIPV) {
726 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP,
727 		    DATA_TYPE_UINT64, gcl->gcl_ip, NULL);
728 	}
729 
730 	/*
731 	 * Add an indication of whether the trap occured during privileged code.
732 	 */
733 	if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) {
734 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV,
735 		    DATA_TYPE_BOOLEAN_VALUE,
736 		    gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL);
737 	}
738 
739 	/*
740 	 * If requested, add the index of the MCA bank.  This indicates the
741 	 * n'th bank of 4 MCA registers, and does not necessarily correspond
742 	 * to MCi_* - use the bank offset to correlate
743 	 */
744 	if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) {
745 		fm_payload_set(ereport,
746 		    /* Bank number */
747 		    FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno,
748 		    /* Offset of MCi_CTL */
749 		    FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64,
750 		    IA32_MSR_MC(bankno, CTL),
751 		    NULL);
752 	}
753 
754 	/*
755 	 * Add MCi_STATUS if requested, and decode it.
756 	 */
757 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) {
758 		const char *tbes[] = {
759 			"No tracking",			/* 00 */
760 			"Green - below threshold",	/* 01 */
761 			"Yellow - above threshold",	/* 10 */
762 			"Reserved"			/* 11 */
763 		};
764 
765 		fm_payload_set(ereport,
766 		    /* Bank MCi_STATUS */
767 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat,
768 		    /* Overflow? */
769 		    _GCPU_BSTATUS(bstat, OVER),
770 		    /* Uncorrected? */
771 		    _GCPU_BSTATUS(bstat, UC),
772 		    /* Enabled? */
773 		    _GCPU_BSTATUS(bstat, EN),
774 		    /* Processor context corrupt? */
775 		    _GCPU_BSTATUS(bstat, PCC),
776 		    /* Error code */
777 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE,
778 		    DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat),
779 		    /* Model-specific error code */
780 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE,
781 		    DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat),
782 		    NULL);
783 
784 		/*
785 		 * If MCG_CAP.TES_P indicates that that thresholding info
786 		 * is present in the architural component of the bank status
787 		 * then include threshold information for this bank.
788 		 */
789 		if (gcl->gcl_flags & GCPU_GCL_F_TES_P) {
790 			fm_payload_set(ereport,
791 			    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES,
792 			    DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)],
793 			    NULL);
794 		}
795 	}
796 
797 	/*
798 	 * Add MCi_ADDR info if requested and valid. We force addition of
799 	 * MCi_ADDR, even if its not valid on AMD family 0xf and above,
800 	 * to aid in analysis of ereports, for WatchDog errors.
801 	 */
802 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR &&
803 	    ((bstat & MSR_MC_STATUS_ADDRV) ||
804 	    gcpu_force_addr_in_payload)) {
805 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR,
806 		    DATA_TYPE_UINT64, gbl->gbl_addr, NULL);
807 	}
808 
809 	/*
810 	 * MCi_MISC if requested and MCi_STATUS.MISCV).
811 	 */
812 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC &&
813 	    bstat & MSR_MC_STATUS_MISCV) {
814 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC,
815 		    DATA_TYPE_UINT64, gbl->gbl_misc, NULL);
816 	}
817 
818 }
819 
820 /*
821  * Construct and post an ereport based on the logout information from a
822  * single MCA bank.  We are not necessarily running on the cpu that
823  * detected the error.
824  */
825 static void
826 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx,
827     const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status)
828 {
829 	gcpu_data_t *gcpu = gcl->gcl_gcpu;
830 	cmi_hdl_t hdl = gcpu->gcpu_hdl;
831 	const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx];
832 	const char *cpuclass = NULL, *leafclass = NULL;
833 	uint16_t code = MCAX86_ERRCODE(status);
834 	errorq_elem_t *eqep, *scr_eqep;
835 	nvlist_t *ereport, *detector;
836 	char buf[FM_MAX_CLASS];
837 	const char *classfmt;
838 	nv_alloc_t *nva;
839 
840 	if (panicstr) {
841 		if ((eqep = errorq_reserve(ereport_errorq)) == NULL)
842 			return;
843 		ereport = errorq_elem_nvl(ereport_errorq, eqep);
844 
845 		/*
846 		 * Allocate another element for scratch space, but fallback
847 		 * to the one we have if that fails.  We'd like to use the
848 		 * additional scratch space for nvlist construction.
849 		 */
850 		if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL)
851 			nva = errorq_elem_nva(ereport_errorq, scr_eqep);
852 		else
853 			nva = errorq_elem_nva(ereport_errorq, eqep);
854 	} else {
855 		ereport = fm_nvlist_create(NULL);
856 		nva = NULL;
857 		eqep = NULL;
858 		scr_eqep = NULL;
859 	}
860 
861 	if (ereport == NULL)
862 		return;
863 
864 	/*
865 	 * Common payload data required by the protocol:
866 	 *	- ereport class
867 	 *	- detector
868 	 *	- ENA
869 	 */
870 
871 	/*
872 	 * Ereport class - call into model-specific support to allow it to
873 	 * provide a cpu class or leaf class, otherwise calculate our own.
874 	 */
875 	cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass);
876 	classfmt = ged ?  ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN;
877 	gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass,
878 	    leafclass);
879 
880 	/*
881 	 * The detector FMRI.
882 	 */
883 	if ((detector = cms_ereport_detector(hdl, bankidx, mscookie,
884 	    nva)) == NULL)
885 		detector = gcpu_fmri_create(hdl, nva);
886 
887 	/*
888 	 * Should we define a new ENA format 3?? for chip/core/strand?
889 	 * It will be better when virtualized.
890 	 */
891 	fm_ereport_set(ereport, FM_EREPORT_VERSION, buf,
892 	    fm_ena_generate_cpu(gcl->gcl_timestamp,
893 	    cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 |
894 	    cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL);
895 
896 	if (panicstr) {
897 		fm_nvlist_destroy(detector, FM_NVA_RETAIN);
898 		nv_alloc_reset(nva);
899 	} else {
900 		fm_nvlist_destroy(detector, FM_NVA_FREE);
901 	}
902 
903 	/*
904 	 * Add the architectural ereport class-specific payload data.
905 	 */
906 	gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code);
907 
908 	/*
909 	 * Allow model-specific code to add ereport members.
910 	 */
911 	cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status,
912 	    gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie);
913 
914 	/*
915 	 * Include stack if options is turned on and either selected in
916 	 * the payload member bitmask or inclusion is forced.
917 	 */
918 	if (gcpu_mca_stack_flag &&
919 	    (cms_ereport_includestack(hdl, mscookie) ==
920 	    B_TRUE || gcpu_mca_stack_ereport_include)) {
921 		fm_payload_stack_add(ereport, gcl->gcl_stack,
922 		    gcl->gcl_stackdepth);
923 	}
924 
925 	/*
926 	 * If injection has taken place anytime in the past then note this
927 	 * on the ereport.
928 	 */
929 	if (cmi_inj_tainted() == B_TRUE) {
930 		fm_payload_set(ereport, "__injected", DATA_TYPE_BOOLEAN_VALUE,
931 		    B_TRUE, NULL);
932 	}
933 
934 	/*
935 	 * Post ereport.
936 	 */
937 	if (panicstr) {
938 		errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC);
939 		if (scr_eqep)
940 			errorq_cancel(ereport_errorq, scr_eqep);
941 	} else {
942 		(void) fm_ereport_post(ereport, EVCH_TRYHARD);
943 		fm_nvlist_destroy(ereport, FM_NVA_FREE);
944 	}
945 
946 }
947 
948 /*ARGSUSED*/
949 void
950 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe)
951 {
952 	const gcpu_logout_t *gcl = data;
953 	const gcpu_bank_logout_t *gbl;
954 	int ismc;
955 	int i;
956 
957 	ismc = gcl->ismc;
958 	for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
959 		const gcpu_error_disp_t *gened;
960 		cms_cookie_t mscookie;
961 
962 		if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
963 		    !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
964 			uint16_t code = MCAX86_ERRCODE(gbl->gbl_status);
965 
966 			/*
967 			 * Perform a match based on IA32 MCA architectural
968 			 * components alone.
969 			 */
970 			gened = gcpu_disp_match(code); /* may be NULL */
971 
972 			/*
973 			 * Now see if an model-specific match can be made.
974 			 */
975 			mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, ismc,
976 			    i, gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc,
977 			    gcl->gcl_ms_logout);
978 
979 			/*
980 			 * Prepare and dispatch an ereport for logging and
981 			 * diagnosis.
982 			 */
983 			gcpu_ereport_post(gcl, i, gened, mscookie,
984 			    gbl->gbl_status);
985 		} else if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
986 		    (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
987 			/*
988 			 * Telemetry kept changing as we tried to read
989 			 * it.  Force an unknown ereport leafclass but
990 			 * keep the telemetry unchanged for logging.
991 			 */
992 			gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL,
993 			    gbl->gbl_status);
994 		}
995 	}
996 }
997 
998 static size_t gcpu_mca_queue_datasz = 0;
999 
1000 /*
1001  * The following code is ready to make a weak attempt at growing the
1002  * errorq structure size.  Since it is not foolproof (we don't know
1003  * who may already be producing to the outgoing errorq) our caller
1004  * instead assures that we'll always be called with no greater data
1005  * size than on our first call.
1006  */
1007 static void
1008 gcpu_errorq_init(size_t datasz)
1009 {
1010 	int slots;
1011 
1012 	mutex_enter(&gcpu_mca_queue_lock);
1013 
1014 	if (gcpu_mca_queue_datasz >= datasz) {
1015 		mutex_exit(&gcpu_mca_queue_lock);
1016 		return;
1017 	}
1018 
1019 	membar_producer();
1020 	if (gcpu_mca_queue) {
1021 		gcpu_mca_queue_datasz = 0;
1022 		errorq_destroy(gcpu_mca_queue);
1023 	}
1024 
1025 	slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS);
1026 	slots = MIN(slots, GCPU_MCA_MAX_ERRORS);
1027 
1028 	gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain,
1029 	    NULL, slots, datasz, 1, ERRORQ_VITAL);
1030 
1031 	if (gcpu_mca_queue != NULL)
1032 		gcpu_mca_queue_datasz = datasz;
1033 
1034 	mutex_exit(&gcpu_mca_queue_lock);
1035 }
1036 
1037 /*
1038  * Perform MCA initialization as described in section 14.6 of Intel 64
1039  * and IA-32 Architectures Software Developer's Manual Volume 3A.
1040  */
1041 
1042 static uint_t global_nbanks;
1043 
1044 #ifndef __xpv
1045 /*ARGSUSED*/
1046 int
1047 gcpu_cmci_cpu_setup(cpu_setup_t what, int cpuid, void *arg)
1048 {
1049 	/*
1050 	 * In general, we'd expect that in a multi-socket configuration, either
1051 	 * all CPUs would support CMCI or none of them would.  Unfortunately,
1052 	 * that may not be the case in the wild.  While we'd rather check the
1053 	 * handle's enablement state here, that itself is a bit complicated. We
1054 	 * don't have a guarantee in a heterogenous situation that the CPU in
1055 	 * question is using the generic CPU module or not, even though we've
1056 	 * been registered. As such, we allow the interrupt to be registered and
1057 	 * written to the local apic anyways. We won't have a CMCI interrupt
1058 	 * generated anyways because the MCA banks will not be programmed as
1059 	 * such for that CPU by the polling thread.
1060 	 */
1061 	switch (what) {
1062 	case CPU_ON:
1063 		psm_cmci_setup(cpuid, B_TRUE);
1064 		break;
1065 	case CPU_OFF:
1066 		psm_cmci_setup(cpuid, B_FALSE);
1067 		break;
1068 	default:
1069 		break;
1070 	}
1071 
1072 	return (0);
1073 }
1074 
1075 void
1076 gcpu_mca_cmci_enable(cmi_hdl_t hdl)
1077 {
1078 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1079 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1080 
1081 	/*
1082 	 * If this CPU doesn't support CMCI, don't do anything.
1083 	 */
1084 	if ((mca->gcpu_mca_flags & GCPU_MCA_F_CMCI_CAPABLE) == 0)
1085 		return;
1086 
1087 	/*
1088 	 * If we don't have support from the PSM module, then there's nothing we
1089 	 * can do. Note that this changes as we start up the system. The only
1090 	 * case where it may be mistakenly NULL is for the boot CPU. The boot
1091 	 * CPU will have this taken care of for it in gcpu_post_startup(), once
1092 	 * we know for certain whether or not the PSM module supports CMCI.
1093 	 */
1094 	if (psm_cmci_setup == NULL) {
1095 		return;
1096 	}
1097 
1098 	mca->gcpu_mca_flags |= GCPU_MCA_F_CMCI_ENABLE;
1099 	if (MUTEX_HELD(&cpu_lock)) {
1100 		if (!gcpu_mca_cpu_registered) {
1101 			register_cpu_setup_func(gcpu_cmci_cpu_setup, NULL);
1102 			gcpu_mca_cpu_registered = B_TRUE;
1103 		}
1104 	} else {
1105 		mutex_enter(&cpu_lock);
1106 		if (!gcpu_mca_cpu_registered) {
1107 			register_cpu_setup_func(gcpu_cmci_cpu_setup, NULL);
1108 			gcpu_mca_cpu_registered = B_TRUE;
1109 		}
1110 		mutex_exit(&cpu_lock);
1111 	}
1112 
1113 	/*
1114 	 * Call the PSM op to make sure that we initialize things on
1115 	 * this CPU.
1116 	 */
1117 	psm_cmci_setup(cmi_hdl_logical_id(hdl), B_TRUE);
1118 }
1119 #endif	/* !__xpv */
1120 
1121 void
1122 gcpu_mca_init(cmi_hdl_t hdl)
1123 {
1124 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1125 	uint64_t cap;
1126 	uint_t vendor = cmi_hdl_vendor(hdl);
1127 	uint_t family = cmi_hdl_family(hdl);
1128 	uint_t rev = cmi_hdl_chiprev(hdl);
1129 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1130 	int mcg_ctl_present;
1131 	uint_t nbanks;
1132 	uint32_t ctl_skip_mask = 0;
1133 	uint32_t status_skip_mask = 0;
1134 	size_t mslsz;
1135 	int i;
1136 #ifndef __xpv
1137 	int mcg_ctl2_present;
1138 	uint32_t cmci_capable = 0;
1139 #endif
1140 	if (gcpu == NULL)
1141 		return;
1142 
1143 	/* We add MCi_ADDR always for AMD Family 0xf and above */
1144 	if (X86_CHIPFAM_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B))
1145 		gcpu_force_addr_in_payload = 1;
1146 
1147 	/*
1148 	 * Protect from some silly /etc/system settings.
1149 	 */
1150 	if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100)
1151 		gcpu_mca_telemetry_retries = 5;
1152 
1153 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS)
1154 		return;
1155 
1156 	/*
1157 	 * CPU startup code only calls cmi_mca_init if x86_featureset indicates
1158 	 * both MCA and MCE support (i.e., X86FSET_MCA).  P5, K6, and earlier
1159 	 * processors, which have their own more primitive way of doing
1160 	 * machine checks, will not have cmi_mca_init called since their
1161 	 * CPUID information will not indicate both MCA and MCE features.
1162 	 */
1163 	ASSERT(is_x86_feature(x86_featureset, X86FSET_MCA));
1164 
1165 	/*
1166 	 * Determine whether the IA32_MCG_CTL register is present.  If it
1167 	 * is we will enable all features by writing -1 to it towards
1168 	 * the end of this initialization;  if it is absent then volume 3A
1169 	 * says we must nonetheless continue to initialize the individual
1170 	 * banks.
1171 	 */
1172 	mcg_ctl_present = cap & MCG_CAP_CTL_P;
1173 #ifndef __xpv
1174 	mcg_ctl2_present = cap & MCG_CAP_CTL2_P;
1175 #endif
1176 
1177 	/*
1178 	 * We squirell values away for inspection/debugging.
1179 	 */
1180 	mca->gcpu_mca_bioscfg.bios_mcg_cap = cap;
1181 	if (mcg_ctl_present)
1182 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL,
1183 		    &mca->gcpu_mca_bioscfg.bios_mcg_ctl);
1184 
1185 	/*
1186 	 * Determine the number of error-reporting banks implemented.
1187 	 */
1188 	mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK;
1189 
1190 	if (nbanks != 0 && global_nbanks == 0)
1191 		global_nbanks = nbanks;	/* no race - BSP will get here first */
1192 
1193 	/*
1194 	 * If someone is hiding the number of banks (perhaps we are fully
1195 	 * virtualized?) or if this processor has more banks than the
1196 	 * first to set global_nbanks then bail.  The latter requirement
1197 	 * is because we need to size our errorq data structure and we
1198 	 * don't want to have to grow the errorq (destroy and recreate)
1199 	 * which may just lose some telemetry.
1200 	 */
1201 	if (nbanks == 0 || nbanks > global_nbanks)
1202 		return;
1203 
1204 	mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks *
1205 	    sizeof (struct gcpu_bios_bankcfg), KM_SLEEP);
1206 
1207 	/*
1208 	 * Calculate the size we need to allocate for a gcpu_logout_t
1209 	 * with a gcl_data array big enough for all banks of this cpu.
1210 	 * Add any space requested by the model-specific logout support.
1211 	 */
1212 	mslsz = cms_logout_size(hdl);
1213 	mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) +
1214 	    (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz;
1215 
1216 	for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) {
1217 		gcpu_logout_t *gcl;
1218 
1219 		mca->gcpu_mca_logout[i] = gcl =
1220 		    kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP);
1221 		gcl->gcl_gcpu = gcpu;
1222 		gcl->gcl_nbanks = nbanks;
1223 		gcl->gcl_ms_logout = (mslsz == 0) ? NULL :
1224 		    (char *)(&gcl->gcl_data[0]) + nbanks *
1225 		    sizeof (gcpu_bank_logout_t);
1226 
1227 	}
1228 
1229 #ifdef __xpv
1230 	gcpu_xpv_mca_init(nbanks);
1231 #endif
1232 
1233 	mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1;
1234 
1235 #ifndef __xpv
1236 	mca->gcpu_bank_cmci = kmem_zalloc(sizeof (gcpu_mca_cmci_t) * nbanks,
1237 	    KM_SLEEP);
1238 #endif
1239 
1240 	/*
1241 	 * Create our errorq to transport the logout structures.  This
1242 	 * can fail so users of gcpu_mca_queue must be prepared for NULL.
1243 	 */
1244 	gcpu_errorq_init(mca->gcpu_mca_lgsz);
1245 
1246 	/*
1247 	 * Not knowing which, if any, banks are shared between cores we
1248 	 * assure serialization of MCA bank initialization by each cpu
1249 	 * on the chip.  On chip architectures in which some banks are
1250 	 * shared this will mean the shared resource is initialized more
1251 	 * than once - we're simply aiming to avoid simultaneous MSR writes
1252 	 * to the shared resource.
1253 	 *
1254 	 * Even with these precautions, some platforms may yield a GP fault
1255 	 * if a core other than a designated master tries to write anything
1256 	 * but all 0's to MCi_{STATUS,ADDR,CTL}.  So we will perform
1257 	 * those writes under on_trap protection.
1258 	 */
1259 	mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock);
1260 
1261 	/*
1262 	 * Initialize poller data, but don't start polling yet.
1263 	 */
1264 	gcpu_mca_poll_init(hdl);
1265 
1266 	/*
1267 	 * Work out which MCA banks we will initialize.  In MCA logout
1268 	 * code we will only read those banks which we initialize here.
1269 	 */
1270 	for (i = 0; i < nbanks; i++) {
1271 		boolean_t skipctl = cms_bankctl_skipinit(hdl, i);
1272 		boolean_t skipstatus = cms_bankstatus_skipinit(hdl, i);
1273 
1274 		if (!cms_present(hdl)) {
1275 			/*
1276 			 * Model-specific support is not present, try to use
1277 			 * sane defaults.
1278 			 *
1279 			 * On AMD family 6 processors, reports about spurious
1280 			 * machine checks indicate that bank 0 should be
1281 			 * skipped.
1282 			 *
1283 			 * On Intel family 6 processors, the documentation tells
1284 			 * us not to write to MC0_CTL.
1285 			 *
1286 			 */
1287 			if (i == 0 && family == 6) {
1288 				switch (vendor) {
1289 				case X86_VENDOR_AMD:
1290 					skipstatus = B_TRUE;
1291 					/*FALLTHRU*/
1292 				case X86_VENDOR_Intel:
1293 					skipctl = B_TRUE;
1294 					break;
1295 				}
1296 			}
1297 		}
1298 
1299 		ctl_skip_mask |= skipctl << i;
1300 		status_skip_mask |= skipstatus << i;
1301 
1302 		if (skipctl && skipstatus)
1303 			continue;
1304 
1305 		/*
1306 		 * Record which MCA banks were enabled, from the point of view
1307 		 * of the whole chip (if some cores share a bank we must be
1308 		 * sure either can logout from it).
1309 		 */
1310 		atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i);
1311 
1312 #ifndef __xpv
1313 		/*
1314 		 * check CMCI capability
1315 		 */
1316 		if (mcg_ctl2_present) {
1317 			uint64_t ctl2;
1318 			uint32_t cap = 0;
1319 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
1320 			if (ctl2 & MSR_MC_CTL2_EN)
1321 				continue;
1322 			ctl2 |= MSR_MC_CTL2_EN;
1323 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
1324 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
1325 			mca->gcpu_bank_cmci[i].cmci_cap = cap =
1326 			    (ctl2 & MSR_MC_CTL2_EN) ? 1 : 0;
1327 			if (cap)
1328 				cmci_capable ++;
1329 			/*
1330 			 * Set threshold to 1 while unset the en field, to avoid
1331 			 * CMCI trigged before APIC LVT entry init.
1332 			 */
1333 			ctl2 = (ctl2 & (~MSR_MC_CTL2_EN)) | 1;
1334 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
1335 
1336 			/*
1337 			 * init cmci related count
1338 			 */
1339 			mca->gcpu_bank_cmci[i].cmci_enabled = 0;
1340 			mca->gcpu_bank_cmci[i].drtcmci = 0;
1341 			mca->gcpu_bank_cmci[i].ncmci = 0;
1342 		}
1343 #endif
1344 	}
1345 
1346 #ifndef __xpv
1347 	if (cmci_capable) {
1348 		mca->gcpu_mca_flags |= GCPU_MCA_F_CMCI_CAPABLE;
1349 		gcpu_mca_cmci_enable(hdl);
1350 	}
1351 #endif
1352 
1353 #ifndef __xpv
1354 	/*
1355 	 * Log any valid telemetry lurking in the MCA banks, but do not
1356 	 * clear the status registers.  Ignore the disposition returned -
1357 	 * we have already paniced or reset for any nasty errors found here.
1358 	 *
1359 	 * Intel vol 3A says that we should not do this on family 0x6,
1360 	 * and that for any extended family the BIOS clears things
1361 	 * on power-on reset so you'll only potentially find valid telemetry
1362 	 * on warm reset (we do it for both - on power-on reset we should
1363 	 * just see zeroes).
1364 	 *
1365 	 * AMD docs since K7 say we should process anything we find here.
1366 	 */
1367 	if (!gcpu_suppress_log_on_init &&
1368 	    ((vendor == X86_VENDOR_Intel && family >= 0xf) ||
1369 	    vendor == X86_VENDOR_AMD ||
1370 	    vendor == X86_VENDOR_HYGON))
1371 		gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE,
1372 		    GCPU_MPT_WHAT_POKE_ERR);
1373 
1374 	/*
1375 	 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the
1376 	 * model-specific module the power of veto.
1377 	 */
1378 	for (i = 0; i < nbanks; i++) {
1379 		struct gcpu_bios_bankcfg *bcfgp =
1380 		    mca->gcpu_mca_bioscfg.bios_bankcfg + i;
1381 
1382 		/*
1383 		 * Stash inherited bank MCA state, even for banks we will
1384 		 * not initialize ourselves.  Do not read the MISC register
1385 		 * unconditionally - on some processors that will #GP on
1386 		 * banks that do not implement the MISC register (would be
1387 		 * caught by on_trap, anyway).
1388 		 */
1389 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL),
1390 		    &bcfgp->bios_bank_ctl);
1391 
1392 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1393 		    &bcfgp->bios_bank_status);
1394 
1395 		if ((bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV) ||
1396 		    gcpu_force_addr_in_payload) {
1397 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR),
1398 			    &bcfgp->bios_bank_addr);
1399 		}
1400 
1401 		/*
1402 		 * In some old BIOS the status value after boot can indicate
1403 		 * MISCV when there is actually no MISC register for
1404 		 * that bank.  The following read could therefore
1405 		 * aggravate a general protection fault.  This should be
1406 		 * caught by on_trap, but the #GP fault handler is busted
1407 		 * and can suffer a double fault even before we get to
1408 		 * trap() to check for on_trap protection.  Until that
1409 		 * issue is fixed we remove the one access that we know
1410 		 * can cause a #GP.
1411 		 *
1412 		 * if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV)
1413 		 *	(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC),
1414 		 *	    &bcfgp->bios_bank_misc);
1415 		 */
1416 		bcfgp->bios_bank_misc = 0;
1417 
1418 		if (!(ctl_skip_mask & (1 << i))) {
1419 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL),
1420 			    cms_bankctl_val(hdl, i, -1ULL));
1421 		}
1422 
1423 		if (!(status_skip_mask & (1 << i))) {
1424 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS),
1425 			    cms_bankstatus_val(hdl, i, 0ULL));
1426 		}
1427 	}
1428 #endif
1429 	/*
1430 	 * Now let the model-specific support perform further initialization
1431 	 * of non-architectural features.
1432 	 */
1433 	cms_mca_init(hdl, nbanks);
1434 
1435 #ifndef __xpv
1436 	(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL);
1437 	membar_producer();
1438 
1439 	/* enable all machine-check features */
1440 	if (mcg_ctl_present)
1441 		(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL,
1442 		    cms_mcgctl_val(hdl, nbanks, -1ULL));
1443 #endif
1444 
1445 	mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock);
1446 
1447 #ifndef __xpv
1448 	/* enable machine-check exception in CR4 */
1449 	cmi_hdl_enable_mce(hdl);
1450 #endif
1451 }
1452 
1453 static uint64_t
1454 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu,
1455     gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp)
1456 {
1457 	int curctxbad = 0, unconstrained = 0, forcefatal = 0;
1458 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1459 	int nbanks = mca->gcpu_mca_nbanks;
1460 	gcpu_mce_status_t mce;
1461 	gcpu_bank_logout_t *gbl;
1462 	uint64_t disp = 0;
1463 	int i;
1464 
1465 	if (mcesp == NULL)
1466 		mcesp = &mce;
1467 
1468 	mcesp->mce_nerr = nerr;
1469 
1470 	mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc =
1471 	    mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned =
1472 	    mcesp->mce_forcefatal = mcesp->mce_ignored = 0;
1473 
1474 	/*
1475 	 * If this a machine check then if the return instruction pointer
1476 	 * is not valid the current context is lost.
1477 	 */
1478 	if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV))
1479 		disp |= CMI_ERRDISP_RIPV_INVALID;
1480 	gcl->ismc = ismc;
1481 
1482 	for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1483 		uint64_t mcistatus = gbl->gbl_status;
1484 		uint32_t ms_scope;
1485 		int pcc, uc;
1486 		int poisoned;
1487 
1488 		if (!(mcistatus & MSR_MC_STATUS_VAL))
1489 			continue;
1490 
1491 		if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)
1492 			continue;
1493 
1494 		pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0;
1495 		uc = (mcistatus & MSR_MC_STATUS_UC) != 0;
1496 		mcesp->mce_npcc += pcc;
1497 		mcesp->mce_nuc += uc;
1498 
1499 		ms_scope = cms_error_action(hdl, ismc, i, mcistatus,
1500 		    gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout);
1501 
1502 		if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) {
1503 			pcc = 0;
1504 			mcesp->mce_npcc_ok++;
1505 			gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED;
1506 		}
1507 
1508 		if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) {
1509 			uc = 0;
1510 			mcesp->mce_nuc_ok++;
1511 			gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED;
1512 		}
1513 
1514 		if (uc) {
1515 			poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0;
1516 			if (poisoned) {
1517 				mcesp->mce_nuc_poisoned++;
1518 				gbl->gbl_disp |= CMI_ERRDISP_POISONED;
1519 			}
1520 		}
1521 
1522 		if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) {
1523 			/*
1524 			 * We're not being instructed to ignore the error,
1525 			 * so apply our standard disposition logic to it.
1526 			 */
1527 			if (uc && !poisoned) {
1528 				unconstrained++;
1529 				gbl->gbl_disp |= disp |
1530 				    CMI_ERRDISP_UC_UNCONSTRAINED;
1531 			}
1532 
1533 			if (pcc && ismc) {
1534 				curctxbad++;
1535 				gbl->gbl_disp |= disp |
1536 				    CMI_ERRDISP_CURCTXBAD;
1537 			}
1538 
1539 			/*
1540 			 * Even if the above may not indicate that the error
1541 			 * is terminal, model-specific support may insist
1542 			 * that we treat it as such.  Such errors wil be
1543 			 * fatal even if discovered via poll.
1544 			 */
1545 			if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) {
1546 				forcefatal++;
1547 				mcesp->mce_forcefatal++;
1548 				gbl->gbl_disp |= disp |
1549 				    CMI_ERRDISP_FORCEFATAL;
1550 			}
1551 		} else {
1552 			mcesp->mce_ignored++;
1553 			gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED;
1554 		}
1555 	}
1556 
1557 	if (unconstrained > 0)
1558 		disp |= CMI_ERRDISP_UC_UNCONSTRAINED;
1559 
1560 	if (curctxbad > 0)
1561 		disp |= CMI_ERRDISP_CURCTXBAD;
1562 
1563 	if (forcefatal > 0)
1564 		disp |= CMI_ERRDISP_FORCEFATAL;
1565 
1566 	if (gcpu_mca_queue != NULL) {
1567 		int how;
1568 
1569 		if (ismc) {
1570 			how = cmi_mce_response(rp, disp) ?
1571 			    ERRORQ_ASYNC :	/* no panic, so arrange drain */
1572 			    ERRORQ_SYNC;	/* panic flow will drain */
1573 		} else {
1574 			how = (disp & CMI_ERRDISP_FORCEFATAL &&
1575 			    cmi_panic_on_ue()) ?
1576 			    ERRORQ_SYNC :	/* poller will panic */
1577 			    ERRORQ_ASYNC;	/* no panic */
1578 		}
1579 
1580 		errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how);
1581 	} else if (disp != 0) {
1582 		gcpu_bleat(hdl, gcl);
1583 	}
1584 
1585 	mcesp->mce_disp = disp;
1586 
1587 	return (disp);
1588 }
1589 
1590 /*
1591  * Gather error telemetry from our source, and then submit it for
1592  * processing.
1593  */
1594 
1595 #define	IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \
1596 	((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0)
1597 
1598 #define	STATUS_EQV(s1, s2) \
1599 	(((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER))
1600 
1601 static uint32_t gcpu_deferrred_polled_clears;
1602 
1603 #ifndef __xpv
1604 static void
1605 gcpu_cmci_logout(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
1606     uint64_t status, int what)
1607 {
1608 	uint64_t ctl2;
1609 
1610 	if (bank_cmci_p->cmci_cap && (what == GCPU_MPT_WHAT_CYC_ERR) &&
1611 	    (!(status & MSR_MC_STATUS_VAL) || ((status & MSR_MC_STATUS_VAL) &&
1612 	    !(status & MSR_MC_STATUS_CEC_MASK)))) {
1613 
1614 		if (!(bank_cmci_p->cmci_enabled)) {
1615 			/*
1616 			 * when cmci is disabled, and the bank has no error or
1617 			 * no corrected error for
1618 			 * gcpu_mca_cmci_reenable_threshold consecutive polls,
1619 			 * turn on this bank's cmci.
1620 			 */
1621 
1622 			bank_cmci_p->drtcmci ++;
1623 
1624 			if (bank_cmci_p->drtcmci >=
1625 			    gcpu_mca_cmci_reenable_threshold) {
1626 
1627 				/* turn on cmci */
1628 
1629 				(void) cmi_hdl_rdmsr(hdl,
1630 				    IA32_MSR_MC_CTL2(bank), &ctl2);
1631 				ctl2 |= MSR_MC_CTL2_EN;
1632 				(void) cmi_hdl_wrmsr(hdl,
1633 				    IA32_MSR_MC_CTL2(bank), ctl2);
1634 
1635 				/* reset counter and set flag */
1636 				bank_cmci_p->drtcmci = 0;
1637 				bank_cmci_p->cmci_enabled = 1;
1638 			}
1639 		} else {
1640 			/*
1641 			 * when cmci is enabled,if is in cyclic poll and the
1642 			 * bank has no error or no corrected error, reset ncmci
1643 			 * counter
1644 			 */
1645 			bank_cmci_p->ncmci = 0;
1646 		}
1647 	}
1648 }
1649 
1650 static void
1651 gcpu_cmci_throttle(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
1652     int what)
1653 {
1654 	uint64_t ctl2 = 0;
1655 
1656 	/*
1657 	 * if cmci of this bank occurred beyond
1658 	 * gcpu_mca_cmci_throttling_threshold between 2 polls,
1659 	 * turn off this bank's CMCI;
1660 	 */
1661 	if (bank_cmci_p->cmci_enabled && what == GCPU_MPT_WHAT_CMCI_ERR) {
1662 
1663 		/* if it is cmci trap, increase the count */
1664 		bank_cmci_p->ncmci++;
1665 
1666 		if (bank_cmci_p->ncmci >= gcpu_mca_cmci_throttling_threshold) {
1667 
1668 			/* turn off cmci */
1669 
1670 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(bank),
1671 			    &ctl2);
1672 			ctl2 &= ~MSR_MC_CTL2_EN;
1673 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(bank),
1674 			    ctl2);
1675 
1676 			/* clear the flag and count */
1677 
1678 			bank_cmci_p->cmci_enabled = 0;
1679 			bank_cmci_p->ncmci = 0;
1680 		}
1681 	}
1682 }
1683 #endif
1684 
1685 static void
1686 clear_mc(int first, int last, int ismc, boolean_t clrstatus,
1687     cmi_hdl_t hdl, gcpu_logout_t *gcl, gcpu_logout_t *pgcl)
1688 {
1689 	int i;
1690 	gcpu_bank_logout_t *gbl, *pgbl;
1691 	uint64_t status;
1692 
1693 	if (first < 0 || last < 0)
1694 		return;
1695 
1696 	for (i = first, gbl = &gcl->gcl_data[first]; i <= last; i++, gbl++) {
1697 		status = gbl->gbl_status;
1698 		if (status == 0)
1699 			continue;
1700 		if (clrstatus == B_FALSE)
1701 			goto serialize;
1702 
1703 		/*
1704 		 * For i86xpv we always clear status in order to invalidate
1705 		 * the interposed telemetry.
1706 		 *
1707 		 * For native machine checks we always clear status here.  For
1708 		 * native polls we must be a little more cautious since there
1709 		 * is an outside chance that we may clear telemetry from a
1710 		 * shared MCA bank on which a sibling core is machine checking.
1711 		 *
1712 		 * For polled observations of errors that look like they may
1713 		 * produce a machine check (UC/PCC and ENabled, although these
1714 		 * do not guarantee a machine check on error occurence)
1715 		 * we will not clear the status at this wakeup unless
1716 		 * we saw the same status at the previous poll.	 We will
1717 		 * always process and log the current observations - it
1718 		 * is only the clearing of MCi_STATUS which may be
1719 		 * deferred until the next wakeup.
1720 		 */
1721 		if (isxpv || ismc || !IS_MCE_CANDIDATE(status)) {
1722 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL);
1723 			goto serialize;
1724 		}
1725 
1726 		/*
1727 		 * We have a polled observation of a machine check
1728 		 * candidate.  If we saw essentially the same status at the
1729 		 * last poll then clear the status now since this appears
1730 		 * not to be a #MC candidate after all.	 If we see quite
1731 		 * different status now then do not clear, but reconsider at
1732 		 * the next poll.  In no actual machine check clears
1733 		 * the status in the interim then the status should not
1734 		 * keep changing forever (meaning we'd never clear it)
1735 		 * since before long we'll simply have latched the highest-
1736 		 * priority error and set the OVerflow bit.  Nonetheless
1737 		 * we count how many times we defer clearing and after
1738 		 * a while insist on clearing the status.
1739 		 */
1740 		pgbl = &pgcl->gcl_data[i];
1741 		if (pgbl->gbl_clrdefcnt != 0) {
1742 			/* We deferred clear on this bank at last wakeup */
1743 			if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) ||
1744 			    pgbl->gbl_clrdefcnt > 5) {
1745 				/*
1746 				 * Status is unchanged so clear it now and,
1747 				 * since we have already logged this info,
1748 				 * avoid logging it again.
1749 				 */
1750 				gbl->gbl_status = 0;
1751 				(void) cmi_hdl_wrmsr(hdl,
1752 				    IA32_MSR_MC(i, STATUS), 0ULL);
1753 			} else {
1754 				/* Record deferral for next wakeup */
1755 				gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1;
1756 			}
1757 		} else {
1758 			/* Record initial deferral for next wakeup */
1759 			gbl->gbl_clrdefcnt = 1;
1760 			gcpu_deferrred_polled_clears++;
1761 		}
1762 
1763 serialize:
1764 		{
1765 #ifdef __xpv
1766 			;
1767 #else
1768 			/*
1769 			 * Intel Vol 3A says to execute a serializing
1770 			 * instruction here, ie CPUID.	Well WRMSR is also
1771 			 * defined to be serializing, so the status clear above
1772 			 * should suffice.  To be a good citizen, and since
1773 			 * some clears are deferred, we'll execute a CPUID
1774 			 * instruction here.
1775 			 */
1776 			struct cpuid_regs tmp;
1777 			(void) __cpuid_insn(&tmp);
1778 #endif
1779 		}
1780 	}
1781 }
1782 
1783 /*ARGSUSED5*/
1784 void
1785 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask,
1786     gcpu_mce_status_t *mcesp, boolean_t clrstatus, int what)
1787 {
1788 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1789 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1790 	int nbanks = mca->gcpu_mca_nbanks;
1791 	gcpu_bank_logout_t *gbl, *pgbl;
1792 	gcpu_logout_t *gcl, *pgcl;
1793 	int ismc = (rp != NULL);
1794 	int ispoll = !ismc;
1795 	int i, nerr = 0;
1796 	cmi_errno_t err;
1797 	uint64_t mcg_status;
1798 	uint64_t disp;
1799 	uint64_t cap;
1800 	int first = -1;
1801 	int last = -1;
1802 	int willpanic = 0;
1803 
1804 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
1805 	    CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) !=
1806 	    CMI_SUCCESS) {
1807 		if (mcesp != NULL)
1808 			mcesp->mce_nerr = mcesp->mce_disp = 0;
1809 		return;
1810 	}
1811 
1812 	if (ismc) {
1813 		gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION];
1814 		pgcl = NULL;
1815 	} else {
1816 		int pidx = mca->gcpu_mca_nextpoll_idx;
1817 		int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ?
1818 		    GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1;
1819 
1820 		gcl = mca->gcpu_mca_logout[pidx];	/* current logout */
1821 		pgcl = mca->gcpu_mca_logout[ppidx];	/* previous logout */
1822 		mca->gcpu_mca_nextpoll_idx = ppidx;	/* switch next time */
1823 	}
1824 
1825 	gcl->gcl_timestamp = gethrtime_waitfree();
1826 	gcl->gcl_mcg_status = mcg_status;
1827 	gcl->gcl_ip = rp ? rp->r_pc : 0;
1828 
1829 	gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0;
1830 	if (cap & MCG_CAP_TES_P)
1831 		gcl->gcl_flags |= GCPU_GCL_F_TES_P;
1832 
1833 	for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1834 		uint64_t status, status2, addr, misc;
1835 		int retries = gcpu_mca_telemetry_retries;
1836 
1837 		gbl->gbl_status = 0;
1838 		gbl->gbl_disp = 0;
1839 		gbl->gbl_clrdefcnt = 0;
1840 
1841 		/*
1842 		 * Only logout from MCA banks we have initialized from at
1843 		 * least one core.  If a core shares an MCA bank with another
1844 		 * but perhaps lost the race to initialize it, then it must
1845 		 * still be allowed to logout from the shared bank.
1846 		 */
1847 		if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i))
1848 			continue;
1849 
1850 		/*
1851 		 * On a poll look only at the banks we've been asked to check.
1852 		 */
1853 		if (rp == NULL && !(bankmask & 1 << i))
1854 			continue;
1855 
1856 
1857 		if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) !=
1858 		    CMI_SUCCESS)
1859 			continue;
1860 
1861 #ifndef __xpv
1862 		gcpu_cmci_logout(hdl, i, &mca->gcpu_bank_cmci[i], status, what);
1863 #endif
1864 
1865 retry:
1866 		if (!(status & MSR_MC_STATUS_VAL))
1867 			continue;
1868 
1869 		/* First and last bank that have valid status */
1870 		if (first < 0)
1871 			first = i;
1872 		last = i;
1873 
1874 		addr = -1;
1875 		misc = 0;
1876 
1877 		if ((status & MSR_MC_STATUS_ADDRV) ||
1878 		    gcpu_force_addr_in_payload)
1879 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr);
1880 
1881 		if (status & MSR_MC_STATUS_MISCV)
1882 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc);
1883 
1884 #ifndef __xpv
1885 		gcpu_cmci_throttle(hdl, i, &mca->gcpu_bank_cmci[i], what);
1886 #endif
1887 
1888 		/*
1889 		 * Allow the model-specific code to extract bank telemetry.
1890 		 */
1891 		cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout);
1892 
1893 		/*
1894 		 * Not all cpu models assure us that the status/address/misc
1895 		 * data will not change during the above sequence of MSR reads,
1896 		 * or that it can only change by the addition of the OVerflow
1897 		 * bit to the status register.  If the status has changed
1898 		 * other than in the overflow bit then we attempt to reread
1899 		 * for a consistent snapshot, but eventually give up and
1900 		 * go with what we've got.  We only perform this check
1901 		 * for a poll - a further #MC during a #MC will reset, and
1902 		 * polled errors should not overwrite higher-priority
1903 		 * trapping errors (but could set the overflow bit).
1904 		 */
1905 		if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1906 		    &status2)) == CMI_SUCCESS) {
1907 			if (!STATUS_EQV(status, status2)) {
1908 				if (retries-- > 0) {
1909 					status = status2;
1910 					goto retry;
1911 				} else {
1912 					gbl->gbl_disp |=
1913 					    CMI_ERRDISP_INCONSISTENT;
1914 				}
1915 			}
1916 		} else if (ispoll && err != CMI_SUCCESS) {
1917 			gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT;
1918 		}
1919 
1920 		nerr++;
1921 		gbl->gbl_status = status;
1922 		gbl->gbl_addr = addr;
1923 		gbl->gbl_misc = misc;
1924 
1925 		/*
1926 		 * For polled observation, if the count of deferred status
1927 		 * clears updated in the clear_mc() is nonzero and the
1928 		 * MCi_STATUS has not changed, the last wakeup has produced
1929 		 * the ereport of the error. Therefore, clear the status in
1930 		 * this wakeup to avoid duplicate ereport.
1931 		 */
1932 		pgbl = &pgcl->gcl_data[i];
1933 		if (!isxpv && ispoll && IS_MCE_CANDIDATE(status) &&
1934 		    pgbl->gbl_clrdefcnt != 0) {
1935 			if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status)) {
1936 				gbl->gbl_status = 0;
1937 				(void) cmi_hdl_wrmsr(hdl,
1938 				    IA32_MSR_MC(i, STATUS), 0ULL);
1939 			}
1940 		}
1941 	}
1942 
1943 	if (gcpu_mca_stack_flag)
1944 		gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH);
1945 	else
1946 		gcl->gcl_stackdepth = 0;
1947 
1948 	/*
1949 	 * Decide our disposition for this error or errors, and submit for
1950 	 * logging and subsequent diagnosis.
1951 	 */
1952 	if (nerr != 0) {
1953 		disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp);
1954 
1955 		willpanic = (ismc && cmi_mce_response(rp, disp) == 0);
1956 
1957 		if (!willpanic)
1958 			clear_mc(first, last, ismc, clrstatus, hdl, gcl, pgcl);
1959 	} else {
1960 		disp = 0;
1961 		if (mcesp) {
1962 			mcesp->mce_nerr = mcesp->mce_disp = 0;
1963 		}
1964 	}
1965 
1966 	/*
1967 	 * Clear MCG_STATUS if MCIP is set (machine check in progress).
1968 	 * If a second #MC had occured before now the system would have
1969 	 * reset.  We can only do thise once gcpu_mca_process has copied
1970 	 * the logout structure.
1971 	 */
1972 	if (ismc && mcg_status & MCG_STATUS_MCIP)
1973 		(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0);
1974 
1975 	/*
1976 	 * At this point we have read and logged all telemetry that is visible
1977 	 * under the MCA.  On architectures for which the NorthBridge is
1978 	 * on-chip this may include NB-observed errors, but where the NB
1979 	 * is off chip it may have been the source of the #MC request and
1980 	 * so we must call into the memory-controller driver to give it
1981 	 * a chance to log errors.
1982 	 */
1983 	if (ismc) {
1984 		cmi_mc_logout(hdl, 1, willpanic);
1985 	}
1986 }
1987 
1988 #ifndef __xpv
1989 int gcpu_mca_trap_vomit_summary = 0;
1990 
1991 /*
1992  * On a native machine check exception we come here from mcetrap via
1993  * cmi_mca_trap.  A machine check on one cpu of a chip does not trap others
1994  * cpus of the chip, so it is possible that another cpu on this chip could
1995  * initiate a poll while we're in the #mc handler;  it is also possible that
1996  * this trap has occured during a poll on this cpu.  So we must acquire
1997  * the chip-wide poll lock, but be careful to avoid deadlock.
1998  *
1999  * The 'data' pointer cannot be NULL due to init order.
2000  */
2001 uint64_t
2002 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp)
2003 {
2004 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
2005 	kmutex_t *poll_lock = NULL;
2006 	gcpu_mce_status_t mce;
2007 	uint64_t mcg_status;
2008 	int tooklock = 0;
2009 
2010 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
2011 	    CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP))
2012 		return (0);
2013 
2014 	/*
2015 	 * Synchronize with any poller from another core that may happen
2016 	 * to share access to one or more of the MCA banks.
2017 	 */
2018 	if (gcpu->gcpu_shared != NULL)
2019 		poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock;
2020 
2021 	if (poll_lock != NULL && !mutex_owned(poll_lock)) {
2022 		/*
2023 		 * The lock is not owned by the thread we have
2024 		 * interrupted.  Spin for this adaptive lock.
2025 		 */
2026 		while (!mutex_tryenter(poll_lock)) {
2027 			while (mutex_owner(poll_lock) != NULL)
2028 				;
2029 		}
2030 		tooklock = 1;
2031 	}
2032 
2033 	gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE, GCPU_MPT_WHAT_MC_ERR);
2034 
2035 	if (tooklock)
2036 		mutex_exit(poll_lock);
2037 
2038 	/*
2039 	 * gcpu_mca_trap_vomit_summary may be set for debug assistance.
2040 	 */
2041 	if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) {
2042 		cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, "
2043 		    "%u PCC (%u ok), "
2044 		    "%u UC (%d ok, %u poisoned), "
2045 		    "%u forcefatal, %u ignored",
2046 		    mce.mce_nerr, (u_longlong_t)mce.mce_disp,
2047 		    mce.mce_npcc, mce.mce_npcc_ok,
2048 		    mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned,
2049 		    mce.mce_forcefatal, mce.mce_ignored);
2050 	}
2051 
2052 	return (mce.mce_disp);
2053 }
2054 #endif
2055 
2056 /*ARGSUSED*/
2057 void
2058 gcpu_faulted_enter(cmi_hdl_t hdl)
2059 {
2060 	/* Nothing to do here */
2061 }
2062 
2063 /*ARGSUSED*/
2064 void
2065 gcpu_faulted_exit(cmi_hdl_t hdl)
2066 {
2067 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
2068 
2069 	gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING;
2070 }
2071 
2072 /*
2073  * Write the requested values to the indicated MSRs.  Having no knowledge
2074  * of the model-specific requirements for writing to these model-specific
2075  * registers, we will only blindly write to those MSRs if the 'force'
2076  * argument is nonzero.  That option should only be used in prototyping
2077  * and debugging.
2078  */
2079 /*ARGSUSED*/
2080 cmi_errno_t
2081 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs,
2082     int force)
2083 {
2084 	int i, errs = 0;
2085 
2086 	for (i = 0; i < nregs; i++) {
2087 		uint_t msr = regs[i].cmr_msrnum;
2088 		uint64_t val = regs[i].cmr_msrval;
2089 
2090 		if (cms_present(hdl)) {
2091 			if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS)
2092 				errs++;
2093 		} else if (force) {
2094 			errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS);
2095 		} else {
2096 			errs++;
2097 		}
2098 	}
2099 
2100 	return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN);
2101 }
2102 
2103 /* deconfigure gcpu_mca_init() */
2104 void
2105 gcpu_mca_fini(cmi_hdl_t hdl)
2106 {
2107 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
2108 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
2109 	int i;
2110 
2111 	/*
2112 	 * CPU startup code only calls cmi_mca_init if x86_featureset indicates
2113 	 * both MCA and MCE support (i.e., X86FSET_MCA).  P5, K6, and earlier
2114 	 * processors, which have their own more primitive way of doing
2115 	 * machine checks, will not have cmi_mca_init called since their
2116 	 * CPUID information will not indicate both MCA and MCE features.
2117 	 */
2118 	if (!is_x86_feature(x86_featureset, X86FSET_MCA))
2119 		return;
2120 #ifndef __xpv
2121 	/*
2122 	 * disable machine check in CR4
2123 	 */
2124 	cmi_ntv_hwdisable_mce(hdl);
2125 #endif
2126 	mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock);
2127 	gcpu_mca_poll_fini(hdl);
2128 	mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock);
2129 
2130 	/*
2131 	 * free resources allocated during init
2132 	 */
2133 	if (mca->gcpu_bank_cmci != NULL) {
2134 		kmem_free(mca->gcpu_bank_cmci, sizeof (gcpu_mca_cmci_t) *
2135 		    mca->gcpu_mca_nbanks);
2136 	}
2137 
2138 	for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) {
2139 		if (mca->gcpu_mca_logout[i] != NULL) {
2140 			kmem_free(mca->gcpu_mca_logout[i], mca->gcpu_mca_lgsz);
2141 		}
2142 	}
2143 
2144 	if (mca->gcpu_mca_bioscfg.bios_bankcfg != NULL) {
2145 		kmem_free(mca->gcpu_mca_bioscfg.bios_bankcfg,
2146 		    sizeof (struct gcpu_bios_bankcfg) * mca->gcpu_mca_nbanks);
2147 	}
2148 }
2149