1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25 /*
26 * Copyright (c) 2010, Intel Corporation.
27 * All rights reserved.
28 */
29
30 #include <sys/mca_x86.h>
31 #include <sys/cpu_module_impl.h>
32 #include <sys/cpu_module_ms.h>
33 #include <sys/cmn_err.h>
34 #include <sys/cpuvar.h>
35 #include <sys/pghw.h>
36 #include <sys/x86_archext.h>
37 #include <sys/sysmacros.h>
38 #include <sys/regset.h>
39 #include <sys/privregs.h>
40 #include <sys/systm.h>
41 #include <sys/types.h>
42 #include <sys/log.h>
43 #include <sys/psw.h>
44 #include <sys/fm/protocol.h>
45 #include <sys/fm/util.h>
46 #include <sys/errorq.h>
47 #include <sys/mca_x86.h>
48 #include <sys/fm/cpu/GMCA.h>
49 #include <sys/fm/smb/fmsmb.h>
50 #include <sys/sysevent.h>
51 #include <sys/ontrap.h>
52
53 #include "gcpu.h"
54
55 extern int x86gentopo_legacy; /* x86 generic topology support */
56
57 static uint_t gcpu_force_addr_in_payload = 0;
58
59 /*
60 * Clear to log telemetry found at initialization. While processor docs
61 * say you should process this telemetry on all but Intel family 0x6
62 * there are way too many exceptions and we want to avoid bogus
63 * diagnoses.
64 */
65 int gcpu_suppress_log_on_init = 1;
66
67 /*
68 * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at
69 * error logout time. The stack will be included in the ereport if the
70 * error type selects stack inclusion, or in all cases if
71 * gcpu_mca_stack_ereport_include is nonzero.
72 */
73 int gcpu_mca_stack_flag = 0;
74 int gcpu_mca_stack_ereport_include = 0;
75
76 /*
77 * The number of times to re-read MCA telemetry to try to obtain a
78 * consistent snapshot if we find it to be changing under our feet.
79 */
80 int gcpu_mca_telemetry_retries = 5;
81
82 #ifndef __xpv
83 int gcpu_mca_cmci_throttling_threshold = 10;
84 int gcpu_mca_cmci_reenable_threshold = 1000;
85 #endif
86
87 static gcpu_error_disp_t gcpu_errtypes[] = {
88
89 /*
90 * Unclassified
91 */
92 {
93 FM_EREPORT_CPU_GENERIC_UNCLASSIFIED,
94 NULL,
95 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
96 MCAX86_SIMPLE_UNCLASSIFIED_MASKON,
97 MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF
98 },
99
100 /*
101 * Microcode ROM Parity Error
102 */
103 {
104 FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY,
105 NULL,
106 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
107 MCAX86_SIMPLE_MC_CODE_PARITY_MASKON,
108 MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF
109 },
110
111 /*
112 * External - BINIT# from another processor during power-on config
113 */
114 {
115 FM_EREPORT_CPU_GENERIC_EXTERNAL,
116 NULL,
117 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
118 MCAX86_SIMPLE_EXTERNAL_MASKON,
119 MCAX86_SIMPLE_EXTERNAL_MASKOFF
120 },
121
122 /*
123 * Functional redundancy check master/slave error
124 */
125 {
126 FM_EREPORT_CPU_GENERIC_FRC,
127 NULL,
128 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
129 MCAX86_SIMPLE_FRC_MASKON,
130 MCAX86_SIMPLE_FRC_MASKOFF
131 },
132
133 /*
134 * Internal parity error
135 */
136 {
137 FM_EREPORT_CPU_GENERIC_INTERNAL_PARITY,
138 NULL,
139 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
140 MCAX86_SIMPLE_INTERNAL_PARITY_MASKON,
141 MCAX86_SIMPLE_INTERNAL_PARITY_MASKOFF
142 },
143
144
145 /*
146 * Internal timer error
147 */
148 {
149 FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER,
150 NULL,
151 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
152 MCAX86_SIMPLE_INTERNAL_TIMER_MASKON,
153 MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF
154 },
155
156 /*
157 * Internal unclassified
158 */
159 {
160 FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS,
161 NULL,
162 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
163 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON,
164 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF
165 },
166
167 /*
168 * Compound error codes - generic memory hierarchy
169 */
170 {
171 FM_EREPORT_CPU_GENERIC_GENMEMHIER,
172 NULL,
173 FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */
174 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON,
175 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF
176 },
177
178 /*
179 * Compound error codes - TLB errors
180 */
181 {
182 FM_EREPORT_CPU_GENERIC_TLB,
183 "%1$s" "TLB" "%2$s" "_ERR",
184 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
185 MCAX86_COMPOUND_TLB_MASKON,
186 MCAX86_COMPOUND_TLB_MASKOFF
187 },
188
189 /*
190 * Compound error codes - memory hierarchy
191 */
192 {
193 FM_EREPORT_CPU_GENERIC_MEMHIER,
194 "%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR",
195 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
196 MCAX86_COMPOUND_MEMHIER_MASKON,
197 MCAX86_COMPOUND_MEMHIER_MASKOFF
198 },
199
200 /*
201 * Compound error codes - bus and interconnect errors
202 */
203 {
204 FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT,
205 "BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR",
206 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
207 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON,
208 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF
209 },
210 /*
211 * Compound error codes - memory controller errors
212 */
213 {
214 FM_EREPORT_CPU_GENERIC_MEMORY_CONTROLLER,
215 "MC" "_" "%8$s" "_" "%9$s" "_ERR",
216 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
217 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKON,
218 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKOFF
219 },
220 };
221
222 static gcpu_error_disp_t gcpu_unknown = {
223 FM_EREPORT_CPU_GENERIC_UNKNOWN,
224 "UNKNOWN",
225 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
226 0,
227 0
228 };
229
230 static errorq_t *gcpu_mca_queue;
231 static kmutex_t gcpu_mca_queue_lock;
232
233 #ifdef __xpv
234 static int isxpv = 1;
235 #else
236 static int isxpv = 0;
237 #endif
238
239 static const gcpu_error_disp_t *
gcpu_disp_match(uint16_t code)240 gcpu_disp_match(uint16_t code)
241 {
242 const gcpu_error_disp_t *ged = gcpu_errtypes;
243 int i;
244
245 for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t);
246 i++, ged++) {
247 uint16_t on = ged->ged_errcode_mask_on;
248 uint16_t off = ged->ged_errcode_mask_off;
249
250 if ((code & on) == on && (code & off) == 0)
251 return (ged);
252 }
253
254 return (NULL);
255 }
256
257 static uint16_t
bit_strip(uint16_t code,uint16_t mask,uint16_t shift)258 bit_strip(uint16_t code, uint16_t mask, uint16_t shift)
259 {
260 return ((code & mask) >> shift);
261 }
262
263 #define BIT_STRIP(code, name) \
264 bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \
265 MCAX86_ERRCODE_##name##_SHIFT)
266
267 #define GCPU_MNEMONIC_UNDEF "undefined"
268 #define GCPU_MNEMONIC_RESVD "reserved"
269
270 /*
271 * Mappings of TT, LL, RRRR, PP, II and T values to compound error name
272 * mnemonics and to ereport class name components.
273 */
274
275 struct gcpu_mnexp {
276 const char *mne_compound; /* used in expanding compound errname */
277 const char *mne_ereport; /* used in expanding ereport class */
278 };
279
280 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */
281 { "I", FM_EREPORT_CPU_GENERIC_TT_INSTR }, /* INSTR */
282 { "D", FM_EREPORT_CPU_GENERIC_TT_DATA }, /* DATA */
283 { "G", FM_EREPORT_CPU_GENERIC_TT_GEN }, /* GEN */
284 { GCPU_MNEMONIC_UNDEF, "" }
285 };
286
287 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */
288 { "LO", FM_EREPORT_CPU_GENERIC_LL_L0 }, /* L0 */
289 { "L1", FM_EREPORT_CPU_GENERIC_LL_L1 }, /* L1 */
290 { "L2", FM_EREPORT_CPU_GENERIC_LL_L2 }, /* L2 */
291 { "LG", FM_EREPORT_CPU_GENERIC_LL_LG } /* LG */
292 };
293
294 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */
295 { "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR }, /* ERR */
296 { "RD", FM_EREPORT_CPU_GENERIC_RRRR_RD }, /* RD */
297 { "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR }, /* WR */
298 { "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD }, /* DRD */
299 { "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR }, /* DWR */
300 { "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD }, /* IRD */
301 { "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH }, /* PREFETCH */
302 { "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT }, /* EVICT */
303 { "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP }, /* SNOOP */
304 };
305
306 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */
307 { "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC }, /* SRC */
308 { "RES", FM_EREPORT_CPU_GENERIC_PP_RES }, /* RES */
309 { "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS }, /* OBS */
310 { "", FM_EREPORT_CPU_GENERIC_PP_GEN } /* GEN */
311 };
312
313 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */
314 { "M", FM_EREPORT_CPU_GENERIC_II_MEM }, /* MEM */
315 { GCPU_MNEMONIC_RESVD, "" },
316 { "IO", FM_EREPORT_CPU_GENERIC_II_IO }, /* IO */
317 { "", FM_EREPORT_CPU_GENERIC_II_GEN } /* GEN */
318 };
319
320 static struct gcpu_mnexp gcpu_T_mnemonics[] = { /* MCAX86_ERRCODE_T_* */
321 { "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT }, /* NONE */
322 { "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT } /* TIMEOUT */
323 };
324
325 static struct gcpu_mnexp gcpu_CCCC_mnemonics[] = { /* MCAX86_ERRCODE_CCCC_* */
326 { "CH0", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH0 */
327 { "CH1", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH1 */
328 { "CH2", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH2 */
329 { "CH3", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH3 */
330 { "CH4", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH4 */
331 { "CH5", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH5 */
332 { "CH6", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH6 */
333 { "CH7", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH7 */
334 { "CH8", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH8 */
335 { "CH9", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH9 */
336 { "CH10", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH10 */
337 { "CH11", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH11 */
338 { "CH12", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH12 */
339 { "CH13", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH13 */
340 { "CH14", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH14 */
341 { "CH", FM_EREPORT_CPU_GENERIC_CCCC } /* GEN */
342 };
343
344 static struct gcpu_mnexp gcpu_MMM_mnemonics[] = { /* MCAX86_ERRCODE_MMM_* */
345 { "GEN", FM_EREPORT_CPU_GENERIC_MMM_ERR }, /* GEN ERR */
346 { "RD", FM_EREPORT_CPU_GENERIC_MMM_RD }, /* READ */
347 { "WR", FM_EREPORT_CPU_GENERIC_MMM_WR }, /* WRITE */
348 { "ADDR_CMD", FM_EREPORT_CPU_GENERIC_MMM_ADRCMD }, /* ADDR, CMD */
349 { "SCRUB", FM_EREPORT_CPU_GENERIC_MMM_SCRUB },
350 { GCPU_MNEMONIC_RESVD, ""}, /* RESERVED */
351 { GCPU_MNEMONIC_RESVD, ""}, /* RESERVED */
352 { GCPU_MNEMONIC_RESVD, ""} /* RESERVED */
353 };
354
355 enum gcpu_mn_namespace {
356 GCPU_MN_NAMESPACE_COMPOUND,
357 GCPU_MN_NAMESPACE_EREPORT
358 };
359
360 static const char *
gcpu_mnemonic(const struct gcpu_mnexp * tbl,size_t tbl_sz,uint16_t val,enum gcpu_mn_namespace nspace)361 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint16_t val,
362 enum gcpu_mn_namespace nspace)
363 {
364 if (val >= tbl_sz || val > 0xff)
365 return (GCPU_MNEMONIC_UNDEF); /* for all namespaces */
366
367 switch (nspace) {
368 case GCPU_MN_NAMESPACE_COMPOUND:
369 return (tbl[val].mne_compound);
370 /*NOTREACHED*/
371
372 case GCPU_MN_NAMESPACE_EREPORT:
373 return (tbl[val].mne_ereport);
374 /*NOTREACHED*/
375
376 default:
377 return (GCPU_MNEMONIC_UNDEF);
378 /*NOTREACHED*/
379 }
380 }
381
382 /*
383 * The ereport class leaf component is either a simple string with no
384 * format specifiers, or a string with one or more embedded %n$s specifiers -
385 * positional selection for string arguments. The kernel snprintf does
386 * not support %n$ (and teaching it to do so is too big a headache) so
387 * we will expand this restricted format string ourselves.
388 */
389
390 #define GCPU_CLASS_VARCOMPS 9
391
392 #define GCPU_MNEMONIC(code, name, nspace) \
393 gcpu_mnemonic(gcpu_##name##_mnemonics, \
394 sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \
395 BIT_STRIP(code, name), nspace)
396
397 static void
gcpu_mn_fmt(const char * fmt,char * buf,size_t buflen,uint64_t status,enum gcpu_mn_namespace nspace)398 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
399 enum gcpu_mn_namespace nspace)
400 {
401 uint16_t code = MCAX86_ERRCODE(status);
402 const char *mn[GCPU_CLASS_VARCOMPS];
403 char *p = buf; /* current position in buf */
404 char *q = buf + buflen; /* pointer past last char in buf */
405 int which, expfmtchar, error;
406 char c;
407
408 mn[0] = GCPU_MNEMONIC(code, TT, nspace);
409 mn[1] = GCPU_MNEMONIC(code, LL, nspace);
410 mn[2] = GCPU_MNEMONIC(code, RRRR, nspace);
411 mn[3] = GCPU_MNEMONIC(code, PP, nspace);
412 mn[4] = GCPU_MNEMONIC(code, II, nspace);
413 mn[5] = GCPU_MNEMONIC(code, T, nspace);
414 mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : "";
415 mn[7] = GCPU_MNEMONIC(code, CCCC, nspace);
416 mn[8] = GCPU_MNEMONIC(code, MMM, nspace);
417
418 while (p < q - 1 && (c = *fmt++) != '\0') {
419 if (c != '%') {
420 /* not the beginning of a format specifier - copy */
421 *p++ = c;
422 continue;
423 }
424
425 error = 0;
426 which = -1;
427 expfmtchar = -1;
428
429 nextfmt:
430 if ((c = *fmt++) == '\0')
431 break; /* early termination of fmt specifier */
432
433 switch (c) {
434 case '1':
435 case '2':
436 case '3':
437 case '4':
438 case '5':
439 case '6':
440 case '7':
441 case '8':
442 case '9':
443 if (which != -1) { /* allow only one positional digit */
444 error++;
445 break;
446 }
447 which = c - '1';
448 goto nextfmt;
449 /*NOTREACHED*/
450
451 case '$':
452 if (which == -1) { /* no position specified */
453 error++;
454 break;
455 }
456 expfmtchar = 's';
457 goto nextfmt;
458 /*NOTREACHED*/
459
460 case 's':
461 if (expfmtchar != 's') {
462 error++;
463 break;
464 }
465 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
466 mn[which]);
467 p += strlen(p);
468 break;
469
470 default:
471 error++;
472 break;
473 }
474
475 if (error)
476 break;
477 }
478
479 *p = '\0'; /* NUL termination */
480 }
481
482 static void
gcpu_erpt_clsfmt(const char * fmt,char * buf,size_t buflen,uint64_t status,const char * cpuclass,const char * leafclass)483 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
484 const char *cpuclass, const char *leafclass)
485 {
486 char *p = buf; /* current position in buf */
487 char *q = buf + buflen; /* pointer past last char in buf */
488
489 (void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.",
490 FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC);
491
492 p += strlen(p);
493 if (p >= q)
494 return;
495
496 if (leafclass == NULL) {
497 gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status,
498 GCPU_MN_NAMESPACE_EREPORT);
499 } else {
500 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
501 leafclass);
502 }
503 }
504
505 /*
506 * Create an "hc" scheme FMRI identifying the given cpu with
507 * motherboard/chip/core/strand instance numbers.
508 */
509 static nvlist_t *
gcpu_fmri_create(cmi_hdl_t hdl,nv_alloc_t * nva)510 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva)
511 {
512 nvlist_t *nvl, *fmri;
513
514 if ((nvl = fm_nvlist_create(nva)) == NULL)
515 return (NULL);
516
517 if (!x86gentopo_legacy) {
518 fmri = cmi_hdl_smb_bboard(hdl);
519 if (fmri == NULL)
520 return (NULL);
521
522 fm_fmri_hc_create(nvl, FM_HC_SCHEME_VERSION,
523 NULL, NULL, fmri, 3,
524 "chip", cmi_hdl_smb_chipid(hdl),
525 "core", cmi_hdl_coreid(hdl),
526 "strand", cmi_hdl_strandid(hdl));
527 } else {
528 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 4,
529 "motherboard", 0,
530 "chip", cmi_hdl_chipid(hdl),
531 "core", cmi_hdl_coreid(hdl),
532 "strand", cmi_hdl_strandid(hdl));
533 }
534
535 return (nvl);
536 }
537
538 int gcpu_bleat_count_thresh = 5;
539 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL;
540
541 /*
542 * Called when we are unable to propogate a logout structure onto an
543 * errorq for subsequent ereport preparation and logging etc. The caller
544 * should usually only decide to call this for severe errors - those we
545 * suspect we may need to panic for.
546 */
547 static void
gcpu_bleat(cmi_hdl_t hdl,gcpu_logout_t * gcl)548 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl)
549 {
550 hrtime_t now = gethrtime_waitfree();
551 static hrtime_t gcpu_last_bleat;
552 gcpu_bank_logout_t *gbl;
553 static int bleatcount;
554 int i;
555
556 /*
557 * Throttle spamming of the console. The first gcpu_bleat_count_thresh
558 * can come as fast as we like, but once we've spammed that many
559 * to the console we require a minimum interval to pass before
560 * any more complaints.
561 */
562 if (++bleatcount > gcpu_bleat_count_thresh) {
563 if (now - gcpu_last_bleat < gcpu_bleat_min_interval)
564 return;
565 else
566 bleatcount = 0;
567 }
568 gcpu_last_bleat = now;
569
570 cmn_err(CE_WARN,
571 "Machine-Check Errors unlogged on chip %d core %d strand %d, "
572 "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl),
573 cmi_hdl_strandid(hdl));
574 cmn_err(CE_WARN, "MCG_STATUS 0x%016llx",
575 (u_longlong_t)gcl->gcl_mcg_status);
576 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
577 uint64_t status = gbl->gbl_status;
578
579 if (!(status & MSR_MC_STATUS_VAL))
580 continue;
581
582 /* Force ADDRV for AMD Family 0xf and above */
583 if (gcpu_force_addr_in_payload)
584 status = status | MSR_MC_STATUS_ADDRV;
585
586 switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) {
587 case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV:
588 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
589 "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx",
590 i, IA32_MSR_MC(i, STATUS),
591 (u_longlong_t)gbl->gbl_status,
592 (u_longlong_t)gbl->gbl_addr,
593 (u_longlong_t)gbl->gbl_misc);
594 break;
595
596 case MSR_MC_STATUS_ADDRV:
597 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
598 "STAT 0x%016llx ADDR 0x%016llx",
599 i, IA32_MSR_MC(i, STATUS),
600 (u_longlong_t)gbl->gbl_status,
601 (u_longlong_t)gbl->gbl_addr);
602 break;
603
604 case MSR_MC_STATUS_MISCV:
605 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
606 "STAT 0x%016llx MISC 0x%016llx",
607 i, IA32_MSR_MC(i, STATUS),
608 (u_longlong_t)gbl->gbl_status,
609 (u_longlong_t)gbl->gbl_misc);
610 break;
611
612 default:
613 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
614 "STAT 0x%016llx",
615 i, IA32_MSR_MC(i, STATUS),
616 (u_longlong_t)gbl->gbl_status);
617 break;
618
619 }
620 }
621 }
622
623 #define _GCPU_BSTATUS(status, what) \
624 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \
625 (status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE
626
627 static void
gcpu_ereport_add_logout(nvlist_t * ereport,const gcpu_logout_t * gcl,uint_t bankno,const gcpu_error_disp_t * ged,uint16_t code)628 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl,
629 uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code)
630 {
631 uint64_t members = ged ? ged->ged_ereport_members :
632 FM_EREPORT_PAYLOAD_FLAGS_COMMON;
633 uint64_t mcg = gcl->gcl_mcg_status;
634 int mcip = mcg & MCG_STATUS_MCIP;
635 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno];
636 uint64_t bstat = gbl->gbl_status;
637
638 /*
639 * Include the compound error name if requested and if this
640 * is a compound error type.
641 */
642 if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged &&
643 ged->ged_compound_fmt != NULL) {
644 char buf[FM_MAX_CLASS];
645
646 gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code,
647 GCPU_MN_NAMESPACE_COMPOUND);
648 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR,
649 DATA_TYPE_STRING, buf, NULL);
650 }
651
652 /*
653 * Include disposition information for this error
654 */
655 if (members & FM_EREPORT_PAYLOAD_FLAG_DISP &&
656 gbl->gbl_disp != 0) {
657 int i, empty = 1;
658 char buf[128];
659 char *p = buf, *q = buf + 128;
660 static struct _gcpu_disp_name {
661 uint64_t dv;
662 const char *dn;
663 } disp_names[] = {
664 { CMI_ERRDISP_CURCTXBAD,
665 "processor_context_corrupt" },
666 { CMI_ERRDISP_RIPV_INVALID,
667 "return_ip_invalid" },
668 { CMI_ERRDISP_UC_UNCONSTRAINED,
669 "unconstrained" },
670 { CMI_ERRDISP_FORCEFATAL,
671 "forcefatal" },
672 { CMI_ERRDISP_IGNORED,
673 "ignored" },
674 { CMI_ERRDISP_PCC_CLEARED,
675 "corrupt_context_cleared" },
676 { CMI_ERRDISP_UC_CLEARED,
677 "uncorrected_data_cleared" },
678 { CMI_ERRDISP_POISONED,
679 "poisoned" },
680 { CMI_ERRDISP_INCONSISTENT,
681 "telemetry_unstable" },
682 };
683
684 for (i = 0; i < sizeof (disp_names) /
685 sizeof (struct _gcpu_disp_name); i++) {
686 if ((gbl->gbl_disp & disp_names[i].dv) == 0)
687 continue;
688
689 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p,
690 "%s%s", empty ? "" : ",", disp_names[i].dn);
691 p += strlen(p);
692 empty = 0;
693 }
694
695 if (p != buf)
696 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP,
697 DATA_TYPE_STRING, buf, NULL);
698 }
699
700 /*
701 * If MCG_STATUS is included add that and an indication of whether
702 * this ereport was the result of a machine check or poll.
703 */
704 if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) {
705 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS,
706 DATA_TYPE_UINT64, mcg, NULL);
707
708 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP,
709 DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL);
710 }
711
712 /*
713 * If an instruction pointer is to be included add one provided
714 * MCG_STATUS indicated it is valid; meaningless for polled events.
715 */
716 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP &&
717 mcg & MCG_STATUS_EIPV) {
718 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP,
719 DATA_TYPE_UINT64, gcl->gcl_ip, NULL);
720 }
721
722 /*
723 * Add an indication of whether the trap occured during privileged code.
724 */
725 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) {
726 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV,
727 DATA_TYPE_BOOLEAN_VALUE,
728 gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL);
729 }
730
731 /*
732 * If requested, add the index of the MCA bank. This indicates the
733 * n'th bank of 4 MCA registers, and does not necessarily correspond
734 * to MCi_* - use the bank offset to correlate
735 */
736 if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) {
737 fm_payload_set(ereport,
738 /* Bank number */
739 FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno,
740 /* Offset of MCi_CTL */
741 FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64,
742 IA32_MSR_MC(bankno, CTL),
743 NULL);
744 }
745
746 /*
747 * Add MCi_STATUS if requested, and decode it.
748 */
749 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) {
750 const char *tbes[] = {
751 "No tracking", /* 00 */
752 "Green - below threshold", /* 01 */
753 "Yellow - above threshold", /* 10 */
754 "Reserved" /* 11 */
755 };
756
757 fm_payload_set(ereport,
758 /* Bank MCi_STATUS */
759 FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat,
760 /* Overflow? */
761 _GCPU_BSTATUS(bstat, OVER),
762 /* Uncorrected? */
763 _GCPU_BSTATUS(bstat, UC),
764 /* Enabled? */
765 _GCPU_BSTATUS(bstat, EN),
766 /* Processor context corrupt? */
767 _GCPU_BSTATUS(bstat, PCC),
768 /* Error code */
769 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE,
770 DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat),
771 /* Model-specific error code */
772 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE,
773 DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat),
774 NULL);
775
776 /*
777 * If MCG_CAP.TES_P indicates that that thresholding info
778 * is present in the architural component of the bank status
779 * then include threshold information for this bank.
780 */
781 if (gcl->gcl_flags & GCPU_GCL_F_TES_P) {
782 fm_payload_set(ereport,
783 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES,
784 DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)],
785 NULL);
786 }
787 }
788
789 /*
790 * Add MCi_ADDR info if requested and valid. We force addition of
791 * MCi_ADDR, even if its not valid on AMD family 0xf and above,
792 * to aid in analysis of ereports, for WatchDog errors.
793 */
794 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR &&
795 ((bstat & MSR_MC_STATUS_ADDRV) ||
796 gcpu_force_addr_in_payload)) {
797 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR,
798 DATA_TYPE_UINT64, gbl->gbl_addr, NULL);
799 }
800
801 /*
802 * MCi_MISC if requested and MCi_STATUS.MISCV).
803 */
804 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC &&
805 bstat & MSR_MC_STATUS_MISCV) {
806 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC,
807 DATA_TYPE_UINT64, gbl->gbl_misc, NULL);
808 }
809
810 }
811
812 /*
813 * Construct and post an ereport based on the logout information from a
814 * single MCA bank. We are not necessarily running on the cpu that
815 * detected the error.
816 */
817 static void
gcpu_ereport_post(const gcpu_logout_t * gcl,int bankidx,const gcpu_error_disp_t * ged,cms_cookie_t mscookie,uint64_t status)818 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx,
819 const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status)
820 {
821 gcpu_data_t *gcpu = gcl->gcl_gcpu;
822 cmi_hdl_t hdl = gcpu->gcpu_hdl;
823 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx];
824 const char *cpuclass = NULL, *leafclass = NULL;
825 uint16_t code = MCAX86_ERRCODE(status);
826 errorq_elem_t *eqep, *scr_eqep;
827 nvlist_t *ereport, *detector;
828 char buf[FM_MAX_CLASS];
829 const char *classfmt;
830 nv_alloc_t *nva;
831
832 if (panicstr) {
833 if ((eqep = errorq_reserve(ereport_errorq)) == NULL)
834 return;
835 ereport = errorq_elem_nvl(ereport_errorq, eqep);
836
837 /*
838 * Allocate another element for scratch space, but fallback
839 * to the one we have if that fails. We'd like to use the
840 * additional scratch space for nvlist construction.
841 */
842 if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL)
843 nva = errorq_elem_nva(ereport_errorq, scr_eqep);
844 else
845 nva = errorq_elem_nva(ereport_errorq, eqep);
846 } else {
847 ereport = fm_nvlist_create(NULL);
848 nva = NULL;
849 }
850
851 if (ereport == NULL)
852 return;
853
854 /*
855 * Common payload data required by the protocol:
856 * - ereport class
857 * - detector
858 * - ENA
859 */
860
861 /*
862 * Ereport class - call into model-specific support to allow it to
863 * provide a cpu class or leaf class, otherwise calculate our own.
864 */
865 cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass);
866 classfmt = ged ? ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN;
867 gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass,
868 leafclass);
869
870 /*
871 * The detector FMRI.
872 */
873 if ((detector = cms_ereport_detector(hdl, bankidx, mscookie,
874 nva)) == NULL)
875 detector = gcpu_fmri_create(hdl, nva);
876
877 /*
878 * Should we define a new ENA format 3?? for chip/core/strand?
879 * It will be better when virtualized.
880 */
881 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf,
882 fm_ena_generate_cpu(gcl->gcl_timestamp,
883 cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 |
884 cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL);
885
886 if (panicstr) {
887 fm_nvlist_destroy(detector, FM_NVA_RETAIN);
888 nv_alloc_reset(nva);
889 } else {
890 fm_nvlist_destroy(detector, FM_NVA_FREE);
891 }
892
893 /*
894 * Add the architectural ereport class-specific payload data.
895 */
896 gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code);
897
898 /*
899 * Allow model-specific code to add ereport members.
900 */
901 cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status,
902 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie);
903
904 /*
905 * Include stack if options is turned on and either selected in
906 * the payload member bitmask or inclusion is forced.
907 */
908 if (gcpu_mca_stack_flag &&
909 (cms_ereport_includestack(hdl, mscookie) ==
910 B_TRUE || gcpu_mca_stack_ereport_include)) {
911 fm_payload_stack_add(ereport, gcl->gcl_stack,
912 gcl->gcl_stackdepth);
913 }
914
915 /*
916 * If injection has taken place anytime in the past then note this
917 * on the ereport.
918 */
919 if (cmi_inj_tainted() == B_TRUE) {
920 fm_payload_set(ereport, "__injected", DATA_TYPE_BOOLEAN_VALUE,
921 B_TRUE, NULL);
922 }
923
924 /*
925 * Post ereport.
926 */
927 if (panicstr) {
928 errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC);
929 if (scr_eqep)
930 errorq_cancel(ereport_errorq, scr_eqep);
931 } else {
932 (void) fm_ereport_post(ereport, EVCH_TRYHARD);
933 fm_nvlist_destroy(ereport, FM_NVA_FREE);
934 }
935
936 }
937
938 /*ARGSUSED*/
939 void
gcpu_mca_drain(void * ignored,const void * data,const errorq_elem_t * eqe)940 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe)
941 {
942 const gcpu_logout_t *gcl = data;
943 const gcpu_bank_logout_t *gbl;
944 int ismc;
945 int i;
946
947 ismc = gcl->ismc;
948 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
949 const gcpu_error_disp_t *gened;
950 cms_cookie_t mscookie;
951
952 if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
953 !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
954 uint16_t code = MCAX86_ERRCODE(gbl->gbl_status);
955
956 /*
957 * Perform a match based on IA32 MCA architectural
958 * components alone.
959 */
960 gened = gcpu_disp_match(code); /* may be NULL */
961
962 /*
963 * Now see if an model-specific match can be made.
964 */
965 mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, ismc,
966 i, gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc,
967 gcl->gcl_ms_logout);
968
969 /*
970 * Prepare and dispatch an ereport for logging and
971 * diagnosis.
972 */
973 gcpu_ereport_post(gcl, i, gened, mscookie,
974 gbl->gbl_status);
975 } else if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
976 (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
977 /*
978 * Telemetry kept changing as we tried to read
979 * it. Force an unknown ereport leafclass but
980 * keep the telemetry unchanged for logging.
981 */
982 gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL,
983 gbl->gbl_status);
984 }
985 }
986 }
987
988 static size_t gcpu_mca_queue_datasz = 0;
989
990 /*
991 * The following code is ready to make a weak attempt at growing the
992 * errorq structure size. Since it is not foolproof (we don't know
993 * who may already be producing to the outgoing errorq) our caller
994 * instead assures that we'll always be called with no greater data
995 * size than on our first call.
996 */
997 static void
gcpu_errorq_init(size_t datasz)998 gcpu_errorq_init(size_t datasz)
999 {
1000 int slots;
1001
1002 mutex_enter(&gcpu_mca_queue_lock);
1003
1004 if (gcpu_mca_queue_datasz >= datasz) {
1005 mutex_exit(&gcpu_mca_queue_lock);
1006 return;
1007 }
1008
1009 membar_producer();
1010 if (gcpu_mca_queue) {
1011 gcpu_mca_queue_datasz = 0;
1012 errorq_destroy(gcpu_mca_queue);
1013 }
1014
1015 slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS);
1016 slots = MIN(slots, GCPU_MCA_MAX_ERRORS);
1017
1018 gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain,
1019 NULL, slots, datasz, 1, ERRORQ_VITAL);
1020
1021 if (gcpu_mca_queue != NULL)
1022 gcpu_mca_queue_datasz = datasz;
1023
1024 mutex_exit(&gcpu_mca_queue_lock);
1025 }
1026
1027 /*
1028 * Perform MCA initialization as described in section 14.6 of Intel 64
1029 * and IA-32 Architectures Software Developer's Manual Volume 3A.
1030 */
1031
1032 static uint_t global_nbanks;
1033
1034 void
gcpu_mca_init(cmi_hdl_t hdl)1035 gcpu_mca_init(cmi_hdl_t hdl)
1036 {
1037 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1038 uint64_t cap;
1039 uint_t vendor = cmi_hdl_vendor(hdl);
1040 uint_t family = cmi_hdl_family(hdl);
1041 uint_t rev = cmi_hdl_chiprev(hdl);
1042 gcpu_mca_t *mca = &gcpu->gcpu_mca;
1043 int mcg_ctl_present;
1044 uint_t nbanks;
1045 uint32_t ctl_skip_mask = 0;
1046 uint32_t status_skip_mask = 0;
1047 size_t mslsz;
1048 int i;
1049 #ifndef __xpv
1050 int mcg_ctl2_present;
1051 uint32_t cmci_capable = 0;
1052 #endif
1053 if (gcpu == NULL)
1054 return;
1055
1056 /* We add MCi_ADDR always for AMD Family 0xf and above */
1057 if (X86_CHIPFAM_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B))
1058 gcpu_force_addr_in_payload = 1;
1059
1060 /*
1061 * Protect from some silly /etc/system settings.
1062 */
1063 if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100)
1064 gcpu_mca_telemetry_retries = 5;
1065
1066 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS)
1067 return;
1068
1069 /*
1070 * CPU startup code only calls cmi_mca_init if x86_featureset indicates
1071 * both MCA and MCE support (i.e., X86FSET_MCA). P5, K6, and earlier
1072 * processors, which have their own more primitive way of doing
1073 * machine checks, will not have cmi_mca_init called since their
1074 * CPUID information will not indicate both MCA and MCE features.
1075 */
1076 ASSERT(is_x86_feature(x86_featureset, X86FSET_MCA));
1077
1078 /*
1079 * Determine whether the IA32_MCG_CTL register is present. If it
1080 * is we will enable all features by writing -1 to it towards
1081 * the end of this initialization; if it is absent then volume 3A
1082 * says we must nonetheless continue to initialize the individual
1083 * banks.
1084 */
1085 mcg_ctl_present = cap & MCG_CAP_CTL_P;
1086 #ifndef __xpv
1087 mcg_ctl2_present = cap & MCG_CAP_CTL2_P;
1088 #endif
1089
1090 /*
1091 * We squirell values away for inspection/debugging.
1092 */
1093 mca->gcpu_mca_bioscfg.bios_mcg_cap = cap;
1094 if (mcg_ctl_present)
1095 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL,
1096 &mca->gcpu_mca_bioscfg.bios_mcg_ctl);
1097
1098 /*
1099 * Determine the number of error-reporting banks implemented.
1100 */
1101 mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK;
1102
1103 if (nbanks != 0 && global_nbanks == 0)
1104 global_nbanks = nbanks; /* no race - BSP will get here first */
1105
1106 /*
1107 * If someone is hiding the number of banks (perhaps we are fully
1108 * virtualized?) or if this processor has more banks than the
1109 * first to set global_nbanks then bail. The latter requirement
1110 * is because we need to size our errorq data structure and we
1111 * don't want to have to grow the errorq (destroy and recreate)
1112 * which may just lose some telemetry.
1113 */
1114 if (nbanks == 0 || nbanks > global_nbanks)
1115 return;
1116
1117 mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks *
1118 sizeof (struct gcpu_bios_bankcfg), KM_SLEEP);
1119
1120 /*
1121 * Calculate the size we need to allocate for a gcpu_logout_t
1122 * with a gcl_data array big enough for all banks of this cpu.
1123 * Add any space requested by the model-specific logout support.
1124 */
1125 mslsz = cms_logout_size(hdl);
1126 mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) +
1127 (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz;
1128
1129 for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) {
1130 gcpu_logout_t *gcl;
1131
1132 mca->gcpu_mca_logout[i] = gcl =
1133 kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP);
1134 gcl->gcl_gcpu = gcpu;
1135 gcl->gcl_nbanks = nbanks;
1136 gcl->gcl_ms_logout = (mslsz == 0) ? NULL :
1137 (char *)(&gcl->gcl_data[0]) + nbanks *
1138 sizeof (gcpu_bank_logout_t);
1139
1140 }
1141
1142 #ifdef __xpv
1143 gcpu_xpv_mca_init(nbanks);
1144 #endif
1145
1146 mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1;
1147
1148 #ifndef __xpv
1149 mca->gcpu_bank_cmci = kmem_zalloc(sizeof (gcpu_mca_cmci_t) * nbanks,
1150 KM_SLEEP);
1151 #endif
1152
1153 /*
1154 * Create our errorq to transport the logout structures. This
1155 * can fail so users of gcpu_mca_queue must be prepared for NULL.
1156 */
1157 gcpu_errorq_init(mca->gcpu_mca_lgsz);
1158
1159 /*
1160 * Not knowing which, if any, banks are shared between cores we
1161 * assure serialization of MCA bank initialization by each cpu
1162 * on the chip. On chip architectures in which some banks are
1163 * shared this will mean the shared resource is initialized more
1164 * than once - we're simply aiming to avoid simultaneous MSR writes
1165 * to the shared resource.
1166 *
1167 * Even with these precautions, some platforms may yield a GP fault
1168 * if a core other than a designated master tries to write anything
1169 * but all 0's to MCi_{STATUS,ADDR,CTL}. So we will perform
1170 * those writes under on_trap protection.
1171 */
1172 mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock);
1173
1174 /*
1175 * Initialize poller data, but don't start polling yet.
1176 */
1177 gcpu_mca_poll_init(hdl);
1178
1179 /*
1180 * Work out which MCA banks we will initialize. In MCA logout
1181 * code we will only read those banks which we initialize here.
1182 */
1183 for (i = 0; i < nbanks; i++) {
1184 boolean_t skipctl = cms_bankctl_skipinit(hdl, i);
1185 boolean_t skipstatus = cms_bankstatus_skipinit(hdl, i);
1186
1187 if (!cms_present(hdl)) {
1188 /*
1189 * Model-specific support is not present, try to use
1190 * sane defaults.
1191 *
1192 * On AMD family 6 processors, reports about spurious
1193 * machine checks indicate that bank 0 should be
1194 * skipped.
1195 *
1196 * On Intel family 6 processors, the documentation tells
1197 * us not to write to MC0_CTL.
1198 *
1199 */
1200 if (i == 0 && family == 6) {
1201 switch (vendor) {
1202 case X86_VENDOR_AMD:
1203 skipstatus = B_TRUE;
1204 /*FALLTHRU*/
1205 case X86_VENDOR_Intel:
1206 skipctl = B_TRUE;
1207 break;
1208 }
1209 }
1210 }
1211
1212 ctl_skip_mask |= skipctl << i;
1213 status_skip_mask |= skipstatus << i;
1214
1215 if (skipctl && skipstatus)
1216 continue;
1217
1218 /*
1219 * Record which MCA banks were enabled, from the point of view
1220 * of the whole chip (if some cores share a bank we must be
1221 * sure either can logout from it).
1222 */
1223 atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i);
1224
1225 #ifndef __xpv
1226 /*
1227 * check CMCI capability
1228 */
1229 if (mcg_ctl2_present) {
1230 uint64_t ctl2;
1231 uint32_t cap = 0;
1232 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
1233 if (ctl2 & MSR_MC_CTL2_EN)
1234 continue;
1235 ctl2 |= MSR_MC_CTL2_EN;
1236 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
1237 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
1238 mca->gcpu_bank_cmci[i].cmci_cap = cap =
1239 (ctl2 & MSR_MC_CTL2_EN) ? 1 : 0;
1240 if (cap)
1241 cmci_capable ++;
1242 /*
1243 * Set threshold to 1 while unset the en field, to avoid
1244 * CMCI trigged before APIC LVT entry init.
1245 */
1246 ctl2 = ctl2 & (~MSR_MC_CTL2_EN) | 1;
1247 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
1248
1249 /*
1250 * init cmci related count
1251 */
1252 mca->gcpu_bank_cmci[i].cmci_enabled = 0;
1253 mca->gcpu_bank_cmci[i].drtcmci = 0;
1254 mca->gcpu_bank_cmci[i].ncmci = 0;
1255 }
1256 #endif
1257 }
1258
1259 #ifndef __xpv
1260 if (cmci_capable)
1261 cmi_enable_cmci = 1;
1262 #endif
1263
1264 #ifndef __xpv
1265 /*
1266 * Log any valid telemetry lurking in the MCA banks, but do not
1267 * clear the status registers. Ignore the disposition returned -
1268 * we have already paniced or reset for any nasty errors found here.
1269 *
1270 * Intel vol 3A says that we should not do this on family 0x6,
1271 * and that for any extended family the BIOS clears things
1272 * on power-on reset so you'll only potentially find valid telemetry
1273 * on warm reset (we do it for both - on power-on reset we should
1274 * just see zeroes).
1275 *
1276 * AMD docs since K7 say we should process anything we find here.
1277 */
1278 if (!gcpu_suppress_log_on_init &&
1279 (vendor == X86_VENDOR_Intel && family >= 0xf ||
1280 vendor == X86_VENDOR_AMD))
1281 gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE,
1282 GCPU_MPT_WHAT_POKE_ERR);
1283
1284 /*
1285 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the
1286 * model-specific module the power of veto.
1287 */
1288 for (i = 0; i < nbanks; i++) {
1289 struct gcpu_bios_bankcfg *bcfgp =
1290 mca->gcpu_mca_bioscfg.bios_bankcfg + i;
1291
1292 /*
1293 * Stash inherited bank MCA state, even for banks we will
1294 * not initialize ourselves. Do not read the MISC register
1295 * unconditionally - on some processors that will #GP on
1296 * banks that do not implement the MISC register (would be
1297 * caught by on_trap, anyway).
1298 */
1299 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL),
1300 &bcfgp->bios_bank_ctl);
1301
1302 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1303 &bcfgp->bios_bank_status);
1304
1305 if ((bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV) ||
1306 gcpu_force_addr_in_payload) {
1307 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR),
1308 &bcfgp->bios_bank_addr);
1309 }
1310
1311 /*
1312 * In some old BIOS the status value after boot can indicate
1313 * MISCV when there is actually no MISC register for
1314 * that bank. The following read could therefore
1315 * aggravate a general protection fault. This should be
1316 * caught by on_trap, but the #GP fault handler is busted
1317 * and can suffer a double fault even before we get to
1318 * trap() to check for on_trap protection. Until that
1319 * issue is fixed we remove the one access that we know
1320 * can cause a #GP.
1321 *
1322 * if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV)
1323 * (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC),
1324 * &bcfgp->bios_bank_misc);
1325 */
1326 bcfgp->bios_bank_misc = 0;
1327
1328 if (!(ctl_skip_mask & (1 << i))) {
1329 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL),
1330 cms_bankctl_val(hdl, i, -1ULL));
1331 }
1332
1333 if (!(status_skip_mask & (1 << i))) {
1334 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS),
1335 cms_bankstatus_val(hdl, i, 0ULL));
1336 }
1337 }
1338 #endif
1339 /*
1340 * Now let the model-specific support perform further initialization
1341 * of non-architectural features.
1342 */
1343 cms_mca_init(hdl, nbanks);
1344
1345 #ifndef __xpv
1346 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL);
1347 membar_producer();
1348
1349 /* enable all machine-check features */
1350 if (mcg_ctl_present)
1351 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL,
1352 cms_mcgctl_val(hdl, nbanks, -1ULL));
1353 #endif
1354
1355 mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock);
1356
1357 #ifndef __xpv
1358 /* enable machine-check exception in CR4 */
1359 cmi_hdl_enable_mce(hdl);
1360 #endif
1361 }
1362
1363 static uint64_t
gcpu_mca_process(cmi_hdl_t hdl,struct regs * rp,int nerr,gcpu_data_t * gcpu,gcpu_logout_t * gcl,int ismc,gcpu_mce_status_t * mcesp)1364 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu,
1365 gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp)
1366 {
1367 int curctxbad = 0, unconstrained = 0, forcefatal = 0;
1368 gcpu_mca_t *mca = &gcpu->gcpu_mca;
1369 int nbanks = mca->gcpu_mca_nbanks;
1370 gcpu_mce_status_t mce;
1371 gcpu_bank_logout_t *gbl;
1372 uint64_t disp = 0;
1373 int i;
1374
1375 if (mcesp == NULL)
1376 mcesp = &mce;
1377
1378 mcesp->mce_nerr = nerr;
1379
1380 mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc =
1381 mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned =
1382 mcesp->mce_forcefatal = mcesp->mce_ignored = 0;
1383
1384 /*
1385 * If this a machine check then if the return instruction pointer
1386 * is not valid the current context is lost.
1387 */
1388 if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV))
1389 disp |= CMI_ERRDISP_RIPV_INVALID;
1390 gcl->ismc = ismc;
1391
1392 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1393 uint64_t mcistatus = gbl->gbl_status;
1394 uint32_t ms_scope;
1395 int pcc, uc;
1396 int poisoned;
1397
1398 if (!(mcistatus & MSR_MC_STATUS_VAL))
1399 continue;
1400
1401 if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)
1402 continue;
1403
1404 pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0;
1405 uc = (mcistatus & MSR_MC_STATUS_UC) != 0;
1406 mcesp->mce_npcc += pcc;
1407 mcesp->mce_nuc += uc;
1408
1409 ms_scope = cms_error_action(hdl, ismc, i, mcistatus,
1410 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout);
1411
1412 if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) {
1413 pcc = 0;
1414 mcesp->mce_npcc_ok++;
1415 gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED;
1416 }
1417
1418 if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) {
1419 uc = 0;
1420 mcesp->mce_nuc_ok++;
1421 gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED;
1422 }
1423
1424 if (uc) {
1425 poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0;
1426 if (poisoned) {
1427 mcesp->mce_nuc_poisoned++;
1428 gbl->gbl_disp |= CMI_ERRDISP_POISONED;
1429 }
1430 }
1431
1432 if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) {
1433 /*
1434 * We're not being instructed to ignore the error,
1435 * so apply our standard disposition logic to it.
1436 */
1437 if (uc && !poisoned) {
1438 unconstrained++;
1439 gbl->gbl_disp |= disp |
1440 CMI_ERRDISP_UC_UNCONSTRAINED;
1441 }
1442
1443 if (pcc && ismc) {
1444 curctxbad++;
1445 gbl->gbl_disp |= disp |
1446 CMI_ERRDISP_CURCTXBAD;
1447 }
1448
1449 /*
1450 * Even if the above may not indicate that the error
1451 * is terminal, model-specific support may insist
1452 * that we treat it as such. Such errors wil be
1453 * fatal even if discovered via poll.
1454 */
1455 if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) {
1456 forcefatal++;
1457 mcesp->mce_forcefatal++;
1458 gbl->gbl_disp |= disp |
1459 CMI_ERRDISP_FORCEFATAL;
1460 }
1461 } else {
1462 mcesp->mce_ignored++;
1463 gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED;
1464 }
1465 }
1466
1467 if (unconstrained > 0)
1468 disp |= CMI_ERRDISP_UC_UNCONSTRAINED;
1469
1470 if (curctxbad > 0)
1471 disp |= CMI_ERRDISP_CURCTXBAD;
1472
1473 if (forcefatal > 0)
1474 disp |= CMI_ERRDISP_FORCEFATAL;
1475
1476 if (gcpu_mca_queue != NULL) {
1477 int how;
1478
1479 if (ismc) {
1480 how = cmi_mce_response(rp, disp) ?
1481 ERRORQ_ASYNC : /* no panic, so arrange drain */
1482 ERRORQ_SYNC; /* panic flow will drain */
1483 } else {
1484 how = (disp & CMI_ERRDISP_FORCEFATAL &&
1485 cmi_panic_on_ue()) ?
1486 ERRORQ_SYNC : /* poller will panic */
1487 ERRORQ_ASYNC; /* no panic */
1488 }
1489
1490 errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how);
1491 } else if (disp != 0) {
1492 gcpu_bleat(hdl, gcl);
1493 }
1494
1495 mcesp->mce_disp = disp;
1496
1497 return (disp);
1498 }
1499
1500 /*
1501 * Gather error telemetry from our source, and then submit it for
1502 * processing.
1503 */
1504
1505 #define IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \
1506 ((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0)
1507
1508 #define STATUS_EQV(s1, s2) \
1509 (((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER))
1510
1511 static uint32_t gcpu_deferrred_polled_clears;
1512
1513 #ifndef __xpv
1514 static void
gcpu_cmci_logout(cmi_hdl_t hdl,int bank,gcpu_mca_cmci_t * bank_cmci_p,uint64_t status,int what)1515 gcpu_cmci_logout(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
1516 uint64_t status, int what)
1517 {
1518 uint64_t ctl2;
1519
1520 if (bank_cmci_p->cmci_cap && (what == GCPU_MPT_WHAT_CYC_ERR) &&
1521 (!(status & MSR_MC_STATUS_VAL) || ((status & MSR_MC_STATUS_VAL) &&
1522 !(status & MSR_MC_STATUS_CEC_MASK)))) {
1523
1524 if (!(bank_cmci_p->cmci_enabled)) {
1525 /*
1526 * when cmci is disabled, and the bank has no error or
1527 * no corrected error for
1528 * gcpu_mca_cmci_reenable_threshold consecutive polls,
1529 * turn on this bank's cmci.
1530 */
1531
1532 bank_cmci_p->drtcmci ++;
1533
1534 if (bank_cmci_p->drtcmci >=
1535 gcpu_mca_cmci_reenable_threshold) {
1536
1537 /* turn on cmci */
1538
1539 (void) cmi_hdl_rdmsr(hdl,
1540 IA32_MSR_MC_CTL2(bank), &ctl2);
1541 ctl2 |= MSR_MC_CTL2_EN;
1542 (void) cmi_hdl_wrmsr(hdl,
1543 IA32_MSR_MC_CTL2(bank), ctl2);
1544
1545 /* reset counter and set flag */
1546 bank_cmci_p->drtcmci = 0;
1547 bank_cmci_p->cmci_enabled = 1;
1548 }
1549 } else {
1550 /*
1551 * when cmci is enabled,if is in cyclic poll and the
1552 * bank has no error or no corrected error, reset ncmci
1553 * counter
1554 */
1555 bank_cmci_p->ncmci = 0;
1556 }
1557 }
1558 }
1559
1560 static void
gcpu_cmci_throttle(cmi_hdl_t hdl,int bank,gcpu_mca_cmci_t * bank_cmci_p,int what)1561 gcpu_cmci_throttle(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
1562 int what)
1563 {
1564 uint64_t ctl2 = 0;
1565
1566 /*
1567 * if cmci of this bank occurred beyond
1568 * gcpu_mca_cmci_throttling_threshold between 2 polls,
1569 * turn off this bank's CMCI;
1570 */
1571 if (bank_cmci_p->cmci_enabled && what == GCPU_MPT_WHAT_CMCI_ERR) {
1572
1573 /* if it is cmci trap, increase the count */
1574 bank_cmci_p->ncmci++;
1575
1576 if (bank_cmci_p->ncmci >= gcpu_mca_cmci_throttling_threshold) {
1577
1578 /* turn off cmci */
1579
1580 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(bank),
1581 &ctl2);
1582 ctl2 &= ~MSR_MC_CTL2_EN;
1583 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(bank),
1584 ctl2);
1585
1586 /* clear the flag and count */
1587
1588 bank_cmci_p->cmci_enabled = 0;
1589 bank_cmci_p->ncmci = 0;
1590 }
1591 }
1592 }
1593 #endif
1594
1595 static void
clear_mc(int first,int last,int ismc,boolean_t clrstatus,cmi_hdl_t hdl,gcpu_logout_t * gcl,gcpu_logout_t * pgcl)1596 clear_mc(int first, int last, int ismc, boolean_t clrstatus,
1597 cmi_hdl_t hdl, gcpu_logout_t *gcl, gcpu_logout_t *pgcl)
1598 {
1599 int i;
1600 gcpu_bank_logout_t *gbl, *pgbl;
1601 uint64_t status;
1602
1603 if (first < 0 || last < 0)
1604 return;
1605
1606 for (i = first, gbl = &gcl->gcl_data[first]; i <= last; i++, gbl++) {
1607 status = gbl->gbl_status;
1608 if (status == 0)
1609 continue;
1610 if (clrstatus == B_FALSE)
1611 goto serialize;
1612
1613 /*
1614 * For i86xpv we always clear status in order to invalidate
1615 * the interposed telemetry.
1616 *
1617 * For native machine checks we always clear status here. For
1618 * native polls we must be a little more cautious since there
1619 * is an outside chance that we may clear telemetry from a
1620 * shared MCA bank on which a sibling core is machine checking.
1621 *
1622 * For polled observations of errors that look like they may
1623 * produce a machine check (UC/PCC and ENabled, although these
1624 * do not guarantee a machine check on error occurence)
1625 * we will not clear the status at this wakeup unless
1626 * we saw the same status at the previous poll. We will
1627 * always process and log the current observations - it
1628 * is only the clearing of MCi_STATUS which may be
1629 * deferred until the next wakeup.
1630 */
1631 if (isxpv || ismc || !IS_MCE_CANDIDATE(status)) {
1632 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL);
1633 goto serialize;
1634 }
1635
1636 /*
1637 * We have a polled observation of a machine check
1638 * candidate. If we saw essentially the same status at the
1639 * last poll then clear the status now since this appears
1640 * not to be a #MC candidate after all. If we see quite
1641 * different status now then do not clear, but reconsider at
1642 * the next poll. In no actual machine check clears
1643 * the status in the interim then the status should not
1644 * keep changing forever (meaning we'd never clear it)
1645 * since before long we'll simply have latched the highest-
1646 * priority error and set the OVerflow bit. Nonetheless
1647 * we count how many times we defer clearing and after
1648 * a while insist on clearing the status.
1649 */
1650 pgbl = &pgcl->gcl_data[i];
1651 if (pgbl->gbl_clrdefcnt != 0) {
1652 /* We deferred clear on this bank at last wakeup */
1653 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) ||
1654 pgbl->gbl_clrdefcnt > 5) {
1655 /*
1656 * Status is unchanged so clear it now and,
1657 * since we have already logged this info,
1658 * avoid logging it again.
1659 */
1660 gbl->gbl_status = 0;
1661 (void) cmi_hdl_wrmsr(hdl,
1662 IA32_MSR_MC(i, STATUS), 0ULL);
1663 } else {
1664 /* Record deferral for next wakeup */
1665 gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1;
1666 }
1667 } else {
1668 /* Record initial deferral for next wakeup */
1669 gbl->gbl_clrdefcnt = 1;
1670 gcpu_deferrred_polled_clears++;
1671 }
1672
1673 serialize:
1674 {
1675 #ifdef __xpv
1676 ;
1677 #else
1678 /*
1679 * Intel Vol 3A says to execute a serializing
1680 * instruction here, ie CPUID. Well WRMSR is also
1681 * defined to be serializing, so the status clear above
1682 * should suffice. To be a good citizen, and since
1683 * some clears are deferred, we'll execute a CPUID
1684 * instruction here.
1685 */
1686 struct cpuid_regs tmp;
1687 (void) __cpuid_insn(&tmp);
1688 #endif
1689 }
1690 }
1691 }
1692
1693 /*ARGSUSED5*/
1694 void
gcpu_mca_logout(cmi_hdl_t hdl,struct regs * rp,uint64_t bankmask,gcpu_mce_status_t * mcesp,boolean_t clrstatus,int what)1695 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask,
1696 gcpu_mce_status_t *mcesp, boolean_t clrstatus, int what)
1697 {
1698 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1699 gcpu_mca_t *mca = &gcpu->gcpu_mca;
1700 int nbanks = mca->gcpu_mca_nbanks;
1701 gcpu_bank_logout_t *gbl, *pgbl;
1702 gcpu_logout_t *gcl, *pgcl;
1703 int ismc = (rp != NULL);
1704 int ispoll = !ismc;
1705 int i, nerr = 0;
1706 cmi_errno_t err;
1707 uint64_t mcg_status;
1708 uint64_t disp;
1709 uint64_t cap;
1710 int first = -1;
1711 int last = -1;
1712 int willpanic = 0;
1713
1714 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
1715 CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) !=
1716 CMI_SUCCESS) {
1717 if (mcesp != NULL)
1718 mcesp->mce_nerr = mcesp->mce_disp = 0;
1719 return;
1720 }
1721
1722 if (ismc) {
1723 gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION];
1724 } else {
1725 int pidx = mca->gcpu_mca_nextpoll_idx;
1726 int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ?
1727 GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1;
1728
1729 gcl = mca->gcpu_mca_logout[pidx]; /* current logout */
1730 pgcl = mca->gcpu_mca_logout[ppidx]; /* previous logout */
1731 mca->gcpu_mca_nextpoll_idx = ppidx; /* switch next time */
1732 }
1733
1734 gcl->gcl_timestamp = gethrtime_waitfree();
1735 gcl->gcl_mcg_status = mcg_status;
1736 gcl->gcl_ip = rp ? rp->r_pc : 0;
1737
1738 gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0;
1739 if (cap & MCG_CAP_TES_P)
1740 gcl->gcl_flags |= GCPU_GCL_F_TES_P;
1741
1742 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1743 uint64_t status, status2, addr, misc;
1744 int retries = gcpu_mca_telemetry_retries;
1745
1746 gbl->gbl_status = 0;
1747 gbl->gbl_disp = 0;
1748 gbl->gbl_clrdefcnt = 0;
1749
1750 /*
1751 * Only logout from MCA banks we have initialized from at
1752 * least one core. If a core shares an MCA bank with another
1753 * but perhaps lost the race to initialize it, then it must
1754 * still be allowed to logout from the shared bank.
1755 */
1756 if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i))
1757 continue;
1758
1759 /*
1760 * On a poll look only at the banks we've been asked to check.
1761 */
1762 if (rp == NULL && !(bankmask & 1 << i))
1763 continue;
1764
1765
1766 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) !=
1767 CMI_SUCCESS)
1768 continue;
1769
1770 #ifndef __xpv
1771 gcpu_cmci_logout(hdl, i, &mca->gcpu_bank_cmci[i], status, what);
1772 #endif
1773
1774 retry:
1775 if (!(status & MSR_MC_STATUS_VAL))
1776 continue;
1777
1778 /* First and last bank that have valid status */
1779 if (first < 0)
1780 first = i;
1781 last = i;
1782
1783 addr = -1;
1784 misc = 0;
1785
1786 if ((status & MSR_MC_STATUS_ADDRV) ||
1787 gcpu_force_addr_in_payload)
1788 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr);
1789
1790 if (status & MSR_MC_STATUS_MISCV)
1791 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc);
1792
1793 #ifndef __xpv
1794 gcpu_cmci_throttle(hdl, i, &mca->gcpu_bank_cmci[i], what);
1795 #endif
1796
1797 /*
1798 * Allow the model-specific code to extract bank telemetry.
1799 */
1800 cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout);
1801
1802 /*
1803 * Not all cpu models assure us that the status/address/misc
1804 * data will not change during the above sequence of MSR reads,
1805 * or that it can only change by the addition of the OVerflow
1806 * bit to the status register. If the status has changed
1807 * other than in the overflow bit then we attempt to reread
1808 * for a consistent snapshot, but eventually give up and
1809 * go with what we've got. We only perform this check
1810 * for a poll - a further #MC during a #MC will reset, and
1811 * polled errors should not overwrite higher-priority
1812 * trapping errors (but could set the overflow bit).
1813 */
1814 if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1815 &status2)) == CMI_SUCCESS) {
1816 if (!STATUS_EQV(status, status2)) {
1817 if (retries-- > 0) {
1818 status = status2;
1819 goto retry;
1820 } else {
1821 gbl->gbl_disp |=
1822 CMI_ERRDISP_INCONSISTENT;
1823 }
1824 }
1825 } else if (ispoll && err != CMI_SUCCESS) {
1826 gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT;
1827 }
1828
1829 nerr++;
1830 gbl->gbl_status = status;
1831 gbl->gbl_addr = addr;
1832 gbl->gbl_misc = misc;
1833
1834 /*
1835 * For polled observation, if the count of deferred status
1836 * clears updated in the clear_mc() is nonzero and the
1837 * MCi_STATUS has not changed, the last wakeup has produced
1838 * the ereport of the error. Therefore, clear the status in
1839 * this wakeup to avoid duplicate ereport.
1840 */
1841 pgbl = &pgcl->gcl_data[i];
1842 if (!isxpv && ispoll && IS_MCE_CANDIDATE(status) &&
1843 pgbl->gbl_clrdefcnt != 0) {
1844 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status)) {
1845 gbl->gbl_status = 0;
1846 (void) cmi_hdl_wrmsr(hdl,
1847 IA32_MSR_MC(i, STATUS), 0ULL);
1848 }
1849 }
1850 }
1851
1852 if (gcpu_mca_stack_flag)
1853 gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH);
1854 else
1855 gcl->gcl_stackdepth = 0;
1856
1857 /*
1858 * Decide our disposition for this error or errors, and submit for
1859 * logging and subsequent diagnosis.
1860 */
1861 if (nerr != 0) {
1862 disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp);
1863
1864 willpanic = (ismc && cmi_mce_response(rp, disp) == 0);
1865
1866 if (!willpanic)
1867 clear_mc(first, last, ismc, clrstatus, hdl, gcl, pgcl);
1868 } else {
1869 disp = 0;
1870 if (mcesp) {
1871 mcesp->mce_nerr = mcesp->mce_disp = 0;
1872 }
1873 }
1874
1875 /*
1876 * Clear MCG_STATUS if MCIP is set (machine check in progress).
1877 * If a second #MC had occured before now the system would have
1878 * reset. We can only do thise once gcpu_mca_process has copied
1879 * the logout structure.
1880 */
1881 if (ismc && mcg_status & MCG_STATUS_MCIP)
1882 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0);
1883
1884 /*
1885 * At this point we have read and logged all telemetry that is visible
1886 * under the MCA. On architectures for which the NorthBridge is
1887 * on-chip this may include NB-observed errors, but where the NB
1888 * is off chip it may have been the source of the #MC request and
1889 * so we must call into the memory-controller driver to give it
1890 * a chance to log errors.
1891 */
1892 if (ismc) {
1893 cmi_mc_logout(hdl, 1, willpanic);
1894 }
1895 }
1896
1897 #ifndef __xpv
1898 int gcpu_mca_trap_vomit_summary = 0;
1899
1900 /*
1901 * On a native machine check exception we come here from mcetrap via
1902 * cmi_mca_trap. A machine check on one cpu of a chip does not trap others
1903 * cpus of the chip, so it is possible that another cpu on this chip could
1904 * initiate a poll while we're in the #mc handler; it is also possible that
1905 * this trap has occured during a poll on this cpu. So we must acquire
1906 * the chip-wide poll lock, but be careful to avoid deadlock.
1907 *
1908 * The 'data' pointer cannot be NULL due to init order.
1909 */
1910 uint64_t
gcpu_mca_trap(cmi_hdl_t hdl,struct regs * rp)1911 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp)
1912 {
1913 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1914 kmutex_t *poll_lock = NULL;
1915 gcpu_mce_status_t mce;
1916 uint64_t mcg_status;
1917 int tooklock = 0;
1918
1919 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
1920 CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP))
1921 return (0);
1922
1923 /*
1924 * Synchronize with any poller from another core that may happen
1925 * to share access to one or more of the MCA banks.
1926 */
1927 if (gcpu->gcpu_shared != NULL)
1928 poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock;
1929
1930 if (poll_lock != NULL && !mutex_owned(poll_lock)) {
1931 /*
1932 * The lock is not owned by the thread we have
1933 * interrupted. Spin for this adaptive lock.
1934 */
1935 while (!mutex_tryenter(poll_lock)) {
1936 while (mutex_owner(poll_lock) != NULL)
1937 ;
1938 }
1939 tooklock = 1;
1940 }
1941
1942 gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE, GCPU_MPT_WHAT_MC_ERR);
1943
1944 if (tooklock)
1945 mutex_exit(poll_lock);
1946
1947 /*
1948 * gcpu_mca_trap_vomit_summary may be set for debug assistance.
1949 */
1950 if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) {
1951 cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, "
1952 "%u PCC (%u ok), "
1953 "%u UC (%d ok, %u poisoned), "
1954 "%u forcefatal, %u ignored",
1955 mce.mce_nerr, (u_longlong_t)mce.mce_disp,
1956 mce.mce_npcc, mce.mce_npcc_ok,
1957 mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned,
1958 mce.mce_forcefatal, mce.mce_ignored);
1959 }
1960
1961 return (mce.mce_disp);
1962 }
1963 #endif
1964
1965 /*ARGSUSED*/
1966 void
gcpu_faulted_enter(cmi_hdl_t hdl)1967 gcpu_faulted_enter(cmi_hdl_t hdl)
1968 {
1969 /* Nothing to do here */
1970 }
1971
1972 /*ARGSUSED*/
1973 void
gcpu_faulted_exit(cmi_hdl_t hdl)1974 gcpu_faulted_exit(cmi_hdl_t hdl)
1975 {
1976 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1977
1978 gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING;
1979 }
1980
1981 /*
1982 * Write the requested values to the indicated MSRs. Having no knowledge
1983 * of the model-specific requirements for writing to these model-specific
1984 * registers, we will only blindly write to those MSRs if the 'force'
1985 * argument is nonzero. That option should only be used in prototyping
1986 * and debugging.
1987 */
1988 /*ARGSUSED*/
1989 cmi_errno_t
gcpu_msrinject(cmi_hdl_t hdl,cmi_mca_regs_t * regs,uint_t nregs,int force)1990 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs,
1991 int force)
1992 {
1993 int i, errs = 0;
1994
1995 for (i = 0; i < nregs; i++) {
1996 uint_t msr = regs[i].cmr_msrnum;
1997 uint64_t val = regs[i].cmr_msrval;
1998
1999 if (cms_present(hdl)) {
2000 if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS)
2001 errs++;
2002 } else if (force) {
2003 errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS);
2004 } else {
2005 errs++;
2006 }
2007 }
2008
2009 return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN);
2010 }
2011
2012 /* deconfigure gcpu_mca_init() */
2013 void
gcpu_mca_fini(cmi_hdl_t hdl)2014 gcpu_mca_fini(cmi_hdl_t hdl)
2015 {
2016 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
2017 gcpu_mca_t *mca = &gcpu->gcpu_mca;
2018 int i;
2019
2020 /*
2021 * CPU startup code only calls cmi_mca_init if x86_featureset indicates
2022 * both MCA and MCE support (i.e., X86FSET_MCA). P5, K6, and earlier
2023 * processors, which have their own more primitive way of doing
2024 * machine checks, will not have cmi_mca_init called since their
2025 * CPUID information will not indicate both MCA and MCE features.
2026 */
2027 if (!is_x86_feature(x86_featureset, X86FSET_MCA))
2028 return;
2029 #ifndef __xpv
2030 /*
2031 * disable machine check in CR4
2032 */
2033 cmi_ntv_hwdisable_mce(hdl);
2034 #endif
2035 mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock);
2036 gcpu_mca_poll_fini(hdl);
2037 mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock);
2038
2039 /*
2040 * free resources allocated during init
2041 */
2042 if (mca->gcpu_bank_cmci != NULL) {
2043 kmem_free(mca->gcpu_bank_cmci, sizeof (gcpu_mca_cmci_t) *
2044 mca->gcpu_mca_nbanks);
2045 }
2046
2047 for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) {
2048 if (mca->gcpu_mca_logout[i] != NULL) {
2049 kmem_free(mca->gcpu_mca_logout[i], mca->gcpu_mca_lgsz);
2050 }
2051 }
2052
2053 if (mca->gcpu_mca_bioscfg.bios_bankcfg != NULL) {
2054 kmem_free(mca->gcpu_mca_bioscfg.bios_bankcfg,
2055 sizeof (struct gcpu_bios_bankcfg) * mca->gcpu_mca_nbanks);
2056 }
2057 }
2058