1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2009 Hudson River Trading LLC
5 * Written by: John H. Baldwin <jhb@FreeBSD.org>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 /*
31 * Support for x86 machine check architecture.
32 */
33
34 #include <sys/cdefs.h>
35 #ifdef __amd64__
36 #define DEV_APIC
37 #else
38 #include "opt_apic.h"
39 #endif
40
41 #include <sys/param.h>
42 #include <sys/bus.h>
43 #include <sys/interrupt.h>
44 #include <sys/kernel.h>
45 #include <sys/lock.h>
46 #include <sys/malloc.h>
47 #include <sys/mutex.h>
48 #include <sys/proc.h>
49 #include <sys/sbuf.h>
50 #include <sys/sched.h>
51 #include <sys/smp.h>
52 #include <sys/sysctl.h>
53 #include <sys/syslog.h>
54 #include <sys/systm.h>
55 #include <sys/taskqueue.h>
56 #include <machine/intr_machdep.h>
57 #include <x86/apicvar.h>
58 #include <machine/cpu.h>
59 #include <machine/cputypes.h>
60 #include <x86/mca.h>
61 #include <machine/md_var.h>
62 #include <machine/specialreg.h>
63
64 /* Modes for mca_scan() */
65 enum scan_mode {
66 POLLED,
67 MCE,
68 CMCI,
69 };
70
71 #ifdef DEV_APIC
72 /*
73 * State maintained for each monitored MCx bank to control the
74 * corrected machine check interrupt threshold.
75 */
76 struct cmc_state {
77 int max_threshold;
78 time_t last_intr;
79 };
80
81 struct amd_et_state {
82 int cur_threshold;
83 time_t last_intr;
84 };
85 #endif
86
87 struct mca_internal {
88 struct mca_record rec;
89 STAILQ_ENTRY(mca_internal) link;
90 };
91
92 struct mca_enumerator_ops {
93 unsigned int (*ctl)(int);
94 unsigned int (*status)(int);
95 unsigned int (*addr)(int);
96 unsigned int (*misc)(int);
97 };
98
99 static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture");
100
101 static volatile int mca_count; /* Number of records stored. */
102 static int mca_banks; /* Number of per-CPU register banks. */
103 static int mca_maxcount = -1; /* Limit on records stored. (-1 = unlimited) */
104
105 static SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
106 "Machine Check Architecture");
107
108 static int mca_enabled = 1;
109 SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0,
110 "Administrative toggle for machine check support");
111
112 static int log_corrected = 1;
113 SYSCTL_INT(_hw_mca, OID_AUTO, log_corrected, CTLFLAG_RWTUN, &log_corrected, 0,
114 "Log corrected errors to the console");
115
116 static int amd10h_L1TP = 1;
117 SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0,
118 "Administrative toggle for logging of level one TLB parity (L1TP) errors");
119
120 static int intel6h_HSD131;
121 SYSCTL_INT(_hw_mca, OID_AUTO, intel6h_HSD131, CTLFLAG_RDTUN, &intel6h_HSD131, 0,
122 "Administrative toggle for logging of spurious corrected errors");
123
124 int workaround_erratum383;
125 SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RDTUN,
126 &workaround_erratum383, 0,
127 "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?");
128
129 #ifdef DIAGNOSTIC
130 static uint64_t fake_status;
131 SYSCTL_U64(_hw_mca, OID_AUTO, fake_status, CTLFLAG_RW,
132 &fake_status, 0,
133 "Insert artificial MCA with given status (testing purpose only)");
134 static int fake_bank;
135 SYSCTL_INT(_hw_mca, OID_AUTO, fake_bank, CTLFLAG_RW,
136 &fake_bank, 0,
137 "Bank to use for artificial MCAs (testing purpose only)");
138 #endif
139
140 static bool mca_uselog = false;
141 SYSCTL_BOOL(_hw_mca, OID_AUTO, uselog, CTLFLAG_RWTUN, &mca_uselog, 0,
142 "Should the system send non-fatal machine check errors to the log "
143 "(instead of the console)?");
144
145 static STAILQ_HEAD(, mca_internal) mca_freelist;
146 static int mca_freecount;
147 static STAILQ_HEAD(, mca_internal) mca_records;
148 static STAILQ_HEAD(, mca_internal) mca_pending;
149 static int mca_ticks = 300;
150 static struct taskqueue *mca_tq;
151 static struct task mca_resize_task;
152 static struct task mca_postscan_task;
153 static struct timeout_task mca_scan_task;
154 static struct mtx mca_lock;
155 static bool mca_startup_done = false;
156
157 /* Static buffer to compose messages while in an interrupt context. */
158 static char mca_msg_buf[1024];
159 static struct mtx mca_msg_buf_lock;
160
161 /* Statistics on number of MCA events by type, updated with the mca_lock. */
162 static uint64_t mca_stats[MCA_T_COUNT];
163 SYSCTL_OPAQUE(_hw_mca, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_SKIP,
164 mca_stats, MCA_T_COUNT * sizeof(mca_stats[0]),
165 "S", "Array of MCA events by type");
166
167 /* Variables to track and control message rate limiting. */
168 static struct timeval mca_last_log_time;
169 static struct timeval mca_log_interval;
170 static int mca_log_skipped;
171
172 static int
sysctl_mca_log_interval(SYSCTL_HANDLER_ARGS)173 sysctl_mca_log_interval(SYSCTL_HANDLER_ARGS)
174 {
175 int error;
176 u_int val;
177
178 val = mca_log_interval.tv_sec;
179 error = sysctl_handle_int(oidp, &val, 0, req);
180 if (error != 0 || req->newptr == NULL)
181 return (error);
182 mca_log_interval.tv_sec = val;
183 return (0);
184 }
185 SYSCTL_PROC(_hw_mca, OID_AUTO, log_interval,
186 CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, &mca_log_interval, 0,
187 sysctl_mca_log_interval, "IU",
188 "Minimum number of seconds between logging correctable MCAs"
189 " (0 = no limit)");
190
191 static unsigned int
mca_ia32_ctl_reg(int bank)192 mca_ia32_ctl_reg(int bank)
193 {
194 return (MSR_MC_CTL(bank));
195 }
196
197 static unsigned int
mca_ia32_status_reg(int bank)198 mca_ia32_status_reg(int bank)
199 {
200 return (MSR_MC_STATUS(bank));
201 }
202
203 static unsigned int
mca_ia32_addr_reg(int bank)204 mca_ia32_addr_reg(int bank)
205 {
206 return (MSR_MC_ADDR(bank));
207 }
208
209 static unsigned int
mca_ia32_misc_reg(int bank)210 mca_ia32_misc_reg(int bank)
211 {
212 return (MSR_MC_MISC(bank));
213 }
214
215 static unsigned int
mca_smca_ctl_reg(int bank)216 mca_smca_ctl_reg(int bank)
217 {
218 return (MSR_SMCA_MC_CTL(bank));
219 }
220
221 static unsigned int
mca_smca_status_reg(int bank)222 mca_smca_status_reg(int bank)
223 {
224 return (MSR_SMCA_MC_STATUS(bank));
225 }
226
227 static unsigned int
mca_smca_addr_reg(int bank)228 mca_smca_addr_reg(int bank)
229 {
230 return (MSR_SMCA_MC_ADDR(bank));
231 }
232
233 static unsigned int
mca_smca_misc_reg(int bank)234 mca_smca_misc_reg(int bank)
235 {
236 return (MSR_SMCA_MC_MISC(bank));
237 }
238
239 static struct mca_enumerator_ops mca_msr_ops = {
240 .ctl = mca_ia32_ctl_reg,
241 .status = mca_ia32_status_reg,
242 .addr = mca_ia32_addr_reg,
243 .misc = mca_ia32_misc_reg
244 };
245
246 #ifdef DEV_APIC
247 static struct cmc_state **cmc_state; /* Indexed by cpuid, bank. */
248 static struct amd_et_state **amd_et_state; /* Indexed by cpuid, bank. */
249 static int cmc_throttle = 60; /* Time in seconds to throttle CMCI. */
250
251 static int amd_elvt = -1;
252
253 static inline bool
amd_thresholding_supported(void)254 amd_thresholding_supported(void)
255 {
256 if (cpu_vendor_id != CPU_VENDOR_AMD &&
257 cpu_vendor_id != CPU_VENDOR_HYGON)
258 return (false);
259 /*
260 * The RASCap register is wholly reserved in families 0x10-0x15 (through model 1F).
261 *
262 * It begins to be documented in family 0x15 model 30 and family 0x16,
263 * but neither of these families documents the ScalableMca bit, which
264 * supposedly defines the presence of this feature on family 0x17.
265 */
266 if (CPUID_TO_FAMILY(cpu_id) >= 0x10 && CPUID_TO_FAMILY(cpu_id) <= 0x16)
267 return (true);
268 if (CPUID_TO_FAMILY(cpu_id) >= 0x17)
269 return ((amd_rascap & AMDRAS_SCALABLE_MCA) != 0);
270 return (false);
271 }
272 #endif
273
274 static inline bool
cmci_supported(uint64_t mcg_cap)275 cmci_supported(uint64_t mcg_cap)
276 {
277 /*
278 * MCG_CAP_CMCI_P bit is reserved in AMD documentation. Until
279 * it is defined, do not use it to check for CMCI support.
280 */
281 if (cpu_vendor_id != CPU_VENDOR_INTEL)
282 return (false);
283 return ((mcg_cap & MCG_CAP_CMCI_P) != 0);
284 }
285
286 static inline bool
tes_supported(uint64_t mcg_cap)287 tes_supported(uint64_t mcg_cap)
288 {
289
290 /*
291 * MCG_CAP_TES_P bit is reserved in AMD documentation. Until
292 * it is defined, do not use it to check for TES support.
293 */
294 if (cpu_vendor_id != CPU_VENDOR_INTEL)
295 return (false);
296 return ((mcg_cap & MCG_CAP_TES_P) != 0);
297 }
298
299 static inline bool
ser_supported(uint64_t mcg_cap)300 ser_supported(uint64_t mcg_cap)
301 {
302
303 return (tes_supported(mcg_cap) && (mcg_cap & MCG_CAP_SER_P) != 0);
304 }
305
306 static int
sysctl_positive_int(SYSCTL_HANDLER_ARGS)307 sysctl_positive_int(SYSCTL_HANDLER_ARGS)
308 {
309 int error, value;
310
311 value = *(int *)arg1;
312 error = sysctl_handle_int(oidp, &value, 0, req);
313 if (error || req->newptr == NULL)
314 return (error);
315 if (value <= 0)
316 return (EINVAL);
317 *(int *)arg1 = value;
318 return (0);
319 }
320
321 static int
sysctl_mca_records(SYSCTL_HANDLER_ARGS)322 sysctl_mca_records(SYSCTL_HANDLER_ARGS)
323 {
324 int *name = (int *)arg1;
325 u_int namelen = arg2;
326 struct mca_record record;
327 struct mca_internal *rec;
328 int i;
329
330 if (namelen != 1)
331 return (EINVAL);
332
333 if (name[0] < 0 || name[0] >= mca_count)
334 return (EINVAL);
335
336 mtx_lock_spin(&mca_lock);
337 if (name[0] >= mca_count) {
338 mtx_unlock_spin(&mca_lock);
339 return (EINVAL);
340 }
341 i = 0;
342 STAILQ_FOREACH(rec, &mca_records, link) {
343 if (i == name[0]) {
344 record = rec->rec;
345 break;
346 }
347 i++;
348 }
349 mtx_unlock_spin(&mca_lock);
350 return (SYSCTL_OUT(req, &record, sizeof(record)));
351 }
352
353 static const char *
mca_error_ttype(uint16_t mca_error)354 mca_error_ttype(uint16_t mca_error)
355 {
356
357 switch ((mca_error & 0x000c) >> 2) {
358 case 0:
359 return ("I");
360 case 1:
361 return ("D");
362 case 2:
363 return ("G");
364 }
365 return ("?");
366 }
367
368 static const char *
mca_error_level(uint16_t mca_error)369 mca_error_level(uint16_t mca_error)
370 {
371
372 switch (mca_error & 0x0003) {
373 case 0:
374 return ("L0");
375 case 1:
376 return ("L1");
377 case 2:
378 return ("L2");
379 case 3:
380 return ("LG");
381 }
382 return ("L?");
383 }
384
385 static const char *
mca_error_request(uint16_t mca_error)386 mca_error_request(uint16_t mca_error)
387 {
388
389 switch ((mca_error & 0x00f0) >> 4) {
390 case 0x0:
391 return ("ERR");
392 case 0x1:
393 return ("RD");
394 case 0x2:
395 return ("WR");
396 case 0x3:
397 return ("DRD");
398 case 0x4:
399 return ("DWR");
400 case 0x5:
401 return ("IRD");
402 case 0x6:
403 return ("PREFETCH");
404 case 0x7:
405 return ("EVICT");
406 case 0x8:
407 return ("SNOOP");
408 }
409 return ("???");
410 }
411
412 static const char *
mca_error_mmtype(uint16_t mca_error,enum mca_stat_types * event_type)413 mca_error_mmtype(uint16_t mca_error, enum mca_stat_types *event_type)
414 {
415
416 switch ((mca_error & 0x70) >> 4) {
417 case 0x0:
418 *event_type = MCA_T_MEMCONTROLLER_GEN;
419 return ("GEN");
420 case 0x1:
421 *event_type = MCA_T_MEMCONTROLLER_RD;
422 return ("RD");
423 case 0x2:
424 *event_type = MCA_T_MEMCONTROLLER_WR;
425 return ("WR");
426 case 0x3:
427 *event_type = MCA_T_MEMCONTROLLER_AC;
428 return ("AC");
429 case 0x4:
430 *event_type = MCA_T_MEMCONTROLLER_MS;
431 return ("MS");
432 }
433 *event_type = MCA_T_MEMCONTROLLER_OTHER;
434 return ("???");
435 }
436
437 static const char *
mca_addres_mode(uint64_t mca_misc)438 mca_addres_mode(uint64_t mca_misc)
439 {
440
441 switch ((mca_misc & MC_MISC_ADDRESS_MODE) >> 6) {
442 case 0x0:
443 return ("Segment Offset");
444 case 0x1:
445 return ("Linear Address");
446 case 0x2:
447 return ("Physical Address");
448 case 0x3:
449 return ("Memory Address");
450 case 0x7:
451 return ("Generic");
452 }
453 return ("???");
454 }
455
456 static int
mca_mute(const struct mca_record * rec)457 mca_mute(const struct mca_record *rec)
458 {
459
460 /*
461 * Skip spurious corrected parity errors generated by Intel Haswell-
462 * and Broadwell-based CPUs (see HSD131, HSM142, HSW131 and BDM48
463 * erratum respectively), unless reporting is enabled.
464 * Note that these errors also have been observed with the D0-stepping
465 * of Haswell, while at least initially the CPU specification updates
466 * suggested only the C0-stepping to be affected. Similarly, Celeron
467 * 2955U with a CPU ID of 0x45 apparently are also concerned with the
468 * same problem, with HSM142 only referring to 0x3c and 0x46.
469 */
470 if (cpu_vendor_id == CPU_VENDOR_INTEL &&
471 CPUID_TO_FAMILY(cpu_id) == 0x6 &&
472 (CPUID_TO_MODEL(cpu_id) == 0x3c || /* HSD131, HSM142, HSW131 */
473 CPUID_TO_MODEL(cpu_id) == 0x3d || /* BDM48 */
474 CPUID_TO_MODEL(cpu_id) == 0x45 ||
475 CPUID_TO_MODEL(cpu_id) == 0x46) && /* HSM142 */
476 rec->mr_bank == 0 &&
477 (rec->mr_status & 0xa0000000ffffffff) == 0x80000000000f0005 &&
478 !intel6h_HSD131)
479 return (1);
480
481 return (0);
482 }
483
484 /* Dump details about a single machine check. */
485 static void
mca_log(enum scan_mode mode,const struct mca_record * rec,bool fatal)486 mca_log(enum scan_mode mode, const struct mca_record *rec, bool fatal)
487 {
488 int error, numskipped;
489 uint16_t mca_error;
490 enum mca_stat_types event_type;
491 struct sbuf sb;
492 bool uncor, using_shared_buf;
493
494 if (mca_mute(rec))
495 return;
496
497 uncor = (rec->mr_status & MC_STATUS_UC) != 0;
498
499 if (!log_corrected && !uncor && (!tes_supported(rec->mr_mcg_cap) ||
500 ((rec->mr_status & MC_STATUS_TES_STATUS) >> 53) != 0x2))
501 return;
502
503 /* Try to use an allocated buffer when not in an interrupt context. */
504 if (mode == POLLED && sbuf_new(&sb, NULL, 512, SBUF_AUTOEXTEND) != NULL)
505 using_shared_buf = false;
506 else {
507 using_shared_buf = true;
508 mtx_lock_spin(&mca_msg_buf_lock);
509 sbuf_new(&sb, mca_msg_buf, sizeof(mca_msg_buf), SBUF_FIXEDLEN);
510 }
511
512 sbuf_printf(&sb, "MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank,
513 (long long)rec->mr_status);
514 sbuf_printf(&sb, "MCA: Global Cap 0x%016llx, Status 0x%016llx\n",
515 (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status);
516 sbuf_printf(&sb, "MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n",
517 cpu_vendor, rec->mr_cpu_id, rec->mr_apic_id);
518 sbuf_printf(&sb, "MCA: CPU %d ", rec->mr_cpu);
519 if (rec->mr_status & MC_STATUS_UC)
520 sbuf_printf(&sb, "UNCOR ");
521 else {
522 sbuf_printf(&sb, "COR ");
523 if (cmci_supported(rec->mr_mcg_cap))
524 sbuf_printf(&sb, "(%lld) ", ((long long)rec->mr_status &
525 MC_STATUS_COR_COUNT) >> 38);
526 if (tes_supported(rec->mr_mcg_cap)) {
527 switch ((rec->mr_status & MC_STATUS_TES_STATUS) >> 53) {
528 case 0x1:
529 sbuf_printf(&sb, "(Green) ");
530 break;
531 case 0x2:
532 sbuf_printf(&sb, "(Yellow) ");
533 break;
534 }
535 }
536 }
537 if (rec->mr_status & MC_STATUS_EN)
538 sbuf_printf(&sb, "EN ");
539 if (rec->mr_status & MC_STATUS_PCC)
540 sbuf_printf(&sb, "PCC ");
541 if (ser_supported(rec->mr_mcg_cap)) {
542 if (rec->mr_status & MC_STATUS_S)
543 sbuf_printf(&sb, "S ");
544 if (rec->mr_status & MC_STATUS_AR)
545 sbuf_printf(&sb, "AR ");
546 }
547 if (rec->mr_status & MC_STATUS_OVER)
548 sbuf_printf(&sb, "OVER ");
549 mca_error = rec->mr_status & MC_STATUS_MCA_ERROR;
550 event_type = MCA_T_COUNT;
551 switch (mca_error) {
552 /* Simple error codes. */
553 case 0x0000:
554 sbuf_printf(&sb, "no error");
555 event_type = MCA_T_NONE;
556 break;
557 case 0x0001:
558 sbuf_printf(&sb, "unclassified error");
559 event_type = MCA_T_UNCLASSIFIED;
560 break;
561 case 0x0002:
562 sbuf_printf(&sb, "ucode ROM parity error");
563 event_type = MCA_T_UCODE_ROM_PARITY;
564 break;
565 case 0x0003:
566 sbuf_printf(&sb, "external error");
567 event_type = MCA_T_EXTERNAL;
568 break;
569 case 0x0004:
570 sbuf_printf(&sb, "FRC error");
571 event_type = MCA_T_FRC;
572 break;
573 case 0x0005:
574 sbuf_printf(&sb, "internal parity error");
575 event_type = MCA_T_INTERNAL_PARITY;
576 break;
577 case 0x0006:
578 sbuf_printf(&sb, "SMM handler code access violation");
579 event_type = MCA_T_SMM_HANDLER;
580 break;
581 case 0x0400:
582 sbuf_printf(&sb, "internal timer error");
583 event_type = MCA_T_INTERNAL_TIMER;
584 break;
585 case 0x0e0b:
586 sbuf_printf(&sb, "generic I/O error");
587 event_type = MCA_T_GENERIC_IO;
588 if (rec->mr_cpu_vendor_id == CPU_VENDOR_INTEL &&
589 (rec->mr_status & MC_STATUS_MISCV)) {
590 sbuf_printf(&sb, " (pci%d:%d:%d:%d)",
591 (int)((rec->mr_misc & MC_MISC_PCIE_SEG) >> 32),
592 (int)((rec->mr_misc & MC_MISC_PCIE_BUS) >> 24),
593 (int)((rec->mr_misc & MC_MISC_PCIE_SLOT) >> 19),
594 (int)((rec->mr_misc & MC_MISC_PCIE_FUNC) >> 16));
595 }
596 break;
597 default:
598 if ((mca_error & 0xfc00) == 0x0400) {
599 sbuf_printf(&sb, "internal error %x",
600 mca_error & 0x03ff);
601 event_type = MCA_T_INTERNAL;
602 break;
603 }
604
605 /* Compound error codes. */
606
607 /* Memory hierarchy error. */
608 if ((mca_error & 0xeffc) == 0x000c) {
609 sbuf_printf(&sb, "%s memory error",
610 mca_error_level(mca_error));
611 event_type = MCA_T_MEMORY;
612 break;
613 }
614
615 /* TLB error. */
616 if ((mca_error & 0xeff0) == 0x0010) {
617 sbuf_printf(&sb, "%sTLB %s error",
618 mca_error_ttype(mca_error),
619 mca_error_level(mca_error));
620 event_type = MCA_T_TLB;
621 break;
622 }
623
624 /* Memory controller error. */
625 if ((mca_error & 0xef80) == 0x0080) {
626 sbuf_printf(&sb, "%s channel ",
627 mca_error_mmtype(mca_error, &event_type));
628 if ((mca_error & 0x000f) != 0x000f)
629 sbuf_printf(&sb, "%d", mca_error & 0x000f);
630 else
631 sbuf_printf(&sb, "??");
632 sbuf_printf(&sb, " memory error");
633 break;
634 }
635
636 /* Cache error. */
637 if ((mca_error & 0xef00) == 0x0100) {
638 sbuf_printf(&sb, "%sCACHE %s %s error",
639 mca_error_ttype(mca_error),
640 mca_error_level(mca_error),
641 mca_error_request(mca_error));
642 event_type = MCA_T_CACHE;
643 break;
644 }
645
646 /* Extended memory error. */
647 if ((mca_error & 0xef80) == 0x0280) {
648 sbuf_printf(&sb, "%s channel ",
649 mca_error_mmtype(mca_error, &event_type));
650 if ((mca_error & 0x000f) != 0x000f)
651 sbuf_printf(&sb, "%d", mca_error & 0x000f);
652 else
653 sbuf_printf(&sb, "??");
654 sbuf_printf(&sb, " extended memory error");
655 break;
656 }
657
658 /* Bus and/or Interconnect error. */
659 if ((mca_error & 0xe800) == 0x0800) {
660 sbuf_printf(&sb, "BUS%s ", mca_error_level(mca_error));
661 event_type = MCA_T_BUS;
662 switch ((mca_error & 0x0600) >> 9) {
663 case 0:
664 sbuf_printf(&sb, "Source");
665 break;
666 case 1:
667 sbuf_printf(&sb, "Responder");
668 break;
669 case 2:
670 sbuf_printf(&sb, "Observer");
671 break;
672 default:
673 sbuf_printf(&sb, "???");
674 break;
675 }
676 sbuf_printf(&sb, " %s ", mca_error_request(mca_error));
677 switch ((mca_error & 0x000c) >> 2) {
678 case 0:
679 sbuf_printf(&sb, "Memory");
680 break;
681 case 2:
682 sbuf_printf(&sb, "I/O");
683 break;
684 case 3:
685 sbuf_printf(&sb, "Other");
686 break;
687 default:
688 sbuf_printf(&sb, "???");
689 break;
690 }
691 if (mca_error & 0x0100)
692 sbuf_printf(&sb, " timed out");
693 break;
694 }
695
696 sbuf_printf(&sb, "unknown error %x", mca_error);
697 event_type = MCA_T_UNKNOWN;
698 break;
699 }
700 sbuf_printf(&sb, "\n");
701 if (rec->mr_status & MC_STATUS_ADDRV) {
702 sbuf_printf(&sb, "MCA: Address 0x%llx",
703 (long long)rec->mr_addr);
704 if (ser_supported(rec->mr_mcg_cap) &&
705 (rec->mr_status & MC_STATUS_MISCV)) {
706 sbuf_printf(&sb, " (Mode: %s, LSB: %d)",
707 mca_addres_mode(rec->mr_misc),
708 (int)(rec->mr_misc & MC_MISC_RA_LSB));
709 }
710 sbuf_printf(&sb, "\n");
711 }
712 if (rec->mr_status & MC_STATUS_MISCV)
713 sbuf_printf(&sb, "MCA: Misc 0x%llx\n", (long long)rec->mr_misc);
714
715 if (event_type < 0 || event_type >= MCA_T_COUNT) {
716 KASSERT(0, ("%s: invalid event type (%d)", __func__,
717 event_type));
718 event_type = MCA_T_UNKNOWN;
719 }
720 numskipped = 0;
721 if (!fatal && !uncor) {
722 /*
723 * Update statistics and check the rate limit for
724 * correctable errors. The rate limit is only applied
725 * after the system records a reasonable number of errors
726 * of the same type. The goal is to reduce the impact of
727 * the system seeing and attempting to log a burst of
728 * similar errors, which (especially when printed to the
729 * console) can be expensive.
730 */
731 mtx_lock_spin(&mca_lock);
732 mca_stats[event_type]++;
733 if (mca_log_interval.tv_sec > 0 && mca_stats[event_type] > 50 &&
734 ratecheck(&mca_last_log_time, &mca_log_interval) == 0) {
735 mca_log_skipped++;
736 mtx_unlock_spin(&mca_lock);
737 goto done;
738 }
739 numskipped = mca_log_skipped;
740 mca_log_skipped = 0;
741 mtx_unlock_spin(&mca_lock);
742 }
743
744 error = sbuf_finish(&sb);
745 if (fatal || !mca_uselog) {
746 if (numskipped > 0)
747 printf("MCA: %d events skipped due to rate limit\n",
748 numskipped);
749 if (error)
750 printf("MCA: error logging message (sbuf error %d)\n",
751 error);
752 else
753 sbuf_putbuf(&sb);
754 } else {
755 if (numskipped > 0)
756 log(LOG_ERR,
757 "MCA: %d events skipped due to rate limit\n",
758 numskipped);
759 if (error)
760 log(LOG_ERR,
761 "MCA: error logging message (sbuf error %d)\n",
762 error);
763 else
764 log(uncor ? LOG_CRIT : LOG_ERR, "%s", sbuf_data(&sb));
765 }
766
767 done:
768 sbuf_delete(&sb);
769 if (using_shared_buf)
770 mtx_unlock_spin(&mca_msg_buf_lock);
771 }
772
773 static bool
mca_is_mce(uint64_t mcg_cap,uint64_t status,bool * recoverablep)774 mca_is_mce(uint64_t mcg_cap, uint64_t status, bool *recoverablep)
775 {
776
777 /* Corrected error. */
778 if ((status & MC_STATUS_UC) == 0)
779 return (0);
780
781 /* Spurious MCA error. */
782 if ((status & MC_STATUS_EN) == 0)
783 return (0);
784
785 /* The processor does not support software error recovery. */
786 if (!ser_supported(mcg_cap)) {
787 *recoverablep = false;
788 return (1);
789 }
790
791 /* Context might have been corrupted. */
792 if (status & MC_STATUS_PCC) {
793 *recoverablep = false;
794 return (1);
795 }
796
797 /* Uncorrected software recoverable. */
798 if (status & MC_STATUS_S) {
799 /* Action required vs optional. */
800 if (status & MC_STATUS_AR)
801 *recoverablep = false;
802 return (1);
803 }
804
805 /* Uncorrected no action required. */
806 return (0);
807 }
808
809 static int
mca_check_status(enum scan_mode mode,uint64_t mcg_cap,int bank,struct mca_record * rec,bool * recoverablep)810 mca_check_status(enum scan_mode mode, uint64_t mcg_cap, int bank,
811 struct mca_record *rec, bool *recoverablep)
812 {
813 uint64_t status;
814 u_int p[4];
815 bool mce, recover;
816
817 status = rdmsr(mca_msr_ops.status(bank));
818 if (!(status & MC_STATUS_VAL)) {
819 #ifdef DIAGNOSTIC
820 /*
821 * Check if we have a pending artificial event to generate.
822 * Note that this is potentially racy with the sysctl. The
823 * tradeoff is deemed acceptable given the test nature
824 * of the code.
825 */
826 if (fake_status && bank == fake_bank) {
827 status = fake_status;
828 fake_status = 0;
829 }
830 if (!(status & MC_STATUS_VAL))
831 return (0);
832 #else
833 return (0);
834 #endif
835 }
836
837 recover = *recoverablep;
838 mce = mca_is_mce(mcg_cap, status, &recover);
839 if (mce != (mode == MCE))
840 return (0);
841 *recoverablep = recover;
842
843 /* Save exception information. */
844 rec->mr_status = status;
845 rec->mr_bank = bank;
846 rec->mr_addr = 0;
847 if (status & MC_STATUS_ADDRV)
848 rec->mr_addr = rdmsr(mca_msr_ops.addr(bank));
849 rec->mr_misc = 0;
850 if (status & MC_STATUS_MISCV)
851 rec->mr_misc = rdmsr(mca_msr_ops.misc(bank));
852 rec->mr_tsc = rdtsc();
853 rec->mr_apic_id = PCPU_GET(apic_id);
854 rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP);
855 rec->mr_mcg_status = rdmsr(MSR_MCG_STATUS);
856 rec->mr_cpu_id = cpu_id;
857 rec->mr_cpu_vendor_id = cpu_vendor_id;
858 rec->mr_cpu = PCPU_GET(cpuid);
859
860 /*
861 * Clear machine check. Don't do this for uncorrectable
862 * errors so that the BIOS can see them.
863 */
864 if (!mce || recover) {
865 wrmsr(mca_msr_ops.status(bank), 0);
866 do_cpuid(0, p);
867 }
868 return (1);
869 }
870
871 static void
mca_resize_freelist(void)872 mca_resize_freelist(void)
873 {
874 struct mca_internal *next, *rec;
875 STAILQ_HEAD(, mca_internal) tmplist;
876 int count, i, desired_max, desired_min;
877
878 /*
879 * Ensure we have at least one record for each bank and one
880 * record per CPU, but no more than twice that amount.
881 */
882 desired_min = imax(mp_ncpus, mca_banks);
883 desired_max = imax(mp_ncpus, mca_banks) * 2;
884 STAILQ_INIT(&tmplist);
885 mtx_lock_spin(&mca_lock);
886 while (mca_freecount > desired_max) {
887 rec = STAILQ_FIRST(&mca_freelist);
888 KASSERT(rec != NULL, ("mca_freecount is %d, but list is empty",
889 mca_freecount));
890 STAILQ_REMOVE_HEAD(&mca_freelist, link);
891 mca_freecount--;
892 STAILQ_INSERT_TAIL(&tmplist, rec, link);
893 }
894 while (mca_freecount < desired_min) {
895 count = desired_min - mca_freecount;
896 mtx_unlock_spin(&mca_lock);
897 for (i = 0; i < count; i++) {
898 rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
899 STAILQ_INSERT_TAIL(&tmplist, rec, link);
900 }
901 mtx_lock_spin(&mca_lock);
902 STAILQ_CONCAT(&mca_freelist, &tmplist);
903 mca_freecount += count;
904 }
905 mtx_unlock_spin(&mca_lock);
906 STAILQ_FOREACH_SAFE(rec, &tmplist, link, next)
907 free(rec, M_MCA);
908 }
909
910 static void
mca_resize(void * context,int pending)911 mca_resize(void *context, int pending)
912 {
913
914 mca_resize_freelist();
915 }
916
917 static void
mca_record_entry(enum scan_mode mode,const struct mca_record * record)918 mca_record_entry(enum scan_mode mode, const struct mca_record *record)
919 {
920 struct mca_internal *rec;
921
922 if (mode == POLLED) {
923 rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
924 mtx_lock_spin(&mca_lock);
925 } else {
926 mtx_lock_spin(&mca_lock);
927 rec = STAILQ_FIRST(&mca_freelist);
928 if (rec == NULL) {
929 mtx_unlock_spin(&mca_lock);
930 printf("MCA: Unable to allocate space for an event.\n");
931 mca_log(mode, record, false);
932 return;
933 }
934 STAILQ_REMOVE_HEAD(&mca_freelist, link);
935 mca_freecount--;
936 }
937
938 rec->rec = *record;
939 STAILQ_INSERT_TAIL(&mca_pending, rec, link);
940 mtx_unlock_spin(&mca_lock);
941 }
942
943 #ifdef DEV_APIC
944 /*
945 * Update the interrupt threshold for a CMCI. The strategy is to use
946 * a low trigger that interrupts as soon as the first event occurs.
947 * However, if a steady stream of events arrive, the threshold is
948 * increased until the interrupts are throttled to once every
949 * cmc_throttle seconds or the periodic scan. If a periodic scan
950 * finds that the threshold is too high, it is lowered.
951 */
952 static int
update_threshold(enum scan_mode mode,int valid,int last_intr,int count,int cur_threshold,int max_threshold)953 update_threshold(enum scan_mode mode, int valid, int last_intr, int count,
954 int cur_threshold, int max_threshold)
955 {
956 u_int delta;
957 int limit;
958
959 delta = (u_int)(time_uptime - last_intr);
960 limit = cur_threshold;
961
962 /*
963 * If an interrupt was received less than cmc_throttle seconds
964 * since the previous interrupt and the count from the current
965 * event is greater than or equal to the current threshold,
966 * double the threshold up to the max.
967 */
968 if (mode == CMCI && valid) {
969 if (delta < cmc_throttle && count >= limit &&
970 limit < max_threshold) {
971 limit = min(limit << 1, max_threshold);
972 }
973 return (limit);
974 }
975
976 /*
977 * When the banks are polled, check to see if the threshold
978 * should be lowered.
979 */
980 if (mode != POLLED)
981 return (limit);
982
983 /* If a CMCI occurred recently, do nothing for now. */
984 if (delta < cmc_throttle)
985 return (limit);
986
987 /*
988 * Compute a new limit based on the average rate of events per
989 * cmc_throttle seconds since the last interrupt.
990 */
991 if (valid) {
992 limit = count * cmc_throttle / delta;
993 if (limit <= 0)
994 limit = 1;
995 else if (limit > max_threshold)
996 limit = max_threshold;
997 } else {
998 limit = 1;
999 }
1000 return (limit);
1001 }
1002
1003 static void
cmci_update(enum scan_mode mode,int bank,int valid,struct mca_record * rec)1004 cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
1005 {
1006 struct cmc_state *cc;
1007 uint64_t ctl;
1008 int cur_threshold, new_threshold;
1009 int count;
1010
1011 /* Fetch the current limit for this bank. */
1012 cc = &cmc_state[PCPU_GET(cpuid)][bank];
1013 ctl = rdmsr(MSR_MC_CTL2(bank));
1014 count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
1015 cur_threshold = ctl & MC_CTL2_THRESHOLD;
1016
1017 new_threshold = update_threshold(mode, valid, cc->last_intr, count,
1018 cur_threshold, cc->max_threshold);
1019
1020 if (mode == CMCI && valid)
1021 cc->last_intr = time_uptime;
1022 if (new_threshold != cur_threshold) {
1023 ctl &= ~MC_CTL2_THRESHOLD;
1024 ctl |= new_threshold;
1025 wrmsr(MSR_MC_CTL2(bank), ctl);
1026 }
1027 }
1028
1029 static void
amd_thresholding_update(enum scan_mode mode,int bank,int valid)1030 amd_thresholding_update(enum scan_mode mode, int bank, int valid)
1031 {
1032 struct amd_et_state *cc;
1033 uint64_t misc;
1034 int new_threshold;
1035 int count;
1036
1037 cc = &amd_et_state[PCPU_GET(cpuid)][bank];
1038 misc = rdmsr(mca_msr_ops.misc(bank));
1039 count = (misc & MC_MISC_AMD_CNT_MASK) >> MC_MISC_AMD_CNT_SHIFT;
1040 count = count - (MC_MISC_AMD_CNT_MAX - cc->cur_threshold);
1041
1042 new_threshold = update_threshold(mode, valid, cc->last_intr, count,
1043 cc->cur_threshold, MC_MISC_AMD_CNT_MAX);
1044
1045 cc->cur_threshold = new_threshold;
1046 misc &= ~MC_MISC_AMD_CNT_MASK;
1047 misc |= (uint64_t)(MC_MISC_AMD_CNT_MAX - cc->cur_threshold)
1048 << MC_MISC_AMD_CNT_SHIFT;
1049 misc &= ~MC_MISC_AMD_OVERFLOW;
1050 wrmsr(mca_msr_ops.misc(bank), misc);
1051 if (mode == CMCI && valid)
1052 cc->last_intr = time_uptime;
1053 }
1054 #endif
1055
1056 /*
1057 * This scans all the machine check banks of the current CPU to see if
1058 * there are any machine checks. Any non-recoverable errors are
1059 * reported immediately via mca_log(). The current thread must be
1060 * pinned when this is called. The 'mode' parameter indicates if we
1061 * are being called from the MC exception handler, the CMCI handler,
1062 * or the periodic poller.
1063 */
1064 static int
mca_scan(enum scan_mode mode,bool * recoverablep)1065 mca_scan(enum scan_mode mode, bool *recoverablep)
1066 {
1067 struct mca_record rec;
1068 uint64_t mcg_cap;
1069 int count = 0, i, valid;
1070
1071 mcg_cap = rdmsr(MSR_MCG_CAP);
1072 for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
1073 #ifdef DEV_APIC
1074 /*
1075 * For a CMCI, only check banks this CPU is
1076 * responsible for.
1077 */
1078 if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
1079 continue;
1080 #endif
1081
1082 valid = mca_check_status(mode, mcg_cap, i, &rec, recoverablep);
1083 if (valid) {
1084 count++;
1085 if (*recoverablep)
1086 mca_record_entry(mode, &rec);
1087 else
1088 mca_log(mode, &rec, true);
1089 }
1090
1091 #ifdef DEV_APIC
1092 /*
1093 * If this is a bank this CPU monitors via CMCI,
1094 * update the threshold.
1095 */
1096 if (PCPU_GET(cmci_mask) & 1 << i) {
1097 if (cmc_state != NULL)
1098 cmci_update(mode, i, valid, &rec);
1099 else
1100 amd_thresholding_update(mode, i, valid);
1101 }
1102 #endif
1103 }
1104 return (count);
1105 }
1106
1107 /*
1108 * Store a new record on the mca_records list while enforcing
1109 * mca_maxcount.
1110 */
1111 static void
mca_store_record(struct mca_internal * mca)1112 mca_store_record(struct mca_internal *mca)
1113 {
1114
1115 /*
1116 * If we are storing no records (mca_maxcount == 0),
1117 * we just free this record.
1118 *
1119 * If we are storing records (mca_maxcount != 0) and
1120 * we have free space on the list, store the record
1121 * and increment mca_count.
1122 *
1123 * If we are storing records and we do not have free
1124 * space on the list, store the new record at the
1125 * tail and free the oldest one from the head.
1126 */
1127 if (mca_maxcount != 0)
1128 STAILQ_INSERT_TAIL(&mca_records, mca, link);
1129 if (mca_maxcount < 0 || mca_count < mca_maxcount)
1130 mca_count++;
1131 else {
1132 if (mca_maxcount != 0) {
1133 mca = STAILQ_FIRST(&mca_records);
1134 STAILQ_REMOVE_HEAD(&mca_records, link);
1135 }
1136 STAILQ_INSERT_TAIL(&mca_freelist, mca, link);
1137 mca_freecount++;
1138 }
1139 }
1140
1141 /*
1142 * Do the work to process machine check records which have just been
1143 * gathered. Print any pending logs to the console. Queue them for storage.
1144 * Trigger a resizing of the free list.
1145 */
1146 static void
mca_process_records(enum scan_mode mode)1147 mca_process_records(enum scan_mode mode)
1148 {
1149 struct mca_internal *mca;
1150 STAILQ_HEAD(, mca_internal) tmplist;
1151
1152 /*
1153 * If in an interrupt context, defer the post-scan activities to a
1154 * task queue.
1155 */
1156 if (mode != POLLED) {
1157 if (mca_startup_done)
1158 taskqueue_enqueue(mca_tq, &mca_postscan_task);
1159 return;
1160 }
1161
1162 /*
1163 * Copy the pending list to the stack so we can drop the spin lock
1164 * while we are emitting logs.
1165 */
1166 STAILQ_INIT(&tmplist);
1167 mtx_lock_spin(&mca_lock);
1168 STAILQ_SWAP(&mca_pending, &tmplist, mca_internal);
1169 mtx_unlock_spin(&mca_lock);
1170
1171 STAILQ_FOREACH(mca, &tmplist, link)
1172 mca_log(mode, &mca->rec, false);
1173
1174 mtx_lock_spin(&mca_lock);
1175 while ((mca = STAILQ_FIRST(&tmplist)) != NULL) {
1176 STAILQ_REMOVE_HEAD(&tmplist, link);
1177 mca_store_record(mca);
1178 }
1179 mtx_unlock_spin(&mca_lock);
1180 mca_resize_freelist();
1181 }
1182
1183 /*
1184 * Emit log entries and resize the free list. This is intended to be called
1185 * from a task queue to handle work which does not need to be done (or cannot
1186 * be done) in an interrupt context.
1187 */
1188 static void
mca_postscan(void * context __unused,int pending __unused)1189 mca_postscan(void *context __unused, int pending __unused)
1190 {
1191
1192 mca_process_records(POLLED);
1193 }
1194
1195 /*
1196 * Scan the machine check banks on all CPUs by binding to each CPU in
1197 * turn. If any of the CPUs contained new machine check records, log
1198 * them to the console.
1199 */
1200 static void
mca_scan_cpus(void * context,int pending)1201 mca_scan_cpus(void *context, int pending)
1202 {
1203 struct thread *td;
1204 int cpu;
1205 bool recoverable = true;
1206
1207 mca_resize_freelist();
1208 td = curthread;
1209 thread_lock(td);
1210 CPU_FOREACH(cpu) {
1211 sched_bind(td, cpu);
1212 thread_unlock(td);
1213 mca_scan(POLLED, &recoverable);
1214 thread_lock(td);
1215 sched_unbind(td);
1216 }
1217 thread_unlock(td);
1218 if (!STAILQ_EMPTY(&mca_pending))
1219 mca_process_records(POLLED);
1220 taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task,
1221 mca_ticks * SBT_1S, 0, C_PREL(1));
1222 }
1223
1224 static int
sysctl_mca_scan(SYSCTL_HANDLER_ARGS)1225 sysctl_mca_scan(SYSCTL_HANDLER_ARGS)
1226 {
1227 int error, i;
1228
1229 i = 0;
1230 error = sysctl_handle_int(oidp, &i, 0, req);
1231 if (error)
1232 return (error);
1233 if (i)
1234 taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task,
1235 0, 0, 0);
1236 return (0);
1237 }
1238
1239 static int
sysctl_mca_maxcount(SYSCTL_HANDLER_ARGS)1240 sysctl_mca_maxcount(SYSCTL_HANDLER_ARGS)
1241 {
1242 struct mca_internal *mca;
1243 int error, i;
1244 bool doresize;
1245
1246 i = mca_maxcount;
1247 error = sysctl_handle_int(oidp, &i, 0, req);
1248 if (error || req->newptr == NULL)
1249 return (error);
1250 mtx_lock_spin(&mca_lock);
1251 mca_maxcount = i;
1252 doresize = false;
1253 if (mca_maxcount >= 0)
1254 while (mca_count > mca_maxcount) {
1255 mca = STAILQ_FIRST(&mca_records);
1256 STAILQ_REMOVE_HEAD(&mca_records, link);
1257 mca_count--;
1258 STAILQ_INSERT_TAIL(&mca_freelist, mca, link);
1259 mca_freecount++;
1260 doresize = true;
1261 }
1262 mtx_unlock_spin(&mca_lock);
1263 if (doresize && mca_startup_done)
1264 taskqueue_enqueue(mca_tq, &mca_resize_task);
1265 return (error);
1266 }
1267
1268 static void
mca_startup(void * dummy)1269 mca_startup(void *dummy)
1270 {
1271
1272 if (mca_banks <= 0)
1273 return;
1274
1275 taskqueue_start_threads(&mca_tq, 1, PI_SWI(SWI_TQ), "mca taskq");
1276 taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task,
1277 mca_ticks * SBT_1S, 0, C_PREL(1));
1278 mca_startup_done = true;
1279
1280 /*
1281 * CMCIs during boot may have recorded entries. Conduct the post-scan
1282 * activities now.
1283 */
1284 mca_postscan(NULL, 0);
1285 }
1286 SYSINIT(mca_startup, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, mca_startup, NULL);
1287
1288 #ifdef DEV_APIC
1289 static void
cmci_setup(void)1290 cmci_setup(void)
1291 {
1292 int i;
1293
1294 cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state *), M_MCA,
1295 M_WAITOK);
1296 for (i = 0; i <= mp_maxid; i++)
1297 cmc_state[i] = malloc(sizeof(struct cmc_state) * mca_banks,
1298 M_MCA, M_WAITOK | M_ZERO);
1299 SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1300 "cmc_throttle", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
1301 &cmc_throttle, 0, sysctl_positive_int, "I",
1302 "Interval in seconds to throttle corrected MC interrupts");
1303 }
1304
1305 static void
amd_thresholding_setup(void)1306 amd_thresholding_setup(void)
1307 {
1308 u_int i;
1309
1310 amd_et_state = malloc((mp_maxid + 1) * sizeof(struct amd_et_state *),
1311 M_MCA, M_WAITOK);
1312 for (i = 0; i <= mp_maxid; i++)
1313 amd_et_state[i] = malloc(sizeof(struct amd_et_state) *
1314 mca_banks, M_MCA, M_WAITOK | M_ZERO);
1315 SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1316 "cmc_throttle", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
1317 &cmc_throttle, 0, sysctl_positive_int, "I",
1318 "Interval in seconds to throttle corrected MC interrupts");
1319 }
1320 #endif
1321
1322 static void
mca_setup(uint64_t mcg_cap)1323 mca_setup(uint64_t mcg_cap)
1324 {
1325
1326 /*
1327 * On AMD Family 10h processors, unless logging of level one TLB
1328 * parity (L1TP) errors is disabled, enable the recommended workaround
1329 * for Erratum 383.
1330 */
1331 if (cpu_vendor_id == CPU_VENDOR_AMD &&
1332 CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP)
1333 workaround_erratum383 = 1;
1334
1335 mca_banks = mcg_cap & MCG_CAP_COUNT;
1336 mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
1337 mtx_init(&mca_msg_buf_lock, "mca_msg_buf", NULL, MTX_SPIN);
1338 STAILQ_INIT(&mca_records);
1339 STAILQ_INIT(&mca_pending);
1340 mca_tq = taskqueue_create_fast("mca", M_WAITOK,
1341 taskqueue_thread_enqueue, &mca_tq);
1342 TIMEOUT_TASK_INIT(mca_tq, &mca_scan_task, 0, mca_scan_cpus, NULL);
1343 STAILQ_INIT(&mca_freelist);
1344 TASK_INIT(&mca_resize_task, 0, mca_resize, NULL);
1345 TASK_INIT(&mca_postscan_task, 0, mca_postscan, NULL);
1346 mca_resize_freelist();
1347 SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1348 "count", CTLFLAG_RD, (int *)(uintptr_t)&mca_count, 0,
1349 "Record count");
1350 SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1351 "maxcount", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
1352 &mca_maxcount, 0, sysctl_mca_maxcount, "I",
1353 "Maximum record count (-1 is unlimited)");
1354 SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1355 "interval", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
1356 &mca_ticks, 0, sysctl_positive_int, "I",
1357 "Periodic interval in seconds to scan for machine checks");
1358 SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1359 "records", CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mca_records,
1360 "Machine check records");
1361 SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1362 "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1363 sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
1364 #ifdef DEV_APIC
1365 if (cmci_supported(mcg_cap))
1366 cmci_setup();
1367 else if (amd_thresholding_supported())
1368 amd_thresholding_setup();
1369 #endif
1370 }
1371
1372 #ifdef DEV_APIC
1373 /*
1374 * See if we should monitor CMCI for this bank. If CMCI_EN is already
1375 * set in MC_CTL2, then another CPU is responsible for this bank, so
1376 * ignore it. If CMCI_EN returns zero after being set, then this bank
1377 * does not support CMCI_EN. If this CPU sets CMCI_EN, then it should
1378 * now monitor this bank.
1379 */
1380 static void
cmci_monitor(int i)1381 cmci_monitor(int i)
1382 {
1383 struct cmc_state *cc;
1384 uint64_t ctl;
1385
1386 KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
1387
1388 /*
1389 * It is possible for some APs to report CMCI support even if the BSP
1390 * does not, apparently due to a BIOS bug.
1391 */
1392 if (cmc_state == NULL) {
1393 if (bootverbose) {
1394 printf(
1395 "AP %d (%d,%d) reports CMCI support but the BSP does not\n",
1396 PCPU_GET(cpuid), PCPU_GET(apic_id),
1397 PCPU_GET(acpi_id));
1398 }
1399 return;
1400 }
1401
1402 ctl = rdmsr(MSR_MC_CTL2(i));
1403 if (ctl & MC_CTL2_CMCI_EN)
1404 /* Already monitored by another CPU. */
1405 return;
1406
1407 /* Set the threshold to one event for now. */
1408 ctl &= ~MC_CTL2_THRESHOLD;
1409 ctl |= MC_CTL2_CMCI_EN | 1;
1410 wrmsr(MSR_MC_CTL2(i), ctl);
1411 ctl = rdmsr(MSR_MC_CTL2(i));
1412 if (!(ctl & MC_CTL2_CMCI_EN))
1413 /* This bank does not support CMCI. */
1414 return;
1415
1416 cc = &cmc_state[PCPU_GET(cpuid)][i];
1417
1418 /* Determine maximum threshold. */
1419 ctl &= ~MC_CTL2_THRESHOLD;
1420 ctl |= 0x7fff;
1421 wrmsr(MSR_MC_CTL2(i), ctl);
1422 ctl = rdmsr(MSR_MC_CTL2(i));
1423 cc->max_threshold = ctl & MC_CTL2_THRESHOLD;
1424
1425 /* Start off with a threshold of 1. */
1426 ctl &= ~MC_CTL2_THRESHOLD;
1427 ctl |= 1;
1428 wrmsr(MSR_MC_CTL2(i), ctl);
1429
1430 /* Mark this bank as monitored. */
1431 PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
1432 }
1433
1434 /*
1435 * For resume, reset the threshold for any banks we monitor back to
1436 * one and throw away the timestamp of the last interrupt.
1437 */
1438 static void
cmci_resume(int i)1439 cmci_resume(int i)
1440 {
1441 struct cmc_state *cc;
1442 uint64_t ctl;
1443
1444 KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
1445
1446 /* See cmci_monitor(). */
1447 if (cmc_state == NULL)
1448 return;
1449
1450 /* Ignore banks not monitored by this CPU. */
1451 if (!(PCPU_GET(cmci_mask) & 1 << i))
1452 return;
1453
1454 cc = &cmc_state[PCPU_GET(cpuid)][i];
1455 cc->last_intr = 0;
1456 ctl = rdmsr(MSR_MC_CTL2(i));
1457 ctl &= ~MC_CTL2_THRESHOLD;
1458 ctl |= MC_CTL2_CMCI_EN | 1;
1459 wrmsr(MSR_MC_CTL2(i), ctl);
1460 }
1461
1462 /*
1463 * Apply an AMD ET configuration to the corresponding MSR.
1464 */
1465 static void
amd_thresholding_start(struct amd_et_state * cc,int bank)1466 amd_thresholding_start(struct amd_et_state *cc, int bank)
1467 {
1468 uint64_t misc;
1469
1470 KASSERT(amd_elvt >= 0, ("ELVT offset is not set"));
1471
1472 misc = rdmsr(mca_msr_ops.misc(bank));
1473
1474 misc &= ~MC_MISC_AMD_INT_MASK;
1475 misc |= MC_MISC_AMD_INT_LVT;
1476
1477 misc &= ~MC_MISC_AMD_LVT_MASK;
1478 misc |= (uint64_t)amd_elvt << MC_MISC_AMD_LVT_SHIFT;
1479
1480 misc &= ~MC_MISC_AMD_CNT_MASK;
1481 misc |= (uint64_t)(MC_MISC_AMD_CNT_MAX - cc->cur_threshold)
1482 << MC_MISC_AMD_CNT_SHIFT;
1483
1484 misc &= ~MC_MISC_AMD_OVERFLOW;
1485 misc |= MC_MISC_AMD_CNTEN;
1486
1487 wrmsr(mca_msr_ops.misc(bank), misc);
1488 }
1489
1490 static void
amd_thresholding_monitor(int i)1491 amd_thresholding_monitor(int i)
1492 {
1493 struct amd_et_state *cc;
1494 uint64_t misc;
1495
1496 /*
1497 * Kludge: On 10h, banks after 4 are not thresholding but also may have
1498 * bogus Valid bits. Skip them. This is definitely fixed in 15h, but
1499 * I have not investigated whether it is fixed in earlier models.
1500 */
1501 if (CPUID_TO_FAMILY(cpu_id) < 0x15 && i >= 5)
1502 return;
1503
1504 /* The counter must be valid and present. */
1505 misc = rdmsr(mca_msr_ops.misc(i));
1506 if ((misc & (MC_MISC_AMD_VAL | MC_MISC_AMD_CNTP)) !=
1507 (MC_MISC_AMD_VAL | MC_MISC_AMD_CNTP))
1508 return;
1509
1510 /* The register should not be locked. */
1511 if ((misc & MC_MISC_AMD_LOCK) != 0) {
1512 if (bootverbose)
1513 printf("%s: 0x%jx: Bank %d: locked\n", __func__,
1514 (uintmax_t)misc, i);
1515 return;
1516 }
1517
1518 /*
1519 * If counter is enabled then either the firmware or another CPU
1520 * has already claimed it.
1521 */
1522 if ((misc & MC_MISC_AMD_CNTEN) != 0) {
1523 if (bootverbose)
1524 printf("%s: 0x%jx: Bank %d: already enabled\n",
1525 __func__, (uintmax_t)misc, i);
1526 return;
1527 }
1528
1529 /*
1530 * Configure an Extended Interrupt LVT register for reporting
1531 * counter overflows if that feature is supported and the first
1532 * extended register is available.
1533 */
1534 amd_elvt = lapic_enable_mca_elvt();
1535 if (amd_elvt < 0) {
1536 printf("%s: Bank %d: lapic enable mca elvt failed: %d\n",
1537 __func__, i, amd_elvt);
1538 return;
1539 }
1540
1541 cc = &amd_et_state[PCPU_GET(cpuid)][i];
1542 cc->cur_threshold = 1;
1543 amd_thresholding_start(cc, i);
1544
1545 /* Mark this bank as monitored. */
1546 PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
1547 }
1548
1549 static void
amd_thresholding_resume(int i)1550 amd_thresholding_resume(int i)
1551 {
1552 struct amd_et_state *cc;
1553
1554 KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
1555
1556 /* Ignore banks not monitored by this CPU. */
1557 if (!(PCPU_GET(cmci_mask) & 1 << i))
1558 return;
1559
1560 cc = &amd_et_state[PCPU_GET(cpuid)][i];
1561 cc->last_intr = 0;
1562 cc->cur_threshold = 1;
1563 amd_thresholding_start(cc, i);
1564 }
1565 #endif
1566
1567 /*
1568 * Initializes per-CPU machine check registers and enables corrected
1569 * machine check interrupts.
1570 */
1571 static void
_mca_init(int boot)1572 _mca_init(int boot)
1573 {
1574 uint64_t mcg_cap;
1575 uint64_t ctl, mask;
1576 int i, skip, family;
1577
1578 family = CPUID_TO_FAMILY(cpu_id);
1579
1580 /* MCE is required. */
1581 if (!mca_enabled || !(cpu_feature & CPUID_MCE))
1582 return;
1583
1584 if (cpu_feature & CPUID_MCA) {
1585 if (boot)
1586 PCPU_SET(cmci_mask, 0);
1587
1588 mcg_cap = rdmsr(MSR_MCG_CAP);
1589 if (mcg_cap & MCG_CAP_CTL_P)
1590 /* Enable MCA features. */
1591 wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
1592 if (IS_BSP() && boot)
1593 mca_setup(mcg_cap);
1594
1595 /*
1596 * Disable logging of level one TLB parity (L1TP) errors by
1597 * the data cache as an alternative workaround for AMD Family
1598 * 10h Erratum 383. Unlike the recommended workaround, there
1599 * is no performance penalty to this workaround. However,
1600 * L1TP errors will go unreported.
1601 */
1602 if (cpu_vendor_id == CPU_VENDOR_AMD && family == 0x10 &&
1603 !amd10h_L1TP) {
1604 mask = rdmsr(MSR_MC0_CTL_MASK);
1605 if ((mask & (1UL << 5)) == 0)
1606 wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5));
1607 }
1608 if (amd_rascap & AMDRAS_SCALABLE_MCA) {
1609 mca_msr_ops.ctl = mca_smca_ctl_reg;
1610 mca_msr_ops.status = mca_smca_status_reg;
1611 mca_msr_ops.addr = mca_smca_addr_reg;
1612 mca_msr_ops.misc = mca_smca_misc_reg;
1613 }
1614
1615 /* Enable local MCE if supported. */
1616 if (cpu_vendor_id == CPU_VENDOR_INTEL &&
1617 (mcg_cap & MCG_CAP_LMCE_P) &&
1618 (rdmsr(MSR_IA32_FEATURE_CONTROL) &
1619 IA32_FEATURE_CONTROL_LMCE_EN))
1620 wrmsr(MSR_MCG_EXT_CTL, rdmsr(MSR_MCG_EXT_CTL) | 1);
1621
1622 /*
1623 * The cmci_monitor() must not be executed
1624 * simultaneously by several CPUs.
1625 */
1626 if (boot)
1627 mtx_lock_spin(&mca_lock);
1628
1629 for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
1630 /* By default enable logging of all errors. */
1631 ctl = 0xffffffffffffffffUL;
1632 skip = 0;
1633
1634 if (cpu_vendor_id == CPU_VENDOR_INTEL) {
1635 /*
1636 * For P6 models before Nehalem MC0_CTL is
1637 * always enabled and reserved.
1638 */
1639 if (i == 0 && family == 0x6
1640 && CPUID_TO_MODEL(cpu_id) < 0x1a)
1641 skip = 1;
1642 } else if (cpu_vendor_id == CPU_VENDOR_AMD) {
1643 /* BKDG for Family 10h: unset GartTblWkEn. */
1644 if (i == MC_AMDNB_BANK && family >= 0xf &&
1645 family < 0x17)
1646 ctl &= ~(1UL << 10);
1647 }
1648
1649 if (!skip)
1650 wrmsr(mca_msr_ops.ctl(i), ctl);
1651
1652 #ifdef DEV_APIC
1653 if (cmci_supported(mcg_cap)) {
1654 if (boot)
1655 cmci_monitor(i);
1656 else
1657 cmci_resume(i);
1658 } else if (amd_thresholding_supported()) {
1659 if (boot)
1660 amd_thresholding_monitor(i);
1661 else
1662 amd_thresholding_resume(i);
1663 }
1664 #endif
1665
1666 /* Clear all errors. */
1667 wrmsr(mca_msr_ops.status(i), 0);
1668 }
1669 if (boot)
1670 mtx_unlock_spin(&mca_lock);
1671
1672 #ifdef DEV_APIC
1673 if (cmci_supported(mcg_cap) &&
1674 PCPU_GET(cmci_mask) != 0 && boot)
1675 lapic_enable_cmc();
1676 #endif
1677 }
1678
1679 load_cr4(rcr4() | CR4_MCE);
1680 }
1681
1682 /* Must be executed on each CPU during boot. */
1683 void
mca_init(void)1684 mca_init(void)
1685 {
1686
1687 _mca_init(1);
1688 }
1689
1690 /* Must be executed on each CPU during resume. */
1691 void
mca_resume(void)1692 mca_resume(void)
1693 {
1694
1695 _mca_init(0);
1696 }
1697
1698 /*
1699 * The machine check registers for the BSP cannot be initialized until
1700 * the local APIC is initialized. This happens at SI_SUB_CPU,
1701 * SI_ORDER_SECOND.
1702 */
1703 static void
mca_init_bsp(void * arg __unused)1704 mca_init_bsp(void *arg __unused)
1705 {
1706
1707 mca_init();
1708 }
1709 SYSINIT(mca_init_bsp, SI_SUB_CPU, SI_ORDER_ANY, mca_init_bsp, NULL);
1710
1711 /* Called when a machine check exception fires. */
1712 void
mca_intr(void)1713 mca_intr(void)
1714 {
1715 uint64_t mcg_status;
1716 int count;
1717 bool lmcs, recoverable;
1718
1719 if (!(cpu_feature & CPUID_MCA)) {
1720 /*
1721 * Just print the values of the old Pentium registers
1722 * and panic.
1723 */
1724 printf("MC Type: 0x%jx Address: 0x%jx\n",
1725 (uintmax_t)rdmsr(MSR_P5_MC_TYPE),
1726 (uintmax_t)rdmsr(MSR_P5_MC_ADDR));
1727 panic("Machine check exception");
1728 }
1729
1730 /* Scan the banks and check for any non-recoverable errors. */
1731 mcg_status = rdmsr(MSR_MCG_STATUS);
1732 recoverable = (mcg_status & MCG_STATUS_RIPV) != 0;
1733 lmcs = (cpu_vendor_id != CPU_VENDOR_INTEL ||
1734 (mcg_status & MCG_STATUS_LMCS));
1735 count = mca_scan(MCE, &recoverable);
1736
1737 if (!recoverable) {
1738 /*
1739 * Only panic if the error was detected local to this CPU.
1740 * Some errors will assert a machine check on all CPUs, but
1741 * only certain CPUs will find a valid bank to log.
1742 */
1743 while (!lmcs && count == 0)
1744 cpu_spinwait();
1745
1746 panic("Unrecoverable machine check exception");
1747 }
1748
1749 if (count)
1750 mca_process_records(MCE);
1751
1752 /* Clear MCIP. */
1753 wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP);
1754 }
1755
1756 #ifdef DEV_APIC
1757 /* Called for a CMCI (correctable machine check interrupt). */
1758 void
cmc_intr(void)1759 cmc_intr(void)
1760 {
1761 bool recoverable = true;
1762
1763 /*
1764 * Serialize MCA bank scanning to prevent collisions from
1765 * sibling threads.
1766 *
1767 * If we found anything, log them to the console.
1768 */
1769 if (mca_scan(CMCI, &recoverable) != 0)
1770 mca_process_records(CMCI);
1771 }
1772 #endif
1773