1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2009 Hudson River Trading LLC
5  * Written by: John H. Baldwin <jhb@FreeBSD.org>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 /*
31  * Support for x86 machine check architecture.
32  */
33 
34 #include <sys/cdefs.h>
35 #ifdef __amd64__
36 #define	DEV_APIC
37 #else
38 #include "opt_apic.h"
39 #endif
40 
41 #include <sys/param.h>
42 #include <sys/bus.h>
43 #include <sys/interrupt.h>
44 #include <sys/kernel.h>
45 #include <sys/lock.h>
46 #include <sys/malloc.h>
47 #include <sys/mutex.h>
48 #include <sys/proc.h>
49 #include <sys/sbuf.h>
50 #include <sys/sched.h>
51 #include <sys/smp.h>
52 #include <sys/sysctl.h>
53 #include <sys/syslog.h>
54 #include <sys/systm.h>
55 #include <sys/taskqueue.h>
56 #include <machine/intr_machdep.h>
57 #include <x86/apicvar.h>
58 #include <machine/cpu.h>
59 #include <machine/cputypes.h>
60 #include <x86/mca.h>
61 #include <machine/md_var.h>
62 #include <machine/specialreg.h>
63 
64 /* Modes for mca_scan() */
65 enum scan_mode {
66 	POLLED,
67 	MCE,
68 	CMCI,
69 };
70 
71 #ifdef DEV_APIC
72 /*
73  * State maintained for each monitored MCx bank to control the
74  * corrected machine check interrupt threshold.
75  */
76 struct cmc_state {
77 	int	max_threshold;
78 	time_t	last_intr;
79 };
80 
81 struct amd_et_state {
82 	int	cur_threshold;
83 	time_t	last_intr;
84 };
85 #endif
86 
87 struct mca_internal {
88 	struct mca_record rec;
89 	STAILQ_ENTRY(mca_internal) link;
90 };
91 
92 struct mca_enumerator_ops {
93         unsigned int (*ctl)(int);
94         unsigned int (*status)(int);
95         unsigned int (*addr)(int);
96         unsigned int (*misc)(int);
97 };
98 
99 static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture");
100 
101 static volatile int mca_count;	/* Number of records stored. */
102 static int mca_banks;		/* Number of per-CPU register banks. */
103 static int mca_maxcount = -1;	/* Limit on records stored. (-1 = unlimited) */
104 
105 static SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
106     "Machine Check Architecture");
107 
108 static int mca_enabled = 1;
109 SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0,
110     "Administrative toggle for machine check support");
111 
112 static int log_corrected = 1;
113 SYSCTL_INT(_hw_mca, OID_AUTO, log_corrected, CTLFLAG_RWTUN, &log_corrected, 0,
114     "Log corrected errors to the console");
115 
116 static int amd10h_L1TP = 1;
117 SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0,
118     "Administrative toggle for logging of level one TLB parity (L1TP) errors");
119 
120 static int intel6h_HSD131;
121 SYSCTL_INT(_hw_mca, OID_AUTO, intel6h_HSD131, CTLFLAG_RDTUN, &intel6h_HSD131, 0,
122     "Administrative toggle for logging of spurious corrected errors");
123 
124 int workaround_erratum383;
125 SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RDTUN,
126     &workaround_erratum383, 0,
127     "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?");
128 
129 #ifdef DIAGNOSTIC
130 static uint64_t fake_status;
131 SYSCTL_U64(_hw_mca, OID_AUTO, fake_status, CTLFLAG_RW,
132     &fake_status, 0,
133     "Insert artificial MCA with given status (testing purpose only)");
134 static int fake_bank;
135 SYSCTL_INT(_hw_mca, OID_AUTO, fake_bank, CTLFLAG_RW,
136     &fake_bank, 0,
137     "Bank to use for artificial MCAs (testing purpose only)");
138 #endif
139 
140 static bool mca_uselog = false;
141 SYSCTL_BOOL(_hw_mca, OID_AUTO, uselog, CTLFLAG_RWTUN, &mca_uselog, 0,
142     "Should the system send non-fatal machine check errors to the log "
143     "(instead of the console)?");
144 
145 static STAILQ_HEAD(, mca_internal) mca_freelist;
146 static int mca_freecount;
147 static STAILQ_HEAD(, mca_internal) mca_records;
148 static STAILQ_HEAD(, mca_internal) mca_pending;
149 static int mca_ticks = 300;
150 static struct taskqueue *mca_tq;
151 static struct task mca_resize_task;
152 static struct task mca_postscan_task;
153 static struct timeout_task mca_scan_task;
154 static struct mtx mca_lock;
155 static bool mca_startup_done = false;
156 
157 /* Static buffer to compose messages while in an interrupt context. */
158 static char mca_msg_buf[1024];
159 static struct mtx mca_msg_buf_lock;
160 
161 /* Statistics on number of MCA events by type, updated with the mca_lock. */
162 static uint64_t mca_stats[MCA_T_COUNT];
163 SYSCTL_OPAQUE(_hw_mca, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_SKIP,
164     mca_stats, MCA_T_COUNT * sizeof(mca_stats[0]),
165     "S", "Array of MCA events by type");
166 
167 /* Variables to track and control message rate limiting. */
168 static struct timeval mca_last_log_time;
169 static struct timeval mca_log_interval;
170 static int mca_log_skipped;
171 
172 static int
sysctl_mca_log_interval(SYSCTL_HANDLER_ARGS)173 sysctl_mca_log_interval(SYSCTL_HANDLER_ARGS)
174 {
175 	int error;
176 	u_int val;
177 
178 	val = mca_log_interval.tv_sec;
179 	error = sysctl_handle_int(oidp, &val, 0, req);
180 	if (error != 0 || req->newptr == NULL)
181 		return (error);
182 	mca_log_interval.tv_sec = val;
183 	return (0);
184 }
185 SYSCTL_PROC(_hw_mca, OID_AUTO, log_interval,
186     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, &mca_log_interval, 0,
187     sysctl_mca_log_interval, "IU",
188     "Minimum number of seconds between logging correctable MCAs"
189     " (0 = no limit)");
190 
191 static unsigned int
mca_ia32_ctl_reg(int bank)192 mca_ia32_ctl_reg(int bank)
193 {
194 	return (MSR_MC_CTL(bank));
195 }
196 
197 static unsigned int
mca_ia32_status_reg(int bank)198 mca_ia32_status_reg(int bank)
199 {
200 	return (MSR_MC_STATUS(bank));
201 }
202 
203 static unsigned int
mca_ia32_addr_reg(int bank)204 mca_ia32_addr_reg(int bank)
205 {
206 	return (MSR_MC_ADDR(bank));
207 }
208 
209 static unsigned int
mca_ia32_misc_reg(int bank)210 mca_ia32_misc_reg(int bank)
211 {
212 	return (MSR_MC_MISC(bank));
213 }
214 
215 static unsigned int
mca_smca_ctl_reg(int bank)216 mca_smca_ctl_reg(int bank)
217 {
218         return (MSR_SMCA_MC_CTL(bank));
219 }
220 
221 static unsigned int
mca_smca_status_reg(int bank)222 mca_smca_status_reg(int bank)
223 {
224         return (MSR_SMCA_MC_STATUS(bank));
225 }
226 
227 static unsigned int
mca_smca_addr_reg(int bank)228 mca_smca_addr_reg(int bank)
229 {
230         return (MSR_SMCA_MC_ADDR(bank));
231 }
232 
233 static unsigned int
mca_smca_misc_reg(int bank)234 mca_smca_misc_reg(int bank)
235 {
236         return (MSR_SMCA_MC_MISC(bank));
237 }
238 
239 static struct mca_enumerator_ops mca_msr_ops = {
240         .ctl    = mca_ia32_ctl_reg,
241         .status = mca_ia32_status_reg,
242         .addr   = mca_ia32_addr_reg,
243         .misc   = mca_ia32_misc_reg
244 };
245 
246 #ifdef DEV_APIC
247 static struct cmc_state **cmc_state;		/* Indexed by cpuid, bank. */
248 static struct amd_et_state **amd_et_state;	/* Indexed by cpuid, bank. */
249 static int cmc_throttle = 60;	/* Time in seconds to throttle CMCI. */
250 
251 static int amd_elvt = -1;
252 
253 static inline bool
amd_thresholding_supported(void)254 amd_thresholding_supported(void)
255 {
256 	if (cpu_vendor_id != CPU_VENDOR_AMD &&
257 	    cpu_vendor_id != CPU_VENDOR_HYGON)
258 		return (false);
259 	/*
260 	 * The RASCap register is wholly reserved in families 0x10-0x15 (through model 1F).
261 	 *
262 	 * It begins to be documented in family 0x15 model 30 and family 0x16,
263 	 * but neither of these families documents the ScalableMca bit, which
264 	 * supposedly defines the presence of this feature on family 0x17.
265 	 */
266 	if (CPUID_TO_FAMILY(cpu_id) >= 0x10 && CPUID_TO_FAMILY(cpu_id) <= 0x16)
267 		return (true);
268 	if (CPUID_TO_FAMILY(cpu_id) >= 0x17)
269 		return ((amd_rascap & AMDRAS_SCALABLE_MCA) != 0);
270 	return (false);
271 }
272 #endif
273 
274 static inline bool
cmci_supported(uint64_t mcg_cap)275 cmci_supported(uint64_t mcg_cap)
276 {
277 	/*
278 	 * MCG_CAP_CMCI_P bit is reserved in AMD documentation.  Until
279 	 * it is defined, do not use it to check for CMCI support.
280 	 */
281 	if (cpu_vendor_id != CPU_VENDOR_INTEL)
282 		return (false);
283 	return ((mcg_cap & MCG_CAP_CMCI_P) != 0);
284 }
285 
286 static inline bool
tes_supported(uint64_t mcg_cap)287 tes_supported(uint64_t mcg_cap)
288 {
289 
290 	/*
291 	 * MCG_CAP_TES_P bit is reserved in AMD documentation.  Until
292 	 * it is defined, do not use it to check for TES support.
293 	 */
294 	if (cpu_vendor_id != CPU_VENDOR_INTEL)
295 		return (false);
296 	return ((mcg_cap & MCG_CAP_TES_P) != 0);
297 }
298 
299 static inline bool
ser_supported(uint64_t mcg_cap)300 ser_supported(uint64_t mcg_cap)
301 {
302 
303 	return (tes_supported(mcg_cap) && (mcg_cap & MCG_CAP_SER_P) != 0);
304 }
305 
306 static int
sysctl_positive_int(SYSCTL_HANDLER_ARGS)307 sysctl_positive_int(SYSCTL_HANDLER_ARGS)
308 {
309 	int error, value;
310 
311 	value = *(int *)arg1;
312 	error = sysctl_handle_int(oidp, &value, 0, req);
313 	if (error || req->newptr == NULL)
314 		return (error);
315 	if (value <= 0)
316 		return (EINVAL);
317 	*(int *)arg1 = value;
318 	return (0);
319 }
320 
321 static int
sysctl_mca_records(SYSCTL_HANDLER_ARGS)322 sysctl_mca_records(SYSCTL_HANDLER_ARGS)
323 {
324 	int *name = (int *)arg1;
325 	u_int namelen = arg2;
326 	struct mca_record record;
327 	struct mca_internal *rec;
328 	int i;
329 
330 	if (namelen != 1)
331 		return (EINVAL);
332 
333 	if (name[0] < 0 || name[0] >= mca_count)
334 		return (EINVAL);
335 
336 	mtx_lock_spin(&mca_lock);
337 	if (name[0] >= mca_count) {
338 		mtx_unlock_spin(&mca_lock);
339 		return (EINVAL);
340 	}
341 	i = 0;
342 	STAILQ_FOREACH(rec, &mca_records, link) {
343 		if (i == name[0]) {
344 			record = rec->rec;
345 			break;
346 		}
347 		i++;
348 	}
349 	mtx_unlock_spin(&mca_lock);
350 	return (SYSCTL_OUT(req, &record, sizeof(record)));
351 }
352 
353 static const char *
mca_error_ttype(uint16_t mca_error)354 mca_error_ttype(uint16_t mca_error)
355 {
356 
357 	switch ((mca_error & 0x000c) >> 2) {
358 	case 0:
359 		return ("I");
360 	case 1:
361 		return ("D");
362 	case 2:
363 		return ("G");
364 	}
365 	return ("?");
366 }
367 
368 static const char *
mca_error_level(uint16_t mca_error)369 mca_error_level(uint16_t mca_error)
370 {
371 
372 	switch (mca_error & 0x0003) {
373 	case 0:
374 		return ("L0");
375 	case 1:
376 		return ("L1");
377 	case 2:
378 		return ("L2");
379 	case 3:
380 		return ("LG");
381 	}
382 	return ("L?");
383 }
384 
385 static const char *
mca_error_request(uint16_t mca_error)386 mca_error_request(uint16_t mca_error)
387 {
388 
389 	switch ((mca_error & 0x00f0) >> 4) {
390 	case 0x0:
391 		return ("ERR");
392 	case 0x1:
393 		return ("RD");
394 	case 0x2:
395 		return ("WR");
396 	case 0x3:
397 		return ("DRD");
398 	case 0x4:
399 		return ("DWR");
400 	case 0x5:
401 		return ("IRD");
402 	case 0x6:
403 		return ("PREFETCH");
404 	case 0x7:
405 		return ("EVICT");
406 	case 0x8:
407 		return ("SNOOP");
408 	}
409 	return ("???");
410 }
411 
412 static const char *
mca_error_mmtype(uint16_t mca_error,enum mca_stat_types * event_type)413 mca_error_mmtype(uint16_t mca_error, enum mca_stat_types *event_type)
414 {
415 
416 	switch ((mca_error & 0x70) >> 4) {
417 	case 0x0:
418 		*event_type = MCA_T_MEMCONTROLLER_GEN;
419 		return ("GEN");
420 	case 0x1:
421 		*event_type = MCA_T_MEMCONTROLLER_RD;
422 		return ("RD");
423 	case 0x2:
424 		*event_type = MCA_T_MEMCONTROLLER_WR;
425 		return ("WR");
426 	case 0x3:
427 		*event_type = MCA_T_MEMCONTROLLER_AC;
428 		return ("AC");
429 	case 0x4:
430 		*event_type = MCA_T_MEMCONTROLLER_MS;
431 		return ("MS");
432 	}
433 	*event_type = MCA_T_MEMCONTROLLER_OTHER;
434 	return ("???");
435 }
436 
437 static const char *
mca_addres_mode(uint64_t mca_misc)438 mca_addres_mode(uint64_t mca_misc)
439 {
440 
441 	switch ((mca_misc & MC_MISC_ADDRESS_MODE) >> 6) {
442 	case 0x0:
443 		return ("Segment Offset");
444 	case 0x1:
445 		return ("Linear Address");
446 	case 0x2:
447 		return ("Physical Address");
448 	case 0x3:
449 		return ("Memory Address");
450 	case 0x7:
451 		return ("Generic");
452 	}
453 	return ("???");
454 }
455 
456 static int
mca_mute(const struct mca_record * rec)457 mca_mute(const struct mca_record *rec)
458 {
459 
460 	/*
461 	 * Skip spurious corrected parity errors generated by Intel Haswell-
462 	 * and Broadwell-based CPUs (see HSD131, HSM142, HSW131 and BDM48
463 	 * erratum respectively), unless reporting is enabled.
464 	 * Note that these errors also have been observed with the D0-stepping
465 	 * of Haswell, while at least initially the CPU specification updates
466 	 * suggested only the C0-stepping to be affected.  Similarly, Celeron
467 	 * 2955U with a CPU ID of 0x45 apparently are also concerned with the
468 	 * same problem, with HSM142 only referring to 0x3c and 0x46.
469 	 */
470 	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
471 	    CPUID_TO_FAMILY(cpu_id) == 0x6 &&
472 	    (CPUID_TO_MODEL(cpu_id) == 0x3c ||	/* HSD131, HSM142, HSW131 */
473 	    CPUID_TO_MODEL(cpu_id) == 0x3d ||	/* BDM48 */
474 	    CPUID_TO_MODEL(cpu_id) == 0x45 ||
475 	    CPUID_TO_MODEL(cpu_id) == 0x46) &&	/* HSM142 */
476 	    rec->mr_bank == 0 &&
477 	    (rec->mr_status & 0xa0000000ffffffff) == 0x80000000000f0005 &&
478 	    !intel6h_HSD131)
479 	    	return (1);
480 
481 	return (0);
482 }
483 
484 /* Dump details about a single machine check. */
485 static void
mca_log(enum scan_mode mode,const struct mca_record * rec,bool fatal)486 mca_log(enum scan_mode mode, const struct mca_record *rec, bool fatal)
487 {
488 	int error, numskipped;
489 	uint16_t mca_error;
490 	enum mca_stat_types event_type;
491 	struct sbuf sb;
492 	bool uncor, using_shared_buf;
493 
494 	if (mca_mute(rec))
495 		return;
496 
497 	uncor = (rec->mr_status & MC_STATUS_UC) != 0;
498 
499 	if (!log_corrected && !uncor && (!tes_supported(rec->mr_mcg_cap) ||
500 	    ((rec->mr_status & MC_STATUS_TES_STATUS) >> 53) != 0x2))
501 		return;
502 
503 	/* Try to use an allocated buffer when not in an interrupt context. */
504 	if (mode == POLLED && sbuf_new(&sb, NULL, 512, SBUF_AUTOEXTEND) != NULL)
505 		using_shared_buf = false;
506 	else {
507 		using_shared_buf = true;
508 		mtx_lock_spin(&mca_msg_buf_lock);
509 		sbuf_new(&sb, mca_msg_buf, sizeof(mca_msg_buf), SBUF_FIXEDLEN);
510 	}
511 
512 	sbuf_printf(&sb, "MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank,
513 	    (long long)rec->mr_status);
514 	sbuf_printf(&sb, "MCA: Global Cap 0x%016llx, Status 0x%016llx\n",
515 	    (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status);
516 	sbuf_printf(&sb, "MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n",
517 	    cpu_vendor, rec->mr_cpu_id, rec->mr_apic_id);
518 	sbuf_printf(&sb, "MCA: CPU %d ", rec->mr_cpu);
519 	if (rec->mr_status & MC_STATUS_UC)
520 		sbuf_printf(&sb, "UNCOR ");
521 	else {
522 		sbuf_printf(&sb, "COR ");
523 		if (cmci_supported(rec->mr_mcg_cap))
524 			sbuf_printf(&sb, "(%lld) ", ((long long)rec->mr_status &
525 			    MC_STATUS_COR_COUNT) >> 38);
526 		if (tes_supported(rec->mr_mcg_cap)) {
527 			switch ((rec->mr_status & MC_STATUS_TES_STATUS) >> 53) {
528 			case 0x1:
529 				sbuf_printf(&sb, "(Green) ");
530 				break;
531 			case 0x2:
532 				sbuf_printf(&sb, "(Yellow) ");
533 				break;
534 			}
535 		}
536 	}
537 	if (rec->mr_status & MC_STATUS_EN)
538 		sbuf_printf(&sb, "EN ");
539 	if (rec->mr_status & MC_STATUS_PCC)
540 		sbuf_printf(&sb, "PCC ");
541 	if (ser_supported(rec->mr_mcg_cap)) {
542 		if (rec->mr_status & MC_STATUS_S)
543 			sbuf_printf(&sb, "S ");
544 		if (rec->mr_status & MC_STATUS_AR)
545 			sbuf_printf(&sb, "AR ");
546 	}
547 	if (rec->mr_status & MC_STATUS_OVER)
548 		sbuf_printf(&sb, "OVER ");
549 	mca_error = rec->mr_status & MC_STATUS_MCA_ERROR;
550 	event_type = MCA_T_COUNT;
551 	switch (mca_error) {
552 		/* Simple error codes. */
553 	case 0x0000:
554 		sbuf_printf(&sb, "no error");
555 		event_type = MCA_T_NONE;
556 		break;
557 	case 0x0001:
558 		sbuf_printf(&sb, "unclassified error");
559 		event_type = MCA_T_UNCLASSIFIED;
560 		break;
561 	case 0x0002:
562 		sbuf_printf(&sb, "ucode ROM parity error");
563 		event_type = MCA_T_UCODE_ROM_PARITY;
564 		break;
565 	case 0x0003:
566 		sbuf_printf(&sb, "external error");
567 		event_type = MCA_T_EXTERNAL;
568 		break;
569 	case 0x0004:
570 		sbuf_printf(&sb, "FRC error");
571 		event_type = MCA_T_FRC;
572 		break;
573 	case 0x0005:
574 		sbuf_printf(&sb, "internal parity error");
575 		event_type = MCA_T_INTERNAL_PARITY;
576 		break;
577 	case 0x0006:
578 		sbuf_printf(&sb, "SMM handler code access violation");
579 		event_type = MCA_T_SMM_HANDLER;
580 		break;
581 	case 0x0400:
582 		sbuf_printf(&sb, "internal timer error");
583 		event_type = MCA_T_INTERNAL_TIMER;
584 		break;
585 	case 0x0e0b:
586 		sbuf_printf(&sb, "generic I/O error");
587 		event_type = MCA_T_GENERIC_IO;
588 		if (rec->mr_cpu_vendor_id == CPU_VENDOR_INTEL &&
589 		    (rec->mr_status & MC_STATUS_MISCV)) {
590 			sbuf_printf(&sb, " (pci%d:%d:%d:%d)",
591 			    (int)((rec->mr_misc & MC_MISC_PCIE_SEG) >> 32),
592 			    (int)((rec->mr_misc & MC_MISC_PCIE_BUS) >> 24),
593 			    (int)((rec->mr_misc & MC_MISC_PCIE_SLOT) >> 19),
594 			    (int)((rec->mr_misc & MC_MISC_PCIE_FUNC) >> 16));
595 		}
596 		break;
597 	default:
598 		if ((mca_error & 0xfc00) == 0x0400) {
599 			sbuf_printf(&sb, "internal error %x",
600 			    mca_error & 0x03ff);
601 			event_type = MCA_T_INTERNAL;
602 			break;
603 		}
604 
605 		/* Compound error codes. */
606 
607 		/* Memory hierarchy error. */
608 		if ((mca_error & 0xeffc) == 0x000c) {
609 			sbuf_printf(&sb, "%s memory error",
610 			    mca_error_level(mca_error));
611 			event_type = MCA_T_MEMORY;
612 			break;
613 		}
614 
615 		/* TLB error. */
616 		if ((mca_error & 0xeff0) == 0x0010) {
617 			sbuf_printf(&sb, "%sTLB %s error",
618 			    mca_error_ttype(mca_error),
619 			    mca_error_level(mca_error));
620 			event_type = MCA_T_TLB;
621 			break;
622 		}
623 
624 		/* Memory controller error. */
625 		if ((mca_error & 0xef80) == 0x0080) {
626 			sbuf_printf(&sb, "%s channel ",
627 			    mca_error_mmtype(mca_error, &event_type));
628 			if ((mca_error & 0x000f) != 0x000f)
629 				sbuf_printf(&sb, "%d", mca_error & 0x000f);
630 			else
631 				sbuf_printf(&sb, "??");
632 			sbuf_printf(&sb, " memory error");
633 			break;
634 		}
635 
636 		/* Cache error. */
637 		if ((mca_error & 0xef00) == 0x0100) {
638 			sbuf_printf(&sb, "%sCACHE %s %s error",
639 			    mca_error_ttype(mca_error),
640 			    mca_error_level(mca_error),
641 			    mca_error_request(mca_error));
642 			event_type = MCA_T_CACHE;
643 			break;
644 		}
645 
646 		/* Extended memory error. */
647 		if ((mca_error & 0xef80) == 0x0280) {
648 			sbuf_printf(&sb, "%s channel ",
649 			    mca_error_mmtype(mca_error, &event_type));
650 			if ((mca_error & 0x000f) != 0x000f)
651 				sbuf_printf(&sb, "%d", mca_error & 0x000f);
652 			else
653 				sbuf_printf(&sb, "??");
654 			sbuf_printf(&sb, " extended memory error");
655 			break;
656 		}
657 
658 		/* Bus and/or Interconnect error. */
659 		if ((mca_error & 0xe800) == 0x0800) {
660 			sbuf_printf(&sb, "BUS%s ", mca_error_level(mca_error));
661 			event_type = MCA_T_BUS;
662 			switch ((mca_error & 0x0600) >> 9) {
663 			case 0:
664 				sbuf_printf(&sb, "Source");
665 				break;
666 			case 1:
667 				sbuf_printf(&sb, "Responder");
668 				break;
669 			case 2:
670 				sbuf_printf(&sb, "Observer");
671 				break;
672 			default:
673 				sbuf_printf(&sb, "???");
674 				break;
675 			}
676 			sbuf_printf(&sb, " %s ", mca_error_request(mca_error));
677 			switch ((mca_error & 0x000c) >> 2) {
678 			case 0:
679 				sbuf_printf(&sb, "Memory");
680 				break;
681 			case 2:
682 				sbuf_printf(&sb, "I/O");
683 				break;
684 			case 3:
685 				sbuf_printf(&sb, "Other");
686 				break;
687 			default:
688 				sbuf_printf(&sb, "???");
689 				break;
690 			}
691 			if (mca_error & 0x0100)
692 				sbuf_printf(&sb, " timed out");
693 			break;
694 		}
695 
696 		sbuf_printf(&sb, "unknown error %x", mca_error);
697 		event_type = MCA_T_UNKNOWN;
698 		break;
699 	}
700 	sbuf_printf(&sb, "\n");
701 	if (rec->mr_status & MC_STATUS_ADDRV) {
702 		sbuf_printf(&sb, "MCA: Address 0x%llx",
703 		    (long long)rec->mr_addr);
704 		if (ser_supported(rec->mr_mcg_cap) &&
705 		    (rec->mr_status & MC_STATUS_MISCV)) {
706 			sbuf_printf(&sb, " (Mode: %s, LSB: %d)",
707 			    mca_addres_mode(rec->mr_misc),
708 			    (int)(rec->mr_misc & MC_MISC_RA_LSB));
709 		}
710 		sbuf_printf(&sb, "\n");
711 	}
712 	if (rec->mr_status & MC_STATUS_MISCV)
713 		sbuf_printf(&sb, "MCA: Misc 0x%llx\n", (long long)rec->mr_misc);
714 
715 	if (event_type < 0 || event_type >= MCA_T_COUNT) {
716 		KASSERT(0, ("%s: invalid event type (%d)", __func__,
717 		    event_type));
718 		event_type = MCA_T_UNKNOWN;
719 	}
720 	numskipped = 0;
721 	if (!fatal && !uncor) {
722 		/*
723 		 * Update statistics and check the rate limit for
724 		 * correctable errors. The rate limit is only applied
725 		 * after the system records a reasonable number of errors
726 		 * of the same type. The goal is to reduce the impact of
727 		 * the system seeing and attempting to log a burst of
728 		 * similar errors, which (especially when printed to the
729 		 * console) can be expensive.
730 		 */
731 		mtx_lock_spin(&mca_lock);
732 		mca_stats[event_type]++;
733 		if (mca_log_interval.tv_sec > 0 && mca_stats[event_type] > 50 &&
734 		    ratecheck(&mca_last_log_time, &mca_log_interval) == 0) {
735 			mca_log_skipped++;
736 			mtx_unlock_spin(&mca_lock);
737 			goto done;
738 		}
739 		numskipped = mca_log_skipped;
740 		mca_log_skipped = 0;
741 		mtx_unlock_spin(&mca_lock);
742 	}
743 
744 	error = sbuf_finish(&sb);
745 	if (fatal || !mca_uselog) {
746 		if (numskipped > 0)
747 			printf("MCA: %d events skipped due to rate limit\n",
748 			    numskipped);
749 		if (error)
750 			printf("MCA: error logging message (sbuf error %d)\n",
751 			    error);
752 		else
753 			sbuf_putbuf(&sb);
754 	} else {
755 		if (numskipped > 0)
756 			log(LOG_ERR,
757 			    "MCA: %d events skipped due to rate limit\n",
758 			    numskipped);
759 		if (error)
760 			log(LOG_ERR,
761 			    "MCA: error logging message (sbuf error %d)\n",
762 			    error);
763 		else
764 			log(uncor ? LOG_CRIT : LOG_ERR, "%s", sbuf_data(&sb));
765 	}
766 
767 done:
768 	sbuf_delete(&sb);
769 	if (using_shared_buf)
770 		mtx_unlock_spin(&mca_msg_buf_lock);
771 }
772 
773 static bool
mca_is_mce(uint64_t mcg_cap,uint64_t status,bool * recoverablep)774 mca_is_mce(uint64_t mcg_cap, uint64_t status, bool *recoverablep)
775 {
776 
777 	/* Corrected error. */
778 	if ((status & MC_STATUS_UC) == 0)
779 		return (0);
780 
781 	/* Spurious MCA error. */
782 	if ((status & MC_STATUS_EN) == 0)
783 		return (0);
784 
785 	/* The processor does not support software error recovery. */
786 	if (!ser_supported(mcg_cap)) {
787 		*recoverablep = false;
788 		return (1);
789 	}
790 
791 	/* Context might have been corrupted. */
792 	if (status & MC_STATUS_PCC) {
793 		*recoverablep = false;
794 		return (1);
795 	}
796 
797 	/* Uncorrected software recoverable. */
798 	if (status & MC_STATUS_S) {
799 		/* Action required vs optional. */
800 		if (status & MC_STATUS_AR)
801 			*recoverablep = false;
802 		return (1);
803 	}
804 
805 	/* Uncorrected no action required. */
806 	return (0);
807 }
808 
809 static int
mca_check_status(enum scan_mode mode,uint64_t mcg_cap,int bank,struct mca_record * rec,bool * recoverablep)810 mca_check_status(enum scan_mode mode, uint64_t mcg_cap, int bank,
811     struct mca_record *rec, bool *recoverablep)
812 {
813 	uint64_t status;
814 	u_int p[4];
815 	bool mce, recover;
816 
817 	status = rdmsr(mca_msr_ops.status(bank));
818 	if (!(status & MC_STATUS_VAL)) {
819 #ifdef DIAGNOSTIC
820 		/*
821 		 * Check if we have a pending artificial event to generate.
822 		 * Note that this is potentially racy with the sysctl. The
823 		 * tradeoff is deemed acceptable given the test nature
824 		 * of the code.
825 		 */
826 		if (fake_status && bank == fake_bank) {
827 			status = fake_status;
828 			fake_status = 0;
829 		}
830 		if (!(status & MC_STATUS_VAL))
831 			return (0);
832 #else
833 		return (0);
834 #endif
835 	}
836 
837 	recover = *recoverablep;
838 	mce = mca_is_mce(mcg_cap, status, &recover);
839 	if (mce != (mode == MCE))
840 		return (0);
841 	*recoverablep = recover;
842 
843 	/* Save exception information. */
844 	rec->mr_status = status;
845 	rec->mr_bank = bank;
846 	rec->mr_addr = 0;
847 	if (status & MC_STATUS_ADDRV)
848 		rec->mr_addr = rdmsr(mca_msr_ops.addr(bank));
849 	rec->mr_misc = 0;
850 	if (status & MC_STATUS_MISCV)
851 		rec->mr_misc = rdmsr(mca_msr_ops.misc(bank));
852 	rec->mr_tsc = rdtsc();
853 	rec->mr_apic_id = PCPU_GET(apic_id);
854 	rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP);
855 	rec->mr_mcg_status = rdmsr(MSR_MCG_STATUS);
856 	rec->mr_cpu_id = cpu_id;
857 	rec->mr_cpu_vendor_id = cpu_vendor_id;
858 	rec->mr_cpu = PCPU_GET(cpuid);
859 
860 	/*
861 	 * Clear machine check.  Don't do this for uncorrectable
862 	 * errors so that the BIOS can see them.
863 	 */
864 	if (!mce || recover) {
865 		wrmsr(mca_msr_ops.status(bank), 0);
866 		do_cpuid(0, p);
867 	}
868 	return (1);
869 }
870 
871 static void
mca_resize_freelist(void)872 mca_resize_freelist(void)
873 {
874 	struct mca_internal *next, *rec;
875 	STAILQ_HEAD(, mca_internal) tmplist;
876 	int count, i, desired_max, desired_min;
877 
878 	/*
879 	 * Ensure we have at least one record for each bank and one
880 	 * record per CPU, but no more than twice that amount.
881 	 */
882 	desired_min = imax(mp_ncpus, mca_banks);
883 	desired_max = imax(mp_ncpus, mca_banks) * 2;
884 	STAILQ_INIT(&tmplist);
885 	mtx_lock_spin(&mca_lock);
886 	while (mca_freecount > desired_max) {
887 		rec = STAILQ_FIRST(&mca_freelist);
888 		KASSERT(rec != NULL, ("mca_freecount is %d, but list is empty",
889 		    mca_freecount));
890 		STAILQ_REMOVE_HEAD(&mca_freelist, link);
891 		mca_freecount--;
892 		STAILQ_INSERT_TAIL(&tmplist, rec, link);
893 	}
894 	while (mca_freecount < desired_min) {
895 		count = desired_min - mca_freecount;
896 		mtx_unlock_spin(&mca_lock);
897 		for (i = 0; i < count; i++) {
898 			rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
899 			STAILQ_INSERT_TAIL(&tmplist, rec, link);
900 		}
901 		mtx_lock_spin(&mca_lock);
902 		STAILQ_CONCAT(&mca_freelist, &tmplist);
903 		mca_freecount += count;
904 	}
905 	mtx_unlock_spin(&mca_lock);
906 	STAILQ_FOREACH_SAFE(rec, &tmplist, link, next)
907 		free(rec, M_MCA);
908 }
909 
910 static void
mca_resize(void * context,int pending)911 mca_resize(void *context, int pending)
912 {
913 
914 	mca_resize_freelist();
915 }
916 
917 static void
mca_record_entry(enum scan_mode mode,const struct mca_record * record)918 mca_record_entry(enum scan_mode mode, const struct mca_record *record)
919 {
920 	struct mca_internal *rec;
921 
922 	if (mode == POLLED) {
923 		rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
924 		mtx_lock_spin(&mca_lock);
925 	} else {
926 		mtx_lock_spin(&mca_lock);
927 		rec = STAILQ_FIRST(&mca_freelist);
928 		if (rec == NULL) {
929 			mtx_unlock_spin(&mca_lock);
930 			printf("MCA: Unable to allocate space for an event.\n");
931 			mca_log(mode, record, false);
932 			return;
933 		}
934 		STAILQ_REMOVE_HEAD(&mca_freelist, link);
935 		mca_freecount--;
936 	}
937 
938 	rec->rec = *record;
939 	STAILQ_INSERT_TAIL(&mca_pending, rec, link);
940 	mtx_unlock_spin(&mca_lock);
941 }
942 
943 #ifdef DEV_APIC
944 /*
945  * Update the interrupt threshold for a CMCI.  The strategy is to use
946  * a low trigger that interrupts as soon as the first event occurs.
947  * However, if a steady stream of events arrive, the threshold is
948  * increased until the interrupts are throttled to once every
949  * cmc_throttle seconds or the periodic scan.  If a periodic scan
950  * finds that the threshold is too high, it is lowered.
951  */
952 static int
update_threshold(enum scan_mode mode,int valid,int last_intr,int count,int cur_threshold,int max_threshold)953 update_threshold(enum scan_mode mode, int valid, int last_intr, int count,
954     int cur_threshold, int max_threshold)
955 {
956 	u_int delta;
957 	int limit;
958 
959 	delta = (u_int)(time_uptime - last_intr);
960 	limit = cur_threshold;
961 
962 	/*
963 	 * If an interrupt was received less than cmc_throttle seconds
964 	 * since the previous interrupt and the count from the current
965 	 * event is greater than or equal to the current threshold,
966 	 * double the threshold up to the max.
967 	 */
968 	if (mode == CMCI && valid) {
969 		if (delta < cmc_throttle && count >= limit &&
970 		    limit < max_threshold) {
971 			limit = min(limit << 1, max_threshold);
972 		}
973 		return (limit);
974 	}
975 
976 	/*
977 	 * When the banks are polled, check to see if the threshold
978 	 * should be lowered.
979 	 */
980 	if (mode != POLLED)
981 		return (limit);
982 
983 	/* If a CMCI occurred recently, do nothing for now. */
984 	if (delta < cmc_throttle)
985 		return (limit);
986 
987 	/*
988 	 * Compute a new limit based on the average rate of events per
989 	 * cmc_throttle seconds since the last interrupt.
990 	 */
991 	if (valid) {
992 		limit = count * cmc_throttle / delta;
993 		if (limit <= 0)
994 			limit = 1;
995 		else if (limit > max_threshold)
996 			limit = max_threshold;
997 	} else {
998 		limit = 1;
999 	}
1000 	return (limit);
1001 }
1002 
1003 static void
cmci_update(enum scan_mode mode,int bank,int valid,struct mca_record * rec)1004 cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
1005 {
1006 	struct cmc_state *cc;
1007 	uint64_t ctl;
1008 	int cur_threshold, new_threshold;
1009 	int count;
1010 
1011 	/* Fetch the current limit for this bank. */
1012 	cc = &cmc_state[PCPU_GET(cpuid)][bank];
1013 	ctl = rdmsr(MSR_MC_CTL2(bank));
1014 	count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
1015 	cur_threshold = ctl & MC_CTL2_THRESHOLD;
1016 
1017 	new_threshold = update_threshold(mode, valid, cc->last_intr, count,
1018 	    cur_threshold, cc->max_threshold);
1019 
1020 	if (mode == CMCI && valid)
1021 		cc->last_intr = time_uptime;
1022 	if (new_threshold != cur_threshold) {
1023 		ctl &= ~MC_CTL2_THRESHOLD;
1024 		ctl |= new_threshold;
1025 		wrmsr(MSR_MC_CTL2(bank), ctl);
1026 	}
1027 }
1028 
1029 static void
amd_thresholding_update(enum scan_mode mode,int bank,int valid)1030 amd_thresholding_update(enum scan_mode mode, int bank, int valid)
1031 {
1032 	struct amd_et_state *cc;
1033 	uint64_t misc;
1034 	int new_threshold;
1035 	int count;
1036 
1037 	cc = &amd_et_state[PCPU_GET(cpuid)][bank];
1038 	misc = rdmsr(mca_msr_ops.misc(bank));
1039 	count = (misc & MC_MISC_AMD_CNT_MASK) >> MC_MISC_AMD_CNT_SHIFT;
1040 	count = count - (MC_MISC_AMD_CNT_MAX - cc->cur_threshold);
1041 
1042 	new_threshold = update_threshold(mode, valid, cc->last_intr, count,
1043 	    cc->cur_threshold, MC_MISC_AMD_CNT_MAX);
1044 
1045 	cc->cur_threshold = new_threshold;
1046 	misc &= ~MC_MISC_AMD_CNT_MASK;
1047 	misc |= (uint64_t)(MC_MISC_AMD_CNT_MAX - cc->cur_threshold)
1048 	    << MC_MISC_AMD_CNT_SHIFT;
1049 	misc &= ~MC_MISC_AMD_OVERFLOW;
1050 	wrmsr(mca_msr_ops.misc(bank), misc);
1051 	if (mode == CMCI && valid)
1052 		cc->last_intr = time_uptime;
1053 }
1054 #endif
1055 
1056 /*
1057  * This scans all the machine check banks of the current CPU to see if
1058  * there are any machine checks.  Any non-recoverable errors are
1059  * reported immediately via mca_log().  The current thread must be
1060  * pinned when this is called.  The 'mode' parameter indicates if we
1061  * are being called from the MC exception handler, the CMCI handler,
1062  * or the periodic poller.
1063  */
1064 static int
mca_scan(enum scan_mode mode,bool * recoverablep)1065 mca_scan(enum scan_mode mode, bool *recoverablep)
1066 {
1067 	struct mca_record rec;
1068 	uint64_t mcg_cap;
1069 	int count = 0, i, valid;
1070 
1071 	mcg_cap = rdmsr(MSR_MCG_CAP);
1072 	for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
1073 #ifdef DEV_APIC
1074 		/*
1075 		 * For a CMCI, only check banks this CPU is
1076 		 * responsible for.
1077 		 */
1078 		if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
1079 			continue;
1080 #endif
1081 
1082 		valid = mca_check_status(mode, mcg_cap, i, &rec, recoverablep);
1083 		if (valid) {
1084 			count++;
1085 			if (*recoverablep)
1086 				mca_record_entry(mode, &rec);
1087 			else
1088 				mca_log(mode, &rec, true);
1089 		}
1090 
1091 #ifdef DEV_APIC
1092 		/*
1093 		 * If this is a bank this CPU monitors via CMCI,
1094 		 * update the threshold.
1095 		 */
1096 		if (PCPU_GET(cmci_mask) & 1 << i) {
1097 			if (cmc_state != NULL)
1098 				cmci_update(mode, i, valid, &rec);
1099 			else
1100 				amd_thresholding_update(mode, i, valid);
1101 		}
1102 #endif
1103 	}
1104 	return (count);
1105 }
1106 
1107 /*
1108  * Store a new record on the mca_records list while enforcing
1109  * mca_maxcount.
1110  */
1111 static void
mca_store_record(struct mca_internal * mca)1112 mca_store_record(struct mca_internal *mca)
1113 {
1114 
1115 	/*
1116 	 * If we are storing no records (mca_maxcount == 0),
1117 	 * we just free this record.
1118 	 *
1119 	 * If we are storing records (mca_maxcount != 0) and
1120 	 * we have free space on the list, store the record
1121 	 * and increment mca_count.
1122 	 *
1123 	 * If we are storing records and we do not have free
1124 	 * space on the list, store the new record at the
1125 	 * tail and free the oldest one from the head.
1126 	 */
1127 	if (mca_maxcount != 0)
1128 		STAILQ_INSERT_TAIL(&mca_records, mca, link);
1129 	if (mca_maxcount < 0 || mca_count < mca_maxcount)
1130 		mca_count++;
1131 	else {
1132 		if (mca_maxcount != 0) {
1133 			mca = STAILQ_FIRST(&mca_records);
1134 			STAILQ_REMOVE_HEAD(&mca_records, link);
1135 		}
1136 		STAILQ_INSERT_TAIL(&mca_freelist, mca, link);
1137 		mca_freecount++;
1138 	}
1139 }
1140 
1141 /*
1142  * Do the work to process machine check records which have just been
1143  * gathered. Print any pending logs to the console. Queue them for storage.
1144  * Trigger a resizing of the free list.
1145  */
1146 static void
mca_process_records(enum scan_mode mode)1147 mca_process_records(enum scan_mode mode)
1148 {
1149 	struct mca_internal *mca;
1150 	STAILQ_HEAD(, mca_internal) tmplist;
1151 
1152 	/*
1153 	 * If in an interrupt context, defer the post-scan activities to a
1154 	 * task queue.
1155 	 */
1156 	if (mode != POLLED) {
1157 		if (mca_startup_done)
1158 			taskqueue_enqueue(mca_tq, &mca_postscan_task);
1159 		return;
1160 	}
1161 
1162 	/*
1163 	 * Copy the pending list to the stack so we can drop the spin lock
1164 	 * while we are emitting logs.
1165 	 */
1166 	STAILQ_INIT(&tmplist);
1167 	mtx_lock_spin(&mca_lock);
1168 	STAILQ_SWAP(&mca_pending, &tmplist, mca_internal);
1169 	mtx_unlock_spin(&mca_lock);
1170 
1171 	STAILQ_FOREACH(mca, &tmplist, link)
1172 		mca_log(mode, &mca->rec, false);
1173 
1174 	mtx_lock_spin(&mca_lock);
1175 	while ((mca = STAILQ_FIRST(&tmplist)) != NULL) {
1176 		STAILQ_REMOVE_HEAD(&tmplist, link);
1177 		mca_store_record(mca);
1178 	}
1179 	mtx_unlock_spin(&mca_lock);
1180 	mca_resize_freelist();
1181 }
1182 
1183 /*
1184  * Emit log entries and resize the free list. This is intended to be called
1185  * from a task queue to handle work which does not need to be done (or cannot
1186  * be done) in an interrupt context.
1187  */
1188 static void
mca_postscan(void * context __unused,int pending __unused)1189 mca_postscan(void *context __unused, int pending __unused)
1190 {
1191 
1192 	mca_process_records(POLLED);
1193 }
1194 
1195 /*
1196  * Scan the machine check banks on all CPUs by binding to each CPU in
1197  * turn.  If any of the CPUs contained new machine check records, log
1198  * them to the console.
1199  */
1200 static void
mca_scan_cpus(void * context,int pending)1201 mca_scan_cpus(void *context, int pending)
1202 {
1203 	struct thread *td;
1204 	int cpu;
1205 	bool recoverable = true;
1206 
1207 	mca_resize_freelist();
1208 	td = curthread;
1209 	thread_lock(td);
1210 	CPU_FOREACH(cpu) {
1211 		sched_bind(td, cpu);
1212 		thread_unlock(td);
1213 		mca_scan(POLLED, &recoverable);
1214 		thread_lock(td);
1215 		sched_unbind(td);
1216 	}
1217 	thread_unlock(td);
1218 	if (!STAILQ_EMPTY(&mca_pending))
1219 		mca_process_records(POLLED);
1220 	taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task,
1221 	    mca_ticks * SBT_1S, 0, C_PREL(1));
1222 }
1223 
1224 static int
sysctl_mca_scan(SYSCTL_HANDLER_ARGS)1225 sysctl_mca_scan(SYSCTL_HANDLER_ARGS)
1226 {
1227 	int error, i;
1228 
1229 	i = 0;
1230 	error = sysctl_handle_int(oidp, &i, 0, req);
1231 	if (error)
1232 		return (error);
1233 	if (i)
1234 		taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task,
1235 		    0, 0, 0);
1236 	return (0);
1237 }
1238 
1239 static int
sysctl_mca_maxcount(SYSCTL_HANDLER_ARGS)1240 sysctl_mca_maxcount(SYSCTL_HANDLER_ARGS)
1241 {
1242 	struct mca_internal *mca;
1243 	int error, i;
1244 	bool doresize;
1245 
1246 	i = mca_maxcount;
1247 	error = sysctl_handle_int(oidp, &i, 0, req);
1248 	if (error || req->newptr == NULL)
1249 		return (error);
1250 	mtx_lock_spin(&mca_lock);
1251 	mca_maxcount = i;
1252 	doresize = false;
1253 	if (mca_maxcount >= 0)
1254 		while (mca_count > mca_maxcount) {
1255 			mca = STAILQ_FIRST(&mca_records);
1256 			STAILQ_REMOVE_HEAD(&mca_records, link);
1257 			mca_count--;
1258 			STAILQ_INSERT_TAIL(&mca_freelist, mca, link);
1259 			mca_freecount++;
1260 			doresize = true;
1261 		}
1262 	mtx_unlock_spin(&mca_lock);
1263 	if (doresize && mca_startup_done)
1264 		taskqueue_enqueue(mca_tq, &mca_resize_task);
1265 	return (error);
1266 }
1267 
1268 static void
mca_startup(void * dummy)1269 mca_startup(void *dummy)
1270 {
1271 
1272 	if (mca_banks <= 0)
1273 		return;
1274 
1275 	taskqueue_start_threads(&mca_tq, 1, PI_SWI(SWI_TQ), "mca taskq");
1276 	taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task,
1277 	    mca_ticks * SBT_1S, 0, C_PREL(1));
1278 	mca_startup_done = true;
1279 
1280 	/*
1281 	 * CMCIs during boot may have recorded entries. Conduct the post-scan
1282 	 * activities now.
1283 	 */
1284 	mca_postscan(NULL, 0);
1285 }
1286 SYSINIT(mca_startup, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, mca_startup, NULL);
1287 
1288 #ifdef DEV_APIC
1289 static void
cmci_setup(void)1290 cmci_setup(void)
1291 {
1292 	int i;
1293 
1294 	cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state *), M_MCA,
1295 	    M_WAITOK);
1296 	for (i = 0; i <= mp_maxid; i++)
1297 		cmc_state[i] = malloc(sizeof(struct cmc_state) * mca_banks,
1298 		    M_MCA, M_WAITOK | M_ZERO);
1299 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1300 	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
1301 	    &cmc_throttle, 0, sysctl_positive_int, "I",
1302 	    "Interval in seconds to throttle corrected MC interrupts");
1303 }
1304 
1305 static void
amd_thresholding_setup(void)1306 amd_thresholding_setup(void)
1307 {
1308 	u_int i;
1309 
1310 	amd_et_state = malloc((mp_maxid + 1) * sizeof(struct amd_et_state *),
1311 	    M_MCA, M_WAITOK);
1312 	for (i = 0; i <= mp_maxid; i++)
1313 		amd_et_state[i] = malloc(sizeof(struct amd_et_state) *
1314 		    mca_banks, M_MCA, M_WAITOK | M_ZERO);
1315 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1316 	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
1317 	    &cmc_throttle, 0, sysctl_positive_int, "I",
1318 	    "Interval in seconds to throttle corrected MC interrupts");
1319 }
1320 #endif
1321 
1322 static void
mca_setup(uint64_t mcg_cap)1323 mca_setup(uint64_t mcg_cap)
1324 {
1325 
1326 	/*
1327 	 * On AMD Family 10h processors, unless logging of level one TLB
1328 	 * parity (L1TP) errors is disabled, enable the recommended workaround
1329 	 * for Erratum 383.
1330 	 */
1331 	if (cpu_vendor_id == CPU_VENDOR_AMD &&
1332 	    CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP)
1333 		workaround_erratum383 = 1;
1334 
1335 	mca_banks = mcg_cap & MCG_CAP_COUNT;
1336 	mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
1337 	mtx_init(&mca_msg_buf_lock, "mca_msg_buf", NULL, MTX_SPIN);
1338 	STAILQ_INIT(&mca_records);
1339 	STAILQ_INIT(&mca_pending);
1340 	mca_tq = taskqueue_create_fast("mca", M_WAITOK,
1341 	    taskqueue_thread_enqueue, &mca_tq);
1342 	TIMEOUT_TASK_INIT(mca_tq, &mca_scan_task, 0, mca_scan_cpus, NULL);
1343 	STAILQ_INIT(&mca_freelist);
1344 	TASK_INIT(&mca_resize_task, 0, mca_resize, NULL);
1345 	TASK_INIT(&mca_postscan_task, 0, mca_postscan, NULL);
1346 	mca_resize_freelist();
1347 	SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1348 	    "count", CTLFLAG_RD, (int *)(uintptr_t)&mca_count, 0,
1349 	    "Record count");
1350 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1351 	    "maxcount", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
1352 	    &mca_maxcount, 0, sysctl_mca_maxcount, "I",
1353 	    "Maximum record count (-1 is unlimited)");
1354 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1355 	    "interval", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
1356 	    &mca_ticks, 0, sysctl_positive_int, "I",
1357 	    "Periodic interval in seconds to scan for machine checks");
1358 	SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1359 	    "records", CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mca_records,
1360 	    "Machine check records");
1361 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1362 	    "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1363 	    sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
1364 #ifdef DEV_APIC
1365 	if (cmci_supported(mcg_cap))
1366 		cmci_setup();
1367 	else if (amd_thresholding_supported())
1368 		amd_thresholding_setup();
1369 #endif
1370 }
1371 
1372 #ifdef DEV_APIC
1373 /*
1374  * See if we should monitor CMCI for this bank.  If CMCI_EN is already
1375  * set in MC_CTL2, then another CPU is responsible for this bank, so
1376  * ignore it.  If CMCI_EN returns zero after being set, then this bank
1377  * does not support CMCI_EN.  If this CPU sets CMCI_EN, then it should
1378  * now monitor this bank.
1379  */
1380 static void
cmci_monitor(int i)1381 cmci_monitor(int i)
1382 {
1383 	struct cmc_state *cc;
1384 	uint64_t ctl;
1385 
1386 	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
1387 
1388 	/*
1389 	 * It is possible for some APs to report CMCI support even if the BSP
1390 	 * does not, apparently due to a BIOS bug.
1391 	 */
1392 	if (cmc_state == NULL) {
1393 		if (bootverbose) {
1394 			printf(
1395 		    "AP %d (%d,%d) reports CMCI support but the BSP does not\n",
1396 			    PCPU_GET(cpuid), PCPU_GET(apic_id),
1397 			    PCPU_GET(acpi_id));
1398 		}
1399 		return;
1400 	}
1401 
1402 	ctl = rdmsr(MSR_MC_CTL2(i));
1403 	if (ctl & MC_CTL2_CMCI_EN)
1404 		/* Already monitored by another CPU. */
1405 		return;
1406 
1407 	/* Set the threshold to one event for now. */
1408 	ctl &= ~MC_CTL2_THRESHOLD;
1409 	ctl |= MC_CTL2_CMCI_EN | 1;
1410 	wrmsr(MSR_MC_CTL2(i), ctl);
1411 	ctl = rdmsr(MSR_MC_CTL2(i));
1412 	if (!(ctl & MC_CTL2_CMCI_EN))
1413 		/* This bank does not support CMCI. */
1414 		return;
1415 
1416 	cc = &cmc_state[PCPU_GET(cpuid)][i];
1417 
1418 	/* Determine maximum threshold. */
1419 	ctl &= ~MC_CTL2_THRESHOLD;
1420 	ctl |= 0x7fff;
1421 	wrmsr(MSR_MC_CTL2(i), ctl);
1422 	ctl = rdmsr(MSR_MC_CTL2(i));
1423 	cc->max_threshold = ctl & MC_CTL2_THRESHOLD;
1424 
1425 	/* Start off with a threshold of 1. */
1426 	ctl &= ~MC_CTL2_THRESHOLD;
1427 	ctl |= 1;
1428 	wrmsr(MSR_MC_CTL2(i), ctl);
1429 
1430 	/* Mark this bank as monitored. */
1431 	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
1432 }
1433 
1434 /*
1435  * For resume, reset the threshold for any banks we monitor back to
1436  * one and throw away the timestamp of the last interrupt.
1437  */
1438 static void
cmci_resume(int i)1439 cmci_resume(int i)
1440 {
1441 	struct cmc_state *cc;
1442 	uint64_t ctl;
1443 
1444 	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
1445 
1446 	/* See cmci_monitor(). */
1447 	if (cmc_state == NULL)
1448 		return;
1449 
1450 	/* Ignore banks not monitored by this CPU. */
1451 	if (!(PCPU_GET(cmci_mask) & 1 << i))
1452 		return;
1453 
1454 	cc = &cmc_state[PCPU_GET(cpuid)][i];
1455 	cc->last_intr = 0;
1456 	ctl = rdmsr(MSR_MC_CTL2(i));
1457 	ctl &= ~MC_CTL2_THRESHOLD;
1458 	ctl |= MC_CTL2_CMCI_EN | 1;
1459 	wrmsr(MSR_MC_CTL2(i), ctl);
1460 }
1461 
1462 /*
1463  * Apply an AMD ET configuration to the corresponding MSR.
1464  */
1465 static void
amd_thresholding_start(struct amd_et_state * cc,int bank)1466 amd_thresholding_start(struct amd_et_state *cc, int bank)
1467 {
1468 	uint64_t misc;
1469 
1470 	KASSERT(amd_elvt >= 0, ("ELVT offset is not set"));
1471 
1472 	misc = rdmsr(mca_msr_ops.misc(bank));
1473 
1474 	misc &= ~MC_MISC_AMD_INT_MASK;
1475 	misc |= MC_MISC_AMD_INT_LVT;
1476 
1477 	misc &= ~MC_MISC_AMD_LVT_MASK;
1478 	misc |= (uint64_t)amd_elvt << MC_MISC_AMD_LVT_SHIFT;
1479 
1480 	misc &= ~MC_MISC_AMD_CNT_MASK;
1481 	misc |= (uint64_t)(MC_MISC_AMD_CNT_MAX - cc->cur_threshold)
1482 	    << MC_MISC_AMD_CNT_SHIFT;
1483 
1484 	misc &= ~MC_MISC_AMD_OVERFLOW;
1485 	misc |= MC_MISC_AMD_CNTEN;
1486 
1487 	wrmsr(mca_msr_ops.misc(bank), misc);
1488 }
1489 
1490 static void
amd_thresholding_monitor(int i)1491 amd_thresholding_monitor(int i)
1492 {
1493 	struct amd_et_state *cc;
1494 	uint64_t misc;
1495 
1496 	/*
1497 	 * Kludge: On 10h, banks after 4 are not thresholding but also may have
1498 	 * bogus Valid bits.  Skip them.  This is definitely fixed in 15h, but
1499 	 * I have not investigated whether it is fixed in earlier models.
1500 	 */
1501 	if (CPUID_TO_FAMILY(cpu_id) < 0x15 && i >= 5)
1502 		return;
1503 
1504 	/* The counter must be valid and present. */
1505 	misc = rdmsr(mca_msr_ops.misc(i));
1506 	if ((misc & (MC_MISC_AMD_VAL | MC_MISC_AMD_CNTP)) !=
1507 	    (MC_MISC_AMD_VAL | MC_MISC_AMD_CNTP))
1508 		return;
1509 
1510 	/* The register should not be locked. */
1511 	if ((misc & MC_MISC_AMD_LOCK) != 0) {
1512 		if (bootverbose)
1513 			printf("%s: 0x%jx: Bank %d: locked\n", __func__,
1514 			    (uintmax_t)misc, i);
1515 		return;
1516 	}
1517 
1518 	/*
1519 	 * If counter is enabled then either the firmware or another CPU
1520 	 * has already claimed it.
1521 	 */
1522 	if ((misc & MC_MISC_AMD_CNTEN) != 0) {
1523 		if (bootverbose)
1524 			printf("%s: 0x%jx: Bank %d: already enabled\n",
1525 			    __func__, (uintmax_t)misc, i);
1526 		return;
1527 	}
1528 
1529 	/*
1530 	 * Configure an Extended Interrupt LVT register for reporting
1531 	 * counter overflows if that feature is supported and the first
1532 	 * extended register is available.
1533 	 */
1534 	amd_elvt = lapic_enable_mca_elvt();
1535 	if (amd_elvt < 0) {
1536 		printf("%s: Bank %d: lapic enable mca elvt failed: %d\n",
1537 		    __func__, i, amd_elvt);
1538 		return;
1539 	}
1540 
1541 	cc = &amd_et_state[PCPU_GET(cpuid)][i];
1542 	cc->cur_threshold = 1;
1543 	amd_thresholding_start(cc, i);
1544 
1545 	/* Mark this bank as monitored. */
1546 	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
1547 }
1548 
1549 static void
amd_thresholding_resume(int i)1550 amd_thresholding_resume(int i)
1551 {
1552 	struct amd_et_state *cc;
1553 
1554 	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
1555 
1556 	/* Ignore banks not monitored by this CPU. */
1557 	if (!(PCPU_GET(cmci_mask) & 1 << i))
1558 		return;
1559 
1560 	cc = &amd_et_state[PCPU_GET(cpuid)][i];
1561 	cc->last_intr = 0;
1562 	cc->cur_threshold = 1;
1563 	amd_thresholding_start(cc, i);
1564 }
1565 #endif
1566 
1567 /*
1568  * Initializes per-CPU machine check registers and enables corrected
1569  * machine check interrupts.
1570  */
1571 static void
_mca_init(int boot)1572 _mca_init(int boot)
1573 {
1574 	uint64_t mcg_cap;
1575 	uint64_t ctl, mask;
1576 	int i, skip, family;
1577 
1578 	family = CPUID_TO_FAMILY(cpu_id);
1579 
1580 	/* MCE is required. */
1581 	if (!mca_enabled || !(cpu_feature & CPUID_MCE))
1582 		return;
1583 
1584 	if (cpu_feature & CPUID_MCA) {
1585 		if (boot)
1586 			PCPU_SET(cmci_mask, 0);
1587 
1588 		mcg_cap = rdmsr(MSR_MCG_CAP);
1589 		if (mcg_cap & MCG_CAP_CTL_P)
1590 			/* Enable MCA features. */
1591 			wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
1592 		if (IS_BSP() && boot)
1593 			mca_setup(mcg_cap);
1594 
1595 		/*
1596 		 * Disable logging of level one TLB parity (L1TP) errors by
1597 		 * the data cache as an alternative workaround for AMD Family
1598 		 * 10h Erratum 383.  Unlike the recommended workaround, there
1599 		 * is no performance penalty to this workaround.  However,
1600 		 * L1TP errors will go unreported.
1601 		 */
1602 		if (cpu_vendor_id == CPU_VENDOR_AMD && family == 0x10 &&
1603 		    !amd10h_L1TP) {
1604 			mask = rdmsr(MSR_MC0_CTL_MASK);
1605 			if ((mask & (1UL << 5)) == 0)
1606 				wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5));
1607 		}
1608 		if (amd_rascap & AMDRAS_SCALABLE_MCA) {
1609 			mca_msr_ops.ctl = mca_smca_ctl_reg;
1610 			mca_msr_ops.status = mca_smca_status_reg;
1611 			mca_msr_ops.addr = mca_smca_addr_reg;
1612 			mca_msr_ops.misc = mca_smca_misc_reg;
1613 		}
1614 
1615 		/* Enable local MCE if supported. */
1616 		if (cpu_vendor_id == CPU_VENDOR_INTEL &&
1617 		    (mcg_cap & MCG_CAP_LMCE_P) &&
1618 		    (rdmsr(MSR_IA32_FEATURE_CONTROL) &
1619 		     IA32_FEATURE_CONTROL_LMCE_EN))
1620 			wrmsr(MSR_MCG_EXT_CTL, rdmsr(MSR_MCG_EXT_CTL) | 1);
1621 
1622 		/*
1623 		 * The cmci_monitor() must not be executed
1624 		 * simultaneously by several CPUs.
1625 		 */
1626 		if (boot)
1627 			mtx_lock_spin(&mca_lock);
1628 
1629 		for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
1630 			/* By default enable logging of all errors. */
1631 			ctl = 0xffffffffffffffffUL;
1632 			skip = 0;
1633 
1634 			if (cpu_vendor_id == CPU_VENDOR_INTEL) {
1635 				/*
1636 				 * For P6 models before Nehalem MC0_CTL is
1637 				 * always enabled and reserved.
1638 				 */
1639 				if (i == 0 && family == 0x6
1640 				    && CPUID_TO_MODEL(cpu_id) < 0x1a)
1641 					skip = 1;
1642 			} else if (cpu_vendor_id == CPU_VENDOR_AMD) {
1643 				/* BKDG for Family 10h: unset GartTblWkEn. */
1644 				if (i == MC_AMDNB_BANK && family >= 0xf &&
1645 				    family < 0x17)
1646 					ctl &= ~(1UL << 10);
1647 			}
1648 
1649 			if (!skip)
1650 				wrmsr(mca_msr_ops.ctl(i), ctl);
1651 
1652 #ifdef DEV_APIC
1653 			if (cmci_supported(mcg_cap)) {
1654 				if (boot)
1655 					cmci_monitor(i);
1656 				else
1657 					cmci_resume(i);
1658 			} else if (amd_thresholding_supported()) {
1659 				if (boot)
1660 					amd_thresholding_monitor(i);
1661 				else
1662 					amd_thresholding_resume(i);
1663 			}
1664 #endif
1665 
1666 			/* Clear all errors. */
1667 			wrmsr(mca_msr_ops.status(i), 0);
1668 		}
1669 		if (boot)
1670 			mtx_unlock_spin(&mca_lock);
1671 
1672 #ifdef DEV_APIC
1673 		if (cmci_supported(mcg_cap) &&
1674 		    PCPU_GET(cmci_mask) != 0 && boot)
1675 			lapic_enable_cmc();
1676 #endif
1677 	}
1678 
1679 	load_cr4(rcr4() | CR4_MCE);
1680 }
1681 
1682 /* Must be executed on each CPU during boot. */
1683 void
mca_init(void)1684 mca_init(void)
1685 {
1686 
1687 	_mca_init(1);
1688 }
1689 
1690 /* Must be executed on each CPU during resume. */
1691 void
mca_resume(void)1692 mca_resume(void)
1693 {
1694 
1695 	_mca_init(0);
1696 }
1697 
1698 /*
1699  * The machine check registers for the BSP cannot be initialized until
1700  * the local APIC is initialized.  This happens at SI_SUB_CPU,
1701  * SI_ORDER_SECOND.
1702  */
1703 static void
mca_init_bsp(void * arg __unused)1704 mca_init_bsp(void *arg __unused)
1705 {
1706 
1707 	mca_init();
1708 }
1709 SYSINIT(mca_init_bsp, SI_SUB_CPU, SI_ORDER_ANY, mca_init_bsp, NULL);
1710 
1711 /* Called when a machine check exception fires. */
1712 void
mca_intr(void)1713 mca_intr(void)
1714 {
1715 	uint64_t mcg_status;
1716 	int count;
1717 	bool lmcs, recoverable;
1718 
1719 	if (!(cpu_feature & CPUID_MCA)) {
1720 		/*
1721 		 * Just print the values of the old Pentium registers
1722 		 * and panic.
1723 		 */
1724 		printf("MC Type: 0x%jx  Address: 0x%jx\n",
1725 		    (uintmax_t)rdmsr(MSR_P5_MC_TYPE),
1726 		    (uintmax_t)rdmsr(MSR_P5_MC_ADDR));
1727 		panic("Machine check exception");
1728 	}
1729 
1730 	/* Scan the banks and check for any non-recoverable errors. */
1731 	mcg_status = rdmsr(MSR_MCG_STATUS);
1732 	recoverable = (mcg_status & MCG_STATUS_RIPV) != 0;
1733 	lmcs = (cpu_vendor_id != CPU_VENDOR_INTEL ||
1734 	    (mcg_status & MCG_STATUS_LMCS));
1735 	count = mca_scan(MCE, &recoverable);
1736 
1737 	if (!recoverable) {
1738 		/*
1739 		 * Only panic if the error was detected local to this CPU.
1740 		 * Some errors will assert a machine check on all CPUs, but
1741 		 * only certain CPUs will find a valid bank to log.
1742 		 */
1743 		while (!lmcs && count == 0)
1744 			cpu_spinwait();
1745 
1746 		panic("Unrecoverable machine check exception");
1747 	}
1748 
1749 	if (count)
1750 		mca_process_records(MCE);
1751 
1752 	/* Clear MCIP. */
1753 	wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP);
1754 }
1755 
1756 #ifdef DEV_APIC
1757 /* Called for a CMCI (correctable machine check interrupt). */
1758 void
cmc_intr(void)1759 cmc_intr(void)
1760 {
1761 	bool recoverable = true;
1762 
1763 	/*
1764 	 * Serialize MCA bank scanning to prevent collisions from
1765 	 * sibling threads.
1766 	 *
1767 	 * If we found anything, log them to the console.
1768 	 */
1769 	if (mca_scan(CMCI, &recoverable) != 0)
1770 		mca_process_records(CMCI);
1771 }
1772 #endif
1773