xref: /linux/arch/x86/kernel/cpu/mce/core.c (revision dd9a41bc61cc62d38306465ed62373b98df0049e)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Machine check handler.
4  *
5  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
6  * Rest from unknown author(s).
7  * 2004 Andi Kleen. Rewrote most of it.
8  * Copyright 2008 Intel Corporation
9  * Author: Andi Kleen
10  */
11 
12 #include <linux/thread_info.h>
13 #include <linux/capability.h>
14 #include <linux/miscdevice.h>
15 #include <linux/ratelimit.h>
16 #include <linux/rcupdate.h>
17 #include <linux/kobject.h>
18 #include <linux/uaccess.h>
19 #include <linux/kdebug.h>
20 #include <linux/kernel.h>
21 #include <linux/percpu.h>
22 #include <linux/string.h>
23 #include <linux/device.h>
24 #include <linux/syscore_ops.h>
25 #include <linux/delay.h>
26 #include <linux/ctype.h>
27 #include <linux/sched.h>
28 #include <linux/sysfs.h>
29 #include <linux/types.h>
30 #include <linux/slab.h>
31 #include <linux/init.h>
32 #include <linux/kmod.h>
33 #include <linux/poll.h>
34 #include <linux/nmi.h>
35 #include <linux/cpu.h>
36 #include <linux/ras.h>
37 #include <linux/smp.h>
38 #include <linux/fs.h>
39 #include <linux/mm.h>
40 #include <linux/debugfs.h>
41 #include <linux/irq_work.h>
42 #include <linux/export.h>
43 #include <linux/jump_label.h>
44 #include <linux/set_memory.h>
45 
46 #include <asm/intel-family.h>
47 #include <asm/processor.h>
48 #include <asm/traps.h>
49 #include <asm/tlbflush.h>
50 #include <asm/mce.h>
51 #include <asm/msr.h>
52 #include <asm/reboot.h>
53 
54 #include "internal.h"
55 
56 /* sysfs synchronization */
57 static DEFINE_MUTEX(mce_sysfs_mutex);
58 
59 #define CREATE_TRACE_POINTS
60 #include <trace/events/mce.h>
61 
62 #define SPINUNIT		100	/* 100ns */
63 
64 DEFINE_PER_CPU(unsigned, mce_exception_count);
65 
66 DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
67 
68 struct mce_bank {
69 	u64			ctl;			/* subevents to enable */
70 	bool			init;			/* initialise bank? */
71 };
72 static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
73 
74 #define ATTR_LEN               16
75 /* One object for each MCE bank, shared by all CPUs */
76 struct mce_bank_dev {
77 	struct device_attribute	attr;			/* device attribute */
78 	char			attrname[ATTR_LEN];	/* attribute name */
79 	u8			bank;			/* bank number */
80 };
81 static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];
82 
83 struct mce_vendor_flags mce_flags __read_mostly;
84 
85 struct mca_config mca_cfg __read_mostly = {
86 	.bootlog  = -1,
87 	/*
88 	 * Tolerant levels:
89 	 * 0: always panic on uncorrected errors, log corrected errors
90 	 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
91 	 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
92 	 * 3: never panic or SIGBUS, log all errors (for testing only)
93 	 */
94 	.tolerant = 1,
95 	.monarch_timeout = -1
96 };
97 
98 static DEFINE_PER_CPU(struct mce, mces_seen);
99 static unsigned long mce_need_notify;
100 static int cpu_missing;
101 
102 /*
103  * MCA banks polled by the period polling timer for corrected events.
104  * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
105  */
106 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
107 	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
108 };
109 
110 /*
111  * MCA banks controlled through firmware first for corrected errors.
112  * This is a global list of banks for which we won't enable CMCI and we
113  * won't poll. Firmware controls these banks and is responsible for
114  * reporting corrected errors through GHES. Uncorrected/recoverable
115  * errors are still notified through a machine check.
116  */
117 mce_banks_t mce_banks_ce_disabled;
118 
119 static struct work_struct mce_work;
120 static struct irq_work mce_irq_work;
121 
122 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
123 
124 /*
125  * CPU/chipset specific EDAC code can register a notifier call here to print
126  * MCE errors in a human-readable form.
127  */
128 BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
129 
130 /* Do initial initialization of a struct mce */
131 void mce_setup(struct mce *m)
132 {
133 	memset(m, 0, sizeof(struct mce));
134 	m->cpu = m->extcpu = smp_processor_id();
135 	/* need the internal __ version to avoid deadlocks */
136 	m->time = __ktime_get_real_seconds();
137 	m->cpuvendor = boot_cpu_data.x86_vendor;
138 	m->cpuid = cpuid_eax(1);
139 	m->socketid = cpu_data(m->extcpu).phys_proc_id;
140 	m->apicid = cpu_data(m->extcpu).initial_apicid;
141 	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
142 
143 	if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
144 		rdmsrl(MSR_PPIN, m->ppin);
145 	else if (this_cpu_has(X86_FEATURE_AMD_PPIN))
146 		rdmsrl(MSR_AMD_PPIN, m->ppin);
147 
148 	m->microcode = boot_cpu_data.microcode;
149 }
150 
151 DEFINE_PER_CPU(struct mce, injectm);
152 EXPORT_PER_CPU_SYMBOL_GPL(injectm);
153 
154 void mce_log(struct mce *m)
155 {
156 	if (!mce_gen_pool_add(m))
157 		irq_work_queue(&mce_irq_work);
158 }
159 EXPORT_SYMBOL_GPL(mce_log);
160 
161 /*
162  * We run the default notifier if we have only the UC, the first and the
163  * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
164  * notifiers registered on the chain.
165  */
166 #define NUM_DEFAULT_NOTIFIERS	3
167 static atomic_t num_notifiers;
168 
169 void mce_register_decode_chain(struct notifier_block *nb)
170 {
171 	if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
172 		return;
173 
174 	atomic_inc(&num_notifiers);
175 
176 	blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
177 }
178 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
179 
180 void mce_unregister_decode_chain(struct notifier_block *nb)
181 {
182 	atomic_dec(&num_notifiers);
183 
184 	blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
185 }
186 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
187 
188 static inline u32 ctl_reg(int bank)
189 {
190 	return MSR_IA32_MCx_CTL(bank);
191 }
192 
193 static inline u32 status_reg(int bank)
194 {
195 	return MSR_IA32_MCx_STATUS(bank);
196 }
197 
198 static inline u32 addr_reg(int bank)
199 {
200 	return MSR_IA32_MCx_ADDR(bank);
201 }
202 
203 static inline u32 misc_reg(int bank)
204 {
205 	return MSR_IA32_MCx_MISC(bank);
206 }
207 
208 static inline u32 smca_ctl_reg(int bank)
209 {
210 	return MSR_AMD64_SMCA_MCx_CTL(bank);
211 }
212 
213 static inline u32 smca_status_reg(int bank)
214 {
215 	return MSR_AMD64_SMCA_MCx_STATUS(bank);
216 }
217 
218 static inline u32 smca_addr_reg(int bank)
219 {
220 	return MSR_AMD64_SMCA_MCx_ADDR(bank);
221 }
222 
223 static inline u32 smca_misc_reg(int bank)
224 {
225 	return MSR_AMD64_SMCA_MCx_MISC(bank);
226 }
227 
228 struct mca_msr_regs msr_ops = {
229 	.ctl	= ctl_reg,
230 	.status	= status_reg,
231 	.addr	= addr_reg,
232 	.misc	= misc_reg
233 };
234 
235 static void __print_mce(struct mce *m)
236 {
237 	pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
238 		 m->extcpu,
239 		 (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
240 		 m->mcgstatus, m->bank, m->status);
241 
242 	if (m->ip) {
243 		pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
244 			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
245 			m->cs, m->ip);
246 
247 		if (m->cs == __KERNEL_CS)
248 			pr_cont("{%pS}", (void *)(unsigned long)m->ip);
249 		pr_cont("\n");
250 	}
251 
252 	pr_emerg(HW_ERR "TSC %llx ", m->tsc);
253 	if (m->addr)
254 		pr_cont("ADDR %llx ", m->addr);
255 	if (m->misc)
256 		pr_cont("MISC %llx ", m->misc);
257 
258 	if (mce_flags.smca) {
259 		if (m->synd)
260 			pr_cont("SYND %llx ", m->synd);
261 		if (m->ipid)
262 			pr_cont("IPID %llx ", m->ipid);
263 	}
264 
265 	pr_cont("\n");
266 	/*
267 	 * Note this output is parsed by external tools and old fields
268 	 * should not be changed.
269 	 */
270 	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
271 		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
272 		m->microcode);
273 }
274 
275 static void print_mce(struct mce *m)
276 {
277 	__print_mce(m);
278 
279 	if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
280 		pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
281 }
282 
283 #define PANIC_TIMEOUT 5 /* 5 seconds */
284 
285 static atomic_t mce_panicked;
286 
287 static int fake_panic;
288 static atomic_t mce_fake_panicked;
289 
290 /* Panic in progress. Enable interrupts and wait for final IPI */
291 static void wait_for_panic(void)
292 {
293 	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
294 
295 	preempt_disable();
296 	local_irq_enable();
297 	while (timeout-- > 0)
298 		udelay(1);
299 	if (panic_timeout == 0)
300 		panic_timeout = mca_cfg.panic_timeout;
301 	panic("Panicing machine check CPU died");
302 }
303 
304 static void mce_panic(const char *msg, struct mce *final, char *exp)
305 {
306 	int apei_err = 0;
307 	struct llist_node *pending;
308 	struct mce_evt_llist *l;
309 
310 	if (!fake_panic) {
311 		/*
312 		 * Make sure only one CPU runs in machine check panic
313 		 */
314 		if (atomic_inc_return(&mce_panicked) > 1)
315 			wait_for_panic();
316 		barrier();
317 
318 		bust_spinlocks(1);
319 		console_verbose();
320 	} else {
321 		/* Don't log too much for fake panic */
322 		if (atomic_inc_return(&mce_fake_panicked) > 1)
323 			return;
324 	}
325 	pending = mce_gen_pool_prepare_records();
326 	/* First print corrected ones that are still unlogged */
327 	llist_for_each_entry(l, pending, llnode) {
328 		struct mce *m = &l->mce;
329 		if (!(m->status & MCI_STATUS_UC)) {
330 			print_mce(m);
331 			if (!apei_err)
332 				apei_err = apei_write_mce(m);
333 		}
334 	}
335 	/* Now print uncorrected but with the final one last */
336 	llist_for_each_entry(l, pending, llnode) {
337 		struct mce *m = &l->mce;
338 		if (!(m->status & MCI_STATUS_UC))
339 			continue;
340 		if (!final || mce_cmp(m, final)) {
341 			print_mce(m);
342 			if (!apei_err)
343 				apei_err = apei_write_mce(m);
344 		}
345 	}
346 	if (final) {
347 		print_mce(final);
348 		if (!apei_err)
349 			apei_err = apei_write_mce(final);
350 	}
351 	if (cpu_missing)
352 		pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
353 	if (exp)
354 		pr_emerg(HW_ERR "Machine check: %s\n", exp);
355 	if (!fake_panic) {
356 		if (panic_timeout == 0)
357 			panic_timeout = mca_cfg.panic_timeout;
358 		panic(msg);
359 	} else
360 		pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
361 }
362 
363 /* Support code for software error injection */
364 
365 static int msr_to_offset(u32 msr)
366 {
367 	unsigned bank = __this_cpu_read(injectm.bank);
368 
369 	if (msr == mca_cfg.rip_msr)
370 		return offsetof(struct mce, ip);
371 	if (msr == msr_ops.status(bank))
372 		return offsetof(struct mce, status);
373 	if (msr == msr_ops.addr(bank))
374 		return offsetof(struct mce, addr);
375 	if (msr == msr_ops.misc(bank))
376 		return offsetof(struct mce, misc);
377 	if (msr == MSR_IA32_MCG_STATUS)
378 		return offsetof(struct mce, mcgstatus);
379 	return -1;
380 }
381 
382 /* MSR access wrappers used for error injection */
383 static u64 mce_rdmsrl(u32 msr)
384 {
385 	u64 v;
386 
387 	if (__this_cpu_read(injectm.finished)) {
388 		int offset = msr_to_offset(msr);
389 
390 		if (offset < 0)
391 			return 0;
392 		return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
393 	}
394 
395 	if (rdmsrl_safe(msr, &v)) {
396 		WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
397 		/*
398 		 * Return zero in case the access faulted. This should
399 		 * not happen normally but can happen if the CPU does
400 		 * something weird, or if the code is buggy.
401 		 */
402 		v = 0;
403 	}
404 
405 	return v;
406 }
407 
408 static void mce_wrmsrl(u32 msr, u64 v)
409 {
410 	if (__this_cpu_read(injectm.finished)) {
411 		int offset = msr_to_offset(msr);
412 
413 		if (offset >= 0)
414 			*(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
415 		return;
416 	}
417 	wrmsrl(msr, v);
418 }
419 
420 /*
421  * Collect all global (w.r.t. this processor) status about this machine
422  * check into our "mce" struct so that we can use it later to assess
423  * the severity of the problem as we read per-bank specific details.
424  */
425 static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
426 {
427 	mce_setup(m);
428 
429 	m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
430 	if (regs) {
431 		/*
432 		 * Get the address of the instruction at the time of
433 		 * the machine check error.
434 		 */
435 		if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
436 			m->ip = regs->ip;
437 			m->cs = regs->cs;
438 
439 			/*
440 			 * When in VM86 mode make the cs look like ring 3
441 			 * always. This is a lie, but it's better than passing
442 			 * the additional vm86 bit around everywhere.
443 			 */
444 			if (v8086_mode(regs))
445 				m->cs |= 3;
446 		}
447 		/* Use accurate RIP reporting if available. */
448 		if (mca_cfg.rip_msr)
449 			m->ip = mce_rdmsrl(mca_cfg.rip_msr);
450 	}
451 }
452 
453 int mce_available(struct cpuinfo_x86 *c)
454 {
455 	if (mca_cfg.disabled)
456 		return 0;
457 	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
458 }
459 
460 static void mce_schedule_work(void)
461 {
462 	if (!mce_gen_pool_empty())
463 		schedule_work(&mce_work);
464 }
465 
466 static void mce_irq_work_cb(struct irq_work *entry)
467 {
468 	mce_schedule_work();
469 }
470 
471 /*
472  * Check if the address reported by the CPU is in a format we can parse.
473  * It would be possible to add code for most other cases, but all would
474  * be somewhat complicated (e.g. segment offset would require an instruction
475  * parser). So only support physical addresses up to page granuality for now.
476  */
477 int mce_usable_address(struct mce *m)
478 {
479 	if (!(m->status & MCI_STATUS_ADDRV))
480 		return 0;
481 
482 	/* Checks after this one are Intel/Zhaoxin-specific: */
483 	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
484 	    boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
485 		return 1;
486 
487 	if (!(m->status & MCI_STATUS_MISCV))
488 		return 0;
489 
490 	if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
491 		return 0;
492 
493 	if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
494 		return 0;
495 
496 	return 1;
497 }
498 EXPORT_SYMBOL_GPL(mce_usable_address);
499 
500 bool mce_is_memory_error(struct mce *m)
501 {
502 	switch (m->cpuvendor) {
503 	case X86_VENDOR_AMD:
504 	case X86_VENDOR_HYGON:
505 		return amd_mce_is_memory_error(m);
506 
507 	case X86_VENDOR_INTEL:
508 	case X86_VENDOR_ZHAOXIN:
509 		/*
510 		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
511 		 *
512 		 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
513 		 * indicating a memory error. Bit 8 is used for indicating a
514 		 * cache hierarchy error. The combination of bit 2 and bit 3
515 		 * is used for indicating a `generic' cache hierarchy error
516 		 * But we can't just blindly check the above bits, because if
517 		 * bit 11 is set, then it is a bus/interconnect error - and
518 		 * either way the above bits just gives more detail on what
519 		 * bus/interconnect error happened. Note that bit 12 can be
520 		 * ignored, as it's the "filter" bit.
521 		 */
522 		return (m->status & 0xef80) == BIT(7) ||
523 		       (m->status & 0xef00) == BIT(8) ||
524 		       (m->status & 0xeffc) == 0xc;
525 
526 	default:
527 		return false;
528 	}
529 }
530 EXPORT_SYMBOL_GPL(mce_is_memory_error);
531 
532 bool mce_is_correctable(struct mce *m)
533 {
534 	if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
535 		return false;
536 
537 	if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
538 		return false;
539 
540 	if (m->status & MCI_STATUS_UC)
541 		return false;
542 
543 	return true;
544 }
545 EXPORT_SYMBOL_GPL(mce_is_correctable);
546 
547 static bool cec_add_mce(struct mce *m)
548 {
549 	if (!m)
550 		return false;
551 
552 	/* We eat only correctable DRAM errors with usable addresses. */
553 	if (mce_is_memory_error(m) &&
554 	    mce_is_correctable(m)  &&
555 	    mce_usable_address(m))
556 		if (!cec_add_elem(m->addr >> PAGE_SHIFT))
557 			return true;
558 
559 	return false;
560 }
561 
562 static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
563 			      void *data)
564 {
565 	struct mce *m = (struct mce *)data;
566 
567 	if (!m)
568 		return NOTIFY_DONE;
569 
570 	if (cec_add_mce(m))
571 		return NOTIFY_STOP;
572 
573 	/* Emit the trace record: */
574 	trace_mce_record(m);
575 
576 	set_bit(0, &mce_need_notify);
577 
578 	mce_notify_irq();
579 
580 	return NOTIFY_DONE;
581 }
582 
583 static struct notifier_block first_nb = {
584 	.notifier_call	= mce_first_notifier,
585 	.priority	= MCE_PRIO_FIRST,
586 };
587 
588 static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
589 			      void *data)
590 {
591 	struct mce *mce = (struct mce *)data;
592 	unsigned long pfn;
593 
594 	if (!mce || !mce_usable_address(mce))
595 		return NOTIFY_DONE;
596 
597 	if (mce->severity != MCE_AO_SEVERITY &&
598 	    mce->severity != MCE_DEFERRED_SEVERITY)
599 		return NOTIFY_DONE;
600 
601 	pfn = mce->addr >> PAGE_SHIFT;
602 	if (!memory_failure(pfn, 0))
603 		set_mce_nospec(pfn);
604 
605 	return NOTIFY_OK;
606 }
607 
608 static struct notifier_block mce_uc_nb = {
609 	.notifier_call	= uc_decode_notifier,
610 	.priority	= MCE_PRIO_UC,
611 };
612 
613 static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
614 				void *data)
615 {
616 	struct mce *m = (struct mce *)data;
617 
618 	if (!m)
619 		return NOTIFY_DONE;
620 
621 	if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
622 		return NOTIFY_DONE;
623 
624 	__print_mce(m);
625 
626 	return NOTIFY_DONE;
627 }
628 
629 static struct notifier_block mce_default_nb = {
630 	.notifier_call	= mce_default_notifier,
631 	/* lowest prio, we want it to run last. */
632 	.priority	= MCE_PRIO_LOWEST,
633 };
634 
635 /*
636  * Read ADDR and MISC registers.
637  */
638 static void mce_read_aux(struct mce *m, int i)
639 {
640 	if (m->status & MCI_STATUS_MISCV)
641 		m->misc = mce_rdmsrl(msr_ops.misc(i));
642 
643 	if (m->status & MCI_STATUS_ADDRV) {
644 		m->addr = mce_rdmsrl(msr_ops.addr(i));
645 
646 		/*
647 		 * Mask the reported address by the reported granularity.
648 		 */
649 		if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
650 			u8 shift = MCI_MISC_ADDR_LSB(m->misc);
651 			m->addr >>= shift;
652 			m->addr <<= shift;
653 		}
654 
655 		/*
656 		 * Extract [55:<lsb>] where lsb is the least significant
657 		 * *valid* bit of the address bits.
658 		 */
659 		if (mce_flags.smca) {
660 			u8 lsb = (m->addr >> 56) & 0x3f;
661 
662 			m->addr &= GENMASK_ULL(55, lsb);
663 		}
664 	}
665 
666 	if (mce_flags.smca) {
667 		m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
668 
669 		if (m->status & MCI_STATUS_SYNDV)
670 			m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
671 	}
672 }
673 
674 DEFINE_PER_CPU(unsigned, mce_poll_count);
675 
676 /*
677  * Poll for corrected events or events that happened before reset.
678  * Those are just logged through /dev/mcelog.
679  *
680  * This is executed in standard interrupt context.
681  *
682  * Note: spec recommends to panic for fatal unsignalled
683  * errors here. However this would be quite problematic --
684  * we would need to reimplement the Monarch handling and
685  * it would mess up the exclusion between exception handler
686  * and poll handler -- * so we skip this for now.
687  * These cases should not happen anyways, or only when the CPU
688  * is already totally * confused. In this case it's likely it will
689  * not fully execute the machine check handler either.
690  */
691 bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
692 {
693 	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
694 	bool error_seen = false;
695 	struct mce m;
696 	int i;
697 
698 	this_cpu_inc(mce_poll_count);
699 
700 	mce_gather_info(&m, NULL);
701 
702 	if (flags & MCP_TIMESTAMP)
703 		m.tsc = rdtsc();
704 
705 	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
706 		if (!mce_banks[i].ctl || !test_bit(i, *b))
707 			continue;
708 
709 		m.misc = 0;
710 		m.addr = 0;
711 		m.bank = i;
712 
713 		barrier();
714 		m.status = mce_rdmsrl(msr_ops.status(i));
715 
716 		/* If this entry is not valid, ignore it */
717 		if (!(m.status & MCI_STATUS_VAL))
718 			continue;
719 
720 		/*
721 		 * If we are logging everything (at CPU online) or this
722 		 * is a corrected error, then we must log it.
723 		 */
724 		if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC))
725 			goto log_it;
726 
727 		/*
728 		 * Newer Intel systems that support software error
729 		 * recovery need to make additional checks. Other
730 		 * CPUs should skip over uncorrected errors, but log
731 		 * everything else.
732 		 */
733 		if (!mca_cfg.ser) {
734 			if (m.status & MCI_STATUS_UC)
735 				continue;
736 			goto log_it;
737 		}
738 
739 		/* Log "not enabled" (speculative) errors */
740 		if (!(m.status & MCI_STATUS_EN))
741 			goto log_it;
742 
743 		/*
744 		 * Log UCNA (SDM: 15.6.3 "UCR Error Classification")
745 		 * UC == 1 && PCC == 0 && S == 0
746 		 */
747 		if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S))
748 			goto log_it;
749 
750 		/*
751 		 * Skip anything else. Presumption is that our read of this
752 		 * bank is racing with a machine check. Leave the log alone
753 		 * for do_machine_check() to deal with it.
754 		 */
755 		continue;
756 
757 log_it:
758 		error_seen = true;
759 
760 		if (flags & MCP_DONTLOG)
761 			goto clear_it;
762 
763 		mce_read_aux(&m, i);
764 		m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
765 		/*
766 		 * Don't get the IP here because it's unlikely to
767 		 * have anything to do with the actual error location.
768 		 */
769 
770 		if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
771 			goto clear_it;
772 
773 		mce_log(&m);
774 
775 clear_it:
776 		/*
777 		 * Clear state for this bank.
778 		 */
779 		mce_wrmsrl(msr_ops.status(i), 0);
780 	}
781 
782 	/*
783 	 * Don't clear MCG_STATUS here because it's only defined for
784 	 * exceptions.
785 	 */
786 
787 	sync_core();
788 
789 	return error_seen;
790 }
791 EXPORT_SYMBOL_GPL(machine_check_poll);
792 
793 /*
794  * Do a quick check if any of the events requires a panic.
795  * This decides if we keep the events around or clear them.
796  */
797 static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
798 			  struct pt_regs *regs)
799 {
800 	char *tmp = *msg;
801 	int i;
802 
803 	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
804 		m->status = mce_rdmsrl(msr_ops.status(i));
805 		if (!(m->status & MCI_STATUS_VAL))
806 			continue;
807 
808 		__set_bit(i, validp);
809 		if (quirk_no_way_out)
810 			quirk_no_way_out(i, m, regs);
811 
812 		m->bank = i;
813 		if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
814 			mce_read_aux(m, i);
815 			*msg = tmp;
816 			return 1;
817 		}
818 	}
819 	return 0;
820 }
821 
822 /*
823  * Variable to establish order between CPUs while scanning.
824  * Each CPU spins initially until executing is equal its number.
825  */
826 static atomic_t mce_executing;
827 
828 /*
829  * Defines order of CPUs on entry. First CPU becomes Monarch.
830  */
831 static atomic_t mce_callin;
832 
833 /*
834  * Check if a timeout waiting for other CPUs happened.
835  */
836 static int mce_timed_out(u64 *t, const char *msg)
837 {
838 	/*
839 	 * The others already did panic for some reason.
840 	 * Bail out like in a timeout.
841 	 * rmb() to tell the compiler that system_state
842 	 * might have been modified by someone else.
843 	 */
844 	rmb();
845 	if (atomic_read(&mce_panicked))
846 		wait_for_panic();
847 	if (!mca_cfg.monarch_timeout)
848 		goto out;
849 	if ((s64)*t < SPINUNIT) {
850 		if (mca_cfg.tolerant <= 1)
851 			mce_panic(msg, NULL, NULL);
852 		cpu_missing = 1;
853 		return 1;
854 	}
855 	*t -= SPINUNIT;
856 out:
857 	touch_nmi_watchdog();
858 	return 0;
859 }
860 
861 /*
862  * The Monarch's reign.  The Monarch is the CPU who entered
863  * the machine check handler first. It waits for the others to
864  * raise the exception too and then grades them. When any
865  * error is fatal panic. Only then let the others continue.
866  *
867  * The other CPUs entering the MCE handler will be controlled by the
868  * Monarch. They are called Subjects.
869  *
870  * This way we prevent any potential data corruption in a unrecoverable case
871  * and also makes sure always all CPU's errors are examined.
872  *
873  * Also this detects the case of a machine check event coming from outer
874  * space (not detected by any CPUs) In this case some external agent wants
875  * us to shut down, so panic too.
876  *
877  * The other CPUs might still decide to panic if the handler happens
878  * in a unrecoverable place, but in this case the system is in a semi-stable
879  * state and won't corrupt anything by itself. It's ok to let the others
880  * continue for a bit first.
881  *
882  * All the spin loops have timeouts; when a timeout happens a CPU
883  * typically elects itself to be Monarch.
884  */
885 static void mce_reign(void)
886 {
887 	int cpu;
888 	struct mce *m = NULL;
889 	int global_worst = 0;
890 	char *msg = NULL;
891 	char *nmsg = NULL;
892 
893 	/*
894 	 * This CPU is the Monarch and the other CPUs have run
895 	 * through their handlers.
896 	 * Grade the severity of the errors of all the CPUs.
897 	 */
898 	for_each_possible_cpu(cpu) {
899 		int severity = mce_severity(&per_cpu(mces_seen, cpu),
900 					    mca_cfg.tolerant,
901 					    &nmsg, true);
902 		if (severity > global_worst) {
903 			msg = nmsg;
904 			global_worst = severity;
905 			m = &per_cpu(mces_seen, cpu);
906 		}
907 	}
908 
909 	/*
910 	 * Cannot recover? Panic here then.
911 	 * This dumps all the mces in the log buffer and stops the
912 	 * other CPUs.
913 	 */
914 	if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
915 		mce_panic("Fatal machine check", m, msg);
916 
917 	/*
918 	 * For UC somewhere we let the CPU who detects it handle it.
919 	 * Also must let continue the others, otherwise the handling
920 	 * CPU could deadlock on a lock.
921 	 */
922 
923 	/*
924 	 * No machine check event found. Must be some external
925 	 * source or one CPU is hung. Panic.
926 	 */
927 	if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
928 		mce_panic("Fatal machine check from unknown source", NULL, NULL);
929 
930 	/*
931 	 * Now clear all the mces_seen so that they don't reappear on
932 	 * the next mce.
933 	 */
934 	for_each_possible_cpu(cpu)
935 		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
936 }
937 
938 static atomic_t global_nwo;
939 
940 /*
941  * Start of Monarch synchronization. This waits until all CPUs have
942  * entered the exception handler and then determines if any of them
943  * saw a fatal event that requires panic. Then it executes them
944  * in the entry order.
945  * TBD double check parallel CPU hotunplug
946  */
947 static int mce_start(int *no_way_out)
948 {
949 	int order;
950 	int cpus = num_online_cpus();
951 	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
952 
953 	if (!timeout)
954 		return -1;
955 
956 	atomic_add(*no_way_out, &global_nwo);
957 	/*
958 	 * Rely on the implied barrier below, such that global_nwo
959 	 * is updated before mce_callin.
960 	 */
961 	order = atomic_inc_return(&mce_callin);
962 
963 	/*
964 	 * Wait for everyone.
965 	 */
966 	while (atomic_read(&mce_callin) != cpus) {
967 		if (mce_timed_out(&timeout,
968 				  "Timeout: Not all CPUs entered broadcast exception handler")) {
969 			atomic_set(&global_nwo, 0);
970 			return -1;
971 		}
972 		ndelay(SPINUNIT);
973 	}
974 
975 	/*
976 	 * mce_callin should be read before global_nwo
977 	 */
978 	smp_rmb();
979 
980 	if (order == 1) {
981 		/*
982 		 * Monarch: Starts executing now, the others wait.
983 		 */
984 		atomic_set(&mce_executing, 1);
985 	} else {
986 		/*
987 		 * Subject: Now start the scanning loop one by one in
988 		 * the original callin order.
989 		 * This way when there are any shared banks it will be
990 		 * only seen by one CPU before cleared, avoiding duplicates.
991 		 */
992 		while (atomic_read(&mce_executing) < order) {
993 			if (mce_timed_out(&timeout,
994 					  "Timeout: Subject CPUs unable to finish machine check processing")) {
995 				atomic_set(&global_nwo, 0);
996 				return -1;
997 			}
998 			ndelay(SPINUNIT);
999 		}
1000 	}
1001 
1002 	/*
1003 	 * Cache the global no_way_out state.
1004 	 */
1005 	*no_way_out = atomic_read(&global_nwo);
1006 
1007 	return order;
1008 }
1009 
1010 /*
1011  * Synchronize between CPUs after main scanning loop.
1012  * This invokes the bulk of the Monarch processing.
1013  */
1014 static int mce_end(int order)
1015 {
1016 	int ret = -1;
1017 	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
1018 
1019 	if (!timeout)
1020 		goto reset;
1021 	if (order < 0)
1022 		goto reset;
1023 
1024 	/*
1025 	 * Allow others to run.
1026 	 */
1027 	atomic_inc(&mce_executing);
1028 
1029 	if (order == 1) {
1030 		/* CHECKME: Can this race with a parallel hotplug? */
1031 		int cpus = num_online_cpus();
1032 
1033 		/*
1034 		 * Monarch: Wait for everyone to go through their scanning
1035 		 * loops.
1036 		 */
1037 		while (atomic_read(&mce_executing) <= cpus) {
1038 			if (mce_timed_out(&timeout,
1039 					  "Timeout: Monarch CPU unable to finish machine check processing"))
1040 				goto reset;
1041 			ndelay(SPINUNIT);
1042 		}
1043 
1044 		mce_reign();
1045 		barrier();
1046 		ret = 0;
1047 	} else {
1048 		/*
1049 		 * Subject: Wait for Monarch to finish.
1050 		 */
1051 		while (atomic_read(&mce_executing) != 0) {
1052 			if (mce_timed_out(&timeout,
1053 					  "Timeout: Monarch CPU did not finish machine check processing"))
1054 				goto reset;
1055 			ndelay(SPINUNIT);
1056 		}
1057 
1058 		/*
1059 		 * Don't reset anything. That's done by the Monarch.
1060 		 */
1061 		return 0;
1062 	}
1063 
1064 	/*
1065 	 * Reset all global state.
1066 	 */
1067 reset:
1068 	atomic_set(&global_nwo, 0);
1069 	atomic_set(&mce_callin, 0);
1070 	barrier();
1071 
1072 	/*
1073 	 * Let others run again.
1074 	 */
1075 	atomic_set(&mce_executing, 0);
1076 	return ret;
1077 }
1078 
1079 static void mce_clear_state(unsigned long *toclear)
1080 {
1081 	int i;
1082 
1083 	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1084 		if (test_bit(i, toclear))
1085 			mce_wrmsrl(msr_ops.status(i), 0);
1086 	}
1087 }
1088 
1089 static int do_memory_failure(struct mce *m)
1090 {
1091 	int flags = MF_ACTION_REQUIRED;
1092 	int ret;
1093 
1094 	pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
1095 	if (!(m->mcgstatus & MCG_STATUS_RIPV))
1096 		flags |= MF_MUST_KILL;
1097 	ret = memory_failure(m->addr >> PAGE_SHIFT, flags);
1098 	if (ret)
1099 		pr_err("Memory error not recovered");
1100 	else
1101 		set_mce_nospec(m->addr >> PAGE_SHIFT);
1102 	return ret;
1103 }
1104 
1105 
1106 /*
1107  * Cases where we avoid rendezvous handler timeout:
1108  * 1) If this CPU is offline.
1109  *
1110  * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
1111  *  skip those CPUs which remain looping in the 1st kernel - see
1112  *  crash_nmi_callback().
1113  *
1114  * Note: there still is a small window between kexec-ing and the new,
1115  * kdump kernel establishing a new #MC handler where a broadcasted MCE
1116  * might not get handled properly.
1117  */
1118 static bool __mc_check_crashing_cpu(int cpu)
1119 {
1120 	if (cpu_is_offline(cpu) ||
1121 	    (crashing_cpu != -1 && crashing_cpu != cpu)) {
1122 		u64 mcgstatus;
1123 
1124 		mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
1125 
1126 		if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
1127 			if (mcgstatus & MCG_STATUS_LMCES)
1128 				return false;
1129 		}
1130 
1131 		if (mcgstatus & MCG_STATUS_RIPV) {
1132 			mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1133 			return true;
1134 		}
1135 	}
1136 	return false;
1137 }
1138 
1139 static void __mc_scan_banks(struct mce *m, struct mce *final,
1140 			    unsigned long *toclear, unsigned long *valid_banks,
1141 			    int no_way_out, int *worst)
1142 {
1143 	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1144 	struct mca_config *cfg = &mca_cfg;
1145 	int severity, i;
1146 
1147 	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1148 		__clear_bit(i, toclear);
1149 		if (!test_bit(i, valid_banks))
1150 			continue;
1151 
1152 		if (!mce_banks[i].ctl)
1153 			continue;
1154 
1155 		m->misc = 0;
1156 		m->addr = 0;
1157 		m->bank = i;
1158 
1159 		m->status = mce_rdmsrl(msr_ops.status(i));
1160 		if (!(m->status & MCI_STATUS_VAL))
1161 			continue;
1162 
1163 		/*
1164 		 * Corrected or non-signaled errors are handled by
1165 		 * machine_check_poll(). Leave them alone, unless this panics.
1166 		 */
1167 		if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1168 			!no_way_out)
1169 			continue;
1170 
1171 		/* Set taint even when machine check was not enabled. */
1172 		add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1173 
1174 		severity = mce_severity(m, cfg->tolerant, NULL, true);
1175 
1176 		/*
1177 		 * When machine check was for corrected/deferred handler don't
1178 		 * touch, unless we're panicking.
1179 		 */
1180 		if ((severity == MCE_KEEP_SEVERITY ||
1181 		     severity == MCE_UCNA_SEVERITY) && !no_way_out)
1182 			continue;
1183 
1184 		__set_bit(i, toclear);
1185 
1186 		/* Machine check event was not enabled. Clear, but ignore. */
1187 		if (severity == MCE_NO_SEVERITY)
1188 			continue;
1189 
1190 		mce_read_aux(m, i);
1191 
1192 		/* assuming valid severity level != 0 */
1193 		m->severity = severity;
1194 
1195 		mce_log(m);
1196 
1197 		if (severity > *worst) {
1198 			*final = *m;
1199 			*worst = severity;
1200 		}
1201 	}
1202 
1203 	/* mce_clear_state will clear *final, save locally for use later */
1204 	*m = *final;
1205 }
1206 
1207 /*
1208  * The actual machine check handler. This only handles real
1209  * exceptions when something got corrupted coming in through int 18.
1210  *
1211  * This is executed in NMI context not subject to normal locking rules. This
1212  * implies that most kernel services cannot be safely used. Don't even
1213  * think about putting a printk in there!
1214  *
1215  * On Intel systems this is entered on all CPUs in parallel through
1216  * MCE broadcast. However some CPUs might be broken beyond repair,
1217  * so be always careful when synchronizing with others.
1218  *
1219  * Tracing and kprobes are disabled: if we interrupted a kernel context
1220  * with IF=1, we need to minimize stack usage.  There are also recursion
1221  * issues: if the machine check was due to a failure of the memory
1222  * backing the user stack, tracing that reads the user stack will cause
1223  * potentially infinite recursion.
1224  */
1225 void notrace do_machine_check(struct pt_regs *regs, long error_code)
1226 {
1227 	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1228 	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1229 	struct mca_config *cfg = &mca_cfg;
1230 	int cpu = smp_processor_id();
1231 	struct mce m, *final;
1232 	char *msg = NULL;
1233 	int worst = 0;
1234 
1235 	/*
1236 	 * Establish sequential order between the CPUs entering the machine
1237 	 * check handler.
1238 	 */
1239 	int order = -1;
1240 
1241 	/*
1242 	 * If no_way_out gets set, there is no safe way to recover from this
1243 	 * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
1244 	 */
1245 	int no_way_out = 0;
1246 
1247 	/*
1248 	 * If kill_it gets set, there might be a way to recover from this
1249 	 * error.
1250 	 */
1251 	int kill_it = 0;
1252 
1253 	/*
1254 	 * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
1255 	 * on Intel.
1256 	 */
1257 	int lmce = 1;
1258 
1259 	if (__mc_check_crashing_cpu(cpu))
1260 		return;
1261 
1262 	ist_enter(regs);
1263 
1264 	this_cpu_inc(mce_exception_count);
1265 
1266 	mce_gather_info(&m, regs);
1267 	m.tsc = rdtsc();
1268 
1269 	final = this_cpu_ptr(&mces_seen);
1270 	*final = m;
1271 
1272 	memset(valid_banks, 0, sizeof(valid_banks));
1273 	no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1274 
1275 	barrier();
1276 
1277 	/*
1278 	 * When no restart IP might need to kill or panic.
1279 	 * Assume the worst for now, but if we find the
1280 	 * severity is MCE_AR_SEVERITY we have other options.
1281 	 */
1282 	if (!(m.mcgstatus & MCG_STATUS_RIPV))
1283 		kill_it = 1;
1284 
1285 	/*
1286 	 * Check if this MCE is signaled to only this logical processor,
1287 	 * on Intel, Zhaoxin only.
1288 	 */
1289 	if (m.cpuvendor == X86_VENDOR_INTEL ||
1290 	    m.cpuvendor == X86_VENDOR_ZHAOXIN)
1291 		lmce = m.mcgstatus & MCG_STATUS_LMCES;
1292 
1293 	/*
1294 	 * Local machine check may already know that we have to panic.
1295 	 * Broadcast machine check begins rendezvous in mce_start()
1296 	 * Go through all banks in exclusion of the other CPUs. This way we
1297 	 * don't report duplicated events on shared banks because the first one
1298 	 * to see it will clear it.
1299 	 */
1300 	if (lmce) {
1301 		if (no_way_out)
1302 			mce_panic("Fatal local machine check", &m, msg);
1303 	} else {
1304 		order = mce_start(&no_way_out);
1305 	}
1306 
1307 	__mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst);
1308 
1309 	if (!no_way_out)
1310 		mce_clear_state(toclear);
1311 
1312 	/*
1313 	 * Do most of the synchronization with other CPUs.
1314 	 * When there's any problem use only local no_way_out state.
1315 	 */
1316 	if (!lmce) {
1317 		if (mce_end(order) < 0)
1318 			no_way_out = worst >= MCE_PANIC_SEVERITY;
1319 	} else {
1320 		/*
1321 		 * If there was a fatal machine check we should have
1322 		 * already called mce_panic earlier in this function.
1323 		 * Since we re-read the banks, we might have found
1324 		 * something new. Check again to see if we found a
1325 		 * fatal error. We call "mce_severity()" again to
1326 		 * make sure we have the right "msg".
1327 		 */
1328 		if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
1329 			mce_severity(&m, cfg->tolerant, &msg, true);
1330 			mce_panic("Local fatal machine check!", &m, msg);
1331 		}
1332 	}
1333 
1334 	/*
1335 	 * If tolerant is at an insane level we drop requests to kill
1336 	 * processes and continue even when there is no way out.
1337 	 */
1338 	if (cfg->tolerant == 3)
1339 		kill_it = 0;
1340 	else if (no_way_out)
1341 		mce_panic("Fatal machine check on current CPU", &m, msg);
1342 
1343 	if (worst > 0)
1344 		irq_work_queue(&mce_irq_work);
1345 
1346 	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1347 
1348 	sync_core();
1349 
1350 	if (worst != MCE_AR_SEVERITY && !kill_it)
1351 		goto out_ist;
1352 
1353 	/* Fault was in user mode and we need to take some action */
1354 	if ((m.cs & 3) == 3) {
1355 		ist_begin_non_atomic(regs);
1356 		local_irq_enable();
1357 
1358 		if (kill_it || do_memory_failure(&m))
1359 			force_sig(SIGBUS);
1360 		local_irq_disable();
1361 		ist_end_non_atomic();
1362 	} else {
1363 		if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0))
1364 			mce_panic("Failed kernel mode recovery", &m, msg);
1365 	}
1366 
1367 out_ist:
1368 	ist_exit(regs);
1369 }
1370 EXPORT_SYMBOL_GPL(do_machine_check);
1371 NOKPROBE_SYMBOL(do_machine_check);
1372 
1373 #ifndef CONFIG_MEMORY_FAILURE
1374 int memory_failure(unsigned long pfn, int flags)
1375 {
1376 	/* mce_severity() should not hand us an ACTION_REQUIRED error */
1377 	BUG_ON(flags & MF_ACTION_REQUIRED);
1378 	pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1379 	       "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1380 	       pfn);
1381 
1382 	return 0;
1383 }
1384 #endif
1385 
1386 /*
1387  * Periodic polling timer for "silent" machine check errors.  If the
1388  * poller finds an MCE, poll 2x faster.  When the poller finds no more
1389  * errors, poll 2x slower (up to check_interval seconds).
1390  */
1391 static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1392 
1393 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1394 static DEFINE_PER_CPU(struct timer_list, mce_timer);
1395 
1396 static unsigned long mce_adjust_timer_default(unsigned long interval)
1397 {
1398 	return interval;
1399 }
1400 
1401 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1402 
1403 static void __start_timer(struct timer_list *t, unsigned long interval)
1404 {
1405 	unsigned long when = jiffies + interval;
1406 	unsigned long flags;
1407 
1408 	local_irq_save(flags);
1409 
1410 	if (!timer_pending(t) || time_before(when, t->expires))
1411 		mod_timer(t, round_jiffies(when));
1412 
1413 	local_irq_restore(flags);
1414 }
1415 
1416 static void mce_timer_fn(struct timer_list *t)
1417 {
1418 	struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
1419 	unsigned long iv;
1420 
1421 	WARN_ON(cpu_t != t);
1422 
1423 	iv = __this_cpu_read(mce_next_interval);
1424 
1425 	if (mce_available(this_cpu_ptr(&cpu_info))) {
1426 		machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
1427 
1428 		if (mce_intel_cmci_poll()) {
1429 			iv = mce_adjust_timer(iv);
1430 			goto done;
1431 		}
1432 	}
1433 
1434 	/*
1435 	 * Alert userspace if needed. If we logged an MCE, reduce the polling
1436 	 * interval, otherwise increase the polling interval.
1437 	 */
1438 	if (mce_notify_irq())
1439 		iv = max(iv / 2, (unsigned long) HZ/100);
1440 	else
1441 		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1442 
1443 done:
1444 	__this_cpu_write(mce_next_interval, iv);
1445 	__start_timer(t, iv);
1446 }
1447 
1448 /*
1449  * Ensure that the timer is firing in @interval from now.
1450  */
1451 void mce_timer_kick(unsigned long interval)
1452 {
1453 	struct timer_list *t = this_cpu_ptr(&mce_timer);
1454 	unsigned long iv = __this_cpu_read(mce_next_interval);
1455 
1456 	__start_timer(t, interval);
1457 
1458 	if (interval < iv)
1459 		__this_cpu_write(mce_next_interval, interval);
1460 }
1461 
1462 /* Must not be called in IRQ context where del_timer_sync() can deadlock */
1463 static void mce_timer_delete_all(void)
1464 {
1465 	int cpu;
1466 
1467 	for_each_online_cpu(cpu)
1468 		del_timer_sync(&per_cpu(mce_timer, cpu));
1469 }
1470 
1471 /*
1472  * Notify the user(s) about new machine check events.
1473  * Can be called from interrupt context, but not from machine check/NMI
1474  * context.
1475  */
1476 int mce_notify_irq(void)
1477 {
1478 	/* Not more than two messages every minute */
1479 	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1480 
1481 	if (test_and_clear_bit(0, &mce_need_notify)) {
1482 		mce_work_trigger();
1483 
1484 		if (__ratelimit(&ratelimit))
1485 			pr_info(HW_ERR "Machine check events logged\n");
1486 
1487 		return 1;
1488 	}
1489 	return 0;
1490 }
1491 EXPORT_SYMBOL_GPL(mce_notify_irq);
1492 
1493 static void __mcheck_cpu_mce_banks_init(void)
1494 {
1495 	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1496 	u8 n_banks = this_cpu_read(mce_num_banks);
1497 	int i;
1498 
1499 	for (i = 0; i < n_banks; i++) {
1500 		struct mce_bank *b = &mce_banks[i];
1501 
1502 		/*
1503 		 * Init them all, __mcheck_cpu_apply_quirks() is going to apply
1504 		 * the required vendor quirks before
1505 		 * __mcheck_cpu_init_clear_banks() does the final bank setup.
1506 		 */
1507 		b->ctl = -1ULL;
1508 		b->init = 1;
1509 	}
1510 }
1511 
1512 /*
1513  * Initialize Machine Checks for a CPU.
1514  */
1515 static void __mcheck_cpu_cap_init(void)
1516 {
1517 	u64 cap;
1518 	u8 b;
1519 
1520 	rdmsrl(MSR_IA32_MCG_CAP, cap);
1521 
1522 	b = cap & MCG_BANKCNT_MASK;
1523 
1524 	if (b > MAX_NR_BANKS) {
1525 		pr_warn("CPU%d: Using only %u machine check banks out of %u\n",
1526 			smp_processor_id(), MAX_NR_BANKS, b);
1527 		b = MAX_NR_BANKS;
1528 	}
1529 
1530 	this_cpu_write(mce_num_banks, b);
1531 
1532 	__mcheck_cpu_mce_banks_init();
1533 
1534 	/* Use accurate RIP reporting if available. */
1535 	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1536 		mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1537 
1538 	if (cap & MCG_SER_P)
1539 		mca_cfg.ser = 1;
1540 }
1541 
1542 static void __mcheck_cpu_init_generic(void)
1543 {
1544 	enum mcp_flags m_fl = 0;
1545 	mce_banks_t all_banks;
1546 	u64 cap;
1547 
1548 	if (!mca_cfg.bootlog)
1549 		m_fl = MCP_DONTLOG;
1550 
1551 	/*
1552 	 * Log the machine checks left over from the previous reset.
1553 	 */
1554 	bitmap_fill(all_banks, MAX_NR_BANKS);
1555 	machine_check_poll(MCP_UC | m_fl, &all_banks);
1556 
1557 	cr4_set_bits(X86_CR4_MCE);
1558 
1559 	rdmsrl(MSR_IA32_MCG_CAP, cap);
1560 	if (cap & MCG_CTL_P)
1561 		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1562 }
1563 
1564 static void __mcheck_cpu_init_clear_banks(void)
1565 {
1566 	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1567 	int i;
1568 
1569 	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1570 		struct mce_bank *b = &mce_banks[i];
1571 
1572 		if (!b->init)
1573 			continue;
1574 		wrmsrl(msr_ops.ctl(i), b->ctl);
1575 		wrmsrl(msr_ops.status(i), 0);
1576 	}
1577 }
1578 
1579 /*
1580  * Do a final check to see if there are any unused/RAZ banks.
1581  *
1582  * This must be done after the banks have been initialized and any quirks have
1583  * been applied.
1584  *
1585  * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs.
1586  * Otherwise, a user who disables a bank will not be able to re-enable it
1587  * without a system reboot.
1588  */
1589 static void __mcheck_cpu_check_banks(void)
1590 {
1591 	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1592 	u64 msrval;
1593 	int i;
1594 
1595 	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1596 		struct mce_bank *b = &mce_banks[i];
1597 
1598 		if (!b->init)
1599 			continue;
1600 
1601 		rdmsrl(msr_ops.ctl(i), msrval);
1602 		b->init = !!msrval;
1603 	}
1604 }
1605 
1606 /*
1607  * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1608  * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1609  * Vol 3B Table 15-20). But this confuses both the code that determines
1610  * whether the machine check occurred in kernel or user mode, and also
1611  * the severity assessment code. Pretend that EIPV was set, and take the
1612  * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1613  */
1614 static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1615 {
1616 	if (bank != 0)
1617 		return;
1618 	if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1619 		return;
1620 	if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1621 		          MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1622 			  MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1623 			  MCACOD)) !=
1624 			 (MCI_STATUS_UC|MCI_STATUS_EN|
1625 			  MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1626 			  MCI_STATUS_AR|MCACOD_INSTR))
1627 		return;
1628 
1629 	m->mcgstatus |= MCG_STATUS_EIPV;
1630 	m->ip = regs->ip;
1631 	m->cs = regs->cs;
1632 }
1633 
1634 /* Add per CPU specific workarounds here */
1635 static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1636 {
1637 	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1638 	struct mca_config *cfg = &mca_cfg;
1639 
1640 	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1641 		pr_info("unknown CPU type - not enabling MCE support\n");
1642 		return -EOPNOTSUPP;
1643 	}
1644 
1645 	/* This should be disabled by the BIOS, but isn't always */
1646 	if (c->x86_vendor == X86_VENDOR_AMD) {
1647 		if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
1648 			/*
1649 			 * disable GART TBL walk error reporting, which
1650 			 * trips off incorrectly with the IOMMU & 3ware
1651 			 * & Cerberus:
1652 			 */
1653 			clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1654 		}
1655 		if (c->x86 < 0x11 && cfg->bootlog < 0) {
1656 			/*
1657 			 * Lots of broken BIOS around that don't clear them
1658 			 * by default and leave crap in there. Don't log:
1659 			 */
1660 			cfg->bootlog = 0;
1661 		}
1662 		/*
1663 		 * Various K7s with broken bank 0 around. Always disable
1664 		 * by default.
1665 		 */
1666 		if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0)
1667 			mce_banks[0].ctl = 0;
1668 
1669 		/*
1670 		 * overflow_recov is supported for F15h Models 00h-0fh
1671 		 * even though we don't have a CPUID bit for it.
1672 		 */
1673 		if (c->x86 == 0x15 && c->x86_model <= 0xf)
1674 			mce_flags.overflow_recov = 1;
1675 
1676 	}
1677 
1678 	if (c->x86_vendor == X86_VENDOR_INTEL) {
1679 		/*
1680 		 * SDM documents that on family 6 bank 0 should not be written
1681 		 * because it aliases to another special BIOS controlled
1682 		 * register.
1683 		 * But it's not aliased anymore on model 0x1a+
1684 		 * Don't ignore bank 0 completely because there could be a
1685 		 * valid event later, merely don't write CTL0.
1686 		 */
1687 
1688 		if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
1689 			mce_banks[0].init = 0;
1690 
1691 		/*
1692 		 * All newer Intel systems support MCE broadcasting. Enable
1693 		 * synchronization with a one second timeout.
1694 		 */
1695 		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1696 			cfg->monarch_timeout < 0)
1697 			cfg->monarch_timeout = USEC_PER_SEC;
1698 
1699 		/*
1700 		 * There are also broken BIOSes on some Pentium M and
1701 		 * earlier systems:
1702 		 */
1703 		if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1704 			cfg->bootlog = 0;
1705 
1706 		if (c->x86 == 6 && c->x86_model == 45)
1707 			quirk_no_way_out = quirk_sandybridge_ifu;
1708 	}
1709 
1710 	if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
1711 		/*
1712 		 * All newer Zhaoxin CPUs support MCE broadcasting. Enable
1713 		 * synchronization with a one second timeout.
1714 		 */
1715 		if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
1716 			if (cfg->monarch_timeout < 0)
1717 				cfg->monarch_timeout = USEC_PER_SEC;
1718 		}
1719 	}
1720 
1721 	if (cfg->monarch_timeout < 0)
1722 		cfg->monarch_timeout = 0;
1723 	if (cfg->bootlog != 0)
1724 		cfg->panic_timeout = 30;
1725 
1726 	return 0;
1727 }
1728 
1729 static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1730 {
1731 	if (c->x86 != 5)
1732 		return 0;
1733 
1734 	switch (c->x86_vendor) {
1735 	case X86_VENDOR_INTEL:
1736 		intel_p5_mcheck_init(c);
1737 		return 1;
1738 		break;
1739 	case X86_VENDOR_CENTAUR:
1740 		winchip_mcheck_init(c);
1741 		return 1;
1742 		break;
1743 	default:
1744 		return 0;
1745 	}
1746 
1747 	return 0;
1748 }
1749 
1750 /*
1751  * Init basic CPU features needed for early decoding of MCEs.
1752  */
1753 static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
1754 {
1755 	if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
1756 		mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
1757 		mce_flags.succor	 = !!cpu_has(c, X86_FEATURE_SUCCOR);
1758 		mce_flags.smca		 = !!cpu_has(c, X86_FEATURE_SMCA);
1759 
1760 		if (mce_flags.smca) {
1761 			msr_ops.ctl	= smca_ctl_reg;
1762 			msr_ops.status	= smca_status_reg;
1763 			msr_ops.addr	= smca_addr_reg;
1764 			msr_ops.misc	= smca_misc_reg;
1765 		}
1766 	}
1767 }
1768 
1769 static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
1770 {
1771 	struct mca_config *cfg = &mca_cfg;
1772 
1773 	 /*
1774 	  * All newer Centaur CPUs support MCE broadcasting. Enable
1775 	  * synchronization with a one second timeout.
1776 	  */
1777 	if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) ||
1778 	     c->x86 > 6) {
1779 		if (cfg->monarch_timeout < 0)
1780 			cfg->monarch_timeout = USEC_PER_SEC;
1781 	}
1782 }
1783 
1784 static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
1785 {
1786 	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1787 
1788 	/*
1789 	 * These CPUs have MCA bank 8 which reports only one error type called
1790 	 * SVAD (System View Address Decoder). The reporting of that error is
1791 	 * controlled by IA32_MC8.CTL.0.
1792 	 *
1793 	 * If enabled, prefetching on these CPUs will cause SVAD MCE when
1794 	 * virtual machines start and result in a system  panic. Always disable
1795 	 * bank 8 SVAD error by default.
1796 	 */
1797 	if ((c->x86 == 7 && c->x86_model == 0x1b) ||
1798 	    (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
1799 		if (this_cpu_read(mce_num_banks) > 8)
1800 			mce_banks[8].ctl = 0;
1801 	}
1802 
1803 	intel_init_cmci();
1804 	intel_init_lmce();
1805 	mce_adjust_timer = cmci_intel_adjust_timer;
1806 }
1807 
1808 static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
1809 {
1810 	intel_clear_lmce();
1811 }
1812 
1813 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1814 {
1815 	switch (c->x86_vendor) {
1816 	case X86_VENDOR_INTEL:
1817 		mce_intel_feature_init(c);
1818 		mce_adjust_timer = cmci_intel_adjust_timer;
1819 		break;
1820 
1821 	case X86_VENDOR_AMD: {
1822 		mce_amd_feature_init(c);
1823 		break;
1824 		}
1825 
1826 	case X86_VENDOR_HYGON:
1827 		mce_hygon_feature_init(c);
1828 		break;
1829 
1830 	case X86_VENDOR_CENTAUR:
1831 		mce_centaur_feature_init(c);
1832 		break;
1833 
1834 	case X86_VENDOR_ZHAOXIN:
1835 		mce_zhaoxin_feature_init(c);
1836 		break;
1837 
1838 	default:
1839 		break;
1840 	}
1841 }
1842 
1843 static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
1844 {
1845 	switch (c->x86_vendor) {
1846 	case X86_VENDOR_INTEL:
1847 		mce_intel_feature_clear(c);
1848 		break;
1849 
1850 	case X86_VENDOR_ZHAOXIN:
1851 		mce_zhaoxin_feature_clear(c);
1852 		break;
1853 
1854 	default:
1855 		break;
1856 	}
1857 }
1858 
1859 static void mce_start_timer(struct timer_list *t)
1860 {
1861 	unsigned long iv = check_interval * HZ;
1862 
1863 	if (mca_cfg.ignore_ce || !iv)
1864 		return;
1865 
1866 	this_cpu_write(mce_next_interval, iv);
1867 	__start_timer(t, iv);
1868 }
1869 
1870 static void __mcheck_cpu_setup_timer(void)
1871 {
1872 	struct timer_list *t = this_cpu_ptr(&mce_timer);
1873 
1874 	timer_setup(t, mce_timer_fn, TIMER_PINNED);
1875 }
1876 
1877 static void __mcheck_cpu_init_timer(void)
1878 {
1879 	struct timer_list *t = this_cpu_ptr(&mce_timer);
1880 
1881 	timer_setup(t, mce_timer_fn, TIMER_PINNED);
1882 	mce_start_timer(t);
1883 }
1884 
1885 bool filter_mce(struct mce *m)
1886 {
1887 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1888 		return amd_filter_mce(m);
1889 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
1890 		return intel_filter_mce(m);
1891 
1892 	return false;
1893 }
1894 
1895 /* Handle unconfigured int18 (should never happen) */
1896 static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1897 {
1898 	pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1899 	       smp_processor_id());
1900 }
1901 
1902 /* Call the installed machine check handler for this CPU setup. */
1903 void (*machine_check_vector)(struct pt_regs *, long error_code) =
1904 						unexpected_machine_check;
1905 
1906 dotraplinkage notrace void do_mce(struct pt_regs *regs, long error_code)
1907 {
1908 	machine_check_vector(regs, error_code);
1909 }
1910 NOKPROBE_SYMBOL(do_mce);
1911 
1912 /*
1913  * Called for each booted CPU to set up machine checks.
1914  * Must be called with preempt off:
1915  */
1916 void mcheck_cpu_init(struct cpuinfo_x86 *c)
1917 {
1918 	if (mca_cfg.disabled)
1919 		return;
1920 
1921 	if (__mcheck_cpu_ancient_init(c))
1922 		return;
1923 
1924 	if (!mce_available(c))
1925 		return;
1926 
1927 	__mcheck_cpu_cap_init();
1928 
1929 	if (__mcheck_cpu_apply_quirks(c) < 0) {
1930 		mca_cfg.disabled = 1;
1931 		return;
1932 	}
1933 
1934 	if (mce_gen_pool_init()) {
1935 		mca_cfg.disabled = 1;
1936 		pr_emerg("Couldn't allocate MCE records pool!\n");
1937 		return;
1938 	}
1939 
1940 	machine_check_vector = do_machine_check;
1941 
1942 	__mcheck_cpu_init_early(c);
1943 	__mcheck_cpu_init_generic();
1944 	__mcheck_cpu_init_vendor(c);
1945 	__mcheck_cpu_init_clear_banks();
1946 	__mcheck_cpu_check_banks();
1947 	__mcheck_cpu_setup_timer();
1948 }
1949 
1950 /*
1951  * Called for each booted CPU to clear some machine checks opt-ins
1952  */
1953 void mcheck_cpu_clear(struct cpuinfo_x86 *c)
1954 {
1955 	if (mca_cfg.disabled)
1956 		return;
1957 
1958 	if (!mce_available(c))
1959 		return;
1960 
1961 	/*
1962 	 * Possibly to clear general settings generic to x86
1963 	 * __mcheck_cpu_clear_generic(c);
1964 	 */
1965 	__mcheck_cpu_clear_vendor(c);
1966 
1967 }
1968 
1969 static void __mce_disable_bank(void *arg)
1970 {
1971 	int bank = *((int *)arg);
1972 	__clear_bit(bank, this_cpu_ptr(mce_poll_banks));
1973 	cmci_disable_bank(bank);
1974 }
1975 
1976 void mce_disable_bank(int bank)
1977 {
1978 	if (bank >= this_cpu_read(mce_num_banks)) {
1979 		pr_warn(FW_BUG
1980 			"Ignoring request to disable invalid MCA bank %d.\n",
1981 			bank);
1982 		return;
1983 	}
1984 	set_bit(bank, mce_banks_ce_disabled);
1985 	on_each_cpu(__mce_disable_bank, &bank, 1);
1986 }
1987 
1988 /*
1989  * mce=off Disables machine check
1990  * mce=no_cmci Disables CMCI
1991  * mce=no_lmce Disables LMCE
1992  * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1993  * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1994  * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1995  *	monarchtimeout is how long to wait for other CPUs on machine
1996  *	check, or 0 to not wait
1997  * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
1998 	and older.
1999  * mce=nobootlog Don't log MCEs from before booting.
2000  * mce=bios_cmci_threshold Don't program the CMCI threshold
2001  * mce=recovery force enable memcpy_mcsafe()
2002  */
2003 static int __init mcheck_enable(char *str)
2004 {
2005 	struct mca_config *cfg = &mca_cfg;
2006 
2007 	if (*str == 0) {
2008 		enable_p5_mce();
2009 		return 1;
2010 	}
2011 	if (*str == '=')
2012 		str++;
2013 	if (!strcmp(str, "off"))
2014 		cfg->disabled = 1;
2015 	else if (!strcmp(str, "no_cmci"))
2016 		cfg->cmci_disabled = true;
2017 	else if (!strcmp(str, "no_lmce"))
2018 		cfg->lmce_disabled = 1;
2019 	else if (!strcmp(str, "dont_log_ce"))
2020 		cfg->dont_log_ce = true;
2021 	else if (!strcmp(str, "ignore_ce"))
2022 		cfg->ignore_ce = true;
2023 	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
2024 		cfg->bootlog = (str[0] == 'b');
2025 	else if (!strcmp(str, "bios_cmci_threshold"))
2026 		cfg->bios_cmci_threshold = 1;
2027 	else if (!strcmp(str, "recovery"))
2028 		cfg->recovery = 1;
2029 	else if (isdigit(str[0])) {
2030 		if (get_option(&str, &cfg->tolerant) == 2)
2031 			get_option(&str, &(cfg->monarch_timeout));
2032 	} else {
2033 		pr_info("mce argument %s ignored. Please use /sys\n", str);
2034 		return 0;
2035 	}
2036 	return 1;
2037 }
2038 __setup("mce", mcheck_enable);
2039 
2040 int __init mcheck_init(void)
2041 {
2042 	mcheck_intel_therm_init();
2043 	mce_register_decode_chain(&first_nb);
2044 	mce_register_decode_chain(&mce_uc_nb);
2045 	mce_register_decode_chain(&mce_default_nb);
2046 	mcheck_vendor_init_severity();
2047 
2048 	INIT_WORK(&mce_work, mce_gen_pool_process);
2049 	init_irq_work(&mce_irq_work, mce_irq_work_cb);
2050 
2051 	return 0;
2052 }
2053 
2054 /*
2055  * mce_syscore: PM support
2056  */
2057 
2058 /*
2059  * Disable machine checks on suspend and shutdown. We can't really handle
2060  * them later.
2061  */
2062 static void mce_disable_error_reporting(void)
2063 {
2064 	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
2065 	int i;
2066 
2067 	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
2068 		struct mce_bank *b = &mce_banks[i];
2069 
2070 		if (b->init)
2071 			wrmsrl(msr_ops.ctl(i), 0);
2072 	}
2073 	return;
2074 }
2075 
2076 static void vendor_disable_error_reporting(void)
2077 {
2078 	/*
2079 	 * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these
2080 	 * MSRs are socket-wide. Disabling them for just a single offlined CPU
2081 	 * is bad, since it will inhibit reporting for all shared resources on
2082 	 * the socket like the last level cache (LLC), the integrated memory
2083 	 * controller (iMC), etc.
2084 	 */
2085 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
2086 	    boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
2087 	    boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
2088 	    boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN)
2089 		return;
2090 
2091 	mce_disable_error_reporting();
2092 }
2093 
2094 static int mce_syscore_suspend(void)
2095 {
2096 	vendor_disable_error_reporting();
2097 	return 0;
2098 }
2099 
2100 static void mce_syscore_shutdown(void)
2101 {
2102 	vendor_disable_error_reporting();
2103 }
2104 
2105 /*
2106  * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2107  * Only one CPU is active at this time, the others get re-added later using
2108  * CPU hotplug:
2109  */
2110 static void mce_syscore_resume(void)
2111 {
2112 	__mcheck_cpu_init_generic();
2113 	__mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
2114 	__mcheck_cpu_init_clear_banks();
2115 }
2116 
2117 static struct syscore_ops mce_syscore_ops = {
2118 	.suspend	= mce_syscore_suspend,
2119 	.shutdown	= mce_syscore_shutdown,
2120 	.resume		= mce_syscore_resume,
2121 };
2122 
2123 /*
2124  * mce_device: Sysfs support
2125  */
2126 
2127 static void mce_cpu_restart(void *data)
2128 {
2129 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2130 		return;
2131 	__mcheck_cpu_init_generic();
2132 	__mcheck_cpu_init_clear_banks();
2133 	__mcheck_cpu_init_timer();
2134 }
2135 
2136 /* Reinit MCEs after user configuration changes */
2137 static void mce_restart(void)
2138 {
2139 	mce_timer_delete_all();
2140 	on_each_cpu(mce_cpu_restart, NULL, 1);
2141 }
2142 
2143 /* Toggle features for corrected errors */
2144 static void mce_disable_cmci(void *data)
2145 {
2146 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2147 		return;
2148 	cmci_clear();
2149 }
2150 
2151 static void mce_enable_ce(void *all)
2152 {
2153 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2154 		return;
2155 	cmci_reenable();
2156 	cmci_recheck();
2157 	if (all)
2158 		__mcheck_cpu_init_timer();
2159 }
2160 
2161 static struct bus_type mce_subsys = {
2162 	.name		= "machinecheck",
2163 	.dev_name	= "machinecheck",
2164 };
2165 
2166 DEFINE_PER_CPU(struct device *, mce_device);
2167 
2168 static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr)
2169 {
2170 	return container_of(attr, struct mce_bank_dev, attr);
2171 }
2172 
2173 static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2174 			 char *buf)
2175 {
2176 	u8 bank = attr_to_bank(attr)->bank;
2177 	struct mce_bank *b;
2178 
2179 	if (bank >= per_cpu(mce_num_banks, s->id))
2180 		return -EINVAL;
2181 
2182 	b = &per_cpu(mce_banks_array, s->id)[bank];
2183 
2184 	if (!b->init)
2185 		return -ENODEV;
2186 
2187 	return sprintf(buf, "%llx\n", b->ctl);
2188 }
2189 
2190 static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2191 			const char *buf, size_t size)
2192 {
2193 	u8 bank = attr_to_bank(attr)->bank;
2194 	struct mce_bank *b;
2195 	u64 new;
2196 
2197 	if (kstrtou64(buf, 0, &new) < 0)
2198 		return -EINVAL;
2199 
2200 	if (bank >= per_cpu(mce_num_banks, s->id))
2201 		return -EINVAL;
2202 
2203 	b = &per_cpu(mce_banks_array, s->id)[bank];
2204 
2205 	if (!b->init)
2206 		return -ENODEV;
2207 
2208 	b->ctl = new;
2209 	mce_restart();
2210 
2211 	return size;
2212 }
2213 
2214 static ssize_t set_ignore_ce(struct device *s,
2215 			     struct device_attribute *attr,
2216 			     const char *buf, size_t size)
2217 {
2218 	u64 new;
2219 
2220 	if (kstrtou64(buf, 0, &new) < 0)
2221 		return -EINVAL;
2222 
2223 	mutex_lock(&mce_sysfs_mutex);
2224 	if (mca_cfg.ignore_ce ^ !!new) {
2225 		if (new) {
2226 			/* disable ce features */
2227 			mce_timer_delete_all();
2228 			on_each_cpu(mce_disable_cmci, NULL, 1);
2229 			mca_cfg.ignore_ce = true;
2230 		} else {
2231 			/* enable ce features */
2232 			mca_cfg.ignore_ce = false;
2233 			on_each_cpu(mce_enable_ce, (void *)1, 1);
2234 		}
2235 	}
2236 	mutex_unlock(&mce_sysfs_mutex);
2237 
2238 	return size;
2239 }
2240 
2241 static ssize_t set_cmci_disabled(struct device *s,
2242 				 struct device_attribute *attr,
2243 				 const char *buf, size_t size)
2244 {
2245 	u64 new;
2246 
2247 	if (kstrtou64(buf, 0, &new) < 0)
2248 		return -EINVAL;
2249 
2250 	mutex_lock(&mce_sysfs_mutex);
2251 	if (mca_cfg.cmci_disabled ^ !!new) {
2252 		if (new) {
2253 			/* disable cmci */
2254 			on_each_cpu(mce_disable_cmci, NULL, 1);
2255 			mca_cfg.cmci_disabled = true;
2256 		} else {
2257 			/* enable cmci */
2258 			mca_cfg.cmci_disabled = false;
2259 			on_each_cpu(mce_enable_ce, NULL, 1);
2260 		}
2261 	}
2262 	mutex_unlock(&mce_sysfs_mutex);
2263 
2264 	return size;
2265 }
2266 
2267 static ssize_t store_int_with_restart(struct device *s,
2268 				      struct device_attribute *attr,
2269 				      const char *buf, size_t size)
2270 {
2271 	unsigned long old_check_interval = check_interval;
2272 	ssize_t ret = device_store_ulong(s, attr, buf, size);
2273 
2274 	if (check_interval == old_check_interval)
2275 		return ret;
2276 
2277 	mutex_lock(&mce_sysfs_mutex);
2278 	mce_restart();
2279 	mutex_unlock(&mce_sysfs_mutex);
2280 
2281 	return ret;
2282 }
2283 
2284 static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2285 static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2286 static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2287 
2288 static struct dev_ext_attribute dev_attr_check_interval = {
2289 	__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2290 	&check_interval
2291 };
2292 
2293 static struct dev_ext_attribute dev_attr_ignore_ce = {
2294 	__ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2295 	&mca_cfg.ignore_ce
2296 };
2297 
2298 static struct dev_ext_attribute dev_attr_cmci_disabled = {
2299 	__ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2300 	&mca_cfg.cmci_disabled
2301 };
2302 
2303 static struct device_attribute *mce_device_attrs[] = {
2304 	&dev_attr_tolerant.attr,
2305 	&dev_attr_check_interval.attr,
2306 #ifdef CONFIG_X86_MCELOG_LEGACY
2307 	&dev_attr_trigger,
2308 #endif
2309 	&dev_attr_monarch_timeout.attr,
2310 	&dev_attr_dont_log_ce.attr,
2311 	&dev_attr_ignore_ce.attr,
2312 	&dev_attr_cmci_disabled.attr,
2313 	NULL
2314 };
2315 
2316 static cpumask_var_t mce_device_initialized;
2317 
2318 static void mce_device_release(struct device *dev)
2319 {
2320 	kfree(dev);
2321 }
2322 
2323 /* Per CPU device init. All of the CPUs still share the same bank device: */
2324 static int mce_device_create(unsigned int cpu)
2325 {
2326 	struct device *dev;
2327 	int err;
2328 	int i, j;
2329 
2330 	if (!mce_available(&boot_cpu_data))
2331 		return -EIO;
2332 
2333 	dev = per_cpu(mce_device, cpu);
2334 	if (dev)
2335 		return 0;
2336 
2337 	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
2338 	if (!dev)
2339 		return -ENOMEM;
2340 	dev->id  = cpu;
2341 	dev->bus = &mce_subsys;
2342 	dev->release = &mce_device_release;
2343 
2344 	err = device_register(dev);
2345 	if (err) {
2346 		put_device(dev);
2347 		return err;
2348 	}
2349 
2350 	for (i = 0; mce_device_attrs[i]; i++) {
2351 		err = device_create_file(dev, mce_device_attrs[i]);
2352 		if (err)
2353 			goto error;
2354 	}
2355 	for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) {
2356 		err = device_create_file(dev, &mce_bank_devs[j].attr);
2357 		if (err)
2358 			goto error2;
2359 	}
2360 	cpumask_set_cpu(cpu, mce_device_initialized);
2361 	per_cpu(mce_device, cpu) = dev;
2362 
2363 	return 0;
2364 error2:
2365 	while (--j >= 0)
2366 		device_remove_file(dev, &mce_bank_devs[j].attr);
2367 error:
2368 	while (--i >= 0)
2369 		device_remove_file(dev, mce_device_attrs[i]);
2370 
2371 	device_unregister(dev);
2372 
2373 	return err;
2374 }
2375 
2376 static void mce_device_remove(unsigned int cpu)
2377 {
2378 	struct device *dev = per_cpu(mce_device, cpu);
2379 	int i;
2380 
2381 	if (!cpumask_test_cpu(cpu, mce_device_initialized))
2382 		return;
2383 
2384 	for (i = 0; mce_device_attrs[i]; i++)
2385 		device_remove_file(dev, mce_device_attrs[i]);
2386 
2387 	for (i = 0; i < per_cpu(mce_num_banks, cpu); i++)
2388 		device_remove_file(dev, &mce_bank_devs[i].attr);
2389 
2390 	device_unregister(dev);
2391 	cpumask_clear_cpu(cpu, mce_device_initialized);
2392 	per_cpu(mce_device, cpu) = NULL;
2393 }
2394 
2395 /* Make sure there are no machine checks on offlined CPUs. */
2396 static void mce_disable_cpu(void)
2397 {
2398 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2399 		return;
2400 
2401 	if (!cpuhp_tasks_frozen)
2402 		cmci_clear();
2403 
2404 	vendor_disable_error_reporting();
2405 }
2406 
2407 static void mce_reenable_cpu(void)
2408 {
2409 	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
2410 	int i;
2411 
2412 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2413 		return;
2414 
2415 	if (!cpuhp_tasks_frozen)
2416 		cmci_reenable();
2417 	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
2418 		struct mce_bank *b = &mce_banks[i];
2419 
2420 		if (b->init)
2421 			wrmsrl(msr_ops.ctl(i), b->ctl);
2422 	}
2423 }
2424 
2425 static int mce_cpu_dead(unsigned int cpu)
2426 {
2427 	mce_intel_hcpu_update(cpu);
2428 
2429 	/* intentionally ignoring frozen here */
2430 	if (!cpuhp_tasks_frozen)
2431 		cmci_rediscover();
2432 	return 0;
2433 }
2434 
2435 static int mce_cpu_online(unsigned int cpu)
2436 {
2437 	struct timer_list *t = this_cpu_ptr(&mce_timer);
2438 	int ret;
2439 
2440 	mce_device_create(cpu);
2441 
2442 	ret = mce_threshold_create_device(cpu);
2443 	if (ret) {
2444 		mce_device_remove(cpu);
2445 		return ret;
2446 	}
2447 	mce_reenable_cpu();
2448 	mce_start_timer(t);
2449 	return 0;
2450 }
2451 
2452 static int mce_cpu_pre_down(unsigned int cpu)
2453 {
2454 	struct timer_list *t = this_cpu_ptr(&mce_timer);
2455 
2456 	mce_disable_cpu();
2457 	del_timer_sync(t);
2458 	mce_threshold_remove_device(cpu);
2459 	mce_device_remove(cpu);
2460 	return 0;
2461 }
2462 
2463 static __init void mce_init_banks(void)
2464 {
2465 	int i;
2466 
2467 	for (i = 0; i < MAX_NR_BANKS; i++) {
2468 		struct mce_bank_dev *b = &mce_bank_devs[i];
2469 		struct device_attribute *a = &b->attr;
2470 
2471 		b->bank = i;
2472 
2473 		sysfs_attr_init(&a->attr);
2474 		a->attr.name	= b->attrname;
2475 		snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2476 
2477 		a->attr.mode	= 0644;
2478 		a->show		= show_bank;
2479 		a->store	= set_bank;
2480 	}
2481 }
2482 
2483 static __init int mcheck_init_device(void)
2484 {
2485 	int err;
2486 
2487 	/*
2488 	 * Check if we have a spare virtual bit. This will only become
2489 	 * a problem if/when we move beyond 5-level page tables.
2490 	 */
2491 	MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
2492 
2493 	if (!mce_available(&boot_cpu_data)) {
2494 		err = -EIO;
2495 		goto err_out;
2496 	}
2497 
2498 	if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2499 		err = -ENOMEM;
2500 		goto err_out;
2501 	}
2502 
2503 	mce_init_banks();
2504 
2505 	err = subsys_system_register(&mce_subsys, NULL);
2506 	if (err)
2507 		goto err_out_mem;
2508 
2509 	err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
2510 				mce_cpu_dead);
2511 	if (err)
2512 		goto err_out_mem;
2513 
2514 	err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
2515 				mce_cpu_online, mce_cpu_pre_down);
2516 	if (err < 0)
2517 		goto err_out_online;
2518 
2519 	register_syscore_ops(&mce_syscore_ops);
2520 
2521 	return 0;
2522 
2523 err_out_online:
2524 	cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
2525 
2526 err_out_mem:
2527 	free_cpumask_var(mce_device_initialized);
2528 
2529 err_out:
2530 	pr_err("Unable to init MCE device (rc: %d)\n", err);
2531 
2532 	return err;
2533 }
2534 device_initcall_sync(mcheck_init_device);
2535 
2536 /*
2537  * Old style boot options parsing. Only for compatibility.
2538  */
2539 static int __init mcheck_disable(char *str)
2540 {
2541 	mca_cfg.disabled = 1;
2542 	return 1;
2543 }
2544 __setup("nomce", mcheck_disable);
2545 
2546 #ifdef CONFIG_DEBUG_FS
2547 struct dentry *mce_get_debugfs_dir(void)
2548 {
2549 	static struct dentry *dmce;
2550 
2551 	if (!dmce)
2552 		dmce = debugfs_create_dir("mce", NULL);
2553 
2554 	return dmce;
2555 }
2556 
2557 static void mce_reset(void)
2558 {
2559 	cpu_missing = 0;
2560 	atomic_set(&mce_fake_panicked, 0);
2561 	atomic_set(&mce_executing, 0);
2562 	atomic_set(&mce_callin, 0);
2563 	atomic_set(&global_nwo, 0);
2564 }
2565 
2566 static int fake_panic_get(void *data, u64 *val)
2567 {
2568 	*val = fake_panic;
2569 	return 0;
2570 }
2571 
2572 static int fake_panic_set(void *data, u64 val)
2573 {
2574 	mce_reset();
2575 	fake_panic = val;
2576 	return 0;
2577 }
2578 
2579 DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set,
2580 			 "%llu\n");
2581 
2582 static void __init mcheck_debugfs_init(void)
2583 {
2584 	struct dentry *dmce;
2585 
2586 	dmce = mce_get_debugfs_dir();
2587 	debugfs_create_file_unsafe("fake_panic", 0444, dmce, NULL,
2588 				   &fake_panic_fops);
2589 }
2590 #else
2591 static void __init mcheck_debugfs_init(void) { }
2592 #endif
2593 
2594 DEFINE_STATIC_KEY_FALSE(mcsafe_key);
2595 EXPORT_SYMBOL_GPL(mcsafe_key);
2596 
2597 static int __init mcheck_late_init(void)
2598 {
2599 	if (mca_cfg.recovery)
2600 		static_branch_inc(&mcsafe_key);
2601 
2602 	mcheck_debugfs_init();
2603 	cec_init();
2604 
2605 	/*
2606 	 * Flush out everything that has been logged during early boot, now that
2607 	 * everything has been initialized (workqueues, decoders, ...).
2608 	 */
2609 	mce_schedule_work();
2610 
2611 	return 0;
2612 }
2613 late_initcall(mcheck_late_init);
2614