xref: /linux/arch/x86/kernel/nmi.c (revision 553222f3e81f18da31b2552e18dc519715198590)
1 /*
2  *  Copyright (C) 1991, 1992  Linus Torvalds
3  *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4  *  Copyright (C) 2011	Don Zickus Red Hat, Inc.
5  *
6  *  Pentium III FXSR, SSE support
7  *	Gareth Hughes <gareth@valinux.com>, May 2000
8  */
9 
10 /*
11  * Handle hardware traps and faults.
12  */
13 #include <linux/spinlock.h>
14 #include <linux/kprobes.h>
15 #include <linux/kdebug.h>
16 #include <linux/nmi.h>
17 #include <linux/delay.h>
18 #include <linux/hardirq.h>
19 #include <linux/slab.h>
20 #include <linux/export.h>
21 
22 #include <linux/mca.h>
23 
24 #if defined(CONFIG_EDAC)
25 #include <linux/edac.h>
26 #endif
27 
28 #include <linux/atomic.h>
29 #include <asm/traps.h>
30 #include <asm/mach_traps.h>
31 #include <asm/nmi.h>
32 #include <asm/x86_init.h>
33 
34 #define NMI_MAX_NAMELEN	16
35 struct nmiaction {
36 	struct list_head list;
37 	nmi_handler_t handler;
38 	unsigned int flags;
39 	char *name;
40 };
41 
42 struct nmi_desc {
43 	spinlock_t lock;
44 	struct list_head head;
45 };
46 
47 static struct nmi_desc nmi_desc[NMI_MAX] =
48 {
49 	{
50 		.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
51 		.head = LIST_HEAD_INIT(nmi_desc[0].head),
52 	},
53 	{
54 		.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
55 		.head = LIST_HEAD_INIT(nmi_desc[1].head),
56 	},
57 	{
58 		.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
59 		.head = LIST_HEAD_INIT(nmi_desc[2].head),
60 	},
61 	{
62 		.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
63 		.head = LIST_HEAD_INIT(nmi_desc[3].head),
64 	},
65 
66 };
67 
68 struct nmi_stats {
69 	unsigned int normal;
70 	unsigned int unknown;
71 	unsigned int external;
72 	unsigned int swallow;
73 };
74 
75 static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
76 
77 static int ignore_nmis;
78 
79 int unknown_nmi_panic;
80 /*
81  * Prevent NMI reason port (0x61) being accessed simultaneously, can
82  * only be used in NMI handler.
83  */
84 static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
85 
86 static int __init setup_unknown_nmi_panic(char *str)
87 {
88 	unknown_nmi_panic = 1;
89 	return 1;
90 }
91 __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
92 
93 #define nmi_to_desc(type) (&nmi_desc[type])
94 
95 static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
96 {
97 	struct nmi_desc *desc = nmi_to_desc(type);
98 	struct nmiaction *a;
99 	int handled=0;
100 
101 	rcu_read_lock();
102 
103 	/*
104 	 * NMIs are edge-triggered, which means if you have enough
105 	 * of them concurrently, you can lose some because only one
106 	 * can be latched at any given time.  Walk the whole list
107 	 * to handle those situations.
108 	 */
109 	list_for_each_entry_rcu(a, &desc->head, list)
110 		handled += a->handler(type, regs);
111 
112 	rcu_read_unlock();
113 
114 	/* return total number of NMI events handled */
115 	return handled;
116 }
117 
118 static int __setup_nmi(unsigned int type, struct nmiaction *action)
119 {
120 	struct nmi_desc *desc = nmi_to_desc(type);
121 	unsigned long flags;
122 
123 	spin_lock_irqsave(&desc->lock, flags);
124 
125 	/*
126 	 * most handlers of type NMI_UNKNOWN never return because
127 	 * they just assume the NMI is theirs.  Just a sanity check
128 	 * to manage expectations
129 	 */
130 	WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head));
131 	WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
132 	WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
133 
134 	/*
135 	 * some handlers need to be executed first otherwise a fake
136 	 * event confuses some handlers (kdump uses this flag)
137 	 */
138 	if (action->flags & NMI_FLAG_FIRST)
139 		list_add_rcu(&action->list, &desc->head);
140 	else
141 		list_add_tail_rcu(&action->list, &desc->head);
142 
143 	spin_unlock_irqrestore(&desc->lock, flags);
144 	return 0;
145 }
146 
147 static struct nmiaction *__free_nmi(unsigned int type, const char *name)
148 {
149 	struct nmi_desc *desc = nmi_to_desc(type);
150 	struct nmiaction *n;
151 	unsigned long flags;
152 
153 	spin_lock_irqsave(&desc->lock, flags);
154 
155 	list_for_each_entry_rcu(n, &desc->head, list) {
156 		/*
157 		 * the name passed in to describe the nmi handler
158 		 * is used as the lookup key
159 		 */
160 		if (!strcmp(n->name, name)) {
161 			WARN(in_nmi(),
162 				"Trying to free NMI (%s) from NMI context!\n", n->name);
163 			list_del_rcu(&n->list);
164 			break;
165 		}
166 	}
167 
168 	spin_unlock_irqrestore(&desc->lock, flags);
169 	synchronize_rcu();
170 	return (n);
171 }
172 
173 int register_nmi_handler(unsigned int type, nmi_handler_t handler,
174 			unsigned long nmiflags, const char *devname)
175 {
176 	struct nmiaction *action;
177 	int retval = -ENOMEM;
178 
179 	if (!handler)
180 		return -EINVAL;
181 
182 	action = kzalloc(sizeof(struct nmiaction), GFP_KERNEL);
183 	if (!action)
184 		goto fail_action;
185 
186 	action->handler = handler;
187 	action->flags = nmiflags;
188 	action->name = kstrndup(devname, NMI_MAX_NAMELEN, GFP_KERNEL);
189 	if (!action->name)
190 		goto fail_action_name;
191 
192 	retval = __setup_nmi(type, action);
193 
194 	if (retval)
195 		goto fail_setup_nmi;
196 
197 	return retval;
198 
199 fail_setup_nmi:
200 	kfree(action->name);
201 fail_action_name:
202 	kfree(action);
203 fail_action:
204 
205 	return retval;
206 }
207 EXPORT_SYMBOL_GPL(register_nmi_handler);
208 
209 void unregister_nmi_handler(unsigned int type, const char *name)
210 {
211 	struct nmiaction *a;
212 
213 	a = __free_nmi(type, name);
214 	if (a) {
215 		kfree(a->name);
216 		kfree(a);
217 	}
218 }
219 
220 EXPORT_SYMBOL_GPL(unregister_nmi_handler);
221 
222 static notrace __kprobes void
223 pci_serr_error(unsigned char reason, struct pt_regs *regs)
224 {
225 	/* check to see if anyone registered against these types of errors */
226 	if (nmi_handle(NMI_SERR, regs, false))
227 		return;
228 
229 	pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
230 		 reason, smp_processor_id());
231 
232 	/*
233 	 * On some machines, PCI SERR line is used to report memory
234 	 * errors. EDAC makes use of it.
235 	 */
236 #if defined(CONFIG_EDAC)
237 	if (edac_handler_set()) {
238 		edac_atomic_assert_error();
239 		return;
240 	}
241 #endif
242 
243 	if (panic_on_unrecovered_nmi)
244 		panic("NMI: Not continuing");
245 
246 	pr_emerg("Dazed and confused, but trying to continue\n");
247 
248 	/* Clear and disable the PCI SERR error line. */
249 	reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
250 	outb(reason, NMI_REASON_PORT);
251 }
252 
253 static notrace __kprobes void
254 io_check_error(unsigned char reason, struct pt_regs *regs)
255 {
256 	unsigned long i;
257 
258 	/* check to see if anyone registered against these types of errors */
259 	if (nmi_handle(NMI_IO_CHECK, regs, false))
260 		return;
261 
262 	pr_emerg(
263 	"NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
264 		 reason, smp_processor_id());
265 	show_registers(regs);
266 
267 	if (panic_on_io_nmi)
268 		panic("NMI IOCK error: Not continuing");
269 
270 	/* Re-enable the IOCK line, wait for a few seconds */
271 	reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
272 	outb(reason, NMI_REASON_PORT);
273 
274 	i = 20000;
275 	while (--i) {
276 		touch_nmi_watchdog();
277 		udelay(100);
278 	}
279 
280 	reason &= ~NMI_REASON_CLEAR_IOCHK;
281 	outb(reason, NMI_REASON_PORT);
282 }
283 
284 static notrace __kprobes void
285 unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
286 {
287 	int handled;
288 
289 	/*
290 	 * Use 'false' as back-to-back NMIs are dealt with one level up.
291 	 * Of course this makes having multiple 'unknown' handlers useless
292 	 * as only the first one is ever run (unless it can actually determine
293 	 * if it caused the NMI)
294 	 */
295 	handled = nmi_handle(NMI_UNKNOWN, regs, false);
296 	if (handled) {
297 		__this_cpu_add(nmi_stats.unknown, handled);
298 		return;
299 	}
300 
301 	__this_cpu_add(nmi_stats.unknown, 1);
302 
303 #ifdef CONFIG_MCA
304 	/*
305 	 * Might actually be able to figure out what the guilty party
306 	 * is:
307 	 */
308 	if (MCA_bus) {
309 		mca_handle_nmi();
310 		return;
311 	}
312 #endif
313 	pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
314 		 reason, smp_processor_id());
315 
316 	pr_emerg("Do you have a strange power saving mode enabled?\n");
317 	if (unknown_nmi_panic || panic_on_unrecovered_nmi)
318 		panic("NMI: Not continuing");
319 
320 	pr_emerg("Dazed and confused, but trying to continue\n");
321 }
322 
323 static DEFINE_PER_CPU(bool, swallow_nmi);
324 static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
325 
326 static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
327 {
328 	unsigned char reason = 0;
329 	int handled;
330 	bool b2b = false;
331 
332 	/*
333 	 * CPU-specific NMI must be processed before non-CPU-specific
334 	 * NMI, otherwise we may lose it, because the CPU-specific
335 	 * NMI can not be detected/processed on other CPUs.
336 	 */
337 
338 	/*
339 	 * Back-to-back NMIs are interesting because they can either
340 	 * be two NMI or more than two NMIs (any thing over two is dropped
341 	 * due to NMI being edge-triggered).  If this is the second half
342 	 * of the back-to-back NMI, assume we dropped things and process
343 	 * more handlers.  Otherwise reset the 'swallow' NMI behaviour
344 	 */
345 	if (regs->ip == __this_cpu_read(last_nmi_rip))
346 		b2b = true;
347 	else
348 		__this_cpu_write(swallow_nmi, false);
349 
350 	__this_cpu_write(last_nmi_rip, regs->ip);
351 
352 	handled = nmi_handle(NMI_LOCAL, regs, b2b);
353 	__this_cpu_add(nmi_stats.normal, handled);
354 	if (handled) {
355 		/*
356 		 * There are cases when a NMI handler handles multiple
357 		 * events in the current NMI.  One of these events may
358 		 * be queued for in the next NMI.  Because the event is
359 		 * already handled, the next NMI will result in an unknown
360 		 * NMI.  Instead lets flag this for a potential NMI to
361 		 * swallow.
362 		 */
363 		if (handled > 1)
364 			__this_cpu_write(swallow_nmi, true);
365 		return;
366 	}
367 
368 	/* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
369 	raw_spin_lock(&nmi_reason_lock);
370 	reason = x86_platform.get_nmi_reason();
371 
372 	if (reason & NMI_REASON_MASK) {
373 		if (reason & NMI_REASON_SERR)
374 			pci_serr_error(reason, regs);
375 		else if (reason & NMI_REASON_IOCHK)
376 			io_check_error(reason, regs);
377 #ifdef CONFIG_X86_32
378 		/*
379 		 * Reassert NMI in case it became active
380 		 * meanwhile as it's edge-triggered:
381 		 */
382 		reassert_nmi();
383 #endif
384 		__this_cpu_add(nmi_stats.external, 1);
385 		raw_spin_unlock(&nmi_reason_lock);
386 		return;
387 	}
388 	raw_spin_unlock(&nmi_reason_lock);
389 
390 	/*
391 	 * Only one NMI can be latched at a time.  To handle
392 	 * this we may process multiple nmi handlers at once to
393 	 * cover the case where an NMI is dropped.  The downside
394 	 * to this approach is we may process an NMI prematurely,
395 	 * while its real NMI is sitting latched.  This will cause
396 	 * an unknown NMI on the next run of the NMI processing.
397 	 *
398 	 * We tried to flag that condition above, by setting the
399 	 * swallow_nmi flag when we process more than one event.
400 	 * This condition is also only present on the second half
401 	 * of a back-to-back NMI, so we flag that condition too.
402 	 *
403 	 * If both are true, we assume we already processed this
404 	 * NMI previously and we swallow it.  Otherwise we reset
405 	 * the logic.
406 	 *
407 	 * There are scenarios where we may accidentally swallow
408 	 * a 'real' unknown NMI.  For example, while processing
409 	 * a perf NMI another perf NMI comes in along with a
410 	 * 'real' unknown NMI.  These two NMIs get combined into
411 	 * one (as descibed above).  When the next NMI gets
412 	 * processed, it will be flagged by perf as handled, but
413 	 * noone will know that there was a 'real' unknown NMI sent
414 	 * also.  As a result it gets swallowed.  Or if the first
415 	 * perf NMI returns two events handled then the second
416 	 * NMI will get eaten by the logic below, again losing a
417 	 * 'real' unknown NMI.  But this is the best we can do
418 	 * for now.
419 	 */
420 	if (b2b && __this_cpu_read(swallow_nmi))
421 		__this_cpu_add(nmi_stats.swallow, 1);
422 	else
423 		unknown_nmi_error(reason, regs);
424 }
425 
426 /*
427  * NMIs can hit breakpoints which will cause it to lose its
428  * NMI context with the CPU when the breakpoint does an iret.
429  */
430 #ifdef CONFIG_X86_32
431 /*
432  * For i386, NMIs use the same stack as the kernel, and we can
433  * add a workaround to the iret problem in C. Simply have 3 states
434  * the NMI can be in.
435  *
436  *  1) not running
437  *  2) executing
438  *  3) latched
439  *
440  * When no NMI is in progress, it is in the "not running" state.
441  * When an NMI comes in, it goes into the "executing" state.
442  * Normally, if another NMI is triggered, it does not interrupt
443  * the running NMI and the HW will simply latch it so that when
444  * the first NMI finishes, it will restart the second NMI.
445  * (Note, the latch is binary, thus multiple NMIs triggering,
446  *  when one is running, are ignored. Only one NMI is restarted.)
447  *
448  * If an NMI hits a breakpoint that executes an iret, another
449  * NMI can preempt it. We do not want to allow this new NMI
450  * to run, but we want to execute it when the first one finishes.
451  * We set the state to "latched", and the first NMI will perform
452  * an cmpxchg on the state, and if it doesn't successfully
453  * reset the state to "not running" it will restart the next
454  * NMI.
455  */
456 enum nmi_states {
457 	NMI_NOT_RUNNING,
458 	NMI_EXECUTING,
459 	NMI_LATCHED,
460 };
461 static DEFINE_PER_CPU(enum nmi_states, nmi_state);
462 
463 #define nmi_nesting_preprocess(regs)					\
464 	do {								\
465 		if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) {	\
466 			__get_cpu_var(nmi_state) = NMI_LATCHED;		\
467 			return;						\
468 		}							\
469 	nmi_restart:							\
470 		__get_cpu_var(nmi_state) = NMI_EXECUTING;		\
471 	} while (0)
472 
473 #define nmi_nesting_postprocess()					\
474 	do {								\
475 		if (cmpxchg(&__get_cpu_var(nmi_state),			\
476 		    NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING)	\
477 			goto nmi_restart;				\
478 	} while (0)
479 #else /* x86_64 */
480 /*
481  * In x86_64 things are a bit more difficult. This has the same problem
482  * where an NMI hitting a breakpoint that calls iret will remove the
483  * NMI context, allowing a nested NMI to enter. What makes this more
484  * difficult is that both NMIs and breakpoints have their own stack.
485  * When a new NMI or breakpoint is executed, the stack is set to a fixed
486  * point. If an NMI is nested, it will have its stack set at that same
487  * fixed address that the first NMI had, and will start corrupting the
488  * stack. This is handled in entry_64.S, but the same problem exists with
489  * the breakpoint stack.
490  *
491  * If a breakpoint is being processed, and the debug stack is being used,
492  * if an NMI comes in and also hits a breakpoint, the stack pointer
493  * will be set to the same fixed address as the breakpoint that was
494  * interrupted, causing that stack to be corrupted. To handle this case,
495  * check if the stack that was interrupted is the debug stack, and if
496  * so, change the IDT so that new breakpoints will use the current stack
497  * and not switch to the fixed address. On return of the NMI, switch back
498  * to the original IDT.
499  */
500 static DEFINE_PER_CPU(int, update_debug_stack);
501 
502 static inline void nmi_nesting_preprocess(struct pt_regs *regs)
503 {
504 	/*
505 	 * If we interrupted a breakpoint, it is possible that
506 	 * the nmi handler will have breakpoints too. We need to
507 	 * change the IDT such that breakpoints that happen here
508 	 * continue to use the NMI stack.
509 	 */
510 	if (unlikely(is_debug_stack(regs->sp))) {
511 		debug_stack_set_zero();
512 		__get_cpu_var(update_debug_stack) = 1;
513 	}
514 }
515 
516 static inline void nmi_nesting_postprocess(void)
517 {
518 	if (unlikely(__get_cpu_var(update_debug_stack)))
519 		debug_stack_reset();
520 }
521 #endif
522 
523 dotraplinkage notrace __kprobes void
524 do_nmi(struct pt_regs *regs, long error_code)
525 {
526 	nmi_nesting_preprocess(regs);
527 
528 	nmi_enter();
529 
530 	inc_irq_stat(__nmi_count);
531 
532 	if (!ignore_nmis)
533 		default_do_nmi(regs);
534 
535 	nmi_exit();
536 
537 	/* On i386, may loop back to preprocess */
538 	nmi_nesting_postprocess();
539 }
540 
541 void stop_nmi(void)
542 {
543 	ignore_nmis++;
544 }
545 
546 void restart_nmi(void)
547 {
548 	ignore_nmis--;
549 }
550 
551 /* reset the back-to-back NMI logic */
552 void local_touch_nmi(void)
553 {
554 	__this_cpu_write(last_nmi_rip, 0);
555 }
556