1 /* 2 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs 4 * Copyright (C) 2011 Don Zickus Red Hat, Inc. 5 * 6 * Pentium III FXSR, SSE support 7 * Gareth Hughes <gareth@valinux.com>, May 2000 8 */ 9 10 /* 11 * Handle hardware traps and faults. 12 */ 13 #include <linux/spinlock.h> 14 #include <linux/kprobes.h> 15 #include <linux/kdebug.h> 16 #include <linux/nmi.h> 17 #include <linux/debugfs.h> 18 #include <linux/delay.h> 19 #include <linux/hardirq.h> 20 #include <linux/slab.h> 21 #include <linux/export.h> 22 23 #if defined(CONFIG_EDAC) 24 #include <linux/edac.h> 25 #endif 26 27 #include <linux/atomic.h> 28 #include <asm/traps.h> 29 #include <asm/mach_traps.h> 30 #include <asm/nmi.h> 31 #include <asm/x86_init.h> 32 #include <asm/reboot.h> 33 34 #define CREATE_TRACE_POINTS 35 #include <trace/events/nmi.h> 36 37 struct nmi_desc { 38 spinlock_t lock; 39 struct list_head head; 40 }; 41 42 static struct nmi_desc nmi_desc[NMI_MAX] = 43 { 44 { 45 .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), 46 .head = LIST_HEAD_INIT(nmi_desc[0].head), 47 }, 48 { 49 .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), 50 .head = LIST_HEAD_INIT(nmi_desc[1].head), 51 }, 52 { 53 .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), 54 .head = LIST_HEAD_INIT(nmi_desc[2].head), 55 }, 56 { 57 .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), 58 .head = LIST_HEAD_INIT(nmi_desc[3].head), 59 }, 60 61 }; 62 63 struct nmi_stats { 64 unsigned int normal; 65 unsigned int unknown; 66 unsigned int external; 67 unsigned int swallow; 68 }; 69 70 static DEFINE_PER_CPU(struct nmi_stats, nmi_stats); 71 72 static int ignore_nmis; 73 74 int unknown_nmi_panic; 75 /* 76 * Prevent NMI reason port (0x61) being accessed simultaneously, can 77 * only be used in NMI handler. 78 */ 79 static DEFINE_RAW_SPINLOCK(nmi_reason_lock); 80 81 static int __init setup_unknown_nmi_panic(char *str) 82 { 83 unknown_nmi_panic = 1; 84 return 1; 85 } 86 __setup("unknown_nmi_panic", setup_unknown_nmi_panic); 87 88 #define nmi_to_desc(type) (&nmi_desc[type]) 89 90 static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC; 91 92 static int __init nmi_warning_debugfs(void) 93 { 94 debugfs_create_u64("nmi_longest_ns", 0644, 95 arch_debugfs_dir, &nmi_longest_ns); 96 return 0; 97 } 98 fs_initcall(nmi_warning_debugfs); 99 100 static void nmi_max_handler(struct irq_work *w) 101 { 102 struct nmiaction *a = container_of(w, struct nmiaction, irq_work); 103 int remainder_ns, decimal_msecs; 104 u64 whole_msecs = ACCESS_ONCE(a->max_duration); 105 106 remainder_ns = do_div(whole_msecs, (1000 * 1000)); 107 decimal_msecs = remainder_ns / 1000; 108 109 printk_ratelimited(KERN_INFO 110 "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n", 111 a->handler, whole_msecs, decimal_msecs); 112 } 113 114 static int nmi_handle(unsigned int type, struct pt_regs *regs) 115 { 116 struct nmi_desc *desc = nmi_to_desc(type); 117 struct nmiaction *a; 118 int handled=0; 119 120 rcu_read_lock(); 121 122 /* 123 * NMIs are edge-triggered, which means if you have enough 124 * of them concurrently, you can lose some because only one 125 * can be latched at any given time. Walk the whole list 126 * to handle those situations. 127 */ 128 list_for_each_entry_rcu(a, &desc->head, list) { 129 int thishandled; 130 u64 delta; 131 132 delta = sched_clock(); 133 thishandled = a->handler(type, regs); 134 handled += thishandled; 135 delta = sched_clock() - delta; 136 trace_nmi_handler(a->handler, (int)delta, thishandled); 137 138 if (delta < nmi_longest_ns || delta < a->max_duration) 139 continue; 140 141 a->max_duration = delta; 142 irq_work_queue(&a->irq_work); 143 } 144 145 rcu_read_unlock(); 146 147 /* return total number of NMI events handled */ 148 return handled; 149 } 150 NOKPROBE_SYMBOL(nmi_handle); 151 152 int __register_nmi_handler(unsigned int type, struct nmiaction *action) 153 { 154 struct nmi_desc *desc = nmi_to_desc(type); 155 unsigned long flags; 156 157 if (!action->handler) 158 return -EINVAL; 159 160 init_irq_work(&action->irq_work, nmi_max_handler); 161 162 spin_lock_irqsave(&desc->lock, flags); 163 164 /* 165 * most handlers of type NMI_UNKNOWN never return because 166 * they just assume the NMI is theirs. Just a sanity check 167 * to manage expectations 168 */ 169 WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head)); 170 WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head)); 171 WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head)); 172 173 /* 174 * some handlers need to be executed first otherwise a fake 175 * event confuses some handlers (kdump uses this flag) 176 */ 177 if (action->flags & NMI_FLAG_FIRST) 178 list_add_rcu(&action->list, &desc->head); 179 else 180 list_add_tail_rcu(&action->list, &desc->head); 181 182 spin_unlock_irqrestore(&desc->lock, flags); 183 return 0; 184 } 185 EXPORT_SYMBOL(__register_nmi_handler); 186 187 void unregister_nmi_handler(unsigned int type, const char *name) 188 { 189 struct nmi_desc *desc = nmi_to_desc(type); 190 struct nmiaction *n; 191 unsigned long flags; 192 193 spin_lock_irqsave(&desc->lock, flags); 194 195 list_for_each_entry_rcu(n, &desc->head, list) { 196 /* 197 * the name passed in to describe the nmi handler 198 * is used as the lookup key 199 */ 200 if (!strcmp(n->name, name)) { 201 WARN(in_nmi(), 202 "Trying to free NMI (%s) from NMI context!\n", n->name); 203 list_del_rcu(&n->list); 204 break; 205 } 206 } 207 208 spin_unlock_irqrestore(&desc->lock, flags); 209 synchronize_rcu(); 210 } 211 EXPORT_SYMBOL_GPL(unregister_nmi_handler); 212 213 static void 214 pci_serr_error(unsigned char reason, struct pt_regs *regs) 215 { 216 /* check to see if anyone registered against these types of errors */ 217 if (nmi_handle(NMI_SERR, regs)) 218 return; 219 220 pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", 221 reason, smp_processor_id()); 222 223 /* 224 * On some machines, PCI SERR line is used to report memory 225 * errors. EDAC makes use of it. 226 */ 227 #if defined(CONFIG_EDAC) 228 if (edac_handler_set()) { 229 edac_atomic_assert_error(); 230 return; 231 } 232 #endif 233 234 if (panic_on_unrecovered_nmi) 235 nmi_panic(regs, "NMI: Not continuing"); 236 237 pr_emerg("Dazed and confused, but trying to continue\n"); 238 239 /* Clear and disable the PCI SERR error line. */ 240 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR; 241 outb(reason, NMI_REASON_PORT); 242 } 243 NOKPROBE_SYMBOL(pci_serr_error); 244 245 static void 246 io_check_error(unsigned char reason, struct pt_regs *regs) 247 { 248 unsigned long i; 249 250 /* check to see if anyone registered against these types of errors */ 251 if (nmi_handle(NMI_IO_CHECK, regs)) 252 return; 253 254 pr_emerg( 255 "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", 256 reason, smp_processor_id()); 257 show_regs(regs); 258 259 if (panic_on_io_nmi) { 260 nmi_panic(regs, "NMI IOCK error: Not continuing"); 261 262 /* 263 * If we end up here, it means we have received an NMI while 264 * processing panic(). Simply return without delaying and 265 * re-enabling NMIs. 266 */ 267 return; 268 } 269 270 /* Re-enable the IOCK line, wait for a few seconds */ 271 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK; 272 outb(reason, NMI_REASON_PORT); 273 274 i = 20000; 275 while (--i) { 276 touch_nmi_watchdog(); 277 udelay(100); 278 } 279 280 reason &= ~NMI_REASON_CLEAR_IOCHK; 281 outb(reason, NMI_REASON_PORT); 282 } 283 NOKPROBE_SYMBOL(io_check_error); 284 285 static void 286 unknown_nmi_error(unsigned char reason, struct pt_regs *regs) 287 { 288 int handled; 289 290 /* 291 * Use 'false' as back-to-back NMIs are dealt with one level up. 292 * Of course this makes having multiple 'unknown' handlers useless 293 * as only the first one is ever run (unless it can actually determine 294 * if it caused the NMI) 295 */ 296 handled = nmi_handle(NMI_UNKNOWN, regs); 297 if (handled) { 298 __this_cpu_add(nmi_stats.unknown, handled); 299 return; 300 } 301 302 __this_cpu_add(nmi_stats.unknown, 1); 303 304 pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", 305 reason, smp_processor_id()); 306 307 pr_emerg("Do you have a strange power saving mode enabled?\n"); 308 if (unknown_nmi_panic || panic_on_unrecovered_nmi) 309 nmi_panic(regs, "NMI: Not continuing"); 310 311 pr_emerg("Dazed and confused, but trying to continue\n"); 312 } 313 NOKPROBE_SYMBOL(unknown_nmi_error); 314 315 static DEFINE_PER_CPU(bool, swallow_nmi); 316 static DEFINE_PER_CPU(unsigned long, last_nmi_rip); 317 318 static void default_do_nmi(struct pt_regs *regs) 319 { 320 unsigned char reason = 0; 321 int handled; 322 bool b2b = false; 323 324 /* 325 * CPU-specific NMI must be processed before non-CPU-specific 326 * NMI, otherwise we may lose it, because the CPU-specific 327 * NMI can not be detected/processed on other CPUs. 328 */ 329 330 /* 331 * Back-to-back NMIs are interesting because they can either 332 * be two NMI or more than two NMIs (any thing over two is dropped 333 * due to NMI being edge-triggered). If this is the second half 334 * of the back-to-back NMI, assume we dropped things and process 335 * more handlers. Otherwise reset the 'swallow' NMI behaviour 336 */ 337 if (regs->ip == __this_cpu_read(last_nmi_rip)) 338 b2b = true; 339 else 340 __this_cpu_write(swallow_nmi, false); 341 342 __this_cpu_write(last_nmi_rip, regs->ip); 343 344 handled = nmi_handle(NMI_LOCAL, regs); 345 __this_cpu_add(nmi_stats.normal, handled); 346 if (handled) { 347 /* 348 * There are cases when a NMI handler handles multiple 349 * events in the current NMI. One of these events may 350 * be queued for in the next NMI. Because the event is 351 * already handled, the next NMI will result in an unknown 352 * NMI. Instead lets flag this for a potential NMI to 353 * swallow. 354 */ 355 if (handled > 1) 356 __this_cpu_write(swallow_nmi, true); 357 return; 358 } 359 360 /* 361 * Non-CPU-specific NMI: NMI sources can be processed on any CPU. 362 * 363 * Another CPU may be processing panic routines while holding 364 * nmi_reason_lock. Check if the CPU issued the IPI for crash dumping, 365 * and if so, call its callback directly. If there is no CPU preparing 366 * crash dump, we simply loop here. 367 */ 368 while (!raw_spin_trylock(&nmi_reason_lock)) { 369 run_crash_ipi_callback(regs); 370 cpu_relax(); 371 } 372 373 reason = x86_platform.get_nmi_reason(); 374 375 if (reason & NMI_REASON_MASK) { 376 if (reason & NMI_REASON_SERR) 377 pci_serr_error(reason, regs); 378 else if (reason & NMI_REASON_IOCHK) 379 io_check_error(reason, regs); 380 #ifdef CONFIG_X86_32 381 /* 382 * Reassert NMI in case it became active 383 * meanwhile as it's edge-triggered: 384 */ 385 reassert_nmi(); 386 #endif 387 __this_cpu_add(nmi_stats.external, 1); 388 raw_spin_unlock(&nmi_reason_lock); 389 return; 390 } 391 raw_spin_unlock(&nmi_reason_lock); 392 393 /* 394 * Only one NMI can be latched at a time. To handle 395 * this we may process multiple nmi handlers at once to 396 * cover the case where an NMI is dropped. The downside 397 * to this approach is we may process an NMI prematurely, 398 * while its real NMI is sitting latched. This will cause 399 * an unknown NMI on the next run of the NMI processing. 400 * 401 * We tried to flag that condition above, by setting the 402 * swallow_nmi flag when we process more than one event. 403 * This condition is also only present on the second half 404 * of a back-to-back NMI, so we flag that condition too. 405 * 406 * If both are true, we assume we already processed this 407 * NMI previously and we swallow it. Otherwise we reset 408 * the logic. 409 * 410 * There are scenarios where we may accidentally swallow 411 * a 'real' unknown NMI. For example, while processing 412 * a perf NMI another perf NMI comes in along with a 413 * 'real' unknown NMI. These two NMIs get combined into 414 * one (as descibed above). When the next NMI gets 415 * processed, it will be flagged by perf as handled, but 416 * noone will know that there was a 'real' unknown NMI sent 417 * also. As a result it gets swallowed. Or if the first 418 * perf NMI returns two events handled then the second 419 * NMI will get eaten by the logic below, again losing a 420 * 'real' unknown NMI. But this is the best we can do 421 * for now. 422 */ 423 if (b2b && __this_cpu_read(swallow_nmi)) 424 __this_cpu_add(nmi_stats.swallow, 1); 425 else 426 unknown_nmi_error(reason, regs); 427 } 428 NOKPROBE_SYMBOL(default_do_nmi); 429 430 /* 431 * NMIs can page fault or hit breakpoints which will cause it to lose 432 * its NMI context with the CPU when the breakpoint or page fault does an IRET. 433 * 434 * As a result, NMIs can nest if NMIs get unmasked due an IRET during 435 * NMI processing. On x86_64, the asm glue protects us from nested NMIs 436 * if the outer NMI came from kernel mode, but we can still nest if the 437 * outer NMI came from user mode. 438 * 439 * To handle these nested NMIs, we have three states: 440 * 441 * 1) not running 442 * 2) executing 443 * 3) latched 444 * 445 * When no NMI is in progress, it is in the "not running" state. 446 * When an NMI comes in, it goes into the "executing" state. 447 * Normally, if another NMI is triggered, it does not interrupt 448 * the running NMI and the HW will simply latch it so that when 449 * the first NMI finishes, it will restart the second NMI. 450 * (Note, the latch is binary, thus multiple NMIs triggering, 451 * when one is running, are ignored. Only one NMI is restarted.) 452 * 453 * If an NMI executes an iret, another NMI can preempt it. We do not 454 * want to allow this new NMI to run, but we want to execute it when the 455 * first one finishes. We set the state to "latched", and the exit of 456 * the first NMI will perform a dec_return, if the result is zero 457 * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the 458 * dec_return would have set the state to NMI_EXECUTING (what we want it 459 * to be when we are running). In this case, we simply jump back to 460 * rerun the NMI handler again, and restart the 'latched' NMI. 461 * 462 * No trap (breakpoint or page fault) should be hit before nmi_restart, 463 * thus there is no race between the first check of state for NOT_RUNNING 464 * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs 465 * at this point. 466 * 467 * In case the NMI takes a page fault, we need to save off the CR2 468 * because the NMI could have preempted another page fault and corrupt 469 * the CR2 that is about to be read. As nested NMIs must be restarted 470 * and they can not take breakpoints or page faults, the update of the 471 * CR2 must be done before converting the nmi state back to NOT_RUNNING. 472 * Otherwise, there would be a race of another nested NMI coming in 473 * after setting state to NOT_RUNNING but before updating the nmi_cr2. 474 */ 475 enum nmi_states { 476 NMI_NOT_RUNNING = 0, 477 NMI_EXECUTING, 478 NMI_LATCHED, 479 }; 480 static DEFINE_PER_CPU(enum nmi_states, nmi_state); 481 static DEFINE_PER_CPU(unsigned long, nmi_cr2); 482 483 #ifdef CONFIG_X86_64 484 /* 485 * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without 486 * some care, the inner breakpoint will clobber the outer breakpoint's 487 * stack. 488 * 489 * If a breakpoint is being processed, and the debug stack is being 490 * used, if an NMI comes in and also hits a breakpoint, the stack 491 * pointer will be set to the same fixed address as the breakpoint that 492 * was interrupted, causing that stack to be corrupted. To handle this 493 * case, check if the stack that was interrupted is the debug stack, and 494 * if so, change the IDT so that new breakpoints will use the current 495 * stack and not switch to the fixed address. On return of the NMI, 496 * switch back to the original IDT. 497 */ 498 static DEFINE_PER_CPU(int, update_debug_stack); 499 #endif 500 501 dotraplinkage notrace void 502 do_nmi(struct pt_regs *regs, long error_code) 503 { 504 if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { 505 this_cpu_write(nmi_state, NMI_LATCHED); 506 return; 507 } 508 this_cpu_write(nmi_state, NMI_EXECUTING); 509 this_cpu_write(nmi_cr2, read_cr2()); 510 nmi_restart: 511 512 #ifdef CONFIG_X86_64 513 /* 514 * If we interrupted a breakpoint, it is possible that 515 * the nmi handler will have breakpoints too. We need to 516 * change the IDT such that breakpoints that happen here 517 * continue to use the NMI stack. 518 */ 519 if (unlikely(is_debug_stack(regs->sp))) { 520 debug_stack_set_zero(); 521 this_cpu_write(update_debug_stack, 1); 522 } 523 #endif 524 525 nmi_enter(); 526 527 inc_irq_stat(__nmi_count); 528 529 if (!ignore_nmis) 530 default_do_nmi(regs); 531 532 nmi_exit(); 533 534 #ifdef CONFIG_X86_64 535 if (unlikely(this_cpu_read(update_debug_stack))) { 536 debug_stack_reset(); 537 this_cpu_write(update_debug_stack, 0); 538 } 539 #endif 540 541 if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) 542 write_cr2(this_cpu_read(nmi_cr2)); 543 if (this_cpu_dec_return(nmi_state)) 544 goto nmi_restart; 545 } 546 NOKPROBE_SYMBOL(do_nmi); 547 548 void stop_nmi(void) 549 { 550 ignore_nmis++; 551 } 552 553 void restart_nmi(void) 554 { 555 ignore_nmis--; 556 } 557 558 /* reset the back-to-back NMI logic */ 559 void local_touch_nmi(void) 560 { 561 __this_cpu_write(last_nmi_rip, 0); 562 } 563 EXPORT_SYMBOL_GPL(local_touch_nmi); 564