1 /* 2 * Copyright (C) 1995 Linus Torvalds 3 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. 4 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar 5 */ 6 #include <linux/magic.h> /* STACK_END_MAGIC */ 7 #include <linux/sched.h> /* test_thread_flag(), ... */ 8 #include <linux/kdebug.h> /* oops_begin/end, ... */ 9 #include <linux/module.h> /* search_exception_table */ 10 #include <linux/bootmem.h> /* max_low_pfn */ 11 #include <linux/kprobes.h> /* __kprobes, ... */ 12 #include <linux/mmiotrace.h> /* kmmio_handler, ... */ 13 #include <linux/perf_event.h> /* perf_sw_event */ 14 15 #include <asm/traps.h> /* dotraplinkage, ... */ 16 #include <asm/pgalloc.h> /* pgd_*(), ... */ 17 #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ 18 19 /* 20 * Page fault error code bits: 21 * 22 * bit 0 == 0: no page found 1: protection fault 23 * bit 1 == 0: read access 1: write access 24 * bit 2 == 0: kernel-mode access 1: user-mode access 25 * bit 3 == 1: use of reserved bit detected 26 * bit 4 == 1: fault was an instruction fetch 27 */ 28 enum x86_pf_error_code { 29 30 PF_PROT = 1 << 0, 31 PF_WRITE = 1 << 1, 32 PF_USER = 1 << 2, 33 PF_RSVD = 1 << 3, 34 PF_INSTR = 1 << 4, 35 }; 36 37 /* 38 * Returns 0 if mmiotrace is disabled, or if the fault is not 39 * handled by mmiotrace: 40 */ 41 static inline int __kprobes 42 kmmio_fault(struct pt_regs *regs, unsigned long addr) 43 { 44 if (unlikely(is_kmmio_active())) 45 if (kmmio_handler(regs, addr) == 1) 46 return -1; 47 return 0; 48 } 49 50 static inline int __kprobes notify_page_fault(struct pt_regs *regs) 51 { 52 int ret = 0; 53 54 /* kprobe_running() needs smp_processor_id() */ 55 if (kprobes_built_in() && !user_mode_vm(regs)) { 56 preempt_disable(); 57 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 58 ret = 1; 59 preempt_enable(); 60 } 61 62 return ret; 63 } 64 65 /* 66 * Prefetch quirks: 67 * 68 * 32-bit mode: 69 * 70 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. 71 * Check that here and ignore it. 72 * 73 * 64-bit mode: 74 * 75 * Sometimes the CPU reports invalid exceptions on prefetch. 76 * Check that here and ignore it. 77 * 78 * Opcode checker based on code by Richard Brunner. 79 */ 80 static inline int 81 check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, 82 unsigned char opcode, int *prefetch) 83 { 84 unsigned char instr_hi = opcode & 0xf0; 85 unsigned char instr_lo = opcode & 0x0f; 86 87 switch (instr_hi) { 88 case 0x20: 89 case 0x30: 90 /* 91 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. 92 * In X86_64 long mode, the CPU will signal invalid 93 * opcode if some of these prefixes are present so 94 * X86_64 will never get here anyway 95 */ 96 return ((instr_lo & 7) == 0x6); 97 #ifdef CONFIG_X86_64 98 case 0x40: 99 /* 100 * In AMD64 long mode 0x40..0x4F are valid REX prefixes 101 * Need to figure out under what instruction mode the 102 * instruction was issued. Could check the LDT for lm, 103 * but for now it's good enough to assume that long 104 * mode only uses well known segments or kernel. 105 */ 106 return (!user_mode(regs)) || (regs->cs == __USER_CS); 107 #endif 108 case 0x60: 109 /* 0x64 thru 0x67 are valid prefixes in all modes. */ 110 return (instr_lo & 0xC) == 0x4; 111 case 0xF0: 112 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ 113 return !instr_lo || (instr_lo>>1) == 1; 114 case 0x00: 115 /* Prefetch instruction is 0x0F0D or 0x0F18 */ 116 if (probe_kernel_address(instr, opcode)) 117 return 0; 118 119 *prefetch = (instr_lo == 0xF) && 120 (opcode == 0x0D || opcode == 0x18); 121 return 0; 122 default: 123 return 0; 124 } 125 } 126 127 static int 128 is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) 129 { 130 unsigned char *max_instr; 131 unsigned char *instr; 132 int prefetch = 0; 133 134 /* 135 * If it was a exec (instruction fetch) fault on NX page, then 136 * do not ignore the fault: 137 */ 138 if (error_code & PF_INSTR) 139 return 0; 140 141 instr = (void *)convert_ip_to_linear(current, regs); 142 max_instr = instr + 15; 143 144 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) 145 return 0; 146 147 while (instr < max_instr) { 148 unsigned char opcode; 149 150 if (probe_kernel_address(instr, opcode)) 151 break; 152 153 instr++; 154 155 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) 156 break; 157 } 158 return prefetch; 159 } 160 161 static void 162 force_sig_info_fault(int si_signo, int si_code, unsigned long address, 163 struct task_struct *tsk) 164 { 165 siginfo_t info; 166 167 info.si_signo = si_signo; 168 info.si_errno = 0; 169 info.si_code = si_code; 170 info.si_addr = (void __user *)address; 171 info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; 172 173 force_sig_info(si_signo, &info, tsk); 174 } 175 176 DEFINE_SPINLOCK(pgd_lock); 177 LIST_HEAD(pgd_list); 178 179 #ifdef CONFIG_X86_32 180 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) 181 { 182 unsigned index = pgd_index(address); 183 pgd_t *pgd_k; 184 pud_t *pud, *pud_k; 185 pmd_t *pmd, *pmd_k; 186 187 pgd += index; 188 pgd_k = init_mm.pgd + index; 189 190 if (!pgd_present(*pgd_k)) 191 return NULL; 192 193 /* 194 * set_pgd(pgd, *pgd_k); here would be useless on PAE 195 * and redundant with the set_pmd() on non-PAE. As would 196 * set_pud. 197 */ 198 pud = pud_offset(pgd, address); 199 pud_k = pud_offset(pgd_k, address); 200 if (!pud_present(*pud_k)) 201 return NULL; 202 203 pmd = pmd_offset(pud, address); 204 pmd_k = pmd_offset(pud_k, address); 205 if (!pmd_present(*pmd_k)) 206 return NULL; 207 208 if (!pmd_present(*pmd)) 209 set_pmd(pmd, *pmd_k); 210 else 211 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); 212 213 return pmd_k; 214 } 215 216 void vmalloc_sync_all(void) 217 { 218 unsigned long address; 219 220 if (SHARED_KERNEL_PMD) 221 return; 222 223 for (address = VMALLOC_START & PMD_MASK; 224 address >= TASK_SIZE && address < FIXADDR_TOP; 225 address += PMD_SIZE) { 226 227 unsigned long flags; 228 struct page *page; 229 230 spin_lock_irqsave(&pgd_lock, flags); 231 list_for_each_entry(page, &pgd_list, lru) { 232 if (!vmalloc_sync_one(page_address(page), address)) 233 break; 234 } 235 spin_unlock_irqrestore(&pgd_lock, flags); 236 } 237 } 238 239 /* 240 * 32-bit: 241 * 242 * Handle a fault on the vmalloc or module mapping area 243 */ 244 static noinline __kprobes int vmalloc_fault(unsigned long address) 245 { 246 unsigned long pgd_paddr; 247 pmd_t *pmd_k; 248 pte_t *pte_k; 249 250 /* Make sure we are in vmalloc area: */ 251 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 252 return -1; 253 254 /* 255 * Synchronize this task's top level page-table 256 * with the 'reference' page table. 257 * 258 * Do _not_ use "current" here. We might be inside 259 * an interrupt in the middle of a task switch.. 260 */ 261 pgd_paddr = read_cr3(); 262 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); 263 if (!pmd_k) 264 return -1; 265 266 pte_k = pte_offset_kernel(pmd_k, address); 267 if (!pte_present(*pte_k)) 268 return -1; 269 270 return 0; 271 } 272 273 /* 274 * Did it hit the DOS screen memory VA from vm86 mode? 275 */ 276 static inline void 277 check_v8086_mode(struct pt_regs *regs, unsigned long address, 278 struct task_struct *tsk) 279 { 280 unsigned long bit; 281 282 if (!v8086_mode(regs)) 283 return; 284 285 bit = (address - 0xA0000) >> PAGE_SHIFT; 286 if (bit < 32) 287 tsk->thread.screen_bitmap |= 1 << bit; 288 } 289 290 static bool low_pfn(unsigned long pfn) 291 { 292 return pfn < max_low_pfn; 293 } 294 295 static void dump_pagetable(unsigned long address) 296 { 297 pgd_t *base = __va(read_cr3()); 298 pgd_t *pgd = &base[pgd_index(address)]; 299 pmd_t *pmd; 300 pte_t *pte; 301 302 #ifdef CONFIG_X86_PAE 303 printk("*pdpt = %016Lx ", pgd_val(*pgd)); 304 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) 305 goto out; 306 #endif 307 pmd = pmd_offset(pud_offset(pgd, address), address); 308 printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); 309 310 /* 311 * We must not directly access the pte in the highpte 312 * case if the page table is located in highmem. 313 * And let's rather not kmap-atomic the pte, just in case 314 * it's allocated already: 315 */ 316 if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) 317 goto out; 318 319 pte = pte_offset_kernel(pmd, address); 320 printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); 321 out: 322 printk("\n"); 323 } 324 325 #else /* CONFIG_X86_64: */ 326 327 void vmalloc_sync_all(void) 328 { 329 unsigned long address; 330 331 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; 332 address += PGDIR_SIZE) { 333 334 const pgd_t *pgd_ref = pgd_offset_k(address); 335 unsigned long flags; 336 struct page *page; 337 338 if (pgd_none(*pgd_ref)) 339 continue; 340 341 spin_lock_irqsave(&pgd_lock, flags); 342 list_for_each_entry(page, &pgd_list, lru) { 343 pgd_t *pgd; 344 pgd = (pgd_t *)page_address(page) + pgd_index(address); 345 if (pgd_none(*pgd)) 346 set_pgd(pgd, *pgd_ref); 347 else 348 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 349 } 350 spin_unlock_irqrestore(&pgd_lock, flags); 351 } 352 } 353 354 /* 355 * 64-bit: 356 * 357 * Handle a fault on the vmalloc area 358 * 359 * This assumes no large pages in there. 360 */ 361 static noinline __kprobes int vmalloc_fault(unsigned long address) 362 { 363 pgd_t *pgd, *pgd_ref; 364 pud_t *pud, *pud_ref; 365 pmd_t *pmd, *pmd_ref; 366 pte_t *pte, *pte_ref; 367 368 /* Make sure we are in vmalloc area: */ 369 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 370 return -1; 371 372 /* 373 * Copy kernel mappings over when needed. This can also 374 * happen within a race in page table update. In the later 375 * case just flush: 376 */ 377 pgd = pgd_offset(current->active_mm, address); 378 pgd_ref = pgd_offset_k(address); 379 if (pgd_none(*pgd_ref)) 380 return -1; 381 382 if (pgd_none(*pgd)) 383 set_pgd(pgd, *pgd_ref); 384 else 385 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 386 387 /* 388 * Below here mismatches are bugs because these lower tables 389 * are shared: 390 */ 391 392 pud = pud_offset(pgd, address); 393 pud_ref = pud_offset(pgd_ref, address); 394 if (pud_none(*pud_ref)) 395 return -1; 396 397 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) 398 BUG(); 399 400 pmd = pmd_offset(pud, address); 401 pmd_ref = pmd_offset(pud_ref, address); 402 if (pmd_none(*pmd_ref)) 403 return -1; 404 405 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) 406 BUG(); 407 408 pte_ref = pte_offset_kernel(pmd_ref, address); 409 if (!pte_present(*pte_ref)) 410 return -1; 411 412 pte = pte_offset_kernel(pmd, address); 413 414 /* 415 * Don't use pte_page here, because the mappings can point 416 * outside mem_map, and the NUMA hash lookup cannot handle 417 * that: 418 */ 419 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) 420 BUG(); 421 422 return 0; 423 } 424 425 static const char errata93_warning[] = 426 KERN_ERR 427 "******* Your BIOS seems to not contain a fix for K8 errata #93\n" 428 "******* Working around it, but it may cause SEGVs or burn power.\n" 429 "******* Please consider a BIOS update.\n" 430 "******* Disabling USB legacy in the BIOS may also help.\n"; 431 432 /* 433 * No vm86 mode in 64-bit mode: 434 */ 435 static inline void 436 check_v8086_mode(struct pt_regs *regs, unsigned long address, 437 struct task_struct *tsk) 438 { 439 } 440 441 static int bad_address(void *p) 442 { 443 unsigned long dummy; 444 445 return probe_kernel_address((unsigned long *)p, dummy); 446 } 447 448 static void dump_pagetable(unsigned long address) 449 { 450 pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); 451 pgd_t *pgd = base + pgd_index(address); 452 pud_t *pud; 453 pmd_t *pmd; 454 pte_t *pte; 455 456 if (bad_address(pgd)) 457 goto bad; 458 459 printk("PGD %lx ", pgd_val(*pgd)); 460 461 if (!pgd_present(*pgd)) 462 goto out; 463 464 pud = pud_offset(pgd, address); 465 if (bad_address(pud)) 466 goto bad; 467 468 printk("PUD %lx ", pud_val(*pud)); 469 if (!pud_present(*pud) || pud_large(*pud)) 470 goto out; 471 472 pmd = pmd_offset(pud, address); 473 if (bad_address(pmd)) 474 goto bad; 475 476 printk("PMD %lx ", pmd_val(*pmd)); 477 if (!pmd_present(*pmd) || pmd_large(*pmd)) 478 goto out; 479 480 pte = pte_offset_kernel(pmd, address); 481 if (bad_address(pte)) 482 goto bad; 483 484 printk("PTE %lx", pte_val(*pte)); 485 out: 486 printk("\n"); 487 return; 488 bad: 489 printk("BAD\n"); 490 } 491 492 #endif /* CONFIG_X86_64 */ 493 494 /* 495 * Workaround for K8 erratum #93 & buggy BIOS. 496 * 497 * BIOS SMM functions are required to use a specific workaround 498 * to avoid corruption of the 64bit RIP register on C stepping K8. 499 * 500 * A lot of BIOS that didn't get tested properly miss this. 501 * 502 * The OS sees this as a page fault with the upper 32bits of RIP cleared. 503 * Try to work around it here. 504 * 505 * Note we only handle faults in kernel here. 506 * Does nothing on 32-bit. 507 */ 508 static int is_errata93(struct pt_regs *regs, unsigned long address) 509 { 510 #ifdef CONFIG_X86_64 511 if (address != regs->ip) 512 return 0; 513 514 if ((address >> 32) != 0) 515 return 0; 516 517 address |= 0xffffffffUL << 32; 518 if ((address >= (u64)_stext && address <= (u64)_etext) || 519 (address >= MODULES_VADDR && address <= MODULES_END)) { 520 printk_once(errata93_warning); 521 regs->ip = address; 522 return 1; 523 } 524 #endif 525 return 0; 526 } 527 528 /* 529 * Work around K8 erratum #100 K8 in compat mode occasionally jumps 530 * to illegal addresses >4GB. 531 * 532 * We catch this in the page fault handler because these addresses 533 * are not reachable. Just detect this case and return. Any code 534 * segment in LDT is compatibility mode. 535 */ 536 static int is_errata100(struct pt_regs *regs, unsigned long address) 537 { 538 #ifdef CONFIG_X86_64 539 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) 540 return 1; 541 #endif 542 return 0; 543 } 544 545 static int is_f00f_bug(struct pt_regs *regs, unsigned long address) 546 { 547 #ifdef CONFIG_X86_F00F_BUG 548 unsigned long nr; 549 550 /* 551 * Pentium F0 0F C7 C8 bug workaround: 552 */ 553 if (boot_cpu_data.f00f_bug) { 554 nr = (address - idt_descr.address) >> 3; 555 556 if (nr == 6) { 557 do_invalid_op(regs, 0); 558 return 1; 559 } 560 } 561 #endif 562 return 0; 563 } 564 565 static const char nx_warning[] = KERN_CRIT 566 "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; 567 568 static void 569 show_fault_oops(struct pt_regs *regs, unsigned long error_code, 570 unsigned long address) 571 { 572 if (!oops_may_print()) 573 return; 574 575 if (error_code & PF_INSTR) { 576 unsigned int level; 577 578 pte_t *pte = lookup_address(address, &level); 579 580 if (pte && pte_present(*pte) && !pte_exec(*pte)) 581 printk(nx_warning, current_uid()); 582 } 583 584 printk(KERN_ALERT "BUG: unable to handle kernel "); 585 if (address < PAGE_SIZE) 586 printk(KERN_CONT "NULL pointer dereference"); 587 else 588 printk(KERN_CONT "paging request"); 589 590 printk(KERN_CONT " at %p\n", (void *) address); 591 printk(KERN_ALERT "IP:"); 592 printk_address(regs->ip, 1); 593 594 dump_pagetable(address); 595 } 596 597 static noinline void 598 pgtable_bad(struct pt_regs *regs, unsigned long error_code, 599 unsigned long address) 600 { 601 struct task_struct *tsk; 602 unsigned long flags; 603 int sig; 604 605 flags = oops_begin(); 606 tsk = current; 607 sig = SIGKILL; 608 609 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 610 tsk->comm, address); 611 dump_pagetable(address); 612 613 tsk->thread.cr2 = address; 614 tsk->thread.trap_no = 14; 615 tsk->thread.error_code = error_code; 616 617 if (__die("Bad pagetable", regs, error_code)) 618 sig = 0; 619 620 oops_end(flags, regs, sig); 621 } 622 623 static noinline void 624 no_context(struct pt_regs *regs, unsigned long error_code, 625 unsigned long address) 626 { 627 struct task_struct *tsk = current; 628 unsigned long *stackend; 629 unsigned long flags; 630 int sig; 631 632 /* Are we prepared to handle this kernel fault? */ 633 if (fixup_exception(regs)) 634 return; 635 636 /* 637 * 32-bit: 638 * 639 * Valid to do another page fault here, because if this fault 640 * had been triggered by is_prefetch fixup_exception would have 641 * handled it. 642 * 643 * 64-bit: 644 * 645 * Hall of shame of CPU/BIOS bugs. 646 */ 647 if (is_prefetch(regs, error_code, address)) 648 return; 649 650 if (is_errata93(regs, address)) 651 return; 652 653 /* 654 * Oops. The kernel tried to access some bad page. We'll have to 655 * terminate things with extreme prejudice: 656 */ 657 flags = oops_begin(); 658 659 show_fault_oops(regs, error_code, address); 660 661 stackend = end_of_stack(tsk); 662 if (tsk != &init_task && *stackend != STACK_END_MAGIC) 663 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); 664 665 tsk->thread.cr2 = address; 666 tsk->thread.trap_no = 14; 667 tsk->thread.error_code = error_code; 668 669 sig = SIGKILL; 670 if (__die("Oops", regs, error_code)) 671 sig = 0; 672 673 /* Executive summary in case the body of the oops scrolled away */ 674 printk(KERN_EMERG "CR2: %016lx\n", address); 675 676 oops_end(flags, regs, sig); 677 } 678 679 /* 680 * Print out info about fatal segfaults, if the show_unhandled_signals 681 * sysctl is set: 682 */ 683 static inline void 684 show_signal_msg(struct pt_regs *regs, unsigned long error_code, 685 unsigned long address, struct task_struct *tsk) 686 { 687 if (!unhandled_signal(tsk, SIGSEGV)) 688 return; 689 690 if (!printk_ratelimit()) 691 return; 692 693 printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", 694 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 695 tsk->comm, task_pid_nr(tsk), address, 696 (void *)regs->ip, (void *)regs->sp, error_code); 697 698 print_vma_addr(KERN_CONT " in ", regs->ip); 699 700 printk(KERN_CONT "\n"); 701 } 702 703 static void 704 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, 705 unsigned long address, int si_code) 706 { 707 struct task_struct *tsk = current; 708 709 /* User mode accesses just cause a SIGSEGV */ 710 if (error_code & PF_USER) { 711 /* 712 * It's possible to have interrupts off here: 713 */ 714 local_irq_enable(); 715 716 /* 717 * Valid to do another page fault here because this one came 718 * from user space: 719 */ 720 if (is_prefetch(regs, error_code, address)) 721 return; 722 723 if (is_errata100(regs, address)) 724 return; 725 726 if (unlikely(show_unhandled_signals)) 727 show_signal_msg(regs, error_code, address, tsk); 728 729 /* Kernel addresses are always protection faults: */ 730 tsk->thread.cr2 = address; 731 tsk->thread.error_code = error_code | (address >= TASK_SIZE); 732 tsk->thread.trap_no = 14; 733 734 force_sig_info_fault(SIGSEGV, si_code, address, tsk); 735 736 return; 737 } 738 739 if (is_f00f_bug(regs, address)) 740 return; 741 742 no_context(regs, error_code, address); 743 } 744 745 static noinline void 746 bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, 747 unsigned long address) 748 { 749 __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); 750 } 751 752 static void 753 __bad_area(struct pt_regs *regs, unsigned long error_code, 754 unsigned long address, int si_code) 755 { 756 struct mm_struct *mm = current->mm; 757 758 /* 759 * Something tried to access memory that isn't in our memory map.. 760 * Fix it, but check if it's kernel or user first.. 761 */ 762 up_read(&mm->mmap_sem); 763 764 __bad_area_nosemaphore(regs, error_code, address, si_code); 765 } 766 767 static noinline void 768 bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) 769 { 770 __bad_area(regs, error_code, address, SEGV_MAPERR); 771 } 772 773 static noinline void 774 bad_area_access_error(struct pt_regs *regs, unsigned long error_code, 775 unsigned long address) 776 { 777 __bad_area(regs, error_code, address, SEGV_ACCERR); 778 } 779 780 /* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ 781 static void 782 out_of_memory(struct pt_regs *regs, unsigned long error_code, 783 unsigned long address) 784 { 785 /* 786 * We ran out of memory, call the OOM killer, and return the userspace 787 * (which will retry the fault, or kill us if we got oom-killed): 788 */ 789 up_read(¤t->mm->mmap_sem); 790 791 pagefault_out_of_memory(); 792 } 793 794 static void 795 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, 796 unsigned int fault) 797 { 798 struct task_struct *tsk = current; 799 struct mm_struct *mm = tsk->mm; 800 int code = BUS_ADRERR; 801 802 up_read(&mm->mmap_sem); 803 804 /* Kernel mode? Handle exceptions or die: */ 805 if (!(error_code & PF_USER)) { 806 no_context(regs, error_code, address); 807 return; 808 } 809 810 /* User-space => ok to do another page fault: */ 811 if (is_prefetch(regs, error_code, address)) 812 return; 813 814 tsk->thread.cr2 = address; 815 tsk->thread.error_code = error_code; 816 tsk->thread.trap_no = 14; 817 818 #ifdef CONFIG_MEMORY_FAILURE 819 if (fault & VM_FAULT_HWPOISON) { 820 printk(KERN_ERR 821 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", 822 tsk->comm, tsk->pid, address); 823 code = BUS_MCEERR_AR; 824 } 825 #endif 826 force_sig_info_fault(SIGBUS, code, address, tsk); 827 } 828 829 static noinline void 830 mm_fault_error(struct pt_regs *regs, unsigned long error_code, 831 unsigned long address, unsigned int fault) 832 { 833 if (fault & VM_FAULT_OOM) { 834 out_of_memory(regs, error_code, address); 835 } else { 836 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) 837 do_sigbus(regs, error_code, address, fault); 838 else 839 BUG(); 840 } 841 } 842 843 static int spurious_fault_check(unsigned long error_code, pte_t *pte) 844 { 845 if ((error_code & PF_WRITE) && !pte_write(*pte)) 846 return 0; 847 848 if ((error_code & PF_INSTR) && !pte_exec(*pte)) 849 return 0; 850 851 return 1; 852 } 853 854 /* 855 * Handle a spurious fault caused by a stale TLB entry. 856 * 857 * This allows us to lazily refresh the TLB when increasing the 858 * permissions of a kernel page (RO -> RW or NX -> X). Doing it 859 * eagerly is very expensive since that implies doing a full 860 * cross-processor TLB flush, even if no stale TLB entries exist 861 * on other processors. 862 * 863 * There are no security implications to leaving a stale TLB when 864 * increasing the permissions on a page. 865 */ 866 static noinline __kprobes int 867 spurious_fault(unsigned long error_code, unsigned long address) 868 { 869 pgd_t *pgd; 870 pud_t *pud; 871 pmd_t *pmd; 872 pte_t *pte; 873 int ret; 874 875 /* Reserved-bit violation or user access to kernel space? */ 876 if (error_code & (PF_USER | PF_RSVD)) 877 return 0; 878 879 pgd = init_mm.pgd + pgd_index(address); 880 if (!pgd_present(*pgd)) 881 return 0; 882 883 pud = pud_offset(pgd, address); 884 if (!pud_present(*pud)) 885 return 0; 886 887 if (pud_large(*pud)) 888 return spurious_fault_check(error_code, (pte_t *) pud); 889 890 pmd = pmd_offset(pud, address); 891 if (!pmd_present(*pmd)) 892 return 0; 893 894 if (pmd_large(*pmd)) 895 return spurious_fault_check(error_code, (pte_t *) pmd); 896 897 pte = pte_offset_kernel(pmd, address); 898 if (!pte_present(*pte)) 899 return 0; 900 901 ret = spurious_fault_check(error_code, pte); 902 if (!ret) 903 return 0; 904 905 /* 906 * Make sure we have permissions in PMD. 907 * If not, then there's a bug in the page tables: 908 */ 909 ret = spurious_fault_check(error_code, (pte_t *) pmd); 910 WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); 911 912 return ret; 913 } 914 915 int show_unhandled_signals = 1; 916 917 static inline int 918 access_error(unsigned long error_code, int write, struct vm_area_struct *vma) 919 { 920 if (write) { 921 /* write, present and write, not present: */ 922 if (unlikely(!(vma->vm_flags & VM_WRITE))) 923 return 1; 924 return 0; 925 } 926 927 /* read, present: */ 928 if (unlikely(error_code & PF_PROT)) 929 return 1; 930 931 /* read, not present: */ 932 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) 933 return 1; 934 935 return 0; 936 } 937 938 static int fault_in_kernel_space(unsigned long address) 939 { 940 return address >= TASK_SIZE_MAX; 941 } 942 943 /* 944 * This routine handles page faults. It determines the address, 945 * and the problem, and then passes it off to one of the appropriate 946 * routines. 947 */ 948 dotraplinkage void __kprobes 949 do_page_fault(struct pt_regs *regs, unsigned long error_code) 950 { 951 struct vm_area_struct *vma; 952 struct task_struct *tsk; 953 unsigned long address; 954 struct mm_struct *mm; 955 int write; 956 int fault; 957 958 tsk = current; 959 mm = tsk->mm; 960 961 /* Get the faulting address: */ 962 address = read_cr2(); 963 964 /* 965 * Detect and handle instructions that would cause a page fault for 966 * both a tracked kernel page and a userspace page. 967 */ 968 if (kmemcheck_active(regs)) 969 kmemcheck_hide(regs); 970 prefetchw(&mm->mmap_sem); 971 972 if (unlikely(kmmio_fault(regs, address))) 973 return; 974 975 /* 976 * We fault-in kernel-space virtual memory on-demand. The 977 * 'reference' page table is init_mm.pgd. 978 * 979 * NOTE! We MUST NOT take any locks for this case. We may 980 * be in an interrupt or a critical region, and should 981 * only copy the information from the master page table, 982 * nothing more. 983 * 984 * This verifies that the fault happens in kernel space 985 * (error_code & 4) == 0, and that the fault was not a 986 * protection error (error_code & 9) == 0. 987 */ 988 if (unlikely(fault_in_kernel_space(address))) { 989 if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { 990 if (vmalloc_fault(address) >= 0) 991 return; 992 993 if (kmemcheck_fault(regs, address, error_code)) 994 return; 995 } 996 997 /* Can handle a stale RO->RW TLB: */ 998 if (spurious_fault(error_code, address)) 999 return; 1000 1001 /* kprobes don't want to hook the spurious faults: */ 1002 if (notify_page_fault(regs)) 1003 return; 1004 /* 1005 * Don't take the mm semaphore here. If we fixup a prefetch 1006 * fault we could otherwise deadlock: 1007 */ 1008 bad_area_nosemaphore(regs, error_code, address); 1009 1010 return; 1011 } 1012 1013 /* kprobes don't want to hook the spurious faults: */ 1014 if (unlikely(notify_page_fault(regs))) 1015 return; 1016 /* 1017 * It's safe to allow irq's after cr2 has been saved and the 1018 * vmalloc fault has been handled. 1019 * 1020 * User-mode registers count as a user access even for any 1021 * potential system fault or CPU buglet: 1022 */ 1023 if (user_mode_vm(regs)) { 1024 local_irq_enable(); 1025 error_code |= PF_USER; 1026 } else { 1027 if (regs->flags & X86_EFLAGS_IF) 1028 local_irq_enable(); 1029 } 1030 1031 if (unlikely(error_code & PF_RSVD)) 1032 pgtable_bad(regs, error_code, address); 1033 1034 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); 1035 1036 /* 1037 * If we're in an interrupt, have no user context or are running 1038 * in an atomic region then we must not take the fault: 1039 */ 1040 if (unlikely(in_atomic() || !mm)) { 1041 bad_area_nosemaphore(regs, error_code, address); 1042 return; 1043 } 1044 1045 /* 1046 * When running in the kernel we expect faults to occur only to 1047 * addresses in user space. All other faults represent errors in 1048 * the kernel and should generate an OOPS. Unfortunately, in the 1049 * case of an erroneous fault occurring in a code path which already 1050 * holds mmap_sem we will deadlock attempting to validate the fault 1051 * against the address space. Luckily the kernel only validly 1052 * references user space from well defined areas of code, which are 1053 * listed in the exceptions table. 1054 * 1055 * As the vast majority of faults will be valid we will only perform 1056 * the source reference check when there is a possibility of a 1057 * deadlock. Attempt to lock the address space, if we cannot we then 1058 * validate the source. If this is invalid we can skip the address 1059 * space check, thus avoiding the deadlock: 1060 */ 1061 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 1062 if ((error_code & PF_USER) == 0 && 1063 !search_exception_tables(regs->ip)) { 1064 bad_area_nosemaphore(regs, error_code, address); 1065 return; 1066 } 1067 down_read(&mm->mmap_sem); 1068 } else { 1069 /* 1070 * The above down_read_trylock() might have succeeded in 1071 * which case we'll have missed the might_sleep() from 1072 * down_read(): 1073 */ 1074 might_sleep(); 1075 } 1076 1077 vma = find_vma(mm, address); 1078 if (unlikely(!vma)) { 1079 bad_area(regs, error_code, address); 1080 return; 1081 } 1082 if (likely(vma->vm_start <= address)) 1083 goto good_area; 1084 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { 1085 bad_area(regs, error_code, address); 1086 return; 1087 } 1088 if (error_code & PF_USER) { 1089 /* 1090 * Accessing the stack below %sp is always a bug. 1091 * The large cushion allows instructions like enter 1092 * and pusha to work. ("enter $65535, $31" pushes 1093 * 32 pointers and then decrements %sp by 65535.) 1094 */ 1095 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { 1096 bad_area(regs, error_code, address); 1097 return; 1098 } 1099 } 1100 if (unlikely(expand_stack(vma, address))) { 1101 bad_area(regs, error_code, address); 1102 return; 1103 } 1104 1105 /* 1106 * Ok, we have a good vm_area for this memory access, so 1107 * we can handle it.. 1108 */ 1109 good_area: 1110 write = error_code & PF_WRITE; 1111 1112 if (unlikely(access_error(error_code, write, vma))) { 1113 bad_area_access_error(regs, error_code, address); 1114 return; 1115 } 1116 1117 /* 1118 * If for any reason at all we couldn't handle the fault, 1119 * make sure we exit gracefully rather than endlessly redo 1120 * the fault: 1121 */ 1122 fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); 1123 1124 if (unlikely(fault & VM_FAULT_ERROR)) { 1125 mm_fault_error(regs, error_code, address, fault); 1126 return; 1127 } 1128 1129 if (fault & VM_FAULT_MAJOR) { 1130 tsk->maj_flt++; 1131 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, 1132 regs, address); 1133 } else { 1134 tsk->min_flt++; 1135 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, 1136 regs, address); 1137 } 1138 1139 check_v8086_mode(regs, address, tsk); 1140 1141 up_read(&mm->mmap_sem); 1142 } 1143