1 /* 2 * arch/s390/mm/fault.c 3 * 4 * S390 version 5 * Copyright (C) 1999 IBM Deutschland Entwicklung GmbH, IBM Corporation 6 * Author(s): Hartmut Penner (hp@de.ibm.com) 7 * Ulrich Weigand (uweigand@de.ibm.com) 8 * 9 * Derived from "arch/i386/mm/fault.c" 10 * Copyright (C) 1995 Linus Torvalds 11 */ 12 13 #include <linux/signal.h> 14 #include <linux/sched.h> 15 #include <linux/kernel.h> 16 #include <linux/errno.h> 17 #include <linux/string.h> 18 #include <linux/types.h> 19 #include <linux/ptrace.h> 20 #include <linux/mman.h> 21 #include <linux/mm.h> 22 #include <linux/smp.h> 23 #include <linux/smp_lock.h> 24 #include <linux/init.h> 25 #include <linux/console.h> 26 #include <linux/module.h> 27 #include <linux/hardirq.h> 28 #include <linux/kprobes.h> 29 30 #include <asm/system.h> 31 #include <asm/uaccess.h> 32 #include <asm/pgtable.h> 33 #include <asm/kdebug.h> 34 #include <asm/s390_ext.h> 35 36 #ifndef CONFIG_64BIT 37 #define __FAIL_ADDR_MASK 0x7ffff000 38 #define __FIXUP_MASK 0x7fffffff 39 #define __SUBCODE_MASK 0x0200 40 #define __PF_RES_FIELD 0ULL 41 #else /* CONFIG_64BIT */ 42 #define __FAIL_ADDR_MASK -4096L 43 #define __FIXUP_MASK ~0L 44 #define __SUBCODE_MASK 0x0600 45 #define __PF_RES_FIELD 0x8000000000000000ULL 46 #endif /* CONFIG_64BIT */ 47 48 #ifdef CONFIG_SYSCTL 49 extern int sysctl_userprocess_debug; 50 #endif 51 52 extern void die(const char *,struct pt_regs *,long); 53 54 #ifdef CONFIG_KPROBES 55 static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); 56 int register_page_fault_notifier(struct notifier_block *nb) 57 { 58 return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); 59 } 60 61 int unregister_page_fault_notifier(struct notifier_block *nb) 62 { 63 return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); 64 } 65 66 static inline int notify_page_fault(enum die_val val, const char *str, 67 struct pt_regs *regs, long err, int trap, int sig) 68 { 69 struct die_args args = { 70 .regs = regs, 71 .str = str, 72 .err = err, 73 .trapnr = trap, 74 .signr = sig 75 }; 76 return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); 77 } 78 #else 79 static inline int notify_page_fault(enum die_val val, const char *str, 80 struct pt_regs *regs, long err, int trap, int sig) 81 { 82 return NOTIFY_DONE; 83 } 84 #endif 85 86 extern spinlock_t timerlist_lock; 87 88 /* 89 * Unlock any spinlocks which will prevent us from getting the 90 * message out (timerlist_lock is acquired through the 91 * console unblank code) 92 */ 93 void bust_spinlocks(int yes) 94 { 95 if (yes) { 96 oops_in_progress = 1; 97 } else { 98 int loglevel_save = console_loglevel; 99 console_unblank(); 100 oops_in_progress = 0; 101 /* 102 * OK, the message is on the console. Now we call printk() 103 * without oops_in_progress set so that printk will give klogd 104 * a poke. Hold onto your hats... 105 */ 106 console_loglevel = 15; 107 printk(" "); 108 console_loglevel = loglevel_save; 109 } 110 } 111 112 /* 113 * Check which address space is addressed by the access 114 * register in S390_lowcore.exc_access_id. 115 * Returns 1 for user space and 0 for kernel space. 116 */ 117 static int __check_access_register(struct pt_regs *regs, int error_code) 118 { 119 int areg = S390_lowcore.exc_access_id; 120 121 if (areg == 0) 122 /* Access via access register 0 -> kernel address */ 123 return 0; 124 save_access_regs(current->thread.acrs); 125 if (regs && areg < NUM_ACRS && current->thread.acrs[areg] <= 1) 126 /* 127 * access register contains 0 -> kernel address, 128 * access register contains 1 -> user space address 129 */ 130 return current->thread.acrs[areg]; 131 132 /* Something unhealthy was done with the access registers... */ 133 die("page fault via unknown access register", regs, error_code); 134 do_exit(SIGKILL); 135 return 0; 136 } 137 138 /* 139 * Check which address space the address belongs to. 140 * May return 1 or 2 for user space and 0 for kernel space. 141 * Returns 2 for user space in primary addressing mode with 142 * CONFIG_S390_EXEC_PROTECT on and kernel parameter noexec=on. 143 */ 144 static inline int check_user_space(struct pt_regs *regs, int error_code) 145 { 146 /* 147 * The lowest two bits of S390_lowcore.trans_exc_code indicate 148 * which paging table was used: 149 * 0: Primary Segment Table Descriptor 150 * 1: STD determined via access register 151 * 2: Secondary Segment Table Descriptor 152 * 3: Home Segment Table Descriptor 153 */ 154 int descriptor = S390_lowcore.trans_exc_code & 3; 155 if (unlikely(descriptor == 1)) 156 return __check_access_register(regs, error_code); 157 if (descriptor == 2) 158 return current->thread.mm_segment.ar4; 159 return ((descriptor != 0) ^ (switch_amode)) << s390_noexec; 160 } 161 162 /* 163 * Send SIGSEGV to task. This is an external routine 164 * to keep the stack usage of do_page_fault small. 165 */ 166 static void do_sigsegv(struct pt_regs *regs, unsigned long error_code, 167 int si_code, unsigned long address) 168 { 169 struct siginfo si; 170 171 #if defined(CONFIG_SYSCTL) || defined(CONFIG_PROCESS_DEBUG) 172 #if defined(CONFIG_SYSCTL) 173 if (sysctl_userprocess_debug) 174 #endif 175 { 176 printk("User process fault: interruption code 0x%lX\n", 177 error_code); 178 printk("failing address: %lX\n", address); 179 show_regs(regs); 180 } 181 #endif 182 si.si_signo = SIGSEGV; 183 si.si_code = si_code; 184 si.si_addr = (void __user *) address; 185 force_sig_info(SIGSEGV, &si, current); 186 } 187 188 #ifdef CONFIG_S390_EXEC_PROTECT 189 extern long sys_sigreturn(struct pt_regs *regs); 190 extern long sys_rt_sigreturn(struct pt_regs *regs); 191 extern long sys32_sigreturn(struct pt_regs *regs); 192 extern long sys32_rt_sigreturn(struct pt_regs *regs); 193 194 static inline void do_sigreturn(struct mm_struct *mm, struct pt_regs *regs, 195 int rt) 196 { 197 up_read(&mm->mmap_sem); 198 clear_tsk_thread_flag(current, TIF_SINGLE_STEP); 199 #ifdef CONFIG_COMPAT 200 if (test_tsk_thread_flag(current, TIF_31BIT)) { 201 if (rt) 202 sys32_rt_sigreturn(regs); 203 else 204 sys32_sigreturn(regs); 205 return; 206 } 207 #endif /* CONFIG_COMPAT */ 208 if (rt) 209 sys_rt_sigreturn(regs); 210 else 211 sys_sigreturn(regs); 212 return; 213 } 214 215 static int signal_return(struct mm_struct *mm, struct pt_regs *regs, 216 unsigned long address, unsigned long error_code) 217 { 218 pgd_t *pgd; 219 pmd_t *pmd; 220 pte_t *pte; 221 u16 *instruction; 222 unsigned long pfn, uaddr = regs->psw.addr; 223 224 spin_lock(&mm->page_table_lock); 225 pgd = pgd_offset(mm, uaddr); 226 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 227 goto out_fault; 228 pmd = pmd_offset(pgd, uaddr); 229 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) 230 goto out_fault; 231 pte = pte_offset_map(pmd_offset(pgd_offset(mm, uaddr), uaddr), uaddr); 232 if (!pte || !pte_present(*pte)) 233 goto out_fault; 234 pfn = pte_pfn(*pte); 235 if (!pfn_valid(pfn)) 236 goto out_fault; 237 spin_unlock(&mm->page_table_lock); 238 239 instruction = (u16 *) ((pfn << PAGE_SHIFT) + (uaddr & (PAGE_SIZE-1))); 240 if (*instruction == 0x0a77) 241 do_sigreturn(mm, regs, 0); 242 else if (*instruction == 0x0aad) 243 do_sigreturn(mm, regs, 1); 244 else { 245 printk("- XXX - do_exception: task = %s, primary, NO EXEC " 246 "-> SIGSEGV\n", current->comm); 247 up_read(&mm->mmap_sem); 248 current->thread.prot_addr = address; 249 current->thread.trap_no = error_code; 250 do_sigsegv(regs, error_code, SEGV_MAPERR, address); 251 } 252 return 0; 253 out_fault: 254 spin_unlock(&mm->page_table_lock); 255 return -EFAULT; 256 } 257 #endif /* CONFIG_S390_EXEC_PROTECT */ 258 259 /* 260 * This routine handles page faults. It determines the address, 261 * and the problem, and then passes it off to one of the appropriate 262 * routines. 263 * 264 * error_code: 265 * 04 Protection -> Write-Protection (suprression) 266 * 10 Segment translation -> Not present (nullification) 267 * 11 Page translation -> Not present (nullification) 268 * 3b Region third trans. -> Not present (nullification) 269 */ 270 static inline void __kprobes 271 do_exception(struct pt_regs *regs, unsigned long error_code, int is_protection) 272 { 273 struct task_struct *tsk; 274 struct mm_struct *mm; 275 struct vm_area_struct * vma; 276 unsigned long address; 277 int user_address; 278 const struct exception_table_entry *fixup; 279 int si_code = SEGV_MAPERR; 280 281 tsk = current; 282 mm = tsk->mm; 283 284 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, 285 SIGSEGV) == NOTIFY_STOP) 286 return; 287 288 /* 289 * Check for low-address protection. This needs to be treated 290 * as a special case because the translation exception code 291 * field is not guaranteed to contain valid data in this case. 292 */ 293 if (is_protection && !(S390_lowcore.trans_exc_code & 4)) { 294 295 /* Low-address protection hit in kernel mode means 296 NULL pointer write access in kernel mode. */ 297 if (!(regs->psw.mask & PSW_MASK_PSTATE)) { 298 address = 0; 299 user_address = 0; 300 goto no_context; 301 } 302 303 /* Low-address protection hit in user mode 'cannot happen'. */ 304 die ("Low-address protection", regs, error_code); 305 do_exit(SIGKILL); 306 } 307 308 /* 309 * get the failing address 310 * more specific the segment and page table portion of 311 * the address 312 */ 313 address = S390_lowcore.trans_exc_code & __FAIL_ADDR_MASK; 314 user_address = check_user_space(regs, error_code); 315 316 /* 317 * Verify that the fault happened in user space, that 318 * we are not in an interrupt and that there is a 319 * user context. 320 */ 321 if (user_address == 0 || in_atomic() || !mm) 322 goto no_context; 323 324 /* 325 * When we get here, the fault happened in the current 326 * task's user address space, so we can switch on the 327 * interrupts again and then search the VMAs 328 */ 329 local_irq_enable(); 330 331 down_read(&mm->mmap_sem); 332 333 vma = find_vma(mm, address); 334 if (!vma) 335 goto bad_area; 336 337 #ifdef CONFIG_S390_EXEC_PROTECT 338 if (unlikely((user_address == 2) && !(vma->vm_flags & VM_EXEC))) 339 if (!signal_return(mm, regs, address, error_code)) 340 /* 341 * signal_return() has done an up_read(&mm->mmap_sem) 342 * if it returns 0. 343 */ 344 return; 345 #endif 346 347 if (vma->vm_start <= address) 348 goto good_area; 349 if (!(vma->vm_flags & VM_GROWSDOWN)) 350 goto bad_area; 351 if (expand_stack(vma, address)) 352 goto bad_area; 353 /* 354 * Ok, we have a good vm_area for this memory access, so 355 * we can handle it.. 356 */ 357 good_area: 358 si_code = SEGV_ACCERR; 359 if (!is_protection) { 360 /* page not present, check vm flags */ 361 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) 362 goto bad_area; 363 } else { 364 if (!(vma->vm_flags & VM_WRITE)) 365 goto bad_area; 366 } 367 368 survive: 369 /* 370 * If for any reason at all we couldn't handle the fault, 371 * make sure we exit gracefully rather than endlessly redo 372 * the fault. 373 */ 374 switch (handle_mm_fault(mm, vma, address, is_protection)) { 375 case VM_FAULT_MINOR: 376 tsk->min_flt++; 377 break; 378 case VM_FAULT_MAJOR: 379 tsk->maj_flt++; 380 break; 381 case VM_FAULT_SIGBUS: 382 goto do_sigbus; 383 case VM_FAULT_OOM: 384 goto out_of_memory; 385 default: 386 BUG(); 387 } 388 389 up_read(&mm->mmap_sem); 390 /* 391 * The instruction that caused the program check will 392 * be repeated. Don't signal single step via SIGTRAP. 393 */ 394 clear_tsk_thread_flag(current, TIF_SINGLE_STEP); 395 return; 396 397 /* 398 * Something tried to access memory that isn't in our memory map.. 399 * Fix it, but check if it's kernel or user first.. 400 */ 401 bad_area: 402 up_read(&mm->mmap_sem); 403 404 /* User mode accesses just cause a SIGSEGV */ 405 if (regs->psw.mask & PSW_MASK_PSTATE) { 406 tsk->thread.prot_addr = address; 407 tsk->thread.trap_no = error_code; 408 do_sigsegv(regs, error_code, si_code, address); 409 return; 410 } 411 412 no_context: 413 /* Are we prepared to handle this kernel fault? */ 414 fixup = search_exception_tables(regs->psw.addr & __FIXUP_MASK); 415 if (fixup) { 416 regs->psw.addr = fixup->fixup | PSW_ADDR_AMODE; 417 return; 418 } 419 420 /* 421 * Oops. The kernel tried to access some bad page. We'll have to 422 * terminate things with extreme prejudice. 423 */ 424 if (user_address == 0) 425 printk(KERN_ALERT "Unable to handle kernel pointer dereference" 426 " at virtual kernel address %p\n", (void *)address); 427 else 428 printk(KERN_ALERT "Unable to handle kernel paging request" 429 " at virtual user address %p\n", (void *)address); 430 431 die("Oops", regs, error_code); 432 do_exit(SIGKILL); 433 434 435 /* 436 * We ran out of memory, or some other thing happened to us that made 437 * us unable to handle the page fault gracefully. 438 */ 439 out_of_memory: 440 up_read(&mm->mmap_sem); 441 if (is_init(tsk)) { 442 yield(); 443 down_read(&mm->mmap_sem); 444 goto survive; 445 } 446 printk("VM: killing process %s\n", tsk->comm); 447 if (regs->psw.mask & PSW_MASK_PSTATE) 448 do_exit(SIGKILL); 449 goto no_context; 450 451 do_sigbus: 452 up_read(&mm->mmap_sem); 453 454 /* 455 * Send a sigbus, regardless of whether we were in kernel 456 * or user mode. 457 */ 458 tsk->thread.prot_addr = address; 459 tsk->thread.trap_no = error_code; 460 force_sig(SIGBUS, tsk); 461 462 /* Kernel mode? Handle exceptions or die */ 463 if (!(regs->psw.mask & PSW_MASK_PSTATE)) 464 goto no_context; 465 } 466 467 void do_protection_exception(struct pt_regs *regs, unsigned long error_code) 468 { 469 regs->psw.addr -= (error_code >> 16); 470 do_exception(regs, 4, 1); 471 } 472 473 void do_dat_exception(struct pt_regs *regs, unsigned long error_code) 474 { 475 do_exception(regs, error_code & 0xff, 0); 476 } 477 478 #ifdef CONFIG_PFAULT 479 /* 480 * 'pfault' pseudo page faults routines. 481 */ 482 static ext_int_info_t ext_int_pfault; 483 static int pfault_disable = 0; 484 485 static int __init nopfault(char *str) 486 { 487 pfault_disable = 1; 488 return 1; 489 } 490 491 __setup("nopfault", nopfault); 492 493 typedef struct { 494 __u16 refdiagc; 495 __u16 reffcode; 496 __u16 refdwlen; 497 __u16 refversn; 498 __u64 refgaddr; 499 __u64 refselmk; 500 __u64 refcmpmk; 501 __u64 reserved; 502 } __attribute__ ((packed)) pfault_refbk_t; 503 504 int pfault_init(void) 505 { 506 pfault_refbk_t refbk = 507 { 0x258, 0, 5, 2, __LC_CURRENT, 1ULL << 48, 1ULL << 48, 508 __PF_RES_FIELD }; 509 int rc; 510 511 if (!MACHINE_IS_VM || pfault_disable) 512 return -1; 513 asm volatile( 514 " diag %1,%0,0x258\n" 515 "0: j 2f\n" 516 "1: la %0,8\n" 517 "2:\n" 518 EX_TABLE(0b,1b) 519 : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc"); 520 __ctl_set_bit(0, 9); 521 return rc; 522 } 523 524 void pfault_fini(void) 525 { 526 pfault_refbk_t refbk = 527 { 0x258, 1, 5, 2, 0ULL, 0ULL, 0ULL, 0ULL }; 528 529 if (!MACHINE_IS_VM || pfault_disable) 530 return; 531 __ctl_clear_bit(0,9); 532 asm volatile( 533 " diag %0,0,0x258\n" 534 "0:\n" 535 EX_TABLE(0b,0b) 536 : : "a" (&refbk), "m" (refbk) : "cc"); 537 } 538 539 static void pfault_interrupt(__u16 error_code) 540 { 541 struct task_struct *tsk; 542 __u16 subcode; 543 544 /* 545 * Get the external interruption subcode & pfault 546 * initial/completion signal bit. VM stores this 547 * in the 'cpu address' field associated with the 548 * external interrupt. 549 */ 550 subcode = S390_lowcore.cpu_addr; 551 if ((subcode & 0xff00) != __SUBCODE_MASK) 552 return; 553 554 /* 555 * Get the token (= address of the task structure of the affected task). 556 */ 557 tsk = *(struct task_struct **) __LC_PFAULT_INTPARM; 558 559 if (subcode & 0x0080) { 560 /* signal bit is set -> a page has been swapped in by VM */ 561 if (xchg(&tsk->thread.pfault_wait, -1) != 0) { 562 /* Initial interrupt was faster than the completion 563 * interrupt. pfault_wait is valid. Set pfault_wait 564 * back to zero and wake up the process. This can 565 * safely be done because the task is still sleeping 566 * and can't produce new pfaults. */ 567 tsk->thread.pfault_wait = 0; 568 wake_up_process(tsk); 569 put_task_struct(tsk); 570 } 571 } else { 572 /* signal bit not set -> a real page is missing. */ 573 get_task_struct(tsk); 574 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 575 if (xchg(&tsk->thread.pfault_wait, 1) != 0) { 576 /* Completion interrupt was faster than the initial 577 * interrupt (swapped in a -1 for pfault_wait). Set 578 * pfault_wait back to zero and exit. This can be 579 * done safely because tsk is running in kernel 580 * mode and can't produce new pfaults. */ 581 tsk->thread.pfault_wait = 0; 582 set_task_state(tsk, TASK_RUNNING); 583 put_task_struct(tsk); 584 } else 585 set_tsk_need_resched(tsk); 586 } 587 } 588 589 void __init pfault_irq_init(void) 590 { 591 if (!MACHINE_IS_VM) 592 return; 593 594 /* 595 * Try to get pfault pseudo page faults going. 596 */ 597 if (register_early_external_interrupt(0x2603, pfault_interrupt, 598 &ext_int_pfault) != 0) 599 panic("Couldn't request external interrupt 0x2603"); 600 601 if (pfault_init() == 0) 602 return; 603 604 /* Tough luck, no pfault. */ 605 pfault_disable = 1; 606 unregister_early_external_interrupt(0x2603, pfault_interrupt, 607 &ext_int_pfault); 608 } 609 #endif 610