1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) 4 */ 5 6 #include <linux/mm.h> 7 #include <linux/sched/signal.h> 8 #include <linux/hardirq.h> 9 #include <linux/module.h> 10 #include <linux/uaccess.h> 11 #include <linux/sched/debug.h> 12 #include <asm/current.h> 13 #include <asm/tlbflush.h> 14 #include <arch.h> 15 #include <as-layout.h> 16 #include <kern_util.h> 17 #include <os.h> 18 #include <skas.h> 19 20 /* 21 * NOTE: UML does not have exception tables. As such, this is almost a copy 22 * of the code in mm/memory.c, only adjusting the logic to simply check whether 23 * we are coming from the kernel instead of doing an additional lookup in the 24 * exception table. 25 * We can do this simplification because we never get here if the exception was 26 * fixable. 27 */ 28 static inline bool get_mmap_lock_carefully(struct mm_struct *mm, bool is_user) 29 { 30 if (likely(mmap_read_trylock(mm))) 31 return true; 32 33 if (!is_user) 34 return false; 35 36 return !mmap_read_lock_killable(mm); 37 } 38 39 static inline bool mmap_upgrade_trylock(struct mm_struct *mm) 40 { 41 /* 42 * We don't have this operation yet. 43 * 44 * It should be easy enough to do: it's basically a 45 * atomic_long_try_cmpxchg_acquire() 46 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but 47 * it also needs the proper lockdep magic etc. 48 */ 49 return false; 50 } 51 52 static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, bool is_user) 53 { 54 mmap_read_unlock(mm); 55 if (!is_user) 56 return false; 57 58 return !mmap_write_lock_killable(mm); 59 } 60 61 /* 62 * Helper for page fault handling. 63 * 64 * This is kind of equivalend to "mmap_read_lock()" followed 65 * by "find_extend_vma()", except it's a lot more careful about 66 * the locking (and will drop the lock on failure). 67 * 68 * For example, if we have a kernel bug that causes a page 69 * fault, we don't want to just use mmap_read_lock() to get 70 * the mm lock, because that would deadlock if the bug were 71 * to happen while we're holding the mm lock for writing. 72 * 73 * So this checks the exception tables on kernel faults in 74 * order to only do this all for instructions that are actually 75 * expected to fault. 76 * 77 * We can also actually take the mm lock for writing if we 78 * need to extend the vma, which helps the VM layer a lot. 79 */ 80 static struct vm_area_struct * 81 um_lock_mm_and_find_vma(struct mm_struct *mm, 82 unsigned long addr, bool is_user) 83 { 84 struct vm_area_struct *vma; 85 86 if (!get_mmap_lock_carefully(mm, is_user)) 87 return NULL; 88 89 vma = find_vma(mm, addr); 90 if (likely(vma && (vma->vm_start <= addr))) 91 return vma; 92 93 /* 94 * Well, dang. We might still be successful, but only 95 * if we can extend a vma to do so. 96 */ 97 if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { 98 mmap_read_unlock(mm); 99 return NULL; 100 } 101 102 /* 103 * We can try to upgrade the mmap lock atomically, 104 * in which case we can continue to use the vma 105 * we already looked up. 106 * 107 * Otherwise we'll have to drop the mmap lock and 108 * re-take it, and also look up the vma again, 109 * re-checking it. 110 */ 111 if (!mmap_upgrade_trylock(mm)) { 112 if (!upgrade_mmap_lock_carefully(mm, is_user)) 113 return NULL; 114 115 vma = find_vma(mm, addr); 116 if (!vma) 117 goto fail; 118 if (vma->vm_start <= addr) 119 goto success; 120 if (!(vma->vm_flags & VM_GROWSDOWN)) 121 goto fail; 122 } 123 124 if (expand_stack_locked(vma, addr)) 125 goto fail; 126 127 success: 128 mmap_write_downgrade(mm); 129 return vma; 130 131 fail: 132 mmap_write_unlock(mm); 133 return NULL; 134 } 135 136 /* 137 * Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM by 138 * segv(). 139 */ 140 int handle_page_fault(unsigned long address, unsigned long ip, 141 int is_write, int is_user, int *code_out) 142 { 143 struct mm_struct *mm = current->mm; 144 struct vm_area_struct *vma; 145 pmd_t *pmd; 146 pte_t *pte; 147 int err = -EFAULT; 148 unsigned int flags = FAULT_FLAG_DEFAULT; 149 150 *code_out = SEGV_MAPERR; 151 152 /* 153 * If the fault was with pagefaults disabled, don't take the fault, just 154 * fail. 155 */ 156 if (faulthandler_disabled()) 157 goto out_nosemaphore; 158 159 if (is_user) 160 flags |= FAULT_FLAG_USER; 161 retry: 162 vma = um_lock_mm_and_find_vma(mm, address, is_user); 163 if (!vma) 164 goto out_nosemaphore; 165 166 *code_out = SEGV_ACCERR; 167 if (is_write) { 168 if (!(vma->vm_flags & VM_WRITE)) 169 goto out; 170 flags |= FAULT_FLAG_WRITE; 171 } else { 172 /* Don't require VM_READ|VM_EXEC for write faults! */ 173 if (!(vma->vm_flags & (VM_READ | VM_EXEC))) 174 goto out; 175 } 176 177 do { 178 vm_fault_t fault; 179 180 fault = handle_mm_fault(vma, address, flags, NULL); 181 182 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) 183 goto out_nosemaphore; 184 185 /* The fault is fully completed (including releasing mmap lock) */ 186 if (fault & VM_FAULT_COMPLETED) 187 return 0; 188 189 if (unlikely(fault & VM_FAULT_ERROR)) { 190 if (fault & VM_FAULT_OOM) { 191 goto out_of_memory; 192 } else if (fault & VM_FAULT_SIGSEGV) { 193 goto out; 194 } else if (fault & VM_FAULT_SIGBUS) { 195 err = -EACCES; 196 goto out; 197 } 198 BUG(); 199 } 200 if (fault & VM_FAULT_RETRY) { 201 flags |= FAULT_FLAG_TRIED; 202 203 goto retry; 204 } 205 206 pmd = pmd_off(mm, address); 207 pte = pte_offset_kernel(pmd, address); 208 } while (!pte_present(*pte)); 209 err = 0; 210 /* 211 * The below warning was added in place of 212 * pte_mkyoung(); if (is_write) pte_mkdirty(); 213 * If it's triggered, we'd see normally a hang here (a clean pte is 214 * marked read-only to emulate the dirty bit). 215 * However, the generic code can mark a PTE writable but clean on a 216 * concurrent read fault, triggering this harmlessly. So comment it out. 217 */ 218 #if 0 219 WARN_ON(!pte_young(*pte) || (is_write && !pte_dirty(*pte))); 220 #endif 221 222 out: 223 mmap_read_unlock(mm); 224 out_nosemaphore: 225 return err; 226 227 out_of_memory: 228 /* 229 * We ran out of memory, call the OOM killer, and return the userspace 230 * (which will retry the fault, or kill us if we got oom-killed). 231 */ 232 mmap_read_unlock(mm); 233 if (!is_user) 234 goto out_nosemaphore; 235 pagefault_out_of_memory(); 236 return 0; 237 } 238 239 static void show_segv_info(struct uml_pt_regs *regs) 240 { 241 struct task_struct *tsk = current; 242 struct faultinfo *fi = UPT_FAULTINFO(regs); 243 244 if (!unhandled_signal(tsk, SIGSEGV)) 245 return; 246 247 if (!printk_ratelimit()) 248 return; 249 250 printk("%s%s[%d]: segfault at %lx ip %px sp %px error %x", 251 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 252 tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi), 253 (void *)UPT_IP(regs), (void *)UPT_SP(regs), 254 fi->error_code); 255 256 print_vma_addr(KERN_CONT " in ", UPT_IP(regs)); 257 printk(KERN_CONT "\n"); 258 } 259 260 static void bad_segv(struct faultinfo fi, unsigned long ip) 261 { 262 current->thread.arch.faultinfo = fi; 263 force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *) FAULT_ADDRESS(fi)); 264 } 265 266 void fatal_sigsegv(void) 267 { 268 force_fatal_sig(SIGSEGV); 269 do_signal(¤t->thread.regs); 270 /* 271 * This is to tell gcc that we're not returning - do_signal 272 * can, in general, return, but in this case, it's not, since 273 * we just got a fatal SIGSEGV queued. 274 */ 275 os_dump_core(); 276 } 277 278 /** 279 * segv_handler() - the SIGSEGV handler 280 * @sig: the signal number 281 * @unused_si: the signal info struct; unused in this handler 282 * @regs: the ptrace register information 283 * @mc: the mcontext of the signal 284 * 285 * The handler first extracts the faultinfo from the UML ptrace regs struct. 286 * If the userfault did not happen in an UML userspace process, bad_segv is called. 287 * Otherwise the signal did happen in a cloned userspace process, handle it. 288 */ 289 void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, 290 void *mc) 291 { 292 struct faultinfo * fi = UPT_FAULTINFO(regs); 293 294 if (UPT_IS_USER(regs) && !SEGV_IS_FIXABLE(fi)) { 295 show_segv_info(regs); 296 bad_segv(*fi, UPT_IP(regs)); 297 return; 298 } 299 segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs, mc); 300 } 301 302 /* 303 * We give a *copy* of the faultinfo in the regs to segv. 304 * This must be done, since nesting SEGVs could overwrite 305 * the info in the regs. A pointer to the info then would 306 * give us bad data! 307 */ 308 unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, 309 struct uml_pt_regs *regs, void *mc) 310 { 311 int si_code; 312 int err; 313 int is_write = FAULT_WRITE(fi); 314 unsigned long address = FAULT_ADDRESS(fi); 315 316 if (!is_user && regs) 317 current->thread.segv_regs = container_of(regs, struct pt_regs, regs); 318 319 if (!is_user && init_mm.context.sync_tlb_range_to) { 320 /* 321 * Kernel has pending updates from set_ptes that were not 322 * flushed yet. Syncing them should fix the pagefault (if not 323 * we'll get here again and panic). 324 */ 325 err = um_tlb_sync(&init_mm); 326 if (err == -ENOMEM) 327 report_enomem(); 328 if (err) 329 panic("Failed to sync kernel TLBs: %d", err); 330 goto out; 331 } 332 else if (current->pagefault_disabled) { 333 if (!mc) { 334 show_regs(container_of(regs, struct pt_regs, regs)); 335 panic("Segfault with pagefaults disabled but no mcontext"); 336 } 337 if (!current->thread.segv_continue) { 338 show_regs(container_of(regs, struct pt_regs, regs)); 339 panic("Segfault without recovery target"); 340 } 341 mc_set_rip(mc, current->thread.segv_continue); 342 current->thread.segv_continue = NULL; 343 goto out; 344 } 345 else if (current->mm == NULL) { 346 show_regs(container_of(regs, struct pt_regs, regs)); 347 panic("Segfault with no mm"); 348 } 349 else if (!is_user && address > PAGE_SIZE && address < TASK_SIZE) { 350 show_regs(container_of(regs, struct pt_regs, regs)); 351 panic("Kernel tried to access user memory at addr 0x%lx, ip 0x%lx", 352 address, ip); 353 } 354 355 if (SEGV_IS_FIXABLE(&fi)) 356 err = handle_page_fault(address, ip, is_write, is_user, 357 &si_code); 358 else { 359 err = -EFAULT; 360 /* 361 * A thread accessed NULL, we get a fault, but CR2 is invalid. 362 * This code is used in __do_copy_from_user() of TT mode. 363 * XXX tt mode is gone, so maybe this isn't needed any more 364 */ 365 address = 0; 366 } 367 368 if (!err) 369 goto out; 370 else if (!is_user && arch_fixup(ip, regs)) 371 goto out; 372 373 if (!is_user) { 374 show_regs(container_of(regs, struct pt_regs, regs)); 375 panic("Kernel mode fault at addr 0x%lx, ip 0x%lx", 376 address, ip); 377 } 378 379 show_segv_info(regs); 380 381 if (err == -EACCES) { 382 current->thread.arch.faultinfo = fi; 383 force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); 384 } else { 385 BUG_ON(err != -EFAULT); 386 current->thread.arch.faultinfo = fi; 387 force_sig_fault(SIGSEGV, si_code, (void __user *) address); 388 } 389 390 out: 391 if (regs) 392 current->thread.segv_regs = NULL; 393 394 return 0; 395 } 396 397 void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs, 398 void *mc) 399 { 400 int code, err; 401 if (!UPT_IS_USER(regs)) { 402 if (sig == SIGBUS) 403 printk(KERN_ERR "Bus error - the host /dev/shm or /tmp " 404 "mount likely just ran out of space\n"); 405 panic("Kernel mode signal %d", sig); 406 } 407 408 arch_examine_signal(sig, regs); 409 410 /* Is the signal layout for the signal known? 411 * Signal data must be scrubbed to prevent information leaks. 412 */ 413 code = si->si_code; 414 err = si->si_errno; 415 if ((err == 0) && (siginfo_layout(sig, code) == SIL_FAULT)) { 416 struct faultinfo *fi = UPT_FAULTINFO(regs); 417 current->thread.arch.faultinfo = *fi; 418 force_sig_fault(sig, code, (void __user *)FAULT_ADDRESS(*fi)); 419 } else { 420 printk(KERN_ERR "Attempted to relay unknown signal %d (si_code = %d) with errno %d\n", 421 sig, code, err); 422 force_sig(sig); 423 } 424 } 425 426 void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, 427 void *mc) 428 { 429 do_IRQ(WINCH_IRQ, regs); 430 } 431