1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * mm/mmap.c 4 * 5 * Written by obz. 6 * 7 * Address space accounting code <alan@lxorguk.ukuu.org.uk> 8 */ 9 10 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 11 12 #include <linux/kernel.h> 13 #include <linux/slab.h> 14 #include <linux/backing-dev.h> 15 #include <linux/mm.h> 16 #include <linux/mm_inline.h> 17 #include <linux/shm.h> 18 #include <linux/mman.h> 19 #include <linux/pagemap.h> 20 #include <linux/swap.h> 21 #include <linux/syscalls.h> 22 #include <linux/capability.h> 23 #include <linux/init.h> 24 #include <linux/file.h> 25 #include <linux/fs.h> 26 #include <linux/personality.h> 27 #include <linux/security.h> 28 #include <linux/hugetlb.h> 29 #include <linux/shmem_fs.h> 30 #include <linux/profile.h> 31 #include <linux/export.h> 32 #include <linux/mount.h> 33 #include <linux/mempolicy.h> 34 #include <linux/rmap.h> 35 #include <linux/mmu_notifier.h> 36 #include <linux/mmdebug.h> 37 #include <linux/perf_event.h> 38 #include <linux/audit.h> 39 #include <linux/khugepaged.h> 40 #include <linux/uprobes.h> 41 #include <linux/notifier.h> 42 #include <linux/memory.h> 43 #include <linux/printk.h> 44 #include <linux/userfaultfd_k.h> 45 #include <linux/moduleparam.h> 46 #include <linux/pkeys.h> 47 #include <linux/oom.h> 48 #include <linux/sched/mm.h> 49 #include <linux/ksm.h> 50 #include <linux/memfd.h> 51 52 #include <linux/uaccess.h> 53 #include <asm/cacheflush.h> 54 #include <asm/tlb.h> 55 #include <asm/mmu_context.h> 56 57 #define CREATE_TRACE_POINTS 58 #include <trace/events/mmap.h> 59 60 #include "internal.h" 61 62 #ifndef arch_mmap_check 63 #define arch_mmap_check(addr, len, flags) (0) 64 #endif 65 66 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS 67 const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; 68 int mmap_rnd_bits_max __ro_after_init = CONFIG_ARCH_MMAP_RND_BITS_MAX; 69 int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS; 70 #endif 71 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS 72 const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN; 73 const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX; 74 int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; 75 #endif 76 77 static bool ignore_rlimit_data; 78 core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); 79 80 /* Update vma->vm_page_prot to reflect vma->vm_flags. */ 81 void vma_set_page_prot(struct vm_area_struct *vma) 82 { 83 vm_flags_t vm_flags = vma->vm_flags; 84 pgprot_t vm_page_prot; 85 86 vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); 87 if (vma_wants_writenotify(vma, vm_page_prot)) { 88 vm_flags &= ~VM_SHARED; 89 vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags); 90 } 91 /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */ 92 WRITE_ONCE(vma->vm_page_prot, vm_page_prot); 93 } 94 95 /* 96 * check_brk_limits() - Use platform specific check of range & verify mlock 97 * limits. 98 * @addr: The address to check 99 * @len: The size of increase. 100 * 101 * Return: 0 on success. 102 */ 103 static int check_brk_limits(unsigned long addr, unsigned long len) 104 { 105 unsigned long mapped_addr; 106 107 mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); 108 if (IS_ERR_VALUE(mapped_addr)) 109 return mapped_addr; 110 111 return mlock_future_ok(current->mm, 112 current->mm->def_flags & VM_LOCKED, len) 113 ? 0 : -EAGAIN; 114 } 115 116 SYSCALL_DEFINE1(brk, unsigned long, brk) 117 { 118 unsigned long newbrk, oldbrk, origbrk; 119 struct mm_struct *mm = current->mm; 120 struct vm_area_struct *brkvma, *next = NULL; 121 unsigned long min_brk; 122 bool populate = false; 123 LIST_HEAD(uf); 124 struct vma_iterator vmi; 125 126 if (mmap_write_lock_killable(mm)) 127 return -EINTR; 128 129 origbrk = mm->brk; 130 131 min_brk = mm->start_brk; 132 #ifdef CONFIG_COMPAT_BRK 133 /* 134 * CONFIG_COMPAT_BRK can still be overridden by setting 135 * randomize_va_space to 2, which will still cause mm->start_brk 136 * to be arbitrarily shifted 137 */ 138 if (!current->brk_randomized) 139 min_brk = mm->end_data; 140 #endif 141 if (brk < min_brk) 142 goto out; 143 144 /* 145 * Check against rlimit here. If this check is done later after the test 146 * of oldbrk with newbrk then it can escape the test and let the data 147 * segment grow beyond its set limit the in case where the limit is 148 * not page aligned -Ram Gupta 149 */ 150 if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk, 151 mm->end_data, mm->start_data)) 152 goto out; 153 154 newbrk = PAGE_ALIGN(brk); 155 oldbrk = PAGE_ALIGN(mm->brk); 156 if (oldbrk == newbrk) { 157 mm->brk = brk; 158 goto success; 159 } 160 161 /* Always allow shrinking brk. */ 162 if (brk <= mm->brk) { 163 /* Search one past newbrk */ 164 vma_iter_init(&vmi, mm, newbrk); 165 brkvma = vma_find(&vmi, oldbrk); 166 if (!brkvma || brkvma->vm_start >= oldbrk) 167 goto out; /* mapping intersects with an existing non-brk vma. */ 168 /* 169 * mm->brk must be protected by write mmap_lock. 170 * do_vmi_align_munmap() will drop the lock on success, so 171 * update it before calling do_vma_munmap(). 172 */ 173 mm->brk = brk; 174 if (do_vmi_align_munmap(&vmi, brkvma, mm, newbrk, oldbrk, &uf, 175 /* unlock = */ true)) 176 goto out; 177 178 goto success_unlocked; 179 } 180 181 if (check_brk_limits(oldbrk, newbrk - oldbrk)) 182 goto out; 183 184 /* 185 * Only check if the next VMA is within the stack_guard_gap of the 186 * expansion area 187 */ 188 vma_iter_init(&vmi, mm, oldbrk); 189 next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap); 190 if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) 191 goto out; 192 193 brkvma = vma_prev_limit(&vmi, mm->start_brk); 194 /* Ok, looks good - let it rip. */ 195 if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 196 EMPTY_VMA_FLAGS) < 0) 197 goto out; 198 199 mm->brk = brk; 200 if (mm->def_flags & VM_LOCKED) 201 populate = true; 202 203 success: 204 mmap_write_unlock(mm); 205 success_unlocked: 206 userfaultfd_unmap_complete(mm, &uf); 207 if (populate) 208 mm_populate(oldbrk, newbrk - oldbrk); 209 return brk; 210 211 out: 212 mm->brk = origbrk; 213 mmap_write_unlock(mm); 214 return origbrk; 215 } 216 217 /* 218 * If a hint addr is less than mmap_min_addr change hint to be as 219 * low as possible but still greater than mmap_min_addr 220 */ 221 static inline unsigned long round_hint_to_min(unsigned long hint) 222 { 223 hint &= PAGE_MASK; 224 if (((void *)hint != NULL) && 225 (hint < mmap_min_addr)) 226 return PAGE_ALIGN(mmap_min_addr); 227 return hint; 228 } 229 230 bool mlock_future_ok(const struct mm_struct *mm, bool is_vma_locked, 231 unsigned long bytes) 232 { 233 unsigned long locked_pages, limit_pages; 234 235 if (!is_vma_locked || capable(CAP_IPC_LOCK)) 236 return true; 237 238 locked_pages = bytes >> PAGE_SHIFT; 239 locked_pages += mm->locked_vm; 240 241 limit_pages = rlimit(RLIMIT_MEMLOCK); 242 limit_pages >>= PAGE_SHIFT; 243 244 return locked_pages <= limit_pages; 245 } 246 247 static inline u64 file_mmap_size_max(struct file *file, struct inode *inode) 248 { 249 if (S_ISREG(inode->i_mode)) 250 return MAX_LFS_FILESIZE; 251 252 if (S_ISBLK(inode->i_mode)) 253 return MAX_LFS_FILESIZE; 254 255 if (S_ISSOCK(inode->i_mode)) 256 return MAX_LFS_FILESIZE; 257 258 /* Special "we do even unsigned file positions" case */ 259 if (file->f_op->fop_flags & FOP_UNSIGNED_OFFSET) 260 return 0; 261 262 /* Yes, random drivers might want more. But I'm tired of buggy drivers */ 263 return ULONG_MAX; 264 } 265 266 static inline bool file_mmap_ok(struct file *file, struct inode *inode, 267 unsigned long pgoff, unsigned long len) 268 { 269 u64 maxsize = file_mmap_size_max(file, inode); 270 271 if (maxsize && len > maxsize) 272 return false; 273 maxsize -= len; 274 if (pgoff > maxsize >> PAGE_SHIFT) 275 return false; 276 return true; 277 } 278 279 /** 280 * do_mmap() - Perform a userland memory mapping into the current process 281 * address space of length @len with protection bits @prot, mmap flags @flags 282 * (from which VMA flags will be inferred), and any additional VMA flags to 283 * apply @vm_flags. If this is a file-backed mapping then the file is specified 284 * in @file and page offset into the file via @pgoff. 285 * 286 * This function does not perform security checks on the file and assumes, if 287 * @uf is non-NULL, the caller has provided a list head to track unmap events 288 * for userfaultfd @uf. 289 * 290 * It also simply indicates whether memory population is required by setting 291 * @populate, which must be non-NULL, expecting the caller to actually perform 292 * this task itself if appropriate. 293 * 294 * This function will invoke architecture-specific (and if provided and 295 * relevant, file system-specific) logic to determine the most appropriate 296 * unmapped area in which to place the mapping if not MAP_FIXED. 297 * 298 * Callers which require userland mmap() behaviour should invoke vm_mmap(), 299 * which is also exported for module use. 300 * 301 * Those which require this behaviour less security checks, userfaultfd and 302 * populate behaviour, and who handle the mmap write lock themselves, should 303 * call this function. 304 * 305 * Note that the returned address may reside within a merged VMA if an 306 * appropriate merge were to take place, so it doesn't necessarily specify the 307 * start of a VMA, rather only the start of a valid mapped range of length 308 * @len bytes, rounded down to the nearest page size. 309 * 310 * The caller must write-lock current->mm->mmap_lock. 311 * 312 * @file: An optional struct file pointer describing the file which is to be 313 * mapped, if a file-backed mapping. 314 * @addr: If non-zero, hints at (or if @flags has MAP_FIXED set, specifies) the 315 * address at which to perform this mapping. See mmap (2) for details. Must be 316 * page-aligned. 317 * @len: The length of the mapping. Will be page-aligned and must be at least 1 318 * page in size. 319 * @prot: Protection bits describing access required to the mapping. See mmap 320 * (2) for details. 321 * @flags: Flags specifying how the mapping should be performed, see mmap (2) 322 * for details. 323 * @vm_flags: VMA flags which should be set by default, or 0 otherwise. 324 * @pgoff: Page offset into the @file if file-backed, should be 0 otherwise. 325 * @populate: A pointer to a value which will be set to 0 if no population of 326 * the range is required, or the number of bytes to populate if it is. Must be 327 * non-NULL. See mmap (2) for details as to under what circumstances population 328 * of the range occurs. 329 * @uf: An optional pointer to a list head to track userfaultfd unmap events 330 * should unmapping events arise. If provided, it is up to the caller to manage 331 * this. 332 * 333 * Returns: Either an error, or the address at which the requested mapping has 334 * been performed. 335 */ 336 unsigned long do_mmap(struct file *file, unsigned long addr, 337 unsigned long len, unsigned long prot, 338 unsigned long flags, vm_flags_t vm_flags, 339 unsigned long pgoff, unsigned long *populate, 340 struct list_head *uf) 341 { 342 struct mm_struct *mm = current->mm; 343 int pkey = 0; 344 345 *populate = 0; 346 347 mmap_assert_write_locked(mm); 348 349 if (!len) 350 return -EINVAL; 351 352 /* 353 * Does the application expect PROT_READ to imply PROT_EXEC? 354 * 355 * (the exception is when the underlying filesystem is noexec 356 * mounted, in which case we don't add PROT_EXEC.) 357 */ 358 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) 359 if (!(file && path_noexec(&file->f_path))) 360 prot |= PROT_EXEC; 361 362 /* force arch specific MAP_FIXED handling in get_unmapped_area */ 363 if (flags & MAP_FIXED_NOREPLACE) 364 flags |= MAP_FIXED; 365 366 if (!(flags & MAP_FIXED)) 367 addr = round_hint_to_min(addr); 368 369 /* Careful about overflows.. */ 370 len = PAGE_ALIGN(len); 371 if (!len) 372 return -ENOMEM; 373 374 /* offset overflow? */ 375 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) 376 return -EOVERFLOW; 377 378 /* Too many mappings? */ 379 if (mm->map_count > get_sysctl_max_map_count()) 380 return -ENOMEM; 381 382 /* 383 * addr is returned from get_unmapped_area, 384 * There are two cases: 385 * 1> MAP_FIXED == false 386 * unallocated memory, no need to check sealing. 387 * 1> MAP_FIXED == true 388 * sealing is checked inside mmap_region when 389 * do_vmi_munmap is called. 390 */ 391 392 if (prot == PROT_EXEC) { 393 pkey = execute_only_pkey(mm); 394 if (pkey < 0) 395 pkey = 0; 396 } 397 398 /* Do simple checking here so the lower-level routines won't have 399 * to. we assume access permissions have been handled by the open 400 * of the memory object, so we don't do any here. 401 */ 402 vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(file, flags) | 403 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 404 405 /* Obtain the address to map to. we verify (or select) it and ensure 406 * that it represents a valid section of the address space. 407 */ 408 addr = __get_unmapped_area(file, addr, len, pgoff, flags, vm_flags); 409 if (IS_ERR_VALUE(addr)) 410 return addr; 411 412 if (flags & MAP_FIXED_NOREPLACE) { 413 if (find_vma_intersection(mm, addr, addr + len)) 414 return -EEXIST; 415 } 416 417 if (flags & MAP_LOCKED) 418 if (!can_do_mlock()) 419 return -EPERM; 420 421 if (!mlock_future_ok(mm, vm_flags & VM_LOCKED, len)) 422 return -EAGAIN; 423 424 if (file) { 425 struct inode *inode = file_inode(file); 426 unsigned long flags_mask; 427 int err; 428 429 if (!file_mmap_ok(file, inode, pgoff, len)) 430 return -EOVERFLOW; 431 432 flags_mask = LEGACY_MAP_MASK; 433 if (file->f_op->fop_flags & FOP_MMAP_SYNC) 434 flags_mask |= MAP_SYNC; 435 436 switch (flags & MAP_TYPE) { 437 case MAP_SHARED: 438 /* 439 * Force use of MAP_SHARED_VALIDATE with non-legacy 440 * flags. E.g. MAP_SYNC is dangerous to use with 441 * MAP_SHARED as you don't know which consistency model 442 * you will get. We silently ignore unsupported flags 443 * with MAP_SHARED to preserve backward compatibility. 444 */ 445 flags &= LEGACY_MAP_MASK; 446 fallthrough; 447 case MAP_SHARED_VALIDATE: 448 if (flags & ~flags_mask) 449 return -EOPNOTSUPP; 450 if (prot & PROT_WRITE) { 451 if (!(file->f_mode & FMODE_WRITE)) 452 return -EACCES; 453 if (IS_SWAPFILE(file->f_mapping->host)) 454 return -ETXTBSY; 455 } 456 457 /* 458 * Make sure we don't allow writing to an append-only 459 * file.. 460 */ 461 if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) 462 return -EACCES; 463 464 vm_flags |= VM_SHARED | VM_MAYSHARE; 465 if (!(file->f_mode & FMODE_WRITE)) 466 vm_flags &= ~(VM_MAYWRITE | VM_SHARED); 467 fallthrough; 468 case MAP_PRIVATE: 469 if (!(file->f_mode & FMODE_READ)) 470 return -EACCES; 471 if (path_noexec(&file->f_path)) { 472 if (vm_flags & VM_EXEC) 473 return -EPERM; 474 vm_flags &= ~VM_MAYEXEC; 475 } 476 477 if (!can_mmap_file(file)) 478 return -ENODEV; 479 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) 480 return -EINVAL; 481 break; 482 483 default: 484 return -EINVAL; 485 } 486 487 /* 488 * Check to see if we are violating any seals and update VMA 489 * flags if necessary to avoid future seal violations. 490 */ 491 err = memfd_check_seals_mmap(file, &vm_flags); 492 if (err) 493 return (unsigned long)err; 494 } else { 495 switch (flags & MAP_TYPE) { 496 case MAP_SHARED: 497 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) 498 return -EINVAL; 499 /* 500 * Ignore pgoff. 501 */ 502 pgoff = 0; 503 vm_flags |= VM_SHARED | VM_MAYSHARE; 504 break; 505 case MAP_DROPPABLE: 506 if (VM_DROPPABLE == VM_NONE) 507 return -ENOTSUPP; 508 /* 509 * A locked or stack area makes no sense to be droppable. 510 * 511 * Also, since droppable pages can just go away at any time 512 * it makes no sense to copy them on fork or dump them. 513 * 514 * And don't attempt to combine with hugetlb for now. 515 */ 516 if (flags & (MAP_LOCKED | MAP_HUGETLB)) 517 return -EINVAL; 518 if (vm_flags & (VM_GROWSDOWN | VM_GROWSUP)) 519 return -EINVAL; 520 521 vm_flags |= VM_DROPPABLE; 522 523 /* 524 * If the pages can be dropped, then it doesn't make 525 * sense to reserve them. 526 */ 527 vm_flags |= VM_NORESERVE; 528 529 /* 530 * Likewise, they're volatile enough that they 531 * shouldn't survive forks or coredumps. 532 */ 533 vm_flags |= VM_WIPEONFORK | VM_DONTDUMP; 534 fallthrough; 535 case MAP_PRIVATE: 536 /* 537 * Set pgoff according to addr for anon_vma. 538 */ 539 pgoff = addr >> PAGE_SHIFT; 540 break; 541 default: 542 return -EINVAL; 543 } 544 } 545 546 /* 547 * Set 'VM_NORESERVE' if we should not account for the 548 * memory use of this mapping. 549 */ 550 if (flags & MAP_NORESERVE) { 551 /* We honor MAP_NORESERVE if allowed to overcommit */ 552 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) 553 vm_flags |= VM_NORESERVE; 554 555 /* hugetlb applies strict overcommit unless MAP_NORESERVE */ 556 if (file && is_file_hugepages(file)) 557 vm_flags |= VM_NORESERVE; 558 } 559 560 addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); 561 if (!IS_ERR_VALUE(addr) && 562 ((vm_flags & VM_LOCKED) || 563 (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) 564 *populate = len; 565 return addr; 566 } 567 568 unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, 569 unsigned long prot, unsigned long flags, 570 unsigned long fd, unsigned long pgoff) 571 { 572 struct file *file = NULL; 573 unsigned long retval; 574 575 if (!(flags & MAP_ANONYMOUS)) { 576 audit_mmap_fd(fd, flags); 577 file = fget(fd); 578 if (!file) 579 return -EBADF; 580 if (is_file_hugepages(file)) { 581 len = ALIGN(len, huge_page_size(hstate_file(file))); 582 } else if (unlikely(flags & MAP_HUGETLB)) { 583 retval = -EINVAL; 584 goto out_fput; 585 } 586 } else if (flags & MAP_HUGETLB) { 587 struct hstate *hs; 588 589 hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); 590 if (!hs) 591 return -EINVAL; 592 593 len = ALIGN(len, huge_page_size(hs)); 594 /* 595 * VM_NORESERVE is used because the reservations will be 596 * taken when vm_ops->mmap() is called 597 */ 598 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, 599 mk_vma_flags(VMA_NORESERVE_BIT), 600 HUGETLB_ANONHUGE_INODE, 601 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); 602 if (IS_ERR(file)) 603 return PTR_ERR(file); 604 } 605 606 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); 607 out_fput: 608 if (file) 609 fput(file); 610 return retval; 611 } 612 613 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, 614 unsigned long, prot, unsigned long, flags, 615 unsigned long, fd, unsigned long, pgoff) 616 { 617 return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); 618 } 619 620 #ifdef __ARCH_WANT_SYS_OLD_MMAP 621 struct mmap_arg_struct { 622 unsigned long addr; 623 unsigned long len; 624 unsigned long prot; 625 unsigned long flags; 626 unsigned long fd; 627 unsigned long offset; 628 }; 629 630 SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) 631 { 632 struct mmap_arg_struct a; 633 634 if (copy_from_user(&a, arg, sizeof(a))) 635 return -EFAULT; 636 if (offset_in_page(a.offset)) 637 return -EINVAL; 638 639 return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, 640 a.offset >> PAGE_SHIFT); 641 } 642 #endif /* __ARCH_WANT_SYS_OLD_MMAP */ 643 644 /* 645 * Determine if the allocation needs to ensure that there is no 646 * existing mapping within it's guard gaps, for use as start_gap. 647 */ 648 static inline unsigned long stack_guard_placement(vm_flags_t vm_flags) 649 { 650 if (vm_flags & VM_SHADOW_STACK) 651 return PAGE_SIZE; 652 653 return 0; 654 } 655 656 /* 657 * Search for an unmapped address range. 658 * 659 * We are looking for a range that: 660 * - does not intersect with any VMA; 661 * - is contained within the [low_limit, high_limit) interval; 662 * - is at least the desired size. 663 * - satisfies (begin_addr & align_mask) == (align_offset & align_mask) 664 */ 665 unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) 666 { 667 unsigned long addr; 668 669 if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) 670 addr = unmapped_area_topdown(info); 671 else 672 addr = unmapped_area(info); 673 674 trace_vm_unmapped_area(addr, info); 675 return addr; 676 } 677 678 /* Get an address range which is currently unmapped. 679 * For shmat() with addr=0. 680 * 681 * Ugly calling convention alert: 682 * Return value with the low bits set means error value, 683 * ie 684 * if (ret & ~PAGE_MASK) 685 * error = ret; 686 * 687 * This function "knows" that -ENOMEM has the bits set. 688 */ 689 unsigned long 690 generic_get_unmapped_area(struct file *filp, unsigned long addr, 691 unsigned long len, unsigned long pgoff, 692 unsigned long flags, vm_flags_t vm_flags) 693 { 694 struct mm_struct *mm = current->mm; 695 struct vm_area_struct *vma, *prev; 696 struct vm_unmapped_area_info info = {}; 697 const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); 698 699 if (len > mmap_end - mmap_min_addr) 700 return -ENOMEM; 701 702 if (flags & MAP_FIXED) 703 return addr; 704 705 if (addr) { 706 addr = PAGE_ALIGN(addr); 707 vma = find_vma_prev(mm, addr, &prev); 708 if (mmap_end - len >= addr && addr >= mmap_min_addr && 709 (!vma || addr + len <= vm_start_gap(vma)) && 710 (!prev || addr >= vm_end_gap(prev))) 711 return addr; 712 } 713 714 info.length = len; 715 info.low_limit = mm->mmap_base; 716 info.high_limit = mmap_end; 717 info.start_gap = stack_guard_placement(vm_flags); 718 if (filp && is_file_hugepages(filp)) 719 info.align_mask = huge_page_mask_align(filp); 720 return vm_unmapped_area(&info); 721 } 722 723 #ifndef HAVE_ARCH_UNMAPPED_AREA 724 unsigned long 725 arch_get_unmapped_area(struct file *filp, unsigned long addr, 726 unsigned long len, unsigned long pgoff, 727 unsigned long flags, vm_flags_t vm_flags) 728 { 729 return generic_get_unmapped_area(filp, addr, len, pgoff, flags, 730 vm_flags); 731 } 732 #endif 733 734 /* 735 * This mmap-allocator allocates new areas top-down from below the 736 * stack's low limit (the base): 737 */ 738 unsigned long 739 generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, 740 unsigned long len, unsigned long pgoff, 741 unsigned long flags, vm_flags_t vm_flags) 742 { 743 struct vm_area_struct *vma, *prev; 744 struct mm_struct *mm = current->mm; 745 struct vm_unmapped_area_info info = {}; 746 const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); 747 748 /* requested length too big for entire address space */ 749 if (len > mmap_end - mmap_min_addr) 750 return -ENOMEM; 751 752 if (flags & MAP_FIXED) 753 return addr; 754 755 /* requesting a specific address */ 756 if (addr) { 757 addr = PAGE_ALIGN(addr); 758 vma = find_vma_prev(mm, addr, &prev); 759 if (mmap_end - len >= addr && addr >= mmap_min_addr && 760 (!vma || addr + len <= vm_start_gap(vma)) && 761 (!prev || addr >= vm_end_gap(prev))) 762 return addr; 763 } 764 765 info.flags = VM_UNMAPPED_AREA_TOPDOWN; 766 info.length = len; 767 info.low_limit = PAGE_SIZE; 768 info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); 769 info.start_gap = stack_guard_placement(vm_flags); 770 if (filp && is_file_hugepages(filp)) 771 info.align_mask = huge_page_mask_align(filp); 772 addr = vm_unmapped_area(&info); 773 774 /* 775 * A failed mmap() very likely causes application failure, 776 * so fall back to the bottom-up function here. This scenario 777 * can happen with large stack limits and large mmap() 778 * allocations. 779 */ 780 if (offset_in_page(addr)) { 781 VM_BUG_ON(addr != -ENOMEM); 782 info.flags = 0; 783 info.low_limit = TASK_UNMAPPED_BASE; 784 info.high_limit = mmap_end; 785 addr = vm_unmapped_area(&info); 786 } 787 788 return addr; 789 } 790 791 #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN 792 unsigned long 793 arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, 794 unsigned long len, unsigned long pgoff, 795 unsigned long flags, vm_flags_t vm_flags) 796 { 797 return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, 798 vm_flags); 799 } 800 #endif 801 802 unsigned long mm_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, 803 unsigned long len, unsigned long pgoff, 804 unsigned long flags, vm_flags_t vm_flags) 805 { 806 if (mm_flags_test(MMF_TOPDOWN, current->mm)) 807 return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, 808 flags, vm_flags); 809 return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); 810 } 811 812 unsigned long 813 __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, 814 unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) 815 { 816 unsigned long (*get_area)(struct file *, unsigned long, 817 unsigned long, unsigned long, unsigned long) 818 = NULL; 819 820 unsigned long error = arch_mmap_check(addr, len, flags); 821 if (error) 822 return error; 823 824 /* Careful about overflows.. */ 825 if (len > TASK_SIZE) 826 return -ENOMEM; 827 828 if (file) { 829 if (file->f_op->get_unmapped_area) 830 get_area = file->f_op->get_unmapped_area; 831 } else if (flags & MAP_SHARED) { 832 /* 833 * mmap_region() will call shmem_zero_setup() to create a file, 834 * so use shmem's get_unmapped_area in case it can be huge. 835 */ 836 get_area = shmem_get_unmapped_area; 837 } 838 839 /* Always treat pgoff as zero for anonymous memory. */ 840 if (!file) 841 pgoff = 0; 842 843 if (get_area) { 844 addr = get_area(file, addr, len, pgoff, flags); 845 } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file 846 && !addr /* no hint */ 847 && IS_ALIGNED(len, PMD_SIZE)) { 848 /* Ensures that larger anonymous mappings are THP aligned. */ 849 addr = thp_get_unmapped_area_vmflags(file, addr, len, 850 pgoff, flags, vm_flags); 851 } else { 852 addr = mm_get_unmapped_area_vmflags(file, addr, len, 853 pgoff, flags, vm_flags); 854 } 855 if (IS_ERR_VALUE(addr)) 856 return addr; 857 858 if (addr > TASK_SIZE - len) 859 return -ENOMEM; 860 if (offset_in_page(addr)) 861 return -EINVAL; 862 863 error = security_mmap_addr(addr); 864 return error ? error : addr; 865 } 866 867 unsigned long 868 mm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, 869 unsigned long pgoff, unsigned long flags) 870 { 871 return mm_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, 0); 872 } 873 EXPORT_SYMBOL(mm_get_unmapped_area); 874 875 /** 876 * find_vma_intersection() - Look up the first VMA which intersects the interval 877 * @mm: The process address space. 878 * @start_addr: The inclusive start user address. 879 * @end_addr: The exclusive end user address. 880 * 881 * Returns: The first VMA within the provided range, %NULL otherwise. Assumes 882 * start_addr < end_addr. 883 */ 884 struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, 885 unsigned long start_addr, 886 unsigned long end_addr) 887 { 888 unsigned long index = start_addr; 889 890 mmap_assert_locked(mm); 891 return mt_find(&mm->mm_mt, &index, end_addr - 1); 892 } 893 EXPORT_SYMBOL(find_vma_intersection); 894 895 /** 896 * find_vma() - Find the VMA for a given address, or the next VMA. 897 * @mm: The mm_struct to check 898 * @addr: The address 899 * 900 * Returns: The VMA associated with addr, or the next VMA. 901 * May return %NULL in the case of no VMA at addr or above. 902 */ 903 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) 904 { 905 unsigned long index = addr; 906 907 mmap_assert_locked(mm); 908 return mt_find(&mm->mm_mt, &index, ULONG_MAX); 909 } 910 EXPORT_SYMBOL(find_vma); 911 912 /** 913 * find_vma_prev() - Find the VMA for a given address, or the next vma and 914 * set %pprev to the previous VMA, if any. 915 * @mm: The mm_struct to check 916 * @addr: The address 917 * @pprev: The pointer to set to the previous VMA 918 * 919 * Note that RCU lock is missing here since the external mmap_lock() is used 920 * instead. 921 * 922 * Returns: The VMA associated with @addr, or the next vma. 923 * May return %NULL in the case of no vma at addr or above. 924 */ 925 struct vm_area_struct * 926 find_vma_prev(struct mm_struct *mm, unsigned long addr, 927 struct vm_area_struct **pprev) 928 { 929 struct vm_area_struct *vma; 930 VMA_ITERATOR(vmi, mm, addr); 931 932 vma = vma_iter_load(&vmi); 933 *pprev = vma_prev(&vmi); 934 if (!vma) 935 vma = vma_next(&vmi); 936 return vma; 937 } 938 939 /* enforced gap between the expanding stack and other mappings. */ 940 unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; 941 942 static int __init cmdline_parse_stack_guard_gap(char *p) 943 { 944 unsigned long val; 945 char *endptr; 946 947 val = simple_strtoul(p, &endptr, 10); 948 if (!*endptr) 949 stack_guard_gap = val << PAGE_SHIFT; 950 951 return 1; 952 } 953 __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); 954 955 #ifdef CONFIG_STACK_GROWSUP 956 int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) 957 { 958 return expand_upwards(vma, address); 959 } 960 961 struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) 962 { 963 struct vm_area_struct *vma, *prev; 964 965 addr &= PAGE_MASK; 966 vma = find_vma_prev(mm, addr, &prev); 967 if (vma && (vma->vm_start <= addr)) 968 return vma; 969 if (!prev) 970 return NULL; 971 if (expand_stack_locked(prev, addr)) 972 return NULL; 973 if (prev->vm_flags & VM_LOCKED) 974 populate_vma_page_range(prev, addr, prev->vm_end, NULL); 975 return prev; 976 } 977 #else 978 int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) 979 { 980 return expand_downwards(vma, address); 981 } 982 983 struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) 984 { 985 struct vm_area_struct *vma; 986 unsigned long start; 987 988 addr &= PAGE_MASK; 989 vma = find_vma(mm, addr); 990 if (!vma) 991 return NULL; 992 if (vma->vm_start <= addr) 993 return vma; 994 start = vma->vm_start; 995 if (expand_stack_locked(vma, addr)) 996 return NULL; 997 if (vma->vm_flags & VM_LOCKED) 998 populate_vma_page_range(vma, addr, start, NULL); 999 return vma; 1000 } 1001 #endif 1002 1003 #if defined(CONFIG_STACK_GROWSUP) 1004 1005 #define vma_expand_up(vma,addr) expand_upwards(vma, addr) 1006 #define vma_expand_down(vma, addr) (-EFAULT) 1007 1008 #else 1009 1010 #define vma_expand_up(vma,addr) (-EFAULT) 1011 #define vma_expand_down(vma, addr) expand_downwards(vma, addr) 1012 1013 #endif 1014 1015 /* 1016 * expand_stack(): legacy interface for page faulting. Don't use unless 1017 * you have to. 1018 * 1019 * This is called with the mm locked for reading, drops the lock, takes 1020 * the lock for writing, tries to look up a vma again, expands it if 1021 * necessary, and downgrades the lock to reading again. 1022 * 1023 * If no vma is found or it can't be expanded, it returns NULL and has 1024 * dropped the lock. 1025 */ 1026 struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) 1027 { 1028 struct vm_area_struct *vma, *prev; 1029 1030 mmap_read_unlock(mm); 1031 if (mmap_write_lock_killable(mm)) 1032 return NULL; 1033 1034 vma = find_vma_prev(mm, addr, &prev); 1035 if (vma && vma->vm_start <= addr) 1036 goto success; 1037 1038 if (prev && !vma_expand_up(prev, addr)) { 1039 vma = prev; 1040 goto success; 1041 } 1042 1043 if (vma && !vma_expand_down(vma, addr)) 1044 goto success; 1045 1046 mmap_write_unlock(mm); 1047 return NULL; 1048 1049 success: 1050 mmap_write_downgrade(mm); 1051 return vma; 1052 } 1053 1054 /* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. 1055 * @mm: The mm_struct 1056 * @start: The start address to munmap 1057 * @len: The length to be munmapped. 1058 * @uf: The userfaultfd list_head 1059 * 1060 * Return: 0 on success, error otherwise. 1061 */ 1062 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, 1063 struct list_head *uf) 1064 { 1065 VMA_ITERATOR(vmi, mm, start); 1066 1067 return do_vmi_munmap(&vmi, mm, start, len, uf, false); 1068 } 1069 1070 int vm_munmap(unsigned long start, size_t len) 1071 { 1072 return __vm_munmap(start, len, false); 1073 } 1074 EXPORT_SYMBOL(vm_munmap); 1075 1076 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) 1077 { 1078 addr = untagged_addr(addr); 1079 return __vm_munmap(addr, len, true); 1080 } 1081 1082 1083 /* 1084 * Emulation of deprecated remap_file_pages() syscall. 1085 */ 1086 SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, 1087 unsigned long, prot, unsigned long, pgoff, unsigned long, flags) 1088 { 1089 1090 struct mm_struct *mm = current->mm; 1091 struct vm_area_struct *vma; 1092 unsigned long populate = 0; 1093 unsigned long ret = -EINVAL; 1094 struct file *file; 1095 vm_flags_t vm_flags; 1096 1097 pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n", 1098 current->comm, current->pid); 1099 1100 if (prot) 1101 return ret; 1102 start = start & PAGE_MASK; 1103 size = size & PAGE_MASK; 1104 1105 if (start + size <= start) 1106 return ret; 1107 1108 /* Does pgoff wrap? */ 1109 if (pgoff + (size >> PAGE_SHIFT) < pgoff) 1110 return ret; 1111 1112 if (mmap_read_lock_killable(mm)) 1113 return -EINTR; 1114 1115 /* 1116 * Look up VMA under read lock first so we can perform the security 1117 * without holding locks (which can be problematic). We reacquire a 1118 * write lock later and check nothing changed underneath us. 1119 */ 1120 vma = vma_lookup(mm, start); 1121 1122 if (!vma || !(vma->vm_flags & VM_SHARED)) { 1123 mmap_read_unlock(mm); 1124 return -EINVAL; 1125 } 1126 1127 prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; 1128 prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; 1129 prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; 1130 1131 flags &= MAP_NONBLOCK; 1132 flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; 1133 if (vma->vm_flags & VM_LOCKED) 1134 flags |= MAP_LOCKED; 1135 1136 /* Save vm_flags used to calculate prot and flags, and recheck later. */ 1137 vm_flags = vma->vm_flags; 1138 file = get_file(vma->vm_file); 1139 1140 mmap_read_unlock(mm); 1141 1142 /* Call outside mmap_lock to be consistent with other callers. */ 1143 ret = security_mmap_file(file, prot, flags); 1144 if (ret) { 1145 fput(file); 1146 return ret; 1147 } 1148 1149 ret = -EINVAL; 1150 1151 /* OK security check passed, take write lock + let it rip. */ 1152 if (mmap_write_lock_killable(mm)) { 1153 fput(file); 1154 return -EINTR; 1155 } 1156 1157 vma = vma_lookup(mm, start); 1158 1159 if (!vma) 1160 goto out; 1161 1162 /* Make sure things didn't change under us. */ 1163 if (vma->vm_flags != vm_flags) 1164 goto out; 1165 if (vma->vm_file != file) 1166 goto out; 1167 1168 if (start + size > vma->vm_end) { 1169 VMA_ITERATOR(vmi, mm, vma->vm_end); 1170 struct vm_area_struct *next, *prev = vma; 1171 1172 for_each_vma_range(vmi, next, start + size) { 1173 /* hole between vmas ? */ 1174 if (next->vm_start != prev->vm_end) 1175 goto out; 1176 1177 if (next->vm_file != vma->vm_file) 1178 goto out; 1179 1180 if (next->vm_flags != vma->vm_flags) 1181 goto out; 1182 1183 if (start + size <= next->vm_end) 1184 break; 1185 1186 prev = next; 1187 } 1188 1189 if (!next) 1190 goto out; 1191 } 1192 1193 ret = do_mmap(vma->vm_file, start, size, 1194 prot, flags, 0, pgoff, &populate, NULL); 1195 out: 1196 mmap_write_unlock(mm); 1197 fput(file); 1198 if (populate) 1199 mm_populate(ret, populate); 1200 if (!IS_ERR_VALUE(ret)) 1201 ret = 0; 1202 return ret; 1203 } 1204 1205 int vm_brk_flags(unsigned long addr, unsigned long request, bool is_exec) 1206 { 1207 const vma_flags_t vma_flags = is_exec ? 1208 mk_vma_flags(VMA_EXEC_BIT) : EMPTY_VMA_FLAGS; 1209 struct mm_struct *mm = current->mm; 1210 struct vm_area_struct *vma = NULL; 1211 unsigned long len; 1212 int ret; 1213 bool populate; 1214 LIST_HEAD(uf); 1215 VMA_ITERATOR(vmi, mm, addr); 1216 1217 len = PAGE_ALIGN(request); 1218 if (len < request) 1219 return -ENOMEM; 1220 if (!len) 1221 return 0; 1222 1223 if (mmap_write_lock_killable(mm)) 1224 return -EINTR; 1225 1226 ret = check_brk_limits(addr, len); 1227 if (ret) 1228 goto limits_failed; 1229 1230 ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0); 1231 if (ret) 1232 goto munmap_failed; 1233 1234 vma = vma_prev(&vmi); 1235 ret = do_brk_flags(&vmi, vma, addr, len, vma_flags); 1236 populate = ((mm->def_flags & VM_LOCKED) != 0); 1237 mmap_write_unlock(mm); 1238 userfaultfd_unmap_complete(mm, &uf); 1239 if (populate && !ret) 1240 mm_populate(addr, len); 1241 return ret; 1242 1243 munmap_failed: 1244 limits_failed: 1245 mmap_write_unlock(mm); 1246 return ret; 1247 } 1248 1249 static 1250 unsigned long tear_down_vmas(struct mm_struct *mm, struct vma_iterator *vmi, 1251 struct vm_area_struct *vma, unsigned long end) 1252 { 1253 unsigned long nr_accounted = 0; 1254 int count = 0; 1255 1256 mmap_assert_write_locked(mm); 1257 vma_iter_set(vmi, vma->vm_end); 1258 do { 1259 if (vma->vm_flags & VM_ACCOUNT) 1260 nr_accounted += vma_pages(vma); 1261 vma_mark_detached(vma); 1262 remove_vma(vma); 1263 count++; 1264 cond_resched(); 1265 vma = vma_next(vmi); 1266 } while (vma && vma->vm_end <= end); 1267 1268 VM_WARN_ON_ONCE(count != mm->map_count); 1269 return nr_accounted; 1270 } 1271 1272 /* Release all mmaps. */ 1273 void exit_mmap(struct mm_struct *mm) 1274 { 1275 struct mmu_gather tlb; 1276 struct vm_area_struct *vma; 1277 unsigned long nr_accounted = 0; 1278 VMA_ITERATOR(vmi, mm, 0); 1279 struct unmap_desc unmap; 1280 1281 /* mm's last user has gone, and its about to be pulled down */ 1282 mmu_notifier_release(mm); 1283 1284 mmap_read_lock(mm); 1285 arch_exit_mmap(mm); 1286 1287 vma = vma_next(&vmi); 1288 if (!vma) { 1289 /* Can happen if dup_mmap() received an OOM */ 1290 mmap_read_unlock(mm); 1291 mmap_write_lock(mm); 1292 goto destroy; 1293 } 1294 1295 unmap_all_init(&unmap, &vmi, vma); 1296 flush_cache_mm(mm); 1297 tlb_gather_mmu_fullmm(&tlb, mm); 1298 /* update_hiwater_rss(mm) here? but nobody should be looking */ 1299 /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ 1300 unmap_vmas(&tlb, &unmap); 1301 mmap_read_unlock(mm); 1302 1303 /* 1304 * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper 1305 * because the memory has been already freed. 1306 */ 1307 mm_flags_set(MMF_OOM_SKIP, mm); 1308 mmap_write_lock(mm); 1309 unmap.mm_wr_locked = true; 1310 mt_clear_in_rcu(&mm->mm_mt); 1311 unmap_pgtable_init(&unmap, &vmi); 1312 free_pgtables(&tlb, &unmap); 1313 tlb_finish_mmu(&tlb); 1314 1315 /* 1316 * Walk the list again, actually closing and freeing it, with preemption 1317 * enabled, without holding any MM locks besides the unreachable 1318 * mmap_write_lock. 1319 */ 1320 nr_accounted = tear_down_vmas(mm, &vmi, vma, ULONG_MAX); 1321 1322 destroy: 1323 __mt_destroy(&mm->mm_mt); 1324 trace_exit_mmap(mm); 1325 mmap_write_unlock(mm); 1326 vm_unacct_memory(nr_accounted); 1327 } 1328 1329 /* 1330 * Return true if the calling process may expand its vm space by the passed 1331 * number of pages 1332 */ 1333 bool may_expand_vm(struct mm_struct *mm, const vma_flags_t *vma_flags, 1334 unsigned long npages) 1335 { 1336 if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) 1337 return false; 1338 1339 if (is_data_mapping_vma_flags(vma_flags) && 1340 mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { 1341 /* Workaround for Valgrind */ 1342 if (rlimit(RLIMIT_DATA) == 0 && 1343 mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT) 1344 return true; 1345 1346 pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n", 1347 current->comm, current->pid, 1348 (mm->data_vm + npages) << PAGE_SHIFT, 1349 rlimit(RLIMIT_DATA), 1350 ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data"); 1351 1352 if (!ignore_rlimit_data) 1353 return false; 1354 } 1355 1356 return true; 1357 } 1358 1359 void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) 1360 { 1361 WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages); 1362 1363 if (is_exec_mapping(flags)) 1364 mm->exec_vm += npages; 1365 else if (is_stack_mapping(flags)) 1366 mm->stack_vm += npages; 1367 else if (is_data_mapping(flags)) 1368 mm->data_vm += npages; 1369 } 1370 1371 static vm_fault_t special_mapping_fault(struct vm_fault *vmf); 1372 1373 /* 1374 * Close hook, called for unmap() and on the old vma for mremap(). 1375 * 1376 * Having a close hook prevents vma merging regardless of flags. 1377 */ 1378 static void special_mapping_close(struct vm_area_struct *vma) 1379 { 1380 const struct vm_special_mapping *sm = vma->vm_private_data; 1381 1382 if (sm->close) 1383 sm->close(sm, vma); 1384 } 1385 1386 static const char *special_mapping_name(struct vm_area_struct *vma) 1387 { 1388 return ((struct vm_special_mapping *)vma->vm_private_data)->name; 1389 } 1390 1391 static int special_mapping_mremap(struct vm_area_struct *new_vma) 1392 { 1393 struct vm_special_mapping *sm = new_vma->vm_private_data; 1394 1395 if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) 1396 return -EFAULT; 1397 1398 if (sm->mremap) 1399 return sm->mremap(sm, new_vma); 1400 1401 return 0; 1402 } 1403 1404 static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr) 1405 { 1406 /* 1407 * Forbid splitting special mappings - kernel has expectations over 1408 * the number of pages in mapping. Together with VM_DONTEXPAND 1409 * the size of vma should stay the same over the special mapping's 1410 * lifetime. 1411 */ 1412 return -EINVAL; 1413 } 1414 1415 static const struct vm_operations_struct special_mapping_vmops = { 1416 .close = special_mapping_close, 1417 .fault = special_mapping_fault, 1418 .mremap = special_mapping_mremap, 1419 .name = special_mapping_name, 1420 /* vDSO code relies that VVAR can't be accessed remotely */ 1421 .access = NULL, 1422 .may_split = special_mapping_split, 1423 }; 1424 1425 static vm_fault_t special_mapping_fault(struct vm_fault *vmf) 1426 { 1427 struct vm_area_struct *vma = vmf->vma; 1428 pgoff_t pgoff; 1429 struct page **pages; 1430 struct vm_special_mapping *sm = vma->vm_private_data; 1431 1432 if (sm->fault) 1433 return sm->fault(sm, vmf->vma, vmf); 1434 1435 pages = sm->pages; 1436 1437 for (pgoff = vmf->pgoff; pgoff && *pages; ++pages) 1438 pgoff--; 1439 1440 if (*pages) { 1441 struct page *page = *pages; 1442 get_page(page); 1443 vmf->page = page; 1444 return 0; 1445 } 1446 1447 return VM_FAULT_SIGBUS; 1448 } 1449 1450 static struct vm_area_struct *__install_special_mapping( 1451 struct mm_struct *mm, 1452 unsigned long addr, unsigned long len, 1453 vm_flags_t vm_flags, void *priv, 1454 const struct vm_operations_struct *ops) 1455 { 1456 int ret; 1457 struct vm_area_struct *vma; 1458 1459 vma = vm_area_alloc(mm); 1460 if (unlikely(vma == NULL)) 1461 return ERR_PTR(-ENOMEM); 1462 1463 vma_set_range(vma, addr, addr + len, 0); 1464 vm_flags |= mm->def_flags | VM_DONTEXPAND; 1465 if (pgtable_supports_soft_dirty()) 1466 vm_flags |= VM_SOFTDIRTY; 1467 vm_flags_init(vma, vm_flags & ~VM_LOCKED_MASK); 1468 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 1469 1470 vma->vm_ops = ops; 1471 vma->vm_private_data = priv; 1472 1473 ret = insert_vm_struct(mm, vma); 1474 if (ret) 1475 goto out; 1476 1477 vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); 1478 1479 perf_event_mmap(vma); 1480 1481 return vma; 1482 1483 out: 1484 vm_area_free(vma); 1485 return ERR_PTR(ret); 1486 } 1487 1488 bool vma_is_special_mapping(const struct vm_area_struct *vma, 1489 const struct vm_special_mapping *sm) 1490 { 1491 return vma->vm_private_data == sm && 1492 vma->vm_ops == &special_mapping_vmops; 1493 } 1494 1495 /* 1496 * Called with mm->mmap_lock held for writing. 1497 * Insert a new vma covering the given region, with the given flags. 1498 * Its pages are supplied by the given array of struct page *. 1499 * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. 1500 * The region past the last page supplied will always produce SIGBUS. 1501 * The array pointer and the pages it points to are assumed to stay alive 1502 * for as long as this mapping might exist. 1503 */ 1504 struct vm_area_struct *_install_special_mapping( 1505 struct mm_struct *mm, 1506 unsigned long addr, unsigned long len, 1507 vm_flags_t vm_flags, const struct vm_special_mapping *spec) 1508 { 1509 return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec, 1510 &special_mapping_vmops); 1511 } 1512 1513 #ifdef CONFIG_SYSCTL 1514 #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ 1515 defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) 1516 int sysctl_legacy_va_layout; 1517 #endif 1518 1519 static const struct ctl_table mmap_table[] = { 1520 { 1521 .procname = "max_map_count", 1522 .data = &sysctl_max_map_count, 1523 .maxlen = sizeof(sysctl_max_map_count), 1524 .mode = 0644, 1525 .proc_handler = proc_dointvec_minmax, 1526 .extra1 = SYSCTL_ZERO, 1527 }, 1528 #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ 1529 defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) 1530 { 1531 .procname = "legacy_va_layout", 1532 .data = &sysctl_legacy_va_layout, 1533 .maxlen = sizeof(sysctl_legacy_va_layout), 1534 .mode = 0644, 1535 .proc_handler = proc_dointvec_minmax, 1536 .extra1 = SYSCTL_ZERO, 1537 }, 1538 #endif 1539 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS 1540 { 1541 .procname = "mmap_rnd_bits", 1542 .data = &mmap_rnd_bits, 1543 .maxlen = sizeof(mmap_rnd_bits), 1544 .mode = 0600, 1545 .proc_handler = proc_dointvec_minmax, 1546 .extra1 = (void *)&mmap_rnd_bits_min, 1547 .extra2 = (void *)&mmap_rnd_bits_max, 1548 }, 1549 #endif 1550 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS 1551 { 1552 .procname = "mmap_rnd_compat_bits", 1553 .data = &mmap_rnd_compat_bits, 1554 .maxlen = sizeof(mmap_rnd_compat_bits), 1555 .mode = 0600, 1556 .proc_handler = proc_dointvec_minmax, 1557 .extra1 = (void *)&mmap_rnd_compat_bits_min, 1558 .extra2 = (void *)&mmap_rnd_compat_bits_max, 1559 }, 1560 #endif 1561 }; 1562 #endif /* CONFIG_SYSCTL */ 1563 1564 /* 1565 * initialise the percpu counter for VM, initialise VMA state. 1566 */ 1567 void __init mmap_init(void) 1568 { 1569 int ret; 1570 1571 ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); 1572 VM_BUG_ON(ret); 1573 #ifdef CONFIG_SYSCTL 1574 register_sysctl_init("vm", mmap_table); 1575 #endif 1576 vma_state_init(); 1577 } 1578 1579 /* 1580 * Initialise sysctl_user_reserve_kbytes. 1581 * 1582 * This is intended to prevent a user from starting a single memory hogging 1583 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER 1584 * mode. 1585 * 1586 * The default value is min(3% of free memory, 128MB) 1587 * 128MB is enough to recover with sshd/login, bash, and top/kill. 1588 */ 1589 static int init_user_reserve(void) 1590 { 1591 unsigned long free_kbytes; 1592 1593 free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); 1594 1595 sysctl_user_reserve_kbytes = min(free_kbytes / 32, SZ_128K); 1596 return 0; 1597 } 1598 subsys_initcall(init_user_reserve); 1599 1600 /* 1601 * Initialise sysctl_admin_reserve_kbytes. 1602 * 1603 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin 1604 * to log in and kill a memory hogging process. 1605 * 1606 * Systems with more than 256MB will reserve 8MB, enough to recover 1607 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will 1608 * only reserve 3% of free pages by default. 1609 */ 1610 static int init_admin_reserve(void) 1611 { 1612 unsigned long free_kbytes; 1613 1614 free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); 1615 1616 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, SZ_8K); 1617 return 0; 1618 } 1619 subsys_initcall(init_admin_reserve); 1620 1621 /* 1622 * Reinititalise user and admin reserves if memory is added or removed. 1623 * 1624 * The default user reserve max is 128MB, and the default max for the 1625 * admin reserve is 8MB. These are usually, but not always, enough to 1626 * enable recovery from a memory hogging process using login/sshd, a shell, 1627 * and tools like top. It may make sense to increase or even disable the 1628 * reserve depending on the existence of swap or variations in the recovery 1629 * tools. So, the admin may have changed them. 1630 * 1631 * If memory is added and the reserves have been eliminated or increased above 1632 * the default max, then we'll trust the admin. 1633 * 1634 * If memory is removed and there isn't enough free memory, then we 1635 * need to reset the reserves. 1636 * 1637 * Otherwise keep the reserve set by the admin. 1638 */ 1639 static int reserve_mem_notifier(struct notifier_block *nb, 1640 unsigned long action, void *data) 1641 { 1642 unsigned long tmp, free_kbytes; 1643 1644 switch (action) { 1645 case MEM_ONLINE: 1646 /* Default max is 128MB. Leave alone if modified by operator. */ 1647 tmp = sysctl_user_reserve_kbytes; 1648 if (tmp > 0 && tmp < SZ_128K) 1649 init_user_reserve(); 1650 1651 /* Default max is 8MB. Leave alone if modified by operator. */ 1652 tmp = sysctl_admin_reserve_kbytes; 1653 if (tmp > 0 && tmp < SZ_8K) 1654 init_admin_reserve(); 1655 1656 break; 1657 case MEM_OFFLINE: 1658 free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); 1659 1660 if (sysctl_user_reserve_kbytes > free_kbytes) { 1661 init_user_reserve(); 1662 pr_info("vm.user_reserve_kbytes reset to %lu\n", 1663 sysctl_user_reserve_kbytes); 1664 } 1665 1666 if (sysctl_admin_reserve_kbytes > free_kbytes) { 1667 init_admin_reserve(); 1668 pr_info("vm.admin_reserve_kbytes reset to %lu\n", 1669 sysctl_admin_reserve_kbytes); 1670 } 1671 break; 1672 default: 1673 break; 1674 } 1675 return NOTIFY_OK; 1676 } 1677 1678 static int __meminit init_reserve_notifier(void) 1679 { 1680 if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI)) 1681 pr_err("Failed registering memory add/remove notifier for admin reserve\n"); 1682 1683 return 0; 1684 } 1685 subsys_initcall(init_reserve_notifier); 1686 1687 /* 1688 * Obtain a read lock on mm->mmap_lock, if the specified address is below the 1689 * start of the VMA, the intent is to perform a write, and it is a 1690 * downward-growing stack, then attempt to expand the stack to contain it. 1691 * 1692 * This function is intended only for obtaining an argument page from an ELF 1693 * image, and is almost certainly NOT what you want to use for any other 1694 * purpose. 1695 * 1696 * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the 1697 * VMA referenced must not be linked in any user-visible tree, i.e. it must be a 1698 * new VMA being mapped. 1699 * 1700 * The function assumes that addr is either contained within the VMA or below 1701 * it, and makes no attempt to validate this value beyond that. 1702 * 1703 * Returns true if the read lock was obtained and a stack was perhaps expanded, 1704 * false if the stack expansion failed. 1705 * 1706 * On stack expansion the function temporarily acquires an mmap write lock 1707 * before downgrading it. 1708 */ 1709 bool mmap_read_lock_maybe_expand(struct mm_struct *mm, 1710 struct vm_area_struct *new_vma, 1711 unsigned long addr, bool write) 1712 { 1713 if (!write || addr >= new_vma->vm_start) { 1714 mmap_read_lock(mm); 1715 return true; 1716 } 1717 1718 if (!(new_vma->vm_flags & VM_GROWSDOWN)) 1719 return false; 1720 1721 mmap_write_lock(mm); 1722 if (expand_downwards(new_vma, addr)) { 1723 mmap_write_unlock(mm); 1724 return false; 1725 } 1726 1727 mmap_write_downgrade(mm); 1728 return true; 1729 } 1730 1731 __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) 1732 { 1733 struct vm_area_struct *mpnt, *tmp; 1734 int retval; 1735 unsigned long charge = 0; 1736 LIST_HEAD(uf); 1737 VMA_ITERATOR(vmi, mm, 0); 1738 1739 if (mmap_write_lock_killable(oldmm)) 1740 return -EINTR; 1741 flush_cache_dup_mm(oldmm); 1742 uprobe_dup_mmap(oldmm, mm); 1743 /* 1744 * Not linked in yet - no deadlock potential: 1745 */ 1746 mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); 1747 1748 /* No ordering required: file already has been exposed. */ 1749 dup_mm_exe_file(mm, oldmm); 1750 1751 mm->total_vm = oldmm->total_vm; 1752 mm->data_vm = oldmm->data_vm; 1753 mm->exec_vm = oldmm->exec_vm; 1754 mm->stack_vm = oldmm->stack_vm; 1755 1756 /* Use __mt_dup() to efficiently build an identical maple tree. */ 1757 retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); 1758 if (unlikely(retval)) 1759 goto out; 1760 1761 mt_clear_in_rcu(vmi.mas.tree); 1762 for_each_vma(vmi, mpnt) { 1763 struct file *file; 1764 1765 retval = vma_start_write_killable(mpnt); 1766 if (retval < 0) 1767 goto loop_out; 1768 if (mpnt->vm_flags & VM_DONTCOPY) { 1769 retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, 1770 mpnt->vm_end, GFP_KERNEL); 1771 if (retval) 1772 goto loop_out; 1773 1774 vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); 1775 continue; 1776 } 1777 charge = 0; 1778 if (mpnt->vm_flags & VM_ACCOUNT) { 1779 unsigned long len = vma_pages(mpnt); 1780 1781 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ 1782 goto fail_nomem; 1783 charge = len; 1784 } 1785 1786 tmp = vm_area_dup(mpnt); 1787 if (!tmp) 1788 goto fail_nomem; 1789 retval = vma_dup_policy(mpnt, tmp); 1790 if (retval) 1791 goto fail_nomem_policy; 1792 tmp->vm_mm = mm; 1793 retval = dup_userfaultfd(tmp, &uf); 1794 if (retval) 1795 goto fail_nomem_anon_vma_fork; 1796 if (tmp->vm_flags & VM_WIPEONFORK) { 1797 /* 1798 * VM_WIPEONFORK gets a clean slate in the child. 1799 * Don't prepare anon_vma until fault since we don't 1800 * copy page for current vma. 1801 */ 1802 tmp->anon_vma = NULL; 1803 } else if (anon_vma_fork(tmp, mpnt)) 1804 goto fail_nomem_anon_vma_fork; 1805 vm_flags_clear(tmp, VM_LOCKED_MASK); 1806 /* 1807 * Copy/update hugetlb private vma information. 1808 */ 1809 if (is_vm_hugetlb_page(tmp)) 1810 hugetlb_dup_vma_private(tmp); 1811 1812 /* 1813 * Link the vma into the MT. After using __mt_dup(), memory 1814 * allocation is not necessary here, so it cannot fail. 1815 */ 1816 vma_iter_bulk_store(&vmi, tmp); 1817 1818 mm->map_count++; 1819 1820 if (tmp->vm_ops && tmp->vm_ops->open) 1821 tmp->vm_ops->open(tmp); 1822 1823 file = tmp->vm_file; 1824 if (file) { 1825 struct address_space *mapping = file->f_mapping; 1826 1827 get_file(file); 1828 i_mmap_lock_write(mapping); 1829 if (vma_is_shared_maywrite(tmp)) 1830 mapping_allow_writable(mapping); 1831 flush_dcache_mmap_lock(mapping); 1832 /* insert tmp into the share list, just after mpnt */ 1833 vma_interval_tree_insert_after(tmp, mpnt, 1834 &mapping->i_mmap); 1835 flush_dcache_mmap_unlock(mapping); 1836 i_mmap_unlock_write(mapping); 1837 } 1838 1839 if (!(tmp->vm_flags & VM_WIPEONFORK)) 1840 retval = copy_page_range(tmp, mpnt); 1841 1842 if (retval) { 1843 mpnt = vma_next(&vmi); 1844 goto loop_out; 1845 } 1846 } 1847 /* a new mm has just been created */ 1848 retval = arch_dup_mmap(oldmm, mm); 1849 loop_out: 1850 vma_iter_free(&vmi); 1851 if (!retval) { 1852 mt_set_in_rcu(vmi.mas.tree); 1853 ksm_fork(mm, oldmm); 1854 khugepaged_fork(mm, oldmm); 1855 } else { 1856 unsigned long end; 1857 1858 /* 1859 * The entire maple tree has already been duplicated, but 1860 * replacing the vmas failed at mpnt (which could be NULL if 1861 * all were allocated but the last vma was not fully set up). 1862 * Use the start address of the failure point to clean up the 1863 * partially initialized tree. 1864 */ 1865 if (!mm->map_count) { 1866 /* zero vmas were written to the new tree. */ 1867 end = 0; 1868 } else if (mpnt) { 1869 /* partial tree failure */ 1870 end = mpnt->vm_start; 1871 } else { 1872 /* All vmas were written to the new tree */ 1873 end = ULONG_MAX; 1874 } 1875 1876 /* Hide mm from oom killer because the memory is being freed */ 1877 mm_flags_set(MMF_OOM_SKIP, mm); 1878 if (end) { 1879 vma_iter_set(&vmi, 0); 1880 tmp = vma_next(&vmi); 1881 UNMAP_STATE(unmap, &vmi, /* first = */ tmp, 1882 /* vma_start = */ 0, /* vma_end = */ end, 1883 /* prev = */ NULL, /* next = */ NULL); 1884 1885 /* 1886 * Don't iterate over vmas beyond the failure point for 1887 * both unmap_vma() and free_pgtables(). 1888 */ 1889 unmap.tree_end = end; 1890 flush_cache_mm(mm); 1891 unmap_region(&unmap); 1892 charge = tear_down_vmas(mm, &vmi, tmp, end); 1893 vm_unacct_memory(charge); 1894 } 1895 __mt_destroy(&mm->mm_mt); 1896 /* 1897 * The mm_struct is going to exit, but the locks will be dropped 1898 * first. Set the mm_struct as unstable is advisable as it is 1899 * not fully initialised. 1900 */ 1901 mm_flags_set(MMF_UNSTABLE, mm); 1902 } 1903 out: 1904 mmap_write_unlock(mm); 1905 flush_tlb_mm(oldmm); 1906 mmap_write_unlock(oldmm); 1907 if (!retval) 1908 dup_userfaultfd_complete(&uf); 1909 else 1910 dup_userfaultfd_fail(&uf); 1911 return retval; 1912 1913 fail_nomem_anon_vma_fork: 1914 mpol_put(vma_policy(tmp)); 1915 fail_nomem_policy: 1916 vm_area_free(tmp); 1917 fail_nomem: 1918 retval = -ENOMEM; 1919 vm_unacct_memory(charge); 1920 goto loop_out; 1921 } 1922