1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/mm/nommu.c 4 * 5 * Replacement code for mm functions to support CPU's that don't 6 * have any form of memory management unit (thus no virtual memory). 7 * 8 * See Documentation/admin-guide/mm/nommu-mmap.rst 9 * 10 * Copyright (c) 2004-2008 David Howells <dhowells@redhat.com> 11 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> 12 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> 13 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> 14 * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org> 15 */ 16 17 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 18 19 #include <linux/export.h> 20 #include <linux/mm.h> 21 #include <linux/sched/mm.h> 22 #include <linux/mman.h> 23 #include <linux/swap.h> 24 #include <linux/file.h> 25 #include <linux/highmem.h> 26 #include <linux/pagemap.h> 27 #include <linux/slab.h> 28 #include <linux/vmalloc.h> 29 #include <linux/backing-dev.h> 30 #include <linux/compiler.h> 31 #include <linux/mount.h> 32 #include <linux/personality.h> 33 #include <linux/security.h> 34 #include <linux/syscalls.h> 35 #include <linux/audit.h> 36 #include <linux/printk.h> 37 38 #include <linux/uaccess.h> 39 #include <linux/uio.h> 40 #include <asm/tlb.h> 41 #include <asm/tlbflush.h> 42 #include <asm/mmu_context.h> 43 #include "internal.h" 44 45 unsigned long highest_memmap_pfn; 46 int heap_stack_gap = 0; 47 48 atomic_long_t mmap_pages_allocated; 49 50 51 /* list of mapped, potentially shareable regions */ 52 static struct kmem_cache *vm_region_jar; 53 struct rb_root nommu_region_tree = RB_ROOT; 54 DECLARE_RWSEM(nommu_region_sem); 55 56 const struct vm_operations_struct generic_file_vm_ops = { 57 }; 58 59 /* 60 * Return the total memory allocated for this pointer, not 61 * just what the caller asked for. 62 * 63 * Doesn't have to be accurate, i.e. may have races. 64 */ 65 unsigned int kobjsize(const void *objp) 66 { 67 struct page *page; 68 69 /* 70 * If the object we have should not have ksize performed on it, 71 * return size of 0 72 */ 73 if (!objp || !virt_addr_valid(objp)) 74 return 0; 75 76 page = virt_to_head_page(objp); 77 78 /* 79 * If the allocator sets PageSlab, we know the pointer came from 80 * kmalloc(). 81 */ 82 if (PageSlab(page)) 83 return ksize(objp); 84 85 /* 86 * If it's not a compound page, see if we have a matching VMA 87 * region. This test is intentionally done in reverse order, 88 * so if there's no VMA, we still fall through and hand back 89 * PAGE_SIZE for 0-order pages. 90 */ 91 if (!PageCompound(page)) { 92 struct vm_area_struct *vma; 93 94 vma = find_vma(current->mm, (unsigned long)objp); 95 if (vma) 96 return vma->vm_end - vma->vm_start; 97 } 98 99 /* 100 * The ksize() function is only guaranteed to work for pointers 101 * returned by kmalloc(). So handle arbitrary pointers here. 102 */ 103 return page_size(page); 104 } 105 106 void vfree(const void *addr) 107 { 108 kfree(addr); 109 } 110 EXPORT_SYMBOL(vfree); 111 112 void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) 113 { 114 /* 115 * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc() 116 * returns only a logical address. 117 */ 118 return kmalloc_noprof(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); 119 } 120 EXPORT_SYMBOL(__vmalloc_noprof); 121 122 void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) 123 { 124 return krealloc_noprof(p, size, (flags | __GFP_COMP) & ~__GFP_HIGHMEM); 125 } 126 127 void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, 128 unsigned long start, unsigned long end, gfp_t gfp_mask, 129 pgprot_t prot, unsigned long vm_flags, int node, 130 const void *caller) 131 { 132 return __vmalloc_noprof(size, gfp_mask); 133 } 134 135 void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask, 136 int node, const void *caller) 137 { 138 return __vmalloc_noprof(size, gfp_mask); 139 } 140 141 static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) 142 { 143 void *ret; 144 145 ret = __vmalloc(size, flags); 146 if (ret) { 147 struct vm_area_struct *vma; 148 149 mmap_write_lock(current->mm); 150 vma = find_vma(current->mm, (unsigned long)ret); 151 if (vma) 152 vm_flags_set(vma, VM_USERMAP); 153 mmap_write_unlock(current->mm); 154 } 155 156 return ret; 157 } 158 159 void *vmalloc_user_noprof(unsigned long size) 160 { 161 return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO); 162 } 163 EXPORT_SYMBOL(vmalloc_user_noprof); 164 165 struct page *vmalloc_to_page(const void *addr) 166 { 167 return virt_to_page(addr); 168 } 169 EXPORT_SYMBOL(vmalloc_to_page); 170 171 unsigned long vmalloc_to_pfn(const void *addr) 172 { 173 return page_to_pfn(virt_to_page(addr)); 174 } 175 EXPORT_SYMBOL(vmalloc_to_pfn); 176 177 long vread_iter(struct iov_iter *iter, const char *addr, size_t count) 178 { 179 /* Don't allow overflow */ 180 if ((unsigned long) addr + count < count) 181 count = -(unsigned long) addr; 182 183 return copy_to_iter(addr, count, iter); 184 } 185 186 /* 187 * vmalloc - allocate virtually contiguous memory 188 * 189 * @size: allocation size 190 * 191 * Allocate enough pages to cover @size from the page level 192 * allocator and map them into contiguous kernel virtual space. 193 * 194 * For tight control over page level allocator and protection flags 195 * use __vmalloc() instead. 196 */ 197 void *vmalloc_noprof(unsigned long size) 198 { 199 return __vmalloc_noprof(size, GFP_KERNEL); 200 } 201 EXPORT_SYMBOL(vmalloc_noprof); 202 203 /* 204 * vmalloc_huge_node - allocate virtually contiguous memory, on a node 205 * 206 * @size: allocation size 207 * @gfp_mask: flags for the page level allocator 208 * @node: node to use for allocation or NUMA_NO_NODE 209 * 210 * Allocate enough pages to cover @size from the page level 211 * allocator and map them into contiguous kernel virtual space. 212 * 213 * Due to NOMMU implications the node argument and HUGE page attribute is 214 * ignored. 215 */ 216 void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) 217 { 218 return __vmalloc_noprof(size, gfp_mask); 219 } 220 221 /* 222 * vzalloc - allocate virtually contiguous memory with zero fill 223 * 224 * @size: allocation size 225 * 226 * Allocate enough pages to cover @size from the page level 227 * allocator and map them into contiguous kernel virtual space. 228 * The memory allocated is set to zero. 229 * 230 * For tight control over page level allocator and protection flags 231 * use __vmalloc() instead. 232 */ 233 void *vzalloc_noprof(unsigned long size) 234 { 235 return __vmalloc_noprof(size, GFP_KERNEL | __GFP_ZERO); 236 } 237 EXPORT_SYMBOL(vzalloc_noprof); 238 239 /** 240 * vmalloc_node - allocate memory on a specific node 241 * @size: allocation size 242 * @node: numa node 243 * 244 * Allocate enough pages to cover @size from the page level 245 * allocator and map them into contiguous kernel virtual space. 246 * 247 * For tight control over page level allocator and protection flags 248 * use __vmalloc() instead. 249 */ 250 void *vmalloc_node_noprof(unsigned long size, int node) 251 { 252 return vmalloc_noprof(size); 253 } 254 EXPORT_SYMBOL(vmalloc_node_noprof); 255 256 /** 257 * vzalloc_node - allocate memory on a specific node with zero fill 258 * @size: allocation size 259 * @node: numa node 260 * 261 * Allocate enough pages to cover @size from the page level 262 * allocator and map them into contiguous kernel virtual space. 263 * The memory allocated is set to zero. 264 * 265 * For tight control over page level allocator and protection flags 266 * use __vmalloc() instead. 267 */ 268 void *vzalloc_node_noprof(unsigned long size, int node) 269 { 270 return vzalloc_noprof(size); 271 } 272 EXPORT_SYMBOL(vzalloc_node_noprof); 273 274 /** 275 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 276 * @size: allocation size 277 * 278 * Allocate enough 32bit PA addressable pages to cover @size from the 279 * page level allocator and map them into contiguous kernel virtual space. 280 */ 281 void *vmalloc_32_noprof(unsigned long size) 282 { 283 return __vmalloc_noprof(size, GFP_KERNEL); 284 } 285 EXPORT_SYMBOL(vmalloc_32_noprof); 286 287 /** 288 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory 289 * @size: allocation size 290 * 291 * The resulting memory area is 32bit addressable and zeroed so it can be 292 * mapped to userspace without leaking data. 293 * 294 * VM_USERMAP is set on the corresponding VMA so that subsequent calls to 295 * remap_vmalloc_range() are permissible. 296 */ 297 void *vmalloc_32_user_noprof(unsigned long size) 298 { 299 /* 300 * We'll have to sort out the ZONE_DMA bits for 64-bit, 301 * but for now this can simply use vmalloc_user() directly. 302 */ 303 return vmalloc_user_noprof(size); 304 } 305 EXPORT_SYMBOL(vmalloc_32_user_noprof); 306 307 void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) 308 { 309 BUG(); 310 return NULL; 311 } 312 EXPORT_SYMBOL(vmap); 313 314 void vunmap(const void *addr) 315 { 316 BUG(); 317 } 318 EXPORT_SYMBOL(vunmap); 319 320 void *vm_map_ram(struct page **pages, unsigned int count, int node) 321 { 322 BUG(); 323 return NULL; 324 } 325 EXPORT_SYMBOL(vm_map_ram); 326 327 void vm_unmap_ram(const void *mem, unsigned int count) 328 { 329 BUG(); 330 } 331 EXPORT_SYMBOL(vm_unmap_ram); 332 333 void vm_unmap_aliases(void) 334 { 335 } 336 EXPORT_SYMBOL_GPL(vm_unmap_aliases); 337 338 void free_vm_area(struct vm_struct *area) 339 { 340 BUG(); 341 } 342 EXPORT_SYMBOL_GPL(free_vm_area); 343 344 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, 345 struct page *page) 346 { 347 return -EINVAL; 348 } 349 EXPORT_SYMBOL(vm_insert_page); 350 351 int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, 352 struct page **pages, unsigned long *num) 353 { 354 return -EINVAL; 355 } 356 EXPORT_SYMBOL(vm_insert_pages); 357 358 int vm_map_pages(struct vm_area_struct *vma, struct page **pages, 359 unsigned long num) 360 { 361 return -EINVAL; 362 } 363 EXPORT_SYMBOL(vm_map_pages); 364 365 int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, 366 unsigned long num) 367 { 368 return -EINVAL; 369 } 370 EXPORT_SYMBOL(vm_map_pages_zero); 371 372 /* 373 * sys_brk() for the most part doesn't need the global kernel 374 * lock, except when an application is doing something nasty 375 * like trying to un-brk an area that has already been mapped 376 * to a regular file. in this case, the unmapping will need 377 * to invoke file system routines that need the global lock. 378 */ 379 SYSCALL_DEFINE1(brk, unsigned long, brk) 380 { 381 struct mm_struct *mm = current->mm; 382 383 if (brk < mm->start_brk || brk > mm->context.end_brk) 384 return mm->brk; 385 386 if (mm->brk == brk) 387 return mm->brk; 388 389 /* 390 * Always allow shrinking brk 391 */ 392 if (brk <= mm->brk) { 393 mm->brk = brk; 394 return brk; 395 } 396 397 /* 398 * Ok, looks good - let it rip. 399 */ 400 flush_icache_user_range(mm->brk, brk); 401 return mm->brk = brk; 402 } 403 404 static int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; 405 406 static const struct ctl_table nommu_table[] = { 407 { 408 .procname = "nr_trim_pages", 409 .data = &sysctl_nr_trim_pages, 410 .maxlen = sizeof(sysctl_nr_trim_pages), 411 .mode = 0644, 412 .proc_handler = proc_dointvec_minmax, 413 .extra1 = SYSCTL_ZERO, 414 }, 415 }; 416 417 /* 418 * initialise the percpu counter for VM and region record slabs 419 */ 420 void __init mmap_init(void) 421 { 422 int ret; 423 424 ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); 425 VM_BUG_ON(ret); 426 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT); 427 register_sysctl_init("vm", nommu_table); 428 } 429 430 /* 431 * validate the region tree 432 * - the caller must hold the region lock 433 */ 434 #ifdef CONFIG_DEBUG_NOMMU_REGIONS 435 static noinline void validate_nommu_regions(void) 436 { 437 struct vm_region *region, *last; 438 struct rb_node *p, *lastp; 439 440 lastp = rb_first(&nommu_region_tree); 441 if (!lastp) 442 return; 443 444 last = rb_entry(lastp, struct vm_region, vm_rb); 445 BUG_ON(last->vm_end <= last->vm_start); 446 BUG_ON(last->vm_top < last->vm_end); 447 448 while ((p = rb_next(lastp))) { 449 region = rb_entry(p, struct vm_region, vm_rb); 450 last = rb_entry(lastp, struct vm_region, vm_rb); 451 452 BUG_ON(region->vm_end <= region->vm_start); 453 BUG_ON(region->vm_top < region->vm_end); 454 BUG_ON(region->vm_start < last->vm_top); 455 456 lastp = p; 457 } 458 } 459 #else 460 static void validate_nommu_regions(void) 461 { 462 } 463 #endif 464 465 /* 466 * add a region into the global tree 467 */ 468 static void add_nommu_region(struct vm_region *region) 469 { 470 struct vm_region *pregion; 471 struct rb_node **p, *parent; 472 473 validate_nommu_regions(); 474 475 parent = NULL; 476 p = &nommu_region_tree.rb_node; 477 while (*p) { 478 parent = *p; 479 pregion = rb_entry(parent, struct vm_region, vm_rb); 480 if (region->vm_start < pregion->vm_start) 481 p = &(*p)->rb_left; 482 else if (region->vm_start > pregion->vm_start) 483 p = &(*p)->rb_right; 484 else if (pregion == region) 485 return; 486 else 487 BUG(); 488 } 489 490 rb_link_node(®ion->vm_rb, parent, p); 491 rb_insert_color(®ion->vm_rb, &nommu_region_tree); 492 493 validate_nommu_regions(); 494 } 495 496 /* 497 * delete a region from the global tree 498 */ 499 static void delete_nommu_region(struct vm_region *region) 500 { 501 BUG_ON(!nommu_region_tree.rb_node); 502 503 validate_nommu_regions(); 504 rb_erase(®ion->vm_rb, &nommu_region_tree); 505 validate_nommu_regions(); 506 } 507 508 /* 509 * free a contiguous series of pages 510 */ 511 static void free_page_series(unsigned long from, unsigned long to) 512 { 513 for (; from < to; from += PAGE_SIZE) { 514 struct page *page = virt_to_page((void *)from); 515 516 atomic_long_dec(&mmap_pages_allocated); 517 put_page(page); 518 } 519 } 520 521 /* 522 * release a reference to a region 523 * - the caller must hold the region semaphore for writing, which this releases 524 * - the region may not have been added to the tree yet, in which case vm_top 525 * will equal vm_start 526 */ 527 static void __put_nommu_region(struct vm_region *region) 528 __releases(nommu_region_sem) 529 { 530 BUG_ON(!nommu_region_tree.rb_node); 531 532 if (--region->vm_usage == 0) { 533 if (region->vm_top > region->vm_start) 534 delete_nommu_region(region); 535 up_write(&nommu_region_sem); 536 537 if (region->vm_file) 538 fput(region->vm_file); 539 540 /* IO memory and memory shared directly out of the pagecache 541 * from ramfs/tmpfs mustn't be released here */ 542 if (region->vm_flags & VM_MAPPED_COPY) 543 free_page_series(region->vm_start, region->vm_top); 544 kmem_cache_free(vm_region_jar, region); 545 } else { 546 up_write(&nommu_region_sem); 547 } 548 } 549 550 /* 551 * release a reference to a region 552 */ 553 static void put_nommu_region(struct vm_region *region) 554 { 555 down_write(&nommu_region_sem); 556 __put_nommu_region(region); 557 } 558 559 static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) 560 { 561 vma->vm_mm = mm; 562 563 /* add the VMA to the mapping */ 564 if (vma->vm_file) { 565 struct address_space *mapping = vma->vm_file->f_mapping; 566 567 i_mmap_lock_write(mapping); 568 flush_dcache_mmap_lock(mapping); 569 vma_interval_tree_insert(vma, &mapping->i_mmap); 570 flush_dcache_mmap_unlock(mapping); 571 i_mmap_unlock_write(mapping); 572 } 573 } 574 575 static void cleanup_vma_from_mm(struct vm_area_struct *vma) 576 { 577 vma->vm_mm->map_count--; 578 /* remove the VMA from the mapping */ 579 if (vma->vm_file) { 580 struct address_space *mapping; 581 mapping = vma->vm_file->f_mapping; 582 583 i_mmap_lock_write(mapping); 584 flush_dcache_mmap_lock(mapping); 585 vma_interval_tree_remove(vma, &mapping->i_mmap); 586 flush_dcache_mmap_unlock(mapping); 587 i_mmap_unlock_write(mapping); 588 } 589 } 590 591 /* 592 * delete a VMA from its owning mm_struct and address space 593 */ 594 static int delete_vma_from_mm(struct vm_area_struct *vma) 595 { 596 VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_start); 597 598 vma_iter_config(&vmi, vma->vm_start, vma->vm_end); 599 if (vma_iter_prealloc(&vmi, NULL)) { 600 pr_warn("Allocation of vma tree for process %d failed\n", 601 current->pid); 602 return -ENOMEM; 603 } 604 cleanup_vma_from_mm(vma); 605 606 /* remove from the MM's tree and list */ 607 vma_iter_clear(&vmi); 608 return 0; 609 } 610 /* 611 * destroy a VMA record 612 */ 613 static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) 614 { 615 vma_close(vma); 616 if (vma->vm_file) 617 fput(vma->vm_file); 618 put_nommu_region(vma->vm_region); 619 vm_area_free(vma); 620 } 621 622 struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, 623 unsigned long start_addr, 624 unsigned long end_addr) 625 { 626 unsigned long index = start_addr; 627 628 mmap_assert_locked(mm); 629 return mt_find(&mm->mm_mt, &index, end_addr - 1); 630 } 631 EXPORT_SYMBOL(find_vma_intersection); 632 633 /* 634 * look up the first VMA in which addr resides, NULL if none 635 * - should be called with mm->mmap_lock at least held readlocked 636 */ 637 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) 638 { 639 VMA_ITERATOR(vmi, mm, addr); 640 641 return vma_iter_load(&vmi); 642 } 643 EXPORT_SYMBOL(find_vma); 644 645 /* 646 * At least xtensa ends up having protection faults even with no 647 * MMU.. No stack expansion, at least. 648 */ 649 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, 650 unsigned long addr, struct pt_regs *regs) 651 { 652 struct vm_area_struct *vma; 653 654 mmap_read_lock(mm); 655 vma = vma_lookup(mm, addr); 656 if (!vma) 657 mmap_read_unlock(mm); 658 return vma; 659 } 660 661 /* 662 * expand a stack to a given address 663 * - not supported under NOMMU conditions 664 */ 665 int expand_stack_locked(struct vm_area_struct *vma, unsigned long addr) 666 { 667 return -ENOMEM; 668 } 669 670 struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) 671 { 672 mmap_read_unlock(mm); 673 return NULL; 674 } 675 676 /* 677 * look up the first VMA exactly that exactly matches addr 678 * - should be called with mm->mmap_lock at least held readlocked 679 */ 680 static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, 681 unsigned long addr, 682 unsigned long len) 683 { 684 struct vm_area_struct *vma; 685 unsigned long end = addr + len; 686 VMA_ITERATOR(vmi, mm, addr); 687 688 vma = vma_iter_load(&vmi); 689 if (!vma) 690 return NULL; 691 if (vma->vm_start != addr) 692 return NULL; 693 if (vma->vm_end != end) 694 return NULL; 695 696 return vma; 697 } 698 699 /* 700 * determine whether a mapping should be permitted and, if so, what sort of 701 * mapping we're capable of supporting 702 */ 703 static int validate_mmap_request(struct file *file, 704 unsigned long addr, 705 unsigned long len, 706 unsigned long prot, 707 unsigned long flags, 708 unsigned long pgoff, 709 unsigned long *_capabilities) 710 { 711 unsigned long capabilities, rlen; 712 int ret; 713 714 /* do the simple checks first */ 715 if (flags & MAP_FIXED) 716 return -EINVAL; 717 718 if ((flags & MAP_TYPE) != MAP_PRIVATE && 719 (flags & MAP_TYPE) != MAP_SHARED) 720 return -EINVAL; 721 722 if (!len) 723 return -EINVAL; 724 725 /* Careful about overflows.. */ 726 rlen = PAGE_ALIGN(len); 727 if (!rlen || rlen > TASK_SIZE) 728 return -ENOMEM; 729 730 /* offset overflow? */ 731 if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff) 732 return -EOVERFLOW; 733 734 if (file) { 735 /* files must support mmap */ 736 if (!file->f_op->mmap) 737 return -ENODEV; 738 739 /* work out if what we've got could possibly be shared 740 * - we support chardevs that provide their own "memory" 741 * - we support files/blockdevs that are memory backed 742 */ 743 if (file->f_op->mmap_capabilities) { 744 capabilities = file->f_op->mmap_capabilities(file); 745 } else { 746 /* no explicit capabilities set, so assume some 747 * defaults */ 748 switch (file_inode(file)->i_mode & S_IFMT) { 749 case S_IFREG: 750 case S_IFBLK: 751 capabilities = NOMMU_MAP_COPY; 752 break; 753 754 case S_IFCHR: 755 capabilities = 756 NOMMU_MAP_DIRECT | 757 NOMMU_MAP_READ | 758 NOMMU_MAP_WRITE; 759 break; 760 761 default: 762 return -EINVAL; 763 } 764 } 765 766 /* eliminate any capabilities that we can't support on this 767 * device */ 768 if (!file->f_op->get_unmapped_area) 769 capabilities &= ~NOMMU_MAP_DIRECT; 770 if (!(file->f_mode & FMODE_CAN_READ)) 771 capabilities &= ~NOMMU_MAP_COPY; 772 773 /* The file shall have been opened with read permission. */ 774 if (!(file->f_mode & FMODE_READ)) 775 return -EACCES; 776 777 if (flags & MAP_SHARED) { 778 /* do checks for writing, appending and locking */ 779 if ((prot & PROT_WRITE) && 780 !(file->f_mode & FMODE_WRITE)) 781 return -EACCES; 782 783 if (IS_APPEND(file_inode(file)) && 784 (file->f_mode & FMODE_WRITE)) 785 return -EACCES; 786 787 if (!(capabilities & NOMMU_MAP_DIRECT)) 788 return -ENODEV; 789 790 /* we mustn't privatise shared mappings */ 791 capabilities &= ~NOMMU_MAP_COPY; 792 } else { 793 /* we're going to read the file into private memory we 794 * allocate */ 795 if (!(capabilities & NOMMU_MAP_COPY)) 796 return -ENODEV; 797 798 /* we don't permit a private writable mapping to be 799 * shared with the backing device */ 800 if (prot & PROT_WRITE) 801 capabilities &= ~NOMMU_MAP_DIRECT; 802 } 803 804 if (capabilities & NOMMU_MAP_DIRECT) { 805 if (((prot & PROT_READ) && !(capabilities & NOMMU_MAP_READ)) || 806 ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) || 807 ((prot & PROT_EXEC) && !(capabilities & NOMMU_MAP_EXEC)) 808 ) { 809 capabilities &= ~NOMMU_MAP_DIRECT; 810 if (flags & MAP_SHARED) { 811 pr_warn("MAP_SHARED not completely supported on !MMU\n"); 812 return -EINVAL; 813 } 814 } 815 } 816 817 /* handle executable mappings and implied executable 818 * mappings */ 819 if (path_noexec(&file->f_path)) { 820 if (prot & PROT_EXEC) 821 return -EPERM; 822 } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { 823 /* handle implication of PROT_EXEC by PROT_READ */ 824 if (current->personality & READ_IMPLIES_EXEC) { 825 if (capabilities & NOMMU_MAP_EXEC) 826 prot |= PROT_EXEC; 827 } 828 } else if ((prot & PROT_READ) && 829 (prot & PROT_EXEC) && 830 !(capabilities & NOMMU_MAP_EXEC) 831 ) { 832 /* backing file is not executable, try to copy */ 833 capabilities &= ~NOMMU_MAP_DIRECT; 834 } 835 } else { 836 /* anonymous mappings are always memory backed and can be 837 * privately mapped 838 */ 839 capabilities = NOMMU_MAP_COPY; 840 841 /* handle PROT_EXEC implication by PROT_READ */ 842 if ((prot & PROT_READ) && 843 (current->personality & READ_IMPLIES_EXEC)) 844 prot |= PROT_EXEC; 845 } 846 847 /* allow the security API to have its say */ 848 ret = security_mmap_addr(addr); 849 if (ret < 0) 850 return ret; 851 852 /* looks okay */ 853 *_capabilities = capabilities; 854 return 0; 855 } 856 857 /* 858 * we've determined that we can make the mapping, now translate what we 859 * now know into VMA flags 860 */ 861 static unsigned long determine_vm_flags(struct file *file, 862 unsigned long prot, 863 unsigned long flags, 864 unsigned long capabilities) 865 { 866 unsigned long vm_flags; 867 868 vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(file, flags); 869 870 if (!file) { 871 /* 872 * MAP_ANONYMOUS. MAP_SHARED is mapped to MAP_PRIVATE, because 873 * there is no fork(). 874 */ 875 vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 876 } else if (flags & MAP_PRIVATE) { 877 /* MAP_PRIVATE file mapping */ 878 if (capabilities & NOMMU_MAP_DIRECT) 879 vm_flags |= (capabilities & NOMMU_VMFLAGS); 880 else 881 vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 882 883 if (!(prot & PROT_WRITE) && !current->ptrace) 884 /* 885 * R/O private file mapping which cannot be used to 886 * modify memory, especially also not via active ptrace 887 * (e.g., set breakpoints) or later by upgrading 888 * permissions (no mprotect()). We can try overlaying 889 * the file mapping, which will work e.g., on chardevs, 890 * ramfs/tmpfs/shmfs and romfs/cramf. 891 */ 892 vm_flags |= VM_MAYOVERLAY; 893 } else { 894 /* MAP_SHARED file mapping: NOMMU_MAP_DIRECT is set. */ 895 vm_flags |= VM_SHARED | VM_MAYSHARE | 896 (capabilities & NOMMU_VMFLAGS); 897 } 898 899 return vm_flags; 900 } 901 902 /* 903 * set up a shared mapping on a file (the driver or filesystem provides and 904 * pins the storage) 905 */ 906 static int do_mmap_shared_file(struct vm_area_struct *vma) 907 { 908 int ret; 909 910 ret = mmap_file(vma->vm_file, vma); 911 if (ret == 0) { 912 vma->vm_region->vm_top = vma->vm_region->vm_end; 913 return 0; 914 } 915 if (ret != -ENOSYS) 916 return ret; 917 918 /* getting -ENOSYS indicates that direct mmap isn't possible (as 919 * opposed to tried but failed) so we can only give a suitable error as 920 * it's not possible to make a private copy if MAP_SHARED was given */ 921 return -ENODEV; 922 } 923 924 /* 925 * set up a private mapping or an anonymous shared mapping 926 */ 927 static int do_mmap_private(struct vm_area_struct *vma, 928 struct vm_region *region, 929 unsigned long len, 930 unsigned long capabilities) 931 { 932 unsigned long total, point; 933 void *base; 934 int ret, order; 935 936 /* 937 * Invoke the file's mapping function so that it can keep track of 938 * shared mappings on devices or memory. VM_MAYOVERLAY will be set if 939 * it may attempt to share, which will make is_nommu_shared_mapping() 940 * happy. 941 */ 942 if (capabilities & NOMMU_MAP_DIRECT) { 943 ret = mmap_file(vma->vm_file, vma); 944 /* shouldn't return success if we're not sharing */ 945 if (WARN_ON_ONCE(!is_nommu_shared_mapping(vma->vm_flags))) 946 ret = -ENOSYS; 947 if (ret == 0) { 948 vma->vm_region->vm_top = vma->vm_region->vm_end; 949 return 0; 950 } 951 if (ret != -ENOSYS) 952 return ret; 953 954 /* getting an ENOSYS error indicates that direct mmap isn't 955 * possible (as opposed to tried but failed) so we'll try to 956 * make a private copy of the data and map that instead */ 957 } 958 959 960 /* allocate some memory to hold the mapping 961 * - note that this may not return a page-aligned address if the object 962 * we're allocating is smaller than a page 963 */ 964 order = get_order(len); 965 total = 1 << order; 966 point = len >> PAGE_SHIFT; 967 968 /* we don't want to allocate a power-of-2 sized page set */ 969 if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) 970 total = point; 971 972 base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL); 973 if (!base) 974 goto enomem; 975 976 atomic_long_add(total, &mmap_pages_allocated); 977 978 vm_flags_set(vma, VM_MAPPED_COPY); 979 region->vm_flags = vma->vm_flags; 980 region->vm_start = (unsigned long) base; 981 region->vm_end = region->vm_start + len; 982 region->vm_top = region->vm_start + (total << PAGE_SHIFT); 983 984 vma->vm_start = region->vm_start; 985 vma->vm_end = region->vm_start + len; 986 987 if (vma->vm_file) { 988 /* read the contents of a file into the copy */ 989 loff_t fpos; 990 991 fpos = vma->vm_pgoff; 992 fpos <<= PAGE_SHIFT; 993 994 ret = kernel_read(vma->vm_file, base, len, &fpos); 995 if (ret < 0) 996 goto error_free; 997 998 /* clear the last little bit */ 999 if (ret < len) 1000 memset(base + ret, 0, len - ret); 1001 1002 } else { 1003 vma_set_anonymous(vma); 1004 } 1005 1006 return 0; 1007 1008 error_free: 1009 free_page_series(region->vm_start, region->vm_top); 1010 region->vm_start = vma->vm_start = 0; 1011 region->vm_end = vma->vm_end = 0; 1012 region->vm_top = 0; 1013 return ret; 1014 1015 enomem: 1016 pr_err("Allocation of length %lu from process %d (%s) failed\n", 1017 len, current->pid, current->comm); 1018 show_mem(); 1019 return -ENOMEM; 1020 } 1021 1022 /* 1023 * handle mapping creation for uClinux 1024 */ 1025 unsigned long do_mmap(struct file *file, 1026 unsigned long addr, 1027 unsigned long len, 1028 unsigned long prot, 1029 unsigned long flags, 1030 vm_flags_t vm_flags, 1031 unsigned long pgoff, 1032 unsigned long *populate, 1033 struct list_head *uf) 1034 { 1035 struct vm_area_struct *vma; 1036 struct vm_region *region; 1037 struct rb_node *rb; 1038 unsigned long capabilities, result; 1039 int ret; 1040 VMA_ITERATOR(vmi, current->mm, 0); 1041 1042 *populate = 0; 1043 1044 /* decide whether we should attempt the mapping, and if so what sort of 1045 * mapping */ 1046 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, 1047 &capabilities); 1048 if (ret < 0) 1049 return ret; 1050 1051 /* we ignore the address hint */ 1052 addr = 0; 1053 len = PAGE_ALIGN(len); 1054 1055 /* we've determined that we can make the mapping, now translate what we 1056 * now know into VMA flags */ 1057 vm_flags |= determine_vm_flags(file, prot, flags, capabilities); 1058 1059 1060 /* we're going to need to record the mapping */ 1061 region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); 1062 if (!region) 1063 goto error_getting_region; 1064 1065 vma = vm_area_alloc(current->mm); 1066 if (!vma) 1067 goto error_getting_vma; 1068 1069 region->vm_usage = 1; 1070 region->vm_flags = vm_flags; 1071 region->vm_pgoff = pgoff; 1072 1073 vm_flags_init(vma, vm_flags); 1074 vma->vm_pgoff = pgoff; 1075 1076 if (file) { 1077 region->vm_file = get_file(file); 1078 vma->vm_file = get_file(file); 1079 } 1080 1081 down_write(&nommu_region_sem); 1082 1083 /* if we want to share, we need to check for regions created by other 1084 * mmap() calls that overlap with our proposed mapping 1085 * - we can only share with a superset match on most regular files 1086 * - shared mappings on character devices and memory backed files are 1087 * permitted to overlap inexactly as far as we are concerned for in 1088 * these cases, sharing is handled in the driver or filesystem rather 1089 * than here 1090 */ 1091 if (is_nommu_shared_mapping(vm_flags)) { 1092 struct vm_region *pregion; 1093 unsigned long pglen, rpglen, pgend, rpgend, start; 1094 1095 pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1096 pgend = pgoff + pglen; 1097 1098 for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) { 1099 pregion = rb_entry(rb, struct vm_region, vm_rb); 1100 1101 if (!is_nommu_shared_mapping(pregion->vm_flags)) 1102 continue; 1103 1104 /* search for overlapping mappings on the same file */ 1105 if (file_inode(pregion->vm_file) != 1106 file_inode(file)) 1107 continue; 1108 1109 if (pregion->vm_pgoff >= pgend) 1110 continue; 1111 1112 rpglen = pregion->vm_end - pregion->vm_start; 1113 rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT; 1114 rpgend = pregion->vm_pgoff + rpglen; 1115 if (pgoff >= rpgend) 1116 continue; 1117 1118 /* handle inexactly overlapping matches between 1119 * mappings */ 1120 if ((pregion->vm_pgoff != pgoff || rpglen != pglen) && 1121 !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) { 1122 /* new mapping is not a subset of the region */ 1123 if (!(capabilities & NOMMU_MAP_DIRECT)) 1124 goto sharing_violation; 1125 continue; 1126 } 1127 1128 /* we've found a region we can share */ 1129 pregion->vm_usage++; 1130 vma->vm_region = pregion; 1131 start = pregion->vm_start; 1132 start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; 1133 vma->vm_start = start; 1134 vma->vm_end = start + len; 1135 1136 if (pregion->vm_flags & VM_MAPPED_COPY) 1137 vm_flags_set(vma, VM_MAPPED_COPY); 1138 else { 1139 ret = do_mmap_shared_file(vma); 1140 if (ret < 0) { 1141 vma->vm_region = NULL; 1142 vma->vm_start = 0; 1143 vma->vm_end = 0; 1144 pregion->vm_usage--; 1145 pregion = NULL; 1146 goto error_just_free; 1147 } 1148 } 1149 fput(region->vm_file); 1150 kmem_cache_free(vm_region_jar, region); 1151 region = pregion; 1152 result = start; 1153 goto share; 1154 } 1155 1156 /* obtain the address at which to make a shared mapping 1157 * - this is the hook for quasi-memory character devices to 1158 * tell us the location of a shared mapping 1159 */ 1160 if (capabilities & NOMMU_MAP_DIRECT) { 1161 addr = file->f_op->get_unmapped_area(file, addr, len, 1162 pgoff, flags); 1163 if (IS_ERR_VALUE(addr)) { 1164 ret = addr; 1165 if (ret != -ENOSYS) 1166 goto error_just_free; 1167 1168 /* the driver refused to tell us where to site 1169 * the mapping so we'll have to attempt to copy 1170 * it */ 1171 ret = -ENODEV; 1172 if (!(capabilities & NOMMU_MAP_COPY)) 1173 goto error_just_free; 1174 1175 capabilities &= ~NOMMU_MAP_DIRECT; 1176 } else { 1177 vma->vm_start = region->vm_start = addr; 1178 vma->vm_end = region->vm_end = addr + len; 1179 } 1180 } 1181 } 1182 1183 vma->vm_region = region; 1184 1185 /* set up the mapping 1186 * - the region is filled in if NOMMU_MAP_DIRECT is still set 1187 */ 1188 if (file && vma->vm_flags & VM_SHARED) 1189 ret = do_mmap_shared_file(vma); 1190 else 1191 ret = do_mmap_private(vma, region, len, capabilities); 1192 if (ret < 0) 1193 goto error_just_free; 1194 add_nommu_region(region); 1195 1196 /* clear anonymous mappings that don't ask for uninitialized data */ 1197 if (!vma->vm_file && 1198 (!IS_ENABLED(CONFIG_MMAP_ALLOW_UNINITIALIZED) || 1199 !(flags & MAP_UNINITIALIZED))) 1200 memset((void *)region->vm_start, 0, 1201 region->vm_end - region->vm_start); 1202 1203 /* okay... we have a mapping; now we have to register it */ 1204 result = vma->vm_start; 1205 1206 current->mm->total_vm += len >> PAGE_SHIFT; 1207 1208 share: 1209 BUG_ON(!vma->vm_region); 1210 vma_iter_config(&vmi, vma->vm_start, vma->vm_end); 1211 if (vma_iter_prealloc(&vmi, vma)) 1212 goto error_just_free; 1213 1214 setup_vma_to_mm(vma, current->mm); 1215 current->mm->map_count++; 1216 /* add the VMA to the tree */ 1217 vma_iter_store_new(&vmi, vma); 1218 1219 /* we flush the region from the icache only when the first executable 1220 * mapping of it is made */ 1221 if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { 1222 flush_icache_user_range(region->vm_start, region->vm_end); 1223 region->vm_icache_flushed = true; 1224 } 1225 1226 up_write(&nommu_region_sem); 1227 1228 return result; 1229 1230 error_just_free: 1231 up_write(&nommu_region_sem); 1232 error: 1233 vma_iter_free(&vmi); 1234 if (region->vm_file) 1235 fput(region->vm_file); 1236 kmem_cache_free(vm_region_jar, region); 1237 if (vma->vm_file) 1238 fput(vma->vm_file); 1239 vm_area_free(vma); 1240 return ret; 1241 1242 sharing_violation: 1243 up_write(&nommu_region_sem); 1244 pr_warn("Attempt to share mismatched mappings\n"); 1245 ret = -EINVAL; 1246 goto error; 1247 1248 error_getting_vma: 1249 kmem_cache_free(vm_region_jar, region); 1250 pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n", 1251 len, current->pid); 1252 show_mem(); 1253 return -ENOMEM; 1254 1255 error_getting_region: 1256 pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n", 1257 len, current->pid); 1258 show_mem(); 1259 return -ENOMEM; 1260 } 1261 1262 unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, 1263 unsigned long prot, unsigned long flags, 1264 unsigned long fd, unsigned long pgoff) 1265 { 1266 struct file *file = NULL; 1267 unsigned long retval = -EBADF; 1268 1269 audit_mmap_fd(fd, flags); 1270 if (!(flags & MAP_ANONYMOUS)) { 1271 file = fget(fd); 1272 if (!file) 1273 goto out; 1274 } 1275 1276 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); 1277 1278 if (file) 1279 fput(file); 1280 out: 1281 return retval; 1282 } 1283 1284 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, 1285 unsigned long, prot, unsigned long, flags, 1286 unsigned long, fd, unsigned long, pgoff) 1287 { 1288 return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); 1289 } 1290 1291 #ifdef __ARCH_WANT_SYS_OLD_MMAP 1292 struct mmap_arg_struct { 1293 unsigned long addr; 1294 unsigned long len; 1295 unsigned long prot; 1296 unsigned long flags; 1297 unsigned long fd; 1298 unsigned long offset; 1299 }; 1300 1301 SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) 1302 { 1303 struct mmap_arg_struct a; 1304 1305 if (copy_from_user(&a, arg, sizeof(a))) 1306 return -EFAULT; 1307 if (offset_in_page(a.offset)) 1308 return -EINVAL; 1309 1310 return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, 1311 a.offset >> PAGE_SHIFT); 1312 } 1313 #endif /* __ARCH_WANT_SYS_OLD_MMAP */ 1314 1315 /* 1316 * split a vma into two pieces at address 'addr', a new vma is allocated either 1317 * for the first part or the tail. 1318 */ 1319 static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, 1320 unsigned long addr, int new_below) 1321 { 1322 struct vm_area_struct *new; 1323 struct vm_region *region; 1324 unsigned long npages; 1325 struct mm_struct *mm; 1326 1327 /* we're only permitted to split anonymous regions (these should have 1328 * only a single usage on the region) */ 1329 if (vma->vm_file) 1330 return -ENOMEM; 1331 1332 mm = vma->vm_mm; 1333 if (mm->map_count >= sysctl_max_map_count) 1334 return -ENOMEM; 1335 1336 region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL); 1337 if (!region) 1338 return -ENOMEM; 1339 1340 new = vm_area_dup(vma); 1341 if (!new) 1342 goto err_vma_dup; 1343 1344 /* most fields are the same, copy all, and then fixup */ 1345 *region = *vma->vm_region; 1346 new->vm_region = region; 1347 1348 npages = (addr - vma->vm_start) >> PAGE_SHIFT; 1349 1350 if (new_below) { 1351 region->vm_top = region->vm_end = new->vm_end = addr; 1352 } else { 1353 region->vm_start = new->vm_start = addr; 1354 region->vm_pgoff = new->vm_pgoff += npages; 1355 } 1356 1357 vma_iter_config(vmi, new->vm_start, new->vm_end); 1358 if (vma_iter_prealloc(vmi, vma)) { 1359 pr_warn("Allocation of vma tree for process %d failed\n", 1360 current->pid); 1361 goto err_vmi_preallocate; 1362 } 1363 1364 if (new->vm_ops && new->vm_ops->open) 1365 new->vm_ops->open(new); 1366 1367 down_write(&nommu_region_sem); 1368 delete_nommu_region(vma->vm_region); 1369 if (new_below) { 1370 vma->vm_region->vm_start = vma->vm_start = addr; 1371 vma->vm_region->vm_pgoff = vma->vm_pgoff += npages; 1372 } else { 1373 vma->vm_region->vm_end = vma->vm_end = addr; 1374 vma->vm_region->vm_top = addr; 1375 } 1376 add_nommu_region(vma->vm_region); 1377 add_nommu_region(new->vm_region); 1378 up_write(&nommu_region_sem); 1379 1380 setup_vma_to_mm(vma, mm); 1381 setup_vma_to_mm(new, mm); 1382 vma_iter_store_new(vmi, new); 1383 mm->map_count++; 1384 return 0; 1385 1386 err_vmi_preallocate: 1387 vm_area_free(new); 1388 err_vma_dup: 1389 kmem_cache_free(vm_region_jar, region); 1390 return -ENOMEM; 1391 } 1392 1393 /* 1394 * shrink a VMA by removing the specified chunk from either the beginning or 1395 * the end 1396 */ 1397 static int vmi_shrink_vma(struct vma_iterator *vmi, 1398 struct vm_area_struct *vma, 1399 unsigned long from, unsigned long to) 1400 { 1401 struct vm_region *region; 1402 1403 /* adjust the VMA's pointers, which may reposition it in the MM's tree 1404 * and list */ 1405 if (from > vma->vm_start) { 1406 if (vma_iter_clear_gfp(vmi, from, vma->vm_end, GFP_KERNEL)) 1407 return -ENOMEM; 1408 vma->vm_end = from; 1409 } else { 1410 if (vma_iter_clear_gfp(vmi, vma->vm_start, to, GFP_KERNEL)) 1411 return -ENOMEM; 1412 vma->vm_start = to; 1413 } 1414 1415 /* cut the backing region down to size */ 1416 region = vma->vm_region; 1417 BUG_ON(region->vm_usage != 1); 1418 1419 down_write(&nommu_region_sem); 1420 delete_nommu_region(region); 1421 if (from > region->vm_start) { 1422 to = region->vm_top; 1423 region->vm_top = region->vm_end = from; 1424 } else { 1425 region->vm_start = to; 1426 } 1427 add_nommu_region(region); 1428 up_write(&nommu_region_sem); 1429 1430 free_page_series(from, to); 1431 return 0; 1432 } 1433 1434 /* 1435 * release a mapping 1436 * - under NOMMU conditions the chunk to be unmapped must be backed by a single 1437 * VMA, though it need not cover the whole VMA 1438 */ 1439 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) 1440 { 1441 VMA_ITERATOR(vmi, mm, start); 1442 struct vm_area_struct *vma; 1443 unsigned long end; 1444 int ret = 0; 1445 1446 len = PAGE_ALIGN(len); 1447 if (len == 0) 1448 return -EINVAL; 1449 1450 end = start + len; 1451 1452 /* find the first potentially overlapping VMA */ 1453 vma = vma_find(&vmi, end); 1454 if (!vma) { 1455 static int limit; 1456 if (limit < 5) { 1457 pr_warn("munmap of memory not mmapped by process %d (%s): 0x%lx-0x%lx\n", 1458 current->pid, current->comm, 1459 start, start + len - 1); 1460 limit++; 1461 } 1462 return -EINVAL; 1463 } 1464 1465 /* we're allowed to split an anonymous VMA but not a file-backed one */ 1466 if (vma->vm_file) { 1467 do { 1468 if (start > vma->vm_start) 1469 return -EINVAL; 1470 if (end == vma->vm_end) 1471 goto erase_whole_vma; 1472 vma = vma_find(&vmi, end); 1473 } while (vma); 1474 return -EINVAL; 1475 } else { 1476 /* the chunk must be a subset of the VMA found */ 1477 if (start == vma->vm_start && end == vma->vm_end) 1478 goto erase_whole_vma; 1479 if (start < vma->vm_start || end > vma->vm_end) 1480 return -EINVAL; 1481 if (offset_in_page(start)) 1482 return -EINVAL; 1483 if (end != vma->vm_end && offset_in_page(end)) 1484 return -EINVAL; 1485 if (start != vma->vm_start && end != vma->vm_end) { 1486 ret = split_vma(&vmi, vma, start, 1); 1487 if (ret < 0) 1488 return ret; 1489 } 1490 return vmi_shrink_vma(&vmi, vma, start, end); 1491 } 1492 1493 erase_whole_vma: 1494 if (delete_vma_from_mm(vma)) 1495 ret = -ENOMEM; 1496 else 1497 delete_vma(mm, vma); 1498 return ret; 1499 } 1500 1501 int vm_munmap(unsigned long addr, size_t len) 1502 { 1503 struct mm_struct *mm = current->mm; 1504 int ret; 1505 1506 mmap_write_lock(mm); 1507 ret = do_munmap(mm, addr, len, NULL); 1508 mmap_write_unlock(mm); 1509 return ret; 1510 } 1511 EXPORT_SYMBOL(vm_munmap); 1512 1513 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) 1514 { 1515 return vm_munmap(addr, len); 1516 } 1517 1518 /* 1519 * release all the mappings made in a process's VM space 1520 */ 1521 void exit_mmap(struct mm_struct *mm) 1522 { 1523 VMA_ITERATOR(vmi, mm, 0); 1524 struct vm_area_struct *vma; 1525 1526 if (!mm) 1527 return; 1528 1529 mm->total_vm = 0; 1530 1531 /* 1532 * Lock the mm to avoid assert complaining even though this is the only 1533 * user of the mm 1534 */ 1535 mmap_write_lock(mm); 1536 for_each_vma(vmi, vma) { 1537 cleanup_vma_from_mm(vma); 1538 delete_vma(mm, vma); 1539 cond_resched(); 1540 } 1541 __mt_destroy(&mm->mm_mt); 1542 mmap_write_unlock(mm); 1543 } 1544 1545 /* 1546 * expand (or shrink) an existing mapping, potentially moving it at the same 1547 * time (controlled by the MREMAP_MAYMOVE flag and available VM space) 1548 * 1549 * under NOMMU conditions, we only permit changing a mapping's size, and only 1550 * as long as it stays within the region allocated by do_mmap_private() and the 1551 * block is not shareable 1552 * 1553 * MREMAP_FIXED is not supported under NOMMU conditions 1554 */ 1555 static unsigned long do_mremap(unsigned long addr, 1556 unsigned long old_len, unsigned long new_len, 1557 unsigned long flags, unsigned long new_addr) 1558 { 1559 struct vm_area_struct *vma; 1560 1561 /* insanity checks first */ 1562 old_len = PAGE_ALIGN(old_len); 1563 new_len = PAGE_ALIGN(new_len); 1564 if (old_len == 0 || new_len == 0) 1565 return (unsigned long) -EINVAL; 1566 1567 if (offset_in_page(addr)) 1568 return -EINVAL; 1569 1570 if (flags & MREMAP_FIXED && new_addr != addr) 1571 return (unsigned long) -EINVAL; 1572 1573 vma = find_vma_exact(current->mm, addr, old_len); 1574 if (!vma) 1575 return (unsigned long) -EINVAL; 1576 1577 if (vma->vm_end != vma->vm_start + old_len) 1578 return (unsigned long) -EFAULT; 1579 1580 if (is_nommu_shared_mapping(vma->vm_flags)) 1581 return (unsigned long) -EPERM; 1582 1583 if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start) 1584 return (unsigned long) -ENOMEM; 1585 1586 /* all checks complete - do it */ 1587 vma->vm_end = vma->vm_start + new_len; 1588 return vma->vm_start; 1589 } 1590 1591 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, 1592 unsigned long, new_len, unsigned long, flags, 1593 unsigned long, new_addr) 1594 { 1595 unsigned long ret; 1596 1597 mmap_write_lock(current->mm); 1598 ret = do_mremap(addr, old_len, new_len, flags, new_addr); 1599 mmap_write_unlock(current->mm); 1600 return ret; 1601 } 1602 1603 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, 1604 unsigned long pfn, unsigned long size, pgprot_t prot) 1605 { 1606 if (addr != (pfn << PAGE_SHIFT)) 1607 return -EINVAL; 1608 1609 vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); 1610 return 0; 1611 } 1612 EXPORT_SYMBOL(remap_pfn_range); 1613 1614 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) 1615 { 1616 unsigned long pfn = start >> PAGE_SHIFT; 1617 unsigned long vm_len = vma->vm_end - vma->vm_start; 1618 1619 pfn += vma->vm_pgoff; 1620 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); 1621 } 1622 EXPORT_SYMBOL(vm_iomap_memory); 1623 1624 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 1625 unsigned long pgoff) 1626 { 1627 unsigned int size = vma->vm_end - vma->vm_start; 1628 1629 if (!(vma->vm_flags & VM_USERMAP)) 1630 return -EINVAL; 1631 1632 vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT)); 1633 vma->vm_end = vma->vm_start + size; 1634 1635 return 0; 1636 } 1637 EXPORT_SYMBOL(remap_vmalloc_range); 1638 1639 vm_fault_t filemap_fault(struct vm_fault *vmf) 1640 { 1641 BUG(); 1642 return 0; 1643 } 1644 EXPORT_SYMBOL(filemap_fault); 1645 1646 vm_fault_t filemap_map_pages(struct vm_fault *vmf, 1647 pgoff_t start_pgoff, pgoff_t end_pgoff) 1648 { 1649 BUG(); 1650 return 0; 1651 } 1652 EXPORT_SYMBOL(filemap_map_pages); 1653 1654 static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, 1655 void *buf, int len, unsigned int gup_flags) 1656 { 1657 struct vm_area_struct *vma; 1658 int write = gup_flags & FOLL_WRITE; 1659 1660 if (mmap_read_lock_killable(mm)) 1661 return 0; 1662 1663 /* the access must start within one of the target process's mappings */ 1664 vma = find_vma(mm, addr); 1665 if (vma) { 1666 /* don't overrun this mapping */ 1667 if (addr + len >= vma->vm_end) 1668 len = vma->vm_end - addr; 1669 1670 /* only read or write mappings where it is permitted */ 1671 if (write && vma->vm_flags & VM_MAYWRITE) 1672 copy_to_user_page(vma, NULL, addr, 1673 (void *) addr, buf, len); 1674 else if (!write && vma->vm_flags & VM_MAYREAD) 1675 copy_from_user_page(vma, NULL, addr, 1676 buf, (void *) addr, len); 1677 else 1678 len = 0; 1679 } else { 1680 len = 0; 1681 } 1682 1683 mmap_read_unlock(mm); 1684 1685 return len; 1686 } 1687 1688 /** 1689 * access_remote_vm - access another process' address space 1690 * @mm: the mm_struct of the target address space 1691 * @addr: start address to access 1692 * @buf: source or destination buffer 1693 * @len: number of bytes to transfer 1694 * @gup_flags: flags modifying lookup behaviour 1695 * 1696 * The caller must hold a reference on @mm. 1697 */ 1698 int access_remote_vm(struct mm_struct *mm, unsigned long addr, 1699 void *buf, int len, unsigned int gup_flags) 1700 { 1701 return __access_remote_vm(mm, addr, buf, len, gup_flags); 1702 } 1703 1704 /* 1705 * Access another process' address space. 1706 * - source/target buffer must be kernel space 1707 */ 1708 int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, 1709 unsigned int gup_flags) 1710 { 1711 struct mm_struct *mm; 1712 1713 if (addr + len < addr) 1714 return 0; 1715 1716 mm = get_task_mm(tsk); 1717 if (!mm) 1718 return 0; 1719 1720 len = __access_remote_vm(mm, addr, buf, len, gup_flags); 1721 1722 mmput(mm); 1723 return len; 1724 } 1725 EXPORT_SYMBOL_GPL(access_process_vm); 1726 1727 #ifdef CONFIG_BPF_SYSCALL 1728 /* 1729 * Copy a string from another process's address space as given in mm. 1730 * If there is any error return -EFAULT. 1731 */ 1732 static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr, 1733 void *buf, int len) 1734 { 1735 unsigned long addr_end; 1736 struct vm_area_struct *vma; 1737 int ret = -EFAULT; 1738 1739 *(char *)buf = '\0'; 1740 1741 if (mmap_read_lock_killable(mm)) 1742 return ret; 1743 1744 /* the access must start within one of the target process's mappings */ 1745 vma = find_vma(mm, addr); 1746 if (!vma) 1747 goto out; 1748 1749 if (check_add_overflow(addr, len, &addr_end)) 1750 goto out; 1751 1752 /* don't overrun this mapping */ 1753 if (addr_end > vma->vm_end) 1754 len = vma->vm_end - addr; 1755 1756 /* only read mappings where it is permitted */ 1757 if (vma->vm_flags & VM_MAYREAD) { 1758 ret = strscpy(buf, (char *)addr, len); 1759 if (ret < 0) 1760 ret = len - 1; 1761 } 1762 1763 out: 1764 mmap_read_unlock(mm); 1765 return ret; 1766 } 1767 1768 /** 1769 * copy_remote_vm_str - copy a string from another process's address space. 1770 * @tsk: the task of the target address space 1771 * @addr: start address to read from 1772 * @buf: destination buffer 1773 * @len: number of bytes to copy 1774 * @gup_flags: flags modifying lookup behaviour (unused) 1775 * 1776 * The caller must hold a reference on @mm. 1777 * 1778 * Return: number of bytes copied from @addr (source) to @buf (destination); 1779 * not including the trailing NUL. Always guaranteed to leave NUL-terminated 1780 * buffer. On any error, return -EFAULT. 1781 */ 1782 int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr, 1783 void *buf, int len, unsigned int gup_flags) 1784 { 1785 struct mm_struct *mm; 1786 int ret; 1787 1788 if (unlikely(len == 0)) 1789 return 0; 1790 1791 mm = get_task_mm(tsk); 1792 if (!mm) { 1793 *(char *)buf = '\0'; 1794 return -EFAULT; 1795 } 1796 1797 ret = __copy_remote_vm_str(mm, addr, buf, len); 1798 1799 mmput(mm); 1800 1801 return ret; 1802 } 1803 EXPORT_SYMBOL_GPL(copy_remote_vm_str); 1804 #endif /* CONFIG_BPF_SYSCALL */ 1805 1806 /** 1807 * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode 1808 * @inode: The inode to check 1809 * @size: The current filesize of the inode 1810 * @newsize: The proposed filesize of the inode 1811 * 1812 * Check the shared mappings on an inode on behalf of a shrinking truncate to 1813 * make sure that any outstanding VMAs aren't broken and then shrink the 1814 * vm_regions that extend beyond so that do_mmap() doesn't 1815 * automatically grant mappings that are too large. 1816 */ 1817 int nommu_shrink_inode_mappings(struct inode *inode, size_t size, 1818 size_t newsize) 1819 { 1820 struct vm_area_struct *vma; 1821 struct vm_region *region; 1822 pgoff_t low, high; 1823 size_t r_size, r_top; 1824 1825 low = newsize >> PAGE_SHIFT; 1826 high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 1827 1828 down_write(&nommu_region_sem); 1829 i_mmap_lock_read(inode->i_mapping); 1830 1831 /* search for VMAs that fall within the dead zone */ 1832 vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { 1833 /* found one - only interested if it's shared out of the page 1834 * cache */ 1835 if (vma->vm_flags & VM_SHARED) { 1836 i_mmap_unlock_read(inode->i_mapping); 1837 up_write(&nommu_region_sem); 1838 return -ETXTBSY; /* not quite true, but near enough */ 1839 } 1840 } 1841 1842 /* reduce any regions that overlap the dead zone - if in existence, 1843 * these will be pointed to by VMAs that don't overlap the dead zone 1844 * 1845 * we don't check for any regions that start beyond the EOF as there 1846 * shouldn't be any 1847 */ 1848 vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) { 1849 if (!(vma->vm_flags & VM_SHARED)) 1850 continue; 1851 1852 region = vma->vm_region; 1853 r_size = region->vm_top - region->vm_start; 1854 r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size; 1855 1856 if (r_top > newsize) { 1857 region->vm_top -= r_top - newsize; 1858 if (region->vm_end > region->vm_top) 1859 region->vm_end = region->vm_top; 1860 } 1861 } 1862 1863 i_mmap_unlock_read(inode->i_mapping); 1864 up_write(&nommu_region_sem); 1865 return 0; 1866 } 1867 1868 /* 1869 * Initialise sysctl_user_reserve_kbytes. 1870 * 1871 * This is intended to prevent a user from starting a single memory hogging 1872 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER 1873 * mode. 1874 * 1875 * The default value is min(3% of free memory, 128MB) 1876 * 128MB is enough to recover with sshd/login, bash, and top/kill. 1877 */ 1878 static int __meminit init_user_reserve(void) 1879 { 1880 unsigned long free_kbytes; 1881 1882 free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); 1883 1884 sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); 1885 return 0; 1886 } 1887 subsys_initcall(init_user_reserve); 1888 1889 /* 1890 * Initialise sysctl_admin_reserve_kbytes. 1891 * 1892 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin 1893 * to log in and kill a memory hogging process. 1894 * 1895 * Systems with more than 256MB will reserve 8MB, enough to recover 1896 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will 1897 * only reserve 3% of free pages by default. 1898 */ 1899 static int __meminit init_admin_reserve(void) 1900 { 1901 unsigned long free_kbytes; 1902 1903 free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); 1904 1905 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); 1906 return 0; 1907 } 1908 subsys_initcall(init_admin_reserve); 1909