1 /* 2 * linux/mm/nommu.c 3 * 4 * Replacement code for mm functions to support CPU's that don't 5 * have any form of memory management unit (thus no virtual memory). 6 * 7 * See Documentation/nommu-mmap.txt 8 * 9 * Copyright (c) 2004-2008 David Howells <dhowells@redhat.com> 10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> 11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> 12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> 13 * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org> 14 */ 15 16 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 17 18 #include <linux/export.h> 19 #include <linux/mm.h> 20 #include <linux/vmacache.h> 21 #include <linux/mman.h> 22 #include <linux/swap.h> 23 #include <linux/file.h> 24 #include <linux/highmem.h> 25 #include <linux/pagemap.h> 26 #include <linux/slab.h> 27 #include <linux/vmalloc.h> 28 #include <linux/blkdev.h> 29 #include <linux/backing-dev.h> 30 #include <linux/compiler.h> 31 #include <linux/mount.h> 32 #include <linux/personality.h> 33 #include <linux/security.h> 34 #include <linux/syscalls.h> 35 #include <linux/audit.h> 36 #include <linux/sched/sysctl.h> 37 #include <linux/printk.h> 38 39 #include <asm/uaccess.h> 40 #include <asm/tlb.h> 41 #include <asm/tlbflush.h> 42 #include <asm/mmu_context.h> 43 #include "internal.h" 44 45 void *high_memory; 46 EXPORT_SYMBOL(high_memory); 47 struct page *mem_map; 48 unsigned long max_mapnr; 49 EXPORT_SYMBOL(max_mapnr); 50 unsigned long highest_memmap_pfn; 51 struct percpu_counter vm_committed_as; 52 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 53 int sysctl_overcommit_ratio = 50; /* default is 50% */ 54 unsigned long sysctl_overcommit_kbytes __read_mostly; 55 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; 56 int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; 57 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ 58 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ 59 int heap_stack_gap = 0; 60 61 atomic_long_t mmap_pages_allocated; 62 63 /* 64 * The global memory commitment made in the system can be a metric 65 * that can be used to drive ballooning decisions when Linux is hosted 66 * as a guest. On Hyper-V, the host implements a policy engine for dynamically 67 * balancing memory across competing virtual machines that are hosted. 68 * Several metrics drive this policy engine including the guest reported 69 * memory commitment. 70 */ 71 unsigned long vm_memory_committed(void) 72 { 73 return percpu_counter_read_positive(&vm_committed_as); 74 } 75 76 EXPORT_SYMBOL_GPL(vm_memory_committed); 77 78 EXPORT_SYMBOL(mem_map); 79 80 /* list of mapped, potentially shareable regions */ 81 static struct kmem_cache *vm_region_jar; 82 struct rb_root nommu_region_tree = RB_ROOT; 83 DECLARE_RWSEM(nommu_region_sem); 84 85 const struct vm_operations_struct generic_file_vm_ops = { 86 }; 87 88 /* 89 * Return the total memory allocated for this pointer, not 90 * just what the caller asked for. 91 * 92 * Doesn't have to be accurate, i.e. may have races. 93 */ 94 unsigned int kobjsize(const void *objp) 95 { 96 struct page *page; 97 98 /* 99 * If the object we have should not have ksize performed on it, 100 * return size of 0 101 */ 102 if (!objp || !virt_addr_valid(objp)) 103 return 0; 104 105 page = virt_to_head_page(objp); 106 107 /* 108 * If the allocator sets PageSlab, we know the pointer came from 109 * kmalloc(). 110 */ 111 if (PageSlab(page)) 112 return ksize(objp); 113 114 /* 115 * If it's not a compound page, see if we have a matching VMA 116 * region. This test is intentionally done in reverse order, 117 * so if there's no VMA, we still fall through and hand back 118 * PAGE_SIZE for 0-order pages. 119 */ 120 if (!PageCompound(page)) { 121 struct vm_area_struct *vma; 122 123 vma = find_vma(current->mm, (unsigned long)objp); 124 if (vma) 125 return vma->vm_end - vma->vm_start; 126 } 127 128 /* 129 * The ksize() function is only guaranteed to work for pointers 130 * returned by kmalloc(). So handle arbitrary pointers here. 131 */ 132 return PAGE_SIZE << compound_order(page); 133 } 134 135 long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 136 unsigned long start, unsigned long nr_pages, 137 unsigned int foll_flags, struct page **pages, 138 struct vm_area_struct **vmas, int *nonblocking) 139 { 140 struct vm_area_struct *vma; 141 unsigned long vm_flags; 142 int i; 143 144 /* calculate required read or write permissions. 145 * If FOLL_FORCE is set, we only require the "MAY" flags. 146 */ 147 vm_flags = (foll_flags & FOLL_WRITE) ? 148 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 149 vm_flags &= (foll_flags & FOLL_FORCE) ? 150 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 151 152 for (i = 0; i < nr_pages; i++) { 153 vma = find_vma(mm, start); 154 if (!vma) 155 goto finish_or_fault; 156 157 /* protect what we can, including chardevs */ 158 if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || 159 !(vm_flags & vma->vm_flags)) 160 goto finish_or_fault; 161 162 if (pages) { 163 pages[i] = virt_to_page(start); 164 if (pages[i]) 165 page_cache_get(pages[i]); 166 } 167 if (vmas) 168 vmas[i] = vma; 169 start = (start + PAGE_SIZE) & PAGE_MASK; 170 } 171 172 return i; 173 174 finish_or_fault: 175 return i ? : -EFAULT; 176 } 177 178 /* 179 * get a list of pages in an address range belonging to the specified process 180 * and indicate the VMA that covers each page 181 * - this is potentially dodgy as we may end incrementing the page count of a 182 * slab page or a secondary page from a compound page 183 * - don't permit access to VMAs that don't support it, such as I/O mappings 184 */ 185 long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 186 unsigned long start, unsigned long nr_pages, 187 int write, int force, struct page **pages, 188 struct vm_area_struct **vmas) 189 { 190 int flags = 0; 191 192 if (write) 193 flags |= FOLL_WRITE; 194 if (force) 195 flags |= FOLL_FORCE; 196 197 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, 198 NULL); 199 } 200 EXPORT_SYMBOL(get_user_pages); 201 202 long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, 203 unsigned long start, unsigned long nr_pages, 204 int write, int force, struct page **pages, 205 int *locked) 206 { 207 return get_user_pages(tsk, mm, start, nr_pages, write, force, 208 pages, NULL); 209 } 210 EXPORT_SYMBOL(get_user_pages_locked); 211 212 long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, 213 unsigned long start, unsigned long nr_pages, 214 int write, int force, struct page **pages, 215 unsigned int gup_flags) 216 { 217 long ret; 218 down_read(&mm->mmap_sem); 219 ret = get_user_pages(tsk, mm, start, nr_pages, write, force, 220 pages, NULL); 221 up_read(&mm->mmap_sem); 222 return ret; 223 } 224 EXPORT_SYMBOL(__get_user_pages_unlocked); 225 226 long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, 227 unsigned long start, unsigned long nr_pages, 228 int write, int force, struct page **pages) 229 { 230 return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, 231 force, pages, 0); 232 } 233 EXPORT_SYMBOL(get_user_pages_unlocked); 234 235 /** 236 * follow_pfn - look up PFN at a user virtual address 237 * @vma: memory mapping 238 * @address: user virtual address 239 * @pfn: location to store found PFN 240 * 241 * Only IO mappings and raw PFN mappings are allowed. 242 * 243 * Returns zero and the pfn at @pfn on success, -ve otherwise. 244 */ 245 int follow_pfn(struct vm_area_struct *vma, unsigned long address, 246 unsigned long *pfn) 247 { 248 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) 249 return -EINVAL; 250 251 *pfn = address >> PAGE_SHIFT; 252 return 0; 253 } 254 EXPORT_SYMBOL(follow_pfn); 255 256 LIST_HEAD(vmap_area_list); 257 258 void vfree(const void *addr) 259 { 260 kfree(addr); 261 } 262 EXPORT_SYMBOL(vfree); 263 264 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 265 { 266 /* 267 * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc() 268 * returns only a logical address. 269 */ 270 return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); 271 } 272 EXPORT_SYMBOL(__vmalloc); 273 274 void *vmalloc_user(unsigned long size) 275 { 276 void *ret; 277 278 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, 279 PAGE_KERNEL); 280 if (ret) { 281 struct vm_area_struct *vma; 282 283 down_write(¤t->mm->mmap_sem); 284 vma = find_vma(current->mm, (unsigned long)ret); 285 if (vma) 286 vma->vm_flags |= VM_USERMAP; 287 up_write(¤t->mm->mmap_sem); 288 } 289 290 return ret; 291 } 292 EXPORT_SYMBOL(vmalloc_user); 293 294 struct page *vmalloc_to_page(const void *addr) 295 { 296 return virt_to_page(addr); 297 } 298 EXPORT_SYMBOL(vmalloc_to_page); 299 300 unsigned long vmalloc_to_pfn(const void *addr) 301 { 302 return page_to_pfn(virt_to_page(addr)); 303 } 304 EXPORT_SYMBOL(vmalloc_to_pfn); 305 306 long vread(char *buf, char *addr, unsigned long count) 307 { 308 /* Don't allow overflow */ 309 if ((unsigned long) buf + count < count) 310 count = -(unsigned long) buf; 311 312 memcpy(buf, addr, count); 313 return count; 314 } 315 316 long vwrite(char *buf, char *addr, unsigned long count) 317 { 318 /* Don't allow overflow */ 319 if ((unsigned long) addr + count < count) 320 count = -(unsigned long) addr; 321 322 memcpy(addr, buf, count); 323 return count; 324 } 325 326 /* 327 * vmalloc - allocate virtually contiguous memory 328 * 329 * @size: allocation size 330 * 331 * Allocate enough pages to cover @size from the page level 332 * allocator and map them into contiguous kernel virtual space. 333 * 334 * For tight control over page level allocator and protection flags 335 * use __vmalloc() instead. 336 */ 337 void *vmalloc(unsigned long size) 338 { 339 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); 340 } 341 EXPORT_SYMBOL(vmalloc); 342 343 /* 344 * vzalloc - allocate virtually contiguous memory with zero fill 345 * 346 * @size: allocation size 347 * 348 * Allocate enough pages to cover @size from the page level 349 * allocator and map them into contiguous kernel virtual space. 350 * The memory allocated is set to zero. 351 * 352 * For tight control over page level allocator and protection flags 353 * use __vmalloc() instead. 354 */ 355 void *vzalloc(unsigned long size) 356 { 357 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, 358 PAGE_KERNEL); 359 } 360 EXPORT_SYMBOL(vzalloc); 361 362 /** 363 * vmalloc_node - allocate memory on a specific node 364 * @size: allocation size 365 * @node: numa node 366 * 367 * Allocate enough pages to cover @size from the page level 368 * allocator and map them into contiguous kernel virtual space. 369 * 370 * For tight control over page level allocator and protection flags 371 * use __vmalloc() instead. 372 */ 373 void *vmalloc_node(unsigned long size, int node) 374 { 375 return vmalloc(size); 376 } 377 EXPORT_SYMBOL(vmalloc_node); 378 379 /** 380 * vzalloc_node - allocate memory on a specific node with zero fill 381 * @size: allocation size 382 * @node: numa node 383 * 384 * Allocate enough pages to cover @size from the page level 385 * allocator and map them into contiguous kernel virtual space. 386 * The memory allocated is set to zero. 387 * 388 * For tight control over page level allocator and protection flags 389 * use __vmalloc() instead. 390 */ 391 void *vzalloc_node(unsigned long size, int node) 392 { 393 return vzalloc(size); 394 } 395 EXPORT_SYMBOL(vzalloc_node); 396 397 #ifndef PAGE_KERNEL_EXEC 398 # define PAGE_KERNEL_EXEC PAGE_KERNEL 399 #endif 400 401 /** 402 * vmalloc_exec - allocate virtually contiguous, executable memory 403 * @size: allocation size 404 * 405 * Kernel-internal function to allocate enough pages to cover @size 406 * the page level allocator and map them into contiguous and 407 * executable kernel virtual space. 408 * 409 * For tight control over page level allocator and protection flags 410 * use __vmalloc() instead. 411 */ 412 413 void *vmalloc_exec(unsigned long size) 414 { 415 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); 416 } 417 418 /** 419 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 420 * @size: allocation size 421 * 422 * Allocate enough 32bit PA addressable pages to cover @size from the 423 * page level allocator and map them into contiguous kernel virtual space. 424 */ 425 void *vmalloc_32(unsigned long size) 426 { 427 return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); 428 } 429 EXPORT_SYMBOL(vmalloc_32); 430 431 /** 432 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory 433 * @size: allocation size 434 * 435 * The resulting memory area is 32bit addressable and zeroed so it can be 436 * mapped to userspace without leaking data. 437 * 438 * VM_USERMAP is set on the corresponding VMA so that subsequent calls to 439 * remap_vmalloc_range() are permissible. 440 */ 441 void *vmalloc_32_user(unsigned long size) 442 { 443 /* 444 * We'll have to sort out the ZONE_DMA bits for 64-bit, 445 * but for now this can simply use vmalloc_user() directly. 446 */ 447 return vmalloc_user(size); 448 } 449 EXPORT_SYMBOL(vmalloc_32_user); 450 451 void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) 452 { 453 BUG(); 454 return NULL; 455 } 456 EXPORT_SYMBOL(vmap); 457 458 void vunmap(const void *addr) 459 { 460 BUG(); 461 } 462 EXPORT_SYMBOL(vunmap); 463 464 void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) 465 { 466 BUG(); 467 return NULL; 468 } 469 EXPORT_SYMBOL(vm_map_ram); 470 471 void vm_unmap_ram(const void *mem, unsigned int count) 472 { 473 BUG(); 474 } 475 EXPORT_SYMBOL(vm_unmap_ram); 476 477 void vm_unmap_aliases(void) 478 { 479 } 480 EXPORT_SYMBOL_GPL(vm_unmap_aliases); 481 482 /* 483 * Implement a stub for vmalloc_sync_all() if the architecture chose not to 484 * have one. 485 */ 486 void __weak vmalloc_sync_all(void) 487 { 488 } 489 490 /** 491 * alloc_vm_area - allocate a range of kernel address space 492 * @size: size of the area 493 * 494 * Returns: NULL on failure, vm_struct on success 495 * 496 * This function reserves a range of kernel address space, and 497 * allocates pagetables to map that range. No actual mappings 498 * are created. If the kernel address space is not shared 499 * between processes, it syncs the pagetable across all 500 * processes. 501 */ 502 struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) 503 { 504 BUG(); 505 return NULL; 506 } 507 EXPORT_SYMBOL_GPL(alloc_vm_area); 508 509 void free_vm_area(struct vm_struct *area) 510 { 511 BUG(); 512 } 513 EXPORT_SYMBOL_GPL(free_vm_area); 514 515 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, 516 struct page *page) 517 { 518 return -EINVAL; 519 } 520 EXPORT_SYMBOL(vm_insert_page); 521 522 /* 523 * sys_brk() for the most part doesn't need the global kernel 524 * lock, except when an application is doing something nasty 525 * like trying to un-brk an area that has already been mapped 526 * to a regular file. in this case, the unmapping will need 527 * to invoke file system routines that need the global lock. 528 */ 529 SYSCALL_DEFINE1(brk, unsigned long, brk) 530 { 531 struct mm_struct *mm = current->mm; 532 533 if (brk < mm->start_brk || brk > mm->context.end_brk) 534 return mm->brk; 535 536 if (mm->brk == brk) 537 return mm->brk; 538 539 /* 540 * Always allow shrinking brk 541 */ 542 if (brk <= mm->brk) { 543 mm->brk = brk; 544 return brk; 545 } 546 547 /* 548 * Ok, looks good - let it rip. 549 */ 550 flush_icache_range(mm->brk, brk); 551 return mm->brk = brk; 552 } 553 554 /* 555 * initialise the VMA and region record slabs 556 */ 557 void __init mmap_init(void) 558 { 559 int ret; 560 561 ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); 562 VM_BUG_ON(ret); 563 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT); 564 } 565 566 /* 567 * validate the region tree 568 * - the caller must hold the region lock 569 */ 570 #ifdef CONFIG_DEBUG_NOMMU_REGIONS 571 static noinline void validate_nommu_regions(void) 572 { 573 struct vm_region *region, *last; 574 struct rb_node *p, *lastp; 575 576 lastp = rb_first(&nommu_region_tree); 577 if (!lastp) 578 return; 579 580 last = rb_entry(lastp, struct vm_region, vm_rb); 581 BUG_ON(last->vm_end <= last->vm_start); 582 BUG_ON(last->vm_top < last->vm_end); 583 584 while ((p = rb_next(lastp))) { 585 region = rb_entry(p, struct vm_region, vm_rb); 586 last = rb_entry(lastp, struct vm_region, vm_rb); 587 588 BUG_ON(region->vm_end <= region->vm_start); 589 BUG_ON(region->vm_top < region->vm_end); 590 BUG_ON(region->vm_start < last->vm_top); 591 592 lastp = p; 593 } 594 } 595 #else 596 static void validate_nommu_regions(void) 597 { 598 } 599 #endif 600 601 /* 602 * add a region into the global tree 603 */ 604 static void add_nommu_region(struct vm_region *region) 605 { 606 struct vm_region *pregion; 607 struct rb_node **p, *parent; 608 609 validate_nommu_regions(); 610 611 parent = NULL; 612 p = &nommu_region_tree.rb_node; 613 while (*p) { 614 parent = *p; 615 pregion = rb_entry(parent, struct vm_region, vm_rb); 616 if (region->vm_start < pregion->vm_start) 617 p = &(*p)->rb_left; 618 else if (region->vm_start > pregion->vm_start) 619 p = &(*p)->rb_right; 620 else if (pregion == region) 621 return; 622 else 623 BUG(); 624 } 625 626 rb_link_node(®ion->vm_rb, parent, p); 627 rb_insert_color(®ion->vm_rb, &nommu_region_tree); 628 629 validate_nommu_regions(); 630 } 631 632 /* 633 * delete a region from the global tree 634 */ 635 static void delete_nommu_region(struct vm_region *region) 636 { 637 BUG_ON(!nommu_region_tree.rb_node); 638 639 validate_nommu_regions(); 640 rb_erase(®ion->vm_rb, &nommu_region_tree); 641 validate_nommu_regions(); 642 } 643 644 /* 645 * free a contiguous series of pages 646 */ 647 static void free_page_series(unsigned long from, unsigned long to) 648 { 649 for (; from < to; from += PAGE_SIZE) { 650 struct page *page = virt_to_page(from); 651 652 atomic_long_dec(&mmap_pages_allocated); 653 put_page(page); 654 } 655 } 656 657 /* 658 * release a reference to a region 659 * - the caller must hold the region semaphore for writing, which this releases 660 * - the region may not have been added to the tree yet, in which case vm_top 661 * will equal vm_start 662 */ 663 static void __put_nommu_region(struct vm_region *region) 664 __releases(nommu_region_sem) 665 { 666 BUG_ON(!nommu_region_tree.rb_node); 667 668 if (--region->vm_usage == 0) { 669 if (region->vm_top > region->vm_start) 670 delete_nommu_region(region); 671 up_write(&nommu_region_sem); 672 673 if (region->vm_file) 674 fput(region->vm_file); 675 676 /* IO memory and memory shared directly out of the pagecache 677 * from ramfs/tmpfs mustn't be released here */ 678 if (region->vm_flags & VM_MAPPED_COPY) 679 free_page_series(region->vm_start, region->vm_top); 680 kmem_cache_free(vm_region_jar, region); 681 } else { 682 up_write(&nommu_region_sem); 683 } 684 } 685 686 /* 687 * release a reference to a region 688 */ 689 static void put_nommu_region(struct vm_region *region) 690 { 691 down_write(&nommu_region_sem); 692 __put_nommu_region(region); 693 } 694 695 /* 696 * update protection on a vma 697 */ 698 static void protect_vma(struct vm_area_struct *vma, unsigned long flags) 699 { 700 #ifdef CONFIG_MPU 701 struct mm_struct *mm = vma->vm_mm; 702 long start = vma->vm_start & PAGE_MASK; 703 while (start < vma->vm_end) { 704 protect_page(mm, start, flags); 705 start += PAGE_SIZE; 706 } 707 update_protections(mm); 708 #endif 709 } 710 711 /* 712 * add a VMA into a process's mm_struct in the appropriate place in the list 713 * and tree and add to the address space's page tree also if not an anonymous 714 * page 715 * - should be called with mm->mmap_sem held writelocked 716 */ 717 static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) 718 { 719 struct vm_area_struct *pvma, *prev; 720 struct address_space *mapping; 721 struct rb_node **p, *parent, *rb_prev; 722 723 BUG_ON(!vma->vm_region); 724 725 mm->map_count++; 726 vma->vm_mm = mm; 727 728 protect_vma(vma, vma->vm_flags); 729 730 /* add the VMA to the mapping */ 731 if (vma->vm_file) { 732 mapping = vma->vm_file->f_mapping; 733 734 i_mmap_lock_write(mapping); 735 flush_dcache_mmap_lock(mapping); 736 vma_interval_tree_insert(vma, &mapping->i_mmap); 737 flush_dcache_mmap_unlock(mapping); 738 i_mmap_unlock_write(mapping); 739 } 740 741 /* add the VMA to the tree */ 742 parent = rb_prev = NULL; 743 p = &mm->mm_rb.rb_node; 744 while (*p) { 745 parent = *p; 746 pvma = rb_entry(parent, struct vm_area_struct, vm_rb); 747 748 /* sort by: start addr, end addr, VMA struct addr in that order 749 * (the latter is necessary as we may get identical VMAs) */ 750 if (vma->vm_start < pvma->vm_start) 751 p = &(*p)->rb_left; 752 else if (vma->vm_start > pvma->vm_start) { 753 rb_prev = parent; 754 p = &(*p)->rb_right; 755 } else if (vma->vm_end < pvma->vm_end) 756 p = &(*p)->rb_left; 757 else if (vma->vm_end > pvma->vm_end) { 758 rb_prev = parent; 759 p = &(*p)->rb_right; 760 } else if (vma < pvma) 761 p = &(*p)->rb_left; 762 else if (vma > pvma) { 763 rb_prev = parent; 764 p = &(*p)->rb_right; 765 } else 766 BUG(); 767 } 768 769 rb_link_node(&vma->vm_rb, parent, p); 770 rb_insert_color(&vma->vm_rb, &mm->mm_rb); 771 772 /* add VMA to the VMA list also */ 773 prev = NULL; 774 if (rb_prev) 775 prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); 776 777 __vma_link_list(mm, vma, prev, parent); 778 } 779 780 /* 781 * delete a VMA from its owning mm_struct and address space 782 */ 783 static void delete_vma_from_mm(struct vm_area_struct *vma) 784 { 785 int i; 786 struct address_space *mapping; 787 struct mm_struct *mm = vma->vm_mm; 788 struct task_struct *curr = current; 789 790 protect_vma(vma, 0); 791 792 mm->map_count--; 793 for (i = 0; i < VMACACHE_SIZE; i++) { 794 /* if the vma is cached, invalidate the entire cache */ 795 if (curr->vmacache[i] == vma) { 796 vmacache_invalidate(mm); 797 break; 798 } 799 } 800 801 /* remove the VMA from the mapping */ 802 if (vma->vm_file) { 803 mapping = vma->vm_file->f_mapping; 804 805 i_mmap_lock_write(mapping); 806 flush_dcache_mmap_lock(mapping); 807 vma_interval_tree_remove(vma, &mapping->i_mmap); 808 flush_dcache_mmap_unlock(mapping); 809 i_mmap_unlock_write(mapping); 810 } 811 812 /* remove from the MM's tree and list */ 813 rb_erase(&vma->vm_rb, &mm->mm_rb); 814 815 if (vma->vm_prev) 816 vma->vm_prev->vm_next = vma->vm_next; 817 else 818 mm->mmap = vma->vm_next; 819 820 if (vma->vm_next) 821 vma->vm_next->vm_prev = vma->vm_prev; 822 } 823 824 /* 825 * destroy a VMA record 826 */ 827 static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) 828 { 829 if (vma->vm_ops && vma->vm_ops->close) 830 vma->vm_ops->close(vma); 831 if (vma->vm_file) 832 fput(vma->vm_file); 833 put_nommu_region(vma->vm_region); 834 kmem_cache_free(vm_area_cachep, vma); 835 } 836 837 /* 838 * look up the first VMA in which addr resides, NULL if none 839 * - should be called with mm->mmap_sem at least held readlocked 840 */ 841 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) 842 { 843 struct vm_area_struct *vma; 844 845 /* check the cache first */ 846 vma = vmacache_find(mm, addr); 847 if (likely(vma)) 848 return vma; 849 850 /* trawl the list (there may be multiple mappings in which addr 851 * resides) */ 852 for (vma = mm->mmap; vma; vma = vma->vm_next) { 853 if (vma->vm_start > addr) 854 return NULL; 855 if (vma->vm_end > addr) { 856 vmacache_update(addr, vma); 857 return vma; 858 } 859 } 860 861 return NULL; 862 } 863 EXPORT_SYMBOL(find_vma); 864 865 /* 866 * find a VMA 867 * - we don't extend stack VMAs under NOMMU conditions 868 */ 869 struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) 870 { 871 return find_vma(mm, addr); 872 } 873 874 /* 875 * expand a stack to a given address 876 * - not supported under NOMMU conditions 877 */ 878 int expand_stack(struct vm_area_struct *vma, unsigned long address) 879 { 880 return -ENOMEM; 881 } 882 883 /* 884 * look up the first VMA exactly that exactly matches addr 885 * - should be called with mm->mmap_sem at least held readlocked 886 */ 887 static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, 888 unsigned long addr, 889 unsigned long len) 890 { 891 struct vm_area_struct *vma; 892 unsigned long end = addr + len; 893 894 /* check the cache first */ 895 vma = vmacache_find_exact(mm, addr, end); 896 if (vma) 897 return vma; 898 899 /* trawl the list (there may be multiple mappings in which addr 900 * resides) */ 901 for (vma = mm->mmap; vma; vma = vma->vm_next) { 902 if (vma->vm_start < addr) 903 continue; 904 if (vma->vm_start > addr) 905 return NULL; 906 if (vma->vm_end == end) { 907 vmacache_update(addr, vma); 908 return vma; 909 } 910 } 911 912 return NULL; 913 } 914 915 /* 916 * determine whether a mapping should be permitted and, if so, what sort of 917 * mapping we're capable of supporting 918 */ 919 static int validate_mmap_request(struct file *file, 920 unsigned long addr, 921 unsigned long len, 922 unsigned long prot, 923 unsigned long flags, 924 unsigned long pgoff, 925 unsigned long *_capabilities) 926 { 927 unsigned long capabilities, rlen; 928 int ret; 929 930 /* do the simple checks first */ 931 if (flags & MAP_FIXED) 932 return -EINVAL; 933 934 if ((flags & MAP_TYPE) != MAP_PRIVATE && 935 (flags & MAP_TYPE) != MAP_SHARED) 936 return -EINVAL; 937 938 if (!len) 939 return -EINVAL; 940 941 /* Careful about overflows.. */ 942 rlen = PAGE_ALIGN(len); 943 if (!rlen || rlen > TASK_SIZE) 944 return -ENOMEM; 945 946 /* offset overflow? */ 947 if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff) 948 return -EOVERFLOW; 949 950 if (file) { 951 /* files must support mmap */ 952 if (!file->f_op->mmap) 953 return -ENODEV; 954 955 /* work out if what we've got could possibly be shared 956 * - we support chardevs that provide their own "memory" 957 * - we support files/blockdevs that are memory backed 958 */ 959 if (file->f_op->mmap_capabilities) { 960 capabilities = file->f_op->mmap_capabilities(file); 961 } else { 962 /* no explicit capabilities set, so assume some 963 * defaults */ 964 switch (file_inode(file)->i_mode & S_IFMT) { 965 case S_IFREG: 966 case S_IFBLK: 967 capabilities = NOMMU_MAP_COPY; 968 break; 969 970 case S_IFCHR: 971 capabilities = 972 NOMMU_MAP_DIRECT | 973 NOMMU_MAP_READ | 974 NOMMU_MAP_WRITE; 975 break; 976 977 default: 978 return -EINVAL; 979 } 980 } 981 982 /* eliminate any capabilities that we can't support on this 983 * device */ 984 if (!file->f_op->get_unmapped_area) 985 capabilities &= ~NOMMU_MAP_DIRECT; 986 if (!(file->f_mode & FMODE_CAN_READ)) 987 capabilities &= ~NOMMU_MAP_COPY; 988 989 /* The file shall have been opened with read permission. */ 990 if (!(file->f_mode & FMODE_READ)) 991 return -EACCES; 992 993 if (flags & MAP_SHARED) { 994 /* do checks for writing, appending and locking */ 995 if ((prot & PROT_WRITE) && 996 !(file->f_mode & FMODE_WRITE)) 997 return -EACCES; 998 999 if (IS_APPEND(file_inode(file)) && 1000 (file->f_mode & FMODE_WRITE)) 1001 return -EACCES; 1002 1003 if (locks_verify_locked(file)) 1004 return -EAGAIN; 1005 1006 if (!(capabilities & NOMMU_MAP_DIRECT)) 1007 return -ENODEV; 1008 1009 /* we mustn't privatise shared mappings */ 1010 capabilities &= ~NOMMU_MAP_COPY; 1011 } else { 1012 /* we're going to read the file into private memory we 1013 * allocate */ 1014 if (!(capabilities & NOMMU_MAP_COPY)) 1015 return -ENODEV; 1016 1017 /* we don't permit a private writable mapping to be 1018 * shared with the backing device */ 1019 if (prot & PROT_WRITE) 1020 capabilities &= ~NOMMU_MAP_DIRECT; 1021 } 1022 1023 if (capabilities & NOMMU_MAP_DIRECT) { 1024 if (((prot & PROT_READ) && !(capabilities & NOMMU_MAP_READ)) || 1025 ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) || 1026 ((prot & PROT_EXEC) && !(capabilities & NOMMU_MAP_EXEC)) 1027 ) { 1028 capabilities &= ~NOMMU_MAP_DIRECT; 1029 if (flags & MAP_SHARED) { 1030 pr_warn("MAP_SHARED not completely supported on !MMU\n"); 1031 return -EINVAL; 1032 } 1033 } 1034 } 1035 1036 /* handle executable mappings and implied executable 1037 * mappings */ 1038 if (path_noexec(&file->f_path)) { 1039 if (prot & PROT_EXEC) 1040 return -EPERM; 1041 } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { 1042 /* handle implication of PROT_EXEC by PROT_READ */ 1043 if (current->personality & READ_IMPLIES_EXEC) { 1044 if (capabilities & NOMMU_MAP_EXEC) 1045 prot |= PROT_EXEC; 1046 } 1047 } else if ((prot & PROT_READ) && 1048 (prot & PROT_EXEC) && 1049 !(capabilities & NOMMU_MAP_EXEC) 1050 ) { 1051 /* backing file is not executable, try to copy */ 1052 capabilities &= ~NOMMU_MAP_DIRECT; 1053 } 1054 } else { 1055 /* anonymous mappings are always memory backed and can be 1056 * privately mapped 1057 */ 1058 capabilities = NOMMU_MAP_COPY; 1059 1060 /* handle PROT_EXEC implication by PROT_READ */ 1061 if ((prot & PROT_READ) && 1062 (current->personality & READ_IMPLIES_EXEC)) 1063 prot |= PROT_EXEC; 1064 } 1065 1066 /* allow the security API to have its say */ 1067 ret = security_mmap_addr(addr); 1068 if (ret < 0) 1069 return ret; 1070 1071 /* looks okay */ 1072 *_capabilities = capabilities; 1073 return 0; 1074 } 1075 1076 /* 1077 * we've determined that we can make the mapping, now translate what we 1078 * now know into VMA flags 1079 */ 1080 static unsigned long determine_vm_flags(struct file *file, 1081 unsigned long prot, 1082 unsigned long flags, 1083 unsigned long capabilities) 1084 { 1085 unsigned long vm_flags; 1086 1087 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); 1088 /* vm_flags |= mm->def_flags; */ 1089 1090 if (!(capabilities & NOMMU_MAP_DIRECT)) { 1091 /* attempt to share read-only copies of mapped file chunks */ 1092 vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 1093 if (file && !(prot & PROT_WRITE)) 1094 vm_flags |= VM_MAYSHARE; 1095 } else { 1096 /* overlay a shareable mapping on the backing device or inode 1097 * if possible - used for chardevs, ramfs/tmpfs/shmfs and 1098 * romfs/cramfs */ 1099 vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS); 1100 if (flags & MAP_SHARED) 1101 vm_flags |= VM_SHARED; 1102 } 1103 1104 /* refuse to let anyone share private mappings with this process if 1105 * it's being traced - otherwise breakpoints set in it may interfere 1106 * with another untraced process 1107 */ 1108 if ((flags & MAP_PRIVATE) && current->ptrace) 1109 vm_flags &= ~VM_MAYSHARE; 1110 1111 return vm_flags; 1112 } 1113 1114 /* 1115 * set up a shared mapping on a file (the driver or filesystem provides and 1116 * pins the storage) 1117 */ 1118 static int do_mmap_shared_file(struct vm_area_struct *vma) 1119 { 1120 int ret; 1121 1122 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); 1123 if (ret == 0) { 1124 vma->vm_region->vm_top = vma->vm_region->vm_end; 1125 return 0; 1126 } 1127 if (ret != -ENOSYS) 1128 return ret; 1129 1130 /* getting -ENOSYS indicates that direct mmap isn't possible (as 1131 * opposed to tried but failed) so we can only give a suitable error as 1132 * it's not possible to make a private copy if MAP_SHARED was given */ 1133 return -ENODEV; 1134 } 1135 1136 /* 1137 * set up a private mapping or an anonymous shared mapping 1138 */ 1139 static int do_mmap_private(struct vm_area_struct *vma, 1140 struct vm_region *region, 1141 unsigned long len, 1142 unsigned long capabilities) 1143 { 1144 unsigned long total, point; 1145 void *base; 1146 int ret, order; 1147 1148 /* invoke the file's mapping function so that it can keep track of 1149 * shared mappings on devices or memory 1150 * - VM_MAYSHARE will be set if it may attempt to share 1151 */ 1152 if (capabilities & NOMMU_MAP_DIRECT) { 1153 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); 1154 if (ret == 0) { 1155 /* shouldn't return success if we're not sharing */ 1156 BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); 1157 vma->vm_region->vm_top = vma->vm_region->vm_end; 1158 return 0; 1159 } 1160 if (ret != -ENOSYS) 1161 return ret; 1162 1163 /* getting an ENOSYS error indicates that direct mmap isn't 1164 * possible (as opposed to tried but failed) so we'll try to 1165 * make a private copy of the data and map that instead */ 1166 } 1167 1168 1169 /* allocate some memory to hold the mapping 1170 * - note that this may not return a page-aligned address if the object 1171 * we're allocating is smaller than a page 1172 */ 1173 order = get_order(len); 1174 total = 1 << order; 1175 point = len >> PAGE_SHIFT; 1176 1177 /* we don't want to allocate a power-of-2 sized page set */ 1178 if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) 1179 total = point; 1180 1181 base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL); 1182 if (!base) 1183 goto enomem; 1184 1185 atomic_long_add(total, &mmap_pages_allocated); 1186 1187 region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; 1188 region->vm_start = (unsigned long) base; 1189 region->vm_end = region->vm_start + len; 1190 region->vm_top = region->vm_start + (total << PAGE_SHIFT); 1191 1192 vma->vm_start = region->vm_start; 1193 vma->vm_end = region->vm_start + len; 1194 1195 if (vma->vm_file) { 1196 /* read the contents of a file into the copy */ 1197 mm_segment_t old_fs; 1198 loff_t fpos; 1199 1200 fpos = vma->vm_pgoff; 1201 fpos <<= PAGE_SHIFT; 1202 1203 old_fs = get_fs(); 1204 set_fs(KERNEL_DS); 1205 ret = __vfs_read(vma->vm_file, base, len, &fpos); 1206 set_fs(old_fs); 1207 1208 if (ret < 0) 1209 goto error_free; 1210 1211 /* clear the last little bit */ 1212 if (ret < len) 1213 memset(base + ret, 0, len - ret); 1214 1215 } 1216 1217 return 0; 1218 1219 error_free: 1220 free_page_series(region->vm_start, region->vm_top); 1221 region->vm_start = vma->vm_start = 0; 1222 region->vm_end = vma->vm_end = 0; 1223 region->vm_top = 0; 1224 return ret; 1225 1226 enomem: 1227 pr_err("Allocation of length %lu from process %d (%s) failed\n", 1228 len, current->pid, current->comm); 1229 show_free_areas(0); 1230 return -ENOMEM; 1231 } 1232 1233 /* 1234 * handle mapping creation for uClinux 1235 */ 1236 unsigned long do_mmap(struct file *file, 1237 unsigned long addr, 1238 unsigned long len, 1239 unsigned long prot, 1240 unsigned long flags, 1241 vm_flags_t vm_flags, 1242 unsigned long pgoff, 1243 unsigned long *populate) 1244 { 1245 struct vm_area_struct *vma; 1246 struct vm_region *region; 1247 struct rb_node *rb; 1248 unsigned long capabilities, result; 1249 int ret; 1250 1251 *populate = 0; 1252 1253 /* decide whether we should attempt the mapping, and if so what sort of 1254 * mapping */ 1255 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, 1256 &capabilities); 1257 if (ret < 0) 1258 return ret; 1259 1260 /* we ignore the address hint */ 1261 addr = 0; 1262 len = PAGE_ALIGN(len); 1263 1264 /* we've determined that we can make the mapping, now translate what we 1265 * now know into VMA flags */ 1266 vm_flags |= determine_vm_flags(file, prot, flags, capabilities); 1267 1268 /* we're going to need to record the mapping */ 1269 region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); 1270 if (!region) 1271 goto error_getting_region; 1272 1273 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 1274 if (!vma) 1275 goto error_getting_vma; 1276 1277 region->vm_usage = 1; 1278 region->vm_flags = vm_flags; 1279 region->vm_pgoff = pgoff; 1280 1281 INIT_LIST_HEAD(&vma->anon_vma_chain); 1282 vma->vm_flags = vm_flags; 1283 vma->vm_pgoff = pgoff; 1284 1285 if (file) { 1286 region->vm_file = get_file(file); 1287 vma->vm_file = get_file(file); 1288 } 1289 1290 down_write(&nommu_region_sem); 1291 1292 /* if we want to share, we need to check for regions created by other 1293 * mmap() calls that overlap with our proposed mapping 1294 * - we can only share with a superset match on most regular files 1295 * - shared mappings on character devices and memory backed files are 1296 * permitted to overlap inexactly as far as we are concerned for in 1297 * these cases, sharing is handled in the driver or filesystem rather 1298 * than here 1299 */ 1300 if (vm_flags & VM_MAYSHARE) { 1301 struct vm_region *pregion; 1302 unsigned long pglen, rpglen, pgend, rpgend, start; 1303 1304 pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1305 pgend = pgoff + pglen; 1306 1307 for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) { 1308 pregion = rb_entry(rb, struct vm_region, vm_rb); 1309 1310 if (!(pregion->vm_flags & VM_MAYSHARE)) 1311 continue; 1312 1313 /* search for overlapping mappings on the same file */ 1314 if (file_inode(pregion->vm_file) != 1315 file_inode(file)) 1316 continue; 1317 1318 if (pregion->vm_pgoff >= pgend) 1319 continue; 1320 1321 rpglen = pregion->vm_end - pregion->vm_start; 1322 rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT; 1323 rpgend = pregion->vm_pgoff + rpglen; 1324 if (pgoff >= rpgend) 1325 continue; 1326 1327 /* handle inexactly overlapping matches between 1328 * mappings */ 1329 if ((pregion->vm_pgoff != pgoff || rpglen != pglen) && 1330 !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) { 1331 /* new mapping is not a subset of the region */ 1332 if (!(capabilities & NOMMU_MAP_DIRECT)) 1333 goto sharing_violation; 1334 continue; 1335 } 1336 1337 /* we've found a region we can share */ 1338 pregion->vm_usage++; 1339 vma->vm_region = pregion; 1340 start = pregion->vm_start; 1341 start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; 1342 vma->vm_start = start; 1343 vma->vm_end = start + len; 1344 1345 if (pregion->vm_flags & VM_MAPPED_COPY) 1346 vma->vm_flags |= VM_MAPPED_COPY; 1347 else { 1348 ret = do_mmap_shared_file(vma); 1349 if (ret < 0) { 1350 vma->vm_region = NULL; 1351 vma->vm_start = 0; 1352 vma->vm_end = 0; 1353 pregion->vm_usage--; 1354 pregion = NULL; 1355 goto error_just_free; 1356 } 1357 } 1358 fput(region->vm_file); 1359 kmem_cache_free(vm_region_jar, region); 1360 region = pregion; 1361 result = start; 1362 goto share; 1363 } 1364 1365 /* obtain the address at which to make a shared mapping 1366 * - this is the hook for quasi-memory character devices to 1367 * tell us the location of a shared mapping 1368 */ 1369 if (capabilities & NOMMU_MAP_DIRECT) { 1370 addr = file->f_op->get_unmapped_area(file, addr, len, 1371 pgoff, flags); 1372 if (IS_ERR_VALUE(addr)) { 1373 ret = addr; 1374 if (ret != -ENOSYS) 1375 goto error_just_free; 1376 1377 /* the driver refused to tell us where to site 1378 * the mapping so we'll have to attempt to copy 1379 * it */ 1380 ret = -ENODEV; 1381 if (!(capabilities & NOMMU_MAP_COPY)) 1382 goto error_just_free; 1383 1384 capabilities &= ~NOMMU_MAP_DIRECT; 1385 } else { 1386 vma->vm_start = region->vm_start = addr; 1387 vma->vm_end = region->vm_end = addr + len; 1388 } 1389 } 1390 } 1391 1392 vma->vm_region = region; 1393 1394 /* set up the mapping 1395 * - the region is filled in if NOMMU_MAP_DIRECT is still set 1396 */ 1397 if (file && vma->vm_flags & VM_SHARED) 1398 ret = do_mmap_shared_file(vma); 1399 else 1400 ret = do_mmap_private(vma, region, len, capabilities); 1401 if (ret < 0) 1402 goto error_just_free; 1403 add_nommu_region(region); 1404 1405 /* clear anonymous mappings that don't ask for uninitialized data */ 1406 if (!vma->vm_file && !(flags & MAP_UNINITIALIZED)) 1407 memset((void *)region->vm_start, 0, 1408 region->vm_end - region->vm_start); 1409 1410 /* okay... we have a mapping; now we have to register it */ 1411 result = vma->vm_start; 1412 1413 current->mm->total_vm += len >> PAGE_SHIFT; 1414 1415 share: 1416 add_vma_to_mm(current->mm, vma); 1417 1418 /* we flush the region from the icache only when the first executable 1419 * mapping of it is made */ 1420 if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { 1421 flush_icache_range(region->vm_start, region->vm_end); 1422 region->vm_icache_flushed = true; 1423 } 1424 1425 up_write(&nommu_region_sem); 1426 1427 return result; 1428 1429 error_just_free: 1430 up_write(&nommu_region_sem); 1431 error: 1432 if (region->vm_file) 1433 fput(region->vm_file); 1434 kmem_cache_free(vm_region_jar, region); 1435 if (vma->vm_file) 1436 fput(vma->vm_file); 1437 kmem_cache_free(vm_area_cachep, vma); 1438 return ret; 1439 1440 sharing_violation: 1441 up_write(&nommu_region_sem); 1442 pr_warn("Attempt to share mismatched mappings\n"); 1443 ret = -EINVAL; 1444 goto error; 1445 1446 error_getting_vma: 1447 kmem_cache_free(vm_region_jar, region); 1448 pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n", 1449 len, current->pid); 1450 show_free_areas(0); 1451 return -ENOMEM; 1452 1453 error_getting_region: 1454 pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n", 1455 len, current->pid); 1456 show_free_areas(0); 1457 return -ENOMEM; 1458 } 1459 1460 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, 1461 unsigned long, prot, unsigned long, flags, 1462 unsigned long, fd, unsigned long, pgoff) 1463 { 1464 struct file *file = NULL; 1465 unsigned long retval = -EBADF; 1466 1467 audit_mmap_fd(fd, flags); 1468 if (!(flags & MAP_ANONYMOUS)) { 1469 file = fget(fd); 1470 if (!file) 1471 goto out; 1472 } 1473 1474 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 1475 1476 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); 1477 1478 if (file) 1479 fput(file); 1480 out: 1481 return retval; 1482 } 1483 1484 #ifdef __ARCH_WANT_SYS_OLD_MMAP 1485 struct mmap_arg_struct { 1486 unsigned long addr; 1487 unsigned long len; 1488 unsigned long prot; 1489 unsigned long flags; 1490 unsigned long fd; 1491 unsigned long offset; 1492 }; 1493 1494 SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) 1495 { 1496 struct mmap_arg_struct a; 1497 1498 if (copy_from_user(&a, arg, sizeof(a))) 1499 return -EFAULT; 1500 if (offset_in_page(a.offset)) 1501 return -EINVAL; 1502 1503 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, 1504 a.offset >> PAGE_SHIFT); 1505 } 1506 #endif /* __ARCH_WANT_SYS_OLD_MMAP */ 1507 1508 /* 1509 * split a vma into two pieces at address 'addr', a new vma is allocated either 1510 * for the first part or the tail. 1511 */ 1512 int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, 1513 unsigned long addr, int new_below) 1514 { 1515 struct vm_area_struct *new; 1516 struct vm_region *region; 1517 unsigned long npages; 1518 1519 /* we're only permitted to split anonymous regions (these should have 1520 * only a single usage on the region) */ 1521 if (vma->vm_file) 1522 return -ENOMEM; 1523 1524 if (mm->map_count >= sysctl_max_map_count) 1525 return -ENOMEM; 1526 1527 region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL); 1528 if (!region) 1529 return -ENOMEM; 1530 1531 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 1532 if (!new) { 1533 kmem_cache_free(vm_region_jar, region); 1534 return -ENOMEM; 1535 } 1536 1537 /* most fields are the same, copy all, and then fixup */ 1538 *new = *vma; 1539 *region = *vma->vm_region; 1540 new->vm_region = region; 1541 1542 npages = (addr - vma->vm_start) >> PAGE_SHIFT; 1543 1544 if (new_below) { 1545 region->vm_top = region->vm_end = new->vm_end = addr; 1546 } else { 1547 region->vm_start = new->vm_start = addr; 1548 region->vm_pgoff = new->vm_pgoff += npages; 1549 } 1550 1551 if (new->vm_ops && new->vm_ops->open) 1552 new->vm_ops->open(new); 1553 1554 delete_vma_from_mm(vma); 1555 down_write(&nommu_region_sem); 1556 delete_nommu_region(vma->vm_region); 1557 if (new_below) { 1558 vma->vm_region->vm_start = vma->vm_start = addr; 1559 vma->vm_region->vm_pgoff = vma->vm_pgoff += npages; 1560 } else { 1561 vma->vm_region->vm_end = vma->vm_end = addr; 1562 vma->vm_region->vm_top = addr; 1563 } 1564 add_nommu_region(vma->vm_region); 1565 add_nommu_region(new->vm_region); 1566 up_write(&nommu_region_sem); 1567 add_vma_to_mm(mm, vma); 1568 add_vma_to_mm(mm, new); 1569 return 0; 1570 } 1571 1572 /* 1573 * shrink a VMA by removing the specified chunk from either the beginning or 1574 * the end 1575 */ 1576 static int shrink_vma(struct mm_struct *mm, 1577 struct vm_area_struct *vma, 1578 unsigned long from, unsigned long to) 1579 { 1580 struct vm_region *region; 1581 1582 /* adjust the VMA's pointers, which may reposition it in the MM's tree 1583 * and list */ 1584 delete_vma_from_mm(vma); 1585 if (from > vma->vm_start) 1586 vma->vm_end = from; 1587 else 1588 vma->vm_start = to; 1589 add_vma_to_mm(mm, vma); 1590 1591 /* cut the backing region down to size */ 1592 region = vma->vm_region; 1593 BUG_ON(region->vm_usage != 1); 1594 1595 down_write(&nommu_region_sem); 1596 delete_nommu_region(region); 1597 if (from > region->vm_start) { 1598 to = region->vm_top; 1599 region->vm_top = region->vm_end = from; 1600 } else { 1601 region->vm_start = to; 1602 } 1603 add_nommu_region(region); 1604 up_write(&nommu_region_sem); 1605 1606 free_page_series(from, to); 1607 return 0; 1608 } 1609 1610 /* 1611 * release a mapping 1612 * - under NOMMU conditions the chunk to be unmapped must be backed by a single 1613 * VMA, though it need not cover the whole VMA 1614 */ 1615 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) 1616 { 1617 struct vm_area_struct *vma; 1618 unsigned long end; 1619 int ret; 1620 1621 len = PAGE_ALIGN(len); 1622 if (len == 0) 1623 return -EINVAL; 1624 1625 end = start + len; 1626 1627 /* find the first potentially overlapping VMA */ 1628 vma = find_vma(mm, start); 1629 if (!vma) { 1630 static int limit; 1631 if (limit < 5) { 1632 pr_warn("munmap of memory not mmapped by process %d (%s): 0x%lx-0x%lx\n", 1633 current->pid, current->comm, 1634 start, start + len - 1); 1635 limit++; 1636 } 1637 return -EINVAL; 1638 } 1639 1640 /* we're allowed to split an anonymous VMA but not a file-backed one */ 1641 if (vma->vm_file) { 1642 do { 1643 if (start > vma->vm_start) 1644 return -EINVAL; 1645 if (end == vma->vm_end) 1646 goto erase_whole_vma; 1647 vma = vma->vm_next; 1648 } while (vma); 1649 return -EINVAL; 1650 } else { 1651 /* the chunk must be a subset of the VMA found */ 1652 if (start == vma->vm_start && end == vma->vm_end) 1653 goto erase_whole_vma; 1654 if (start < vma->vm_start || end > vma->vm_end) 1655 return -EINVAL; 1656 if (offset_in_page(start)) 1657 return -EINVAL; 1658 if (end != vma->vm_end && offset_in_page(end)) 1659 return -EINVAL; 1660 if (start != vma->vm_start && end != vma->vm_end) { 1661 ret = split_vma(mm, vma, start, 1); 1662 if (ret < 0) 1663 return ret; 1664 } 1665 return shrink_vma(mm, vma, start, end); 1666 } 1667 1668 erase_whole_vma: 1669 delete_vma_from_mm(vma); 1670 delete_vma(mm, vma); 1671 return 0; 1672 } 1673 EXPORT_SYMBOL(do_munmap); 1674 1675 int vm_munmap(unsigned long addr, size_t len) 1676 { 1677 struct mm_struct *mm = current->mm; 1678 int ret; 1679 1680 down_write(&mm->mmap_sem); 1681 ret = do_munmap(mm, addr, len); 1682 up_write(&mm->mmap_sem); 1683 return ret; 1684 } 1685 EXPORT_SYMBOL(vm_munmap); 1686 1687 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) 1688 { 1689 return vm_munmap(addr, len); 1690 } 1691 1692 /* 1693 * release all the mappings made in a process's VM space 1694 */ 1695 void exit_mmap(struct mm_struct *mm) 1696 { 1697 struct vm_area_struct *vma; 1698 1699 if (!mm) 1700 return; 1701 1702 mm->total_vm = 0; 1703 1704 while ((vma = mm->mmap)) { 1705 mm->mmap = vma->vm_next; 1706 delete_vma_from_mm(vma); 1707 delete_vma(mm, vma); 1708 cond_resched(); 1709 } 1710 } 1711 1712 unsigned long vm_brk(unsigned long addr, unsigned long len) 1713 { 1714 return -ENOMEM; 1715 } 1716 1717 /* 1718 * expand (or shrink) an existing mapping, potentially moving it at the same 1719 * time (controlled by the MREMAP_MAYMOVE flag and available VM space) 1720 * 1721 * under NOMMU conditions, we only permit changing a mapping's size, and only 1722 * as long as it stays within the region allocated by do_mmap_private() and the 1723 * block is not shareable 1724 * 1725 * MREMAP_FIXED is not supported under NOMMU conditions 1726 */ 1727 static unsigned long do_mremap(unsigned long addr, 1728 unsigned long old_len, unsigned long new_len, 1729 unsigned long flags, unsigned long new_addr) 1730 { 1731 struct vm_area_struct *vma; 1732 1733 /* insanity checks first */ 1734 old_len = PAGE_ALIGN(old_len); 1735 new_len = PAGE_ALIGN(new_len); 1736 if (old_len == 0 || new_len == 0) 1737 return (unsigned long) -EINVAL; 1738 1739 if (offset_in_page(addr)) 1740 return -EINVAL; 1741 1742 if (flags & MREMAP_FIXED && new_addr != addr) 1743 return (unsigned long) -EINVAL; 1744 1745 vma = find_vma_exact(current->mm, addr, old_len); 1746 if (!vma) 1747 return (unsigned long) -EINVAL; 1748 1749 if (vma->vm_end != vma->vm_start + old_len) 1750 return (unsigned long) -EFAULT; 1751 1752 if (vma->vm_flags & VM_MAYSHARE) 1753 return (unsigned long) -EPERM; 1754 1755 if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start) 1756 return (unsigned long) -ENOMEM; 1757 1758 /* all checks complete - do it */ 1759 vma->vm_end = vma->vm_start + new_len; 1760 return vma->vm_start; 1761 } 1762 1763 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, 1764 unsigned long, new_len, unsigned long, flags, 1765 unsigned long, new_addr) 1766 { 1767 unsigned long ret; 1768 1769 down_write(¤t->mm->mmap_sem); 1770 ret = do_mremap(addr, old_len, new_len, flags, new_addr); 1771 up_write(¤t->mm->mmap_sem); 1772 return ret; 1773 } 1774 1775 struct page *follow_page_mask(struct vm_area_struct *vma, 1776 unsigned long address, unsigned int flags, 1777 unsigned int *page_mask) 1778 { 1779 *page_mask = 0; 1780 return NULL; 1781 } 1782 1783 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, 1784 unsigned long pfn, unsigned long size, pgprot_t prot) 1785 { 1786 if (addr != (pfn << PAGE_SHIFT)) 1787 return -EINVAL; 1788 1789 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; 1790 return 0; 1791 } 1792 EXPORT_SYMBOL(remap_pfn_range); 1793 1794 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) 1795 { 1796 unsigned long pfn = start >> PAGE_SHIFT; 1797 unsigned long vm_len = vma->vm_end - vma->vm_start; 1798 1799 pfn += vma->vm_pgoff; 1800 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); 1801 } 1802 EXPORT_SYMBOL(vm_iomap_memory); 1803 1804 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 1805 unsigned long pgoff) 1806 { 1807 unsigned int size = vma->vm_end - vma->vm_start; 1808 1809 if (!(vma->vm_flags & VM_USERMAP)) 1810 return -EINVAL; 1811 1812 vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT)); 1813 vma->vm_end = vma->vm_start + size; 1814 1815 return 0; 1816 } 1817 EXPORT_SYMBOL(remap_vmalloc_range); 1818 1819 unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, 1820 unsigned long len, unsigned long pgoff, unsigned long flags) 1821 { 1822 return -ENOMEM; 1823 } 1824 1825 void unmap_mapping_range(struct address_space *mapping, 1826 loff_t const holebegin, loff_t const holelen, 1827 int even_cows) 1828 { 1829 } 1830 EXPORT_SYMBOL(unmap_mapping_range); 1831 1832 /* 1833 * Check that a process has enough memory to allocate a new virtual 1834 * mapping. 0 means there is enough memory for the allocation to 1835 * succeed and -ENOMEM implies there is not. 1836 * 1837 * We currently support three overcommit policies, which are set via the 1838 * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting 1839 * 1840 * Strict overcommit modes added 2002 Feb 26 by Alan Cox. 1841 * Additional code 2002 Jul 20 by Robert Love. 1842 * 1843 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. 1844 * 1845 * Note this is a helper function intended to be used by LSMs which 1846 * wish to use this logic. 1847 */ 1848 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) 1849 { 1850 long free, allowed, reserve; 1851 1852 vm_acct_memory(pages); 1853 1854 /* 1855 * Sometimes we want to use more memory than we have 1856 */ 1857 if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) 1858 return 0; 1859 1860 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 1861 free = global_page_state(NR_FREE_PAGES); 1862 free += global_page_state(NR_FILE_PAGES); 1863 1864 /* 1865 * shmem pages shouldn't be counted as free in this 1866 * case, they can't be purged, only swapped out, and 1867 * that won't affect the overall amount of available 1868 * memory in the system. 1869 */ 1870 free -= global_page_state(NR_SHMEM); 1871 1872 free += get_nr_swap_pages(); 1873 1874 /* 1875 * Any slabs which are created with the 1876 * SLAB_RECLAIM_ACCOUNT flag claim to have contents 1877 * which are reclaimable, under pressure. The dentry 1878 * cache and most inode caches should fall into this 1879 */ 1880 free += global_page_state(NR_SLAB_RECLAIMABLE); 1881 1882 /* 1883 * Leave reserved pages. The pages are not for anonymous pages. 1884 */ 1885 if (free <= totalreserve_pages) 1886 goto error; 1887 else 1888 free -= totalreserve_pages; 1889 1890 /* 1891 * Reserve some for root 1892 */ 1893 if (!cap_sys_admin) 1894 free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); 1895 1896 if (free > pages) 1897 return 0; 1898 1899 goto error; 1900 } 1901 1902 allowed = vm_commit_limit(); 1903 /* 1904 * Reserve some 3% for root 1905 */ 1906 if (!cap_sys_admin) 1907 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); 1908 1909 /* 1910 * Don't let a single process grow so big a user can't recover 1911 */ 1912 if (mm) { 1913 reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); 1914 allowed -= min_t(long, mm->total_vm / 32, reserve); 1915 } 1916 1917 if (percpu_counter_read_positive(&vm_committed_as) < allowed) 1918 return 0; 1919 1920 error: 1921 vm_unacct_memory(pages); 1922 1923 return -ENOMEM; 1924 } 1925 1926 int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1927 { 1928 BUG(); 1929 return 0; 1930 } 1931 EXPORT_SYMBOL(filemap_fault); 1932 1933 void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) 1934 { 1935 BUG(); 1936 } 1937 EXPORT_SYMBOL(filemap_map_pages); 1938 1939 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, 1940 unsigned long addr, void *buf, int len, int write) 1941 { 1942 struct vm_area_struct *vma; 1943 1944 down_read(&mm->mmap_sem); 1945 1946 /* the access must start within one of the target process's mappings */ 1947 vma = find_vma(mm, addr); 1948 if (vma) { 1949 /* don't overrun this mapping */ 1950 if (addr + len >= vma->vm_end) 1951 len = vma->vm_end - addr; 1952 1953 /* only read or write mappings where it is permitted */ 1954 if (write && vma->vm_flags & VM_MAYWRITE) 1955 copy_to_user_page(vma, NULL, addr, 1956 (void *) addr, buf, len); 1957 else if (!write && vma->vm_flags & VM_MAYREAD) 1958 copy_from_user_page(vma, NULL, addr, 1959 buf, (void *) addr, len); 1960 else 1961 len = 0; 1962 } else { 1963 len = 0; 1964 } 1965 1966 up_read(&mm->mmap_sem); 1967 1968 return len; 1969 } 1970 1971 /** 1972 * @access_remote_vm - access another process' address space 1973 * @mm: the mm_struct of the target address space 1974 * @addr: start address to access 1975 * @buf: source or destination buffer 1976 * @len: number of bytes to transfer 1977 * @write: whether the access is a write 1978 * 1979 * The caller must hold a reference on @mm. 1980 */ 1981 int access_remote_vm(struct mm_struct *mm, unsigned long addr, 1982 void *buf, int len, int write) 1983 { 1984 return __access_remote_vm(NULL, mm, addr, buf, len, write); 1985 } 1986 1987 /* 1988 * Access another process' address space. 1989 * - source/target buffer must be kernel space 1990 */ 1991 int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) 1992 { 1993 struct mm_struct *mm; 1994 1995 if (addr + len < addr) 1996 return 0; 1997 1998 mm = get_task_mm(tsk); 1999 if (!mm) 2000 return 0; 2001 2002 len = __access_remote_vm(tsk, mm, addr, buf, len, write); 2003 2004 mmput(mm); 2005 return len; 2006 } 2007 2008 /** 2009 * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode 2010 * @inode: The inode to check 2011 * @size: The current filesize of the inode 2012 * @newsize: The proposed filesize of the inode 2013 * 2014 * Check the shared mappings on an inode on behalf of a shrinking truncate to 2015 * make sure that that any outstanding VMAs aren't broken and then shrink the 2016 * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't 2017 * automatically grant mappings that are too large. 2018 */ 2019 int nommu_shrink_inode_mappings(struct inode *inode, size_t size, 2020 size_t newsize) 2021 { 2022 struct vm_area_struct *vma; 2023 struct vm_region *region; 2024 pgoff_t low, high; 2025 size_t r_size, r_top; 2026 2027 low = newsize >> PAGE_SHIFT; 2028 high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 2029 2030 down_write(&nommu_region_sem); 2031 i_mmap_lock_read(inode->i_mapping); 2032 2033 /* search for VMAs that fall within the dead zone */ 2034 vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { 2035 /* found one - only interested if it's shared out of the page 2036 * cache */ 2037 if (vma->vm_flags & VM_SHARED) { 2038 i_mmap_unlock_read(inode->i_mapping); 2039 up_write(&nommu_region_sem); 2040 return -ETXTBSY; /* not quite true, but near enough */ 2041 } 2042 } 2043 2044 /* reduce any regions that overlap the dead zone - if in existence, 2045 * these will be pointed to by VMAs that don't overlap the dead zone 2046 * 2047 * we don't check for any regions that start beyond the EOF as there 2048 * shouldn't be any 2049 */ 2050 vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) { 2051 if (!(vma->vm_flags & VM_SHARED)) 2052 continue; 2053 2054 region = vma->vm_region; 2055 r_size = region->vm_top - region->vm_start; 2056 r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size; 2057 2058 if (r_top > newsize) { 2059 region->vm_top -= r_top - newsize; 2060 if (region->vm_end > region->vm_top) 2061 region->vm_end = region->vm_top; 2062 } 2063 } 2064 2065 i_mmap_unlock_read(inode->i_mapping); 2066 up_write(&nommu_region_sem); 2067 return 0; 2068 } 2069 2070 /* 2071 * Initialise sysctl_user_reserve_kbytes. 2072 * 2073 * This is intended to prevent a user from starting a single memory hogging 2074 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER 2075 * mode. 2076 * 2077 * The default value is min(3% of free memory, 128MB) 2078 * 128MB is enough to recover with sshd/login, bash, and top/kill. 2079 */ 2080 static int __meminit init_user_reserve(void) 2081 { 2082 unsigned long free_kbytes; 2083 2084 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); 2085 2086 sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); 2087 return 0; 2088 } 2089 subsys_initcall(init_user_reserve); 2090 2091 /* 2092 * Initialise sysctl_admin_reserve_kbytes. 2093 * 2094 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin 2095 * to log in and kill a memory hogging process. 2096 * 2097 * Systems with more than 256MB will reserve 8MB, enough to recover 2098 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will 2099 * only reserve 3% of free pages by default. 2100 */ 2101 static int __meminit init_admin_reserve(void) 2102 { 2103 unsigned long free_kbytes; 2104 2105 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); 2106 2107 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); 2108 return 0; 2109 } 2110 subsys_initcall(init_admin_reserve); 2111