1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/madvise.c 4 * 5 * Copyright (C) 1999 Linus Torvalds 6 * Copyright (C) 2002 Christoph Hellwig 7 */ 8 9 #include <linux/mman.h> 10 #include <linux/pagemap.h> 11 #include <linux/syscalls.h> 12 #include <linux/mempolicy.h> 13 #include <linux/page-isolation.h> 14 #include <linux/page_idle.h> 15 #include <linux/userfaultfd_k.h> 16 #include <linux/hugetlb.h> 17 #include <linux/falloc.h> 18 #include <linux/fadvise.h> 19 #include <linux/sched.h> 20 #include <linux/sched/mm.h> 21 #include <linux/uio.h> 22 #include <linux/ksm.h> 23 #include <linux/fs.h> 24 #include <linux/file.h> 25 #include <linux/blkdev.h> 26 #include <linux/backing-dev.h> 27 #include <linux/pagewalk.h> 28 #include <linux/swap.h> 29 #include <linux/swapops.h> 30 #include <linux/shmem_fs.h> 31 #include <linux/mmu_notifier.h> 32 33 #include <asm/tlb.h> 34 35 #include "internal.h" 36 37 struct madvise_walk_private { 38 struct mmu_gather *tlb; 39 bool pageout; 40 }; 41 42 /* 43 * Any behaviour which results in changes to the vma->vm_flags needs to 44 * take mmap_lock for writing. Others, which simply traverse vmas, need 45 * to only take it for reading. 46 */ 47 static int madvise_need_mmap_write(int behavior) 48 { 49 switch (behavior) { 50 case MADV_REMOVE: 51 case MADV_WILLNEED: 52 case MADV_DONTNEED: 53 case MADV_COLD: 54 case MADV_PAGEOUT: 55 case MADV_FREE: 56 return 0; 57 default: 58 /* be safe, default to 1. list exceptions explicitly */ 59 return 1; 60 } 61 } 62 63 /* 64 * We can potentially split a vm area into separate 65 * areas, each area with its own behavior. 66 */ 67 static long madvise_behavior(struct vm_area_struct *vma, 68 struct vm_area_struct **prev, 69 unsigned long start, unsigned long end, int behavior) 70 { 71 struct mm_struct *mm = vma->vm_mm; 72 int error = 0; 73 pgoff_t pgoff; 74 unsigned long new_flags = vma->vm_flags; 75 76 switch (behavior) { 77 case MADV_NORMAL: 78 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 79 break; 80 case MADV_SEQUENTIAL: 81 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 82 break; 83 case MADV_RANDOM: 84 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 85 break; 86 case MADV_DONTFORK: 87 new_flags |= VM_DONTCOPY; 88 break; 89 case MADV_DOFORK: 90 if (vma->vm_flags & VM_IO) { 91 error = -EINVAL; 92 goto out; 93 } 94 new_flags &= ~VM_DONTCOPY; 95 break; 96 case MADV_WIPEONFORK: 97 /* MADV_WIPEONFORK is only supported on anonymous memory. */ 98 if (vma->vm_file || vma->vm_flags & VM_SHARED) { 99 error = -EINVAL; 100 goto out; 101 } 102 new_flags |= VM_WIPEONFORK; 103 break; 104 case MADV_KEEPONFORK: 105 new_flags &= ~VM_WIPEONFORK; 106 break; 107 case MADV_DONTDUMP: 108 new_flags |= VM_DONTDUMP; 109 break; 110 case MADV_DODUMP: 111 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) { 112 error = -EINVAL; 113 goto out; 114 } 115 new_flags &= ~VM_DONTDUMP; 116 break; 117 case MADV_MERGEABLE: 118 case MADV_UNMERGEABLE: 119 error = ksm_madvise(vma, start, end, behavior, &new_flags); 120 if (error) 121 goto out_convert_errno; 122 break; 123 case MADV_HUGEPAGE: 124 case MADV_NOHUGEPAGE: 125 error = hugepage_madvise(vma, &new_flags, behavior); 126 if (error) 127 goto out_convert_errno; 128 break; 129 } 130 131 if (new_flags == vma->vm_flags) { 132 *prev = vma; 133 goto out; 134 } 135 136 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 137 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, 138 vma->vm_file, pgoff, vma_policy(vma), 139 vma->vm_userfaultfd_ctx); 140 if (*prev) { 141 vma = *prev; 142 goto success; 143 } 144 145 *prev = vma; 146 147 if (start != vma->vm_start) { 148 if (unlikely(mm->map_count >= sysctl_max_map_count)) { 149 error = -ENOMEM; 150 goto out; 151 } 152 error = __split_vma(mm, vma, start, 1); 153 if (error) 154 goto out_convert_errno; 155 } 156 157 if (end != vma->vm_end) { 158 if (unlikely(mm->map_count >= sysctl_max_map_count)) { 159 error = -ENOMEM; 160 goto out; 161 } 162 error = __split_vma(mm, vma, end, 0); 163 if (error) 164 goto out_convert_errno; 165 } 166 167 success: 168 /* 169 * vm_flags is protected by the mmap_lock held in write mode. 170 */ 171 vma->vm_flags = new_flags; 172 173 out_convert_errno: 174 /* 175 * madvise() returns EAGAIN if kernel resources, such as 176 * slab, are temporarily unavailable. 177 */ 178 if (error == -ENOMEM) 179 error = -EAGAIN; 180 out: 181 return error; 182 } 183 184 #ifdef CONFIG_SWAP 185 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, 186 unsigned long end, struct mm_walk *walk) 187 { 188 pte_t *orig_pte; 189 struct vm_area_struct *vma = walk->private; 190 unsigned long index; 191 192 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 193 return 0; 194 195 for (index = start; index != end; index += PAGE_SIZE) { 196 pte_t pte; 197 swp_entry_t entry; 198 struct page *page; 199 spinlock_t *ptl; 200 201 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); 202 pte = *(orig_pte + ((index - start) / PAGE_SIZE)); 203 pte_unmap_unlock(orig_pte, ptl); 204 205 if (pte_present(pte) || pte_none(pte)) 206 continue; 207 entry = pte_to_swp_entry(pte); 208 if (unlikely(non_swap_entry(entry))) 209 continue; 210 211 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, 212 vma, index, false); 213 if (page) 214 put_page(page); 215 } 216 217 return 0; 218 } 219 220 static const struct mm_walk_ops swapin_walk_ops = { 221 .pmd_entry = swapin_walk_pmd_entry, 222 }; 223 224 static void force_shm_swapin_readahead(struct vm_area_struct *vma, 225 unsigned long start, unsigned long end, 226 struct address_space *mapping) 227 { 228 XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); 229 pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1); 230 struct page *page; 231 232 rcu_read_lock(); 233 xas_for_each(&xas, page, end_index) { 234 swp_entry_t swap; 235 236 if (!xa_is_value(page)) 237 continue; 238 xas_pause(&xas); 239 rcu_read_unlock(); 240 241 swap = radix_to_swp_entry(page); 242 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, 243 NULL, 0, false); 244 if (page) 245 put_page(page); 246 247 rcu_read_lock(); 248 } 249 rcu_read_unlock(); 250 251 lru_add_drain(); /* Push any new pages onto the LRU now */ 252 } 253 #endif /* CONFIG_SWAP */ 254 255 /* 256 * Schedule all required I/O operations. Do not wait for completion. 257 */ 258 static long madvise_willneed(struct vm_area_struct *vma, 259 struct vm_area_struct **prev, 260 unsigned long start, unsigned long end) 261 { 262 struct mm_struct *mm = vma->vm_mm; 263 struct file *file = vma->vm_file; 264 loff_t offset; 265 266 *prev = vma; 267 #ifdef CONFIG_SWAP 268 if (!file) { 269 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); 270 lru_add_drain(); /* Push any new pages onto the LRU now */ 271 return 0; 272 } 273 274 if (shmem_mapping(file->f_mapping)) { 275 force_shm_swapin_readahead(vma, start, end, 276 file->f_mapping); 277 return 0; 278 } 279 #else 280 if (!file) 281 return -EBADF; 282 #endif 283 284 if (IS_DAX(file_inode(file))) { 285 /* no bad return value, but ignore advice */ 286 return 0; 287 } 288 289 /* 290 * Filesystem's fadvise may need to take various locks. We need to 291 * explicitly grab a reference because the vma (and hence the 292 * vma's reference to the file) can go away as soon as we drop 293 * mmap_lock. 294 */ 295 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 296 get_file(file); 297 offset = (loff_t)(start - vma->vm_start) 298 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 299 mmap_read_unlock(mm); 300 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); 301 fput(file); 302 mmap_read_lock(mm); 303 return 0; 304 } 305 306 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, 307 unsigned long addr, unsigned long end, 308 struct mm_walk *walk) 309 { 310 struct madvise_walk_private *private = walk->private; 311 struct mmu_gather *tlb = private->tlb; 312 bool pageout = private->pageout; 313 struct mm_struct *mm = tlb->mm; 314 struct vm_area_struct *vma = walk->vma; 315 pte_t *orig_pte, *pte, ptent; 316 spinlock_t *ptl; 317 struct page *page = NULL; 318 LIST_HEAD(page_list); 319 320 if (fatal_signal_pending(current)) 321 return -EINTR; 322 323 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 324 if (pmd_trans_huge(*pmd)) { 325 pmd_t orig_pmd; 326 unsigned long next = pmd_addr_end(addr, end); 327 328 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 329 ptl = pmd_trans_huge_lock(pmd, vma); 330 if (!ptl) 331 return 0; 332 333 orig_pmd = *pmd; 334 if (is_huge_zero_pmd(orig_pmd)) 335 goto huge_unlock; 336 337 if (unlikely(!pmd_present(orig_pmd))) { 338 VM_BUG_ON(thp_migration_supported() && 339 !is_pmd_migration_entry(orig_pmd)); 340 goto huge_unlock; 341 } 342 343 page = pmd_page(orig_pmd); 344 345 /* Do not interfere with other mappings of this page */ 346 if (page_mapcount(page) != 1) 347 goto huge_unlock; 348 349 if (next - addr != HPAGE_PMD_SIZE) { 350 int err; 351 352 get_page(page); 353 spin_unlock(ptl); 354 lock_page(page); 355 err = split_huge_page(page); 356 unlock_page(page); 357 put_page(page); 358 if (!err) 359 goto regular_page; 360 return 0; 361 } 362 363 if (pmd_young(orig_pmd)) { 364 pmdp_invalidate(vma, addr, pmd); 365 orig_pmd = pmd_mkold(orig_pmd); 366 367 set_pmd_at(mm, addr, pmd, orig_pmd); 368 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 369 } 370 371 ClearPageReferenced(page); 372 test_and_clear_page_young(page); 373 if (pageout) { 374 if (!isolate_lru_page(page)) { 375 if (PageUnevictable(page)) 376 putback_lru_page(page); 377 else 378 list_add(&page->lru, &page_list); 379 } 380 } else 381 deactivate_page(page); 382 huge_unlock: 383 spin_unlock(ptl); 384 if (pageout) 385 reclaim_pages(&page_list); 386 return 0; 387 } 388 389 regular_page: 390 if (pmd_trans_unstable(pmd)) 391 return 0; 392 #endif 393 tlb_change_page_size(tlb, PAGE_SIZE); 394 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 395 flush_tlb_batched_pending(mm); 396 arch_enter_lazy_mmu_mode(); 397 for (; addr < end; pte++, addr += PAGE_SIZE) { 398 ptent = *pte; 399 400 if (pte_none(ptent)) 401 continue; 402 403 if (!pte_present(ptent)) 404 continue; 405 406 page = vm_normal_page(vma, addr, ptent); 407 if (!page) 408 continue; 409 410 /* 411 * Creating a THP page is expensive so split it only if we 412 * are sure it's worth. Split it if we are only owner. 413 */ 414 if (PageTransCompound(page)) { 415 if (page_mapcount(page) != 1) 416 break; 417 get_page(page); 418 if (!trylock_page(page)) { 419 put_page(page); 420 break; 421 } 422 pte_unmap_unlock(orig_pte, ptl); 423 if (split_huge_page(page)) { 424 unlock_page(page); 425 put_page(page); 426 pte_offset_map_lock(mm, pmd, addr, &ptl); 427 break; 428 } 429 unlock_page(page); 430 put_page(page); 431 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 432 pte--; 433 addr -= PAGE_SIZE; 434 continue; 435 } 436 437 /* Do not interfere with other mappings of this page */ 438 if (page_mapcount(page) != 1) 439 continue; 440 441 VM_BUG_ON_PAGE(PageTransCompound(page), page); 442 443 if (pte_young(ptent)) { 444 ptent = ptep_get_and_clear_full(mm, addr, pte, 445 tlb->fullmm); 446 ptent = pte_mkold(ptent); 447 set_pte_at(mm, addr, pte, ptent); 448 tlb_remove_tlb_entry(tlb, pte, addr); 449 } 450 451 /* 452 * We are deactivating a page for accelerating reclaiming. 453 * VM couldn't reclaim the page unless we clear PG_young. 454 * As a side effect, it makes confuse idle-page tracking 455 * because they will miss recent referenced history. 456 */ 457 ClearPageReferenced(page); 458 test_and_clear_page_young(page); 459 if (pageout) { 460 if (!isolate_lru_page(page)) { 461 if (PageUnevictable(page)) 462 putback_lru_page(page); 463 else 464 list_add(&page->lru, &page_list); 465 } 466 } else 467 deactivate_page(page); 468 } 469 470 arch_leave_lazy_mmu_mode(); 471 pte_unmap_unlock(orig_pte, ptl); 472 if (pageout) 473 reclaim_pages(&page_list); 474 cond_resched(); 475 476 return 0; 477 } 478 479 static const struct mm_walk_ops cold_walk_ops = { 480 .pmd_entry = madvise_cold_or_pageout_pte_range, 481 }; 482 483 static void madvise_cold_page_range(struct mmu_gather *tlb, 484 struct vm_area_struct *vma, 485 unsigned long addr, unsigned long end) 486 { 487 struct madvise_walk_private walk_private = { 488 .pageout = false, 489 .tlb = tlb, 490 }; 491 492 tlb_start_vma(tlb, vma); 493 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 494 tlb_end_vma(tlb, vma); 495 } 496 497 static long madvise_cold(struct vm_area_struct *vma, 498 struct vm_area_struct **prev, 499 unsigned long start_addr, unsigned long end_addr) 500 { 501 struct mm_struct *mm = vma->vm_mm; 502 struct mmu_gather tlb; 503 504 *prev = vma; 505 if (!can_madv_lru_vma(vma)) 506 return -EINVAL; 507 508 lru_add_drain(); 509 tlb_gather_mmu(&tlb, mm); 510 madvise_cold_page_range(&tlb, vma, start_addr, end_addr); 511 tlb_finish_mmu(&tlb); 512 513 return 0; 514 } 515 516 static void madvise_pageout_page_range(struct mmu_gather *tlb, 517 struct vm_area_struct *vma, 518 unsigned long addr, unsigned long end) 519 { 520 struct madvise_walk_private walk_private = { 521 .pageout = true, 522 .tlb = tlb, 523 }; 524 525 tlb_start_vma(tlb, vma); 526 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 527 tlb_end_vma(tlb, vma); 528 } 529 530 static inline bool can_do_pageout(struct vm_area_struct *vma) 531 { 532 if (vma_is_anonymous(vma)) 533 return true; 534 if (!vma->vm_file) 535 return false; 536 /* 537 * paging out pagecache only for non-anonymous mappings that correspond 538 * to the files the calling process could (if tried) open for writing; 539 * otherwise we'd be including shared non-exclusive mappings, which 540 * opens a side channel. 541 */ 542 return inode_owner_or_capable(&init_user_ns, 543 file_inode(vma->vm_file)) || 544 file_permission(vma->vm_file, MAY_WRITE) == 0; 545 } 546 547 static long madvise_pageout(struct vm_area_struct *vma, 548 struct vm_area_struct **prev, 549 unsigned long start_addr, unsigned long end_addr) 550 { 551 struct mm_struct *mm = vma->vm_mm; 552 struct mmu_gather tlb; 553 554 *prev = vma; 555 if (!can_madv_lru_vma(vma)) 556 return -EINVAL; 557 558 if (!can_do_pageout(vma)) 559 return 0; 560 561 lru_add_drain(); 562 tlb_gather_mmu(&tlb, mm); 563 madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); 564 tlb_finish_mmu(&tlb); 565 566 return 0; 567 } 568 569 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, 570 unsigned long end, struct mm_walk *walk) 571 572 { 573 struct mmu_gather *tlb = walk->private; 574 struct mm_struct *mm = tlb->mm; 575 struct vm_area_struct *vma = walk->vma; 576 spinlock_t *ptl; 577 pte_t *orig_pte, *pte, ptent; 578 struct page *page; 579 int nr_swap = 0; 580 unsigned long next; 581 582 next = pmd_addr_end(addr, end); 583 if (pmd_trans_huge(*pmd)) 584 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) 585 goto next; 586 587 if (pmd_trans_unstable(pmd)) 588 return 0; 589 590 tlb_change_page_size(tlb, PAGE_SIZE); 591 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 592 flush_tlb_batched_pending(mm); 593 arch_enter_lazy_mmu_mode(); 594 for (; addr != end; pte++, addr += PAGE_SIZE) { 595 ptent = *pte; 596 597 if (pte_none(ptent)) 598 continue; 599 /* 600 * If the pte has swp_entry, just clear page table to 601 * prevent swap-in which is more expensive rather than 602 * (page allocation + zeroing). 603 */ 604 if (!pte_present(ptent)) { 605 swp_entry_t entry; 606 607 entry = pte_to_swp_entry(ptent); 608 if (non_swap_entry(entry)) 609 continue; 610 nr_swap--; 611 free_swap_and_cache(entry); 612 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 613 continue; 614 } 615 616 page = vm_normal_page(vma, addr, ptent); 617 if (!page) 618 continue; 619 620 /* 621 * If pmd isn't transhuge but the page is THP and 622 * is owned by only this process, split it and 623 * deactivate all pages. 624 */ 625 if (PageTransCompound(page)) { 626 if (page_mapcount(page) != 1) 627 goto out; 628 get_page(page); 629 if (!trylock_page(page)) { 630 put_page(page); 631 goto out; 632 } 633 pte_unmap_unlock(orig_pte, ptl); 634 if (split_huge_page(page)) { 635 unlock_page(page); 636 put_page(page); 637 pte_offset_map_lock(mm, pmd, addr, &ptl); 638 goto out; 639 } 640 unlock_page(page); 641 put_page(page); 642 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 643 pte--; 644 addr -= PAGE_SIZE; 645 continue; 646 } 647 648 VM_BUG_ON_PAGE(PageTransCompound(page), page); 649 650 if (PageSwapCache(page) || PageDirty(page)) { 651 if (!trylock_page(page)) 652 continue; 653 /* 654 * If page is shared with others, we couldn't clear 655 * PG_dirty of the page. 656 */ 657 if (page_mapcount(page) != 1) { 658 unlock_page(page); 659 continue; 660 } 661 662 if (PageSwapCache(page) && !try_to_free_swap(page)) { 663 unlock_page(page); 664 continue; 665 } 666 667 ClearPageDirty(page); 668 unlock_page(page); 669 } 670 671 if (pte_young(ptent) || pte_dirty(ptent)) { 672 /* 673 * Some of architecture(ex, PPC) don't update TLB 674 * with set_pte_at and tlb_remove_tlb_entry so for 675 * the portability, remap the pte with old|clean 676 * after pte clearing. 677 */ 678 ptent = ptep_get_and_clear_full(mm, addr, pte, 679 tlb->fullmm); 680 681 ptent = pte_mkold(ptent); 682 ptent = pte_mkclean(ptent); 683 set_pte_at(mm, addr, pte, ptent); 684 tlb_remove_tlb_entry(tlb, pte, addr); 685 } 686 mark_page_lazyfree(page); 687 } 688 out: 689 if (nr_swap) { 690 if (current->mm == mm) 691 sync_mm_rss(mm); 692 693 add_mm_counter(mm, MM_SWAPENTS, nr_swap); 694 } 695 arch_leave_lazy_mmu_mode(); 696 pte_unmap_unlock(orig_pte, ptl); 697 cond_resched(); 698 next: 699 return 0; 700 } 701 702 static const struct mm_walk_ops madvise_free_walk_ops = { 703 .pmd_entry = madvise_free_pte_range, 704 }; 705 706 static int madvise_free_single_vma(struct vm_area_struct *vma, 707 unsigned long start_addr, unsigned long end_addr) 708 { 709 struct mm_struct *mm = vma->vm_mm; 710 struct mmu_notifier_range range; 711 struct mmu_gather tlb; 712 713 /* MADV_FREE works for only anon vma at the moment */ 714 if (!vma_is_anonymous(vma)) 715 return -EINVAL; 716 717 range.start = max(vma->vm_start, start_addr); 718 if (range.start >= vma->vm_end) 719 return -EINVAL; 720 range.end = min(vma->vm_end, end_addr); 721 if (range.end <= vma->vm_start) 722 return -EINVAL; 723 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, 724 range.start, range.end); 725 726 lru_add_drain(); 727 tlb_gather_mmu(&tlb, mm); 728 update_hiwater_rss(mm); 729 730 mmu_notifier_invalidate_range_start(&range); 731 tlb_start_vma(&tlb, vma); 732 walk_page_range(vma->vm_mm, range.start, range.end, 733 &madvise_free_walk_ops, &tlb); 734 tlb_end_vma(&tlb, vma); 735 mmu_notifier_invalidate_range_end(&range); 736 tlb_finish_mmu(&tlb); 737 738 return 0; 739 } 740 741 /* 742 * Application no longer needs these pages. If the pages are dirty, 743 * it's OK to just throw them away. The app will be more careful about 744 * data it wants to keep. Be sure to free swap resources too. The 745 * zap_page_range call sets things up for shrink_active_list to actually free 746 * these pages later if no one else has touched them in the meantime, 747 * although we could add these pages to a global reuse list for 748 * shrink_active_list to pick up before reclaiming other pages. 749 * 750 * NB: This interface discards data rather than pushes it out to swap, 751 * as some implementations do. This has performance implications for 752 * applications like large transactional databases which want to discard 753 * pages in anonymous maps after committing to backing store the data 754 * that was kept in them. There is no reason to write this data out to 755 * the swap area if the application is discarding it. 756 * 757 * An interface that causes the system to free clean pages and flush 758 * dirty pages is already available as msync(MS_INVALIDATE). 759 */ 760 static long madvise_dontneed_single_vma(struct vm_area_struct *vma, 761 unsigned long start, unsigned long end) 762 { 763 zap_page_range(vma, start, end - start); 764 return 0; 765 } 766 767 static long madvise_dontneed_free(struct vm_area_struct *vma, 768 struct vm_area_struct **prev, 769 unsigned long start, unsigned long end, 770 int behavior) 771 { 772 struct mm_struct *mm = vma->vm_mm; 773 774 *prev = vma; 775 if (!can_madv_lru_vma(vma)) 776 return -EINVAL; 777 778 if (!userfaultfd_remove(vma, start, end)) { 779 *prev = NULL; /* mmap_lock has been dropped, prev is stale */ 780 781 mmap_read_lock(mm); 782 vma = find_vma(mm, start); 783 if (!vma) 784 return -ENOMEM; 785 if (start < vma->vm_start) { 786 /* 787 * This "vma" under revalidation is the one 788 * with the lowest vma->vm_start where start 789 * is also < vma->vm_end. If start < 790 * vma->vm_start it means an hole materialized 791 * in the user address space within the 792 * virtual range passed to MADV_DONTNEED 793 * or MADV_FREE. 794 */ 795 return -ENOMEM; 796 } 797 if (!can_madv_lru_vma(vma)) 798 return -EINVAL; 799 if (end > vma->vm_end) { 800 /* 801 * Don't fail if end > vma->vm_end. If the old 802 * vma was split while the mmap_lock was 803 * released the effect of the concurrent 804 * operation may not cause madvise() to 805 * have an undefined result. There may be an 806 * adjacent next vma that we'll walk 807 * next. userfaultfd_remove() will generate an 808 * UFFD_EVENT_REMOVE repetition on the 809 * end-vma->vm_end range, but the manager can 810 * handle a repetition fine. 811 */ 812 end = vma->vm_end; 813 } 814 VM_WARN_ON(start >= end); 815 } 816 817 if (behavior == MADV_DONTNEED) 818 return madvise_dontneed_single_vma(vma, start, end); 819 else if (behavior == MADV_FREE) 820 return madvise_free_single_vma(vma, start, end); 821 else 822 return -EINVAL; 823 } 824 825 /* 826 * Application wants to free up the pages and associated backing store. 827 * This is effectively punching a hole into the middle of a file. 828 */ 829 static long madvise_remove(struct vm_area_struct *vma, 830 struct vm_area_struct **prev, 831 unsigned long start, unsigned long end) 832 { 833 loff_t offset; 834 int error; 835 struct file *f; 836 struct mm_struct *mm = vma->vm_mm; 837 838 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 839 840 if (vma->vm_flags & VM_LOCKED) 841 return -EINVAL; 842 843 f = vma->vm_file; 844 845 if (!f || !f->f_mapping || !f->f_mapping->host) { 846 return -EINVAL; 847 } 848 849 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 850 return -EACCES; 851 852 offset = (loff_t)(start - vma->vm_start) 853 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 854 855 /* 856 * Filesystem's fallocate may need to take i_mutex. We need to 857 * explicitly grab a reference because the vma (and hence the 858 * vma's reference to the file) can go away as soon as we drop 859 * mmap_lock. 860 */ 861 get_file(f); 862 if (userfaultfd_remove(vma, start, end)) { 863 /* mmap_lock was not released by userfaultfd_remove() */ 864 mmap_read_unlock(mm); 865 } 866 error = vfs_fallocate(f, 867 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 868 offset, end - start); 869 fput(f); 870 mmap_read_lock(mm); 871 return error; 872 } 873 874 #ifdef CONFIG_MEMORY_FAILURE 875 /* 876 * Error injection support for memory error handling. 877 */ 878 static int madvise_inject_error(int behavior, 879 unsigned long start, unsigned long end) 880 { 881 unsigned long size; 882 883 if (!capable(CAP_SYS_ADMIN)) 884 return -EPERM; 885 886 887 for (; start < end; start += size) { 888 unsigned long pfn; 889 struct page *page; 890 int ret; 891 892 ret = get_user_pages_fast(start, 1, 0, &page); 893 if (ret != 1) 894 return ret; 895 pfn = page_to_pfn(page); 896 897 /* 898 * When soft offlining hugepages, after migrating the page 899 * we dissolve it, therefore in the second loop "page" will 900 * no longer be a compound page. 901 */ 902 size = page_size(compound_head(page)); 903 904 if (behavior == MADV_SOFT_OFFLINE) { 905 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", 906 pfn, start); 907 ret = soft_offline_page(pfn, MF_COUNT_INCREASED); 908 } else { 909 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", 910 pfn, start); 911 ret = memory_failure(pfn, MF_COUNT_INCREASED); 912 } 913 914 if (ret) 915 return ret; 916 } 917 918 return 0; 919 } 920 #endif 921 922 static long 923 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 924 unsigned long start, unsigned long end, int behavior) 925 { 926 switch (behavior) { 927 case MADV_REMOVE: 928 return madvise_remove(vma, prev, start, end); 929 case MADV_WILLNEED: 930 return madvise_willneed(vma, prev, start, end); 931 case MADV_COLD: 932 return madvise_cold(vma, prev, start, end); 933 case MADV_PAGEOUT: 934 return madvise_pageout(vma, prev, start, end); 935 case MADV_FREE: 936 case MADV_DONTNEED: 937 return madvise_dontneed_free(vma, prev, start, end, behavior); 938 default: 939 return madvise_behavior(vma, prev, start, end, behavior); 940 } 941 } 942 943 static bool 944 madvise_behavior_valid(int behavior) 945 { 946 switch (behavior) { 947 case MADV_DOFORK: 948 case MADV_DONTFORK: 949 case MADV_NORMAL: 950 case MADV_SEQUENTIAL: 951 case MADV_RANDOM: 952 case MADV_REMOVE: 953 case MADV_WILLNEED: 954 case MADV_DONTNEED: 955 case MADV_FREE: 956 case MADV_COLD: 957 case MADV_PAGEOUT: 958 #ifdef CONFIG_KSM 959 case MADV_MERGEABLE: 960 case MADV_UNMERGEABLE: 961 #endif 962 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 963 case MADV_HUGEPAGE: 964 case MADV_NOHUGEPAGE: 965 #endif 966 case MADV_DONTDUMP: 967 case MADV_DODUMP: 968 case MADV_WIPEONFORK: 969 case MADV_KEEPONFORK: 970 #ifdef CONFIG_MEMORY_FAILURE 971 case MADV_SOFT_OFFLINE: 972 case MADV_HWPOISON: 973 #endif 974 return true; 975 976 default: 977 return false; 978 } 979 } 980 981 static bool 982 process_madvise_behavior_valid(int behavior) 983 { 984 switch (behavior) { 985 case MADV_COLD: 986 case MADV_PAGEOUT: 987 return true; 988 default: 989 return false; 990 } 991 } 992 993 /* 994 * The madvise(2) system call. 995 * 996 * Applications can use madvise() to advise the kernel how it should 997 * handle paging I/O in this VM area. The idea is to help the kernel 998 * use appropriate read-ahead and caching techniques. The information 999 * provided is advisory only, and can be safely disregarded by the 1000 * kernel without affecting the correct operation of the application. 1001 * 1002 * behavior values: 1003 * MADV_NORMAL - the default behavior is to read clusters. This 1004 * results in some read-ahead and read-behind. 1005 * MADV_RANDOM - the system should read the minimum amount of data 1006 * on any access, since it is unlikely that the appli- 1007 * cation will need more than what it asks for. 1008 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 1009 * once, so they can be aggressively read ahead, and 1010 * can be freed soon after they are accessed. 1011 * MADV_WILLNEED - the application is notifying the system to read 1012 * some pages ahead. 1013 * MADV_DONTNEED - the application is finished with the given range, 1014 * so the kernel can free resources associated with it. 1015 * MADV_FREE - the application marks pages in the given range as lazy free, 1016 * where actual purges are postponed until memory pressure happens. 1017 * MADV_REMOVE - the application wants to free up the given range of 1018 * pages and associated backing store. 1019 * MADV_DONTFORK - omit this area from child's address space when forking: 1020 * typically, to avoid COWing pages pinned by get_user_pages(). 1021 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 1022 * MADV_WIPEONFORK - present the child process with zero-filled memory in this 1023 * range after a fork. 1024 * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK 1025 * MADV_HWPOISON - trigger memory error handler as if the given memory range 1026 * were corrupted by unrecoverable hardware memory failure. 1027 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. 1028 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 1029 * this area with pages of identical content from other such areas. 1030 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 1031 * MADV_HUGEPAGE - the application wants to back the given range by transparent 1032 * huge pages in the future. Existing pages might be coalesced and 1033 * new pages might be allocated as THP. 1034 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by 1035 * transparent huge pages so the existing pages will not be 1036 * coalesced into THP and new pages will not be allocated as THP. 1037 * MADV_DONTDUMP - the application wants to prevent pages in the given range 1038 * from being included in its core dump. 1039 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. 1040 * MADV_COLD - the application is not expected to use this memory soon, 1041 * deactivate pages in this range so that they can be reclaimed 1042 * easily if memory pressure happens. 1043 * MADV_PAGEOUT - the application is not expected to use this memory soon, 1044 * page out the pages in this range immediately. 1045 * 1046 * return values: 1047 * zero - success 1048 * -EINVAL - start + len < 0, start is not page-aligned, 1049 * "behavior" is not a valid value, or application 1050 * is attempting to release locked or shared pages, 1051 * or the specified address range includes file, Huge TLB, 1052 * MAP_SHARED or VMPFNMAP range. 1053 * -ENOMEM - addresses in the specified range are not currently 1054 * mapped, or are outside the AS of the process. 1055 * -EIO - an I/O error occurred while paging in data. 1056 * -EBADF - map exists, but area maps something that isn't a file. 1057 * -EAGAIN - a kernel resource was temporarily unavailable. 1058 */ 1059 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) 1060 { 1061 unsigned long end, tmp; 1062 struct vm_area_struct *vma, *prev; 1063 int unmapped_error = 0; 1064 int error = -EINVAL; 1065 int write; 1066 size_t len; 1067 struct blk_plug plug; 1068 1069 start = untagged_addr(start); 1070 1071 if (!madvise_behavior_valid(behavior)) 1072 return error; 1073 1074 if (!PAGE_ALIGNED(start)) 1075 return error; 1076 len = PAGE_ALIGN(len_in); 1077 1078 /* Check to see whether len was rounded up from small -ve to zero */ 1079 if (len_in && !len) 1080 return error; 1081 1082 end = start + len; 1083 if (end < start) 1084 return error; 1085 1086 error = 0; 1087 if (end == start) 1088 return error; 1089 1090 #ifdef CONFIG_MEMORY_FAILURE 1091 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 1092 return madvise_inject_error(behavior, start, start + len_in); 1093 #endif 1094 1095 write = madvise_need_mmap_write(behavior); 1096 if (write) { 1097 if (mmap_write_lock_killable(mm)) 1098 return -EINTR; 1099 } else { 1100 mmap_read_lock(mm); 1101 } 1102 1103 /* 1104 * If the interval [start,end) covers some unmapped address 1105 * ranges, just ignore them, but return -ENOMEM at the end. 1106 * - different from the way of handling in mlock etc. 1107 */ 1108 vma = find_vma_prev(mm, start, &prev); 1109 if (vma && start > vma->vm_start) 1110 prev = vma; 1111 1112 blk_start_plug(&plug); 1113 for (;;) { 1114 /* Still start < end. */ 1115 error = -ENOMEM; 1116 if (!vma) 1117 goto out; 1118 1119 /* Here start < (end|vma->vm_end). */ 1120 if (start < vma->vm_start) { 1121 unmapped_error = -ENOMEM; 1122 start = vma->vm_start; 1123 if (start >= end) 1124 goto out; 1125 } 1126 1127 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 1128 tmp = vma->vm_end; 1129 if (end < tmp) 1130 tmp = end; 1131 1132 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 1133 error = madvise_vma(vma, &prev, start, tmp, behavior); 1134 if (error) 1135 goto out; 1136 start = tmp; 1137 if (prev && start < prev->vm_end) 1138 start = prev->vm_end; 1139 error = unmapped_error; 1140 if (start >= end) 1141 goto out; 1142 if (prev) 1143 vma = prev->vm_next; 1144 else /* madvise_remove dropped mmap_lock */ 1145 vma = find_vma(mm, start); 1146 } 1147 out: 1148 blk_finish_plug(&plug); 1149 if (write) 1150 mmap_write_unlock(mm); 1151 else 1152 mmap_read_unlock(mm); 1153 1154 return error; 1155 } 1156 1157 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 1158 { 1159 return do_madvise(current->mm, start, len_in, behavior); 1160 } 1161 1162 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, 1163 size_t, vlen, int, behavior, unsigned int, flags) 1164 { 1165 ssize_t ret; 1166 struct iovec iovstack[UIO_FASTIOV], iovec; 1167 struct iovec *iov = iovstack; 1168 struct iov_iter iter; 1169 struct pid *pid; 1170 struct task_struct *task; 1171 struct mm_struct *mm; 1172 size_t total_len; 1173 unsigned int f_flags; 1174 1175 if (flags != 0) { 1176 ret = -EINVAL; 1177 goto out; 1178 } 1179 1180 ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); 1181 if (ret < 0) 1182 goto out; 1183 1184 pid = pidfd_get_pid(pidfd, &f_flags); 1185 if (IS_ERR(pid)) { 1186 ret = PTR_ERR(pid); 1187 goto free_iov; 1188 } 1189 1190 task = get_pid_task(pid, PIDTYPE_PID); 1191 if (!task) { 1192 ret = -ESRCH; 1193 goto put_pid; 1194 } 1195 1196 if (!process_madvise_behavior_valid(behavior)) { 1197 ret = -EINVAL; 1198 goto release_task; 1199 } 1200 1201 /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ 1202 mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); 1203 if (IS_ERR_OR_NULL(mm)) { 1204 ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; 1205 goto release_task; 1206 } 1207 1208 /* 1209 * Require CAP_SYS_NICE for influencing process performance. Note that 1210 * only non-destructive hints are currently supported. 1211 */ 1212 if (!capable(CAP_SYS_NICE)) { 1213 ret = -EPERM; 1214 goto release_mm; 1215 } 1216 1217 total_len = iov_iter_count(&iter); 1218 1219 while (iov_iter_count(&iter)) { 1220 iovec = iov_iter_iovec(&iter); 1221 ret = do_madvise(mm, (unsigned long)iovec.iov_base, 1222 iovec.iov_len, behavior); 1223 if (ret < 0) 1224 break; 1225 iov_iter_advance(&iter, iovec.iov_len); 1226 } 1227 1228 if (ret == 0) 1229 ret = total_len - iov_iter_count(&iter); 1230 1231 release_mm: 1232 mmput(mm); 1233 release_task: 1234 put_task_struct(task); 1235 put_pid: 1236 put_pid(pid); 1237 free_iov: 1238 kfree(iov); 1239 out: 1240 return ret; 1241 } 1242