1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/madvise.c 4 * 5 * Copyright (C) 1999 Linus Torvalds 6 * Copyright (C) 2002 Christoph Hellwig 7 */ 8 9 #include <linux/mman.h> 10 #include <linux/pagemap.h> 11 #include <linux/syscalls.h> 12 #include <linux/mempolicy.h> 13 #include <linux/page-isolation.h> 14 #include <linux/page_idle.h> 15 #include <linux/userfaultfd_k.h> 16 #include <linux/hugetlb.h> 17 #include <linux/falloc.h> 18 #include <linux/fadvise.h> 19 #include <linux/sched.h> 20 #include <linux/sched/mm.h> 21 #include <linux/mm_inline.h> 22 #include <linux/string.h> 23 #include <linux/uio.h> 24 #include <linux/ksm.h> 25 #include <linux/fs.h> 26 #include <linux/file.h> 27 #include <linux/blkdev.h> 28 #include <linux/backing-dev.h> 29 #include <linux/pagewalk.h> 30 #include <linux/swap.h> 31 #include <linux/swapops.h> 32 #include <linux/shmem_fs.h> 33 #include <linux/mmu_notifier.h> 34 35 #include <asm/tlb.h> 36 37 #include "internal.h" 38 #include "swap.h" 39 40 struct madvise_walk_private { 41 struct mmu_gather *tlb; 42 bool pageout; 43 }; 44 45 /* 46 * Any behaviour which results in changes to the vma->vm_flags needs to 47 * take mmap_lock for writing. Others, which simply traverse vmas, need 48 * to only take it for reading. 49 */ 50 static int madvise_need_mmap_write(int behavior) 51 { 52 switch (behavior) { 53 case MADV_REMOVE: 54 case MADV_WILLNEED: 55 case MADV_DONTNEED: 56 case MADV_DONTNEED_LOCKED: 57 case MADV_COLD: 58 case MADV_PAGEOUT: 59 case MADV_FREE: 60 case MADV_POPULATE_READ: 61 case MADV_POPULATE_WRITE: 62 case MADV_COLLAPSE: 63 return 0; 64 default: 65 /* be safe, default to 1. list exceptions explicitly */ 66 return 1; 67 } 68 } 69 70 #ifdef CONFIG_ANON_VMA_NAME 71 struct anon_vma_name *anon_vma_name_alloc(const char *name) 72 { 73 struct anon_vma_name *anon_name; 74 size_t count; 75 76 /* Add 1 for NUL terminator at the end of the anon_name->name */ 77 count = strlen(name) + 1; 78 anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); 79 if (anon_name) { 80 kref_init(&anon_name->kref); 81 memcpy(anon_name->name, name, count); 82 } 83 84 return anon_name; 85 } 86 87 void anon_vma_name_free(struct kref *kref) 88 { 89 struct anon_vma_name *anon_name = 90 container_of(kref, struct anon_vma_name, kref); 91 kfree(anon_name); 92 } 93 94 struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) 95 { 96 mmap_assert_locked(vma->vm_mm); 97 98 return vma->anon_name; 99 } 100 101 /* mmap_lock should be write-locked */ 102 static int replace_anon_vma_name(struct vm_area_struct *vma, 103 struct anon_vma_name *anon_name) 104 { 105 struct anon_vma_name *orig_name = anon_vma_name(vma); 106 107 if (!anon_name) { 108 vma->anon_name = NULL; 109 anon_vma_name_put(orig_name); 110 return 0; 111 } 112 113 if (anon_vma_name_eq(orig_name, anon_name)) 114 return 0; 115 116 vma->anon_name = anon_vma_name_reuse(anon_name); 117 anon_vma_name_put(orig_name); 118 119 return 0; 120 } 121 #else /* CONFIG_ANON_VMA_NAME */ 122 static int replace_anon_vma_name(struct vm_area_struct *vma, 123 struct anon_vma_name *anon_name) 124 { 125 if (anon_name) 126 return -EINVAL; 127 128 return 0; 129 } 130 #endif /* CONFIG_ANON_VMA_NAME */ 131 /* 132 * Update the vm_flags on region of a vma, splitting it or merging it as 133 * necessary. Must be called with mmap_sem held for writing; 134 * Caller should ensure anon_name stability by raising its refcount even when 135 * anon_name belongs to a valid vma because this function might free that vma. 136 */ 137 static int madvise_update_vma(struct vm_area_struct *vma, 138 struct vm_area_struct **prev, unsigned long start, 139 unsigned long end, unsigned long new_flags, 140 struct anon_vma_name *anon_name) 141 { 142 struct mm_struct *mm = vma->vm_mm; 143 int error; 144 pgoff_t pgoff; 145 146 if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { 147 *prev = vma; 148 return 0; 149 } 150 151 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 152 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, 153 vma->vm_file, pgoff, vma_policy(vma), 154 vma->vm_userfaultfd_ctx, anon_name); 155 if (*prev) { 156 vma = *prev; 157 goto success; 158 } 159 160 *prev = vma; 161 162 if (start != vma->vm_start) { 163 if (unlikely(mm->map_count >= sysctl_max_map_count)) 164 return -ENOMEM; 165 error = __split_vma(mm, vma, start, 1); 166 if (error) 167 return error; 168 } 169 170 if (end != vma->vm_end) { 171 if (unlikely(mm->map_count >= sysctl_max_map_count)) 172 return -ENOMEM; 173 error = __split_vma(mm, vma, end, 0); 174 if (error) 175 return error; 176 } 177 178 success: 179 /* 180 * vm_flags is protected by the mmap_lock held in write mode. 181 */ 182 vma->vm_flags = new_flags; 183 if (!vma->vm_file || vma_is_anon_shmem(vma)) { 184 error = replace_anon_vma_name(vma, anon_name); 185 if (error) 186 return error; 187 } 188 189 return 0; 190 } 191 192 #ifdef CONFIG_SWAP 193 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, 194 unsigned long end, struct mm_walk *walk) 195 { 196 struct vm_area_struct *vma = walk->private; 197 unsigned long index; 198 struct swap_iocb *splug = NULL; 199 200 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 201 return 0; 202 203 for (index = start; index != end; index += PAGE_SIZE) { 204 pte_t pte; 205 swp_entry_t entry; 206 struct page *page; 207 spinlock_t *ptl; 208 pte_t *ptep; 209 210 ptep = pte_offset_map_lock(vma->vm_mm, pmd, index, &ptl); 211 pte = *ptep; 212 pte_unmap_unlock(ptep, ptl); 213 214 if (!is_swap_pte(pte)) 215 continue; 216 entry = pte_to_swp_entry(pte); 217 if (unlikely(non_swap_entry(entry))) 218 continue; 219 220 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, 221 vma, index, false, &splug); 222 if (page) 223 put_page(page); 224 } 225 swap_read_unplug(splug); 226 cond_resched(); 227 228 return 0; 229 } 230 231 static const struct mm_walk_ops swapin_walk_ops = { 232 .pmd_entry = swapin_walk_pmd_entry, 233 }; 234 235 static void force_shm_swapin_readahead(struct vm_area_struct *vma, 236 unsigned long start, unsigned long end, 237 struct address_space *mapping) 238 { 239 XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); 240 pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1); 241 struct page *page; 242 struct swap_iocb *splug = NULL; 243 244 rcu_read_lock(); 245 xas_for_each(&xas, page, end_index) { 246 swp_entry_t swap; 247 248 if (!xa_is_value(page)) 249 continue; 250 swap = radix_to_swp_entry(page); 251 /* There might be swapin error entries in shmem mapping. */ 252 if (non_swap_entry(swap)) 253 continue; 254 xas_pause(&xas); 255 rcu_read_unlock(); 256 257 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, 258 NULL, 0, false, &splug); 259 if (page) 260 put_page(page); 261 262 rcu_read_lock(); 263 } 264 rcu_read_unlock(); 265 swap_read_unplug(splug); 266 267 lru_add_drain(); /* Push any new pages onto the LRU now */ 268 } 269 #endif /* CONFIG_SWAP */ 270 271 /* 272 * Schedule all required I/O operations. Do not wait for completion. 273 */ 274 static long madvise_willneed(struct vm_area_struct *vma, 275 struct vm_area_struct **prev, 276 unsigned long start, unsigned long end) 277 { 278 struct mm_struct *mm = vma->vm_mm; 279 struct file *file = vma->vm_file; 280 loff_t offset; 281 282 *prev = vma; 283 #ifdef CONFIG_SWAP 284 if (!file) { 285 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); 286 lru_add_drain(); /* Push any new pages onto the LRU now */ 287 return 0; 288 } 289 290 if (shmem_mapping(file->f_mapping)) { 291 force_shm_swapin_readahead(vma, start, end, 292 file->f_mapping); 293 return 0; 294 } 295 #else 296 if (!file) 297 return -EBADF; 298 #endif 299 300 if (IS_DAX(file_inode(file))) { 301 /* no bad return value, but ignore advice */ 302 return 0; 303 } 304 305 /* 306 * Filesystem's fadvise may need to take various locks. We need to 307 * explicitly grab a reference because the vma (and hence the 308 * vma's reference to the file) can go away as soon as we drop 309 * mmap_lock. 310 */ 311 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 312 get_file(file); 313 offset = (loff_t)(start - vma->vm_start) 314 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 315 mmap_read_unlock(mm); 316 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); 317 fput(file); 318 mmap_read_lock(mm); 319 return 0; 320 } 321 322 static inline bool can_do_file_pageout(struct vm_area_struct *vma) 323 { 324 if (!vma->vm_file) 325 return false; 326 /* 327 * paging out pagecache only for non-anonymous mappings that correspond 328 * to the files the calling process could (if tried) open for writing; 329 * otherwise we'd be including shared non-exclusive mappings, which 330 * opens a side channel. 331 */ 332 return inode_owner_or_capable(&init_user_ns, 333 file_inode(vma->vm_file)) || 334 file_permission(vma->vm_file, MAY_WRITE) == 0; 335 } 336 337 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, 338 unsigned long addr, unsigned long end, 339 struct mm_walk *walk) 340 { 341 struct madvise_walk_private *private = walk->private; 342 struct mmu_gather *tlb = private->tlb; 343 bool pageout = private->pageout; 344 struct mm_struct *mm = tlb->mm; 345 struct vm_area_struct *vma = walk->vma; 346 pte_t *orig_pte, *pte, ptent; 347 spinlock_t *ptl; 348 struct page *page = NULL; 349 LIST_HEAD(page_list); 350 bool pageout_anon_only_filter; 351 352 if (fatal_signal_pending(current)) 353 return -EINTR; 354 355 pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) && 356 !can_do_file_pageout(vma); 357 358 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 359 if (pmd_trans_huge(*pmd)) { 360 pmd_t orig_pmd; 361 unsigned long next = pmd_addr_end(addr, end); 362 363 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 364 ptl = pmd_trans_huge_lock(pmd, vma); 365 if (!ptl) 366 return 0; 367 368 orig_pmd = *pmd; 369 if (is_huge_zero_pmd(orig_pmd)) 370 goto huge_unlock; 371 372 if (unlikely(!pmd_present(orig_pmd))) { 373 VM_BUG_ON(thp_migration_supported() && 374 !is_pmd_migration_entry(orig_pmd)); 375 goto huge_unlock; 376 } 377 378 page = pmd_page(orig_pmd); 379 380 /* Do not interfere with other mappings of this page */ 381 if (page_mapcount(page) != 1) 382 goto huge_unlock; 383 384 if (pageout_anon_only_filter && !PageAnon(page)) 385 goto huge_unlock; 386 387 if (next - addr != HPAGE_PMD_SIZE) { 388 int err; 389 390 get_page(page); 391 spin_unlock(ptl); 392 lock_page(page); 393 err = split_huge_page(page); 394 unlock_page(page); 395 put_page(page); 396 if (!err) 397 goto regular_page; 398 return 0; 399 } 400 401 if (pmd_young(orig_pmd)) { 402 pmdp_invalidate(vma, addr, pmd); 403 orig_pmd = pmd_mkold(orig_pmd); 404 405 set_pmd_at(mm, addr, pmd, orig_pmd); 406 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 407 } 408 409 ClearPageReferenced(page); 410 test_and_clear_page_young(page); 411 if (pageout) { 412 if (!isolate_lru_page(page)) { 413 if (PageUnevictable(page)) 414 putback_lru_page(page); 415 else 416 list_add(&page->lru, &page_list); 417 } 418 } else 419 deactivate_page(page); 420 huge_unlock: 421 spin_unlock(ptl); 422 if (pageout) 423 reclaim_pages(&page_list); 424 return 0; 425 } 426 427 regular_page: 428 if (pmd_trans_unstable(pmd)) 429 return 0; 430 #endif 431 tlb_change_page_size(tlb, PAGE_SIZE); 432 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 433 flush_tlb_batched_pending(mm); 434 arch_enter_lazy_mmu_mode(); 435 for (; addr < end; pte++, addr += PAGE_SIZE) { 436 ptent = *pte; 437 438 if (pte_none(ptent)) 439 continue; 440 441 if (!pte_present(ptent)) 442 continue; 443 444 page = vm_normal_page(vma, addr, ptent); 445 if (!page || is_zone_device_page(page)) 446 continue; 447 448 /* 449 * Creating a THP page is expensive so split it only if we 450 * are sure it's worth. Split it if we are only owner. 451 */ 452 if (PageTransCompound(page)) { 453 if (page_mapcount(page) != 1) 454 break; 455 if (pageout_anon_only_filter && !PageAnon(page)) 456 break; 457 get_page(page); 458 if (!trylock_page(page)) { 459 put_page(page); 460 break; 461 } 462 pte_unmap_unlock(orig_pte, ptl); 463 if (split_huge_page(page)) { 464 unlock_page(page); 465 put_page(page); 466 orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 467 break; 468 } 469 unlock_page(page); 470 put_page(page); 471 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 472 pte--; 473 addr -= PAGE_SIZE; 474 continue; 475 } 476 477 /* 478 * Do not interfere with other mappings of this page and 479 * non-LRU page. 480 */ 481 if (!PageLRU(page) || page_mapcount(page) != 1) 482 continue; 483 484 if (pageout_anon_only_filter && !PageAnon(page)) 485 continue; 486 487 VM_BUG_ON_PAGE(PageTransCompound(page), page); 488 489 if (pte_young(ptent)) { 490 ptent = ptep_get_and_clear_full(mm, addr, pte, 491 tlb->fullmm); 492 ptent = pte_mkold(ptent); 493 set_pte_at(mm, addr, pte, ptent); 494 tlb_remove_tlb_entry(tlb, pte, addr); 495 } 496 497 /* 498 * We are deactivating a page for accelerating reclaiming. 499 * VM couldn't reclaim the page unless we clear PG_young. 500 * As a side effect, it makes confuse idle-page tracking 501 * because they will miss recent referenced history. 502 */ 503 ClearPageReferenced(page); 504 test_and_clear_page_young(page); 505 if (pageout) { 506 if (!isolate_lru_page(page)) { 507 if (PageUnevictable(page)) 508 putback_lru_page(page); 509 else 510 list_add(&page->lru, &page_list); 511 } 512 } else 513 deactivate_page(page); 514 } 515 516 arch_leave_lazy_mmu_mode(); 517 pte_unmap_unlock(orig_pte, ptl); 518 if (pageout) 519 reclaim_pages(&page_list); 520 cond_resched(); 521 522 return 0; 523 } 524 525 static const struct mm_walk_ops cold_walk_ops = { 526 .pmd_entry = madvise_cold_or_pageout_pte_range, 527 }; 528 529 static void madvise_cold_page_range(struct mmu_gather *tlb, 530 struct vm_area_struct *vma, 531 unsigned long addr, unsigned long end) 532 { 533 struct madvise_walk_private walk_private = { 534 .pageout = false, 535 .tlb = tlb, 536 }; 537 538 tlb_start_vma(tlb, vma); 539 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 540 tlb_end_vma(tlb, vma); 541 } 542 543 static inline bool can_madv_lru_vma(struct vm_area_struct *vma) 544 { 545 return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); 546 } 547 548 static long madvise_cold(struct vm_area_struct *vma, 549 struct vm_area_struct **prev, 550 unsigned long start_addr, unsigned long end_addr) 551 { 552 struct mm_struct *mm = vma->vm_mm; 553 struct mmu_gather tlb; 554 555 *prev = vma; 556 if (!can_madv_lru_vma(vma)) 557 return -EINVAL; 558 559 lru_add_drain(); 560 tlb_gather_mmu(&tlb, mm); 561 madvise_cold_page_range(&tlb, vma, start_addr, end_addr); 562 tlb_finish_mmu(&tlb); 563 564 return 0; 565 } 566 567 static void madvise_pageout_page_range(struct mmu_gather *tlb, 568 struct vm_area_struct *vma, 569 unsigned long addr, unsigned long end) 570 { 571 struct madvise_walk_private walk_private = { 572 .pageout = true, 573 .tlb = tlb, 574 }; 575 576 tlb_start_vma(tlb, vma); 577 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 578 tlb_end_vma(tlb, vma); 579 } 580 581 static long madvise_pageout(struct vm_area_struct *vma, 582 struct vm_area_struct **prev, 583 unsigned long start_addr, unsigned long end_addr) 584 { 585 struct mm_struct *mm = vma->vm_mm; 586 struct mmu_gather tlb; 587 588 *prev = vma; 589 if (!can_madv_lru_vma(vma)) 590 return -EINVAL; 591 592 /* 593 * If the VMA belongs to a private file mapping, there can be private 594 * dirty pages which can be paged out if even this process is neither 595 * owner nor write capable of the file. We allow private file mappings 596 * further to pageout dirty anon pages. 597 */ 598 if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) && 599 (vma->vm_flags & VM_MAYSHARE))) 600 return 0; 601 602 lru_add_drain(); 603 tlb_gather_mmu(&tlb, mm); 604 madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); 605 tlb_finish_mmu(&tlb); 606 607 return 0; 608 } 609 610 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, 611 unsigned long end, struct mm_walk *walk) 612 613 { 614 struct mmu_gather *tlb = walk->private; 615 struct mm_struct *mm = tlb->mm; 616 struct vm_area_struct *vma = walk->vma; 617 spinlock_t *ptl; 618 pte_t *orig_pte, *pte, ptent; 619 struct folio *folio; 620 struct page *page; 621 int nr_swap = 0; 622 unsigned long next; 623 624 next = pmd_addr_end(addr, end); 625 if (pmd_trans_huge(*pmd)) 626 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) 627 goto next; 628 629 if (pmd_trans_unstable(pmd)) 630 return 0; 631 632 tlb_change_page_size(tlb, PAGE_SIZE); 633 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 634 flush_tlb_batched_pending(mm); 635 arch_enter_lazy_mmu_mode(); 636 for (; addr != end; pte++, addr += PAGE_SIZE) { 637 ptent = *pte; 638 639 if (pte_none(ptent)) 640 continue; 641 /* 642 * If the pte has swp_entry, just clear page table to 643 * prevent swap-in which is more expensive rather than 644 * (page allocation + zeroing). 645 */ 646 if (!pte_present(ptent)) { 647 swp_entry_t entry; 648 649 entry = pte_to_swp_entry(ptent); 650 if (!non_swap_entry(entry)) { 651 nr_swap--; 652 free_swap_and_cache(entry); 653 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 654 } else if (is_hwpoison_entry(entry) || 655 is_swapin_error_entry(entry)) { 656 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 657 } 658 continue; 659 } 660 661 page = vm_normal_page(vma, addr, ptent); 662 if (!page || is_zone_device_page(page)) 663 continue; 664 folio = page_folio(page); 665 666 /* 667 * If pmd isn't transhuge but the folio is large and 668 * is owned by only this process, split it and 669 * deactivate all pages. 670 */ 671 if (folio_test_large(folio)) { 672 if (folio_mapcount(folio) != 1) 673 goto out; 674 folio_get(folio); 675 if (!folio_trylock(folio)) { 676 folio_put(folio); 677 goto out; 678 } 679 pte_unmap_unlock(orig_pte, ptl); 680 if (split_folio(folio)) { 681 folio_unlock(folio); 682 folio_put(folio); 683 orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 684 goto out; 685 } 686 folio_unlock(folio); 687 folio_put(folio); 688 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 689 pte--; 690 addr -= PAGE_SIZE; 691 continue; 692 } 693 694 if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { 695 if (!folio_trylock(folio)) 696 continue; 697 /* 698 * If folio is shared with others, we mustn't clear 699 * the folio's dirty flag. 700 */ 701 if (folio_mapcount(folio) != 1) { 702 folio_unlock(folio); 703 continue; 704 } 705 706 if (folio_test_swapcache(folio) && 707 !folio_free_swap(folio)) { 708 folio_unlock(folio); 709 continue; 710 } 711 712 folio_clear_dirty(folio); 713 folio_unlock(folio); 714 } 715 716 if (pte_young(ptent) || pte_dirty(ptent)) { 717 /* 718 * Some of architecture(ex, PPC) don't update TLB 719 * with set_pte_at and tlb_remove_tlb_entry so for 720 * the portability, remap the pte with old|clean 721 * after pte clearing. 722 */ 723 ptent = ptep_get_and_clear_full(mm, addr, pte, 724 tlb->fullmm); 725 726 ptent = pte_mkold(ptent); 727 ptent = pte_mkclean(ptent); 728 set_pte_at(mm, addr, pte, ptent); 729 tlb_remove_tlb_entry(tlb, pte, addr); 730 } 731 mark_page_lazyfree(&folio->page); 732 } 733 out: 734 if (nr_swap) { 735 if (current->mm == mm) 736 sync_mm_rss(mm); 737 738 add_mm_counter(mm, MM_SWAPENTS, nr_swap); 739 } 740 arch_leave_lazy_mmu_mode(); 741 pte_unmap_unlock(orig_pte, ptl); 742 cond_resched(); 743 next: 744 return 0; 745 } 746 747 static const struct mm_walk_ops madvise_free_walk_ops = { 748 .pmd_entry = madvise_free_pte_range, 749 }; 750 751 static int madvise_free_single_vma(struct vm_area_struct *vma, 752 unsigned long start_addr, unsigned long end_addr) 753 { 754 struct mm_struct *mm = vma->vm_mm; 755 struct mmu_notifier_range range; 756 struct mmu_gather tlb; 757 758 /* MADV_FREE works for only anon vma at the moment */ 759 if (!vma_is_anonymous(vma)) 760 return -EINVAL; 761 762 range.start = max(vma->vm_start, start_addr); 763 if (range.start >= vma->vm_end) 764 return -EINVAL; 765 range.end = min(vma->vm_end, end_addr); 766 if (range.end <= vma->vm_start) 767 return -EINVAL; 768 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, 769 range.start, range.end); 770 771 lru_add_drain(); 772 tlb_gather_mmu(&tlb, mm); 773 update_hiwater_rss(mm); 774 775 mmu_notifier_invalidate_range_start(&range); 776 tlb_start_vma(&tlb, vma); 777 walk_page_range(vma->vm_mm, range.start, range.end, 778 &madvise_free_walk_ops, &tlb); 779 tlb_end_vma(&tlb, vma); 780 mmu_notifier_invalidate_range_end(&range); 781 tlb_finish_mmu(&tlb); 782 783 return 0; 784 } 785 786 /* 787 * Application no longer needs these pages. If the pages are dirty, 788 * it's OK to just throw them away. The app will be more careful about 789 * data it wants to keep. Be sure to free swap resources too. The 790 * zap_page_range_single call sets things up for shrink_active_list to actually 791 * free these pages later if no one else has touched them in the meantime, 792 * although we could add these pages to a global reuse list for 793 * shrink_active_list to pick up before reclaiming other pages. 794 * 795 * NB: This interface discards data rather than pushes it out to swap, 796 * as some implementations do. This has performance implications for 797 * applications like large transactional databases which want to discard 798 * pages in anonymous maps after committing to backing store the data 799 * that was kept in them. There is no reason to write this data out to 800 * the swap area if the application is discarding it. 801 * 802 * An interface that causes the system to free clean pages and flush 803 * dirty pages is already available as msync(MS_INVALIDATE). 804 */ 805 static long madvise_dontneed_single_vma(struct vm_area_struct *vma, 806 unsigned long start, unsigned long end) 807 { 808 zap_page_range_single(vma, start, end - start, NULL); 809 return 0; 810 } 811 812 static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, 813 unsigned long start, 814 unsigned long *end, 815 int behavior) 816 { 817 if (!is_vm_hugetlb_page(vma)) { 818 unsigned int forbidden = VM_PFNMAP; 819 820 if (behavior != MADV_DONTNEED_LOCKED) 821 forbidden |= VM_LOCKED; 822 823 return !(vma->vm_flags & forbidden); 824 } 825 826 if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) 827 return false; 828 if (start & ~huge_page_mask(hstate_vma(vma))) 829 return false; 830 831 /* 832 * Madvise callers expect the length to be rounded up to PAGE_SIZE 833 * boundaries, and may be unaware that this VMA uses huge pages. 834 * Avoid unexpected data loss by rounding down the number of 835 * huge pages freed. 836 */ 837 *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); 838 839 return true; 840 } 841 842 static long madvise_dontneed_free(struct vm_area_struct *vma, 843 struct vm_area_struct **prev, 844 unsigned long start, unsigned long end, 845 int behavior) 846 { 847 struct mm_struct *mm = vma->vm_mm; 848 849 *prev = vma; 850 if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) 851 return -EINVAL; 852 853 if (start == end) 854 return 0; 855 856 if (!userfaultfd_remove(vma, start, end)) { 857 *prev = NULL; /* mmap_lock has been dropped, prev is stale */ 858 859 mmap_read_lock(mm); 860 vma = find_vma(mm, start); 861 if (!vma) 862 return -ENOMEM; 863 if (start < vma->vm_start) { 864 /* 865 * This "vma" under revalidation is the one 866 * with the lowest vma->vm_start where start 867 * is also < vma->vm_end. If start < 868 * vma->vm_start it means an hole materialized 869 * in the user address space within the 870 * virtual range passed to MADV_DONTNEED 871 * or MADV_FREE. 872 */ 873 return -ENOMEM; 874 } 875 /* 876 * Potential end adjustment for hugetlb vma is OK as 877 * the check below keeps end within vma. 878 */ 879 if (!madvise_dontneed_free_valid_vma(vma, start, &end, 880 behavior)) 881 return -EINVAL; 882 if (end > vma->vm_end) { 883 /* 884 * Don't fail if end > vma->vm_end. If the old 885 * vma was split while the mmap_lock was 886 * released the effect of the concurrent 887 * operation may not cause madvise() to 888 * have an undefined result. There may be an 889 * adjacent next vma that we'll walk 890 * next. userfaultfd_remove() will generate an 891 * UFFD_EVENT_REMOVE repetition on the 892 * end-vma->vm_end range, but the manager can 893 * handle a repetition fine. 894 */ 895 end = vma->vm_end; 896 } 897 VM_WARN_ON(start >= end); 898 } 899 900 if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) 901 return madvise_dontneed_single_vma(vma, start, end); 902 else if (behavior == MADV_FREE) 903 return madvise_free_single_vma(vma, start, end); 904 else 905 return -EINVAL; 906 } 907 908 static long madvise_populate(struct vm_area_struct *vma, 909 struct vm_area_struct **prev, 910 unsigned long start, unsigned long end, 911 int behavior) 912 { 913 const bool write = behavior == MADV_POPULATE_WRITE; 914 struct mm_struct *mm = vma->vm_mm; 915 unsigned long tmp_end; 916 int locked = 1; 917 long pages; 918 919 *prev = vma; 920 921 while (start < end) { 922 /* 923 * We might have temporarily dropped the lock. For example, 924 * our VMA might have been split. 925 */ 926 if (!vma || start >= vma->vm_end) { 927 vma = vma_lookup(mm, start); 928 if (!vma) 929 return -ENOMEM; 930 } 931 932 tmp_end = min_t(unsigned long, end, vma->vm_end); 933 /* Populate (prefault) page tables readable/writable. */ 934 pages = faultin_vma_page_range(vma, start, tmp_end, write, 935 &locked); 936 if (!locked) { 937 mmap_read_lock(mm); 938 locked = 1; 939 *prev = NULL; 940 vma = NULL; 941 } 942 if (pages < 0) { 943 switch (pages) { 944 case -EINTR: 945 return -EINTR; 946 case -EINVAL: /* Incompatible mappings / permissions. */ 947 return -EINVAL; 948 case -EHWPOISON: 949 return -EHWPOISON; 950 case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ 951 return -EFAULT; 952 default: 953 pr_warn_once("%s: unhandled return value: %ld\n", 954 __func__, pages); 955 fallthrough; 956 case -ENOMEM: 957 return -ENOMEM; 958 } 959 } 960 start += pages * PAGE_SIZE; 961 } 962 return 0; 963 } 964 965 /* 966 * Application wants to free up the pages and associated backing store. 967 * This is effectively punching a hole into the middle of a file. 968 */ 969 static long madvise_remove(struct vm_area_struct *vma, 970 struct vm_area_struct **prev, 971 unsigned long start, unsigned long end) 972 { 973 loff_t offset; 974 int error; 975 struct file *f; 976 struct mm_struct *mm = vma->vm_mm; 977 978 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 979 980 if (vma->vm_flags & VM_LOCKED) 981 return -EINVAL; 982 983 f = vma->vm_file; 984 985 if (!f || !f->f_mapping || !f->f_mapping->host) { 986 return -EINVAL; 987 } 988 989 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 990 return -EACCES; 991 992 offset = (loff_t)(start - vma->vm_start) 993 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 994 995 /* 996 * Filesystem's fallocate may need to take i_rwsem. We need to 997 * explicitly grab a reference because the vma (and hence the 998 * vma's reference to the file) can go away as soon as we drop 999 * mmap_lock. 1000 */ 1001 get_file(f); 1002 if (userfaultfd_remove(vma, start, end)) { 1003 /* mmap_lock was not released by userfaultfd_remove() */ 1004 mmap_read_unlock(mm); 1005 } 1006 error = vfs_fallocate(f, 1007 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 1008 offset, end - start); 1009 fput(f); 1010 mmap_read_lock(mm); 1011 return error; 1012 } 1013 1014 /* 1015 * Apply an madvise behavior to a region of a vma. madvise_update_vma 1016 * will handle splitting a vm area into separate areas, each area with its own 1017 * behavior. 1018 */ 1019 static int madvise_vma_behavior(struct vm_area_struct *vma, 1020 struct vm_area_struct **prev, 1021 unsigned long start, unsigned long end, 1022 unsigned long behavior) 1023 { 1024 int error; 1025 struct anon_vma_name *anon_name; 1026 unsigned long new_flags = vma->vm_flags; 1027 1028 switch (behavior) { 1029 case MADV_REMOVE: 1030 return madvise_remove(vma, prev, start, end); 1031 case MADV_WILLNEED: 1032 return madvise_willneed(vma, prev, start, end); 1033 case MADV_COLD: 1034 return madvise_cold(vma, prev, start, end); 1035 case MADV_PAGEOUT: 1036 return madvise_pageout(vma, prev, start, end); 1037 case MADV_FREE: 1038 case MADV_DONTNEED: 1039 case MADV_DONTNEED_LOCKED: 1040 return madvise_dontneed_free(vma, prev, start, end, behavior); 1041 case MADV_POPULATE_READ: 1042 case MADV_POPULATE_WRITE: 1043 return madvise_populate(vma, prev, start, end, behavior); 1044 case MADV_NORMAL: 1045 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 1046 break; 1047 case MADV_SEQUENTIAL: 1048 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 1049 break; 1050 case MADV_RANDOM: 1051 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 1052 break; 1053 case MADV_DONTFORK: 1054 new_flags |= VM_DONTCOPY; 1055 break; 1056 case MADV_DOFORK: 1057 if (vma->vm_flags & VM_IO) 1058 return -EINVAL; 1059 new_flags &= ~VM_DONTCOPY; 1060 break; 1061 case MADV_WIPEONFORK: 1062 /* MADV_WIPEONFORK is only supported on anonymous memory. */ 1063 if (vma->vm_file || vma->vm_flags & VM_SHARED) 1064 return -EINVAL; 1065 new_flags |= VM_WIPEONFORK; 1066 break; 1067 case MADV_KEEPONFORK: 1068 new_flags &= ~VM_WIPEONFORK; 1069 break; 1070 case MADV_DONTDUMP: 1071 new_flags |= VM_DONTDUMP; 1072 break; 1073 case MADV_DODUMP: 1074 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) 1075 return -EINVAL; 1076 new_flags &= ~VM_DONTDUMP; 1077 break; 1078 case MADV_MERGEABLE: 1079 case MADV_UNMERGEABLE: 1080 error = ksm_madvise(vma, start, end, behavior, &new_flags); 1081 if (error) 1082 goto out; 1083 break; 1084 case MADV_HUGEPAGE: 1085 case MADV_NOHUGEPAGE: 1086 error = hugepage_madvise(vma, &new_flags, behavior); 1087 if (error) 1088 goto out; 1089 break; 1090 case MADV_COLLAPSE: 1091 return madvise_collapse(vma, prev, start, end); 1092 } 1093 1094 anon_name = anon_vma_name(vma); 1095 anon_vma_name_get(anon_name); 1096 error = madvise_update_vma(vma, prev, start, end, new_flags, 1097 anon_name); 1098 anon_vma_name_put(anon_name); 1099 1100 out: 1101 /* 1102 * madvise() returns EAGAIN if kernel resources, such as 1103 * slab, are temporarily unavailable. 1104 */ 1105 if (error == -ENOMEM) 1106 error = -EAGAIN; 1107 return error; 1108 } 1109 1110 #ifdef CONFIG_MEMORY_FAILURE 1111 /* 1112 * Error injection support for memory error handling. 1113 */ 1114 static int madvise_inject_error(int behavior, 1115 unsigned long start, unsigned long end) 1116 { 1117 unsigned long size; 1118 1119 if (!capable(CAP_SYS_ADMIN)) 1120 return -EPERM; 1121 1122 1123 for (; start < end; start += size) { 1124 unsigned long pfn; 1125 struct page *page; 1126 int ret; 1127 1128 ret = get_user_pages_fast(start, 1, 0, &page); 1129 if (ret != 1) 1130 return ret; 1131 pfn = page_to_pfn(page); 1132 1133 /* 1134 * When soft offlining hugepages, after migrating the page 1135 * we dissolve it, therefore in the second loop "page" will 1136 * no longer be a compound page. 1137 */ 1138 size = page_size(compound_head(page)); 1139 1140 if (behavior == MADV_SOFT_OFFLINE) { 1141 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", 1142 pfn, start); 1143 ret = soft_offline_page(pfn, MF_COUNT_INCREASED); 1144 } else { 1145 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", 1146 pfn, start); 1147 ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED); 1148 if (ret == -EOPNOTSUPP) 1149 ret = 0; 1150 } 1151 1152 if (ret) 1153 return ret; 1154 } 1155 1156 return 0; 1157 } 1158 #endif 1159 1160 static bool 1161 madvise_behavior_valid(int behavior) 1162 { 1163 switch (behavior) { 1164 case MADV_DOFORK: 1165 case MADV_DONTFORK: 1166 case MADV_NORMAL: 1167 case MADV_SEQUENTIAL: 1168 case MADV_RANDOM: 1169 case MADV_REMOVE: 1170 case MADV_WILLNEED: 1171 case MADV_DONTNEED: 1172 case MADV_DONTNEED_LOCKED: 1173 case MADV_FREE: 1174 case MADV_COLD: 1175 case MADV_PAGEOUT: 1176 case MADV_POPULATE_READ: 1177 case MADV_POPULATE_WRITE: 1178 #ifdef CONFIG_KSM 1179 case MADV_MERGEABLE: 1180 case MADV_UNMERGEABLE: 1181 #endif 1182 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1183 case MADV_HUGEPAGE: 1184 case MADV_NOHUGEPAGE: 1185 case MADV_COLLAPSE: 1186 #endif 1187 case MADV_DONTDUMP: 1188 case MADV_DODUMP: 1189 case MADV_WIPEONFORK: 1190 case MADV_KEEPONFORK: 1191 #ifdef CONFIG_MEMORY_FAILURE 1192 case MADV_SOFT_OFFLINE: 1193 case MADV_HWPOISON: 1194 #endif 1195 return true; 1196 1197 default: 1198 return false; 1199 } 1200 } 1201 1202 static bool process_madvise_behavior_valid(int behavior) 1203 { 1204 switch (behavior) { 1205 case MADV_COLD: 1206 case MADV_PAGEOUT: 1207 case MADV_WILLNEED: 1208 case MADV_COLLAPSE: 1209 return true; 1210 default: 1211 return false; 1212 } 1213 } 1214 1215 /* 1216 * Walk the vmas in range [start,end), and call the visit function on each one. 1217 * The visit function will get start and end parameters that cover the overlap 1218 * between the current vma and the original range. Any unmapped regions in the 1219 * original range will result in this function returning -ENOMEM while still 1220 * calling the visit function on all of the existing vmas in the range. 1221 * Must be called with the mmap_lock held for reading or writing. 1222 */ 1223 static 1224 int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, 1225 unsigned long end, unsigned long arg, 1226 int (*visit)(struct vm_area_struct *vma, 1227 struct vm_area_struct **prev, unsigned long start, 1228 unsigned long end, unsigned long arg)) 1229 { 1230 struct vm_area_struct *vma; 1231 struct vm_area_struct *prev; 1232 unsigned long tmp; 1233 int unmapped_error = 0; 1234 1235 /* 1236 * If the interval [start,end) covers some unmapped address 1237 * ranges, just ignore them, but return -ENOMEM at the end. 1238 * - different from the way of handling in mlock etc. 1239 */ 1240 vma = find_vma_prev(mm, start, &prev); 1241 if (vma && start > vma->vm_start) 1242 prev = vma; 1243 1244 for (;;) { 1245 int error; 1246 1247 /* Still start < end. */ 1248 if (!vma) 1249 return -ENOMEM; 1250 1251 /* Here start < (end|vma->vm_end). */ 1252 if (start < vma->vm_start) { 1253 unmapped_error = -ENOMEM; 1254 start = vma->vm_start; 1255 if (start >= end) 1256 break; 1257 } 1258 1259 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 1260 tmp = vma->vm_end; 1261 if (end < tmp) 1262 tmp = end; 1263 1264 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 1265 error = visit(vma, &prev, start, tmp, arg); 1266 if (error) 1267 return error; 1268 start = tmp; 1269 if (prev && start < prev->vm_end) 1270 start = prev->vm_end; 1271 if (start >= end) 1272 break; 1273 if (prev) 1274 vma = find_vma(mm, prev->vm_end); 1275 else /* madvise_remove dropped mmap_lock */ 1276 vma = find_vma(mm, start); 1277 } 1278 1279 return unmapped_error; 1280 } 1281 1282 #ifdef CONFIG_ANON_VMA_NAME 1283 static int madvise_vma_anon_name(struct vm_area_struct *vma, 1284 struct vm_area_struct **prev, 1285 unsigned long start, unsigned long end, 1286 unsigned long anon_name) 1287 { 1288 int error; 1289 1290 /* Only anonymous mappings can be named */ 1291 if (vma->vm_file && !vma_is_anon_shmem(vma)) 1292 return -EBADF; 1293 1294 error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, 1295 (struct anon_vma_name *)anon_name); 1296 1297 /* 1298 * madvise() returns EAGAIN if kernel resources, such as 1299 * slab, are temporarily unavailable. 1300 */ 1301 if (error == -ENOMEM) 1302 error = -EAGAIN; 1303 return error; 1304 } 1305 1306 int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, 1307 unsigned long len_in, struct anon_vma_name *anon_name) 1308 { 1309 unsigned long end; 1310 unsigned long len; 1311 1312 if (start & ~PAGE_MASK) 1313 return -EINVAL; 1314 len = (len_in + ~PAGE_MASK) & PAGE_MASK; 1315 1316 /* Check to see whether len was rounded up from small -ve to zero */ 1317 if (len_in && !len) 1318 return -EINVAL; 1319 1320 end = start + len; 1321 if (end < start) 1322 return -EINVAL; 1323 1324 if (end == start) 1325 return 0; 1326 1327 return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, 1328 madvise_vma_anon_name); 1329 } 1330 #endif /* CONFIG_ANON_VMA_NAME */ 1331 /* 1332 * The madvise(2) system call. 1333 * 1334 * Applications can use madvise() to advise the kernel how it should 1335 * handle paging I/O in this VM area. The idea is to help the kernel 1336 * use appropriate read-ahead and caching techniques. The information 1337 * provided is advisory only, and can be safely disregarded by the 1338 * kernel without affecting the correct operation of the application. 1339 * 1340 * behavior values: 1341 * MADV_NORMAL - the default behavior is to read clusters. This 1342 * results in some read-ahead and read-behind. 1343 * MADV_RANDOM - the system should read the minimum amount of data 1344 * on any access, since it is unlikely that the appli- 1345 * cation will need more than what it asks for. 1346 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 1347 * once, so they can be aggressively read ahead, and 1348 * can be freed soon after they are accessed. 1349 * MADV_WILLNEED - the application is notifying the system to read 1350 * some pages ahead. 1351 * MADV_DONTNEED - the application is finished with the given range, 1352 * so the kernel can free resources associated with it. 1353 * MADV_FREE - the application marks pages in the given range as lazy free, 1354 * where actual purges are postponed until memory pressure happens. 1355 * MADV_REMOVE - the application wants to free up the given range of 1356 * pages and associated backing store. 1357 * MADV_DONTFORK - omit this area from child's address space when forking: 1358 * typically, to avoid COWing pages pinned by get_user_pages(). 1359 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 1360 * MADV_WIPEONFORK - present the child process with zero-filled memory in this 1361 * range after a fork. 1362 * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK 1363 * MADV_HWPOISON - trigger memory error handler as if the given memory range 1364 * were corrupted by unrecoverable hardware memory failure. 1365 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. 1366 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 1367 * this area with pages of identical content from other such areas. 1368 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 1369 * MADV_HUGEPAGE - the application wants to back the given range by transparent 1370 * huge pages in the future. Existing pages might be coalesced and 1371 * new pages might be allocated as THP. 1372 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by 1373 * transparent huge pages so the existing pages will not be 1374 * coalesced into THP and new pages will not be allocated as THP. 1375 * MADV_COLLAPSE - synchronously coalesce pages into new THP. 1376 * MADV_DONTDUMP - the application wants to prevent pages in the given range 1377 * from being included in its core dump. 1378 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. 1379 * MADV_COLD - the application is not expected to use this memory soon, 1380 * deactivate pages in this range so that they can be reclaimed 1381 * easily if memory pressure happens. 1382 * MADV_PAGEOUT - the application is not expected to use this memory soon, 1383 * page out the pages in this range immediately. 1384 * MADV_POPULATE_READ - populate (prefault) page tables readable by 1385 * triggering read faults if required 1386 * MADV_POPULATE_WRITE - populate (prefault) page tables writable by 1387 * triggering write faults if required 1388 * 1389 * return values: 1390 * zero - success 1391 * -EINVAL - start + len < 0, start is not page-aligned, 1392 * "behavior" is not a valid value, or application 1393 * is attempting to release locked or shared pages, 1394 * or the specified address range includes file, Huge TLB, 1395 * MAP_SHARED or VMPFNMAP range. 1396 * -ENOMEM - addresses in the specified range are not currently 1397 * mapped, or are outside the AS of the process. 1398 * -EIO - an I/O error occurred while paging in data. 1399 * -EBADF - map exists, but area maps something that isn't a file. 1400 * -EAGAIN - a kernel resource was temporarily unavailable. 1401 */ 1402 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) 1403 { 1404 unsigned long end; 1405 int error; 1406 int write; 1407 size_t len; 1408 struct blk_plug plug; 1409 1410 start = untagged_addr(start); 1411 1412 if (!madvise_behavior_valid(behavior)) 1413 return -EINVAL; 1414 1415 if (!PAGE_ALIGNED(start)) 1416 return -EINVAL; 1417 len = PAGE_ALIGN(len_in); 1418 1419 /* Check to see whether len was rounded up from small -ve to zero */ 1420 if (len_in && !len) 1421 return -EINVAL; 1422 1423 end = start + len; 1424 if (end < start) 1425 return -EINVAL; 1426 1427 if (end == start) 1428 return 0; 1429 1430 #ifdef CONFIG_MEMORY_FAILURE 1431 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 1432 return madvise_inject_error(behavior, start, start + len_in); 1433 #endif 1434 1435 write = madvise_need_mmap_write(behavior); 1436 if (write) { 1437 if (mmap_write_lock_killable(mm)) 1438 return -EINTR; 1439 } else { 1440 mmap_read_lock(mm); 1441 } 1442 1443 blk_start_plug(&plug); 1444 error = madvise_walk_vmas(mm, start, end, behavior, 1445 madvise_vma_behavior); 1446 blk_finish_plug(&plug); 1447 if (write) 1448 mmap_write_unlock(mm); 1449 else 1450 mmap_read_unlock(mm); 1451 1452 return error; 1453 } 1454 1455 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 1456 { 1457 return do_madvise(current->mm, start, len_in, behavior); 1458 } 1459 1460 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, 1461 size_t, vlen, int, behavior, unsigned int, flags) 1462 { 1463 ssize_t ret; 1464 struct iovec iovstack[UIO_FASTIOV], iovec; 1465 struct iovec *iov = iovstack; 1466 struct iov_iter iter; 1467 struct task_struct *task; 1468 struct mm_struct *mm; 1469 size_t total_len; 1470 unsigned int f_flags; 1471 1472 if (flags != 0) { 1473 ret = -EINVAL; 1474 goto out; 1475 } 1476 1477 ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); 1478 if (ret < 0) 1479 goto out; 1480 1481 task = pidfd_get_task(pidfd, &f_flags); 1482 if (IS_ERR(task)) { 1483 ret = PTR_ERR(task); 1484 goto free_iov; 1485 } 1486 1487 if (!process_madvise_behavior_valid(behavior)) { 1488 ret = -EINVAL; 1489 goto release_task; 1490 } 1491 1492 /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ 1493 mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); 1494 if (IS_ERR_OR_NULL(mm)) { 1495 ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; 1496 goto release_task; 1497 } 1498 1499 /* 1500 * Require CAP_SYS_NICE for influencing process performance. Note that 1501 * only non-destructive hints are currently supported. 1502 */ 1503 if (!capable(CAP_SYS_NICE)) { 1504 ret = -EPERM; 1505 goto release_mm; 1506 } 1507 1508 total_len = iov_iter_count(&iter); 1509 1510 while (iov_iter_count(&iter)) { 1511 iovec = iov_iter_iovec(&iter); 1512 ret = do_madvise(mm, (unsigned long)iovec.iov_base, 1513 iovec.iov_len, behavior); 1514 if (ret < 0) 1515 break; 1516 iov_iter_advance(&iter, iovec.iov_len); 1517 } 1518 1519 ret = (total_len - iov_iter_count(&iter)) ? : ret; 1520 1521 release_mm: 1522 mmput(mm); 1523 release_task: 1524 put_task_struct(task); 1525 free_iov: 1526 kfree(iov); 1527 out: 1528 return ret; 1529 } 1530