1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/madvise.c 4 * 5 * Copyright (C) 1999 Linus Torvalds 6 * Copyright (C) 2002 Christoph Hellwig 7 */ 8 9 #include <linux/mman.h> 10 #include <linux/pagemap.h> 11 #include <linux/syscalls.h> 12 #include <linux/mempolicy.h> 13 #include <linux/page-isolation.h> 14 #include <linux/page_idle.h> 15 #include <linux/userfaultfd_k.h> 16 #include <linux/hugetlb.h> 17 #include <linux/falloc.h> 18 #include <linux/fadvise.h> 19 #include <linux/sched.h> 20 #include <linux/sched/mm.h> 21 #include <linux/mm_inline.h> 22 #include <linux/string.h> 23 #include <linux/uio.h> 24 #include <linux/ksm.h> 25 #include <linux/fs.h> 26 #include <linux/file.h> 27 #include <linux/blkdev.h> 28 #include <linux/backing-dev.h> 29 #include <linux/pagewalk.h> 30 #include <linux/swap.h> 31 #include <linux/swapops.h> 32 #include <linux/shmem_fs.h> 33 #include <linux/mmu_notifier.h> 34 35 #include <asm/tlb.h> 36 37 #include "internal.h" 38 #include "swap.h" 39 40 /* 41 * Maximum number of attempts we make to install guard pages before we give up 42 * and return -ERESTARTNOINTR to have userspace try again. 43 */ 44 #define MAX_MADVISE_GUARD_RETRIES 3 45 46 struct madvise_walk_private { 47 struct mmu_gather *tlb; 48 bool pageout; 49 }; 50 51 struct madvise_behavior { 52 int behavior; 53 struct mmu_gather *tlb; 54 }; 55 56 /* 57 * Any behaviour which results in changes to the vma->vm_flags needs to 58 * take mmap_lock for writing. Others, which simply traverse vmas, need 59 * to only take it for reading. 60 */ 61 static int madvise_need_mmap_write(int behavior) 62 { 63 switch (behavior) { 64 case MADV_REMOVE: 65 case MADV_WILLNEED: 66 case MADV_DONTNEED: 67 case MADV_DONTNEED_LOCKED: 68 case MADV_COLD: 69 case MADV_PAGEOUT: 70 case MADV_FREE: 71 case MADV_POPULATE_READ: 72 case MADV_POPULATE_WRITE: 73 case MADV_COLLAPSE: 74 case MADV_GUARD_INSTALL: 75 case MADV_GUARD_REMOVE: 76 return 0; 77 default: 78 /* be safe, default to 1. list exceptions explicitly */ 79 return 1; 80 } 81 } 82 83 #ifdef CONFIG_ANON_VMA_NAME 84 struct anon_vma_name *anon_vma_name_alloc(const char *name) 85 { 86 struct anon_vma_name *anon_name; 87 size_t count; 88 89 /* Add 1 for NUL terminator at the end of the anon_name->name */ 90 count = strlen(name) + 1; 91 anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); 92 if (anon_name) { 93 kref_init(&anon_name->kref); 94 memcpy(anon_name->name, name, count); 95 } 96 97 return anon_name; 98 } 99 100 void anon_vma_name_free(struct kref *kref) 101 { 102 struct anon_vma_name *anon_name = 103 container_of(kref, struct anon_vma_name, kref); 104 kfree(anon_name); 105 } 106 107 struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) 108 { 109 mmap_assert_locked(vma->vm_mm); 110 111 return vma->anon_name; 112 } 113 114 /* mmap_lock should be write-locked */ 115 static int replace_anon_vma_name(struct vm_area_struct *vma, 116 struct anon_vma_name *anon_name) 117 { 118 struct anon_vma_name *orig_name = anon_vma_name(vma); 119 120 if (!anon_name) { 121 vma->anon_name = NULL; 122 anon_vma_name_put(orig_name); 123 return 0; 124 } 125 126 if (anon_vma_name_eq(orig_name, anon_name)) 127 return 0; 128 129 vma->anon_name = anon_vma_name_reuse(anon_name); 130 anon_vma_name_put(orig_name); 131 132 return 0; 133 } 134 #else /* CONFIG_ANON_VMA_NAME */ 135 static int replace_anon_vma_name(struct vm_area_struct *vma, 136 struct anon_vma_name *anon_name) 137 { 138 if (anon_name) 139 return -EINVAL; 140 141 return 0; 142 } 143 #endif /* CONFIG_ANON_VMA_NAME */ 144 /* 145 * Update the vm_flags on region of a vma, splitting it or merging it as 146 * necessary. Must be called with mmap_lock held for writing; 147 * Caller should ensure anon_name stability by raising its refcount even when 148 * anon_name belongs to a valid vma because this function might free that vma. 149 */ 150 static int madvise_update_vma(struct vm_area_struct *vma, 151 struct vm_area_struct **prev, unsigned long start, 152 unsigned long end, unsigned long new_flags, 153 struct anon_vma_name *anon_name) 154 { 155 struct mm_struct *mm = vma->vm_mm; 156 int error; 157 VMA_ITERATOR(vmi, mm, start); 158 159 if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { 160 *prev = vma; 161 return 0; 162 } 163 164 vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags, 165 anon_name); 166 if (IS_ERR(vma)) 167 return PTR_ERR(vma); 168 169 *prev = vma; 170 171 /* vm_flags is protected by the mmap_lock held in write mode. */ 172 vma_start_write(vma); 173 vm_flags_reset(vma, new_flags); 174 if (!vma->vm_file || vma_is_anon_shmem(vma)) { 175 error = replace_anon_vma_name(vma, anon_name); 176 if (error) 177 return error; 178 } 179 180 return 0; 181 } 182 183 #ifdef CONFIG_SWAP 184 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, 185 unsigned long end, struct mm_walk *walk) 186 { 187 struct vm_area_struct *vma = walk->private; 188 struct swap_iocb *splug = NULL; 189 pte_t *ptep = NULL; 190 spinlock_t *ptl; 191 unsigned long addr; 192 193 for (addr = start; addr < end; addr += PAGE_SIZE) { 194 pte_t pte; 195 swp_entry_t entry; 196 struct folio *folio; 197 198 if (!ptep++) { 199 ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 200 if (!ptep) 201 break; 202 } 203 204 pte = ptep_get(ptep); 205 if (!is_swap_pte(pte)) 206 continue; 207 entry = pte_to_swp_entry(pte); 208 if (unlikely(non_swap_entry(entry))) 209 continue; 210 211 pte_unmap_unlock(ptep, ptl); 212 ptep = NULL; 213 214 folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, 215 vma, addr, &splug); 216 if (folio) 217 folio_put(folio); 218 } 219 220 if (ptep) 221 pte_unmap_unlock(ptep, ptl); 222 swap_read_unplug(splug); 223 cond_resched(); 224 225 return 0; 226 } 227 228 static const struct mm_walk_ops swapin_walk_ops = { 229 .pmd_entry = swapin_walk_pmd_entry, 230 .walk_lock = PGWALK_RDLOCK, 231 }; 232 233 static void shmem_swapin_range(struct vm_area_struct *vma, 234 unsigned long start, unsigned long end, 235 struct address_space *mapping) 236 { 237 XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); 238 pgoff_t end_index = linear_page_index(vma, end) - 1; 239 struct folio *folio; 240 struct swap_iocb *splug = NULL; 241 242 rcu_read_lock(); 243 xas_for_each(&xas, folio, end_index) { 244 unsigned long addr; 245 swp_entry_t entry; 246 247 if (!xa_is_value(folio)) 248 continue; 249 entry = radix_to_swp_entry(folio); 250 /* There might be swapin error entries in shmem mapping. */ 251 if (non_swap_entry(entry)) 252 continue; 253 254 addr = vma->vm_start + 255 ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT); 256 xas_pause(&xas); 257 rcu_read_unlock(); 258 259 folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping), 260 vma, addr, &splug); 261 if (folio) 262 folio_put(folio); 263 264 rcu_read_lock(); 265 } 266 rcu_read_unlock(); 267 swap_read_unplug(splug); 268 } 269 #endif /* CONFIG_SWAP */ 270 271 /* 272 * Schedule all required I/O operations. Do not wait for completion. 273 */ 274 static long madvise_willneed(struct vm_area_struct *vma, 275 struct vm_area_struct **prev, 276 unsigned long start, unsigned long end) 277 { 278 struct mm_struct *mm = vma->vm_mm; 279 struct file *file = vma->vm_file; 280 loff_t offset; 281 282 *prev = vma; 283 #ifdef CONFIG_SWAP 284 if (!file) { 285 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); 286 lru_add_drain(); /* Push any new pages onto the LRU now */ 287 return 0; 288 } 289 290 if (shmem_mapping(file->f_mapping)) { 291 shmem_swapin_range(vma, start, end, file->f_mapping); 292 lru_add_drain(); /* Push any new pages onto the LRU now */ 293 return 0; 294 } 295 #else 296 if (!file) 297 return -EBADF; 298 #endif 299 300 if (IS_DAX(file_inode(file))) { 301 /* no bad return value, but ignore advice */ 302 return 0; 303 } 304 305 /* 306 * Filesystem's fadvise may need to take various locks. We need to 307 * explicitly grab a reference because the vma (and hence the 308 * vma's reference to the file) can go away as soon as we drop 309 * mmap_lock. 310 */ 311 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 312 get_file(file); 313 offset = (loff_t)(start - vma->vm_start) 314 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 315 mmap_read_unlock(mm); 316 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); 317 fput(file); 318 mmap_read_lock(mm); 319 return 0; 320 } 321 322 static inline bool can_do_file_pageout(struct vm_area_struct *vma) 323 { 324 if (!vma->vm_file) 325 return false; 326 /* 327 * paging out pagecache only for non-anonymous mappings that correspond 328 * to the files the calling process could (if tried) open for writing; 329 * otherwise we'd be including shared non-exclusive mappings, which 330 * opens a side channel. 331 */ 332 return inode_owner_or_capable(&nop_mnt_idmap, 333 file_inode(vma->vm_file)) || 334 file_permission(vma->vm_file, MAY_WRITE) == 0; 335 } 336 337 static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, 338 struct folio *folio, pte_t *ptep, 339 pte_t pte, bool *any_young, 340 bool *any_dirty) 341 { 342 const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; 343 int max_nr = (end - addr) / PAGE_SIZE; 344 345 return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL, 346 any_young, any_dirty); 347 } 348 349 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, 350 unsigned long addr, unsigned long end, 351 struct mm_walk *walk) 352 { 353 struct madvise_walk_private *private = walk->private; 354 struct mmu_gather *tlb = private->tlb; 355 bool pageout = private->pageout; 356 struct mm_struct *mm = tlb->mm; 357 struct vm_area_struct *vma = walk->vma; 358 pte_t *start_pte, *pte, ptent; 359 spinlock_t *ptl; 360 struct folio *folio = NULL; 361 LIST_HEAD(folio_list); 362 bool pageout_anon_only_filter; 363 unsigned int batch_count = 0; 364 int nr; 365 366 if (fatal_signal_pending(current)) 367 return -EINTR; 368 369 pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) && 370 !can_do_file_pageout(vma); 371 372 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 373 if (pmd_trans_huge(*pmd)) { 374 pmd_t orig_pmd; 375 unsigned long next = pmd_addr_end(addr, end); 376 377 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 378 ptl = pmd_trans_huge_lock(pmd, vma); 379 if (!ptl) 380 return 0; 381 382 orig_pmd = *pmd; 383 if (is_huge_zero_pmd(orig_pmd)) 384 goto huge_unlock; 385 386 if (unlikely(!pmd_present(orig_pmd))) { 387 VM_BUG_ON(thp_migration_supported() && 388 !is_pmd_migration_entry(orig_pmd)); 389 goto huge_unlock; 390 } 391 392 folio = pmd_folio(orig_pmd); 393 394 /* Do not interfere with other mappings of this folio */ 395 if (folio_maybe_mapped_shared(folio)) 396 goto huge_unlock; 397 398 if (pageout_anon_only_filter && !folio_test_anon(folio)) 399 goto huge_unlock; 400 401 if (next - addr != HPAGE_PMD_SIZE) { 402 int err; 403 404 folio_get(folio); 405 spin_unlock(ptl); 406 folio_lock(folio); 407 err = split_folio(folio); 408 folio_unlock(folio); 409 folio_put(folio); 410 if (!err) 411 goto regular_folio; 412 return 0; 413 } 414 415 if (!pageout && pmd_young(orig_pmd)) { 416 pmdp_invalidate(vma, addr, pmd); 417 orig_pmd = pmd_mkold(orig_pmd); 418 419 set_pmd_at(mm, addr, pmd, orig_pmd); 420 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 421 } 422 423 folio_clear_referenced(folio); 424 folio_test_clear_young(folio); 425 if (folio_test_active(folio)) 426 folio_set_workingset(folio); 427 if (pageout) { 428 if (folio_isolate_lru(folio)) { 429 if (folio_test_unevictable(folio)) 430 folio_putback_lru(folio); 431 else 432 list_add(&folio->lru, &folio_list); 433 } 434 } else 435 folio_deactivate(folio); 436 huge_unlock: 437 spin_unlock(ptl); 438 if (pageout) 439 reclaim_pages(&folio_list); 440 return 0; 441 } 442 443 regular_folio: 444 #endif 445 tlb_change_page_size(tlb, PAGE_SIZE); 446 restart: 447 start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 448 if (!start_pte) 449 return 0; 450 flush_tlb_batched_pending(mm); 451 arch_enter_lazy_mmu_mode(); 452 for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { 453 nr = 1; 454 ptent = ptep_get(pte); 455 456 if (++batch_count == SWAP_CLUSTER_MAX) { 457 batch_count = 0; 458 if (need_resched()) { 459 arch_leave_lazy_mmu_mode(); 460 pte_unmap_unlock(start_pte, ptl); 461 cond_resched(); 462 goto restart; 463 } 464 } 465 466 if (pte_none(ptent)) 467 continue; 468 469 if (!pte_present(ptent)) 470 continue; 471 472 folio = vm_normal_folio(vma, addr, ptent); 473 if (!folio || folio_is_zone_device(folio)) 474 continue; 475 476 /* 477 * If we encounter a large folio, only split it if it is not 478 * fully mapped within the range we are operating on. Otherwise 479 * leave it as is so that it can be swapped out whole. If we 480 * fail to split a folio, leave it in place and advance to the 481 * next pte in the range. 482 */ 483 if (folio_test_large(folio)) { 484 bool any_young; 485 486 nr = madvise_folio_pte_batch(addr, end, folio, pte, 487 ptent, &any_young, NULL); 488 if (any_young) 489 ptent = pte_mkyoung(ptent); 490 491 if (nr < folio_nr_pages(folio)) { 492 int err; 493 494 if (folio_maybe_mapped_shared(folio)) 495 continue; 496 if (pageout_anon_only_filter && !folio_test_anon(folio)) 497 continue; 498 if (!folio_trylock(folio)) 499 continue; 500 folio_get(folio); 501 arch_leave_lazy_mmu_mode(); 502 pte_unmap_unlock(start_pte, ptl); 503 start_pte = NULL; 504 err = split_folio(folio); 505 folio_unlock(folio); 506 folio_put(folio); 507 start_pte = pte = 508 pte_offset_map_lock(mm, pmd, addr, &ptl); 509 if (!start_pte) 510 break; 511 flush_tlb_batched_pending(mm); 512 arch_enter_lazy_mmu_mode(); 513 if (!err) 514 nr = 0; 515 continue; 516 } 517 } 518 519 /* 520 * Do not interfere with other mappings of this folio and 521 * non-LRU folio. If we have a large folio at this point, we 522 * know it is fully mapped so if its mapcount is the same as its 523 * number of pages, it must be exclusive. 524 */ 525 if (!folio_test_lru(folio) || 526 folio_mapcount(folio) != folio_nr_pages(folio)) 527 continue; 528 529 if (pageout_anon_only_filter && !folio_test_anon(folio)) 530 continue; 531 532 if (!pageout && pte_young(ptent)) { 533 clear_young_dirty_ptes(vma, addr, pte, nr, 534 CYDP_CLEAR_YOUNG); 535 tlb_remove_tlb_entries(tlb, pte, nr, addr); 536 } 537 538 /* 539 * We are deactivating a folio for accelerating reclaiming. 540 * VM couldn't reclaim the folio unless we clear PG_young. 541 * As a side effect, it makes confuse idle-page tracking 542 * because they will miss recent referenced history. 543 */ 544 folio_clear_referenced(folio); 545 folio_test_clear_young(folio); 546 if (folio_test_active(folio)) 547 folio_set_workingset(folio); 548 if (pageout) { 549 if (folio_isolate_lru(folio)) { 550 if (folio_test_unevictable(folio)) 551 folio_putback_lru(folio); 552 else 553 list_add(&folio->lru, &folio_list); 554 } 555 } else 556 folio_deactivate(folio); 557 } 558 559 if (start_pte) { 560 arch_leave_lazy_mmu_mode(); 561 pte_unmap_unlock(start_pte, ptl); 562 } 563 if (pageout) 564 reclaim_pages(&folio_list); 565 cond_resched(); 566 567 return 0; 568 } 569 570 static const struct mm_walk_ops cold_walk_ops = { 571 .pmd_entry = madvise_cold_or_pageout_pte_range, 572 .walk_lock = PGWALK_RDLOCK, 573 }; 574 575 static void madvise_cold_page_range(struct mmu_gather *tlb, 576 struct vm_area_struct *vma, 577 unsigned long addr, unsigned long end) 578 { 579 struct madvise_walk_private walk_private = { 580 .pageout = false, 581 .tlb = tlb, 582 }; 583 584 tlb_start_vma(tlb, vma); 585 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 586 tlb_end_vma(tlb, vma); 587 } 588 589 static inline bool can_madv_lru_vma(struct vm_area_struct *vma) 590 { 591 return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); 592 } 593 594 static long madvise_cold(struct vm_area_struct *vma, 595 struct vm_area_struct **prev, 596 unsigned long start_addr, unsigned long end_addr) 597 { 598 struct mm_struct *mm = vma->vm_mm; 599 struct mmu_gather tlb; 600 601 *prev = vma; 602 if (!can_madv_lru_vma(vma)) 603 return -EINVAL; 604 605 lru_add_drain(); 606 tlb_gather_mmu(&tlb, mm); 607 madvise_cold_page_range(&tlb, vma, start_addr, end_addr); 608 tlb_finish_mmu(&tlb); 609 610 return 0; 611 } 612 613 static void madvise_pageout_page_range(struct mmu_gather *tlb, 614 struct vm_area_struct *vma, 615 unsigned long addr, unsigned long end) 616 { 617 struct madvise_walk_private walk_private = { 618 .pageout = true, 619 .tlb = tlb, 620 }; 621 622 tlb_start_vma(tlb, vma); 623 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 624 tlb_end_vma(tlb, vma); 625 } 626 627 static long madvise_pageout(struct vm_area_struct *vma, 628 struct vm_area_struct **prev, 629 unsigned long start_addr, unsigned long end_addr) 630 { 631 struct mm_struct *mm = vma->vm_mm; 632 struct mmu_gather tlb; 633 634 *prev = vma; 635 if (!can_madv_lru_vma(vma)) 636 return -EINVAL; 637 638 /* 639 * If the VMA belongs to a private file mapping, there can be private 640 * dirty pages which can be paged out if even this process is neither 641 * owner nor write capable of the file. We allow private file mappings 642 * further to pageout dirty anon pages. 643 */ 644 if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) && 645 (vma->vm_flags & VM_MAYSHARE))) 646 return 0; 647 648 lru_add_drain(); 649 tlb_gather_mmu(&tlb, mm); 650 madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); 651 tlb_finish_mmu(&tlb); 652 653 return 0; 654 } 655 656 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, 657 unsigned long end, struct mm_walk *walk) 658 659 { 660 const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY; 661 struct mmu_gather *tlb = walk->private; 662 struct mm_struct *mm = tlb->mm; 663 struct vm_area_struct *vma = walk->vma; 664 spinlock_t *ptl; 665 pte_t *start_pte, *pte, ptent; 666 struct folio *folio; 667 int nr_swap = 0; 668 unsigned long next; 669 int nr, max_nr; 670 671 next = pmd_addr_end(addr, end); 672 if (pmd_trans_huge(*pmd)) 673 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) 674 return 0; 675 676 tlb_change_page_size(tlb, PAGE_SIZE); 677 start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 678 if (!start_pte) 679 return 0; 680 flush_tlb_batched_pending(mm); 681 arch_enter_lazy_mmu_mode(); 682 for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { 683 nr = 1; 684 ptent = ptep_get(pte); 685 686 if (pte_none(ptent)) 687 continue; 688 /* 689 * If the pte has swp_entry, just clear page table to 690 * prevent swap-in which is more expensive rather than 691 * (page allocation + zeroing). 692 */ 693 if (!pte_present(ptent)) { 694 swp_entry_t entry; 695 696 entry = pte_to_swp_entry(ptent); 697 if (!non_swap_entry(entry)) { 698 max_nr = (end - addr) / PAGE_SIZE; 699 nr = swap_pte_batch(pte, max_nr, ptent); 700 nr_swap -= nr; 701 free_swap_and_cache_nr(entry, nr); 702 clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); 703 } else if (is_hwpoison_entry(entry) || 704 is_poisoned_swp_entry(entry)) { 705 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 706 } 707 continue; 708 } 709 710 folio = vm_normal_folio(vma, addr, ptent); 711 if (!folio || folio_is_zone_device(folio)) 712 continue; 713 714 /* 715 * If we encounter a large folio, only split it if it is not 716 * fully mapped within the range we are operating on. Otherwise 717 * leave it as is so that it can be marked as lazyfree. If we 718 * fail to split a folio, leave it in place and advance to the 719 * next pte in the range. 720 */ 721 if (folio_test_large(folio)) { 722 bool any_young, any_dirty; 723 724 nr = madvise_folio_pte_batch(addr, end, folio, pte, 725 ptent, &any_young, &any_dirty); 726 727 if (nr < folio_nr_pages(folio)) { 728 int err; 729 730 if (folio_maybe_mapped_shared(folio)) 731 continue; 732 if (!folio_trylock(folio)) 733 continue; 734 folio_get(folio); 735 arch_leave_lazy_mmu_mode(); 736 pte_unmap_unlock(start_pte, ptl); 737 start_pte = NULL; 738 err = split_folio(folio); 739 folio_unlock(folio); 740 folio_put(folio); 741 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 742 start_pte = pte; 743 if (!start_pte) 744 break; 745 flush_tlb_batched_pending(mm); 746 arch_enter_lazy_mmu_mode(); 747 if (!err) 748 nr = 0; 749 continue; 750 } 751 752 if (any_young) 753 ptent = pte_mkyoung(ptent); 754 if (any_dirty) 755 ptent = pte_mkdirty(ptent); 756 } 757 758 if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { 759 if (!folio_trylock(folio)) 760 continue; 761 /* 762 * If we have a large folio at this point, we know it is 763 * fully mapped so if its mapcount is the same as its 764 * number of pages, it must be exclusive. 765 */ 766 if (folio_mapcount(folio) != folio_nr_pages(folio)) { 767 folio_unlock(folio); 768 continue; 769 } 770 771 if (folio_test_swapcache(folio) && 772 !folio_free_swap(folio)) { 773 folio_unlock(folio); 774 continue; 775 } 776 777 folio_clear_dirty(folio); 778 folio_unlock(folio); 779 } 780 781 if (pte_young(ptent) || pte_dirty(ptent)) { 782 clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags); 783 tlb_remove_tlb_entries(tlb, pte, nr, addr); 784 } 785 folio_mark_lazyfree(folio); 786 } 787 788 if (nr_swap) 789 add_mm_counter(mm, MM_SWAPENTS, nr_swap); 790 if (start_pte) { 791 arch_leave_lazy_mmu_mode(); 792 pte_unmap_unlock(start_pte, ptl); 793 } 794 cond_resched(); 795 796 return 0; 797 } 798 799 static const struct mm_walk_ops madvise_free_walk_ops = { 800 .pmd_entry = madvise_free_pte_range, 801 .walk_lock = PGWALK_RDLOCK, 802 }; 803 804 static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, 805 struct vm_area_struct *vma, 806 unsigned long start_addr, unsigned long end_addr) 807 { 808 struct mm_struct *mm = vma->vm_mm; 809 struct mmu_notifier_range range; 810 struct mmu_gather *tlb = madv_behavior->tlb; 811 812 /* MADV_FREE works for only anon vma at the moment */ 813 if (!vma_is_anonymous(vma)) 814 return -EINVAL; 815 816 range.start = max(vma->vm_start, start_addr); 817 if (range.start >= vma->vm_end) 818 return -EINVAL; 819 range.end = min(vma->vm_end, end_addr); 820 if (range.end <= vma->vm_start) 821 return -EINVAL; 822 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 823 range.start, range.end); 824 825 lru_add_drain(); 826 update_hiwater_rss(mm); 827 828 mmu_notifier_invalidate_range_start(&range); 829 tlb_start_vma(tlb, vma); 830 walk_page_range(vma->vm_mm, range.start, range.end, 831 &madvise_free_walk_ops, tlb); 832 tlb_end_vma(tlb, vma); 833 mmu_notifier_invalidate_range_end(&range); 834 return 0; 835 } 836 837 /* 838 * Application no longer needs these pages. If the pages are dirty, 839 * it's OK to just throw them away. The app will be more careful about 840 * data it wants to keep. Be sure to free swap resources too. The 841 * zap_page_range_single call sets things up for shrink_active_list to actually 842 * free these pages later if no one else has touched them in the meantime, 843 * although we could add these pages to a global reuse list for 844 * shrink_active_list to pick up before reclaiming other pages. 845 * 846 * NB: This interface discards data rather than pushes it out to swap, 847 * as some implementations do. This has performance implications for 848 * applications like large transactional databases which want to discard 849 * pages in anonymous maps after committing to backing store the data 850 * that was kept in them. There is no reason to write this data out to 851 * the swap area if the application is discarding it. 852 * 853 * An interface that causes the system to free clean pages and flush 854 * dirty pages is already available as msync(MS_INVALIDATE). 855 */ 856 static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior, 857 struct vm_area_struct *vma, 858 unsigned long start, unsigned long end) 859 { 860 struct zap_details details = { 861 .reclaim_pt = true, 862 .even_cows = true, 863 }; 864 865 zap_page_range_single_batched( 866 madv_behavior->tlb, vma, start, end - start, &details); 867 return 0; 868 } 869 870 static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, 871 unsigned long start, 872 unsigned long *end, 873 int behavior) 874 { 875 if (!is_vm_hugetlb_page(vma)) { 876 unsigned int forbidden = VM_PFNMAP; 877 878 if (behavior != MADV_DONTNEED_LOCKED) 879 forbidden |= VM_LOCKED; 880 881 return !(vma->vm_flags & forbidden); 882 } 883 884 if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) 885 return false; 886 if (start & ~huge_page_mask(hstate_vma(vma))) 887 return false; 888 889 /* 890 * Madvise callers expect the length to be rounded up to PAGE_SIZE 891 * boundaries, and may be unaware that this VMA uses huge pages. 892 * Avoid unexpected data loss by rounding down the number of 893 * huge pages freed. 894 */ 895 *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); 896 897 return true; 898 } 899 900 static long madvise_dontneed_free(struct vm_area_struct *vma, 901 struct vm_area_struct **prev, 902 unsigned long start, unsigned long end, 903 struct madvise_behavior *madv_behavior) 904 { 905 int behavior = madv_behavior->behavior; 906 struct mm_struct *mm = vma->vm_mm; 907 908 *prev = vma; 909 if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) 910 return -EINVAL; 911 912 if (start == end) 913 return 0; 914 915 if (!userfaultfd_remove(vma, start, end)) { 916 *prev = NULL; /* mmap_lock has been dropped, prev is stale */ 917 918 mmap_read_lock(mm); 919 vma = vma_lookup(mm, start); 920 if (!vma) 921 return -ENOMEM; 922 /* 923 * Potential end adjustment for hugetlb vma is OK as 924 * the check below keeps end within vma. 925 */ 926 if (!madvise_dontneed_free_valid_vma(vma, start, &end, 927 behavior)) 928 return -EINVAL; 929 if (end > vma->vm_end) { 930 /* 931 * Don't fail if end > vma->vm_end. If the old 932 * vma was split while the mmap_lock was 933 * released the effect of the concurrent 934 * operation may not cause madvise() to 935 * have an undefined result. There may be an 936 * adjacent next vma that we'll walk 937 * next. userfaultfd_remove() will generate an 938 * UFFD_EVENT_REMOVE repetition on the 939 * end-vma->vm_end range, but the manager can 940 * handle a repetition fine. 941 */ 942 end = vma->vm_end; 943 } 944 /* 945 * If the memory region between start and end was 946 * originally backed by 4kB pages and then remapped to 947 * be backed by hugepages while mmap_lock was dropped, 948 * the adjustment for hugetlb vma above may have rounded 949 * end down to the start address. 950 */ 951 if (start == end) 952 return 0; 953 VM_WARN_ON(start > end); 954 } 955 956 if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) 957 return madvise_dontneed_single_vma( 958 madv_behavior, vma, start, end); 959 else if (behavior == MADV_FREE) 960 return madvise_free_single_vma(madv_behavior, vma, start, end); 961 else 962 return -EINVAL; 963 } 964 965 static long madvise_populate(struct mm_struct *mm, unsigned long start, 966 unsigned long end, int behavior) 967 { 968 const bool write = behavior == MADV_POPULATE_WRITE; 969 int locked = 1; 970 long pages; 971 972 while (start < end) { 973 /* Populate (prefault) page tables readable/writable. */ 974 pages = faultin_page_range(mm, start, end, write, &locked); 975 if (!locked) { 976 mmap_read_lock(mm); 977 locked = 1; 978 } 979 if (pages < 0) { 980 switch (pages) { 981 case -EINTR: 982 return -EINTR; 983 case -EINVAL: /* Incompatible mappings / permissions. */ 984 return -EINVAL; 985 case -EHWPOISON: 986 return -EHWPOISON; 987 case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ 988 return -EFAULT; 989 default: 990 pr_warn_once("%s: unhandled return value: %ld\n", 991 __func__, pages); 992 fallthrough; 993 case -ENOMEM: /* No VMA or out of memory. */ 994 return -ENOMEM; 995 } 996 } 997 start += pages * PAGE_SIZE; 998 } 999 return 0; 1000 } 1001 1002 /* 1003 * Application wants to free up the pages and associated backing store. 1004 * This is effectively punching a hole into the middle of a file. 1005 */ 1006 static long madvise_remove(struct vm_area_struct *vma, 1007 struct vm_area_struct **prev, 1008 unsigned long start, unsigned long end) 1009 { 1010 loff_t offset; 1011 int error; 1012 struct file *f; 1013 struct mm_struct *mm = vma->vm_mm; 1014 1015 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 1016 1017 if (vma->vm_flags & VM_LOCKED) 1018 return -EINVAL; 1019 1020 f = vma->vm_file; 1021 1022 if (!f || !f->f_mapping || !f->f_mapping->host) { 1023 return -EINVAL; 1024 } 1025 1026 if (!vma_is_shared_maywrite(vma)) 1027 return -EACCES; 1028 1029 offset = (loff_t)(start - vma->vm_start) 1030 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 1031 1032 /* 1033 * Filesystem's fallocate may need to take i_rwsem. We need to 1034 * explicitly grab a reference because the vma (and hence the 1035 * vma's reference to the file) can go away as soon as we drop 1036 * mmap_lock. 1037 */ 1038 get_file(f); 1039 if (userfaultfd_remove(vma, start, end)) { 1040 /* mmap_lock was not released by userfaultfd_remove() */ 1041 mmap_read_unlock(mm); 1042 } 1043 error = vfs_fallocate(f, 1044 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 1045 offset, end - start); 1046 fput(f); 1047 mmap_read_lock(mm); 1048 return error; 1049 } 1050 1051 static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked) 1052 { 1053 vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB; 1054 1055 /* 1056 * A user could lock after setting a guard range but that's fine, as 1057 * they'd not be able to fault in. The issue arises when we try to zap 1058 * existing locked VMAs. We don't want to do that. 1059 */ 1060 if (!allow_locked) 1061 disallowed |= VM_LOCKED; 1062 1063 return !(vma->vm_flags & disallowed); 1064 } 1065 1066 static bool is_guard_pte_marker(pte_t ptent) 1067 { 1068 return is_pte_marker(ptent) && 1069 is_guard_swp_entry(pte_to_swp_entry(ptent)); 1070 } 1071 1072 static int guard_install_pud_entry(pud_t *pud, unsigned long addr, 1073 unsigned long next, struct mm_walk *walk) 1074 { 1075 pud_t pudval = pudp_get(pud); 1076 1077 /* If huge return >0 so we abort the operation + zap. */ 1078 return pud_trans_huge(pudval) || pud_devmap(pudval); 1079 } 1080 1081 static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr, 1082 unsigned long next, struct mm_walk *walk) 1083 { 1084 pmd_t pmdval = pmdp_get(pmd); 1085 1086 /* If huge return >0 so we abort the operation + zap. */ 1087 return pmd_trans_huge(pmdval) || pmd_devmap(pmdval); 1088 } 1089 1090 static int guard_install_pte_entry(pte_t *pte, unsigned long addr, 1091 unsigned long next, struct mm_walk *walk) 1092 { 1093 pte_t pteval = ptep_get(pte); 1094 unsigned long *nr_pages = (unsigned long *)walk->private; 1095 1096 /* If there is already a guard page marker, we have nothing to do. */ 1097 if (is_guard_pte_marker(pteval)) { 1098 (*nr_pages)++; 1099 1100 return 0; 1101 } 1102 1103 /* If populated return >0 so we abort the operation + zap. */ 1104 return 1; 1105 } 1106 1107 static int guard_install_set_pte(unsigned long addr, unsigned long next, 1108 pte_t *ptep, struct mm_walk *walk) 1109 { 1110 unsigned long *nr_pages = (unsigned long *)walk->private; 1111 1112 /* Simply install a PTE marker, this causes segfault on access. */ 1113 *ptep = make_pte_marker(PTE_MARKER_GUARD); 1114 (*nr_pages)++; 1115 1116 return 0; 1117 } 1118 1119 static const struct mm_walk_ops guard_install_walk_ops = { 1120 .pud_entry = guard_install_pud_entry, 1121 .pmd_entry = guard_install_pmd_entry, 1122 .pte_entry = guard_install_pte_entry, 1123 .install_pte = guard_install_set_pte, 1124 .walk_lock = PGWALK_RDLOCK, 1125 }; 1126 1127 static long madvise_guard_install(struct vm_area_struct *vma, 1128 struct vm_area_struct **prev, 1129 unsigned long start, unsigned long end) 1130 { 1131 long err; 1132 int i; 1133 1134 *prev = vma; 1135 if (!is_valid_guard_vma(vma, /* allow_locked = */false)) 1136 return -EINVAL; 1137 1138 /* 1139 * If we install guard markers, then the range is no longer 1140 * empty from a page table perspective and therefore it's 1141 * appropriate to have an anon_vma. 1142 * 1143 * This ensures that on fork, we copy page tables correctly. 1144 */ 1145 err = anon_vma_prepare(vma); 1146 if (err) 1147 return err; 1148 1149 /* 1150 * Optimistically try to install the guard marker pages first. If any 1151 * non-guard pages are encountered, give up and zap the range before 1152 * trying again. 1153 * 1154 * We try a few times before giving up and releasing back to userland to 1155 * loop around, releasing locks in the process to avoid contention. This 1156 * would only happen if there was a great many racing page faults. 1157 * 1158 * In most cases we should simply install the guard markers immediately 1159 * with no zap or looping. 1160 */ 1161 for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) { 1162 unsigned long nr_pages = 0; 1163 1164 /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */ 1165 err = walk_page_range_mm(vma->vm_mm, start, end, 1166 &guard_install_walk_ops, &nr_pages); 1167 if (err < 0) 1168 return err; 1169 1170 if (err == 0) { 1171 unsigned long nr_expected_pages = PHYS_PFN(end - start); 1172 1173 VM_WARN_ON(nr_pages != nr_expected_pages); 1174 return 0; 1175 } 1176 1177 /* 1178 * OK some of the range have non-guard pages mapped, zap 1179 * them. This leaves existing guard pages in place. 1180 */ 1181 zap_page_range_single(vma, start, end - start, NULL); 1182 } 1183 1184 /* 1185 * We were unable to install the guard pages due to being raced by page 1186 * faults. This should not happen ordinarily. We return to userspace and 1187 * immediately retry, relieving lock contention. 1188 */ 1189 return restart_syscall(); 1190 } 1191 1192 static int guard_remove_pud_entry(pud_t *pud, unsigned long addr, 1193 unsigned long next, struct mm_walk *walk) 1194 { 1195 pud_t pudval = pudp_get(pud); 1196 1197 /* If huge, cannot have guard pages present, so no-op - skip. */ 1198 if (pud_trans_huge(pudval) || pud_devmap(pudval)) 1199 walk->action = ACTION_CONTINUE; 1200 1201 return 0; 1202 } 1203 1204 static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr, 1205 unsigned long next, struct mm_walk *walk) 1206 { 1207 pmd_t pmdval = pmdp_get(pmd); 1208 1209 /* If huge, cannot have guard pages present, so no-op - skip. */ 1210 if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) 1211 walk->action = ACTION_CONTINUE; 1212 1213 return 0; 1214 } 1215 1216 static int guard_remove_pte_entry(pte_t *pte, unsigned long addr, 1217 unsigned long next, struct mm_walk *walk) 1218 { 1219 pte_t ptent = ptep_get(pte); 1220 1221 if (is_guard_pte_marker(ptent)) { 1222 /* Simply clear the PTE marker. */ 1223 pte_clear_not_present_full(walk->mm, addr, pte, false); 1224 update_mmu_cache(walk->vma, addr, pte); 1225 } 1226 1227 return 0; 1228 } 1229 1230 static const struct mm_walk_ops guard_remove_walk_ops = { 1231 .pud_entry = guard_remove_pud_entry, 1232 .pmd_entry = guard_remove_pmd_entry, 1233 .pte_entry = guard_remove_pte_entry, 1234 .walk_lock = PGWALK_RDLOCK, 1235 }; 1236 1237 static long madvise_guard_remove(struct vm_area_struct *vma, 1238 struct vm_area_struct **prev, 1239 unsigned long start, unsigned long end) 1240 { 1241 *prev = vma; 1242 /* 1243 * We're ok with removing guards in mlock()'d ranges, as this is a 1244 * non-destructive action. 1245 */ 1246 if (!is_valid_guard_vma(vma, /* allow_locked = */true)) 1247 return -EINVAL; 1248 1249 return walk_page_range(vma->vm_mm, start, end, 1250 &guard_remove_walk_ops, NULL); 1251 } 1252 1253 /* 1254 * Apply an madvise behavior to a region of a vma. madvise_update_vma 1255 * will handle splitting a vm area into separate areas, each area with its own 1256 * behavior. 1257 */ 1258 static int madvise_vma_behavior(struct vm_area_struct *vma, 1259 struct vm_area_struct **prev, 1260 unsigned long start, unsigned long end, 1261 void *behavior_arg) 1262 { 1263 struct madvise_behavior *arg = behavior_arg; 1264 int behavior = arg->behavior; 1265 int error; 1266 struct anon_vma_name *anon_name; 1267 unsigned long new_flags = vma->vm_flags; 1268 1269 if (unlikely(!can_modify_vma_madv(vma, behavior))) 1270 return -EPERM; 1271 1272 switch (behavior) { 1273 case MADV_REMOVE: 1274 return madvise_remove(vma, prev, start, end); 1275 case MADV_WILLNEED: 1276 return madvise_willneed(vma, prev, start, end); 1277 case MADV_COLD: 1278 return madvise_cold(vma, prev, start, end); 1279 case MADV_PAGEOUT: 1280 return madvise_pageout(vma, prev, start, end); 1281 case MADV_FREE: 1282 case MADV_DONTNEED: 1283 case MADV_DONTNEED_LOCKED: 1284 return madvise_dontneed_free(vma, prev, start, end, arg); 1285 case MADV_NORMAL: 1286 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 1287 break; 1288 case MADV_SEQUENTIAL: 1289 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 1290 break; 1291 case MADV_RANDOM: 1292 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 1293 break; 1294 case MADV_DONTFORK: 1295 new_flags |= VM_DONTCOPY; 1296 break; 1297 case MADV_DOFORK: 1298 if (vma->vm_flags & VM_IO) 1299 return -EINVAL; 1300 new_flags &= ~VM_DONTCOPY; 1301 break; 1302 case MADV_WIPEONFORK: 1303 /* MADV_WIPEONFORK is only supported on anonymous memory. */ 1304 if (vma->vm_file || vma->vm_flags & VM_SHARED) 1305 return -EINVAL; 1306 new_flags |= VM_WIPEONFORK; 1307 break; 1308 case MADV_KEEPONFORK: 1309 if (vma->vm_flags & VM_DROPPABLE) 1310 return -EINVAL; 1311 new_flags &= ~VM_WIPEONFORK; 1312 break; 1313 case MADV_DONTDUMP: 1314 new_flags |= VM_DONTDUMP; 1315 break; 1316 case MADV_DODUMP: 1317 if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) || 1318 (vma->vm_flags & VM_DROPPABLE)) 1319 return -EINVAL; 1320 new_flags &= ~VM_DONTDUMP; 1321 break; 1322 case MADV_MERGEABLE: 1323 case MADV_UNMERGEABLE: 1324 error = ksm_madvise(vma, start, end, behavior, &new_flags); 1325 if (error) 1326 goto out; 1327 break; 1328 case MADV_HUGEPAGE: 1329 case MADV_NOHUGEPAGE: 1330 error = hugepage_madvise(vma, &new_flags, behavior); 1331 if (error) 1332 goto out; 1333 break; 1334 case MADV_COLLAPSE: 1335 return madvise_collapse(vma, prev, start, end); 1336 case MADV_GUARD_INSTALL: 1337 return madvise_guard_install(vma, prev, start, end); 1338 case MADV_GUARD_REMOVE: 1339 return madvise_guard_remove(vma, prev, start, end); 1340 } 1341 1342 anon_name = anon_vma_name(vma); 1343 anon_vma_name_get(anon_name); 1344 error = madvise_update_vma(vma, prev, start, end, new_flags, 1345 anon_name); 1346 anon_vma_name_put(anon_name); 1347 1348 out: 1349 /* 1350 * madvise() returns EAGAIN if kernel resources, such as 1351 * slab, are temporarily unavailable. 1352 */ 1353 if (error == -ENOMEM) 1354 error = -EAGAIN; 1355 return error; 1356 } 1357 1358 #ifdef CONFIG_MEMORY_FAILURE 1359 /* 1360 * Error injection support for memory error handling. 1361 */ 1362 static int madvise_inject_error(int behavior, 1363 unsigned long start, unsigned long end) 1364 { 1365 unsigned long size; 1366 1367 if (!capable(CAP_SYS_ADMIN)) 1368 return -EPERM; 1369 1370 1371 for (; start < end; start += size) { 1372 unsigned long pfn; 1373 struct page *page; 1374 int ret; 1375 1376 ret = get_user_pages_fast(start, 1, 0, &page); 1377 if (ret != 1) 1378 return ret; 1379 pfn = page_to_pfn(page); 1380 1381 /* 1382 * When soft offlining hugepages, after migrating the page 1383 * we dissolve it, therefore in the second loop "page" will 1384 * no longer be a compound page. 1385 */ 1386 size = page_size(compound_head(page)); 1387 1388 if (behavior == MADV_SOFT_OFFLINE) { 1389 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", 1390 pfn, start); 1391 ret = soft_offline_page(pfn, MF_COUNT_INCREASED); 1392 } else { 1393 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", 1394 pfn, start); 1395 ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED); 1396 if (ret == -EOPNOTSUPP) 1397 ret = 0; 1398 } 1399 1400 if (ret) 1401 return ret; 1402 } 1403 1404 return 0; 1405 } 1406 1407 static bool is_memory_failure(int behavior) 1408 { 1409 switch (behavior) { 1410 case MADV_HWPOISON: 1411 case MADV_SOFT_OFFLINE: 1412 return true; 1413 default: 1414 return false; 1415 } 1416 } 1417 1418 #else 1419 1420 static int madvise_inject_error(int behavior, 1421 unsigned long start, unsigned long end) 1422 { 1423 return 0; 1424 } 1425 1426 static bool is_memory_failure(int behavior) 1427 { 1428 return false; 1429 } 1430 1431 #endif /* CONFIG_MEMORY_FAILURE */ 1432 1433 static bool 1434 madvise_behavior_valid(int behavior) 1435 { 1436 switch (behavior) { 1437 case MADV_DOFORK: 1438 case MADV_DONTFORK: 1439 case MADV_NORMAL: 1440 case MADV_SEQUENTIAL: 1441 case MADV_RANDOM: 1442 case MADV_REMOVE: 1443 case MADV_WILLNEED: 1444 case MADV_DONTNEED: 1445 case MADV_DONTNEED_LOCKED: 1446 case MADV_FREE: 1447 case MADV_COLD: 1448 case MADV_PAGEOUT: 1449 case MADV_POPULATE_READ: 1450 case MADV_POPULATE_WRITE: 1451 #ifdef CONFIG_KSM 1452 case MADV_MERGEABLE: 1453 case MADV_UNMERGEABLE: 1454 #endif 1455 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1456 case MADV_HUGEPAGE: 1457 case MADV_NOHUGEPAGE: 1458 case MADV_COLLAPSE: 1459 #endif 1460 case MADV_DONTDUMP: 1461 case MADV_DODUMP: 1462 case MADV_WIPEONFORK: 1463 case MADV_KEEPONFORK: 1464 case MADV_GUARD_INSTALL: 1465 case MADV_GUARD_REMOVE: 1466 #ifdef CONFIG_MEMORY_FAILURE 1467 case MADV_SOFT_OFFLINE: 1468 case MADV_HWPOISON: 1469 #endif 1470 return true; 1471 1472 default: 1473 return false; 1474 } 1475 } 1476 1477 /* Can we invoke process_madvise() on a remote mm for the specified behavior? */ 1478 static bool process_madvise_remote_valid(int behavior) 1479 { 1480 switch (behavior) { 1481 case MADV_COLD: 1482 case MADV_PAGEOUT: 1483 case MADV_WILLNEED: 1484 case MADV_COLLAPSE: 1485 return true; 1486 default: 1487 return false; 1488 } 1489 } 1490 1491 /* 1492 * Walk the vmas in range [start,end), and call the visit function on each one. 1493 * The visit function will get start and end parameters that cover the overlap 1494 * between the current vma and the original range. Any unmapped regions in the 1495 * original range will result in this function returning -ENOMEM while still 1496 * calling the visit function on all of the existing vmas in the range. 1497 * Must be called with the mmap_lock held for reading or writing. 1498 */ 1499 static 1500 int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, 1501 unsigned long end, void *arg, 1502 int (*visit)(struct vm_area_struct *vma, 1503 struct vm_area_struct **prev, unsigned long start, 1504 unsigned long end, void *arg)) 1505 { 1506 struct vm_area_struct *vma; 1507 struct vm_area_struct *prev; 1508 unsigned long tmp; 1509 int unmapped_error = 0; 1510 1511 /* 1512 * If the interval [start,end) covers some unmapped address 1513 * ranges, just ignore them, but return -ENOMEM at the end. 1514 * - different from the way of handling in mlock etc. 1515 */ 1516 vma = find_vma_prev(mm, start, &prev); 1517 if (vma && start > vma->vm_start) 1518 prev = vma; 1519 1520 for (;;) { 1521 int error; 1522 1523 /* Still start < end. */ 1524 if (!vma) 1525 return -ENOMEM; 1526 1527 /* Here start < (end|vma->vm_end). */ 1528 if (start < vma->vm_start) { 1529 unmapped_error = -ENOMEM; 1530 start = vma->vm_start; 1531 if (start >= end) 1532 break; 1533 } 1534 1535 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 1536 tmp = vma->vm_end; 1537 if (end < tmp) 1538 tmp = end; 1539 1540 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 1541 error = visit(vma, &prev, start, tmp, arg); 1542 if (error) 1543 return error; 1544 start = tmp; 1545 if (prev && start < prev->vm_end) 1546 start = prev->vm_end; 1547 if (start >= end) 1548 break; 1549 if (prev) 1550 vma = find_vma(mm, prev->vm_end); 1551 else /* madvise_remove dropped mmap_lock */ 1552 vma = find_vma(mm, start); 1553 } 1554 1555 return unmapped_error; 1556 } 1557 1558 #ifdef CONFIG_ANON_VMA_NAME 1559 static int madvise_vma_anon_name(struct vm_area_struct *vma, 1560 struct vm_area_struct **prev, 1561 unsigned long start, unsigned long end, 1562 void *anon_name) 1563 { 1564 int error; 1565 1566 /* Only anonymous mappings can be named */ 1567 if (vma->vm_file && !vma_is_anon_shmem(vma)) 1568 return -EBADF; 1569 1570 error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, 1571 anon_name); 1572 1573 /* 1574 * madvise() returns EAGAIN if kernel resources, such as 1575 * slab, are temporarily unavailable. 1576 */ 1577 if (error == -ENOMEM) 1578 error = -EAGAIN; 1579 return error; 1580 } 1581 1582 int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, 1583 unsigned long len_in, struct anon_vma_name *anon_name) 1584 { 1585 unsigned long end; 1586 unsigned long len; 1587 1588 if (start & ~PAGE_MASK) 1589 return -EINVAL; 1590 len = (len_in + ~PAGE_MASK) & PAGE_MASK; 1591 1592 /* Check to see whether len was rounded up from small -ve to zero */ 1593 if (len_in && !len) 1594 return -EINVAL; 1595 1596 end = start + len; 1597 if (end < start) 1598 return -EINVAL; 1599 1600 if (end == start) 1601 return 0; 1602 1603 return madvise_walk_vmas(mm, start, end, anon_name, 1604 madvise_vma_anon_name); 1605 } 1606 #endif /* CONFIG_ANON_VMA_NAME */ 1607 1608 static int madvise_lock(struct mm_struct *mm, int behavior) 1609 { 1610 if (is_memory_failure(behavior)) 1611 return 0; 1612 1613 if (madvise_need_mmap_write(behavior)) { 1614 if (mmap_write_lock_killable(mm)) 1615 return -EINTR; 1616 } else { 1617 mmap_read_lock(mm); 1618 } 1619 return 0; 1620 } 1621 1622 static void madvise_unlock(struct mm_struct *mm, int behavior) 1623 { 1624 if (is_memory_failure(behavior)) 1625 return; 1626 1627 if (madvise_need_mmap_write(behavior)) 1628 mmap_write_unlock(mm); 1629 else 1630 mmap_read_unlock(mm); 1631 } 1632 1633 static bool madvise_batch_tlb_flush(int behavior) 1634 { 1635 switch (behavior) { 1636 case MADV_DONTNEED: 1637 case MADV_DONTNEED_LOCKED: 1638 case MADV_FREE: 1639 return true; 1640 default: 1641 return false; 1642 } 1643 } 1644 1645 static void madvise_init_tlb(struct madvise_behavior *madv_behavior, 1646 struct mm_struct *mm) 1647 { 1648 if (madvise_batch_tlb_flush(madv_behavior->behavior)) 1649 tlb_gather_mmu(madv_behavior->tlb, mm); 1650 } 1651 1652 static void madvise_finish_tlb(struct madvise_behavior *madv_behavior) 1653 { 1654 if (madvise_batch_tlb_flush(madv_behavior->behavior)) 1655 tlb_finish_mmu(madv_behavior->tlb); 1656 } 1657 1658 static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) 1659 { 1660 size_t len; 1661 1662 if (!madvise_behavior_valid(behavior)) 1663 return false; 1664 1665 if (!PAGE_ALIGNED(start)) 1666 return false; 1667 len = PAGE_ALIGN(len_in); 1668 1669 /* Check to see whether len was rounded up from small -ve to zero */ 1670 if (len_in && !len) 1671 return false; 1672 1673 if (start + len < start) 1674 return false; 1675 1676 return true; 1677 } 1678 1679 /* 1680 * madvise_should_skip() - Return if the request is invalid or nothing. 1681 * @start: Start address of madvise-requested address range. 1682 * @len_in: Length of madvise-requested address range. 1683 * @behavior: Requested madvise behavor. 1684 * @err: Pointer to store an error code from the check. 1685 * 1686 * If the specified behaviour is invalid or nothing would occur, we skip the 1687 * operation. This function returns true in the cases, otherwise false. In 1688 * the former case we store an error on @err. 1689 */ 1690 static bool madvise_should_skip(unsigned long start, size_t len_in, 1691 int behavior, int *err) 1692 { 1693 if (!is_valid_madvise(start, len_in, behavior)) { 1694 *err = -EINVAL; 1695 return true; 1696 } 1697 if (start + PAGE_ALIGN(len_in) == start) { 1698 *err = 0; 1699 return true; 1700 } 1701 return false; 1702 } 1703 1704 static bool is_madvise_populate(int behavior) 1705 { 1706 switch (behavior) { 1707 case MADV_POPULATE_READ: 1708 case MADV_POPULATE_WRITE: 1709 return true; 1710 default: 1711 return false; 1712 } 1713 } 1714 1715 static int madvise_do_behavior(struct mm_struct *mm, 1716 unsigned long start, size_t len_in, 1717 struct madvise_behavior *madv_behavior) 1718 { 1719 int behavior = madv_behavior->behavior; 1720 struct blk_plug plug; 1721 unsigned long end; 1722 int error; 1723 1724 if (is_memory_failure(behavior)) 1725 return madvise_inject_error(behavior, start, start + len_in); 1726 start = untagged_addr_remote(mm, start); 1727 end = start + PAGE_ALIGN(len_in); 1728 1729 blk_start_plug(&plug); 1730 if (is_madvise_populate(behavior)) 1731 error = madvise_populate(mm, start, end, behavior); 1732 else 1733 error = madvise_walk_vmas(mm, start, end, madv_behavior, 1734 madvise_vma_behavior); 1735 blk_finish_plug(&plug); 1736 return error; 1737 } 1738 1739 /* 1740 * The madvise(2) system call. 1741 * 1742 * Applications can use madvise() to advise the kernel how it should 1743 * handle paging I/O in this VM area. The idea is to help the kernel 1744 * use appropriate read-ahead and caching techniques. The information 1745 * provided is advisory only, and can be safely disregarded by the 1746 * kernel without affecting the correct operation of the application. 1747 * 1748 * behavior values: 1749 * MADV_NORMAL - the default behavior is to read clusters. This 1750 * results in some read-ahead and read-behind. 1751 * MADV_RANDOM - the system should read the minimum amount of data 1752 * on any access, since it is unlikely that the appli- 1753 * cation will need more than what it asks for. 1754 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 1755 * once, so they can be aggressively read ahead, and 1756 * can be freed soon after they are accessed. 1757 * MADV_WILLNEED - the application is notifying the system to read 1758 * some pages ahead. 1759 * MADV_DONTNEED - the application is finished with the given range, 1760 * so the kernel can free resources associated with it. 1761 * MADV_FREE - the application marks pages in the given range as lazy free, 1762 * where actual purges are postponed until memory pressure happens. 1763 * MADV_REMOVE - the application wants to free up the given range of 1764 * pages and associated backing store. 1765 * MADV_DONTFORK - omit this area from child's address space when forking: 1766 * typically, to avoid COWing pages pinned by get_user_pages(). 1767 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 1768 * MADV_WIPEONFORK - present the child process with zero-filled memory in this 1769 * range after a fork. 1770 * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK 1771 * MADV_HWPOISON - trigger memory error handler as if the given memory range 1772 * were corrupted by unrecoverable hardware memory failure. 1773 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. 1774 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 1775 * this area with pages of identical content from other such areas. 1776 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 1777 * MADV_HUGEPAGE - the application wants to back the given range by transparent 1778 * huge pages in the future. Existing pages might be coalesced and 1779 * new pages might be allocated as THP. 1780 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by 1781 * transparent huge pages so the existing pages will not be 1782 * coalesced into THP and new pages will not be allocated as THP. 1783 * MADV_COLLAPSE - synchronously coalesce pages into new THP. 1784 * MADV_DONTDUMP - the application wants to prevent pages in the given range 1785 * from being included in its core dump. 1786 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. 1787 * MADV_COLD - the application is not expected to use this memory soon, 1788 * deactivate pages in this range so that they can be reclaimed 1789 * easily if memory pressure happens. 1790 * MADV_PAGEOUT - the application is not expected to use this memory soon, 1791 * page out the pages in this range immediately. 1792 * MADV_POPULATE_READ - populate (prefault) page tables readable by 1793 * triggering read faults if required 1794 * MADV_POPULATE_WRITE - populate (prefault) page tables writable by 1795 * triggering write faults if required 1796 * 1797 * return values: 1798 * zero - success 1799 * -EINVAL - start + len < 0, start is not page-aligned, 1800 * "behavior" is not a valid value, or application 1801 * is attempting to release locked or shared pages, 1802 * or the specified address range includes file, Huge TLB, 1803 * MAP_SHARED or VMPFNMAP range. 1804 * -ENOMEM - addresses in the specified range are not currently 1805 * mapped, or are outside the AS of the process. 1806 * -EIO - an I/O error occurred while paging in data. 1807 * -EBADF - map exists, but area maps something that isn't a file. 1808 * -EAGAIN - a kernel resource was temporarily unavailable. 1809 * -EPERM - memory is sealed. 1810 */ 1811 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) 1812 { 1813 int error; 1814 struct mmu_gather tlb; 1815 struct madvise_behavior madv_behavior = { 1816 .behavior = behavior, 1817 .tlb = &tlb, 1818 }; 1819 1820 if (madvise_should_skip(start, len_in, behavior, &error)) 1821 return error; 1822 error = madvise_lock(mm, behavior); 1823 if (error) 1824 return error; 1825 madvise_init_tlb(&madv_behavior, mm); 1826 error = madvise_do_behavior(mm, start, len_in, &madv_behavior); 1827 madvise_finish_tlb(&madv_behavior); 1828 madvise_unlock(mm, behavior); 1829 1830 return error; 1831 } 1832 1833 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 1834 { 1835 return do_madvise(current->mm, start, len_in, behavior); 1836 } 1837 1838 /* Perform an madvise operation over a vector of addresses and lengths. */ 1839 static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, 1840 int behavior) 1841 { 1842 ssize_t ret = 0; 1843 size_t total_len; 1844 struct mmu_gather tlb; 1845 struct madvise_behavior madv_behavior = { 1846 .behavior = behavior, 1847 .tlb = &tlb, 1848 }; 1849 1850 total_len = iov_iter_count(iter); 1851 1852 ret = madvise_lock(mm, behavior); 1853 if (ret) 1854 return ret; 1855 madvise_init_tlb(&madv_behavior, mm); 1856 1857 while (iov_iter_count(iter)) { 1858 unsigned long start = (unsigned long)iter_iov_addr(iter); 1859 size_t len_in = iter_iov_len(iter); 1860 int error; 1861 1862 if (madvise_should_skip(start, len_in, behavior, &error)) 1863 ret = error; 1864 else 1865 ret = madvise_do_behavior(mm, start, len_in, 1866 &madv_behavior); 1867 /* 1868 * An madvise operation is attempting to restart the syscall, 1869 * but we cannot proceed as it would not be correct to repeat 1870 * the operation in aggregate, and would be surprising to the 1871 * user. 1872 * 1873 * We drop and reacquire locks so it is safe to just loop and 1874 * try again. We check for fatal signals in case we need exit 1875 * early anyway. 1876 */ 1877 if (ret == -ERESTARTNOINTR) { 1878 if (fatal_signal_pending(current)) { 1879 ret = -EINTR; 1880 break; 1881 } 1882 1883 /* Drop and reacquire lock to unwind race. */ 1884 madvise_finish_tlb(&madv_behavior); 1885 madvise_unlock(mm, behavior); 1886 ret = madvise_lock(mm, behavior); 1887 if (ret) 1888 goto out; 1889 madvise_init_tlb(&madv_behavior, mm); 1890 continue; 1891 } 1892 if (ret < 0) 1893 break; 1894 iov_iter_advance(iter, iter_iov_len(iter)); 1895 } 1896 madvise_finish_tlb(&madv_behavior); 1897 madvise_unlock(mm, behavior); 1898 1899 out: 1900 ret = (total_len - iov_iter_count(iter)) ? : ret; 1901 1902 return ret; 1903 } 1904 1905 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, 1906 size_t, vlen, int, behavior, unsigned int, flags) 1907 { 1908 ssize_t ret; 1909 struct iovec iovstack[UIO_FASTIOV]; 1910 struct iovec *iov = iovstack; 1911 struct iov_iter iter; 1912 struct task_struct *task; 1913 struct mm_struct *mm; 1914 unsigned int f_flags; 1915 1916 if (flags != 0) { 1917 ret = -EINVAL; 1918 goto out; 1919 } 1920 1921 ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); 1922 if (ret < 0) 1923 goto out; 1924 1925 task = pidfd_get_task(pidfd, &f_flags); 1926 if (IS_ERR(task)) { 1927 ret = PTR_ERR(task); 1928 goto free_iov; 1929 } 1930 1931 /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ 1932 mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); 1933 if (IS_ERR(mm)) { 1934 ret = PTR_ERR(mm); 1935 goto release_task; 1936 } 1937 1938 /* 1939 * We need only perform this check if we are attempting to manipulate a 1940 * remote process's address space. 1941 */ 1942 if (mm != current->mm && !process_madvise_remote_valid(behavior)) { 1943 ret = -EINVAL; 1944 goto release_mm; 1945 } 1946 1947 /* 1948 * Require CAP_SYS_NICE for influencing process performance. Note that 1949 * only non-destructive hints are currently supported for remote 1950 * processes. 1951 */ 1952 if (mm != current->mm && !capable(CAP_SYS_NICE)) { 1953 ret = -EPERM; 1954 goto release_mm; 1955 } 1956 1957 ret = vector_madvise(mm, &iter, behavior); 1958 1959 release_mm: 1960 mmput(mm); 1961 release_task: 1962 put_task_struct(task); 1963 free_iov: 1964 kfree(iov); 1965 out: 1966 return ret; 1967 } 1968