1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/madvise.c 4 * 5 * Copyright (C) 1999 Linus Torvalds 6 * Copyright (C) 2002 Christoph Hellwig 7 */ 8 9 #include <linux/mman.h> 10 #include <linux/pagemap.h> 11 #include <linux/syscalls.h> 12 #include <linux/mempolicy.h> 13 #include <linux/page-isolation.h> 14 #include <linux/page_idle.h> 15 #include <linux/userfaultfd_k.h> 16 #include <linux/hugetlb.h> 17 #include <linux/falloc.h> 18 #include <linux/fadvise.h> 19 #include <linux/sched.h> 20 #include <linux/sched/mm.h> 21 #include <linux/mm_inline.h> 22 #include <linux/mmu_context.h> 23 #include <linux/string.h> 24 #include <linux/uio.h> 25 #include <linux/ksm.h> 26 #include <linux/fs.h> 27 #include <linux/file.h> 28 #include <linux/blkdev.h> 29 #include <linux/backing-dev.h> 30 #include <linux/pagewalk.h> 31 #include <linux/swap.h> 32 #include <linux/leafops.h> 33 #include <linux/shmem_fs.h> 34 #include <linux/mmu_notifier.h> 35 36 #include <asm/tlb.h> 37 38 #include "internal.h" 39 #include "swap.h" 40 41 #define __MADV_SET_ANON_VMA_NAME (-1) 42 43 /* 44 * Maximum number of attempts we make to install guard pages before we give up 45 * and return -ERESTARTNOINTR to have userspace try again. 46 */ 47 #define MAX_MADVISE_GUARD_RETRIES 3 48 49 struct madvise_walk_private { 50 struct mmu_gather *tlb; 51 bool pageout; 52 }; 53 54 enum madvise_lock_mode { 55 MADVISE_NO_LOCK, 56 MADVISE_MMAP_READ_LOCK, 57 MADVISE_MMAP_WRITE_LOCK, 58 MADVISE_VMA_READ_LOCK, 59 }; 60 61 struct madvise_behavior_range { 62 unsigned long start; 63 unsigned long end; 64 }; 65 66 struct madvise_behavior { 67 struct mm_struct *mm; 68 int behavior; 69 struct mmu_gather *tlb; 70 enum madvise_lock_mode lock_mode; 71 struct anon_vma_name *anon_name; 72 73 /* 74 * The range over which the behaviour is currently being applied. If 75 * traversing multiple VMAs, this is updated for each. 76 */ 77 struct madvise_behavior_range range; 78 /* The VMA and VMA preceding it (if applicable) currently targeted. */ 79 struct vm_area_struct *prev; 80 struct vm_area_struct *vma; 81 bool lock_dropped; 82 }; 83 84 #ifdef CONFIG_ANON_VMA_NAME 85 static int madvise_walk_vmas(struct madvise_behavior *madv_behavior); 86 87 struct anon_vma_name *anon_vma_name_alloc(const char *name) 88 { 89 struct anon_vma_name *anon_name; 90 size_t count; 91 92 /* Add 1 for NUL terminator at the end of the anon_name->name */ 93 count = strlen(name) + 1; 94 anon_name = kmalloc_flex(*anon_name, name, count); 95 if (anon_name) { 96 kref_init(&anon_name->kref); 97 memcpy(anon_name->name, name, count); 98 } 99 100 return anon_name; 101 } 102 103 void anon_vma_name_free(struct kref *kref) 104 { 105 struct anon_vma_name *anon_name = 106 container_of(kref, struct anon_vma_name, kref); 107 kfree(anon_name); 108 } 109 110 struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) 111 { 112 vma_assert_stabilised(vma); 113 return vma->anon_name; 114 } 115 116 /* mmap_lock should be write-locked */ 117 static int replace_anon_vma_name(struct vm_area_struct *vma, 118 struct anon_vma_name *anon_name) 119 { 120 struct anon_vma_name *orig_name = anon_vma_name(vma); 121 122 if (!anon_name) { 123 vma->anon_name = NULL; 124 anon_vma_name_put(orig_name); 125 return 0; 126 } 127 128 if (anon_vma_name_eq(orig_name, anon_name)) 129 return 0; 130 131 vma->anon_name = anon_vma_name_reuse(anon_name); 132 anon_vma_name_put(orig_name); 133 134 return 0; 135 } 136 #else /* CONFIG_ANON_VMA_NAME */ 137 static int replace_anon_vma_name(struct vm_area_struct *vma, 138 struct anon_vma_name *anon_name) 139 { 140 if (anon_name) 141 return -EINVAL; 142 143 return 0; 144 } 145 #endif /* CONFIG_ANON_VMA_NAME */ 146 /* 147 * Update the vm_flags or anon_name on region of a vma, splitting it or merging 148 * it as necessary. Must be called with mmap_lock held for writing. 149 */ 150 static int madvise_update_vma(vm_flags_t new_flags, 151 struct madvise_behavior *madv_behavior) 152 { 153 struct vm_area_struct *vma = madv_behavior->vma; 154 struct madvise_behavior_range *range = &madv_behavior->range; 155 struct anon_vma_name *anon_name = madv_behavior->anon_name; 156 bool set_new_anon_name = madv_behavior->behavior == __MADV_SET_ANON_VMA_NAME; 157 VMA_ITERATOR(vmi, madv_behavior->mm, range->start); 158 159 if (new_flags == vma->vm_flags && (!set_new_anon_name || 160 anon_vma_name_eq(anon_vma_name(vma), anon_name))) 161 return 0; 162 163 if (set_new_anon_name) 164 vma = vma_modify_name(&vmi, madv_behavior->prev, vma, 165 range->start, range->end, anon_name); 166 else 167 vma = vma_modify_flags(&vmi, madv_behavior->prev, vma, 168 range->start, range->end, &new_flags); 169 170 if (IS_ERR(vma)) 171 return PTR_ERR(vma); 172 173 madv_behavior->vma = vma; 174 175 /* vm_flags is protected by the mmap_lock held in write mode. */ 176 vma_start_write(vma); 177 vm_flags_reset(vma, new_flags); 178 if (set_new_anon_name) 179 return replace_anon_vma_name(vma, anon_name); 180 181 return 0; 182 } 183 184 #ifdef CONFIG_SWAP 185 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, 186 unsigned long end, struct mm_walk *walk) 187 { 188 struct vm_area_struct *vma = walk->private; 189 struct swap_iocb *splug = NULL; 190 pte_t *ptep = NULL; 191 spinlock_t *ptl; 192 unsigned long addr; 193 194 for (addr = start; addr < end; addr += PAGE_SIZE) { 195 pte_t pte; 196 softleaf_t entry; 197 struct folio *folio; 198 199 if (!ptep++) { 200 ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 201 if (!ptep) 202 break; 203 } 204 205 pte = ptep_get(ptep); 206 entry = softleaf_from_pte(pte); 207 if (unlikely(!softleaf_is_swap(entry))) 208 continue; 209 210 pte_unmap_unlock(ptep, ptl); 211 ptep = NULL; 212 213 folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, 214 vma, addr, &splug); 215 if (folio) 216 folio_put(folio); 217 } 218 219 if (ptep) 220 pte_unmap_unlock(ptep, ptl); 221 swap_read_unplug(splug); 222 cond_resched(); 223 224 return 0; 225 } 226 227 static const struct mm_walk_ops swapin_walk_ops = { 228 .pmd_entry = swapin_walk_pmd_entry, 229 .walk_lock = PGWALK_RDLOCK, 230 }; 231 232 static void shmem_swapin_range(struct vm_area_struct *vma, 233 unsigned long start, unsigned long end, 234 struct address_space *mapping) 235 { 236 XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); 237 pgoff_t end_index = linear_page_index(vma, end) - 1; 238 struct folio *folio; 239 struct swap_iocb *splug = NULL; 240 241 rcu_read_lock(); 242 xas_for_each(&xas, folio, end_index) { 243 unsigned long addr; 244 swp_entry_t entry; 245 246 if (!xa_is_value(folio)) 247 continue; 248 entry = radix_to_swp_entry(folio); 249 /* There might be swapin error entries in shmem mapping. */ 250 if (!softleaf_is_swap(entry)) 251 continue; 252 253 addr = vma->vm_start + 254 ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT); 255 xas_pause(&xas); 256 rcu_read_unlock(); 257 258 folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping), 259 vma, addr, &splug); 260 if (folio) 261 folio_put(folio); 262 263 rcu_read_lock(); 264 } 265 rcu_read_unlock(); 266 swap_read_unplug(splug); 267 } 268 #endif /* CONFIG_SWAP */ 269 270 static void mark_mmap_lock_dropped(struct madvise_behavior *madv_behavior) 271 { 272 VM_WARN_ON_ONCE(madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK); 273 madv_behavior->lock_dropped = true; 274 } 275 276 /* 277 * Schedule all required I/O operations. Do not wait for completion. 278 */ 279 static long madvise_willneed(struct madvise_behavior *madv_behavior) 280 { 281 struct vm_area_struct *vma = madv_behavior->vma; 282 struct mm_struct *mm = madv_behavior->mm; 283 struct file *file = vma->vm_file; 284 unsigned long start = madv_behavior->range.start; 285 unsigned long end = madv_behavior->range.end; 286 loff_t offset; 287 288 #ifdef CONFIG_SWAP 289 if (!file) { 290 walk_page_range_vma(vma, start, end, &swapin_walk_ops, vma); 291 lru_add_drain(); /* Push any new pages onto the LRU now */ 292 return 0; 293 } 294 295 if (shmem_mapping(file->f_mapping)) { 296 shmem_swapin_range(vma, start, end, file->f_mapping); 297 lru_add_drain(); /* Push any new pages onto the LRU now */ 298 return 0; 299 } 300 #else 301 if (!file) 302 return -EBADF; 303 #endif 304 305 if (IS_DAX(file_inode(file))) { 306 /* no bad return value, but ignore advice */ 307 return 0; 308 } 309 310 /* 311 * Filesystem's fadvise may need to take various locks. We need to 312 * explicitly grab a reference because the vma (and hence the 313 * vma's reference to the file) can go away as soon as we drop 314 * mmap_lock. 315 */ 316 mark_mmap_lock_dropped(madv_behavior); 317 get_file(file); 318 offset = (loff_t)(start - vma->vm_start) 319 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 320 mmap_read_unlock(mm); 321 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); 322 fput(file); 323 mmap_read_lock(mm); 324 return 0; 325 } 326 327 static inline bool can_do_file_pageout(struct vm_area_struct *vma) 328 { 329 if (!vma->vm_file) 330 return false; 331 /* 332 * paging out pagecache only for non-anonymous mappings that correspond 333 * to the files the calling process could (if tried) open for writing; 334 * otherwise we'd be including shared non-exclusive mappings, which 335 * opens a side channel. 336 */ 337 return inode_owner_or_capable(&nop_mnt_idmap, 338 file_inode(vma->vm_file)) || 339 file_permission(vma->vm_file, MAY_WRITE) == 0; 340 } 341 342 static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, 343 struct folio *folio, pte_t *ptep, 344 pte_t *ptentp) 345 { 346 int max_nr = (end - addr) / PAGE_SIZE; 347 348 return folio_pte_batch_flags(folio, NULL, ptep, ptentp, max_nr, 349 FPB_MERGE_YOUNG_DIRTY); 350 } 351 352 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, 353 unsigned long addr, unsigned long end, 354 struct mm_walk *walk) 355 { 356 struct madvise_walk_private *private = walk->private; 357 struct mmu_gather *tlb = private->tlb; 358 bool pageout = private->pageout; 359 struct mm_struct *mm = tlb->mm; 360 struct vm_area_struct *vma = walk->vma; 361 pte_t *start_pte, *pte, ptent; 362 spinlock_t *ptl; 363 struct folio *folio = NULL; 364 LIST_HEAD(folio_list); 365 bool pageout_anon_only_filter; 366 unsigned int batch_count = 0; 367 int nr; 368 369 if (fatal_signal_pending(current)) 370 return -EINTR; 371 372 pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) && 373 !can_do_file_pageout(vma); 374 375 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 376 if (pmd_trans_huge(*pmd)) { 377 pmd_t orig_pmd; 378 unsigned long next = pmd_addr_end(addr, end); 379 380 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 381 ptl = pmd_trans_huge_lock(pmd, vma); 382 if (!ptl) 383 return 0; 384 385 orig_pmd = *pmd; 386 if (is_huge_zero_pmd(orig_pmd)) 387 goto huge_unlock; 388 389 if (unlikely(!pmd_present(orig_pmd))) { 390 VM_BUG_ON(thp_migration_supported() && 391 !pmd_is_migration_entry(orig_pmd)); 392 goto huge_unlock; 393 } 394 395 folio = pmd_folio(orig_pmd); 396 397 /* Do not interfere with other mappings of this folio */ 398 if (folio_maybe_mapped_shared(folio)) 399 goto huge_unlock; 400 401 if (pageout_anon_only_filter && !folio_test_anon(folio)) 402 goto huge_unlock; 403 404 if (next - addr != HPAGE_PMD_SIZE) { 405 int err; 406 407 folio_get(folio); 408 spin_unlock(ptl); 409 folio_lock(folio); 410 err = split_folio(folio); 411 folio_unlock(folio); 412 folio_put(folio); 413 if (!err) 414 goto regular_folio; 415 return 0; 416 } 417 418 if (!pageout && pmd_young(orig_pmd)) { 419 pmdp_invalidate(vma, addr, pmd); 420 orig_pmd = pmd_mkold(orig_pmd); 421 422 set_pmd_at(mm, addr, pmd, orig_pmd); 423 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 424 } 425 426 folio_clear_referenced(folio); 427 folio_test_clear_young(folio); 428 if (folio_test_active(folio)) 429 folio_set_workingset(folio); 430 if (pageout) { 431 if (folio_isolate_lru(folio)) { 432 if (folio_test_unevictable(folio)) 433 folio_putback_lru(folio); 434 else 435 list_add(&folio->lru, &folio_list); 436 } 437 } else 438 folio_deactivate(folio); 439 huge_unlock: 440 spin_unlock(ptl); 441 if (pageout) 442 reclaim_pages(&folio_list); 443 return 0; 444 } 445 446 regular_folio: 447 #endif 448 tlb_change_page_size(tlb, PAGE_SIZE); 449 restart: 450 start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 451 if (!start_pte) 452 return 0; 453 flush_tlb_batched_pending(mm); 454 lazy_mmu_mode_enable(); 455 for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { 456 nr = 1; 457 ptent = ptep_get(pte); 458 459 if (++batch_count == SWAP_CLUSTER_MAX) { 460 batch_count = 0; 461 if (need_resched()) { 462 lazy_mmu_mode_disable(); 463 pte_unmap_unlock(start_pte, ptl); 464 cond_resched(); 465 goto restart; 466 } 467 } 468 469 if (pte_none(ptent)) 470 continue; 471 472 if (!pte_present(ptent)) 473 continue; 474 475 folio = vm_normal_folio(vma, addr, ptent); 476 if (!folio || folio_is_zone_device(folio)) 477 continue; 478 479 /* 480 * If we encounter a large folio, only split it if it is not 481 * fully mapped within the range we are operating on. Otherwise 482 * leave it as is so that it can be swapped out whole. If we 483 * fail to split a folio, leave it in place and advance to the 484 * next pte in the range. 485 */ 486 if (folio_test_large(folio)) { 487 nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); 488 if (nr < folio_nr_pages(folio)) { 489 int err; 490 491 if (folio_maybe_mapped_shared(folio)) 492 continue; 493 if (pageout_anon_only_filter && !folio_test_anon(folio)) 494 continue; 495 if (!folio_trylock(folio)) 496 continue; 497 folio_get(folio); 498 lazy_mmu_mode_disable(); 499 pte_unmap_unlock(start_pte, ptl); 500 start_pte = NULL; 501 err = split_folio(folio); 502 folio_unlock(folio); 503 folio_put(folio); 504 start_pte = pte = 505 pte_offset_map_lock(mm, pmd, addr, &ptl); 506 if (!start_pte) 507 break; 508 flush_tlb_batched_pending(mm); 509 lazy_mmu_mode_enable(); 510 if (!err) 511 nr = 0; 512 continue; 513 } 514 } 515 516 /* 517 * Do not interfere with other mappings of this folio and 518 * non-LRU folio. If we have a large folio at this point, we 519 * know it is fully mapped so if its mapcount is the same as its 520 * number of pages, it must be exclusive. 521 */ 522 if (!folio_test_lru(folio) || 523 folio_mapcount(folio) != folio_nr_pages(folio)) 524 continue; 525 526 if (pageout_anon_only_filter && !folio_test_anon(folio)) 527 continue; 528 529 if (!pageout && pte_young(ptent)) { 530 clear_young_dirty_ptes(vma, addr, pte, nr, 531 CYDP_CLEAR_YOUNG); 532 tlb_remove_tlb_entries(tlb, pte, nr, addr); 533 } 534 535 /* 536 * We are deactivating a folio for accelerating reclaiming. 537 * VM couldn't reclaim the folio unless we clear PG_young. 538 * As a side effect, it makes confuse idle-page tracking 539 * because they will miss recent referenced history. 540 */ 541 folio_clear_referenced(folio); 542 folio_test_clear_young(folio); 543 if (folio_test_active(folio)) 544 folio_set_workingset(folio); 545 if (pageout) { 546 if (folio_isolate_lru(folio)) { 547 if (folio_test_unevictable(folio)) 548 folio_putback_lru(folio); 549 else 550 list_add(&folio->lru, &folio_list); 551 } 552 } else 553 folio_deactivate(folio); 554 } 555 556 if (start_pte) { 557 lazy_mmu_mode_disable(); 558 pte_unmap_unlock(start_pte, ptl); 559 } 560 if (pageout) 561 reclaim_pages(&folio_list); 562 cond_resched(); 563 564 return 0; 565 } 566 567 static const struct mm_walk_ops cold_walk_ops = { 568 .pmd_entry = madvise_cold_or_pageout_pte_range, 569 .walk_lock = PGWALK_RDLOCK, 570 }; 571 572 static void madvise_cold_page_range(struct mmu_gather *tlb, 573 struct madvise_behavior *madv_behavior) 574 575 { 576 struct vm_area_struct *vma = madv_behavior->vma; 577 struct madvise_behavior_range *range = &madv_behavior->range; 578 struct madvise_walk_private walk_private = { 579 .pageout = false, 580 .tlb = tlb, 581 }; 582 583 tlb_start_vma(tlb, vma); 584 walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops, 585 &walk_private); 586 tlb_end_vma(tlb, vma); 587 } 588 589 static inline bool can_madv_lru_vma(struct vm_area_struct *vma) 590 { 591 return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); 592 } 593 594 static long madvise_cold(struct madvise_behavior *madv_behavior) 595 { 596 struct vm_area_struct *vma = madv_behavior->vma; 597 struct mmu_gather tlb; 598 599 if (!can_madv_lru_vma(vma)) 600 return -EINVAL; 601 602 lru_add_drain(); 603 tlb_gather_mmu(&tlb, madv_behavior->mm); 604 madvise_cold_page_range(&tlb, madv_behavior); 605 tlb_finish_mmu(&tlb); 606 607 return 0; 608 } 609 610 static void madvise_pageout_page_range(struct mmu_gather *tlb, 611 struct vm_area_struct *vma, 612 struct madvise_behavior_range *range) 613 { 614 struct madvise_walk_private walk_private = { 615 .pageout = true, 616 .tlb = tlb, 617 }; 618 619 tlb_start_vma(tlb, vma); 620 walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops, 621 &walk_private); 622 tlb_end_vma(tlb, vma); 623 } 624 625 static long madvise_pageout(struct madvise_behavior *madv_behavior) 626 { 627 struct mmu_gather tlb; 628 struct vm_area_struct *vma = madv_behavior->vma; 629 630 if (!can_madv_lru_vma(vma)) 631 return -EINVAL; 632 633 /* 634 * If the VMA belongs to a private file mapping, there can be private 635 * dirty pages which can be paged out if even this process is neither 636 * owner nor write capable of the file. We allow private file mappings 637 * further to pageout dirty anon pages. 638 */ 639 if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) && 640 (vma->vm_flags & VM_MAYSHARE))) 641 return 0; 642 643 lru_add_drain(); 644 tlb_gather_mmu(&tlb, madv_behavior->mm); 645 madvise_pageout_page_range(&tlb, vma, &madv_behavior->range); 646 tlb_finish_mmu(&tlb); 647 648 return 0; 649 } 650 651 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, 652 unsigned long end, struct mm_walk *walk) 653 654 { 655 const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY; 656 struct mmu_gather *tlb = walk->private; 657 struct mm_struct *mm = tlb->mm; 658 struct vm_area_struct *vma = walk->vma; 659 spinlock_t *ptl; 660 pte_t *start_pte, *pte, ptent; 661 struct folio *folio; 662 int nr_swap = 0; 663 unsigned long next; 664 int nr, max_nr; 665 666 next = pmd_addr_end(addr, end); 667 if (pmd_trans_huge(*pmd)) 668 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) 669 return 0; 670 671 tlb_change_page_size(tlb, PAGE_SIZE); 672 start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 673 if (!start_pte) 674 return 0; 675 flush_tlb_batched_pending(mm); 676 lazy_mmu_mode_enable(); 677 for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { 678 nr = 1; 679 ptent = ptep_get(pte); 680 681 if (pte_none(ptent)) 682 continue; 683 /* 684 * If the pte has swp_entry, just clear page table to 685 * prevent swap-in which is more expensive rather than 686 * (page allocation + zeroing). 687 */ 688 if (!pte_present(ptent)) { 689 softleaf_t entry = softleaf_from_pte(ptent); 690 691 if (softleaf_is_swap(entry)) { 692 max_nr = (end - addr) / PAGE_SIZE; 693 nr = swap_pte_batch(pte, max_nr, ptent); 694 nr_swap -= nr; 695 swap_put_entries_direct(entry, nr); 696 clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); 697 } else if (softleaf_is_hwpoison(entry) || 698 softleaf_is_poison_marker(entry)) { 699 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 700 } 701 continue; 702 } 703 704 folio = vm_normal_folio(vma, addr, ptent); 705 if (!folio || folio_is_zone_device(folio)) 706 continue; 707 708 /* 709 * If we encounter a large folio, only split it if it is not 710 * fully mapped within the range we are operating on. Otherwise 711 * leave it as is so that it can be marked as lazyfree. If we 712 * fail to split a folio, leave it in place and advance to the 713 * next pte in the range. 714 */ 715 if (folio_test_large(folio)) { 716 nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); 717 if (nr < folio_nr_pages(folio)) { 718 int err; 719 720 if (folio_maybe_mapped_shared(folio)) 721 continue; 722 if (!folio_trylock(folio)) 723 continue; 724 folio_get(folio); 725 lazy_mmu_mode_disable(); 726 pte_unmap_unlock(start_pte, ptl); 727 start_pte = NULL; 728 err = split_folio(folio); 729 folio_unlock(folio); 730 folio_put(folio); 731 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 732 start_pte = pte; 733 if (!start_pte) 734 break; 735 flush_tlb_batched_pending(mm); 736 lazy_mmu_mode_enable(); 737 if (!err) 738 nr = 0; 739 continue; 740 } 741 } 742 743 if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { 744 if (!folio_trylock(folio)) 745 continue; 746 /* 747 * If we have a large folio at this point, we know it is 748 * fully mapped so if its mapcount is the same as its 749 * number of pages, it must be exclusive. 750 */ 751 if (folio_mapcount(folio) != folio_nr_pages(folio)) { 752 folio_unlock(folio); 753 continue; 754 } 755 756 if (folio_test_swapcache(folio) && 757 !folio_free_swap(folio)) { 758 folio_unlock(folio); 759 continue; 760 } 761 762 folio_clear_dirty(folio); 763 folio_unlock(folio); 764 } 765 766 if (pte_young(ptent) || pte_dirty(ptent)) { 767 clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags); 768 tlb_remove_tlb_entries(tlb, pte, nr, addr); 769 } 770 folio_mark_lazyfree(folio); 771 } 772 773 if (nr_swap) 774 add_mm_counter(mm, MM_SWAPENTS, nr_swap); 775 if (start_pte) { 776 lazy_mmu_mode_disable(); 777 pte_unmap_unlock(start_pte, ptl); 778 } 779 cond_resched(); 780 781 return 0; 782 } 783 784 static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode) 785 { 786 switch (mode) { 787 case MADVISE_VMA_READ_LOCK: 788 return PGWALK_VMA_RDLOCK_VERIFY; 789 case MADVISE_MMAP_READ_LOCK: 790 return PGWALK_RDLOCK; 791 default: 792 /* Other modes don't require fixing up the walk_lock */ 793 WARN_ON_ONCE(1); 794 return PGWALK_RDLOCK; 795 } 796 } 797 798 static int madvise_free_single_vma(struct madvise_behavior *madv_behavior) 799 { 800 struct mm_struct *mm = madv_behavior->mm; 801 struct vm_area_struct *vma = madv_behavior->vma; 802 struct mmu_notifier_range range = { 803 .start = madv_behavior->range.start, 804 .end = madv_behavior->range.end, 805 }; 806 struct mmu_gather *tlb = madv_behavior->tlb; 807 struct mm_walk_ops walk_ops = { 808 .pmd_entry = madvise_free_pte_range, 809 }; 810 811 /* MADV_FREE works for only anon vma at the moment */ 812 if (!vma_is_anonymous(vma)) 813 return -EINVAL; 814 815 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 816 range.start, range.end); 817 818 lru_add_drain(); 819 update_hiwater_rss(mm); 820 821 mmu_notifier_invalidate_range_start(&range); 822 tlb_start_vma(tlb, vma); 823 walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode); 824 walk_page_range_vma(vma, range.start, range.end, 825 &walk_ops, tlb); 826 tlb_end_vma(tlb, vma); 827 mmu_notifier_invalidate_range_end(&range); 828 return 0; 829 } 830 831 /* 832 * Application no longer needs these pages. If the pages are dirty, 833 * it's OK to just throw them away. The app will be more careful about 834 * data it wants to keep. Be sure to free swap resources too. The 835 * zap_page_range_single call sets things up for shrink_active_list to actually 836 * free these pages later if no one else has touched them in the meantime, 837 * although we could add these pages to a global reuse list for 838 * shrink_active_list to pick up before reclaiming other pages. 839 * 840 * NB: This interface discards data rather than pushes it out to swap, 841 * as some implementations do. This has performance implications for 842 * applications like large transactional databases which want to discard 843 * pages in anonymous maps after committing to backing store the data 844 * that was kept in them. There is no reason to write this data out to 845 * the swap area if the application is discarding it. 846 * 847 * An interface that causes the system to free clean pages and flush 848 * dirty pages is already available as msync(MS_INVALIDATE). 849 */ 850 static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior) 851 852 { 853 struct madvise_behavior_range *range = &madv_behavior->range; 854 struct zap_details details = { 855 .reclaim_pt = true, 856 }; 857 858 zap_page_range_single_batched( 859 madv_behavior->tlb, madv_behavior->vma, range->start, 860 range->end - range->start, &details); 861 return 0; 862 } 863 864 static 865 bool madvise_dontneed_free_valid_vma(struct madvise_behavior *madv_behavior) 866 { 867 struct vm_area_struct *vma = madv_behavior->vma; 868 int behavior = madv_behavior->behavior; 869 struct madvise_behavior_range *range = &madv_behavior->range; 870 871 if (!is_vm_hugetlb_page(vma)) { 872 unsigned int forbidden = VM_PFNMAP; 873 874 if (behavior != MADV_DONTNEED_LOCKED) 875 forbidden |= VM_LOCKED; 876 877 return !(vma->vm_flags & forbidden); 878 } 879 880 if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) 881 return false; 882 if (range->start & ~huge_page_mask(hstate_vma(vma))) 883 return false; 884 885 /* 886 * Madvise callers expect the length to be rounded up to PAGE_SIZE 887 * boundaries, and may be unaware that this VMA uses huge pages. 888 * Avoid unexpected data loss by rounding down the number of 889 * huge pages freed. 890 */ 891 range->end = ALIGN_DOWN(range->end, huge_page_size(hstate_vma(vma))); 892 893 return true; 894 } 895 896 static long madvise_dontneed_free(struct madvise_behavior *madv_behavior) 897 { 898 struct mm_struct *mm = madv_behavior->mm; 899 struct madvise_behavior_range *range = &madv_behavior->range; 900 int behavior = madv_behavior->behavior; 901 902 if (!madvise_dontneed_free_valid_vma(madv_behavior)) 903 return -EINVAL; 904 905 if (range->start == range->end) 906 return 0; 907 908 if (!userfaultfd_remove(madv_behavior->vma, range->start, range->end)) { 909 struct vm_area_struct *vma; 910 911 mark_mmap_lock_dropped(madv_behavior); 912 mmap_read_lock(mm); 913 madv_behavior->vma = vma = vma_lookup(mm, range->start); 914 if (!vma) 915 return -ENOMEM; 916 /* 917 * Potential end adjustment for hugetlb vma is OK as 918 * the check below keeps end within vma. 919 */ 920 if (!madvise_dontneed_free_valid_vma(madv_behavior)) 921 return -EINVAL; 922 if (range->end > vma->vm_end) { 923 /* 924 * Don't fail if end > vma->vm_end. If the old 925 * vma was split while the mmap_lock was 926 * released the effect of the concurrent 927 * operation may not cause madvise() to 928 * have an undefined result. There may be an 929 * adjacent next vma that we'll walk 930 * next. userfaultfd_remove() will generate an 931 * UFFD_EVENT_REMOVE repetition on the 932 * end-vma->vm_end range, but the manager can 933 * handle a repetition fine. 934 */ 935 range->end = vma->vm_end; 936 } 937 /* 938 * If the memory region between start and end was 939 * originally backed by 4kB pages and then remapped to 940 * be backed by hugepages while mmap_lock was dropped, 941 * the adjustment for hugetlb vma above may have rounded 942 * end down to the start address. 943 */ 944 if (range->start == range->end) 945 return 0; 946 VM_WARN_ON(range->start > range->end); 947 } 948 949 if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) 950 return madvise_dontneed_single_vma(madv_behavior); 951 else if (behavior == MADV_FREE) 952 return madvise_free_single_vma(madv_behavior); 953 else 954 return -EINVAL; 955 } 956 957 static long madvise_populate(struct madvise_behavior *madv_behavior) 958 { 959 struct mm_struct *mm = madv_behavior->mm; 960 const bool write = madv_behavior->behavior == MADV_POPULATE_WRITE; 961 int locked = 1; 962 unsigned long start = madv_behavior->range.start; 963 unsigned long end = madv_behavior->range.end; 964 long pages; 965 966 while (start < end) { 967 /* Populate (prefault) page tables readable/writable. */ 968 pages = faultin_page_range(mm, start, end, write, &locked); 969 if (!locked) { 970 mmap_read_lock(mm); 971 locked = 1; 972 } 973 if (pages < 0) { 974 switch (pages) { 975 case -EINTR: 976 return -EINTR; 977 case -EINVAL: /* Incompatible mappings / permissions. */ 978 return -EINVAL; 979 case -EHWPOISON: 980 return -EHWPOISON; 981 case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ 982 return -EFAULT; 983 default: 984 pr_warn_once("%s: unhandled return value: %ld\n", 985 __func__, pages); 986 fallthrough; 987 case -ENOMEM: /* No VMA or out of memory. */ 988 return -ENOMEM; 989 } 990 } 991 start += pages * PAGE_SIZE; 992 } 993 return 0; 994 } 995 996 /* 997 * Application wants to free up the pages and associated backing store. 998 * This is effectively punching a hole into the middle of a file. 999 */ 1000 static long madvise_remove(struct madvise_behavior *madv_behavior) 1001 { 1002 loff_t offset; 1003 int error; 1004 struct file *f; 1005 struct mm_struct *mm = madv_behavior->mm; 1006 struct vm_area_struct *vma = madv_behavior->vma; 1007 unsigned long start = madv_behavior->range.start; 1008 unsigned long end = madv_behavior->range.end; 1009 1010 mark_mmap_lock_dropped(madv_behavior); 1011 1012 if (vma->vm_flags & VM_LOCKED) 1013 return -EINVAL; 1014 1015 f = vma->vm_file; 1016 1017 if (!f || !f->f_mapping || !f->f_mapping->host) { 1018 return -EINVAL; 1019 } 1020 1021 if (!vma_is_shared_maywrite(vma)) 1022 return -EACCES; 1023 1024 offset = (loff_t)(start - vma->vm_start) 1025 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 1026 1027 /* 1028 * Filesystem's fallocate may need to take i_rwsem. We need to 1029 * explicitly grab a reference because the vma (and hence the 1030 * vma's reference to the file) can go away as soon as we drop 1031 * mmap_lock. 1032 */ 1033 get_file(f); 1034 if (userfaultfd_remove(vma, start, end)) { 1035 /* mmap_lock was not released by userfaultfd_remove() */ 1036 mmap_read_unlock(mm); 1037 } 1038 error = vfs_fallocate(f, 1039 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 1040 offset, end - start); 1041 fput(f); 1042 mmap_read_lock(mm); 1043 return error; 1044 } 1045 1046 static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked) 1047 { 1048 vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB; 1049 1050 /* 1051 * A user could lock after setting a guard range but that's fine, as 1052 * they'd not be able to fault in. The issue arises when we try to zap 1053 * existing locked VMAs. We don't want to do that. 1054 */ 1055 if (!allow_locked) 1056 disallowed |= VM_LOCKED; 1057 1058 return !(vma->vm_flags & disallowed); 1059 } 1060 1061 static bool is_guard_pte_marker(pte_t ptent) 1062 { 1063 const softleaf_t entry = softleaf_from_pte(ptent); 1064 1065 return softleaf_is_guard_marker(entry); 1066 } 1067 1068 static int guard_install_pud_entry(pud_t *pud, unsigned long addr, 1069 unsigned long next, struct mm_walk *walk) 1070 { 1071 pud_t pudval = pudp_get(pud); 1072 1073 /* If huge return >0 so we abort the operation + zap. */ 1074 return pud_trans_huge(pudval); 1075 } 1076 1077 static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr, 1078 unsigned long next, struct mm_walk *walk) 1079 { 1080 pmd_t pmdval = pmdp_get(pmd); 1081 1082 /* If huge return >0 so we abort the operation + zap. */ 1083 return pmd_trans_huge(pmdval); 1084 } 1085 1086 static int guard_install_pte_entry(pte_t *pte, unsigned long addr, 1087 unsigned long next, struct mm_walk *walk) 1088 { 1089 pte_t pteval = ptep_get(pte); 1090 unsigned long *nr_pages = (unsigned long *)walk->private; 1091 1092 /* If there is already a guard page marker, we have nothing to do. */ 1093 if (is_guard_pte_marker(pteval)) { 1094 (*nr_pages)++; 1095 1096 return 0; 1097 } 1098 1099 /* If populated return >0 so we abort the operation + zap. */ 1100 return 1; 1101 } 1102 1103 static int guard_install_set_pte(unsigned long addr, unsigned long next, 1104 pte_t *ptep, struct mm_walk *walk) 1105 { 1106 unsigned long *nr_pages = (unsigned long *)walk->private; 1107 1108 /* Simply install a PTE marker, this causes segfault on access. */ 1109 *ptep = make_pte_marker(PTE_MARKER_GUARD); 1110 (*nr_pages)++; 1111 1112 return 0; 1113 } 1114 1115 static long madvise_guard_install(struct madvise_behavior *madv_behavior) 1116 { 1117 struct vm_area_struct *vma = madv_behavior->vma; 1118 struct madvise_behavior_range *range = &madv_behavior->range; 1119 struct mm_walk_ops walk_ops = { 1120 .pud_entry = guard_install_pud_entry, 1121 .pmd_entry = guard_install_pmd_entry, 1122 .pte_entry = guard_install_pte_entry, 1123 .install_pte = guard_install_set_pte, 1124 .walk_lock = get_walk_lock(madv_behavior->lock_mode), 1125 }; 1126 long err; 1127 int i; 1128 1129 if (!is_valid_guard_vma(vma, /* allow_locked = */false)) 1130 return -EINVAL; 1131 1132 /* 1133 * Set atomically under read lock. All pertinent readers will need to 1134 * acquire an mmap/VMA write lock to read it. All remaining readers may 1135 * or may not see the flag set, but we don't care. 1136 */ 1137 vma_set_atomic_flag(vma, VMA_MAYBE_GUARD_BIT); 1138 1139 /* 1140 * If anonymous and we are establishing page tables the VMA ought to 1141 * have an anon_vma associated with it. 1142 * 1143 * We will hold an mmap read lock if this is necessary, this is checked 1144 * as part of the VMA lock logic. 1145 */ 1146 if (vma_is_anonymous(vma)) { 1147 VM_WARN_ON_ONCE(!vma->anon_vma && 1148 madv_behavior->lock_mode != MADVISE_MMAP_READ_LOCK); 1149 1150 err = anon_vma_prepare(vma); 1151 if (err) 1152 return err; 1153 } 1154 1155 /* 1156 * Optimistically try to install the guard marker pages first. If any 1157 * non-guard pages or THP huge pages are encountered, give up and zap 1158 * the range before trying again. 1159 * 1160 * We try a few times before giving up and releasing back to userland to 1161 * loop around, releasing locks in the process to avoid contention. 1162 * 1163 * This would only happen due to races with e.g. page faults or 1164 * khugepaged. 1165 * 1166 * In most cases we should simply install the guard markers immediately 1167 * with no zap or looping. 1168 */ 1169 for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) { 1170 unsigned long nr_pages = 0; 1171 1172 /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */ 1173 if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK) 1174 err = walk_page_range_vma_unsafe(madv_behavior->vma, 1175 range->start, range->end, &walk_ops, 1176 &nr_pages); 1177 else 1178 err = walk_page_range_mm_unsafe(vma->vm_mm, range->start, 1179 range->end, &walk_ops, &nr_pages); 1180 if (err < 0) 1181 return err; 1182 1183 if (err == 0) { 1184 unsigned long nr_expected_pages = 1185 PHYS_PFN(range->end - range->start); 1186 1187 VM_WARN_ON(nr_pages != nr_expected_pages); 1188 return 0; 1189 } 1190 1191 /* 1192 * OK some of the range have non-guard pages mapped, zap 1193 * them. This leaves existing guard pages in place. 1194 */ 1195 zap_page_range_single(vma, range->start, range->end - range->start); 1196 } 1197 1198 /* 1199 * We were unable to install the guard pages, return to userspace and 1200 * immediately retry, relieving lock contention. 1201 */ 1202 return restart_syscall(); 1203 } 1204 1205 static int guard_remove_pud_entry(pud_t *pud, unsigned long addr, 1206 unsigned long next, struct mm_walk *walk) 1207 { 1208 pud_t pudval = pudp_get(pud); 1209 1210 /* If huge, cannot have guard pages present, so no-op - skip. */ 1211 if (pud_trans_huge(pudval)) 1212 walk->action = ACTION_CONTINUE; 1213 1214 return 0; 1215 } 1216 1217 static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr, 1218 unsigned long next, struct mm_walk *walk) 1219 { 1220 pmd_t pmdval = pmdp_get(pmd); 1221 1222 /* If huge, cannot have guard pages present, so no-op - skip. */ 1223 if (pmd_trans_huge(pmdval)) 1224 walk->action = ACTION_CONTINUE; 1225 1226 return 0; 1227 } 1228 1229 static int guard_remove_pte_entry(pte_t *pte, unsigned long addr, 1230 unsigned long next, struct mm_walk *walk) 1231 { 1232 pte_t ptent = ptep_get(pte); 1233 1234 if (is_guard_pte_marker(ptent)) { 1235 /* Simply clear the PTE marker. */ 1236 pte_clear_not_present_full(walk->mm, addr, pte, false); 1237 update_mmu_cache(walk->vma, addr, pte); 1238 } 1239 1240 return 0; 1241 } 1242 1243 static long madvise_guard_remove(struct madvise_behavior *madv_behavior) 1244 { 1245 struct vm_area_struct *vma = madv_behavior->vma; 1246 struct madvise_behavior_range *range = &madv_behavior->range; 1247 struct mm_walk_ops wallk_ops = { 1248 .pud_entry = guard_remove_pud_entry, 1249 .pmd_entry = guard_remove_pmd_entry, 1250 .pte_entry = guard_remove_pte_entry, 1251 .walk_lock = get_walk_lock(madv_behavior->lock_mode), 1252 }; 1253 1254 /* 1255 * We're ok with removing guards in mlock()'d ranges, as this is a 1256 * non-destructive action. 1257 */ 1258 if (!is_valid_guard_vma(vma, /* allow_locked = */true)) 1259 return -EINVAL; 1260 1261 return walk_page_range_vma(vma, range->start, range->end, 1262 &wallk_ops, NULL); 1263 } 1264 1265 #ifdef CONFIG_64BIT 1266 /* Does the madvise operation result in discarding of mapped data? */ 1267 static bool is_discard(int behavior) 1268 { 1269 switch (behavior) { 1270 case MADV_FREE: 1271 case MADV_DONTNEED: 1272 case MADV_DONTNEED_LOCKED: 1273 case MADV_REMOVE: 1274 case MADV_DONTFORK: 1275 case MADV_WIPEONFORK: 1276 case MADV_GUARD_INSTALL: 1277 return true; 1278 } 1279 1280 return false; 1281 } 1282 1283 /* 1284 * We are restricted from madvise()'ing mseal()'d VMAs only in very particular 1285 * circumstances - discarding of data from read-only anonymous SEALED mappings. 1286 * 1287 * This is because users cannot trivally discard data from these VMAs, and may 1288 * only do so via an appropriate madvise() call. 1289 */ 1290 static bool can_madvise_modify(struct madvise_behavior *madv_behavior) 1291 { 1292 struct vm_area_struct *vma = madv_behavior->vma; 1293 1294 /* If the VMA isn't sealed we're good. */ 1295 if (!vma_is_sealed(vma)) 1296 return true; 1297 1298 /* For a sealed VMA, we only care about discard operations. */ 1299 if (!is_discard(madv_behavior->behavior)) 1300 return true; 1301 1302 /* 1303 * We explicitly permit all file-backed mappings, whether MAP_SHARED or 1304 * MAP_PRIVATE. 1305 * 1306 * The latter causes some complications. Because now, one can mmap() 1307 * read/write a MAP_PRIVATE mapping, write to it, then mprotect() 1308 * read-only, mseal() and a discard will be permitted. 1309 * 1310 * However, in order to avoid issues with potential use of madvise(..., 1311 * MADV_DONTNEED) of mseal()'d .text mappings we, for the time being, 1312 * permit this. 1313 */ 1314 if (!vma_is_anonymous(vma)) 1315 return true; 1316 1317 /* If the user could write to the mapping anyway, then this is fine. */ 1318 if ((vma->vm_flags & VM_WRITE) && 1319 arch_vma_access_permitted(vma, /* write= */ true, 1320 /* execute= */ false, /* foreign= */ false)) 1321 return true; 1322 1323 /* Otherwise, we are not permitted to perform this operation. */ 1324 return false; 1325 } 1326 #else 1327 static bool can_madvise_modify(struct madvise_behavior *madv_behavior) 1328 { 1329 return true; 1330 } 1331 #endif 1332 1333 /* 1334 * Apply an madvise behavior to a region of a vma. madvise_update_vma 1335 * will handle splitting a vm area into separate areas, each area with its own 1336 * behavior. 1337 */ 1338 static int madvise_vma_behavior(struct madvise_behavior *madv_behavior) 1339 { 1340 int behavior = madv_behavior->behavior; 1341 struct vm_area_struct *vma = madv_behavior->vma; 1342 vm_flags_t new_flags = vma->vm_flags; 1343 struct madvise_behavior_range *range = &madv_behavior->range; 1344 int error; 1345 1346 if (unlikely(!can_madvise_modify(madv_behavior))) 1347 return -EPERM; 1348 1349 switch (behavior) { 1350 case MADV_REMOVE: 1351 return madvise_remove(madv_behavior); 1352 case MADV_WILLNEED: 1353 return madvise_willneed(madv_behavior); 1354 case MADV_COLD: 1355 return madvise_cold(madv_behavior); 1356 case MADV_PAGEOUT: 1357 return madvise_pageout(madv_behavior); 1358 case MADV_FREE: 1359 case MADV_DONTNEED: 1360 case MADV_DONTNEED_LOCKED: 1361 return madvise_dontneed_free(madv_behavior); 1362 case MADV_COLLAPSE: 1363 return madvise_collapse(vma, range->start, range->end, 1364 &madv_behavior->lock_dropped); 1365 case MADV_GUARD_INSTALL: 1366 return madvise_guard_install(madv_behavior); 1367 case MADV_GUARD_REMOVE: 1368 return madvise_guard_remove(madv_behavior); 1369 1370 /* The below behaviours update VMAs via madvise_update_vma(). */ 1371 1372 case MADV_NORMAL: 1373 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 1374 break; 1375 case MADV_SEQUENTIAL: 1376 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 1377 break; 1378 case MADV_RANDOM: 1379 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 1380 break; 1381 case MADV_DONTFORK: 1382 new_flags |= VM_DONTCOPY; 1383 break; 1384 case MADV_DOFORK: 1385 if (new_flags & VM_SPECIAL) 1386 return -EINVAL; 1387 new_flags &= ~VM_DONTCOPY; 1388 break; 1389 case MADV_WIPEONFORK: 1390 /* MADV_WIPEONFORK is only supported on anonymous memory. */ 1391 if (vma->vm_file || new_flags & VM_SHARED) 1392 return -EINVAL; 1393 new_flags |= VM_WIPEONFORK; 1394 break; 1395 case MADV_KEEPONFORK: 1396 if (new_flags & VM_DROPPABLE) 1397 return -EINVAL; 1398 new_flags &= ~VM_WIPEONFORK; 1399 break; 1400 case MADV_DONTDUMP: 1401 new_flags |= VM_DONTDUMP; 1402 break; 1403 case MADV_DODUMP: 1404 if ((!is_vm_hugetlb_page(vma) && (new_flags & VM_SPECIAL)) || 1405 (new_flags & VM_DROPPABLE)) 1406 return -EINVAL; 1407 new_flags &= ~VM_DONTDUMP; 1408 break; 1409 case MADV_MERGEABLE: 1410 case MADV_UNMERGEABLE: 1411 error = ksm_madvise(vma, range->start, range->end, 1412 behavior, &new_flags); 1413 if (error) 1414 goto out; 1415 break; 1416 case MADV_HUGEPAGE: 1417 case MADV_NOHUGEPAGE: 1418 error = hugepage_madvise(vma, &new_flags, behavior); 1419 if (error) 1420 goto out; 1421 break; 1422 case __MADV_SET_ANON_VMA_NAME: 1423 /* Only anonymous mappings can be named */ 1424 if (vma->vm_file && !vma_is_anon_shmem(vma)) 1425 return -EBADF; 1426 break; 1427 } 1428 1429 /* This is a write operation.*/ 1430 VM_WARN_ON_ONCE(madv_behavior->lock_mode != MADVISE_MMAP_WRITE_LOCK); 1431 1432 error = madvise_update_vma(new_flags, madv_behavior); 1433 out: 1434 /* 1435 * madvise() returns EAGAIN if kernel resources, such as 1436 * slab, are temporarily unavailable. 1437 */ 1438 if (error == -ENOMEM) 1439 error = -EAGAIN; 1440 return error; 1441 } 1442 1443 #ifdef CONFIG_MEMORY_FAILURE 1444 /* 1445 * Error injection support for memory error handling. 1446 */ 1447 static int madvise_inject_error(struct madvise_behavior *madv_behavior) 1448 { 1449 unsigned long size; 1450 unsigned long start = madv_behavior->range.start; 1451 unsigned long end = madv_behavior->range.end; 1452 1453 if (!capable(CAP_SYS_ADMIN)) 1454 return -EPERM; 1455 1456 for (; start < end; start += size) { 1457 unsigned long pfn; 1458 struct page *page; 1459 int ret; 1460 1461 ret = get_user_pages_fast(start, 1, 0, &page); 1462 if (ret != 1) 1463 return ret; 1464 pfn = page_to_pfn(page); 1465 1466 /* 1467 * When soft offlining hugepages, after migrating the page 1468 * we dissolve it, therefore in the second loop "page" will 1469 * no longer be a compound page. 1470 */ 1471 size = page_size(compound_head(page)); 1472 1473 if (madv_behavior->behavior == MADV_SOFT_OFFLINE) { 1474 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", 1475 pfn, start); 1476 ret = soft_offline_page(pfn, MF_COUNT_INCREASED); 1477 } else { 1478 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", 1479 pfn, start); 1480 ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED); 1481 if (ret == -EOPNOTSUPP) 1482 ret = 0; 1483 } 1484 1485 if (ret) 1486 return ret; 1487 } 1488 1489 return 0; 1490 } 1491 1492 static bool is_memory_failure(struct madvise_behavior *madv_behavior) 1493 { 1494 switch (madv_behavior->behavior) { 1495 case MADV_HWPOISON: 1496 case MADV_SOFT_OFFLINE: 1497 return true; 1498 default: 1499 return false; 1500 } 1501 } 1502 1503 #else 1504 1505 static int madvise_inject_error(struct madvise_behavior *madv_behavior) 1506 { 1507 return 0; 1508 } 1509 1510 static bool is_memory_failure(struct madvise_behavior *madv_behavior) 1511 { 1512 return false; 1513 } 1514 1515 #endif /* CONFIG_MEMORY_FAILURE */ 1516 1517 static bool 1518 madvise_behavior_valid(int behavior) 1519 { 1520 switch (behavior) { 1521 case MADV_DOFORK: 1522 case MADV_DONTFORK: 1523 case MADV_NORMAL: 1524 case MADV_SEQUENTIAL: 1525 case MADV_RANDOM: 1526 case MADV_REMOVE: 1527 case MADV_WILLNEED: 1528 case MADV_DONTNEED: 1529 case MADV_DONTNEED_LOCKED: 1530 case MADV_FREE: 1531 case MADV_COLD: 1532 case MADV_PAGEOUT: 1533 case MADV_POPULATE_READ: 1534 case MADV_POPULATE_WRITE: 1535 #ifdef CONFIG_KSM 1536 case MADV_MERGEABLE: 1537 case MADV_UNMERGEABLE: 1538 #endif 1539 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1540 case MADV_HUGEPAGE: 1541 case MADV_NOHUGEPAGE: 1542 case MADV_COLLAPSE: 1543 #endif 1544 case MADV_DONTDUMP: 1545 case MADV_DODUMP: 1546 case MADV_WIPEONFORK: 1547 case MADV_KEEPONFORK: 1548 case MADV_GUARD_INSTALL: 1549 case MADV_GUARD_REMOVE: 1550 #ifdef CONFIG_MEMORY_FAILURE 1551 case MADV_SOFT_OFFLINE: 1552 case MADV_HWPOISON: 1553 #endif 1554 return true; 1555 1556 default: 1557 return false; 1558 } 1559 } 1560 1561 /* Can we invoke process_madvise() on a remote mm for the specified behavior? */ 1562 static bool process_madvise_remote_valid(int behavior) 1563 { 1564 switch (behavior) { 1565 case MADV_COLD: 1566 case MADV_PAGEOUT: 1567 case MADV_WILLNEED: 1568 case MADV_COLLAPSE: 1569 return true; 1570 default: 1571 return false; 1572 } 1573 } 1574 1575 /* Does this operation invoke anon_vma_prepare()? */ 1576 static bool prepares_anon_vma(int behavior) 1577 { 1578 switch (behavior) { 1579 case MADV_GUARD_INSTALL: 1580 return true; 1581 default: 1582 return false; 1583 } 1584 } 1585 1586 /* 1587 * We have acquired a VMA read lock, is the VMA valid to be madvise'd under VMA 1588 * read lock only now we have a VMA to examine? 1589 */ 1590 static bool is_vma_lock_sufficient(struct vm_area_struct *vma, 1591 struct madvise_behavior *madv_behavior) 1592 { 1593 /* Must span only a single VMA.*/ 1594 if (madv_behavior->range.end > vma->vm_end) 1595 return false; 1596 /* Remote processes unsupported. */ 1597 if (current->mm != vma->vm_mm) 1598 return false; 1599 /* Userfaultfd unsupported. */ 1600 if (userfaultfd_armed(vma)) 1601 return false; 1602 /* 1603 * anon_vma_prepare() explicitly requires an mmap lock for 1604 * serialisation, so we cannot use a VMA lock in this case. 1605 * 1606 * Note we might race with anon_vma being set, however this makes this 1607 * check overly paranoid which is safe. 1608 */ 1609 if (vma_is_anonymous(vma) && 1610 prepares_anon_vma(madv_behavior->behavior) && !vma->anon_vma) 1611 return false; 1612 1613 return true; 1614 } 1615 1616 /* 1617 * Try to acquire a VMA read lock if possible. 1618 * 1619 * We only support this lock over a single VMA, which the input range must 1620 * span either partially or fully. 1621 * 1622 * This function always returns with an appropriate lock held. If a VMA read 1623 * lock could be acquired, we return true and set madv_behavior state 1624 * accordingly. 1625 * 1626 * If a VMA read lock could not be acquired, we return false and expect caller to 1627 * fallback to mmap lock behaviour. 1628 */ 1629 static bool try_vma_read_lock(struct madvise_behavior *madv_behavior) 1630 { 1631 struct mm_struct *mm = madv_behavior->mm; 1632 struct vm_area_struct *vma; 1633 1634 vma = lock_vma_under_rcu(mm, madv_behavior->range.start); 1635 if (!vma) 1636 goto take_mmap_read_lock; 1637 1638 if (!is_vma_lock_sufficient(vma, madv_behavior)) { 1639 vma_end_read(vma); 1640 goto take_mmap_read_lock; 1641 } 1642 1643 madv_behavior->vma = vma; 1644 return true; 1645 1646 take_mmap_read_lock: 1647 mmap_read_lock(mm); 1648 madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK; 1649 return false; 1650 } 1651 1652 /* 1653 * Walk the vmas in range [start,end), and call the madvise_vma_behavior 1654 * function on each one. The function will get start and end parameters that 1655 * cover the overlap between the current vma and the original range. Any 1656 * unmapped regions in the original range will result in this function returning 1657 * -ENOMEM while still calling the madvise_vma_behavior function on all of the 1658 * existing vmas in the range. Must be called with the mmap_lock held for 1659 * reading or writing. 1660 */ 1661 static 1662 int madvise_walk_vmas(struct madvise_behavior *madv_behavior) 1663 { 1664 struct mm_struct *mm = madv_behavior->mm; 1665 struct madvise_behavior_range *range = &madv_behavior->range; 1666 /* range is updated to span each VMA, so store end of entire range. */ 1667 unsigned long last_end = range->end; 1668 int unmapped_error = 0; 1669 int error; 1670 struct vm_area_struct *prev, *vma; 1671 1672 /* 1673 * If VMA read lock is supported, apply madvise to a single VMA 1674 * tentatively, avoiding walking VMAs. 1675 */ 1676 if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK && 1677 try_vma_read_lock(madv_behavior)) { 1678 error = madvise_vma_behavior(madv_behavior); 1679 vma_end_read(madv_behavior->vma); 1680 return error; 1681 } 1682 1683 vma = find_vma_prev(mm, range->start, &prev); 1684 if (vma && range->start > vma->vm_start) 1685 prev = vma; 1686 1687 for (;;) { 1688 /* Still start < end. */ 1689 if (!vma) 1690 return -ENOMEM; 1691 1692 /* Here start < (last_end|vma->vm_end). */ 1693 if (range->start < vma->vm_start) { 1694 /* 1695 * This indicates a gap between VMAs in the input 1696 * range. This does not cause the operation to abort, 1697 * rather we simply return -ENOMEM to indicate that this 1698 * has happened, but carry on. 1699 */ 1700 unmapped_error = -ENOMEM; 1701 range->start = vma->vm_start; 1702 if (range->start >= last_end) 1703 break; 1704 } 1705 1706 /* Here vma->vm_start <= range->start < (last_end|vma->vm_end) */ 1707 range->end = min(vma->vm_end, last_end); 1708 1709 /* Here vma->vm_start <= range->start < range->end <= (last_end|vma->vm_end). */ 1710 madv_behavior->prev = prev; 1711 madv_behavior->vma = vma; 1712 error = madvise_vma_behavior(madv_behavior); 1713 if (error) 1714 return error; 1715 if (madv_behavior->lock_dropped) { 1716 /* We dropped the mmap lock, we can't ref the VMA. */ 1717 prev = NULL; 1718 vma = NULL; 1719 madv_behavior->lock_dropped = false; 1720 } else { 1721 vma = madv_behavior->vma; 1722 prev = vma; 1723 } 1724 1725 if (vma && range->end < vma->vm_end) 1726 range->end = vma->vm_end; 1727 if (range->end >= last_end) 1728 break; 1729 1730 vma = find_vma(mm, vma ? vma->vm_end : range->end); 1731 range->start = range->end; 1732 } 1733 1734 return unmapped_error; 1735 } 1736 1737 /* 1738 * Any behaviour which results in changes to the vma->vm_flags needs to 1739 * take mmap_lock for writing. Others, which simply traverse vmas, need 1740 * to only take it for reading. 1741 */ 1742 static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior) 1743 { 1744 if (is_memory_failure(madv_behavior)) 1745 return MADVISE_NO_LOCK; 1746 1747 switch (madv_behavior->behavior) { 1748 case MADV_REMOVE: 1749 case MADV_WILLNEED: 1750 case MADV_COLD: 1751 case MADV_PAGEOUT: 1752 case MADV_POPULATE_READ: 1753 case MADV_POPULATE_WRITE: 1754 case MADV_COLLAPSE: 1755 return MADVISE_MMAP_READ_LOCK; 1756 case MADV_GUARD_INSTALL: 1757 case MADV_GUARD_REMOVE: 1758 case MADV_DONTNEED: 1759 case MADV_DONTNEED_LOCKED: 1760 case MADV_FREE: 1761 return MADVISE_VMA_READ_LOCK; 1762 default: 1763 return MADVISE_MMAP_WRITE_LOCK; 1764 } 1765 } 1766 1767 static int madvise_lock(struct madvise_behavior *madv_behavior) 1768 { 1769 struct mm_struct *mm = madv_behavior->mm; 1770 enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior); 1771 1772 switch (lock_mode) { 1773 case MADVISE_NO_LOCK: 1774 break; 1775 case MADVISE_MMAP_WRITE_LOCK: 1776 if (mmap_write_lock_killable(mm)) 1777 return -EINTR; 1778 break; 1779 case MADVISE_MMAP_READ_LOCK: 1780 mmap_read_lock(mm); 1781 break; 1782 case MADVISE_VMA_READ_LOCK: 1783 /* We will acquire the lock per-VMA in madvise_walk_vmas(). */ 1784 break; 1785 } 1786 1787 madv_behavior->lock_mode = lock_mode; 1788 return 0; 1789 } 1790 1791 static void madvise_unlock(struct madvise_behavior *madv_behavior) 1792 { 1793 struct mm_struct *mm = madv_behavior->mm; 1794 1795 switch (madv_behavior->lock_mode) { 1796 case MADVISE_NO_LOCK: 1797 return; 1798 case MADVISE_MMAP_WRITE_LOCK: 1799 mmap_write_unlock(mm); 1800 break; 1801 case MADVISE_MMAP_READ_LOCK: 1802 mmap_read_unlock(mm); 1803 break; 1804 case MADVISE_VMA_READ_LOCK: 1805 /* We will drop the lock per-VMA in madvise_walk_vmas(). */ 1806 break; 1807 } 1808 1809 madv_behavior->lock_mode = MADVISE_NO_LOCK; 1810 } 1811 1812 static bool madvise_batch_tlb_flush(int behavior) 1813 { 1814 switch (behavior) { 1815 case MADV_DONTNEED: 1816 case MADV_DONTNEED_LOCKED: 1817 case MADV_FREE: 1818 return true; 1819 default: 1820 return false; 1821 } 1822 } 1823 1824 static void madvise_init_tlb(struct madvise_behavior *madv_behavior) 1825 { 1826 if (madvise_batch_tlb_flush(madv_behavior->behavior)) 1827 tlb_gather_mmu(madv_behavior->tlb, madv_behavior->mm); 1828 } 1829 1830 static void madvise_finish_tlb(struct madvise_behavior *madv_behavior) 1831 { 1832 if (madvise_batch_tlb_flush(madv_behavior->behavior)) 1833 tlb_finish_mmu(madv_behavior->tlb); 1834 } 1835 1836 static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) 1837 { 1838 size_t len; 1839 1840 if (!madvise_behavior_valid(behavior)) 1841 return false; 1842 1843 if (!PAGE_ALIGNED(start)) 1844 return false; 1845 len = PAGE_ALIGN(len_in); 1846 1847 /* Check to see whether len was rounded up from small -ve to zero */ 1848 if (len_in && !len) 1849 return false; 1850 1851 if (start + len < start) 1852 return false; 1853 1854 return true; 1855 } 1856 1857 /* 1858 * madvise_should_skip() - Return if the request is invalid or nothing. 1859 * @start: Start address of madvise-requested address range. 1860 * @len_in: Length of madvise-requested address range. 1861 * @behavior: Requested madvise behavior. 1862 * @err: Pointer to store an error code from the check. 1863 * 1864 * If the specified behaviour is invalid or nothing would occur, we skip the 1865 * operation. This function returns true in the cases, otherwise false. In 1866 * the former case we store an error on @err. 1867 */ 1868 static bool madvise_should_skip(unsigned long start, size_t len_in, 1869 int behavior, int *err) 1870 { 1871 if (!is_valid_madvise(start, len_in, behavior)) { 1872 *err = -EINVAL; 1873 return true; 1874 } 1875 if (start + PAGE_ALIGN(len_in) == start) { 1876 *err = 0; 1877 return true; 1878 } 1879 return false; 1880 } 1881 1882 static bool is_madvise_populate(struct madvise_behavior *madv_behavior) 1883 { 1884 switch (madv_behavior->behavior) { 1885 case MADV_POPULATE_READ: 1886 case MADV_POPULATE_WRITE: 1887 return true; 1888 default: 1889 return false; 1890 } 1891 } 1892 1893 /* 1894 * untagged_addr_remote() assumes mmap_lock is already held. On 1895 * architectures like x86 and RISC-V, tagging is tricky because each 1896 * mm may have a different tagging mask. However, we might only hold 1897 * the per-VMA lock (currently only local processes are supported), 1898 * so untagged_addr is used to avoid the mmap_lock assertion for 1899 * local processes. 1900 */ 1901 static inline unsigned long get_untagged_addr(struct mm_struct *mm, 1902 unsigned long start) 1903 { 1904 return current->mm == mm ? untagged_addr(start) : 1905 untagged_addr_remote(mm, start); 1906 } 1907 1908 static int madvise_do_behavior(unsigned long start, size_t len_in, 1909 struct madvise_behavior *madv_behavior) 1910 { 1911 struct blk_plug plug; 1912 int error; 1913 struct madvise_behavior_range *range = &madv_behavior->range; 1914 1915 if (is_memory_failure(madv_behavior)) { 1916 range->start = start; 1917 range->end = start + len_in; 1918 return madvise_inject_error(madv_behavior); 1919 } 1920 1921 range->start = get_untagged_addr(madv_behavior->mm, start); 1922 range->end = range->start + PAGE_ALIGN(len_in); 1923 1924 blk_start_plug(&plug); 1925 if (is_madvise_populate(madv_behavior)) 1926 error = madvise_populate(madv_behavior); 1927 else 1928 error = madvise_walk_vmas(madv_behavior); 1929 blk_finish_plug(&plug); 1930 return error; 1931 } 1932 1933 /* 1934 * The madvise(2) system call. 1935 * 1936 * Applications can use madvise() to advise the kernel how it should 1937 * handle paging I/O in this VM area. The idea is to help the kernel 1938 * use appropriate read-ahead and caching techniques. The information 1939 * provided is advisory only, and can be safely disregarded by the 1940 * kernel without affecting the correct operation of the application. 1941 * 1942 * behavior values: 1943 * MADV_NORMAL - the default behavior is to read clusters. This 1944 * results in some read-ahead and read-behind. 1945 * MADV_RANDOM - the system should read the minimum amount of data 1946 * on any access, since it is unlikely that the appli- 1947 * cation will need more than what it asks for. 1948 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 1949 * once, so they can be aggressively read ahead, and 1950 * can be freed soon after they are accessed. 1951 * MADV_WILLNEED - the application is notifying the system to read 1952 * some pages ahead. 1953 * MADV_DONTNEED - the application is finished with the given range, 1954 * so the kernel can free resources associated with it. 1955 * MADV_FREE - the application marks pages in the given range as lazy free, 1956 * where actual purges are postponed until memory pressure happens. 1957 * MADV_REMOVE - the application wants to free up the given range of 1958 * pages and associated backing store. 1959 * MADV_DONTFORK - omit this area from child's address space when forking: 1960 * typically, to avoid COWing pages pinned by get_user_pages(). 1961 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 1962 * MADV_WIPEONFORK - present the child process with zero-filled memory in this 1963 * range after a fork. 1964 * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK 1965 * MADV_HWPOISON - trigger memory error handler as if the given memory range 1966 * were corrupted by unrecoverable hardware memory failure. 1967 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. 1968 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 1969 * this area with pages of identical content from other such areas. 1970 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 1971 * MADV_HUGEPAGE - the application wants to back the given range by transparent 1972 * huge pages in the future. Existing pages might be coalesced and 1973 * new pages might be allocated as THP. 1974 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by 1975 * transparent huge pages so the existing pages will not be 1976 * coalesced into THP and new pages will not be allocated as THP. 1977 * MADV_COLLAPSE - synchronously coalesce pages into new THP. 1978 * MADV_DONTDUMP - the application wants to prevent pages in the given range 1979 * from being included in its core dump. 1980 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. 1981 * MADV_COLD - the application is not expected to use this memory soon, 1982 * deactivate pages in this range so that they can be reclaimed 1983 * easily if memory pressure happens. 1984 * MADV_PAGEOUT - the application is not expected to use this memory soon, 1985 * page out the pages in this range immediately. 1986 * MADV_POPULATE_READ - populate (prefault) page tables readable by 1987 * triggering read faults if required 1988 * MADV_POPULATE_WRITE - populate (prefault) page tables writable by 1989 * triggering write faults if required 1990 * 1991 * return values: 1992 * zero - success 1993 * -EINVAL - start + len < 0, start is not page-aligned, 1994 * "behavior" is not a valid value, or application 1995 * is attempting to release locked or shared pages, 1996 * or the specified address range includes file, Huge TLB, 1997 * MAP_SHARED or VMPFNMAP range. 1998 * -ENOMEM - addresses in the specified range are not currently 1999 * mapped, or are outside the AS of the process. 2000 * -EIO - an I/O error occurred while paging in data. 2001 * -EBADF - map exists, but area maps something that isn't a file. 2002 * -EAGAIN - a kernel resource was temporarily unavailable. 2003 * -EPERM - memory is sealed. 2004 */ 2005 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) 2006 { 2007 int error; 2008 struct mmu_gather tlb; 2009 struct madvise_behavior madv_behavior = { 2010 .mm = mm, 2011 .behavior = behavior, 2012 .tlb = &tlb, 2013 }; 2014 2015 if (madvise_should_skip(start, len_in, behavior, &error)) 2016 return error; 2017 error = madvise_lock(&madv_behavior); 2018 if (error) 2019 return error; 2020 madvise_init_tlb(&madv_behavior); 2021 error = madvise_do_behavior(start, len_in, &madv_behavior); 2022 madvise_finish_tlb(&madv_behavior); 2023 madvise_unlock(&madv_behavior); 2024 2025 return error; 2026 } 2027 2028 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 2029 { 2030 return do_madvise(current->mm, start, len_in, behavior); 2031 } 2032 2033 /* Perform an madvise operation over a vector of addresses and lengths. */ 2034 static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, 2035 int behavior) 2036 { 2037 ssize_t ret = 0; 2038 size_t total_len; 2039 struct mmu_gather tlb; 2040 struct madvise_behavior madv_behavior = { 2041 .mm = mm, 2042 .behavior = behavior, 2043 .tlb = &tlb, 2044 }; 2045 2046 total_len = iov_iter_count(iter); 2047 2048 ret = madvise_lock(&madv_behavior); 2049 if (ret) 2050 return ret; 2051 madvise_init_tlb(&madv_behavior); 2052 2053 while (iov_iter_count(iter)) { 2054 unsigned long start = (unsigned long)iter_iov_addr(iter); 2055 size_t len_in = iter_iov_len(iter); 2056 int error; 2057 2058 if (madvise_should_skip(start, len_in, behavior, &error)) 2059 ret = error; 2060 else 2061 ret = madvise_do_behavior(start, len_in, &madv_behavior); 2062 /* 2063 * An madvise operation is attempting to restart the syscall, 2064 * but we cannot proceed as it would not be correct to repeat 2065 * the operation in aggregate, and would be surprising to the 2066 * user. 2067 * 2068 * We drop and reacquire locks so it is safe to just loop and 2069 * try again. We check for fatal signals in case we need exit 2070 * early anyway. 2071 */ 2072 if (ret == -ERESTARTNOINTR) { 2073 if (fatal_signal_pending(current)) { 2074 ret = -EINTR; 2075 break; 2076 } 2077 2078 /* Drop and reacquire lock to unwind race. */ 2079 madvise_finish_tlb(&madv_behavior); 2080 madvise_unlock(&madv_behavior); 2081 ret = madvise_lock(&madv_behavior); 2082 if (ret) 2083 goto out; 2084 madvise_init_tlb(&madv_behavior); 2085 continue; 2086 } 2087 if (ret < 0) 2088 break; 2089 iov_iter_advance(iter, iter_iov_len(iter)); 2090 } 2091 madvise_finish_tlb(&madv_behavior); 2092 madvise_unlock(&madv_behavior); 2093 2094 out: 2095 ret = (total_len - iov_iter_count(iter)) ? : ret; 2096 2097 return ret; 2098 } 2099 2100 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, 2101 size_t, vlen, int, behavior, unsigned int, flags) 2102 { 2103 ssize_t ret; 2104 struct iovec iovstack[UIO_FASTIOV]; 2105 struct iovec *iov = iovstack; 2106 struct iov_iter iter; 2107 struct task_struct *task; 2108 struct mm_struct *mm; 2109 unsigned int f_flags; 2110 2111 if (flags != 0) { 2112 ret = -EINVAL; 2113 goto out; 2114 } 2115 2116 ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); 2117 if (ret < 0) 2118 goto out; 2119 2120 task = pidfd_get_task(pidfd, &f_flags); 2121 if (IS_ERR(task)) { 2122 ret = PTR_ERR(task); 2123 goto free_iov; 2124 } 2125 2126 /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ 2127 mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); 2128 if (IS_ERR(mm)) { 2129 ret = PTR_ERR(mm); 2130 goto release_task; 2131 } 2132 2133 /* 2134 * We need only perform this check if we are attempting to manipulate a 2135 * remote process's address space. 2136 */ 2137 if (mm != current->mm && !process_madvise_remote_valid(behavior)) { 2138 ret = -EINVAL; 2139 goto release_mm; 2140 } 2141 2142 /* 2143 * Require CAP_SYS_NICE for influencing process performance. Note that 2144 * only non-destructive hints are currently supported for remote 2145 * processes. 2146 */ 2147 if (mm != current->mm && !capable(CAP_SYS_NICE)) { 2148 ret = -EPERM; 2149 goto release_mm; 2150 } 2151 2152 ret = vector_madvise(mm, &iter, behavior); 2153 2154 release_mm: 2155 mmput(mm); 2156 release_task: 2157 put_task_struct(task); 2158 free_iov: 2159 kfree(iov); 2160 out: 2161 return ret; 2162 } 2163 2164 #ifdef CONFIG_ANON_VMA_NAME 2165 2166 #define ANON_VMA_NAME_MAX_LEN 80 2167 #define ANON_VMA_NAME_INVALID_CHARS "\\`$[]" 2168 2169 static inline bool is_valid_name_char(char ch) 2170 { 2171 /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */ 2172 return ch > 0x1f && ch < 0x7f && 2173 !strchr(ANON_VMA_NAME_INVALID_CHARS, ch); 2174 } 2175 2176 static int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, 2177 unsigned long len_in, struct anon_vma_name *anon_name) 2178 { 2179 unsigned long end; 2180 unsigned long len; 2181 int error; 2182 struct madvise_behavior madv_behavior = { 2183 .mm = mm, 2184 .behavior = __MADV_SET_ANON_VMA_NAME, 2185 .anon_name = anon_name, 2186 }; 2187 2188 if (start & ~PAGE_MASK) 2189 return -EINVAL; 2190 len = (len_in + ~PAGE_MASK) & PAGE_MASK; 2191 2192 /* Check to see whether len was rounded up from small -ve to zero */ 2193 if (len_in && !len) 2194 return -EINVAL; 2195 2196 end = start + len; 2197 if (end < start) 2198 return -EINVAL; 2199 2200 if (end == start) 2201 return 0; 2202 2203 madv_behavior.range.start = start; 2204 madv_behavior.range.end = end; 2205 2206 error = madvise_lock(&madv_behavior); 2207 if (error) 2208 return error; 2209 error = madvise_walk_vmas(&madv_behavior); 2210 madvise_unlock(&madv_behavior); 2211 2212 return error; 2213 } 2214 2215 int set_anon_vma_name(unsigned long addr, unsigned long size, 2216 const char __user *uname) 2217 { 2218 struct anon_vma_name *anon_name = NULL; 2219 struct mm_struct *mm = current->mm; 2220 int error; 2221 2222 if (uname) { 2223 char *name, *pch; 2224 2225 name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); 2226 if (IS_ERR(name)) 2227 return PTR_ERR(name); 2228 2229 for (pch = name; *pch != '\0'; pch++) { 2230 if (!is_valid_name_char(*pch)) { 2231 kfree(name); 2232 return -EINVAL; 2233 } 2234 } 2235 /* anon_vma has its own copy */ 2236 anon_name = anon_vma_name_alloc(name); 2237 kfree(name); 2238 if (!anon_name) 2239 return -ENOMEM; 2240 } 2241 2242 error = madvise_set_anon_name(mm, addr, size, anon_name); 2243 anon_vma_name_put(anon_name); 2244 2245 return error; 2246 } 2247 #endif 2248