1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/madvise.c 4 * 5 * Copyright (C) 1999 Linus Torvalds 6 * Copyright (C) 2002 Christoph Hellwig 7 */ 8 9 #include <linux/mman.h> 10 #include <linux/pagemap.h> 11 #include <linux/syscalls.h> 12 #include <linux/mempolicy.h> 13 #include <linux/page-isolation.h> 14 #include <linux/page_idle.h> 15 #include <linux/userfaultfd_k.h> 16 #include <linux/hugetlb.h> 17 #include <linux/falloc.h> 18 #include <linux/fadvise.h> 19 #include <linux/sched.h> 20 #include <linux/sched/mm.h> 21 #include <linux/mm_inline.h> 22 #include <linux/mmu_context.h> 23 #include <linux/string.h> 24 #include <linux/uio.h> 25 #include <linux/ksm.h> 26 #include <linux/fs.h> 27 #include <linux/file.h> 28 #include <linux/blkdev.h> 29 #include <linux/backing-dev.h> 30 #include <linux/pagewalk.h> 31 #include <linux/swap.h> 32 #include <linux/leafops.h> 33 #include <linux/shmem_fs.h> 34 #include <linux/mmu_notifier.h> 35 36 #include <asm/tlb.h> 37 38 #include "internal.h" 39 #include "swap.h" 40 41 #define __MADV_SET_ANON_VMA_NAME (-1) 42 43 /* 44 * Maximum number of attempts we make to install guard pages before we give up 45 * and return -ERESTARTNOINTR to have userspace try again. 46 */ 47 #define MAX_MADVISE_GUARD_RETRIES 3 48 49 struct madvise_walk_private { 50 struct mmu_gather *tlb; 51 bool pageout; 52 }; 53 54 enum madvise_lock_mode { 55 MADVISE_NO_LOCK, 56 MADVISE_MMAP_READ_LOCK, 57 MADVISE_MMAP_WRITE_LOCK, 58 MADVISE_VMA_READ_LOCK, 59 }; 60 61 struct madvise_behavior_range { 62 unsigned long start; 63 unsigned long end; 64 }; 65 66 struct madvise_behavior { 67 struct mm_struct *mm; 68 int behavior; 69 struct mmu_gather *tlb; 70 enum madvise_lock_mode lock_mode; 71 struct anon_vma_name *anon_name; 72 73 /* 74 * The range over which the behaviour is currently being applied. If 75 * traversing multiple VMAs, this is updated for each. 76 */ 77 struct madvise_behavior_range range; 78 /* The VMA and VMA preceding it (if applicable) currently targeted. */ 79 struct vm_area_struct *prev; 80 struct vm_area_struct *vma; 81 bool lock_dropped; 82 }; 83 84 #ifdef CONFIG_ANON_VMA_NAME 85 static int madvise_walk_vmas(struct madvise_behavior *madv_behavior); 86 87 struct anon_vma_name *anon_vma_name_alloc(const char *name) 88 { 89 struct anon_vma_name *anon_name; 90 size_t count; 91 92 /* Add 1 for NUL terminator at the end of the anon_name->name */ 93 count = strlen(name) + 1; 94 anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); 95 if (anon_name) { 96 kref_init(&anon_name->kref); 97 memcpy(anon_name->name, name, count); 98 } 99 100 return anon_name; 101 } 102 103 void anon_vma_name_free(struct kref *kref) 104 { 105 struct anon_vma_name *anon_name = 106 container_of(kref, struct anon_vma_name, kref); 107 kfree(anon_name); 108 } 109 110 struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) 111 { 112 vma_assert_stabilised(vma); 113 return vma->anon_name; 114 } 115 116 /* mmap_lock should be write-locked */ 117 static int replace_anon_vma_name(struct vm_area_struct *vma, 118 struct anon_vma_name *anon_name) 119 { 120 struct anon_vma_name *orig_name = anon_vma_name(vma); 121 122 if (!anon_name) { 123 vma->anon_name = NULL; 124 anon_vma_name_put(orig_name); 125 return 0; 126 } 127 128 if (anon_vma_name_eq(orig_name, anon_name)) 129 return 0; 130 131 vma->anon_name = anon_vma_name_reuse(anon_name); 132 anon_vma_name_put(orig_name); 133 134 return 0; 135 } 136 #else /* CONFIG_ANON_VMA_NAME */ 137 static int replace_anon_vma_name(struct vm_area_struct *vma, 138 struct anon_vma_name *anon_name) 139 { 140 if (anon_name) 141 return -EINVAL; 142 143 return 0; 144 } 145 #endif /* CONFIG_ANON_VMA_NAME */ 146 /* 147 * Update the vm_flags or anon_name on region of a vma, splitting it or merging 148 * it as necessary. Must be called with mmap_lock held for writing. 149 */ 150 static int madvise_update_vma(vm_flags_t new_flags, 151 struct madvise_behavior *madv_behavior) 152 { 153 struct vm_area_struct *vma = madv_behavior->vma; 154 struct madvise_behavior_range *range = &madv_behavior->range; 155 struct anon_vma_name *anon_name = madv_behavior->anon_name; 156 bool set_new_anon_name = madv_behavior->behavior == __MADV_SET_ANON_VMA_NAME; 157 VMA_ITERATOR(vmi, madv_behavior->mm, range->start); 158 159 if (new_flags == vma->vm_flags && (!set_new_anon_name || 160 anon_vma_name_eq(anon_vma_name(vma), anon_name))) 161 return 0; 162 163 if (set_new_anon_name) 164 vma = vma_modify_name(&vmi, madv_behavior->prev, vma, 165 range->start, range->end, anon_name); 166 else 167 vma = vma_modify_flags(&vmi, madv_behavior->prev, vma, 168 range->start, range->end, &new_flags); 169 170 if (IS_ERR(vma)) 171 return PTR_ERR(vma); 172 173 madv_behavior->vma = vma; 174 175 /* vm_flags is protected by the mmap_lock held in write mode. */ 176 vma_start_write(vma); 177 vm_flags_reset(vma, new_flags); 178 if (set_new_anon_name) 179 return replace_anon_vma_name(vma, anon_name); 180 181 return 0; 182 } 183 184 #ifdef CONFIG_SWAP 185 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, 186 unsigned long end, struct mm_walk *walk) 187 { 188 struct vm_area_struct *vma = walk->private; 189 struct swap_iocb *splug = NULL; 190 pte_t *ptep = NULL; 191 spinlock_t *ptl; 192 unsigned long addr; 193 194 for (addr = start; addr < end; addr += PAGE_SIZE) { 195 pte_t pte; 196 softleaf_t entry; 197 struct folio *folio; 198 199 if (!ptep++) { 200 ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 201 if (!ptep) 202 break; 203 } 204 205 pte = ptep_get(ptep); 206 entry = softleaf_from_pte(pte); 207 if (unlikely(!softleaf_is_swap(entry))) 208 continue; 209 210 pte_unmap_unlock(ptep, ptl); 211 ptep = NULL; 212 213 folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, 214 vma, addr, &splug); 215 if (folio) 216 folio_put(folio); 217 } 218 219 if (ptep) 220 pte_unmap_unlock(ptep, ptl); 221 swap_read_unplug(splug); 222 cond_resched(); 223 224 return 0; 225 } 226 227 static const struct mm_walk_ops swapin_walk_ops = { 228 .pmd_entry = swapin_walk_pmd_entry, 229 .walk_lock = PGWALK_RDLOCK, 230 }; 231 232 static void shmem_swapin_range(struct vm_area_struct *vma, 233 unsigned long start, unsigned long end, 234 struct address_space *mapping) 235 { 236 XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); 237 pgoff_t end_index = linear_page_index(vma, end) - 1; 238 struct folio *folio; 239 struct swap_iocb *splug = NULL; 240 241 rcu_read_lock(); 242 xas_for_each(&xas, folio, end_index) { 243 unsigned long addr; 244 swp_entry_t entry; 245 246 if (!xa_is_value(folio)) 247 continue; 248 entry = radix_to_swp_entry(folio); 249 /* There might be swapin error entries in shmem mapping. */ 250 if (!softleaf_is_swap(entry)) 251 continue; 252 253 addr = vma->vm_start + 254 ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT); 255 xas_pause(&xas); 256 rcu_read_unlock(); 257 258 folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping), 259 vma, addr, &splug); 260 if (folio) 261 folio_put(folio); 262 263 rcu_read_lock(); 264 } 265 rcu_read_unlock(); 266 swap_read_unplug(splug); 267 } 268 #endif /* CONFIG_SWAP */ 269 270 static void mark_mmap_lock_dropped(struct madvise_behavior *madv_behavior) 271 { 272 VM_WARN_ON_ONCE(madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK); 273 madv_behavior->lock_dropped = true; 274 } 275 276 /* 277 * Schedule all required I/O operations. Do not wait for completion. 278 */ 279 static long madvise_willneed(struct madvise_behavior *madv_behavior) 280 { 281 struct vm_area_struct *vma = madv_behavior->vma; 282 struct mm_struct *mm = madv_behavior->mm; 283 struct file *file = vma->vm_file; 284 unsigned long start = madv_behavior->range.start; 285 unsigned long end = madv_behavior->range.end; 286 loff_t offset; 287 288 #ifdef CONFIG_SWAP 289 if (!file) { 290 walk_page_range_vma(vma, start, end, &swapin_walk_ops, vma); 291 lru_add_drain(); /* Push any new pages onto the LRU now */ 292 return 0; 293 } 294 295 if (shmem_mapping(file->f_mapping)) { 296 shmem_swapin_range(vma, start, end, file->f_mapping); 297 lru_add_drain(); /* Push any new pages onto the LRU now */ 298 return 0; 299 } 300 #else 301 if (!file) 302 return -EBADF; 303 #endif 304 305 if (IS_DAX(file_inode(file))) { 306 /* no bad return value, but ignore advice */ 307 return 0; 308 } 309 310 /* 311 * Filesystem's fadvise may need to take various locks. We need to 312 * explicitly grab a reference because the vma (and hence the 313 * vma's reference to the file) can go away as soon as we drop 314 * mmap_lock. 315 */ 316 mark_mmap_lock_dropped(madv_behavior); 317 get_file(file); 318 offset = (loff_t)(start - vma->vm_start) 319 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 320 mmap_read_unlock(mm); 321 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); 322 fput(file); 323 mmap_read_lock(mm); 324 return 0; 325 } 326 327 static inline bool can_do_file_pageout(struct vm_area_struct *vma) 328 { 329 if (!vma->vm_file) 330 return false; 331 /* 332 * paging out pagecache only for non-anonymous mappings that correspond 333 * to the files the calling process could (if tried) open for writing; 334 * otherwise we'd be including shared non-exclusive mappings, which 335 * opens a side channel. 336 */ 337 return inode_owner_or_capable(&nop_mnt_idmap, 338 file_inode(vma->vm_file)) || 339 file_permission(vma->vm_file, MAY_WRITE) == 0; 340 } 341 342 static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, 343 struct folio *folio, pte_t *ptep, 344 pte_t *ptentp) 345 { 346 int max_nr = (end - addr) / PAGE_SIZE; 347 348 return folio_pte_batch_flags(folio, NULL, ptep, ptentp, max_nr, 349 FPB_MERGE_YOUNG_DIRTY); 350 } 351 352 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, 353 unsigned long addr, unsigned long end, 354 struct mm_walk *walk) 355 { 356 struct madvise_walk_private *private = walk->private; 357 struct mmu_gather *tlb = private->tlb; 358 bool pageout = private->pageout; 359 struct mm_struct *mm = tlb->mm; 360 struct vm_area_struct *vma = walk->vma; 361 pte_t *start_pte, *pte, ptent; 362 spinlock_t *ptl; 363 struct folio *folio = NULL; 364 LIST_HEAD(folio_list); 365 bool pageout_anon_only_filter; 366 unsigned int batch_count = 0; 367 int nr; 368 369 if (fatal_signal_pending(current)) 370 return -EINTR; 371 372 pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) && 373 !can_do_file_pageout(vma); 374 375 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 376 if (pmd_trans_huge(*pmd)) { 377 pmd_t orig_pmd; 378 unsigned long next = pmd_addr_end(addr, end); 379 380 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 381 ptl = pmd_trans_huge_lock(pmd, vma); 382 if (!ptl) 383 return 0; 384 385 orig_pmd = *pmd; 386 if (is_huge_zero_pmd(orig_pmd)) 387 goto huge_unlock; 388 389 if (unlikely(!pmd_present(orig_pmd))) { 390 VM_BUG_ON(thp_migration_supported() && 391 !pmd_is_migration_entry(orig_pmd)); 392 goto huge_unlock; 393 } 394 395 folio = pmd_folio(orig_pmd); 396 397 /* Do not interfere with other mappings of this folio */ 398 if (folio_maybe_mapped_shared(folio)) 399 goto huge_unlock; 400 401 if (pageout_anon_only_filter && !folio_test_anon(folio)) 402 goto huge_unlock; 403 404 if (next - addr != HPAGE_PMD_SIZE) { 405 int err; 406 407 folio_get(folio); 408 spin_unlock(ptl); 409 folio_lock(folio); 410 err = split_folio(folio); 411 folio_unlock(folio); 412 folio_put(folio); 413 if (!err) 414 goto regular_folio; 415 return 0; 416 } 417 418 if (!pageout && pmd_young(orig_pmd)) { 419 pmdp_invalidate(vma, addr, pmd); 420 orig_pmd = pmd_mkold(orig_pmd); 421 422 set_pmd_at(mm, addr, pmd, orig_pmd); 423 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 424 } 425 426 folio_clear_referenced(folio); 427 folio_test_clear_young(folio); 428 if (folio_test_active(folio)) 429 folio_set_workingset(folio); 430 if (pageout) { 431 if (folio_isolate_lru(folio)) { 432 if (folio_test_unevictable(folio)) 433 folio_putback_lru(folio); 434 else 435 list_add(&folio->lru, &folio_list); 436 } 437 } else 438 folio_deactivate(folio); 439 huge_unlock: 440 spin_unlock(ptl); 441 if (pageout) 442 reclaim_pages(&folio_list); 443 return 0; 444 } 445 446 regular_folio: 447 #endif 448 tlb_change_page_size(tlb, PAGE_SIZE); 449 restart: 450 start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 451 if (!start_pte) 452 return 0; 453 flush_tlb_batched_pending(mm); 454 lazy_mmu_mode_enable(); 455 for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { 456 nr = 1; 457 ptent = ptep_get(pte); 458 459 if (++batch_count == SWAP_CLUSTER_MAX) { 460 batch_count = 0; 461 if (need_resched()) { 462 lazy_mmu_mode_disable(); 463 pte_unmap_unlock(start_pte, ptl); 464 cond_resched(); 465 goto restart; 466 } 467 } 468 469 if (pte_none(ptent)) 470 continue; 471 472 if (!pte_present(ptent)) 473 continue; 474 475 folio = vm_normal_folio(vma, addr, ptent); 476 if (!folio || folio_is_zone_device(folio)) 477 continue; 478 479 /* 480 * If we encounter a large folio, only split it if it is not 481 * fully mapped within the range we are operating on. Otherwise 482 * leave it as is so that it can be swapped out whole. If we 483 * fail to split a folio, leave it in place and advance to the 484 * next pte in the range. 485 */ 486 if (folio_test_large(folio)) { 487 nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); 488 if (nr < folio_nr_pages(folio)) { 489 int err; 490 491 if (folio_maybe_mapped_shared(folio)) 492 continue; 493 if (pageout_anon_only_filter && !folio_test_anon(folio)) 494 continue; 495 if (!folio_trylock(folio)) 496 continue; 497 folio_get(folio); 498 lazy_mmu_mode_disable(); 499 pte_unmap_unlock(start_pte, ptl); 500 start_pte = NULL; 501 err = split_folio(folio); 502 folio_unlock(folio); 503 folio_put(folio); 504 start_pte = pte = 505 pte_offset_map_lock(mm, pmd, addr, &ptl); 506 if (!start_pte) 507 break; 508 flush_tlb_batched_pending(mm); 509 lazy_mmu_mode_enable(); 510 if (!err) 511 nr = 0; 512 continue; 513 } 514 } 515 516 /* 517 * Do not interfere with other mappings of this folio and 518 * non-LRU folio. If we have a large folio at this point, we 519 * know it is fully mapped so if its mapcount is the same as its 520 * number of pages, it must be exclusive. 521 */ 522 if (!folio_test_lru(folio) || 523 folio_mapcount(folio) != folio_nr_pages(folio)) 524 continue; 525 526 if (pageout_anon_only_filter && !folio_test_anon(folio)) 527 continue; 528 529 if (!pageout && pte_young(ptent)) { 530 clear_young_dirty_ptes(vma, addr, pte, nr, 531 CYDP_CLEAR_YOUNG); 532 tlb_remove_tlb_entries(tlb, pte, nr, addr); 533 } 534 535 /* 536 * We are deactivating a folio for accelerating reclaiming. 537 * VM couldn't reclaim the folio unless we clear PG_young. 538 * As a side effect, it makes confuse idle-page tracking 539 * because they will miss recent referenced history. 540 */ 541 folio_clear_referenced(folio); 542 folio_test_clear_young(folio); 543 if (folio_test_active(folio)) 544 folio_set_workingset(folio); 545 if (pageout) { 546 if (folio_isolate_lru(folio)) { 547 if (folio_test_unevictable(folio)) 548 folio_putback_lru(folio); 549 else 550 list_add(&folio->lru, &folio_list); 551 } 552 } else 553 folio_deactivate(folio); 554 } 555 556 if (start_pte) { 557 lazy_mmu_mode_disable(); 558 pte_unmap_unlock(start_pte, ptl); 559 } 560 if (pageout) 561 reclaim_pages(&folio_list); 562 cond_resched(); 563 564 return 0; 565 } 566 567 static const struct mm_walk_ops cold_walk_ops = { 568 .pmd_entry = madvise_cold_or_pageout_pte_range, 569 .walk_lock = PGWALK_RDLOCK, 570 }; 571 572 static void madvise_cold_page_range(struct mmu_gather *tlb, 573 struct madvise_behavior *madv_behavior) 574 575 { 576 struct vm_area_struct *vma = madv_behavior->vma; 577 struct madvise_behavior_range *range = &madv_behavior->range; 578 struct madvise_walk_private walk_private = { 579 .pageout = false, 580 .tlb = tlb, 581 }; 582 583 tlb_start_vma(tlb, vma); 584 walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops, 585 &walk_private); 586 tlb_end_vma(tlb, vma); 587 } 588 589 static inline bool can_madv_lru_vma(struct vm_area_struct *vma) 590 { 591 return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); 592 } 593 594 static long madvise_cold(struct madvise_behavior *madv_behavior) 595 { 596 struct vm_area_struct *vma = madv_behavior->vma; 597 struct mmu_gather tlb; 598 599 if (!can_madv_lru_vma(vma)) 600 return -EINVAL; 601 602 lru_add_drain(); 603 tlb_gather_mmu(&tlb, madv_behavior->mm); 604 madvise_cold_page_range(&tlb, madv_behavior); 605 tlb_finish_mmu(&tlb); 606 607 return 0; 608 } 609 610 static void madvise_pageout_page_range(struct mmu_gather *tlb, 611 struct vm_area_struct *vma, 612 struct madvise_behavior_range *range) 613 { 614 struct madvise_walk_private walk_private = { 615 .pageout = true, 616 .tlb = tlb, 617 }; 618 619 tlb_start_vma(tlb, vma); 620 walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops, 621 &walk_private); 622 tlb_end_vma(tlb, vma); 623 } 624 625 static long madvise_pageout(struct madvise_behavior *madv_behavior) 626 { 627 struct mmu_gather tlb; 628 struct vm_area_struct *vma = madv_behavior->vma; 629 630 if (!can_madv_lru_vma(vma)) 631 return -EINVAL; 632 633 /* 634 * If the VMA belongs to a private file mapping, there can be private 635 * dirty pages which can be paged out if even this process is neither 636 * owner nor write capable of the file. We allow private file mappings 637 * further to pageout dirty anon pages. 638 */ 639 if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) && 640 (vma->vm_flags & VM_MAYSHARE))) 641 return 0; 642 643 lru_add_drain(); 644 tlb_gather_mmu(&tlb, madv_behavior->mm); 645 madvise_pageout_page_range(&tlb, vma, &madv_behavior->range); 646 tlb_finish_mmu(&tlb); 647 648 return 0; 649 } 650 651 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, 652 unsigned long end, struct mm_walk *walk) 653 654 { 655 const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY; 656 struct mmu_gather *tlb = walk->private; 657 struct mm_struct *mm = tlb->mm; 658 struct vm_area_struct *vma = walk->vma; 659 spinlock_t *ptl; 660 pte_t *start_pte, *pte, ptent; 661 struct folio *folio; 662 int nr_swap = 0; 663 unsigned long next; 664 int nr, max_nr; 665 666 next = pmd_addr_end(addr, end); 667 if (pmd_trans_huge(*pmd)) 668 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) 669 return 0; 670 671 tlb_change_page_size(tlb, PAGE_SIZE); 672 start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 673 if (!start_pte) 674 return 0; 675 flush_tlb_batched_pending(mm); 676 lazy_mmu_mode_enable(); 677 for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { 678 nr = 1; 679 ptent = ptep_get(pte); 680 681 if (pte_none(ptent)) 682 continue; 683 /* 684 * If the pte has swp_entry, just clear page table to 685 * prevent swap-in which is more expensive rather than 686 * (page allocation + zeroing). 687 */ 688 if (!pte_present(ptent)) { 689 softleaf_t entry = softleaf_from_pte(ptent); 690 691 if (softleaf_is_swap(entry)) { 692 max_nr = (end - addr) / PAGE_SIZE; 693 nr = swap_pte_batch(pte, max_nr, ptent); 694 nr_swap -= nr; 695 swap_put_entries_direct(entry, nr); 696 clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); 697 } else if (softleaf_is_hwpoison(entry) || 698 softleaf_is_poison_marker(entry)) { 699 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 700 } 701 continue; 702 } 703 704 folio = vm_normal_folio(vma, addr, ptent); 705 if (!folio || folio_is_zone_device(folio)) 706 continue; 707 708 /* 709 * If we encounter a large folio, only split it if it is not 710 * fully mapped within the range we are operating on. Otherwise 711 * leave it as is so that it can be marked as lazyfree. If we 712 * fail to split a folio, leave it in place and advance to the 713 * next pte in the range. 714 */ 715 if (folio_test_large(folio)) { 716 nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); 717 if (nr < folio_nr_pages(folio)) { 718 int err; 719 720 if (folio_maybe_mapped_shared(folio)) 721 continue; 722 if (!folio_trylock(folio)) 723 continue; 724 folio_get(folio); 725 lazy_mmu_mode_disable(); 726 pte_unmap_unlock(start_pte, ptl); 727 start_pte = NULL; 728 err = split_folio(folio); 729 folio_unlock(folio); 730 folio_put(folio); 731 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 732 start_pte = pte; 733 if (!start_pte) 734 break; 735 flush_tlb_batched_pending(mm); 736 lazy_mmu_mode_enable(); 737 if (!err) 738 nr = 0; 739 continue; 740 } 741 } 742 743 if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { 744 if (!folio_trylock(folio)) 745 continue; 746 /* 747 * If we have a large folio at this point, we know it is 748 * fully mapped so if its mapcount is the same as its 749 * number of pages, it must be exclusive. 750 */ 751 if (folio_mapcount(folio) != folio_nr_pages(folio)) { 752 folio_unlock(folio); 753 continue; 754 } 755 756 if (folio_test_swapcache(folio) && 757 !folio_free_swap(folio)) { 758 folio_unlock(folio); 759 continue; 760 } 761 762 folio_clear_dirty(folio); 763 folio_unlock(folio); 764 } 765 766 if (pte_young(ptent) || pte_dirty(ptent)) { 767 clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags); 768 tlb_remove_tlb_entries(tlb, pte, nr, addr); 769 } 770 folio_mark_lazyfree(folio); 771 } 772 773 if (nr_swap) 774 add_mm_counter(mm, MM_SWAPENTS, nr_swap); 775 if (start_pte) { 776 lazy_mmu_mode_disable(); 777 pte_unmap_unlock(start_pte, ptl); 778 } 779 cond_resched(); 780 781 return 0; 782 } 783 784 static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode) 785 { 786 switch (mode) { 787 case MADVISE_VMA_READ_LOCK: 788 return PGWALK_VMA_RDLOCK_VERIFY; 789 case MADVISE_MMAP_READ_LOCK: 790 return PGWALK_RDLOCK; 791 default: 792 /* Other modes don't require fixing up the walk_lock */ 793 WARN_ON_ONCE(1); 794 return PGWALK_RDLOCK; 795 } 796 } 797 798 static int madvise_free_single_vma(struct madvise_behavior *madv_behavior) 799 { 800 struct mm_struct *mm = madv_behavior->mm; 801 struct vm_area_struct *vma = madv_behavior->vma; 802 unsigned long start_addr = madv_behavior->range.start; 803 unsigned long end_addr = madv_behavior->range.end; 804 struct mmu_notifier_range range; 805 struct mmu_gather *tlb = madv_behavior->tlb; 806 struct mm_walk_ops walk_ops = { 807 .pmd_entry = madvise_free_pte_range, 808 }; 809 810 /* MADV_FREE works for only anon vma at the moment */ 811 if (!vma_is_anonymous(vma)) 812 return -EINVAL; 813 814 range.start = max(vma->vm_start, start_addr); 815 if (range.start >= vma->vm_end) 816 return -EINVAL; 817 range.end = min(vma->vm_end, end_addr); 818 if (range.end <= vma->vm_start) 819 return -EINVAL; 820 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 821 range.start, range.end); 822 823 lru_add_drain(); 824 update_hiwater_rss(mm); 825 826 mmu_notifier_invalidate_range_start(&range); 827 tlb_start_vma(tlb, vma); 828 walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode); 829 walk_page_range_vma(vma, range.start, range.end, 830 &walk_ops, tlb); 831 tlb_end_vma(tlb, vma); 832 mmu_notifier_invalidate_range_end(&range); 833 return 0; 834 } 835 836 /* 837 * Application no longer needs these pages. If the pages are dirty, 838 * it's OK to just throw them away. The app will be more careful about 839 * data it wants to keep. Be sure to free swap resources too. The 840 * zap_page_range_single call sets things up for shrink_active_list to actually 841 * free these pages later if no one else has touched them in the meantime, 842 * although we could add these pages to a global reuse list for 843 * shrink_active_list to pick up before reclaiming other pages. 844 * 845 * NB: This interface discards data rather than pushes it out to swap, 846 * as some implementations do. This has performance implications for 847 * applications like large transactional databases which want to discard 848 * pages in anonymous maps after committing to backing store the data 849 * that was kept in them. There is no reason to write this data out to 850 * the swap area if the application is discarding it. 851 * 852 * An interface that causes the system to free clean pages and flush 853 * dirty pages is already available as msync(MS_INVALIDATE). 854 */ 855 static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior) 856 857 { 858 struct madvise_behavior_range *range = &madv_behavior->range; 859 struct zap_details details = { 860 .reclaim_pt = true, 861 .even_cows = true, 862 }; 863 864 zap_page_range_single_batched( 865 madv_behavior->tlb, madv_behavior->vma, range->start, 866 range->end - range->start, &details); 867 return 0; 868 } 869 870 static 871 bool madvise_dontneed_free_valid_vma(struct madvise_behavior *madv_behavior) 872 { 873 struct vm_area_struct *vma = madv_behavior->vma; 874 int behavior = madv_behavior->behavior; 875 struct madvise_behavior_range *range = &madv_behavior->range; 876 877 if (!is_vm_hugetlb_page(vma)) { 878 unsigned int forbidden = VM_PFNMAP; 879 880 if (behavior != MADV_DONTNEED_LOCKED) 881 forbidden |= VM_LOCKED; 882 883 return !(vma->vm_flags & forbidden); 884 } 885 886 if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) 887 return false; 888 if (range->start & ~huge_page_mask(hstate_vma(vma))) 889 return false; 890 891 /* 892 * Madvise callers expect the length to be rounded up to PAGE_SIZE 893 * boundaries, and may be unaware that this VMA uses huge pages. 894 * Avoid unexpected data loss by rounding down the number of 895 * huge pages freed. 896 */ 897 range->end = ALIGN_DOWN(range->end, huge_page_size(hstate_vma(vma))); 898 899 return true; 900 } 901 902 static long madvise_dontneed_free(struct madvise_behavior *madv_behavior) 903 { 904 struct mm_struct *mm = madv_behavior->mm; 905 struct madvise_behavior_range *range = &madv_behavior->range; 906 int behavior = madv_behavior->behavior; 907 908 if (!madvise_dontneed_free_valid_vma(madv_behavior)) 909 return -EINVAL; 910 911 if (range->start == range->end) 912 return 0; 913 914 if (!userfaultfd_remove(madv_behavior->vma, range->start, range->end)) { 915 struct vm_area_struct *vma; 916 917 mark_mmap_lock_dropped(madv_behavior); 918 mmap_read_lock(mm); 919 madv_behavior->vma = vma = vma_lookup(mm, range->start); 920 if (!vma) 921 return -ENOMEM; 922 /* 923 * Potential end adjustment for hugetlb vma is OK as 924 * the check below keeps end within vma. 925 */ 926 if (!madvise_dontneed_free_valid_vma(madv_behavior)) 927 return -EINVAL; 928 if (range->end > vma->vm_end) { 929 /* 930 * Don't fail if end > vma->vm_end. If the old 931 * vma was split while the mmap_lock was 932 * released the effect of the concurrent 933 * operation may not cause madvise() to 934 * have an undefined result. There may be an 935 * adjacent next vma that we'll walk 936 * next. userfaultfd_remove() will generate an 937 * UFFD_EVENT_REMOVE repetition on the 938 * end-vma->vm_end range, but the manager can 939 * handle a repetition fine. 940 */ 941 range->end = vma->vm_end; 942 } 943 /* 944 * If the memory region between start and end was 945 * originally backed by 4kB pages and then remapped to 946 * be backed by hugepages while mmap_lock was dropped, 947 * the adjustment for hugetlb vma above may have rounded 948 * end down to the start address. 949 */ 950 if (range->start == range->end) 951 return 0; 952 VM_WARN_ON(range->start > range->end); 953 } 954 955 if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) 956 return madvise_dontneed_single_vma(madv_behavior); 957 else if (behavior == MADV_FREE) 958 return madvise_free_single_vma(madv_behavior); 959 else 960 return -EINVAL; 961 } 962 963 static long madvise_populate(struct madvise_behavior *madv_behavior) 964 { 965 struct mm_struct *mm = madv_behavior->mm; 966 const bool write = madv_behavior->behavior == MADV_POPULATE_WRITE; 967 int locked = 1; 968 unsigned long start = madv_behavior->range.start; 969 unsigned long end = madv_behavior->range.end; 970 long pages; 971 972 while (start < end) { 973 /* Populate (prefault) page tables readable/writable. */ 974 pages = faultin_page_range(mm, start, end, write, &locked); 975 if (!locked) { 976 mmap_read_lock(mm); 977 locked = 1; 978 } 979 if (pages < 0) { 980 switch (pages) { 981 case -EINTR: 982 return -EINTR; 983 case -EINVAL: /* Incompatible mappings / permissions. */ 984 return -EINVAL; 985 case -EHWPOISON: 986 return -EHWPOISON; 987 case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ 988 return -EFAULT; 989 default: 990 pr_warn_once("%s: unhandled return value: %ld\n", 991 __func__, pages); 992 fallthrough; 993 case -ENOMEM: /* No VMA or out of memory. */ 994 return -ENOMEM; 995 } 996 } 997 start += pages * PAGE_SIZE; 998 } 999 return 0; 1000 } 1001 1002 /* 1003 * Application wants to free up the pages and associated backing store. 1004 * This is effectively punching a hole into the middle of a file. 1005 */ 1006 static long madvise_remove(struct madvise_behavior *madv_behavior) 1007 { 1008 loff_t offset; 1009 int error; 1010 struct file *f; 1011 struct mm_struct *mm = madv_behavior->mm; 1012 struct vm_area_struct *vma = madv_behavior->vma; 1013 unsigned long start = madv_behavior->range.start; 1014 unsigned long end = madv_behavior->range.end; 1015 1016 mark_mmap_lock_dropped(madv_behavior); 1017 1018 if (vma->vm_flags & VM_LOCKED) 1019 return -EINVAL; 1020 1021 f = vma->vm_file; 1022 1023 if (!f || !f->f_mapping || !f->f_mapping->host) { 1024 return -EINVAL; 1025 } 1026 1027 if (!vma_is_shared_maywrite(vma)) 1028 return -EACCES; 1029 1030 offset = (loff_t)(start - vma->vm_start) 1031 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 1032 1033 /* 1034 * Filesystem's fallocate may need to take i_rwsem. We need to 1035 * explicitly grab a reference because the vma (and hence the 1036 * vma's reference to the file) can go away as soon as we drop 1037 * mmap_lock. 1038 */ 1039 get_file(f); 1040 if (userfaultfd_remove(vma, start, end)) { 1041 /* mmap_lock was not released by userfaultfd_remove() */ 1042 mmap_read_unlock(mm); 1043 } 1044 error = vfs_fallocate(f, 1045 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 1046 offset, end - start); 1047 fput(f); 1048 mmap_read_lock(mm); 1049 return error; 1050 } 1051 1052 static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked) 1053 { 1054 vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB; 1055 1056 /* 1057 * A user could lock after setting a guard range but that's fine, as 1058 * they'd not be able to fault in. The issue arises when we try to zap 1059 * existing locked VMAs. We don't want to do that. 1060 */ 1061 if (!allow_locked) 1062 disallowed |= VM_LOCKED; 1063 1064 return !(vma->vm_flags & disallowed); 1065 } 1066 1067 static bool is_guard_pte_marker(pte_t ptent) 1068 { 1069 const softleaf_t entry = softleaf_from_pte(ptent); 1070 1071 return softleaf_is_guard_marker(entry); 1072 } 1073 1074 static int guard_install_pud_entry(pud_t *pud, unsigned long addr, 1075 unsigned long next, struct mm_walk *walk) 1076 { 1077 pud_t pudval = pudp_get(pud); 1078 1079 /* If huge return >0 so we abort the operation + zap. */ 1080 return pud_trans_huge(pudval); 1081 } 1082 1083 static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr, 1084 unsigned long next, struct mm_walk *walk) 1085 { 1086 pmd_t pmdval = pmdp_get(pmd); 1087 1088 /* If huge return >0 so we abort the operation + zap. */ 1089 return pmd_trans_huge(pmdval); 1090 } 1091 1092 static int guard_install_pte_entry(pte_t *pte, unsigned long addr, 1093 unsigned long next, struct mm_walk *walk) 1094 { 1095 pte_t pteval = ptep_get(pte); 1096 unsigned long *nr_pages = (unsigned long *)walk->private; 1097 1098 /* If there is already a guard page marker, we have nothing to do. */ 1099 if (is_guard_pte_marker(pteval)) { 1100 (*nr_pages)++; 1101 1102 return 0; 1103 } 1104 1105 /* If populated return >0 so we abort the operation + zap. */ 1106 return 1; 1107 } 1108 1109 static int guard_install_set_pte(unsigned long addr, unsigned long next, 1110 pte_t *ptep, struct mm_walk *walk) 1111 { 1112 unsigned long *nr_pages = (unsigned long *)walk->private; 1113 1114 /* Simply install a PTE marker, this causes segfault on access. */ 1115 *ptep = make_pte_marker(PTE_MARKER_GUARD); 1116 (*nr_pages)++; 1117 1118 return 0; 1119 } 1120 1121 static long madvise_guard_install(struct madvise_behavior *madv_behavior) 1122 { 1123 struct vm_area_struct *vma = madv_behavior->vma; 1124 struct madvise_behavior_range *range = &madv_behavior->range; 1125 struct mm_walk_ops walk_ops = { 1126 .pud_entry = guard_install_pud_entry, 1127 .pmd_entry = guard_install_pmd_entry, 1128 .pte_entry = guard_install_pte_entry, 1129 .install_pte = guard_install_set_pte, 1130 .walk_lock = get_walk_lock(madv_behavior->lock_mode), 1131 }; 1132 long err; 1133 int i; 1134 1135 if (!is_valid_guard_vma(vma, /* allow_locked = */false)) 1136 return -EINVAL; 1137 1138 /* 1139 * Set atomically under read lock. All pertinent readers will need to 1140 * acquire an mmap/VMA write lock to read it. All remaining readers may 1141 * or may not see the flag set, but we don't care. 1142 */ 1143 vma_set_atomic_flag(vma, VMA_MAYBE_GUARD_BIT); 1144 1145 /* 1146 * If anonymous and we are establishing page tables the VMA ought to 1147 * have an anon_vma associated with it. 1148 * 1149 * We will hold an mmap read lock if this is necessary, this is checked 1150 * as part of the VMA lock logic. 1151 */ 1152 if (vma_is_anonymous(vma)) { 1153 VM_WARN_ON_ONCE(!vma->anon_vma && 1154 madv_behavior->lock_mode != MADVISE_MMAP_READ_LOCK); 1155 1156 err = anon_vma_prepare(vma); 1157 if (err) 1158 return err; 1159 } 1160 1161 /* 1162 * Optimistically try to install the guard marker pages first. If any 1163 * non-guard pages or THP huge pages are encountered, give up and zap 1164 * the range before trying again. 1165 * 1166 * We try a few times before giving up and releasing back to userland to 1167 * loop around, releasing locks in the process to avoid contention. 1168 * 1169 * This would only happen due to races with e.g. page faults or 1170 * khugepaged. 1171 * 1172 * In most cases we should simply install the guard markers immediately 1173 * with no zap or looping. 1174 */ 1175 for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) { 1176 unsigned long nr_pages = 0; 1177 1178 /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */ 1179 if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK) 1180 err = walk_page_range_vma_unsafe(madv_behavior->vma, 1181 range->start, range->end, &walk_ops, 1182 &nr_pages); 1183 else 1184 err = walk_page_range_mm_unsafe(vma->vm_mm, range->start, 1185 range->end, &walk_ops, &nr_pages); 1186 if (err < 0) 1187 return err; 1188 1189 if (err == 0) { 1190 unsigned long nr_expected_pages = 1191 PHYS_PFN(range->end - range->start); 1192 1193 VM_WARN_ON(nr_pages != nr_expected_pages); 1194 return 0; 1195 } 1196 1197 /* 1198 * OK some of the range have non-guard pages mapped, zap 1199 * them. This leaves existing guard pages in place. 1200 */ 1201 zap_page_range_single(vma, range->start, 1202 range->end - range->start, NULL); 1203 } 1204 1205 /* 1206 * We were unable to install the guard pages, return to userspace and 1207 * immediately retry, relieving lock contention. 1208 */ 1209 return restart_syscall(); 1210 } 1211 1212 static int guard_remove_pud_entry(pud_t *pud, unsigned long addr, 1213 unsigned long next, struct mm_walk *walk) 1214 { 1215 pud_t pudval = pudp_get(pud); 1216 1217 /* If huge, cannot have guard pages present, so no-op - skip. */ 1218 if (pud_trans_huge(pudval)) 1219 walk->action = ACTION_CONTINUE; 1220 1221 return 0; 1222 } 1223 1224 static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr, 1225 unsigned long next, struct mm_walk *walk) 1226 { 1227 pmd_t pmdval = pmdp_get(pmd); 1228 1229 /* If huge, cannot have guard pages present, so no-op - skip. */ 1230 if (pmd_trans_huge(pmdval)) 1231 walk->action = ACTION_CONTINUE; 1232 1233 return 0; 1234 } 1235 1236 static int guard_remove_pte_entry(pte_t *pte, unsigned long addr, 1237 unsigned long next, struct mm_walk *walk) 1238 { 1239 pte_t ptent = ptep_get(pte); 1240 1241 if (is_guard_pte_marker(ptent)) { 1242 /* Simply clear the PTE marker. */ 1243 pte_clear_not_present_full(walk->mm, addr, pte, false); 1244 update_mmu_cache(walk->vma, addr, pte); 1245 } 1246 1247 return 0; 1248 } 1249 1250 static long madvise_guard_remove(struct madvise_behavior *madv_behavior) 1251 { 1252 struct vm_area_struct *vma = madv_behavior->vma; 1253 struct madvise_behavior_range *range = &madv_behavior->range; 1254 struct mm_walk_ops wallk_ops = { 1255 .pud_entry = guard_remove_pud_entry, 1256 .pmd_entry = guard_remove_pmd_entry, 1257 .pte_entry = guard_remove_pte_entry, 1258 .walk_lock = get_walk_lock(madv_behavior->lock_mode), 1259 }; 1260 1261 /* 1262 * We're ok with removing guards in mlock()'d ranges, as this is a 1263 * non-destructive action. 1264 */ 1265 if (!is_valid_guard_vma(vma, /* allow_locked = */true)) 1266 return -EINVAL; 1267 1268 return walk_page_range_vma(vma, range->start, range->end, 1269 &wallk_ops, NULL); 1270 } 1271 1272 #ifdef CONFIG_64BIT 1273 /* Does the madvise operation result in discarding of mapped data? */ 1274 static bool is_discard(int behavior) 1275 { 1276 switch (behavior) { 1277 case MADV_FREE: 1278 case MADV_DONTNEED: 1279 case MADV_DONTNEED_LOCKED: 1280 case MADV_REMOVE: 1281 case MADV_DONTFORK: 1282 case MADV_WIPEONFORK: 1283 case MADV_GUARD_INSTALL: 1284 return true; 1285 } 1286 1287 return false; 1288 } 1289 1290 /* 1291 * We are restricted from madvise()'ing mseal()'d VMAs only in very particular 1292 * circumstances - discarding of data from read-only anonymous SEALED mappings. 1293 * 1294 * This is because users cannot trivally discard data from these VMAs, and may 1295 * only do so via an appropriate madvise() call. 1296 */ 1297 static bool can_madvise_modify(struct madvise_behavior *madv_behavior) 1298 { 1299 struct vm_area_struct *vma = madv_behavior->vma; 1300 1301 /* If the VMA isn't sealed we're good. */ 1302 if (!vma_is_sealed(vma)) 1303 return true; 1304 1305 /* For a sealed VMA, we only care about discard operations. */ 1306 if (!is_discard(madv_behavior->behavior)) 1307 return true; 1308 1309 /* 1310 * We explicitly permit all file-backed mappings, whether MAP_SHARED or 1311 * MAP_PRIVATE. 1312 * 1313 * The latter causes some complications. Because now, one can mmap() 1314 * read/write a MAP_PRIVATE mapping, write to it, then mprotect() 1315 * read-only, mseal() and a discard will be permitted. 1316 * 1317 * However, in order to avoid issues with potential use of madvise(..., 1318 * MADV_DONTNEED) of mseal()'d .text mappings we, for the time being, 1319 * permit this. 1320 */ 1321 if (!vma_is_anonymous(vma)) 1322 return true; 1323 1324 /* If the user could write to the mapping anyway, then this is fine. */ 1325 if ((vma->vm_flags & VM_WRITE) && 1326 arch_vma_access_permitted(vma, /* write= */ true, 1327 /* execute= */ false, /* foreign= */ false)) 1328 return true; 1329 1330 /* Otherwise, we are not permitted to perform this operation. */ 1331 return false; 1332 } 1333 #else 1334 static bool can_madvise_modify(struct madvise_behavior *madv_behavior) 1335 { 1336 return true; 1337 } 1338 #endif 1339 1340 /* 1341 * Apply an madvise behavior to a region of a vma. madvise_update_vma 1342 * will handle splitting a vm area into separate areas, each area with its own 1343 * behavior. 1344 */ 1345 static int madvise_vma_behavior(struct madvise_behavior *madv_behavior) 1346 { 1347 int behavior = madv_behavior->behavior; 1348 struct vm_area_struct *vma = madv_behavior->vma; 1349 vm_flags_t new_flags = vma->vm_flags; 1350 struct madvise_behavior_range *range = &madv_behavior->range; 1351 int error; 1352 1353 if (unlikely(!can_madvise_modify(madv_behavior))) 1354 return -EPERM; 1355 1356 switch (behavior) { 1357 case MADV_REMOVE: 1358 return madvise_remove(madv_behavior); 1359 case MADV_WILLNEED: 1360 return madvise_willneed(madv_behavior); 1361 case MADV_COLD: 1362 return madvise_cold(madv_behavior); 1363 case MADV_PAGEOUT: 1364 return madvise_pageout(madv_behavior); 1365 case MADV_FREE: 1366 case MADV_DONTNEED: 1367 case MADV_DONTNEED_LOCKED: 1368 return madvise_dontneed_free(madv_behavior); 1369 case MADV_COLLAPSE: 1370 return madvise_collapse(vma, range->start, range->end, 1371 &madv_behavior->lock_dropped); 1372 case MADV_GUARD_INSTALL: 1373 return madvise_guard_install(madv_behavior); 1374 case MADV_GUARD_REMOVE: 1375 return madvise_guard_remove(madv_behavior); 1376 1377 /* The below behaviours update VMAs via madvise_update_vma(). */ 1378 1379 case MADV_NORMAL: 1380 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 1381 break; 1382 case MADV_SEQUENTIAL: 1383 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 1384 break; 1385 case MADV_RANDOM: 1386 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 1387 break; 1388 case MADV_DONTFORK: 1389 new_flags |= VM_DONTCOPY; 1390 break; 1391 case MADV_DOFORK: 1392 if (new_flags & VM_IO) 1393 return -EINVAL; 1394 new_flags &= ~VM_DONTCOPY; 1395 break; 1396 case MADV_WIPEONFORK: 1397 /* MADV_WIPEONFORK is only supported on anonymous memory. */ 1398 if (vma->vm_file || new_flags & VM_SHARED) 1399 return -EINVAL; 1400 new_flags |= VM_WIPEONFORK; 1401 break; 1402 case MADV_KEEPONFORK: 1403 if (new_flags & VM_DROPPABLE) 1404 return -EINVAL; 1405 new_flags &= ~VM_WIPEONFORK; 1406 break; 1407 case MADV_DONTDUMP: 1408 new_flags |= VM_DONTDUMP; 1409 break; 1410 case MADV_DODUMP: 1411 if ((!is_vm_hugetlb_page(vma) && (new_flags & VM_SPECIAL)) || 1412 (new_flags & VM_DROPPABLE)) 1413 return -EINVAL; 1414 new_flags &= ~VM_DONTDUMP; 1415 break; 1416 case MADV_MERGEABLE: 1417 case MADV_UNMERGEABLE: 1418 error = ksm_madvise(vma, range->start, range->end, 1419 behavior, &new_flags); 1420 if (error) 1421 goto out; 1422 break; 1423 case MADV_HUGEPAGE: 1424 case MADV_NOHUGEPAGE: 1425 error = hugepage_madvise(vma, &new_flags, behavior); 1426 if (error) 1427 goto out; 1428 break; 1429 case __MADV_SET_ANON_VMA_NAME: 1430 /* Only anonymous mappings can be named */ 1431 if (vma->vm_file && !vma_is_anon_shmem(vma)) 1432 return -EBADF; 1433 break; 1434 } 1435 1436 /* This is a write operation.*/ 1437 VM_WARN_ON_ONCE(madv_behavior->lock_mode != MADVISE_MMAP_WRITE_LOCK); 1438 1439 error = madvise_update_vma(new_flags, madv_behavior); 1440 out: 1441 /* 1442 * madvise() returns EAGAIN if kernel resources, such as 1443 * slab, are temporarily unavailable. 1444 */ 1445 if (error == -ENOMEM) 1446 error = -EAGAIN; 1447 return error; 1448 } 1449 1450 #ifdef CONFIG_MEMORY_FAILURE 1451 /* 1452 * Error injection support for memory error handling. 1453 */ 1454 static int madvise_inject_error(struct madvise_behavior *madv_behavior) 1455 { 1456 unsigned long size; 1457 unsigned long start = madv_behavior->range.start; 1458 unsigned long end = madv_behavior->range.end; 1459 1460 if (!capable(CAP_SYS_ADMIN)) 1461 return -EPERM; 1462 1463 for (; start < end; start += size) { 1464 unsigned long pfn; 1465 struct page *page; 1466 int ret; 1467 1468 ret = get_user_pages_fast(start, 1, 0, &page); 1469 if (ret != 1) 1470 return ret; 1471 pfn = page_to_pfn(page); 1472 1473 /* 1474 * When soft offlining hugepages, after migrating the page 1475 * we dissolve it, therefore in the second loop "page" will 1476 * no longer be a compound page. 1477 */ 1478 size = page_size(compound_head(page)); 1479 1480 if (madv_behavior->behavior == MADV_SOFT_OFFLINE) { 1481 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", 1482 pfn, start); 1483 ret = soft_offline_page(pfn, MF_COUNT_INCREASED); 1484 } else { 1485 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", 1486 pfn, start); 1487 ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED); 1488 if (ret == -EOPNOTSUPP) 1489 ret = 0; 1490 } 1491 1492 if (ret) 1493 return ret; 1494 } 1495 1496 return 0; 1497 } 1498 1499 static bool is_memory_failure(struct madvise_behavior *madv_behavior) 1500 { 1501 switch (madv_behavior->behavior) { 1502 case MADV_HWPOISON: 1503 case MADV_SOFT_OFFLINE: 1504 return true; 1505 default: 1506 return false; 1507 } 1508 } 1509 1510 #else 1511 1512 static int madvise_inject_error(struct madvise_behavior *madv_behavior) 1513 { 1514 return 0; 1515 } 1516 1517 static bool is_memory_failure(struct madvise_behavior *madv_behavior) 1518 { 1519 return false; 1520 } 1521 1522 #endif /* CONFIG_MEMORY_FAILURE */ 1523 1524 static bool 1525 madvise_behavior_valid(int behavior) 1526 { 1527 switch (behavior) { 1528 case MADV_DOFORK: 1529 case MADV_DONTFORK: 1530 case MADV_NORMAL: 1531 case MADV_SEQUENTIAL: 1532 case MADV_RANDOM: 1533 case MADV_REMOVE: 1534 case MADV_WILLNEED: 1535 case MADV_DONTNEED: 1536 case MADV_DONTNEED_LOCKED: 1537 case MADV_FREE: 1538 case MADV_COLD: 1539 case MADV_PAGEOUT: 1540 case MADV_POPULATE_READ: 1541 case MADV_POPULATE_WRITE: 1542 #ifdef CONFIG_KSM 1543 case MADV_MERGEABLE: 1544 case MADV_UNMERGEABLE: 1545 #endif 1546 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1547 case MADV_HUGEPAGE: 1548 case MADV_NOHUGEPAGE: 1549 case MADV_COLLAPSE: 1550 #endif 1551 case MADV_DONTDUMP: 1552 case MADV_DODUMP: 1553 case MADV_WIPEONFORK: 1554 case MADV_KEEPONFORK: 1555 case MADV_GUARD_INSTALL: 1556 case MADV_GUARD_REMOVE: 1557 #ifdef CONFIG_MEMORY_FAILURE 1558 case MADV_SOFT_OFFLINE: 1559 case MADV_HWPOISON: 1560 #endif 1561 return true; 1562 1563 default: 1564 return false; 1565 } 1566 } 1567 1568 /* Can we invoke process_madvise() on a remote mm for the specified behavior? */ 1569 static bool process_madvise_remote_valid(int behavior) 1570 { 1571 switch (behavior) { 1572 case MADV_COLD: 1573 case MADV_PAGEOUT: 1574 case MADV_WILLNEED: 1575 case MADV_COLLAPSE: 1576 return true; 1577 default: 1578 return false; 1579 } 1580 } 1581 1582 /* Does this operation invoke anon_vma_prepare()? */ 1583 static bool prepares_anon_vma(int behavior) 1584 { 1585 switch (behavior) { 1586 case MADV_GUARD_INSTALL: 1587 return true; 1588 default: 1589 return false; 1590 } 1591 } 1592 1593 /* 1594 * We have acquired a VMA read lock, is the VMA valid to be madvise'd under VMA 1595 * read lock only now we have a VMA to examine? 1596 */ 1597 static bool is_vma_lock_sufficient(struct vm_area_struct *vma, 1598 struct madvise_behavior *madv_behavior) 1599 { 1600 /* Must span only a single VMA.*/ 1601 if (madv_behavior->range.end > vma->vm_end) 1602 return false; 1603 /* Remote processes unsupported. */ 1604 if (current->mm != vma->vm_mm) 1605 return false; 1606 /* Userfaultfd unsupported. */ 1607 if (userfaultfd_armed(vma)) 1608 return false; 1609 /* 1610 * anon_vma_prepare() explicitly requires an mmap lock for 1611 * serialisation, so we cannot use a VMA lock in this case. 1612 * 1613 * Note we might race with anon_vma being set, however this makes this 1614 * check overly paranoid which is safe. 1615 */ 1616 if (vma_is_anonymous(vma) && 1617 prepares_anon_vma(madv_behavior->behavior) && !vma->anon_vma) 1618 return false; 1619 1620 return true; 1621 } 1622 1623 /* 1624 * Try to acquire a VMA read lock if possible. 1625 * 1626 * We only support this lock over a single VMA, which the input range must 1627 * span either partially or fully. 1628 * 1629 * This function always returns with an appropriate lock held. If a VMA read 1630 * lock could be acquired, we return true and set madv_behavior state 1631 * accordingly. 1632 * 1633 * If a VMA read lock could not be acquired, we return false and expect caller to 1634 * fallback to mmap lock behaviour. 1635 */ 1636 static bool try_vma_read_lock(struct madvise_behavior *madv_behavior) 1637 { 1638 struct mm_struct *mm = madv_behavior->mm; 1639 struct vm_area_struct *vma; 1640 1641 vma = lock_vma_under_rcu(mm, madv_behavior->range.start); 1642 if (!vma) 1643 goto take_mmap_read_lock; 1644 1645 if (!is_vma_lock_sufficient(vma, madv_behavior)) { 1646 vma_end_read(vma); 1647 goto take_mmap_read_lock; 1648 } 1649 1650 madv_behavior->vma = vma; 1651 return true; 1652 1653 take_mmap_read_lock: 1654 mmap_read_lock(mm); 1655 madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK; 1656 return false; 1657 } 1658 1659 /* 1660 * Walk the vmas in range [start,end), and call the madvise_vma_behavior 1661 * function on each one. The function will get start and end parameters that 1662 * cover the overlap between the current vma and the original range. Any 1663 * unmapped regions in the original range will result in this function returning 1664 * -ENOMEM while still calling the madvise_vma_behavior function on all of the 1665 * existing vmas in the range. Must be called with the mmap_lock held for 1666 * reading or writing. 1667 */ 1668 static 1669 int madvise_walk_vmas(struct madvise_behavior *madv_behavior) 1670 { 1671 struct mm_struct *mm = madv_behavior->mm; 1672 struct madvise_behavior_range *range = &madv_behavior->range; 1673 /* range is updated to span each VMA, so store end of entire range. */ 1674 unsigned long last_end = range->end; 1675 int unmapped_error = 0; 1676 int error; 1677 struct vm_area_struct *prev, *vma; 1678 1679 /* 1680 * If VMA read lock is supported, apply madvise to a single VMA 1681 * tentatively, avoiding walking VMAs. 1682 */ 1683 if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK && 1684 try_vma_read_lock(madv_behavior)) { 1685 error = madvise_vma_behavior(madv_behavior); 1686 vma_end_read(madv_behavior->vma); 1687 return error; 1688 } 1689 1690 vma = find_vma_prev(mm, range->start, &prev); 1691 if (vma && range->start > vma->vm_start) 1692 prev = vma; 1693 1694 for (;;) { 1695 /* Still start < end. */ 1696 if (!vma) 1697 return -ENOMEM; 1698 1699 /* Here start < (last_end|vma->vm_end). */ 1700 if (range->start < vma->vm_start) { 1701 /* 1702 * This indicates a gap between VMAs in the input 1703 * range. This does not cause the operation to abort, 1704 * rather we simply return -ENOMEM to indicate that this 1705 * has happened, but carry on. 1706 */ 1707 unmapped_error = -ENOMEM; 1708 range->start = vma->vm_start; 1709 if (range->start >= last_end) 1710 break; 1711 } 1712 1713 /* Here vma->vm_start <= range->start < (last_end|vma->vm_end) */ 1714 range->end = min(vma->vm_end, last_end); 1715 1716 /* Here vma->vm_start <= range->start < range->end <= (last_end|vma->vm_end). */ 1717 madv_behavior->prev = prev; 1718 madv_behavior->vma = vma; 1719 error = madvise_vma_behavior(madv_behavior); 1720 if (error) 1721 return error; 1722 if (madv_behavior->lock_dropped) { 1723 /* We dropped the mmap lock, we can't ref the VMA. */ 1724 prev = NULL; 1725 vma = NULL; 1726 madv_behavior->lock_dropped = false; 1727 } else { 1728 vma = madv_behavior->vma; 1729 prev = vma; 1730 } 1731 1732 if (vma && range->end < vma->vm_end) 1733 range->end = vma->vm_end; 1734 if (range->end >= last_end) 1735 break; 1736 1737 vma = find_vma(mm, vma ? vma->vm_end : range->end); 1738 range->start = range->end; 1739 } 1740 1741 return unmapped_error; 1742 } 1743 1744 /* 1745 * Any behaviour which results in changes to the vma->vm_flags needs to 1746 * take mmap_lock for writing. Others, which simply traverse vmas, need 1747 * to only take it for reading. 1748 */ 1749 static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior) 1750 { 1751 if (is_memory_failure(madv_behavior)) 1752 return MADVISE_NO_LOCK; 1753 1754 switch (madv_behavior->behavior) { 1755 case MADV_REMOVE: 1756 case MADV_WILLNEED: 1757 case MADV_COLD: 1758 case MADV_PAGEOUT: 1759 case MADV_POPULATE_READ: 1760 case MADV_POPULATE_WRITE: 1761 case MADV_COLLAPSE: 1762 return MADVISE_MMAP_READ_LOCK; 1763 case MADV_GUARD_INSTALL: 1764 case MADV_GUARD_REMOVE: 1765 case MADV_DONTNEED: 1766 case MADV_DONTNEED_LOCKED: 1767 case MADV_FREE: 1768 return MADVISE_VMA_READ_LOCK; 1769 default: 1770 return MADVISE_MMAP_WRITE_LOCK; 1771 } 1772 } 1773 1774 static int madvise_lock(struct madvise_behavior *madv_behavior) 1775 { 1776 struct mm_struct *mm = madv_behavior->mm; 1777 enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior); 1778 1779 switch (lock_mode) { 1780 case MADVISE_NO_LOCK: 1781 break; 1782 case MADVISE_MMAP_WRITE_LOCK: 1783 if (mmap_write_lock_killable(mm)) 1784 return -EINTR; 1785 break; 1786 case MADVISE_MMAP_READ_LOCK: 1787 mmap_read_lock(mm); 1788 break; 1789 case MADVISE_VMA_READ_LOCK: 1790 /* We will acquire the lock per-VMA in madvise_walk_vmas(). */ 1791 break; 1792 } 1793 1794 madv_behavior->lock_mode = lock_mode; 1795 return 0; 1796 } 1797 1798 static void madvise_unlock(struct madvise_behavior *madv_behavior) 1799 { 1800 struct mm_struct *mm = madv_behavior->mm; 1801 1802 switch (madv_behavior->lock_mode) { 1803 case MADVISE_NO_LOCK: 1804 return; 1805 case MADVISE_MMAP_WRITE_LOCK: 1806 mmap_write_unlock(mm); 1807 break; 1808 case MADVISE_MMAP_READ_LOCK: 1809 mmap_read_unlock(mm); 1810 break; 1811 case MADVISE_VMA_READ_LOCK: 1812 /* We will drop the lock per-VMA in madvise_walk_vmas(). */ 1813 break; 1814 } 1815 1816 madv_behavior->lock_mode = MADVISE_NO_LOCK; 1817 } 1818 1819 static bool madvise_batch_tlb_flush(int behavior) 1820 { 1821 switch (behavior) { 1822 case MADV_DONTNEED: 1823 case MADV_DONTNEED_LOCKED: 1824 case MADV_FREE: 1825 return true; 1826 default: 1827 return false; 1828 } 1829 } 1830 1831 static void madvise_init_tlb(struct madvise_behavior *madv_behavior) 1832 { 1833 if (madvise_batch_tlb_flush(madv_behavior->behavior)) 1834 tlb_gather_mmu(madv_behavior->tlb, madv_behavior->mm); 1835 } 1836 1837 static void madvise_finish_tlb(struct madvise_behavior *madv_behavior) 1838 { 1839 if (madvise_batch_tlb_flush(madv_behavior->behavior)) 1840 tlb_finish_mmu(madv_behavior->tlb); 1841 } 1842 1843 static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) 1844 { 1845 size_t len; 1846 1847 if (!madvise_behavior_valid(behavior)) 1848 return false; 1849 1850 if (!PAGE_ALIGNED(start)) 1851 return false; 1852 len = PAGE_ALIGN(len_in); 1853 1854 /* Check to see whether len was rounded up from small -ve to zero */ 1855 if (len_in && !len) 1856 return false; 1857 1858 if (start + len < start) 1859 return false; 1860 1861 return true; 1862 } 1863 1864 /* 1865 * madvise_should_skip() - Return if the request is invalid or nothing. 1866 * @start: Start address of madvise-requested address range. 1867 * @len_in: Length of madvise-requested address range. 1868 * @behavior: Requested madvise behavior. 1869 * @err: Pointer to store an error code from the check. 1870 * 1871 * If the specified behaviour is invalid or nothing would occur, we skip the 1872 * operation. This function returns true in the cases, otherwise false. In 1873 * the former case we store an error on @err. 1874 */ 1875 static bool madvise_should_skip(unsigned long start, size_t len_in, 1876 int behavior, int *err) 1877 { 1878 if (!is_valid_madvise(start, len_in, behavior)) { 1879 *err = -EINVAL; 1880 return true; 1881 } 1882 if (start + PAGE_ALIGN(len_in) == start) { 1883 *err = 0; 1884 return true; 1885 } 1886 return false; 1887 } 1888 1889 static bool is_madvise_populate(struct madvise_behavior *madv_behavior) 1890 { 1891 switch (madv_behavior->behavior) { 1892 case MADV_POPULATE_READ: 1893 case MADV_POPULATE_WRITE: 1894 return true; 1895 default: 1896 return false; 1897 } 1898 } 1899 1900 /* 1901 * untagged_addr_remote() assumes mmap_lock is already held. On 1902 * architectures like x86 and RISC-V, tagging is tricky because each 1903 * mm may have a different tagging mask. However, we might only hold 1904 * the per-VMA lock (currently only local processes are supported), 1905 * so untagged_addr is used to avoid the mmap_lock assertion for 1906 * local processes. 1907 */ 1908 static inline unsigned long get_untagged_addr(struct mm_struct *mm, 1909 unsigned long start) 1910 { 1911 return current->mm == mm ? untagged_addr(start) : 1912 untagged_addr_remote(mm, start); 1913 } 1914 1915 static int madvise_do_behavior(unsigned long start, size_t len_in, 1916 struct madvise_behavior *madv_behavior) 1917 { 1918 struct blk_plug plug; 1919 int error; 1920 struct madvise_behavior_range *range = &madv_behavior->range; 1921 1922 if (is_memory_failure(madv_behavior)) { 1923 range->start = start; 1924 range->end = start + len_in; 1925 return madvise_inject_error(madv_behavior); 1926 } 1927 1928 range->start = get_untagged_addr(madv_behavior->mm, start); 1929 range->end = range->start + PAGE_ALIGN(len_in); 1930 1931 blk_start_plug(&plug); 1932 if (is_madvise_populate(madv_behavior)) 1933 error = madvise_populate(madv_behavior); 1934 else 1935 error = madvise_walk_vmas(madv_behavior); 1936 blk_finish_plug(&plug); 1937 return error; 1938 } 1939 1940 /* 1941 * The madvise(2) system call. 1942 * 1943 * Applications can use madvise() to advise the kernel how it should 1944 * handle paging I/O in this VM area. The idea is to help the kernel 1945 * use appropriate read-ahead and caching techniques. The information 1946 * provided is advisory only, and can be safely disregarded by the 1947 * kernel without affecting the correct operation of the application. 1948 * 1949 * behavior values: 1950 * MADV_NORMAL - the default behavior is to read clusters. This 1951 * results in some read-ahead and read-behind. 1952 * MADV_RANDOM - the system should read the minimum amount of data 1953 * on any access, since it is unlikely that the appli- 1954 * cation will need more than what it asks for. 1955 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 1956 * once, so they can be aggressively read ahead, and 1957 * can be freed soon after they are accessed. 1958 * MADV_WILLNEED - the application is notifying the system to read 1959 * some pages ahead. 1960 * MADV_DONTNEED - the application is finished with the given range, 1961 * so the kernel can free resources associated with it. 1962 * MADV_FREE - the application marks pages in the given range as lazy free, 1963 * where actual purges are postponed until memory pressure happens. 1964 * MADV_REMOVE - the application wants to free up the given range of 1965 * pages and associated backing store. 1966 * MADV_DONTFORK - omit this area from child's address space when forking: 1967 * typically, to avoid COWing pages pinned by get_user_pages(). 1968 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 1969 * MADV_WIPEONFORK - present the child process with zero-filled memory in this 1970 * range after a fork. 1971 * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK 1972 * MADV_HWPOISON - trigger memory error handler as if the given memory range 1973 * were corrupted by unrecoverable hardware memory failure. 1974 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. 1975 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 1976 * this area with pages of identical content from other such areas. 1977 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 1978 * MADV_HUGEPAGE - the application wants to back the given range by transparent 1979 * huge pages in the future. Existing pages might be coalesced and 1980 * new pages might be allocated as THP. 1981 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by 1982 * transparent huge pages so the existing pages will not be 1983 * coalesced into THP and new pages will not be allocated as THP. 1984 * MADV_COLLAPSE - synchronously coalesce pages into new THP. 1985 * MADV_DONTDUMP - the application wants to prevent pages in the given range 1986 * from being included in its core dump. 1987 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. 1988 * MADV_COLD - the application is not expected to use this memory soon, 1989 * deactivate pages in this range so that they can be reclaimed 1990 * easily if memory pressure happens. 1991 * MADV_PAGEOUT - the application is not expected to use this memory soon, 1992 * page out the pages in this range immediately. 1993 * MADV_POPULATE_READ - populate (prefault) page tables readable by 1994 * triggering read faults if required 1995 * MADV_POPULATE_WRITE - populate (prefault) page tables writable by 1996 * triggering write faults if required 1997 * 1998 * return values: 1999 * zero - success 2000 * -EINVAL - start + len < 0, start is not page-aligned, 2001 * "behavior" is not a valid value, or application 2002 * is attempting to release locked or shared pages, 2003 * or the specified address range includes file, Huge TLB, 2004 * MAP_SHARED or VMPFNMAP range. 2005 * -ENOMEM - addresses in the specified range are not currently 2006 * mapped, or are outside the AS of the process. 2007 * -EIO - an I/O error occurred while paging in data. 2008 * -EBADF - map exists, but area maps something that isn't a file. 2009 * -EAGAIN - a kernel resource was temporarily unavailable. 2010 * -EPERM - memory is sealed. 2011 */ 2012 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) 2013 { 2014 int error; 2015 struct mmu_gather tlb; 2016 struct madvise_behavior madv_behavior = { 2017 .mm = mm, 2018 .behavior = behavior, 2019 .tlb = &tlb, 2020 }; 2021 2022 if (madvise_should_skip(start, len_in, behavior, &error)) 2023 return error; 2024 error = madvise_lock(&madv_behavior); 2025 if (error) 2026 return error; 2027 madvise_init_tlb(&madv_behavior); 2028 error = madvise_do_behavior(start, len_in, &madv_behavior); 2029 madvise_finish_tlb(&madv_behavior); 2030 madvise_unlock(&madv_behavior); 2031 2032 return error; 2033 } 2034 2035 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 2036 { 2037 return do_madvise(current->mm, start, len_in, behavior); 2038 } 2039 2040 /* Perform an madvise operation over a vector of addresses and lengths. */ 2041 static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, 2042 int behavior) 2043 { 2044 ssize_t ret = 0; 2045 size_t total_len; 2046 struct mmu_gather tlb; 2047 struct madvise_behavior madv_behavior = { 2048 .mm = mm, 2049 .behavior = behavior, 2050 .tlb = &tlb, 2051 }; 2052 2053 total_len = iov_iter_count(iter); 2054 2055 ret = madvise_lock(&madv_behavior); 2056 if (ret) 2057 return ret; 2058 madvise_init_tlb(&madv_behavior); 2059 2060 while (iov_iter_count(iter)) { 2061 unsigned long start = (unsigned long)iter_iov_addr(iter); 2062 size_t len_in = iter_iov_len(iter); 2063 int error; 2064 2065 if (madvise_should_skip(start, len_in, behavior, &error)) 2066 ret = error; 2067 else 2068 ret = madvise_do_behavior(start, len_in, &madv_behavior); 2069 /* 2070 * An madvise operation is attempting to restart the syscall, 2071 * but we cannot proceed as it would not be correct to repeat 2072 * the operation in aggregate, and would be surprising to the 2073 * user. 2074 * 2075 * We drop and reacquire locks so it is safe to just loop and 2076 * try again. We check for fatal signals in case we need exit 2077 * early anyway. 2078 */ 2079 if (ret == -ERESTARTNOINTR) { 2080 if (fatal_signal_pending(current)) { 2081 ret = -EINTR; 2082 break; 2083 } 2084 2085 /* Drop and reacquire lock to unwind race. */ 2086 madvise_finish_tlb(&madv_behavior); 2087 madvise_unlock(&madv_behavior); 2088 ret = madvise_lock(&madv_behavior); 2089 if (ret) 2090 goto out; 2091 madvise_init_tlb(&madv_behavior); 2092 continue; 2093 } 2094 if (ret < 0) 2095 break; 2096 iov_iter_advance(iter, iter_iov_len(iter)); 2097 } 2098 madvise_finish_tlb(&madv_behavior); 2099 madvise_unlock(&madv_behavior); 2100 2101 out: 2102 ret = (total_len - iov_iter_count(iter)) ? : ret; 2103 2104 return ret; 2105 } 2106 2107 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, 2108 size_t, vlen, int, behavior, unsigned int, flags) 2109 { 2110 ssize_t ret; 2111 struct iovec iovstack[UIO_FASTIOV]; 2112 struct iovec *iov = iovstack; 2113 struct iov_iter iter; 2114 struct task_struct *task; 2115 struct mm_struct *mm; 2116 unsigned int f_flags; 2117 2118 if (flags != 0) { 2119 ret = -EINVAL; 2120 goto out; 2121 } 2122 2123 ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); 2124 if (ret < 0) 2125 goto out; 2126 2127 task = pidfd_get_task(pidfd, &f_flags); 2128 if (IS_ERR(task)) { 2129 ret = PTR_ERR(task); 2130 goto free_iov; 2131 } 2132 2133 /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ 2134 mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); 2135 if (IS_ERR(mm)) { 2136 ret = PTR_ERR(mm); 2137 goto release_task; 2138 } 2139 2140 /* 2141 * We need only perform this check if we are attempting to manipulate a 2142 * remote process's address space. 2143 */ 2144 if (mm != current->mm && !process_madvise_remote_valid(behavior)) { 2145 ret = -EINVAL; 2146 goto release_mm; 2147 } 2148 2149 /* 2150 * Require CAP_SYS_NICE for influencing process performance. Note that 2151 * only non-destructive hints are currently supported for remote 2152 * processes. 2153 */ 2154 if (mm != current->mm && !capable(CAP_SYS_NICE)) { 2155 ret = -EPERM; 2156 goto release_mm; 2157 } 2158 2159 ret = vector_madvise(mm, &iter, behavior); 2160 2161 release_mm: 2162 mmput(mm); 2163 release_task: 2164 put_task_struct(task); 2165 free_iov: 2166 kfree(iov); 2167 out: 2168 return ret; 2169 } 2170 2171 #ifdef CONFIG_ANON_VMA_NAME 2172 2173 #define ANON_VMA_NAME_MAX_LEN 80 2174 #define ANON_VMA_NAME_INVALID_CHARS "\\`$[]" 2175 2176 static inline bool is_valid_name_char(char ch) 2177 { 2178 /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */ 2179 return ch > 0x1f && ch < 0x7f && 2180 !strchr(ANON_VMA_NAME_INVALID_CHARS, ch); 2181 } 2182 2183 static int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, 2184 unsigned long len_in, struct anon_vma_name *anon_name) 2185 { 2186 unsigned long end; 2187 unsigned long len; 2188 int error; 2189 struct madvise_behavior madv_behavior = { 2190 .mm = mm, 2191 .behavior = __MADV_SET_ANON_VMA_NAME, 2192 .anon_name = anon_name, 2193 }; 2194 2195 if (start & ~PAGE_MASK) 2196 return -EINVAL; 2197 len = (len_in + ~PAGE_MASK) & PAGE_MASK; 2198 2199 /* Check to see whether len was rounded up from small -ve to zero */ 2200 if (len_in && !len) 2201 return -EINVAL; 2202 2203 end = start + len; 2204 if (end < start) 2205 return -EINVAL; 2206 2207 if (end == start) 2208 return 0; 2209 2210 madv_behavior.range.start = start; 2211 madv_behavior.range.end = end; 2212 2213 error = madvise_lock(&madv_behavior); 2214 if (error) 2215 return error; 2216 error = madvise_walk_vmas(&madv_behavior); 2217 madvise_unlock(&madv_behavior); 2218 2219 return error; 2220 } 2221 2222 int set_anon_vma_name(unsigned long addr, unsigned long size, 2223 const char __user *uname) 2224 { 2225 struct anon_vma_name *anon_name = NULL; 2226 struct mm_struct *mm = current->mm; 2227 int error; 2228 2229 if (uname) { 2230 char *name, *pch; 2231 2232 name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); 2233 if (IS_ERR(name)) 2234 return PTR_ERR(name); 2235 2236 for (pch = name; *pch != '\0'; pch++) { 2237 if (!is_valid_name_char(*pch)) { 2238 kfree(name); 2239 return -EINVAL; 2240 } 2241 } 2242 /* anon_vma has its own copy */ 2243 anon_name = anon_vma_name_alloc(name); 2244 kfree(name); 2245 if (!anon_name) 2246 return -ENOMEM; 2247 } 2248 2249 error = madvise_set_anon_name(mm, addr, size, anon_name); 2250 anon_vma_name_put(anon_name); 2251 2252 return error; 2253 } 2254 #endif 2255