1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/madvise.c 4 * 5 * Copyright (C) 1999 Linus Torvalds 6 * Copyright (C) 2002 Christoph Hellwig 7 */ 8 9 #include <linux/mman.h> 10 #include <linux/pagemap.h> 11 #include <linux/syscalls.h> 12 #include <linux/mempolicy.h> 13 #include <linux/page-isolation.h> 14 #include <linux/page_idle.h> 15 #include <linux/userfaultfd_k.h> 16 #include <linux/hugetlb.h> 17 #include <linux/falloc.h> 18 #include <linux/fadvise.h> 19 #include <linux/sched.h> 20 #include <linux/sched/mm.h> 21 #include <linux/mm_inline.h> 22 #include <linux/string.h> 23 #include <linux/uio.h> 24 #include <linux/ksm.h> 25 #include <linux/fs.h> 26 #include <linux/file.h> 27 #include <linux/blkdev.h> 28 #include <linux/backing-dev.h> 29 #include <linux/pagewalk.h> 30 #include <linux/swap.h> 31 #include <linux/swapops.h> 32 #include <linux/shmem_fs.h> 33 #include <linux/mmu_notifier.h> 34 35 #include <asm/tlb.h> 36 37 #include "internal.h" 38 #include "swap.h" 39 40 #define __MADV_SET_ANON_VMA_NAME (-1) 41 42 /* 43 * Maximum number of attempts we make to install guard pages before we give up 44 * and return -ERESTARTNOINTR to have userspace try again. 45 */ 46 #define MAX_MADVISE_GUARD_RETRIES 3 47 48 struct madvise_walk_private { 49 struct mmu_gather *tlb; 50 bool pageout; 51 }; 52 53 enum madvise_lock_mode { 54 MADVISE_NO_LOCK, 55 MADVISE_MMAP_READ_LOCK, 56 MADVISE_MMAP_WRITE_LOCK, 57 MADVISE_VMA_READ_LOCK, 58 }; 59 60 struct madvise_behavior_range { 61 unsigned long start; 62 unsigned long end; 63 }; 64 65 struct madvise_behavior { 66 struct mm_struct *mm; 67 int behavior; 68 struct mmu_gather *tlb; 69 enum madvise_lock_mode lock_mode; 70 struct anon_vma_name *anon_name; 71 72 /* 73 * The range over which the behaviour is currently being applied. If 74 * traversing multiple VMAs, this is updated for each. 75 */ 76 struct madvise_behavior_range range; 77 /* The VMA and VMA preceding it (if applicable) currently targeted. */ 78 struct vm_area_struct *prev; 79 struct vm_area_struct *vma; 80 bool lock_dropped; 81 }; 82 83 #ifdef CONFIG_ANON_VMA_NAME 84 static int madvise_walk_vmas(struct madvise_behavior *madv_behavior); 85 86 struct anon_vma_name *anon_vma_name_alloc(const char *name) 87 { 88 struct anon_vma_name *anon_name; 89 size_t count; 90 91 /* Add 1 for NUL terminator at the end of the anon_name->name */ 92 count = strlen(name) + 1; 93 anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); 94 if (anon_name) { 95 kref_init(&anon_name->kref); 96 memcpy(anon_name->name, name, count); 97 } 98 99 return anon_name; 100 } 101 102 void anon_vma_name_free(struct kref *kref) 103 { 104 struct anon_vma_name *anon_name = 105 container_of(kref, struct anon_vma_name, kref); 106 kfree(anon_name); 107 } 108 109 struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) 110 { 111 if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) 112 vma_assert_locked(vma); 113 114 return vma->anon_name; 115 } 116 117 /* mmap_lock should be write-locked */ 118 static int replace_anon_vma_name(struct vm_area_struct *vma, 119 struct anon_vma_name *anon_name) 120 { 121 struct anon_vma_name *orig_name = anon_vma_name(vma); 122 123 if (!anon_name) { 124 vma->anon_name = NULL; 125 anon_vma_name_put(orig_name); 126 return 0; 127 } 128 129 if (anon_vma_name_eq(orig_name, anon_name)) 130 return 0; 131 132 vma->anon_name = anon_vma_name_reuse(anon_name); 133 anon_vma_name_put(orig_name); 134 135 return 0; 136 } 137 #else /* CONFIG_ANON_VMA_NAME */ 138 static int replace_anon_vma_name(struct vm_area_struct *vma, 139 struct anon_vma_name *anon_name) 140 { 141 if (anon_name) 142 return -EINVAL; 143 144 return 0; 145 } 146 #endif /* CONFIG_ANON_VMA_NAME */ 147 /* 148 * Update the vm_flags or anon_name on region of a vma, splitting it or merging 149 * it as necessary. Must be called with mmap_lock held for writing. 150 */ 151 static int madvise_update_vma(vm_flags_t new_flags, 152 struct madvise_behavior *madv_behavior) 153 { 154 struct vm_area_struct *vma = madv_behavior->vma; 155 struct madvise_behavior_range *range = &madv_behavior->range; 156 struct anon_vma_name *anon_name = madv_behavior->anon_name; 157 bool set_new_anon_name = madv_behavior->behavior == __MADV_SET_ANON_VMA_NAME; 158 VMA_ITERATOR(vmi, madv_behavior->mm, range->start); 159 160 if (new_flags == vma->vm_flags && (!set_new_anon_name || 161 anon_vma_name_eq(anon_vma_name(vma), anon_name))) 162 return 0; 163 164 if (set_new_anon_name) 165 vma = vma_modify_name(&vmi, madv_behavior->prev, vma, 166 range->start, range->end, anon_name); 167 else 168 vma = vma_modify_flags(&vmi, madv_behavior->prev, vma, 169 range->start, range->end, new_flags); 170 171 if (IS_ERR(vma)) 172 return PTR_ERR(vma); 173 174 madv_behavior->vma = vma; 175 176 /* vm_flags is protected by the mmap_lock held in write mode. */ 177 vma_start_write(vma); 178 vm_flags_reset(vma, new_flags); 179 if (set_new_anon_name) 180 return replace_anon_vma_name(vma, anon_name); 181 182 return 0; 183 } 184 185 #ifdef CONFIG_SWAP 186 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, 187 unsigned long end, struct mm_walk *walk) 188 { 189 struct vm_area_struct *vma = walk->private; 190 struct swap_iocb *splug = NULL; 191 pte_t *ptep = NULL; 192 spinlock_t *ptl; 193 unsigned long addr; 194 195 for (addr = start; addr < end; addr += PAGE_SIZE) { 196 pte_t pte; 197 swp_entry_t entry; 198 struct folio *folio; 199 200 if (!ptep++) { 201 ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 202 if (!ptep) 203 break; 204 } 205 206 pte = ptep_get(ptep); 207 if (!is_swap_pte(pte)) 208 continue; 209 entry = pte_to_swp_entry(pte); 210 if (unlikely(non_swap_entry(entry))) 211 continue; 212 213 pte_unmap_unlock(ptep, ptl); 214 ptep = NULL; 215 216 folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, 217 vma, addr, &splug); 218 if (folio) 219 folio_put(folio); 220 } 221 222 if (ptep) 223 pte_unmap_unlock(ptep, ptl); 224 swap_read_unplug(splug); 225 cond_resched(); 226 227 return 0; 228 } 229 230 static const struct mm_walk_ops swapin_walk_ops = { 231 .pmd_entry = swapin_walk_pmd_entry, 232 .walk_lock = PGWALK_RDLOCK, 233 }; 234 235 static void shmem_swapin_range(struct vm_area_struct *vma, 236 unsigned long start, unsigned long end, 237 struct address_space *mapping) 238 { 239 XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); 240 pgoff_t end_index = linear_page_index(vma, end) - 1; 241 struct folio *folio; 242 struct swap_iocb *splug = NULL; 243 244 rcu_read_lock(); 245 xas_for_each(&xas, folio, end_index) { 246 unsigned long addr; 247 swp_entry_t entry; 248 249 if (!xa_is_value(folio)) 250 continue; 251 entry = radix_to_swp_entry(folio); 252 /* There might be swapin error entries in shmem mapping. */ 253 if (non_swap_entry(entry)) 254 continue; 255 256 addr = vma->vm_start + 257 ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT); 258 xas_pause(&xas); 259 rcu_read_unlock(); 260 261 folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping), 262 vma, addr, &splug); 263 if (folio) 264 folio_put(folio); 265 266 rcu_read_lock(); 267 } 268 rcu_read_unlock(); 269 swap_read_unplug(splug); 270 } 271 #endif /* CONFIG_SWAP */ 272 273 static void mark_mmap_lock_dropped(struct madvise_behavior *madv_behavior) 274 { 275 VM_WARN_ON_ONCE(madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK); 276 madv_behavior->lock_dropped = true; 277 } 278 279 /* 280 * Schedule all required I/O operations. Do not wait for completion. 281 */ 282 static long madvise_willneed(struct madvise_behavior *madv_behavior) 283 { 284 struct vm_area_struct *vma = madv_behavior->vma; 285 struct mm_struct *mm = madv_behavior->mm; 286 struct file *file = vma->vm_file; 287 unsigned long start = madv_behavior->range.start; 288 unsigned long end = madv_behavior->range.end; 289 loff_t offset; 290 291 #ifdef CONFIG_SWAP 292 if (!file) { 293 walk_page_range_vma(vma, start, end, &swapin_walk_ops, vma); 294 lru_add_drain(); /* Push any new pages onto the LRU now */ 295 return 0; 296 } 297 298 if (shmem_mapping(file->f_mapping)) { 299 shmem_swapin_range(vma, start, end, file->f_mapping); 300 lru_add_drain(); /* Push any new pages onto the LRU now */ 301 return 0; 302 } 303 #else 304 if (!file) 305 return -EBADF; 306 #endif 307 308 if (IS_DAX(file_inode(file))) { 309 /* no bad return value, but ignore advice */ 310 return 0; 311 } 312 313 /* 314 * Filesystem's fadvise may need to take various locks. We need to 315 * explicitly grab a reference because the vma (and hence the 316 * vma's reference to the file) can go away as soon as we drop 317 * mmap_lock. 318 */ 319 mark_mmap_lock_dropped(madv_behavior); 320 get_file(file); 321 offset = (loff_t)(start - vma->vm_start) 322 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 323 mmap_read_unlock(mm); 324 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); 325 fput(file); 326 mmap_read_lock(mm); 327 return 0; 328 } 329 330 static inline bool can_do_file_pageout(struct vm_area_struct *vma) 331 { 332 if (!vma->vm_file) 333 return false; 334 /* 335 * paging out pagecache only for non-anonymous mappings that correspond 336 * to the files the calling process could (if tried) open for writing; 337 * otherwise we'd be including shared non-exclusive mappings, which 338 * opens a side channel. 339 */ 340 return inode_owner_or_capable(&nop_mnt_idmap, 341 file_inode(vma->vm_file)) || 342 file_permission(vma->vm_file, MAY_WRITE) == 0; 343 } 344 345 static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, 346 struct folio *folio, pte_t *ptep, 347 pte_t *ptentp) 348 { 349 int max_nr = (end - addr) / PAGE_SIZE; 350 351 return folio_pte_batch_flags(folio, NULL, ptep, ptentp, max_nr, 352 FPB_MERGE_YOUNG_DIRTY); 353 } 354 355 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, 356 unsigned long addr, unsigned long end, 357 struct mm_walk *walk) 358 { 359 struct madvise_walk_private *private = walk->private; 360 struct mmu_gather *tlb = private->tlb; 361 bool pageout = private->pageout; 362 struct mm_struct *mm = tlb->mm; 363 struct vm_area_struct *vma = walk->vma; 364 pte_t *start_pte, *pte, ptent; 365 spinlock_t *ptl; 366 struct folio *folio = NULL; 367 LIST_HEAD(folio_list); 368 bool pageout_anon_only_filter; 369 unsigned int batch_count = 0; 370 int nr; 371 372 if (fatal_signal_pending(current)) 373 return -EINTR; 374 375 pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) && 376 !can_do_file_pageout(vma); 377 378 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 379 if (pmd_trans_huge(*pmd)) { 380 pmd_t orig_pmd; 381 unsigned long next = pmd_addr_end(addr, end); 382 383 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 384 ptl = pmd_trans_huge_lock(pmd, vma); 385 if (!ptl) 386 return 0; 387 388 orig_pmd = *pmd; 389 if (is_huge_zero_pmd(orig_pmd)) 390 goto huge_unlock; 391 392 if (unlikely(!pmd_present(orig_pmd))) { 393 VM_BUG_ON(thp_migration_supported() && 394 !is_pmd_migration_entry(orig_pmd)); 395 goto huge_unlock; 396 } 397 398 folio = pmd_folio(orig_pmd); 399 400 /* Do not interfere with other mappings of this folio */ 401 if (folio_maybe_mapped_shared(folio)) 402 goto huge_unlock; 403 404 if (pageout_anon_only_filter && !folio_test_anon(folio)) 405 goto huge_unlock; 406 407 if (next - addr != HPAGE_PMD_SIZE) { 408 int err; 409 410 folio_get(folio); 411 spin_unlock(ptl); 412 folio_lock(folio); 413 err = split_folio(folio); 414 folio_unlock(folio); 415 folio_put(folio); 416 if (!err) 417 goto regular_folio; 418 return 0; 419 } 420 421 if (!pageout && pmd_young(orig_pmd)) { 422 pmdp_invalidate(vma, addr, pmd); 423 orig_pmd = pmd_mkold(orig_pmd); 424 425 set_pmd_at(mm, addr, pmd, orig_pmd); 426 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 427 } 428 429 folio_clear_referenced(folio); 430 folio_test_clear_young(folio); 431 if (folio_test_active(folio)) 432 folio_set_workingset(folio); 433 if (pageout) { 434 if (folio_isolate_lru(folio)) { 435 if (folio_test_unevictable(folio)) 436 folio_putback_lru(folio); 437 else 438 list_add(&folio->lru, &folio_list); 439 } 440 } else 441 folio_deactivate(folio); 442 huge_unlock: 443 spin_unlock(ptl); 444 if (pageout) 445 reclaim_pages(&folio_list); 446 return 0; 447 } 448 449 regular_folio: 450 #endif 451 tlb_change_page_size(tlb, PAGE_SIZE); 452 restart: 453 start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 454 if (!start_pte) 455 return 0; 456 flush_tlb_batched_pending(mm); 457 arch_enter_lazy_mmu_mode(); 458 for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { 459 nr = 1; 460 ptent = ptep_get(pte); 461 462 if (++batch_count == SWAP_CLUSTER_MAX) { 463 batch_count = 0; 464 if (need_resched()) { 465 arch_leave_lazy_mmu_mode(); 466 pte_unmap_unlock(start_pte, ptl); 467 cond_resched(); 468 goto restart; 469 } 470 } 471 472 if (pte_none(ptent)) 473 continue; 474 475 if (!pte_present(ptent)) 476 continue; 477 478 folio = vm_normal_folio(vma, addr, ptent); 479 if (!folio || folio_is_zone_device(folio)) 480 continue; 481 482 /* 483 * If we encounter a large folio, only split it if it is not 484 * fully mapped within the range we are operating on. Otherwise 485 * leave it as is so that it can be swapped out whole. If we 486 * fail to split a folio, leave it in place and advance to the 487 * next pte in the range. 488 */ 489 if (folio_test_large(folio)) { 490 nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); 491 if (nr < folio_nr_pages(folio)) { 492 int err; 493 494 if (folio_maybe_mapped_shared(folio)) 495 continue; 496 if (pageout_anon_only_filter && !folio_test_anon(folio)) 497 continue; 498 if (!folio_trylock(folio)) 499 continue; 500 folio_get(folio); 501 arch_leave_lazy_mmu_mode(); 502 pte_unmap_unlock(start_pte, ptl); 503 start_pte = NULL; 504 err = split_folio(folio); 505 folio_unlock(folio); 506 folio_put(folio); 507 start_pte = pte = 508 pte_offset_map_lock(mm, pmd, addr, &ptl); 509 if (!start_pte) 510 break; 511 flush_tlb_batched_pending(mm); 512 arch_enter_lazy_mmu_mode(); 513 if (!err) 514 nr = 0; 515 continue; 516 } 517 } 518 519 /* 520 * Do not interfere with other mappings of this folio and 521 * non-LRU folio. If we have a large folio at this point, we 522 * know it is fully mapped so if its mapcount is the same as its 523 * number of pages, it must be exclusive. 524 */ 525 if (!folio_test_lru(folio) || 526 folio_mapcount(folio) != folio_nr_pages(folio)) 527 continue; 528 529 if (pageout_anon_only_filter && !folio_test_anon(folio)) 530 continue; 531 532 if (!pageout && pte_young(ptent)) { 533 clear_young_dirty_ptes(vma, addr, pte, nr, 534 CYDP_CLEAR_YOUNG); 535 tlb_remove_tlb_entries(tlb, pte, nr, addr); 536 } 537 538 /* 539 * We are deactivating a folio for accelerating reclaiming. 540 * VM couldn't reclaim the folio unless we clear PG_young. 541 * As a side effect, it makes confuse idle-page tracking 542 * because they will miss recent referenced history. 543 */ 544 folio_clear_referenced(folio); 545 folio_test_clear_young(folio); 546 if (folio_test_active(folio)) 547 folio_set_workingset(folio); 548 if (pageout) { 549 if (folio_isolate_lru(folio)) { 550 if (folio_test_unevictable(folio)) 551 folio_putback_lru(folio); 552 else 553 list_add(&folio->lru, &folio_list); 554 } 555 } else 556 folio_deactivate(folio); 557 } 558 559 if (start_pte) { 560 arch_leave_lazy_mmu_mode(); 561 pte_unmap_unlock(start_pte, ptl); 562 } 563 if (pageout) 564 reclaim_pages(&folio_list); 565 cond_resched(); 566 567 return 0; 568 } 569 570 static const struct mm_walk_ops cold_walk_ops = { 571 .pmd_entry = madvise_cold_or_pageout_pte_range, 572 .walk_lock = PGWALK_RDLOCK, 573 }; 574 575 static void madvise_cold_page_range(struct mmu_gather *tlb, 576 struct madvise_behavior *madv_behavior) 577 578 { 579 struct vm_area_struct *vma = madv_behavior->vma; 580 struct madvise_behavior_range *range = &madv_behavior->range; 581 struct madvise_walk_private walk_private = { 582 .pageout = false, 583 .tlb = tlb, 584 }; 585 586 tlb_start_vma(tlb, vma); 587 walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops, 588 &walk_private); 589 tlb_end_vma(tlb, vma); 590 } 591 592 static inline bool can_madv_lru_vma(struct vm_area_struct *vma) 593 { 594 return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); 595 } 596 597 static long madvise_cold(struct madvise_behavior *madv_behavior) 598 { 599 struct vm_area_struct *vma = madv_behavior->vma; 600 struct mmu_gather tlb; 601 602 if (!can_madv_lru_vma(vma)) 603 return -EINVAL; 604 605 lru_add_drain(); 606 tlb_gather_mmu(&tlb, madv_behavior->mm); 607 madvise_cold_page_range(&tlb, madv_behavior); 608 tlb_finish_mmu(&tlb); 609 610 return 0; 611 } 612 613 static void madvise_pageout_page_range(struct mmu_gather *tlb, 614 struct vm_area_struct *vma, 615 struct madvise_behavior_range *range) 616 { 617 struct madvise_walk_private walk_private = { 618 .pageout = true, 619 .tlb = tlb, 620 }; 621 622 tlb_start_vma(tlb, vma); 623 walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops, 624 &walk_private); 625 tlb_end_vma(tlb, vma); 626 } 627 628 static long madvise_pageout(struct madvise_behavior *madv_behavior) 629 { 630 struct mmu_gather tlb; 631 struct vm_area_struct *vma = madv_behavior->vma; 632 633 if (!can_madv_lru_vma(vma)) 634 return -EINVAL; 635 636 /* 637 * If the VMA belongs to a private file mapping, there can be private 638 * dirty pages which can be paged out if even this process is neither 639 * owner nor write capable of the file. We allow private file mappings 640 * further to pageout dirty anon pages. 641 */ 642 if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) && 643 (vma->vm_flags & VM_MAYSHARE))) 644 return 0; 645 646 lru_add_drain(); 647 tlb_gather_mmu(&tlb, madv_behavior->mm); 648 madvise_pageout_page_range(&tlb, vma, &madv_behavior->range); 649 tlb_finish_mmu(&tlb); 650 651 return 0; 652 } 653 654 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, 655 unsigned long end, struct mm_walk *walk) 656 657 { 658 const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY; 659 struct mmu_gather *tlb = walk->private; 660 struct mm_struct *mm = tlb->mm; 661 struct vm_area_struct *vma = walk->vma; 662 spinlock_t *ptl; 663 pte_t *start_pte, *pte, ptent; 664 struct folio *folio; 665 int nr_swap = 0; 666 unsigned long next; 667 int nr, max_nr; 668 669 next = pmd_addr_end(addr, end); 670 if (pmd_trans_huge(*pmd)) 671 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) 672 return 0; 673 674 tlb_change_page_size(tlb, PAGE_SIZE); 675 start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 676 if (!start_pte) 677 return 0; 678 flush_tlb_batched_pending(mm); 679 arch_enter_lazy_mmu_mode(); 680 for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { 681 nr = 1; 682 ptent = ptep_get(pte); 683 684 if (pte_none(ptent)) 685 continue; 686 /* 687 * If the pte has swp_entry, just clear page table to 688 * prevent swap-in which is more expensive rather than 689 * (page allocation + zeroing). 690 */ 691 if (!pte_present(ptent)) { 692 swp_entry_t entry; 693 694 entry = pte_to_swp_entry(ptent); 695 if (!non_swap_entry(entry)) { 696 max_nr = (end - addr) / PAGE_SIZE; 697 nr = swap_pte_batch(pte, max_nr, ptent); 698 nr_swap -= nr; 699 free_swap_and_cache_nr(entry, nr); 700 clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); 701 } else if (is_hwpoison_entry(entry) || 702 is_poisoned_swp_entry(entry)) { 703 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 704 } 705 continue; 706 } 707 708 folio = vm_normal_folio(vma, addr, ptent); 709 if (!folio || folio_is_zone_device(folio)) 710 continue; 711 712 /* 713 * If we encounter a large folio, only split it if it is not 714 * fully mapped within the range we are operating on. Otherwise 715 * leave it as is so that it can be marked as lazyfree. If we 716 * fail to split a folio, leave it in place and advance to the 717 * next pte in the range. 718 */ 719 if (folio_test_large(folio)) { 720 nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); 721 if (nr < folio_nr_pages(folio)) { 722 int err; 723 724 if (folio_maybe_mapped_shared(folio)) 725 continue; 726 if (!folio_trylock(folio)) 727 continue; 728 folio_get(folio); 729 arch_leave_lazy_mmu_mode(); 730 pte_unmap_unlock(start_pte, ptl); 731 start_pte = NULL; 732 err = split_folio(folio); 733 folio_unlock(folio); 734 folio_put(folio); 735 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 736 start_pte = pte; 737 if (!start_pte) 738 break; 739 flush_tlb_batched_pending(mm); 740 arch_enter_lazy_mmu_mode(); 741 if (!err) 742 nr = 0; 743 continue; 744 } 745 } 746 747 if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { 748 if (!folio_trylock(folio)) 749 continue; 750 /* 751 * If we have a large folio at this point, we know it is 752 * fully mapped so if its mapcount is the same as its 753 * number of pages, it must be exclusive. 754 */ 755 if (folio_mapcount(folio) != folio_nr_pages(folio)) { 756 folio_unlock(folio); 757 continue; 758 } 759 760 if (folio_test_swapcache(folio) && 761 !folio_free_swap(folio)) { 762 folio_unlock(folio); 763 continue; 764 } 765 766 folio_clear_dirty(folio); 767 folio_unlock(folio); 768 } 769 770 if (pte_young(ptent) || pte_dirty(ptent)) { 771 clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags); 772 tlb_remove_tlb_entries(tlb, pte, nr, addr); 773 } 774 folio_mark_lazyfree(folio); 775 } 776 777 if (nr_swap) 778 add_mm_counter(mm, MM_SWAPENTS, nr_swap); 779 if (start_pte) { 780 arch_leave_lazy_mmu_mode(); 781 pte_unmap_unlock(start_pte, ptl); 782 } 783 cond_resched(); 784 785 return 0; 786 } 787 788 static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode) 789 { 790 switch (mode) { 791 case MADVISE_VMA_READ_LOCK: 792 return PGWALK_VMA_RDLOCK_VERIFY; 793 case MADVISE_MMAP_READ_LOCK: 794 return PGWALK_RDLOCK; 795 default: 796 /* Other modes don't require fixing up the walk_lock */ 797 WARN_ON_ONCE(1); 798 return PGWALK_RDLOCK; 799 } 800 } 801 802 static int madvise_free_single_vma(struct madvise_behavior *madv_behavior) 803 { 804 struct mm_struct *mm = madv_behavior->mm; 805 struct vm_area_struct *vma = madv_behavior->vma; 806 unsigned long start_addr = madv_behavior->range.start; 807 unsigned long end_addr = madv_behavior->range.end; 808 struct mmu_notifier_range range; 809 struct mmu_gather *tlb = madv_behavior->tlb; 810 struct mm_walk_ops walk_ops = { 811 .pmd_entry = madvise_free_pte_range, 812 }; 813 814 /* MADV_FREE works for only anon vma at the moment */ 815 if (!vma_is_anonymous(vma)) 816 return -EINVAL; 817 818 range.start = max(vma->vm_start, start_addr); 819 if (range.start >= vma->vm_end) 820 return -EINVAL; 821 range.end = min(vma->vm_end, end_addr); 822 if (range.end <= vma->vm_start) 823 return -EINVAL; 824 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 825 range.start, range.end); 826 827 lru_add_drain(); 828 update_hiwater_rss(mm); 829 830 mmu_notifier_invalidate_range_start(&range); 831 tlb_start_vma(tlb, vma); 832 walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode); 833 walk_page_range_vma(vma, range.start, range.end, 834 &walk_ops, tlb); 835 tlb_end_vma(tlb, vma); 836 mmu_notifier_invalidate_range_end(&range); 837 return 0; 838 } 839 840 /* 841 * Application no longer needs these pages. If the pages are dirty, 842 * it's OK to just throw them away. The app will be more careful about 843 * data it wants to keep. Be sure to free swap resources too. The 844 * zap_page_range_single call sets things up for shrink_active_list to actually 845 * free these pages later if no one else has touched them in the meantime, 846 * although we could add these pages to a global reuse list for 847 * shrink_active_list to pick up before reclaiming other pages. 848 * 849 * NB: This interface discards data rather than pushes it out to swap, 850 * as some implementations do. This has performance implications for 851 * applications like large transactional databases which want to discard 852 * pages in anonymous maps after committing to backing store the data 853 * that was kept in them. There is no reason to write this data out to 854 * the swap area if the application is discarding it. 855 * 856 * An interface that causes the system to free clean pages and flush 857 * dirty pages is already available as msync(MS_INVALIDATE). 858 */ 859 static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior) 860 861 { 862 struct madvise_behavior_range *range = &madv_behavior->range; 863 struct zap_details details = { 864 .reclaim_pt = true, 865 .even_cows = true, 866 }; 867 868 zap_page_range_single_batched( 869 madv_behavior->tlb, madv_behavior->vma, range->start, 870 range->end - range->start, &details); 871 return 0; 872 } 873 874 static 875 bool madvise_dontneed_free_valid_vma(struct madvise_behavior *madv_behavior) 876 { 877 struct vm_area_struct *vma = madv_behavior->vma; 878 int behavior = madv_behavior->behavior; 879 struct madvise_behavior_range *range = &madv_behavior->range; 880 881 if (!is_vm_hugetlb_page(vma)) { 882 unsigned int forbidden = VM_PFNMAP; 883 884 if (behavior != MADV_DONTNEED_LOCKED) 885 forbidden |= VM_LOCKED; 886 887 return !(vma->vm_flags & forbidden); 888 } 889 890 if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) 891 return false; 892 if (range->start & ~huge_page_mask(hstate_vma(vma))) 893 return false; 894 895 /* 896 * Madvise callers expect the length to be rounded up to PAGE_SIZE 897 * boundaries, and may be unaware that this VMA uses huge pages. 898 * Avoid unexpected data loss by rounding down the number of 899 * huge pages freed. 900 */ 901 range->end = ALIGN_DOWN(range->end, huge_page_size(hstate_vma(vma))); 902 903 return true; 904 } 905 906 static long madvise_dontneed_free(struct madvise_behavior *madv_behavior) 907 { 908 struct mm_struct *mm = madv_behavior->mm; 909 struct madvise_behavior_range *range = &madv_behavior->range; 910 int behavior = madv_behavior->behavior; 911 912 if (!madvise_dontneed_free_valid_vma(madv_behavior)) 913 return -EINVAL; 914 915 if (range->start == range->end) 916 return 0; 917 918 if (!userfaultfd_remove(madv_behavior->vma, range->start, range->end)) { 919 struct vm_area_struct *vma; 920 921 mark_mmap_lock_dropped(madv_behavior); 922 mmap_read_lock(mm); 923 madv_behavior->vma = vma = vma_lookup(mm, range->start); 924 if (!vma) 925 return -ENOMEM; 926 /* 927 * Potential end adjustment for hugetlb vma is OK as 928 * the check below keeps end within vma. 929 */ 930 if (!madvise_dontneed_free_valid_vma(madv_behavior)) 931 return -EINVAL; 932 if (range->end > vma->vm_end) { 933 /* 934 * Don't fail if end > vma->vm_end. If the old 935 * vma was split while the mmap_lock was 936 * released the effect of the concurrent 937 * operation may not cause madvise() to 938 * have an undefined result. There may be an 939 * adjacent next vma that we'll walk 940 * next. userfaultfd_remove() will generate an 941 * UFFD_EVENT_REMOVE repetition on the 942 * end-vma->vm_end range, but the manager can 943 * handle a repetition fine. 944 */ 945 range->end = vma->vm_end; 946 } 947 /* 948 * If the memory region between start and end was 949 * originally backed by 4kB pages and then remapped to 950 * be backed by hugepages while mmap_lock was dropped, 951 * the adjustment for hugetlb vma above may have rounded 952 * end down to the start address. 953 */ 954 if (range->start == range->end) 955 return 0; 956 VM_WARN_ON(range->start > range->end); 957 } 958 959 if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) 960 return madvise_dontneed_single_vma(madv_behavior); 961 else if (behavior == MADV_FREE) 962 return madvise_free_single_vma(madv_behavior); 963 else 964 return -EINVAL; 965 } 966 967 static long madvise_populate(struct madvise_behavior *madv_behavior) 968 { 969 struct mm_struct *mm = madv_behavior->mm; 970 const bool write = madv_behavior->behavior == MADV_POPULATE_WRITE; 971 int locked = 1; 972 unsigned long start = madv_behavior->range.start; 973 unsigned long end = madv_behavior->range.end; 974 long pages; 975 976 while (start < end) { 977 /* Populate (prefault) page tables readable/writable. */ 978 pages = faultin_page_range(mm, start, end, write, &locked); 979 if (!locked) { 980 mmap_read_lock(mm); 981 locked = 1; 982 } 983 if (pages < 0) { 984 switch (pages) { 985 case -EINTR: 986 return -EINTR; 987 case -EINVAL: /* Incompatible mappings / permissions. */ 988 return -EINVAL; 989 case -EHWPOISON: 990 return -EHWPOISON; 991 case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ 992 return -EFAULT; 993 default: 994 pr_warn_once("%s: unhandled return value: %ld\n", 995 __func__, pages); 996 fallthrough; 997 case -ENOMEM: /* No VMA or out of memory. */ 998 return -ENOMEM; 999 } 1000 } 1001 start += pages * PAGE_SIZE; 1002 } 1003 return 0; 1004 } 1005 1006 /* 1007 * Application wants to free up the pages and associated backing store. 1008 * This is effectively punching a hole into the middle of a file. 1009 */ 1010 static long madvise_remove(struct madvise_behavior *madv_behavior) 1011 { 1012 loff_t offset; 1013 int error; 1014 struct file *f; 1015 struct mm_struct *mm = madv_behavior->mm; 1016 struct vm_area_struct *vma = madv_behavior->vma; 1017 unsigned long start = madv_behavior->range.start; 1018 unsigned long end = madv_behavior->range.end; 1019 1020 mark_mmap_lock_dropped(madv_behavior); 1021 1022 if (vma->vm_flags & VM_LOCKED) 1023 return -EINVAL; 1024 1025 f = vma->vm_file; 1026 1027 if (!f || !f->f_mapping || !f->f_mapping->host) { 1028 return -EINVAL; 1029 } 1030 1031 if (!vma_is_shared_maywrite(vma)) 1032 return -EACCES; 1033 1034 offset = (loff_t)(start - vma->vm_start) 1035 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 1036 1037 /* 1038 * Filesystem's fallocate may need to take i_rwsem. We need to 1039 * explicitly grab a reference because the vma (and hence the 1040 * vma's reference to the file) can go away as soon as we drop 1041 * mmap_lock. 1042 */ 1043 get_file(f); 1044 if (userfaultfd_remove(vma, start, end)) { 1045 /* mmap_lock was not released by userfaultfd_remove() */ 1046 mmap_read_unlock(mm); 1047 } 1048 error = vfs_fallocate(f, 1049 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 1050 offset, end - start); 1051 fput(f); 1052 mmap_read_lock(mm); 1053 return error; 1054 } 1055 1056 static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked) 1057 { 1058 vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB; 1059 1060 /* 1061 * A user could lock after setting a guard range but that's fine, as 1062 * they'd not be able to fault in. The issue arises when we try to zap 1063 * existing locked VMAs. We don't want to do that. 1064 */ 1065 if (!allow_locked) 1066 disallowed |= VM_LOCKED; 1067 1068 return !(vma->vm_flags & disallowed); 1069 } 1070 1071 static bool is_guard_pte_marker(pte_t ptent) 1072 { 1073 return is_pte_marker(ptent) && 1074 is_guard_swp_entry(pte_to_swp_entry(ptent)); 1075 } 1076 1077 static int guard_install_pud_entry(pud_t *pud, unsigned long addr, 1078 unsigned long next, struct mm_walk *walk) 1079 { 1080 pud_t pudval = pudp_get(pud); 1081 1082 /* If huge return >0 so we abort the operation + zap. */ 1083 return pud_trans_huge(pudval); 1084 } 1085 1086 static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr, 1087 unsigned long next, struct mm_walk *walk) 1088 { 1089 pmd_t pmdval = pmdp_get(pmd); 1090 1091 /* If huge return >0 so we abort the operation + zap. */ 1092 return pmd_trans_huge(pmdval); 1093 } 1094 1095 static int guard_install_pte_entry(pte_t *pte, unsigned long addr, 1096 unsigned long next, struct mm_walk *walk) 1097 { 1098 pte_t pteval = ptep_get(pte); 1099 unsigned long *nr_pages = (unsigned long *)walk->private; 1100 1101 /* If there is already a guard page marker, we have nothing to do. */ 1102 if (is_guard_pte_marker(pteval)) { 1103 (*nr_pages)++; 1104 1105 return 0; 1106 } 1107 1108 /* If populated return >0 so we abort the operation + zap. */ 1109 return 1; 1110 } 1111 1112 static int guard_install_set_pte(unsigned long addr, unsigned long next, 1113 pte_t *ptep, struct mm_walk *walk) 1114 { 1115 unsigned long *nr_pages = (unsigned long *)walk->private; 1116 1117 /* Simply install a PTE marker, this causes segfault on access. */ 1118 *ptep = make_pte_marker(PTE_MARKER_GUARD); 1119 (*nr_pages)++; 1120 1121 return 0; 1122 } 1123 1124 static const struct mm_walk_ops guard_install_walk_ops = { 1125 .pud_entry = guard_install_pud_entry, 1126 .pmd_entry = guard_install_pmd_entry, 1127 .pte_entry = guard_install_pte_entry, 1128 .install_pte = guard_install_set_pte, 1129 .walk_lock = PGWALK_RDLOCK, 1130 }; 1131 1132 static long madvise_guard_install(struct madvise_behavior *madv_behavior) 1133 { 1134 struct vm_area_struct *vma = madv_behavior->vma; 1135 struct madvise_behavior_range *range = &madv_behavior->range; 1136 long err; 1137 int i; 1138 1139 if (!is_valid_guard_vma(vma, /* allow_locked = */false)) 1140 return -EINVAL; 1141 1142 /* 1143 * If we install guard markers, then the range is no longer 1144 * empty from a page table perspective and therefore it's 1145 * appropriate to have an anon_vma. 1146 * 1147 * This ensures that on fork, we copy page tables correctly. 1148 */ 1149 err = anon_vma_prepare(vma); 1150 if (err) 1151 return err; 1152 1153 /* 1154 * Optimistically try to install the guard marker pages first. If any 1155 * non-guard pages are encountered, give up and zap the range before 1156 * trying again. 1157 * 1158 * We try a few times before giving up and releasing back to userland to 1159 * loop around, releasing locks in the process to avoid contention. This 1160 * would only happen if there was a great many racing page faults. 1161 * 1162 * In most cases we should simply install the guard markers immediately 1163 * with no zap or looping. 1164 */ 1165 for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) { 1166 unsigned long nr_pages = 0; 1167 1168 /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */ 1169 err = walk_page_range_mm(vma->vm_mm, range->start, range->end, 1170 &guard_install_walk_ops, &nr_pages); 1171 if (err < 0) 1172 return err; 1173 1174 if (err == 0) { 1175 unsigned long nr_expected_pages = 1176 PHYS_PFN(range->end - range->start); 1177 1178 VM_WARN_ON(nr_pages != nr_expected_pages); 1179 return 0; 1180 } 1181 1182 /* 1183 * OK some of the range have non-guard pages mapped, zap 1184 * them. This leaves existing guard pages in place. 1185 */ 1186 zap_page_range_single(vma, range->start, 1187 range->end - range->start, NULL); 1188 } 1189 1190 /* 1191 * We were unable to install the guard pages due to being raced by page 1192 * faults. This should not happen ordinarily. We return to userspace and 1193 * immediately retry, relieving lock contention. 1194 */ 1195 return restart_syscall(); 1196 } 1197 1198 static int guard_remove_pud_entry(pud_t *pud, unsigned long addr, 1199 unsigned long next, struct mm_walk *walk) 1200 { 1201 pud_t pudval = pudp_get(pud); 1202 1203 /* If huge, cannot have guard pages present, so no-op - skip. */ 1204 if (pud_trans_huge(pudval)) 1205 walk->action = ACTION_CONTINUE; 1206 1207 return 0; 1208 } 1209 1210 static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr, 1211 unsigned long next, struct mm_walk *walk) 1212 { 1213 pmd_t pmdval = pmdp_get(pmd); 1214 1215 /* If huge, cannot have guard pages present, so no-op - skip. */ 1216 if (pmd_trans_huge(pmdval)) 1217 walk->action = ACTION_CONTINUE; 1218 1219 return 0; 1220 } 1221 1222 static int guard_remove_pte_entry(pte_t *pte, unsigned long addr, 1223 unsigned long next, struct mm_walk *walk) 1224 { 1225 pte_t ptent = ptep_get(pte); 1226 1227 if (is_guard_pte_marker(ptent)) { 1228 /* Simply clear the PTE marker. */ 1229 pte_clear_not_present_full(walk->mm, addr, pte, false); 1230 update_mmu_cache(walk->vma, addr, pte); 1231 } 1232 1233 return 0; 1234 } 1235 1236 static const struct mm_walk_ops guard_remove_walk_ops = { 1237 .pud_entry = guard_remove_pud_entry, 1238 .pmd_entry = guard_remove_pmd_entry, 1239 .pte_entry = guard_remove_pte_entry, 1240 .walk_lock = PGWALK_RDLOCK, 1241 }; 1242 1243 static long madvise_guard_remove(struct madvise_behavior *madv_behavior) 1244 { 1245 struct vm_area_struct *vma = madv_behavior->vma; 1246 struct madvise_behavior_range *range = &madv_behavior->range; 1247 1248 /* 1249 * We're ok with removing guards in mlock()'d ranges, as this is a 1250 * non-destructive action. 1251 */ 1252 if (!is_valid_guard_vma(vma, /* allow_locked = */true)) 1253 return -EINVAL; 1254 1255 return walk_page_range_vma(vma, range->start, range->end, 1256 &guard_remove_walk_ops, NULL); 1257 } 1258 1259 /* 1260 * Apply an madvise behavior to a region of a vma. madvise_update_vma 1261 * will handle splitting a vm area into separate areas, each area with its own 1262 * behavior. 1263 */ 1264 static int madvise_vma_behavior(struct madvise_behavior *madv_behavior) 1265 { 1266 int behavior = madv_behavior->behavior; 1267 struct vm_area_struct *vma = madv_behavior->vma; 1268 vm_flags_t new_flags = vma->vm_flags; 1269 struct madvise_behavior_range *range = &madv_behavior->range; 1270 int error; 1271 1272 if (unlikely(!can_modify_vma_madv(madv_behavior->vma, behavior))) 1273 return -EPERM; 1274 1275 switch (behavior) { 1276 case MADV_REMOVE: 1277 return madvise_remove(madv_behavior); 1278 case MADV_WILLNEED: 1279 return madvise_willneed(madv_behavior); 1280 case MADV_COLD: 1281 return madvise_cold(madv_behavior); 1282 case MADV_PAGEOUT: 1283 return madvise_pageout(madv_behavior); 1284 case MADV_FREE: 1285 case MADV_DONTNEED: 1286 case MADV_DONTNEED_LOCKED: 1287 return madvise_dontneed_free(madv_behavior); 1288 case MADV_COLLAPSE: 1289 return madvise_collapse(vma, range->start, range->end, 1290 &madv_behavior->lock_dropped); 1291 case MADV_GUARD_INSTALL: 1292 return madvise_guard_install(madv_behavior); 1293 case MADV_GUARD_REMOVE: 1294 return madvise_guard_remove(madv_behavior); 1295 1296 /* The below behaviours update VMAs via madvise_update_vma(). */ 1297 1298 case MADV_NORMAL: 1299 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 1300 break; 1301 case MADV_SEQUENTIAL: 1302 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 1303 break; 1304 case MADV_RANDOM: 1305 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 1306 break; 1307 case MADV_DONTFORK: 1308 new_flags |= VM_DONTCOPY; 1309 break; 1310 case MADV_DOFORK: 1311 if (new_flags & VM_IO) 1312 return -EINVAL; 1313 new_flags &= ~VM_DONTCOPY; 1314 break; 1315 case MADV_WIPEONFORK: 1316 /* MADV_WIPEONFORK is only supported on anonymous memory. */ 1317 if (vma->vm_file || new_flags & VM_SHARED) 1318 return -EINVAL; 1319 new_flags |= VM_WIPEONFORK; 1320 break; 1321 case MADV_KEEPONFORK: 1322 if (new_flags & VM_DROPPABLE) 1323 return -EINVAL; 1324 new_flags &= ~VM_WIPEONFORK; 1325 break; 1326 case MADV_DONTDUMP: 1327 new_flags |= VM_DONTDUMP; 1328 break; 1329 case MADV_DODUMP: 1330 if ((!is_vm_hugetlb_page(vma) && (new_flags & VM_SPECIAL)) || 1331 (new_flags & VM_DROPPABLE)) 1332 return -EINVAL; 1333 new_flags &= ~VM_DONTDUMP; 1334 break; 1335 case MADV_MERGEABLE: 1336 case MADV_UNMERGEABLE: 1337 error = ksm_madvise(vma, range->start, range->end, 1338 behavior, &new_flags); 1339 if (error) 1340 goto out; 1341 break; 1342 case MADV_HUGEPAGE: 1343 case MADV_NOHUGEPAGE: 1344 error = hugepage_madvise(vma, &new_flags, behavior); 1345 if (error) 1346 goto out; 1347 break; 1348 case __MADV_SET_ANON_VMA_NAME: 1349 /* Only anonymous mappings can be named */ 1350 if (vma->vm_file && !vma_is_anon_shmem(vma)) 1351 return -EBADF; 1352 break; 1353 } 1354 1355 /* This is a write operation.*/ 1356 VM_WARN_ON_ONCE(madv_behavior->lock_mode != MADVISE_MMAP_WRITE_LOCK); 1357 1358 error = madvise_update_vma(new_flags, madv_behavior); 1359 out: 1360 /* 1361 * madvise() returns EAGAIN if kernel resources, such as 1362 * slab, are temporarily unavailable. 1363 */ 1364 if (error == -ENOMEM) 1365 error = -EAGAIN; 1366 return error; 1367 } 1368 1369 #ifdef CONFIG_MEMORY_FAILURE 1370 /* 1371 * Error injection support for memory error handling. 1372 */ 1373 static int madvise_inject_error(struct madvise_behavior *madv_behavior) 1374 { 1375 unsigned long size; 1376 unsigned long start = madv_behavior->range.start; 1377 unsigned long end = madv_behavior->range.end; 1378 1379 if (!capable(CAP_SYS_ADMIN)) 1380 return -EPERM; 1381 1382 for (; start < end; start += size) { 1383 unsigned long pfn; 1384 struct page *page; 1385 int ret; 1386 1387 ret = get_user_pages_fast(start, 1, 0, &page); 1388 if (ret != 1) 1389 return ret; 1390 pfn = page_to_pfn(page); 1391 1392 /* 1393 * When soft offlining hugepages, after migrating the page 1394 * we dissolve it, therefore in the second loop "page" will 1395 * no longer be a compound page. 1396 */ 1397 size = page_size(compound_head(page)); 1398 1399 if (madv_behavior->behavior == MADV_SOFT_OFFLINE) { 1400 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", 1401 pfn, start); 1402 ret = soft_offline_page(pfn, MF_COUNT_INCREASED); 1403 } else { 1404 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", 1405 pfn, start); 1406 ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED); 1407 if (ret == -EOPNOTSUPP) 1408 ret = 0; 1409 } 1410 1411 if (ret) 1412 return ret; 1413 } 1414 1415 return 0; 1416 } 1417 1418 static bool is_memory_failure(struct madvise_behavior *madv_behavior) 1419 { 1420 switch (madv_behavior->behavior) { 1421 case MADV_HWPOISON: 1422 case MADV_SOFT_OFFLINE: 1423 return true; 1424 default: 1425 return false; 1426 } 1427 } 1428 1429 #else 1430 1431 static int madvise_inject_error(struct madvise_behavior *madv_behavior) 1432 { 1433 return 0; 1434 } 1435 1436 static bool is_memory_failure(struct madvise_behavior *madv_behavior) 1437 { 1438 return false; 1439 } 1440 1441 #endif /* CONFIG_MEMORY_FAILURE */ 1442 1443 static bool 1444 madvise_behavior_valid(int behavior) 1445 { 1446 switch (behavior) { 1447 case MADV_DOFORK: 1448 case MADV_DONTFORK: 1449 case MADV_NORMAL: 1450 case MADV_SEQUENTIAL: 1451 case MADV_RANDOM: 1452 case MADV_REMOVE: 1453 case MADV_WILLNEED: 1454 case MADV_DONTNEED: 1455 case MADV_DONTNEED_LOCKED: 1456 case MADV_FREE: 1457 case MADV_COLD: 1458 case MADV_PAGEOUT: 1459 case MADV_POPULATE_READ: 1460 case MADV_POPULATE_WRITE: 1461 #ifdef CONFIG_KSM 1462 case MADV_MERGEABLE: 1463 case MADV_UNMERGEABLE: 1464 #endif 1465 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1466 case MADV_HUGEPAGE: 1467 case MADV_NOHUGEPAGE: 1468 case MADV_COLLAPSE: 1469 #endif 1470 case MADV_DONTDUMP: 1471 case MADV_DODUMP: 1472 case MADV_WIPEONFORK: 1473 case MADV_KEEPONFORK: 1474 case MADV_GUARD_INSTALL: 1475 case MADV_GUARD_REMOVE: 1476 #ifdef CONFIG_MEMORY_FAILURE 1477 case MADV_SOFT_OFFLINE: 1478 case MADV_HWPOISON: 1479 #endif 1480 return true; 1481 1482 default: 1483 return false; 1484 } 1485 } 1486 1487 /* Can we invoke process_madvise() on a remote mm for the specified behavior? */ 1488 static bool process_madvise_remote_valid(int behavior) 1489 { 1490 switch (behavior) { 1491 case MADV_COLD: 1492 case MADV_PAGEOUT: 1493 case MADV_WILLNEED: 1494 case MADV_COLLAPSE: 1495 return true; 1496 default: 1497 return false; 1498 } 1499 } 1500 1501 /* 1502 * Try to acquire a VMA read lock if possible. 1503 * 1504 * We only support this lock over a single VMA, which the input range must 1505 * span either partially or fully. 1506 * 1507 * This function always returns with an appropriate lock held. If a VMA read 1508 * lock could be acquired, we return true and set madv_behavior state 1509 * accordingly. 1510 * 1511 * If a VMA read lock could not be acquired, we return false and expect caller to 1512 * fallback to mmap lock behaviour. 1513 */ 1514 static bool try_vma_read_lock(struct madvise_behavior *madv_behavior) 1515 { 1516 struct mm_struct *mm = madv_behavior->mm; 1517 struct vm_area_struct *vma; 1518 1519 vma = lock_vma_under_rcu(mm, madv_behavior->range.start); 1520 if (!vma) 1521 goto take_mmap_read_lock; 1522 /* 1523 * Must span only a single VMA; uffd and remote processes are 1524 * unsupported. 1525 */ 1526 if (madv_behavior->range.end > vma->vm_end || current->mm != mm || 1527 userfaultfd_armed(vma)) { 1528 vma_end_read(vma); 1529 goto take_mmap_read_lock; 1530 } 1531 madv_behavior->vma = vma; 1532 return true; 1533 1534 take_mmap_read_lock: 1535 mmap_read_lock(mm); 1536 madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK; 1537 return false; 1538 } 1539 1540 /* 1541 * Walk the vmas in range [start,end), and call the madvise_vma_behavior 1542 * function on each one. The function will get start and end parameters that 1543 * cover the overlap between the current vma and the original range. Any 1544 * unmapped regions in the original range will result in this function returning 1545 * -ENOMEM while still calling the madvise_vma_behavior function on all of the 1546 * existing vmas in the range. Must be called with the mmap_lock held for 1547 * reading or writing. 1548 */ 1549 static 1550 int madvise_walk_vmas(struct madvise_behavior *madv_behavior) 1551 { 1552 struct mm_struct *mm = madv_behavior->mm; 1553 struct madvise_behavior_range *range = &madv_behavior->range; 1554 /* range is updated to span each VMA, so store end of entire range. */ 1555 unsigned long last_end = range->end; 1556 int unmapped_error = 0; 1557 int error; 1558 struct vm_area_struct *prev, *vma; 1559 1560 /* 1561 * If VMA read lock is supported, apply madvise to a single VMA 1562 * tentatively, avoiding walking VMAs. 1563 */ 1564 if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK && 1565 try_vma_read_lock(madv_behavior)) { 1566 error = madvise_vma_behavior(madv_behavior); 1567 vma_end_read(madv_behavior->vma); 1568 return error; 1569 } 1570 1571 vma = find_vma_prev(mm, range->start, &prev); 1572 if (vma && range->start > vma->vm_start) 1573 prev = vma; 1574 1575 for (;;) { 1576 /* Still start < end. */ 1577 if (!vma) 1578 return -ENOMEM; 1579 1580 /* Here start < (last_end|vma->vm_end). */ 1581 if (range->start < vma->vm_start) { 1582 /* 1583 * This indicates a gap between VMAs in the input 1584 * range. This does not cause the operation to abort, 1585 * rather we simply return -ENOMEM to indicate that this 1586 * has happened, but carry on. 1587 */ 1588 unmapped_error = -ENOMEM; 1589 range->start = vma->vm_start; 1590 if (range->start >= last_end) 1591 break; 1592 } 1593 1594 /* Here vma->vm_start <= range->start < (last_end|vma->vm_end) */ 1595 range->end = min(vma->vm_end, last_end); 1596 1597 /* Here vma->vm_start <= range->start < range->end <= (last_end|vma->vm_end). */ 1598 madv_behavior->prev = prev; 1599 madv_behavior->vma = vma; 1600 error = madvise_vma_behavior(madv_behavior); 1601 if (error) 1602 return error; 1603 if (madv_behavior->lock_dropped) { 1604 /* We dropped the mmap lock, we can't ref the VMA. */ 1605 prev = NULL; 1606 vma = NULL; 1607 madv_behavior->lock_dropped = false; 1608 } else { 1609 vma = madv_behavior->vma; 1610 prev = vma; 1611 } 1612 1613 if (vma && range->end < vma->vm_end) 1614 range->end = vma->vm_end; 1615 if (range->end >= last_end) 1616 break; 1617 1618 vma = find_vma(mm, vma ? vma->vm_end : range->end); 1619 range->start = range->end; 1620 } 1621 1622 return unmapped_error; 1623 } 1624 1625 /* 1626 * Any behaviour which results in changes to the vma->vm_flags needs to 1627 * take mmap_lock for writing. Others, which simply traverse vmas, need 1628 * to only take it for reading. 1629 */ 1630 static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior) 1631 { 1632 if (is_memory_failure(madv_behavior)) 1633 return MADVISE_NO_LOCK; 1634 1635 switch (madv_behavior->behavior) { 1636 case MADV_REMOVE: 1637 case MADV_WILLNEED: 1638 case MADV_COLD: 1639 case MADV_PAGEOUT: 1640 case MADV_POPULATE_READ: 1641 case MADV_POPULATE_WRITE: 1642 case MADV_COLLAPSE: 1643 case MADV_GUARD_INSTALL: 1644 case MADV_GUARD_REMOVE: 1645 return MADVISE_MMAP_READ_LOCK; 1646 case MADV_DONTNEED: 1647 case MADV_DONTNEED_LOCKED: 1648 case MADV_FREE: 1649 return MADVISE_VMA_READ_LOCK; 1650 default: 1651 return MADVISE_MMAP_WRITE_LOCK; 1652 } 1653 } 1654 1655 static int madvise_lock(struct madvise_behavior *madv_behavior) 1656 { 1657 struct mm_struct *mm = madv_behavior->mm; 1658 enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior); 1659 1660 switch (lock_mode) { 1661 case MADVISE_NO_LOCK: 1662 break; 1663 case MADVISE_MMAP_WRITE_LOCK: 1664 if (mmap_write_lock_killable(mm)) 1665 return -EINTR; 1666 break; 1667 case MADVISE_MMAP_READ_LOCK: 1668 mmap_read_lock(mm); 1669 break; 1670 case MADVISE_VMA_READ_LOCK: 1671 /* We will acquire the lock per-VMA in madvise_walk_vmas(). */ 1672 break; 1673 } 1674 1675 madv_behavior->lock_mode = lock_mode; 1676 return 0; 1677 } 1678 1679 static void madvise_unlock(struct madvise_behavior *madv_behavior) 1680 { 1681 struct mm_struct *mm = madv_behavior->mm; 1682 1683 switch (madv_behavior->lock_mode) { 1684 case MADVISE_NO_LOCK: 1685 return; 1686 case MADVISE_MMAP_WRITE_LOCK: 1687 mmap_write_unlock(mm); 1688 break; 1689 case MADVISE_MMAP_READ_LOCK: 1690 mmap_read_unlock(mm); 1691 break; 1692 case MADVISE_VMA_READ_LOCK: 1693 /* We will drop the lock per-VMA in madvise_walk_vmas(). */ 1694 break; 1695 } 1696 1697 madv_behavior->lock_mode = MADVISE_NO_LOCK; 1698 } 1699 1700 static bool madvise_batch_tlb_flush(int behavior) 1701 { 1702 switch (behavior) { 1703 case MADV_DONTNEED: 1704 case MADV_DONTNEED_LOCKED: 1705 case MADV_FREE: 1706 return true; 1707 default: 1708 return false; 1709 } 1710 } 1711 1712 static void madvise_init_tlb(struct madvise_behavior *madv_behavior) 1713 { 1714 if (madvise_batch_tlb_flush(madv_behavior->behavior)) 1715 tlb_gather_mmu(madv_behavior->tlb, madv_behavior->mm); 1716 } 1717 1718 static void madvise_finish_tlb(struct madvise_behavior *madv_behavior) 1719 { 1720 if (madvise_batch_tlb_flush(madv_behavior->behavior)) 1721 tlb_finish_mmu(madv_behavior->tlb); 1722 } 1723 1724 static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) 1725 { 1726 size_t len; 1727 1728 if (!madvise_behavior_valid(behavior)) 1729 return false; 1730 1731 if (!PAGE_ALIGNED(start)) 1732 return false; 1733 len = PAGE_ALIGN(len_in); 1734 1735 /* Check to see whether len was rounded up from small -ve to zero */ 1736 if (len_in && !len) 1737 return false; 1738 1739 if (start + len < start) 1740 return false; 1741 1742 return true; 1743 } 1744 1745 /* 1746 * madvise_should_skip() - Return if the request is invalid or nothing. 1747 * @start: Start address of madvise-requested address range. 1748 * @len_in: Length of madvise-requested address range. 1749 * @behavior: Requested madvise behavor. 1750 * @err: Pointer to store an error code from the check. 1751 * 1752 * If the specified behaviour is invalid or nothing would occur, we skip the 1753 * operation. This function returns true in the cases, otherwise false. In 1754 * the former case we store an error on @err. 1755 */ 1756 static bool madvise_should_skip(unsigned long start, size_t len_in, 1757 int behavior, int *err) 1758 { 1759 if (!is_valid_madvise(start, len_in, behavior)) { 1760 *err = -EINVAL; 1761 return true; 1762 } 1763 if (start + PAGE_ALIGN(len_in) == start) { 1764 *err = 0; 1765 return true; 1766 } 1767 return false; 1768 } 1769 1770 static bool is_madvise_populate(struct madvise_behavior *madv_behavior) 1771 { 1772 switch (madv_behavior->behavior) { 1773 case MADV_POPULATE_READ: 1774 case MADV_POPULATE_WRITE: 1775 return true; 1776 default: 1777 return false; 1778 } 1779 } 1780 1781 /* 1782 * untagged_addr_remote() assumes mmap_lock is already held. On 1783 * architectures like x86 and RISC-V, tagging is tricky because each 1784 * mm may have a different tagging mask. However, we might only hold 1785 * the per-VMA lock (currently only local processes are supported), 1786 * so untagged_addr is used to avoid the mmap_lock assertion for 1787 * local processes. 1788 */ 1789 static inline unsigned long get_untagged_addr(struct mm_struct *mm, 1790 unsigned long start) 1791 { 1792 return current->mm == mm ? untagged_addr(start) : 1793 untagged_addr_remote(mm, start); 1794 } 1795 1796 static int madvise_do_behavior(unsigned long start, size_t len_in, 1797 struct madvise_behavior *madv_behavior) 1798 { 1799 struct blk_plug plug; 1800 int error; 1801 struct madvise_behavior_range *range = &madv_behavior->range; 1802 1803 if (is_memory_failure(madv_behavior)) { 1804 range->start = start; 1805 range->end = start + len_in; 1806 return madvise_inject_error(madv_behavior); 1807 } 1808 1809 range->start = get_untagged_addr(madv_behavior->mm, start); 1810 range->end = range->start + PAGE_ALIGN(len_in); 1811 1812 blk_start_plug(&plug); 1813 if (is_madvise_populate(madv_behavior)) 1814 error = madvise_populate(madv_behavior); 1815 else 1816 error = madvise_walk_vmas(madv_behavior); 1817 blk_finish_plug(&plug); 1818 return error; 1819 } 1820 1821 /* 1822 * The madvise(2) system call. 1823 * 1824 * Applications can use madvise() to advise the kernel how it should 1825 * handle paging I/O in this VM area. The idea is to help the kernel 1826 * use appropriate read-ahead and caching techniques. The information 1827 * provided is advisory only, and can be safely disregarded by the 1828 * kernel without affecting the correct operation of the application. 1829 * 1830 * behavior values: 1831 * MADV_NORMAL - the default behavior is to read clusters. This 1832 * results in some read-ahead and read-behind. 1833 * MADV_RANDOM - the system should read the minimum amount of data 1834 * on any access, since it is unlikely that the appli- 1835 * cation will need more than what it asks for. 1836 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 1837 * once, so they can be aggressively read ahead, and 1838 * can be freed soon after they are accessed. 1839 * MADV_WILLNEED - the application is notifying the system to read 1840 * some pages ahead. 1841 * MADV_DONTNEED - the application is finished with the given range, 1842 * so the kernel can free resources associated with it. 1843 * MADV_FREE - the application marks pages in the given range as lazy free, 1844 * where actual purges are postponed until memory pressure happens. 1845 * MADV_REMOVE - the application wants to free up the given range of 1846 * pages and associated backing store. 1847 * MADV_DONTFORK - omit this area from child's address space when forking: 1848 * typically, to avoid COWing pages pinned by get_user_pages(). 1849 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 1850 * MADV_WIPEONFORK - present the child process with zero-filled memory in this 1851 * range after a fork. 1852 * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK 1853 * MADV_HWPOISON - trigger memory error handler as if the given memory range 1854 * were corrupted by unrecoverable hardware memory failure. 1855 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. 1856 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 1857 * this area with pages of identical content from other such areas. 1858 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 1859 * MADV_HUGEPAGE - the application wants to back the given range by transparent 1860 * huge pages in the future. Existing pages might be coalesced and 1861 * new pages might be allocated as THP. 1862 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by 1863 * transparent huge pages so the existing pages will not be 1864 * coalesced into THP and new pages will not be allocated as THP. 1865 * MADV_COLLAPSE - synchronously coalesce pages into new THP. 1866 * MADV_DONTDUMP - the application wants to prevent pages in the given range 1867 * from being included in its core dump. 1868 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. 1869 * MADV_COLD - the application is not expected to use this memory soon, 1870 * deactivate pages in this range so that they can be reclaimed 1871 * easily if memory pressure happens. 1872 * MADV_PAGEOUT - the application is not expected to use this memory soon, 1873 * page out the pages in this range immediately. 1874 * MADV_POPULATE_READ - populate (prefault) page tables readable by 1875 * triggering read faults if required 1876 * MADV_POPULATE_WRITE - populate (prefault) page tables writable by 1877 * triggering write faults if required 1878 * 1879 * return values: 1880 * zero - success 1881 * -EINVAL - start + len < 0, start is not page-aligned, 1882 * "behavior" is not a valid value, or application 1883 * is attempting to release locked or shared pages, 1884 * or the specified address range includes file, Huge TLB, 1885 * MAP_SHARED or VMPFNMAP range. 1886 * -ENOMEM - addresses in the specified range are not currently 1887 * mapped, or are outside the AS of the process. 1888 * -EIO - an I/O error occurred while paging in data. 1889 * -EBADF - map exists, but area maps something that isn't a file. 1890 * -EAGAIN - a kernel resource was temporarily unavailable. 1891 * -EPERM - memory is sealed. 1892 */ 1893 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) 1894 { 1895 int error; 1896 struct mmu_gather tlb; 1897 struct madvise_behavior madv_behavior = { 1898 .mm = mm, 1899 .behavior = behavior, 1900 .tlb = &tlb, 1901 }; 1902 1903 if (madvise_should_skip(start, len_in, behavior, &error)) 1904 return error; 1905 error = madvise_lock(&madv_behavior); 1906 if (error) 1907 return error; 1908 madvise_init_tlb(&madv_behavior); 1909 error = madvise_do_behavior(start, len_in, &madv_behavior); 1910 madvise_finish_tlb(&madv_behavior); 1911 madvise_unlock(&madv_behavior); 1912 1913 return error; 1914 } 1915 1916 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 1917 { 1918 return do_madvise(current->mm, start, len_in, behavior); 1919 } 1920 1921 /* Perform an madvise operation over a vector of addresses and lengths. */ 1922 static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, 1923 int behavior) 1924 { 1925 ssize_t ret = 0; 1926 size_t total_len; 1927 struct mmu_gather tlb; 1928 struct madvise_behavior madv_behavior = { 1929 .mm = mm, 1930 .behavior = behavior, 1931 .tlb = &tlb, 1932 }; 1933 1934 total_len = iov_iter_count(iter); 1935 1936 ret = madvise_lock(&madv_behavior); 1937 if (ret) 1938 return ret; 1939 madvise_init_tlb(&madv_behavior); 1940 1941 while (iov_iter_count(iter)) { 1942 unsigned long start = (unsigned long)iter_iov_addr(iter); 1943 size_t len_in = iter_iov_len(iter); 1944 int error; 1945 1946 if (madvise_should_skip(start, len_in, behavior, &error)) 1947 ret = error; 1948 else 1949 ret = madvise_do_behavior(start, len_in, &madv_behavior); 1950 /* 1951 * An madvise operation is attempting to restart the syscall, 1952 * but we cannot proceed as it would not be correct to repeat 1953 * the operation in aggregate, and would be surprising to the 1954 * user. 1955 * 1956 * We drop and reacquire locks so it is safe to just loop and 1957 * try again. We check for fatal signals in case we need exit 1958 * early anyway. 1959 */ 1960 if (ret == -ERESTARTNOINTR) { 1961 if (fatal_signal_pending(current)) { 1962 ret = -EINTR; 1963 break; 1964 } 1965 1966 /* Drop and reacquire lock to unwind race. */ 1967 madvise_finish_tlb(&madv_behavior); 1968 madvise_unlock(&madv_behavior); 1969 ret = madvise_lock(&madv_behavior); 1970 if (ret) 1971 goto out; 1972 madvise_init_tlb(&madv_behavior); 1973 continue; 1974 } 1975 if (ret < 0) 1976 break; 1977 iov_iter_advance(iter, iter_iov_len(iter)); 1978 } 1979 madvise_finish_tlb(&madv_behavior); 1980 madvise_unlock(&madv_behavior); 1981 1982 out: 1983 ret = (total_len - iov_iter_count(iter)) ? : ret; 1984 1985 return ret; 1986 } 1987 1988 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, 1989 size_t, vlen, int, behavior, unsigned int, flags) 1990 { 1991 ssize_t ret; 1992 struct iovec iovstack[UIO_FASTIOV]; 1993 struct iovec *iov = iovstack; 1994 struct iov_iter iter; 1995 struct task_struct *task; 1996 struct mm_struct *mm; 1997 unsigned int f_flags; 1998 1999 if (flags != 0) { 2000 ret = -EINVAL; 2001 goto out; 2002 } 2003 2004 ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); 2005 if (ret < 0) 2006 goto out; 2007 2008 task = pidfd_get_task(pidfd, &f_flags); 2009 if (IS_ERR(task)) { 2010 ret = PTR_ERR(task); 2011 goto free_iov; 2012 } 2013 2014 /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ 2015 mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); 2016 if (IS_ERR(mm)) { 2017 ret = PTR_ERR(mm); 2018 goto release_task; 2019 } 2020 2021 /* 2022 * We need only perform this check if we are attempting to manipulate a 2023 * remote process's address space. 2024 */ 2025 if (mm != current->mm && !process_madvise_remote_valid(behavior)) { 2026 ret = -EINVAL; 2027 goto release_mm; 2028 } 2029 2030 /* 2031 * Require CAP_SYS_NICE for influencing process performance. Note that 2032 * only non-destructive hints are currently supported for remote 2033 * processes. 2034 */ 2035 if (mm != current->mm && !capable(CAP_SYS_NICE)) { 2036 ret = -EPERM; 2037 goto release_mm; 2038 } 2039 2040 ret = vector_madvise(mm, &iter, behavior); 2041 2042 release_mm: 2043 mmput(mm); 2044 release_task: 2045 put_task_struct(task); 2046 free_iov: 2047 kfree(iov); 2048 out: 2049 return ret; 2050 } 2051 2052 #ifdef CONFIG_ANON_VMA_NAME 2053 2054 #define ANON_VMA_NAME_MAX_LEN 80 2055 #define ANON_VMA_NAME_INVALID_CHARS "\\`$[]" 2056 2057 static inline bool is_valid_name_char(char ch) 2058 { 2059 /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */ 2060 return ch > 0x1f && ch < 0x7f && 2061 !strchr(ANON_VMA_NAME_INVALID_CHARS, ch); 2062 } 2063 2064 static int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, 2065 unsigned long len_in, struct anon_vma_name *anon_name) 2066 { 2067 unsigned long end; 2068 unsigned long len; 2069 int error; 2070 struct madvise_behavior madv_behavior = { 2071 .mm = mm, 2072 .behavior = __MADV_SET_ANON_VMA_NAME, 2073 .anon_name = anon_name, 2074 }; 2075 2076 if (start & ~PAGE_MASK) 2077 return -EINVAL; 2078 len = (len_in + ~PAGE_MASK) & PAGE_MASK; 2079 2080 /* Check to see whether len was rounded up from small -ve to zero */ 2081 if (len_in && !len) 2082 return -EINVAL; 2083 2084 end = start + len; 2085 if (end < start) 2086 return -EINVAL; 2087 2088 if (end == start) 2089 return 0; 2090 2091 madv_behavior.range.start = start; 2092 madv_behavior.range.end = end; 2093 2094 error = madvise_lock(&madv_behavior); 2095 if (error) 2096 return error; 2097 error = madvise_walk_vmas(&madv_behavior); 2098 madvise_unlock(&madv_behavior); 2099 2100 return error; 2101 } 2102 2103 int set_anon_vma_name(unsigned long addr, unsigned long size, 2104 const char __user *uname) 2105 { 2106 struct anon_vma_name *anon_name = NULL; 2107 struct mm_struct *mm = current->mm; 2108 int error; 2109 2110 if (uname) { 2111 char *name, *pch; 2112 2113 name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); 2114 if (IS_ERR(name)) 2115 return PTR_ERR(name); 2116 2117 for (pch = name; *pch != '\0'; pch++) { 2118 if (!is_valid_name_char(*pch)) { 2119 kfree(name); 2120 return -EINVAL; 2121 } 2122 } 2123 /* anon_vma has its own copy */ 2124 anon_name = anon_vma_name_alloc(name); 2125 kfree(name); 2126 if (!anon_name) 2127 return -ENOMEM; 2128 } 2129 2130 error = madvise_set_anon_name(mm, addr, size, anon_name); 2131 anon_vma_name_put(anon_name); 2132 2133 return error; 2134 } 2135 #endif 2136