1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/madvise.c 4 * 5 * Copyright (C) 1999 Linus Torvalds 6 * Copyright (C) 2002 Christoph Hellwig 7 */ 8 9 #include <linux/mman.h> 10 #include <linux/pagemap.h> 11 #include <linux/syscalls.h> 12 #include <linux/mempolicy.h> 13 #include <linux/page-isolation.h> 14 #include <linux/page_idle.h> 15 #include <linux/userfaultfd_k.h> 16 #include <linux/hugetlb.h> 17 #include <linux/falloc.h> 18 #include <linux/fadvise.h> 19 #include <linux/sched.h> 20 #include <linux/sched/mm.h> 21 #include <linux/mm_inline.h> 22 #include <linux/string.h> 23 #include <linux/uio.h> 24 #include <linux/ksm.h> 25 #include <linux/fs.h> 26 #include <linux/file.h> 27 #include <linux/blkdev.h> 28 #include <linux/backing-dev.h> 29 #include <linux/pagewalk.h> 30 #include <linux/swap.h> 31 #include <linux/swapops.h> 32 #include <linux/shmem_fs.h> 33 #include <linux/mmu_notifier.h> 34 35 #include <asm/tlb.h> 36 37 #include "internal.h" 38 39 struct madvise_walk_private { 40 struct mmu_gather *tlb; 41 bool pageout; 42 }; 43 44 /* 45 * Any behaviour which results in changes to the vma->vm_flags needs to 46 * take mmap_lock for writing. Others, which simply traverse vmas, need 47 * to only take it for reading. 48 */ 49 static int madvise_need_mmap_write(int behavior) 50 { 51 switch (behavior) { 52 case MADV_REMOVE: 53 case MADV_WILLNEED: 54 case MADV_DONTNEED: 55 case MADV_COLD: 56 case MADV_PAGEOUT: 57 case MADV_FREE: 58 case MADV_POPULATE_READ: 59 case MADV_POPULATE_WRITE: 60 return 0; 61 default: 62 /* be safe, default to 1. list exceptions explicitly */ 63 return 1; 64 } 65 } 66 67 #ifdef CONFIG_ANON_VMA_NAME 68 struct anon_vma_name *anon_vma_name_alloc(const char *name) 69 { 70 struct anon_vma_name *anon_name; 71 size_t count; 72 73 /* Add 1 for NUL terminator at the end of the anon_name->name */ 74 count = strlen(name) + 1; 75 anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); 76 if (anon_name) { 77 kref_init(&anon_name->kref); 78 memcpy(anon_name->name, name, count); 79 } 80 81 return anon_name; 82 } 83 84 void anon_vma_name_free(struct kref *kref) 85 { 86 struct anon_vma_name *anon_name = 87 container_of(kref, struct anon_vma_name, kref); 88 kfree(anon_name); 89 } 90 91 struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) 92 { 93 mmap_assert_locked(vma->vm_mm); 94 95 if (vma->vm_file) 96 return NULL; 97 98 return vma->anon_name; 99 } 100 101 /* mmap_lock should be write-locked */ 102 static int replace_anon_vma_name(struct vm_area_struct *vma, 103 struct anon_vma_name *anon_name) 104 { 105 struct anon_vma_name *orig_name = anon_vma_name(vma); 106 107 if (!anon_name) { 108 vma->anon_name = NULL; 109 anon_vma_name_put(orig_name); 110 return 0; 111 } 112 113 if (anon_vma_name_eq(orig_name, anon_name)) 114 return 0; 115 116 vma->anon_name = anon_vma_name_reuse(anon_name); 117 anon_vma_name_put(orig_name); 118 119 return 0; 120 } 121 #else /* CONFIG_ANON_VMA_NAME */ 122 static int replace_anon_vma_name(struct vm_area_struct *vma, 123 struct anon_vma_name *anon_name) 124 { 125 if (anon_name) 126 return -EINVAL; 127 128 return 0; 129 } 130 #endif /* CONFIG_ANON_VMA_NAME */ 131 /* 132 * Update the vm_flags on region of a vma, splitting it or merging it as 133 * necessary. Must be called with mmap_sem held for writing; 134 * Caller should ensure anon_name stability by raising its refcount even when 135 * anon_name belongs to a valid vma because this function might free that vma. 136 */ 137 static int madvise_update_vma(struct vm_area_struct *vma, 138 struct vm_area_struct **prev, unsigned long start, 139 unsigned long end, unsigned long new_flags, 140 struct anon_vma_name *anon_name) 141 { 142 struct mm_struct *mm = vma->vm_mm; 143 int error; 144 pgoff_t pgoff; 145 146 if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { 147 *prev = vma; 148 return 0; 149 } 150 151 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 152 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, 153 vma->vm_file, pgoff, vma_policy(vma), 154 vma->vm_userfaultfd_ctx, anon_name); 155 if (*prev) { 156 vma = *prev; 157 goto success; 158 } 159 160 *prev = vma; 161 162 if (start != vma->vm_start) { 163 if (unlikely(mm->map_count >= sysctl_max_map_count)) 164 return -ENOMEM; 165 error = __split_vma(mm, vma, start, 1); 166 if (error) 167 return error; 168 } 169 170 if (end != vma->vm_end) { 171 if (unlikely(mm->map_count >= sysctl_max_map_count)) 172 return -ENOMEM; 173 error = __split_vma(mm, vma, end, 0); 174 if (error) 175 return error; 176 } 177 178 success: 179 /* 180 * vm_flags is protected by the mmap_lock held in write mode. 181 */ 182 vma->vm_flags = new_flags; 183 if (!vma->vm_file) { 184 error = replace_anon_vma_name(vma, anon_name); 185 if (error) 186 return error; 187 } 188 189 return 0; 190 } 191 192 #ifdef CONFIG_SWAP 193 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, 194 unsigned long end, struct mm_walk *walk) 195 { 196 pte_t *orig_pte; 197 struct vm_area_struct *vma = walk->private; 198 unsigned long index; 199 200 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 201 return 0; 202 203 for (index = start; index != end; index += PAGE_SIZE) { 204 pte_t pte; 205 swp_entry_t entry; 206 struct page *page; 207 spinlock_t *ptl; 208 209 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); 210 pte = *(orig_pte + ((index - start) / PAGE_SIZE)); 211 pte_unmap_unlock(orig_pte, ptl); 212 213 if (pte_present(pte) || pte_none(pte)) 214 continue; 215 entry = pte_to_swp_entry(pte); 216 if (unlikely(non_swap_entry(entry))) 217 continue; 218 219 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, 220 vma, index, false); 221 if (page) 222 put_page(page); 223 } 224 225 return 0; 226 } 227 228 static const struct mm_walk_ops swapin_walk_ops = { 229 .pmd_entry = swapin_walk_pmd_entry, 230 }; 231 232 static void force_shm_swapin_readahead(struct vm_area_struct *vma, 233 unsigned long start, unsigned long end, 234 struct address_space *mapping) 235 { 236 XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); 237 pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1); 238 struct page *page; 239 240 rcu_read_lock(); 241 xas_for_each(&xas, page, end_index) { 242 swp_entry_t swap; 243 244 if (!xa_is_value(page)) 245 continue; 246 xas_pause(&xas); 247 rcu_read_unlock(); 248 249 swap = radix_to_swp_entry(page); 250 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, 251 NULL, 0, false); 252 if (page) 253 put_page(page); 254 255 rcu_read_lock(); 256 } 257 rcu_read_unlock(); 258 259 lru_add_drain(); /* Push any new pages onto the LRU now */ 260 } 261 #endif /* CONFIG_SWAP */ 262 263 /* 264 * Schedule all required I/O operations. Do not wait for completion. 265 */ 266 static long madvise_willneed(struct vm_area_struct *vma, 267 struct vm_area_struct **prev, 268 unsigned long start, unsigned long end) 269 { 270 struct mm_struct *mm = vma->vm_mm; 271 struct file *file = vma->vm_file; 272 loff_t offset; 273 274 *prev = vma; 275 #ifdef CONFIG_SWAP 276 if (!file) { 277 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); 278 lru_add_drain(); /* Push any new pages onto the LRU now */ 279 return 0; 280 } 281 282 if (shmem_mapping(file->f_mapping)) { 283 force_shm_swapin_readahead(vma, start, end, 284 file->f_mapping); 285 return 0; 286 } 287 #else 288 if (!file) 289 return -EBADF; 290 #endif 291 292 if (IS_DAX(file_inode(file))) { 293 /* no bad return value, but ignore advice */ 294 return 0; 295 } 296 297 /* 298 * Filesystem's fadvise may need to take various locks. We need to 299 * explicitly grab a reference because the vma (and hence the 300 * vma's reference to the file) can go away as soon as we drop 301 * mmap_lock. 302 */ 303 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 304 get_file(file); 305 offset = (loff_t)(start - vma->vm_start) 306 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 307 mmap_read_unlock(mm); 308 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); 309 fput(file); 310 mmap_read_lock(mm); 311 return 0; 312 } 313 314 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, 315 unsigned long addr, unsigned long end, 316 struct mm_walk *walk) 317 { 318 struct madvise_walk_private *private = walk->private; 319 struct mmu_gather *tlb = private->tlb; 320 bool pageout = private->pageout; 321 struct mm_struct *mm = tlb->mm; 322 struct vm_area_struct *vma = walk->vma; 323 pte_t *orig_pte, *pte, ptent; 324 spinlock_t *ptl; 325 struct page *page = NULL; 326 LIST_HEAD(page_list); 327 328 if (fatal_signal_pending(current)) 329 return -EINTR; 330 331 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 332 if (pmd_trans_huge(*pmd)) { 333 pmd_t orig_pmd; 334 unsigned long next = pmd_addr_end(addr, end); 335 336 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 337 ptl = pmd_trans_huge_lock(pmd, vma); 338 if (!ptl) 339 return 0; 340 341 orig_pmd = *pmd; 342 if (is_huge_zero_pmd(orig_pmd)) 343 goto huge_unlock; 344 345 if (unlikely(!pmd_present(orig_pmd))) { 346 VM_BUG_ON(thp_migration_supported() && 347 !is_pmd_migration_entry(orig_pmd)); 348 goto huge_unlock; 349 } 350 351 page = pmd_page(orig_pmd); 352 353 /* Do not interfere with other mappings of this page */ 354 if (page_mapcount(page) != 1) 355 goto huge_unlock; 356 357 if (next - addr != HPAGE_PMD_SIZE) { 358 int err; 359 360 get_page(page); 361 spin_unlock(ptl); 362 lock_page(page); 363 err = split_huge_page(page); 364 unlock_page(page); 365 put_page(page); 366 if (!err) 367 goto regular_page; 368 return 0; 369 } 370 371 if (pmd_young(orig_pmd)) { 372 pmdp_invalidate(vma, addr, pmd); 373 orig_pmd = pmd_mkold(orig_pmd); 374 375 set_pmd_at(mm, addr, pmd, orig_pmd); 376 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 377 } 378 379 ClearPageReferenced(page); 380 test_and_clear_page_young(page); 381 if (pageout) { 382 if (!isolate_lru_page(page)) { 383 if (PageUnevictable(page)) 384 putback_lru_page(page); 385 else 386 list_add(&page->lru, &page_list); 387 } 388 } else 389 deactivate_page(page); 390 huge_unlock: 391 spin_unlock(ptl); 392 if (pageout) 393 reclaim_pages(&page_list); 394 return 0; 395 } 396 397 regular_page: 398 if (pmd_trans_unstable(pmd)) 399 return 0; 400 #endif 401 tlb_change_page_size(tlb, PAGE_SIZE); 402 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 403 flush_tlb_batched_pending(mm); 404 arch_enter_lazy_mmu_mode(); 405 for (; addr < end; pte++, addr += PAGE_SIZE) { 406 ptent = *pte; 407 408 if (pte_none(ptent)) 409 continue; 410 411 if (!pte_present(ptent)) 412 continue; 413 414 page = vm_normal_page(vma, addr, ptent); 415 if (!page) 416 continue; 417 418 /* 419 * Creating a THP page is expensive so split it only if we 420 * are sure it's worth. Split it if we are only owner. 421 */ 422 if (PageTransCompound(page)) { 423 if (page_mapcount(page) != 1) 424 break; 425 get_page(page); 426 if (!trylock_page(page)) { 427 put_page(page); 428 break; 429 } 430 pte_unmap_unlock(orig_pte, ptl); 431 if (split_huge_page(page)) { 432 unlock_page(page); 433 put_page(page); 434 pte_offset_map_lock(mm, pmd, addr, &ptl); 435 break; 436 } 437 unlock_page(page); 438 put_page(page); 439 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 440 pte--; 441 addr -= PAGE_SIZE; 442 continue; 443 } 444 445 /* Do not interfere with other mappings of this page */ 446 if (page_mapcount(page) != 1) 447 continue; 448 449 VM_BUG_ON_PAGE(PageTransCompound(page), page); 450 451 if (pte_young(ptent)) { 452 ptent = ptep_get_and_clear_full(mm, addr, pte, 453 tlb->fullmm); 454 ptent = pte_mkold(ptent); 455 set_pte_at(mm, addr, pte, ptent); 456 tlb_remove_tlb_entry(tlb, pte, addr); 457 } 458 459 /* 460 * We are deactivating a page for accelerating reclaiming. 461 * VM couldn't reclaim the page unless we clear PG_young. 462 * As a side effect, it makes confuse idle-page tracking 463 * because they will miss recent referenced history. 464 */ 465 ClearPageReferenced(page); 466 test_and_clear_page_young(page); 467 if (pageout) { 468 if (!isolate_lru_page(page)) { 469 if (PageUnevictable(page)) 470 putback_lru_page(page); 471 else 472 list_add(&page->lru, &page_list); 473 } 474 } else 475 deactivate_page(page); 476 } 477 478 arch_leave_lazy_mmu_mode(); 479 pte_unmap_unlock(orig_pte, ptl); 480 if (pageout) 481 reclaim_pages(&page_list); 482 cond_resched(); 483 484 return 0; 485 } 486 487 static const struct mm_walk_ops cold_walk_ops = { 488 .pmd_entry = madvise_cold_or_pageout_pte_range, 489 }; 490 491 static void madvise_cold_page_range(struct mmu_gather *tlb, 492 struct vm_area_struct *vma, 493 unsigned long addr, unsigned long end) 494 { 495 struct madvise_walk_private walk_private = { 496 .pageout = false, 497 .tlb = tlb, 498 }; 499 500 tlb_start_vma(tlb, vma); 501 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 502 tlb_end_vma(tlb, vma); 503 } 504 505 static inline bool can_madv_lru_vma(struct vm_area_struct *vma) 506 { 507 return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); 508 } 509 510 static long madvise_cold(struct vm_area_struct *vma, 511 struct vm_area_struct **prev, 512 unsigned long start_addr, unsigned long end_addr) 513 { 514 struct mm_struct *mm = vma->vm_mm; 515 struct mmu_gather tlb; 516 517 *prev = vma; 518 if (!can_madv_lru_vma(vma)) 519 return -EINVAL; 520 521 lru_add_drain(); 522 tlb_gather_mmu(&tlb, mm); 523 madvise_cold_page_range(&tlb, vma, start_addr, end_addr); 524 tlb_finish_mmu(&tlb); 525 526 return 0; 527 } 528 529 static void madvise_pageout_page_range(struct mmu_gather *tlb, 530 struct vm_area_struct *vma, 531 unsigned long addr, unsigned long end) 532 { 533 struct madvise_walk_private walk_private = { 534 .pageout = true, 535 .tlb = tlb, 536 }; 537 538 tlb_start_vma(tlb, vma); 539 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 540 tlb_end_vma(tlb, vma); 541 } 542 543 static inline bool can_do_pageout(struct vm_area_struct *vma) 544 { 545 if (vma_is_anonymous(vma)) 546 return true; 547 if (!vma->vm_file) 548 return false; 549 /* 550 * paging out pagecache only for non-anonymous mappings that correspond 551 * to the files the calling process could (if tried) open for writing; 552 * otherwise we'd be including shared non-exclusive mappings, which 553 * opens a side channel. 554 */ 555 return inode_owner_or_capable(&init_user_ns, 556 file_inode(vma->vm_file)) || 557 file_permission(vma->vm_file, MAY_WRITE) == 0; 558 } 559 560 static long madvise_pageout(struct vm_area_struct *vma, 561 struct vm_area_struct **prev, 562 unsigned long start_addr, unsigned long end_addr) 563 { 564 struct mm_struct *mm = vma->vm_mm; 565 struct mmu_gather tlb; 566 567 *prev = vma; 568 if (!can_madv_lru_vma(vma)) 569 return -EINVAL; 570 571 if (!can_do_pageout(vma)) 572 return 0; 573 574 lru_add_drain(); 575 tlb_gather_mmu(&tlb, mm); 576 madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); 577 tlb_finish_mmu(&tlb); 578 579 return 0; 580 } 581 582 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, 583 unsigned long end, struct mm_walk *walk) 584 585 { 586 struct mmu_gather *tlb = walk->private; 587 struct mm_struct *mm = tlb->mm; 588 struct vm_area_struct *vma = walk->vma; 589 spinlock_t *ptl; 590 pte_t *orig_pte, *pte, ptent; 591 struct page *page; 592 int nr_swap = 0; 593 unsigned long next; 594 595 next = pmd_addr_end(addr, end); 596 if (pmd_trans_huge(*pmd)) 597 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) 598 goto next; 599 600 if (pmd_trans_unstable(pmd)) 601 return 0; 602 603 tlb_change_page_size(tlb, PAGE_SIZE); 604 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 605 flush_tlb_batched_pending(mm); 606 arch_enter_lazy_mmu_mode(); 607 for (; addr != end; pte++, addr += PAGE_SIZE) { 608 ptent = *pte; 609 610 if (pte_none(ptent)) 611 continue; 612 /* 613 * If the pte has swp_entry, just clear page table to 614 * prevent swap-in which is more expensive rather than 615 * (page allocation + zeroing). 616 */ 617 if (!pte_present(ptent)) { 618 swp_entry_t entry; 619 620 entry = pte_to_swp_entry(ptent); 621 if (non_swap_entry(entry)) 622 continue; 623 nr_swap--; 624 free_swap_and_cache(entry); 625 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 626 continue; 627 } 628 629 page = vm_normal_page(vma, addr, ptent); 630 if (!page) 631 continue; 632 633 /* 634 * If pmd isn't transhuge but the page is THP and 635 * is owned by only this process, split it and 636 * deactivate all pages. 637 */ 638 if (PageTransCompound(page)) { 639 if (page_mapcount(page) != 1) 640 goto out; 641 get_page(page); 642 if (!trylock_page(page)) { 643 put_page(page); 644 goto out; 645 } 646 pte_unmap_unlock(orig_pte, ptl); 647 if (split_huge_page(page)) { 648 unlock_page(page); 649 put_page(page); 650 pte_offset_map_lock(mm, pmd, addr, &ptl); 651 goto out; 652 } 653 unlock_page(page); 654 put_page(page); 655 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 656 pte--; 657 addr -= PAGE_SIZE; 658 continue; 659 } 660 661 VM_BUG_ON_PAGE(PageTransCompound(page), page); 662 663 if (PageSwapCache(page) || PageDirty(page)) { 664 if (!trylock_page(page)) 665 continue; 666 /* 667 * If page is shared with others, we couldn't clear 668 * PG_dirty of the page. 669 */ 670 if (page_mapcount(page) != 1) { 671 unlock_page(page); 672 continue; 673 } 674 675 if (PageSwapCache(page) && !try_to_free_swap(page)) { 676 unlock_page(page); 677 continue; 678 } 679 680 ClearPageDirty(page); 681 unlock_page(page); 682 } 683 684 if (pte_young(ptent) || pte_dirty(ptent)) { 685 /* 686 * Some of architecture(ex, PPC) don't update TLB 687 * with set_pte_at and tlb_remove_tlb_entry so for 688 * the portability, remap the pte with old|clean 689 * after pte clearing. 690 */ 691 ptent = ptep_get_and_clear_full(mm, addr, pte, 692 tlb->fullmm); 693 694 ptent = pte_mkold(ptent); 695 ptent = pte_mkclean(ptent); 696 set_pte_at(mm, addr, pte, ptent); 697 tlb_remove_tlb_entry(tlb, pte, addr); 698 } 699 mark_page_lazyfree(page); 700 } 701 out: 702 if (nr_swap) { 703 if (current->mm == mm) 704 sync_mm_rss(mm); 705 706 add_mm_counter(mm, MM_SWAPENTS, nr_swap); 707 } 708 arch_leave_lazy_mmu_mode(); 709 pte_unmap_unlock(orig_pte, ptl); 710 cond_resched(); 711 next: 712 return 0; 713 } 714 715 static const struct mm_walk_ops madvise_free_walk_ops = { 716 .pmd_entry = madvise_free_pte_range, 717 }; 718 719 static int madvise_free_single_vma(struct vm_area_struct *vma, 720 unsigned long start_addr, unsigned long end_addr) 721 { 722 struct mm_struct *mm = vma->vm_mm; 723 struct mmu_notifier_range range; 724 struct mmu_gather tlb; 725 726 /* MADV_FREE works for only anon vma at the moment */ 727 if (!vma_is_anonymous(vma)) 728 return -EINVAL; 729 730 range.start = max(vma->vm_start, start_addr); 731 if (range.start >= vma->vm_end) 732 return -EINVAL; 733 range.end = min(vma->vm_end, end_addr); 734 if (range.end <= vma->vm_start) 735 return -EINVAL; 736 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, 737 range.start, range.end); 738 739 lru_add_drain(); 740 tlb_gather_mmu(&tlb, mm); 741 update_hiwater_rss(mm); 742 743 mmu_notifier_invalidate_range_start(&range); 744 tlb_start_vma(&tlb, vma); 745 walk_page_range(vma->vm_mm, range.start, range.end, 746 &madvise_free_walk_ops, &tlb); 747 tlb_end_vma(&tlb, vma); 748 mmu_notifier_invalidate_range_end(&range); 749 tlb_finish_mmu(&tlb); 750 751 return 0; 752 } 753 754 /* 755 * Application no longer needs these pages. If the pages are dirty, 756 * it's OK to just throw them away. The app will be more careful about 757 * data it wants to keep. Be sure to free swap resources too. The 758 * zap_page_range call sets things up for shrink_active_list to actually free 759 * these pages later if no one else has touched them in the meantime, 760 * although we could add these pages to a global reuse list for 761 * shrink_active_list to pick up before reclaiming other pages. 762 * 763 * NB: This interface discards data rather than pushes it out to swap, 764 * as some implementations do. This has performance implications for 765 * applications like large transactional databases which want to discard 766 * pages in anonymous maps after committing to backing store the data 767 * that was kept in them. There is no reason to write this data out to 768 * the swap area if the application is discarding it. 769 * 770 * An interface that causes the system to free clean pages and flush 771 * dirty pages is already available as msync(MS_INVALIDATE). 772 */ 773 static long madvise_dontneed_single_vma(struct vm_area_struct *vma, 774 unsigned long start, unsigned long end) 775 { 776 zap_page_range(vma, start, end - start); 777 return 0; 778 } 779 780 static long madvise_dontneed_free(struct vm_area_struct *vma, 781 struct vm_area_struct **prev, 782 unsigned long start, unsigned long end, 783 int behavior) 784 { 785 struct mm_struct *mm = vma->vm_mm; 786 787 *prev = vma; 788 if (!can_madv_lru_vma(vma)) 789 return -EINVAL; 790 791 if (!userfaultfd_remove(vma, start, end)) { 792 *prev = NULL; /* mmap_lock has been dropped, prev is stale */ 793 794 mmap_read_lock(mm); 795 vma = find_vma(mm, start); 796 if (!vma) 797 return -ENOMEM; 798 if (start < vma->vm_start) { 799 /* 800 * This "vma" under revalidation is the one 801 * with the lowest vma->vm_start where start 802 * is also < vma->vm_end. If start < 803 * vma->vm_start it means an hole materialized 804 * in the user address space within the 805 * virtual range passed to MADV_DONTNEED 806 * or MADV_FREE. 807 */ 808 return -ENOMEM; 809 } 810 if (!can_madv_lru_vma(vma)) 811 return -EINVAL; 812 if (end > vma->vm_end) { 813 /* 814 * Don't fail if end > vma->vm_end. If the old 815 * vma was split while the mmap_lock was 816 * released the effect of the concurrent 817 * operation may not cause madvise() to 818 * have an undefined result. There may be an 819 * adjacent next vma that we'll walk 820 * next. userfaultfd_remove() will generate an 821 * UFFD_EVENT_REMOVE repetition on the 822 * end-vma->vm_end range, but the manager can 823 * handle a repetition fine. 824 */ 825 end = vma->vm_end; 826 } 827 VM_WARN_ON(start >= end); 828 } 829 830 if (behavior == MADV_DONTNEED) 831 return madvise_dontneed_single_vma(vma, start, end); 832 else if (behavior == MADV_FREE) 833 return madvise_free_single_vma(vma, start, end); 834 else 835 return -EINVAL; 836 } 837 838 static long madvise_populate(struct vm_area_struct *vma, 839 struct vm_area_struct **prev, 840 unsigned long start, unsigned long end, 841 int behavior) 842 { 843 const bool write = behavior == MADV_POPULATE_WRITE; 844 struct mm_struct *mm = vma->vm_mm; 845 unsigned long tmp_end; 846 int locked = 1; 847 long pages; 848 849 *prev = vma; 850 851 while (start < end) { 852 /* 853 * We might have temporarily dropped the lock. For example, 854 * our VMA might have been split. 855 */ 856 if (!vma || start >= vma->vm_end) { 857 vma = vma_lookup(mm, start); 858 if (!vma) 859 return -ENOMEM; 860 } 861 862 tmp_end = min_t(unsigned long, end, vma->vm_end); 863 /* Populate (prefault) page tables readable/writable. */ 864 pages = faultin_vma_page_range(vma, start, tmp_end, write, 865 &locked); 866 if (!locked) { 867 mmap_read_lock(mm); 868 locked = 1; 869 *prev = NULL; 870 vma = NULL; 871 } 872 if (pages < 0) { 873 switch (pages) { 874 case -EINTR: 875 return -EINTR; 876 case -EINVAL: /* Incompatible mappings / permissions. */ 877 return -EINVAL; 878 case -EHWPOISON: 879 return -EHWPOISON; 880 case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ 881 return -EFAULT; 882 default: 883 pr_warn_once("%s: unhandled return value: %ld\n", 884 __func__, pages); 885 fallthrough; 886 case -ENOMEM: 887 return -ENOMEM; 888 } 889 } 890 start += pages * PAGE_SIZE; 891 } 892 return 0; 893 } 894 895 /* 896 * Application wants to free up the pages and associated backing store. 897 * This is effectively punching a hole into the middle of a file. 898 */ 899 static long madvise_remove(struct vm_area_struct *vma, 900 struct vm_area_struct **prev, 901 unsigned long start, unsigned long end) 902 { 903 loff_t offset; 904 int error; 905 struct file *f; 906 struct mm_struct *mm = vma->vm_mm; 907 908 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 909 910 if (vma->vm_flags & VM_LOCKED) 911 return -EINVAL; 912 913 f = vma->vm_file; 914 915 if (!f || !f->f_mapping || !f->f_mapping->host) { 916 return -EINVAL; 917 } 918 919 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 920 return -EACCES; 921 922 offset = (loff_t)(start - vma->vm_start) 923 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 924 925 /* 926 * Filesystem's fallocate may need to take i_rwsem. We need to 927 * explicitly grab a reference because the vma (and hence the 928 * vma's reference to the file) can go away as soon as we drop 929 * mmap_lock. 930 */ 931 get_file(f); 932 if (userfaultfd_remove(vma, start, end)) { 933 /* mmap_lock was not released by userfaultfd_remove() */ 934 mmap_read_unlock(mm); 935 } 936 error = vfs_fallocate(f, 937 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 938 offset, end - start); 939 fput(f); 940 mmap_read_lock(mm); 941 return error; 942 } 943 944 /* 945 * Apply an madvise behavior to a region of a vma. madvise_update_vma 946 * will handle splitting a vm area into separate areas, each area with its own 947 * behavior. 948 */ 949 static int madvise_vma_behavior(struct vm_area_struct *vma, 950 struct vm_area_struct **prev, 951 unsigned long start, unsigned long end, 952 unsigned long behavior) 953 { 954 int error; 955 struct anon_vma_name *anon_name; 956 unsigned long new_flags = vma->vm_flags; 957 958 switch (behavior) { 959 case MADV_REMOVE: 960 return madvise_remove(vma, prev, start, end); 961 case MADV_WILLNEED: 962 return madvise_willneed(vma, prev, start, end); 963 case MADV_COLD: 964 return madvise_cold(vma, prev, start, end); 965 case MADV_PAGEOUT: 966 return madvise_pageout(vma, prev, start, end); 967 case MADV_FREE: 968 case MADV_DONTNEED: 969 return madvise_dontneed_free(vma, prev, start, end, behavior); 970 case MADV_POPULATE_READ: 971 case MADV_POPULATE_WRITE: 972 return madvise_populate(vma, prev, start, end, behavior); 973 case MADV_NORMAL: 974 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 975 break; 976 case MADV_SEQUENTIAL: 977 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 978 break; 979 case MADV_RANDOM: 980 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 981 break; 982 case MADV_DONTFORK: 983 new_flags |= VM_DONTCOPY; 984 break; 985 case MADV_DOFORK: 986 if (vma->vm_flags & VM_IO) 987 return -EINVAL; 988 new_flags &= ~VM_DONTCOPY; 989 break; 990 case MADV_WIPEONFORK: 991 /* MADV_WIPEONFORK is only supported on anonymous memory. */ 992 if (vma->vm_file || vma->vm_flags & VM_SHARED) 993 return -EINVAL; 994 new_flags |= VM_WIPEONFORK; 995 break; 996 case MADV_KEEPONFORK: 997 new_flags &= ~VM_WIPEONFORK; 998 break; 999 case MADV_DONTDUMP: 1000 new_flags |= VM_DONTDUMP; 1001 break; 1002 case MADV_DODUMP: 1003 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) 1004 return -EINVAL; 1005 new_flags &= ~VM_DONTDUMP; 1006 break; 1007 case MADV_MERGEABLE: 1008 case MADV_UNMERGEABLE: 1009 error = ksm_madvise(vma, start, end, behavior, &new_flags); 1010 if (error) 1011 goto out; 1012 break; 1013 case MADV_HUGEPAGE: 1014 case MADV_NOHUGEPAGE: 1015 error = hugepage_madvise(vma, &new_flags, behavior); 1016 if (error) 1017 goto out; 1018 break; 1019 } 1020 1021 anon_name = anon_vma_name(vma); 1022 anon_vma_name_get(anon_name); 1023 error = madvise_update_vma(vma, prev, start, end, new_flags, 1024 anon_name); 1025 anon_vma_name_put(anon_name); 1026 1027 out: 1028 /* 1029 * madvise() returns EAGAIN if kernel resources, such as 1030 * slab, are temporarily unavailable. 1031 */ 1032 if (error == -ENOMEM) 1033 error = -EAGAIN; 1034 return error; 1035 } 1036 1037 #ifdef CONFIG_MEMORY_FAILURE 1038 /* 1039 * Error injection support for memory error handling. 1040 */ 1041 static int madvise_inject_error(int behavior, 1042 unsigned long start, unsigned long end) 1043 { 1044 unsigned long size; 1045 1046 if (!capable(CAP_SYS_ADMIN)) 1047 return -EPERM; 1048 1049 1050 for (; start < end; start += size) { 1051 unsigned long pfn; 1052 struct page *page; 1053 int ret; 1054 1055 ret = get_user_pages_fast(start, 1, 0, &page); 1056 if (ret != 1) 1057 return ret; 1058 pfn = page_to_pfn(page); 1059 1060 /* 1061 * When soft offlining hugepages, after migrating the page 1062 * we dissolve it, therefore in the second loop "page" will 1063 * no longer be a compound page. 1064 */ 1065 size = page_size(compound_head(page)); 1066 1067 if (behavior == MADV_SOFT_OFFLINE) { 1068 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", 1069 pfn, start); 1070 ret = soft_offline_page(pfn, MF_COUNT_INCREASED); 1071 } else { 1072 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", 1073 pfn, start); 1074 ret = memory_failure(pfn, MF_COUNT_INCREASED); 1075 if (ret == -EOPNOTSUPP) 1076 ret = 0; 1077 } 1078 1079 if (ret) 1080 return ret; 1081 } 1082 1083 return 0; 1084 } 1085 #endif 1086 1087 static bool 1088 madvise_behavior_valid(int behavior) 1089 { 1090 switch (behavior) { 1091 case MADV_DOFORK: 1092 case MADV_DONTFORK: 1093 case MADV_NORMAL: 1094 case MADV_SEQUENTIAL: 1095 case MADV_RANDOM: 1096 case MADV_REMOVE: 1097 case MADV_WILLNEED: 1098 case MADV_DONTNEED: 1099 case MADV_FREE: 1100 case MADV_COLD: 1101 case MADV_PAGEOUT: 1102 case MADV_POPULATE_READ: 1103 case MADV_POPULATE_WRITE: 1104 #ifdef CONFIG_KSM 1105 case MADV_MERGEABLE: 1106 case MADV_UNMERGEABLE: 1107 #endif 1108 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1109 case MADV_HUGEPAGE: 1110 case MADV_NOHUGEPAGE: 1111 #endif 1112 case MADV_DONTDUMP: 1113 case MADV_DODUMP: 1114 case MADV_WIPEONFORK: 1115 case MADV_KEEPONFORK: 1116 #ifdef CONFIG_MEMORY_FAILURE 1117 case MADV_SOFT_OFFLINE: 1118 case MADV_HWPOISON: 1119 #endif 1120 return true; 1121 1122 default: 1123 return false; 1124 } 1125 } 1126 1127 static bool 1128 process_madvise_behavior_valid(int behavior) 1129 { 1130 switch (behavior) { 1131 case MADV_COLD: 1132 case MADV_PAGEOUT: 1133 case MADV_WILLNEED: 1134 return true; 1135 default: 1136 return false; 1137 } 1138 } 1139 1140 /* 1141 * Walk the vmas in range [start,end), and call the visit function on each one. 1142 * The visit function will get start and end parameters that cover the overlap 1143 * between the current vma and the original range. Any unmapped regions in the 1144 * original range will result in this function returning -ENOMEM while still 1145 * calling the visit function on all of the existing vmas in the range. 1146 * Must be called with the mmap_lock held for reading or writing. 1147 */ 1148 static 1149 int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, 1150 unsigned long end, unsigned long arg, 1151 int (*visit)(struct vm_area_struct *vma, 1152 struct vm_area_struct **prev, unsigned long start, 1153 unsigned long end, unsigned long arg)) 1154 { 1155 struct vm_area_struct *vma; 1156 struct vm_area_struct *prev; 1157 unsigned long tmp; 1158 int unmapped_error = 0; 1159 1160 /* 1161 * If the interval [start,end) covers some unmapped address 1162 * ranges, just ignore them, but return -ENOMEM at the end. 1163 * - different from the way of handling in mlock etc. 1164 */ 1165 vma = find_vma_prev(mm, start, &prev); 1166 if (vma && start > vma->vm_start) 1167 prev = vma; 1168 1169 for (;;) { 1170 int error; 1171 1172 /* Still start < end. */ 1173 if (!vma) 1174 return -ENOMEM; 1175 1176 /* Here start < (end|vma->vm_end). */ 1177 if (start < vma->vm_start) { 1178 unmapped_error = -ENOMEM; 1179 start = vma->vm_start; 1180 if (start >= end) 1181 break; 1182 } 1183 1184 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 1185 tmp = vma->vm_end; 1186 if (end < tmp) 1187 tmp = end; 1188 1189 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 1190 error = visit(vma, &prev, start, tmp, arg); 1191 if (error) 1192 return error; 1193 start = tmp; 1194 if (prev && start < prev->vm_end) 1195 start = prev->vm_end; 1196 if (start >= end) 1197 break; 1198 if (prev) 1199 vma = prev->vm_next; 1200 else /* madvise_remove dropped mmap_lock */ 1201 vma = find_vma(mm, start); 1202 } 1203 1204 return unmapped_error; 1205 } 1206 1207 #ifdef CONFIG_ANON_VMA_NAME 1208 static int madvise_vma_anon_name(struct vm_area_struct *vma, 1209 struct vm_area_struct **prev, 1210 unsigned long start, unsigned long end, 1211 unsigned long anon_name) 1212 { 1213 int error; 1214 1215 /* Only anonymous mappings can be named */ 1216 if (vma->vm_file) 1217 return -EBADF; 1218 1219 error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, 1220 (struct anon_vma_name *)anon_name); 1221 1222 /* 1223 * madvise() returns EAGAIN if kernel resources, such as 1224 * slab, are temporarily unavailable. 1225 */ 1226 if (error == -ENOMEM) 1227 error = -EAGAIN; 1228 return error; 1229 } 1230 1231 int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, 1232 unsigned long len_in, struct anon_vma_name *anon_name) 1233 { 1234 unsigned long end; 1235 unsigned long len; 1236 1237 if (start & ~PAGE_MASK) 1238 return -EINVAL; 1239 len = (len_in + ~PAGE_MASK) & PAGE_MASK; 1240 1241 /* Check to see whether len was rounded up from small -ve to zero */ 1242 if (len_in && !len) 1243 return -EINVAL; 1244 1245 end = start + len; 1246 if (end < start) 1247 return -EINVAL; 1248 1249 if (end == start) 1250 return 0; 1251 1252 return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, 1253 madvise_vma_anon_name); 1254 } 1255 #endif /* CONFIG_ANON_VMA_NAME */ 1256 /* 1257 * The madvise(2) system call. 1258 * 1259 * Applications can use madvise() to advise the kernel how it should 1260 * handle paging I/O in this VM area. The idea is to help the kernel 1261 * use appropriate read-ahead and caching techniques. The information 1262 * provided is advisory only, and can be safely disregarded by the 1263 * kernel without affecting the correct operation of the application. 1264 * 1265 * behavior values: 1266 * MADV_NORMAL - the default behavior is to read clusters. This 1267 * results in some read-ahead and read-behind. 1268 * MADV_RANDOM - the system should read the minimum amount of data 1269 * on any access, since it is unlikely that the appli- 1270 * cation will need more than what it asks for. 1271 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 1272 * once, so they can be aggressively read ahead, and 1273 * can be freed soon after they are accessed. 1274 * MADV_WILLNEED - the application is notifying the system to read 1275 * some pages ahead. 1276 * MADV_DONTNEED - the application is finished with the given range, 1277 * so the kernel can free resources associated with it. 1278 * MADV_FREE - the application marks pages in the given range as lazy free, 1279 * where actual purges are postponed until memory pressure happens. 1280 * MADV_REMOVE - the application wants to free up the given range of 1281 * pages and associated backing store. 1282 * MADV_DONTFORK - omit this area from child's address space when forking: 1283 * typically, to avoid COWing pages pinned by get_user_pages(). 1284 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 1285 * MADV_WIPEONFORK - present the child process with zero-filled memory in this 1286 * range after a fork. 1287 * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK 1288 * MADV_HWPOISON - trigger memory error handler as if the given memory range 1289 * were corrupted by unrecoverable hardware memory failure. 1290 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. 1291 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 1292 * this area with pages of identical content from other such areas. 1293 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 1294 * MADV_HUGEPAGE - the application wants to back the given range by transparent 1295 * huge pages in the future. Existing pages might be coalesced and 1296 * new pages might be allocated as THP. 1297 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by 1298 * transparent huge pages so the existing pages will not be 1299 * coalesced into THP and new pages will not be allocated as THP. 1300 * MADV_DONTDUMP - the application wants to prevent pages in the given range 1301 * from being included in its core dump. 1302 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. 1303 * MADV_COLD - the application is not expected to use this memory soon, 1304 * deactivate pages in this range so that they can be reclaimed 1305 * easily if memory pressure happens. 1306 * MADV_PAGEOUT - the application is not expected to use this memory soon, 1307 * page out the pages in this range immediately. 1308 * MADV_POPULATE_READ - populate (prefault) page tables readable by 1309 * triggering read faults if required 1310 * MADV_POPULATE_WRITE - populate (prefault) page tables writable by 1311 * triggering write faults if required 1312 * 1313 * return values: 1314 * zero - success 1315 * -EINVAL - start + len < 0, start is not page-aligned, 1316 * "behavior" is not a valid value, or application 1317 * is attempting to release locked or shared pages, 1318 * or the specified address range includes file, Huge TLB, 1319 * MAP_SHARED or VMPFNMAP range. 1320 * -ENOMEM - addresses in the specified range are not currently 1321 * mapped, or are outside the AS of the process. 1322 * -EIO - an I/O error occurred while paging in data. 1323 * -EBADF - map exists, but area maps something that isn't a file. 1324 * -EAGAIN - a kernel resource was temporarily unavailable. 1325 */ 1326 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) 1327 { 1328 unsigned long end; 1329 int error; 1330 int write; 1331 size_t len; 1332 struct blk_plug plug; 1333 1334 start = untagged_addr(start); 1335 1336 if (!madvise_behavior_valid(behavior)) 1337 return -EINVAL; 1338 1339 if (!PAGE_ALIGNED(start)) 1340 return -EINVAL; 1341 len = PAGE_ALIGN(len_in); 1342 1343 /* Check to see whether len was rounded up from small -ve to zero */ 1344 if (len_in && !len) 1345 return -EINVAL; 1346 1347 end = start + len; 1348 if (end < start) 1349 return -EINVAL; 1350 1351 if (end == start) 1352 return 0; 1353 1354 #ifdef CONFIG_MEMORY_FAILURE 1355 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 1356 return madvise_inject_error(behavior, start, start + len_in); 1357 #endif 1358 1359 write = madvise_need_mmap_write(behavior); 1360 if (write) { 1361 if (mmap_write_lock_killable(mm)) 1362 return -EINTR; 1363 } else { 1364 mmap_read_lock(mm); 1365 } 1366 1367 blk_start_plug(&plug); 1368 error = madvise_walk_vmas(mm, start, end, behavior, 1369 madvise_vma_behavior); 1370 blk_finish_plug(&plug); 1371 if (write) 1372 mmap_write_unlock(mm); 1373 else 1374 mmap_read_unlock(mm); 1375 1376 return error; 1377 } 1378 1379 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 1380 { 1381 return do_madvise(current->mm, start, len_in, behavior); 1382 } 1383 1384 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, 1385 size_t, vlen, int, behavior, unsigned int, flags) 1386 { 1387 ssize_t ret; 1388 struct iovec iovstack[UIO_FASTIOV], iovec; 1389 struct iovec *iov = iovstack; 1390 struct iov_iter iter; 1391 struct task_struct *task; 1392 struct mm_struct *mm; 1393 size_t total_len; 1394 unsigned int f_flags; 1395 1396 if (flags != 0) { 1397 ret = -EINVAL; 1398 goto out; 1399 } 1400 1401 ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); 1402 if (ret < 0) 1403 goto out; 1404 1405 task = pidfd_get_task(pidfd, &f_flags); 1406 if (IS_ERR(task)) { 1407 ret = PTR_ERR(task); 1408 goto free_iov; 1409 } 1410 1411 if (!process_madvise_behavior_valid(behavior)) { 1412 ret = -EINVAL; 1413 goto release_task; 1414 } 1415 1416 /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ 1417 mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); 1418 if (IS_ERR_OR_NULL(mm)) { 1419 ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; 1420 goto release_task; 1421 } 1422 1423 /* 1424 * Require CAP_SYS_NICE for influencing process performance. Note that 1425 * only non-destructive hints are currently supported. 1426 */ 1427 if (!capable(CAP_SYS_NICE)) { 1428 ret = -EPERM; 1429 goto release_mm; 1430 } 1431 1432 total_len = iov_iter_count(&iter); 1433 1434 while (iov_iter_count(&iter)) { 1435 iovec = iov_iter_iovec(&iter); 1436 /* 1437 * do_madvise returns ENOMEM if unmapped holes are present 1438 * in the passed VMA. process_madvise() is expected to skip 1439 * unmapped holes passed to it in the 'struct iovec' list 1440 * and not fail because of them. Thus treat -ENOMEM return 1441 * from do_madvise as valid and continue processing. 1442 */ 1443 ret = do_madvise(mm, (unsigned long)iovec.iov_base, 1444 iovec.iov_len, behavior); 1445 if (ret < 0 && ret != -ENOMEM) 1446 break; 1447 iov_iter_advance(&iter, iovec.iov_len); 1448 } 1449 1450 ret = (total_len - iov_iter_count(&iter)) ? : ret; 1451 1452 release_mm: 1453 mmput(mm); 1454 release_task: 1455 put_task_struct(task); 1456 free_iov: 1457 kfree(iov); 1458 out: 1459 return ret; 1460 } 1461