1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * mm/userfaultfd.c 4 * 5 * Copyright (C) 2015 Red Hat, Inc. 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/sched/signal.h> 10 #include <linux/pagemap.h> 11 #include <linux/rmap.h> 12 #include <linux/swap.h> 13 #include <linux/swapops.h> 14 #include <linux/userfaultfd_k.h> 15 #include <linux/mmu_notifier.h> 16 #include <linux/hugetlb.h> 17 #include <linux/shmem_fs.h> 18 #include <asm/tlbflush.h> 19 #include <asm/tlb.h> 20 #include "internal.h" 21 #include "swap.h" 22 23 static __always_inline 24 bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) 25 { 26 /* Make sure that the dst range is fully within dst_vma. */ 27 if (dst_end > dst_vma->vm_end) 28 return false; 29 30 /* 31 * Check the vma is registered in uffd, this is required to 32 * enforce the VM_MAYWRITE check done at uffd registration 33 * time. 34 */ 35 if (!dst_vma->vm_userfaultfd_ctx.ctx) 36 return false; 37 38 return true; 39 } 40 41 static __always_inline 42 struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm, 43 unsigned long addr) 44 { 45 struct vm_area_struct *vma; 46 47 mmap_assert_locked(mm); 48 vma = vma_lookup(mm, addr); 49 if (!vma) 50 vma = ERR_PTR(-ENOENT); 51 else if (!(vma->vm_flags & VM_SHARED) && 52 unlikely(anon_vma_prepare(vma))) 53 vma = ERR_PTR(-ENOMEM); 54 55 return vma; 56 } 57 58 #ifdef CONFIG_PER_VMA_LOCK 59 /* 60 * uffd_lock_vma() - Lookup and lock vma corresponding to @address. 61 * @mm: mm to search vma in. 62 * @address: address that the vma should contain. 63 * 64 * Should be called without holding mmap_lock. 65 * 66 * Return: A locked vma containing @address, -ENOENT if no vma is found, or 67 * -ENOMEM if anon_vma couldn't be allocated. 68 */ 69 static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, 70 unsigned long address) 71 { 72 struct vm_area_struct *vma; 73 74 vma = lock_vma_under_rcu(mm, address); 75 if (vma) { 76 /* 77 * We know we're going to need to use anon_vma, so check 78 * that early. 79 */ 80 if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma)) 81 vma_end_read(vma); 82 else 83 return vma; 84 } 85 86 mmap_read_lock(mm); 87 vma = find_vma_and_prepare_anon(mm, address); 88 if (!IS_ERR(vma)) { 89 /* 90 * We cannot use vma_start_read() as it may fail due to 91 * false locked (see comment in vma_start_read()). We 92 * can avoid that by directly locking vm_lock under 93 * mmap_lock, which guarantees that nobody can lock the 94 * vma for write (vma_start_write()) under us. 95 */ 96 down_read(&vma->vm_lock->lock); 97 } 98 99 mmap_read_unlock(mm); 100 return vma; 101 } 102 103 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, 104 unsigned long dst_start, 105 unsigned long len) 106 { 107 struct vm_area_struct *dst_vma; 108 109 dst_vma = uffd_lock_vma(dst_mm, dst_start); 110 if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len)) 111 return dst_vma; 112 113 vma_end_read(dst_vma); 114 return ERR_PTR(-ENOENT); 115 } 116 117 static void uffd_mfill_unlock(struct vm_area_struct *vma) 118 { 119 vma_end_read(vma); 120 } 121 122 #else 123 124 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, 125 unsigned long dst_start, 126 unsigned long len) 127 { 128 struct vm_area_struct *dst_vma; 129 130 mmap_read_lock(dst_mm); 131 dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start); 132 if (IS_ERR(dst_vma)) 133 goto out_unlock; 134 135 if (validate_dst_vma(dst_vma, dst_start + len)) 136 return dst_vma; 137 138 dst_vma = ERR_PTR(-ENOENT); 139 out_unlock: 140 mmap_read_unlock(dst_mm); 141 return dst_vma; 142 } 143 144 static void uffd_mfill_unlock(struct vm_area_struct *vma) 145 { 146 mmap_read_unlock(vma->vm_mm); 147 } 148 #endif 149 150 /* Check if dst_addr is outside of file's size. Must be called with ptl held. */ 151 static bool mfill_file_over_size(struct vm_area_struct *dst_vma, 152 unsigned long dst_addr) 153 { 154 struct inode *inode; 155 pgoff_t offset, max_off; 156 157 if (!dst_vma->vm_file) 158 return false; 159 160 inode = dst_vma->vm_file->f_inode; 161 offset = linear_page_index(dst_vma, dst_addr); 162 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 163 return offset >= max_off; 164 } 165 166 /* 167 * Install PTEs, to map dst_addr (within dst_vma) to page. 168 * 169 * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem 170 * and anon, and for both shared and private VMAs. 171 */ 172 int mfill_atomic_install_pte(pmd_t *dst_pmd, 173 struct vm_area_struct *dst_vma, 174 unsigned long dst_addr, struct page *page, 175 bool newly_allocated, uffd_flags_t flags) 176 { 177 int ret; 178 struct mm_struct *dst_mm = dst_vma->vm_mm; 179 pte_t _dst_pte, *dst_pte; 180 bool writable = dst_vma->vm_flags & VM_WRITE; 181 bool vm_shared = dst_vma->vm_flags & VM_SHARED; 182 spinlock_t *ptl; 183 struct folio *folio = page_folio(page); 184 bool page_in_cache = folio_mapping(folio); 185 186 _dst_pte = mk_pte(page, dst_vma->vm_page_prot); 187 _dst_pte = pte_mkdirty(_dst_pte); 188 if (page_in_cache && !vm_shared) 189 writable = false; 190 if (writable) 191 _dst_pte = pte_mkwrite(_dst_pte, dst_vma); 192 if (flags & MFILL_ATOMIC_WP) 193 _dst_pte = pte_mkuffd_wp(_dst_pte); 194 195 ret = -EAGAIN; 196 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 197 if (!dst_pte) 198 goto out; 199 200 if (mfill_file_over_size(dst_vma, dst_addr)) { 201 ret = -EFAULT; 202 goto out_unlock; 203 } 204 205 ret = -EEXIST; 206 /* 207 * We allow to overwrite a pte marker: consider when both MISSING|WP 208 * registered, we firstly wr-protect a none pte which has no page cache 209 * page backing it, then access the page. 210 */ 211 if (!pte_none_mostly(ptep_get(dst_pte))) 212 goto out_unlock; 213 214 if (page_in_cache) { 215 /* Usually, cache pages are already added to LRU */ 216 if (newly_allocated) 217 folio_add_lru(folio); 218 folio_add_file_rmap_pte(folio, page, dst_vma); 219 } else { 220 folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); 221 folio_add_lru_vma(folio, dst_vma); 222 } 223 224 /* 225 * Must happen after rmap, as mm_counter() checks mapping (via 226 * PageAnon()), which is set by __page_set_anon_rmap(). 227 */ 228 inc_mm_counter(dst_mm, mm_counter(folio)); 229 230 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 231 232 /* No need to invalidate - it was non-present before */ 233 update_mmu_cache(dst_vma, dst_addr, dst_pte); 234 ret = 0; 235 out_unlock: 236 pte_unmap_unlock(dst_pte, ptl); 237 out: 238 return ret; 239 } 240 241 static int mfill_atomic_pte_copy(pmd_t *dst_pmd, 242 struct vm_area_struct *dst_vma, 243 unsigned long dst_addr, 244 unsigned long src_addr, 245 uffd_flags_t flags, 246 struct folio **foliop) 247 { 248 void *kaddr; 249 int ret; 250 struct folio *folio; 251 252 if (!*foliop) { 253 ret = -ENOMEM; 254 folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, 255 dst_addr); 256 if (!folio) 257 goto out; 258 259 kaddr = kmap_local_folio(folio, 0); 260 /* 261 * The read mmap_lock is held here. Despite the 262 * mmap_lock being read recursive a deadlock is still 263 * possible if a writer has taken a lock. For example: 264 * 265 * process A thread 1 takes read lock on own mmap_lock 266 * process A thread 2 calls mmap, blocks taking write lock 267 * process B thread 1 takes page fault, read lock on own mmap lock 268 * process B thread 2 calls mmap, blocks taking write lock 269 * process A thread 1 blocks taking read lock on process B 270 * process B thread 1 blocks taking read lock on process A 271 * 272 * Disable page faults to prevent potential deadlock 273 * and retry the copy outside the mmap_lock. 274 */ 275 pagefault_disable(); 276 ret = copy_from_user(kaddr, (const void __user *) src_addr, 277 PAGE_SIZE); 278 pagefault_enable(); 279 kunmap_local(kaddr); 280 281 /* fallback to copy_from_user outside mmap_lock */ 282 if (unlikely(ret)) { 283 ret = -ENOENT; 284 *foliop = folio; 285 /* don't free the page */ 286 goto out; 287 } 288 289 flush_dcache_folio(folio); 290 } else { 291 folio = *foliop; 292 *foliop = NULL; 293 } 294 295 /* 296 * The memory barrier inside __folio_mark_uptodate makes sure that 297 * preceding stores to the page contents become visible before 298 * the set_pte_at() write. 299 */ 300 __folio_mark_uptodate(folio); 301 302 ret = -ENOMEM; 303 if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) 304 goto out_release; 305 306 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 307 &folio->page, true, flags); 308 if (ret) 309 goto out_release; 310 out: 311 return ret; 312 out_release: 313 folio_put(folio); 314 goto out; 315 } 316 317 static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd, 318 struct vm_area_struct *dst_vma, 319 unsigned long dst_addr) 320 { 321 struct folio *folio; 322 int ret = -ENOMEM; 323 324 folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr); 325 if (!folio) 326 return ret; 327 328 if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) 329 goto out_put; 330 331 /* 332 * The memory barrier inside __folio_mark_uptodate makes sure that 333 * zeroing out the folio become visible before mapping the page 334 * using set_pte_at(). See do_anonymous_page(). 335 */ 336 __folio_mark_uptodate(folio); 337 338 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 339 &folio->page, true, 0); 340 if (ret) 341 goto out_put; 342 343 return 0; 344 out_put: 345 folio_put(folio); 346 return ret; 347 } 348 349 static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, 350 struct vm_area_struct *dst_vma, 351 unsigned long dst_addr) 352 { 353 pte_t _dst_pte, *dst_pte; 354 spinlock_t *ptl; 355 int ret; 356 357 if (mm_forbids_zeropage(dst_vma->vm_mm)) 358 return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr); 359 360 _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), 361 dst_vma->vm_page_prot)); 362 ret = -EAGAIN; 363 dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); 364 if (!dst_pte) 365 goto out; 366 if (mfill_file_over_size(dst_vma, dst_addr)) { 367 ret = -EFAULT; 368 goto out_unlock; 369 } 370 ret = -EEXIST; 371 if (!pte_none(ptep_get(dst_pte))) 372 goto out_unlock; 373 set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); 374 /* No need to invalidate - it was non-present before */ 375 update_mmu_cache(dst_vma, dst_addr, dst_pte); 376 ret = 0; 377 out_unlock: 378 pte_unmap_unlock(dst_pte, ptl); 379 out: 380 return ret; 381 } 382 383 /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ 384 static int mfill_atomic_pte_continue(pmd_t *dst_pmd, 385 struct vm_area_struct *dst_vma, 386 unsigned long dst_addr, 387 uffd_flags_t flags) 388 { 389 struct inode *inode = file_inode(dst_vma->vm_file); 390 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 391 struct folio *folio; 392 struct page *page; 393 int ret; 394 395 ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); 396 /* Our caller expects us to return -EFAULT if we failed to find folio */ 397 if (ret == -ENOENT) 398 ret = -EFAULT; 399 if (ret) 400 goto out; 401 if (!folio) { 402 ret = -EFAULT; 403 goto out; 404 } 405 406 page = folio_file_page(folio, pgoff); 407 if (PageHWPoison(page)) { 408 ret = -EIO; 409 goto out_release; 410 } 411 412 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 413 page, false, flags); 414 if (ret) 415 goto out_release; 416 417 folio_unlock(folio); 418 ret = 0; 419 out: 420 return ret; 421 out_release: 422 folio_unlock(folio); 423 folio_put(folio); 424 goto out; 425 } 426 427 /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ 428 static int mfill_atomic_pte_poison(pmd_t *dst_pmd, 429 struct vm_area_struct *dst_vma, 430 unsigned long dst_addr, 431 uffd_flags_t flags) 432 { 433 int ret; 434 struct mm_struct *dst_mm = dst_vma->vm_mm; 435 pte_t _dst_pte, *dst_pte; 436 spinlock_t *ptl; 437 438 _dst_pte = make_pte_marker(PTE_MARKER_POISONED); 439 ret = -EAGAIN; 440 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 441 if (!dst_pte) 442 goto out; 443 444 if (mfill_file_over_size(dst_vma, dst_addr)) { 445 ret = -EFAULT; 446 goto out_unlock; 447 } 448 449 ret = -EEXIST; 450 /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */ 451 if (!pte_none(ptep_get(dst_pte))) 452 goto out_unlock; 453 454 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 455 456 /* No need to invalidate - it was non-present before */ 457 update_mmu_cache(dst_vma, dst_addr, dst_pte); 458 ret = 0; 459 out_unlock: 460 pte_unmap_unlock(dst_pte, ptl); 461 out: 462 return ret; 463 } 464 465 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) 466 { 467 pgd_t *pgd; 468 p4d_t *p4d; 469 pud_t *pud; 470 471 pgd = pgd_offset(mm, address); 472 p4d = p4d_alloc(mm, pgd, address); 473 if (!p4d) 474 return NULL; 475 pud = pud_alloc(mm, p4d, address); 476 if (!pud) 477 return NULL; 478 /* 479 * Note that we didn't run this because the pmd was 480 * missing, the *pmd may be already established and in 481 * turn it may also be a trans_huge_pmd. 482 */ 483 return pmd_alloc(mm, pud, address); 484 } 485 486 #ifdef CONFIG_HUGETLB_PAGE 487 /* 488 * mfill_atomic processing for HUGETLB vmas. Note that this routine is 489 * called with either vma-lock or mmap_lock held, it will release the lock 490 * before returning. 491 */ 492 static __always_inline ssize_t mfill_atomic_hugetlb( 493 struct userfaultfd_ctx *ctx, 494 struct vm_area_struct *dst_vma, 495 unsigned long dst_start, 496 unsigned long src_start, 497 unsigned long len, 498 uffd_flags_t flags) 499 { 500 struct mm_struct *dst_mm = dst_vma->vm_mm; 501 ssize_t err; 502 pte_t *dst_pte; 503 unsigned long src_addr, dst_addr; 504 long copied; 505 struct folio *folio; 506 unsigned long vma_hpagesize; 507 pgoff_t idx; 508 u32 hash; 509 struct address_space *mapping; 510 511 /* 512 * There is no default zero huge page for all huge page sizes as 513 * supported by hugetlb. A PMD_SIZE huge pages may exist as used 514 * by THP. Since we can not reliably insert a zero page, this 515 * feature is not supported. 516 */ 517 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { 518 up_read(&ctx->map_changing_lock); 519 uffd_mfill_unlock(dst_vma); 520 return -EINVAL; 521 } 522 523 src_addr = src_start; 524 dst_addr = dst_start; 525 copied = 0; 526 folio = NULL; 527 vma_hpagesize = vma_kernel_pagesize(dst_vma); 528 529 /* 530 * Validate alignment based on huge page size 531 */ 532 err = -EINVAL; 533 if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) 534 goto out_unlock; 535 536 retry: 537 /* 538 * On routine entry dst_vma is set. If we had to drop mmap_lock and 539 * retry, dst_vma will be set to NULL and we must lookup again. 540 */ 541 if (!dst_vma) { 542 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); 543 if (IS_ERR(dst_vma)) { 544 err = PTR_ERR(dst_vma); 545 goto out; 546 } 547 548 err = -ENOENT; 549 if (!is_vm_hugetlb_page(dst_vma)) 550 goto out_unlock_vma; 551 552 err = -EINVAL; 553 if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) 554 goto out_unlock_vma; 555 556 /* 557 * If memory mappings are changing because of non-cooperative 558 * operation (e.g. mremap) running in parallel, bail out and 559 * request the user to retry later 560 */ 561 down_read(&ctx->map_changing_lock); 562 err = -EAGAIN; 563 if (atomic_read(&ctx->mmap_changing)) 564 goto out_unlock; 565 } 566 567 while (src_addr < src_start + len) { 568 BUG_ON(dst_addr >= dst_start + len); 569 570 /* 571 * Serialize via vma_lock and hugetlb_fault_mutex. 572 * vma_lock ensures the dst_pte remains valid even 573 * in the case of shared pmds. fault mutex prevents 574 * races with other faulting threads. 575 */ 576 idx = linear_page_index(dst_vma, dst_addr); 577 mapping = dst_vma->vm_file->f_mapping; 578 hash = hugetlb_fault_mutex_hash(mapping, idx); 579 mutex_lock(&hugetlb_fault_mutex_table[hash]); 580 hugetlb_vma_lock_read(dst_vma); 581 582 err = -ENOMEM; 583 dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); 584 if (!dst_pte) { 585 hugetlb_vma_unlock_read(dst_vma); 586 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 587 goto out_unlock; 588 } 589 590 if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && 591 !huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) { 592 err = -EEXIST; 593 hugetlb_vma_unlock_read(dst_vma); 594 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 595 goto out_unlock; 596 } 597 598 err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, 599 src_addr, flags, &folio); 600 601 hugetlb_vma_unlock_read(dst_vma); 602 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 603 604 cond_resched(); 605 606 if (unlikely(err == -ENOENT)) { 607 up_read(&ctx->map_changing_lock); 608 uffd_mfill_unlock(dst_vma); 609 BUG_ON(!folio); 610 611 err = copy_folio_from_user(folio, 612 (const void __user *)src_addr, true); 613 if (unlikely(err)) { 614 err = -EFAULT; 615 goto out; 616 } 617 618 dst_vma = NULL; 619 goto retry; 620 } else 621 BUG_ON(folio); 622 623 if (!err) { 624 dst_addr += vma_hpagesize; 625 src_addr += vma_hpagesize; 626 copied += vma_hpagesize; 627 628 if (fatal_signal_pending(current)) 629 err = -EINTR; 630 } 631 if (err) 632 break; 633 } 634 635 out_unlock: 636 up_read(&ctx->map_changing_lock); 637 out_unlock_vma: 638 uffd_mfill_unlock(dst_vma); 639 out: 640 if (folio) 641 folio_put(folio); 642 BUG_ON(copied < 0); 643 BUG_ON(err > 0); 644 BUG_ON(!copied && !err); 645 return copied ? copied : err; 646 } 647 #else /* !CONFIG_HUGETLB_PAGE */ 648 /* fail at build time if gcc attempts to use this */ 649 extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx, 650 struct vm_area_struct *dst_vma, 651 unsigned long dst_start, 652 unsigned long src_start, 653 unsigned long len, 654 uffd_flags_t flags); 655 #endif /* CONFIG_HUGETLB_PAGE */ 656 657 static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, 658 struct vm_area_struct *dst_vma, 659 unsigned long dst_addr, 660 unsigned long src_addr, 661 uffd_flags_t flags, 662 struct folio **foliop) 663 { 664 ssize_t err; 665 666 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { 667 return mfill_atomic_pte_continue(dst_pmd, dst_vma, 668 dst_addr, flags); 669 } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { 670 return mfill_atomic_pte_poison(dst_pmd, dst_vma, 671 dst_addr, flags); 672 } 673 674 /* 675 * The normal page fault path for a shmem will invoke the 676 * fault, fill the hole in the file and COW it right away. The 677 * result generates plain anonymous memory. So when we are 678 * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll 679 * generate anonymous memory directly without actually filling 680 * the hole. For the MAP_PRIVATE case the robustness check 681 * only happens in the pagetable (to verify it's still none) 682 * and not in the radix tree. 683 */ 684 if (!(dst_vma->vm_flags & VM_SHARED)) { 685 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) 686 err = mfill_atomic_pte_copy(dst_pmd, dst_vma, 687 dst_addr, src_addr, 688 flags, foliop); 689 else 690 err = mfill_atomic_pte_zeropage(dst_pmd, 691 dst_vma, dst_addr); 692 } else { 693 err = shmem_mfill_atomic_pte(dst_pmd, dst_vma, 694 dst_addr, src_addr, 695 flags, foliop); 696 } 697 698 return err; 699 } 700 701 static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, 702 unsigned long dst_start, 703 unsigned long src_start, 704 unsigned long len, 705 uffd_flags_t flags) 706 { 707 struct mm_struct *dst_mm = ctx->mm; 708 struct vm_area_struct *dst_vma; 709 ssize_t err; 710 pmd_t *dst_pmd; 711 unsigned long src_addr, dst_addr; 712 long copied; 713 struct folio *folio; 714 715 /* 716 * Sanitize the command parameters: 717 */ 718 BUG_ON(dst_start & ~PAGE_MASK); 719 BUG_ON(len & ~PAGE_MASK); 720 721 /* Does the address range wrap, or is the span zero-sized? */ 722 BUG_ON(src_start + len <= src_start); 723 BUG_ON(dst_start + len <= dst_start); 724 725 src_addr = src_start; 726 dst_addr = dst_start; 727 copied = 0; 728 folio = NULL; 729 retry: 730 /* 731 * Make sure the vma is not shared, that the dst range is 732 * both valid and fully within a single existing vma. 733 */ 734 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); 735 if (IS_ERR(dst_vma)) { 736 err = PTR_ERR(dst_vma); 737 goto out; 738 } 739 740 /* 741 * If memory mappings are changing because of non-cooperative 742 * operation (e.g. mremap) running in parallel, bail out and 743 * request the user to retry later 744 */ 745 down_read(&ctx->map_changing_lock); 746 err = -EAGAIN; 747 if (atomic_read(&ctx->mmap_changing)) 748 goto out_unlock; 749 750 err = -EINVAL; 751 /* 752 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but 753 * it will overwrite vm_ops, so vma_is_anonymous must return false. 754 */ 755 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && 756 dst_vma->vm_flags & VM_SHARED)) 757 goto out_unlock; 758 759 /* 760 * validate 'mode' now that we know the dst_vma: don't allow 761 * a wrprotect copy if the userfaultfd didn't register as WP. 762 */ 763 if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) 764 goto out_unlock; 765 766 /* 767 * If this is a HUGETLB vma, pass off to appropriate routine 768 */ 769 if (is_vm_hugetlb_page(dst_vma)) 770 return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, 771 src_start, len, flags); 772 773 if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) 774 goto out_unlock; 775 if (!vma_is_shmem(dst_vma) && 776 uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) 777 goto out_unlock; 778 779 while (src_addr < src_start + len) { 780 pmd_t dst_pmdval; 781 782 BUG_ON(dst_addr >= dst_start + len); 783 784 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); 785 if (unlikely(!dst_pmd)) { 786 err = -ENOMEM; 787 break; 788 } 789 790 dst_pmdval = pmdp_get_lockless(dst_pmd); 791 if (unlikely(pmd_none(dst_pmdval)) && 792 unlikely(__pte_alloc(dst_mm, dst_pmd))) { 793 err = -ENOMEM; 794 break; 795 } 796 dst_pmdval = pmdp_get_lockless(dst_pmd); 797 /* 798 * If the dst_pmd is THP don't override it and just be strict. 799 * (This includes the case where the PMD used to be THP and 800 * changed back to none after __pte_alloc().) 801 */ 802 if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) || 803 pmd_devmap(dst_pmdval))) { 804 err = -EEXIST; 805 break; 806 } 807 if (unlikely(pmd_bad(dst_pmdval))) { 808 err = -EFAULT; 809 break; 810 } 811 /* 812 * For shmem mappings, khugepaged is allowed to remove page 813 * tables under us; pte_offset_map_lock() will deal with that. 814 */ 815 816 err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, 817 src_addr, flags, &folio); 818 cond_resched(); 819 820 if (unlikely(err == -ENOENT)) { 821 void *kaddr; 822 823 up_read(&ctx->map_changing_lock); 824 uffd_mfill_unlock(dst_vma); 825 BUG_ON(!folio); 826 827 kaddr = kmap_local_folio(folio, 0); 828 err = copy_from_user(kaddr, 829 (const void __user *) src_addr, 830 PAGE_SIZE); 831 kunmap_local(kaddr); 832 if (unlikely(err)) { 833 err = -EFAULT; 834 goto out; 835 } 836 flush_dcache_folio(folio); 837 goto retry; 838 } else 839 BUG_ON(folio); 840 841 if (!err) { 842 dst_addr += PAGE_SIZE; 843 src_addr += PAGE_SIZE; 844 copied += PAGE_SIZE; 845 846 if (fatal_signal_pending(current)) 847 err = -EINTR; 848 } 849 if (err) 850 break; 851 } 852 853 out_unlock: 854 up_read(&ctx->map_changing_lock); 855 uffd_mfill_unlock(dst_vma); 856 out: 857 if (folio) 858 folio_put(folio); 859 BUG_ON(copied < 0); 860 BUG_ON(err > 0); 861 BUG_ON(!copied && !err); 862 return copied ? copied : err; 863 } 864 865 ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, 866 unsigned long src_start, unsigned long len, 867 uffd_flags_t flags) 868 { 869 return mfill_atomic(ctx, dst_start, src_start, len, 870 uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY)); 871 } 872 873 ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, 874 unsigned long start, 875 unsigned long len) 876 { 877 return mfill_atomic(ctx, start, 0, len, 878 uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE)); 879 } 880 881 ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, 882 unsigned long len, uffd_flags_t flags) 883 { 884 885 /* 886 * A caller might reasonably assume that UFFDIO_CONTINUE contains an 887 * smp_wmb() to ensure that any writes to the about-to-be-mapped page by 888 * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to 889 * subsequent loads from the page through the newly mapped address range. 890 */ 891 smp_wmb(); 892 893 return mfill_atomic(ctx, start, 0, len, 894 uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); 895 } 896 897 ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, 898 unsigned long len, uffd_flags_t flags) 899 { 900 return mfill_atomic(ctx, start, 0, len, 901 uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON)); 902 } 903 904 long uffd_wp_range(struct vm_area_struct *dst_vma, 905 unsigned long start, unsigned long len, bool enable_wp) 906 { 907 unsigned int mm_cp_flags; 908 struct mmu_gather tlb; 909 long ret; 910 911 VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end, 912 "The address range exceeds VMA boundary.\n"); 913 if (enable_wp) 914 mm_cp_flags = MM_CP_UFFD_WP; 915 else 916 mm_cp_flags = MM_CP_UFFD_WP_RESOLVE; 917 918 /* 919 * vma->vm_page_prot already reflects that uffd-wp is enabled for this 920 * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed 921 * to be write-protected as default whenever protection changes. 922 * Try upgrading write permissions manually. 923 */ 924 if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) 925 mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; 926 tlb_gather_mmu(&tlb, dst_vma->vm_mm); 927 ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); 928 tlb_finish_mmu(&tlb); 929 930 return ret; 931 } 932 933 int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, 934 unsigned long len, bool enable_wp) 935 { 936 struct mm_struct *dst_mm = ctx->mm; 937 unsigned long end = start + len; 938 unsigned long _start, _end; 939 struct vm_area_struct *dst_vma; 940 unsigned long page_mask; 941 long err; 942 VMA_ITERATOR(vmi, dst_mm, start); 943 944 /* 945 * Sanitize the command parameters: 946 */ 947 BUG_ON(start & ~PAGE_MASK); 948 BUG_ON(len & ~PAGE_MASK); 949 950 /* Does the address range wrap, or is the span zero-sized? */ 951 BUG_ON(start + len <= start); 952 953 mmap_read_lock(dst_mm); 954 955 /* 956 * If memory mappings are changing because of non-cooperative 957 * operation (e.g. mremap) running in parallel, bail out and 958 * request the user to retry later 959 */ 960 down_read(&ctx->map_changing_lock); 961 err = -EAGAIN; 962 if (atomic_read(&ctx->mmap_changing)) 963 goto out_unlock; 964 965 err = -ENOENT; 966 for_each_vma_range(vmi, dst_vma, end) { 967 968 if (!userfaultfd_wp(dst_vma)) { 969 err = -ENOENT; 970 break; 971 } 972 973 if (is_vm_hugetlb_page(dst_vma)) { 974 err = -EINVAL; 975 page_mask = vma_kernel_pagesize(dst_vma) - 1; 976 if ((start & page_mask) || (len & page_mask)) 977 break; 978 } 979 980 _start = max(dst_vma->vm_start, start); 981 _end = min(dst_vma->vm_end, end); 982 983 err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp); 984 985 /* Return 0 on success, <0 on failures */ 986 if (err < 0) 987 break; 988 err = 0; 989 } 990 out_unlock: 991 up_read(&ctx->map_changing_lock); 992 mmap_read_unlock(dst_mm); 993 return err; 994 } 995 996 997 void double_pt_lock(spinlock_t *ptl1, 998 spinlock_t *ptl2) 999 __acquires(ptl1) 1000 __acquires(ptl2) 1001 { 1002 if (ptl1 > ptl2) 1003 swap(ptl1, ptl2); 1004 /* lock in virtual address order to avoid lock inversion */ 1005 spin_lock(ptl1); 1006 if (ptl1 != ptl2) 1007 spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING); 1008 else 1009 __acquire(ptl2); 1010 } 1011 1012 void double_pt_unlock(spinlock_t *ptl1, 1013 spinlock_t *ptl2) 1014 __releases(ptl1) 1015 __releases(ptl2) 1016 { 1017 spin_unlock(ptl1); 1018 if (ptl1 != ptl2) 1019 spin_unlock(ptl2); 1020 else 1021 __release(ptl2); 1022 } 1023 1024 static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, 1025 pte_t orig_dst_pte, pte_t orig_src_pte, 1026 pmd_t *dst_pmd, pmd_t dst_pmdval) 1027 { 1028 return pte_same(ptep_get(src_pte), orig_src_pte) && 1029 pte_same(ptep_get(dst_pte), orig_dst_pte) && 1030 pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd)); 1031 } 1032 1033 static int move_present_pte(struct mm_struct *mm, 1034 struct vm_area_struct *dst_vma, 1035 struct vm_area_struct *src_vma, 1036 unsigned long dst_addr, unsigned long src_addr, 1037 pte_t *dst_pte, pte_t *src_pte, 1038 pte_t orig_dst_pte, pte_t orig_src_pte, 1039 pmd_t *dst_pmd, pmd_t dst_pmdval, 1040 spinlock_t *dst_ptl, spinlock_t *src_ptl, 1041 struct folio *src_folio) 1042 { 1043 int err = 0; 1044 1045 double_pt_lock(dst_ptl, src_ptl); 1046 1047 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1048 dst_pmd, dst_pmdval)) { 1049 err = -EAGAIN; 1050 goto out; 1051 } 1052 if (folio_test_large(src_folio) || 1053 folio_maybe_dma_pinned(src_folio) || 1054 !PageAnonExclusive(&src_folio->page)) { 1055 err = -EBUSY; 1056 goto out; 1057 } 1058 1059 orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte); 1060 /* Folio got pinned from under us. Put it back and fail the move. */ 1061 if (folio_maybe_dma_pinned(src_folio)) { 1062 set_pte_at(mm, src_addr, src_pte, orig_src_pte); 1063 err = -EBUSY; 1064 goto out; 1065 } 1066 1067 folio_move_anon_rmap(src_folio, dst_vma); 1068 src_folio->index = linear_page_index(dst_vma, dst_addr); 1069 1070 orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot); 1071 /* Follow mremap() behavior and treat the entry dirty after the move */ 1072 orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma); 1073 1074 set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); 1075 out: 1076 double_pt_unlock(dst_ptl, src_ptl); 1077 return err; 1078 } 1079 1080 static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, 1081 unsigned long dst_addr, unsigned long src_addr, 1082 pte_t *dst_pte, pte_t *src_pte, 1083 pte_t orig_dst_pte, pte_t orig_src_pte, 1084 pmd_t *dst_pmd, pmd_t dst_pmdval, 1085 spinlock_t *dst_ptl, spinlock_t *src_ptl, 1086 struct folio *src_folio) 1087 { 1088 double_pt_lock(dst_ptl, src_ptl); 1089 1090 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1091 dst_pmd, dst_pmdval)) { 1092 double_pt_unlock(dst_ptl, src_ptl); 1093 return -EAGAIN; 1094 } 1095 1096 /* 1097 * The src_folio resides in the swapcache, requiring an update to its 1098 * index and mapping to align with the dst_vma, where a swap-in may 1099 * occur and hit the swapcache after moving the PTE. 1100 */ 1101 if (src_folio) { 1102 folio_move_anon_rmap(src_folio, dst_vma); 1103 src_folio->index = linear_page_index(dst_vma, dst_addr); 1104 } 1105 1106 orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); 1107 set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); 1108 double_pt_unlock(dst_ptl, src_ptl); 1109 1110 return 0; 1111 } 1112 1113 static int move_zeropage_pte(struct mm_struct *mm, 1114 struct vm_area_struct *dst_vma, 1115 struct vm_area_struct *src_vma, 1116 unsigned long dst_addr, unsigned long src_addr, 1117 pte_t *dst_pte, pte_t *src_pte, 1118 pte_t orig_dst_pte, pte_t orig_src_pte, 1119 pmd_t *dst_pmd, pmd_t dst_pmdval, 1120 spinlock_t *dst_ptl, spinlock_t *src_ptl) 1121 { 1122 pte_t zero_pte; 1123 1124 double_pt_lock(dst_ptl, src_ptl); 1125 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1126 dst_pmd, dst_pmdval)) { 1127 double_pt_unlock(dst_ptl, src_ptl); 1128 return -EAGAIN; 1129 } 1130 1131 zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), 1132 dst_vma->vm_page_prot)); 1133 ptep_clear_flush(src_vma, src_addr, src_pte); 1134 set_pte_at(mm, dst_addr, dst_pte, zero_pte); 1135 double_pt_unlock(dst_ptl, src_ptl); 1136 1137 return 0; 1138 } 1139 1140 1141 /* 1142 * The mmap_lock for reading is held by the caller. Just move the page 1143 * from src_pmd to dst_pmd if possible, and return true if succeeded 1144 * in moving the page. 1145 */ 1146 static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, 1147 struct vm_area_struct *dst_vma, 1148 struct vm_area_struct *src_vma, 1149 unsigned long dst_addr, unsigned long src_addr, 1150 __u64 mode) 1151 { 1152 swp_entry_t entry; 1153 struct swap_info_struct *si = NULL; 1154 pte_t orig_src_pte, orig_dst_pte; 1155 pte_t src_folio_pte; 1156 spinlock_t *src_ptl, *dst_ptl; 1157 pte_t *src_pte = NULL; 1158 pte_t *dst_pte = NULL; 1159 pmd_t dummy_pmdval; 1160 pmd_t dst_pmdval; 1161 struct folio *src_folio = NULL; 1162 struct anon_vma *src_anon_vma = NULL; 1163 struct mmu_notifier_range range; 1164 int err = 0; 1165 1166 flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE); 1167 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 1168 src_addr, src_addr + PAGE_SIZE); 1169 mmu_notifier_invalidate_range_start(&range); 1170 retry: 1171 /* 1172 * Use the maywrite version to indicate that dst_pte will be modified, 1173 * since dst_pte needs to be none, the subsequent pte_same() check 1174 * cannot prevent the dst_pte page from being freed concurrently, so we 1175 * also need to abtain dst_pmdval and recheck pmd_same() later. 1176 */ 1177 dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval, 1178 &dst_ptl); 1179 1180 /* Retry if a huge pmd materialized from under us */ 1181 if (unlikely(!dst_pte)) { 1182 err = -EAGAIN; 1183 goto out; 1184 } 1185 1186 /* 1187 * Unlike dst_pte, the subsequent pte_same() check can ensure the 1188 * stability of the src_pte page, so there is no need to get pmdval, 1189 * just pass a dummy variable to it. 1190 */ 1191 src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval, 1192 &src_ptl); 1193 1194 /* 1195 * We held the mmap_lock for reading so MADV_DONTNEED 1196 * can zap transparent huge pages under us, or the 1197 * transparent huge page fault can establish new 1198 * transparent huge pages under us. 1199 */ 1200 if (unlikely(!src_pte)) { 1201 err = -EAGAIN; 1202 goto out; 1203 } 1204 1205 /* Sanity checks before the operation */ 1206 if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) || 1207 pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) { 1208 err = -EINVAL; 1209 goto out; 1210 } 1211 1212 spin_lock(dst_ptl); 1213 orig_dst_pte = ptep_get(dst_pte); 1214 spin_unlock(dst_ptl); 1215 if (!pte_none(orig_dst_pte)) { 1216 err = -EEXIST; 1217 goto out; 1218 } 1219 1220 spin_lock(src_ptl); 1221 orig_src_pte = ptep_get(src_pte); 1222 spin_unlock(src_ptl); 1223 if (pte_none(orig_src_pte)) { 1224 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) 1225 err = -ENOENT; 1226 else /* nothing to do to move a hole */ 1227 err = 0; 1228 goto out; 1229 } 1230 1231 /* If PTE changed after we locked the folio them start over */ 1232 if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { 1233 err = -EAGAIN; 1234 goto out; 1235 } 1236 1237 if (pte_present(orig_src_pte)) { 1238 if (is_zero_pfn(pte_pfn(orig_src_pte))) { 1239 err = move_zeropage_pte(mm, dst_vma, src_vma, 1240 dst_addr, src_addr, dst_pte, src_pte, 1241 orig_dst_pte, orig_src_pte, 1242 dst_pmd, dst_pmdval, dst_ptl, src_ptl); 1243 goto out; 1244 } 1245 1246 /* 1247 * Pin and lock both source folio and anon_vma. Since we are in 1248 * RCU read section, we can't block, so on contention have to 1249 * unmap the ptes, obtain the lock and retry. 1250 */ 1251 if (!src_folio) { 1252 struct folio *folio; 1253 bool locked; 1254 1255 /* 1256 * Pin the page while holding the lock to be sure the 1257 * page isn't freed under us 1258 */ 1259 spin_lock(src_ptl); 1260 if (!pte_same(orig_src_pte, ptep_get(src_pte))) { 1261 spin_unlock(src_ptl); 1262 err = -EAGAIN; 1263 goto out; 1264 } 1265 1266 folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); 1267 if (!folio || !PageAnonExclusive(&folio->page)) { 1268 spin_unlock(src_ptl); 1269 err = -EBUSY; 1270 goto out; 1271 } 1272 1273 locked = folio_trylock(folio); 1274 /* 1275 * We avoid waiting for folio lock with a raised 1276 * refcount for large folios because extra refcounts 1277 * will result in split_folio() failing later and 1278 * retrying. If multiple tasks are trying to move a 1279 * large folio we can end up livelocking. 1280 */ 1281 if (!locked && folio_test_large(folio)) { 1282 spin_unlock(src_ptl); 1283 err = -EAGAIN; 1284 goto out; 1285 } 1286 1287 folio_get(folio); 1288 src_folio = folio; 1289 src_folio_pte = orig_src_pte; 1290 spin_unlock(src_ptl); 1291 1292 if (!locked) { 1293 pte_unmap(src_pte); 1294 pte_unmap(dst_pte); 1295 src_pte = dst_pte = NULL; 1296 /* now we can block and wait */ 1297 folio_lock(src_folio); 1298 goto retry; 1299 } 1300 1301 if (WARN_ON_ONCE(!folio_test_anon(src_folio))) { 1302 err = -EBUSY; 1303 goto out; 1304 } 1305 } 1306 1307 /* at this point we have src_folio locked */ 1308 if (folio_test_large(src_folio)) { 1309 /* split_folio() can block */ 1310 pte_unmap(src_pte); 1311 pte_unmap(dst_pte); 1312 src_pte = dst_pte = NULL; 1313 err = split_folio(src_folio); 1314 if (err) 1315 goto out; 1316 /* have to reacquire the folio after it got split */ 1317 folio_unlock(src_folio); 1318 folio_put(src_folio); 1319 src_folio = NULL; 1320 goto retry; 1321 } 1322 1323 if (!src_anon_vma) { 1324 /* 1325 * folio_referenced walks the anon_vma chain 1326 * without the folio lock. Serialize against it with 1327 * the anon_vma lock, the folio lock is not enough. 1328 */ 1329 src_anon_vma = folio_get_anon_vma(src_folio); 1330 if (!src_anon_vma) { 1331 /* page was unmapped from under us */ 1332 err = -EAGAIN; 1333 goto out; 1334 } 1335 if (!anon_vma_trylock_write(src_anon_vma)) { 1336 pte_unmap(src_pte); 1337 pte_unmap(dst_pte); 1338 src_pte = dst_pte = NULL; 1339 /* now we can block and wait */ 1340 anon_vma_lock_write(src_anon_vma); 1341 goto retry; 1342 } 1343 } 1344 1345 err = move_present_pte(mm, dst_vma, src_vma, 1346 dst_addr, src_addr, dst_pte, src_pte, 1347 orig_dst_pte, orig_src_pte, dst_pmd, 1348 dst_pmdval, dst_ptl, src_ptl, src_folio); 1349 } else { 1350 struct folio *folio = NULL; 1351 1352 entry = pte_to_swp_entry(orig_src_pte); 1353 if (non_swap_entry(entry)) { 1354 if (is_migration_entry(entry)) { 1355 pte_unmap(src_pte); 1356 pte_unmap(dst_pte); 1357 src_pte = dst_pte = NULL; 1358 migration_entry_wait(mm, src_pmd, src_addr); 1359 err = -EAGAIN; 1360 } else 1361 err = -EFAULT; 1362 goto out; 1363 } 1364 1365 if (!pte_swp_exclusive(orig_src_pte)) { 1366 err = -EBUSY; 1367 goto out; 1368 } 1369 1370 si = get_swap_device(entry); 1371 if (unlikely(!si)) { 1372 err = -EAGAIN; 1373 goto out; 1374 } 1375 /* 1376 * Verify the existence of the swapcache. If present, the folio's 1377 * index and mapping must be updated even when the PTE is a swap 1378 * entry. The anon_vma lock is not taken during this process since 1379 * the folio has already been unmapped, and the swap entry is 1380 * exclusive, preventing rmap walks. 1381 * 1382 * For large folios, return -EBUSY immediately, as split_folio() 1383 * also returns -EBUSY when attempting to split unmapped large 1384 * folios in the swapcache. This issue needs to be resolved 1385 * separately to allow proper handling. 1386 */ 1387 if (!src_folio) 1388 folio = filemap_get_folio(swap_address_space(entry), 1389 swap_cache_index(entry)); 1390 if (!IS_ERR_OR_NULL(folio)) { 1391 if (folio_test_large(folio)) { 1392 err = -EBUSY; 1393 folio_put(folio); 1394 goto out; 1395 } 1396 src_folio = folio; 1397 src_folio_pte = orig_src_pte; 1398 if (!folio_trylock(src_folio)) { 1399 pte_unmap(src_pte); 1400 pte_unmap(dst_pte); 1401 src_pte = dst_pte = NULL; 1402 put_swap_device(si); 1403 si = NULL; 1404 /* now we can block and wait */ 1405 folio_lock(src_folio); 1406 goto retry; 1407 } 1408 } 1409 err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, 1410 orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, 1411 dst_ptl, src_ptl, src_folio); 1412 } 1413 1414 out: 1415 if (src_anon_vma) { 1416 anon_vma_unlock_write(src_anon_vma); 1417 put_anon_vma(src_anon_vma); 1418 } 1419 if (src_folio) { 1420 folio_unlock(src_folio); 1421 folio_put(src_folio); 1422 } 1423 if (dst_pte) 1424 pte_unmap(dst_pte); 1425 if (src_pte) 1426 pte_unmap(src_pte); 1427 mmu_notifier_invalidate_range_end(&range); 1428 if (si) 1429 put_swap_device(si); 1430 1431 return err; 1432 } 1433 1434 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1435 static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1436 unsigned long src_addr, 1437 unsigned long src_end) 1438 { 1439 return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) || 1440 src_end - src_addr < HPAGE_PMD_SIZE; 1441 } 1442 #else 1443 static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1444 unsigned long src_addr, 1445 unsigned long src_end) 1446 { 1447 /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */ 1448 return false; 1449 } 1450 #endif 1451 1452 static inline bool vma_move_compatible(struct vm_area_struct *vma) 1453 { 1454 return !(vma->vm_flags & (VM_PFNMAP | VM_IO | VM_HUGETLB | 1455 VM_MIXEDMAP | VM_SHADOW_STACK)); 1456 } 1457 1458 static int validate_move_areas(struct userfaultfd_ctx *ctx, 1459 struct vm_area_struct *src_vma, 1460 struct vm_area_struct *dst_vma) 1461 { 1462 /* Only allow moving if both have the same access and protection */ 1463 if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) || 1464 pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot)) 1465 return -EINVAL; 1466 1467 /* Only allow moving if both are mlocked or both aren't */ 1468 if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED)) 1469 return -EINVAL; 1470 1471 /* 1472 * For now, we keep it simple and only move between writable VMAs. 1473 * Access flags are equal, therefore cheching only the source is enough. 1474 */ 1475 if (!(src_vma->vm_flags & VM_WRITE)) 1476 return -EINVAL; 1477 1478 /* Check if vma flags indicate content which can be moved */ 1479 if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma)) 1480 return -EINVAL; 1481 1482 /* Ensure dst_vma is registered in uffd we are operating on */ 1483 if (!dst_vma->vm_userfaultfd_ctx.ctx || 1484 dst_vma->vm_userfaultfd_ctx.ctx != ctx) 1485 return -EINVAL; 1486 1487 /* Only allow moving across anonymous vmas */ 1488 if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma)) 1489 return -EINVAL; 1490 1491 return 0; 1492 } 1493 1494 static __always_inline 1495 int find_vmas_mm_locked(struct mm_struct *mm, 1496 unsigned long dst_start, 1497 unsigned long src_start, 1498 struct vm_area_struct **dst_vmap, 1499 struct vm_area_struct **src_vmap) 1500 { 1501 struct vm_area_struct *vma; 1502 1503 mmap_assert_locked(mm); 1504 vma = find_vma_and_prepare_anon(mm, dst_start); 1505 if (IS_ERR(vma)) 1506 return PTR_ERR(vma); 1507 1508 *dst_vmap = vma; 1509 /* Skip finding src_vma if src_start is in dst_vma */ 1510 if (src_start >= vma->vm_start && src_start < vma->vm_end) 1511 goto out_success; 1512 1513 vma = vma_lookup(mm, src_start); 1514 if (!vma) 1515 return -ENOENT; 1516 out_success: 1517 *src_vmap = vma; 1518 return 0; 1519 } 1520 1521 #ifdef CONFIG_PER_VMA_LOCK 1522 static int uffd_move_lock(struct mm_struct *mm, 1523 unsigned long dst_start, 1524 unsigned long src_start, 1525 struct vm_area_struct **dst_vmap, 1526 struct vm_area_struct **src_vmap) 1527 { 1528 struct vm_area_struct *vma; 1529 int err; 1530 1531 vma = uffd_lock_vma(mm, dst_start); 1532 if (IS_ERR(vma)) 1533 return PTR_ERR(vma); 1534 1535 *dst_vmap = vma; 1536 /* 1537 * Skip finding src_vma if src_start is in dst_vma. This also ensures 1538 * that we don't lock the same vma twice. 1539 */ 1540 if (src_start >= vma->vm_start && src_start < vma->vm_end) { 1541 *src_vmap = vma; 1542 return 0; 1543 } 1544 1545 /* 1546 * Using uffd_lock_vma() to get src_vma can lead to following deadlock: 1547 * 1548 * Thread1 Thread2 1549 * ------- ------- 1550 * vma_start_read(dst_vma) 1551 * mmap_write_lock(mm) 1552 * vma_start_write(src_vma) 1553 * vma_start_read(src_vma) 1554 * mmap_read_lock(mm) 1555 * vma_start_write(dst_vma) 1556 */ 1557 *src_vmap = lock_vma_under_rcu(mm, src_start); 1558 if (likely(*src_vmap)) 1559 return 0; 1560 1561 /* Undo any locking and retry in mmap_lock critical section */ 1562 vma_end_read(*dst_vmap); 1563 1564 mmap_read_lock(mm); 1565 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); 1566 if (!err) { 1567 /* 1568 * See comment in uffd_lock_vma() as to why not using 1569 * vma_start_read() here. 1570 */ 1571 down_read(&(*dst_vmap)->vm_lock->lock); 1572 if (*dst_vmap != *src_vmap) 1573 down_read_nested(&(*src_vmap)->vm_lock->lock, 1574 SINGLE_DEPTH_NESTING); 1575 } 1576 mmap_read_unlock(mm); 1577 return err; 1578 } 1579 1580 static void uffd_move_unlock(struct vm_area_struct *dst_vma, 1581 struct vm_area_struct *src_vma) 1582 { 1583 vma_end_read(src_vma); 1584 if (src_vma != dst_vma) 1585 vma_end_read(dst_vma); 1586 } 1587 1588 #else 1589 1590 static int uffd_move_lock(struct mm_struct *mm, 1591 unsigned long dst_start, 1592 unsigned long src_start, 1593 struct vm_area_struct **dst_vmap, 1594 struct vm_area_struct **src_vmap) 1595 { 1596 int err; 1597 1598 mmap_read_lock(mm); 1599 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); 1600 if (err) 1601 mmap_read_unlock(mm); 1602 return err; 1603 } 1604 1605 static void uffd_move_unlock(struct vm_area_struct *dst_vma, 1606 struct vm_area_struct *src_vma) 1607 { 1608 mmap_assert_locked(src_vma->vm_mm); 1609 mmap_read_unlock(dst_vma->vm_mm); 1610 } 1611 #endif 1612 1613 /** 1614 * move_pages - move arbitrary anonymous pages of an existing vma 1615 * @ctx: pointer to the userfaultfd context 1616 * @dst_start: start of the destination virtual memory range 1617 * @src_start: start of the source virtual memory range 1618 * @len: length of the virtual memory range 1619 * @mode: flags from uffdio_move.mode 1620 * 1621 * It will either use the mmap_lock in read mode or per-vma locks 1622 * 1623 * move_pages() remaps arbitrary anonymous pages atomically in zero 1624 * copy. It only works on non shared anonymous pages because those can 1625 * be relocated without generating non linear anon_vmas in the rmap 1626 * code. 1627 * 1628 * It provides a zero copy mechanism to handle userspace page faults. 1629 * The source vma pages should have mapcount == 1, which can be 1630 * enforced by using madvise(MADV_DONTFORK) on src vma. 1631 * 1632 * The thread receiving the page during the userland page fault 1633 * will receive the faulting page in the source vma through the network, 1634 * storage or any other I/O device (MADV_DONTFORK in the source vma 1635 * avoids move_pages() to fail with -EBUSY if the process forks before 1636 * move_pages() is called), then it will call move_pages() to map the 1637 * page in the faulting address in the destination vma. 1638 * 1639 * This userfaultfd command works purely via pagetables, so it's the 1640 * most efficient way to move physical non shared anonymous pages 1641 * across different virtual addresses. Unlike mremap()/mmap()/munmap() 1642 * it does not create any new vmas. The mapping in the destination 1643 * address is atomic. 1644 * 1645 * It only works if the vma protection bits are identical from the 1646 * source and destination vma. 1647 * 1648 * It can remap non shared anonymous pages within the same vma too. 1649 * 1650 * If the source virtual memory range has any unmapped holes, or if 1651 * the destination virtual memory range is not a whole unmapped hole, 1652 * move_pages() will fail respectively with -ENOENT or -EEXIST. This 1653 * provides a very strict behavior to avoid any chance of memory 1654 * corruption going unnoticed if there are userland race conditions. 1655 * Only one thread should resolve the userland page fault at any given 1656 * time for any given faulting address. This means that if two threads 1657 * try to both call move_pages() on the same destination address at the 1658 * same time, the second thread will get an explicit error from this 1659 * command. 1660 * 1661 * The command retval will return "len" is successful. The command 1662 * however can be interrupted by fatal signals or errors. If 1663 * interrupted it will return the number of bytes successfully 1664 * remapped before the interruption if any, or the negative error if 1665 * none. It will never return zero. Either it will return an error or 1666 * an amount of bytes successfully moved. If the retval reports a 1667 * "short" remap, the move_pages() command should be repeated by 1668 * userland with src+retval, dst+reval, len-retval if it wants to know 1669 * about the error that interrupted it. 1670 * 1671 * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to 1672 * prevent -ENOENT errors to materialize if there are holes in the 1673 * source virtual range that is being remapped. The holes will be 1674 * accounted as successfully remapped in the retval of the 1675 * command. This is mostly useful to remap hugepage naturally aligned 1676 * virtual regions without knowing if there are transparent hugepage 1677 * in the regions or not, but preventing the risk of having to split 1678 * the hugepmd during the remap. 1679 * 1680 * If there's any rmap walk that is taking the anon_vma locks without 1681 * first obtaining the folio lock (the only current instance is 1682 * folio_referenced), they will have to verify if the folio->mapping 1683 * has changed after taking the anon_vma lock. If it changed they 1684 * should release the lock and retry obtaining a new anon_vma, because 1685 * it means the anon_vma was changed by move_pages() before the lock 1686 * could be obtained. This is the only additional complexity added to 1687 * the rmap code to provide this anonymous page remapping functionality. 1688 */ 1689 ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, 1690 unsigned long src_start, unsigned long len, __u64 mode) 1691 { 1692 struct mm_struct *mm = ctx->mm; 1693 struct vm_area_struct *src_vma, *dst_vma; 1694 unsigned long src_addr, dst_addr; 1695 pmd_t *src_pmd, *dst_pmd; 1696 long err = -EINVAL; 1697 ssize_t moved = 0; 1698 1699 /* Sanitize the command parameters. */ 1700 if (WARN_ON_ONCE(src_start & ~PAGE_MASK) || 1701 WARN_ON_ONCE(dst_start & ~PAGE_MASK) || 1702 WARN_ON_ONCE(len & ~PAGE_MASK)) 1703 goto out; 1704 1705 /* Does the address range wrap, or is the span zero-sized? */ 1706 if (WARN_ON_ONCE(src_start + len <= src_start) || 1707 WARN_ON_ONCE(dst_start + len <= dst_start)) 1708 goto out; 1709 1710 err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma); 1711 if (err) 1712 goto out; 1713 1714 /* Re-check after taking map_changing_lock */ 1715 err = -EAGAIN; 1716 down_read(&ctx->map_changing_lock); 1717 if (likely(atomic_read(&ctx->mmap_changing))) 1718 goto out_unlock; 1719 /* 1720 * Make sure the vma is not shared, that the src and dst remap 1721 * ranges are both valid and fully within a single existing 1722 * vma. 1723 */ 1724 err = -EINVAL; 1725 if (src_vma->vm_flags & VM_SHARED) 1726 goto out_unlock; 1727 if (src_start + len > src_vma->vm_end) 1728 goto out_unlock; 1729 1730 if (dst_vma->vm_flags & VM_SHARED) 1731 goto out_unlock; 1732 if (dst_start + len > dst_vma->vm_end) 1733 goto out_unlock; 1734 1735 err = validate_move_areas(ctx, src_vma, dst_vma); 1736 if (err) 1737 goto out_unlock; 1738 1739 for (src_addr = src_start, dst_addr = dst_start; 1740 src_addr < src_start + len;) { 1741 spinlock_t *ptl; 1742 pmd_t dst_pmdval; 1743 unsigned long step_size; 1744 1745 /* 1746 * Below works because anonymous area would not have a 1747 * transparent huge PUD. If file-backed support is added, 1748 * that case would need to be handled here. 1749 */ 1750 src_pmd = mm_find_pmd(mm, src_addr); 1751 if (unlikely(!src_pmd)) { 1752 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 1753 err = -ENOENT; 1754 break; 1755 } 1756 src_pmd = mm_alloc_pmd(mm, src_addr); 1757 if (unlikely(!src_pmd)) { 1758 err = -ENOMEM; 1759 break; 1760 } 1761 } 1762 dst_pmd = mm_alloc_pmd(mm, dst_addr); 1763 if (unlikely(!dst_pmd)) { 1764 err = -ENOMEM; 1765 break; 1766 } 1767 1768 dst_pmdval = pmdp_get_lockless(dst_pmd); 1769 /* 1770 * If the dst_pmd is mapped as THP don't override it and just 1771 * be strict. If dst_pmd changes into TPH after this check, the 1772 * move_pages_huge_pmd() will detect the change and retry 1773 * while move_pages_pte() will detect the change and fail. 1774 */ 1775 if (unlikely(pmd_trans_huge(dst_pmdval))) { 1776 err = -EEXIST; 1777 break; 1778 } 1779 1780 ptl = pmd_trans_huge_lock(src_pmd, src_vma); 1781 if (ptl) { 1782 if (pmd_devmap(*src_pmd)) { 1783 spin_unlock(ptl); 1784 err = -ENOENT; 1785 break; 1786 } 1787 1788 /* Check if we can move the pmd without splitting it. */ 1789 if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || 1790 !pmd_none(dst_pmdval)) { 1791 struct folio *folio = pmd_folio(*src_pmd); 1792 1793 if (!folio || (!is_huge_zero_folio(folio) && 1794 !PageAnonExclusive(&folio->page))) { 1795 spin_unlock(ptl); 1796 err = -EBUSY; 1797 break; 1798 } 1799 1800 spin_unlock(ptl); 1801 split_huge_pmd(src_vma, src_pmd, src_addr); 1802 /* The folio will be split by move_pages_pte() */ 1803 continue; 1804 } 1805 1806 err = move_pages_huge_pmd(mm, dst_pmd, src_pmd, 1807 dst_pmdval, dst_vma, src_vma, 1808 dst_addr, src_addr); 1809 step_size = HPAGE_PMD_SIZE; 1810 } else { 1811 if (pmd_none(*src_pmd)) { 1812 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 1813 err = -ENOENT; 1814 break; 1815 } 1816 if (unlikely(__pte_alloc(mm, src_pmd))) { 1817 err = -ENOMEM; 1818 break; 1819 } 1820 } 1821 1822 if (unlikely(pte_alloc(mm, dst_pmd))) { 1823 err = -ENOMEM; 1824 break; 1825 } 1826 1827 err = move_pages_pte(mm, dst_pmd, src_pmd, 1828 dst_vma, src_vma, 1829 dst_addr, src_addr, mode); 1830 step_size = PAGE_SIZE; 1831 } 1832 1833 cond_resched(); 1834 1835 if (fatal_signal_pending(current)) { 1836 /* Do not override an error */ 1837 if (!err || err == -EAGAIN) 1838 err = -EINTR; 1839 break; 1840 } 1841 1842 if (err) { 1843 if (err == -EAGAIN) 1844 continue; 1845 break; 1846 } 1847 1848 /* Proceed to the next page */ 1849 dst_addr += step_size; 1850 src_addr += step_size; 1851 moved += step_size; 1852 } 1853 1854 out_unlock: 1855 up_read(&ctx->map_changing_lock); 1856 uffd_move_unlock(dst_vma, src_vma); 1857 out: 1858 VM_WARN_ON(moved < 0); 1859 VM_WARN_ON(err > 0); 1860 VM_WARN_ON(!moved && !err); 1861 return moved ? moved : err; 1862 } 1863 1864 static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, 1865 vm_flags_t flags) 1866 { 1867 const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; 1868 1869 vm_flags_reset(vma, flags); 1870 /* 1871 * For shared mappings, we want to enable writenotify while 1872 * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply 1873 * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. 1874 */ 1875 if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) 1876 vma_set_page_prot(vma); 1877 } 1878 1879 static void userfaultfd_set_ctx(struct vm_area_struct *vma, 1880 struct userfaultfd_ctx *ctx, 1881 unsigned long flags) 1882 { 1883 vma_start_write(vma); 1884 vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; 1885 userfaultfd_set_vm_flags(vma, 1886 (vma->vm_flags & ~__VM_UFFD_FLAGS) | flags); 1887 } 1888 1889 void userfaultfd_reset_ctx(struct vm_area_struct *vma) 1890 { 1891 userfaultfd_set_ctx(vma, NULL, 0); 1892 } 1893 1894 struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, 1895 struct vm_area_struct *prev, 1896 struct vm_area_struct *vma, 1897 unsigned long start, 1898 unsigned long end) 1899 { 1900 struct vm_area_struct *ret; 1901 1902 /* Reset ptes for the whole vma range if wr-protected */ 1903 if (userfaultfd_wp(vma)) 1904 uffd_wp_range(vma, start, end - start, false); 1905 1906 ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, 1907 vma->vm_flags & ~__VM_UFFD_FLAGS, 1908 NULL_VM_UFFD_CTX); 1909 1910 /* 1911 * In the vma_merge() successful mprotect-like case 8: 1912 * the next vma was merged into the current one and 1913 * the current one has not been updated yet. 1914 */ 1915 if (!IS_ERR(ret)) 1916 userfaultfd_reset_ctx(ret); 1917 1918 return ret; 1919 } 1920 1921 /* Assumes mmap write lock taken, and mm_struct pinned. */ 1922 int userfaultfd_register_range(struct userfaultfd_ctx *ctx, 1923 struct vm_area_struct *vma, 1924 unsigned long vm_flags, 1925 unsigned long start, unsigned long end, 1926 bool wp_async) 1927 { 1928 VMA_ITERATOR(vmi, ctx->mm, start); 1929 struct vm_area_struct *prev = vma_prev(&vmi); 1930 unsigned long vma_end; 1931 unsigned long new_flags; 1932 1933 if (vma->vm_start < start) 1934 prev = vma; 1935 1936 for_each_vma_range(vmi, vma, end) { 1937 cond_resched(); 1938 1939 BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); 1940 BUG_ON(vma->vm_userfaultfd_ctx.ctx && 1941 vma->vm_userfaultfd_ctx.ctx != ctx); 1942 WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); 1943 1944 /* 1945 * Nothing to do: this vma is already registered into this 1946 * userfaultfd and with the right tracking mode too. 1947 */ 1948 if (vma->vm_userfaultfd_ctx.ctx == ctx && 1949 (vma->vm_flags & vm_flags) == vm_flags) 1950 goto skip; 1951 1952 if (vma->vm_start > start) 1953 start = vma->vm_start; 1954 vma_end = min(end, vma->vm_end); 1955 1956 new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; 1957 vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, 1958 new_flags, 1959 (struct vm_userfaultfd_ctx){ctx}); 1960 if (IS_ERR(vma)) 1961 return PTR_ERR(vma); 1962 1963 /* 1964 * In the vma_merge() successful mprotect-like case 8: 1965 * the next vma was merged into the current one and 1966 * the current one has not been updated yet. 1967 */ 1968 userfaultfd_set_ctx(vma, ctx, vm_flags); 1969 1970 if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) 1971 hugetlb_unshare_all_pmds(vma); 1972 1973 skip: 1974 prev = vma; 1975 start = vma->vm_end; 1976 } 1977 1978 return 0; 1979 } 1980 1981 void userfaultfd_release_new(struct userfaultfd_ctx *ctx) 1982 { 1983 struct mm_struct *mm = ctx->mm; 1984 struct vm_area_struct *vma; 1985 VMA_ITERATOR(vmi, mm, 0); 1986 1987 /* the various vma->vm_userfaultfd_ctx still points to it */ 1988 mmap_write_lock(mm); 1989 for_each_vma(vmi, vma) { 1990 if (vma->vm_userfaultfd_ctx.ctx == ctx) 1991 userfaultfd_reset_ctx(vma); 1992 } 1993 mmap_write_unlock(mm); 1994 } 1995 1996 void userfaultfd_release_all(struct mm_struct *mm, 1997 struct userfaultfd_ctx *ctx) 1998 { 1999 struct vm_area_struct *vma, *prev; 2000 VMA_ITERATOR(vmi, mm, 0); 2001 2002 if (!mmget_not_zero(mm)) 2003 return; 2004 2005 /* 2006 * Flush page faults out of all CPUs. NOTE: all page faults 2007 * must be retried without returning VM_FAULT_SIGBUS if 2008 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx 2009 * changes while handle_userfault released the mmap_lock. So 2010 * it's critical that released is set to true (above), before 2011 * taking the mmap_lock for writing. 2012 */ 2013 mmap_write_lock(mm); 2014 prev = NULL; 2015 for_each_vma(vmi, vma) { 2016 cond_resched(); 2017 BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ 2018 !!(vma->vm_flags & __VM_UFFD_FLAGS)); 2019 if (vma->vm_userfaultfd_ctx.ctx != ctx) { 2020 prev = vma; 2021 continue; 2022 } 2023 2024 vma = userfaultfd_clear_vma(&vmi, prev, vma, 2025 vma->vm_start, vma->vm_end); 2026 prev = vma; 2027 } 2028 mmap_write_unlock(mm); 2029 mmput(mm); 2030 } 2031