1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * mm/userfaultfd.c 4 * 5 * Copyright (C) 2015 Red Hat, Inc. 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/sched/signal.h> 10 #include <linux/pagemap.h> 11 #include <linux/rmap.h> 12 #include <linux/swap.h> 13 #include <linux/leafops.h> 14 #include <linux/userfaultfd_k.h> 15 #include <linux/mmu_notifier.h> 16 #include <linux/hugetlb.h> 17 #include <linux/shmem_fs.h> 18 #include <asm/tlbflush.h> 19 #include <asm/tlb.h> 20 #include "internal.h" 21 #include "swap.h" 22 23 static __always_inline 24 bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) 25 { 26 /* Make sure that the dst range is fully within dst_vma. */ 27 if (dst_end > dst_vma->vm_end) 28 return false; 29 30 /* 31 * Check the vma is registered in uffd, this is required to 32 * enforce the VM_MAYWRITE check done at uffd registration 33 * time. 34 */ 35 if (!dst_vma->vm_userfaultfd_ctx.ctx) 36 return false; 37 38 return true; 39 } 40 41 static __always_inline 42 struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm, 43 unsigned long addr) 44 { 45 struct vm_area_struct *vma; 46 47 mmap_assert_locked(mm); 48 vma = vma_lookup(mm, addr); 49 if (!vma) 50 vma = ERR_PTR(-ENOENT); 51 else if (!(vma->vm_flags & VM_SHARED) && 52 unlikely(anon_vma_prepare(vma))) 53 vma = ERR_PTR(-ENOMEM); 54 55 return vma; 56 } 57 58 #ifdef CONFIG_PER_VMA_LOCK 59 /* 60 * uffd_lock_vma() - Lookup and lock vma corresponding to @address. 61 * @mm: mm to search vma in. 62 * @address: address that the vma should contain. 63 * 64 * Should be called without holding mmap_lock. 65 * 66 * Return: A locked vma containing @address, -ENOENT if no vma is found, or 67 * -ENOMEM if anon_vma couldn't be allocated. 68 */ 69 static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, 70 unsigned long address) 71 { 72 struct vm_area_struct *vma; 73 74 vma = lock_vma_under_rcu(mm, address); 75 if (vma) { 76 /* 77 * We know we're going to need to use anon_vma, so check 78 * that early. 79 */ 80 if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma)) 81 vma_end_read(vma); 82 else 83 return vma; 84 } 85 86 mmap_read_lock(mm); 87 vma = find_vma_and_prepare_anon(mm, address); 88 if (!IS_ERR(vma)) { 89 bool locked = vma_start_read_locked(vma); 90 91 if (!locked) 92 vma = ERR_PTR(-EAGAIN); 93 } 94 95 mmap_read_unlock(mm); 96 return vma; 97 } 98 99 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, 100 unsigned long dst_start, 101 unsigned long len) 102 { 103 struct vm_area_struct *dst_vma; 104 105 dst_vma = uffd_lock_vma(dst_mm, dst_start); 106 if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len)) 107 return dst_vma; 108 109 vma_end_read(dst_vma); 110 return ERR_PTR(-ENOENT); 111 } 112 113 static void uffd_mfill_unlock(struct vm_area_struct *vma) 114 { 115 vma_end_read(vma); 116 } 117 118 #else 119 120 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, 121 unsigned long dst_start, 122 unsigned long len) 123 { 124 struct vm_area_struct *dst_vma; 125 126 mmap_read_lock(dst_mm); 127 dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start); 128 if (IS_ERR(dst_vma)) 129 goto out_unlock; 130 131 if (validate_dst_vma(dst_vma, dst_start + len)) 132 return dst_vma; 133 134 dst_vma = ERR_PTR(-ENOENT); 135 out_unlock: 136 mmap_read_unlock(dst_mm); 137 return dst_vma; 138 } 139 140 static void uffd_mfill_unlock(struct vm_area_struct *vma) 141 { 142 mmap_read_unlock(vma->vm_mm); 143 } 144 #endif 145 146 /* Check if dst_addr is outside of file's size. Must be called with ptl held. */ 147 static bool mfill_file_over_size(struct vm_area_struct *dst_vma, 148 unsigned long dst_addr) 149 { 150 struct inode *inode; 151 pgoff_t offset, max_off; 152 153 if (!dst_vma->vm_file) 154 return false; 155 156 inode = dst_vma->vm_file->f_inode; 157 offset = linear_page_index(dst_vma, dst_addr); 158 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 159 return offset >= max_off; 160 } 161 162 /* 163 * Install PTEs, to map dst_addr (within dst_vma) to page. 164 * 165 * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem 166 * and anon, and for both shared and private VMAs. 167 */ 168 int mfill_atomic_install_pte(pmd_t *dst_pmd, 169 struct vm_area_struct *dst_vma, 170 unsigned long dst_addr, struct page *page, 171 bool newly_allocated, uffd_flags_t flags) 172 { 173 int ret; 174 struct mm_struct *dst_mm = dst_vma->vm_mm; 175 pte_t _dst_pte, *dst_pte; 176 bool writable = dst_vma->vm_flags & VM_WRITE; 177 bool vm_shared = dst_vma->vm_flags & VM_SHARED; 178 spinlock_t *ptl; 179 struct folio *folio = page_folio(page); 180 bool page_in_cache = folio_mapping(folio); 181 pte_t dst_ptep; 182 183 _dst_pte = mk_pte(page, dst_vma->vm_page_prot); 184 _dst_pte = pte_mkdirty(_dst_pte); 185 if (page_in_cache && !vm_shared) 186 writable = false; 187 if (writable) 188 _dst_pte = pte_mkwrite(_dst_pte, dst_vma); 189 if (flags & MFILL_ATOMIC_WP) 190 _dst_pte = pte_mkuffd_wp(_dst_pte); 191 192 ret = -EAGAIN; 193 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 194 if (!dst_pte) 195 goto out; 196 197 if (mfill_file_over_size(dst_vma, dst_addr)) { 198 ret = -EFAULT; 199 goto out_unlock; 200 } 201 202 ret = -EEXIST; 203 204 dst_ptep = ptep_get(dst_pte); 205 206 /* 207 * We are allowed to overwrite a UFFD pte marker: consider when both 208 * MISSING|WP registered, we firstly wr-protect a none pte which has no 209 * page cache page backing it, then access the page. 210 */ 211 if (!pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep)) 212 goto out_unlock; 213 214 if (page_in_cache) { 215 /* Usually, cache pages are already added to LRU */ 216 if (newly_allocated) 217 folio_add_lru(folio); 218 folio_add_file_rmap_pte(folio, page, dst_vma); 219 } else { 220 folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); 221 folio_add_lru_vma(folio, dst_vma); 222 } 223 224 /* 225 * Must happen after rmap, as mm_counter() checks mapping (via 226 * PageAnon()), which is set by __page_set_anon_rmap(). 227 */ 228 inc_mm_counter(dst_mm, mm_counter(folio)); 229 230 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 231 232 /* No need to invalidate - it was non-present before */ 233 update_mmu_cache(dst_vma, dst_addr, dst_pte); 234 ret = 0; 235 out_unlock: 236 pte_unmap_unlock(dst_pte, ptl); 237 out: 238 return ret; 239 } 240 241 static int mfill_atomic_pte_copy(pmd_t *dst_pmd, 242 struct vm_area_struct *dst_vma, 243 unsigned long dst_addr, 244 unsigned long src_addr, 245 uffd_flags_t flags, 246 struct folio **foliop) 247 { 248 void *kaddr; 249 int ret; 250 struct folio *folio; 251 252 if (!*foliop) { 253 ret = -ENOMEM; 254 folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, 255 dst_addr); 256 if (!folio) 257 goto out; 258 259 kaddr = kmap_local_folio(folio, 0); 260 /* 261 * The read mmap_lock is held here. Despite the 262 * mmap_lock being read recursive a deadlock is still 263 * possible if a writer has taken a lock. For example: 264 * 265 * process A thread 1 takes read lock on own mmap_lock 266 * process A thread 2 calls mmap, blocks taking write lock 267 * process B thread 1 takes page fault, read lock on own mmap lock 268 * process B thread 2 calls mmap, blocks taking write lock 269 * process A thread 1 blocks taking read lock on process B 270 * process B thread 1 blocks taking read lock on process A 271 * 272 * Disable page faults to prevent potential deadlock 273 * and retry the copy outside the mmap_lock. 274 */ 275 pagefault_disable(); 276 ret = copy_from_user(kaddr, (const void __user *) src_addr, 277 PAGE_SIZE); 278 pagefault_enable(); 279 kunmap_local(kaddr); 280 281 /* fallback to copy_from_user outside mmap_lock */ 282 if (unlikely(ret)) { 283 ret = -ENOENT; 284 *foliop = folio; 285 /* don't free the page */ 286 goto out; 287 } 288 289 flush_dcache_folio(folio); 290 } else { 291 folio = *foliop; 292 *foliop = NULL; 293 } 294 295 /* 296 * The memory barrier inside __folio_mark_uptodate makes sure that 297 * preceding stores to the page contents become visible before 298 * the set_pte_at() write. 299 */ 300 __folio_mark_uptodate(folio); 301 302 ret = -ENOMEM; 303 if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) 304 goto out_release; 305 306 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 307 &folio->page, true, flags); 308 if (ret) 309 goto out_release; 310 out: 311 return ret; 312 out_release: 313 folio_put(folio); 314 goto out; 315 } 316 317 static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd, 318 struct vm_area_struct *dst_vma, 319 unsigned long dst_addr) 320 { 321 struct folio *folio; 322 int ret = -ENOMEM; 323 324 folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr); 325 if (!folio) 326 return ret; 327 328 if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) 329 goto out_put; 330 331 /* 332 * The memory barrier inside __folio_mark_uptodate makes sure that 333 * zeroing out the folio become visible before mapping the page 334 * using set_pte_at(). See do_anonymous_page(). 335 */ 336 __folio_mark_uptodate(folio); 337 338 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 339 &folio->page, true, 0); 340 if (ret) 341 goto out_put; 342 343 return 0; 344 out_put: 345 folio_put(folio); 346 return ret; 347 } 348 349 static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, 350 struct vm_area_struct *dst_vma, 351 unsigned long dst_addr) 352 { 353 pte_t _dst_pte, *dst_pte; 354 spinlock_t *ptl; 355 int ret; 356 357 if (mm_forbids_zeropage(dst_vma->vm_mm)) 358 return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr); 359 360 _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), 361 dst_vma->vm_page_prot)); 362 ret = -EAGAIN; 363 dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); 364 if (!dst_pte) 365 goto out; 366 if (mfill_file_over_size(dst_vma, dst_addr)) { 367 ret = -EFAULT; 368 goto out_unlock; 369 } 370 ret = -EEXIST; 371 if (!pte_none(ptep_get(dst_pte))) 372 goto out_unlock; 373 set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); 374 /* No need to invalidate - it was non-present before */ 375 update_mmu_cache(dst_vma, dst_addr, dst_pte); 376 ret = 0; 377 out_unlock: 378 pte_unmap_unlock(dst_pte, ptl); 379 out: 380 return ret; 381 } 382 383 /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ 384 static int mfill_atomic_pte_continue(pmd_t *dst_pmd, 385 struct vm_area_struct *dst_vma, 386 unsigned long dst_addr, 387 uffd_flags_t flags) 388 { 389 struct inode *inode = file_inode(dst_vma->vm_file); 390 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 391 struct folio *folio; 392 struct page *page; 393 int ret; 394 395 ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); 396 /* Our caller expects us to return -EFAULT if we failed to find folio */ 397 if (ret == -ENOENT) 398 ret = -EFAULT; 399 if (ret) 400 goto out; 401 if (!folio) { 402 ret = -EFAULT; 403 goto out; 404 } 405 406 page = folio_file_page(folio, pgoff); 407 if (PageHWPoison(page)) { 408 ret = -EIO; 409 goto out_release; 410 } 411 412 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 413 page, false, flags); 414 if (ret) 415 goto out_release; 416 417 folio_unlock(folio); 418 ret = 0; 419 out: 420 return ret; 421 out_release: 422 folio_unlock(folio); 423 folio_put(folio); 424 goto out; 425 } 426 427 /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ 428 static int mfill_atomic_pte_poison(pmd_t *dst_pmd, 429 struct vm_area_struct *dst_vma, 430 unsigned long dst_addr, 431 uffd_flags_t flags) 432 { 433 int ret; 434 struct mm_struct *dst_mm = dst_vma->vm_mm; 435 pte_t _dst_pte, *dst_pte; 436 spinlock_t *ptl; 437 438 _dst_pte = make_pte_marker(PTE_MARKER_POISONED); 439 ret = -EAGAIN; 440 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 441 if (!dst_pte) 442 goto out; 443 444 if (mfill_file_over_size(dst_vma, dst_addr)) { 445 ret = -EFAULT; 446 goto out_unlock; 447 } 448 449 ret = -EEXIST; 450 /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */ 451 if (!pte_none(ptep_get(dst_pte))) 452 goto out_unlock; 453 454 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 455 456 /* No need to invalidate - it was non-present before */ 457 update_mmu_cache(dst_vma, dst_addr, dst_pte); 458 ret = 0; 459 out_unlock: 460 pte_unmap_unlock(dst_pte, ptl); 461 out: 462 return ret; 463 } 464 465 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) 466 { 467 pgd_t *pgd; 468 p4d_t *p4d; 469 pud_t *pud; 470 471 pgd = pgd_offset(mm, address); 472 p4d = p4d_alloc(mm, pgd, address); 473 if (!p4d) 474 return NULL; 475 pud = pud_alloc(mm, p4d, address); 476 if (!pud) 477 return NULL; 478 /* 479 * Note that we didn't run this because the pmd was 480 * missing, the *pmd may be already established and in 481 * turn it may also be a trans_huge_pmd. 482 */ 483 return pmd_alloc(mm, pud, address); 484 } 485 486 #ifdef CONFIG_HUGETLB_PAGE 487 /* 488 * mfill_atomic processing for HUGETLB vmas. Note that this routine is 489 * called with either vma-lock or mmap_lock held, it will release the lock 490 * before returning. 491 */ 492 static __always_inline ssize_t mfill_atomic_hugetlb( 493 struct userfaultfd_ctx *ctx, 494 struct vm_area_struct *dst_vma, 495 unsigned long dst_start, 496 unsigned long src_start, 497 unsigned long len, 498 uffd_flags_t flags) 499 { 500 struct mm_struct *dst_mm = dst_vma->vm_mm; 501 ssize_t err; 502 pte_t *dst_pte; 503 unsigned long src_addr, dst_addr; 504 long copied; 505 struct folio *folio; 506 unsigned long vma_hpagesize; 507 pgoff_t idx; 508 u32 hash; 509 struct address_space *mapping; 510 511 /* 512 * There is no default zero huge page for all huge page sizes as 513 * supported by hugetlb. A PMD_SIZE huge pages may exist as used 514 * by THP. Since we can not reliably insert a zero page, this 515 * feature is not supported. 516 */ 517 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { 518 up_read(&ctx->map_changing_lock); 519 uffd_mfill_unlock(dst_vma); 520 return -EINVAL; 521 } 522 523 src_addr = src_start; 524 dst_addr = dst_start; 525 copied = 0; 526 folio = NULL; 527 vma_hpagesize = vma_kernel_pagesize(dst_vma); 528 529 /* 530 * Validate alignment based on huge page size 531 */ 532 err = -EINVAL; 533 if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) 534 goto out_unlock; 535 536 retry: 537 /* 538 * On routine entry dst_vma is set. If we had to drop mmap_lock and 539 * retry, dst_vma will be set to NULL and we must lookup again. 540 */ 541 if (!dst_vma) { 542 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); 543 if (IS_ERR(dst_vma)) { 544 err = PTR_ERR(dst_vma); 545 goto out; 546 } 547 548 err = -ENOENT; 549 if (!is_vm_hugetlb_page(dst_vma)) 550 goto out_unlock_vma; 551 552 err = -EINVAL; 553 if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) 554 goto out_unlock_vma; 555 556 /* 557 * If memory mappings are changing because of non-cooperative 558 * operation (e.g. mremap) running in parallel, bail out and 559 * request the user to retry later 560 */ 561 down_read(&ctx->map_changing_lock); 562 err = -EAGAIN; 563 if (atomic_read(&ctx->mmap_changing)) 564 goto out_unlock; 565 } 566 567 while (src_addr < src_start + len) { 568 VM_WARN_ON_ONCE(dst_addr >= dst_start + len); 569 570 /* 571 * Serialize via vma_lock and hugetlb_fault_mutex. 572 * vma_lock ensures the dst_pte remains valid even 573 * in the case of shared pmds. fault mutex prevents 574 * races with other faulting threads. 575 */ 576 idx = linear_page_index(dst_vma, dst_addr); 577 mapping = dst_vma->vm_file->f_mapping; 578 hash = hugetlb_fault_mutex_hash(mapping, idx); 579 mutex_lock(&hugetlb_fault_mutex_table[hash]); 580 hugetlb_vma_lock_read(dst_vma); 581 582 err = -ENOMEM; 583 dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); 584 if (!dst_pte) { 585 hugetlb_vma_unlock_read(dst_vma); 586 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 587 goto out_unlock; 588 } 589 590 if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { 591 const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte); 592 593 if (!huge_pte_none(ptep) && !pte_is_uffd_marker(ptep)) { 594 err = -EEXIST; 595 hugetlb_vma_unlock_read(dst_vma); 596 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 597 goto out_unlock; 598 } 599 } 600 601 err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, 602 src_addr, flags, &folio); 603 604 hugetlb_vma_unlock_read(dst_vma); 605 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 606 607 cond_resched(); 608 609 if (unlikely(err == -ENOENT)) { 610 up_read(&ctx->map_changing_lock); 611 uffd_mfill_unlock(dst_vma); 612 VM_WARN_ON_ONCE(!folio); 613 614 err = copy_folio_from_user(folio, 615 (const void __user *)src_addr, true); 616 if (unlikely(err)) { 617 err = -EFAULT; 618 goto out; 619 } 620 621 dst_vma = NULL; 622 goto retry; 623 } else 624 VM_WARN_ON_ONCE(folio); 625 626 if (!err) { 627 dst_addr += vma_hpagesize; 628 src_addr += vma_hpagesize; 629 copied += vma_hpagesize; 630 631 if (fatal_signal_pending(current)) 632 err = -EINTR; 633 } 634 if (err) 635 break; 636 } 637 638 out_unlock: 639 up_read(&ctx->map_changing_lock); 640 out_unlock_vma: 641 uffd_mfill_unlock(dst_vma); 642 out: 643 if (folio) 644 folio_put(folio); 645 VM_WARN_ON_ONCE(copied < 0); 646 VM_WARN_ON_ONCE(err > 0); 647 VM_WARN_ON_ONCE(!copied && !err); 648 return copied ? copied : err; 649 } 650 #else /* !CONFIG_HUGETLB_PAGE */ 651 /* fail at build time if gcc attempts to use this */ 652 extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx, 653 struct vm_area_struct *dst_vma, 654 unsigned long dst_start, 655 unsigned long src_start, 656 unsigned long len, 657 uffd_flags_t flags); 658 #endif /* CONFIG_HUGETLB_PAGE */ 659 660 static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, 661 struct vm_area_struct *dst_vma, 662 unsigned long dst_addr, 663 unsigned long src_addr, 664 uffd_flags_t flags, 665 struct folio **foliop) 666 { 667 ssize_t err; 668 669 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { 670 return mfill_atomic_pte_continue(dst_pmd, dst_vma, 671 dst_addr, flags); 672 } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { 673 return mfill_atomic_pte_poison(dst_pmd, dst_vma, 674 dst_addr, flags); 675 } 676 677 /* 678 * The normal page fault path for a shmem will invoke the 679 * fault, fill the hole in the file and COW it right away. The 680 * result generates plain anonymous memory. So when we are 681 * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll 682 * generate anonymous memory directly without actually filling 683 * the hole. For the MAP_PRIVATE case the robustness check 684 * only happens in the pagetable (to verify it's still none) 685 * and not in the radix tree. 686 */ 687 if (!(dst_vma->vm_flags & VM_SHARED)) { 688 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) 689 err = mfill_atomic_pte_copy(dst_pmd, dst_vma, 690 dst_addr, src_addr, 691 flags, foliop); 692 else 693 err = mfill_atomic_pte_zeropage(dst_pmd, 694 dst_vma, dst_addr); 695 } else { 696 err = shmem_mfill_atomic_pte(dst_pmd, dst_vma, 697 dst_addr, src_addr, 698 flags, foliop); 699 } 700 701 return err; 702 } 703 704 static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, 705 unsigned long dst_start, 706 unsigned long src_start, 707 unsigned long len, 708 uffd_flags_t flags) 709 { 710 struct mm_struct *dst_mm = ctx->mm; 711 struct vm_area_struct *dst_vma; 712 ssize_t err; 713 pmd_t *dst_pmd; 714 unsigned long src_addr, dst_addr; 715 long copied; 716 struct folio *folio; 717 718 /* 719 * Sanitize the command parameters: 720 */ 721 VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); 722 VM_WARN_ON_ONCE(len & ~PAGE_MASK); 723 724 /* Does the address range wrap, or is the span zero-sized? */ 725 VM_WARN_ON_ONCE(src_start + len <= src_start); 726 VM_WARN_ON_ONCE(dst_start + len <= dst_start); 727 728 src_addr = src_start; 729 dst_addr = dst_start; 730 copied = 0; 731 folio = NULL; 732 retry: 733 /* 734 * Make sure the vma is not shared, that the dst range is 735 * both valid and fully within a single existing vma. 736 */ 737 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); 738 if (IS_ERR(dst_vma)) { 739 err = PTR_ERR(dst_vma); 740 goto out; 741 } 742 743 /* 744 * If memory mappings are changing because of non-cooperative 745 * operation (e.g. mremap) running in parallel, bail out and 746 * request the user to retry later 747 */ 748 down_read(&ctx->map_changing_lock); 749 err = -EAGAIN; 750 if (atomic_read(&ctx->mmap_changing)) 751 goto out_unlock; 752 753 err = -EINVAL; 754 /* 755 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but 756 * it will overwrite vm_ops, so vma_is_anonymous must return false. 757 */ 758 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && 759 dst_vma->vm_flags & VM_SHARED)) 760 goto out_unlock; 761 762 /* 763 * validate 'mode' now that we know the dst_vma: don't allow 764 * a wrprotect copy if the userfaultfd didn't register as WP. 765 */ 766 if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) 767 goto out_unlock; 768 769 /* 770 * If this is a HUGETLB vma, pass off to appropriate routine 771 */ 772 if (is_vm_hugetlb_page(dst_vma)) 773 return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, 774 src_start, len, flags); 775 776 if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) 777 goto out_unlock; 778 if (!vma_is_shmem(dst_vma) && 779 uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) 780 goto out_unlock; 781 782 while (src_addr < src_start + len) { 783 pmd_t dst_pmdval; 784 785 VM_WARN_ON_ONCE(dst_addr >= dst_start + len); 786 787 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); 788 if (unlikely(!dst_pmd)) { 789 err = -ENOMEM; 790 break; 791 } 792 793 dst_pmdval = pmdp_get_lockless(dst_pmd); 794 if (unlikely(pmd_none(dst_pmdval)) && 795 unlikely(__pte_alloc(dst_mm, dst_pmd))) { 796 err = -ENOMEM; 797 break; 798 } 799 dst_pmdval = pmdp_get_lockless(dst_pmd); 800 /* 801 * If the dst_pmd is THP don't override it and just be strict. 802 * (This includes the case where the PMD used to be THP and 803 * changed back to none after __pte_alloc().) 804 */ 805 if (unlikely(!pmd_present(dst_pmdval) || 806 pmd_trans_huge(dst_pmdval))) { 807 err = -EEXIST; 808 break; 809 } 810 if (unlikely(pmd_bad(dst_pmdval))) { 811 err = -EFAULT; 812 break; 813 } 814 /* 815 * For shmem mappings, khugepaged is allowed to remove page 816 * tables under us; pte_offset_map_lock() will deal with that. 817 */ 818 819 err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, 820 src_addr, flags, &folio); 821 cond_resched(); 822 823 if (unlikely(err == -ENOENT)) { 824 void *kaddr; 825 826 up_read(&ctx->map_changing_lock); 827 uffd_mfill_unlock(dst_vma); 828 VM_WARN_ON_ONCE(!folio); 829 830 kaddr = kmap_local_folio(folio, 0); 831 err = copy_from_user(kaddr, 832 (const void __user *) src_addr, 833 PAGE_SIZE); 834 kunmap_local(kaddr); 835 if (unlikely(err)) { 836 err = -EFAULT; 837 goto out; 838 } 839 flush_dcache_folio(folio); 840 goto retry; 841 } else 842 VM_WARN_ON_ONCE(folio); 843 844 if (!err) { 845 dst_addr += PAGE_SIZE; 846 src_addr += PAGE_SIZE; 847 copied += PAGE_SIZE; 848 849 if (fatal_signal_pending(current)) 850 err = -EINTR; 851 } 852 if (err) 853 break; 854 } 855 856 out_unlock: 857 up_read(&ctx->map_changing_lock); 858 uffd_mfill_unlock(dst_vma); 859 out: 860 if (folio) 861 folio_put(folio); 862 VM_WARN_ON_ONCE(copied < 0); 863 VM_WARN_ON_ONCE(err > 0); 864 VM_WARN_ON_ONCE(!copied && !err); 865 return copied ? copied : err; 866 } 867 868 ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, 869 unsigned long src_start, unsigned long len, 870 uffd_flags_t flags) 871 { 872 return mfill_atomic(ctx, dst_start, src_start, len, 873 uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY)); 874 } 875 876 ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, 877 unsigned long start, 878 unsigned long len) 879 { 880 return mfill_atomic(ctx, start, 0, len, 881 uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE)); 882 } 883 884 ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, 885 unsigned long len, uffd_flags_t flags) 886 { 887 888 /* 889 * A caller might reasonably assume that UFFDIO_CONTINUE contains an 890 * smp_wmb() to ensure that any writes to the about-to-be-mapped page by 891 * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to 892 * subsequent loads from the page through the newly mapped address range. 893 */ 894 smp_wmb(); 895 896 return mfill_atomic(ctx, start, 0, len, 897 uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); 898 } 899 900 ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, 901 unsigned long len, uffd_flags_t flags) 902 { 903 return mfill_atomic(ctx, start, 0, len, 904 uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON)); 905 } 906 907 long uffd_wp_range(struct vm_area_struct *dst_vma, 908 unsigned long start, unsigned long len, bool enable_wp) 909 { 910 unsigned int mm_cp_flags; 911 struct mmu_gather tlb; 912 long ret; 913 914 VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end, 915 "The address range exceeds VMA boundary.\n"); 916 if (enable_wp) 917 mm_cp_flags = MM_CP_UFFD_WP; 918 else 919 mm_cp_flags = MM_CP_UFFD_WP_RESOLVE; 920 921 /* 922 * vma->vm_page_prot already reflects that uffd-wp is enabled for this 923 * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed 924 * to be write-protected as default whenever protection changes. 925 * Try upgrading write permissions manually. 926 */ 927 if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) 928 mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; 929 tlb_gather_mmu(&tlb, dst_vma->vm_mm); 930 ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); 931 tlb_finish_mmu(&tlb); 932 933 return ret; 934 } 935 936 int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, 937 unsigned long len, bool enable_wp) 938 { 939 struct mm_struct *dst_mm = ctx->mm; 940 unsigned long end = start + len; 941 unsigned long _start, _end; 942 struct vm_area_struct *dst_vma; 943 unsigned long page_mask; 944 long err; 945 VMA_ITERATOR(vmi, dst_mm, start); 946 947 /* 948 * Sanitize the command parameters: 949 */ 950 VM_WARN_ON_ONCE(start & ~PAGE_MASK); 951 VM_WARN_ON_ONCE(len & ~PAGE_MASK); 952 953 /* Does the address range wrap, or is the span zero-sized? */ 954 VM_WARN_ON_ONCE(start + len <= start); 955 956 mmap_read_lock(dst_mm); 957 958 /* 959 * If memory mappings are changing because of non-cooperative 960 * operation (e.g. mremap) running in parallel, bail out and 961 * request the user to retry later 962 */ 963 down_read(&ctx->map_changing_lock); 964 err = -EAGAIN; 965 if (atomic_read(&ctx->mmap_changing)) 966 goto out_unlock; 967 968 err = -ENOENT; 969 for_each_vma_range(vmi, dst_vma, end) { 970 971 if (!userfaultfd_wp(dst_vma)) { 972 err = -ENOENT; 973 break; 974 } 975 976 if (is_vm_hugetlb_page(dst_vma)) { 977 err = -EINVAL; 978 page_mask = vma_kernel_pagesize(dst_vma) - 1; 979 if ((start & page_mask) || (len & page_mask)) 980 break; 981 } 982 983 _start = max(dst_vma->vm_start, start); 984 _end = min(dst_vma->vm_end, end); 985 986 err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp); 987 988 /* Return 0 on success, <0 on failures */ 989 if (err < 0) 990 break; 991 err = 0; 992 } 993 out_unlock: 994 up_read(&ctx->map_changing_lock); 995 mmap_read_unlock(dst_mm); 996 return err; 997 } 998 999 1000 void double_pt_lock(spinlock_t *ptl1, 1001 spinlock_t *ptl2) 1002 __acquires(ptl1) 1003 __acquires(ptl2) 1004 { 1005 if (ptl1 > ptl2) 1006 swap(ptl1, ptl2); 1007 /* lock in virtual address order to avoid lock inversion */ 1008 spin_lock(ptl1); 1009 if (ptl1 != ptl2) 1010 spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING); 1011 else 1012 __acquire(ptl2); 1013 } 1014 1015 void double_pt_unlock(spinlock_t *ptl1, 1016 spinlock_t *ptl2) 1017 __releases(ptl1) 1018 __releases(ptl2) 1019 { 1020 spin_unlock(ptl1); 1021 if (ptl1 != ptl2) 1022 spin_unlock(ptl2); 1023 else 1024 __release(ptl2); 1025 } 1026 1027 static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, 1028 pte_t orig_dst_pte, pte_t orig_src_pte, 1029 pmd_t *dst_pmd, pmd_t dst_pmdval) 1030 { 1031 return pte_same(ptep_get(src_pte), orig_src_pte) && 1032 pte_same(ptep_get(dst_pte), orig_dst_pte) && 1033 pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd)); 1034 } 1035 1036 /* 1037 * Checks if the two ptes and the corresponding folio are eligible for batched 1038 * move. If so, then returns pointer to the locked folio. Otherwise, returns NULL. 1039 * 1040 * NOTE: folio's reference is not required as the whole operation is within 1041 * PTL's critical section. 1042 */ 1043 static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, 1044 unsigned long src_addr, 1045 pte_t *src_pte, pte_t *dst_pte) 1046 { 1047 pte_t orig_dst_pte, orig_src_pte; 1048 struct folio *folio; 1049 1050 orig_dst_pte = ptep_get(dst_pte); 1051 if (!pte_none(orig_dst_pte)) 1052 return NULL; 1053 1054 orig_src_pte = ptep_get(src_pte); 1055 if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte))) 1056 return NULL; 1057 1058 folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); 1059 if (!folio || !folio_trylock(folio)) 1060 return NULL; 1061 if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) { 1062 folio_unlock(folio); 1063 return NULL; 1064 } 1065 return folio; 1066 } 1067 1068 /* 1069 * Moves src folios to dst in a batch as long as they are not large, and can 1070 * successfully take the lock via folio_trylock(). 1071 */ 1072 static long move_present_ptes(struct mm_struct *mm, 1073 struct vm_area_struct *dst_vma, 1074 struct vm_area_struct *src_vma, 1075 unsigned long dst_addr, unsigned long src_addr, 1076 pte_t *dst_pte, pte_t *src_pte, 1077 pte_t orig_dst_pte, pte_t orig_src_pte, 1078 pmd_t *dst_pmd, pmd_t dst_pmdval, 1079 spinlock_t *dst_ptl, spinlock_t *src_ptl, 1080 struct folio **first_src_folio, unsigned long len) 1081 { 1082 int err = 0; 1083 struct folio *src_folio = *first_src_folio; 1084 unsigned long src_start = src_addr; 1085 unsigned long src_end; 1086 1087 len = pmd_addr_end(dst_addr, dst_addr + len) - dst_addr; 1088 src_end = pmd_addr_end(src_addr, src_addr + len); 1089 flush_cache_range(src_vma, src_addr, src_end); 1090 double_pt_lock(dst_ptl, src_ptl); 1091 1092 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1093 dst_pmd, dst_pmdval)) { 1094 err = -EAGAIN; 1095 goto out; 1096 } 1097 if (folio_test_large(src_folio) || 1098 folio_maybe_dma_pinned(src_folio) || 1099 !PageAnonExclusive(&src_folio->page)) { 1100 err = -EBUSY; 1101 goto out; 1102 } 1103 /* It's safe to drop the reference now as the page-table is holding one. */ 1104 folio_put(*first_src_folio); 1105 *first_src_folio = NULL; 1106 lazy_mmu_mode_enable(); 1107 1108 while (true) { 1109 orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); 1110 /* Folio got pinned from under us. Put it back and fail the move. */ 1111 if (folio_maybe_dma_pinned(src_folio)) { 1112 set_pte_at(mm, src_addr, src_pte, orig_src_pte); 1113 err = -EBUSY; 1114 break; 1115 } 1116 1117 folio_move_anon_rmap(src_folio, dst_vma); 1118 src_folio->index = linear_page_index(dst_vma, dst_addr); 1119 1120 orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); 1121 /* Set soft dirty bit so userspace can notice the pte was moved */ 1122 if (pgtable_supports_soft_dirty()) 1123 orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); 1124 if (pte_dirty(orig_src_pte)) 1125 orig_dst_pte = pte_mkdirty(orig_dst_pte); 1126 orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); 1127 set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); 1128 1129 src_addr += PAGE_SIZE; 1130 if (src_addr == src_end) 1131 break; 1132 dst_addr += PAGE_SIZE; 1133 dst_pte++; 1134 src_pte++; 1135 1136 folio_unlock(src_folio); 1137 src_folio = check_ptes_for_batched_move(src_vma, src_addr, 1138 src_pte, dst_pte); 1139 if (!src_folio) 1140 break; 1141 } 1142 1143 lazy_mmu_mode_disable(); 1144 if (src_addr > src_start) 1145 flush_tlb_range(src_vma, src_start, src_addr); 1146 1147 if (src_folio) 1148 folio_unlock(src_folio); 1149 out: 1150 double_pt_unlock(dst_ptl, src_ptl); 1151 return src_addr > src_start ? src_addr - src_start : err; 1152 } 1153 1154 static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, 1155 unsigned long dst_addr, unsigned long src_addr, 1156 pte_t *dst_pte, pte_t *src_pte, 1157 pte_t orig_dst_pte, pte_t orig_src_pte, 1158 pmd_t *dst_pmd, pmd_t dst_pmdval, 1159 spinlock_t *dst_ptl, spinlock_t *src_ptl, 1160 struct folio *src_folio, 1161 struct swap_info_struct *si, swp_entry_t entry) 1162 { 1163 /* 1164 * Check if the folio still belongs to the target swap entry after 1165 * acquiring the lock. Folio can be freed in the swap cache while 1166 * not locked. 1167 */ 1168 if (src_folio && unlikely(!folio_test_swapcache(src_folio) || 1169 entry.val != src_folio->swap.val)) 1170 return -EAGAIN; 1171 1172 double_pt_lock(dst_ptl, src_ptl); 1173 1174 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1175 dst_pmd, dst_pmdval)) { 1176 double_pt_unlock(dst_ptl, src_ptl); 1177 return -EAGAIN; 1178 } 1179 1180 /* 1181 * The src_folio resides in the swapcache, requiring an update to its 1182 * index and mapping to align with the dst_vma, where a swap-in may 1183 * occur and hit the swapcache after moving the PTE. 1184 */ 1185 if (src_folio) { 1186 folio_move_anon_rmap(src_folio, dst_vma); 1187 src_folio->index = linear_page_index(dst_vma, dst_addr); 1188 } else { 1189 /* 1190 * Check if the swap entry is cached after acquiring the src_pte 1191 * lock. Otherwise, we might miss a newly loaded swap cache folio. 1192 * 1193 * We are trying to catch newly added swap cache, the only possible case is 1194 * when a folio is swapped in and out again staying in swap cache, using the 1195 * same entry before the PTE check above. The PTL is acquired and released 1196 * twice, each time after updating the swap table. So holding 1197 * the PTL here ensures we see the updated value. 1198 */ 1199 if (swap_cache_has_folio(entry)) { 1200 double_pt_unlock(dst_ptl, src_ptl); 1201 return -EAGAIN; 1202 } 1203 } 1204 1205 orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); 1206 if (pgtable_supports_soft_dirty()) 1207 orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); 1208 set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); 1209 double_pt_unlock(dst_ptl, src_ptl); 1210 1211 return PAGE_SIZE; 1212 } 1213 1214 static int move_zeropage_pte(struct mm_struct *mm, 1215 struct vm_area_struct *dst_vma, 1216 struct vm_area_struct *src_vma, 1217 unsigned long dst_addr, unsigned long src_addr, 1218 pte_t *dst_pte, pte_t *src_pte, 1219 pte_t orig_dst_pte, pte_t orig_src_pte, 1220 pmd_t *dst_pmd, pmd_t dst_pmdval, 1221 spinlock_t *dst_ptl, spinlock_t *src_ptl) 1222 { 1223 pte_t zero_pte; 1224 1225 double_pt_lock(dst_ptl, src_ptl); 1226 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1227 dst_pmd, dst_pmdval)) { 1228 double_pt_unlock(dst_ptl, src_ptl); 1229 return -EAGAIN; 1230 } 1231 1232 zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), 1233 dst_vma->vm_page_prot)); 1234 ptep_clear_flush(src_vma, src_addr, src_pte); 1235 set_pte_at(mm, dst_addr, dst_pte, zero_pte); 1236 double_pt_unlock(dst_ptl, src_ptl); 1237 1238 return PAGE_SIZE; 1239 } 1240 1241 1242 /* 1243 * The mmap_lock for reading is held by the caller. Just move the page(s) 1244 * from src_pmd to dst_pmd if possible, and return number of bytes moved. 1245 * On failure, an error code is returned. 1246 */ 1247 static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, 1248 struct vm_area_struct *dst_vma, 1249 struct vm_area_struct *src_vma, 1250 unsigned long dst_addr, unsigned long src_addr, 1251 unsigned long len, __u64 mode) 1252 { 1253 struct swap_info_struct *si = NULL; 1254 pte_t orig_src_pte, orig_dst_pte; 1255 pte_t src_folio_pte; 1256 spinlock_t *src_ptl, *dst_ptl; 1257 pte_t *src_pte = NULL; 1258 pte_t *dst_pte = NULL; 1259 pmd_t dummy_pmdval; 1260 pmd_t dst_pmdval; 1261 struct folio *src_folio = NULL; 1262 struct mmu_notifier_range range; 1263 long ret = 0; 1264 1265 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 1266 src_addr, src_addr + len); 1267 mmu_notifier_invalidate_range_start(&range); 1268 retry: 1269 /* 1270 * Use the maywrite version to indicate that dst_pte will be modified, 1271 * since dst_pte needs to be none, the subsequent pte_same() check 1272 * cannot prevent the dst_pte page from being freed concurrently, so we 1273 * also need to obtain dst_pmdval and recheck pmd_same() later. 1274 */ 1275 dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval, 1276 &dst_ptl); 1277 1278 /* Retry if a huge pmd materialized from under us */ 1279 if (unlikely(!dst_pte)) { 1280 ret = -EAGAIN; 1281 goto out; 1282 } 1283 1284 /* 1285 * Unlike dst_pte, the subsequent pte_same() check can ensure the 1286 * stability of the src_pte page, so there is no need to get pmdval, 1287 * just pass a dummy variable to it. 1288 */ 1289 src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval, 1290 &src_ptl); 1291 1292 /* 1293 * We held the mmap_lock for reading so MADV_DONTNEED 1294 * can zap transparent huge pages under us, or the 1295 * transparent huge page fault can establish new 1296 * transparent huge pages under us. 1297 */ 1298 if (unlikely(!src_pte)) { 1299 ret = -EAGAIN; 1300 goto out; 1301 } 1302 1303 /* Sanity checks before the operation */ 1304 if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) || 1305 pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) { 1306 ret = -EINVAL; 1307 goto out; 1308 } 1309 1310 spin_lock(dst_ptl); 1311 orig_dst_pte = ptep_get(dst_pte); 1312 spin_unlock(dst_ptl); 1313 if (!pte_none(orig_dst_pte)) { 1314 ret = -EEXIST; 1315 goto out; 1316 } 1317 1318 spin_lock(src_ptl); 1319 orig_src_pte = ptep_get(src_pte); 1320 spin_unlock(src_ptl); 1321 if (pte_none(orig_src_pte)) { 1322 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) 1323 ret = -ENOENT; 1324 else /* nothing to do to move a hole */ 1325 ret = PAGE_SIZE; 1326 goto out; 1327 } 1328 1329 /* If PTE changed after we locked the folio then start over */ 1330 if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { 1331 ret = -EAGAIN; 1332 goto out; 1333 } 1334 1335 if (pte_present(orig_src_pte)) { 1336 if (is_zero_pfn(pte_pfn(orig_src_pte))) { 1337 ret = move_zeropage_pte(mm, dst_vma, src_vma, 1338 dst_addr, src_addr, dst_pte, src_pte, 1339 orig_dst_pte, orig_src_pte, 1340 dst_pmd, dst_pmdval, dst_ptl, src_ptl); 1341 goto out; 1342 } 1343 1344 /* 1345 * Pin and lock source folio. Since we are in RCU read section, 1346 * we can't block, so on contention have to unmap the ptes, 1347 * obtain the lock and retry. 1348 */ 1349 if (!src_folio) { 1350 struct folio *folio; 1351 bool locked; 1352 1353 /* 1354 * Pin the page while holding the lock to be sure the 1355 * page isn't freed under us 1356 */ 1357 spin_lock(src_ptl); 1358 if (!pte_same(orig_src_pte, ptep_get(src_pte))) { 1359 spin_unlock(src_ptl); 1360 ret = -EAGAIN; 1361 goto out; 1362 } 1363 1364 folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); 1365 if (!folio || !PageAnonExclusive(&folio->page)) { 1366 spin_unlock(src_ptl); 1367 ret = -EBUSY; 1368 goto out; 1369 } 1370 1371 locked = folio_trylock(folio); 1372 /* 1373 * We avoid waiting for folio lock with a raised 1374 * refcount for large folios because extra refcounts 1375 * will result in split_folio() failing later and 1376 * retrying. If multiple tasks are trying to move a 1377 * large folio we can end up livelocking. 1378 */ 1379 if (!locked && folio_test_large(folio)) { 1380 spin_unlock(src_ptl); 1381 ret = -EAGAIN; 1382 goto out; 1383 } 1384 1385 folio_get(folio); 1386 src_folio = folio; 1387 src_folio_pte = orig_src_pte; 1388 spin_unlock(src_ptl); 1389 1390 if (!locked) { 1391 pte_unmap(src_pte); 1392 pte_unmap(dst_pte); 1393 src_pte = dst_pte = NULL; 1394 /* now we can block and wait */ 1395 folio_lock(src_folio); 1396 goto retry; 1397 } 1398 1399 if (WARN_ON_ONCE(!folio_test_anon(src_folio))) { 1400 ret = -EBUSY; 1401 goto out; 1402 } 1403 } 1404 1405 /* at this point we have src_folio locked */ 1406 if (folio_test_large(src_folio)) { 1407 /* split_folio() can block */ 1408 pte_unmap(src_pte); 1409 pte_unmap(dst_pte); 1410 src_pte = dst_pte = NULL; 1411 ret = split_folio(src_folio); 1412 if (ret) 1413 goto out; 1414 /* have to reacquire the folio after it got split */ 1415 folio_unlock(src_folio); 1416 folio_put(src_folio); 1417 src_folio = NULL; 1418 goto retry; 1419 } 1420 1421 ret = move_present_ptes(mm, dst_vma, src_vma, 1422 dst_addr, src_addr, dst_pte, src_pte, 1423 orig_dst_pte, orig_src_pte, dst_pmd, 1424 dst_pmdval, dst_ptl, src_ptl, &src_folio, 1425 len); 1426 } else { /* !pte_present() */ 1427 struct folio *folio = NULL; 1428 const softleaf_t entry = softleaf_from_pte(orig_src_pte); 1429 1430 if (softleaf_is_migration(entry)) { 1431 pte_unmap(src_pte); 1432 pte_unmap(dst_pte); 1433 src_pte = dst_pte = NULL; 1434 migration_entry_wait(mm, src_pmd, src_addr); 1435 1436 ret = -EAGAIN; 1437 goto out; 1438 } else if (!softleaf_is_swap(entry)) { 1439 ret = -EFAULT; 1440 goto out; 1441 } 1442 1443 if (!pte_swp_exclusive(orig_src_pte)) { 1444 ret = -EBUSY; 1445 goto out; 1446 } 1447 1448 si = get_swap_device(entry); 1449 if (unlikely(!si)) { 1450 ret = -EAGAIN; 1451 goto out; 1452 } 1453 /* 1454 * Verify the existence of the swapcache. If present, the folio's 1455 * index and mapping must be updated even when the PTE is a swap 1456 * entry. The anon_vma lock is not taken during this process since 1457 * the folio has already been unmapped, and the swap entry is 1458 * exclusive, preventing rmap walks. 1459 * 1460 * For large folios, return -EBUSY immediately, as split_folio() 1461 * also returns -EBUSY when attempting to split unmapped large 1462 * folios in the swapcache. This issue needs to be resolved 1463 * separately to allow proper handling. 1464 */ 1465 if (!src_folio) 1466 folio = swap_cache_get_folio(entry); 1467 if (folio) { 1468 if (folio_test_large(folio)) { 1469 ret = -EBUSY; 1470 folio_put(folio); 1471 goto out; 1472 } 1473 src_folio = folio; 1474 src_folio_pte = orig_src_pte; 1475 if (!folio_trylock(src_folio)) { 1476 pte_unmap(src_pte); 1477 pte_unmap(dst_pte); 1478 src_pte = dst_pte = NULL; 1479 put_swap_device(si); 1480 si = NULL; 1481 /* now we can block and wait */ 1482 folio_lock(src_folio); 1483 goto retry; 1484 } 1485 } 1486 ret = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, 1487 orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, 1488 dst_ptl, src_ptl, src_folio, si, entry); 1489 } 1490 1491 out: 1492 if (src_folio) { 1493 folio_unlock(src_folio); 1494 folio_put(src_folio); 1495 } 1496 /* 1497 * Unmap in reverse order (LIFO) to maintain proper kmap_local 1498 * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte 1499 * first, then src_pte, so we must unmap src_pte first, then dst_pte. 1500 */ 1501 if (src_pte) 1502 pte_unmap(src_pte); 1503 if (dst_pte) 1504 pte_unmap(dst_pte); 1505 mmu_notifier_invalidate_range_end(&range); 1506 if (si) 1507 put_swap_device(si); 1508 1509 return ret; 1510 } 1511 1512 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1513 static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1514 unsigned long src_addr, 1515 unsigned long src_end) 1516 { 1517 return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) || 1518 src_end - src_addr < HPAGE_PMD_SIZE; 1519 } 1520 #else 1521 static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1522 unsigned long src_addr, 1523 unsigned long src_end) 1524 { 1525 /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */ 1526 return false; 1527 } 1528 #endif 1529 1530 static inline bool vma_move_compatible(struct vm_area_struct *vma) 1531 { 1532 return !(vma->vm_flags & (VM_PFNMAP | VM_IO | VM_HUGETLB | 1533 VM_MIXEDMAP | VM_SHADOW_STACK)); 1534 } 1535 1536 static int validate_move_areas(struct userfaultfd_ctx *ctx, 1537 struct vm_area_struct *src_vma, 1538 struct vm_area_struct *dst_vma) 1539 { 1540 /* Only allow moving if both have the same access and protection */ 1541 if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) || 1542 pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot)) 1543 return -EINVAL; 1544 1545 /* Only allow moving if both are mlocked or both aren't */ 1546 if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED)) 1547 return -EINVAL; 1548 1549 /* 1550 * For now, we keep it simple and only move between writable VMAs. 1551 * Access flags are equal, therefore checking only the source is enough. 1552 */ 1553 if (!(src_vma->vm_flags & VM_WRITE)) 1554 return -EINVAL; 1555 1556 /* Check if vma flags indicate content which can be moved */ 1557 if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma)) 1558 return -EINVAL; 1559 1560 /* Ensure dst_vma is registered in uffd we are operating on */ 1561 if (!dst_vma->vm_userfaultfd_ctx.ctx || 1562 dst_vma->vm_userfaultfd_ctx.ctx != ctx) 1563 return -EINVAL; 1564 1565 /* Only allow moving across anonymous vmas */ 1566 if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma)) 1567 return -EINVAL; 1568 1569 return 0; 1570 } 1571 1572 static __always_inline 1573 int find_vmas_mm_locked(struct mm_struct *mm, 1574 unsigned long dst_start, 1575 unsigned long src_start, 1576 struct vm_area_struct **dst_vmap, 1577 struct vm_area_struct **src_vmap) 1578 { 1579 struct vm_area_struct *vma; 1580 1581 mmap_assert_locked(mm); 1582 vma = find_vma_and_prepare_anon(mm, dst_start); 1583 if (IS_ERR(vma)) 1584 return PTR_ERR(vma); 1585 1586 *dst_vmap = vma; 1587 /* Skip finding src_vma if src_start is in dst_vma */ 1588 if (src_start >= vma->vm_start && src_start < vma->vm_end) 1589 goto out_success; 1590 1591 vma = vma_lookup(mm, src_start); 1592 if (!vma) 1593 return -ENOENT; 1594 out_success: 1595 *src_vmap = vma; 1596 return 0; 1597 } 1598 1599 #ifdef CONFIG_PER_VMA_LOCK 1600 static int uffd_move_lock(struct mm_struct *mm, 1601 unsigned long dst_start, 1602 unsigned long src_start, 1603 struct vm_area_struct **dst_vmap, 1604 struct vm_area_struct **src_vmap) 1605 { 1606 struct vm_area_struct *vma; 1607 int err; 1608 1609 vma = uffd_lock_vma(mm, dst_start); 1610 if (IS_ERR(vma)) 1611 return PTR_ERR(vma); 1612 1613 *dst_vmap = vma; 1614 /* 1615 * Skip finding src_vma if src_start is in dst_vma. This also ensures 1616 * that we don't lock the same vma twice. 1617 */ 1618 if (src_start >= vma->vm_start && src_start < vma->vm_end) { 1619 *src_vmap = vma; 1620 return 0; 1621 } 1622 1623 /* 1624 * Using uffd_lock_vma() to get src_vma can lead to following deadlock: 1625 * 1626 * Thread1 Thread2 1627 * ------- ------- 1628 * vma_start_read(dst_vma) 1629 * mmap_write_lock(mm) 1630 * vma_start_write(src_vma) 1631 * vma_start_read(src_vma) 1632 * mmap_read_lock(mm) 1633 * vma_start_write(dst_vma) 1634 */ 1635 *src_vmap = lock_vma_under_rcu(mm, src_start); 1636 if (likely(*src_vmap)) 1637 return 0; 1638 1639 /* Undo any locking and retry in mmap_lock critical section */ 1640 vma_end_read(*dst_vmap); 1641 1642 mmap_read_lock(mm); 1643 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); 1644 if (err) 1645 goto out; 1646 1647 if (!vma_start_read_locked(*dst_vmap)) { 1648 err = -EAGAIN; 1649 goto out; 1650 } 1651 1652 /* Nothing further to do if both vmas are locked. */ 1653 if (*dst_vmap == *src_vmap) 1654 goto out; 1655 1656 if (!vma_start_read_locked_nested(*src_vmap, SINGLE_DEPTH_NESTING)) { 1657 /* Undo dst_vmap locking if src_vmap failed to lock */ 1658 vma_end_read(*dst_vmap); 1659 err = -EAGAIN; 1660 } 1661 out: 1662 mmap_read_unlock(mm); 1663 return err; 1664 } 1665 1666 static void uffd_move_unlock(struct vm_area_struct *dst_vma, 1667 struct vm_area_struct *src_vma) 1668 { 1669 vma_end_read(src_vma); 1670 if (src_vma != dst_vma) 1671 vma_end_read(dst_vma); 1672 } 1673 1674 #else 1675 1676 static int uffd_move_lock(struct mm_struct *mm, 1677 unsigned long dst_start, 1678 unsigned long src_start, 1679 struct vm_area_struct **dst_vmap, 1680 struct vm_area_struct **src_vmap) 1681 { 1682 int err; 1683 1684 mmap_read_lock(mm); 1685 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); 1686 if (err) 1687 mmap_read_unlock(mm); 1688 return err; 1689 } 1690 1691 static void uffd_move_unlock(struct vm_area_struct *dst_vma, 1692 struct vm_area_struct *src_vma) 1693 { 1694 mmap_assert_locked(src_vma->vm_mm); 1695 mmap_read_unlock(dst_vma->vm_mm); 1696 } 1697 #endif 1698 1699 /** 1700 * move_pages - move arbitrary anonymous pages of an existing vma 1701 * @ctx: pointer to the userfaultfd context 1702 * @dst_start: start of the destination virtual memory range 1703 * @src_start: start of the source virtual memory range 1704 * @len: length of the virtual memory range 1705 * @mode: flags from uffdio_move.mode 1706 * 1707 * It will either use the mmap_lock in read mode or per-vma locks 1708 * 1709 * move_pages() remaps arbitrary anonymous pages atomically in zero 1710 * copy. It only works on non shared anonymous pages because those can 1711 * be relocated without generating non linear anon_vmas in the rmap 1712 * code. 1713 * 1714 * It provides a zero copy mechanism to handle userspace page faults. 1715 * The source vma pages should have mapcount == 1, which can be 1716 * enforced by using madvise(MADV_DONTFORK) on src vma. 1717 * 1718 * The thread receiving the page during the userland page fault 1719 * will receive the faulting page in the source vma through the network, 1720 * storage or any other I/O device (MADV_DONTFORK in the source vma 1721 * avoids move_pages() to fail with -EBUSY if the process forks before 1722 * move_pages() is called), then it will call move_pages() to map the 1723 * page in the faulting address in the destination vma. 1724 * 1725 * This userfaultfd command works purely via pagetables, so it's the 1726 * most efficient way to move physical non shared anonymous pages 1727 * across different virtual addresses. Unlike mremap()/mmap()/munmap() 1728 * it does not create any new vmas. The mapping in the destination 1729 * address is atomic. 1730 * 1731 * It only works if the vma protection bits are identical from the 1732 * source and destination vma. 1733 * 1734 * It can remap non shared anonymous pages within the same vma too. 1735 * 1736 * If the source virtual memory range has any unmapped holes, or if 1737 * the destination virtual memory range is not a whole unmapped hole, 1738 * move_pages() will fail respectively with -ENOENT or -EEXIST. This 1739 * provides a very strict behavior to avoid any chance of memory 1740 * corruption going unnoticed if there are userland race conditions. 1741 * Only one thread should resolve the userland page fault at any given 1742 * time for any given faulting address. This means that if two threads 1743 * try to both call move_pages() on the same destination address at the 1744 * same time, the second thread will get an explicit error from this 1745 * command. 1746 * 1747 * The command retval will return "len" is successful. The command 1748 * however can be interrupted by fatal signals or errors. If 1749 * interrupted it will return the number of bytes successfully 1750 * remapped before the interruption if any, or the negative error if 1751 * none. It will never return zero. Either it will return an error or 1752 * an amount of bytes successfully moved. If the retval reports a 1753 * "short" remap, the move_pages() command should be repeated by 1754 * userland with src+retval, dst+reval, len-retval if it wants to know 1755 * about the error that interrupted it. 1756 * 1757 * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to 1758 * prevent -ENOENT errors to materialize if there are holes in the 1759 * source virtual range that is being remapped. The holes will be 1760 * accounted as successfully remapped in the retval of the 1761 * command. This is mostly useful to remap hugepage naturally aligned 1762 * virtual regions without knowing if there are transparent hugepage 1763 * in the regions or not, but preventing the risk of having to split 1764 * the hugepmd during the remap. 1765 */ 1766 ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, 1767 unsigned long src_start, unsigned long len, __u64 mode) 1768 { 1769 struct mm_struct *mm = ctx->mm; 1770 struct vm_area_struct *src_vma, *dst_vma; 1771 unsigned long src_addr, dst_addr, src_end; 1772 pmd_t *src_pmd, *dst_pmd; 1773 long err = -EINVAL; 1774 ssize_t moved = 0; 1775 1776 /* Sanitize the command parameters. */ 1777 VM_WARN_ON_ONCE(src_start & ~PAGE_MASK); 1778 VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); 1779 VM_WARN_ON_ONCE(len & ~PAGE_MASK); 1780 1781 /* Does the address range wrap, or is the span zero-sized? */ 1782 VM_WARN_ON_ONCE(src_start + len < src_start); 1783 VM_WARN_ON_ONCE(dst_start + len < dst_start); 1784 1785 err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma); 1786 if (err) 1787 goto out; 1788 1789 /* Re-check after taking map_changing_lock */ 1790 err = -EAGAIN; 1791 down_read(&ctx->map_changing_lock); 1792 if (likely(atomic_read(&ctx->mmap_changing))) 1793 goto out_unlock; 1794 /* 1795 * Make sure the vma is not shared, that the src and dst remap 1796 * ranges are both valid and fully within a single existing 1797 * vma. 1798 */ 1799 err = -EINVAL; 1800 if (src_vma->vm_flags & VM_SHARED) 1801 goto out_unlock; 1802 if (src_start + len > src_vma->vm_end) 1803 goto out_unlock; 1804 1805 if (dst_vma->vm_flags & VM_SHARED) 1806 goto out_unlock; 1807 if (dst_start + len > dst_vma->vm_end) 1808 goto out_unlock; 1809 1810 err = validate_move_areas(ctx, src_vma, dst_vma); 1811 if (err) 1812 goto out_unlock; 1813 1814 for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len; 1815 src_addr < src_end;) { 1816 spinlock_t *ptl; 1817 pmd_t dst_pmdval; 1818 unsigned long step_size; 1819 1820 /* 1821 * Below works because anonymous area would not have a 1822 * transparent huge PUD. If file-backed support is added, 1823 * that case would need to be handled here. 1824 */ 1825 src_pmd = mm_find_pmd(mm, src_addr); 1826 if (unlikely(!src_pmd)) { 1827 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 1828 err = -ENOENT; 1829 break; 1830 } 1831 src_pmd = mm_alloc_pmd(mm, src_addr); 1832 if (unlikely(!src_pmd)) { 1833 err = -ENOMEM; 1834 break; 1835 } 1836 } 1837 dst_pmd = mm_alloc_pmd(mm, dst_addr); 1838 if (unlikely(!dst_pmd)) { 1839 err = -ENOMEM; 1840 break; 1841 } 1842 1843 dst_pmdval = pmdp_get_lockless(dst_pmd); 1844 /* 1845 * If the dst_pmd is mapped as THP don't override it and just 1846 * be strict. If dst_pmd changes into TPH after this check, the 1847 * move_pages_huge_pmd() will detect the change and retry 1848 * while move_pages_pte() will detect the change and fail. 1849 */ 1850 if (unlikely(pmd_trans_huge(dst_pmdval))) { 1851 err = -EEXIST; 1852 break; 1853 } 1854 1855 ptl = pmd_trans_huge_lock(src_pmd, src_vma); 1856 if (ptl) { 1857 /* Check if we can move the pmd without splitting it. */ 1858 if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || 1859 !pmd_none(dst_pmdval)) { 1860 /* Can be a migration entry */ 1861 if (pmd_present(*src_pmd)) { 1862 struct folio *folio = pmd_folio(*src_pmd); 1863 1864 if (!is_huge_zero_folio(folio) && 1865 !PageAnonExclusive(&folio->page)) { 1866 spin_unlock(ptl); 1867 err = -EBUSY; 1868 break; 1869 } 1870 } 1871 1872 spin_unlock(ptl); 1873 split_huge_pmd(src_vma, src_pmd, src_addr); 1874 /* The folio will be split by move_pages_pte() */ 1875 continue; 1876 } 1877 1878 err = move_pages_huge_pmd(mm, dst_pmd, src_pmd, 1879 dst_pmdval, dst_vma, src_vma, 1880 dst_addr, src_addr); 1881 step_size = HPAGE_PMD_SIZE; 1882 } else { 1883 long ret; 1884 1885 if (pmd_none(*src_pmd)) { 1886 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 1887 err = -ENOENT; 1888 break; 1889 } 1890 if (unlikely(__pte_alloc(mm, src_pmd))) { 1891 err = -ENOMEM; 1892 break; 1893 } 1894 } 1895 1896 if (unlikely(pte_alloc(mm, dst_pmd))) { 1897 err = -ENOMEM; 1898 break; 1899 } 1900 1901 ret = move_pages_ptes(mm, dst_pmd, src_pmd, 1902 dst_vma, src_vma, dst_addr, 1903 src_addr, src_end - src_addr, mode); 1904 if (ret < 0) 1905 err = ret; 1906 else 1907 step_size = ret; 1908 } 1909 1910 cond_resched(); 1911 1912 if (fatal_signal_pending(current)) { 1913 /* Do not override an error */ 1914 if (!err || err == -EAGAIN) 1915 err = -EINTR; 1916 break; 1917 } 1918 1919 if (err) { 1920 if (err == -EAGAIN) 1921 continue; 1922 break; 1923 } 1924 1925 /* Proceed to the next page */ 1926 dst_addr += step_size; 1927 src_addr += step_size; 1928 moved += step_size; 1929 } 1930 1931 out_unlock: 1932 up_read(&ctx->map_changing_lock); 1933 uffd_move_unlock(dst_vma, src_vma); 1934 out: 1935 VM_WARN_ON_ONCE(moved < 0); 1936 VM_WARN_ON_ONCE(err > 0); 1937 VM_WARN_ON_ONCE(!moved && !err); 1938 return moved ? moved : err; 1939 } 1940 1941 static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, 1942 vm_flags_t vm_flags) 1943 { 1944 const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP; 1945 1946 vm_flags_reset(vma, vm_flags); 1947 /* 1948 * For shared mappings, we want to enable writenotify while 1949 * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply 1950 * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. 1951 */ 1952 if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) 1953 vma_set_page_prot(vma); 1954 } 1955 1956 static void userfaultfd_set_ctx(struct vm_area_struct *vma, 1957 struct userfaultfd_ctx *ctx, 1958 vm_flags_t vm_flags) 1959 { 1960 vma_start_write(vma); 1961 vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; 1962 userfaultfd_set_vm_flags(vma, 1963 (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags); 1964 } 1965 1966 void userfaultfd_reset_ctx(struct vm_area_struct *vma) 1967 { 1968 userfaultfd_set_ctx(vma, NULL, 0); 1969 } 1970 1971 struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, 1972 struct vm_area_struct *prev, 1973 struct vm_area_struct *vma, 1974 unsigned long start, 1975 unsigned long end) 1976 { 1977 struct vm_area_struct *ret; 1978 bool give_up_on_oom = false; 1979 1980 /* 1981 * If we are modifying only and not splitting, just give up on the merge 1982 * if OOM prevents us from merging successfully. 1983 */ 1984 if (start == vma->vm_start && end == vma->vm_end) 1985 give_up_on_oom = true; 1986 1987 /* Reset ptes for the whole vma range if wr-protected */ 1988 if (userfaultfd_wp(vma)) 1989 uffd_wp_range(vma, start, end - start, false); 1990 1991 ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, 1992 vma->vm_flags & ~__VM_UFFD_FLAGS, 1993 NULL_VM_UFFD_CTX, give_up_on_oom); 1994 1995 /* 1996 * In the vma_merge() successful mprotect-like case 8: 1997 * the next vma was merged into the current one and 1998 * the current one has not been updated yet. 1999 */ 2000 if (!IS_ERR(ret)) 2001 userfaultfd_reset_ctx(ret); 2002 2003 return ret; 2004 } 2005 2006 /* Assumes mmap write lock taken, and mm_struct pinned. */ 2007 int userfaultfd_register_range(struct userfaultfd_ctx *ctx, 2008 struct vm_area_struct *vma, 2009 vm_flags_t vm_flags, 2010 unsigned long start, unsigned long end, 2011 bool wp_async) 2012 { 2013 VMA_ITERATOR(vmi, ctx->mm, start); 2014 struct vm_area_struct *prev = vma_prev(&vmi); 2015 unsigned long vma_end; 2016 vm_flags_t new_flags; 2017 2018 if (vma->vm_start < start) 2019 prev = vma; 2020 2021 for_each_vma_range(vmi, vma, end) { 2022 cond_resched(); 2023 2024 VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async)); 2025 VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx && 2026 vma->vm_userfaultfd_ctx.ctx != ctx); 2027 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)); 2028 2029 /* 2030 * Nothing to do: this vma is already registered into this 2031 * userfaultfd and with the right tracking mode too. 2032 */ 2033 if (vma->vm_userfaultfd_ctx.ctx == ctx && 2034 (vma->vm_flags & vm_flags) == vm_flags) 2035 goto skip; 2036 2037 if (vma->vm_start > start) 2038 start = vma->vm_start; 2039 vma_end = min(end, vma->vm_end); 2040 2041 new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; 2042 vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, 2043 new_flags, 2044 (struct vm_userfaultfd_ctx){ctx}, 2045 /* give_up_on_oom = */false); 2046 if (IS_ERR(vma)) 2047 return PTR_ERR(vma); 2048 2049 /* 2050 * In the vma_merge() successful mprotect-like case 8: 2051 * the next vma was merged into the current one and 2052 * the current one has not been updated yet. 2053 */ 2054 userfaultfd_set_ctx(vma, ctx, vm_flags); 2055 2056 if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) 2057 hugetlb_unshare_all_pmds(vma); 2058 2059 skip: 2060 prev = vma; 2061 start = vma->vm_end; 2062 } 2063 2064 return 0; 2065 } 2066 2067 void userfaultfd_release_new(struct userfaultfd_ctx *ctx) 2068 { 2069 struct mm_struct *mm = ctx->mm; 2070 struct vm_area_struct *vma; 2071 VMA_ITERATOR(vmi, mm, 0); 2072 2073 /* the various vma->vm_userfaultfd_ctx still points to it */ 2074 mmap_write_lock(mm); 2075 for_each_vma(vmi, vma) { 2076 if (vma->vm_userfaultfd_ctx.ctx == ctx) 2077 userfaultfd_reset_ctx(vma); 2078 } 2079 mmap_write_unlock(mm); 2080 } 2081 2082 void userfaultfd_release_all(struct mm_struct *mm, 2083 struct userfaultfd_ctx *ctx) 2084 { 2085 struct vm_area_struct *vma, *prev; 2086 VMA_ITERATOR(vmi, mm, 0); 2087 2088 if (!mmget_not_zero(mm)) 2089 return; 2090 2091 /* 2092 * Flush page faults out of all CPUs. NOTE: all page faults 2093 * must be retried without returning VM_FAULT_SIGBUS if 2094 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx 2095 * changes while handle_userfault released the mmap_lock. So 2096 * it's critical that released is set to true (above), before 2097 * taking the mmap_lock for writing. 2098 */ 2099 mmap_write_lock(mm); 2100 prev = NULL; 2101 for_each_vma(vmi, vma) { 2102 cond_resched(); 2103 VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^ 2104 !!(vma->vm_flags & __VM_UFFD_FLAGS)); 2105 if (vma->vm_userfaultfd_ctx.ctx != ctx) { 2106 prev = vma; 2107 continue; 2108 } 2109 2110 vma = userfaultfd_clear_vma(&vmi, prev, vma, 2111 vma->vm_start, vma->vm_end); 2112 prev = vma; 2113 } 2114 mmap_write_unlock(mm); 2115 mmput(mm); 2116 } 2117