1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * mm/userfaultfd.c 4 * 5 * Copyright (C) 2015 Red Hat, Inc. 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/sched/signal.h> 10 #include <linux/pagemap.h> 11 #include <linux/rmap.h> 12 #include <linux/swap.h> 13 #include <linux/leafops.h> 14 #include <linux/userfaultfd_k.h> 15 #include <linux/mmu_notifier.h> 16 #include <linux/hugetlb.h> 17 #include <linux/shmem_fs.h> 18 #include <asm/tlbflush.h> 19 #include <asm/tlb.h> 20 #include "internal.h" 21 #include "swap.h" 22 23 static __always_inline 24 bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) 25 { 26 /* Make sure that the dst range is fully within dst_vma. */ 27 if (dst_end > dst_vma->vm_end) 28 return false; 29 30 /* 31 * Check the vma is registered in uffd, this is required to 32 * enforce the VM_MAYWRITE check done at uffd registration 33 * time. 34 */ 35 if (!dst_vma->vm_userfaultfd_ctx.ctx) 36 return false; 37 38 return true; 39 } 40 41 static __always_inline 42 struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm, 43 unsigned long addr) 44 { 45 struct vm_area_struct *vma; 46 47 mmap_assert_locked(mm); 48 vma = vma_lookup(mm, addr); 49 if (!vma) 50 vma = ERR_PTR(-ENOENT); 51 else if (!(vma->vm_flags & VM_SHARED) && 52 unlikely(anon_vma_prepare(vma))) 53 vma = ERR_PTR(-ENOMEM); 54 55 return vma; 56 } 57 58 #ifdef CONFIG_PER_VMA_LOCK 59 /* 60 * uffd_lock_vma() - Lookup and lock vma corresponding to @address. 61 * @mm: mm to search vma in. 62 * @address: address that the vma should contain. 63 * 64 * Should be called without holding mmap_lock. 65 * 66 * Return: A locked vma containing @address, -ENOENT if no vma is found, or 67 * -ENOMEM if anon_vma couldn't be allocated. 68 */ 69 static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, 70 unsigned long address) 71 { 72 struct vm_area_struct *vma; 73 74 vma = lock_vma_under_rcu(mm, address); 75 if (vma) { 76 /* 77 * We know we're going to need to use anon_vma, so check 78 * that early. 79 */ 80 if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma)) 81 vma_end_read(vma); 82 else 83 return vma; 84 } 85 86 mmap_read_lock(mm); 87 vma = find_vma_and_prepare_anon(mm, address); 88 if (!IS_ERR(vma)) { 89 bool locked = vma_start_read_locked(vma); 90 91 if (!locked) 92 vma = ERR_PTR(-EAGAIN); 93 } 94 95 mmap_read_unlock(mm); 96 return vma; 97 } 98 99 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, 100 unsigned long dst_start, 101 unsigned long len) 102 { 103 struct vm_area_struct *dst_vma; 104 105 dst_vma = uffd_lock_vma(dst_mm, dst_start); 106 if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len)) 107 return dst_vma; 108 109 vma_end_read(dst_vma); 110 return ERR_PTR(-ENOENT); 111 } 112 113 static void uffd_mfill_unlock(struct vm_area_struct *vma) 114 { 115 vma_end_read(vma); 116 } 117 118 #else 119 120 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, 121 unsigned long dst_start, 122 unsigned long len) 123 { 124 struct vm_area_struct *dst_vma; 125 126 mmap_read_lock(dst_mm); 127 dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start); 128 if (IS_ERR(dst_vma)) 129 goto out_unlock; 130 131 if (validate_dst_vma(dst_vma, dst_start + len)) 132 return dst_vma; 133 134 dst_vma = ERR_PTR(-ENOENT); 135 out_unlock: 136 mmap_read_unlock(dst_mm); 137 return dst_vma; 138 } 139 140 static void uffd_mfill_unlock(struct vm_area_struct *vma) 141 { 142 mmap_read_unlock(vma->vm_mm); 143 } 144 #endif 145 146 /* Check if dst_addr is outside of file's size. Must be called with ptl held. */ 147 static bool mfill_file_over_size(struct vm_area_struct *dst_vma, 148 unsigned long dst_addr) 149 { 150 struct inode *inode; 151 pgoff_t offset, max_off; 152 153 if (!dst_vma->vm_file) 154 return false; 155 156 inode = dst_vma->vm_file->f_inode; 157 offset = linear_page_index(dst_vma, dst_addr); 158 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 159 return offset >= max_off; 160 } 161 162 /* 163 * Install PTEs, to map dst_addr (within dst_vma) to page. 164 * 165 * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem 166 * and anon, and for both shared and private VMAs. 167 */ 168 int mfill_atomic_install_pte(pmd_t *dst_pmd, 169 struct vm_area_struct *dst_vma, 170 unsigned long dst_addr, struct page *page, 171 bool newly_allocated, uffd_flags_t flags) 172 { 173 int ret; 174 struct mm_struct *dst_mm = dst_vma->vm_mm; 175 pte_t _dst_pte, *dst_pte; 176 bool writable = dst_vma->vm_flags & VM_WRITE; 177 bool vm_shared = dst_vma->vm_flags & VM_SHARED; 178 spinlock_t *ptl; 179 struct folio *folio = page_folio(page); 180 bool page_in_cache = folio_mapping(folio); 181 pte_t dst_ptep; 182 183 _dst_pte = mk_pte(page, dst_vma->vm_page_prot); 184 _dst_pte = pte_mkdirty(_dst_pte); 185 if (page_in_cache && !vm_shared) 186 writable = false; 187 if (writable) 188 _dst_pte = pte_mkwrite(_dst_pte, dst_vma); 189 if (flags & MFILL_ATOMIC_WP) 190 _dst_pte = pte_mkuffd_wp(_dst_pte); 191 192 ret = -EAGAIN; 193 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 194 if (!dst_pte) 195 goto out; 196 197 if (mfill_file_over_size(dst_vma, dst_addr)) { 198 ret = -EFAULT; 199 goto out_unlock; 200 } 201 202 ret = -EEXIST; 203 204 dst_ptep = ptep_get(dst_pte); 205 206 /* 207 * We are allowed to overwrite a UFFD pte marker: consider when both 208 * MISSING|WP registered, we firstly wr-protect a none pte which has no 209 * page cache page backing it, then access the page. 210 */ 211 if (!pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep)) 212 goto out_unlock; 213 214 if (page_in_cache) { 215 /* Usually, cache pages are already added to LRU */ 216 if (newly_allocated) 217 folio_add_lru(folio); 218 folio_add_file_rmap_pte(folio, page, dst_vma); 219 } else { 220 folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); 221 folio_add_lru_vma(folio, dst_vma); 222 } 223 224 /* 225 * Must happen after rmap, as mm_counter() checks mapping (via 226 * PageAnon()), which is set by __page_set_anon_rmap(). 227 */ 228 inc_mm_counter(dst_mm, mm_counter(folio)); 229 230 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 231 232 /* No need to invalidate - it was non-present before */ 233 update_mmu_cache(dst_vma, dst_addr, dst_pte); 234 ret = 0; 235 out_unlock: 236 pte_unmap_unlock(dst_pte, ptl); 237 out: 238 return ret; 239 } 240 241 static int mfill_atomic_pte_copy(pmd_t *dst_pmd, 242 struct vm_area_struct *dst_vma, 243 unsigned long dst_addr, 244 unsigned long src_addr, 245 uffd_flags_t flags, 246 struct folio **foliop) 247 { 248 void *kaddr; 249 int ret; 250 struct folio *folio; 251 252 if (!*foliop) { 253 ret = -ENOMEM; 254 folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, 255 dst_addr); 256 if (!folio) 257 goto out; 258 259 kaddr = kmap_local_folio(folio, 0); 260 /* 261 * The read mmap_lock is held here. Despite the 262 * mmap_lock being read recursive a deadlock is still 263 * possible if a writer has taken a lock. For example: 264 * 265 * process A thread 1 takes read lock on own mmap_lock 266 * process A thread 2 calls mmap, blocks taking write lock 267 * process B thread 1 takes page fault, read lock on own mmap lock 268 * process B thread 2 calls mmap, blocks taking write lock 269 * process A thread 1 blocks taking read lock on process B 270 * process B thread 1 blocks taking read lock on process A 271 * 272 * Disable page faults to prevent potential deadlock 273 * and retry the copy outside the mmap_lock. 274 */ 275 pagefault_disable(); 276 ret = copy_from_user(kaddr, (const void __user *) src_addr, 277 PAGE_SIZE); 278 pagefault_enable(); 279 kunmap_local(kaddr); 280 281 /* fallback to copy_from_user outside mmap_lock */ 282 if (unlikely(ret)) { 283 ret = -ENOENT; 284 *foliop = folio; 285 /* don't free the page */ 286 goto out; 287 } 288 289 flush_dcache_folio(folio); 290 } else { 291 folio = *foliop; 292 *foliop = NULL; 293 } 294 295 /* 296 * The memory barrier inside __folio_mark_uptodate makes sure that 297 * preceding stores to the page contents become visible before 298 * the set_pte_at() write. 299 */ 300 __folio_mark_uptodate(folio); 301 302 ret = -ENOMEM; 303 if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) 304 goto out_release; 305 306 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 307 &folio->page, true, flags); 308 if (ret) 309 goto out_release; 310 out: 311 return ret; 312 out_release: 313 folio_put(folio); 314 goto out; 315 } 316 317 static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd, 318 struct vm_area_struct *dst_vma, 319 unsigned long dst_addr) 320 { 321 struct folio *folio; 322 int ret = -ENOMEM; 323 324 folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr); 325 if (!folio) 326 return ret; 327 328 if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) 329 goto out_put; 330 331 /* 332 * The memory barrier inside __folio_mark_uptodate makes sure that 333 * zeroing out the folio become visible before mapping the page 334 * using set_pte_at(). See do_anonymous_page(). 335 */ 336 __folio_mark_uptodate(folio); 337 338 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 339 &folio->page, true, 0); 340 if (ret) 341 goto out_put; 342 343 return 0; 344 out_put: 345 folio_put(folio); 346 return ret; 347 } 348 349 static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, 350 struct vm_area_struct *dst_vma, 351 unsigned long dst_addr) 352 { 353 pte_t _dst_pte, *dst_pte; 354 spinlock_t *ptl; 355 int ret; 356 357 if (mm_forbids_zeropage(dst_vma->vm_mm)) 358 return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr); 359 360 _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), 361 dst_vma->vm_page_prot)); 362 ret = -EAGAIN; 363 dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); 364 if (!dst_pte) 365 goto out; 366 if (mfill_file_over_size(dst_vma, dst_addr)) { 367 ret = -EFAULT; 368 goto out_unlock; 369 } 370 ret = -EEXIST; 371 if (!pte_none(ptep_get(dst_pte))) 372 goto out_unlock; 373 set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); 374 /* No need to invalidate - it was non-present before */ 375 update_mmu_cache(dst_vma, dst_addr, dst_pte); 376 ret = 0; 377 out_unlock: 378 pte_unmap_unlock(dst_pte, ptl); 379 out: 380 return ret; 381 } 382 383 /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ 384 static int mfill_atomic_pte_continue(pmd_t *dst_pmd, 385 struct vm_area_struct *dst_vma, 386 unsigned long dst_addr, 387 uffd_flags_t flags) 388 { 389 struct inode *inode = file_inode(dst_vma->vm_file); 390 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 391 struct folio *folio; 392 struct page *page; 393 int ret; 394 395 ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); 396 /* Our caller expects us to return -EFAULT if we failed to find folio */ 397 if (ret == -ENOENT) 398 ret = -EFAULT; 399 if (ret) 400 goto out; 401 if (!folio) { 402 ret = -EFAULT; 403 goto out; 404 } 405 406 page = folio_file_page(folio, pgoff); 407 if (PageHWPoison(page)) { 408 ret = -EIO; 409 goto out_release; 410 } 411 412 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 413 page, false, flags); 414 if (ret) 415 goto out_release; 416 417 folio_unlock(folio); 418 ret = 0; 419 out: 420 return ret; 421 out_release: 422 folio_unlock(folio); 423 folio_put(folio); 424 goto out; 425 } 426 427 /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ 428 static int mfill_atomic_pte_poison(pmd_t *dst_pmd, 429 struct vm_area_struct *dst_vma, 430 unsigned long dst_addr, 431 uffd_flags_t flags) 432 { 433 int ret; 434 struct mm_struct *dst_mm = dst_vma->vm_mm; 435 pte_t _dst_pte, *dst_pte; 436 spinlock_t *ptl; 437 438 _dst_pte = make_pte_marker(PTE_MARKER_POISONED); 439 ret = -EAGAIN; 440 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 441 if (!dst_pte) 442 goto out; 443 444 if (mfill_file_over_size(dst_vma, dst_addr)) { 445 ret = -EFAULT; 446 goto out_unlock; 447 } 448 449 ret = -EEXIST; 450 /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */ 451 if (!pte_none(ptep_get(dst_pte))) 452 goto out_unlock; 453 454 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 455 456 /* No need to invalidate - it was non-present before */ 457 update_mmu_cache(dst_vma, dst_addr, dst_pte); 458 ret = 0; 459 out_unlock: 460 pte_unmap_unlock(dst_pte, ptl); 461 out: 462 return ret; 463 } 464 465 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) 466 { 467 pgd_t *pgd; 468 p4d_t *p4d; 469 pud_t *pud; 470 471 pgd = pgd_offset(mm, address); 472 p4d = p4d_alloc(mm, pgd, address); 473 if (!p4d) 474 return NULL; 475 pud = pud_alloc(mm, p4d, address); 476 if (!pud) 477 return NULL; 478 /* 479 * Note that we didn't run this because the pmd was 480 * missing, the *pmd may be already established and in 481 * turn it may also be a trans_huge_pmd. 482 */ 483 return pmd_alloc(mm, pud, address); 484 } 485 486 #ifdef CONFIG_HUGETLB_PAGE 487 /* 488 * mfill_atomic processing for HUGETLB vmas. Note that this routine is 489 * called with either vma-lock or mmap_lock held, it will release the lock 490 * before returning. 491 */ 492 static __always_inline ssize_t mfill_atomic_hugetlb( 493 struct userfaultfd_ctx *ctx, 494 struct vm_area_struct *dst_vma, 495 unsigned long dst_start, 496 unsigned long src_start, 497 unsigned long len, 498 uffd_flags_t flags) 499 { 500 struct mm_struct *dst_mm = dst_vma->vm_mm; 501 ssize_t err; 502 pte_t *dst_pte; 503 unsigned long src_addr, dst_addr; 504 long copied; 505 struct folio *folio; 506 unsigned long vma_hpagesize; 507 pgoff_t idx; 508 u32 hash; 509 struct address_space *mapping; 510 511 /* 512 * There is no default zero huge page for all huge page sizes as 513 * supported by hugetlb. A PMD_SIZE huge pages may exist as used 514 * by THP. Since we can not reliably insert a zero page, this 515 * feature is not supported. 516 */ 517 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { 518 up_read(&ctx->map_changing_lock); 519 uffd_mfill_unlock(dst_vma); 520 return -EINVAL; 521 } 522 523 src_addr = src_start; 524 dst_addr = dst_start; 525 copied = 0; 526 folio = NULL; 527 vma_hpagesize = vma_kernel_pagesize(dst_vma); 528 529 /* 530 * Validate alignment based on huge page size 531 */ 532 err = -EINVAL; 533 if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) 534 goto out_unlock; 535 536 retry: 537 /* 538 * On routine entry dst_vma is set. If we had to drop mmap_lock and 539 * retry, dst_vma will be set to NULL and we must lookup again. 540 */ 541 if (!dst_vma) { 542 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); 543 if (IS_ERR(dst_vma)) { 544 err = PTR_ERR(dst_vma); 545 goto out; 546 } 547 548 err = -ENOENT; 549 if (!is_vm_hugetlb_page(dst_vma)) 550 goto out_unlock_vma; 551 552 err = -EINVAL; 553 if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) 554 goto out_unlock_vma; 555 556 /* 557 * If memory mappings are changing because of non-cooperative 558 * operation (e.g. mremap) running in parallel, bail out and 559 * request the user to retry later 560 */ 561 down_read(&ctx->map_changing_lock); 562 err = -EAGAIN; 563 if (atomic_read(&ctx->mmap_changing)) 564 goto out_unlock; 565 } 566 567 while (src_addr < src_start + len) { 568 VM_WARN_ON_ONCE(dst_addr >= dst_start + len); 569 570 /* 571 * Serialize via vma_lock and hugetlb_fault_mutex. 572 * vma_lock ensures the dst_pte remains valid even 573 * in the case of shared pmds. fault mutex prevents 574 * races with other faulting threads. 575 */ 576 idx = linear_page_index(dst_vma, dst_addr); 577 mapping = dst_vma->vm_file->f_mapping; 578 hash = hugetlb_fault_mutex_hash(mapping, idx); 579 mutex_lock(&hugetlb_fault_mutex_table[hash]); 580 hugetlb_vma_lock_read(dst_vma); 581 582 err = -ENOMEM; 583 dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); 584 if (!dst_pte) { 585 hugetlb_vma_unlock_read(dst_vma); 586 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 587 goto out_unlock; 588 } 589 590 if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { 591 const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte); 592 593 if (!huge_pte_none(ptep) && !pte_is_uffd_marker(ptep)) { 594 err = -EEXIST; 595 hugetlb_vma_unlock_read(dst_vma); 596 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 597 goto out_unlock; 598 } 599 } 600 601 err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, 602 src_addr, flags, &folio); 603 604 hugetlb_vma_unlock_read(dst_vma); 605 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 606 607 cond_resched(); 608 609 if (unlikely(err == -ENOENT)) { 610 up_read(&ctx->map_changing_lock); 611 uffd_mfill_unlock(dst_vma); 612 VM_WARN_ON_ONCE(!folio); 613 614 err = copy_folio_from_user(folio, 615 (const void __user *)src_addr, true); 616 if (unlikely(err)) { 617 err = -EFAULT; 618 goto out; 619 } 620 621 dst_vma = NULL; 622 goto retry; 623 } else 624 VM_WARN_ON_ONCE(folio); 625 626 if (!err) { 627 dst_addr += vma_hpagesize; 628 src_addr += vma_hpagesize; 629 copied += vma_hpagesize; 630 631 if (fatal_signal_pending(current)) 632 err = -EINTR; 633 } 634 if (err) 635 break; 636 } 637 638 out_unlock: 639 up_read(&ctx->map_changing_lock); 640 out_unlock_vma: 641 uffd_mfill_unlock(dst_vma); 642 out: 643 if (folio) 644 folio_put(folio); 645 VM_WARN_ON_ONCE(copied < 0); 646 VM_WARN_ON_ONCE(err > 0); 647 VM_WARN_ON_ONCE(!copied && !err); 648 return copied ? copied : err; 649 } 650 #else /* !CONFIG_HUGETLB_PAGE */ 651 /* fail at build time if gcc attempts to use this */ 652 extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx, 653 struct vm_area_struct *dst_vma, 654 unsigned long dst_start, 655 unsigned long src_start, 656 unsigned long len, 657 uffd_flags_t flags); 658 #endif /* CONFIG_HUGETLB_PAGE */ 659 660 static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, 661 struct vm_area_struct *dst_vma, 662 unsigned long dst_addr, 663 unsigned long src_addr, 664 uffd_flags_t flags, 665 struct folio **foliop) 666 { 667 ssize_t err; 668 669 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { 670 return mfill_atomic_pte_continue(dst_pmd, dst_vma, 671 dst_addr, flags); 672 } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { 673 return mfill_atomic_pte_poison(dst_pmd, dst_vma, 674 dst_addr, flags); 675 } 676 677 /* 678 * The normal page fault path for a shmem will invoke the 679 * fault, fill the hole in the file and COW it right away. The 680 * result generates plain anonymous memory. So when we are 681 * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll 682 * generate anonymous memory directly without actually filling 683 * the hole. For the MAP_PRIVATE case the robustness check 684 * only happens in the pagetable (to verify it's still none) 685 * and not in the radix tree. 686 */ 687 if (!(dst_vma->vm_flags & VM_SHARED)) { 688 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) 689 err = mfill_atomic_pte_copy(dst_pmd, dst_vma, 690 dst_addr, src_addr, 691 flags, foliop); 692 else 693 err = mfill_atomic_pte_zeropage(dst_pmd, 694 dst_vma, dst_addr); 695 } else { 696 err = shmem_mfill_atomic_pte(dst_pmd, dst_vma, 697 dst_addr, src_addr, 698 flags, foliop); 699 } 700 701 return err; 702 } 703 704 static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, 705 unsigned long dst_start, 706 unsigned long src_start, 707 unsigned long len, 708 uffd_flags_t flags) 709 { 710 struct mm_struct *dst_mm = ctx->mm; 711 struct vm_area_struct *dst_vma; 712 ssize_t err; 713 pmd_t *dst_pmd; 714 unsigned long src_addr, dst_addr; 715 long copied; 716 struct folio *folio; 717 718 /* 719 * Sanitize the command parameters: 720 */ 721 VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); 722 VM_WARN_ON_ONCE(len & ~PAGE_MASK); 723 724 /* Does the address range wrap, or is the span zero-sized? */ 725 VM_WARN_ON_ONCE(src_start + len <= src_start); 726 VM_WARN_ON_ONCE(dst_start + len <= dst_start); 727 728 src_addr = src_start; 729 dst_addr = dst_start; 730 copied = 0; 731 folio = NULL; 732 retry: 733 /* 734 * Make sure the vma is not shared, that the dst range is 735 * both valid and fully within a single existing vma. 736 */ 737 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); 738 if (IS_ERR(dst_vma)) { 739 err = PTR_ERR(dst_vma); 740 goto out; 741 } 742 743 /* 744 * If memory mappings are changing because of non-cooperative 745 * operation (e.g. mremap) running in parallel, bail out and 746 * request the user to retry later 747 */ 748 down_read(&ctx->map_changing_lock); 749 err = -EAGAIN; 750 if (atomic_read(&ctx->mmap_changing)) 751 goto out_unlock; 752 753 err = -EINVAL; 754 /* 755 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but 756 * it will overwrite vm_ops, so vma_is_anonymous must return false. 757 */ 758 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && 759 dst_vma->vm_flags & VM_SHARED)) 760 goto out_unlock; 761 762 /* 763 * validate 'mode' now that we know the dst_vma: don't allow 764 * a wrprotect copy if the userfaultfd didn't register as WP. 765 */ 766 if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) 767 goto out_unlock; 768 769 /* 770 * If this is a HUGETLB vma, pass off to appropriate routine 771 */ 772 if (is_vm_hugetlb_page(dst_vma)) 773 return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, 774 src_start, len, flags); 775 776 if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) 777 goto out_unlock; 778 if (!vma_is_shmem(dst_vma) && 779 uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) 780 goto out_unlock; 781 782 while (src_addr < src_start + len) { 783 pmd_t dst_pmdval; 784 785 VM_WARN_ON_ONCE(dst_addr >= dst_start + len); 786 787 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); 788 if (unlikely(!dst_pmd)) { 789 err = -ENOMEM; 790 break; 791 } 792 793 dst_pmdval = pmdp_get_lockless(dst_pmd); 794 if (unlikely(pmd_none(dst_pmdval)) && 795 unlikely(__pte_alloc(dst_mm, dst_pmd))) { 796 err = -ENOMEM; 797 break; 798 } 799 dst_pmdval = pmdp_get_lockless(dst_pmd); 800 /* 801 * If the dst_pmd is THP don't override it and just be strict. 802 * (This includes the case where the PMD used to be THP and 803 * changed back to none after __pte_alloc().) 804 */ 805 if (unlikely(!pmd_present(dst_pmdval) || 806 pmd_trans_huge(dst_pmdval))) { 807 err = -EEXIST; 808 break; 809 } 810 if (unlikely(pmd_bad(dst_pmdval))) { 811 err = -EFAULT; 812 break; 813 } 814 /* 815 * For shmem mappings, khugepaged is allowed to remove page 816 * tables under us; pte_offset_map_lock() will deal with that. 817 */ 818 819 err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, 820 src_addr, flags, &folio); 821 cond_resched(); 822 823 if (unlikely(err == -ENOENT)) { 824 void *kaddr; 825 826 up_read(&ctx->map_changing_lock); 827 uffd_mfill_unlock(dst_vma); 828 VM_WARN_ON_ONCE(!folio); 829 830 kaddr = kmap_local_folio(folio, 0); 831 err = copy_from_user(kaddr, 832 (const void __user *) src_addr, 833 PAGE_SIZE); 834 kunmap_local(kaddr); 835 if (unlikely(err)) { 836 err = -EFAULT; 837 goto out; 838 } 839 flush_dcache_folio(folio); 840 goto retry; 841 } else 842 VM_WARN_ON_ONCE(folio); 843 844 if (!err) { 845 dst_addr += PAGE_SIZE; 846 src_addr += PAGE_SIZE; 847 copied += PAGE_SIZE; 848 849 if (fatal_signal_pending(current)) 850 err = -EINTR; 851 } 852 if (err) 853 break; 854 } 855 856 out_unlock: 857 up_read(&ctx->map_changing_lock); 858 uffd_mfill_unlock(dst_vma); 859 out: 860 if (folio) 861 folio_put(folio); 862 VM_WARN_ON_ONCE(copied < 0); 863 VM_WARN_ON_ONCE(err > 0); 864 VM_WARN_ON_ONCE(!copied && !err); 865 return copied ? copied : err; 866 } 867 868 ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, 869 unsigned long src_start, unsigned long len, 870 uffd_flags_t flags) 871 { 872 return mfill_atomic(ctx, dst_start, src_start, len, 873 uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY)); 874 } 875 876 ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, 877 unsigned long start, 878 unsigned long len) 879 { 880 return mfill_atomic(ctx, start, 0, len, 881 uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE)); 882 } 883 884 ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, 885 unsigned long len, uffd_flags_t flags) 886 { 887 888 /* 889 * A caller might reasonably assume that UFFDIO_CONTINUE contains an 890 * smp_wmb() to ensure that any writes to the about-to-be-mapped page by 891 * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to 892 * subsequent loads from the page through the newly mapped address range. 893 */ 894 smp_wmb(); 895 896 return mfill_atomic(ctx, start, 0, len, 897 uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); 898 } 899 900 ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, 901 unsigned long len, uffd_flags_t flags) 902 { 903 return mfill_atomic(ctx, start, 0, len, 904 uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON)); 905 } 906 907 long uffd_wp_range(struct vm_area_struct *dst_vma, 908 unsigned long start, unsigned long len, bool enable_wp) 909 { 910 unsigned int mm_cp_flags; 911 struct mmu_gather tlb; 912 long ret; 913 914 VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end, 915 "The address range exceeds VMA boundary.\n"); 916 if (enable_wp) 917 mm_cp_flags = MM_CP_UFFD_WP; 918 else 919 mm_cp_flags = MM_CP_UFFD_WP_RESOLVE; 920 921 /* 922 * vma->vm_page_prot already reflects that uffd-wp is enabled for this 923 * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed 924 * to be write-protected as default whenever protection changes. 925 * Try upgrading write permissions manually. 926 */ 927 if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) 928 mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; 929 tlb_gather_mmu(&tlb, dst_vma->vm_mm); 930 ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); 931 tlb_finish_mmu(&tlb); 932 933 return ret; 934 } 935 936 int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, 937 unsigned long len, bool enable_wp) 938 { 939 struct mm_struct *dst_mm = ctx->mm; 940 unsigned long end = start + len; 941 unsigned long _start, _end; 942 struct vm_area_struct *dst_vma; 943 unsigned long page_mask; 944 long err; 945 VMA_ITERATOR(vmi, dst_mm, start); 946 947 /* 948 * Sanitize the command parameters: 949 */ 950 VM_WARN_ON_ONCE(start & ~PAGE_MASK); 951 VM_WARN_ON_ONCE(len & ~PAGE_MASK); 952 953 /* Does the address range wrap, or is the span zero-sized? */ 954 VM_WARN_ON_ONCE(start + len <= start); 955 956 mmap_read_lock(dst_mm); 957 958 /* 959 * If memory mappings are changing because of non-cooperative 960 * operation (e.g. mremap) running in parallel, bail out and 961 * request the user to retry later 962 */ 963 down_read(&ctx->map_changing_lock); 964 err = -EAGAIN; 965 if (atomic_read(&ctx->mmap_changing)) 966 goto out_unlock; 967 968 err = -ENOENT; 969 for_each_vma_range(vmi, dst_vma, end) { 970 971 if (!userfaultfd_wp(dst_vma)) { 972 err = -ENOENT; 973 break; 974 } 975 976 if (is_vm_hugetlb_page(dst_vma)) { 977 err = -EINVAL; 978 page_mask = vma_kernel_pagesize(dst_vma) - 1; 979 if ((start & page_mask) || (len & page_mask)) 980 break; 981 } 982 983 _start = max(dst_vma->vm_start, start); 984 _end = min(dst_vma->vm_end, end); 985 986 err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp); 987 988 /* Return 0 on success, <0 on failures */ 989 if (err < 0) 990 break; 991 err = 0; 992 } 993 out_unlock: 994 up_read(&ctx->map_changing_lock); 995 mmap_read_unlock(dst_mm); 996 return err; 997 } 998 999 1000 void double_pt_lock(spinlock_t *ptl1, 1001 spinlock_t *ptl2) 1002 __acquires(ptl1) 1003 __acquires(ptl2) 1004 { 1005 if (ptl1 > ptl2) 1006 swap(ptl1, ptl2); 1007 /* lock in virtual address order to avoid lock inversion */ 1008 spin_lock(ptl1); 1009 if (ptl1 != ptl2) 1010 spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING); 1011 else 1012 __acquire(ptl2); 1013 } 1014 1015 void double_pt_unlock(spinlock_t *ptl1, 1016 spinlock_t *ptl2) 1017 __releases(ptl1) 1018 __releases(ptl2) 1019 { 1020 spin_unlock(ptl1); 1021 if (ptl1 != ptl2) 1022 spin_unlock(ptl2); 1023 else 1024 __release(ptl2); 1025 } 1026 1027 static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, 1028 pte_t orig_dst_pte, pte_t orig_src_pte, 1029 pmd_t *dst_pmd, pmd_t dst_pmdval) 1030 { 1031 return pte_same(ptep_get(src_pte), orig_src_pte) && 1032 pte_same(ptep_get(dst_pte), orig_dst_pte) && 1033 pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd)); 1034 } 1035 1036 /* 1037 * Checks if the two ptes and the corresponding folio are eligible for batched 1038 * move. If so, then returns pointer to the locked folio. Otherwise, returns NULL. 1039 * 1040 * NOTE: folio's reference is not required as the whole operation is within 1041 * PTL's critical section. 1042 */ 1043 static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, 1044 unsigned long src_addr, 1045 pte_t *src_pte, pte_t *dst_pte) 1046 { 1047 pte_t orig_dst_pte, orig_src_pte; 1048 struct folio *folio; 1049 1050 orig_dst_pte = ptep_get(dst_pte); 1051 if (!pte_none(orig_dst_pte)) 1052 return NULL; 1053 1054 orig_src_pte = ptep_get(src_pte); 1055 if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte))) 1056 return NULL; 1057 1058 folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); 1059 if (!folio || !folio_trylock(folio)) 1060 return NULL; 1061 if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) { 1062 folio_unlock(folio); 1063 return NULL; 1064 } 1065 return folio; 1066 } 1067 1068 /* 1069 * Moves src folios to dst in a batch as long as they are not large, and can 1070 * successfully take the lock via folio_trylock(). 1071 */ 1072 static long move_present_ptes(struct mm_struct *mm, 1073 struct vm_area_struct *dst_vma, 1074 struct vm_area_struct *src_vma, 1075 unsigned long dst_addr, unsigned long src_addr, 1076 pte_t *dst_pte, pte_t *src_pte, 1077 pte_t orig_dst_pte, pte_t orig_src_pte, 1078 pmd_t *dst_pmd, pmd_t dst_pmdval, 1079 spinlock_t *dst_ptl, spinlock_t *src_ptl, 1080 struct folio **first_src_folio, unsigned long len) 1081 { 1082 int err = 0; 1083 struct folio *src_folio = *first_src_folio; 1084 unsigned long src_start = src_addr; 1085 unsigned long src_end; 1086 1087 len = pmd_addr_end(dst_addr, dst_addr + len) - dst_addr; 1088 src_end = pmd_addr_end(src_addr, src_addr + len); 1089 flush_cache_range(src_vma, src_addr, src_end); 1090 double_pt_lock(dst_ptl, src_ptl); 1091 1092 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1093 dst_pmd, dst_pmdval)) { 1094 err = -EAGAIN; 1095 goto out; 1096 } 1097 if (folio_test_large(src_folio) || 1098 folio_maybe_dma_pinned(src_folio) || 1099 !PageAnonExclusive(&src_folio->page)) { 1100 err = -EBUSY; 1101 goto out; 1102 } 1103 /* It's safe to drop the reference now as the page-table is holding one. */ 1104 folio_put(*first_src_folio); 1105 *first_src_folio = NULL; 1106 arch_enter_lazy_mmu_mode(); 1107 1108 while (true) { 1109 orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); 1110 /* Folio got pinned from under us. Put it back and fail the move. */ 1111 if (folio_maybe_dma_pinned(src_folio)) { 1112 set_pte_at(mm, src_addr, src_pte, orig_src_pte); 1113 err = -EBUSY; 1114 break; 1115 } 1116 1117 folio_move_anon_rmap(src_folio, dst_vma); 1118 src_folio->index = linear_page_index(dst_vma, dst_addr); 1119 1120 orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); 1121 /* Set soft dirty bit so userspace can notice the pte was moved */ 1122 if (pgtable_supports_soft_dirty()) 1123 orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); 1124 if (pte_dirty(orig_src_pte)) 1125 orig_dst_pte = pte_mkdirty(orig_dst_pte); 1126 orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); 1127 set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); 1128 1129 src_addr += PAGE_SIZE; 1130 if (src_addr == src_end) 1131 break; 1132 dst_addr += PAGE_SIZE; 1133 dst_pte++; 1134 src_pte++; 1135 1136 folio_unlock(src_folio); 1137 src_folio = check_ptes_for_batched_move(src_vma, src_addr, 1138 src_pte, dst_pte); 1139 if (!src_folio) 1140 break; 1141 } 1142 1143 arch_leave_lazy_mmu_mode(); 1144 if (src_addr > src_start) 1145 flush_tlb_range(src_vma, src_start, src_addr); 1146 1147 if (src_folio) 1148 folio_unlock(src_folio); 1149 out: 1150 double_pt_unlock(dst_ptl, src_ptl); 1151 return src_addr > src_start ? src_addr - src_start : err; 1152 } 1153 1154 static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, 1155 unsigned long dst_addr, unsigned long src_addr, 1156 pte_t *dst_pte, pte_t *src_pte, 1157 pte_t orig_dst_pte, pte_t orig_src_pte, 1158 pmd_t *dst_pmd, pmd_t dst_pmdval, 1159 spinlock_t *dst_ptl, spinlock_t *src_ptl, 1160 struct folio *src_folio, 1161 struct swap_info_struct *si, swp_entry_t entry) 1162 { 1163 /* 1164 * Check if the folio still belongs to the target swap entry after 1165 * acquiring the lock. Folio can be freed in the swap cache while 1166 * not locked. 1167 */ 1168 if (src_folio && unlikely(!folio_test_swapcache(src_folio) || 1169 entry.val != src_folio->swap.val)) 1170 return -EAGAIN; 1171 1172 double_pt_lock(dst_ptl, src_ptl); 1173 1174 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1175 dst_pmd, dst_pmdval)) { 1176 double_pt_unlock(dst_ptl, src_ptl); 1177 return -EAGAIN; 1178 } 1179 1180 /* 1181 * The src_folio resides in the swapcache, requiring an update to its 1182 * index and mapping to align with the dst_vma, where a swap-in may 1183 * occur and hit the swapcache after moving the PTE. 1184 */ 1185 if (src_folio) { 1186 folio_move_anon_rmap(src_folio, dst_vma); 1187 src_folio->index = linear_page_index(dst_vma, dst_addr); 1188 } else { 1189 /* 1190 * Check if the swap entry is cached after acquiring the src_pte 1191 * lock. Otherwise, we might miss a newly loaded swap cache folio. 1192 * 1193 * Check swap_map directly to minimize overhead, READ_ONCE is sufficient. 1194 * We are trying to catch newly added swap cache, the only possible case is 1195 * when a folio is swapped in and out again staying in swap cache, using the 1196 * same entry before the PTE check above. The PTL is acquired and released 1197 * twice, each time after updating the swap_map's flag. So holding 1198 * the PTL here ensures we see the updated value. False positive is possible, 1199 * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the 1200 * cache, or during the tiny synchronization window between swap cache and 1201 * swap_map, but it will be gone very quickly, worst result is retry jitters. 1202 */ 1203 if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) { 1204 double_pt_unlock(dst_ptl, src_ptl); 1205 return -EAGAIN; 1206 } 1207 } 1208 1209 orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); 1210 if (pgtable_supports_soft_dirty()) 1211 orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); 1212 set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); 1213 double_pt_unlock(dst_ptl, src_ptl); 1214 1215 return PAGE_SIZE; 1216 } 1217 1218 static int move_zeropage_pte(struct mm_struct *mm, 1219 struct vm_area_struct *dst_vma, 1220 struct vm_area_struct *src_vma, 1221 unsigned long dst_addr, unsigned long src_addr, 1222 pte_t *dst_pte, pte_t *src_pte, 1223 pte_t orig_dst_pte, pte_t orig_src_pte, 1224 pmd_t *dst_pmd, pmd_t dst_pmdval, 1225 spinlock_t *dst_ptl, spinlock_t *src_ptl) 1226 { 1227 pte_t zero_pte; 1228 1229 double_pt_lock(dst_ptl, src_ptl); 1230 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1231 dst_pmd, dst_pmdval)) { 1232 double_pt_unlock(dst_ptl, src_ptl); 1233 return -EAGAIN; 1234 } 1235 1236 zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), 1237 dst_vma->vm_page_prot)); 1238 ptep_clear_flush(src_vma, src_addr, src_pte); 1239 set_pte_at(mm, dst_addr, dst_pte, zero_pte); 1240 double_pt_unlock(dst_ptl, src_ptl); 1241 1242 return PAGE_SIZE; 1243 } 1244 1245 1246 /* 1247 * The mmap_lock for reading is held by the caller. Just move the page(s) 1248 * from src_pmd to dst_pmd if possible, and return number of bytes moved. 1249 * On failure, an error code is returned. 1250 */ 1251 static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, 1252 struct vm_area_struct *dst_vma, 1253 struct vm_area_struct *src_vma, 1254 unsigned long dst_addr, unsigned long src_addr, 1255 unsigned long len, __u64 mode) 1256 { 1257 struct swap_info_struct *si = NULL; 1258 pte_t orig_src_pte, orig_dst_pte; 1259 pte_t src_folio_pte; 1260 spinlock_t *src_ptl, *dst_ptl; 1261 pte_t *src_pte = NULL; 1262 pte_t *dst_pte = NULL; 1263 pmd_t dummy_pmdval; 1264 pmd_t dst_pmdval; 1265 struct folio *src_folio = NULL; 1266 struct mmu_notifier_range range; 1267 long ret = 0; 1268 1269 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 1270 src_addr, src_addr + len); 1271 mmu_notifier_invalidate_range_start(&range); 1272 retry: 1273 /* 1274 * Use the maywrite version to indicate that dst_pte will be modified, 1275 * since dst_pte needs to be none, the subsequent pte_same() check 1276 * cannot prevent the dst_pte page from being freed concurrently, so we 1277 * also need to abtain dst_pmdval and recheck pmd_same() later. 1278 */ 1279 dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval, 1280 &dst_ptl); 1281 1282 /* Retry if a huge pmd materialized from under us */ 1283 if (unlikely(!dst_pte)) { 1284 ret = -EAGAIN; 1285 goto out; 1286 } 1287 1288 /* 1289 * Unlike dst_pte, the subsequent pte_same() check can ensure the 1290 * stability of the src_pte page, so there is no need to get pmdval, 1291 * just pass a dummy variable to it. 1292 */ 1293 src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval, 1294 &src_ptl); 1295 1296 /* 1297 * We held the mmap_lock for reading so MADV_DONTNEED 1298 * can zap transparent huge pages under us, or the 1299 * transparent huge page fault can establish new 1300 * transparent huge pages under us. 1301 */ 1302 if (unlikely(!src_pte)) { 1303 ret = -EAGAIN; 1304 goto out; 1305 } 1306 1307 /* Sanity checks before the operation */ 1308 if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) || 1309 pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) { 1310 ret = -EINVAL; 1311 goto out; 1312 } 1313 1314 spin_lock(dst_ptl); 1315 orig_dst_pte = ptep_get(dst_pte); 1316 spin_unlock(dst_ptl); 1317 if (!pte_none(orig_dst_pte)) { 1318 ret = -EEXIST; 1319 goto out; 1320 } 1321 1322 spin_lock(src_ptl); 1323 orig_src_pte = ptep_get(src_pte); 1324 spin_unlock(src_ptl); 1325 if (pte_none(orig_src_pte)) { 1326 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) 1327 ret = -ENOENT; 1328 else /* nothing to do to move a hole */ 1329 ret = PAGE_SIZE; 1330 goto out; 1331 } 1332 1333 /* If PTE changed after we locked the folio them start over */ 1334 if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { 1335 ret = -EAGAIN; 1336 goto out; 1337 } 1338 1339 if (pte_present(orig_src_pte)) { 1340 if (is_zero_pfn(pte_pfn(orig_src_pte))) { 1341 ret = move_zeropage_pte(mm, dst_vma, src_vma, 1342 dst_addr, src_addr, dst_pte, src_pte, 1343 orig_dst_pte, orig_src_pte, 1344 dst_pmd, dst_pmdval, dst_ptl, src_ptl); 1345 goto out; 1346 } 1347 1348 /* 1349 * Pin and lock source folio. Since we are in RCU read section, 1350 * we can't block, so on contention have to unmap the ptes, 1351 * obtain the lock and retry. 1352 */ 1353 if (!src_folio) { 1354 struct folio *folio; 1355 bool locked; 1356 1357 /* 1358 * Pin the page while holding the lock to be sure the 1359 * page isn't freed under us 1360 */ 1361 spin_lock(src_ptl); 1362 if (!pte_same(orig_src_pte, ptep_get(src_pte))) { 1363 spin_unlock(src_ptl); 1364 ret = -EAGAIN; 1365 goto out; 1366 } 1367 1368 folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); 1369 if (!folio || !PageAnonExclusive(&folio->page)) { 1370 spin_unlock(src_ptl); 1371 ret = -EBUSY; 1372 goto out; 1373 } 1374 1375 locked = folio_trylock(folio); 1376 /* 1377 * We avoid waiting for folio lock with a raised 1378 * refcount for large folios because extra refcounts 1379 * will result in split_folio() failing later and 1380 * retrying. If multiple tasks are trying to move a 1381 * large folio we can end up livelocking. 1382 */ 1383 if (!locked && folio_test_large(folio)) { 1384 spin_unlock(src_ptl); 1385 ret = -EAGAIN; 1386 goto out; 1387 } 1388 1389 folio_get(folio); 1390 src_folio = folio; 1391 src_folio_pte = orig_src_pte; 1392 spin_unlock(src_ptl); 1393 1394 if (!locked) { 1395 pte_unmap(src_pte); 1396 pte_unmap(dst_pte); 1397 src_pte = dst_pte = NULL; 1398 /* now we can block and wait */ 1399 folio_lock(src_folio); 1400 goto retry; 1401 } 1402 1403 if (WARN_ON_ONCE(!folio_test_anon(src_folio))) { 1404 ret = -EBUSY; 1405 goto out; 1406 } 1407 } 1408 1409 /* at this point we have src_folio locked */ 1410 if (folio_test_large(src_folio)) { 1411 /* split_folio() can block */ 1412 pte_unmap(src_pte); 1413 pte_unmap(dst_pte); 1414 src_pte = dst_pte = NULL; 1415 ret = split_folio(src_folio); 1416 if (ret) 1417 goto out; 1418 /* have to reacquire the folio after it got split */ 1419 folio_unlock(src_folio); 1420 folio_put(src_folio); 1421 src_folio = NULL; 1422 goto retry; 1423 } 1424 1425 ret = move_present_ptes(mm, dst_vma, src_vma, 1426 dst_addr, src_addr, dst_pte, src_pte, 1427 orig_dst_pte, orig_src_pte, dst_pmd, 1428 dst_pmdval, dst_ptl, src_ptl, &src_folio, 1429 len); 1430 } else { /* !pte_present() */ 1431 struct folio *folio = NULL; 1432 const softleaf_t entry = softleaf_from_pte(orig_src_pte); 1433 1434 if (softleaf_is_migration(entry)) { 1435 pte_unmap(src_pte); 1436 pte_unmap(dst_pte); 1437 src_pte = dst_pte = NULL; 1438 migration_entry_wait(mm, src_pmd, src_addr); 1439 1440 ret = -EAGAIN; 1441 goto out; 1442 } else if (!softleaf_is_swap(entry)) { 1443 ret = -EFAULT; 1444 goto out; 1445 } 1446 1447 if (!pte_swp_exclusive(orig_src_pte)) { 1448 ret = -EBUSY; 1449 goto out; 1450 } 1451 1452 si = get_swap_device(entry); 1453 if (unlikely(!si)) { 1454 ret = -EAGAIN; 1455 goto out; 1456 } 1457 /* 1458 * Verify the existence of the swapcache. If present, the folio's 1459 * index and mapping must be updated even when the PTE is a swap 1460 * entry. The anon_vma lock is not taken during this process since 1461 * the folio has already been unmapped, and the swap entry is 1462 * exclusive, preventing rmap walks. 1463 * 1464 * For large folios, return -EBUSY immediately, as split_folio() 1465 * also returns -EBUSY when attempting to split unmapped large 1466 * folios in the swapcache. This issue needs to be resolved 1467 * separately to allow proper handling. 1468 */ 1469 if (!src_folio) 1470 folio = swap_cache_get_folio(entry); 1471 if (folio) { 1472 if (folio_test_large(folio)) { 1473 ret = -EBUSY; 1474 folio_put(folio); 1475 goto out; 1476 } 1477 src_folio = folio; 1478 src_folio_pte = orig_src_pte; 1479 if (!folio_trylock(src_folio)) { 1480 pte_unmap(src_pte); 1481 pte_unmap(dst_pte); 1482 src_pte = dst_pte = NULL; 1483 put_swap_device(si); 1484 si = NULL; 1485 /* now we can block and wait */ 1486 folio_lock(src_folio); 1487 goto retry; 1488 } 1489 } 1490 ret = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, 1491 orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, 1492 dst_ptl, src_ptl, src_folio, si, entry); 1493 } 1494 1495 out: 1496 if (src_folio) { 1497 folio_unlock(src_folio); 1498 folio_put(src_folio); 1499 } 1500 /* 1501 * Unmap in reverse order (LIFO) to maintain proper kmap_local 1502 * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte 1503 * first, then src_pte, so we must unmap src_pte first, then dst_pte. 1504 */ 1505 if (src_pte) 1506 pte_unmap(src_pte); 1507 if (dst_pte) 1508 pte_unmap(dst_pte); 1509 mmu_notifier_invalidate_range_end(&range); 1510 if (si) 1511 put_swap_device(si); 1512 1513 return ret; 1514 } 1515 1516 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1517 static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1518 unsigned long src_addr, 1519 unsigned long src_end) 1520 { 1521 return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) || 1522 src_end - src_addr < HPAGE_PMD_SIZE; 1523 } 1524 #else 1525 static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1526 unsigned long src_addr, 1527 unsigned long src_end) 1528 { 1529 /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */ 1530 return false; 1531 } 1532 #endif 1533 1534 static inline bool vma_move_compatible(struct vm_area_struct *vma) 1535 { 1536 return !(vma->vm_flags & (VM_PFNMAP | VM_IO | VM_HUGETLB | 1537 VM_MIXEDMAP | VM_SHADOW_STACK)); 1538 } 1539 1540 static int validate_move_areas(struct userfaultfd_ctx *ctx, 1541 struct vm_area_struct *src_vma, 1542 struct vm_area_struct *dst_vma) 1543 { 1544 /* Only allow moving if both have the same access and protection */ 1545 if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) || 1546 pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot)) 1547 return -EINVAL; 1548 1549 /* Only allow moving if both are mlocked or both aren't */ 1550 if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED)) 1551 return -EINVAL; 1552 1553 /* 1554 * For now, we keep it simple and only move between writable VMAs. 1555 * Access flags are equal, therefore checking only the source is enough. 1556 */ 1557 if (!(src_vma->vm_flags & VM_WRITE)) 1558 return -EINVAL; 1559 1560 /* Check if vma flags indicate content which can be moved */ 1561 if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma)) 1562 return -EINVAL; 1563 1564 /* Ensure dst_vma is registered in uffd we are operating on */ 1565 if (!dst_vma->vm_userfaultfd_ctx.ctx || 1566 dst_vma->vm_userfaultfd_ctx.ctx != ctx) 1567 return -EINVAL; 1568 1569 /* Only allow moving across anonymous vmas */ 1570 if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma)) 1571 return -EINVAL; 1572 1573 return 0; 1574 } 1575 1576 static __always_inline 1577 int find_vmas_mm_locked(struct mm_struct *mm, 1578 unsigned long dst_start, 1579 unsigned long src_start, 1580 struct vm_area_struct **dst_vmap, 1581 struct vm_area_struct **src_vmap) 1582 { 1583 struct vm_area_struct *vma; 1584 1585 mmap_assert_locked(mm); 1586 vma = find_vma_and_prepare_anon(mm, dst_start); 1587 if (IS_ERR(vma)) 1588 return PTR_ERR(vma); 1589 1590 *dst_vmap = vma; 1591 /* Skip finding src_vma if src_start is in dst_vma */ 1592 if (src_start >= vma->vm_start && src_start < vma->vm_end) 1593 goto out_success; 1594 1595 vma = vma_lookup(mm, src_start); 1596 if (!vma) 1597 return -ENOENT; 1598 out_success: 1599 *src_vmap = vma; 1600 return 0; 1601 } 1602 1603 #ifdef CONFIG_PER_VMA_LOCK 1604 static int uffd_move_lock(struct mm_struct *mm, 1605 unsigned long dst_start, 1606 unsigned long src_start, 1607 struct vm_area_struct **dst_vmap, 1608 struct vm_area_struct **src_vmap) 1609 { 1610 struct vm_area_struct *vma; 1611 int err; 1612 1613 vma = uffd_lock_vma(mm, dst_start); 1614 if (IS_ERR(vma)) 1615 return PTR_ERR(vma); 1616 1617 *dst_vmap = vma; 1618 /* 1619 * Skip finding src_vma if src_start is in dst_vma. This also ensures 1620 * that we don't lock the same vma twice. 1621 */ 1622 if (src_start >= vma->vm_start && src_start < vma->vm_end) { 1623 *src_vmap = vma; 1624 return 0; 1625 } 1626 1627 /* 1628 * Using uffd_lock_vma() to get src_vma can lead to following deadlock: 1629 * 1630 * Thread1 Thread2 1631 * ------- ------- 1632 * vma_start_read(dst_vma) 1633 * mmap_write_lock(mm) 1634 * vma_start_write(src_vma) 1635 * vma_start_read(src_vma) 1636 * mmap_read_lock(mm) 1637 * vma_start_write(dst_vma) 1638 */ 1639 *src_vmap = lock_vma_under_rcu(mm, src_start); 1640 if (likely(*src_vmap)) 1641 return 0; 1642 1643 /* Undo any locking and retry in mmap_lock critical section */ 1644 vma_end_read(*dst_vmap); 1645 1646 mmap_read_lock(mm); 1647 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); 1648 if (err) 1649 goto out; 1650 1651 if (!vma_start_read_locked(*dst_vmap)) { 1652 err = -EAGAIN; 1653 goto out; 1654 } 1655 1656 /* Nothing further to do if both vmas are locked. */ 1657 if (*dst_vmap == *src_vmap) 1658 goto out; 1659 1660 if (!vma_start_read_locked_nested(*src_vmap, SINGLE_DEPTH_NESTING)) { 1661 /* Undo dst_vmap locking if src_vmap failed to lock */ 1662 vma_end_read(*dst_vmap); 1663 err = -EAGAIN; 1664 } 1665 out: 1666 mmap_read_unlock(mm); 1667 return err; 1668 } 1669 1670 static void uffd_move_unlock(struct vm_area_struct *dst_vma, 1671 struct vm_area_struct *src_vma) 1672 { 1673 vma_end_read(src_vma); 1674 if (src_vma != dst_vma) 1675 vma_end_read(dst_vma); 1676 } 1677 1678 #else 1679 1680 static int uffd_move_lock(struct mm_struct *mm, 1681 unsigned long dst_start, 1682 unsigned long src_start, 1683 struct vm_area_struct **dst_vmap, 1684 struct vm_area_struct **src_vmap) 1685 { 1686 int err; 1687 1688 mmap_read_lock(mm); 1689 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); 1690 if (err) 1691 mmap_read_unlock(mm); 1692 return err; 1693 } 1694 1695 static void uffd_move_unlock(struct vm_area_struct *dst_vma, 1696 struct vm_area_struct *src_vma) 1697 { 1698 mmap_assert_locked(src_vma->vm_mm); 1699 mmap_read_unlock(dst_vma->vm_mm); 1700 } 1701 #endif 1702 1703 /** 1704 * move_pages - move arbitrary anonymous pages of an existing vma 1705 * @ctx: pointer to the userfaultfd context 1706 * @dst_start: start of the destination virtual memory range 1707 * @src_start: start of the source virtual memory range 1708 * @len: length of the virtual memory range 1709 * @mode: flags from uffdio_move.mode 1710 * 1711 * It will either use the mmap_lock in read mode or per-vma locks 1712 * 1713 * move_pages() remaps arbitrary anonymous pages atomically in zero 1714 * copy. It only works on non shared anonymous pages because those can 1715 * be relocated without generating non linear anon_vmas in the rmap 1716 * code. 1717 * 1718 * It provides a zero copy mechanism to handle userspace page faults. 1719 * The source vma pages should have mapcount == 1, which can be 1720 * enforced by using madvise(MADV_DONTFORK) on src vma. 1721 * 1722 * The thread receiving the page during the userland page fault 1723 * will receive the faulting page in the source vma through the network, 1724 * storage or any other I/O device (MADV_DONTFORK in the source vma 1725 * avoids move_pages() to fail with -EBUSY if the process forks before 1726 * move_pages() is called), then it will call move_pages() to map the 1727 * page in the faulting address in the destination vma. 1728 * 1729 * This userfaultfd command works purely via pagetables, so it's the 1730 * most efficient way to move physical non shared anonymous pages 1731 * across different virtual addresses. Unlike mremap()/mmap()/munmap() 1732 * it does not create any new vmas. The mapping in the destination 1733 * address is atomic. 1734 * 1735 * It only works if the vma protection bits are identical from the 1736 * source and destination vma. 1737 * 1738 * It can remap non shared anonymous pages within the same vma too. 1739 * 1740 * If the source virtual memory range has any unmapped holes, or if 1741 * the destination virtual memory range is not a whole unmapped hole, 1742 * move_pages() will fail respectively with -ENOENT or -EEXIST. This 1743 * provides a very strict behavior to avoid any chance of memory 1744 * corruption going unnoticed if there are userland race conditions. 1745 * Only one thread should resolve the userland page fault at any given 1746 * time for any given faulting address. This means that if two threads 1747 * try to both call move_pages() on the same destination address at the 1748 * same time, the second thread will get an explicit error from this 1749 * command. 1750 * 1751 * The command retval will return "len" is successful. The command 1752 * however can be interrupted by fatal signals or errors. If 1753 * interrupted it will return the number of bytes successfully 1754 * remapped before the interruption if any, or the negative error if 1755 * none. It will never return zero. Either it will return an error or 1756 * an amount of bytes successfully moved. If the retval reports a 1757 * "short" remap, the move_pages() command should be repeated by 1758 * userland with src+retval, dst+reval, len-retval if it wants to know 1759 * about the error that interrupted it. 1760 * 1761 * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to 1762 * prevent -ENOENT errors to materialize if there are holes in the 1763 * source virtual range that is being remapped. The holes will be 1764 * accounted as successfully remapped in the retval of the 1765 * command. This is mostly useful to remap hugepage naturally aligned 1766 * virtual regions without knowing if there are transparent hugepage 1767 * in the regions or not, but preventing the risk of having to split 1768 * the hugepmd during the remap. 1769 */ 1770 ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, 1771 unsigned long src_start, unsigned long len, __u64 mode) 1772 { 1773 struct mm_struct *mm = ctx->mm; 1774 struct vm_area_struct *src_vma, *dst_vma; 1775 unsigned long src_addr, dst_addr, src_end; 1776 pmd_t *src_pmd, *dst_pmd; 1777 long err = -EINVAL; 1778 ssize_t moved = 0; 1779 1780 /* Sanitize the command parameters. */ 1781 VM_WARN_ON_ONCE(src_start & ~PAGE_MASK); 1782 VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); 1783 VM_WARN_ON_ONCE(len & ~PAGE_MASK); 1784 1785 /* Does the address range wrap, or is the span zero-sized? */ 1786 VM_WARN_ON_ONCE(src_start + len < src_start); 1787 VM_WARN_ON_ONCE(dst_start + len < dst_start); 1788 1789 err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma); 1790 if (err) 1791 goto out; 1792 1793 /* Re-check after taking map_changing_lock */ 1794 err = -EAGAIN; 1795 down_read(&ctx->map_changing_lock); 1796 if (likely(atomic_read(&ctx->mmap_changing))) 1797 goto out_unlock; 1798 /* 1799 * Make sure the vma is not shared, that the src and dst remap 1800 * ranges are both valid and fully within a single existing 1801 * vma. 1802 */ 1803 err = -EINVAL; 1804 if (src_vma->vm_flags & VM_SHARED) 1805 goto out_unlock; 1806 if (src_start + len > src_vma->vm_end) 1807 goto out_unlock; 1808 1809 if (dst_vma->vm_flags & VM_SHARED) 1810 goto out_unlock; 1811 if (dst_start + len > dst_vma->vm_end) 1812 goto out_unlock; 1813 1814 err = validate_move_areas(ctx, src_vma, dst_vma); 1815 if (err) 1816 goto out_unlock; 1817 1818 for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len; 1819 src_addr < src_end;) { 1820 spinlock_t *ptl; 1821 pmd_t dst_pmdval; 1822 unsigned long step_size; 1823 1824 /* 1825 * Below works because anonymous area would not have a 1826 * transparent huge PUD. If file-backed support is added, 1827 * that case would need to be handled here. 1828 */ 1829 src_pmd = mm_find_pmd(mm, src_addr); 1830 if (unlikely(!src_pmd)) { 1831 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 1832 err = -ENOENT; 1833 break; 1834 } 1835 src_pmd = mm_alloc_pmd(mm, src_addr); 1836 if (unlikely(!src_pmd)) { 1837 err = -ENOMEM; 1838 break; 1839 } 1840 } 1841 dst_pmd = mm_alloc_pmd(mm, dst_addr); 1842 if (unlikely(!dst_pmd)) { 1843 err = -ENOMEM; 1844 break; 1845 } 1846 1847 dst_pmdval = pmdp_get_lockless(dst_pmd); 1848 /* 1849 * If the dst_pmd is mapped as THP don't override it and just 1850 * be strict. If dst_pmd changes into TPH after this check, the 1851 * move_pages_huge_pmd() will detect the change and retry 1852 * while move_pages_pte() will detect the change and fail. 1853 */ 1854 if (unlikely(pmd_trans_huge(dst_pmdval))) { 1855 err = -EEXIST; 1856 break; 1857 } 1858 1859 ptl = pmd_trans_huge_lock(src_pmd, src_vma); 1860 if (ptl) { 1861 /* Check if we can move the pmd without splitting it. */ 1862 if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || 1863 !pmd_none(dst_pmdval)) { 1864 /* Can be a migration entry */ 1865 if (pmd_present(*src_pmd)) { 1866 struct folio *folio = pmd_folio(*src_pmd); 1867 1868 if (!is_huge_zero_folio(folio) && 1869 !PageAnonExclusive(&folio->page)) { 1870 spin_unlock(ptl); 1871 err = -EBUSY; 1872 break; 1873 } 1874 } 1875 1876 spin_unlock(ptl); 1877 split_huge_pmd(src_vma, src_pmd, src_addr); 1878 /* The folio will be split by move_pages_pte() */ 1879 continue; 1880 } 1881 1882 err = move_pages_huge_pmd(mm, dst_pmd, src_pmd, 1883 dst_pmdval, dst_vma, src_vma, 1884 dst_addr, src_addr); 1885 step_size = HPAGE_PMD_SIZE; 1886 } else { 1887 long ret; 1888 1889 if (pmd_none(*src_pmd)) { 1890 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 1891 err = -ENOENT; 1892 break; 1893 } 1894 if (unlikely(__pte_alloc(mm, src_pmd))) { 1895 err = -ENOMEM; 1896 break; 1897 } 1898 } 1899 1900 if (unlikely(pte_alloc(mm, dst_pmd))) { 1901 err = -ENOMEM; 1902 break; 1903 } 1904 1905 ret = move_pages_ptes(mm, dst_pmd, src_pmd, 1906 dst_vma, src_vma, dst_addr, 1907 src_addr, src_end - src_addr, mode); 1908 if (ret < 0) 1909 err = ret; 1910 else 1911 step_size = ret; 1912 } 1913 1914 cond_resched(); 1915 1916 if (fatal_signal_pending(current)) { 1917 /* Do not override an error */ 1918 if (!err || err == -EAGAIN) 1919 err = -EINTR; 1920 break; 1921 } 1922 1923 if (err) { 1924 if (err == -EAGAIN) 1925 continue; 1926 break; 1927 } 1928 1929 /* Proceed to the next page */ 1930 dst_addr += step_size; 1931 src_addr += step_size; 1932 moved += step_size; 1933 } 1934 1935 out_unlock: 1936 up_read(&ctx->map_changing_lock); 1937 uffd_move_unlock(dst_vma, src_vma); 1938 out: 1939 VM_WARN_ON_ONCE(moved < 0); 1940 VM_WARN_ON_ONCE(err > 0); 1941 VM_WARN_ON_ONCE(!moved && !err); 1942 return moved ? moved : err; 1943 } 1944 1945 static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, 1946 vm_flags_t vm_flags) 1947 { 1948 const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP; 1949 1950 vm_flags_reset(vma, vm_flags); 1951 /* 1952 * For shared mappings, we want to enable writenotify while 1953 * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply 1954 * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. 1955 */ 1956 if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) 1957 vma_set_page_prot(vma); 1958 } 1959 1960 static void userfaultfd_set_ctx(struct vm_area_struct *vma, 1961 struct userfaultfd_ctx *ctx, 1962 vm_flags_t vm_flags) 1963 { 1964 vma_start_write(vma); 1965 vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; 1966 userfaultfd_set_vm_flags(vma, 1967 (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags); 1968 } 1969 1970 void userfaultfd_reset_ctx(struct vm_area_struct *vma) 1971 { 1972 userfaultfd_set_ctx(vma, NULL, 0); 1973 } 1974 1975 struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, 1976 struct vm_area_struct *prev, 1977 struct vm_area_struct *vma, 1978 unsigned long start, 1979 unsigned long end) 1980 { 1981 struct vm_area_struct *ret; 1982 bool give_up_on_oom = false; 1983 1984 /* 1985 * If we are modifying only and not splitting, just give up on the merge 1986 * if OOM prevents us from merging successfully. 1987 */ 1988 if (start == vma->vm_start && end == vma->vm_end) 1989 give_up_on_oom = true; 1990 1991 /* Reset ptes for the whole vma range if wr-protected */ 1992 if (userfaultfd_wp(vma)) 1993 uffd_wp_range(vma, start, end - start, false); 1994 1995 ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, 1996 vma->vm_flags & ~__VM_UFFD_FLAGS, 1997 NULL_VM_UFFD_CTX, give_up_on_oom); 1998 1999 /* 2000 * In the vma_merge() successful mprotect-like case 8: 2001 * the next vma was merged into the current one and 2002 * the current one has not been updated yet. 2003 */ 2004 if (!IS_ERR(ret)) 2005 userfaultfd_reset_ctx(ret); 2006 2007 return ret; 2008 } 2009 2010 /* Assumes mmap write lock taken, and mm_struct pinned. */ 2011 int userfaultfd_register_range(struct userfaultfd_ctx *ctx, 2012 struct vm_area_struct *vma, 2013 vm_flags_t vm_flags, 2014 unsigned long start, unsigned long end, 2015 bool wp_async) 2016 { 2017 VMA_ITERATOR(vmi, ctx->mm, start); 2018 struct vm_area_struct *prev = vma_prev(&vmi); 2019 unsigned long vma_end; 2020 vm_flags_t new_flags; 2021 2022 if (vma->vm_start < start) 2023 prev = vma; 2024 2025 for_each_vma_range(vmi, vma, end) { 2026 cond_resched(); 2027 2028 VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async)); 2029 VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx && 2030 vma->vm_userfaultfd_ctx.ctx != ctx); 2031 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)); 2032 2033 /* 2034 * Nothing to do: this vma is already registered into this 2035 * userfaultfd and with the right tracking mode too. 2036 */ 2037 if (vma->vm_userfaultfd_ctx.ctx == ctx && 2038 (vma->vm_flags & vm_flags) == vm_flags) 2039 goto skip; 2040 2041 if (vma->vm_start > start) 2042 start = vma->vm_start; 2043 vma_end = min(end, vma->vm_end); 2044 2045 new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; 2046 vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, 2047 new_flags, 2048 (struct vm_userfaultfd_ctx){ctx}, 2049 /* give_up_on_oom = */false); 2050 if (IS_ERR(vma)) 2051 return PTR_ERR(vma); 2052 2053 /* 2054 * In the vma_merge() successful mprotect-like case 8: 2055 * the next vma was merged into the current one and 2056 * the current one has not been updated yet. 2057 */ 2058 userfaultfd_set_ctx(vma, ctx, vm_flags); 2059 2060 if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) 2061 hugetlb_unshare_all_pmds(vma); 2062 2063 skip: 2064 prev = vma; 2065 start = vma->vm_end; 2066 } 2067 2068 return 0; 2069 } 2070 2071 void userfaultfd_release_new(struct userfaultfd_ctx *ctx) 2072 { 2073 struct mm_struct *mm = ctx->mm; 2074 struct vm_area_struct *vma; 2075 VMA_ITERATOR(vmi, mm, 0); 2076 2077 /* the various vma->vm_userfaultfd_ctx still points to it */ 2078 mmap_write_lock(mm); 2079 for_each_vma(vmi, vma) { 2080 if (vma->vm_userfaultfd_ctx.ctx == ctx) 2081 userfaultfd_reset_ctx(vma); 2082 } 2083 mmap_write_unlock(mm); 2084 } 2085 2086 void userfaultfd_release_all(struct mm_struct *mm, 2087 struct userfaultfd_ctx *ctx) 2088 { 2089 struct vm_area_struct *vma, *prev; 2090 VMA_ITERATOR(vmi, mm, 0); 2091 2092 if (!mmget_not_zero(mm)) 2093 return; 2094 2095 /* 2096 * Flush page faults out of all CPUs. NOTE: all page faults 2097 * must be retried without returning VM_FAULT_SIGBUS if 2098 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx 2099 * changes while handle_userfault released the mmap_lock. So 2100 * it's critical that released is set to true (above), before 2101 * taking the mmap_lock for writing. 2102 */ 2103 mmap_write_lock(mm); 2104 prev = NULL; 2105 for_each_vma(vmi, vma) { 2106 cond_resched(); 2107 VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^ 2108 !!(vma->vm_flags & __VM_UFFD_FLAGS)); 2109 if (vma->vm_userfaultfd_ctx.ctx != ctx) { 2110 prev = vma; 2111 continue; 2112 } 2113 2114 vma = userfaultfd_clear_vma(&vmi, prev, vma, 2115 vma->vm_start, vma->vm_end); 2116 prev = vma; 2117 } 2118 mmap_write_unlock(mm); 2119 mmput(mm); 2120 } 2121