1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * mm/userfaultfd.c 4 * 5 * Copyright (C) 2015 Red Hat, Inc. 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/sched/signal.h> 10 #include <linux/pagemap.h> 11 #include <linux/rmap.h> 12 #include <linux/swap.h> 13 #include <linux/leafops.h> 14 #include <linux/userfaultfd_k.h> 15 #include <linux/mmu_notifier.h> 16 #include <linux/hugetlb.h> 17 #include <asm/tlbflush.h> 18 #include <asm/tlb.h> 19 #include "internal.h" 20 #include "swap.h" 21 22 struct mfill_state { 23 struct userfaultfd_ctx *ctx; 24 unsigned long src_start; 25 unsigned long dst_start; 26 unsigned long len; 27 uffd_flags_t flags; 28 29 struct vm_area_struct *vma; 30 unsigned long src_addr; 31 unsigned long dst_addr; 32 pmd_t *pmd; 33 }; 34 35 static bool anon_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags) 36 { 37 /* anonymous memory does not support MINOR mode */ 38 if (vm_flags & VM_UFFD_MINOR) 39 return false; 40 return true; 41 } 42 43 static struct folio *anon_alloc_folio(struct vm_area_struct *vma, 44 unsigned long addr) 45 { 46 struct folio *folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, 47 addr); 48 49 if (!folio) 50 return NULL; 51 52 if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) { 53 folio_put(folio); 54 return NULL; 55 } 56 57 return folio; 58 } 59 60 static const struct vm_uffd_ops anon_uffd_ops = { 61 .can_userfault = anon_can_userfault, 62 .alloc_folio = anon_alloc_folio, 63 }; 64 65 static const struct vm_uffd_ops *vma_uffd_ops(struct vm_area_struct *vma) 66 { 67 if (vma_is_anonymous(vma)) 68 return &anon_uffd_ops; 69 return vma->vm_ops ? vma->vm_ops->uffd_ops : NULL; 70 } 71 72 static __always_inline 73 bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) 74 { 75 /* Make sure that the dst range is fully within dst_vma. */ 76 if (dst_end > dst_vma->vm_end) 77 return false; 78 79 /* 80 * Check the vma is registered in uffd, this is required to 81 * enforce the VM_MAYWRITE check done at uffd registration 82 * time. 83 */ 84 if (!dst_vma->vm_userfaultfd_ctx.ctx) 85 return false; 86 87 return true; 88 } 89 90 static __always_inline 91 struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm, 92 unsigned long addr) 93 { 94 struct vm_area_struct *vma; 95 96 mmap_assert_locked(mm); 97 vma = vma_lookup(mm, addr); 98 if (!vma) 99 vma = ERR_PTR(-ENOENT); 100 else if (!(vma->vm_flags & VM_SHARED) && 101 unlikely(anon_vma_prepare(vma))) 102 vma = ERR_PTR(-ENOMEM); 103 104 return vma; 105 } 106 107 #ifdef CONFIG_PER_VMA_LOCK 108 /* 109 * uffd_lock_vma() - Lookup and lock vma corresponding to @address. 110 * @mm: mm to search vma in. 111 * @address: address that the vma should contain. 112 * 113 * Should be called without holding mmap_lock. 114 * 115 * Return: A locked vma containing @address, -ENOENT if no vma is found, or 116 * -ENOMEM if anon_vma couldn't be allocated. 117 */ 118 static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, 119 unsigned long address) 120 { 121 struct vm_area_struct *vma; 122 123 vma = lock_vma_under_rcu(mm, address); 124 if (vma) { 125 /* 126 * We know we're going to need to use anon_vma, so check 127 * that early. 128 */ 129 if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma)) 130 vma_end_read(vma); 131 else 132 return vma; 133 } 134 135 mmap_read_lock(mm); 136 vma = find_vma_and_prepare_anon(mm, address); 137 if (!IS_ERR(vma)) { 138 bool locked = vma_start_read_locked(vma); 139 140 if (!locked) 141 vma = ERR_PTR(-EAGAIN); 142 } 143 144 mmap_read_unlock(mm); 145 return vma; 146 } 147 148 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, 149 unsigned long dst_start, 150 unsigned long len) 151 { 152 struct vm_area_struct *dst_vma; 153 154 dst_vma = uffd_lock_vma(dst_mm, dst_start); 155 if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len)) 156 return dst_vma; 157 158 vma_end_read(dst_vma); 159 return ERR_PTR(-ENOENT); 160 } 161 162 static void uffd_mfill_unlock(struct vm_area_struct *vma) 163 { 164 vma_end_read(vma); 165 } 166 167 #else 168 169 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, 170 unsigned long dst_start, 171 unsigned long len) 172 { 173 struct vm_area_struct *dst_vma; 174 175 mmap_read_lock(dst_mm); 176 dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start); 177 if (IS_ERR(dst_vma)) 178 goto out_unlock; 179 180 if (validate_dst_vma(dst_vma, dst_start + len)) 181 return dst_vma; 182 183 dst_vma = ERR_PTR(-ENOENT); 184 out_unlock: 185 mmap_read_unlock(dst_mm); 186 return dst_vma; 187 } 188 189 static void uffd_mfill_unlock(struct vm_area_struct *vma) 190 { 191 mmap_read_unlock(vma->vm_mm); 192 } 193 #endif 194 195 static void mfill_put_vma(struct mfill_state *state) 196 { 197 if (!state->vma) 198 return; 199 200 up_read(&state->ctx->map_changing_lock); 201 uffd_mfill_unlock(state->vma); 202 state->vma = NULL; 203 } 204 205 static int mfill_get_vma(struct mfill_state *state) 206 { 207 struct userfaultfd_ctx *ctx = state->ctx; 208 uffd_flags_t flags = state->flags; 209 struct vm_area_struct *dst_vma; 210 const struct vm_uffd_ops *ops; 211 int err; 212 213 /* 214 * Make sure the vma is not shared, that the dst range is 215 * both valid and fully within a single existing vma. 216 */ 217 dst_vma = uffd_mfill_lock(ctx->mm, state->dst_start, state->len); 218 if (IS_ERR(dst_vma)) 219 return PTR_ERR(dst_vma); 220 221 /* 222 * If memory mappings are changing because of non-cooperative 223 * operation (e.g. mremap) running in parallel, bail out and 224 * request the user to retry later 225 */ 226 down_read(&ctx->map_changing_lock); 227 state->vma = dst_vma; 228 err = -EAGAIN; 229 if (atomic_read(&ctx->mmap_changing)) 230 goto out_unlock; 231 232 err = -EINVAL; 233 234 /* 235 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but 236 * it will overwrite vm_ops, so vma_is_anonymous must return false. 237 */ 238 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && 239 dst_vma->vm_flags & VM_SHARED)) 240 goto out_unlock; 241 242 /* 243 * validate 'mode' now that we know the dst_vma: don't allow 244 * a wrprotect copy if the userfaultfd didn't register as WP. 245 */ 246 if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) 247 goto out_unlock; 248 249 if (is_vm_hugetlb_page(dst_vma)) 250 return 0; 251 252 ops = vma_uffd_ops(dst_vma); 253 if (!ops) 254 goto out_unlock; 255 256 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && 257 !ops->get_folio_noalloc) 258 goto out_unlock; 259 260 return 0; 261 262 out_unlock: 263 mfill_put_vma(state); 264 return err; 265 } 266 267 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) 268 { 269 pgd_t *pgd; 270 p4d_t *p4d; 271 pud_t *pud; 272 273 pgd = pgd_offset(mm, address); 274 p4d = p4d_alloc(mm, pgd, address); 275 if (!p4d) 276 return NULL; 277 pud = pud_alloc(mm, p4d, address); 278 if (!pud) 279 return NULL; 280 /* 281 * Note that we didn't run this because the pmd was 282 * missing, the *pmd may be already established and in 283 * turn it may also be a trans_huge_pmd. 284 */ 285 return pmd_alloc(mm, pud, address); 286 } 287 288 static int mfill_establish_pmd(struct mfill_state *state) 289 { 290 struct mm_struct *dst_mm = state->ctx->mm; 291 pmd_t *dst_pmd, dst_pmdval; 292 293 dst_pmd = mm_alloc_pmd(dst_mm, state->dst_addr); 294 if (unlikely(!dst_pmd)) 295 return -ENOMEM; 296 297 dst_pmdval = pmdp_get_lockless(dst_pmd); 298 if (unlikely(pmd_none(dst_pmdval)) && 299 unlikely(__pte_alloc(dst_mm, dst_pmd))) 300 return -ENOMEM; 301 302 dst_pmdval = pmdp_get_lockless(dst_pmd); 303 /* 304 * If the dst_pmd is THP don't override it and just be strict. 305 * (This includes the case where the PMD used to be THP and 306 * changed back to none after __pte_alloc().) 307 */ 308 if (unlikely(!pmd_present(dst_pmdval) || pmd_leaf(dst_pmdval))) 309 return -EEXIST; 310 if (unlikely(pmd_bad(dst_pmdval))) 311 return -EFAULT; 312 313 state->pmd = dst_pmd; 314 return 0; 315 } 316 317 /* Check if dst_addr is outside of file's size. Must be called with ptl held. */ 318 static bool mfill_file_over_size(struct vm_area_struct *dst_vma, 319 unsigned long dst_addr) 320 { 321 struct inode *inode; 322 pgoff_t offset, max_off; 323 324 if (!dst_vma->vm_file) 325 return false; 326 327 inode = dst_vma->vm_file->f_inode; 328 offset = linear_page_index(dst_vma, dst_addr); 329 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 330 return offset >= max_off; 331 } 332 333 /* 334 * Install PTEs, to map dst_addr (within dst_vma) to page. 335 * 336 * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem 337 * and anon, and for both shared and private VMAs. 338 */ 339 static int mfill_atomic_install_pte(pmd_t *dst_pmd, 340 struct vm_area_struct *dst_vma, 341 unsigned long dst_addr, struct page *page, 342 uffd_flags_t flags) 343 { 344 int ret; 345 struct mm_struct *dst_mm = dst_vma->vm_mm; 346 pte_t _dst_pte, *dst_pte; 347 bool writable = dst_vma->vm_flags & VM_WRITE; 348 bool vm_shared = dst_vma->vm_flags & VM_SHARED; 349 spinlock_t *ptl; 350 struct folio *folio = page_folio(page); 351 bool page_in_cache = folio_mapping(folio); 352 pte_t dst_ptep; 353 354 _dst_pte = mk_pte(page, dst_vma->vm_page_prot); 355 _dst_pte = pte_mkdirty(_dst_pte); 356 if (page_in_cache && !vm_shared) 357 writable = false; 358 if (writable) 359 _dst_pte = pte_mkwrite(_dst_pte, dst_vma); 360 if (flags & MFILL_ATOMIC_WP) 361 _dst_pte = pte_mkuffd_wp(_dst_pte); 362 363 ret = -EAGAIN; 364 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 365 if (!dst_pte) 366 goto out; 367 368 if (mfill_file_over_size(dst_vma, dst_addr)) { 369 ret = -EFAULT; 370 goto out_unlock; 371 } 372 373 ret = -EEXIST; 374 375 dst_ptep = ptep_get(dst_pte); 376 377 /* 378 * We are allowed to overwrite a UFFD pte marker: consider when both 379 * MISSING|WP registered, we firstly wr-protect a none pte which has no 380 * page cache page backing it, then access the page. 381 */ 382 if (!pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep)) 383 goto out_unlock; 384 385 if (page_in_cache) { 386 folio_add_file_rmap_pte(folio, page, dst_vma); 387 } else { 388 folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); 389 folio_add_lru_vma(folio, dst_vma); 390 } 391 392 /* 393 * Must happen after rmap, as mm_counter() checks mapping (via 394 * PageAnon()), which is set by __page_set_anon_rmap(). 395 */ 396 inc_mm_counter(dst_mm, mm_counter(folio)); 397 398 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 399 400 if (page_in_cache) 401 folio_unlock(folio); 402 403 /* No need to invalidate - it was non-present before */ 404 update_mmu_cache(dst_vma, dst_addr, dst_pte); 405 ret = 0; 406 out_unlock: 407 pte_unmap_unlock(dst_pte, ptl); 408 out: 409 return ret; 410 } 411 412 static int mfill_copy_folio_locked(struct folio *folio, unsigned long src_addr) 413 { 414 void *kaddr; 415 int ret; 416 417 kaddr = kmap_local_folio(folio, 0); 418 /* 419 * The read mmap_lock is held here. Despite the 420 * mmap_lock being read recursive a deadlock is still 421 * possible if a writer has taken a lock. For example: 422 * 423 * process A thread 1 takes read lock on own mmap_lock 424 * process A thread 2 calls mmap, blocks taking write lock 425 * process B thread 1 takes page fault, read lock on own mmap lock 426 * process B thread 2 calls mmap, blocks taking write lock 427 * process A thread 1 blocks taking read lock on process B 428 * process B thread 1 blocks taking read lock on process A 429 * 430 * Disable page faults to prevent potential deadlock 431 * and retry the copy outside the mmap_lock. 432 */ 433 pagefault_disable(); 434 ret = copy_from_user(kaddr, (const void __user *) src_addr, 435 PAGE_SIZE); 436 pagefault_enable(); 437 kunmap_local(kaddr); 438 439 if (ret) 440 return -EFAULT; 441 442 flush_dcache_folio(folio); 443 return ret; 444 } 445 446 static int mfill_copy_folio_retry(struct mfill_state *state, struct folio *folio) 447 { 448 unsigned long src_addr = state->src_addr; 449 void *kaddr; 450 int err; 451 452 /* retry copying with mm_lock dropped */ 453 mfill_put_vma(state); 454 455 kaddr = kmap_local_folio(folio, 0); 456 err = copy_from_user(kaddr, (const void __user *) src_addr, PAGE_SIZE); 457 kunmap_local(kaddr); 458 if (unlikely(err)) 459 return -EFAULT; 460 461 flush_dcache_folio(folio); 462 463 /* reget VMA and PMD, they could change underneath us */ 464 err = mfill_get_vma(state); 465 if (err) 466 return err; 467 468 err = mfill_establish_pmd(state); 469 if (err) 470 return err; 471 472 return 0; 473 } 474 475 static int __mfill_atomic_pte(struct mfill_state *state, 476 const struct vm_uffd_ops *ops) 477 { 478 unsigned long dst_addr = state->dst_addr; 479 unsigned long src_addr = state->src_addr; 480 uffd_flags_t flags = state->flags; 481 struct folio *folio; 482 int ret; 483 484 folio = ops->alloc_folio(state->vma, state->dst_addr); 485 if (!folio) 486 return -ENOMEM; 487 488 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { 489 ret = mfill_copy_folio_locked(folio, src_addr); 490 /* 491 * Fallback to copy_from_user outside mmap_lock. 492 * If retry is successful, mfill_copy_folio_locked() returns 493 * with locks retaken by mfill_get_vma(). 494 * If there was an error, we must mfill_put_vma() anyway and it 495 * will take care of unlocking if needed. 496 */ 497 if (unlikely(ret)) { 498 ret = mfill_copy_folio_retry(state, folio); 499 if (ret) 500 goto err_folio_put; 501 } 502 } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { 503 clear_user_highpage(&folio->page, state->dst_addr); 504 } else { 505 VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags); 506 } 507 508 /* 509 * The memory barrier inside __folio_mark_uptodate makes sure that 510 * preceding stores to the page contents become visible before 511 * the set_pte_at() write. 512 */ 513 __folio_mark_uptodate(folio); 514 515 if (ops->filemap_add) { 516 ret = ops->filemap_add(folio, state->vma, state->dst_addr); 517 if (ret) 518 goto err_folio_put; 519 } 520 521 ret = mfill_atomic_install_pte(state->pmd, state->vma, dst_addr, 522 &folio->page, flags); 523 if (ret) 524 goto err_filemap_remove; 525 526 return 0; 527 528 err_filemap_remove: 529 if (ops->filemap_remove) 530 ops->filemap_remove(folio, state->vma); 531 err_folio_put: 532 folio_put(folio); 533 return ret; 534 } 535 536 static int mfill_atomic_pte_copy(struct mfill_state *state) 537 { 538 const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma); 539 540 /* 541 * The normal page fault path for a MAP_PRIVATE mapping in a 542 * file-backed VMA will invoke the fault, fill the hole in the file and 543 * COW it right away. The result generates plain anonymous memory. 544 * So when we are asked to fill a hole in a MAP_PRIVATE mapping, we'll 545 * generate anonymous memory directly without actually filling the 546 * hole. For the MAP_PRIVATE case the robustness check only happens in 547 * the pagetable (to verify it's still none) and not in the page cache. 548 */ 549 if (!(state->vma->vm_flags & VM_SHARED)) 550 ops = &anon_uffd_ops; 551 552 return __mfill_atomic_pte(state, ops); 553 } 554 555 static int mfill_atomic_pte_zeroed_folio(struct mfill_state *state) 556 { 557 const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma); 558 559 return __mfill_atomic_pte(state, ops); 560 } 561 562 static int mfill_atomic_pte_zeropage(struct mfill_state *state) 563 { 564 struct vm_area_struct *dst_vma = state->vma; 565 unsigned long dst_addr = state->dst_addr; 566 pmd_t *dst_pmd = state->pmd; 567 pte_t _dst_pte, *dst_pte; 568 spinlock_t *ptl; 569 int ret; 570 571 if (mm_forbids_zeropage(dst_vma->vm_mm) || 572 (dst_vma->vm_flags & VM_SHARED)) 573 return mfill_atomic_pte_zeroed_folio(state); 574 575 _dst_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), 576 dst_vma->vm_page_prot)); 577 ret = -EAGAIN; 578 dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); 579 if (!dst_pte) 580 goto out; 581 if (mfill_file_over_size(dst_vma, dst_addr)) { 582 ret = -EFAULT; 583 goto out_unlock; 584 } 585 ret = -EEXIST; 586 if (!pte_none(ptep_get(dst_pte))) 587 goto out_unlock; 588 set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); 589 /* No need to invalidate - it was non-present before */ 590 update_mmu_cache(dst_vma, dst_addr, dst_pte); 591 ret = 0; 592 out_unlock: 593 pte_unmap_unlock(dst_pte, ptl); 594 out: 595 return ret; 596 } 597 598 /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ 599 static int mfill_atomic_pte_continue(struct mfill_state *state) 600 { 601 struct vm_area_struct *dst_vma = state->vma; 602 const struct vm_uffd_ops *ops = vma_uffd_ops(dst_vma); 603 unsigned long dst_addr = state->dst_addr; 604 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 605 struct inode *inode = file_inode(dst_vma->vm_file); 606 uffd_flags_t flags = state->flags; 607 pmd_t *dst_pmd = state->pmd; 608 struct folio *folio; 609 struct page *page; 610 int ret; 611 612 if (!ops) { 613 VM_WARN_ONCE(1, "UFFDIO_CONTINUE for unsupported VMA"); 614 return -EOPNOTSUPP; 615 } 616 617 folio = ops->get_folio_noalloc(inode, pgoff); 618 /* Our caller expects us to return -EFAULT if we failed to find folio */ 619 if (IS_ERR_OR_NULL(folio)) 620 return -EFAULT; 621 622 page = folio_file_page(folio, pgoff); 623 if (PageHWPoison(page)) { 624 ret = -EIO; 625 goto out_release; 626 } 627 628 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 629 page, flags); 630 if (ret) 631 goto out_release; 632 633 return 0; 634 635 out_release: 636 folio_unlock(folio); 637 folio_put(folio); 638 return ret; 639 } 640 641 /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ 642 static int mfill_atomic_pte_poison(struct mfill_state *state) 643 { 644 struct vm_area_struct *dst_vma = state->vma; 645 struct mm_struct *dst_mm = dst_vma->vm_mm; 646 unsigned long dst_addr = state->dst_addr; 647 pmd_t *dst_pmd = state->pmd; 648 pte_t _dst_pte, *dst_pte; 649 spinlock_t *ptl; 650 int ret; 651 652 _dst_pte = make_pte_marker(PTE_MARKER_POISONED); 653 ret = -EAGAIN; 654 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 655 if (!dst_pte) 656 goto out; 657 658 if (mfill_file_over_size(dst_vma, dst_addr)) { 659 ret = -EFAULT; 660 goto out_unlock; 661 } 662 663 ret = -EEXIST; 664 /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */ 665 if (!pte_none(ptep_get(dst_pte))) 666 goto out_unlock; 667 668 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 669 670 /* No need to invalidate - it was non-present before */ 671 update_mmu_cache(dst_vma, dst_addr, dst_pte); 672 ret = 0; 673 out_unlock: 674 pte_unmap_unlock(dst_pte, ptl); 675 out: 676 return ret; 677 } 678 679 #ifdef CONFIG_HUGETLB_PAGE 680 /* 681 * mfill_atomic processing for HUGETLB vmas. Note that this routine is 682 * called with either vma-lock or mmap_lock held, it will release the lock 683 * before returning. 684 */ 685 static __always_inline ssize_t mfill_atomic_hugetlb( 686 struct userfaultfd_ctx *ctx, 687 struct vm_area_struct *dst_vma, 688 unsigned long dst_start, 689 unsigned long src_start, 690 unsigned long len, 691 uffd_flags_t flags) 692 { 693 struct mm_struct *dst_mm = dst_vma->vm_mm; 694 ssize_t err; 695 pte_t *dst_pte; 696 unsigned long src_addr, dst_addr; 697 long copied; 698 struct folio *folio; 699 unsigned long vma_hpagesize; 700 pgoff_t idx; 701 u32 hash; 702 struct address_space *mapping; 703 704 /* 705 * There is no default zero huge page for all huge page sizes as 706 * supported by hugetlb. A PMD_SIZE huge pages may exist as used 707 * by THP. Since we can not reliably insert a zero page, this 708 * feature is not supported. 709 */ 710 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { 711 up_read(&ctx->map_changing_lock); 712 uffd_mfill_unlock(dst_vma); 713 return -EINVAL; 714 } 715 716 src_addr = src_start; 717 dst_addr = dst_start; 718 copied = 0; 719 folio = NULL; 720 vma_hpagesize = vma_kernel_pagesize(dst_vma); 721 722 /* 723 * Validate alignment based on huge page size 724 */ 725 err = -EINVAL; 726 if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) 727 goto out_unlock; 728 729 retry: 730 /* 731 * On routine entry dst_vma is set. If we had to drop mmap_lock and 732 * retry, dst_vma will be set to NULL and we must lookup again. 733 */ 734 if (!dst_vma) { 735 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); 736 if (IS_ERR(dst_vma)) { 737 err = PTR_ERR(dst_vma); 738 goto out; 739 } 740 741 err = -ENOENT; 742 if (!is_vm_hugetlb_page(dst_vma)) 743 goto out_unlock_vma; 744 745 err = -EINVAL; 746 if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) 747 goto out_unlock_vma; 748 749 /* 750 * If memory mappings are changing because of non-cooperative 751 * operation (e.g. mremap) running in parallel, bail out and 752 * request the user to retry later 753 */ 754 down_read(&ctx->map_changing_lock); 755 err = -EAGAIN; 756 if (atomic_read(&ctx->mmap_changing)) 757 goto out_unlock; 758 } 759 760 while (src_addr < src_start + len) { 761 VM_WARN_ON_ONCE(dst_addr >= dst_start + len); 762 763 /* 764 * Serialize via vma_lock and hugetlb_fault_mutex. 765 * vma_lock ensures the dst_pte remains valid even 766 * in the case of shared pmds. fault mutex prevents 767 * races with other faulting threads. 768 */ 769 idx = hugetlb_linear_page_index(dst_vma, dst_addr); 770 mapping = dst_vma->vm_file->f_mapping; 771 hash = hugetlb_fault_mutex_hash(mapping, idx); 772 mutex_lock(&hugetlb_fault_mutex_table[hash]); 773 hugetlb_vma_lock_read(dst_vma); 774 775 err = -ENOMEM; 776 dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); 777 if (!dst_pte) { 778 hugetlb_vma_unlock_read(dst_vma); 779 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 780 goto out_unlock; 781 } 782 783 if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { 784 const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte); 785 786 if (!huge_pte_none(ptep) && !pte_is_uffd_marker(ptep)) { 787 err = -EEXIST; 788 hugetlb_vma_unlock_read(dst_vma); 789 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 790 goto out_unlock; 791 } 792 } 793 794 err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, 795 src_addr, flags, &folio); 796 797 hugetlb_vma_unlock_read(dst_vma); 798 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 799 800 cond_resched(); 801 802 if (unlikely(err == -ENOENT)) { 803 up_read(&ctx->map_changing_lock); 804 uffd_mfill_unlock(dst_vma); 805 VM_WARN_ON_ONCE(!folio); 806 807 err = copy_folio_from_user(folio, 808 (const void __user *)src_addr, true); 809 if (unlikely(err)) { 810 err = -EFAULT; 811 goto out; 812 } 813 814 dst_vma = NULL; 815 goto retry; 816 } else 817 VM_WARN_ON_ONCE(folio); 818 819 if (!err) { 820 dst_addr += vma_hpagesize; 821 src_addr += vma_hpagesize; 822 copied += vma_hpagesize; 823 824 if (fatal_signal_pending(current)) 825 err = -EINTR; 826 } 827 if (err) 828 break; 829 } 830 831 out_unlock: 832 up_read(&ctx->map_changing_lock); 833 out_unlock_vma: 834 uffd_mfill_unlock(dst_vma); 835 out: 836 if (folio) 837 folio_put(folio); 838 VM_WARN_ON_ONCE(copied < 0); 839 VM_WARN_ON_ONCE(err > 0); 840 VM_WARN_ON_ONCE(!copied && !err); 841 return copied ? copied : err; 842 } 843 #else /* !CONFIG_HUGETLB_PAGE */ 844 /* fail at build time if gcc attempts to use this */ 845 extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx, 846 struct vm_area_struct *dst_vma, 847 unsigned long dst_start, 848 unsigned long src_start, 849 unsigned long len, 850 uffd_flags_t flags); 851 #endif /* CONFIG_HUGETLB_PAGE */ 852 853 static __always_inline ssize_t mfill_atomic_pte(struct mfill_state *state) 854 { 855 uffd_flags_t flags = state->flags; 856 857 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) 858 return mfill_atomic_pte_continue(state); 859 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) 860 return mfill_atomic_pte_poison(state); 861 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) 862 return mfill_atomic_pte_copy(state); 863 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) 864 return mfill_atomic_pte_zeropage(state); 865 866 VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags); 867 return -EOPNOTSUPP; 868 } 869 870 static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, 871 unsigned long dst_start, 872 unsigned long src_start, 873 unsigned long len, 874 uffd_flags_t flags) 875 { 876 struct mfill_state state = (struct mfill_state){ 877 .ctx = ctx, 878 .dst_start = dst_start, 879 .src_start = src_start, 880 .flags = flags, 881 .len = len, 882 .src_addr = src_start, 883 .dst_addr = dst_start, 884 }; 885 long copied = 0; 886 ssize_t err; 887 888 /* 889 * Sanitize the command parameters: 890 */ 891 VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); 892 VM_WARN_ON_ONCE(len & ~PAGE_MASK); 893 894 /* Does the address range wrap, or is the span zero-sized? */ 895 VM_WARN_ON_ONCE(src_start + len <= src_start); 896 VM_WARN_ON_ONCE(dst_start + len <= dst_start); 897 898 err = mfill_get_vma(&state); 899 if (err) 900 goto out; 901 902 /* 903 * If this is a HUGETLB vma, pass off to appropriate routine 904 */ 905 if (is_vm_hugetlb_page(state.vma)) 906 return mfill_atomic_hugetlb(ctx, state.vma, dst_start, 907 src_start, len, flags); 908 909 while (state.src_addr < src_start + len) { 910 VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len); 911 912 err = mfill_establish_pmd(&state); 913 if (err) 914 break; 915 916 /* 917 * For shmem mappings, khugepaged is allowed to remove page 918 * tables under us; pte_offset_map_lock() will deal with that. 919 */ 920 921 err = mfill_atomic_pte(&state); 922 cond_resched(); 923 924 if (!err) { 925 state.dst_addr += PAGE_SIZE; 926 state.src_addr += PAGE_SIZE; 927 copied += PAGE_SIZE; 928 929 if (fatal_signal_pending(current)) 930 err = -EINTR; 931 } 932 if (err) 933 break; 934 } 935 936 mfill_put_vma(&state); 937 out: 938 VM_WARN_ON_ONCE(copied < 0); 939 VM_WARN_ON_ONCE(err > 0); 940 VM_WARN_ON_ONCE(!copied && !err); 941 return copied ? copied : err; 942 } 943 944 ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, 945 unsigned long src_start, unsigned long len, 946 uffd_flags_t flags) 947 { 948 return mfill_atomic(ctx, dst_start, src_start, len, 949 uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY)); 950 } 951 952 ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, 953 unsigned long start, 954 unsigned long len) 955 { 956 return mfill_atomic(ctx, start, 0, len, 957 uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE)); 958 } 959 960 ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, 961 unsigned long len, uffd_flags_t flags) 962 { 963 964 /* 965 * A caller might reasonably assume that UFFDIO_CONTINUE contains an 966 * smp_wmb() to ensure that any writes to the about-to-be-mapped page by 967 * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to 968 * subsequent loads from the page through the newly mapped address range. 969 */ 970 smp_wmb(); 971 972 return mfill_atomic(ctx, start, 0, len, 973 uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); 974 } 975 976 ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, 977 unsigned long len, uffd_flags_t flags) 978 { 979 return mfill_atomic(ctx, start, 0, len, 980 uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON)); 981 } 982 983 long uffd_wp_range(struct vm_area_struct *dst_vma, 984 unsigned long start, unsigned long len, bool enable_wp) 985 { 986 unsigned int mm_cp_flags; 987 struct mmu_gather tlb; 988 long ret; 989 990 VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end, 991 "The address range exceeds VMA boundary.\n"); 992 if (enable_wp) 993 mm_cp_flags = MM_CP_UFFD_WP; 994 else 995 mm_cp_flags = MM_CP_UFFD_WP_RESOLVE; 996 997 /* 998 * vma->vm_page_prot already reflects that uffd-wp is enabled for this 999 * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed 1000 * to be write-protected as default whenever protection changes. 1001 * Try upgrading write permissions manually. 1002 */ 1003 if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) 1004 mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; 1005 tlb_gather_mmu(&tlb, dst_vma->vm_mm); 1006 ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); 1007 tlb_finish_mmu(&tlb); 1008 1009 return ret; 1010 } 1011 1012 int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, 1013 unsigned long len, bool enable_wp) 1014 { 1015 struct mm_struct *dst_mm = ctx->mm; 1016 unsigned long end = start + len; 1017 unsigned long _start, _end; 1018 struct vm_area_struct *dst_vma; 1019 unsigned long page_mask; 1020 long err; 1021 VMA_ITERATOR(vmi, dst_mm, start); 1022 1023 /* 1024 * Sanitize the command parameters: 1025 */ 1026 VM_WARN_ON_ONCE(start & ~PAGE_MASK); 1027 VM_WARN_ON_ONCE(len & ~PAGE_MASK); 1028 1029 /* Does the address range wrap, or is the span zero-sized? */ 1030 VM_WARN_ON_ONCE(start + len <= start); 1031 1032 mmap_read_lock(dst_mm); 1033 1034 /* 1035 * If memory mappings are changing because of non-cooperative 1036 * operation (e.g. mremap) running in parallel, bail out and 1037 * request the user to retry later 1038 */ 1039 down_read(&ctx->map_changing_lock); 1040 err = -EAGAIN; 1041 if (atomic_read(&ctx->mmap_changing)) 1042 goto out_unlock; 1043 1044 err = -ENOENT; 1045 for_each_vma_range(vmi, dst_vma, end) { 1046 1047 if (!userfaultfd_wp(dst_vma)) { 1048 err = -ENOENT; 1049 break; 1050 } 1051 1052 if (is_vm_hugetlb_page(dst_vma)) { 1053 err = -EINVAL; 1054 page_mask = vma_kernel_pagesize(dst_vma) - 1; 1055 if ((start & page_mask) || (len & page_mask)) 1056 break; 1057 } 1058 1059 _start = max(dst_vma->vm_start, start); 1060 _end = min(dst_vma->vm_end, end); 1061 1062 err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp); 1063 1064 /* Return 0 on success, <0 on failures */ 1065 if (err < 0) 1066 break; 1067 err = 0; 1068 } 1069 out_unlock: 1070 up_read(&ctx->map_changing_lock); 1071 mmap_read_unlock(dst_mm); 1072 return err; 1073 } 1074 1075 1076 void double_pt_lock(spinlock_t *ptl1, 1077 spinlock_t *ptl2) 1078 __acquires(ptl1) 1079 __acquires(ptl2) 1080 { 1081 if (ptl1 > ptl2) 1082 swap(ptl1, ptl2); 1083 /* lock in virtual address order to avoid lock inversion */ 1084 spin_lock(ptl1); 1085 if (ptl1 != ptl2) 1086 spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING); 1087 else 1088 __acquire(ptl2); 1089 } 1090 1091 void double_pt_unlock(spinlock_t *ptl1, 1092 spinlock_t *ptl2) 1093 __releases(ptl1) 1094 __releases(ptl2) 1095 { 1096 spin_unlock(ptl1); 1097 if (ptl1 != ptl2) 1098 spin_unlock(ptl2); 1099 else 1100 __release(ptl2); 1101 } 1102 1103 static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, 1104 pte_t orig_dst_pte, pte_t orig_src_pte, 1105 pmd_t *dst_pmd, pmd_t dst_pmdval) 1106 { 1107 return pte_same(ptep_get(src_pte), orig_src_pte) && 1108 pte_same(ptep_get(dst_pte), orig_dst_pte) && 1109 pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd)); 1110 } 1111 1112 /* 1113 * Checks if the two ptes and the corresponding folio are eligible for batched 1114 * move. If so, then returns pointer to the locked folio. Otherwise, returns NULL. 1115 * 1116 * NOTE: folio's reference is not required as the whole operation is within 1117 * PTL's critical section. 1118 */ 1119 static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, 1120 unsigned long src_addr, 1121 pte_t *src_pte, pte_t *dst_pte) 1122 { 1123 pte_t orig_dst_pte, orig_src_pte; 1124 struct folio *folio; 1125 1126 orig_dst_pte = ptep_get(dst_pte); 1127 if (!pte_none(orig_dst_pte)) 1128 return NULL; 1129 1130 orig_src_pte = ptep_get(src_pte); 1131 if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte))) 1132 return NULL; 1133 1134 folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); 1135 if (!folio || !folio_trylock(folio)) 1136 return NULL; 1137 if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) { 1138 folio_unlock(folio); 1139 return NULL; 1140 } 1141 return folio; 1142 } 1143 1144 /* 1145 * Moves src folios to dst in a batch as long as they are not large, and can 1146 * successfully take the lock via folio_trylock(). 1147 */ 1148 static long move_present_ptes(struct mm_struct *mm, 1149 struct vm_area_struct *dst_vma, 1150 struct vm_area_struct *src_vma, 1151 unsigned long dst_addr, unsigned long src_addr, 1152 pte_t *dst_pte, pte_t *src_pte, 1153 pte_t orig_dst_pte, pte_t orig_src_pte, 1154 pmd_t *dst_pmd, pmd_t dst_pmdval, 1155 spinlock_t *dst_ptl, spinlock_t *src_ptl, 1156 struct folio **first_src_folio, unsigned long len) 1157 { 1158 int err = 0; 1159 struct folio *src_folio = *first_src_folio; 1160 unsigned long src_start = src_addr; 1161 unsigned long src_end; 1162 1163 len = pmd_addr_end(dst_addr, dst_addr + len) - dst_addr; 1164 src_end = pmd_addr_end(src_addr, src_addr + len); 1165 flush_cache_range(src_vma, src_addr, src_end); 1166 double_pt_lock(dst_ptl, src_ptl); 1167 1168 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1169 dst_pmd, dst_pmdval)) { 1170 err = -EAGAIN; 1171 goto out; 1172 } 1173 if (folio_test_large(src_folio) || 1174 folio_maybe_dma_pinned(src_folio) || 1175 !PageAnonExclusive(&src_folio->page)) { 1176 err = -EBUSY; 1177 goto out; 1178 } 1179 /* It's safe to drop the reference now as the page-table is holding one. */ 1180 folio_put(*first_src_folio); 1181 *first_src_folio = NULL; 1182 lazy_mmu_mode_enable(); 1183 1184 while (true) { 1185 orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); 1186 /* Folio got pinned from under us. Put it back and fail the move. */ 1187 if (folio_maybe_dma_pinned(src_folio)) { 1188 set_pte_at(mm, src_addr, src_pte, orig_src_pte); 1189 err = -EBUSY; 1190 break; 1191 } 1192 1193 folio_move_anon_rmap(src_folio, dst_vma); 1194 src_folio->index = linear_page_index(dst_vma, dst_addr); 1195 1196 orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); 1197 /* Set soft dirty bit so userspace can notice the pte was moved */ 1198 if (pgtable_supports_soft_dirty()) 1199 orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); 1200 if (pte_dirty(orig_src_pte)) 1201 orig_dst_pte = pte_mkdirty(orig_dst_pte); 1202 orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); 1203 set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); 1204 1205 src_addr += PAGE_SIZE; 1206 if (src_addr == src_end) 1207 break; 1208 dst_addr += PAGE_SIZE; 1209 dst_pte++; 1210 src_pte++; 1211 1212 folio_unlock(src_folio); 1213 src_folio = check_ptes_for_batched_move(src_vma, src_addr, 1214 src_pte, dst_pte); 1215 if (!src_folio) 1216 break; 1217 } 1218 1219 lazy_mmu_mode_disable(); 1220 if (src_addr > src_start) 1221 flush_tlb_range(src_vma, src_start, src_addr); 1222 1223 if (src_folio) 1224 folio_unlock(src_folio); 1225 out: 1226 double_pt_unlock(dst_ptl, src_ptl); 1227 return src_addr > src_start ? src_addr - src_start : err; 1228 } 1229 1230 static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, 1231 unsigned long dst_addr, unsigned long src_addr, 1232 pte_t *dst_pte, pte_t *src_pte, 1233 pte_t orig_dst_pte, pte_t orig_src_pte, 1234 pmd_t *dst_pmd, pmd_t dst_pmdval, 1235 spinlock_t *dst_ptl, spinlock_t *src_ptl, 1236 struct folio *src_folio, 1237 struct swap_info_struct *si, swp_entry_t entry) 1238 { 1239 /* 1240 * Check if the folio still belongs to the target swap entry after 1241 * acquiring the lock. Folio can be freed in the swap cache while 1242 * not locked. 1243 */ 1244 if (src_folio && unlikely(!folio_test_swapcache(src_folio) || 1245 entry.val != src_folio->swap.val)) 1246 return -EAGAIN; 1247 1248 double_pt_lock(dst_ptl, src_ptl); 1249 1250 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1251 dst_pmd, dst_pmdval)) { 1252 double_pt_unlock(dst_ptl, src_ptl); 1253 return -EAGAIN; 1254 } 1255 1256 /* 1257 * The src_folio resides in the swapcache, requiring an update to its 1258 * index and mapping to align with the dst_vma, where a swap-in may 1259 * occur and hit the swapcache after moving the PTE. 1260 */ 1261 if (src_folio) { 1262 folio_move_anon_rmap(src_folio, dst_vma); 1263 src_folio->index = linear_page_index(dst_vma, dst_addr); 1264 } else { 1265 /* 1266 * Check if the swap entry is cached after acquiring the src_pte 1267 * lock. Otherwise, we might miss a newly loaded swap cache folio. 1268 * 1269 * We are trying to catch newly added swap cache, the only possible case is 1270 * when a folio is swapped in and out again staying in swap cache, using the 1271 * same entry before the PTE check above. The PTL is acquired and released 1272 * twice, each time after updating the swap table. So holding 1273 * the PTL here ensures we see the updated value. 1274 */ 1275 if (swap_cache_has_folio(entry)) { 1276 double_pt_unlock(dst_ptl, src_ptl); 1277 return -EAGAIN; 1278 } 1279 } 1280 1281 orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); 1282 if (pgtable_supports_soft_dirty()) 1283 orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); 1284 set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); 1285 double_pt_unlock(dst_ptl, src_ptl); 1286 1287 return PAGE_SIZE; 1288 } 1289 1290 static int move_zeropage_pte(struct mm_struct *mm, 1291 struct vm_area_struct *dst_vma, 1292 struct vm_area_struct *src_vma, 1293 unsigned long dst_addr, unsigned long src_addr, 1294 pte_t *dst_pte, pte_t *src_pte, 1295 pte_t orig_dst_pte, pte_t orig_src_pte, 1296 pmd_t *dst_pmd, pmd_t dst_pmdval, 1297 spinlock_t *dst_ptl, spinlock_t *src_ptl) 1298 { 1299 pte_t zero_pte; 1300 1301 double_pt_lock(dst_ptl, src_ptl); 1302 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1303 dst_pmd, dst_pmdval)) { 1304 double_pt_unlock(dst_ptl, src_ptl); 1305 return -EAGAIN; 1306 } 1307 1308 zero_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), 1309 dst_vma->vm_page_prot)); 1310 ptep_clear_flush(src_vma, src_addr, src_pte); 1311 set_pte_at(mm, dst_addr, dst_pte, zero_pte); 1312 double_pt_unlock(dst_ptl, src_ptl); 1313 1314 return PAGE_SIZE; 1315 } 1316 1317 1318 /* 1319 * The mmap_lock for reading is held by the caller. Just move the page(s) 1320 * from src_pmd to dst_pmd if possible, and return number of bytes moved. 1321 * On failure, an error code is returned. 1322 */ 1323 static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, 1324 struct vm_area_struct *dst_vma, 1325 struct vm_area_struct *src_vma, 1326 unsigned long dst_addr, unsigned long src_addr, 1327 unsigned long len, __u64 mode) 1328 { 1329 struct swap_info_struct *si = NULL; 1330 pte_t orig_src_pte, orig_dst_pte; 1331 pte_t src_folio_pte; 1332 spinlock_t *src_ptl, *dst_ptl; 1333 pte_t *src_pte = NULL; 1334 pte_t *dst_pte = NULL; 1335 pmd_t dummy_pmdval; 1336 pmd_t dst_pmdval; 1337 struct folio *src_folio = NULL; 1338 struct mmu_notifier_range range; 1339 long ret = 0; 1340 1341 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 1342 src_addr, src_addr + len); 1343 mmu_notifier_invalidate_range_start(&range); 1344 retry: 1345 /* 1346 * Use the maywrite version to indicate that dst_pte will be modified, 1347 * since dst_pte needs to be none, the subsequent pte_same() check 1348 * cannot prevent the dst_pte page from being freed concurrently, so we 1349 * also need to obtain dst_pmdval and recheck pmd_same() later. 1350 */ 1351 dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval, 1352 &dst_ptl); 1353 1354 /* Retry if a huge pmd materialized from under us */ 1355 if (unlikely(!dst_pte)) { 1356 ret = -EAGAIN; 1357 goto out; 1358 } 1359 1360 /* 1361 * Unlike dst_pte, the subsequent pte_same() check can ensure the 1362 * stability of the src_pte page, so there is no need to get pmdval, 1363 * just pass a dummy variable to it. 1364 */ 1365 src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval, 1366 &src_ptl); 1367 1368 /* 1369 * We held the mmap_lock for reading so MADV_DONTNEED 1370 * can zap transparent huge pages under us, or the 1371 * transparent huge page fault can establish new 1372 * transparent huge pages under us. 1373 */ 1374 if (unlikely(!src_pte)) { 1375 ret = -EAGAIN; 1376 goto out; 1377 } 1378 1379 /* Sanity checks before the operation */ 1380 if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) || 1381 pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) { 1382 ret = -EINVAL; 1383 goto out; 1384 } 1385 1386 spin_lock(dst_ptl); 1387 orig_dst_pte = ptep_get(dst_pte); 1388 spin_unlock(dst_ptl); 1389 if (!pte_none(orig_dst_pte)) { 1390 ret = -EEXIST; 1391 goto out; 1392 } 1393 1394 spin_lock(src_ptl); 1395 orig_src_pte = ptep_get(src_pte); 1396 spin_unlock(src_ptl); 1397 if (pte_none(orig_src_pte)) { 1398 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) 1399 ret = -ENOENT; 1400 else /* nothing to do to move a hole */ 1401 ret = PAGE_SIZE; 1402 goto out; 1403 } 1404 1405 /* If PTE changed after we locked the folio then start over */ 1406 if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { 1407 ret = -EAGAIN; 1408 goto out; 1409 } 1410 1411 if (pte_present(orig_src_pte)) { 1412 if (is_zero_pfn(pte_pfn(orig_src_pte))) { 1413 ret = move_zeropage_pte(mm, dst_vma, src_vma, 1414 dst_addr, src_addr, dst_pte, src_pte, 1415 orig_dst_pte, orig_src_pte, 1416 dst_pmd, dst_pmdval, dst_ptl, src_ptl); 1417 goto out; 1418 } 1419 1420 /* 1421 * Pin and lock source folio. Since we are in RCU read section, 1422 * we can't block, so on contention have to unmap the ptes, 1423 * obtain the lock and retry. 1424 */ 1425 if (!src_folio) { 1426 struct folio *folio; 1427 bool locked; 1428 1429 /* 1430 * Pin the page while holding the lock to be sure the 1431 * page isn't freed under us 1432 */ 1433 spin_lock(src_ptl); 1434 if (!pte_same(orig_src_pte, ptep_get(src_pte))) { 1435 spin_unlock(src_ptl); 1436 ret = -EAGAIN; 1437 goto out; 1438 } 1439 1440 folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); 1441 if (!folio || !PageAnonExclusive(&folio->page)) { 1442 spin_unlock(src_ptl); 1443 ret = -EBUSY; 1444 goto out; 1445 } 1446 1447 locked = folio_trylock(folio); 1448 /* 1449 * We avoid waiting for folio lock with a raised 1450 * refcount for large folios because extra refcounts 1451 * will result in split_folio() failing later and 1452 * retrying. If multiple tasks are trying to move a 1453 * large folio we can end up livelocking. 1454 */ 1455 if (!locked && folio_test_large(folio)) { 1456 spin_unlock(src_ptl); 1457 ret = -EAGAIN; 1458 goto out; 1459 } 1460 1461 folio_get(folio); 1462 src_folio = folio; 1463 src_folio_pte = orig_src_pte; 1464 spin_unlock(src_ptl); 1465 1466 if (!locked) { 1467 pte_unmap(src_pte); 1468 pte_unmap(dst_pte); 1469 src_pte = dst_pte = NULL; 1470 /* now we can block and wait */ 1471 folio_lock(src_folio); 1472 goto retry; 1473 } 1474 1475 if (WARN_ON_ONCE(!folio_test_anon(src_folio))) { 1476 ret = -EBUSY; 1477 goto out; 1478 } 1479 } 1480 1481 /* at this point we have src_folio locked */ 1482 if (folio_test_large(src_folio)) { 1483 /* split_folio() can block */ 1484 pte_unmap(src_pte); 1485 pte_unmap(dst_pte); 1486 src_pte = dst_pte = NULL; 1487 ret = split_folio(src_folio); 1488 if (ret) 1489 goto out; 1490 /* have to reacquire the folio after it got split */ 1491 folio_unlock(src_folio); 1492 folio_put(src_folio); 1493 src_folio = NULL; 1494 goto retry; 1495 } 1496 1497 ret = move_present_ptes(mm, dst_vma, src_vma, 1498 dst_addr, src_addr, dst_pte, src_pte, 1499 orig_dst_pte, orig_src_pte, dst_pmd, 1500 dst_pmdval, dst_ptl, src_ptl, &src_folio, 1501 len); 1502 } else { /* !pte_present() */ 1503 struct folio *folio = NULL; 1504 const softleaf_t entry = softleaf_from_pte(orig_src_pte); 1505 1506 if (softleaf_is_migration(entry)) { 1507 pte_unmap(src_pte); 1508 pte_unmap(dst_pte); 1509 src_pte = dst_pte = NULL; 1510 migration_entry_wait(mm, src_pmd, src_addr); 1511 1512 ret = -EAGAIN; 1513 goto out; 1514 } else if (!softleaf_is_swap(entry)) { 1515 ret = -EFAULT; 1516 goto out; 1517 } 1518 1519 if (!pte_swp_exclusive(orig_src_pte)) { 1520 ret = -EBUSY; 1521 goto out; 1522 } 1523 1524 si = get_swap_device(entry); 1525 if (unlikely(!si)) { 1526 ret = -EAGAIN; 1527 goto out; 1528 } 1529 /* 1530 * Verify the existence of the swapcache. If present, the folio's 1531 * index and mapping must be updated even when the PTE is a swap 1532 * entry. The anon_vma lock is not taken during this process since 1533 * the folio has already been unmapped, and the swap entry is 1534 * exclusive, preventing rmap walks. 1535 * 1536 * For large folios, return -EBUSY immediately, as split_folio() 1537 * also returns -EBUSY when attempting to split unmapped large 1538 * folios in the swapcache. This issue needs to be resolved 1539 * separately to allow proper handling. 1540 */ 1541 if (!src_folio) 1542 folio = swap_cache_get_folio(entry); 1543 if (folio) { 1544 if (folio_test_large(folio)) { 1545 ret = -EBUSY; 1546 folio_put(folio); 1547 goto out; 1548 } 1549 src_folio = folio; 1550 src_folio_pte = orig_src_pte; 1551 if (!folio_trylock(src_folio)) { 1552 pte_unmap(src_pte); 1553 pte_unmap(dst_pte); 1554 src_pte = dst_pte = NULL; 1555 put_swap_device(si); 1556 si = NULL; 1557 /* now we can block and wait */ 1558 folio_lock(src_folio); 1559 goto retry; 1560 } 1561 } 1562 ret = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, 1563 orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, 1564 dst_ptl, src_ptl, src_folio, si, entry); 1565 } 1566 1567 out: 1568 if (src_folio) { 1569 folio_unlock(src_folio); 1570 folio_put(src_folio); 1571 } 1572 /* 1573 * Unmap in reverse order (LIFO) to maintain proper kmap_local 1574 * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte 1575 * first, then src_pte, so we must unmap src_pte first, then dst_pte. 1576 */ 1577 if (src_pte) 1578 pte_unmap(src_pte); 1579 if (dst_pte) 1580 pte_unmap(dst_pte); 1581 mmu_notifier_invalidate_range_end(&range); 1582 if (si) 1583 put_swap_device(si); 1584 1585 return ret; 1586 } 1587 1588 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1589 static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1590 unsigned long src_addr, 1591 unsigned long src_end) 1592 { 1593 return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) || 1594 src_end - src_addr < HPAGE_PMD_SIZE; 1595 } 1596 #else 1597 static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1598 unsigned long src_addr, 1599 unsigned long src_end) 1600 { 1601 /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */ 1602 return false; 1603 } 1604 #endif 1605 1606 static inline bool vma_move_compatible(struct vm_area_struct *vma) 1607 { 1608 return !(vma->vm_flags & (VM_PFNMAP | VM_IO | VM_HUGETLB | 1609 VM_MIXEDMAP | VM_SHADOW_STACK)); 1610 } 1611 1612 static int validate_move_areas(struct userfaultfd_ctx *ctx, 1613 struct vm_area_struct *src_vma, 1614 struct vm_area_struct *dst_vma) 1615 { 1616 /* Only allow moving if both have the same access and protection */ 1617 if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) || 1618 pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot)) 1619 return -EINVAL; 1620 1621 /* Only allow moving if both are mlocked or both aren't */ 1622 if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED)) 1623 return -EINVAL; 1624 1625 /* 1626 * For now, we keep it simple and only move between writable VMAs. 1627 * Access flags are equal, therefore checking only the source is enough. 1628 */ 1629 if (!(src_vma->vm_flags & VM_WRITE)) 1630 return -EINVAL; 1631 1632 /* Check if vma flags indicate content which can be moved */ 1633 if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma)) 1634 return -EINVAL; 1635 1636 /* Ensure dst_vma is registered in uffd we are operating on */ 1637 if (!dst_vma->vm_userfaultfd_ctx.ctx || 1638 dst_vma->vm_userfaultfd_ctx.ctx != ctx) 1639 return -EINVAL; 1640 1641 /* Only allow moving across anonymous vmas */ 1642 if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma)) 1643 return -EINVAL; 1644 1645 return 0; 1646 } 1647 1648 static __always_inline 1649 int find_vmas_mm_locked(struct mm_struct *mm, 1650 unsigned long dst_start, 1651 unsigned long src_start, 1652 struct vm_area_struct **dst_vmap, 1653 struct vm_area_struct **src_vmap) 1654 { 1655 struct vm_area_struct *vma; 1656 1657 mmap_assert_locked(mm); 1658 vma = find_vma_and_prepare_anon(mm, dst_start); 1659 if (IS_ERR(vma)) 1660 return PTR_ERR(vma); 1661 1662 *dst_vmap = vma; 1663 /* Skip finding src_vma if src_start is in dst_vma */ 1664 if (src_start >= vma->vm_start && src_start < vma->vm_end) 1665 goto out_success; 1666 1667 vma = vma_lookup(mm, src_start); 1668 if (!vma) 1669 return -ENOENT; 1670 out_success: 1671 *src_vmap = vma; 1672 return 0; 1673 } 1674 1675 #ifdef CONFIG_PER_VMA_LOCK 1676 static int uffd_move_lock(struct mm_struct *mm, 1677 unsigned long dst_start, 1678 unsigned long src_start, 1679 struct vm_area_struct **dst_vmap, 1680 struct vm_area_struct **src_vmap) 1681 { 1682 struct vm_area_struct *vma; 1683 int err; 1684 1685 vma = uffd_lock_vma(mm, dst_start); 1686 if (IS_ERR(vma)) 1687 return PTR_ERR(vma); 1688 1689 *dst_vmap = vma; 1690 /* 1691 * Skip finding src_vma if src_start is in dst_vma. This also ensures 1692 * that we don't lock the same vma twice. 1693 */ 1694 if (src_start >= vma->vm_start && src_start < vma->vm_end) { 1695 *src_vmap = vma; 1696 return 0; 1697 } 1698 1699 /* 1700 * Using uffd_lock_vma() to get src_vma can lead to following deadlock: 1701 * 1702 * Thread1 Thread2 1703 * ------- ------- 1704 * vma_start_read(dst_vma) 1705 * mmap_write_lock(mm) 1706 * vma_start_write(src_vma) 1707 * vma_start_read(src_vma) 1708 * mmap_read_lock(mm) 1709 * vma_start_write(dst_vma) 1710 */ 1711 *src_vmap = lock_vma_under_rcu(mm, src_start); 1712 if (likely(*src_vmap)) 1713 return 0; 1714 1715 /* Undo any locking and retry in mmap_lock critical section */ 1716 vma_end_read(*dst_vmap); 1717 1718 mmap_read_lock(mm); 1719 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); 1720 if (err) 1721 goto out; 1722 1723 if (!vma_start_read_locked(*dst_vmap)) { 1724 err = -EAGAIN; 1725 goto out; 1726 } 1727 1728 /* Nothing further to do if both vmas are locked. */ 1729 if (*dst_vmap == *src_vmap) 1730 goto out; 1731 1732 if (!vma_start_read_locked_nested(*src_vmap, SINGLE_DEPTH_NESTING)) { 1733 /* Undo dst_vmap locking if src_vmap failed to lock */ 1734 vma_end_read(*dst_vmap); 1735 err = -EAGAIN; 1736 } 1737 out: 1738 mmap_read_unlock(mm); 1739 return err; 1740 } 1741 1742 static void uffd_move_unlock(struct vm_area_struct *dst_vma, 1743 struct vm_area_struct *src_vma) 1744 { 1745 vma_end_read(src_vma); 1746 if (src_vma != dst_vma) 1747 vma_end_read(dst_vma); 1748 } 1749 1750 #else 1751 1752 static int uffd_move_lock(struct mm_struct *mm, 1753 unsigned long dst_start, 1754 unsigned long src_start, 1755 struct vm_area_struct **dst_vmap, 1756 struct vm_area_struct **src_vmap) 1757 { 1758 int err; 1759 1760 mmap_read_lock(mm); 1761 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); 1762 if (err) 1763 mmap_read_unlock(mm); 1764 return err; 1765 } 1766 1767 static void uffd_move_unlock(struct vm_area_struct *dst_vma, 1768 struct vm_area_struct *src_vma) 1769 { 1770 mmap_assert_locked(src_vma->vm_mm); 1771 mmap_read_unlock(dst_vma->vm_mm); 1772 } 1773 #endif 1774 1775 /** 1776 * move_pages - move arbitrary anonymous pages of an existing vma 1777 * @ctx: pointer to the userfaultfd context 1778 * @dst_start: start of the destination virtual memory range 1779 * @src_start: start of the source virtual memory range 1780 * @len: length of the virtual memory range 1781 * @mode: flags from uffdio_move.mode 1782 * 1783 * It will either use the mmap_lock in read mode or per-vma locks 1784 * 1785 * move_pages() remaps arbitrary anonymous pages atomically in zero 1786 * copy. It only works on non shared anonymous pages because those can 1787 * be relocated without generating non linear anon_vmas in the rmap 1788 * code. 1789 * 1790 * It provides a zero copy mechanism to handle userspace page faults. 1791 * The source vma pages should have mapcount == 1, which can be 1792 * enforced by using madvise(MADV_DONTFORK) on src vma. 1793 * 1794 * The thread receiving the page during the userland page fault 1795 * will receive the faulting page in the source vma through the network, 1796 * storage or any other I/O device (MADV_DONTFORK in the source vma 1797 * avoids move_pages() to fail with -EBUSY if the process forks before 1798 * move_pages() is called), then it will call move_pages() to map the 1799 * page in the faulting address in the destination vma. 1800 * 1801 * This userfaultfd command works purely via pagetables, so it's the 1802 * most efficient way to move physical non shared anonymous pages 1803 * across different virtual addresses. Unlike mremap()/mmap()/munmap() 1804 * it does not create any new vmas. The mapping in the destination 1805 * address is atomic. 1806 * 1807 * It only works if the vma protection bits are identical from the 1808 * source and destination vma. 1809 * 1810 * It can remap non shared anonymous pages within the same vma too. 1811 * 1812 * If the source virtual memory range has any unmapped holes, or if 1813 * the destination virtual memory range is not a whole unmapped hole, 1814 * move_pages() will fail respectively with -ENOENT or -EEXIST. This 1815 * provides a very strict behavior to avoid any chance of memory 1816 * corruption going unnoticed if there are userland race conditions. 1817 * Only one thread should resolve the userland page fault at any given 1818 * time for any given faulting address. This means that if two threads 1819 * try to both call move_pages() on the same destination address at the 1820 * same time, the second thread will get an explicit error from this 1821 * command. 1822 * 1823 * The command retval will return "len" is successful. The command 1824 * however can be interrupted by fatal signals or errors. If 1825 * interrupted it will return the number of bytes successfully 1826 * remapped before the interruption if any, or the negative error if 1827 * none. It will never return zero. Either it will return an error or 1828 * an amount of bytes successfully moved. If the retval reports a 1829 * "short" remap, the move_pages() command should be repeated by 1830 * userland with src+retval, dst+reval, len-retval if it wants to know 1831 * about the error that interrupted it. 1832 * 1833 * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to 1834 * prevent -ENOENT errors to materialize if there are holes in the 1835 * source virtual range that is being remapped. The holes will be 1836 * accounted as successfully remapped in the retval of the 1837 * command. This is mostly useful to remap hugepage naturally aligned 1838 * virtual regions without knowing if there are transparent hugepage 1839 * in the regions or not, but preventing the risk of having to split 1840 * the hugepmd during the remap. 1841 */ 1842 ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, 1843 unsigned long src_start, unsigned long len, __u64 mode) 1844 { 1845 struct mm_struct *mm = ctx->mm; 1846 struct vm_area_struct *src_vma, *dst_vma; 1847 unsigned long src_addr, dst_addr, src_end; 1848 pmd_t *src_pmd, *dst_pmd; 1849 long err = -EINVAL; 1850 ssize_t moved = 0; 1851 1852 /* Sanitize the command parameters. */ 1853 VM_WARN_ON_ONCE(src_start & ~PAGE_MASK); 1854 VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); 1855 VM_WARN_ON_ONCE(len & ~PAGE_MASK); 1856 1857 /* Does the address range wrap, or is the span zero-sized? */ 1858 VM_WARN_ON_ONCE(src_start + len < src_start); 1859 VM_WARN_ON_ONCE(dst_start + len < dst_start); 1860 1861 err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma); 1862 if (err) 1863 goto out; 1864 1865 /* Re-check after taking map_changing_lock */ 1866 err = -EAGAIN; 1867 down_read(&ctx->map_changing_lock); 1868 if (likely(atomic_read(&ctx->mmap_changing))) 1869 goto out_unlock; 1870 /* 1871 * Make sure the vma is not shared, that the src and dst remap 1872 * ranges are both valid and fully within a single existing 1873 * vma. 1874 */ 1875 err = -EINVAL; 1876 if (src_vma->vm_flags & VM_SHARED) 1877 goto out_unlock; 1878 if (src_start + len > src_vma->vm_end) 1879 goto out_unlock; 1880 1881 if (dst_vma->vm_flags & VM_SHARED) 1882 goto out_unlock; 1883 if (dst_start + len > dst_vma->vm_end) 1884 goto out_unlock; 1885 1886 err = validate_move_areas(ctx, src_vma, dst_vma); 1887 if (err) 1888 goto out_unlock; 1889 1890 for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len; 1891 src_addr < src_end;) { 1892 spinlock_t *ptl; 1893 pmd_t dst_pmdval; 1894 unsigned long step_size; 1895 1896 /* 1897 * Below works because anonymous area would not have a 1898 * transparent huge PUD. If file-backed support is added, 1899 * that case would need to be handled here. 1900 */ 1901 src_pmd = mm_find_pmd(mm, src_addr); 1902 if (unlikely(!src_pmd)) { 1903 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 1904 err = -ENOENT; 1905 break; 1906 } 1907 src_pmd = mm_alloc_pmd(mm, src_addr); 1908 if (unlikely(!src_pmd)) { 1909 err = -ENOMEM; 1910 break; 1911 } 1912 } 1913 dst_pmd = mm_alloc_pmd(mm, dst_addr); 1914 if (unlikely(!dst_pmd)) { 1915 err = -ENOMEM; 1916 break; 1917 } 1918 1919 dst_pmdval = pmdp_get_lockless(dst_pmd); 1920 /* 1921 * If the dst_pmd is mapped as THP don't override it and just 1922 * be strict. If dst_pmd changes into TPH after this check, the 1923 * move_pages_huge_pmd() will detect the change and retry 1924 * while move_pages_pte() will detect the change and fail. 1925 */ 1926 if (unlikely(pmd_trans_huge(dst_pmdval))) { 1927 err = -EEXIST; 1928 break; 1929 } 1930 1931 ptl = pmd_trans_huge_lock(src_pmd, src_vma); 1932 if (ptl) { 1933 /* Check if we can move the pmd without splitting it. */ 1934 if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || 1935 !pmd_none(dst_pmdval)) { 1936 /* Can be a migration entry */ 1937 if (pmd_present(*src_pmd)) { 1938 struct folio *folio = pmd_folio(*src_pmd); 1939 1940 if (!is_huge_zero_folio(folio) && 1941 !PageAnonExclusive(&folio->page)) { 1942 spin_unlock(ptl); 1943 err = -EBUSY; 1944 break; 1945 } 1946 } 1947 1948 spin_unlock(ptl); 1949 split_huge_pmd(src_vma, src_pmd, src_addr); 1950 /* The folio will be split by move_pages_pte() */ 1951 continue; 1952 } 1953 1954 err = move_pages_huge_pmd(mm, dst_pmd, src_pmd, 1955 dst_pmdval, dst_vma, src_vma, 1956 dst_addr, src_addr); 1957 step_size = HPAGE_PMD_SIZE; 1958 } else { 1959 long ret; 1960 1961 if (pmd_none(*src_pmd)) { 1962 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 1963 err = -ENOENT; 1964 break; 1965 } 1966 if (unlikely(__pte_alloc(mm, src_pmd))) { 1967 err = -ENOMEM; 1968 break; 1969 } 1970 } 1971 1972 if (unlikely(pte_alloc(mm, dst_pmd))) { 1973 err = -ENOMEM; 1974 break; 1975 } 1976 1977 ret = move_pages_ptes(mm, dst_pmd, src_pmd, 1978 dst_vma, src_vma, dst_addr, 1979 src_addr, src_end - src_addr, mode); 1980 if (ret < 0) 1981 err = ret; 1982 else 1983 step_size = ret; 1984 } 1985 1986 cond_resched(); 1987 1988 if (fatal_signal_pending(current)) { 1989 /* Do not override an error */ 1990 if (!err || err == -EAGAIN) 1991 err = -EINTR; 1992 break; 1993 } 1994 1995 if (err) { 1996 if (err == -EAGAIN) 1997 continue; 1998 break; 1999 } 2000 2001 /* Proceed to the next page */ 2002 dst_addr += step_size; 2003 src_addr += step_size; 2004 moved += step_size; 2005 } 2006 2007 out_unlock: 2008 up_read(&ctx->map_changing_lock); 2009 uffd_move_unlock(dst_vma, src_vma); 2010 out: 2011 VM_WARN_ON_ONCE(moved < 0); 2012 VM_WARN_ON_ONCE(err > 0); 2013 VM_WARN_ON_ONCE(!moved && !err); 2014 return moved ? moved : err; 2015 } 2016 2017 bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, 2018 bool wp_async) 2019 { 2020 const struct vm_uffd_ops *ops = vma_uffd_ops(vma); 2021 2022 if (vma->vm_flags & VM_DROPPABLE) 2023 return false; 2024 2025 vm_flags &= __VM_UFFD_FLAGS; 2026 2027 /* 2028 * If WP is the only mode enabled and context is wp async, allow any 2029 * memory type. 2030 */ 2031 if (wp_async && (vm_flags == VM_UFFD_WP)) 2032 return true; 2033 2034 /* For any other mode reject VMAs that don't implement vm_uffd_ops */ 2035 if (!ops) 2036 return false; 2037 2038 /* 2039 * If user requested uffd-wp but not enabled pte markers for 2040 * uffd-wp, then only anonymous memory is supported 2041 */ 2042 if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && 2043 !vma_is_anonymous(vma)) 2044 return false; 2045 2046 return ops->can_userfault(vma, vm_flags); 2047 } 2048 2049 static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, 2050 vm_flags_t vm_flags) 2051 { 2052 const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP; 2053 2054 vm_flags_reset(vma, vm_flags); 2055 /* 2056 * For shared mappings, we want to enable writenotify while 2057 * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply 2058 * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. 2059 */ 2060 if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) 2061 vma_set_page_prot(vma); 2062 } 2063 2064 static void userfaultfd_set_ctx(struct vm_area_struct *vma, 2065 struct userfaultfd_ctx *ctx, 2066 vm_flags_t vm_flags) 2067 { 2068 vma_start_write(vma); 2069 vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; 2070 userfaultfd_set_vm_flags(vma, 2071 (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags); 2072 } 2073 2074 void userfaultfd_reset_ctx(struct vm_area_struct *vma) 2075 { 2076 userfaultfd_set_ctx(vma, NULL, 0); 2077 } 2078 2079 struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, 2080 struct vm_area_struct *prev, 2081 struct vm_area_struct *vma, 2082 unsigned long start, 2083 unsigned long end) 2084 { 2085 struct vm_area_struct *ret; 2086 bool give_up_on_oom = false; 2087 vma_flags_t new_vma_flags = vma->flags; 2088 2089 vma_flags_clear_mask(&new_vma_flags, __VMA_UFFD_FLAGS); 2090 2091 /* 2092 * If we are modifying only and not splitting, just give up on the merge 2093 * if OOM prevents us from merging successfully. 2094 */ 2095 if (start == vma->vm_start && end == vma->vm_end) 2096 give_up_on_oom = true; 2097 2098 /* Reset ptes for the whole vma range if wr-protected */ 2099 if (userfaultfd_wp(vma)) 2100 uffd_wp_range(vma, start, end - start, false); 2101 2102 ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, 2103 &new_vma_flags, NULL_VM_UFFD_CTX, 2104 give_up_on_oom); 2105 2106 /* 2107 * In the vma_merge() successful mprotect-like case 8: 2108 * the next vma was merged into the current one and 2109 * the current one has not been updated yet. 2110 */ 2111 if (!IS_ERR(ret)) 2112 userfaultfd_reset_ctx(ret); 2113 2114 return ret; 2115 } 2116 2117 /* Assumes mmap write lock taken, and mm_struct pinned. */ 2118 int userfaultfd_register_range(struct userfaultfd_ctx *ctx, 2119 struct vm_area_struct *vma, 2120 vm_flags_t vm_flags, 2121 unsigned long start, unsigned long end, 2122 bool wp_async) 2123 { 2124 vma_flags_t vma_flags = legacy_to_vma_flags(vm_flags); 2125 VMA_ITERATOR(vmi, ctx->mm, start); 2126 struct vm_area_struct *prev = vma_prev(&vmi); 2127 unsigned long vma_end; 2128 vma_flags_t new_vma_flags; 2129 2130 if (vma->vm_start < start) 2131 prev = vma; 2132 2133 for_each_vma_range(vmi, vma, end) { 2134 cond_resched(); 2135 2136 VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async)); 2137 VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx && 2138 vma->vm_userfaultfd_ctx.ctx != ctx); 2139 VM_WARN_ON_ONCE(!vma_test(vma, VMA_MAYWRITE_BIT)); 2140 2141 /* 2142 * Nothing to do: this vma is already registered into this 2143 * userfaultfd and with the right tracking mode too. 2144 */ 2145 if (vma->vm_userfaultfd_ctx.ctx == ctx && 2146 vma_test_all_mask(vma, vma_flags)) 2147 goto skip; 2148 2149 if (vma->vm_start > start) 2150 start = vma->vm_start; 2151 vma_end = min(end, vma->vm_end); 2152 2153 new_vma_flags = vma->flags; 2154 vma_flags_clear_mask(&new_vma_flags, __VMA_UFFD_FLAGS); 2155 vma_flags_set_mask(&new_vma_flags, vma_flags); 2156 2157 vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, 2158 &new_vma_flags, 2159 (struct vm_userfaultfd_ctx){ctx}, 2160 /* give_up_on_oom = */false); 2161 if (IS_ERR(vma)) 2162 return PTR_ERR(vma); 2163 2164 /* 2165 * In the vma_merge() successful mprotect-like case 8: 2166 * the next vma was merged into the current one and 2167 * the current one has not been updated yet. 2168 */ 2169 userfaultfd_set_ctx(vma, ctx, vm_flags); 2170 2171 if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) 2172 hugetlb_unshare_all_pmds(vma); 2173 2174 skip: 2175 prev = vma; 2176 start = vma->vm_end; 2177 } 2178 2179 return 0; 2180 } 2181 2182 void userfaultfd_release_new(struct userfaultfd_ctx *ctx) 2183 { 2184 struct mm_struct *mm = ctx->mm; 2185 struct vm_area_struct *vma; 2186 VMA_ITERATOR(vmi, mm, 0); 2187 2188 /* the various vma->vm_userfaultfd_ctx still points to it */ 2189 mmap_write_lock(mm); 2190 for_each_vma(vmi, vma) { 2191 if (vma->vm_userfaultfd_ctx.ctx == ctx) 2192 userfaultfd_reset_ctx(vma); 2193 } 2194 mmap_write_unlock(mm); 2195 } 2196 2197 void userfaultfd_release_all(struct mm_struct *mm, 2198 struct userfaultfd_ctx *ctx) 2199 { 2200 struct vm_area_struct *vma, *prev; 2201 VMA_ITERATOR(vmi, mm, 0); 2202 2203 if (!mmget_not_zero(mm)) 2204 return; 2205 2206 /* 2207 * Flush page faults out of all CPUs. NOTE: all page faults 2208 * must be retried without returning VM_FAULT_SIGBUS if 2209 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx 2210 * changes while handle_userfault released the mmap_lock. So 2211 * it's critical that released is set to true (above), before 2212 * taking the mmap_lock for writing. 2213 */ 2214 mmap_write_lock(mm); 2215 prev = NULL; 2216 for_each_vma(vmi, vma) { 2217 cond_resched(); 2218 VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^ 2219 !!(vma->vm_flags & __VM_UFFD_FLAGS)); 2220 if (vma->vm_userfaultfd_ctx.ctx != ctx) { 2221 prev = vma; 2222 continue; 2223 } 2224 2225 vma = userfaultfd_clear_vma(&vmi, prev, vma, 2226 vma->vm_start, vma->vm_end); 2227 prev = vma; 2228 } 2229 mmap_write_unlock(mm); 2230 mmput(mm); 2231 } 2232