1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * mm/userfaultfd.c 4 * 5 * Copyright (C) 2015 Red Hat, Inc. 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/sched/signal.h> 10 #include <linux/pagemap.h> 11 #include <linux/rmap.h> 12 #include <linux/swap.h> 13 #include <linux/leafops.h> 14 #include <linux/userfaultfd_k.h> 15 #include <linux/mmu_notifier.h> 16 #include <linux/hugetlb.h> 17 #include <linux/file.h> 18 #include <linux/cleanup.h> 19 #include <asm/tlbflush.h> 20 #include <asm/tlb.h> 21 #include "internal.h" 22 #include "swap.h" 23 24 struct mfill_state { 25 struct userfaultfd_ctx *ctx; 26 unsigned long src_start; 27 unsigned long dst_start; 28 unsigned long len; 29 uffd_flags_t flags; 30 31 struct vm_area_struct *vma; 32 unsigned long src_addr; 33 unsigned long dst_addr; 34 pmd_t *pmd; 35 }; 36 37 static bool anon_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags) 38 { 39 /* anonymous memory does not support MINOR mode */ 40 if (vm_flags & VM_UFFD_MINOR) 41 return false; 42 return true; 43 } 44 45 static struct folio *anon_alloc_folio(struct vm_area_struct *vma, 46 unsigned long addr) 47 { 48 struct folio *folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, 49 addr); 50 51 if (!folio) 52 return NULL; 53 54 if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) { 55 folio_put(folio); 56 return NULL; 57 } 58 59 return folio; 60 } 61 62 static const struct vm_uffd_ops anon_uffd_ops = { 63 .can_userfault = anon_can_userfault, 64 .alloc_folio = anon_alloc_folio, 65 }; 66 67 static const struct vm_uffd_ops *vma_uffd_ops(struct vm_area_struct *vma) 68 { 69 if (vma_is_anonymous(vma)) 70 return &anon_uffd_ops; 71 return vma->vm_ops->uffd_ops; 72 } 73 74 static __always_inline 75 bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) 76 { 77 /* Make sure that the dst range is fully within dst_vma. */ 78 if (dst_end > dst_vma->vm_end) 79 return false; 80 81 /* 82 * Check the vma is registered in uffd, this is required to 83 * enforce the VM_MAYWRITE check done at uffd registration 84 * time. 85 */ 86 if (!dst_vma->vm_userfaultfd_ctx.ctx) 87 return false; 88 89 return true; 90 } 91 92 static __always_inline 93 struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm, 94 unsigned long addr) 95 { 96 struct vm_area_struct *vma; 97 98 mmap_assert_locked(mm); 99 vma = vma_lookup(mm, addr); 100 if (!vma) 101 vma = ERR_PTR(-ENOENT); 102 else if (!(vma->vm_flags & VM_SHARED) && 103 unlikely(anon_vma_prepare(vma))) 104 vma = ERR_PTR(-ENOMEM); 105 106 return vma; 107 } 108 109 #ifdef CONFIG_PER_VMA_LOCK 110 /* 111 * uffd_lock_vma() - Lookup and lock vma corresponding to @address. 112 * @mm: mm to search vma in. 113 * @address: address that the vma should contain. 114 * 115 * Should be called without holding mmap_lock. 116 * 117 * Return: A locked vma containing @address, -ENOENT if no vma is found, or 118 * -ENOMEM if anon_vma couldn't be allocated. 119 */ 120 static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, 121 unsigned long address) 122 { 123 struct vm_area_struct *vma; 124 125 vma = lock_vma_under_rcu(mm, address); 126 if (vma) { 127 /* 128 * We know we're going to need to use anon_vma, so check 129 * that early. 130 */ 131 if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma)) 132 vma_end_read(vma); 133 else 134 return vma; 135 } 136 137 mmap_read_lock(mm); 138 vma = find_vma_and_prepare_anon(mm, address); 139 if (!IS_ERR(vma)) { 140 bool locked = vma_start_read_locked(vma); 141 142 if (!locked) 143 vma = ERR_PTR(-EAGAIN); 144 } 145 146 mmap_read_unlock(mm); 147 return vma; 148 } 149 150 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, 151 unsigned long dst_start, 152 unsigned long len) 153 { 154 struct vm_area_struct *dst_vma; 155 156 dst_vma = uffd_lock_vma(dst_mm, dst_start); 157 if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len)) 158 return dst_vma; 159 160 vma_end_read(dst_vma); 161 return ERR_PTR(-ENOENT); 162 } 163 164 static void uffd_mfill_unlock(struct vm_area_struct *vma) 165 { 166 vma_end_read(vma); 167 } 168 169 #else 170 171 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, 172 unsigned long dst_start, 173 unsigned long len) 174 { 175 struct vm_area_struct *dst_vma; 176 177 mmap_read_lock(dst_mm); 178 dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start); 179 if (IS_ERR(dst_vma)) 180 goto out_unlock; 181 182 if (validate_dst_vma(dst_vma, dst_start + len)) 183 return dst_vma; 184 185 dst_vma = ERR_PTR(-ENOENT); 186 out_unlock: 187 mmap_read_unlock(dst_mm); 188 return dst_vma; 189 } 190 191 static void uffd_mfill_unlock(struct vm_area_struct *vma) 192 { 193 mmap_read_unlock(vma->vm_mm); 194 } 195 #endif 196 197 static void mfill_put_vma(struct mfill_state *state) 198 { 199 if (!state->vma) 200 return; 201 202 up_read(&state->ctx->map_changing_lock); 203 uffd_mfill_unlock(state->vma); 204 state->vma = NULL; 205 } 206 207 static int mfill_get_vma(struct mfill_state *state) 208 { 209 struct userfaultfd_ctx *ctx = state->ctx; 210 uffd_flags_t flags = state->flags; 211 struct vm_area_struct *dst_vma; 212 const struct vm_uffd_ops *ops; 213 int err; 214 215 /* 216 * Make sure the vma is not shared, that the dst range is 217 * both valid and fully within a single existing vma. 218 */ 219 dst_vma = uffd_mfill_lock(ctx->mm, state->dst_start, state->len); 220 if (IS_ERR(dst_vma)) 221 return PTR_ERR(dst_vma); 222 223 /* 224 * If memory mappings are changing because of non-cooperative 225 * operation (e.g. mremap) running in parallel, bail out and 226 * request the user to retry later 227 */ 228 down_read(&ctx->map_changing_lock); 229 state->vma = dst_vma; 230 err = -EAGAIN; 231 if (atomic_read(&ctx->mmap_changing)) 232 goto out_unlock; 233 234 err = -EINVAL; 235 236 /* 237 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but 238 * it will overwrite vm_ops, so vma_is_anonymous must return false. 239 */ 240 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && 241 dst_vma->vm_flags & VM_SHARED)) 242 goto out_unlock; 243 244 /* 245 * validate 'mode' now that we know the dst_vma: don't allow 246 * a wrprotect copy if the userfaultfd didn't register as WP. 247 */ 248 if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) 249 goto out_unlock; 250 251 if (is_vm_hugetlb_page(dst_vma)) 252 return 0; 253 254 ops = vma_uffd_ops(dst_vma); 255 if (!ops) 256 goto out_unlock; 257 258 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && 259 !ops->get_folio_noalloc) 260 goto out_unlock; 261 262 return 0; 263 264 out_unlock: 265 mfill_put_vma(state); 266 return err; 267 } 268 269 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) 270 { 271 pgd_t *pgd; 272 p4d_t *p4d; 273 pud_t *pud; 274 275 pgd = pgd_offset(mm, address); 276 p4d = p4d_alloc(mm, pgd, address); 277 if (!p4d) 278 return NULL; 279 pud = pud_alloc(mm, p4d, address); 280 if (!pud) 281 return NULL; 282 /* 283 * Note that we didn't run this because the pmd was 284 * missing, the *pmd may be already established and in 285 * turn it may also be a trans_huge_pmd. 286 */ 287 return pmd_alloc(mm, pud, address); 288 } 289 290 static int mfill_establish_pmd(struct mfill_state *state) 291 { 292 struct mm_struct *dst_mm = state->ctx->mm; 293 pmd_t *dst_pmd, dst_pmdval; 294 295 dst_pmd = mm_alloc_pmd(dst_mm, state->dst_addr); 296 if (unlikely(!dst_pmd)) 297 return -ENOMEM; 298 299 dst_pmdval = pmdp_get_lockless(dst_pmd); 300 if (unlikely(pmd_none(dst_pmdval)) && 301 unlikely(__pte_alloc(dst_mm, dst_pmd))) 302 return -ENOMEM; 303 304 dst_pmdval = pmdp_get_lockless(dst_pmd); 305 /* 306 * If the dst_pmd is THP don't override it and just be strict. 307 * (This includes the case where the PMD used to be THP and 308 * changed back to none after __pte_alloc().) 309 */ 310 if (unlikely(!pmd_present(dst_pmdval) || pmd_leaf(dst_pmdval))) 311 return -EEXIST; 312 if (unlikely(pmd_bad(dst_pmdval))) 313 return -EFAULT; 314 315 state->pmd = dst_pmd; 316 return 0; 317 } 318 319 /* Check if dst_addr is outside of file's size. Must be called with ptl held. */ 320 static bool mfill_file_over_size(struct vm_area_struct *dst_vma, 321 unsigned long dst_addr) 322 { 323 struct inode *inode; 324 pgoff_t offset, max_off; 325 326 if (!dst_vma->vm_file) 327 return false; 328 329 inode = dst_vma->vm_file->f_inode; 330 offset = linear_page_index(dst_vma, dst_addr); 331 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 332 return offset >= max_off; 333 } 334 335 /* 336 * Install PTEs, to map dst_addr (within dst_vma) to page. 337 * 338 * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem 339 * and anon, and for both shared and private VMAs. 340 */ 341 static int mfill_atomic_install_pte(pmd_t *dst_pmd, 342 struct vm_area_struct *dst_vma, 343 unsigned long dst_addr, struct page *page, 344 uffd_flags_t flags) 345 { 346 int ret; 347 struct mm_struct *dst_mm = dst_vma->vm_mm; 348 pte_t _dst_pte, *dst_pte; 349 bool writable = dst_vma->vm_flags & VM_WRITE; 350 bool vm_shared = dst_vma->vm_flags & VM_SHARED; 351 spinlock_t *ptl; 352 struct folio *folio = page_folio(page); 353 bool page_in_cache = folio_mapping(folio); 354 pte_t dst_ptep; 355 356 _dst_pte = mk_pte(page, dst_vma->vm_page_prot); 357 _dst_pte = pte_mkdirty(_dst_pte); 358 if (page_in_cache && !vm_shared) 359 writable = false; 360 if (writable) 361 _dst_pte = pte_mkwrite(_dst_pte, dst_vma); 362 if (flags & MFILL_ATOMIC_WP) 363 _dst_pte = pte_mkuffd_wp(_dst_pte); 364 365 ret = -EAGAIN; 366 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 367 if (!dst_pte) 368 goto out; 369 370 if (mfill_file_over_size(dst_vma, dst_addr)) { 371 ret = -EFAULT; 372 goto out_unlock; 373 } 374 375 ret = -EEXIST; 376 377 dst_ptep = ptep_get(dst_pte); 378 379 /* 380 * We are allowed to overwrite a UFFD pte marker: consider when both 381 * MISSING|WP registered, we firstly wr-protect a none pte which has no 382 * page cache page backing it, then access the page. 383 */ 384 if (!pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep)) 385 goto out_unlock; 386 387 if (page_in_cache) { 388 folio_add_file_rmap_pte(folio, page, dst_vma); 389 } else { 390 folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); 391 folio_add_lru_vma(folio, dst_vma); 392 } 393 394 /* 395 * Must happen after rmap, as mm_counter() checks mapping (via 396 * PageAnon()), which is set by __page_set_anon_rmap(). 397 */ 398 inc_mm_counter(dst_mm, mm_counter(folio)); 399 400 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 401 402 if (page_in_cache) 403 folio_unlock(folio); 404 405 /* No need to invalidate - it was non-present before */ 406 update_mmu_cache(dst_vma, dst_addr, dst_pte); 407 ret = 0; 408 out_unlock: 409 pte_unmap_unlock(dst_pte, ptl); 410 out: 411 return ret; 412 } 413 414 static int mfill_copy_folio_locked(struct folio *folio, unsigned long src_addr) 415 { 416 void *kaddr; 417 int ret; 418 419 kaddr = kmap_local_folio(folio, 0); 420 /* 421 * The read mmap_lock is held here. Despite the 422 * mmap_lock being read recursive a deadlock is still 423 * possible if a writer has taken a lock. For example: 424 * 425 * process A thread 1 takes read lock on own mmap_lock 426 * process A thread 2 calls mmap, blocks taking write lock 427 * process B thread 1 takes page fault, read lock on own mmap lock 428 * process B thread 2 calls mmap, blocks taking write lock 429 * process A thread 1 blocks taking read lock on process B 430 * process B thread 1 blocks taking read lock on process A 431 * 432 * Disable page faults to prevent potential deadlock 433 * and retry the copy outside the mmap_lock. 434 */ 435 pagefault_disable(); 436 ret = copy_from_user(kaddr, (const void __user *) src_addr, 437 PAGE_SIZE); 438 pagefault_enable(); 439 kunmap_local(kaddr); 440 441 if (ret) 442 return -EFAULT; 443 444 flush_dcache_folio(folio); 445 return ret; 446 } 447 448 #define MFILL_RETRY_STATE_VMA_FLAGS \ 449 append_vma_flags(__VMA_UFFD_FLAGS, VMA_SHARED_BIT) 450 451 /* 452 * VMA state saved before dropping the locks in mfill_copy_folio_retry(). 453 * Used to detect VMA replacement or incompatible changes after reacquiring the 454 * locks. 455 */ 456 struct mfill_retry_state { 457 const struct vm_uffd_ops *ops; 458 struct file *file; 459 vma_flags_t flags; 460 pgoff_t pgoff; 461 }; 462 463 static void mfill_retry_state_save(struct mfill_retry_state *s, 464 struct vm_area_struct *vma) 465 { 466 s->flags = vma_flags_and_mask(&vma->flags, MFILL_RETRY_STATE_VMA_FLAGS); 467 s->ops = vma_uffd_ops(vma); 468 s->pgoff = vma->vm_pgoff; 469 470 if (vma->vm_file) 471 s->file = get_file(vma->vm_file); 472 } 473 474 static bool mfill_retry_state_changed(struct mfill_retry_state *state, 475 struct vm_area_struct *vma) 476 { 477 vma_flags_t flags = vma_flags_and_mask(&vma->flags, 478 MFILL_RETRY_STATE_VMA_FLAGS); 479 480 /* Have any UFFD flags (missing, WP, minor) changed? */ 481 if (!vma_flags_same_pair(&state->flags, &flags)) 482 return true; 483 484 /* VMA type or effective uffd_ops changed while the lock was dropped */ 485 if (state->ops != vma_uffd_ops(vma)) 486 return true; 487 488 /* VMA was anonymous before; changed only if it no longer is */ 489 if (!state->file) 490 return !vma_is_anonymous(vma); 491 492 /* VMA was file backed, but file, inode or offset has changed */ 493 if (!vma->vm_file || vma->vm_file->f_inode != state->file->f_inode || 494 state->file != vma->vm_file || vma->vm_pgoff != state->pgoff) 495 return true; 496 497 return false; 498 } 499 500 static void mfill_retry_state_put(struct mfill_retry_state *s) 501 { 502 if (s->file) 503 fput(s->file); 504 } 505 506 DEFINE_FREE(retry_put, struct mfill_retry_state *, 507 if (_T) mfill_retry_state_put(_T)); 508 509 static int mfill_copy_folio_retry(struct mfill_state *mfill_state, 510 struct folio *folio) 511 { 512 struct mfill_retry_state retry_state = { 0 }; 513 struct mfill_retry_state *for_free __free(retry_put) = &retry_state; 514 unsigned long src_addr = mfill_state->src_addr; 515 void *kaddr; 516 int err; 517 518 mfill_retry_state_save(&retry_state, mfill_state->vma); 519 520 /* retry copying with mm_lock dropped */ 521 mfill_put_vma(mfill_state); 522 523 kaddr = kmap_local_folio(folio, 0); 524 err = copy_from_user(kaddr, (const void __user *) src_addr, PAGE_SIZE); 525 kunmap_local(kaddr); 526 if (unlikely(err)) 527 return -EFAULT; 528 529 flush_dcache_folio(folio); 530 531 /* reget VMA and PMD, they could change underneath us */ 532 err = mfill_get_vma(mfill_state); 533 if (err) 534 return err; 535 536 if (mfill_retry_state_changed(&retry_state, mfill_state->vma)) 537 return -EAGAIN; 538 539 err = mfill_establish_pmd(mfill_state); 540 if (err) 541 return err; 542 543 return 0; 544 } 545 546 static int __mfill_atomic_pte(struct mfill_state *state, 547 const struct vm_uffd_ops *ops) 548 { 549 unsigned long dst_addr = state->dst_addr; 550 unsigned long src_addr = state->src_addr; 551 uffd_flags_t flags = state->flags; 552 struct folio *folio; 553 int ret; 554 555 if (!ops) { 556 VM_WARN_ONCE(1, "UFFDIO_COPY for unsupported VMA"); 557 return -EOPNOTSUPP; 558 } 559 560 folio = ops->alloc_folio(state->vma, state->dst_addr); 561 if (!folio) 562 return -ENOMEM; 563 564 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { 565 ret = mfill_copy_folio_locked(folio, src_addr); 566 /* 567 * Fallback to copy_from_user outside mmap_lock. 568 * If retry is successful, mfill_copy_folio_locked() returns 569 * with locks retaken by mfill_get_vma(). 570 * If there was an error, we must mfill_put_vma() anyway and it 571 * will take care of unlocking if needed. 572 */ 573 if (unlikely(ret)) { 574 ret = mfill_copy_folio_retry(state, folio); 575 if (ret) 576 goto err_folio_put; 577 } 578 } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { 579 clear_user_highpage(&folio->page, state->dst_addr); 580 } else { 581 VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags); 582 } 583 584 /* 585 * The memory barrier inside __folio_mark_uptodate makes sure that 586 * preceding stores to the page contents become visible before 587 * the set_pte_at() write. 588 */ 589 __folio_mark_uptodate(folio); 590 591 if (ops->filemap_add) { 592 ret = ops->filemap_add(folio, state->vma, state->dst_addr); 593 if (ret) 594 goto err_folio_put; 595 } 596 597 ret = mfill_atomic_install_pte(state->pmd, state->vma, dst_addr, 598 &folio->page, flags); 599 if (ret) 600 goto err_filemap_remove; 601 602 return 0; 603 604 err_filemap_remove: 605 if (ops->filemap_remove) 606 ops->filemap_remove(folio, state->vma); 607 err_folio_put: 608 folio_put(folio); 609 return ret; 610 } 611 612 static int mfill_atomic_pte_copy(struct mfill_state *state) 613 { 614 const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma); 615 616 /* 617 * The normal page fault path for a MAP_PRIVATE mapping in a 618 * file-backed VMA will invoke the fault, fill the hole in the file and 619 * COW it right away. The result generates plain anonymous memory. 620 * So when we are asked to fill a hole in a MAP_PRIVATE mapping, we'll 621 * generate anonymous memory directly without actually filling the 622 * hole. For the MAP_PRIVATE case the robustness check only happens in 623 * the pagetable (to verify it's still none) and not in the page cache. 624 */ 625 if (!(state->vma->vm_flags & VM_SHARED)) 626 ops = &anon_uffd_ops; 627 628 return __mfill_atomic_pte(state, ops); 629 } 630 631 static int mfill_atomic_pte_zeroed_folio(struct mfill_state *state) 632 { 633 const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma); 634 635 return __mfill_atomic_pte(state, ops); 636 } 637 638 static int mfill_atomic_pte_zeropage(struct mfill_state *state) 639 { 640 struct vm_area_struct *dst_vma = state->vma; 641 unsigned long dst_addr = state->dst_addr; 642 pmd_t *dst_pmd = state->pmd; 643 pte_t _dst_pte, *dst_pte; 644 spinlock_t *ptl; 645 int ret; 646 647 if (mm_forbids_zeropage(dst_vma->vm_mm) || 648 (dst_vma->vm_flags & VM_SHARED)) 649 return mfill_atomic_pte_zeroed_folio(state); 650 651 _dst_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), 652 dst_vma->vm_page_prot)); 653 ret = -EAGAIN; 654 dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); 655 if (!dst_pte) 656 goto out; 657 if (mfill_file_over_size(dst_vma, dst_addr)) { 658 ret = -EFAULT; 659 goto out_unlock; 660 } 661 ret = -EEXIST; 662 if (!pte_none(ptep_get(dst_pte))) 663 goto out_unlock; 664 set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); 665 /* No need to invalidate - it was non-present before */ 666 update_mmu_cache(dst_vma, dst_addr, dst_pte); 667 ret = 0; 668 out_unlock: 669 pte_unmap_unlock(dst_pte, ptl); 670 out: 671 return ret; 672 } 673 674 /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ 675 static int mfill_atomic_pte_continue(struct mfill_state *state) 676 { 677 struct vm_area_struct *dst_vma = state->vma; 678 const struct vm_uffd_ops *ops = vma_uffd_ops(dst_vma); 679 unsigned long dst_addr = state->dst_addr; 680 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 681 struct inode *inode = file_inode(dst_vma->vm_file); 682 uffd_flags_t flags = state->flags; 683 pmd_t *dst_pmd = state->pmd; 684 struct folio *folio; 685 struct page *page; 686 int ret; 687 688 if (!ops) { 689 VM_WARN_ONCE(1, "UFFDIO_CONTINUE for unsupported VMA"); 690 return -EOPNOTSUPP; 691 } 692 693 folio = ops->get_folio_noalloc(inode, pgoff); 694 /* Our caller expects us to return -EFAULT if we failed to find folio */ 695 if (IS_ERR_OR_NULL(folio)) 696 return -EFAULT; 697 698 page = folio_file_page(folio, pgoff); 699 if (PageHWPoison(page)) { 700 ret = -EIO; 701 goto out_release; 702 } 703 704 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 705 page, flags); 706 if (ret) 707 goto out_release; 708 709 return 0; 710 711 out_release: 712 folio_unlock(folio); 713 folio_put(folio); 714 return ret; 715 } 716 717 /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ 718 static int mfill_atomic_pte_poison(struct mfill_state *state) 719 { 720 struct vm_area_struct *dst_vma = state->vma; 721 struct mm_struct *dst_mm = dst_vma->vm_mm; 722 unsigned long dst_addr = state->dst_addr; 723 pmd_t *dst_pmd = state->pmd; 724 pte_t _dst_pte, *dst_pte; 725 spinlock_t *ptl; 726 int ret; 727 728 _dst_pte = make_pte_marker(PTE_MARKER_POISONED); 729 ret = -EAGAIN; 730 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 731 if (!dst_pte) 732 goto out; 733 734 if (mfill_file_over_size(dst_vma, dst_addr)) { 735 ret = -EFAULT; 736 goto out_unlock; 737 } 738 739 ret = -EEXIST; 740 /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */ 741 if (!pte_none(ptep_get(dst_pte))) 742 goto out_unlock; 743 744 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 745 746 /* No need to invalidate - it was non-present before */ 747 update_mmu_cache(dst_vma, dst_addr, dst_pte); 748 ret = 0; 749 out_unlock: 750 pte_unmap_unlock(dst_pte, ptl); 751 out: 752 return ret; 753 } 754 755 #ifdef CONFIG_HUGETLB_PAGE 756 /* 757 * mfill_atomic processing for HUGETLB vmas. Note that this routine is 758 * called with either vma-lock or mmap_lock held, it will release the lock 759 * before returning. 760 */ 761 static __always_inline ssize_t mfill_atomic_hugetlb( 762 struct userfaultfd_ctx *ctx, 763 struct vm_area_struct *dst_vma, 764 unsigned long dst_start, 765 unsigned long src_start, 766 unsigned long len, 767 uffd_flags_t flags) 768 { 769 struct mm_struct *dst_mm = dst_vma->vm_mm; 770 ssize_t err; 771 pte_t *dst_pte; 772 unsigned long src_addr, dst_addr; 773 long copied; 774 struct folio *folio; 775 unsigned long vma_hpagesize; 776 pgoff_t idx; 777 u32 hash; 778 struct address_space *mapping; 779 780 /* 781 * There is no default zero huge page for all huge page sizes as 782 * supported by hugetlb. A PMD_SIZE huge pages may exist as used 783 * by THP. Since we can not reliably insert a zero page, this 784 * feature is not supported. 785 */ 786 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { 787 up_read(&ctx->map_changing_lock); 788 uffd_mfill_unlock(dst_vma); 789 return -EINVAL; 790 } 791 792 src_addr = src_start; 793 dst_addr = dst_start; 794 copied = 0; 795 folio = NULL; 796 vma_hpagesize = vma_kernel_pagesize(dst_vma); 797 798 /* 799 * Validate alignment based on huge page size 800 */ 801 err = -EINVAL; 802 if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) 803 goto out_unlock; 804 805 retry: 806 /* 807 * On routine entry dst_vma is set. If we had to drop mmap_lock and 808 * retry, dst_vma will be set to NULL and we must lookup again. 809 */ 810 if (!dst_vma) { 811 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); 812 if (IS_ERR(dst_vma)) { 813 err = PTR_ERR(dst_vma); 814 goto out; 815 } 816 817 err = -ENOENT; 818 if (!is_vm_hugetlb_page(dst_vma)) 819 goto out_unlock_vma; 820 821 err = -EINVAL; 822 if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) 823 goto out_unlock_vma; 824 825 /* 826 * If memory mappings are changing because of non-cooperative 827 * operation (e.g. mremap) running in parallel, bail out and 828 * request the user to retry later 829 */ 830 down_read(&ctx->map_changing_lock); 831 err = -EAGAIN; 832 if (atomic_read(&ctx->mmap_changing)) 833 goto out_unlock; 834 } 835 836 while (src_addr < src_start + len) { 837 VM_WARN_ON_ONCE(dst_addr >= dst_start + len); 838 839 /* 840 * Serialize via vma_lock and hugetlb_fault_mutex. 841 * vma_lock ensures the dst_pte remains valid even 842 * in the case of shared pmds. fault mutex prevents 843 * races with other faulting threads. 844 */ 845 idx = hugetlb_linear_page_index(dst_vma, dst_addr); 846 mapping = dst_vma->vm_file->f_mapping; 847 hash = hugetlb_fault_mutex_hash(mapping, idx); 848 mutex_lock(&hugetlb_fault_mutex_table[hash]); 849 hugetlb_vma_lock_read(dst_vma); 850 851 err = -ENOMEM; 852 dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); 853 if (!dst_pte) { 854 hugetlb_vma_unlock_read(dst_vma); 855 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 856 goto out_unlock; 857 } 858 859 if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { 860 const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte); 861 862 if (!huge_pte_none(ptep) && !pte_is_uffd_marker(ptep)) { 863 err = -EEXIST; 864 hugetlb_vma_unlock_read(dst_vma); 865 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 866 goto out_unlock; 867 } 868 } 869 870 err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, 871 src_addr, flags, &folio); 872 873 hugetlb_vma_unlock_read(dst_vma); 874 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 875 876 cond_resched(); 877 878 if (unlikely(err == -ENOENT)) { 879 up_read(&ctx->map_changing_lock); 880 uffd_mfill_unlock(dst_vma); 881 VM_WARN_ON_ONCE(!folio); 882 883 err = copy_folio_from_user(folio, 884 (const void __user *)src_addr, true); 885 if (unlikely(err)) { 886 err = -EFAULT; 887 goto out; 888 } 889 890 dst_vma = NULL; 891 goto retry; 892 } else 893 VM_WARN_ON_ONCE(folio); 894 895 if (!err) { 896 dst_addr += vma_hpagesize; 897 src_addr += vma_hpagesize; 898 copied += vma_hpagesize; 899 900 if (fatal_signal_pending(current)) 901 err = -EINTR; 902 } 903 if (err) 904 break; 905 } 906 907 out_unlock: 908 up_read(&ctx->map_changing_lock); 909 out_unlock_vma: 910 uffd_mfill_unlock(dst_vma); 911 out: 912 if (folio) 913 folio_put(folio); 914 VM_WARN_ON_ONCE(copied < 0); 915 VM_WARN_ON_ONCE(err > 0); 916 VM_WARN_ON_ONCE(!copied && !err); 917 return copied ? copied : err; 918 } 919 #else /* !CONFIG_HUGETLB_PAGE */ 920 /* fail at build time if gcc attempts to use this */ 921 extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx, 922 struct vm_area_struct *dst_vma, 923 unsigned long dst_start, 924 unsigned long src_start, 925 unsigned long len, 926 uffd_flags_t flags); 927 #endif /* CONFIG_HUGETLB_PAGE */ 928 929 static __always_inline ssize_t mfill_atomic_pte(struct mfill_state *state) 930 { 931 uffd_flags_t flags = state->flags; 932 933 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) 934 return mfill_atomic_pte_continue(state); 935 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) 936 return mfill_atomic_pte_poison(state); 937 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) 938 return mfill_atomic_pte_copy(state); 939 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) 940 return mfill_atomic_pte_zeropage(state); 941 942 VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags); 943 return -EOPNOTSUPP; 944 } 945 946 static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, 947 unsigned long dst_start, 948 unsigned long src_start, 949 unsigned long len, 950 uffd_flags_t flags) 951 { 952 struct mfill_state state = (struct mfill_state){ 953 .ctx = ctx, 954 .dst_start = dst_start, 955 .src_start = src_start, 956 .flags = flags, 957 .len = len, 958 .src_addr = src_start, 959 .dst_addr = dst_start, 960 }; 961 long copied = 0; 962 ssize_t err; 963 964 /* 965 * Sanitize the command parameters: 966 */ 967 VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); 968 VM_WARN_ON_ONCE(len & ~PAGE_MASK); 969 970 /* Does the address range wrap, or is the span zero-sized? */ 971 VM_WARN_ON_ONCE(src_start + len <= src_start); 972 VM_WARN_ON_ONCE(dst_start + len <= dst_start); 973 974 err = mfill_get_vma(&state); 975 if (err) 976 goto out; 977 978 /* 979 * If this is a HUGETLB vma, pass off to appropriate routine 980 */ 981 if (is_vm_hugetlb_page(state.vma)) 982 return mfill_atomic_hugetlb(ctx, state.vma, dst_start, 983 src_start, len, flags); 984 985 while (state.src_addr < src_start + len) { 986 VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len); 987 988 err = mfill_establish_pmd(&state); 989 if (err) 990 break; 991 992 /* 993 * For shmem mappings, khugepaged is allowed to remove page 994 * tables under us; pte_offset_map_lock() will deal with that. 995 */ 996 997 err = mfill_atomic_pte(&state); 998 cond_resched(); 999 1000 if (!err) { 1001 state.dst_addr += PAGE_SIZE; 1002 state.src_addr += PAGE_SIZE; 1003 copied += PAGE_SIZE; 1004 1005 if (fatal_signal_pending(current)) 1006 err = -EINTR; 1007 } 1008 if (err) 1009 break; 1010 } 1011 1012 mfill_put_vma(&state); 1013 out: 1014 VM_WARN_ON_ONCE(copied < 0); 1015 VM_WARN_ON_ONCE(err > 0); 1016 VM_WARN_ON_ONCE(!copied && !err); 1017 return copied ? copied : err; 1018 } 1019 1020 ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, 1021 unsigned long src_start, unsigned long len, 1022 uffd_flags_t flags) 1023 { 1024 return mfill_atomic(ctx, dst_start, src_start, len, 1025 uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY)); 1026 } 1027 1028 ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, 1029 unsigned long start, 1030 unsigned long len) 1031 { 1032 return mfill_atomic(ctx, start, 0, len, 1033 uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE)); 1034 } 1035 1036 ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, 1037 unsigned long len, uffd_flags_t flags) 1038 { 1039 1040 /* 1041 * A caller might reasonably assume that UFFDIO_CONTINUE contains an 1042 * smp_wmb() to ensure that any writes to the about-to-be-mapped page by 1043 * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to 1044 * subsequent loads from the page through the newly mapped address range. 1045 */ 1046 smp_wmb(); 1047 1048 return mfill_atomic(ctx, start, 0, len, 1049 uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); 1050 } 1051 1052 ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, 1053 unsigned long len, uffd_flags_t flags) 1054 { 1055 return mfill_atomic(ctx, start, 0, len, 1056 uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON)); 1057 } 1058 1059 long uffd_wp_range(struct vm_area_struct *dst_vma, 1060 unsigned long start, unsigned long len, bool enable_wp) 1061 { 1062 unsigned int mm_cp_flags; 1063 struct mmu_gather tlb; 1064 long ret; 1065 1066 VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end, 1067 "The address range exceeds VMA boundary.\n"); 1068 if (enable_wp) 1069 mm_cp_flags = MM_CP_UFFD_WP; 1070 else 1071 mm_cp_flags = MM_CP_UFFD_WP_RESOLVE; 1072 1073 /* 1074 * vma->vm_page_prot already reflects that uffd-wp is enabled for this 1075 * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed 1076 * to be write-protected as default whenever protection changes. 1077 * Try upgrading write permissions manually. 1078 */ 1079 if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) 1080 mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; 1081 tlb_gather_mmu(&tlb, dst_vma->vm_mm); 1082 ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); 1083 tlb_finish_mmu(&tlb); 1084 1085 return ret; 1086 } 1087 1088 int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, 1089 unsigned long len, bool enable_wp) 1090 { 1091 struct mm_struct *dst_mm = ctx->mm; 1092 unsigned long end = start + len; 1093 unsigned long _start, _end; 1094 struct vm_area_struct *dst_vma; 1095 unsigned long page_mask; 1096 long err; 1097 VMA_ITERATOR(vmi, dst_mm, start); 1098 1099 /* 1100 * Sanitize the command parameters: 1101 */ 1102 VM_WARN_ON_ONCE(start & ~PAGE_MASK); 1103 VM_WARN_ON_ONCE(len & ~PAGE_MASK); 1104 1105 /* Does the address range wrap, or is the span zero-sized? */ 1106 VM_WARN_ON_ONCE(start + len <= start); 1107 1108 mmap_read_lock(dst_mm); 1109 1110 /* 1111 * If memory mappings are changing because of non-cooperative 1112 * operation (e.g. mremap) running in parallel, bail out and 1113 * request the user to retry later 1114 */ 1115 down_read(&ctx->map_changing_lock); 1116 err = -EAGAIN; 1117 if (atomic_read(&ctx->mmap_changing)) 1118 goto out_unlock; 1119 1120 err = -ENOENT; 1121 for_each_vma_range(vmi, dst_vma, end) { 1122 1123 if (!userfaultfd_wp(dst_vma)) { 1124 err = -ENOENT; 1125 break; 1126 } 1127 1128 if (is_vm_hugetlb_page(dst_vma)) { 1129 err = -EINVAL; 1130 page_mask = vma_kernel_pagesize(dst_vma) - 1; 1131 if ((start & page_mask) || (len & page_mask)) 1132 break; 1133 } 1134 1135 _start = max(dst_vma->vm_start, start); 1136 _end = min(dst_vma->vm_end, end); 1137 1138 err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp); 1139 1140 /* Return 0 on success, <0 on failures */ 1141 if (err < 0) 1142 break; 1143 err = 0; 1144 } 1145 out_unlock: 1146 up_read(&ctx->map_changing_lock); 1147 mmap_read_unlock(dst_mm); 1148 return err; 1149 } 1150 1151 1152 void double_pt_lock(spinlock_t *ptl1, 1153 spinlock_t *ptl2) 1154 __acquires(ptl1) 1155 __acquires(ptl2) 1156 { 1157 if (ptl1 > ptl2) 1158 swap(ptl1, ptl2); 1159 /* lock in virtual address order to avoid lock inversion */ 1160 spin_lock(ptl1); 1161 if (ptl1 != ptl2) 1162 spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING); 1163 else 1164 __acquire(ptl2); 1165 } 1166 1167 void double_pt_unlock(spinlock_t *ptl1, 1168 spinlock_t *ptl2) 1169 __releases(ptl1) 1170 __releases(ptl2) 1171 { 1172 spin_unlock(ptl1); 1173 if (ptl1 != ptl2) 1174 spin_unlock(ptl2); 1175 else 1176 __release(ptl2); 1177 } 1178 1179 static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, 1180 pte_t orig_dst_pte, pte_t orig_src_pte, 1181 pmd_t *dst_pmd, pmd_t dst_pmdval) 1182 { 1183 return pte_same(ptep_get(src_pte), orig_src_pte) && 1184 pte_same(ptep_get(dst_pte), orig_dst_pte) && 1185 pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd)); 1186 } 1187 1188 /* 1189 * Checks if the two ptes and the corresponding folio are eligible for batched 1190 * move. If so, then returns pointer to the locked folio. Otherwise, returns NULL. 1191 * 1192 * NOTE: folio's reference is not required as the whole operation is within 1193 * PTL's critical section. 1194 */ 1195 static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, 1196 unsigned long src_addr, 1197 pte_t *src_pte, pte_t *dst_pte) 1198 { 1199 pte_t orig_dst_pte, orig_src_pte; 1200 struct folio *folio; 1201 1202 orig_dst_pte = ptep_get(dst_pte); 1203 if (!pte_none(orig_dst_pte)) 1204 return NULL; 1205 1206 orig_src_pte = ptep_get(src_pte); 1207 if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte))) 1208 return NULL; 1209 1210 folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); 1211 if (!folio || !folio_trylock(folio)) 1212 return NULL; 1213 if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) { 1214 folio_unlock(folio); 1215 return NULL; 1216 } 1217 return folio; 1218 } 1219 1220 /* 1221 * Moves src folios to dst in a batch as long as they are not large, and can 1222 * successfully take the lock via folio_trylock(). 1223 */ 1224 static long move_present_ptes(struct mm_struct *mm, 1225 struct vm_area_struct *dst_vma, 1226 struct vm_area_struct *src_vma, 1227 unsigned long dst_addr, unsigned long src_addr, 1228 pte_t *dst_pte, pte_t *src_pte, 1229 pte_t orig_dst_pte, pte_t orig_src_pte, 1230 pmd_t *dst_pmd, pmd_t dst_pmdval, 1231 spinlock_t *dst_ptl, spinlock_t *src_ptl, 1232 struct folio **first_src_folio, unsigned long len) 1233 { 1234 int err = 0; 1235 struct folio *src_folio = *first_src_folio; 1236 unsigned long src_start = src_addr; 1237 unsigned long src_end; 1238 1239 len = pmd_addr_end(dst_addr, dst_addr + len) - dst_addr; 1240 src_end = pmd_addr_end(src_addr, src_addr + len); 1241 flush_cache_range(src_vma, src_addr, src_end); 1242 double_pt_lock(dst_ptl, src_ptl); 1243 1244 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1245 dst_pmd, dst_pmdval)) { 1246 err = -EAGAIN; 1247 goto out; 1248 } 1249 if (folio_test_large(src_folio) || 1250 folio_maybe_dma_pinned(src_folio) || 1251 !PageAnonExclusive(&src_folio->page)) { 1252 err = -EBUSY; 1253 goto out; 1254 } 1255 /* It's safe to drop the reference now as the page-table is holding one. */ 1256 folio_put(*first_src_folio); 1257 *first_src_folio = NULL; 1258 lazy_mmu_mode_enable(); 1259 1260 while (true) { 1261 orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); 1262 /* Folio got pinned from under us. Put it back and fail the move. */ 1263 if (folio_maybe_dma_pinned(src_folio)) { 1264 set_pte_at(mm, src_addr, src_pte, orig_src_pte); 1265 err = -EBUSY; 1266 break; 1267 } 1268 1269 folio_move_anon_rmap(src_folio, dst_vma); 1270 src_folio->index = linear_page_index(dst_vma, dst_addr); 1271 1272 orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); 1273 /* Set soft dirty bit so userspace can notice the pte was moved */ 1274 if (pgtable_supports_soft_dirty()) 1275 orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); 1276 if (pte_dirty(orig_src_pte)) 1277 orig_dst_pte = pte_mkdirty(orig_dst_pte); 1278 orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); 1279 set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); 1280 1281 src_addr += PAGE_SIZE; 1282 if (src_addr == src_end) 1283 break; 1284 dst_addr += PAGE_SIZE; 1285 dst_pte++; 1286 src_pte++; 1287 1288 folio_unlock(src_folio); 1289 src_folio = check_ptes_for_batched_move(src_vma, src_addr, 1290 src_pte, dst_pte); 1291 if (!src_folio) 1292 break; 1293 } 1294 1295 lazy_mmu_mode_disable(); 1296 if (src_addr > src_start) 1297 flush_tlb_range(src_vma, src_start, src_addr); 1298 1299 if (src_folio) 1300 folio_unlock(src_folio); 1301 out: 1302 double_pt_unlock(dst_ptl, src_ptl); 1303 return src_addr > src_start ? src_addr - src_start : err; 1304 } 1305 1306 static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, 1307 unsigned long dst_addr, unsigned long src_addr, 1308 pte_t *dst_pte, pte_t *src_pte, 1309 pte_t orig_dst_pte, pte_t orig_src_pte, 1310 pmd_t *dst_pmd, pmd_t dst_pmdval, 1311 spinlock_t *dst_ptl, spinlock_t *src_ptl, 1312 struct folio *src_folio, 1313 struct swap_info_struct *si, swp_entry_t entry) 1314 { 1315 /* 1316 * Check if the folio still belongs to the target swap entry after 1317 * acquiring the lock. Folio can be freed in the swap cache while 1318 * not locked. 1319 */ 1320 if (src_folio && unlikely(!folio_test_swapcache(src_folio) || 1321 entry.val != src_folio->swap.val)) 1322 return -EAGAIN; 1323 1324 double_pt_lock(dst_ptl, src_ptl); 1325 1326 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1327 dst_pmd, dst_pmdval)) { 1328 double_pt_unlock(dst_ptl, src_ptl); 1329 return -EAGAIN; 1330 } 1331 1332 /* 1333 * The src_folio resides in the swapcache, requiring an update to its 1334 * index and mapping to align with the dst_vma, where a swap-in may 1335 * occur and hit the swapcache after moving the PTE. 1336 */ 1337 if (src_folio) { 1338 folio_move_anon_rmap(src_folio, dst_vma); 1339 src_folio->index = linear_page_index(dst_vma, dst_addr); 1340 } else { 1341 /* 1342 * Check if the swap entry is cached after acquiring the src_pte 1343 * lock. Otherwise, we might miss a newly loaded swap cache folio. 1344 * 1345 * We are trying to catch newly added swap cache, the only possible case is 1346 * when a folio is swapped in and out again staying in swap cache, using the 1347 * same entry before the PTE check above. The PTL is acquired and released 1348 * twice, each time after updating the swap table. So holding 1349 * the PTL here ensures we see the updated value. 1350 */ 1351 if (swap_cache_has_folio(entry)) { 1352 double_pt_unlock(dst_ptl, src_ptl); 1353 return -EAGAIN; 1354 } 1355 } 1356 1357 orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); 1358 if (pgtable_supports_soft_dirty()) 1359 orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); 1360 set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); 1361 double_pt_unlock(dst_ptl, src_ptl); 1362 1363 return PAGE_SIZE; 1364 } 1365 1366 static int move_zeropage_pte(struct mm_struct *mm, 1367 struct vm_area_struct *dst_vma, 1368 struct vm_area_struct *src_vma, 1369 unsigned long dst_addr, unsigned long src_addr, 1370 pte_t *dst_pte, pte_t *src_pte, 1371 pte_t orig_dst_pte, pte_t orig_src_pte, 1372 pmd_t *dst_pmd, pmd_t dst_pmdval, 1373 spinlock_t *dst_ptl, spinlock_t *src_ptl) 1374 { 1375 pte_t zero_pte; 1376 1377 double_pt_lock(dst_ptl, src_ptl); 1378 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1379 dst_pmd, dst_pmdval)) { 1380 double_pt_unlock(dst_ptl, src_ptl); 1381 return -EAGAIN; 1382 } 1383 1384 zero_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), 1385 dst_vma->vm_page_prot)); 1386 ptep_clear_flush(src_vma, src_addr, src_pte); 1387 set_pte_at(mm, dst_addr, dst_pte, zero_pte); 1388 double_pt_unlock(dst_ptl, src_ptl); 1389 1390 return PAGE_SIZE; 1391 } 1392 1393 1394 /* 1395 * The mmap_lock for reading is held by the caller. Just move the page(s) 1396 * from src_pmd to dst_pmd if possible, and return number of bytes moved. 1397 * On failure, an error code is returned. 1398 */ 1399 static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, 1400 struct vm_area_struct *dst_vma, 1401 struct vm_area_struct *src_vma, 1402 unsigned long dst_addr, unsigned long src_addr, 1403 unsigned long len, __u64 mode) 1404 { 1405 struct swap_info_struct *si = NULL; 1406 pte_t orig_src_pte, orig_dst_pte; 1407 pte_t src_folio_pte; 1408 spinlock_t *src_ptl, *dst_ptl; 1409 pte_t *src_pte = NULL; 1410 pte_t *dst_pte = NULL; 1411 pmd_t dummy_pmdval; 1412 pmd_t dst_pmdval; 1413 struct folio *src_folio = NULL; 1414 struct mmu_notifier_range range; 1415 long ret = 0; 1416 1417 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 1418 src_addr, src_addr + len); 1419 mmu_notifier_invalidate_range_start(&range); 1420 retry: 1421 /* 1422 * Use the maywrite version to indicate that dst_pte will be modified, 1423 * since dst_pte needs to be none, the subsequent pte_same() check 1424 * cannot prevent the dst_pte page from being freed concurrently, so we 1425 * also need to obtain dst_pmdval and recheck pmd_same() later. 1426 */ 1427 dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval, 1428 &dst_ptl); 1429 1430 /* Retry if a huge pmd materialized from under us */ 1431 if (unlikely(!dst_pte)) { 1432 ret = -EAGAIN; 1433 goto out; 1434 } 1435 1436 /* 1437 * Unlike dst_pte, the subsequent pte_same() check can ensure the 1438 * stability of the src_pte page, so there is no need to get pmdval, 1439 * just pass a dummy variable to it. 1440 */ 1441 src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval, 1442 &src_ptl); 1443 1444 /* 1445 * We held the mmap_lock for reading so MADV_DONTNEED 1446 * can zap transparent huge pages under us, or the 1447 * transparent huge page fault can establish new 1448 * transparent huge pages under us. 1449 */ 1450 if (unlikely(!src_pte)) { 1451 ret = -EAGAIN; 1452 goto out; 1453 } 1454 1455 /* Sanity checks before the operation */ 1456 if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) || 1457 pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) { 1458 ret = -EINVAL; 1459 goto out; 1460 } 1461 1462 spin_lock(dst_ptl); 1463 orig_dst_pte = ptep_get(dst_pte); 1464 spin_unlock(dst_ptl); 1465 if (!pte_none(orig_dst_pte)) { 1466 ret = -EEXIST; 1467 goto out; 1468 } 1469 1470 spin_lock(src_ptl); 1471 orig_src_pte = ptep_get(src_pte); 1472 spin_unlock(src_ptl); 1473 if (pte_none(orig_src_pte)) { 1474 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) 1475 ret = -ENOENT; 1476 else /* nothing to do to move a hole */ 1477 ret = PAGE_SIZE; 1478 goto out; 1479 } 1480 1481 /* If PTE changed after we locked the folio then start over */ 1482 if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { 1483 ret = -EAGAIN; 1484 goto out; 1485 } 1486 1487 if (pte_present(orig_src_pte)) { 1488 if (is_zero_pfn(pte_pfn(orig_src_pte))) { 1489 ret = move_zeropage_pte(mm, dst_vma, src_vma, 1490 dst_addr, src_addr, dst_pte, src_pte, 1491 orig_dst_pte, orig_src_pte, 1492 dst_pmd, dst_pmdval, dst_ptl, src_ptl); 1493 goto out; 1494 } 1495 1496 /* 1497 * Pin and lock source folio. Since we are in RCU read section, 1498 * we can't block, so on contention have to unmap the ptes, 1499 * obtain the lock and retry. 1500 */ 1501 if (!src_folio) { 1502 struct folio *folio; 1503 bool locked; 1504 1505 /* 1506 * Pin the page while holding the lock to be sure the 1507 * page isn't freed under us 1508 */ 1509 spin_lock(src_ptl); 1510 if (!pte_same(orig_src_pte, ptep_get(src_pte))) { 1511 spin_unlock(src_ptl); 1512 ret = -EAGAIN; 1513 goto out; 1514 } 1515 1516 folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); 1517 if (!folio || !PageAnonExclusive(&folio->page)) { 1518 spin_unlock(src_ptl); 1519 ret = -EBUSY; 1520 goto out; 1521 } 1522 1523 locked = folio_trylock(folio); 1524 /* 1525 * We avoid waiting for folio lock with a raised 1526 * refcount for large folios because extra refcounts 1527 * will result in split_folio() failing later and 1528 * retrying. If multiple tasks are trying to move a 1529 * large folio we can end up livelocking. 1530 */ 1531 if (!locked && folio_test_large(folio)) { 1532 spin_unlock(src_ptl); 1533 ret = -EAGAIN; 1534 goto out; 1535 } 1536 1537 folio_get(folio); 1538 src_folio = folio; 1539 src_folio_pte = orig_src_pte; 1540 spin_unlock(src_ptl); 1541 1542 if (!locked) { 1543 pte_unmap(src_pte); 1544 pte_unmap(dst_pte); 1545 src_pte = dst_pte = NULL; 1546 /* now we can block and wait */ 1547 folio_lock(src_folio); 1548 goto retry; 1549 } 1550 1551 if (WARN_ON_ONCE(!folio_test_anon(src_folio))) { 1552 ret = -EBUSY; 1553 goto out; 1554 } 1555 } 1556 1557 /* at this point we have src_folio locked */ 1558 if (folio_test_large(src_folio)) { 1559 /* split_folio() can block */ 1560 pte_unmap(src_pte); 1561 pte_unmap(dst_pte); 1562 src_pte = dst_pte = NULL; 1563 ret = split_folio(src_folio); 1564 if (ret) 1565 goto out; 1566 /* have to reacquire the folio after it got split */ 1567 folio_unlock(src_folio); 1568 folio_put(src_folio); 1569 src_folio = NULL; 1570 goto retry; 1571 } 1572 1573 ret = move_present_ptes(mm, dst_vma, src_vma, 1574 dst_addr, src_addr, dst_pte, src_pte, 1575 orig_dst_pte, orig_src_pte, dst_pmd, 1576 dst_pmdval, dst_ptl, src_ptl, &src_folio, 1577 len); 1578 } else { /* !pte_present() */ 1579 struct folio *folio = NULL; 1580 const softleaf_t entry = softleaf_from_pte(orig_src_pte); 1581 1582 if (softleaf_is_migration(entry)) { 1583 pte_unmap(src_pte); 1584 pte_unmap(dst_pte); 1585 src_pte = dst_pte = NULL; 1586 migration_entry_wait(mm, src_pmd, src_addr); 1587 1588 ret = -EAGAIN; 1589 goto out; 1590 } else if (!softleaf_is_swap(entry)) { 1591 ret = -EFAULT; 1592 goto out; 1593 } 1594 1595 if (!pte_swp_exclusive(orig_src_pte)) { 1596 ret = -EBUSY; 1597 goto out; 1598 } 1599 1600 si = get_swap_device(entry); 1601 if (unlikely(!si)) { 1602 ret = -EAGAIN; 1603 goto out; 1604 } 1605 /* 1606 * Verify the existence of the swapcache. If present, the folio's 1607 * index and mapping must be updated even when the PTE is a swap 1608 * entry. The anon_vma lock is not taken during this process since 1609 * the folio has already been unmapped, and the swap entry is 1610 * exclusive, preventing rmap walks. 1611 * 1612 * For large folios, return -EBUSY immediately, as split_folio() 1613 * also returns -EBUSY when attempting to split unmapped large 1614 * folios in the swapcache. This issue needs to be resolved 1615 * separately to allow proper handling. 1616 */ 1617 if (!src_folio) 1618 folio = swap_cache_get_folio(entry); 1619 if (folio) { 1620 if (folio_test_large(folio)) { 1621 ret = -EBUSY; 1622 folio_put(folio); 1623 goto out; 1624 } 1625 src_folio = folio; 1626 src_folio_pte = orig_src_pte; 1627 if (!folio_trylock(src_folio)) { 1628 pte_unmap(src_pte); 1629 pte_unmap(dst_pte); 1630 src_pte = dst_pte = NULL; 1631 put_swap_device(si); 1632 si = NULL; 1633 /* now we can block and wait */ 1634 folio_lock(src_folio); 1635 goto retry; 1636 } 1637 } 1638 ret = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, 1639 orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, 1640 dst_ptl, src_ptl, src_folio, si, entry); 1641 } 1642 1643 out: 1644 if (src_folio) { 1645 folio_unlock(src_folio); 1646 folio_put(src_folio); 1647 } 1648 /* 1649 * Unmap in reverse order (LIFO) to maintain proper kmap_local 1650 * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte 1651 * first, then src_pte, so we must unmap src_pte first, then dst_pte. 1652 */ 1653 if (src_pte) 1654 pte_unmap(src_pte); 1655 if (dst_pte) 1656 pte_unmap(dst_pte); 1657 mmu_notifier_invalidate_range_end(&range); 1658 if (si) 1659 put_swap_device(si); 1660 1661 return ret; 1662 } 1663 1664 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1665 static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1666 unsigned long src_addr, 1667 unsigned long src_end) 1668 { 1669 return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) || 1670 src_end - src_addr < HPAGE_PMD_SIZE; 1671 } 1672 #else 1673 static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1674 unsigned long src_addr, 1675 unsigned long src_end) 1676 { 1677 /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */ 1678 return false; 1679 } 1680 #endif 1681 1682 static inline bool vma_move_compatible(struct vm_area_struct *vma) 1683 { 1684 return !(vma->vm_flags & (VM_PFNMAP | VM_IO | VM_HUGETLB | 1685 VM_MIXEDMAP | VM_SHADOW_STACK)); 1686 } 1687 1688 static int validate_move_areas(struct userfaultfd_ctx *ctx, 1689 struct vm_area_struct *src_vma, 1690 struct vm_area_struct *dst_vma) 1691 { 1692 /* Only allow moving if both have the same access and protection */ 1693 if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) || 1694 pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot)) 1695 return -EINVAL; 1696 1697 /* Only allow moving if both are mlocked or both aren't */ 1698 if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED)) 1699 return -EINVAL; 1700 1701 /* 1702 * For now, we keep it simple and only move between writable VMAs. 1703 * Access flags are equal, therefore checking only the source is enough. 1704 */ 1705 if (!(src_vma->vm_flags & VM_WRITE)) 1706 return -EINVAL; 1707 1708 /* Check if vma flags indicate content which can be moved */ 1709 if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma)) 1710 return -EINVAL; 1711 1712 /* Ensure dst_vma is registered in uffd we are operating on */ 1713 if (!dst_vma->vm_userfaultfd_ctx.ctx || 1714 dst_vma->vm_userfaultfd_ctx.ctx != ctx) 1715 return -EINVAL; 1716 1717 /* Only allow moving across anonymous vmas */ 1718 if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma)) 1719 return -EINVAL; 1720 1721 return 0; 1722 } 1723 1724 static __always_inline 1725 int find_vmas_mm_locked(struct mm_struct *mm, 1726 unsigned long dst_start, 1727 unsigned long src_start, 1728 struct vm_area_struct **dst_vmap, 1729 struct vm_area_struct **src_vmap) 1730 { 1731 struct vm_area_struct *vma; 1732 1733 mmap_assert_locked(mm); 1734 vma = find_vma_and_prepare_anon(mm, dst_start); 1735 if (IS_ERR(vma)) 1736 return PTR_ERR(vma); 1737 1738 *dst_vmap = vma; 1739 /* Skip finding src_vma if src_start is in dst_vma */ 1740 if (src_start >= vma->vm_start && src_start < vma->vm_end) 1741 goto out_success; 1742 1743 vma = vma_lookup(mm, src_start); 1744 if (!vma) 1745 return -ENOENT; 1746 out_success: 1747 *src_vmap = vma; 1748 return 0; 1749 } 1750 1751 #ifdef CONFIG_PER_VMA_LOCK 1752 static int uffd_move_lock(struct mm_struct *mm, 1753 unsigned long dst_start, 1754 unsigned long src_start, 1755 struct vm_area_struct **dst_vmap, 1756 struct vm_area_struct **src_vmap) 1757 { 1758 struct vm_area_struct *vma; 1759 int err; 1760 1761 vma = uffd_lock_vma(mm, dst_start); 1762 if (IS_ERR(vma)) 1763 return PTR_ERR(vma); 1764 1765 *dst_vmap = vma; 1766 /* 1767 * Skip finding src_vma if src_start is in dst_vma. This also ensures 1768 * that we don't lock the same vma twice. 1769 */ 1770 if (src_start >= vma->vm_start && src_start < vma->vm_end) { 1771 *src_vmap = vma; 1772 return 0; 1773 } 1774 1775 /* 1776 * Using uffd_lock_vma() to get src_vma can lead to following deadlock: 1777 * 1778 * Thread1 Thread2 1779 * ------- ------- 1780 * vma_start_read(dst_vma) 1781 * mmap_write_lock(mm) 1782 * vma_start_write(src_vma) 1783 * vma_start_read(src_vma) 1784 * mmap_read_lock(mm) 1785 * vma_start_write(dst_vma) 1786 */ 1787 *src_vmap = lock_vma_under_rcu(mm, src_start); 1788 if (likely(*src_vmap)) 1789 return 0; 1790 1791 /* Undo any locking and retry in mmap_lock critical section */ 1792 vma_end_read(*dst_vmap); 1793 1794 mmap_read_lock(mm); 1795 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); 1796 if (err) 1797 goto out; 1798 1799 if (!vma_start_read_locked(*dst_vmap)) { 1800 err = -EAGAIN; 1801 goto out; 1802 } 1803 1804 /* Nothing further to do if both vmas are locked. */ 1805 if (*dst_vmap == *src_vmap) 1806 goto out; 1807 1808 if (!vma_start_read_locked_nested(*src_vmap, SINGLE_DEPTH_NESTING)) { 1809 /* Undo dst_vmap locking if src_vmap failed to lock */ 1810 vma_end_read(*dst_vmap); 1811 err = -EAGAIN; 1812 } 1813 out: 1814 mmap_read_unlock(mm); 1815 return err; 1816 } 1817 1818 static void uffd_move_unlock(struct vm_area_struct *dst_vma, 1819 struct vm_area_struct *src_vma) 1820 { 1821 vma_end_read(src_vma); 1822 if (src_vma != dst_vma) 1823 vma_end_read(dst_vma); 1824 } 1825 1826 #else 1827 1828 static int uffd_move_lock(struct mm_struct *mm, 1829 unsigned long dst_start, 1830 unsigned long src_start, 1831 struct vm_area_struct **dst_vmap, 1832 struct vm_area_struct **src_vmap) 1833 { 1834 int err; 1835 1836 mmap_read_lock(mm); 1837 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); 1838 if (err) 1839 mmap_read_unlock(mm); 1840 return err; 1841 } 1842 1843 static void uffd_move_unlock(struct vm_area_struct *dst_vma, 1844 struct vm_area_struct *src_vma) 1845 { 1846 mmap_assert_locked(src_vma->vm_mm); 1847 mmap_read_unlock(dst_vma->vm_mm); 1848 } 1849 #endif 1850 1851 /** 1852 * move_pages - move arbitrary anonymous pages of an existing vma 1853 * @ctx: pointer to the userfaultfd context 1854 * @dst_start: start of the destination virtual memory range 1855 * @src_start: start of the source virtual memory range 1856 * @len: length of the virtual memory range 1857 * @mode: flags from uffdio_move.mode 1858 * 1859 * It will either use the mmap_lock in read mode or per-vma locks 1860 * 1861 * move_pages() remaps arbitrary anonymous pages atomically in zero 1862 * copy. It only works on non shared anonymous pages because those can 1863 * be relocated without generating non linear anon_vmas in the rmap 1864 * code. 1865 * 1866 * It provides a zero copy mechanism to handle userspace page faults. 1867 * The source vma pages should have mapcount == 1, which can be 1868 * enforced by using madvise(MADV_DONTFORK) on src vma. 1869 * 1870 * The thread receiving the page during the userland page fault 1871 * will receive the faulting page in the source vma through the network, 1872 * storage or any other I/O device (MADV_DONTFORK in the source vma 1873 * avoids move_pages() to fail with -EBUSY if the process forks before 1874 * move_pages() is called), then it will call move_pages() to map the 1875 * page in the faulting address in the destination vma. 1876 * 1877 * This userfaultfd command works purely via pagetables, so it's the 1878 * most efficient way to move physical non shared anonymous pages 1879 * across different virtual addresses. Unlike mremap()/mmap()/munmap() 1880 * it does not create any new vmas. The mapping in the destination 1881 * address is atomic. 1882 * 1883 * It only works if the vma protection bits are identical from the 1884 * source and destination vma. 1885 * 1886 * It can remap non shared anonymous pages within the same vma too. 1887 * 1888 * If the source virtual memory range has any unmapped holes, or if 1889 * the destination virtual memory range is not a whole unmapped hole, 1890 * move_pages() will fail respectively with -ENOENT or -EEXIST. This 1891 * provides a very strict behavior to avoid any chance of memory 1892 * corruption going unnoticed if there are userland race conditions. 1893 * Only one thread should resolve the userland page fault at any given 1894 * time for any given faulting address. This means that if two threads 1895 * try to both call move_pages() on the same destination address at the 1896 * same time, the second thread will get an explicit error from this 1897 * command. 1898 * 1899 * The command retval will return "len" is successful. The command 1900 * however can be interrupted by fatal signals or errors. If 1901 * interrupted it will return the number of bytes successfully 1902 * remapped before the interruption if any, or the negative error if 1903 * none. It will never return zero. Either it will return an error or 1904 * an amount of bytes successfully moved. If the retval reports a 1905 * "short" remap, the move_pages() command should be repeated by 1906 * userland with src+retval, dst+reval, len-retval if it wants to know 1907 * about the error that interrupted it. 1908 * 1909 * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to 1910 * prevent -ENOENT errors to materialize if there are holes in the 1911 * source virtual range that is being remapped. The holes will be 1912 * accounted as successfully remapped in the retval of the 1913 * command. This is mostly useful to remap hugepage naturally aligned 1914 * virtual regions without knowing if there are transparent hugepage 1915 * in the regions or not, but preventing the risk of having to split 1916 * the hugepmd during the remap. 1917 */ 1918 ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, 1919 unsigned long src_start, unsigned long len, __u64 mode) 1920 { 1921 struct mm_struct *mm = ctx->mm; 1922 struct vm_area_struct *src_vma, *dst_vma; 1923 unsigned long src_addr, dst_addr, src_end; 1924 pmd_t *src_pmd, *dst_pmd; 1925 long err = -EINVAL; 1926 ssize_t moved = 0; 1927 1928 /* Sanitize the command parameters. */ 1929 VM_WARN_ON_ONCE(src_start & ~PAGE_MASK); 1930 VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); 1931 VM_WARN_ON_ONCE(len & ~PAGE_MASK); 1932 1933 /* Does the address range wrap, or is the span zero-sized? */ 1934 VM_WARN_ON_ONCE(src_start + len < src_start); 1935 VM_WARN_ON_ONCE(dst_start + len < dst_start); 1936 1937 err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma); 1938 if (err) 1939 goto out; 1940 1941 /* Re-check after taking map_changing_lock */ 1942 err = -EAGAIN; 1943 down_read(&ctx->map_changing_lock); 1944 if (likely(atomic_read(&ctx->mmap_changing))) 1945 goto out_unlock; 1946 /* 1947 * Make sure the vma is not shared, that the src and dst remap 1948 * ranges are both valid and fully within a single existing 1949 * vma. 1950 */ 1951 err = -EINVAL; 1952 if (src_vma->vm_flags & VM_SHARED) 1953 goto out_unlock; 1954 if (src_start + len > src_vma->vm_end) 1955 goto out_unlock; 1956 1957 if (dst_vma->vm_flags & VM_SHARED) 1958 goto out_unlock; 1959 if (dst_start + len > dst_vma->vm_end) 1960 goto out_unlock; 1961 1962 err = validate_move_areas(ctx, src_vma, dst_vma); 1963 if (err) 1964 goto out_unlock; 1965 1966 for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len; 1967 src_addr < src_end;) { 1968 spinlock_t *ptl; 1969 pmd_t dst_pmdval; 1970 unsigned long step_size; 1971 1972 /* 1973 * Below works because anonymous area would not have a 1974 * transparent huge PUD. If file-backed support is added, 1975 * that case would need to be handled here. 1976 */ 1977 src_pmd = mm_find_pmd(mm, src_addr); 1978 if (unlikely(!src_pmd)) { 1979 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 1980 err = -ENOENT; 1981 break; 1982 } 1983 src_pmd = mm_alloc_pmd(mm, src_addr); 1984 if (unlikely(!src_pmd)) { 1985 err = -ENOMEM; 1986 break; 1987 } 1988 } 1989 dst_pmd = mm_alloc_pmd(mm, dst_addr); 1990 if (unlikely(!dst_pmd)) { 1991 err = -ENOMEM; 1992 break; 1993 } 1994 1995 dst_pmdval = pmdp_get_lockless(dst_pmd); 1996 /* 1997 * If the dst_pmd is mapped as THP don't override it and just 1998 * be strict. If dst_pmd changes into TPH after this check, the 1999 * move_pages_huge_pmd() will detect the change and retry 2000 * while move_pages_pte() will detect the change and fail. 2001 */ 2002 if (unlikely(pmd_trans_huge(dst_pmdval))) { 2003 err = -EEXIST; 2004 break; 2005 } 2006 2007 ptl = pmd_trans_huge_lock(src_pmd, src_vma); 2008 if (ptl) { 2009 /* Check if we can move the pmd without splitting it. */ 2010 if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || 2011 !pmd_none(dst_pmdval)) { 2012 /* Can be a migration entry */ 2013 if (pmd_present(*src_pmd)) { 2014 struct folio *folio = pmd_folio(*src_pmd); 2015 2016 if (!is_huge_zero_folio(folio) && 2017 !PageAnonExclusive(&folio->page)) { 2018 spin_unlock(ptl); 2019 err = -EBUSY; 2020 break; 2021 } 2022 } 2023 2024 spin_unlock(ptl); 2025 split_huge_pmd(src_vma, src_pmd, src_addr); 2026 /* The folio will be split by move_pages_pte() */ 2027 continue; 2028 } 2029 2030 err = move_pages_huge_pmd(mm, dst_pmd, src_pmd, 2031 dst_pmdval, dst_vma, src_vma, 2032 dst_addr, src_addr); 2033 step_size = HPAGE_PMD_SIZE; 2034 } else { 2035 long ret; 2036 2037 if (pmd_none(*src_pmd)) { 2038 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 2039 err = -ENOENT; 2040 break; 2041 } 2042 if (unlikely(__pte_alloc(mm, src_pmd))) { 2043 err = -ENOMEM; 2044 break; 2045 } 2046 } 2047 2048 if (unlikely(pte_alloc(mm, dst_pmd))) { 2049 err = -ENOMEM; 2050 break; 2051 } 2052 2053 ret = move_pages_ptes(mm, dst_pmd, src_pmd, 2054 dst_vma, src_vma, dst_addr, 2055 src_addr, src_end - src_addr, mode); 2056 if (ret < 0) 2057 err = ret; 2058 else 2059 step_size = ret; 2060 } 2061 2062 cond_resched(); 2063 2064 if (fatal_signal_pending(current)) { 2065 /* Do not override an error */ 2066 if (!err || err == -EAGAIN) 2067 err = -EINTR; 2068 break; 2069 } 2070 2071 if (err) { 2072 if (err == -EAGAIN) 2073 continue; 2074 break; 2075 } 2076 2077 /* Proceed to the next page */ 2078 dst_addr += step_size; 2079 src_addr += step_size; 2080 moved += step_size; 2081 } 2082 2083 out_unlock: 2084 up_read(&ctx->map_changing_lock); 2085 uffd_move_unlock(dst_vma, src_vma); 2086 out: 2087 VM_WARN_ON_ONCE(moved < 0); 2088 VM_WARN_ON_ONCE(err > 0); 2089 VM_WARN_ON_ONCE(!moved && !err); 2090 return moved ? moved : err; 2091 } 2092 2093 bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, 2094 bool wp_async) 2095 { 2096 const struct vm_uffd_ops *ops = vma_uffd_ops(vma); 2097 2098 if (vma->vm_flags & VM_DROPPABLE) 2099 return false; 2100 2101 vm_flags &= __VM_UFFD_FLAGS; 2102 2103 /* 2104 * If WP is the only mode enabled and context is wp async, allow any 2105 * memory type. 2106 */ 2107 if (wp_async && (vm_flags == VM_UFFD_WP)) 2108 return true; 2109 2110 /* For any other mode reject VMAs that don't implement vm_uffd_ops */ 2111 if (!ops) 2112 return false; 2113 2114 /* 2115 * If user requested uffd-wp but not enabled pte markers for 2116 * uffd-wp, then only anonymous memory is supported 2117 */ 2118 if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && 2119 !vma_is_anonymous(vma)) 2120 return false; 2121 2122 return ops->can_userfault(vma, vm_flags); 2123 } 2124 2125 static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, 2126 vm_flags_t vm_flags) 2127 { 2128 const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP; 2129 2130 vm_flags_reset(vma, vm_flags); 2131 /* 2132 * For shared mappings, we want to enable writenotify while 2133 * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply 2134 * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. 2135 */ 2136 if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) 2137 vma_set_page_prot(vma); 2138 } 2139 2140 static void userfaultfd_set_ctx(struct vm_area_struct *vma, 2141 struct userfaultfd_ctx *ctx, 2142 vm_flags_t vm_flags) 2143 { 2144 vma_start_write(vma); 2145 vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; 2146 userfaultfd_set_vm_flags(vma, 2147 (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags); 2148 } 2149 2150 void userfaultfd_reset_ctx(struct vm_area_struct *vma) 2151 { 2152 userfaultfd_set_ctx(vma, NULL, 0); 2153 } 2154 2155 struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, 2156 struct vm_area_struct *prev, 2157 struct vm_area_struct *vma, 2158 unsigned long start, 2159 unsigned long end) 2160 { 2161 struct vm_area_struct *ret; 2162 bool give_up_on_oom = false; 2163 vma_flags_t new_vma_flags = vma->flags; 2164 2165 vma_flags_clear_mask(&new_vma_flags, __VMA_UFFD_FLAGS); 2166 2167 /* 2168 * If we are modifying only and not splitting, just give up on the merge 2169 * if OOM prevents us from merging successfully. 2170 */ 2171 if (start == vma->vm_start && end == vma->vm_end) 2172 give_up_on_oom = true; 2173 2174 /* Reset ptes for the whole vma range if wr-protected */ 2175 if (userfaultfd_wp(vma)) 2176 uffd_wp_range(vma, start, end - start, false); 2177 2178 ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, 2179 &new_vma_flags, NULL_VM_UFFD_CTX, 2180 give_up_on_oom); 2181 2182 /* 2183 * In the vma_merge() successful mprotect-like case 8: 2184 * the next vma was merged into the current one and 2185 * the current one has not been updated yet. 2186 */ 2187 if (!IS_ERR(ret)) 2188 userfaultfd_reset_ctx(ret); 2189 2190 return ret; 2191 } 2192 2193 /* Assumes mmap write lock taken, and mm_struct pinned. */ 2194 int userfaultfd_register_range(struct userfaultfd_ctx *ctx, 2195 struct vm_area_struct *vma, 2196 vm_flags_t vm_flags, 2197 unsigned long start, unsigned long end, 2198 bool wp_async) 2199 { 2200 vma_flags_t vma_flags = legacy_to_vma_flags(vm_flags); 2201 VMA_ITERATOR(vmi, ctx->mm, start); 2202 struct vm_area_struct *prev = vma_prev(&vmi); 2203 unsigned long vma_end; 2204 vma_flags_t new_vma_flags; 2205 2206 if (vma->vm_start < start) 2207 prev = vma; 2208 2209 for_each_vma_range(vmi, vma, end) { 2210 cond_resched(); 2211 2212 VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async)); 2213 VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx && 2214 vma->vm_userfaultfd_ctx.ctx != ctx); 2215 VM_WARN_ON_ONCE(!vma_test(vma, VMA_MAYWRITE_BIT)); 2216 2217 /* 2218 * Nothing to do: this vma is already registered into this 2219 * userfaultfd and with the right tracking mode too. 2220 */ 2221 if (vma->vm_userfaultfd_ctx.ctx == ctx && 2222 vma_test_all_mask(vma, vma_flags)) 2223 goto skip; 2224 2225 if (vma->vm_start > start) 2226 start = vma->vm_start; 2227 vma_end = min(end, vma->vm_end); 2228 2229 new_vma_flags = vma->flags; 2230 vma_flags_clear_mask(&new_vma_flags, __VMA_UFFD_FLAGS); 2231 vma_flags_set_mask(&new_vma_flags, vma_flags); 2232 2233 vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, 2234 &new_vma_flags, 2235 (struct vm_userfaultfd_ctx){ctx}, 2236 /* give_up_on_oom = */false); 2237 if (IS_ERR(vma)) 2238 return PTR_ERR(vma); 2239 2240 /* 2241 * In the vma_merge() successful mprotect-like case 8: 2242 * the next vma was merged into the current one and 2243 * the current one has not been updated yet. 2244 */ 2245 userfaultfd_set_ctx(vma, ctx, vm_flags); 2246 2247 if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) 2248 hugetlb_unshare_all_pmds(vma); 2249 2250 skip: 2251 prev = vma; 2252 start = vma->vm_end; 2253 } 2254 2255 return 0; 2256 } 2257 2258 void userfaultfd_release_new(struct userfaultfd_ctx *ctx) 2259 { 2260 struct mm_struct *mm = ctx->mm; 2261 struct vm_area_struct *vma; 2262 VMA_ITERATOR(vmi, mm, 0); 2263 2264 /* the various vma->vm_userfaultfd_ctx still points to it */ 2265 mmap_write_lock(mm); 2266 for_each_vma(vmi, vma) { 2267 if (vma->vm_userfaultfd_ctx.ctx == ctx) 2268 userfaultfd_reset_ctx(vma); 2269 } 2270 mmap_write_unlock(mm); 2271 } 2272 2273 void userfaultfd_release_all(struct mm_struct *mm, 2274 struct userfaultfd_ctx *ctx) 2275 { 2276 struct vm_area_struct *vma, *prev; 2277 VMA_ITERATOR(vmi, mm, 0); 2278 2279 if (!mmget_not_zero(mm)) 2280 return; 2281 2282 /* 2283 * Flush page faults out of all CPUs. NOTE: all page faults 2284 * must be retried without returning VM_FAULT_SIGBUS if 2285 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx 2286 * changes while handle_userfault released the mmap_lock. So 2287 * it's critical that released is set to true (above), before 2288 * taking the mmap_lock for writing. 2289 */ 2290 mmap_write_lock(mm); 2291 prev = NULL; 2292 for_each_vma(vmi, vma) { 2293 cond_resched(); 2294 VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^ 2295 !!(vma->vm_flags & __VM_UFFD_FLAGS)); 2296 if (vma->vm_userfaultfd_ctx.ctx != ctx) { 2297 prev = vma; 2298 continue; 2299 } 2300 2301 vma = userfaultfd_clear_vma(&vmi, prev, vma, 2302 vma->vm_start, vma->vm_end); 2303 prev = vma; 2304 } 2305 mmap_write_unlock(mm); 2306 mmput(mm); 2307 } 2308