1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * mm/userfaultfd.c 4 * 5 * Copyright (C) 2015 Red Hat, Inc. 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/sched/signal.h> 10 #include <linux/pagemap.h> 11 #include <linux/rmap.h> 12 #include <linux/swap.h> 13 #include <linux/swapops.h> 14 #include <linux/userfaultfd_k.h> 15 #include <linux/mmu_notifier.h> 16 #include <linux/hugetlb.h> 17 #include <linux/shmem_fs.h> 18 #include <asm/tlbflush.h> 19 #include <asm/tlb.h> 20 #include "internal.h" 21 22 static __always_inline 23 bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) 24 { 25 /* Make sure that the dst range is fully within dst_vma. */ 26 if (dst_end > dst_vma->vm_end) 27 return false; 28 29 /* 30 * Check the vma is registered in uffd, this is required to 31 * enforce the VM_MAYWRITE check done at uffd registration 32 * time. 33 */ 34 if (!dst_vma->vm_userfaultfd_ctx.ctx) 35 return false; 36 37 return true; 38 } 39 40 static __always_inline 41 struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm, 42 unsigned long addr) 43 { 44 struct vm_area_struct *vma; 45 46 mmap_assert_locked(mm); 47 vma = vma_lookup(mm, addr); 48 if (!vma) 49 vma = ERR_PTR(-ENOENT); 50 else if (!(vma->vm_flags & VM_SHARED) && 51 unlikely(anon_vma_prepare(vma))) 52 vma = ERR_PTR(-ENOMEM); 53 54 return vma; 55 } 56 57 #ifdef CONFIG_PER_VMA_LOCK 58 /* 59 * uffd_lock_vma() - Lookup and lock vma corresponding to @address. 60 * @mm: mm to search vma in. 61 * @address: address that the vma should contain. 62 * 63 * Should be called without holding mmap_lock. 64 * 65 * Return: A locked vma containing @address, -ENOENT if no vma is found, or 66 * -ENOMEM if anon_vma couldn't be allocated. 67 */ 68 static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, 69 unsigned long address) 70 { 71 struct vm_area_struct *vma; 72 73 vma = lock_vma_under_rcu(mm, address); 74 if (vma) { 75 /* 76 * We know we're going to need to use anon_vma, so check 77 * that early. 78 */ 79 if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma)) 80 vma_end_read(vma); 81 else 82 return vma; 83 } 84 85 mmap_read_lock(mm); 86 vma = find_vma_and_prepare_anon(mm, address); 87 if (!IS_ERR(vma)) { 88 /* 89 * We cannot use vma_start_read() as it may fail due to 90 * false locked (see comment in vma_start_read()). We 91 * can avoid that by directly locking vm_lock under 92 * mmap_lock, which guarantees that nobody can lock the 93 * vma for write (vma_start_write()) under us. 94 */ 95 down_read(&vma->vm_lock->lock); 96 } 97 98 mmap_read_unlock(mm); 99 return vma; 100 } 101 102 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, 103 unsigned long dst_start, 104 unsigned long len) 105 { 106 struct vm_area_struct *dst_vma; 107 108 dst_vma = uffd_lock_vma(dst_mm, dst_start); 109 if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len)) 110 return dst_vma; 111 112 vma_end_read(dst_vma); 113 return ERR_PTR(-ENOENT); 114 } 115 116 static void uffd_mfill_unlock(struct vm_area_struct *vma) 117 { 118 vma_end_read(vma); 119 } 120 121 #else 122 123 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, 124 unsigned long dst_start, 125 unsigned long len) 126 { 127 struct vm_area_struct *dst_vma; 128 129 mmap_read_lock(dst_mm); 130 dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start); 131 if (IS_ERR(dst_vma)) 132 goto out_unlock; 133 134 if (validate_dst_vma(dst_vma, dst_start + len)) 135 return dst_vma; 136 137 dst_vma = ERR_PTR(-ENOENT); 138 out_unlock: 139 mmap_read_unlock(dst_mm); 140 return dst_vma; 141 } 142 143 static void uffd_mfill_unlock(struct vm_area_struct *vma) 144 { 145 mmap_read_unlock(vma->vm_mm); 146 } 147 #endif 148 149 /* Check if dst_addr is outside of file's size. Must be called with ptl held. */ 150 static bool mfill_file_over_size(struct vm_area_struct *dst_vma, 151 unsigned long dst_addr) 152 { 153 struct inode *inode; 154 pgoff_t offset, max_off; 155 156 if (!dst_vma->vm_file) 157 return false; 158 159 inode = dst_vma->vm_file->f_inode; 160 offset = linear_page_index(dst_vma, dst_addr); 161 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 162 return offset >= max_off; 163 } 164 165 /* 166 * Install PTEs, to map dst_addr (within dst_vma) to page. 167 * 168 * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem 169 * and anon, and for both shared and private VMAs. 170 */ 171 int mfill_atomic_install_pte(pmd_t *dst_pmd, 172 struct vm_area_struct *dst_vma, 173 unsigned long dst_addr, struct page *page, 174 bool newly_allocated, uffd_flags_t flags) 175 { 176 int ret; 177 struct mm_struct *dst_mm = dst_vma->vm_mm; 178 pte_t _dst_pte, *dst_pte; 179 bool writable = dst_vma->vm_flags & VM_WRITE; 180 bool vm_shared = dst_vma->vm_flags & VM_SHARED; 181 spinlock_t *ptl; 182 struct folio *folio = page_folio(page); 183 bool page_in_cache = folio_mapping(folio); 184 185 _dst_pte = mk_pte(page, dst_vma->vm_page_prot); 186 _dst_pte = pte_mkdirty(_dst_pte); 187 if (page_in_cache && !vm_shared) 188 writable = false; 189 if (writable) 190 _dst_pte = pte_mkwrite(_dst_pte, dst_vma); 191 if (flags & MFILL_ATOMIC_WP) 192 _dst_pte = pte_mkuffd_wp(_dst_pte); 193 194 ret = -EAGAIN; 195 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 196 if (!dst_pte) 197 goto out; 198 199 if (mfill_file_over_size(dst_vma, dst_addr)) { 200 ret = -EFAULT; 201 goto out_unlock; 202 } 203 204 ret = -EEXIST; 205 /* 206 * We allow to overwrite a pte marker: consider when both MISSING|WP 207 * registered, we firstly wr-protect a none pte which has no page cache 208 * page backing it, then access the page. 209 */ 210 if (!pte_none_mostly(ptep_get(dst_pte))) 211 goto out_unlock; 212 213 if (page_in_cache) { 214 /* Usually, cache pages are already added to LRU */ 215 if (newly_allocated) 216 folio_add_lru(folio); 217 folio_add_file_rmap_pte(folio, page, dst_vma); 218 } else { 219 folio_add_new_anon_rmap(folio, dst_vma, dst_addr); 220 folio_add_lru_vma(folio, dst_vma); 221 } 222 223 /* 224 * Must happen after rmap, as mm_counter() checks mapping (via 225 * PageAnon()), which is set by __page_set_anon_rmap(). 226 */ 227 inc_mm_counter(dst_mm, mm_counter(folio)); 228 229 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 230 231 /* No need to invalidate - it was non-present before */ 232 update_mmu_cache(dst_vma, dst_addr, dst_pte); 233 ret = 0; 234 out_unlock: 235 pte_unmap_unlock(dst_pte, ptl); 236 out: 237 return ret; 238 } 239 240 static int mfill_atomic_pte_copy(pmd_t *dst_pmd, 241 struct vm_area_struct *dst_vma, 242 unsigned long dst_addr, 243 unsigned long src_addr, 244 uffd_flags_t flags, 245 struct folio **foliop) 246 { 247 void *kaddr; 248 int ret; 249 struct folio *folio; 250 251 if (!*foliop) { 252 ret = -ENOMEM; 253 folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, 254 dst_addr, false); 255 if (!folio) 256 goto out; 257 258 kaddr = kmap_local_folio(folio, 0); 259 /* 260 * The read mmap_lock is held here. Despite the 261 * mmap_lock being read recursive a deadlock is still 262 * possible if a writer has taken a lock. For example: 263 * 264 * process A thread 1 takes read lock on own mmap_lock 265 * process A thread 2 calls mmap, blocks taking write lock 266 * process B thread 1 takes page fault, read lock on own mmap lock 267 * process B thread 2 calls mmap, blocks taking write lock 268 * process A thread 1 blocks taking read lock on process B 269 * process B thread 1 blocks taking read lock on process A 270 * 271 * Disable page faults to prevent potential deadlock 272 * and retry the copy outside the mmap_lock. 273 */ 274 pagefault_disable(); 275 ret = copy_from_user(kaddr, (const void __user *) src_addr, 276 PAGE_SIZE); 277 pagefault_enable(); 278 kunmap_local(kaddr); 279 280 /* fallback to copy_from_user outside mmap_lock */ 281 if (unlikely(ret)) { 282 ret = -ENOENT; 283 *foliop = folio; 284 /* don't free the page */ 285 goto out; 286 } 287 288 flush_dcache_folio(folio); 289 } else { 290 folio = *foliop; 291 *foliop = NULL; 292 } 293 294 /* 295 * The memory barrier inside __folio_mark_uptodate makes sure that 296 * preceding stores to the page contents become visible before 297 * the set_pte_at() write. 298 */ 299 __folio_mark_uptodate(folio); 300 301 ret = -ENOMEM; 302 if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) 303 goto out_release; 304 305 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 306 &folio->page, true, flags); 307 if (ret) 308 goto out_release; 309 out: 310 return ret; 311 out_release: 312 folio_put(folio); 313 goto out; 314 } 315 316 static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, 317 struct vm_area_struct *dst_vma, 318 unsigned long dst_addr) 319 { 320 pte_t _dst_pte, *dst_pte; 321 spinlock_t *ptl; 322 int ret; 323 324 _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), 325 dst_vma->vm_page_prot)); 326 ret = -EAGAIN; 327 dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); 328 if (!dst_pte) 329 goto out; 330 if (mfill_file_over_size(dst_vma, dst_addr)) { 331 ret = -EFAULT; 332 goto out_unlock; 333 } 334 ret = -EEXIST; 335 if (!pte_none(ptep_get(dst_pte))) 336 goto out_unlock; 337 set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); 338 /* No need to invalidate - it was non-present before */ 339 update_mmu_cache(dst_vma, dst_addr, dst_pte); 340 ret = 0; 341 out_unlock: 342 pte_unmap_unlock(dst_pte, ptl); 343 out: 344 return ret; 345 } 346 347 /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ 348 static int mfill_atomic_pte_continue(pmd_t *dst_pmd, 349 struct vm_area_struct *dst_vma, 350 unsigned long dst_addr, 351 uffd_flags_t flags) 352 { 353 struct inode *inode = file_inode(dst_vma->vm_file); 354 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 355 struct folio *folio; 356 struct page *page; 357 int ret; 358 359 ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC); 360 /* Our caller expects us to return -EFAULT if we failed to find folio */ 361 if (ret == -ENOENT) 362 ret = -EFAULT; 363 if (ret) 364 goto out; 365 if (!folio) { 366 ret = -EFAULT; 367 goto out; 368 } 369 370 page = folio_file_page(folio, pgoff); 371 if (PageHWPoison(page)) { 372 ret = -EIO; 373 goto out_release; 374 } 375 376 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 377 page, false, flags); 378 if (ret) 379 goto out_release; 380 381 folio_unlock(folio); 382 ret = 0; 383 out: 384 return ret; 385 out_release: 386 folio_unlock(folio); 387 folio_put(folio); 388 goto out; 389 } 390 391 /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ 392 static int mfill_atomic_pte_poison(pmd_t *dst_pmd, 393 struct vm_area_struct *dst_vma, 394 unsigned long dst_addr, 395 uffd_flags_t flags) 396 { 397 int ret; 398 struct mm_struct *dst_mm = dst_vma->vm_mm; 399 pte_t _dst_pte, *dst_pte; 400 spinlock_t *ptl; 401 402 _dst_pte = make_pte_marker(PTE_MARKER_POISONED); 403 ret = -EAGAIN; 404 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 405 if (!dst_pte) 406 goto out; 407 408 if (mfill_file_over_size(dst_vma, dst_addr)) { 409 ret = -EFAULT; 410 goto out_unlock; 411 } 412 413 ret = -EEXIST; 414 /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */ 415 if (!pte_none(ptep_get(dst_pte))) 416 goto out_unlock; 417 418 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 419 420 /* No need to invalidate - it was non-present before */ 421 update_mmu_cache(dst_vma, dst_addr, dst_pte); 422 ret = 0; 423 out_unlock: 424 pte_unmap_unlock(dst_pte, ptl); 425 out: 426 return ret; 427 } 428 429 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) 430 { 431 pgd_t *pgd; 432 p4d_t *p4d; 433 pud_t *pud; 434 435 pgd = pgd_offset(mm, address); 436 p4d = p4d_alloc(mm, pgd, address); 437 if (!p4d) 438 return NULL; 439 pud = pud_alloc(mm, p4d, address); 440 if (!pud) 441 return NULL; 442 /* 443 * Note that we didn't run this because the pmd was 444 * missing, the *pmd may be already established and in 445 * turn it may also be a trans_huge_pmd. 446 */ 447 return pmd_alloc(mm, pud, address); 448 } 449 450 #ifdef CONFIG_HUGETLB_PAGE 451 /* 452 * mfill_atomic processing for HUGETLB vmas. Note that this routine is 453 * called with either vma-lock or mmap_lock held, it will release the lock 454 * before returning. 455 */ 456 static __always_inline ssize_t mfill_atomic_hugetlb( 457 struct userfaultfd_ctx *ctx, 458 struct vm_area_struct *dst_vma, 459 unsigned long dst_start, 460 unsigned long src_start, 461 unsigned long len, 462 uffd_flags_t flags) 463 { 464 struct mm_struct *dst_mm = dst_vma->vm_mm; 465 ssize_t err; 466 pte_t *dst_pte; 467 unsigned long src_addr, dst_addr; 468 long copied; 469 struct folio *folio; 470 unsigned long vma_hpagesize; 471 pgoff_t idx; 472 u32 hash; 473 struct address_space *mapping; 474 475 /* 476 * There is no default zero huge page for all huge page sizes as 477 * supported by hugetlb. A PMD_SIZE huge pages may exist as used 478 * by THP. Since we can not reliably insert a zero page, this 479 * feature is not supported. 480 */ 481 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { 482 up_read(&ctx->map_changing_lock); 483 uffd_mfill_unlock(dst_vma); 484 return -EINVAL; 485 } 486 487 src_addr = src_start; 488 dst_addr = dst_start; 489 copied = 0; 490 folio = NULL; 491 vma_hpagesize = vma_kernel_pagesize(dst_vma); 492 493 /* 494 * Validate alignment based on huge page size 495 */ 496 err = -EINVAL; 497 if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) 498 goto out_unlock; 499 500 retry: 501 /* 502 * On routine entry dst_vma is set. If we had to drop mmap_lock and 503 * retry, dst_vma will be set to NULL and we must lookup again. 504 */ 505 if (!dst_vma) { 506 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); 507 if (IS_ERR(dst_vma)) { 508 err = PTR_ERR(dst_vma); 509 goto out; 510 } 511 512 err = -ENOENT; 513 if (!is_vm_hugetlb_page(dst_vma)) 514 goto out_unlock_vma; 515 516 err = -EINVAL; 517 if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) 518 goto out_unlock_vma; 519 520 /* 521 * If memory mappings are changing because of non-cooperative 522 * operation (e.g. mremap) running in parallel, bail out and 523 * request the user to retry later 524 */ 525 down_read(&ctx->map_changing_lock); 526 err = -EAGAIN; 527 if (atomic_read(&ctx->mmap_changing)) 528 goto out_unlock; 529 } 530 531 while (src_addr < src_start + len) { 532 BUG_ON(dst_addr >= dst_start + len); 533 534 /* 535 * Serialize via vma_lock and hugetlb_fault_mutex. 536 * vma_lock ensures the dst_pte remains valid even 537 * in the case of shared pmds. fault mutex prevents 538 * races with other faulting threads. 539 */ 540 idx = linear_page_index(dst_vma, dst_addr); 541 mapping = dst_vma->vm_file->f_mapping; 542 hash = hugetlb_fault_mutex_hash(mapping, idx); 543 mutex_lock(&hugetlb_fault_mutex_table[hash]); 544 hugetlb_vma_lock_read(dst_vma); 545 546 err = -ENOMEM; 547 dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); 548 if (!dst_pte) { 549 hugetlb_vma_unlock_read(dst_vma); 550 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 551 goto out_unlock; 552 } 553 554 if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && 555 !huge_pte_none_mostly(huge_ptep_get(dst_pte))) { 556 err = -EEXIST; 557 hugetlb_vma_unlock_read(dst_vma); 558 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 559 goto out_unlock; 560 } 561 562 err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, 563 src_addr, flags, &folio); 564 565 hugetlb_vma_unlock_read(dst_vma); 566 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 567 568 cond_resched(); 569 570 if (unlikely(err == -ENOENT)) { 571 up_read(&ctx->map_changing_lock); 572 uffd_mfill_unlock(dst_vma); 573 BUG_ON(!folio); 574 575 err = copy_folio_from_user(folio, 576 (const void __user *)src_addr, true); 577 if (unlikely(err)) { 578 err = -EFAULT; 579 goto out; 580 } 581 582 dst_vma = NULL; 583 goto retry; 584 } else 585 BUG_ON(folio); 586 587 if (!err) { 588 dst_addr += vma_hpagesize; 589 src_addr += vma_hpagesize; 590 copied += vma_hpagesize; 591 592 if (fatal_signal_pending(current)) 593 err = -EINTR; 594 } 595 if (err) 596 break; 597 } 598 599 out_unlock: 600 up_read(&ctx->map_changing_lock); 601 out_unlock_vma: 602 uffd_mfill_unlock(dst_vma); 603 out: 604 if (folio) 605 folio_put(folio); 606 BUG_ON(copied < 0); 607 BUG_ON(err > 0); 608 BUG_ON(!copied && !err); 609 return copied ? copied : err; 610 } 611 #else /* !CONFIG_HUGETLB_PAGE */ 612 /* fail at build time if gcc attempts to use this */ 613 extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx, 614 struct vm_area_struct *dst_vma, 615 unsigned long dst_start, 616 unsigned long src_start, 617 unsigned long len, 618 uffd_flags_t flags); 619 #endif /* CONFIG_HUGETLB_PAGE */ 620 621 static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, 622 struct vm_area_struct *dst_vma, 623 unsigned long dst_addr, 624 unsigned long src_addr, 625 uffd_flags_t flags, 626 struct folio **foliop) 627 { 628 ssize_t err; 629 630 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { 631 return mfill_atomic_pte_continue(dst_pmd, dst_vma, 632 dst_addr, flags); 633 } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { 634 return mfill_atomic_pte_poison(dst_pmd, dst_vma, 635 dst_addr, flags); 636 } 637 638 /* 639 * The normal page fault path for a shmem will invoke the 640 * fault, fill the hole in the file and COW it right away. The 641 * result generates plain anonymous memory. So when we are 642 * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll 643 * generate anonymous memory directly without actually filling 644 * the hole. For the MAP_PRIVATE case the robustness check 645 * only happens in the pagetable (to verify it's still none) 646 * and not in the radix tree. 647 */ 648 if (!(dst_vma->vm_flags & VM_SHARED)) { 649 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) 650 err = mfill_atomic_pte_copy(dst_pmd, dst_vma, 651 dst_addr, src_addr, 652 flags, foliop); 653 else 654 err = mfill_atomic_pte_zeropage(dst_pmd, 655 dst_vma, dst_addr); 656 } else { 657 err = shmem_mfill_atomic_pte(dst_pmd, dst_vma, 658 dst_addr, src_addr, 659 flags, foliop); 660 } 661 662 return err; 663 } 664 665 static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, 666 unsigned long dst_start, 667 unsigned long src_start, 668 unsigned long len, 669 uffd_flags_t flags) 670 { 671 struct mm_struct *dst_mm = ctx->mm; 672 struct vm_area_struct *dst_vma; 673 ssize_t err; 674 pmd_t *dst_pmd; 675 unsigned long src_addr, dst_addr; 676 long copied; 677 struct folio *folio; 678 679 /* 680 * Sanitize the command parameters: 681 */ 682 BUG_ON(dst_start & ~PAGE_MASK); 683 BUG_ON(len & ~PAGE_MASK); 684 685 /* Does the address range wrap, or is the span zero-sized? */ 686 BUG_ON(src_start + len <= src_start); 687 BUG_ON(dst_start + len <= dst_start); 688 689 src_addr = src_start; 690 dst_addr = dst_start; 691 copied = 0; 692 folio = NULL; 693 retry: 694 /* 695 * Make sure the vma is not shared, that the dst range is 696 * both valid and fully within a single existing vma. 697 */ 698 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); 699 if (IS_ERR(dst_vma)) { 700 err = PTR_ERR(dst_vma); 701 goto out; 702 } 703 704 /* 705 * If memory mappings are changing because of non-cooperative 706 * operation (e.g. mremap) running in parallel, bail out and 707 * request the user to retry later 708 */ 709 down_read(&ctx->map_changing_lock); 710 err = -EAGAIN; 711 if (atomic_read(&ctx->mmap_changing)) 712 goto out_unlock; 713 714 err = -EINVAL; 715 /* 716 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but 717 * it will overwrite vm_ops, so vma_is_anonymous must return false. 718 */ 719 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && 720 dst_vma->vm_flags & VM_SHARED)) 721 goto out_unlock; 722 723 /* 724 * validate 'mode' now that we know the dst_vma: don't allow 725 * a wrprotect copy if the userfaultfd didn't register as WP. 726 */ 727 if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) 728 goto out_unlock; 729 730 /* 731 * If this is a HUGETLB vma, pass off to appropriate routine 732 */ 733 if (is_vm_hugetlb_page(dst_vma)) 734 return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, 735 src_start, len, flags); 736 737 if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) 738 goto out_unlock; 739 if (!vma_is_shmem(dst_vma) && 740 uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) 741 goto out_unlock; 742 743 while (src_addr < src_start + len) { 744 pmd_t dst_pmdval; 745 746 BUG_ON(dst_addr >= dst_start + len); 747 748 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); 749 if (unlikely(!dst_pmd)) { 750 err = -ENOMEM; 751 break; 752 } 753 754 dst_pmdval = pmdp_get_lockless(dst_pmd); 755 /* 756 * If the dst_pmd is mapped as THP don't 757 * override it and just be strict. 758 */ 759 if (unlikely(pmd_trans_huge(dst_pmdval))) { 760 err = -EEXIST; 761 break; 762 } 763 if (unlikely(pmd_none(dst_pmdval)) && 764 unlikely(__pte_alloc(dst_mm, dst_pmd))) { 765 err = -ENOMEM; 766 break; 767 } 768 /* If an huge pmd materialized from under us fail */ 769 if (unlikely(pmd_trans_huge(*dst_pmd))) { 770 err = -EFAULT; 771 break; 772 } 773 774 BUG_ON(pmd_none(*dst_pmd)); 775 BUG_ON(pmd_trans_huge(*dst_pmd)); 776 777 err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, 778 src_addr, flags, &folio); 779 cond_resched(); 780 781 if (unlikely(err == -ENOENT)) { 782 void *kaddr; 783 784 up_read(&ctx->map_changing_lock); 785 uffd_mfill_unlock(dst_vma); 786 BUG_ON(!folio); 787 788 kaddr = kmap_local_folio(folio, 0); 789 err = copy_from_user(kaddr, 790 (const void __user *) src_addr, 791 PAGE_SIZE); 792 kunmap_local(kaddr); 793 if (unlikely(err)) { 794 err = -EFAULT; 795 goto out; 796 } 797 flush_dcache_folio(folio); 798 goto retry; 799 } else 800 BUG_ON(folio); 801 802 if (!err) { 803 dst_addr += PAGE_SIZE; 804 src_addr += PAGE_SIZE; 805 copied += PAGE_SIZE; 806 807 if (fatal_signal_pending(current)) 808 err = -EINTR; 809 } 810 if (err) 811 break; 812 } 813 814 out_unlock: 815 up_read(&ctx->map_changing_lock); 816 uffd_mfill_unlock(dst_vma); 817 out: 818 if (folio) 819 folio_put(folio); 820 BUG_ON(copied < 0); 821 BUG_ON(err > 0); 822 BUG_ON(!copied && !err); 823 return copied ? copied : err; 824 } 825 826 ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, 827 unsigned long src_start, unsigned long len, 828 uffd_flags_t flags) 829 { 830 return mfill_atomic(ctx, dst_start, src_start, len, 831 uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY)); 832 } 833 834 ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, 835 unsigned long start, 836 unsigned long len) 837 { 838 return mfill_atomic(ctx, start, 0, len, 839 uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE)); 840 } 841 842 ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, 843 unsigned long len, uffd_flags_t flags) 844 { 845 846 /* 847 * A caller might reasonably assume that UFFDIO_CONTINUE contains an 848 * smp_wmb() to ensure that any writes to the about-to-be-mapped page by 849 * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to 850 * subsequent loads from the page through the newly mapped address range. 851 */ 852 smp_wmb(); 853 854 return mfill_atomic(ctx, start, 0, len, 855 uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); 856 } 857 858 ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, 859 unsigned long len, uffd_flags_t flags) 860 { 861 return mfill_atomic(ctx, start, 0, len, 862 uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON)); 863 } 864 865 long uffd_wp_range(struct vm_area_struct *dst_vma, 866 unsigned long start, unsigned long len, bool enable_wp) 867 { 868 unsigned int mm_cp_flags; 869 struct mmu_gather tlb; 870 long ret; 871 872 VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end, 873 "The address range exceeds VMA boundary.\n"); 874 if (enable_wp) 875 mm_cp_flags = MM_CP_UFFD_WP; 876 else 877 mm_cp_flags = MM_CP_UFFD_WP_RESOLVE; 878 879 /* 880 * vma->vm_page_prot already reflects that uffd-wp is enabled for this 881 * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed 882 * to be write-protected as default whenever protection changes. 883 * Try upgrading write permissions manually. 884 */ 885 if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) 886 mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; 887 tlb_gather_mmu(&tlb, dst_vma->vm_mm); 888 ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); 889 tlb_finish_mmu(&tlb); 890 891 return ret; 892 } 893 894 int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, 895 unsigned long len, bool enable_wp) 896 { 897 struct mm_struct *dst_mm = ctx->mm; 898 unsigned long end = start + len; 899 unsigned long _start, _end; 900 struct vm_area_struct *dst_vma; 901 unsigned long page_mask; 902 long err; 903 VMA_ITERATOR(vmi, dst_mm, start); 904 905 /* 906 * Sanitize the command parameters: 907 */ 908 BUG_ON(start & ~PAGE_MASK); 909 BUG_ON(len & ~PAGE_MASK); 910 911 /* Does the address range wrap, or is the span zero-sized? */ 912 BUG_ON(start + len <= start); 913 914 mmap_read_lock(dst_mm); 915 916 /* 917 * If memory mappings are changing because of non-cooperative 918 * operation (e.g. mremap) running in parallel, bail out and 919 * request the user to retry later 920 */ 921 down_read(&ctx->map_changing_lock); 922 err = -EAGAIN; 923 if (atomic_read(&ctx->mmap_changing)) 924 goto out_unlock; 925 926 err = -ENOENT; 927 for_each_vma_range(vmi, dst_vma, end) { 928 929 if (!userfaultfd_wp(dst_vma)) { 930 err = -ENOENT; 931 break; 932 } 933 934 if (is_vm_hugetlb_page(dst_vma)) { 935 err = -EINVAL; 936 page_mask = vma_kernel_pagesize(dst_vma) - 1; 937 if ((start & page_mask) || (len & page_mask)) 938 break; 939 } 940 941 _start = max(dst_vma->vm_start, start); 942 _end = min(dst_vma->vm_end, end); 943 944 err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp); 945 946 /* Return 0 on success, <0 on failures */ 947 if (err < 0) 948 break; 949 err = 0; 950 } 951 out_unlock: 952 up_read(&ctx->map_changing_lock); 953 mmap_read_unlock(dst_mm); 954 return err; 955 } 956 957 958 void double_pt_lock(spinlock_t *ptl1, 959 spinlock_t *ptl2) 960 __acquires(ptl1) 961 __acquires(ptl2) 962 { 963 spinlock_t *ptl_tmp; 964 965 if (ptl1 > ptl2) { 966 /* exchange ptl1 and ptl2 */ 967 ptl_tmp = ptl1; 968 ptl1 = ptl2; 969 ptl2 = ptl_tmp; 970 } 971 /* lock in virtual address order to avoid lock inversion */ 972 spin_lock(ptl1); 973 if (ptl1 != ptl2) 974 spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING); 975 else 976 __acquire(ptl2); 977 } 978 979 void double_pt_unlock(spinlock_t *ptl1, 980 spinlock_t *ptl2) 981 __releases(ptl1) 982 __releases(ptl2) 983 { 984 spin_unlock(ptl1); 985 if (ptl1 != ptl2) 986 spin_unlock(ptl2); 987 else 988 __release(ptl2); 989 } 990 991 992 static int move_present_pte(struct mm_struct *mm, 993 struct vm_area_struct *dst_vma, 994 struct vm_area_struct *src_vma, 995 unsigned long dst_addr, unsigned long src_addr, 996 pte_t *dst_pte, pte_t *src_pte, 997 pte_t orig_dst_pte, pte_t orig_src_pte, 998 spinlock_t *dst_ptl, spinlock_t *src_ptl, 999 struct folio *src_folio) 1000 { 1001 int err = 0; 1002 1003 double_pt_lock(dst_ptl, src_ptl); 1004 1005 if (!pte_same(ptep_get(src_pte), orig_src_pte) || 1006 !pte_same(ptep_get(dst_pte), orig_dst_pte)) { 1007 err = -EAGAIN; 1008 goto out; 1009 } 1010 if (folio_test_large(src_folio) || 1011 folio_maybe_dma_pinned(src_folio) || 1012 !PageAnonExclusive(&src_folio->page)) { 1013 err = -EBUSY; 1014 goto out; 1015 } 1016 1017 orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte); 1018 /* Folio got pinned from under us. Put it back and fail the move. */ 1019 if (folio_maybe_dma_pinned(src_folio)) { 1020 set_pte_at(mm, src_addr, src_pte, orig_src_pte); 1021 err = -EBUSY; 1022 goto out; 1023 } 1024 1025 folio_move_anon_rmap(src_folio, dst_vma); 1026 src_folio->index = linear_page_index(dst_vma, dst_addr); 1027 1028 orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot); 1029 /* Follow mremap() behavior and treat the entry dirty after the move */ 1030 orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma); 1031 1032 set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); 1033 out: 1034 double_pt_unlock(dst_ptl, src_ptl); 1035 return err; 1036 } 1037 1038 static int move_swap_pte(struct mm_struct *mm, 1039 unsigned long dst_addr, unsigned long src_addr, 1040 pte_t *dst_pte, pte_t *src_pte, 1041 pte_t orig_dst_pte, pte_t orig_src_pte, 1042 spinlock_t *dst_ptl, spinlock_t *src_ptl) 1043 { 1044 if (!pte_swp_exclusive(orig_src_pte)) 1045 return -EBUSY; 1046 1047 double_pt_lock(dst_ptl, src_ptl); 1048 1049 if (!pte_same(ptep_get(src_pte), orig_src_pte) || 1050 !pte_same(ptep_get(dst_pte), orig_dst_pte)) { 1051 double_pt_unlock(dst_ptl, src_ptl); 1052 return -EAGAIN; 1053 } 1054 1055 orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); 1056 set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); 1057 double_pt_unlock(dst_ptl, src_ptl); 1058 1059 return 0; 1060 } 1061 1062 static int move_zeropage_pte(struct mm_struct *mm, 1063 struct vm_area_struct *dst_vma, 1064 struct vm_area_struct *src_vma, 1065 unsigned long dst_addr, unsigned long src_addr, 1066 pte_t *dst_pte, pte_t *src_pte, 1067 pte_t orig_dst_pte, pte_t orig_src_pte, 1068 spinlock_t *dst_ptl, spinlock_t *src_ptl) 1069 { 1070 pte_t zero_pte; 1071 1072 double_pt_lock(dst_ptl, src_ptl); 1073 if (!pte_same(ptep_get(src_pte), orig_src_pte) || 1074 !pte_same(ptep_get(dst_pte), orig_dst_pte)) { 1075 double_pt_unlock(dst_ptl, src_ptl); 1076 return -EAGAIN; 1077 } 1078 1079 zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), 1080 dst_vma->vm_page_prot)); 1081 ptep_clear_flush(src_vma, src_addr, src_pte); 1082 set_pte_at(mm, dst_addr, dst_pte, zero_pte); 1083 double_pt_unlock(dst_ptl, src_ptl); 1084 1085 return 0; 1086 } 1087 1088 1089 /* 1090 * The mmap_lock for reading is held by the caller. Just move the page 1091 * from src_pmd to dst_pmd if possible, and return true if succeeded 1092 * in moving the page. 1093 */ 1094 static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, 1095 struct vm_area_struct *dst_vma, 1096 struct vm_area_struct *src_vma, 1097 unsigned long dst_addr, unsigned long src_addr, 1098 __u64 mode) 1099 { 1100 swp_entry_t entry; 1101 pte_t orig_src_pte, orig_dst_pte; 1102 pte_t src_folio_pte; 1103 spinlock_t *src_ptl, *dst_ptl; 1104 pte_t *src_pte = NULL; 1105 pte_t *dst_pte = NULL; 1106 1107 struct folio *src_folio = NULL; 1108 struct anon_vma *src_anon_vma = NULL; 1109 struct mmu_notifier_range range; 1110 int err = 0; 1111 1112 flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE); 1113 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 1114 src_addr, src_addr + PAGE_SIZE); 1115 mmu_notifier_invalidate_range_start(&range); 1116 retry: 1117 dst_pte = pte_offset_map_nolock(mm, dst_pmd, dst_addr, &dst_ptl); 1118 1119 /* Retry if a huge pmd materialized from under us */ 1120 if (unlikely(!dst_pte)) { 1121 err = -EAGAIN; 1122 goto out; 1123 } 1124 1125 src_pte = pte_offset_map_nolock(mm, src_pmd, src_addr, &src_ptl); 1126 1127 /* 1128 * We held the mmap_lock for reading so MADV_DONTNEED 1129 * can zap transparent huge pages under us, or the 1130 * transparent huge page fault can establish new 1131 * transparent huge pages under us. 1132 */ 1133 if (unlikely(!src_pte)) { 1134 err = -EAGAIN; 1135 goto out; 1136 } 1137 1138 /* Sanity checks before the operation */ 1139 if (WARN_ON_ONCE(pmd_none(*dst_pmd)) || WARN_ON_ONCE(pmd_none(*src_pmd)) || 1140 WARN_ON_ONCE(pmd_trans_huge(*dst_pmd)) || WARN_ON_ONCE(pmd_trans_huge(*src_pmd))) { 1141 err = -EINVAL; 1142 goto out; 1143 } 1144 1145 spin_lock(dst_ptl); 1146 orig_dst_pte = ptep_get(dst_pte); 1147 spin_unlock(dst_ptl); 1148 if (!pte_none(orig_dst_pte)) { 1149 err = -EEXIST; 1150 goto out; 1151 } 1152 1153 spin_lock(src_ptl); 1154 orig_src_pte = ptep_get(src_pte); 1155 spin_unlock(src_ptl); 1156 if (pte_none(orig_src_pte)) { 1157 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) 1158 err = -ENOENT; 1159 else /* nothing to do to move a hole */ 1160 err = 0; 1161 goto out; 1162 } 1163 1164 /* If PTE changed after we locked the folio them start over */ 1165 if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { 1166 err = -EAGAIN; 1167 goto out; 1168 } 1169 1170 if (pte_present(orig_src_pte)) { 1171 if (is_zero_pfn(pte_pfn(orig_src_pte))) { 1172 err = move_zeropage_pte(mm, dst_vma, src_vma, 1173 dst_addr, src_addr, dst_pte, src_pte, 1174 orig_dst_pte, orig_src_pte, 1175 dst_ptl, src_ptl); 1176 goto out; 1177 } 1178 1179 /* 1180 * Pin and lock both source folio and anon_vma. Since we are in 1181 * RCU read section, we can't block, so on contention have to 1182 * unmap the ptes, obtain the lock and retry. 1183 */ 1184 if (!src_folio) { 1185 struct folio *folio; 1186 1187 /* 1188 * Pin the page while holding the lock to be sure the 1189 * page isn't freed under us 1190 */ 1191 spin_lock(src_ptl); 1192 if (!pte_same(orig_src_pte, ptep_get(src_pte))) { 1193 spin_unlock(src_ptl); 1194 err = -EAGAIN; 1195 goto out; 1196 } 1197 1198 folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); 1199 if (!folio || !PageAnonExclusive(&folio->page)) { 1200 spin_unlock(src_ptl); 1201 err = -EBUSY; 1202 goto out; 1203 } 1204 1205 folio_get(folio); 1206 src_folio = folio; 1207 src_folio_pte = orig_src_pte; 1208 spin_unlock(src_ptl); 1209 1210 if (!folio_trylock(src_folio)) { 1211 pte_unmap(&orig_src_pte); 1212 pte_unmap(&orig_dst_pte); 1213 src_pte = dst_pte = NULL; 1214 /* now we can block and wait */ 1215 folio_lock(src_folio); 1216 goto retry; 1217 } 1218 1219 if (WARN_ON_ONCE(!folio_test_anon(src_folio))) { 1220 err = -EBUSY; 1221 goto out; 1222 } 1223 } 1224 1225 /* at this point we have src_folio locked */ 1226 if (folio_test_large(src_folio)) { 1227 /* split_folio() can block */ 1228 pte_unmap(&orig_src_pte); 1229 pte_unmap(&orig_dst_pte); 1230 src_pte = dst_pte = NULL; 1231 err = split_folio(src_folio); 1232 if (err) 1233 goto out; 1234 /* have to reacquire the folio after it got split */ 1235 folio_unlock(src_folio); 1236 folio_put(src_folio); 1237 src_folio = NULL; 1238 goto retry; 1239 } 1240 1241 if (!src_anon_vma) { 1242 /* 1243 * folio_referenced walks the anon_vma chain 1244 * without the folio lock. Serialize against it with 1245 * the anon_vma lock, the folio lock is not enough. 1246 */ 1247 src_anon_vma = folio_get_anon_vma(src_folio); 1248 if (!src_anon_vma) { 1249 /* page was unmapped from under us */ 1250 err = -EAGAIN; 1251 goto out; 1252 } 1253 if (!anon_vma_trylock_write(src_anon_vma)) { 1254 pte_unmap(&orig_src_pte); 1255 pte_unmap(&orig_dst_pte); 1256 src_pte = dst_pte = NULL; 1257 /* now we can block and wait */ 1258 anon_vma_lock_write(src_anon_vma); 1259 goto retry; 1260 } 1261 } 1262 1263 err = move_present_pte(mm, dst_vma, src_vma, 1264 dst_addr, src_addr, dst_pte, src_pte, 1265 orig_dst_pte, orig_src_pte, 1266 dst_ptl, src_ptl, src_folio); 1267 } else { 1268 entry = pte_to_swp_entry(orig_src_pte); 1269 if (non_swap_entry(entry)) { 1270 if (is_migration_entry(entry)) { 1271 pte_unmap(&orig_src_pte); 1272 pte_unmap(&orig_dst_pte); 1273 src_pte = dst_pte = NULL; 1274 migration_entry_wait(mm, src_pmd, src_addr); 1275 err = -EAGAIN; 1276 } else 1277 err = -EFAULT; 1278 goto out; 1279 } 1280 1281 err = move_swap_pte(mm, dst_addr, src_addr, 1282 dst_pte, src_pte, 1283 orig_dst_pte, orig_src_pte, 1284 dst_ptl, src_ptl); 1285 } 1286 1287 out: 1288 if (src_anon_vma) { 1289 anon_vma_unlock_write(src_anon_vma); 1290 put_anon_vma(src_anon_vma); 1291 } 1292 if (src_folio) { 1293 folio_unlock(src_folio); 1294 folio_put(src_folio); 1295 } 1296 if (dst_pte) 1297 pte_unmap(dst_pte); 1298 if (src_pte) 1299 pte_unmap(src_pte); 1300 mmu_notifier_invalidate_range_end(&range); 1301 1302 return err; 1303 } 1304 1305 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1306 static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1307 unsigned long src_addr, 1308 unsigned long src_end) 1309 { 1310 return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) || 1311 src_end - src_addr < HPAGE_PMD_SIZE; 1312 } 1313 #else 1314 static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1315 unsigned long src_addr, 1316 unsigned long src_end) 1317 { 1318 /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */ 1319 return false; 1320 } 1321 #endif 1322 1323 static inline bool vma_move_compatible(struct vm_area_struct *vma) 1324 { 1325 return !(vma->vm_flags & (VM_PFNMAP | VM_IO | VM_HUGETLB | 1326 VM_MIXEDMAP | VM_SHADOW_STACK)); 1327 } 1328 1329 static int validate_move_areas(struct userfaultfd_ctx *ctx, 1330 struct vm_area_struct *src_vma, 1331 struct vm_area_struct *dst_vma) 1332 { 1333 /* Only allow moving if both have the same access and protection */ 1334 if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) || 1335 pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot)) 1336 return -EINVAL; 1337 1338 /* Only allow moving if both are mlocked or both aren't */ 1339 if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED)) 1340 return -EINVAL; 1341 1342 /* 1343 * For now, we keep it simple and only move between writable VMAs. 1344 * Access flags are equal, therefore cheching only the source is enough. 1345 */ 1346 if (!(src_vma->vm_flags & VM_WRITE)) 1347 return -EINVAL; 1348 1349 /* Check if vma flags indicate content which can be moved */ 1350 if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma)) 1351 return -EINVAL; 1352 1353 /* Ensure dst_vma is registered in uffd we are operating on */ 1354 if (!dst_vma->vm_userfaultfd_ctx.ctx || 1355 dst_vma->vm_userfaultfd_ctx.ctx != ctx) 1356 return -EINVAL; 1357 1358 /* Only allow moving across anonymous vmas */ 1359 if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma)) 1360 return -EINVAL; 1361 1362 return 0; 1363 } 1364 1365 static __always_inline 1366 int find_vmas_mm_locked(struct mm_struct *mm, 1367 unsigned long dst_start, 1368 unsigned long src_start, 1369 struct vm_area_struct **dst_vmap, 1370 struct vm_area_struct **src_vmap) 1371 { 1372 struct vm_area_struct *vma; 1373 1374 mmap_assert_locked(mm); 1375 vma = find_vma_and_prepare_anon(mm, dst_start); 1376 if (IS_ERR(vma)) 1377 return PTR_ERR(vma); 1378 1379 *dst_vmap = vma; 1380 /* Skip finding src_vma if src_start is in dst_vma */ 1381 if (src_start >= vma->vm_start && src_start < vma->vm_end) 1382 goto out_success; 1383 1384 vma = vma_lookup(mm, src_start); 1385 if (!vma) 1386 return -ENOENT; 1387 out_success: 1388 *src_vmap = vma; 1389 return 0; 1390 } 1391 1392 #ifdef CONFIG_PER_VMA_LOCK 1393 static int uffd_move_lock(struct mm_struct *mm, 1394 unsigned long dst_start, 1395 unsigned long src_start, 1396 struct vm_area_struct **dst_vmap, 1397 struct vm_area_struct **src_vmap) 1398 { 1399 struct vm_area_struct *vma; 1400 int err; 1401 1402 vma = uffd_lock_vma(mm, dst_start); 1403 if (IS_ERR(vma)) 1404 return PTR_ERR(vma); 1405 1406 *dst_vmap = vma; 1407 /* 1408 * Skip finding src_vma if src_start is in dst_vma. This also ensures 1409 * that we don't lock the same vma twice. 1410 */ 1411 if (src_start >= vma->vm_start && src_start < vma->vm_end) { 1412 *src_vmap = vma; 1413 return 0; 1414 } 1415 1416 /* 1417 * Using uffd_lock_vma() to get src_vma can lead to following deadlock: 1418 * 1419 * Thread1 Thread2 1420 * ------- ------- 1421 * vma_start_read(dst_vma) 1422 * mmap_write_lock(mm) 1423 * vma_start_write(src_vma) 1424 * vma_start_read(src_vma) 1425 * mmap_read_lock(mm) 1426 * vma_start_write(dst_vma) 1427 */ 1428 *src_vmap = lock_vma_under_rcu(mm, src_start); 1429 if (likely(*src_vmap)) 1430 return 0; 1431 1432 /* Undo any locking and retry in mmap_lock critical section */ 1433 vma_end_read(*dst_vmap); 1434 1435 mmap_read_lock(mm); 1436 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); 1437 if (!err) { 1438 /* 1439 * See comment in uffd_lock_vma() as to why not using 1440 * vma_start_read() here. 1441 */ 1442 down_read(&(*dst_vmap)->vm_lock->lock); 1443 if (*dst_vmap != *src_vmap) 1444 down_read_nested(&(*src_vmap)->vm_lock->lock, 1445 SINGLE_DEPTH_NESTING); 1446 } 1447 mmap_read_unlock(mm); 1448 return err; 1449 } 1450 1451 static void uffd_move_unlock(struct vm_area_struct *dst_vma, 1452 struct vm_area_struct *src_vma) 1453 { 1454 vma_end_read(src_vma); 1455 if (src_vma != dst_vma) 1456 vma_end_read(dst_vma); 1457 } 1458 1459 #else 1460 1461 static int uffd_move_lock(struct mm_struct *mm, 1462 unsigned long dst_start, 1463 unsigned long src_start, 1464 struct vm_area_struct **dst_vmap, 1465 struct vm_area_struct **src_vmap) 1466 { 1467 int err; 1468 1469 mmap_read_lock(mm); 1470 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); 1471 if (err) 1472 mmap_read_unlock(mm); 1473 return err; 1474 } 1475 1476 static void uffd_move_unlock(struct vm_area_struct *dst_vma, 1477 struct vm_area_struct *src_vma) 1478 { 1479 mmap_assert_locked(src_vma->vm_mm); 1480 mmap_read_unlock(dst_vma->vm_mm); 1481 } 1482 #endif 1483 1484 /** 1485 * move_pages - move arbitrary anonymous pages of an existing vma 1486 * @ctx: pointer to the userfaultfd context 1487 * @dst_start: start of the destination virtual memory range 1488 * @src_start: start of the source virtual memory range 1489 * @len: length of the virtual memory range 1490 * @mode: flags from uffdio_move.mode 1491 * 1492 * It will either use the mmap_lock in read mode or per-vma locks 1493 * 1494 * move_pages() remaps arbitrary anonymous pages atomically in zero 1495 * copy. It only works on non shared anonymous pages because those can 1496 * be relocated without generating non linear anon_vmas in the rmap 1497 * code. 1498 * 1499 * It provides a zero copy mechanism to handle userspace page faults. 1500 * The source vma pages should have mapcount == 1, which can be 1501 * enforced by using madvise(MADV_DONTFORK) on src vma. 1502 * 1503 * The thread receiving the page during the userland page fault 1504 * will receive the faulting page in the source vma through the network, 1505 * storage or any other I/O device (MADV_DONTFORK in the source vma 1506 * avoids move_pages() to fail with -EBUSY if the process forks before 1507 * move_pages() is called), then it will call move_pages() to map the 1508 * page in the faulting address in the destination vma. 1509 * 1510 * This userfaultfd command works purely via pagetables, so it's the 1511 * most efficient way to move physical non shared anonymous pages 1512 * across different virtual addresses. Unlike mremap()/mmap()/munmap() 1513 * it does not create any new vmas. The mapping in the destination 1514 * address is atomic. 1515 * 1516 * It only works if the vma protection bits are identical from the 1517 * source and destination vma. 1518 * 1519 * It can remap non shared anonymous pages within the same vma too. 1520 * 1521 * If the source virtual memory range has any unmapped holes, or if 1522 * the destination virtual memory range is not a whole unmapped hole, 1523 * move_pages() will fail respectively with -ENOENT or -EEXIST. This 1524 * provides a very strict behavior to avoid any chance of memory 1525 * corruption going unnoticed if there are userland race conditions. 1526 * Only one thread should resolve the userland page fault at any given 1527 * time for any given faulting address. This means that if two threads 1528 * try to both call move_pages() on the same destination address at the 1529 * same time, the second thread will get an explicit error from this 1530 * command. 1531 * 1532 * The command retval will return "len" is successful. The command 1533 * however can be interrupted by fatal signals or errors. If 1534 * interrupted it will return the number of bytes successfully 1535 * remapped before the interruption if any, or the negative error if 1536 * none. It will never return zero. Either it will return an error or 1537 * an amount of bytes successfully moved. If the retval reports a 1538 * "short" remap, the move_pages() command should be repeated by 1539 * userland with src+retval, dst+reval, len-retval if it wants to know 1540 * about the error that interrupted it. 1541 * 1542 * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to 1543 * prevent -ENOENT errors to materialize if there are holes in the 1544 * source virtual range that is being remapped. The holes will be 1545 * accounted as successfully remapped in the retval of the 1546 * command. This is mostly useful to remap hugepage naturally aligned 1547 * virtual regions without knowing if there are transparent hugepage 1548 * in the regions or not, but preventing the risk of having to split 1549 * the hugepmd during the remap. 1550 * 1551 * If there's any rmap walk that is taking the anon_vma locks without 1552 * first obtaining the folio lock (the only current instance is 1553 * folio_referenced), they will have to verify if the folio->mapping 1554 * has changed after taking the anon_vma lock. If it changed they 1555 * should release the lock and retry obtaining a new anon_vma, because 1556 * it means the anon_vma was changed by move_pages() before the lock 1557 * could be obtained. This is the only additional complexity added to 1558 * the rmap code to provide this anonymous page remapping functionality. 1559 */ 1560 ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, 1561 unsigned long src_start, unsigned long len, __u64 mode) 1562 { 1563 struct mm_struct *mm = ctx->mm; 1564 struct vm_area_struct *src_vma, *dst_vma; 1565 unsigned long src_addr, dst_addr; 1566 pmd_t *src_pmd, *dst_pmd; 1567 long err = -EINVAL; 1568 ssize_t moved = 0; 1569 1570 /* Sanitize the command parameters. */ 1571 if (WARN_ON_ONCE(src_start & ~PAGE_MASK) || 1572 WARN_ON_ONCE(dst_start & ~PAGE_MASK) || 1573 WARN_ON_ONCE(len & ~PAGE_MASK)) 1574 goto out; 1575 1576 /* Does the address range wrap, or is the span zero-sized? */ 1577 if (WARN_ON_ONCE(src_start + len <= src_start) || 1578 WARN_ON_ONCE(dst_start + len <= dst_start)) 1579 goto out; 1580 1581 err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma); 1582 if (err) 1583 goto out; 1584 1585 /* Re-check after taking map_changing_lock */ 1586 err = -EAGAIN; 1587 down_read(&ctx->map_changing_lock); 1588 if (likely(atomic_read(&ctx->mmap_changing))) 1589 goto out_unlock; 1590 /* 1591 * Make sure the vma is not shared, that the src and dst remap 1592 * ranges are both valid and fully within a single existing 1593 * vma. 1594 */ 1595 err = -EINVAL; 1596 if (src_vma->vm_flags & VM_SHARED) 1597 goto out_unlock; 1598 if (src_start + len > src_vma->vm_end) 1599 goto out_unlock; 1600 1601 if (dst_vma->vm_flags & VM_SHARED) 1602 goto out_unlock; 1603 if (dst_start + len > dst_vma->vm_end) 1604 goto out_unlock; 1605 1606 err = validate_move_areas(ctx, src_vma, dst_vma); 1607 if (err) 1608 goto out_unlock; 1609 1610 for (src_addr = src_start, dst_addr = dst_start; 1611 src_addr < src_start + len;) { 1612 spinlock_t *ptl; 1613 pmd_t dst_pmdval; 1614 unsigned long step_size; 1615 1616 /* 1617 * Below works because anonymous area would not have a 1618 * transparent huge PUD. If file-backed support is added, 1619 * that case would need to be handled here. 1620 */ 1621 src_pmd = mm_find_pmd(mm, src_addr); 1622 if (unlikely(!src_pmd)) { 1623 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 1624 err = -ENOENT; 1625 break; 1626 } 1627 src_pmd = mm_alloc_pmd(mm, src_addr); 1628 if (unlikely(!src_pmd)) { 1629 err = -ENOMEM; 1630 break; 1631 } 1632 } 1633 dst_pmd = mm_alloc_pmd(mm, dst_addr); 1634 if (unlikely(!dst_pmd)) { 1635 err = -ENOMEM; 1636 break; 1637 } 1638 1639 dst_pmdval = pmdp_get_lockless(dst_pmd); 1640 /* 1641 * If the dst_pmd is mapped as THP don't override it and just 1642 * be strict. If dst_pmd changes into TPH after this check, the 1643 * move_pages_huge_pmd() will detect the change and retry 1644 * while move_pages_pte() will detect the change and fail. 1645 */ 1646 if (unlikely(pmd_trans_huge(dst_pmdval))) { 1647 err = -EEXIST; 1648 break; 1649 } 1650 1651 ptl = pmd_trans_huge_lock(src_pmd, src_vma); 1652 if (ptl) { 1653 if (pmd_devmap(*src_pmd)) { 1654 spin_unlock(ptl); 1655 err = -ENOENT; 1656 break; 1657 } 1658 1659 /* Check if we can move the pmd without splitting it. */ 1660 if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || 1661 !pmd_none(dst_pmdval)) { 1662 struct folio *folio = pmd_folio(*src_pmd); 1663 1664 if (!folio || (!is_huge_zero_folio(folio) && 1665 !PageAnonExclusive(&folio->page))) { 1666 spin_unlock(ptl); 1667 err = -EBUSY; 1668 break; 1669 } 1670 1671 spin_unlock(ptl); 1672 split_huge_pmd(src_vma, src_pmd, src_addr); 1673 /* The folio will be split by move_pages_pte() */ 1674 continue; 1675 } 1676 1677 err = move_pages_huge_pmd(mm, dst_pmd, src_pmd, 1678 dst_pmdval, dst_vma, src_vma, 1679 dst_addr, src_addr); 1680 step_size = HPAGE_PMD_SIZE; 1681 } else { 1682 if (pmd_none(*src_pmd)) { 1683 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 1684 err = -ENOENT; 1685 break; 1686 } 1687 if (unlikely(__pte_alloc(mm, src_pmd))) { 1688 err = -ENOMEM; 1689 break; 1690 } 1691 } 1692 1693 if (unlikely(pte_alloc(mm, dst_pmd))) { 1694 err = -ENOMEM; 1695 break; 1696 } 1697 1698 err = move_pages_pte(mm, dst_pmd, src_pmd, 1699 dst_vma, src_vma, 1700 dst_addr, src_addr, mode); 1701 step_size = PAGE_SIZE; 1702 } 1703 1704 cond_resched(); 1705 1706 if (fatal_signal_pending(current)) { 1707 /* Do not override an error */ 1708 if (!err || err == -EAGAIN) 1709 err = -EINTR; 1710 break; 1711 } 1712 1713 if (err) { 1714 if (err == -EAGAIN) 1715 continue; 1716 break; 1717 } 1718 1719 /* Proceed to the next page */ 1720 dst_addr += step_size; 1721 src_addr += step_size; 1722 moved += step_size; 1723 } 1724 1725 out_unlock: 1726 up_read(&ctx->map_changing_lock); 1727 uffd_move_unlock(dst_vma, src_vma); 1728 out: 1729 VM_WARN_ON(moved < 0); 1730 VM_WARN_ON(err > 0); 1731 VM_WARN_ON(!moved && !err); 1732 return moved ? moved : err; 1733 } 1734