1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * mm/userfaultfd.c 4 * 5 * Copyright (C) 2015 Red Hat, Inc. 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/sched/signal.h> 10 #include <linux/pagemap.h> 11 #include <linux/rmap.h> 12 #include <linux/swap.h> 13 #include <linux/swapops.h> 14 #include <linux/userfaultfd_k.h> 15 #include <linux/mmu_notifier.h> 16 #include <linux/hugetlb.h> 17 #include <linux/shmem_fs.h> 18 #include <asm/tlbflush.h> 19 #include <asm/tlb.h> 20 #include "internal.h" 21 22 static __always_inline 23 bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) 24 { 25 /* Make sure that the dst range is fully within dst_vma. */ 26 if (dst_end > dst_vma->vm_end) 27 return false; 28 29 /* 30 * Check the vma is registered in uffd, this is required to 31 * enforce the VM_MAYWRITE check done at uffd registration 32 * time. 33 */ 34 if (!dst_vma->vm_userfaultfd_ctx.ctx) 35 return false; 36 37 return true; 38 } 39 40 static __always_inline 41 struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm, 42 unsigned long addr) 43 { 44 struct vm_area_struct *vma; 45 46 mmap_assert_locked(mm); 47 vma = vma_lookup(mm, addr); 48 if (!vma) 49 vma = ERR_PTR(-ENOENT); 50 else if (!(vma->vm_flags & VM_SHARED) && 51 unlikely(anon_vma_prepare(vma))) 52 vma = ERR_PTR(-ENOMEM); 53 54 return vma; 55 } 56 57 #ifdef CONFIG_PER_VMA_LOCK 58 /* 59 * lock_vma() - Lookup and lock vma corresponding to @address. 60 * @mm: mm to search vma in. 61 * @address: address that the vma should contain. 62 * 63 * Should be called without holding mmap_lock. vma should be unlocked after use 64 * with unlock_vma(). 65 * 66 * Return: A locked vma containing @address, -ENOENT if no vma is found, or 67 * -ENOMEM if anon_vma couldn't be allocated. 68 */ 69 static struct vm_area_struct *lock_vma(struct mm_struct *mm, 70 unsigned long address) 71 { 72 struct vm_area_struct *vma; 73 74 vma = lock_vma_under_rcu(mm, address); 75 if (vma) { 76 /* 77 * lock_vma_under_rcu() only checks anon_vma for private 78 * anonymous mappings. But we need to ensure it is assigned in 79 * private file-backed vmas as well. 80 */ 81 if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma)) 82 vma_end_read(vma); 83 else 84 return vma; 85 } 86 87 mmap_read_lock(mm); 88 vma = find_vma_and_prepare_anon(mm, address); 89 if (!IS_ERR(vma)) { 90 /* 91 * We cannot use vma_start_read() as it may fail due to 92 * false locked (see comment in vma_start_read()). We 93 * can avoid that by directly locking vm_lock under 94 * mmap_lock, which guarantees that nobody can lock the 95 * vma for write (vma_start_write()) under us. 96 */ 97 down_read(&vma->vm_lock->lock); 98 } 99 100 mmap_read_unlock(mm); 101 return vma; 102 } 103 104 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, 105 unsigned long dst_start, 106 unsigned long len) 107 { 108 struct vm_area_struct *dst_vma; 109 110 dst_vma = lock_vma(dst_mm, dst_start); 111 if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len)) 112 return dst_vma; 113 114 vma_end_read(dst_vma); 115 return ERR_PTR(-ENOENT); 116 } 117 118 static void uffd_mfill_unlock(struct vm_area_struct *vma) 119 { 120 vma_end_read(vma); 121 } 122 123 #else 124 125 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, 126 unsigned long dst_start, 127 unsigned long len) 128 { 129 struct vm_area_struct *dst_vma; 130 131 mmap_read_lock(dst_mm); 132 dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start); 133 if (IS_ERR(dst_vma)) 134 goto out_unlock; 135 136 if (validate_dst_vma(dst_vma, dst_start + len)) 137 return dst_vma; 138 139 dst_vma = ERR_PTR(-ENOENT); 140 out_unlock: 141 mmap_read_unlock(dst_mm); 142 return dst_vma; 143 } 144 145 static void uffd_mfill_unlock(struct vm_area_struct *vma) 146 { 147 mmap_read_unlock(vma->vm_mm); 148 } 149 #endif 150 151 /* Check if dst_addr is outside of file's size. Must be called with ptl held. */ 152 static bool mfill_file_over_size(struct vm_area_struct *dst_vma, 153 unsigned long dst_addr) 154 { 155 struct inode *inode; 156 pgoff_t offset, max_off; 157 158 if (!dst_vma->vm_file) 159 return false; 160 161 inode = dst_vma->vm_file->f_inode; 162 offset = linear_page_index(dst_vma, dst_addr); 163 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 164 return offset >= max_off; 165 } 166 167 /* 168 * Install PTEs, to map dst_addr (within dst_vma) to page. 169 * 170 * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem 171 * and anon, and for both shared and private VMAs. 172 */ 173 int mfill_atomic_install_pte(pmd_t *dst_pmd, 174 struct vm_area_struct *dst_vma, 175 unsigned long dst_addr, struct page *page, 176 bool newly_allocated, uffd_flags_t flags) 177 { 178 int ret; 179 struct mm_struct *dst_mm = dst_vma->vm_mm; 180 pte_t _dst_pte, *dst_pte; 181 bool writable = dst_vma->vm_flags & VM_WRITE; 182 bool vm_shared = dst_vma->vm_flags & VM_SHARED; 183 spinlock_t *ptl; 184 struct folio *folio = page_folio(page); 185 bool page_in_cache = folio_mapping(folio); 186 187 _dst_pte = mk_pte(page, dst_vma->vm_page_prot); 188 _dst_pte = pte_mkdirty(_dst_pte); 189 if (page_in_cache && !vm_shared) 190 writable = false; 191 if (writable) 192 _dst_pte = pte_mkwrite(_dst_pte, dst_vma); 193 if (flags & MFILL_ATOMIC_WP) 194 _dst_pte = pte_mkuffd_wp(_dst_pte); 195 196 ret = -EAGAIN; 197 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 198 if (!dst_pte) 199 goto out; 200 201 if (mfill_file_over_size(dst_vma, dst_addr)) { 202 ret = -EFAULT; 203 goto out_unlock; 204 } 205 206 ret = -EEXIST; 207 /* 208 * We allow to overwrite a pte marker: consider when both MISSING|WP 209 * registered, we firstly wr-protect a none pte which has no page cache 210 * page backing it, then access the page. 211 */ 212 if (!pte_none_mostly(ptep_get(dst_pte))) 213 goto out_unlock; 214 215 if (page_in_cache) { 216 /* Usually, cache pages are already added to LRU */ 217 if (newly_allocated) 218 folio_add_lru(folio); 219 folio_add_file_rmap_pte(folio, page, dst_vma); 220 } else { 221 folio_add_new_anon_rmap(folio, dst_vma, dst_addr); 222 folio_add_lru_vma(folio, dst_vma); 223 } 224 225 /* 226 * Must happen after rmap, as mm_counter() checks mapping (via 227 * PageAnon()), which is set by __page_set_anon_rmap(). 228 */ 229 inc_mm_counter(dst_mm, mm_counter(folio)); 230 231 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 232 233 /* No need to invalidate - it was non-present before */ 234 update_mmu_cache(dst_vma, dst_addr, dst_pte); 235 ret = 0; 236 out_unlock: 237 pte_unmap_unlock(dst_pte, ptl); 238 out: 239 return ret; 240 } 241 242 static int mfill_atomic_pte_copy(pmd_t *dst_pmd, 243 struct vm_area_struct *dst_vma, 244 unsigned long dst_addr, 245 unsigned long src_addr, 246 uffd_flags_t flags, 247 struct folio **foliop) 248 { 249 void *kaddr; 250 int ret; 251 struct folio *folio; 252 253 if (!*foliop) { 254 ret = -ENOMEM; 255 folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, 256 dst_addr, false); 257 if (!folio) 258 goto out; 259 260 kaddr = kmap_local_folio(folio, 0); 261 /* 262 * The read mmap_lock is held here. Despite the 263 * mmap_lock being read recursive a deadlock is still 264 * possible if a writer has taken a lock. For example: 265 * 266 * process A thread 1 takes read lock on own mmap_lock 267 * process A thread 2 calls mmap, blocks taking write lock 268 * process B thread 1 takes page fault, read lock on own mmap lock 269 * process B thread 2 calls mmap, blocks taking write lock 270 * process A thread 1 blocks taking read lock on process B 271 * process B thread 1 blocks taking read lock on process A 272 * 273 * Disable page faults to prevent potential deadlock 274 * and retry the copy outside the mmap_lock. 275 */ 276 pagefault_disable(); 277 ret = copy_from_user(kaddr, (const void __user *) src_addr, 278 PAGE_SIZE); 279 pagefault_enable(); 280 kunmap_local(kaddr); 281 282 /* fallback to copy_from_user outside mmap_lock */ 283 if (unlikely(ret)) { 284 ret = -ENOENT; 285 *foliop = folio; 286 /* don't free the page */ 287 goto out; 288 } 289 290 flush_dcache_folio(folio); 291 } else { 292 folio = *foliop; 293 *foliop = NULL; 294 } 295 296 /* 297 * The memory barrier inside __folio_mark_uptodate makes sure that 298 * preceding stores to the page contents become visible before 299 * the set_pte_at() write. 300 */ 301 __folio_mark_uptodate(folio); 302 303 ret = -ENOMEM; 304 if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) 305 goto out_release; 306 307 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 308 &folio->page, true, flags); 309 if (ret) 310 goto out_release; 311 out: 312 return ret; 313 out_release: 314 folio_put(folio); 315 goto out; 316 } 317 318 static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, 319 struct vm_area_struct *dst_vma, 320 unsigned long dst_addr) 321 { 322 pte_t _dst_pte, *dst_pte; 323 spinlock_t *ptl; 324 int ret; 325 326 _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), 327 dst_vma->vm_page_prot)); 328 ret = -EAGAIN; 329 dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); 330 if (!dst_pte) 331 goto out; 332 if (mfill_file_over_size(dst_vma, dst_addr)) { 333 ret = -EFAULT; 334 goto out_unlock; 335 } 336 ret = -EEXIST; 337 if (!pte_none(ptep_get(dst_pte))) 338 goto out_unlock; 339 set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); 340 /* No need to invalidate - it was non-present before */ 341 update_mmu_cache(dst_vma, dst_addr, dst_pte); 342 ret = 0; 343 out_unlock: 344 pte_unmap_unlock(dst_pte, ptl); 345 out: 346 return ret; 347 } 348 349 /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ 350 static int mfill_atomic_pte_continue(pmd_t *dst_pmd, 351 struct vm_area_struct *dst_vma, 352 unsigned long dst_addr, 353 uffd_flags_t flags) 354 { 355 struct inode *inode = file_inode(dst_vma->vm_file); 356 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 357 struct folio *folio; 358 struct page *page; 359 int ret; 360 361 ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC); 362 /* Our caller expects us to return -EFAULT if we failed to find folio */ 363 if (ret == -ENOENT) 364 ret = -EFAULT; 365 if (ret) 366 goto out; 367 if (!folio) { 368 ret = -EFAULT; 369 goto out; 370 } 371 372 page = folio_file_page(folio, pgoff); 373 if (PageHWPoison(page)) { 374 ret = -EIO; 375 goto out_release; 376 } 377 378 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 379 page, false, flags); 380 if (ret) 381 goto out_release; 382 383 folio_unlock(folio); 384 ret = 0; 385 out: 386 return ret; 387 out_release: 388 folio_unlock(folio); 389 folio_put(folio); 390 goto out; 391 } 392 393 /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ 394 static int mfill_atomic_pte_poison(pmd_t *dst_pmd, 395 struct vm_area_struct *dst_vma, 396 unsigned long dst_addr, 397 uffd_flags_t flags) 398 { 399 int ret; 400 struct mm_struct *dst_mm = dst_vma->vm_mm; 401 pte_t _dst_pte, *dst_pte; 402 spinlock_t *ptl; 403 404 _dst_pte = make_pte_marker(PTE_MARKER_POISONED); 405 ret = -EAGAIN; 406 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 407 if (!dst_pte) 408 goto out; 409 410 if (mfill_file_over_size(dst_vma, dst_addr)) { 411 ret = -EFAULT; 412 goto out_unlock; 413 } 414 415 ret = -EEXIST; 416 /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */ 417 if (!pte_none(ptep_get(dst_pte))) 418 goto out_unlock; 419 420 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 421 422 /* No need to invalidate - it was non-present before */ 423 update_mmu_cache(dst_vma, dst_addr, dst_pte); 424 ret = 0; 425 out_unlock: 426 pte_unmap_unlock(dst_pte, ptl); 427 out: 428 return ret; 429 } 430 431 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) 432 { 433 pgd_t *pgd; 434 p4d_t *p4d; 435 pud_t *pud; 436 437 pgd = pgd_offset(mm, address); 438 p4d = p4d_alloc(mm, pgd, address); 439 if (!p4d) 440 return NULL; 441 pud = pud_alloc(mm, p4d, address); 442 if (!pud) 443 return NULL; 444 /* 445 * Note that we didn't run this because the pmd was 446 * missing, the *pmd may be already established and in 447 * turn it may also be a trans_huge_pmd. 448 */ 449 return pmd_alloc(mm, pud, address); 450 } 451 452 #ifdef CONFIG_HUGETLB_PAGE 453 /* 454 * mfill_atomic processing for HUGETLB vmas. Note that this routine is 455 * called with either vma-lock or mmap_lock held, it will release the lock 456 * before returning. 457 */ 458 static __always_inline ssize_t mfill_atomic_hugetlb( 459 struct userfaultfd_ctx *ctx, 460 struct vm_area_struct *dst_vma, 461 unsigned long dst_start, 462 unsigned long src_start, 463 unsigned long len, 464 uffd_flags_t flags) 465 { 466 struct mm_struct *dst_mm = dst_vma->vm_mm; 467 ssize_t err; 468 pte_t *dst_pte; 469 unsigned long src_addr, dst_addr; 470 long copied; 471 struct folio *folio; 472 unsigned long vma_hpagesize; 473 pgoff_t idx; 474 u32 hash; 475 struct address_space *mapping; 476 477 /* 478 * There is no default zero huge page for all huge page sizes as 479 * supported by hugetlb. A PMD_SIZE huge pages may exist as used 480 * by THP. Since we can not reliably insert a zero page, this 481 * feature is not supported. 482 */ 483 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { 484 up_read(&ctx->map_changing_lock); 485 uffd_mfill_unlock(dst_vma); 486 return -EINVAL; 487 } 488 489 src_addr = src_start; 490 dst_addr = dst_start; 491 copied = 0; 492 folio = NULL; 493 vma_hpagesize = vma_kernel_pagesize(dst_vma); 494 495 /* 496 * Validate alignment based on huge page size 497 */ 498 err = -EINVAL; 499 if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) 500 goto out_unlock; 501 502 retry: 503 /* 504 * On routine entry dst_vma is set. If we had to drop mmap_lock and 505 * retry, dst_vma will be set to NULL and we must lookup again. 506 */ 507 if (!dst_vma) { 508 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); 509 if (IS_ERR(dst_vma)) { 510 err = PTR_ERR(dst_vma); 511 goto out; 512 } 513 514 err = -ENOENT; 515 if (!is_vm_hugetlb_page(dst_vma)) 516 goto out_unlock_vma; 517 518 err = -EINVAL; 519 if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) 520 goto out_unlock_vma; 521 522 /* 523 * If memory mappings are changing because of non-cooperative 524 * operation (e.g. mremap) running in parallel, bail out and 525 * request the user to retry later 526 */ 527 down_read(&ctx->map_changing_lock); 528 err = -EAGAIN; 529 if (atomic_read(&ctx->mmap_changing)) 530 goto out_unlock; 531 } 532 533 while (src_addr < src_start + len) { 534 BUG_ON(dst_addr >= dst_start + len); 535 536 /* 537 * Serialize via vma_lock and hugetlb_fault_mutex. 538 * vma_lock ensures the dst_pte remains valid even 539 * in the case of shared pmds. fault mutex prevents 540 * races with other faulting threads. 541 */ 542 idx = linear_page_index(dst_vma, dst_addr); 543 mapping = dst_vma->vm_file->f_mapping; 544 hash = hugetlb_fault_mutex_hash(mapping, idx); 545 mutex_lock(&hugetlb_fault_mutex_table[hash]); 546 hugetlb_vma_lock_read(dst_vma); 547 548 err = -ENOMEM; 549 dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); 550 if (!dst_pte) { 551 hugetlb_vma_unlock_read(dst_vma); 552 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 553 goto out_unlock; 554 } 555 556 if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && 557 !huge_pte_none_mostly(huge_ptep_get(dst_pte))) { 558 err = -EEXIST; 559 hugetlb_vma_unlock_read(dst_vma); 560 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 561 goto out_unlock; 562 } 563 564 err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, 565 src_addr, flags, &folio); 566 567 hugetlb_vma_unlock_read(dst_vma); 568 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 569 570 cond_resched(); 571 572 if (unlikely(err == -ENOENT)) { 573 up_read(&ctx->map_changing_lock); 574 uffd_mfill_unlock(dst_vma); 575 BUG_ON(!folio); 576 577 err = copy_folio_from_user(folio, 578 (const void __user *)src_addr, true); 579 if (unlikely(err)) { 580 err = -EFAULT; 581 goto out; 582 } 583 584 dst_vma = NULL; 585 goto retry; 586 } else 587 BUG_ON(folio); 588 589 if (!err) { 590 dst_addr += vma_hpagesize; 591 src_addr += vma_hpagesize; 592 copied += vma_hpagesize; 593 594 if (fatal_signal_pending(current)) 595 err = -EINTR; 596 } 597 if (err) 598 break; 599 } 600 601 out_unlock: 602 up_read(&ctx->map_changing_lock); 603 out_unlock_vma: 604 uffd_mfill_unlock(dst_vma); 605 out: 606 if (folio) 607 folio_put(folio); 608 BUG_ON(copied < 0); 609 BUG_ON(err > 0); 610 BUG_ON(!copied && !err); 611 return copied ? copied : err; 612 } 613 #else /* !CONFIG_HUGETLB_PAGE */ 614 /* fail at build time if gcc attempts to use this */ 615 extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx, 616 struct vm_area_struct *dst_vma, 617 unsigned long dst_start, 618 unsigned long src_start, 619 unsigned long len, 620 uffd_flags_t flags); 621 #endif /* CONFIG_HUGETLB_PAGE */ 622 623 static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, 624 struct vm_area_struct *dst_vma, 625 unsigned long dst_addr, 626 unsigned long src_addr, 627 uffd_flags_t flags, 628 struct folio **foliop) 629 { 630 ssize_t err; 631 632 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { 633 return mfill_atomic_pte_continue(dst_pmd, dst_vma, 634 dst_addr, flags); 635 } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { 636 return mfill_atomic_pte_poison(dst_pmd, dst_vma, 637 dst_addr, flags); 638 } 639 640 /* 641 * The normal page fault path for a shmem will invoke the 642 * fault, fill the hole in the file and COW it right away. The 643 * result generates plain anonymous memory. So when we are 644 * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll 645 * generate anonymous memory directly without actually filling 646 * the hole. For the MAP_PRIVATE case the robustness check 647 * only happens in the pagetable (to verify it's still none) 648 * and not in the radix tree. 649 */ 650 if (!(dst_vma->vm_flags & VM_SHARED)) { 651 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) 652 err = mfill_atomic_pte_copy(dst_pmd, dst_vma, 653 dst_addr, src_addr, 654 flags, foliop); 655 else 656 err = mfill_atomic_pte_zeropage(dst_pmd, 657 dst_vma, dst_addr); 658 } else { 659 err = shmem_mfill_atomic_pte(dst_pmd, dst_vma, 660 dst_addr, src_addr, 661 flags, foliop); 662 } 663 664 return err; 665 } 666 667 static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, 668 unsigned long dst_start, 669 unsigned long src_start, 670 unsigned long len, 671 uffd_flags_t flags) 672 { 673 struct mm_struct *dst_mm = ctx->mm; 674 struct vm_area_struct *dst_vma; 675 ssize_t err; 676 pmd_t *dst_pmd; 677 unsigned long src_addr, dst_addr; 678 long copied; 679 struct folio *folio; 680 681 /* 682 * Sanitize the command parameters: 683 */ 684 BUG_ON(dst_start & ~PAGE_MASK); 685 BUG_ON(len & ~PAGE_MASK); 686 687 /* Does the address range wrap, or is the span zero-sized? */ 688 BUG_ON(src_start + len <= src_start); 689 BUG_ON(dst_start + len <= dst_start); 690 691 src_addr = src_start; 692 dst_addr = dst_start; 693 copied = 0; 694 folio = NULL; 695 retry: 696 /* 697 * Make sure the vma is not shared, that the dst range is 698 * both valid and fully within a single existing vma. 699 */ 700 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); 701 if (IS_ERR(dst_vma)) { 702 err = PTR_ERR(dst_vma); 703 goto out; 704 } 705 706 /* 707 * If memory mappings are changing because of non-cooperative 708 * operation (e.g. mremap) running in parallel, bail out and 709 * request the user to retry later 710 */ 711 down_read(&ctx->map_changing_lock); 712 err = -EAGAIN; 713 if (atomic_read(&ctx->mmap_changing)) 714 goto out_unlock; 715 716 err = -EINVAL; 717 /* 718 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but 719 * it will overwrite vm_ops, so vma_is_anonymous must return false. 720 */ 721 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && 722 dst_vma->vm_flags & VM_SHARED)) 723 goto out_unlock; 724 725 /* 726 * validate 'mode' now that we know the dst_vma: don't allow 727 * a wrprotect copy if the userfaultfd didn't register as WP. 728 */ 729 if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) 730 goto out_unlock; 731 732 /* 733 * If this is a HUGETLB vma, pass off to appropriate routine 734 */ 735 if (is_vm_hugetlb_page(dst_vma)) 736 return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, 737 src_start, len, flags); 738 739 if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) 740 goto out_unlock; 741 if (!vma_is_shmem(dst_vma) && 742 uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) 743 goto out_unlock; 744 745 while (src_addr < src_start + len) { 746 pmd_t dst_pmdval; 747 748 BUG_ON(dst_addr >= dst_start + len); 749 750 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); 751 if (unlikely(!dst_pmd)) { 752 err = -ENOMEM; 753 break; 754 } 755 756 dst_pmdval = pmdp_get_lockless(dst_pmd); 757 /* 758 * If the dst_pmd is mapped as THP don't 759 * override it and just be strict. 760 */ 761 if (unlikely(pmd_trans_huge(dst_pmdval))) { 762 err = -EEXIST; 763 break; 764 } 765 if (unlikely(pmd_none(dst_pmdval)) && 766 unlikely(__pte_alloc(dst_mm, dst_pmd))) { 767 err = -ENOMEM; 768 break; 769 } 770 /* If an huge pmd materialized from under us fail */ 771 if (unlikely(pmd_trans_huge(*dst_pmd))) { 772 err = -EFAULT; 773 break; 774 } 775 776 BUG_ON(pmd_none(*dst_pmd)); 777 BUG_ON(pmd_trans_huge(*dst_pmd)); 778 779 err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, 780 src_addr, flags, &folio); 781 cond_resched(); 782 783 if (unlikely(err == -ENOENT)) { 784 void *kaddr; 785 786 up_read(&ctx->map_changing_lock); 787 uffd_mfill_unlock(dst_vma); 788 BUG_ON(!folio); 789 790 kaddr = kmap_local_folio(folio, 0); 791 err = copy_from_user(kaddr, 792 (const void __user *) src_addr, 793 PAGE_SIZE); 794 kunmap_local(kaddr); 795 if (unlikely(err)) { 796 err = -EFAULT; 797 goto out; 798 } 799 flush_dcache_folio(folio); 800 goto retry; 801 } else 802 BUG_ON(folio); 803 804 if (!err) { 805 dst_addr += PAGE_SIZE; 806 src_addr += PAGE_SIZE; 807 copied += PAGE_SIZE; 808 809 if (fatal_signal_pending(current)) 810 err = -EINTR; 811 } 812 if (err) 813 break; 814 } 815 816 out_unlock: 817 up_read(&ctx->map_changing_lock); 818 uffd_mfill_unlock(dst_vma); 819 out: 820 if (folio) 821 folio_put(folio); 822 BUG_ON(copied < 0); 823 BUG_ON(err > 0); 824 BUG_ON(!copied && !err); 825 return copied ? copied : err; 826 } 827 828 ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, 829 unsigned long src_start, unsigned long len, 830 uffd_flags_t flags) 831 { 832 return mfill_atomic(ctx, dst_start, src_start, len, 833 uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY)); 834 } 835 836 ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, 837 unsigned long start, 838 unsigned long len) 839 { 840 return mfill_atomic(ctx, start, 0, len, 841 uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE)); 842 } 843 844 ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, 845 unsigned long len, uffd_flags_t flags) 846 { 847 848 /* 849 * A caller might reasonably assume that UFFDIO_CONTINUE contains an 850 * smp_wmb() to ensure that any writes to the about-to-be-mapped page by 851 * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to 852 * subsequent loads from the page through the newly mapped address range. 853 */ 854 smp_wmb(); 855 856 return mfill_atomic(ctx, start, 0, len, 857 uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); 858 } 859 860 ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, 861 unsigned long len, uffd_flags_t flags) 862 { 863 return mfill_atomic(ctx, start, 0, len, 864 uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON)); 865 } 866 867 long uffd_wp_range(struct vm_area_struct *dst_vma, 868 unsigned long start, unsigned long len, bool enable_wp) 869 { 870 unsigned int mm_cp_flags; 871 struct mmu_gather tlb; 872 long ret; 873 874 VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end, 875 "The address range exceeds VMA boundary.\n"); 876 if (enable_wp) 877 mm_cp_flags = MM_CP_UFFD_WP; 878 else 879 mm_cp_flags = MM_CP_UFFD_WP_RESOLVE; 880 881 /* 882 * vma->vm_page_prot already reflects that uffd-wp is enabled for this 883 * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed 884 * to be write-protected as default whenever protection changes. 885 * Try upgrading write permissions manually. 886 */ 887 if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) 888 mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; 889 tlb_gather_mmu(&tlb, dst_vma->vm_mm); 890 ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); 891 tlb_finish_mmu(&tlb); 892 893 return ret; 894 } 895 896 int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, 897 unsigned long len, bool enable_wp) 898 { 899 struct mm_struct *dst_mm = ctx->mm; 900 unsigned long end = start + len; 901 unsigned long _start, _end; 902 struct vm_area_struct *dst_vma; 903 unsigned long page_mask; 904 long err; 905 VMA_ITERATOR(vmi, dst_mm, start); 906 907 /* 908 * Sanitize the command parameters: 909 */ 910 BUG_ON(start & ~PAGE_MASK); 911 BUG_ON(len & ~PAGE_MASK); 912 913 /* Does the address range wrap, or is the span zero-sized? */ 914 BUG_ON(start + len <= start); 915 916 mmap_read_lock(dst_mm); 917 918 /* 919 * If memory mappings are changing because of non-cooperative 920 * operation (e.g. mremap) running in parallel, bail out and 921 * request the user to retry later 922 */ 923 down_read(&ctx->map_changing_lock); 924 err = -EAGAIN; 925 if (atomic_read(&ctx->mmap_changing)) 926 goto out_unlock; 927 928 err = -ENOENT; 929 for_each_vma_range(vmi, dst_vma, end) { 930 931 if (!userfaultfd_wp(dst_vma)) { 932 err = -ENOENT; 933 break; 934 } 935 936 if (is_vm_hugetlb_page(dst_vma)) { 937 err = -EINVAL; 938 page_mask = vma_kernel_pagesize(dst_vma) - 1; 939 if ((start & page_mask) || (len & page_mask)) 940 break; 941 } 942 943 _start = max(dst_vma->vm_start, start); 944 _end = min(dst_vma->vm_end, end); 945 946 err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp); 947 948 /* Return 0 on success, <0 on failures */ 949 if (err < 0) 950 break; 951 err = 0; 952 } 953 out_unlock: 954 up_read(&ctx->map_changing_lock); 955 mmap_read_unlock(dst_mm); 956 return err; 957 } 958 959 960 void double_pt_lock(spinlock_t *ptl1, 961 spinlock_t *ptl2) 962 __acquires(ptl1) 963 __acquires(ptl2) 964 { 965 spinlock_t *ptl_tmp; 966 967 if (ptl1 > ptl2) { 968 /* exchange ptl1 and ptl2 */ 969 ptl_tmp = ptl1; 970 ptl1 = ptl2; 971 ptl2 = ptl_tmp; 972 } 973 /* lock in virtual address order to avoid lock inversion */ 974 spin_lock(ptl1); 975 if (ptl1 != ptl2) 976 spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING); 977 else 978 __acquire(ptl2); 979 } 980 981 void double_pt_unlock(spinlock_t *ptl1, 982 spinlock_t *ptl2) 983 __releases(ptl1) 984 __releases(ptl2) 985 { 986 spin_unlock(ptl1); 987 if (ptl1 != ptl2) 988 spin_unlock(ptl2); 989 else 990 __release(ptl2); 991 } 992 993 994 static int move_present_pte(struct mm_struct *mm, 995 struct vm_area_struct *dst_vma, 996 struct vm_area_struct *src_vma, 997 unsigned long dst_addr, unsigned long src_addr, 998 pte_t *dst_pte, pte_t *src_pte, 999 pte_t orig_dst_pte, pte_t orig_src_pte, 1000 spinlock_t *dst_ptl, spinlock_t *src_ptl, 1001 struct folio *src_folio) 1002 { 1003 int err = 0; 1004 1005 double_pt_lock(dst_ptl, src_ptl); 1006 1007 if (!pte_same(ptep_get(src_pte), orig_src_pte) || 1008 !pte_same(ptep_get(dst_pte), orig_dst_pte)) { 1009 err = -EAGAIN; 1010 goto out; 1011 } 1012 if (folio_test_large(src_folio) || 1013 folio_maybe_dma_pinned(src_folio) || 1014 !PageAnonExclusive(&src_folio->page)) { 1015 err = -EBUSY; 1016 goto out; 1017 } 1018 1019 orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte); 1020 /* Folio got pinned from under us. Put it back and fail the move. */ 1021 if (folio_maybe_dma_pinned(src_folio)) { 1022 set_pte_at(mm, src_addr, src_pte, orig_src_pte); 1023 err = -EBUSY; 1024 goto out; 1025 } 1026 1027 folio_move_anon_rmap(src_folio, dst_vma); 1028 src_folio->index = linear_page_index(dst_vma, dst_addr); 1029 1030 orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot); 1031 /* Follow mremap() behavior and treat the entry dirty after the move */ 1032 orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma); 1033 1034 set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); 1035 out: 1036 double_pt_unlock(dst_ptl, src_ptl); 1037 return err; 1038 } 1039 1040 static int move_swap_pte(struct mm_struct *mm, 1041 unsigned long dst_addr, unsigned long src_addr, 1042 pte_t *dst_pte, pte_t *src_pte, 1043 pte_t orig_dst_pte, pte_t orig_src_pte, 1044 spinlock_t *dst_ptl, spinlock_t *src_ptl) 1045 { 1046 if (!pte_swp_exclusive(orig_src_pte)) 1047 return -EBUSY; 1048 1049 double_pt_lock(dst_ptl, src_ptl); 1050 1051 if (!pte_same(ptep_get(src_pte), orig_src_pte) || 1052 !pte_same(ptep_get(dst_pte), orig_dst_pte)) { 1053 double_pt_unlock(dst_ptl, src_ptl); 1054 return -EAGAIN; 1055 } 1056 1057 orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); 1058 set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); 1059 double_pt_unlock(dst_ptl, src_ptl); 1060 1061 return 0; 1062 } 1063 1064 static int move_zeropage_pte(struct mm_struct *mm, 1065 struct vm_area_struct *dst_vma, 1066 struct vm_area_struct *src_vma, 1067 unsigned long dst_addr, unsigned long src_addr, 1068 pte_t *dst_pte, pte_t *src_pte, 1069 pte_t orig_dst_pte, pte_t orig_src_pte, 1070 spinlock_t *dst_ptl, spinlock_t *src_ptl) 1071 { 1072 pte_t zero_pte; 1073 1074 double_pt_lock(dst_ptl, src_ptl); 1075 if (!pte_same(ptep_get(src_pte), orig_src_pte) || 1076 !pte_same(ptep_get(dst_pte), orig_dst_pte)) { 1077 double_pt_unlock(dst_ptl, src_ptl); 1078 return -EAGAIN; 1079 } 1080 1081 zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), 1082 dst_vma->vm_page_prot)); 1083 ptep_clear_flush(src_vma, src_addr, src_pte); 1084 set_pte_at(mm, dst_addr, dst_pte, zero_pte); 1085 double_pt_unlock(dst_ptl, src_ptl); 1086 1087 return 0; 1088 } 1089 1090 1091 /* 1092 * The mmap_lock for reading is held by the caller. Just move the page 1093 * from src_pmd to dst_pmd if possible, and return true if succeeded 1094 * in moving the page. 1095 */ 1096 static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, 1097 struct vm_area_struct *dst_vma, 1098 struct vm_area_struct *src_vma, 1099 unsigned long dst_addr, unsigned long src_addr, 1100 __u64 mode) 1101 { 1102 swp_entry_t entry; 1103 pte_t orig_src_pte, orig_dst_pte; 1104 pte_t src_folio_pte; 1105 spinlock_t *src_ptl, *dst_ptl; 1106 pte_t *src_pte = NULL; 1107 pte_t *dst_pte = NULL; 1108 1109 struct folio *src_folio = NULL; 1110 struct anon_vma *src_anon_vma = NULL; 1111 struct mmu_notifier_range range; 1112 int err = 0; 1113 1114 flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE); 1115 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 1116 src_addr, src_addr + PAGE_SIZE); 1117 mmu_notifier_invalidate_range_start(&range); 1118 retry: 1119 dst_pte = pte_offset_map_nolock(mm, dst_pmd, dst_addr, &dst_ptl); 1120 1121 /* Retry if a huge pmd materialized from under us */ 1122 if (unlikely(!dst_pte)) { 1123 err = -EAGAIN; 1124 goto out; 1125 } 1126 1127 src_pte = pte_offset_map_nolock(mm, src_pmd, src_addr, &src_ptl); 1128 1129 /* 1130 * We held the mmap_lock for reading so MADV_DONTNEED 1131 * can zap transparent huge pages under us, or the 1132 * transparent huge page fault can establish new 1133 * transparent huge pages under us. 1134 */ 1135 if (unlikely(!src_pte)) { 1136 err = -EAGAIN; 1137 goto out; 1138 } 1139 1140 /* Sanity checks before the operation */ 1141 if (WARN_ON_ONCE(pmd_none(*dst_pmd)) || WARN_ON_ONCE(pmd_none(*src_pmd)) || 1142 WARN_ON_ONCE(pmd_trans_huge(*dst_pmd)) || WARN_ON_ONCE(pmd_trans_huge(*src_pmd))) { 1143 err = -EINVAL; 1144 goto out; 1145 } 1146 1147 spin_lock(dst_ptl); 1148 orig_dst_pte = ptep_get(dst_pte); 1149 spin_unlock(dst_ptl); 1150 if (!pte_none(orig_dst_pte)) { 1151 err = -EEXIST; 1152 goto out; 1153 } 1154 1155 spin_lock(src_ptl); 1156 orig_src_pte = ptep_get(src_pte); 1157 spin_unlock(src_ptl); 1158 if (pte_none(orig_src_pte)) { 1159 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) 1160 err = -ENOENT; 1161 else /* nothing to do to move a hole */ 1162 err = 0; 1163 goto out; 1164 } 1165 1166 /* If PTE changed after we locked the folio them start over */ 1167 if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { 1168 err = -EAGAIN; 1169 goto out; 1170 } 1171 1172 if (pte_present(orig_src_pte)) { 1173 if (is_zero_pfn(pte_pfn(orig_src_pte))) { 1174 err = move_zeropage_pte(mm, dst_vma, src_vma, 1175 dst_addr, src_addr, dst_pte, src_pte, 1176 orig_dst_pte, orig_src_pte, 1177 dst_ptl, src_ptl); 1178 goto out; 1179 } 1180 1181 /* 1182 * Pin and lock both source folio and anon_vma. Since we are in 1183 * RCU read section, we can't block, so on contention have to 1184 * unmap the ptes, obtain the lock and retry. 1185 */ 1186 if (!src_folio) { 1187 struct folio *folio; 1188 1189 /* 1190 * Pin the page while holding the lock to be sure the 1191 * page isn't freed under us 1192 */ 1193 spin_lock(src_ptl); 1194 if (!pte_same(orig_src_pte, ptep_get(src_pte))) { 1195 spin_unlock(src_ptl); 1196 err = -EAGAIN; 1197 goto out; 1198 } 1199 1200 folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); 1201 if (!folio || !PageAnonExclusive(&folio->page)) { 1202 spin_unlock(src_ptl); 1203 err = -EBUSY; 1204 goto out; 1205 } 1206 1207 folio_get(folio); 1208 src_folio = folio; 1209 src_folio_pte = orig_src_pte; 1210 spin_unlock(src_ptl); 1211 1212 if (!folio_trylock(src_folio)) { 1213 pte_unmap(&orig_src_pte); 1214 pte_unmap(&orig_dst_pte); 1215 src_pte = dst_pte = NULL; 1216 /* now we can block and wait */ 1217 folio_lock(src_folio); 1218 goto retry; 1219 } 1220 1221 if (WARN_ON_ONCE(!folio_test_anon(src_folio))) { 1222 err = -EBUSY; 1223 goto out; 1224 } 1225 } 1226 1227 /* at this point we have src_folio locked */ 1228 if (folio_test_large(src_folio)) { 1229 /* split_folio() can block */ 1230 pte_unmap(&orig_src_pte); 1231 pte_unmap(&orig_dst_pte); 1232 src_pte = dst_pte = NULL; 1233 err = split_folio(src_folio); 1234 if (err) 1235 goto out; 1236 /* have to reacquire the folio after it got split */ 1237 folio_unlock(src_folio); 1238 folio_put(src_folio); 1239 src_folio = NULL; 1240 goto retry; 1241 } 1242 1243 if (!src_anon_vma) { 1244 /* 1245 * folio_referenced walks the anon_vma chain 1246 * without the folio lock. Serialize against it with 1247 * the anon_vma lock, the folio lock is not enough. 1248 */ 1249 src_anon_vma = folio_get_anon_vma(src_folio); 1250 if (!src_anon_vma) { 1251 /* page was unmapped from under us */ 1252 err = -EAGAIN; 1253 goto out; 1254 } 1255 if (!anon_vma_trylock_write(src_anon_vma)) { 1256 pte_unmap(&orig_src_pte); 1257 pte_unmap(&orig_dst_pte); 1258 src_pte = dst_pte = NULL; 1259 /* now we can block and wait */ 1260 anon_vma_lock_write(src_anon_vma); 1261 goto retry; 1262 } 1263 } 1264 1265 err = move_present_pte(mm, dst_vma, src_vma, 1266 dst_addr, src_addr, dst_pte, src_pte, 1267 orig_dst_pte, orig_src_pte, 1268 dst_ptl, src_ptl, src_folio); 1269 } else { 1270 entry = pte_to_swp_entry(orig_src_pte); 1271 if (non_swap_entry(entry)) { 1272 if (is_migration_entry(entry)) { 1273 pte_unmap(&orig_src_pte); 1274 pte_unmap(&orig_dst_pte); 1275 src_pte = dst_pte = NULL; 1276 migration_entry_wait(mm, src_pmd, src_addr); 1277 err = -EAGAIN; 1278 } else 1279 err = -EFAULT; 1280 goto out; 1281 } 1282 1283 err = move_swap_pte(mm, dst_addr, src_addr, 1284 dst_pte, src_pte, 1285 orig_dst_pte, orig_src_pte, 1286 dst_ptl, src_ptl); 1287 } 1288 1289 out: 1290 if (src_anon_vma) { 1291 anon_vma_unlock_write(src_anon_vma); 1292 put_anon_vma(src_anon_vma); 1293 } 1294 if (src_folio) { 1295 folio_unlock(src_folio); 1296 folio_put(src_folio); 1297 } 1298 if (dst_pte) 1299 pte_unmap(dst_pte); 1300 if (src_pte) 1301 pte_unmap(src_pte); 1302 mmu_notifier_invalidate_range_end(&range); 1303 1304 return err; 1305 } 1306 1307 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1308 static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1309 unsigned long src_addr, 1310 unsigned long src_end) 1311 { 1312 return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) || 1313 src_end - src_addr < HPAGE_PMD_SIZE; 1314 } 1315 #else 1316 static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1317 unsigned long src_addr, 1318 unsigned long src_end) 1319 { 1320 /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */ 1321 return false; 1322 } 1323 #endif 1324 1325 static inline bool vma_move_compatible(struct vm_area_struct *vma) 1326 { 1327 return !(vma->vm_flags & (VM_PFNMAP | VM_IO | VM_HUGETLB | 1328 VM_MIXEDMAP | VM_SHADOW_STACK)); 1329 } 1330 1331 static int validate_move_areas(struct userfaultfd_ctx *ctx, 1332 struct vm_area_struct *src_vma, 1333 struct vm_area_struct *dst_vma) 1334 { 1335 /* Only allow moving if both have the same access and protection */ 1336 if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) || 1337 pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot)) 1338 return -EINVAL; 1339 1340 /* Only allow moving if both are mlocked or both aren't */ 1341 if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED)) 1342 return -EINVAL; 1343 1344 /* 1345 * For now, we keep it simple and only move between writable VMAs. 1346 * Access flags are equal, therefore cheching only the source is enough. 1347 */ 1348 if (!(src_vma->vm_flags & VM_WRITE)) 1349 return -EINVAL; 1350 1351 /* Check if vma flags indicate content which can be moved */ 1352 if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma)) 1353 return -EINVAL; 1354 1355 /* Ensure dst_vma is registered in uffd we are operating on */ 1356 if (!dst_vma->vm_userfaultfd_ctx.ctx || 1357 dst_vma->vm_userfaultfd_ctx.ctx != ctx) 1358 return -EINVAL; 1359 1360 /* Only allow moving across anonymous vmas */ 1361 if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma)) 1362 return -EINVAL; 1363 1364 return 0; 1365 } 1366 1367 static __always_inline 1368 int find_vmas_mm_locked(struct mm_struct *mm, 1369 unsigned long dst_start, 1370 unsigned long src_start, 1371 struct vm_area_struct **dst_vmap, 1372 struct vm_area_struct **src_vmap) 1373 { 1374 struct vm_area_struct *vma; 1375 1376 mmap_assert_locked(mm); 1377 vma = find_vma_and_prepare_anon(mm, dst_start); 1378 if (IS_ERR(vma)) 1379 return PTR_ERR(vma); 1380 1381 *dst_vmap = vma; 1382 /* Skip finding src_vma if src_start is in dst_vma */ 1383 if (src_start >= vma->vm_start && src_start < vma->vm_end) 1384 goto out_success; 1385 1386 vma = vma_lookup(mm, src_start); 1387 if (!vma) 1388 return -ENOENT; 1389 out_success: 1390 *src_vmap = vma; 1391 return 0; 1392 } 1393 1394 #ifdef CONFIG_PER_VMA_LOCK 1395 static int uffd_move_lock(struct mm_struct *mm, 1396 unsigned long dst_start, 1397 unsigned long src_start, 1398 struct vm_area_struct **dst_vmap, 1399 struct vm_area_struct **src_vmap) 1400 { 1401 struct vm_area_struct *vma; 1402 int err; 1403 1404 vma = lock_vma(mm, dst_start); 1405 if (IS_ERR(vma)) 1406 return PTR_ERR(vma); 1407 1408 *dst_vmap = vma; 1409 /* 1410 * Skip finding src_vma if src_start is in dst_vma. This also ensures 1411 * that we don't lock the same vma twice. 1412 */ 1413 if (src_start >= vma->vm_start && src_start < vma->vm_end) { 1414 *src_vmap = vma; 1415 return 0; 1416 } 1417 1418 /* 1419 * Using lock_vma() to get src_vma can lead to following deadlock: 1420 * 1421 * Thread1 Thread2 1422 * ------- ------- 1423 * vma_start_read(dst_vma) 1424 * mmap_write_lock(mm) 1425 * vma_start_write(src_vma) 1426 * vma_start_read(src_vma) 1427 * mmap_read_lock(mm) 1428 * vma_start_write(dst_vma) 1429 */ 1430 *src_vmap = lock_vma_under_rcu(mm, src_start); 1431 if (likely(*src_vmap)) 1432 return 0; 1433 1434 /* Undo any locking and retry in mmap_lock critical section */ 1435 vma_end_read(*dst_vmap); 1436 1437 mmap_read_lock(mm); 1438 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); 1439 if (!err) { 1440 /* 1441 * See comment in lock_vma() as to why not using 1442 * vma_start_read() here. 1443 */ 1444 down_read(&(*dst_vmap)->vm_lock->lock); 1445 if (*dst_vmap != *src_vmap) 1446 down_read_nested(&(*src_vmap)->vm_lock->lock, 1447 SINGLE_DEPTH_NESTING); 1448 } 1449 mmap_read_unlock(mm); 1450 return err; 1451 } 1452 1453 static void uffd_move_unlock(struct vm_area_struct *dst_vma, 1454 struct vm_area_struct *src_vma) 1455 { 1456 vma_end_read(src_vma); 1457 if (src_vma != dst_vma) 1458 vma_end_read(dst_vma); 1459 } 1460 1461 #else 1462 1463 static int uffd_move_lock(struct mm_struct *mm, 1464 unsigned long dst_start, 1465 unsigned long src_start, 1466 struct vm_area_struct **dst_vmap, 1467 struct vm_area_struct **src_vmap) 1468 { 1469 int err; 1470 1471 mmap_read_lock(mm); 1472 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); 1473 if (err) 1474 mmap_read_unlock(mm); 1475 return err; 1476 } 1477 1478 static void uffd_move_unlock(struct vm_area_struct *dst_vma, 1479 struct vm_area_struct *src_vma) 1480 { 1481 mmap_assert_locked(src_vma->vm_mm); 1482 mmap_read_unlock(dst_vma->vm_mm); 1483 } 1484 #endif 1485 1486 /** 1487 * move_pages - move arbitrary anonymous pages of an existing vma 1488 * @ctx: pointer to the userfaultfd context 1489 * @dst_start: start of the destination virtual memory range 1490 * @src_start: start of the source virtual memory range 1491 * @len: length of the virtual memory range 1492 * @mode: flags from uffdio_move.mode 1493 * 1494 * It will either use the mmap_lock in read mode or per-vma locks 1495 * 1496 * move_pages() remaps arbitrary anonymous pages atomically in zero 1497 * copy. It only works on non shared anonymous pages because those can 1498 * be relocated without generating non linear anon_vmas in the rmap 1499 * code. 1500 * 1501 * It provides a zero copy mechanism to handle userspace page faults. 1502 * The source vma pages should have mapcount == 1, which can be 1503 * enforced by using madvise(MADV_DONTFORK) on src vma. 1504 * 1505 * The thread receiving the page during the userland page fault 1506 * will receive the faulting page in the source vma through the network, 1507 * storage or any other I/O device (MADV_DONTFORK in the source vma 1508 * avoids move_pages() to fail with -EBUSY if the process forks before 1509 * move_pages() is called), then it will call move_pages() to map the 1510 * page in the faulting address in the destination vma. 1511 * 1512 * This userfaultfd command works purely via pagetables, so it's the 1513 * most efficient way to move physical non shared anonymous pages 1514 * across different virtual addresses. Unlike mremap()/mmap()/munmap() 1515 * it does not create any new vmas. The mapping in the destination 1516 * address is atomic. 1517 * 1518 * It only works if the vma protection bits are identical from the 1519 * source and destination vma. 1520 * 1521 * It can remap non shared anonymous pages within the same vma too. 1522 * 1523 * If the source virtual memory range has any unmapped holes, or if 1524 * the destination virtual memory range is not a whole unmapped hole, 1525 * move_pages() will fail respectively with -ENOENT or -EEXIST. This 1526 * provides a very strict behavior to avoid any chance of memory 1527 * corruption going unnoticed if there are userland race conditions. 1528 * Only one thread should resolve the userland page fault at any given 1529 * time for any given faulting address. This means that if two threads 1530 * try to both call move_pages() on the same destination address at the 1531 * same time, the second thread will get an explicit error from this 1532 * command. 1533 * 1534 * The command retval will return "len" is successful. The command 1535 * however can be interrupted by fatal signals or errors. If 1536 * interrupted it will return the number of bytes successfully 1537 * remapped before the interruption if any, or the negative error if 1538 * none. It will never return zero. Either it will return an error or 1539 * an amount of bytes successfully moved. If the retval reports a 1540 * "short" remap, the move_pages() command should be repeated by 1541 * userland with src+retval, dst+reval, len-retval if it wants to know 1542 * about the error that interrupted it. 1543 * 1544 * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to 1545 * prevent -ENOENT errors to materialize if there are holes in the 1546 * source virtual range that is being remapped. The holes will be 1547 * accounted as successfully remapped in the retval of the 1548 * command. This is mostly useful to remap hugepage naturally aligned 1549 * virtual regions without knowing if there are transparent hugepage 1550 * in the regions or not, but preventing the risk of having to split 1551 * the hugepmd during the remap. 1552 * 1553 * If there's any rmap walk that is taking the anon_vma locks without 1554 * first obtaining the folio lock (the only current instance is 1555 * folio_referenced), they will have to verify if the folio->mapping 1556 * has changed after taking the anon_vma lock. If it changed they 1557 * should release the lock and retry obtaining a new anon_vma, because 1558 * it means the anon_vma was changed by move_pages() before the lock 1559 * could be obtained. This is the only additional complexity added to 1560 * the rmap code to provide this anonymous page remapping functionality. 1561 */ 1562 ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, 1563 unsigned long src_start, unsigned long len, __u64 mode) 1564 { 1565 struct mm_struct *mm = ctx->mm; 1566 struct vm_area_struct *src_vma, *dst_vma; 1567 unsigned long src_addr, dst_addr; 1568 pmd_t *src_pmd, *dst_pmd; 1569 long err = -EINVAL; 1570 ssize_t moved = 0; 1571 1572 /* Sanitize the command parameters. */ 1573 if (WARN_ON_ONCE(src_start & ~PAGE_MASK) || 1574 WARN_ON_ONCE(dst_start & ~PAGE_MASK) || 1575 WARN_ON_ONCE(len & ~PAGE_MASK)) 1576 goto out; 1577 1578 /* Does the address range wrap, or is the span zero-sized? */ 1579 if (WARN_ON_ONCE(src_start + len <= src_start) || 1580 WARN_ON_ONCE(dst_start + len <= dst_start)) 1581 goto out; 1582 1583 err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma); 1584 if (err) 1585 goto out; 1586 1587 /* Re-check after taking map_changing_lock */ 1588 err = -EAGAIN; 1589 down_read(&ctx->map_changing_lock); 1590 if (likely(atomic_read(&ctx->mmap_changing))) 1591 goto out_unlock; 1592 /* 1593 * Make sure the vma is not shared, that the src and dst remap 1594 * ranges are both valid and fully within a single existing 1595 * vma. 1596 */ 1597 err = -EINVAL; 1598 if (src_vma->vm_flags & VM_SHARED) 1599 goto out_unlock; 1600 if (src_start + len > src_vma->vm_end) 1601 goto out_unlock; 1602 1603 if (dst_vma->vm_flags & VM_SHARED) 1604 goto out_unlock; 1605 if (dst_start + len > dst_vma->vm_end) 1606 goto out_unlock; 1607 1608 err = validate_move_areas(ctx, src_vma, dst_vma); 1609 if (err) 1610 goto out_unlock; 1611 1612 for (src_addr = src_start, dst_addr = dst_start; 1613 src_addr < src_start + len;) { 1614 spinlock_t *ptl; 1615 pmd_t dst_pmdval; 1616 unsigned long step_size; 1617 1618 /* 1619 * Below works because anonymous area would not have a 1620 * transparent huge PUD. If file-backed support is added, 1621 * that case would need to be handled here. 1622 */ 1623 src_pmd = mm_find_pmd(mm, src_addr); 1624 if (unlikely(!src_pmd)) { 1625 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 1626 err = -ENOENT; 1627 break; 1628 } 1629 src_pmd = mm_alloc_pmd(mm, src_addr); 1630 if (unlikely(!src_pmd)) { 1631 err = -ENOMEM; 1632 break; 1633 } 1634 } 1635 dst_pmd = mm_alloc_pmd(mm, dst_addr); 1636 if (unlikely(!dst_pmd)) { 1637 err = -ENOMEM; 1638 break; 1639 } 1640 1641 dst_pmdval = pmdp_get_lockless(dst_pmd); 1642 /* 1643 * If the dst_pmd is mapped as THP don't override it and just 1644 * be strict. If dst_pmd changes into TPH after this check, the 1645 * move_pages_huge_pmd() will detect the change and retry 1646 * while move_pages_pte() will detect the change and fail. 1647 */ 1648 if (unlikely(pmd_trans_huge(dst_pmdval))) { 1649 err = -EEXIST; 1650 break; 1651 } 1652 1653 ptl = pmd_trans_huge_lock(src_pmd, src_vma); 1654 if (ptl) { 1655 if (pmd_devmap(*src_pmd)) { 1656 spin_unlock(ptl); 1657 err = -ENOENT; 1658 break; 1659 } 1660 1661 /* Check if we can move the pmd without splitting it. */ 1662 if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || 1663 !pmd_none(dst_pmdval)) { 1664 struct folio *folio = pmd_folio(*src_pmd); 1665 1666 if (!folio || (!is_huge_zero_folio(folio) && 1667 !PageAnonExclusive(&folio->page))) { 1668 spin_unlock(ptl); 1669 err = -EBUSY; 1670 break; 1671 } 1672 1673 spin_unlock(ptl); 1674 split_huge_pmd(src_vma, src_pmd, src_addr); 1675 /* The folio will be split by move_pages_pte() */ 1676 continue; 1677 } 1678 1679 err = move_pages_huge_pmd(mm, dst_pmd, src_pmd, 1680 dst_pmdval, dst_vma, src_vma, 1681 dst_addr, src_addr); 1682 step_size = HPAGE_PMD_SIZE; 1683 } else { 1684 if (pmd_none(*src_pmd)) { 1685 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 1686 err = -ENOENT; 1687 break; 1688 } 1689 if (unlikely(__pte_alloc(mm, src_pmd))) { 1690 err = -ENOMEM; 1691 break; 1692 } 1693 } 1694 1695 if (unlikely(pte_alloc(mm, dst_pmd))) { 1696 err = -ENOMEM; 1697 break; 1698 } 1699 1700 err = move_pages_pte(mm, dst_pmd, src_pmd, 1701 dst_vma, src_vma, 1702 dst_addr, src_addr, mode); 1703 step_size = PAGE_SIZE; 1704 } 1705 1706 cond_resched(); 1707 1708 if (fatal_signal_pending(current)) { 1709 /* Do not override an error */ 1710 if (!err || err == -EAGAIN) 1711 err = -EINTR; 1712 break; 1713 } 1714 1715 if (err) { 1716 if (err == -EAGAIN) 1717 continue; 1718 break; 1719 } 1720 1721 /* Proceed to the next page */ 1722 dst_addr += step_size; 1723 src_addr += step_size; 1724 moved += step_size; 1725 } 1726 1727 out_unlock: 1728 up_read(&ctx->map_changing_lock); 1729 uffd_move_unlock(dst_vma, src_vma); 1730 out: 1731 VM_WARN_ON(moved < 0); 1732 VM_WARN_ON(err > 0); 1733 VM_WARN_ON(!moved && !err); 1734 return moved ? moved : err; 1735 } 1736