1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * mm/userfaultfd.c 4 * 5 * Copyright (C) 2015 Red Hat, Inc. 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/sched/signal.h> 10 #include <linux/pagemap.h> 11 #include <linux/rmap.h> 12 #include <linux/swap.h> 13 #include <linux/swapops.h> 14 #include <linux/userfaultfd_k.h> 15 #include <linux/mmu_notifier.h> 16 #include <linux/hugetlb.h> 17 #include <linux/shmem_fs.h> 18 #include <asm/tlbflush.h> 19 #include <asm/tlb.h> 20 #include "internal.h" 21 22 static __always_inline 23 bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) 24 { 25 /* Make sure that the dst range is fully within dst_vma. */ 26 if (dst_end > dst_vma->vm_end) 27 return false; 28 29 /* 30 * Check the vma is registered in uffd, this is required to 31 * enforce the VM_MAYWRITE check done at uffd registration 32 * time. 33 */ 34 if (!dst_vma->vm_userfaultfd_ctx.ctx) 35 return false; 36 37 return true; 38 } 39 40 static __always_inline 41 struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm, 42 unsigned long addr) 43 { 44 struct vm_area_struct *vma; 45 46 mmap_assert_locked(mm); 47 vma = vma_lookup(mm, addr); 48 if (!vma) 49 vma = ERR_PTR(-ENOENT); 50 else if (!(vma->vm_flags & VM_SHARED) && 51 unlikely(anon_vma_prepare(vma))) 52 vma = ERR_PTR(-ENOMEM); 53 54 return vma; 55 } 56 57 #ifdef CONFIG_PER_VMA_LOCK 58 /* 59 * lock_vma() - Lookup and lock vma corresponding to @address. 60 * @mm: mm to search vma in. 61 * @address: address that the vma should contain. 62 * 63 * Should be called without holding mmap_lock. vma should be unlocked after use 64 * with unlock_vma(). 65 * 66 * Return: A locked vma containing @address, -ENOENT if no vma is found, or 67 * -ENOMEM if anon_vma couldn't be allocated. 68 */ 69 static struct vm_area_struct *lock_vma(struct mm_struct *mm, 70 unsigned long address) 71 { 72 struct vm_area_struct *vma; 73 74 vma = lock_vma_under_rcu(mm, address); 75 if (vma) { 76 /* 77 * lock_vma_under_rcu() only checks anon_vma for private 78 * anonymous mappings. But we need to ensure it is assigned in 79 * private file-backed vmas as well. 80 */ 81 if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma)) 82 vma_end_read(vma); 83 else 84 return vma; 85 } 86 87 mmap_read_lock(mm); 88 vma = find_vma_and_prepare_anon(mm, address); 89 if (!IS_ERR(vma)) { 90 /* 91 * We cannot use vma_start_read() as it may fail due to 92 * false locked (see comment in vma_start_read()). We 93 * can avoid that by directly locking vm_lock under 94 * mmap_lock, which guarantees that nobody can lock the 95 * vma for write (vma_start_write()) under us. 96 */ 97 down_read(&vma->vm_lock->lock); 98 } 99 100 mmap_read_unlock(mm); 101 return vma; 102 } 103 104 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, 105 unsigned long dst_start, 106 unsigned long len) 107 { 108 struct vm_area_struct *dst_vma; 109 110 dst_vma = lock_vma(dst_mm, dst_start); 111 if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len)) 112 return dst_vma; 113 114 vma_end_read(dst_vma); 115 return ERR_PTR(-ENOENT); 116 } 117 118 static void uffd_mfill_unlock(struct vm_area_struct *vma) 119 { 120 vma_end_read(vma); 121 } 122 123 #else 124 125 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, 126 unsigned long dst_start, 127 unsigned long len) 128 { 129 struct vm_area_struct *dst_vma; 130 131 mmap_read_lock(dst_mm); 132 dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start); 133 if (IS_ERR(dst_vma)) 134 goto out_unlock; 135 136 if (validate_dst_vma(dst_vma, dst_start + len)) 137 return dst_vma; 138 139 dst_vma = ERR_PTR(-ENOENT); 140 out_unlock: 141 mmap_read_unlock(dst_mm); 142 return dst_vma; 143 } 144 145 static void uffd_mfill_unlock(struct vm_area_struct *vma) 146 { 147 mmap_read_unlock(vma->vm_mm); 148 } 149 #endif 150 151 /* Check if dst_addr is outside of file's size. Must be called with ptl held. */ 152 static bool mfill_file_over_size(struct vm_area_struct *dst_vma, 153 unsigned long dst_addr) 154 { 155 struct inode *inode; 156 pgoff_t offset, max_off; 157 158 if (!dst_vma->vm_file) 159 return false; 160 161 inode = dst_vma->vm_file->f_inode; 162 offset = linear_page_index(dst_vma, dst_addr); 163 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 164 return offset >= max_off; 165 } 166 167 /* 168 * Install PTEs, to map dst_addr (within dst_vma) to page. 169 * 170 * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem 171 * and anon, and for both shared and private VMAs. 172 */ 173 int mfill_atomic_install_pte(pmd_t *dst_pmd, 174 struct vm_area_struct *dst_vma, 175 unsigned long dst_addr, struct page *page, 176 bool newly_allocated, uffd_flags_t flags) 177 { 178 int ret; 179 struct mm_struct *dst_mm = dst_vma->vm_mm; 180 pte_t _dst_pte, *dst_pte; 181 bool writable = dst_vma->vm_flags & VM_WRITE; 182 bool vm_shared = dst_vma->vm_flags & VM_SHARED; 183 bool page_in_cache = page_mapping(page); 184 spinlock_t *ptl; 185 struct folio *folio; 186 187 _dst_pte = mk_pte(page, dst_vma->vm_page_prot); 188 _dst_pte = pte_mkdirty(_dst_pte); 189 if (page_in_cache && !vm_shared) 190 writable = false; 191 if (writable) 192 _dst_pte = pte_mkwrite(_dst_pte, dst_vma); 193 if (flags & MFILL_ATOMIC_WP) 194 _dst_pte = pte_mkuffd_wp(_dst_pte); 195 196 ret = -EAGAIN; 197 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 198 if (!dst_pte) 199 goto out; 200 201 if (mfill_file_over_size(dst_vma, dst_addr)) { 202 ret = -EFAULT; 203 goto out_unlock; 204 } 205 206 ret = -EEXIST; 207 /* 208 * We allow to overwrite a pte marker: consider when both MISSING|WP 209 * registered, we firstly wr-protect a none pte which has no page cache 210 * page backing it, then access the page. 211 */ 212 if (!pte_none_mostly(ptep_get(dst_pte))) 213 goto out_unlock; 214 215 folio = page_folio(page); 216 if (page_in_cache) { 217 /* Usually, cache pages are already added to LRU */ 218 if (newly_allocated) 219 folio_add_lru(folio); 220 folio_add_file_rmap_pte(folio, page, dst_vma); 221 } else { 222 folio_add_new_anon_rmap(folio, dst_vma, dst_addr); 223 folio_add_lru_vma(folio, dst_vma); 224 } 225 226 /* 227 * Must happen after rmap, as mm_counter() checks mapping (via 228 * PageAnon()), which is set by __page_set_anon_rmap(). 229 */ 230 inc_mm_counter(dst_mm, mm_counter(folio)); 231 232 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 233 234 /* No need to invalidate - it was non-present before */ 235 update_mmu_cache(dst_vma, dst_addr, dst_pte); 236 ret = 0; 237 out_unlock: 238 pte_unmap_unlock(dst_pte, ptl); 239 out: 240 return ret; 241 } 242 243 static int mfill_atomic_pte_copy(pmd_t *dst_pmd, 244 struct vm_area_struct *dst_vma, 245 unsigned long dst_addr, 246 unsigned long src_addr, 247 uffd_flags_t flags, 248 struct folio **foliop) 249 { 250 void *kaddr; 251 int ret; 252 struct folio *folio; 253 254 if (!*foliop) { 255 ret = -ENOMEM; 256 folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, 257 dst_addr, false); 258 if (!folio) 259 goto out; 260 261 kaddr = kmap_local_folio(folio, 0); 262 /* 263 * The read mmap_lock is held here. Despite the 264 * mmap_lock being read recursive a deadlock is still 265 * possible if a writer has taken a lock. For example: 266 * 267 * process A thread 1 takes read lock on own mmap_lock 268 * process A thread 2 calls mmap, blocks taking write lock 269 * process B thread 1 takes page fault, read lock on own mmap lock 270 * process B thread 2 calls mmap, blocks taking write lock 271 * process A thread 1 blocks taking read lock on process B 272 * process B thread 1 blocks taking read lock on process A 273 * 274 * Disable page faults to prevent potential deadlock 275 * and retry the copy outside the mmap_lock. 276 */ 277 pagefault_disable(); 278 ret = copy_from_user(kaddr, (const void __user *) src_addr, 279 PAGE_SIZE); 280 pagefault_enable(); 281 kunmap_local(kaddr); 282 283 /* fallback to copy_from_user outside mmap_lock */ 284 if (unlikely(ret)) { 285 ret = -ENOENT; 286 *foliop = folio; 287 /* don't free the page */ 288 goto out; 289 } 290 291 flush_dcache_folio(folio); 292 } else { 293 folio = *foliop; 294 *foliop = NULL; 295 } 296 297 /* 298 * The memory barrier inside __folio_mark_uptodate makes sure that 299 * preceding stores to the page contents become visible before 300 * the set_pte_at() write. 301 */ 302 __folio_mark_uptodate(folio); 303 304 ret = -ENOMEM; 305 if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) 306 goto out_release; 307 308 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 309 &folio->page, true, flags); 310 if (ret) 311 goto out_release; 312 out: 313 return ret; 314 out_release: 315 folio_put(folio); 316 goto out; 317 } 318 319 static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd, 320 struct vm_area_struct *dst_vma, 321 unsigned long dst_addr) 322 { 323 struct folio *folio; 324 int ret = -ENOMEM; 325 326 folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr); 327 if (!folio) 328 return ret; 329 330 if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) 331 goto out_put; 332 333 /* 334 * The memory barrier inside __folio_mark_uptodate makes sure that 335 * zeroing out the folio become visible before mapping the page 336 * using set_pte_at(). See do_anonymous_page(). 337 */ 338 __folio_mark_uptodate(folio); 339 340 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 341 &folio->page, true, 0); 342 if (ret) 343 goto out_put; 344 345 return 0; 346 out_put: 347 folio_put(folio); 348 return ret; 349 } 350 351 static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, 352 struct vm_area_struct *dst_vma, 353 unsigned long dst_addr) 354 { 355 pte_t _dst_pte, *dst_pte; 356 spinlock_t *ptl; 357 int ret; 358 359 if (mm_forbids_zeropage(dst_vma->vm_mm)) 360 return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr); 361 362 _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), 363 dst_vma->vm_page_prot)); 364 ret = -EAGAIN; 365 dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); 366 if (!dst_pte) 367 goto out; 368 if (mfill_file_over_size(dst_vma, dst_addr)) { 369 ret = -EFAULT; 370 goto out_unlock; 371 } 372 ret = -EEXIST; 373 if (!pte_none(ptep_get(dst_pte))) 374 goto out_unlock; 375 set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); 376 /* No need to invalidate - it was non-present before */ 377 update_mmu_cache(dst_vma, dst_addr, dst_pte); 378 ret = 0; 379 out_unlock: 380 pte_unmap_unlock(dst_pte, ptl); 381 out: 382 return ret; 383 } 384 385 /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ 386 static int mfill_atomic_pte_continue(pmd_t *dst_pmd, 387 struct vm_area_struct *dst_vma, 388 unsigned long dst_addr, 389 uffd_flags_t flags) 390 { 391 struct inode *inode = file_inode(dst_vma->vm_file); 392 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 393 struct folio *folio; 394 struct page *page; 395 int ret; 396 397 ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC); 398 /* Our caller expects us to return -EFAULT if we failed to find folio */ 399 if (ret == -ENOENT) 400 ret = -EFAULT; 401 if (ret) 402 goto out; 403 if (!folio) { 404 ret = -EFAULT; 405 goto out; 406 } 407 408 page = folio_file_page(folio, pgoff); 409 if (PageHWPoison(page)) { 410 ret = -EIO; 411 goto out_release; 412 } 413 414 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 415 page, false, flags); 416 if (ret) 417 goto out_release; 418 419 folio_unlock(folio); 420 ret = 0; 421 out: 422 return ret; 423 out_release: 424 folio_unlock(folio); 425 folio_put(folio); 426 goto out; 427 } 428 429 /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ 430 static int mfill_atomic_pte_poison(pmd_t *dst_pmd, 431 struct vm_area_struct *dst_vma, 432 unsigned long dst_addr, 433 uffd_flags_t flags) 434 { 435 int ret; 436 struct mm_struct *dst_mm = dst_vma->vm_mm; 437 pte_t _dst_pte, *dst_pte; 438 spinlock_t *ptl; 439 440 _dst_pte = make_pte_marker(PTE_MARKER_POISONED); 441 ret = -EAGAIN; 442 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 443 if (!dst_pte) 444 goto out; 445 446 if (mfill_file_over_size(dst_vma, dst_addr)) { 447 ret = -EFAULT; 448 goto out_unlock; 449 } 450 451 ret = -EEXIST; 452 /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */ 453 if (!pte_none(ptep_get(dst_pte))) 454 goto out_unlock; 455 456 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 457 458 /* No need to invalidate - it was non-present before */ 459 update_mmu_cache(dst_vma, dst_addr, dst_pte); 460 ret = 0; 461 out_unlock: 462 pte_unmap_unlock(dst_pte, ptl); 463 out: 464 return ret; 465 } 466 467 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) 468 { 469 pgd_t *pgd; 470 p4d_t *p4d; 471 pud_t *pud; 472 473 pgd = pgd_offset(mm, address); 474 p4d = p4d_alloc(mm, pgd, address); 475 if (!p4d) 476 return NULL; 477 pud = pud_alloc(mm, p4d, address); 478 if (!pud) 479 return NULL; 480 /* 481 * Note that we didn't run this because the pmd was 482 * missing, the *pmd may be already established and in 483 * turn it may also be a trans_huge_pmd. 484 */ 485 return pmd_alloc(mm, pud, address); 486 } 487 488 #ifdef CONFIG_HUGETLB_PAGE 489 /* 490 * mfill_atomic processing for HUGETLB vmas. Note that this routine is 491 * called with either vma-lock or mmap_lock held, it will release the lock 492 * before returning. 493 */ 494 static __always_inline ssize_t mfill_atomic_hugetlb( 495 struct userfaultfd_ctx *ctx, 496 struct vm_area_struct *dst_vma, 497 unsigned long dst_start, 498 unsigned long src_start, 499 unsigned long len, 500 uffd_flags_t flags) 501 { 502 struct mm_struct *dst_mm = dst_vma->vm_mm; 503 ssize_t err; 504 pte_t *dst_pte; 505 unsigned long src_addr, dst_addr; 506 long copied; 507 struct folio *folio; 508 unsigned long vma_hpagesize; 509 pgoff_t idx; 510 u32 hash; 511 struct address_space *mapping; 512 513 /* 514 * There is no default zero huge page for all huge page sizes as 515 * supported by hugetlb. A PMD_SIZE huge pages may exist as used 516 * by THP. Since we can not reliably insert a zero page, this 517 * feature is not supported. 518 */ 519 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { 520 up_read(&ctx->map_changing_lock); 521 uffd_mfill_unlock(dst_vma); 522 return -EINVAL; 523 } 524 525 src_addr = src_start; 526 dst_addr = dst_start; 527 copied = 0; 528 folio = NULL; 529 vma_hpagesize = vma_kernel_pagesize(dst_vma); 530 531 /* 532 * Validate alignment based on huge page size 533 */ 534 err = -EINVAL; 535 if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) 536 goto out_unlock; 537 538 retry: 539 /* 540 * On routine entry dst_vma is set. If we had to drop mmap_lock and 541 * retry, dst_vma will be set to NULL and we must lookup again. 542 */ 543 if (!dst_vma) { 544 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); 545 if (IS_ERR(dst_vma)) { 546 err = PTR_ERR(dst_vma); 547 goto out; 548 } 549 550 err = -ENOENT; 551 if (!is_vm_hugetlb_page(dst_vma)) 552 goto out_unlock_vma; 553 554 err = -EINVAL; 555 if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) 556 goto out_unlock_vma; 557 558 /* 559 * If memory mappings are changing because of non-cooperative 560 * operation (e.g. mremap) running in parallel, bail out and 561 * request the user to retry later 562 */ 563 down_read(&ctx->map_changing_lock); 564 err = -EAGAIN; 565 if (atomic_read(&ctx->mmap_changing)) 566 goto out_unlock; 567 } 568 569 while (src_addr < src_start + len) { 570 BUG_ON(dst_addr >= dst_start + len); 571 572 /* 573 * Serialize via vma_lock and hugetlb_fault_mutex. 574 * vma_lock ensures the dst_pte remains valid even 575 * in the case of shared pmds. fault mutex prevents 576 * races with other faulting threads. 577 */ 578 idx = linear_page_index(dst_vma, dst_addr); 579 mapping = dst_vma->vm_file->f_mapping; 580 hash = hugetlb_fault_mutex_hash(mapping, idx); 581 mutex_lock(&hugetlb_fault_mutex_table[hash]); 582 hugetlb_vma_lock_read(dst_vma); 583 584 err = -ENOMEM; 585 dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); 586 if (!dst_pte) { 587 hugetlb_vma_unlock_read(dst_vma); 588 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 589 goto out_unlock; 590 } 591 592 if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && 593 !huge_pte_none_mostly(huge_ptep_get(dst_pte))) { 594 err = -EEXIST; 595 hugetlb_vma_unlock_read(dst_vma); 596 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 597 goto out_unlock; 598 } 599 600 err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, 601 src_addr, flags, &folio); 602 603 hugetlb_vma_unlock_read(dst_vma); 604 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 605 606 cond_resched(); 607 608 if (unlikely(err == -ENOENT)) { 609 up_read(&ctx->map_changing_lock); 610 uffd_mfill_unlock(dst_vma); 611 BUG_ON(!folio); 612 613 err = copy_folio_from_user(folio, 614 (const void __user *)src_addr, true); 615 if (unlikely(err)) { 616 err = -EFAULT; 617 goto out; 618 } 619 620 dst_vma = NULL; 621 goto retry; 622 } else 623 BUG_ON(folio); 624 625 if (!err) { 626 dst_addr += vma_hpagesize; 627 src_addr += vma_hpagesize; 628 copied += vma_hpagesize; 629 630 if (fatal_signal_pending(current)) 631 err = -EINTR; 632 } 633 if (err) 634 break; 635 } 636 637 out_unlock: 638 up_read(&ctx->map_changing_lock); 639 out_unlock_vma: 640 uffd_mfill_unlock(dst_vma); 641 out: 642 if (folio) 643 folio_put(folio); 644 BUG_ON(copied < 0); 645 BUG_ON(err > 0); 646 BUG_ON(!copied && !err); 647 return copied ? copied : err; 648 } 649 #else /* !CONFIG_HUGETLB_PAGE */ 650 /* fail at build time if gcc attempts to use this */ 651 extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx, 652 struct vm_area_struct *dst_vma, 653 unsigned long dst_start, 654 unsigned long src_start, 655 unsigned long len, 656 uffd_flags_t flags); 657 #endif /* CONFIG_HUGETLB_PAGE */ 658 659 static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, 660 struct vm_area_struct *dst_vma, 661 unsigned long dst_addr, 662 unsigned long src_addr, 663 uffd_flags_t flags, 664 struct folio **foliop) 665 { 666 ssize_t err; 667 668 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { 669 return mfill_atomic_pte_continue(dst_pmd, dst_vma, 670 dst_addr, flags); 671 } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { 672 return mfill_atomic_pte_poison(dst_pmd, dst_vma, 673 dst_addr, flags); 674 } 675 676 /* 677 * The normal page fault path for a shmem will invoke the 678 * fault, fill the hole in the file and COW it right away. The 679 * result generates plain anonymous memory. So when we are 680 * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll 681 * generate anonymous memory directly without actually filling 682 * the hole. For the MAP_PRIVATE case the robustness check 683 * only happens in the pagetable (to verify it's still none) 684 * and not in the radix tree. 685 */ 686 if (!(dst_vma->vm_flags & VM_SHARED)) { 687 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) 688 err = mfill_atomic_pte_copy(dst_pmd, dst_vma, 689 dst_addr, src_addr, 690 flags, foliop); 691 else 692 err = mfill_atomic_pte_zeropage(dst_pmd, 693 dst_vma, dst_addr); 694 } else { 695 err = shmem_mfill_atomic_pte(dst_pmd, dst_vma, 696 dst_addr, src_addr, 697 flags, foliop); 698 } 699 700 return err; 701 } 702 703 static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, 704 unsigned long dst_start, 705 unsigned long src_start, 706 unsigned long len, 707 uffd_flags_t flags) 708 { 709 struct mm_struct *dst_mm = ctx->mm; 710 struct vm_area_struct *dst_vma; 711 ssize_t err; 712 pmd_t *dst_pmd; 713 unsigned long src_addr, dst_addr; 714 long copied; 715 struct folio *folio; 716 717 /* 718 * Sanitize the command parameters: 719 */ 720 BUG_ON(dst_start & ~PAGE_MASK); 721 BUG_ON(len & ~PAGE_MASK); 722 723 /* Does the address range wrap, or is the span zero-sized? */ 724 BUG_ON(src_start + len <= src_start); 725 BUG_ON(dst_start + len <= dst_start); 726 727 src_addr = src_start; 728 dst_addr = dst_start; 729 copied = 0; 730 folio = NULL; 731 retry: 732 /* 733 * Make sure the vma is not shared, that the dst range is 734 * both valid and fully within a single existing vma. 735 */ 736 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); 737 if (IS_ERR(dst_vma)) { 738 err = PTR_ERR(dst_vma); 739 goto out; 740 } 741 742 /* 743 * If memory mappings are changing because of non-cooperative 744 * operation (e.g. mremap) running in parallel, bail out and 745 * request the user to retry later 746 */ 747 down_read(&ctx->map_changing_lock); 748 err = -EAGAIN; 749 if (atomic_read(&ctx->mmap_changing)) 750 goto out_unlock; 751 752 err = -EINVAL; 753 /* 754 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but 755 * it will overwrite vm_ops, so vma_is_anonymous must return false. 756 */ 757 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && 758 dst_vma->vm_flags & VM_SHARED)) 759 goto out_unlock; 760 761 /* 762 * validate 'mode' now that we know the dst_vma: don't allow 763 * a wrprotect copy if the userfaultfd didn't register as WP. 764 */ 765 if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) 766 goto out_unlock; 767 768 /* 769 * If this is a HUGETLB vma, pass off to appropriate routine 770 */ 771 if (is_vm_hugetlb_page(dst_vma)) 772 return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, 773 src_start, len, flags); 774 775 if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) 776 goto out_unlock; 777 if (!vma_is_shmem(dst_vma) && 778 uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) 779 goto out_unlock; 780 781 while (src_addr < src_start + len) { 782 pmd_t dst_pmdval; 783 784 BUG_ON(dst_addr >= dst_start + len); 785 786 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); 787 if (unlikely(!dst_pmd)) { 788 err = -ENOMEM; 789 break; 790 } 791 792 dst_pmdval = pmdp_get_lockless(dst_pmd); 793 /* 794 * If the dst_pmd is mapped as THP don't 795 * override it and just be strict. 796 */ 797 if (unlikely(pmd_trans_huge(dst_pmdval))) { 798 err = -EEXIST; 799 break; 800 } 801 if (unlikely(pmd_none(dst_pmdval)) && 802 unlikely(__pte_alloc(dst_mm, dst_pmd))) { 803 err = -ENOMEM; 804 break; 805 } 806 /* If an huge pmd materialized from under us fail */ 807 if (unlikely(pmd_trans_huge(*dst_pmd))) { 808 err = -EFAULT; 809 break; 810 } 811 812 BUG_ON(pmd_none(*dst_pmd)); 813 BUG_ON(pmd_trans_huge(*dst_pmd)); 814 815 err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, 816 src_addr, flags, &folio); 817 cond_resched(); 818 819 if (unlikely(err == -ENOENT)) { 820 void *kaddr; 821 822 up_read(&ctx->map_changing_lock); 823 uffd_mfill_unlock(dst_vma); 824 BUG_ON(!folio); 825 826 kaddr = kmap_local_folio(folio, 0); 827 err = copy_from_user(kaddr, 828 (const void __user *) src_addr, 829 PAGE_SIZE); 830 kunmap_local(kaddr); 831 if (unlikely(err)) { 832 err = -EFAULT; 833 goto out; 834 } 835 flush_dcache_folio(folio); 836 goto retry; 837 } else 838 BUG_ON(folio); 839 840 if (!err) { 841 dst_addr += PAGE_SIZE; 842 src_addr += PAGE_SIZE; 843 copied += PAGE_SIZE; 844 845 if (fatal_signal_pending(current)) 846 err = -EINTR; 847 } 848 if (err) 849 break; 850 } 851 852 out_unlock: 853 up_read(&ctx->map_changing_lock); 854 uffd_mfill_unlock(dst_vma); 855 out: 856 if (folio) 857 folio_put(folio); 858 BUG_ON(copied < 0); 859 BUG_ON(err > 0); 860 BUG_ON(!copied && !err); 861 return copied ? copied : err; 862 } 863 864 ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, 865 unsigned long src_start, unsigned long len, 866 uffd_flags_t flags) 867 { 868 return mfill_atomic(ctx, dst_start, src_start, len, 869 uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY)); 870 } 871 872 ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, 873 unsigned long start, 874 unsigned long len) 875 { 876 return mfill_atomic(ctx, start, 0, len, 877 uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE)); 878 } 879 880 ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, 881 unsigned long len, uffd_flags_t flags) 882 { 883 884 /* 885 * A caller might reasonably assume that UFFDIO_CONTINUE contains an 886 * smp_wmb() to ensure that any writes to the about-to-be-mapped page by 887 * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to 888 * subsequent loads from the page through the newly mapped address range. 889 */ 890 smp_wmb(); 891 892 return mfill_atomic(ctx, start, 0, len, 893 uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); 894 } 895 896 ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, 897 unsigned long len, uffd_flags_t flags) 898 { 899 return mfill_atomic(ctx, start, 0, len, 900 uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON)); 901 } 902 903 long uffd_wp_range(struct vm_area_struct *dst_vma, 904 unsigned long start, unsigned long len, bool enable_wp) 905 { 906 unsigned int mm_cp_flags; 907 struct mmu_gather tlb; 908 long ret; 909 910 VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end, 911 "The address range exceeds VMA boundary.\n"); 912 if (enable_wp) 913 mm_cp_flags = MM_CP_UFFD_WP; 914 else 915 mm_cp_flags = MM_CP_UFFD_WP_RESOLVE; 916 917 /* 918 * vma->vm_page_prot already reflects that uffd-wp is enabled for this 919 * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed 920 * to be write-protected as default whenever protection changes. 921 * Try upgrading write permissions manually. 922 */ 923 if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) 924 mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; 925 tlb_gather_mmu(&tlb, dst_vma->vm_mm); 926 ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); 927 tlb_finish_mmu(&tlb); 928 929 return ret; 930 } 931 932 int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, 933 unsigned long len, bool enable_wp) 934 { 935 struct mm_struct *dst_mm = ctx->mm; 936 unsigned long end = start + len; 937 unsigned long _start, _end; 938 struct vm_area_struct *dst_vma; 939 unsigned long page_mask; 940 long err; 941 VMA_ITERATOR(vmi, dst_mm, start); 942 943 /* 944 * Sanitize the command parameters: 945 */ 946 BUG_ON(start & ~PAGE_MASK); 947 BUG_ON(len & ~PAGE_MASK); 948 949 /* Does the address range wrap, or is the span zero-sized? */ 950 BUG_ON(start + len <= start); 951 952 mmap_read_lock(dst_mm); 953 954 /* 955 * If memory mappings are changing because of non-cooperative 956 * operation (e.g. mremap) running in parallel, bail out and 957 * request the user to retry later 958 */ 959 down_read(&ctx->map_changing_lock); 960 err = -EAGAIN; 961 if (atomic_read(&ctx->mmap_changing)) 962 goto out_unlock; 963 964 err = -ENOENT; 965 for_each_vma_range(vmi, dst_vma, end) { 966 967 if (!userfaultfd_wp(dst_vma)) { 968 err = -ENOENT; 969 break; 970 } 971 972 if (is_vm_hugetlb_page(dst_vma)) { 973 err = -EINVAL; 974 page_mask = vma_kernel_pagesize(dst_vma) - 1; 975 if ((start & page_mask) || (len & page_mask)) 976 break; 977 } 978 979 _start = max(dst_vma->vm_start, start); 980 _end = min(dst_vma->vm_end, end); 981 982 err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp); 983 984 /* Return 0 on success, <0 on failures */ 985 if (err < 0) 986 break; 987 err = 0; 988 } 989 out_unlock: 990 up_read(&ctx->map_changing_lock); 991 mmap_read_unlock(dst_mm); 992 return err; 993 } 994 995 996 void double_pt_lock(spinlock_t *ptl1, 997 spinlock_t *ptl2) 998 __acquires(ptl1) 999 __acquires(ptl2) 1000 { 1001 spinlock_t *ptl_tmp; 1002 1003 if (ptl1 > ptl2) { 1004 /* exchange ptl1 and ptl2 */ 1005 ptl_tmp = ptl1; 1006 ptl1 = ptl2; 1007 ptl2 = ptl_tmp; 1008 } 1009 /* lock in virtual address order to avoid lock inversion */ 1010 spin_lock(ptl1); 1011 if (ptl1 != ptl2) 1012 spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING); 1013 else 1014 __acquire(ptl2); 1015 } 1016 1017 void double_pt_unlock(spinlock_t *ptl1, 1018 spinlock_t *ptl2) 1019 __releases(ptl1) 1020 __releases(ptl2) 1021 { 1022 spin_unlock(ptl1); 1023 if (ptl1 != ptl2) 1024 spin_unlock(ptl2); 1025 else 1026 __release(ptl2); 1027 } 1028 1029 1030 static int move_present_pte(struct mm_struct *mm, 1031 struct vm_area_struct *dst_vma, 1032 struct vm_area_struct *src_vma, 1033 unsigned long dst_addr, unsigned long src_addr, 1034 pte_t *dst_pte, pte_t *src_pte, 1035 pte_t orig_dst_pte, pte_t orig_src_pte, 1036 spinlock_t *dst_ptl, spinlock_t *src_ptl, 1037 struct folio *src_folio) 1038 { 1039 int err = 0; 1040 1041 double_pt_lock(dst_ptl, src_ptl); 1042 1043 if (!pte_same(ptep_get(src_pte), orig_src_pte) || 1044 !pte_same(ptep_get(dst_pte), orig_dst_pte)) { 1045 err = -EAGAIN; 1046 goto out; 1047 } 1048 if (folio_test_large(src_folio) || 1049 folio_maybe_dma_pinned(src_folio) || 1050 !PageAnonExclusive(&src_folio->page)) { 1051 err = -EBUSY; 1052 goto out; 1053 } 1054 1055 orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte); 1056 /* Folio got pinned from under us. Put it back and fail the move. */ 1057 if (folio_maybe_dma_pinned(src_folio)) { 1058 set_pte_at(mm, src_addr, src_pte, orig_src_pte); 1059 err = -EBUSY; 1060 goto out; 1061 } 1062 1063 folio_move_anon_rmap(src_folio, dst_vma); 1064 WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); 1065 1066 orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot); 1067 /* Follow mremap() behavior and treat the entry dirty after the move */ 1068 orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma); 1069 1070 set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); 1071 out: 1072 double_pt_unlock(dst_ptl, src_ptl); 1073 return err; 1074 } 1075 1076 static int move_swap_pte(struct mm_struct *mm, 1077 unsigned long dst_addr, unsigned long src_addr, 1078 pte_t *dst_pte, pte_t *src_pte, 1079 pte_t orig_dst_pte, pte_t orig_src_pte, 1080 spinlock_t *dst_ptl, spinlock_t *src_ptl) 1081 { 1082 if (!pte_swp_exclusive(orig_src_pte)) 1083 return -EBUSY; 1084 1085 double_pt_lock(dst_ptl, src_ptl); 1086 1087 if (!pte_same(ptep_get(src_pte), orig_src_pte) || 1088 !pte_same(ptep_get(dst_pte), orig_dst_pte)) { 1089 double_pt_unlock(dst_ptl, src_ptl); 1090 return -EAGAIN; 1091 } 1092 1093 orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); 1094 set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); 1095 double_pt_unlock(dst_ptl, src_ptl); 1096 1097 return 0; 1098 } 1099 1100 static int move_zeropage_pte(struct mm_struct *mm, 1101 struct vm_area_struct *dst_vma, 1102 struct vm_area_struct *src_vma, 1103 unsigned long dst_addr, unsigned long src_addr, 1104 pte_t *dst_pte, pte_t *src_pte, 1105 pte_t orig_dst_pte, pte_t orig_src_pte, 1106 spinlock_t *dst_ptl, spinlock_t *src_ptl) 1107 { 1108 pte_t zero_pte; 1109 1110 double_pt_lock(dst_ptl, src_ptl); 1111 if (!pte_same(ptep_get(src_pte), orig_src_pte) || 1112 !pte_same(ptep_get(dst_pte), orig_dst_pte)) { 1113 double_pt_unlock(dst_ptl, src_ptl); 1114 return -EAGAIN; 1115 } 1116 1117 zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), 1118 dst_vma->vm_page_prot)); 1119 ptep_clear_flush(src_vma, src_addr, src_pte); 1120 set_pte_at(mm, dst_addr, dst_pte, zero_pte); 1121 double_pt_unlock(dst_ptl, src_ptl); 1122 1123 return 0; 1124 } 1125 1126 1127 /* 1128 * The mmap_lock for reading is held by the caller. Just move the page 1129 * from src_pmd to dst_pmd if possible, and return true if succeeded 1130 * in moving the page. 1131 */ 1132 static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, 1133 struct vm_area_struct *dst_vma, 1134 struct vm_area_struct *src_vma, 1135 unsigned long dst_addr, unsigned long src_addr, 1136 __u64 mode) 1137 { 1138 swp_entry_t entry; 1139 pte_t orig_src_pte, orig_dst_pte; 1140 pte_t src_folio_pte; 1141 spinlock_t *src_ptl, *dst_ptl; 1142 pte_t *src_pte = NULL; 1143 pte_t *dst_pte = NULL; 1144 1145 struct folio *src_folio = NULL; 1146 struct anon_vma *src_anon_vma = NULL; 1147 struct mmu_notifier_range range; 1148 int err = 0; 1149 1150 flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE); 1151 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 1152 src_addr, src_addr + PAGE_SIZE); 1153 mmu_notifier_invalidate_range_start(&range); 1154 retry: 1155 dst_pte = pte_offset_map_nolock(mm, dst_pmd, dst_addr, &dst_ptl); 1156 1157 /* Retry if a huge pmd materialized from under us */ 1158 if (unlikely(!dst_pte)) { 1159 err = -EAGAIN; 1160 goto out; 1161 } 1162 1163 src_pte = pte_offset_map_nolock(mm, src_pmd, src_addr, &src_ptl); 1164 1165 /* 1166 * We held the mmap_lock for reading so MADV_DONTNEED 1167 * can zap transparent huge pages under us, or the 1168 * transparent huge page fault can establish new 1169 * transparent huge pages under us. 1170 */ 1171 if (unlikely(!src_pte)) { 1172 err = -EAGAIN; 1173 goto out; 1174 } 1175 1176 /* Sanity checks before the operation */ 1177 if (WARN_ON_ONCE(pmd_none(*dst_pmd)) || WARN_ON_ONCE(pmd_none(*src_pmd)) || 1178 WARN_ON_ONCE(pmd_trans_huge(*dst_pmd)) || WARN_ON_ONCE(pmd_trans_huge(*src_pmd))) { 1179 err = -EINVAL; 1180 goto out; 1181 } 1182 1183 spin_lock(dst_ptl); 1184 orig_dst_pte = ptep_get(dst_pte); 1185 spin_unlock(dst_ptl); 1186 if (!pte_none(orig_dst_pte)) { 1187 err = -EEXIST; 1188 goto out; 1189 } 1190 1191 spin_lock(src_ptl); 1192 orig_src_pte = ptep_get(src_pte); 1193 spin_unlock(src_ptl); 1194 if (pte_none(orig_src_pte)) { 1195 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) 1196 err = -ENOENT; 1197 else /* nothing to do to move a hole */ 1198 err = 0; 1199 goto out; 1200 } 1201 1202 /* If PTE changed after we locked the folio them start over */ 1203 if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { 1204 err = -EAGAIN; 1205 goto out; 1206 } 1207 1208 if (pte_present(orig_src_pte)) { 1209 if (is_zero_pfn(pte_pfn(orig_src_pte))) { 1210 err = move_zeropage_pte(mm, dst_vma, src_vma, 1211 dst_addr, src_addr, dst_pte, src_pte, 1212 orig_dst_pte, orig_src_pte, 1213 dst_ptl, src_ptl); 1214 goto out; 1215 } 1216 1217 /* 1218 * Pin and lock both source folio and anon_vma. Since we are in 1219 * RCU read section, we can't block, so on contention have to 1220 * unmap the ptes, obtain the lock and retry. 1221 */ 1222 if (!src_folio) { 1223 struct folio *folio; 1224 1225 /* 1226 * Pin the page while holding the lock to be sure the 1227 * page isn't freed under us 1228 */ 1229 spin_lock(src_ptl); 1230 if (!pte_same(orig_src_pte, ptep_get(src_pte))) { 1231 spin_unlock(src_ptl); 1232 err = -EAGAIN; 1233 goto out; 1234 } 1235 1236 folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); 1237 if (!folio || !PageAnonExclusive(&folio->page)) { 1238 spin_unlock(src_ptl); 1239 err = -EBUSY; 1240 goto out; 1241 } 1242 1243 folio_get(folio); 1244 src_folio = folio; 1245 src_folio_pte = orig_src_pte; 1246 spin_unlock(src_ptl); 1247 1248 if (!folio_trylock(src_folio)) { 1249 pte_unmap(&orig_src_pte); 1250 pte_unmap(&orig_dst_pte); 1251 src_pte = dst_pte = NULL; 1252 /* now we can block and wait */ 1253 folio_lock(src_folio); 1254 goto retry; 1255 } 1256 1257 if (WARN_ON_ONCE(!folio_test_anon(src_folio))) { 1258 err = -EBUSY; 1259 goto out; 1260 } 1261 } 1262 1263 /* at this point we have src_folio locked */ 1264 if (folio_test_large(src_folio)) { 1265 /* split_folio() can block */ 1266 pte_unmap(&orig_src_pte); 1267 pte_unmap(&orig_dst_pte); 1268 src_pte = dst_pte = NULL; 1269 err = split_folio(src_folio); 1270 if (err) 1271 goto out; 1272 /* have to reacquire the folio after it got split */ 1273 folio_unlock(src_folio); 1274 folio_put(src_folio); 1275 src_folio = NULL; 1276 goto retry; 1277 } 1278 1279 if (!src_anon_vma) { 1280 /* 1281 * folio_referenced walks the anon_vma chain 1282 * without the folio lock. Serialize against it with 1283 * the anon_vma lock, the folio lock is not enough. 1284 */ 1285 src_anon_vma = folio_get_anon_vma(src_folio); 1286 if (!src_anon_vma) { 1287 /* page was unmapped from under us */ 1288 err = -EAGAIN; 1289 goto out; 1290 } 1291 if (!anon_vma_trylock_write(src_anon_vma)) { 1292 pte_unmap(&orig_src_pte); 1293 pte_unmap(&orig_dst_pte); 1294 src_pte = dst_pte = NULL; 1295 /* now we can block and wait */ 1296 anon_vma_lock_write(src_anon_vma); 1297 goto retry; 1298 } 1299 } 1300 1301 err = move_present_pte(mm, dst_vma, src_vma, 1302 dst_addr, src_addr, dst_pte, src_pte, 1303 orig_dst_pte, orig_src_pte, 1304 dst_ptl, src_ptl, src_folio); 1305 } else { 1306 entry = pte_to_swp_entry(orig_src_pte); 1307 if (non_swap_entry(entry)) { 1308 if (is_migration_entry(entry)) { 1309 pte_unmap(&orig_src_pte); 1310 pte_unmap(&orig_dst_pte); 1311 src_pte = dst_pte = NULL; 1312 migration_entry_wait(mm, src_pmd, src_addr); 1313 err = -EAGAIN; 1314 } else 1315 err = -EFAULT; 1316 goto out; 1317 } 1318 1319 err = move_swap_pte(mm, dst_addr, src_addr, 1320 dst_pte, src_pte, 1321 orig_dst_pte, orig_src_pte, 1322 dst_ptl, src_ptl); 1323 } 1324 1325 out: 1326 if (src_anon_vma) { 1327 anon_vma_unlock_write(src_anon_vma); 1328 put_anon_vma(src_anon_vma); 1329 } 1330 if (src_folio) { 1331 folio_unlock(src_folio); 1332 folio_put(src_folio); 1333 } 1334 if (dst_pte) 1335 pte_unmap(dst_pte); 1336 if (src_pte) 1337 pte_unmap(src_pte); 1338 mmu_notifier_invalidate_range_end(&range); 1339 1340 return err; 1341 } 1342 1343 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1344 static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1345 unsigned long src_addr, 1346 unsigned long src_end) 1347 { 1348 return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) || 1349 src_end - src_addr < HPAGE_PMD_SIZE; 1350 } 1351 #else 1352 static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1353 unsigned long src_addr, 1354 unsigned long src_end) 1355 { 1356 /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */ 1357 return false; 1358 } 1359 #endif 1360 1361 static inline bool vma_move_compatible(struct vm_area_struct *vma) 1362 { 1363 return !(vma->vm_flags & (VM_PFNMAP | VM_IO | VM_HUGETLB | 1364 VM_MIXEDMAP | VM_SHADOW_STACK)); 1365 } 1366 1367 static int validate_move_areas(struct userfaultfd_ctx *ctx, 1368 struct vm_area_struct *src_vma, 1369 struct vm_area_struct *dst_vma) 1370 { 1371 /* Only allow moving if both have the same access and protection */ 1372 if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) || 1373 pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot)) 1374 return -EINVAL; 1375 1376 /* Only allow moving if both are mlocked or both aren't */ 1377 if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED)) 1378 return -EINVAL; 1379 1380 /* 1381 * For now, we keep it simple and only move between writable VMAs. 1382 * Access flags are equal, therefore cheching only the source is enough. 1383 */ 1384 if (!(src_vma->vm_flags & VM_WRITE)) 1385 return -EINVAL; 1386 1387 /* Check if vma flags indicate content which can be moved */ 1388 if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma)) 1389 return -EINVAL; 1390 1391 /* Ensure dst_vma is registered in uffd we are operating on */ 1392 if (!dst_vma->vm_userfaultfd_ctx.ctx || 1393 dst_vma->vm_userfaultfd_ctx.ctx != ctx) 1394 return -EINVAL; 1395 1396 /* Only allow moving across anonymous vmas */ 1397 if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma)) 1398 return -EINVAL; 1399 1400 return 0; 1401 } 1402 1403 static __always_inline 1404 int find_vmas_mm_locked(struct mm_struct *mm, 1405 unsigned long dst_start, 1406 unsigned long src_start, 1407 struct vm_area_struct **dst_vmap, 1408 struct vm_area_struct **src_vmap) 1409 { 1410 struct vm_area_struct *vma; 1411 1412 mmap_assert_locked(mm); 1413 vma = find_vma_and_prepare_anon(mm, dst_start); 1414 if (IS_ERR(vma)) 1415 return PTR_ERR(vma); 1416 1417 *dst_vmap = vma; 1418 /* Skip finding src_vma if src_start is in dst_vma */ 1419 if (src_start >= vma->vm_start && src_start < vma->vm_end) 1420 goto out_success; 1421 1422 vma = vma_lookup(mm, src_start); 1423 if (!vma) 1424 return -ENOENT; 1425 out_success: 1426 *src_vmap = vma; 1427 return 0; 1428 } 1429 1430 #ifdef CONFIG_PER_VMA_LOCK 1431 static int uffd_move_lock(struct mm_struct *mm, 1432 unsigned long dst_start, 1433 unsigned long src_start, 1434 struct vm_area_struct **dst_vmap, 1435 struct vm_area_struct **src_vmap) 1436 { 1437 struct vm_area_struct *vma; 1438 int err; 1439 1440 vma = lock_vma(mm, dst_start); 1441 if (IS_ERR(vma)) 1442 return PTR_ERR(vma); 1443 1444 *dst_vmap = vma; 1445 /* 1446 * Skip finding src_vma if src_start is in dst_vma. This also ensures 1447 * that we don't lock the same vma twice. 1448 */ 1449 if (src_start >= vma->vm_start && src_start < vma->vm_end) { 1450 *src_vmap = vma; 1451 return 0; 1452 } 1453 1454 /* 1455 * Using lock_vma() to get src_vma can lead to following deadlock: 1456 * 1457 * Thread1 Thread2 1458 * ------- ------- 1459 * vma_start_read(dst_vma) 1460 * mmap_write_lock(mm) 1461 * vma_start_write(src_vma) 1462 * vma_start_read(src_vma) 1463 * mmap_read_lock(mm) 1464 * vma_start_write(dst_vma) 1465 */ 1466 *src_vmap = lock_vma_under_rcu(mm, src_start); 1467 if (likely(*src_vmap)) 1468 return 0; 1469 1470 /* Undo any locking and retry in mmap_lock critical section */ 1471 vma_end_read(*dst_vmap); 1472 1473 mmap_read_lock(mm); 1474 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); 1475 if (!err) { 1476 /* 1477 * See comment in lock_vma() as to why not using 1478 * vma_start_read() here. 1479 */ 1480 down_read(&(*dst_vmap)->vm_lock->lock); 1481 if (*dst_vmap != *src_vmap) 1482 down_read_nested(&(*src_vmap)->vm_lock->lock, 1483 SINGLE_DEPTH_NESTING); 1484 } 1485 mmap_read_unlock(mm); 1486 return err; 1487 } 1488 1489 static void uffd_move_unlock(struct vm_area_struct *dst_vma, 1490 struct vm_area_struct *src_vma) 1491 { 1492 vma_end_read(src_vma); 1493 if (src_vma != dst_vma) 1494 vma_end_read(dst_vma); 1495 } 1496 1497 #else 1498 1499 static int uffd_move_lock(struct mm_struct *mm, 1500 unsigned long dst_start, 1501 unsigned long src_start, 1502 struct vm_area_struct **dst_vmap, 1503 struct vm_area_struct **src_vmap) 1504 { 1505 int err; 1506 1507 mmap_read_lock(mm); 1508 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); 1509 if (err) 1510 mmap_read_unlock(mm); 1511 return err; 1512 } 1513 1514 static void uffd_move_unlock(struct vm_area_struct *dst_vma, 1515 struct vm_area_struct *src_vma) 1516 { 1517 mmap_assert_locked(src_vma->vm_mm); 1518 mmap_read_unlock(dst_vma->vm_mm); 1519 } 1520 #endif 1521 1522 /** 1523 * move_pages - move arbitrary anonymous pages of an existing vma 1524 * @ctx: pointer to the userfaultfd context 1525 * @dst_start: start of the destination virtual memory range 1526 * @src_start: start of the source virtual memory range 1527 * @len: length of the virtual memory range 1528 * @mode: flags from uffdio_move.mode 1529 * 1530 * It will either use the mmap_lock in read mode or per-vma locks 1531 * 1532 * move_pages() remaps arbitrary anonymous pages atomically in zero 1533 * copy. It only works on non shared anonymous pages because those can 1534 * be relocated without generating non linear anon_vmas in the rmap 1535 * code. 1536 * 1537 * It provides a zero copy mechanism to handle userspace page faults. 1538 * The source vma pages should have mapcount == 1, which can be 1539 * enforced by using madvise(MADV_DONTFORK) on src vma. 1540 * 1541 * The thread receiving the page during the userland page fault 1542 * will receive the faulting page in the source vma through the network, 1543 * storage or any other I/O device (MADV_DONTFORK in the source vma 1544 * avoids move_pages() to fail with -EBUSY if the process forks before 1545 * move_pages() is called), then it will call move_pages() to map the 1546 * page in the faulting address in the destination vma. 1547 * 1548 * This userfaultfd command works purely via pagetables, so it's the 1549 * most efficient way to move physical non shared anonymous pages 1550 * across different virtual addresses. Unlike mremap()/mmap()/munmap() 1551 * it does not create any new vmas. The mapping in the destination 1552 * address is atomic. 1553 * 1554 * It only works if the vma protection bits are identical from the 1555 * source and destination vma. 1556 * 1557 * It can remap non shared anonymous pages within the same vma too. 1558 * 1559 * If the source virtual memory range has any unmapped holes, or if 1560 * the destination virtual memory range is not a whole unmapped hole, 1561 * move_pages() will fail respectively with -ENOENT or -EEXIST. This 1562 * provides a very strict behavior to avoid any chance of memory 1563 * corruption going unnoticed if there are userland race conditions. 1564 * Only one thread should resolve the userland page fault at any given 1565 * time for any given faulting address. This means that if two threads 1566 * try to both call move_pages() on the same destination address at the 1567 * same time, the second thread will get an explicit error from this 1568 * command. 1569 * 1570 * The command retval will return "len" is successful. The command 1571 * however can be interrupted by fatal signals or errors. If 1572 * interrupted it will return the number of bytes successfully 1573 * remapped before the interruption if any, or the negative error if 1574 * none. It will never return zero. Either it will return an error or 1575 * an amount of bytes successfully moved. If the retval reports a 1576 * "short" remap, the move_pages() command should be repeated by 1577 * userland with src+retval, dst+reval, len-retval if it wants to know 1578 * about the error that interrupted it. 1579 * 1580 * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to 1581 * prevent -ENOENT errors to materialize if there are holes in the 1582 * source virtual range that is being remapped. The holes will be 1583 * accounted as successfully remapped in the retval of the 1584 * command. This is mostly useful to remap hugepage naturally aligned 1585 * virtual regions without knowing if there are transparent hugepage 1586 * in the regions or not, but preventing the risk of having to split 1587 * the hugepmd during the remap. 1588 * 1589 * If there's any rmap walk that is taking the anon_vma locks without 1590 * first obtaining the folio lock (the only current instance is 1591 * folio_referenced), they will have to verify if the folio->mapping 1592 * has changed after taking the anon_vma lock. If it changed they 1593 * should release the lock and retry obtaining a new anon_vma, because 1594 * it means the anon_vma was changed by move_pages() before the lock 1595 * could be obtained. This is the only additional complexity added to 1596 * the rmap code to provide this anonymous page remapping functionality. 1597 */ 1598 ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, 1599 unsigned long src_start, unsigned long len, __u64 mode) 1600 { 1601 struct mm_struct *mm = ctx->mm; 1602 struct vm_area_struct *src_vma, *dst_vma; 1603 unsigned long src_addr, dst_addr; 1604 pmd_t *src_pmd, *dst_pmd; 1605 long err = -EINVAL; 1606 ssize_t moved = 0; 1607 1608 /* Sanitize the command parameters. */ 1609 if (WARN_ON_ONCE(src_start & ~PAGE_MASK) || 1610 WARN_ON_ONCE(dst_start & ~PAGE_MASK) || 1611 WARN_ON_ONCE(len & ~PAGE_MASK)) 1612 goto out; 1613 1614 /* Does the address range wrap, or is the span zero-sized? */ 1615 if (WARN_ON_ONCE(src_start + len <= src_start) || 1616 WARN_ON_ONCE(dst_start + len <= dst_start)) 1617 goto out; 1618 1619 err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma); 1620 if (err) 1621 goto out; 1622 1623 /* Re-check after taking map_changing_lock */ 1624 err = -EAGAIN; 1625 down_read(&ctx->map_changing_lock); 1626 if (likely(atomic_read(&ctx->mmap_changing))) 1627 goto out_unlock; 1628 /* 1629 * Make sure the vma is not shared, that the src and dst remap 1630 * ranges are both valid and fully within a single existing 1631 * vma. 1632 */ 1633 err = -EINVAL; 1634 if (src_vma->vm_flags & VM_SHARED) 1635 goto out_unlock; 1636 if (src_start + len > src_vma->vm_end) 1637 goto out_unlock; 1638 1639 if (dst_vma->vm_flags & VM_SHARED) 1640 goto out_unlock; 1641 if (dst_start + len > dst_vma->vm_end) 1642 goto out_unlock; 1643 1644 err = validate_move_areas(ctx, src_vma, dst_vma); 1645 if (err) 1646 goto out_unlock; 1647 1648 for (src_addr = src_start, dst_addr = dst_start; 1649 src_addr < src_start + len;) { 1650 spinlock_t *ptl; 1651 pmd_t dst_pmdval; 1652 unsigned long step_size; 1653 1654 /* 1655 * Below works because anonymous area would not have a 1656 * transparent huge PUD. If file-backed support is added, 1657 * that case would need to be handled here. 1658 */ 1659 src_pmd = mm_find_pmd(mm, src_addr); 1660 if (unlikely(!src_pmd)) { 1661 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 1662 err = -ENOENT; 1663 break; 1664 } 1665 src_pmd = mm_alloc_pmd(mm, src_addr); 1666 if (unlikely(!src_pmd)) { 1667 err = -ENOMEM; 1668 break; 1669 } 1670 } 1671 dst_pmd = mm_alloc_pmd(mm, dst_addr); 1672 if (unlikely(!dst_pmd)) { 1673 err = -ENOMEM; 1674 break; 1675 } 1676 1677 dst_pmdval = pmdp_get_lockless(dst_pmd); 1678 /* 1679 * If the dst_pmd is mapped as THP don't override it and just 1680 * be strict. If dst_pmd changes into TPH after this check, the 1681 * move_pages_huge_pmd() will detect the change and retry 1682 * while move_pages_pte() will detect the change and fail. 1683 */ 1684 if (unlikely(pmd_trans_huge(dst_pmdval))) { 1685 err = -EEXIST; 1686 break; 1687 } 1688 1689 ptl = pmd_trans_huge_lock(src_pmd, src_vma); 1690 if (ptl) { 1691 if (pmd_devmap(*src_pmd)) { 1692 spin_unlock(ptl); 1693 err = -ENOENT; 1694 break; 1695 } 1696 1697 /* Check if we can move the pmd without splitting it. */ 1698 if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || 1699 !pmd_none(dst_pmdval)) { 1700 struct folio *folio = pfn_folio(pmd_pfn(*src_pmd)); 1701 1702 if (!folio || (!is_huge_zero_page(&folio->page) && 1703 !PageAnonExclusive(&folio->page))) { 1704 spin_unlock(ptl); 1705 err = -EBUSY; 1706 break; 1707 } 1708 1709 spin_unlock(ptl); 1710 split_huge_pmd(src_vma, src_pmd, src_addr); 1711 /* The folio will be split by move_pages_pte() */ 1712 continue; 1713 } 1714 1715 err = move_pages_huge_pmd(mm, dst_pmd, src_pmd, 1716 dst_pmdval, dst_vma, src_vma, 1717 dst_addr, src_addr); 1718 step_size = HPAGE_PMD_SIZE; 1719 } else { 1720 if (pmd_none(*src_pmd)) { 1721 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 1722 err = -ENOENT; 1723 break; 1724 } 1725 if (unlikely(__pte_alloc(mm, src_pmd))) { 1726 err = -ENOMEM; 1727 break; 1728 } 1729 } 1730 1731 if (unlikely(pte_alloc(mm, dst_pmd))) { 1732 err = -ENOMEM; 1733 break; 1734 } 1735 1736 err = move_pages_pte(mm, dst_pmd, src_pmd, 1737 dst_vma, src_vma, 1738 dst_addr, src_addr, mode); 1739 step_size = PAGE_SIZE; 1740 } 1741 1742 cond_resched(); 1743 1744 if (fatal_signal_pending(current)) { 1745 /* Do not override an error */ 1746 if (!err || err == -EAGAIN) 1747 err = -EINTR; 1748 break; 1749 } 1750 1751 if (err) { 1752 if (err == -EAGAIN) 1753 continue; 1754 break; 1755 } 1756 1757 /* Proceed to the next page */ 1758 dst_addr += step_size; 1759 src_addr += step_size; 1760 moved += step_size; 1761 } 1762 1763 out_unlock: 1764 up_read(&ctx->map_changing_lock); 1765 uffd_move_unlock(dst_vma, src_vma); 1766 out: 1767 VM_WARN_ON(moved < 0); 1768 VM_WARN_ON(err > 0); 1769 VM_WARN_ON(!moved && !err); 1770 return moved ? moved : err; 1771 } 1772