1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * mm/userfaultfd.c 4 * 5 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> 6 * Copyright (C) 2008-2009 Red Hat, Inc. 7 * Copyright (C) 2015 Red Hat, Inc. 8 * 9 * Some part derived from fs/eventfd.c (anon inode setup) and 10 * mm/ksm.c (mm hashing). 11 */ 12 13 #include <linux/mm.h> 14 #include <linux/sched/signal.h> 15 #include <linux/pagemap.h> 16 #include <linux/rmap.h> 17 #include <linux/swap.h> 18 #include <linux/leafops.h> 19 #include <linux/userfaultfd_k.h> 20 #include <linux/mmu_notifier.h> 21 #include <linux/hugetlb.h> 22 #include <linux/list.h> 23 #include <linux/sched/mm.h> 24 #include <linux/mm_inline.h> 25 #include <linux/poll.h> 26 #include <linux/slab.h> 27 #include <linux/seq_file.h> 28 #include <linux/bug.h> 29 #include <linux/anon_inodes.h> 30 #include <linux/syscalls.h> 31 #include <linux/miscdevice.h> 32 #include <linux/uio.h> 33 #include <linux/file.h> 34 #include <linux/cleanup.h> 35 #include <asm/tlbflush.h> 36 #include <asm/tlb.h> 37 #include "internal.h" 38 #include "swap.h" 39 40 struct mfill_state { 41 struct userfaultfd_ctx *ctx; 42 unsigned long src_start; 43 unsigned long dst_start; 44 unsigned long len; 45 uffd_flags_t flags; 46 47 struct vm_area_struct *vma; 48 unsigned long src_addr; 49 unsigned long dst_addr; 50 pmd_t *pmd; 51 }; 52 53 static bool anon_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags) 54 { 55 /* anonymous memory does not support MINOR mode */ 56 if (vm_flags & VM_UFFD_MINOR) 57 return false; 58 return true; 59 } 60 61 static struct folio *anon_alloc_folio(struct vm_area_struct *vma, 62 unsigned long addr) 63 { 64 struct folio *folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, 65 addr); 66 67 if (!folio) 68 return NULL; 69 70 if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) { 71 folio_put(folio); 72 return NULL; 73 } 74 75 return folio; 76 } 77 78 static const struct vm_uffd_ops anon_uffd_ops = { 79 .can_userfault = anon_can_userfault, 80 .alloc_folio = anon_alloc_folio, 81 }; 82 83 static const struct vm_uffd_ops *vma_uffd_ops(struct vm_area_struct *vma) 84 { 85 if (vma_is_anonymous(vma)) 86 return &anon_uffd_ops; 87 return vma->vm_ops->uffd_ops; 88 } 89 90 static __always_inline 91 bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) 92 { 93 /* Make sure that the dst range is fully within dst_vma. */ 94 if (dst_end > dst_vma->vm_end) 95 return false; 96 97 /* 98 * Check the vma is registered in uffd, this is required to 99 * enforce the VM_MAYWRITE check done at uffd registration 100 * time. 101 */ 102 if (!dst_vma->vm_userfaultfd_ctx.ctx) 103 return false; 104 105 return true; 106 } 107 108 static __always_inline 109 struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm, 110 unsigned long addr) 111 { 112 struct vm_area_struct *vma; 113 114 mmap_assert_locked(mm); 115 vma = vma_lookup(mm, addr); 116 if (!vma) 117 vma = ERR_PTR(-ENOENT); 118 else if (!(vma->vm_flags & VM_SHARED) && 119 unlikely(anon_vma_prepare(vma))) 120 vma = ERR_PTR(-ENOMEM); 121 122 return vma; 123 } 124 125 #ifdef CONFIG_PER_VMA_LOCK 126 /* 127 * uffd_lock_vma() - Lookup and lock vma corresponding to @address. 128 * @mm: mm to search vma in. 129 * @address: address that the vma should contain. 130 * 131 * Should be called without holding mmap_lock. 132 * 133 * Return: A locked vma containing @address, -ENOENT if no vma is found, or 134 * -ENOMEM if anon_vma couldn't be allocated. 135 */ 136 static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, 137 unsigned long address) 138 { 139 struct vm_area_struct *vma; 140 141 vma = lock_vma_under_rcu(mm, address); 142 if (vma) { 143 /* 144 * We know we're going to need to use anon_vma, so check 145 * that early. 146 */ 147 if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma)) 148 vma_end_read(vma); 149 else 150 return vma; 151 } 152 153 mmap_read_lock(mm); 154 vma = find_vma_and_prepare_anon(mm, address); 155 if (!IS_ERR(vma)) { 156 bool locked = vma_start_read_locked(vma); 157 158 if (!locked) 159 vma = ERR_PTR(-EAGAIN); 160 } 161 162 mmap_read_unlock(mm); 163 return vma; 164 } 165 166 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, 167 unsigned long dst_start, 168 unsigned long len) 169 { 170 struct vm_area_struct *dst_vma; 171 172 dst_vma = uffd_lock_vma(dst_mm, dst_start); 173 if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len)) 174 return dst_vma; 175 176 vma_end_read(dst_vma); 177 return ERR_PTR(-ENOENT); 178 } 179 180 static void uffd_mfill_unlock(struct vm_area_struct *vma) 181 { 182 vma_end_read(vma); 183 } 184 185 #else 186 187 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, 188 unsigned long dst_start, 189 unsigned long len) 190 { 191 struct vm_area_struct *dst_vma; 192 193 mmap_read_lock(dst_mm); 194 dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start); 195 if (IS_ERR(dst_vma)) 196 goto out_unlock; 197 198 if (validate_dst_vma(dst_vma, dst_start + len)) 199 return dst_vma; 200 201 dst_vma = ERR_PTR(-ENOENT); 202 out_unlock: 203 mmap_read_unlock(dst_mm); 204 return dst_vma; 205 } 206 207 static void uffd_mfill_unlock(struct vm_area_struct *vma) 208 { 209 mmap_read_unlock(vma->vm_mm); 210 } 211 #endif 212 213 static void mfill_put_vma(struct mfill_state *state) 214 { 215 if (!state->vma) 216 return; 217 218 up_read(&state->ctx->map_changing_lock); 219 uffd_mfill_unlock(state->vma); 220 state->vma = NULL; 221 } 222 223 static int mfill_get_vma(struct mfill_state *state) 224 { 225 struct userfaultfd_ctx *ctx = state->ctx; 226 uffd_flags_t flags = state->flags; 227 struct vm_area_struct *dst_vma; 228 const struct vm_uffd_ops *ops; 229 int err; 230 231 /* 232 * Make sure the vma is not shared, that the dst range is 233 * both valid and fully within a single existing vma. 234 */ 235 dst_vma = uffd_mfill_lock(ctx->mm, state->dst_start, state->len); 236 if (IS_ERR(dst_vma)) 237 return PTR_ERR(dst_vma); 238 239 /* 240 * If memory mappings are changing because of non-cooperative 241 * operation (e.g. mremap) running in parallel, bail out and 242 * request the user to retry later 243 */ 244 down_read(&ctx->map_changing_lock); 245 state->vma = dst_vma; 246 err = -EAGAIN; 247 if (atomic_read(&ctx->mmap_changing)) 248 goto out_unlock; 249 250 err = -EINVAL; 251 252 /* 253 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but 254 * it will overwrite vm_ops, so vma_is_anonymous must return false. 255 */ 256 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && 257 dst_vma->vm_flags & VM_SHARED)) 258 goto out_unlock; 259 260 /* 261 * validate 'mode' now that we know the dst_vma: don't allow 262 * a wrprotect copy if the userfaultfd didn't register as WP. 263 */ 264 if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) 265 goto out_unlock; 266 267 if (is_vm_hugetlb_page(dst_vma)) 268 return 0; 269 270 ops = vma_uffd_ops(dst_vma); 271 if (!ops) 272 goto out_unlock; 273 274 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && 275 !ops->get_folio_noalloc) 276 goto out_unlock; 277 278 return 0; 279 280 out_unlock: 281 mfill_put_vma(state); 282 return err; 283 } 284 285 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) 286 { 287 pgd_t *pgd; 288 p4d_t *p4d; 289 pud_t *pud; 290 291 pgd = pgd_offset(mm, address); 292 p4d = p4d_alloc(mm, pgd, address); 293 if (!p4d) 294 return NULL; 295 pud = pud_alloc(mm, p4d, address); 296 if (!pud) 297 return NULL; 298 /* 299 * Note that we didn't run this because the pmd was 300 * missing, the *pmd may be already established and in 301 * turn it may also be a trans_huge_pmd. 302 */ 303 return pmd_alloc(mm, pud, address); 304 } 305 306 static int mfill_establish_pmd(struct mfill_state *state) 307 { 308 struct mm_struct *dst_mm = state->ctx->mm; 309 pmd_t *dst_pmd, dst_pmdval; 310 311 dst_pmd = mm_alloc_pmd(dst_mm, state->dst_addr); 312 if (unlikely(!dst_pmd)) 313 return -ENOMEM; 314 315 dst_pmdval = pmdp_get_lockless(dst_pmd); 316 if (unlikely(pmd_none(dst_pmdval)) && 317 unlikely(__pte_alloc(dst_mm, dst_pmd))) 318 return -ENOMEM; 319 320 dst_pmdval = pmdp_get_lockless(dst_pmd); 321 /* 322 * If the dst_pmd is THP don't override it and just be strict. 323 * (This includes the case where the PMD used to be THP and 324 * changed back to none after __pte_alloc().) 325 */ 326 if (unlikely(!pmd_present(dst_pmdval) || pmd_leaf(dst_pmdval))) 327 return -EEXIST; 328 if (unlikely(pmd_bad(dst_pmdval))) 329 return -EFAULT; 330 331 state->pmd = dst_pmd; 332 return 0; 333 } 334 335 /* Check if dst_addr is outside of file's size. Must be called with ptl held. */ 336 static bool mfill_file_over_size(struct vm_area_struct *dst_vma, 337 unsigned long dst_addr) 338 { 339 struct inode *inode; 340 pgoff_t offset, max_off; 341 342 if (!dst_vma->vm_file) 343 return false; 344 345 inode = dst_vma->vm_file->f_inode; 346 offset = linear_page_index(dst_vma, dst_addr); 347 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 348 return offset >= max_off; 349 } 350 351 /* 352 * Install PTEs, to map dst_addr (within dst_vma) to page. 353 * 354 * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem 355 * and anon, and for both shared and private VMAs. 356 */ 357 static int mfill_atomic_install_pte(pmd_t *dst_pmd, 358 struct vm_area_struct *dst_vma, 359 unsigned long dst_addr, struct page *page, 360 uffd_flags_t flags) 361 { 362 int ret; 363 struct mm_struct *dst_mm = dst_vma->vm_mm; 364 pte_t _dst_pte, *dst_pte; 365 bool writable = dst_vma->vm_flags & VM_WRITE; 366 bool vm_shared = dst_vma->vm_flags & VM_SHARED; 367 spinlock_t *ptl; 368 struct folio *folio = page_folio(page); 369 bool page_in_cache = folio_mapping(folio); 370 pte_t dst_ptep; 371 372 _dst_pte = mk_pte(page, dst_vma->vm_page_prot); 373 _dst_pte = pte_mkdirty(_dst_pte); 374 if (page_in_cache && !vm_shared) 375 writable = false; 376 if (writable) 377 _dst_pte = pte_mkwrite(_dst_pte, dst_vma); 378 if (flags & MFILL_ATOMIC_WP) 379 _dst_pte = pte_mkuffd_wp(_dst_pte); 380 381 ret = -EAGAIN; 382 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 383 if (!dst_pte) 384 goto out; 385 386 if (mfill_file_over_size(dst_vma, dst_addr)) { 387 ret = -EFAULT; 388 goto out_unlock; 389 } 390 391 ret = -EEXIST; 392 393 dst_ptep = ptep_get(dst_pte); 394 395 /* 396 * We are allowed to overwrite a UFFD pte marker: consider when both 397 * MISSING|WP registered, we firstly wr-protect a none pte which has no 398 * page cache page backing it, then access the page. 399 */ 400 if (!pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep)) 401 goto out_unlock; 402 403 if (page_in_cache) { 404 folio_add_file_rmap_pte(folio, page, dst_vma); 405 } else { 406 folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); 407 folio_add_lru_vma(folio, dst_vma); 408 } 409 410 /* 411 * Must happen after rmap, as mm_counter() checks mapping (via 412 * PageAnon()), which is set by __page_set_anon_rmap(). 413 */ 414 inc_mm_counter(dst_mm, mm_counter(folio)); 415 416 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 417 418 if (page_in_cache) 419 folio_unlock(folio); 420 421 /* No need to invalidate - it was non-present before */ 422 update_mmu_cache(dst_vma, dst_addr, dst_pte); 423 ret = 0; 424 out_unlock: 425 pte_unmap_unlock(dst_pte, ptl); 426 out: 427 return ret; 428 } 429 430 static int mfill_copy_folio_locked(struct folio *folio, unsigned long src_addr) 431 { 432 void *kaddr; 433 int ret; 434 435 kaddr = kmap_local_folio(folio, 0); 436 /* 437 * The read mmap_lock is held here. Despite the 438 * mmap_lock being read recursive a deadlock is still 439 * possible if a writer has taken a lock. For example: 440 * 441 * process A thread 1 takes read lock on own mmap_lock 442 * process A thread 2 calls mmap, blocks taking write lock 443 * process B thread 1 takes page fault, read lock on own mmap lock 444 * process B thread 2 calls mmap, blocks taking write lock 445 * process A thread 1 blocks taking read lock on process B 446 * process B thread 1 blocks taking read lock on process A 447 * 448 * Disable page faults to prevent potential deadlock 449 * and retry the copy outside the mmap_lock. 450 */ 451 pagefault_disable(); 452 ret = copy_from_user(kaddr, (const void __user *) src_addr, 453 PAGE_SIZE); 454 pagefault_enable(); 455 kunmap_local(kaddr); 456 457 if (ret) 458 return -EFAULT; 459 460 flush_dcache_folio(folio); 461 return ret; 462 } 463 464 #define MFILL_RETRY_STATE_VMA_FLAGS \ 465 append_vma_flags(__VMA_UFFD_FLAGS, VMA_SHARED_BIT) 466 467 /* 468 * VMA state saved before dropping the locks in mfill_copy_folio_retry(). 469 * Used to detect VMA replacement or incompatible changes after reacquiring the 470 * locks. 471 */ 472 struct mfill_retry_state { 473 const struct vm_uffd_ops *ops; 474 struct file *file; 475 vma_flags_t flags; 476 pgoff_t pgoff; 477 }; 478 479 static void mfill_retry_state_save(struct mfill_retry_state *s, 480 struct vm_area_struct *vma) 481 { 482 s->flags = vma_flags_and_mask(&vma->flags, MFILL_RETRY_STATE_VMA_FLAGS); 483 s->ops = vma_uffd_ops(vma); 484 s->pgoff = vma->vm_pgoff; 485 486 if (vma->vm_file) 487 s->file = get_file(vma->vm_file); 488 } 489 490 static bool mfill_retry_state_changed(struct mfill_retry_state *state, 491 struct vm_area_struct *vma) 492 { 493 vma_flags_t flags = vma_flags_and_mask(&vma->flags, 494 MFILL_RETRY_STATE_VMA_FLAGS); 495 496 /* Have any UFFD flags (missing, WP, minor) changed? */ 497 if (!vma_flags_same_pair(&state->flags, &flags)) 498 return true; 499 500 /* VMA type or effective uffd_ops changed while the lock was dropped */ 501 if (state->ops != vma_uffd_ops(vma)) 502 return true; 503 504 /* VMA was anonymous before; changed only if it no longer is */ 505 if (!state->file) 506 return !vma_is_anonymous(vma); 507 508 /* VMA was file backed, but file, inode or offset has changed */ 509 if (!vma->vm_file || vma->vm_file->f_inode != state->file->f_inode || 510 state->file != vma->vm_file || vma->vm_pgoff != state->pgoff) 511 return true; 512 513 return false; 514 } 515 516 static void mfill_retry_state_put(struct mfill_retry_state *s) 517 { 518 if (s->file) 519 fput(s->file); 520 } 521 522 DEFINE_FREE(retry_put, struct mfill_retry_state *, 523 if (_T) mfill_retry_state_put(_T)); 524 525 static int mfill_copy_folio_retry(struct mfill_state *mfill_state, 526 struct folio *folio) 527 { 528 struct mfill_retry_state retry_state = { 0 }; 529 struct mfill_retry_state *for_free __free(retry_put) = &retry_state; 530 unsigned long src_addr = mfill_state->src_addr; 531 void *kaddr; 532 int err; 533 534 mfill_retry_state_save(&retry_state, mfill_state->vma); 535 536 /* retry copying with mm_lock dropped */ 537 mfill_put_vma(mfill_state); 538 539 kaddr = kmap_local_folio(folio, 0); 540 err = copy_from_user(kaddr, (const void __user *) src_addr, PAGE_SIZE); 541 kunmap_local(kaddr); 542 if (unlikely(err)) 543 return -EFAULT; 544 545 flush_dcache_folio(folio); 546 547 /* reget VMA and PMD, they could change underneath us */ 548 err = mfill_get_vma(mfill_state); 549 if (err) 550 return err; 551 552 if (mfill_retry_state_changed(&retry_state, mfill_state->vma)) 553 return -EAGAIN; 554 555 err = mfill_establish_pmd(mfill_state); 556 if (err) 557 return err; 558 559 return 0; 560 } 561 562 static int __mfill_atomic_pte(struct mfill_state *state, 563 const struct vm_uffd_ops *ops) 564 { 565 unsigned long dst_addr = state->dst_addr; 566 unsigned long src_addr = state->src_addr; 567 uffd_flags_t flags = state->flags; 568 struct folio *folio; 569 int ret; 570 571 if (!ops) { 572 VM_WARN_ONCE(1, "UFFDIO_COPY for unsupported VMA"); 573 return -EOPNOTSUPP; 574 } 575 576 folio = ops->alloc_folio(state->vma, state->dst_addr); 577 if (!folio) 578 return -ENOMEM; 579 580 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { 581 ret = mfill_copy_folio_locked(folio, src_addr); 582 /* 583 * Fallback to copy_from_user outside mmap_lock. 584 * If retry is successful, mfill_copy_folio_locked() returns 585 * with locks retaken by mfill_get_vma(). 586 * If there was an error, we must mfill_put_vma() anyway and it 587 * will take care of unlocking if needed. 588 */ 589 if (unlikely(ret)) { 590 ret = mfill_copy_folio_retry(state, folio); 591 if (ret) 592 goto err_folio_put; 593 } 594 } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { 595 clear_user_highpage(&folio->page, state->dst_addr); 596 } else { 597 VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags); 598 } 599 600 /* 601 * The memory barrier inside __folio_mark_uptodate makes sure that 602 * preceding stores to the page contents become visible before 603 * the set_pte_at() write. 604 */ 605 __folio_mark_uptodate(folio); 606 607 if (ops->filemap_add) { 608 ret = ops->filemap_add(folio, state->vma, state->dst_addr); 609 if (ret) 610 goto err_folio_put; 611 } 612 613 ret = mfill_atomic_install_pte(state->pmd, state->vma, dst_addr, 614 &folio->page, flags); 615 if (ret) 616 goto err_filemap_remove; 617 618 return 0; 619 620 err_filemap_remove: 621 if (ops->filemap_remove) 622 ops->filemap_remove(folio, state->vma); 623 err_folio_put: 624 folio_put(folio); 625 return ret; 626 } 627 628 static int mfill_atomic_pte_copy(struct mfill_state *state) 629 { 630 const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma); 631 632 /* 633 * The normal page fault path for a MAP_PRIVATE mapping in a 634 * file-backed VMA will invoke the fault, fill the hole in the file and 635 * COW it right away. The result generates plain anonymous memory. 636 * So when we are asked to fill a hole in a MAP_PRIVATE mapping, we'll 637 * generate anonymous memory directly without actually filling the 638 * hole. For the MAP_PRIVATE case the robustness check only happens in 639 * the pagetable (to verify it's still none) and not in the page cache. 640 */ 641 if (!(state->vma->vm_flags & VM_SHARED)) 642 ops = &anon_uffd_ops; 643 644 return __mfill_atomic_pte(state, ops); 645 } 646 647 static int mfill_atomic_pte_zeroed_folio(struct mfill_state *state) 648 { 649 const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma); 650 651 return __mfill_atomic_pte(state, ops); 652 } 653 654 static int mfill_atomic_pte_zeropage(struct mfill_state *state) 655 { 656 struct vm_area_struct *dst_vma = state->vma; 657 unsigned long dst_addr = state->dst_addr; 658 pmd_t *dst_pmd = state->pmd; 659 pte_t _dst_pte, *dst_pte; 660 spinlock_t *ptl; 661 int ret; 662 663 if (mm_forbids_zeropage(dst_vma->vm_mm) || 664 (dst_vma->vm_flags & VM_SHARED)) 665 return mfill_atomic_pte_zeroed_folio(state); 666 667 _dst_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), 668 dst_vma->vm_page_prot)); 669 ret = -EAGAIN; 670 dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); 671 if (!dst_pte) 672 goto out; 673 if (mfill_file_over_size(dst_vma, dst_addr)) { 674 ret = -EFAULT; 675 goto out_unlock; 676 } 677 ret = -EEXIST; 678 if (!pte_none(ptep_get(dst_pte))) 679 goto out_unlock; 680 set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); 681 /* No need to invalidate - it was non-present before */ 682 update_mmu_cache(dst_vma, dst_addr, dst_pte); 683 ret = 0; 684 out_unlock: 685 pte_unmap_unlock(dst_pte, ptl); 686 out: 687 return ret; 688 } 689 690 /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ 691 static int mfill_atomic_pte_continue(struct mfill_state *state) 692 { 693 struct vm_area_struct *dst_vma = state->vma; 694 const struct vm_uffd_ops *ops = vma_uffd_ops(dst_vma); 695 unsigned long dst_addr = state->dst_addr; 696 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 697 struct inode *inode = file_inode(dst_vma->vm_file); 698 uffd_flags_t flags = state->flags; 699 pmd_t *dst_pmd = state->pmd; 700 struct folio *folio; 701 struct page *page; 702 int ret; 703 704 if (!ops) { 705 VM_WARN_ONCE(1, "UFFDIO_CONTINUE for unsupported VMA"); 706 return -EOPNOTSUPP; 707 } 708 709 folio = ops->get_folio_noalloc(inode, pgoff); 710 /* Our caller expects us to return -EFAULT if we failed to find folio */ 711 if (IS_ERR_OR_NULL(folio)) 712 return -EFAULT; 713 714 page = folio_file_page(folio, pgoff); 715 if (PageHWPoison(page)) { 716 ret = -EIO; 717 goto out_release; 718 } 719 720 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 721 page, flags); 722 if (ret) 723 goto out_release; 724 725 return 0; 726 727 out_release: 728 folio_unlock(folio); 729 folio_put(folio); 730 return ret; 731 } 732 733 /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ 734 static int mfill_atomic_pte_poison(struct mfill_state *state) 735 { 736 struct vm_area_struct *dst_vma = state->vma; 737 struct mm_struct *dst_mm = dst_vma->vm_mm; 738 unsigned long dst_addr = state->dst_addr; 739 pmd_t *dst_pmd = state->pmd; 740 pte_t _dst_pte, *dst_pte; 741 spinlock_t *ptl; 742 int ret; 743 744 _dst_pte = make_pte_marker(PTE_MARKER_POISONED); 745 ret = -EAGAIN; 746 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 747 if (!dst_pte) 748 goto out; 749 750 if (mfill_file_over_size(dst_vma, dst_addr)) { 751 ret = -EFAULT; 752 goto out_unlock; 753 } 754 755 ret = -EEXIST; 756 /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */ 757 if (!pte_none(ptep_get(dst_pte))) 758 goto out_unlock; 759 760 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 761 762 /* No need to invalidate - it was non-present before */ 763 update_mmu_cache(dst_vma, dst_addr, dst_pte); 764 ret = 0; 765 out_unlock: 766 pte_unmap_unlock(dst_pte, ptl); 767 out: 768 return ret; 769 } 770 771 #ifdef CONFIG_HUGETLB_PAGE 772 /* 773 * mfill_atomic processing for HUGETLB vmas. Note that this routine is 774 * called with either vma-lock or mmap_lock held, it will release the lock 775 * before returning. 776 */ 777 static __always_inline ssize_t mfill_atomic_hugetlb( 778 struct userfaultfd_ctx *ctx, 779 struct vm_area_struct *dst_vma, 780 unsigned long dst_start, 781 unsigned long src_start, 782 unsigned long len, 783 uffd_flags_t flags) 784 { 785 struct mm_struct *dst_mm = dst_vma->vm_mm; 786 ssize_t err; 787 pte_t *dst_pte; 788 unsigned long src_addr, dst_addr; 789 long copied; 790 struct folio *folio; 791 unsigned long vma_hpagesize; 792 pgoff_t idx; 793 u32 hash; 794 struct address_space *mapping; 795 796 /* 797 * There is no default zero huge page for all huge page sizes as 798 * supported by hugetlb. A PMD_SIZE huge pages may exist as used 799 * by THP. Since we can not reliably insert a zero page, this 800 * feature is not supported. 801 */ 802 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { 803 up_read(&ctx->map_changing_lock); 804 uffd_mfill_unlock(dst_vma); 805 return -EINVAL; 806 } 807 808 src_addr = src_start; 809 dst_addr = dst_start; 810 copied = 0; 811 folio = NULL; 812 vma_hpagesize = vma_kernel_pagesize(dst_vma); 813 814 /* 815 * Validate alignment based on huge page size 816 */ 817 err = -EINVAL; 818 if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) 819 goto out_unlock; 820 821 retry: 822 /* 823 * On routine entry dst_vma is set. If we had to drop mmap_lock and 824 * retry, dst_vma will be set to NULL and we must lookup again. 825 */ 826 if (!dst_vma) { 827 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); 828 if (IS_ERR(dst_vma)) { 829 err = PTR_ERR(dst_vma); 830 goto out; 831 } 832 833 err = -ENOENT; 834 if (!is_vm_hugetlb_page(dst_vma)) 835 goto out_unlock_vma; 836 837 err = -EINVAL; 838 if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) 839 goto out_unlock_vma; 840 841 /* 842 * If memory mappings are changing because of non-cooperative 843 * operation (e.g. mremap) running in parallel, bail out and 844 * request the user to retry later 845 */ 846 down_read(&ctx->map_changing_lock); 847 err = -EAGAIN; 848 if (atomic_read(&ctx->mmap_changing)) 849 goto out_unlock; 850 } 851 852 while (src_addr < src_start + len) { 853 VM_WARN_ON_ONCE(dst_addr >= dst_start + len); 854 855 /* 856 * Serialize via vma_lock and hugetlb_fault_mutex. 857 * vma_lock ensures the dst_pte remains valid even 858 * in the case of shared pmds. fault mutex prevents 859 * races with other faulting threads. 860 */ 861 idx = hugetlb_linear_page_index(dst_vma, dst_addr); 862 mapping = dst_vma->vm_file->f_mapping; 863 hash = hugetlb_fault_mutex_hash(mapping, idx); 864 mutex_lock(&hugetlb_fault_mutex_table[hash]); 865 hugetlb_vma_lock_read(dst_vma); 866 867 err = -ENOMEM; 868 dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); 869 if (!dst_pte) { 870 hugetlb_vma_unlock_read(dst_vma); 871 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 872 goto out_unlock; 873 } 874 875 if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { 876 const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte); 877 878 if (!huge_pte_none(ptep) && !pte_is_uffd_marker(ptep)) { 879 err = -EEXIST; 880 hugetlb_vma_unlock_read(dst_vma); 881 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 882 goto out_unlock; 883 } 884 } 885 886 err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, 887 src_addr, flags, &folio); 888 889 hugetlb_vma_unlock_read(dst_vma); 890 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 891 892 cond_resched(); 893 894 if (unlikely(err == -ENOENT)) { 895 up_read(&ctx->map_changing_lock); 896 uffd_mfill_unlock(dst_vma); 897 VM_WARN_ON_ONCE(!folio); 898 899 err = copy_folio_from_user(folio, 900 (const void __user *)src_addr, true); 901 if (unlikely(err)) { 902 err = -EFAULT; 903 goto out; 904 } 905 906 dst_vma = NULL; 907 goto retry; 908 } else 909 VM_WARN_ON_ONCE(folio); 910 911 if (!err) { 912 dst_addr += vma_hpagesize; 913 src_addr += vma_hpagesize; 914 copied += vma_hpagesize; 915 916 if (fatal_signal_pending(current)) 917 err = -EINTR; 918 } 919 if (err) 920 break; 921 } 922 923 out_unlock: 924 up_read(&ctx->map_changing_lock); 925 out_unlock_vma: 926 uffd_mfill_unlock(dst_vma); 927 out: 928 if (folio) 929 folio_put(folio); 930 VM_WARN_ON_ONCE(copied < 0); 931 VM_WARN_ON_ONCE(err > 0); 932 VM_WARN_ON_ONCE(!copied && !err); 933 return copied ? copied : err; 934 } 935 #else /* !CONFIG_HUGETLB_PAGE */ 936 /* fail at build time if gcc attempts to use this */ 937 extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx, 938 struct vm_area_struct *dst_vma, 939 unsigned long dst_start, 940 unsigned long src_start, 941 unsigned long len, 942 uffd_flags_t flags); 943 #endif /* CONFIG_HUGETLB_PAGE */ 944 945 static __always_inline ssize_t mfill_atomic_pte(struct mfill_state *state) 946 { 947 uffd_flags_t flags = state->flags; 948 949 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) 950 return mfill_atomic_pte_continue(state); 951 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) 952 return mfill_atomic_pte_poison(state); 953 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) 954 return mfill_atomic_pte_copy(state); 955 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) 956 return mfill_atomic_pte_zeropage(state); 957 958 VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags); 959 return -EOPNOTSUPP; 960 } 961 962 static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, 963 unsigned long dst_start, 964 unsigned long src_start, 965 unsigned long len, 966 uffd_flags_t flags) 967 { 968 struct mfill_state state = (struct mfill_state){ 969 .ctx = ctx, 970 .dst_start = dst_start, 971 .src_start = src_start, 972 .flags = flags, 973 .len = len, 974 .src_addr = src_start, 975 .dst_addr = dst_start, 976 }; 977 long copied = 0; 978 ssize_t err; 979 980 /* 981 * Sanitize the command parameters: 982 */ 983 VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); 984 VM_WARN_ON_ONCE(len & ~PAGE_MASK); 985 986 /* Does the address range wrap, or is the span zero-sized? */ 987 VM_WARN_ON_ONCE(src_start + len <= src_start); 988 VM_WARN_ON_ONCE(dst_start + len <= dst_start); 989 990 err = mfill_get_vma(&state); 991 if (err) 992 goto out; 993 994 /* 995 * If this is a HUGETLB vma, pass off to appropriate routine 996 */ 997 if (is_vm_hugetlb_page(state.vma)) 998 return mfill_atomic_hugetlb(ctx, state.vma, dst_start, 999 src_start, len, flags); 1000 1001 while (state.src_addr < src_start + len) { 1002 VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len); 1003 1004 err = mfill_establish_pmd(&state); 1005 if (err) 1006 break; 1007 1008 /* 1009 * For shmem mappings, khugepaged is allowed to remove page 1010 * tables under us; pte_offset_map_lock() will deal with that. 1011 */ 1012 1013 err = mfill_atomic_pte(&state); 1014 cond_resched(); 1015 1016 if (!err) { 1017 state.dst_addr += PAGE_SIZE; 1018 state.src_addr += PAGE_SIZE; 1019 copied += PAGE_SIZE; 1020 1021 if (fatal_signal_pending(current)) 1022 err = -EINTR; 1023 } 1024 if (err) 1025 break; 1026 } 1027 1028 mfill_put_vma(&state); 1029 out: 1030 VM_WARN_ON_ONCE(copied < 0); 1031 VM_WARN_ON_ONCE(err > 0); 1032 VM_WARN_ON_ONCE(!copied && !err); 1033 return copied ? copied : err; 1034 } 1035 1036 static ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, 1037 unsigned long src_start, unsigned long len, 1038 uffd_flags_t flags) 1039 { 1040 return mfill_atomic(ctx, dst_start, src_start, len, 1041 uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY)); 1042 } 1043 1044 static ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, 1045 unsigned long start, 1046 unsigned long len) 1047 { 1048 return mfill_atomic(ctx, start, 0, len, 1049 uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE)); 1050 } 1051 1052 static ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, 1053 unsigned long len, uffd_flags_t flags) 1054 { 1055 1056 /* 1057 * A caller might reasonably assume that UFFDIO_CONTINUE contains an 1058 * smp_wmb() to ensure that any writes to the about-to-be-mapped page by 1059 * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to 1060 * subsequent loads from the page through the newly mapped address range. 1061 */ 1062 smp_wmb(); 1063 1064 return mfill_atomic(ctx, start, 0, len, 1065 uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); 1066 } 1067 1068 static ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, 1069 unsigned long len, uffd_flags_t flags) 1070 { 1071 return mfill_atomic(ctx, start, 0, len, 1072 uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON)); 1073 } 1074 1075 long uffd_wp_range(struct vm_area_struct *dst_vma, 1076 unsigned long start, unsigned long len, bool enable_wp) 1077 { 1078 unsigned int mm_cp_flags; 1079 struct mmu_gather tlb; 1080 long ret; 1081 1082 VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end, 1083 "The address range exceeds VMA boundary.\n"); 1084 if (enable_wp) 1085 mm_cp_flags = MM_CP_UFFD_WP; 1086 else 1087 mm_cp_flags = MM_CP_UFFD_WP_RESOLVE; 1088 1089 /* 1090 * vma->vm_page_prot already reflects that uffd-wp is enabled for this 1091 * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed 1092 * to be write-protected as default whenever protection changes. 1093 * Try upgrading write permissions manually. 1094 */ 1095 if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) 1096 mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; 1097 tlb_gather_mmu(&tlb, dst_vma->vm_mm); 1098 ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); 1099 tlb_finish_mmu(&tlb); 1100 1101 return ret; 1102 } 1103 1104 static int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, 1105 unsigned long len, bool enable_wp) 1106 { 1107 struct mm_struct *dst_mm = ctx->mm; 1108 unsigned long end = start + len; 1109 unsigned long _start, _end; 1110 struct vm_area_struct *dst_vma; 1111 unsigned long page_mask; 1112 long err; 1113 VMA_ITERATOR(vmi, dst_mm, start); 1114 1115 /* 1116 * Sanitize the command parameters: 1117 */ 1118 VM_WARN_ON_ONCE(start & ~PAGE_MASK); 1119 VM_WARN_ON_ONCE(len & ~PAGE_MASK); 1120 1121 /* Does the address range wrap, or is the span zero-sized? */ 1122 VM_WARN_ON_ONCE(start + len <= start); 1123 1124 mmap_read_lock(dst_mm); 1125 1126 /* 1127 * If memory mappings are changing because of non-cooperative 1128 * operation (e.g. mremap) running in parallel, bail out and 1129 * request the user to retry later 1130 */ 1131 down_read(&ctx->map_changing_lock); 1132 err = -EAGAIN; 1133 if (atomic_read(&ctx->mmap_changing)) 1134 goto out_unlock; 1135 1136 err = -ENOENT; 1137 for_each_vma_range(vmi, dst_vma, end) { 1138 1139 if (!userfaultfd_wp(dst_vma)) { 1140 err = -ENOENT; 1141 break; 1142 } 1143 1144 if (is_vm_hugetlb_page(dst_vma)) { 1145 err = -EINVAL; 1146 page_mask = vma_kernel_pagesize(dst_vma) - 1; 1147 if ((start & page_mask) || (len & page_mask)) 1148 break; 1149 } 1150 1151 _start = max(dst_vma->vm_start, start); 1152 _end = min(dst_vma->vm_end, end); 1153 1154 err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp); 1155 1156 /* Return 0 on success, <0 on failures */ 1157 if (err < 0) 1158 break; 1159 err = 0; 1160 } 1161 out_unlock: 1162 up_read(&ctx->map_changing_lock); 1163 mmap_read_unlock(dst_mm); 1164 return err; 1165 } 1166 1167 1168 void double_pt_lock(spinlock_t *ptl1, 1169 spinlock_t *ptl2) 1170 __acquires(ptl1) 1171 __acquires(ptl2) 1172 { 1173 if (ptl1 > ptl2) 1174 swap(ptl1, ptl2); 1175 /* lock in virtual address order to avoid lock inversion */ 1176 spin_lock(ptl1); 1177 if (ptl1 != ptl2) 1178 spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING); 1179 else 1180 __acquire(ptl2); 1181 } 1182 1183 void double_pt_unlock(spinlock_t *ptl1, 1184 spinlock_t *ptl2) 1185 __releases(ptl1) 1186 __releases(ptl2) 1187 { 1188 spin_unlock(ptl1); 1189 if (ptl1 != ptl2) 1190 spin_unlock(ptl2); 1191 else 1192 __release(ptl2); 1193 } 1194 1195 static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, 1196 pte_t orig_dst_pte, pte_t orig_src_pte, 1197 pmd_t *dst_pmd, pmd_t dst_pmdval) 1198 { 1199 return pte_same(ptep_get(src_pte), orig_src_pte) && 1200 pte_same(ptep_get(dst_pte), orig_dst_pte) && 1201 pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd)); 1202 } 1203 1204 /* 1205 * Checks if the two ptes and the corresponding folio are eligible for batched 1206 * move. If so, then returns pointer to the locked folio. Otherwise, returns NULL. 1207 * 1208 * NOTE: folio's reference is not required as the whole operation is within 1209 * PTL's critical section. 1210 */ 1211 static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, 1212 unsigned long src_addr, 1213 pte_t *src_pte, pte_t *dst_pte) 1214 { 1215 pte_t orig_dst_pte, orig_src_pte; 1216 struct folio *folio; 1217 1218 orig_dst_pte = ptep_get(dst_pte); 1219 if (!pte_none(orig_dst_pte)) 1220 return NULL; 1221 1222 orig_src_pte = ptep_get(src_pte); 1223 if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte))) 1224 return NULL; 1225 1226 folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); 1227 if (!folio || !folio_trylock(folio)) 1228 return NULL; 1229 if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) { 1230 folio_unlock(folio); 1231 return NULL; 1232 } 1233 return folio; 1234 } 1235 1236 /* 1237 * Moves src folios to dst in a batch as long as they are not large, and can 1238 * successfully take the lock via folio_trylock(). 1239 */ 1240 static long move_present_ptes(struct mm_struct *mm, 1241 struct vm_area_struct *dst_vma, 1242 struct vm_area_struct *src_vma, 1243 unsigned long dst_addr, unsigned long src_addr, 1244 pte_t *dst_pte, pte_t *src_pte, 1245 pte_t orig_dst_pte, pte_t orig_src_pte, 1246 pmd_t *dst_pmd, pmd_t dst_pmdval, 1247 spinlock_t *dst_ptl, spinlock_t *src_ptl, 1248 struct folio **first_src_folio, unsigned long len) 1249 { 1250 int err = 0; 1251 struct folio *src_folio = *first_src_folio; 1252 unsigned long src_start = src_addr; 1253 unsigned long src_end; 1254 1255 len = pmd_addr_end(dst_addr, dst_addr + len) - dst_addr; 1256 src_end = pmd_addr_end(src_addr, src_addr + len); 1257 flush_cache_range(src_vma, src_addr, src_end); 1258 double_pt_lock(dst_ptl, src_ptl); 1259 1260 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1261 dst_pmd, dst_pmdval)) { 1262 err = -EAGAIN; 1263 goto out; 1264 } 1265 if (folio_test_large(src_folio) || 1266 folio_maybe_dma_pinned(src_folio) || 1267 !PageAnonExclusive(&src_folio->page)) { 1268 err = -EBUSY; 1269 goto out; 1270 } 1271 /* It's safe to drop the reference now as the page-table is holding one. */ 1272 folio_put(*first_src_folio); 1273 *first_src_folio = NULL; 1274 lazy_mmu_mode_enable(); 1275 1276 while (true) { 1277 orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); 1278 /* Folio got pinned from under us. Put it back and fail the move. */ 1279 if (folio_maybe_dma_pinned(src_folio)) { 1280 set_pte_at(mm, src_addr, src_pte, orig_src_pte); 1281 err = -EBUSY; 1282 break; 1283 } 1284 1285 folio_move_anon_rmap(src_folio, dst_vma); 1286 src_folio->index = linear_page_index(dst_vma, dst_addr); 1287 1288 orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); 1289 /* Set soft dirty bit so userspace can notice the pte was moved */ 1290 if (pgtable_supports_soft_dirty()) 1291 orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); 1292 if (pte_dirty(orig_src_pte)) 1293 orig_dst_pte = pte_mkdirty(orig_dst_pte); 1294 orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); 1295 set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); 1296 1297 src_addr += PAGE_SIZE; 1298 if (src_addr == src_end) 1299 break; 1300 dst_addr += PAGE_SIZE; 1301 dst_pte++; 1302 src_pte++; 1303 1304 folio_unlock(src_folio); 1305 src_folio = check_ptes_for_batched_move(src_vma, src_addr, 1306 src_pte, dst_pte); 1307 if (!src_folio) 1308 break; 1309 } 1310 1311 lazy_mmu_mode_disable(); 1312 if (src_addr > src_start) 1313 flush_tlb_range(src_vma, src_start, src_addr); 1314 1315 if (src_folio) 1316 folio_unlock(src_folio); 1317 out: 1318 double_pt_unlock(dst_ptl, src_ptl); 1319 return src_addr > src_start ? src_addr - src_start : err; 1320 } 1321 1322 static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, 1323 unsigned long dst_addr, unsigned long src_addr, 1324 pte_t *dst_pte, pte_t *src_pte, 1325 pte_t orig_dst_pte, pte_t orig_src_pte, 1326 pmd_t *dst_pmd, pmd_t dst_pmdval, 1327 spinlock_t *dst_ptl, spinlock_t *src_ptl, 1328 struct folio *src_folio, 1329 struct swap_info_struct *si, swp_entry_t entry) 1330 { 1331 /* 1332 * Check if the folio still belongs to the target swap entry after 1333 * acquiring the lock. Folio can be freed in the swap cache while 1334 * not locked. 1335 */ 1336 if (src_folio && unlikely(!folio_test_swapcache(src_folio) || 1337 entry.val != src_folio->swap.val)) 1338 return -EAGAIN; 1339 1340 double_pt_lock(dst_ptl, src_ptl); 1341 1342 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1343 dst_pmd, dst_pmdval)) { 1344 double_pt_unlock(dst_ptl, src_ptl); 1345 return -EAGAIN; 1346 } 1347 1348 /* 1349 * The src_folio resides in the swapcache, requiring an update to its 1350 * index and mapping to align with the dst_vma, where a swap-in may 1351 * occur and hit the swapcache after moving the PTE. 1352 */ 1353 if (src_folio) { 1354 folio_move_anon_rmap(src_folio, dst_vma); 1355 src_folio->index = linear_page_index(dst_vma, dst_addr); 1356 } else { 1357 /* 1358 * Check if the swap entry is cached after acquiring the src_pte 1359 * lock. Otherwise, we might miss a newly loaded swap cache folio. 1360 * 1361 * We are trying to catch newly added swap cache, the only possible case is 1362 * when a folio is swapped in and out again staying in swap cache, using the 1363 * same entry before the PTE check above. The PTL is acquired and released 1364 * twice, each time after updating the swap table. So holding 1365 * the PTL here ensures we see the updated value. 1366 */ 1367 if (swap_cache_has_folio(entry)) { 1368 double_pt_unlock(dst_ptl, src_ptl); 1369 return -EAGAIN; 1370 } 1371 } 1372 1373 orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); 1374 if (pgtable_supports_soft_dirty()) 1375 orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); 1376 set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); 1377 double_pt_unlock(dst_ptl, src_ptl); 1378 1379 return PAGE_SIZE; 1380 } 1381 1382 static int move_zeropage_pte(struct mm_struct *mm, 1383 struct vm_area_struct *dst_vma, 1384 struct vm_area_struct *src_vma, 1385 unsigned long dst_addr, unsigned long src_addr, 1386 pte_t *dst_pte, pte_t *src_pte, 1387 pte_t orig_dst_pte, pte_t orig_src_pte, 1388 pmd_t *dst_pmd, pmd_t dst_pmdval, 1389 spinlock_t *dst_ptl, spinlock_t *src_ptl) 1390 { 1391 pte_t zero_pte; 1392 1393 double_pt_lock(dst_ptl, src_ptl); 1394 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1395 dst_pmd, dst_pmdval)) { 1396 double_pt_unlock(dst_ptl, src_ptl); 1397 return -EAGAIN; 1398 } 1399 1400 zero_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), 1401 dst_vma->vm_page_prot)); 1402 ptep_clear_flush(src_vma, src_addr, src_pte); 1403 set_pte_at(mm, dst_addr, dst_pte, zero_pte); 1404 double_pt_unlock(dst_ptl, src_ptl); 1405 1406 return PAGE_SIZE; 1407 } 1408 1409 1410 /* 1411 * The mmap_lock for reading is held by the caller. Just move the page(s) 1412 * from src_pmd to dst_pmd if possible, and return number of bytes moved. 1413 * On failure, an error code is returned. 1414 */ 1415 static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, 1416 struct vm_area_struct *dst_vma, 1417 struct vm_area_struct *src_vma, 1418 unsigned long dst_addr, unsigned long src_addr, 1419 unsigned long len, __u64 mode) 1420 { 1421 struct swap_info_struct *si = NULL; 1422 pte_t orig_src_pte, orig_dst_pte; 1423 pte_t src_folio_pte; 1424 spinlock_t *src_ptl, *dst_ptl; 1425 pte_t *src_pte = NULL; 1426 pte_t *dst_pte = NULL; 1427 pmd_t dummy_pmdval; 1428 pmd_t dst_pmdval; 1429 struct folio *src_folio = NULL; 1430 struct mmu_notifier_range range; 1431 long ret = 0; 1432 1433 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 1434 src_addr, src_addr + len); 1435 mmu_notifier_invalidate_range_start(&range); 1436 retry: 1437 /* 1438 * Use the maywrite version to indicate that dst_pte will be modified, 1439 * since dst_pte needs to be none, the subsequent pte_same() check 1440 * cannot prevent the dst_pte page from being freed concurrently, so we 1441 * also need to obtain dst_pmdval and recheck pmd_same() later. 1442 */ 1443 dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval, 1444 &dst_ptl); 1445 1446 /* Retry if a huge pmd materialized from under us */ 1447 if (unlikely(!dst_pte)) { 1448 ret = -EAGAIN; 1449 goto out; 1450 } 1451 1452 /* 1453 * Unlike dst_pte, the subsequent pte_same() check can ensure the 1454 * stability of the src_pte page, so there is no need to get pmdval, 1455 * just pass a dummy variable to it. 1456 */ 1457 src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval, 1458 &src_ptl); 1459 1460 /* 1461 * We held the mmap_lock for reading so MADV_DONTNEED 1462 * can zap transparent huge pages under us, or the 1463 * transparent huge page fault can establish new 1464 * transparent huge pages under us. 1465 */ 1466 if (unlikely(!src_pte)) { 1467 ret = -EAGAIN; 1468 goto out; 1469 } 1470 1471 /* Sanity checks before the operation */ 1472 if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) || 1473 pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) { 1474 ret = -EINVAL; 1475 goto out; 1476 } 1477 1478 spin_lock(dst_ptl); 1479 orig_dst_pte = ptep_get(dst_pte); 1480 spin_unlock(dst_ptl); 1481 if (!pte_none(orig_dst_pte)) { 1482 ret = -EEXIST; 1483 goto out; 1484 } 1485 1486 spin_lock(src_ptl); 1487 orig_src_pte = ptep_get(src_pte); 1488 spin_unlock(src_ptl); 1489 if (pte_none(orig_src_pte)) { 1490 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) 1491 ret = -ENOENT; 1492 else /* nothing to do to move a hole */ 1493 ret = PAGE_SIZE; 1494 goto out; 1495 } 1496 1497 /* If PTE changed after we locked the folio then start over */ 1498 if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { 1499 ret = -EAGAIN; 1500 goto out; 1501 } 1502 1503 if (pte_present(orig_src_pte)) { 1504 if (is_zero_pfn(pte_pfn(orig_src_pte))) { 1505 ret = move_zeropage_pte(mm, dst_vma, src_vma, 1506 dst_addr, src_addr, dst_pte, src_pte, 1507 orig_dst_pte, orig_src_pte, 1508 dst_pmd, dst_pmdval, dst_ptl, src_ptl); 1509 goto out; 1510 } 1511 1512 /* 1513 * Pin and lock source folio. Since we are in RCU read section, 1514 * we can't block, so on contention have to unmap the ptes, 1515 * obtain the lock and retry. 1516 */ 1517 if (!src_folio) { 1518 struct folio *folio; 1519 bool locked; 1520 1521 /* 1522 * Pin the page while holding the lock to be sure the 1523 * page isn't freed under us 1524 */ 1525 spin_lock(src_ptl); 1526 if (!pte_same(orig_src_pte, ptep_get(src_pte))) { 1527 spin_unlock(src_ptl); 1528 ret = -EAGAIN; 1529 goto out; 1530 } 1531 1532 folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); 1533 if (!folio || !PageAnonExclusive(&folio->page)) { 1534 spin_unlock(src_ptl); 1535 ret = -EBUSY; 1536 goto out; 1537 } 1538 1539 locked = folio_trylock(folio); 1540 /* 1541 * We avoid waiting for folio lock with a raised 1542 * refcount for large folios because extra refcounts 1543 * will result in split_folio() failing later and 1544 * retrying. If multiple tasks are trying to move a 1545 * large folio we can end up livelocking. 1546 */ 1547 if (!locked && folio_test_large(folio)) { 1548 spin_unlock(src_ptl); 1549 ret = -EAGAIN; 1550 goto out; 1551 } 1552 1553 folio_get(folio); 1554 src_folio = folio; 1555 src_folio_pte = orig_src_pte; 1556 spin_unlock(src_ptl); 1557 1558 if (!locked) { 1559 pte_unmap(src_pte); 1560 pte_unmap(dst_pte); 1561 src_pte = dst_pte = NULL; 1562 /* now we can block and wait */ 1563 folio_lock(src_folio); 1564 goto retry; 1565 } 1566 1567 if (WARN_ON_ONCE(!folio_test_anon(src_folio))) { 1568 ret = -EBUSY; 1569 goto out; 1570 } 1571 } 1572 1573 /* at this point we have src_folio locked */ 1574 if (folio_test_large(src_folio)) { 1575 /* split_folio() can block */ 1576 pte_unmap(src_pte); 1577 pte_unmap(dst_pte); 1578 src_pte = dst_pte = NULL; 1579 ret = split_folio(src_folio); 1580 if (ret) 1581 goto out; 1582 /* have to reacquire the folio after it got split */ 1583 folio_unlock(src_folio); 1584 folio_put(src_folio); 1585 src_folio = NULL; 1586 goto retry; 1587 } 1588 1589 ret = move_present_ptes(mm, dst_vma, src_vma, 1590 dst_addr, src_addr, dst_pte, src_pte, 1591 orig_dst_pte, orig_src_pte, dst_pmd, 1592 dst_pmdval, dst_ptl, src_ptl, &src_folio, 1593 len); 1594 } else { /* !pte_present() */ 1595 struct folio *folio = NULL; 1596 const softleaf_t entry = softleaf_from_pte(orig_src_pte); 1597 1598 if (softleaf_is_migration(entry)) { 1599 pte_unmap(src_pte); 1600 pte_unmap(dst_pte); 1601 src_pte = dst_pte = NULL; 1602 migration_entry_wait(mm, src_pmd, src_addr); 1603 1604 ret = -EAGAIN; 1605 goto out; 1606 } else if (!softleaf_is_swap(entry)) { 1607 ret = -EFAULT; 1608 goto out; 1609 } 1610 1611 if (!pte_swp_exclusive(orig_src_pte)) { 1612 ret = -EBUSY; 1613 goto out; 1614 } 1615 1616 si = get_swap_device(entry); 1617 if (unlikely(!si)) { 1618 ret = -EAGAIN; 1619 goto out; 1620 } 1621 /* 1622 * Verify the existence of the swapcache. If present, the folio's 1623 * index and mapping must be updated even when the PTE is a swap 1624 * entry. The anon_vma lock is not taken during this process since 1625 * the folio has already been unmapped, and the swap entry is 1626 * exclusive, preventing rmap walks. 1627 * 1628 * For large folios, return -EBUSY immediately, as split_folio() 1629 * also returns -EBUSY when attempting to split unmapped large 1630 * folios in the swapcache. This issue needs to be resolved 1631 * separately to allow proper handling. 1632 */ 1633 if (!src_folio) 1634 folio = swap_cache_get_folio(entry); 1635 if (folio) { 1636 if (folio_test_large(folio)) { 1637 ret = -EBUSY; 1638 folio_put(folio); 1639 goto out; 1640 } 1641 src_folio = folio; 1642 src_folio_pte = orig_src_pte; 1643 if (!folio_trylock(src_folio)) { 1644 pte_unmap(src_pte); 1645 pte_unmap(dst_pte); 1646 src_pte = dst_pte = NULL; 1647 put_swap_device(si); 1648 si = NULL; 1649 /* now we can block and wait */ 1650 folio_lock(src_folio); 1651 goto retry; 1652 } 1653 } 1654 ret = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, 1655 orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, 1656 dst_ptl, src_ptl, src_folio, si, entry); 1657 } 1658 1659 out: 1660 if (src_folio) { 1661 folio_unlock(src_folio); 1662 folio_put(src_folio); 1663 } 1664 /* 1665 * Unmap in reverse order (LIFO) to maintain proper kmap_local 1666 * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte 1667 * first, then src_pte, so we must unmap src_pte first, then dst_pte. 1668 */ 1669 if (src_pte) 1670 pte_unmap(src_pte); 1671 if (dst_pte) 1672 pte_unmap(dst_pte); 1673 mmu_notifier_invalidate_range_end(&range); 1674 if (si) 1675 put_swap_device(si); 1676 1677 return ret; 1678 } 1679 1680 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1681 static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1682 unsigned long src_addr, 1683 unsigned long src_end) 1684 { 1685 return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) || 1686 src_end - src_addr < HPAGE_PMD_SIZE; 1687 } 1688 #else 1689 static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1690 unsigned long src_addr, 1691 unsigned long src_end) 1692 { 1693 /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */ 1694 return false; 1695 } 1696 #endif 1697 1698 static inline bool vma_move_compatible(struct vm_area_struct *vma) 1699 { 1700 return !(vma->vm_flags & (VM_PFNMAP | VM_IO | VM_HUGETLB | 1701 VM_MIXEDMAP | VM_SHADOW_STACK)); 1702 } 1703 1704 static int validate_move_areas(struct userfaultfd_ctx *ctx, 1705 struct vm_area_struct *src_vma, 1706 struct vm_area_struct *dst_vma) 1707 { 1708 /* Only allow moving if both have the same access and protection */ 1709 if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) || 1710 pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot)) 1711 return -EINVAL; 1712 1713 /* Only allow moving if both are mlocked or both aren't */ 1714 if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED)) 1715 return -EINVAL; 1716 1717 /* 1718 * For now, we keep it simple and only move between writable VMAs. 1719 * Access flags are equal, therefore checking only the source is enough. 1720 */ 1721 if (!(src_vma->vm_flags & VM_WRITE)) 1722 return -EINVAL; 1723 1724 /* Check if vma flags indicate content which can be moved */ 1725 if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma)) 1726 return -EINVAL; 1727 1728 /* Ensure dst_vma is registered in uffd we are operating on */ 1729 if (!dst_vma->vm_userfaultfd_ctx.ctx || 1730 dst_vma->vm_userfaultfd_ctx.ctx != ctx) 1731 return -EINVAL; 1732 1733 /* Only allow moving across anonymous vmas */ 1734 if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma)) 1735 return -EINVAL; 1736 1737 return 0; 1738 } 1739 1740 static __always_inline 1741 int find_vmas_mm_locked(struct mm_struct *mm, 1742 unsigned long dst_start, 1743 unsigned long src_start, 1744 struct vm_area_struct **dst_vmap, 1745 struct vm_area_struct **src_vmap) 1746 { 1747 struct vm_area_struct *vma; 1748 1749 mmap_assert_locked(mm); 1750 vma = find_vma_and_prepare_anon(mm, dst_start); 1751 if (IS_ERR(vma)) 1752 return PTR_ERR(vma); 1753 1754 *dst_vmap = vma; 1755 /* Skip finding src_vma if src_start is in dst_vma */ 1756 if (src_start >= vma->vm_start && src_start < vma->vm_end) 1757 goto out_success; 1758 1759 vma = vma_lookup(mm, src_start); 1760 if (!vma) 1761 return -ENOENT; 1762 out_success: 1763 *src_vmap = vma; 1764 return 0; 1765 } 1766 1767 #ifdef CONFIG_PER_VMA_LOCK 1768 static int uffd_move_lock(struct mm_struct *mm, 1769 unsigned long dst_start, 1770 unsigned long src_start, 1771 struct vm_area_struct **dst_vmap, 1772 struct vm_area_struct **src_vmap) 1773 { 1774 struct vm_area_struct *vma; 1775 int err; 1776 1777 vma = uffd_lock_vma(mm, dst_start); 1778 if (IS_ERR(vma)) 1779 return PTR_ERR(vma); 1780 1781 *dst_vmap = vma; 1782 /* 1783 * Skip finding src_vma if src_start is in dst_vma. This also ensures 1784 * that we don't lock the same vma twice. 1785 */ 1786 if (src_start >= vma->vm_start && src_start < vma->vm_end) { 1787 *src_vmap = vma; 1788 return 0; 1789 } 1790 1791 /* 1792 * Using uffd_lock_vma() to get src_vma can lead to following deadlock: 1793 * 1794 * Thread1 Thread2 1795 * ------- ------- 1796 * vma_start_read(dst_vma) 1797 * mmap_write_lock(mm) 1798 * vma_start_write(src_vma) 1799 * vma_start_read(src_vma) 1800 * mmap_read_lock(mm) 1801 * vma_start_write(dst_vma) 1802 */ 1803 *src_vmap = lock_vma_under_rcu(mm, src_start); 1804 if (likely(*src_vmap)) 1805 return 0; 1806 1807 /* Undo any locking and retry in mmap_lock critical section */ 1808 vma_end_read(*dst_vmap); 1809 1810 mmap_read_lock(mm); 1811 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); 1812 if (err) 1813 goto out; 1814 1815 if (!vma_start_read_locked(*dst_vmap)) { 1816 err = -EAGAIN; 1817 goto out; 1818 } 1819 1820 /* Nothing further to do if both vmas are locked. */ 1821 if (*dst_vmap == *src_vmap) 1822 goto out; 1823 1824 if (!vma_start_read_locked_nested(*src_vmap, SINGLE_DEPTH_NESTING)) { 1825 /* Undo dst_vmap locking if src_vmap failed to lock */ 1826 vma_end_read(*dst_vmap); 1827 err = -EAGAIN; 1828 } 1829 out: 1830 mmap_read_unlock(mm); 1831 return err; 1832 } 1833 1834 static void uffd_move_unlock(struct vm_area_struct *dst_vma, 1835 struct vm_area_struct *src_vma) 1836 { 1837 vma_end_read(src_vma); 1838 if (src_vma != dst_vma) 1839 vma_end_read(dst_vma); 1840 } 1841 1842 #else 1843 1844 static int uffd_move_lock(struct mm_struct *mm, 1845 unsigned long dst_start, 1846 unsigned long src_start, 1847 struct vm_area_struct **dst_vmap, 1848 struct vm_area_struct **src_vmap) 1849 { 1850 int err; 1851 1852 mmap_read_lock(mm); 1853 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); 1854 if (err) 1855 mmap_read_unlock(mm); 1856 return err; 1857 } 1858 1859 static void uffd_move_unlock(struct vm_area_struct *dst_vma, 1860 struct vm_area_struct *src_vma) 1861 { 1862 mmap_assert_locked(src_vma->vm_mm); 1863 mmap_read_unlock(dst_vma->vm_mm); 1864 } 1865 #endif 1866 1867 /** 1868 * move_pages - move arbitrary anonymous pages of an existing vma 1869 * @ctx: pointer to the userfaultfd context 1870 * @dst_start: start of the destination virtual memory range 1871 * @src_start: start of the source virtual memory range 1872 * @len: length of the virtual memory range 1873 * @mode: flags from uffdio_move.mode 1874 * 1875 * It will either use the mmap_lock in read mode or per-vma locks 1876 * 1877 * move_pages() remaps arbitrary anonymous pages atomically in zero 1878 * copy. It only works on non shared anonymous pages because those can 1879 * be relocated without generating non linear anon_vmas in the rmap 1880 * code. 1881 * 1882 * It provides a zero copy mechanism to handle userspace page faults. 1883 * The source vma pages should have mapcount == 1, which can be 1884 * enforced by using madvise(MADV_DONTFORK) on src vma. 1885 * 1886 * The thread receiving the page during the userland page fault 1887 * will receive the faulting page in the source vma through the network, 1888 * storage or any other I/O device (MADV_DONTFORK in the source vma 1889 * avoids move_pages() to fail with -EBUSY if the process forks before 1890 * move_pages() is called), then it will call move_pages() to map the 1891 * page in the faulting address in the destination vma. 1892 * 1893 * This userfaultfd command works purely via pagetables, so it's the 1894 * most efficient way to move physical non shared anonymous pages 1895 * across different virtual addresses. Unlike mremap()/mmap()/munmap() 1896 * it does not create any new vmas. The mapping in the destination 1897 * address is atomic. 1898 * 1899 * It only works if the vma protection bits are identical from the 1900 * source and destination vma. 1901 * 1902 * It can remap non shared anonymous pages within the same vma too. 1903 * 1904 * If the source virtual memory range has any unmapped holes, or if 1905 * the destination virtual memory range is not a whole unmapped hole, 1906 * move_pages() will fail respectively with -ENOENT or -EEXIST. This 1907 * provides a very strict behavior to avoid any chance of memory 1908 * corruption going unnoticed if there are userland race conditions. 1909 * Only one thread should resolve the userland page fault at any given 1910 * time for any given faulting address. This means that if two threads 1911 * try to both call move_pages() on the same destination address at the 1912 * same time, the second thread will get an explicit error from this 1913 * command. 1914 * 1915 * The command retval will return "len" is successful. The command 1916 * however can be interrupted by fatal signals or errors. If 1917 * interrupted it will return the number of bytes successfully 1918 * remapped before the interruption if any, or the negative error if 1919 * none. It will never return zero. Either it will return an error or 1920 * an amount of bytes successfully moved. If the retval reports a 1921 * "short" remap, the move_pages() command should be repeated by 1922 * userland with src+retval, dst+reval, len-retval if it wants to know 1923 * about the error that interrupted it. 1924 * 1925 * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to 1926 * prevent -ENOENT errors to materialize if there are holes in the 1927 * source virtual range that is being remapped. The holes will be 1928 * accounted as successfully remapped in the retval of the 1929 * command. This is mostly useful to remap hugepage naturally aligned 1930 * virtual regions without knowing if there are transparent hugepage 1931 * in the regions or not, but preventing the risk of having to split 1932 * the hugepmd during the remap. 1933 */ 1934 static ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, 1935 unsigned long src_start, unsigned long len, __u64 mode) 1936 { 1937 struct mm_struct *mm = ctx->mm; 1938 struct vm_area_struct *src_vma, *dst_vma; 1939 unsigned long src_addr, dst_addr, src_end; 1940 pmd_t *src_pmd, *dst_pmd; 1941 long err = -EINVAL; 1942 ssize_t moved = 0; 1943 1944 /* Sanitize the command parameters. */ 1945 VM_WARN_ON_ONCE(src_start & ~PAGE_MASK); 1946 VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); 1947 VM_WARN_ON_ONCE(len & ~PAGE_MASK); 1948 1949 /* Does the address range wrap, or is the span zero-sized? */ 1950 VM_WARN_ON_ONCE(src_start + len < src_start); 1951 VM_WARN_ON_ONCE(dst_start + len < dst_start); 1952 1953 err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma); 1954 if (err) 1955 goto out; 1956 1957 /* Re-check after taking map_changing_lock */ 1958 err = -EAGAIN; 1959 down_read(&ctx->map_changing_lock); 1960 if (likely(atomic_read(&ctx->mmap_changing))) 1961 goto out_unlock; 1962 /* 1963 * Make sure the vma is not shared, that the src and dst remap 1964 * ranges are both valid and fully within a single existing 1965 * vma. 1966 */ 1967 err = -EINVAL; 1968 if (src_vma->vm_flags & VM_SHARED) 1969 goto out_unlock; 1970 if (src_start + len > src_vma->vm_end) 1971 goto out_unlock; 1972 1973 if (dst_vma->vm_flags & VM_SHARED) 1974 goto out_unlock; 1975 if (dst_start + len > dst_vma->vm_end) 1976 goto out_unlock; 1977 1978 err = validate_move_areas(ctx, src_vma, dst_vma); 1979 if (err) 1980 goto out_unlock; 1981 1982 for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len; 1983 src_addr < src_end;) { 1984 spinlock_t *ptl; 1985 pmd_t dst_pmdval; 1986 unsigned long step_size; 1987 1988 /* 1989 * Below works because anonymous area would not have a 1990 * transparent huge PUD. If file-backed support is added, 1991 * that case would need to be handled here. 1992 */ 1993 src_pmd = mm_find_pmd(mm, src_addr); 1994 if (unlikely(!src_pmd)) { 1995 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 1996 err = -ENOENT; 1997 break; 1998 } 1999 src_pmd = mm_alloc_pmd(mm, src_addr); 2000 if (unlikely(!src_pmd)) { 2001 err = -ENOMEM; 2002 break; 2003 } 2004 } 2005 dst_pmd = mm_alloc_pmd(mm, dst_addr); 2006 if (unlikely(!dst_pmd)) { 2007 err = -ENOMEM; 2008 break; 2009 } 2010 2011 dst_pmdval = pmdp_get_lockless(dst_pmd); 2012 /* 2013 * If the dst_pmd is mapped as THP don't override it and just 2014 * be strict. If dst_pmd changes into TPH after this check, the 2015 * move_pages_huge_pmd() will detect the change and retry 2016 * while move_pages_pte() will detect the change and fail. 2017 */ 2018 if (unlikely(pmd_trans_huge(dst_pmdval))) { 2019 err = -EEXIST; 2020 break; 2021 } 2022 2023 ptl = pmd_trans_huge_lock(src_pmd, src_vma); 2024 if (ptl) { 2025 /* Check if we can move the pmd without splitting it. */ 2026 if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || 2027 !pmd_none(dst_pmdval)) { 2028 /* Can be a migration entry */ 2029 if (pmd_present(*src_pmd)) { 2030 struct folio *folio = pmd_folio(*src_pmd); 2031 2032 if (!is_huge_zero_folio(folio) && 2033 !PageAnonExclusive(&folio->page)) { 2034 spin_unlock(ptl); 2035 err = -EBUSY; 2036 break; 2037 } 2038 } 2039 2040 spin_unlock(ptl); 2041 split_huge_pmd(src_vma, src_pmd, src_addr); 2042 /* The folio will be split by move_pages_pte() */ 2043 continue; 2044 } 2045 2046 err = move_pages_huge_pmd(mm, dst_pmd, src_pmd, 2047 dst_pmdval, dst_vma, src_vma, 2048 dst_addr, src_addr); 2049 step_size = HPAGE_PMD_SIZE; 2050 } else { 2051 long ret; 2052 2053 if (pmd_none(*src_pmd)) { 2054 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 2055 err = -ENOENT; 2056 break; 2057 } 2058 if (unlikely(__pte_alloc(mm, src_pmd))) { 2059 err = -ENOMEM; 2060 break; 2061 } 2062 } 2063 2064 if (unlikely(pte_alloc(mm, dst_pmd))) { 2065 err = -ENOMEM; 2066 break; 2067 } 2068 2069 ret = move_pages_ptes(mm, dst_pmd, src_pmd, 2070 dst_vma, src_vma, dst_addr, 2071 src_addr, src_end - src_addr, mode); 2072 if (ret < 0) 2073 err = ret; 2074 else 2075 step_size = ret; 2076 } 2077 2078 cond_resched(); 2079 2080 if (fatal_signal_pending(current)) { 2081 /* Do not override an error */ 2082 if (!err || err == -EAGAIN) 2083 err = -EINTR; 2084 break; 2085 } 2086 2087 if (err) { 2088 if (err == -EAGAIN) 2089 continue; 2090 break; 2091 } 2092 2093 /* Proceed to the next page */ 2094 dst_addr += step_size; 2095 src_addr += step_size; 2096 moved += step_size; 2097 } 2098 2099 out_unlock: 2100 up_read(&ctx->map_changing_lock); 2101 uffd_move_unlock(dst_vma, src_vma); 2102 out: 2103 VM_WARN_ON_ONCE(moved < 0); 2104 VM_WARN_ON_ONCE(err > 0); 2105 VM_WARN_ON_ONCE(!moved && !err); 2106 return moved ? moved : err; 2107 } 2108 2109 static bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, 2110 bool wp_async) 2111 { 2112 const struct vm_uffd_ops *ops = vma_uffd_ops(vma); 2113 2114 if (vma->vm_flags & VM_DROPPABLE) 2115 return false; 2116 2117 vm_flags &= __VM_UFFD_FLAGS; 2118 2119 /* 2120 * If WP is the only mode enabled and context is wp async, allow any 2121 * memory type. 2122 */ 2123 if (wp_async && (vm_flags == VM_UFFD_WP)) 2124 return true; 2125 2126 /* For any other mode reject VMAs that don't implement vm_uffd_ops */ 2127 if (!ops) 2128 return false; 2129 2130 /* 2131 * If user requested uffd-wp but not enabled pte markers for 2132 * uffd-wp, then only anonymous memory is supported 2133 */ 2134 if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && 2135 !vma_is_anonymous(vma)) 2136 return false; 2137 2138 return ops->can_userfault(vma, vm_flags); 2139 } 2140 2141 static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, 2142 vm_flags_t vm_flags) 2143 { 2144 const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP; 2145 2146 vm_flags_reset(vma, vm_flags); 2147 /* 2148 * For shared mappings, we want to enable writenotify while 2149 * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply 2150 * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. 2151 */ 2152 if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) 2153 vma_set_page_prot(vma); 2154 } 2155 2156 static void userfaultfd_set_ctx(struct vm_area_struct *vma, 2157 struct userfaultfd_ctx *ctx, 2158 vm_flags_t vm_flags) 2159 { 2160 vma_start_write(vma); 2161 vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; 2162 userfaultfd_set_vm_flags(vma, 2163 (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags); 2164 } 2165 2166 static void userfaultfd_reset_ctx(struct vm_area_struct *vma) 2167 { 2168 userfaultfd_set_ctx(vma, NULL, 0); 2169 } 2170 2171 static struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, 2172 struct vm_area_struct *prev, 2173 struct vm_area_struct *vma, 2174 unsigned long start, 2175 unsigned long end) 2176 { 2177 struct vm_area_struct *ret; 2178 bool give_up_on_oom = false; 2179 vma_flags_t new_vma_flags = vma->flags; 2180 2181 vma_flags_clear_mask(&new_vma_flags, __VMA_UFFD_FLAGS); 2182 2183 /* 2184 * If we are modifying only and not splitting, just give up on the merge 2185 * if OOM prevents us from merging successfully. 2186 */ 2187 if (start == vma->vm_start && end == vma->vm_end) 2188 give_up_on_oom = true; 2189 2190 /* Reset ptes for the whole vma range if wr-protected */ 2191 if (userfaultfd_wp(vma)) 2192 uffd_wp_range(vma, start, end - start, false); 2193 2194 ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, 2195 &new_vma_flags, NULL_VM_UFFD_CTX, 2196 give_up_on_oom); 2197 2198 /* 2199 * In the vma_merge() successful mprotect-like case 8: 2200 * the next vma was merged into the current one and 2201 * the current one has not been updated yet. 2202 */ 2203 if (!IS_ERR(ret)) 2204 userfaultfd_reset_ctx(ret); 2205 2206 return ret; 2207 } 2208 2209 /* Assumes mmap write lock taken, and mm_struct pinned. */ 2210 static int userfaultfd_register_range(struct userfaultfd_ctx *ctx, 2211 struct vm_area_struct *vma, 2212 vm_flags_t vm_flags, 2213 unsigned long start, unsigned long end, 2214 bool wp_async) 2215 { 2216 vma_flags_t vma_flags = legacy_to_vma_flags(vm_flags); 2217 VMA_ITERATOR(vmi, ctx->mm, start); 2218 struct vm_area_struct *prev = vma_prev(&vmi); 2219 unsigned long vma_end; 2220 vma_flags_t new_vma_flags; 2221 2222 if (vma->vm_start < start) 2223 prev = vma; 2224 2225 for_each_vma_range(vmi, vma, end) { 2226 cond_resched(); 2227 2228 VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async)); 2229 VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx && 2230 vma->vm_userfaultfd_ctx.ctx != ctx); 2231 VM_WARN_ON_ONCE(!vma_test(vma, VMA_MAYWRITE_BIT)); 2232 2233 /* 2234 * Nothing to do: this vma is already registered into this 2235 * userfaultfd and with the right tracking mode too. 2236 */ 2237 if (vma->vm_userfaultfd_ctx.ctx == ctx && 2238 vma_test_all_mask(vma, vma_flags)) 2239 goto skip; 2240 2241 if (vma->vm_start > start) 2242 start = vma->vm_start; 2243 vma_end = min(end, vma->vm_end); 2244 2245 new_vma_flags = vma->flags; 2246 vma_flags_clear_mask(&new_vma_flags, __VMA_UFFD_FLAGS); 2247 vma_flags_set_mask(&new_vma_flags, vma_flags); 2248 2249 vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, 2250 &new_vma_flags, 2251 (struct vm_userfaultfd_ctx){ctx}, 2252 /* give_up_on_oom = */false); 2253 if (IS_ERR(vma)) 2254 return PTR_ERR(vma); 2255 2256 /* 2257 * In the vma_merge() successful mprotect-like case 8: 2258 * the next vma was merged into the current one and 2259 * the current one has not been updated yet. 2260 */ 2261 userfaultfd_set_ctx(vma, ctx, vm_flags); 2262 2263 if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) 2264 hugetlb_unshare_all_pmds(vma); 2265 2266 skip: 2267 prev = vma; 2268 start = vma->vm_end; 2269 } 2270 2271 return 0; 2272 } 2273 2274 static void userfaultfd_release_new(struct userfaultfd_ctx *ctx) 2275 { 2276 struct mm_struct *mm = ctx->mm; 2277 struct vm_area_struct *vma; 2278 VMA_ITERATOR(vmi, mm, 0); 2279 2280 /* the various vma->vm_userfaultfd_ctx still points to it */ 2281 mmap_write_lock(mm); 2282 for_each_vma(vmi, vma) { 2283 if (vma->vm_userfaultfd_ctx.ctx == ctx) 2284 userfaultfd_reset_ctx(vma); 2285 } 2286 mmap_write_unlock(mm); 2287 } 2288 2289 static void userfaultfd_release_all(struct mm_struct *mm, 2290 struct userfaultfd_ctx *ctx) 2291 { 2292 struct vm_area_struct *vma, *prev; 2293 VMA_ITERATOR(vmi, mm, 0); 2294 2295 if (!mmget_not_zero(mm)) 2296 return; 2297 2298 /* 2299 * Flush page faults out of all CPUs. NOTE: all page faults 2300 * must be retried without returning VM_FAULT_SIGBUS if 2301 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx 2302 * changes while handle_userfault released the mmap_lock. So 2303 * it's critical that released is set to true (above), before 2304 * taking the mmap_lock for writing. 2305 */ 2306 mmap_write_lock(mm); 2307 prev = NULL; 2308 for_each_vma(vmi, vma) { 2309 cond_resched(); 2310 VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^ 2311 !!(vma->vm_flags & __VM_UFFD_FLAGS)); 2312 if (vma->vm_userfaultfd_ctx.ctx != ctx) { 2313 prev = vma; 2314 continue; 2315 } 2316 2317 vma = userfaultfd_clear_vma(&vmi, prev, vma, 2318 vma->vm_start, vma->vm_end); 2319 prev = vma; 2320 } 2321 mmap_write_unlock(mm); 2322 mmput(mm); 2323 } 2324 2325 static int sysctl_unprivileged_userfaultfd __read_mostly; 2326 2327 #ifdef CONFIG_SYSCTL 2328 static const struct ctl_table vm_userfaultfd_table[] = { 2329 { 2330 .procname = "unprivileged_userfaultfd", 2331 .data = &sysctl_unprivileged_userfaultfd, 2332 .maxlen = sizeof(sysctl_unprivileged_userfaultfd), 2333 .mode = 0644, 2334 .proc_handler = proc_dointvec_minmax, 2335 .extra1 = SYSCTL_ZERO, 2336 .extra2 = SYSCTL_ONE, 2337 }, 2338 }; 2339 #endif 2340 2341 static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init; 2342 2343 struct userfaultfd_fork_ctx { 2344 struct userfaultfd_ctx *orig; 2345 struct userfaultfd_ctx *new; 2346 struct list_head list; 2347 }; 2348 2349 struct userfaultfd_unmap_ctx { 2350 struct userfaultfd_ctx *ctx; 2351 unsigned long start; 2352 unsigned long end; 2353 struct list_head list; 2354 }; 2355 2356 struct userfaultfd_wait_queue { 2357 struct uffd_msg msg; 2358 wait_queue_entry_t wq; 2359 struct userfaultfd_ctx *ctx; 2360 bool waken; 2361 }; 2362 2363 struct userfaultfd_wake_range { 2364 unsigned long start; 2365 unsigned long len; 2366 }; 2367 2368 /* internal indication that UFFD_API ioctl was successfully executed */ 2369 #define UFFD_FEATURE_INITIALIZED (1u << 31) 2370 2371 static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) 2372 { 2373 return ctx->features & UFFD_FEATURE_INITIALIZED; 2374 } 2375 2376 static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx) 2377 { 2378 return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC); 2379 } 2380 2381 /* 2382 * Whether WP_UNPOPULATED is enabled on the uffd context. It is only 2383 * meaningful when userfaultfd_wp()==true on the vma and when it's 2384 * anonymous. 2385 */ 2386 bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) 2387 { 2388 struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; 2389 2390 if (!ctx) 2391 return false; 2392 2393 return ctx->features & UFFD_FEATURE_WP_UNPOPULATED; 2394 } 2395 2396 static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, 2397 int wake_flags, void *key) 2398 { 2399 struct userfaultfd_wake_range *range = key; 2400 int ret; 2401 struct userfaultfd_wait_queue *uwq; 2402 unsigned long start, len; 2403 2404 uwq = container_of(wq, struct userfaultfd_wait_queue, wq); 2405 ret = 0; 2406 /* len == 0 means wake all */ 2407 start = range->start; 2408 len = range->len; 2409 if (len && (start > uwq->msg.arg.pagefault.address || 2410 start + len <= uwq->msg.arg.pagefault.address)) 2411 goto out; 2412 WRITE_ONCE(uwq->waken, true); 2413 /* 2414 * The Program-Order guarantees provided by the scheduler 2415 * ensure uwq->waken is visible before the task is woken. 2416 */ 2417 ret = wake_up_state(wq->private, mode); 2418 if (ret) { 2419 /* 2420 * Wake only once, autoremove behavior. 2421 * 2422 * After the effect of list_del_init is visible to the other 2423 * CPUs, the waitqueue may disappear from under us, see the 2424 * !list_empty_careful() in handle_userfault(). 2425 * 2426 * try_to_wake_up() has an implicit smp_mb(), and the 2427 * wq->private is read before calling the extern function 2428 * "wake_up_state" (which in turns calls try_to_wake_up). 2429 */ 2430 list_del_init(&wq->entry); 2431 } 2432 out: 2433 return ret; 2434 } 2435 2436 /** 2437 * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd 2438 * context. 2439 * @ctx: [in] Pointer to the userfaultfd context. 2440 */ 2441 static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) 2442 { 2443 refcount_inc(&ctx->refcount); 2444 } 2445 2446 /** 2447 * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd 2448 * context. 2449 * @ctx: [in] Pointer to userfaultfd context. 2450 * 2451 * The userfaultfd context reference must have been previously acquired either 2452 * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget(). 2453 */ 2454 static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) 2455 { 2456 if (refcount_dec_and_test(&ctx->refcount)) { 2457 VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_pending_wqh.lock)); 2458 VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_pending_wqh)); 2459 VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_wqh.lock)); 2460 VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_wqh)); 2461 VM_WARN_ON_ONCE(spin_is_locked(&ctx->event_wqh.lock)); 2462 VM_WARN_ON_ONCE(waitqueue_active(&ctx->event_wqh)); 2463 VM_WARN_ON_ONCE(spin_is_locked(&ctx->fd_wqh.lock)); 2464 VM_WARN_ON_ONCE(waitqueue_active(&ctx->fd_wqh)); 2465 mmdrop(ctx->mm); 2466 kmem_cache_free(userfaultfd_ctx_cachep, ctx); 2467 } 2468 } 2469 2470 static inline void msg_init(struct uffd_msg *msg) 2471 { 2472 BUILD_BUG_ON(sizeof(struct uffd_msg) != 32); 2473 /* 2474 * Must use memset to zero out the paddings or kernel data is 2475 * leaked to userland. 2476 */ 2477 memset(msg, 0, sizeof(struct uffd_msg)); 2478 } 2479 2480 static inline struct uffd_msg userfault_msg(unsigned long address, 2481 unsigned long real_address, 2482 unsigned int flags, 2483 unsigned long reason, 2484 unsigned int features) 2485 { 2486 struct uffd_msg msg; 2487 2488 msg_init(&msg); 2489 msg.event = UFFD_EVENT_PAGEFAULT; 2490 2491 msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ? 2492 real_address : address; 2493 2494 /* 2495 * These flags indicate why the userfault occurred: 2496 * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault. 2497 * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault. 2498 * - Neither of these flags being set indicates a MISSING fault. 2499 * 2500 * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write 2501 * fault. Otherwise, it was a read fault. 2502 */ 2503 if (flags & FAULT_FLAG_WRITE) 2504 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE; 2505 if (reason & VM_UFFD_WP) 2506 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; 2507 if (reason & VM_UFFD_MINOR) 2508 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR; 2509 if (features & UFFD_FEATURE_THREAD_ID) 2510 msg.arg.pagefault.feat.ptid = task_pid_vnr(current); 2511 return msg; 2512 } 2513 2514 #ifdef CONFIG_HUGETLB_PAGE 2515 /* 2516 * Same functionality as userfaultfd_must_wait below with modifications for 2517 * hugepmd ranges. 2518 */ 2519 static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, 2520 struct vm_fault *vmf, 2521 unsigned long reason) 2522 { 2523 struct vm_area_struct *vma = vmf->vma; 2524 pte_t *ptep, pte; 2525 2526 assert_fault_locked(vmf); 2527 2528 ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma)); 2529 if (!ptep) 2530 return true; 2531 2532 pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep); 2533 2534 /* 2535 * Lockless access: we're in a wait_event so it's ok if it 2536 * changes under us. 2537 */ 2538 2539 /* Entry is still missing, wait for userspace to resolve the fault. */ 2540 if (huge_pte_none(pte)) 2541 return true; 2542 /* UFFD PTE markers require userspace to resolve the fault. */ 2543 if (pte_is_uffd_marker(pte)) 2544 return true; 2545 /* 2546 * Concurrent migration may have replaced the present PTE with a 2547 * non-marker swap entry between fault delivery and this lockless 2548 * re-check. huge_pte_write() on a swap entry decodes random offset 2549 * bits, so gate it on pte_present(). The migration completion path 2550 * will re-deliver the fault if it still needs userspace. 2551 */ 2552 if (!pte_present(pte)) 2553 return false; 2554 /* 2555 * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to 2556 * resolve the fault. 2557 */ 2558 if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) 2559 return true; 2560 2561 return false; 2562 } 2563 #else 2564 static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, 2565 struct vm_fault *vmf, 2566 unsigned long reason) 2567 { 2568 /* Should never get here. */ 2569 VM_WARN_ON_ONCE(1); 2570 return false; 2571 } 2572 #endif /* CONFIG_HUGETLB_PAGE */ 2573 2574 /* 2575 * Verify the pagetables are still not ok after having registered into 2576 * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any 2577 * userfault that has already been resolved, if userfaultfd_read_iter and 2578 * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different 2579 * threads. 2580 */ 2581 static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, 2582 struct vm_fault *vmf, 2583 unsigned long reason) 2584 { 2585 struct mm_struct *mm = ctx->mm; 2586 unsigned long address = vmf->address; 2587 pgd_t *pgd; 2588 p4d_t *p4d; 2589 pud_t *pud; 2590 pmd_t *pmd, _pmd; 2591 pte_t *pte; 2592 pte_t ptent; 2593 bool ret; 2594 2595 assert_fault_locked(vmf); 2596 2597 pgd = pgd_offset(mm, address); 2598 if (!pgd_present(*pgd)) 2599 return true; 2600 p4d = p4d_offset(pgd, address); 2601 if (!p4d_present(*p4d)) 2602 return true; 2603 pud = pud_offset(p4d, address); 2604 if (!pud_present(*pud)) 2605 return true; 2606 pmd = pmd_offset(pud, address); 2607 again: 2608 _pmd = pmdp_get_lockless(pmd); 2609 if (pmd_none(_pmd)) 2610 return true; 2611 2612 /* 2613 * A race could arise which would result in a softleaf entry such as 2614 * migration entry unexpectedly being present in the PMD, so explicitly 2615 * check for this and bail out if so. 2616 */ 2617 if (!pmd_present(_pmd)) 2618 return false; 2619 2620 if (pmd_trans_huge(_pmd)) 2621 return !pmd_write(_pmd) && (reason & VM_UFFD_WP); 2622 2623 pte = pte_offset_map(pmd, address); 2624 if (!pte) 2625 goto again; 2626 2627 /* 2628 * Lockless access: we're in a wait_event so it's ok if it 2629 * changes under us. 2630 */ 2631 ptent = ptep_get(pte); 2632 2633 ret = true; 2634 /* Entry is still missing, wait for userspace to resolve the fault. */ 2635 if (pte_none(ptent)) 2636 goto out; 2637 /* UFFD PTE markers require userspace to resolve the fault. */ 2638 if (pte_is_uffd_marker(ptent)) 2639 goto out; 2640 /* 2641 * Concurrent swap-out / migration may have replaced the present PTE 2642 * with a non-marker swap entry between fault delivery and this 2643 * lockless re-check. pte_write() on a swap entry decodes random 2644 * offset bits, so gate it on pte_present(). The page-in path will 2645 * re-deliver the fault if it still needs userspace. 2646 */ 2647 if (!pte_present(ptent)) { 2648 ret = false; 2649 goto out; 2650 } 2651 /* 2652 * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to 2653 * resolve the fault. 2654 */ 2655 if (!pte_write(ptent) && (reason & VM_UFFD_WP)) 2656 goto out; 2657 2658 ret = false; 2659 out: 2660 pte_unmap(pte); 2661 return ret; 2662 } 2663 2664 static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags) 2665 { 2666 if (flags & FAULT_FLAG_INTERRUPTIBLE) 2667 return TASK_INTERRUPTIBLE; 2668 2669 if (flags & FAULT_FLAG_KILLABLE) 2670 return TASK_KILLABLE; 2671 2672 return TASK_UNINTERRUPTIBLE; 2673 } 2674 2675 /* 2676 * The locking rules involved in returning VM_FAULT_RETRY depending on 2677 * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and 2678 * FAULT_FLAG_KILLABLE are not straightforward. The "Caution" 2679 * recommendation in __lock_page_or_retry is not an understatement. 2680 * 2681 * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released 2682 * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is 2683 * not set. 2684 * 2685 * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not 2686 * set, VM_FAULT_RETRY can still be returned if and only if there are 2687 * fatal_signal_pending()s, and the mmap_lock must be released before 2688 * returning it. 2689 */ 2690 vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) 2691 { 2692 struct vm_area_struct *vma = vmf->vma; 2693 struct mm_struct *mm = vma->vm_mm; 2694 struct userfaultfd_ctx *ctx; 2695 struct userfaultfd_wait_queue uwq; 2696 vm_fault_t ret = VM_FAULT_SIGBUS; 2697 bool must_wait; 2698 unsigned int blocking_state; 2699 2700 /* 2701 * We don't do userfault handling for the final child pid update 2702 * and when coredumping (faults triggered by get_dump_page()). 2703 */ 2704 if (current->flags & (PF_EXITING|PF_DUMPCORE)) 2705 goto out; 2706 2707 assert_fault_locked(vmf); 2708 2709 ctx = vma->vm_userfaultfd_ctx.ctx; 2710 if (!ctx) 2711 goto out; 2712 2713 VM_WARN_ON_ONCE(ctx->mm != mm); 2714 2715 /* Any unrecognized flag is a bug. */ 2716 VM_WARN_ON_ONCE(reason & ~__VM_UFFD_FLAGS); 2717 /* 0 or > 1 flags set is a bug; we expect exactly 1. */ 2718 VM_WARN_ON_ONCE(!reason || (reason & (reason - 1))); 2719 2720 if (ctx->features & UFFD_FEATURE_SIGBUS) 2721 goto out; 2722 if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY)) 2723 goto out; 2724 2725 /* 2726 * Check that we can return VM_FAULT_RETRY. 2727 * 2728 * NOTE: it should become possible to return VM_FAULT_RETRY 2729 * even if FAULT_FLAG_TRIED is set without leading to gup() 2730 * -EBUSY failures, if the userfaultfd is to be extended for 2731 * VM_UFFD_WP tracking and we intend to arm the userfault 2732 * without first stopping userland access to the memory. For 2733 * VM_UFFD_MISSING userfaults this is enough for now. 2734 */ 2735 if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) { 2736 /* 2737 * Validate the invariant that nowait must allow retry 2738 * to be sure not to return SIGBUS erroneously on 2739 * nowait invocations. 2740 */ 2741 VM_WARN_ON_ONCE(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); 2742 #ifdef CONFIG_DEBUG_VM 2743 if (printk_ratelimit()) { 2744 pr_warn("FAULT_FLAG_ALLOW_RETRY missing %x\n", 2745 vmf->flags); 2746 dump_stack(); 2747 } 2748 #endif 2749 goto out; 2750 } 2751 2752 /* 2753 * Handle nowait, not much to do other than tell it to retry 2754 * and wait. 2755 */ 2756 ret = VM_FAULT_RETRY; 2757 if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) 2758 goto out; 2759 2760 if (unlikely(READ_ONCE(ctx->released))) { 2761 /* 2762 * If a concurrent release is detected, do not return 2763 * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always 2764 * return VM_FAULT_RETRY with lock released proactively. 2765 * 2766 * If we were to return VM_FAULT_SIGBUS here, the non 2767 * cooperative manager would be instead forced to 2768 * always call UFFDIO_UNREGISTER before it can safely 2769 * close the uffd, to avoid involuntary SIGBUS triggered. 2770 * 2771 * If we were to return VM_FAULT_NOPAGE, it would work for 2772 * the fault path, in which the lock will be released 2773 * later. However for GUP, faultin_page() does nothing 2774 * special on NOPAGE, so GUP would spin retrying without 2775 * releasing the mmap read lock, causing possible livelock. 2776 * 2777 * Here only VM_FAULT_RETRY would make sure the mmap lock 2778 * be released immediately, so that the thread concurrently 2779 * releasing the userfault would always make progress. 2780 */ 2781 release_fault_lock(vmf); 2782 goto out; 2783 } 2784 2785 /* take the reference before dropping the mmap_lock */ 2786 userfaultfd_ctx_get(ctx); 2787 2788 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); 2789 uwq.wq.private = current; 2790 uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags, 2791 reason, ctx->features); 2792 uwq.ctx = ctx; 2793 uwq.waken = false; 2794 2795 blocking_state = userfaultfd_get_blocking_state(vmf->flags); 2796 2797 /* 2798 * Take the vma lock now, in order to safely call 2799 * userfaultfd_huge_must_wait() later. Since acquiring the 2800 * (sleepable) vma lock can modify the current task state, that 2801 * must be before explicitly calling set_current_state(). 2802 */ 2803 if (is_vm_hugetlb_page(vma)) 2804 hugetlb_vma_lock_read(vma); 2805 2806 spin_lock_irq(&ctx->fault_pending_wqh.lock); 2807 /* 2808 * After the __add_wait_queue the uwq is visible to userland 2809 * through poll/read(). 2810 */ 2811 __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq); 2812 /* 2813 * The smp_mb() after __set_current_state prevents the reads 2814 * following the spin_unlock to happen before the list_add in 2815 * __add_wait_queue. 2816 */ 2817 set_current_state(blocking_state); 2818 spin_unlock_irq(&ctx->fault_pending_wqh.lock); 2819 2820 if (is_vm_hugetlb_page(vma)) { 2821 must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason); 2822 hugetlb_vma_unlock_read(vma); 2823 } else { 2824 must_wait = userfaultfd_must_wait(ctx, vmf, reason); 2825 } 2826 2827 release_fault_lock(vmf); 2828 2829 if (likely(must_wait && !READ_ONCE(ctx->released))) { 2830 wake_up_poll(&ctx->fd_wqh, EPOLLIN); 2831 schedule(); 2832 } 2833 2834 __set_current_state(TASK_RUNNING); 2835 2836 /* 2837 * Here we race with the list_del; list_add in 2838 * userfaultfd_ctx_read(), however because we don't ever run 2839 * list_del_init() to refile across the two lists, the prev 2840 * and next pointers will never point to self. list_add also 2841 * would never let any of the two pointers to point to 2842 * self. So list_empty_careful won't risk to see both pointers 2843 * pointing to self at any time during the list refile. The 2844 * only case where list_del_init() is called is the full 2845 * removal in the wake function and there we don't re-list_add 2846 * and it's fine not to block on the spinlock. The uwq on this 2847 * kernel stack can be released after the list_del_init. 2848 */ 2849 if (!list_empty_careful(&uwq.wq.entry)) { 2850 spin_lock_irq(&ctx->fault_pending_wqh.lock); 2851 /* 2852 * No need of list_del_init(), the uwq on the stack 2853 * will be freed shortly anyway. 2854 */ 2855 list_del(&uwq.wq.entry); 2856 spin_unlock_irq(&ctx->fault_pending_wqh.lock); 2857 } 2858 2859 /* 2860 * ctx may go away after this if the userfault pseudo fd is 2861 * already released. 2862 */ 2863 userfaultfd_ctx_put(ctx); 2864 2865 out: 2866 return ret; 2867 } 2868 2869 static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, 2870 struct userfaultfd_wait_queue *ewq) 2871 { 2872 struct userfaultfd_ctx *release_new_ctx; 2873 2874 if (WARN_ON_ONCE(current->flags & PF_EXITING)) 2875 goto out; 2876 2877 ewq->ctx = ctx; 2878 init_waitqueue_entry(&ewq->wq, current); 2879 release_new_ctx = NULL; 2880 2881 spin_lock_irq(&ctx->event_wqh.lock); 2882 /* 2883 * After the __add_wait_queue the uwq is visible to userland 2884 * through poll/read(). 2885 */ 2886 __add_wait_queue(&ctx->event_wqh, &ewq->wq); 2887 for (;;) { 2888 set_current_state(TASK_KILLABLE); 2889 if (ewq->msg.event == 0) 2890 break; 2891 if (READ_ONCE(ctx->released) || 2892 fatal_signal_pending(current)) { 2893 /* 2894 * &ewq->wq may be queued in fork_event, but 2895 * __remove_wait_queue ignores the head 2896 * parameter. It would be a problem if it 2897 * didn't. 2898 */ 2899 __remove_wait_queue(&ctx->event_wqh, &ewq->wq); 2900 if (ewq->msg.event == UFFD_EVENT_FORK) { 2901 struct userfaultfd_ctx *new; 2902 2903 new = (struct userfaultfd_ctx *) 2904 (unsigned long) 2905 ewq->msg.arg.reserved.reserved1; 2906 release_new_ctx = new; 2907 } 2908 break; 2909 } 2910 2911 spin_unlock_irq(&ctx->event_wqh.lock); 2912 2913 wake_up_poll(&ctx->fd_wqh, EPOLLIN); 2914 schedule(); 2915 2916 spin_lock_irq(&ctx->event_wqh.lock); 2917 } 2918 __set_current_state(TASK_RUNNING); 2919 spin_unlock_irq(&ctx->event_wqh.lock); 2920 2921 if (release_new_ctx) { 2922 userfaultfd_release_new(release_new_ctx); 2923 userfaultfd_ctx_put(release_new_ctx); 2924 } 2925 2926 /* 2927 * ctx may go away after this if the userfault pseudo fd is 2928 * already released. 2929 */ 2930 out: 2931 atomic_dec(&ctx->mmap_changing); 2932 VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < 0); 2933 userfaultfd_ctx_put(ctx); 2934 } 2935 2936 static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx, 2937 struct userfaultfd_wait_queue *ewq) 2938 { 2939 ewq->msg.event = 0; 2940 wake_up_locked(&ctx->event_wqh); 2941 __remove_wait_queue(&ctx->event_wqh, &ewq->wq); 2942 } 2943 2944 int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) 2945 { 2946 struct userfaultfd_ctx *ctx = NULL, *octx; 2947 struct userfaultfd_fork_ctx *fctx; 2948 2949 octx = vma->vm_userfaultfd_ctx.ctx; 2950 if (!octx) 2951 return 0; 2952 2953 if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) { 2954 userfaultfd_reset_ctx(vma); 2955 return 0; 2956 } 2957 2958 list_for_each_entry(fctx, fcs, list) 2959 if (fctx->orig == octx) { 2960 ctx = fctx->new; 2961 break; 2962 } 2963 2964 if (!ctx) { 2965 fctx = kmalloc_obj(*fctx); 2966 if (!fctx) 2967 return -ENOMEM; 2968 2969 ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); 2970 if (!ctx) { 2971 kfree(fctx); 2972 return -ENOMEM; 2973 } 2974 2975 refcount_set(&ctx->refcount, 1); 2976 ctx->flags = octx->flags; 2977 ctx->features = octx->features; 2978 ctx->released = false; 2979 init_rwsem(&ctx->map_changing_lock); 2980 atomic_set(&ctx->mmap_changing, 0); 2981 ctx->mm = vma->vm_mm; 2982 mmgrab(ctx->mm); 2983 2984 userfaultfd_ctx_get(octx); 2985 down_write(&octx->map_changing_lock); 2986 atomic_inc(&octx->mmap_changing); 2987 up_write(&octx->map_changing_lock); 2988 fctx->orig = octx; 2989 fctx->new = ctx; 2990 list_add_tail(&fctx->list, fcs); 2991 } 2992 2993 vma->vm_userfaultfd_ctx.ctx = ctx; 2994 return 0; 2995 } 2996 2997 static void dup_fctx(struct userfaultfd_fork_ctx *fctx) 2998 { 2999 struct userfaultfd_ctx *ctx = fctx->orig; 3000 struct userfaultfd_wait_queue ewq; 3001 3002 msg_init(&ewq.msg); 3003 3004 ewq.msg.event = UFFD_EVENT_FORK; 3005 ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new; 3006 3007 userfaultfd_event_wait_completion(ctx, &ewq); 3008 } 3009 3010 void dup_userfaultfd_complete(struct list_head *fcs) 3011 { 3012 struct userfaultfd_fork_ctx *fctx, *n; 3013 3014 list_for_each_entry_safe(fctx, n, fcs, list) { 3015 dup_fctx(fctx); 3016 list_del(&fctx->list); 3017 kfree(fctx); 3018 } 3019 } 3020 3021 void dup_userfaultfd_fail(struct list_head *fcs) 3022 { 3023 struct userfaultfd_fork_ctx *fctx, *n; 3024 3025 /* 3026 * An error has occurred on fork, we will tear memory down, but have 3027 * allocated memory for fctx's and raised reference counts for both the 3028 * original and child contexts (and on the mm for each as a result). 3029 * 3030 * These would ordinarily be taken care of by a user handling the event, 3031 * but we are no longer doing so, so manually clean up here. 3032 * 3033 * mm tear down will take care of cleaning up VMA contexts. 3034 */ 3035 list_for_each_entry_safe(fctx, n, fcs, list) { 3036 struct userfaultfd_ctx *octx = fctx->orig; 3037 struct userfaultfd_ctx *ctx = fctx->new; 3038 3039 atomic_dec(&octx->mmap_changing); 3040 VM_WARN_ON_ONCE(atomic_read(&octx->mmap_changing) < 0); 3041 userfaultfd_ctx_put(octx); 3042 userfaultfd_ctx_put(ctx); 3043 3044 list_del(&fctx->list); 3045 kfree(fctx); 3046 } 3047 } 3048 3049 void mremap_userfaultfd_prep(struct vm_area_struct *vma, 3050 struct vm_userfaultfd_ctx *vm_ctx) 3051 { 3052 struct userfaultfd_ctx *ctx; 3053 3054 ctx = vma->vm_userfaultfd_ctx.ctx; 3055 3056 if (!ctx) 3057 return; 3058 3059 if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { 3060 vm_ctx->ctx = ctx; 3061 userfaultfd_ctx_get(ctx); 3062 down_write(&ctx->map_changing_lock); 3063 atomic_inc(&ctx->mmap_changing); 3064 up_write(&ctx->map_changing_lock); 3065 } else { 3066 /* Drop uffd context if remap feature not enabled */ 3067 userfaultfd_reset_ctx(vma); 3068 } 3069 } 3070 3071 void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, 3072 unsigned long from, unsigned long to, 3073 unsigned long len) 3074 { 3075 struct userfaultfd_ctx *ctx = vm_ctx->ctx; 3076 struct userfaultfd_wait_queue ewq; 3077 3078 if (!ctx) 3079 return; 3080 3081 msg_init(&ewq.msg); 3082 3083 ewq.msg.event = UFFD_EVENT_REMAP; 3084 ewq.msg.arg.remap.from = from; 3085 ewq.msg.arg.remap.to = to; 3086 ewq.msg.arg.remap.len = len; 3087 3088 userfaultfd_event_wait_completion(ctx, &ewq); 3089 } 3090 3091 void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *vm_ctx) 3092 { 3093 struct userfaultfd_ctx *ctx = vm_ctx->ctx; 3094 3095 if (!ctx) 3096 return; 3097 3098 atomic_dec(&ctx->mmap_changing); 3099 VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < 0); 3100 userfaultfd_ctx_put(ctx); 3101 } 3102 3103 bool userfaultfd_remove(struct vm_area_struct *vma, 3104 unsigned long start, unsigned long end) 3105 { 3106 struct mm_struct *mm = vma->vm_mm; 3107 struct userfaultfd_ctx *ctx; 3108 struct userfaultfd_wait_queue ewq; 3109 3110 ctx = vma->vm_userfaultfd_ctx.ctx; 3111 if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE)) 3112 return true; 3113 3114 userfaultfd_ctx_get(ctx); 3115 down_write(&ctx->map_changing_lock); 3116 atomic_inc(&ctx->mmap_changing); 3117 up_write(&ctx->map_changing_lock); 3118 mmap_read_unlock(mm); 3119 3120 msg_init(&ewq.msg); 3121 3122 ewq.msg.event = UFFD_EVENT_REMOVE; 3123 ewq.msg.arg.remove.start = start; 3124 ewq.msg.arg.remove.end = end; 3125 3126 userfaultfd_event_wait_completion(ctx, &ewq); 3127 3128 return false; 3129 } 3130 3131 static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, 3132 unsigned long start, unsigned long end) 3133 { 3134 struct userfaultfd_unmap_ctx *unmap_ctx; 3135 3136 list_for_each_entry(unmap_ctx, unmaps, list) 3137 if (unmap_ctx->ctx == ctx && unmap_ctx->start == start && 3138 unmap_ctx->end == end) 3139 return true; 3140 3141 return false; 3142 } 3143 3144 int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, 3145 unsigned long end, struct list_head *unmaps) 3146 { 3147 struct userfaultfd_unmap_ctx *unmap_ctx; 3148 struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; 3149 3150 if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) || 3151 has_unmap_ctx(ctx, unmaps, start, end)) 3152 return 0; 3153 3154 unmap_ctx = kzalloc_obj(*unmap_ctx); 3155 if (!unmap_ctx) 3156 return -ENOMEM; 3157 3158 userfaultfd_ctx_get(ctx); 3159 down_write(&ctx->map_changing_lock); 3160 atomic_inc(&ctx->mmap_changing); 3161 up_write(&ctx->map_changing_lock); 3162 unmap_ctx->ctx = ctx; 3163 unmap_ctx->start = start; 3164 unmap_ctx->end = end; 3165 list_add_tail(&unmap_ctx->list, unmaps); 3166 3167 return 0; 3168 } 3169 3170 void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf) 3171 { 3172 struct userfaultfd_unmap_ctx *ctx, *n; 3173 struct userfaultfd_wait_queue ewq; 3174 3175 list_for_each_entry_safe(ctx, n, uf, list) { 3176 msg_init(&ewq.msg); 3177 3178 ewq.msg.event = UFFD_EVENT_UNMAP; 3179 ewq.msg.arg.remove.start = ctx->start; 3180 ewq.msg.arg.remove.end = ctx->end; 3181 3182 userfaultfd_event_wait_completion(ctx->ctx, &ewq); 3183 3184 list_del(&ctx->list); 3185 kfree(ctx); 3186 } 3187 } 3188 3189 static int userfaultfd_release(struct inode *inode, struct file *file) 3190 { 3191 struct userfaultfd_ctx *ctx = file->private_data; 3192 struct mm_struct *mm = ctx->mm; 3193 /* len == 0 means wake all */ 3194 struct userfaultfd_wake_range range = { .len = 0, }; 3195 3196 WRITE_ONCE(ctx->released, true); 3197 3198 userfaultfd_release_all(mm, ctx); 3199 3200 /* 3201 * After no new page faults can wait on this fault_*wqh, flush 3202 * the last page faults that may have been already waiting on 3203 * the fault_*wqh. 3204 */ 3205 spin_lock_irq(&ctx->fault_pending_wqh.lock); 3206 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range); 3207 __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range); 3208 spin_unlock_irq(&ctx->fault_pending_wqh.lock); 3209 3210 /* Flush pending events that may still wait on event_wqh */ 3211 wake_up_all(&ctx->event_wqh); 3212 3213 wake_up_poll(&ctx->fd_wqh, EPOLLHUP); 3214 userfaultfd_ctx_put(ctx); 3215 return 0; 3216 } 3217 3218 /* fault_pending_wqh.lock must be hold by the caller */ 3219 static inline struct userfaultfd_wait_queue *find_userfault_in( 3220 wait_queue_head_t *wqh) 3221 { 3222 wait_queue_entry_t *wq; 3223 struct userfaultfd_wait_queue *uwq; 3224 3225 lockdep_assert_held(&wqh->lock); 3226 3227 uwq = NULL; 3228 if (!waitqueue_active(wqh)) 3229 goto out; 3230 /* walk in reverse to provide FIFO behavior to read userfaults */ 3231 wq = list_last_entry(&wqh->head, typeof(*wq), entry); 3232 uwq = container_of(wq, struct userfaultfd_wait_queue, wq); 3233 out: 3234 return uwq; 3235 } 3236 3237 static inline struct userfaultfd_wait_queue *find_userfault( 3238 struct userfaultfd_ctx *ctx) 3239 { 3240 return find_userfault_in(&ctx->fault_pending_wqh); 3241 } 3242 3243 static inline struct userfaultfd_wait_queue *find_userfault_evt( 3244 struct userfaultfd_ctx *ctx) 3245 { 3246 return find_userfault_in(&ctx->event_wqh); 3247 } 3248 3249 static __poll_t userfaultfd_poll(struct file *file, poll_table *wait) 3250 { 3251 struct userfaultfd_ctx *ctx = file->private_data; 3252 __poll_t ret; 3253 3254 poll_wait(file, &ctx->fd_wqh, wait); 3255 3256 if (!userfaultfd_is_initialized(ctx)) 3257 return EPOLLERR; 3258 3259 /* 3260 * poll() never guarantees that read won't block. 3261 * userfaults can be waken before they're read(). 3262 */ 3263 if (unlikely(!(file->f_flags & O_NONBLOCK))) 3264 return EPOLLERR; 3265 /* 3266 * lockless access to see if there are pending faults 3267 * __pollwait last action is the add_wait_queue but 3268 * the spin_unlock would allow the waitqueue_active to 3269 * pass above the actual list_add inside 3270 * add_wait_queue critical section. So use a full 3271 * memory barrier to serialize the list_add write of 3272 * add_wait_queue() with the waitqueue_active read 3273 * below. 3274 */ 3275 ret = 0; 3276 smp_mb(); 3277 if (waitqueue_active(&ctx->fault_pending_wqh)) 3278 ret = EPOLLIN; 3279 else if (waitqueue_active(&ctx->event_wqh)) 3280 ret = EPOLLIN; 3281 3282 return ret; 3283 } 3284 3285 static const struct file_operations userfaultfd_fops; 3286 3287 static int resolve_userfault_fork(struct userfaultfd_ctx *new, 3288 struct inode *inode, 3289 struct uffd_msg *msg) 3290 { 3291 int fd; 3292 3293 fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new, 3294 O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); 3295 if (fd < 0) 3296 return fd; 3297 3298 msg->arg.reserved.reserved1 = 0; 3299 msg->arg.fork.ufd = fd; 3300 return 0; 3301 } 3302 3303 static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, 3304 struct uffd_msg *msg, struct inode *inode) 3305 { 3306 ssize_t ret; 3307 DECLARE_WAITQUEUE(wait, current); 3308 struct userfaultfd_wait_queue *uwq; 3309 /* 3310 * Handling fork event requires sleeping operations, so 3311 * we drop the event_wqh lock, then do these ops, then 3312 * lock it back and wake up the waiter. While the lock is 3313 * dropped the ewq may go away so we keep track of it 3314 * carefully. 3315 */ 3316 LIST_HEAD(fork_event); 3317 struct userfaultfd_ctx *fork_nctx = NULL; 3318 3319 /* always take the fd_wqh lock before the fault_pending_wqh lock */ 3320 spin_lock_irq(&ctx->fd_wqh.lock); 3321 __add_wait_queue(&ctx->fd_wqh, &wait); 3322 for (;;) { 3323 set_current_state(TASK_INTERRUPTIBLE); 3324 spin_lock(&ctx->fault_pending_wqh.lock); 3325 uwq = find_userfault(ctx); 3326 if (uwq) { 3327 /* 3328 * Use a seqcount to repeat the lockless check 3329 * in wake_userfault() to avoid missing 3330 * wakeups because during the refile both 3331 * waitqueue could become empty if this is the 3332 * only userfault. 3333 */ 3334 write_seqcount_begin(&ctx->refile_seq); 3335 3336 /* 3337 * The fault_pending_wqh.lock prevents the uwq 3338 * to disappear from under us. 3339 * 3340 * Refile this userfault from 3341 * fault_pending_wqh to fault_wqh, it's not 3342 * pending anymore after we read it. 3343 * 3344 * Use list_del() by hand (as 3345 * userfaultfd_wake_function also uses 3346 * list_del_init() by hand) to be sure nobody 3347 * changes __remove_wait_queue() to use 3348 * list_del_init() in turn breaking the 3349 * !list_empty_careful() check in 3350 * handle_userfault(). The uwq->wq.head list 3351 * must never be empty at any time during the 3352 * refile, or the waitqueue could disappear 3353 * from under us. The "wait_queue_head_t" 3354 * parameter of __remove_wait_queue() is unused 3355 * anyway. 3356 */ 3357 list_del(&uwq->wq.entry); 3358 add_wait_queue(&ctx->fault_wqh, &uwq->wq); 3359 3360 write_seqcount_end(&ctx->refile_seq); 3361 3362 /* careful to always initialize msg if ret == 0 */ 3363 *msg = uwq->msg; 3364 spin_unlock(&ctx->fault_pending_wqh.lock); 3365 ret = 0; 3366 break; 3367 } 3368 spin_unlock(&ctx->fault_pending_wqh.lock); 3369 3370 spin_lock(&ctx->event_wqh.lock); 3371 uwq = find_userfault_evt(ctx); 3372 if (uwq) { 3373 *msg = uwq->msg; 3374 3375 if (uwq->msg.event == UFFD_EVENT_FORK) { 3376 fork_nctx = (struct userfaultfd_ctx *) 3377 (unsigned long) 3378 uwq->msg.arg.reserved.reserved1; 3379 list_move(&uwq->wq.entry, &fork_event); 3380 /* 3381 * fork_nctx can be freed as soon as 3382 * we drop the lock, unless we take a 3383 * reference on it. 3384 */ 3385 userfaultfd_ctx_get(fork_nctx); 3386 spin_unlock(&ctx->event_wqh.lock); 3387 ret = 0; 3388 break; 3389 } 3390 3391 userfaultfd_event_complete(ctx, uwq); 3392 spin_unlock(&ctx->event_wqh.lock); 3393 ret = 0; 3394 break; 3395 } 3396 spin_unlock(&ctx->event_wqh.lock); 3397 3398 if (signal_pending(current)) { 3399 ret = -ERESTARTSYS; 3400 break; 3401 } 3402 if (no_wait) { 3403 ret = -EAGAIN; 3404 break; 3405 } 3406 spin_unlock_irq(&ctx->fd_wqh.lock); 3407 schedule(); 3408 spin_lock_irq(&ctx->fd_wqh.lock); 3409 } 3410 __remove_wait_queue(&ctx->fd_wqh, &wait); 3411 __set_current_state(TASK_RUNNING); 3412 spin_unlock_irq(&ctx->fd_wqh.lock); 3413 3414 if (!ret && msg->event == UFFD_EVENT_FORK) { 3415 ret = resolve_userfault_fork(fork_nctx, inode, msg); 3416 spin_lock_irq(&ctx->event_wqh.lock); 3417 if (!list_empty(&fork_event)) { 3418 /* 3419 * The fork thread didn't abort, so we can 3420 * drop the temporary refcount. 3421 */ 3422 userfaultfd_ctx_put(fork_nctx); 3423 3424 uwq = list_first_entry(&fork_event, 3425 typeof(*uwq), 3426 wq.entry); 3427 /* 3428 * If fork_event list wasn't empty and in turn 3429 * the event wasn't already released by fork 3430 * (the event is allocated on fork kernel 3431 * stack), put the event back to its place in 3432 * the event_wq. fork_event head will be freed 3433 * as soon as we return so the event cannot 3434 * stay queued there no matter the current 3435 * "ret" value. 3436 */ 3437 list_del(&uwq->wq.entry); 3438 __add_wait_queue(&ctx->event_wqh, &uwq->wq); 3439 3440 /* 3441 * Leave the event in the waitqueue and report 3442 * error to userland if we failed to resolve 3443 * the userfault fork. 3444 */ 3445 if (likely(!ret)) 3446 userfaultfd_event_complete(ctx, uwq); 3447 } else { 3448 /* 3449 * Here the fork thread aborted and the 3450 * refcount from the fork thread on fork_nctx 3451 * has already been released. We still hold 3452 * the reference we took before releasing the 3453 * lock above. If resolve_userfault_fork 3454 * failed we've to drop it because the 3455 * fork_nctx has to be freed in such case. If 3456 * it succeeded we'll hold it because the new 3457 * uffd references it. 3458 */ 3459 if (ret) 3460 userfaultfd_ctx_put(fork_nctx); 3461 } 3462 spin_unlock_irq(&ctx->event_wqh.lock); 3463 } 3464 3465 return ret; 3466 } 3467 3468 static ssize_t userfaultfd_read_iter(struct kiocb *iocb, struct iov_iter *to) 3469 { 3470 struct file *file = iocb->ki_filp; 3471 struct userfaultfd_ctx *ctx = file->private_data; 3472 ssize_t _ret, ret = 0; 3473 struct uffd_msg msg; 3474 struct inode *inode = file_inode(file); 3475 bool no_wait; 3476 3477 if (!userfaultfd_is_initialized(ctx)) 3478 return -EINVAL; 3479 3480 no_wait = file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT; 3481 for (;;) { 3482 if (iov_iter_count(to) < sizeof(msg)) 3483 return ret ? ret : -EINVAL; 3484 _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode); 3485 if (_ret < 0) 3486 return ret ? ret : _ret; 3487 _ret = !copy_to_iter_full(&msg, sizeof(msg), to); 3488 if (_ret) 3489 return ret ? ret : -EFAULT; 3490 ret += sizeof(msg); 3491 /* 3492 * Allow to read more than one fault at time but only 3493 * block if waiting for the very first one. 3494 */ 3495 no_wait = true; 3496 } 3497 } 3498 3499 static void __wake_userfault(struct userfaultfd_ctx *ctx, 3500 struct userfaultfd_wake_range *range) 3501 { 3502 spin_lock_irq(&ctx->fault_pending_wqh.lock); 3503 /* wake all in the range and autoremove */ 3504 if (waitqueue_active(&ctx->fault_pending_wqh)) 3505 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 3506 range); 3507 if (waitqueue_active(&ctx->fault_wqh)) 3508 __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range); 3509 spin_unlock_irq(&ctx->fault_pending_wqh.lock); 3510 } 3511 3512 static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, 3513 struct userfaultfd_wake_range *range) 3514 { 3515 unsigned seq; 3516 bool need_wakeup; 3517 3518 /* 3519 * To be sure waitqueue_active() is not reordered by the CPU 3520 * before the pagetable update, use an explicit SMP memory 3521 * barrier here. PT lock release or mmap_read_unlock(mm) still 3522 * have release semantics that can allow the 3523 * waitqueue_active() to be reordered before the pte update. 3524 */ 3525 smp_mb(); 3526 3527 /* 3528 * Use waitqueue_active because it's very frequent to 3529 * change the address space atomically even if there are no 3530 * userfaults yet. So we take the spinlock only when we're 3531 * sure we've userfaults to wake. 3532 */ 3533 do { 3534 seq = read_seqcount_begin(&ctx->refile_seq); 3535 need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) || 3536 waitqueue_active(&ctx->fault_wqh); 3537 cond_resched(); 3538 } while (read_seqcount_retry(&ctx->refile_seq, seq)); 3539 if (need_wakeup) 3540 __wake_userfault(ctx, range); 3541 } 3542 3543 static __always_inline int validate_unaligned_range( 3544 struct mm_struct *mm, __u64 start, __u64 len) 3545 { 3546 __u64 task_size = mm->task_size; 3547 3548 if (len & ~PAGE_MASK) 3549 return -EINVAL; 3550 if (!len) 3551 return -EINVAL; 3552 if (start >= task_size) 3553 return -EINVAL; 3554 if (len > task_size - start) 3555 return -EINVAL; 3556 if (start + len <= start) 3557 return -EINVAL; 3558 return 0; 3559 } 3560 3561 static __always_inline int validate_range(struct mm_struct *mm, 3562 __u64 start, __u64 len) 3563 { 3564 if (start & ~PAGE_MASK) 3565 return -EINVAL; 3566 3567 return validate_unaligned_range(mm, start, len); 3568 } 3569 3570 static int userfaultfd_register(struct userfaultfd_ctx *ctx, 3571 unsigned long arg) 3572 { 3573 struct mm_struct *mm = ctx->mm; 3574 struct vm_area_struct *vma, *cur; 3575 int ret; 3576 struct uffdio_register uffdio_register; 3577 struct uffdio_register __user *user_uffdio_register; 3578 vm_flags_t vm_flags; 3579 bool found; 3580 bool basic_ioctls; 3581 unsigned long start, end; 3582 struct vma_iterator vmi; 3583 bool wp_async = userfaultfd_wp_async_ctx(ctx); 3584 3585 user_uffdio_register = (struct uffdio_register __user *) arg; 3586 3587 ret = -EFAULT; 3588 if (copy_from_user(&uffdio_register, user_uffdio_register, 3589 sizeof(uffdio_register)-sizeof(__u64))) 3590 goto out; 3591 3592 ret = -EINVAL; 3593 if (!uffdio_register.mode) 3594 goto out; 3595 if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES) 3596 goto out; 3597 vm_flags = 0; 3598 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) 3599 vm_flags |= VM_UFFD_MISSING; 3600 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { 3601 if (!pgtable_supports_uffd_wp()) 3602 goto out; 3603 3604 vm_flags |= VM_UFFD_WP; 3605 } 3606 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) { 3607 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR 3608 goto out; 3609 #endif 3610 vm_flags |= VM_UFFD_MINOR; 3611 } 3612 3613 ret = validate_range(mm, uffdio_register.range.start, 3614 uffdio_register.range.len); 3615 if (ret) 3616 goto out; 3617 3618 start = uffdio_register.range.start; 3619 end = start + uffdio_register.range.len; 3620 3621 ret = -ENOMEM; 3622 if (!mmget_not_zero(mm)) 3623 goto out; 3624 3625 ret = -EINVAL; 3626 mmap_write_lock(mm); 3627 vma_iter_init(&vmi, mm, start); 3628 vma = vma_find(&vmi, end); 3629 if (!vma) 3630 goto out_unlock; 3631 3632 /* 3633 * If the first vma contains huge pages, make sure start address 3634 * is aligned to huge page size. 3635 */ 3636 if (is_vm_hugetlb_page(vma)) { 3637 unsigned long vma_hpagesize = vma_kernel_pagesize(vma); 3638 3639 if (start & (vma_hpagesize - 1)) 3640 goto out_unlock; 3641 } 3642 3643 /* 3644 * Search for not compatible vmas. 3645 */ 3646 found = false; 3647 basic_ioctls = false; 3648 cur = vma; 3649 do { 3650 cond_resched(); 3651 3652 VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^ 3653 !!(cur->vm_flags & __VM_UFFD_FLAGS)); 3654 3655 /* check not compatible vmas */ 3656 ret = -EINVAL; 3657 if (!vma_can_userfault(cur, vm_flags, wp_async)) 3658 goto out_unlock; 3659 3660 /* 3661 * UFFDIO_COPY will fill file holes even without 3662 * PROT_WRITE. This check enforces that if this is a 3663 * MAP_SHARED, the process has write permission to the backing 3664 * file. If VM_MAYWRITE is set it also enforces that on a 3665 * MAP_SHARED vma: there is no F_WRITE_SEAL and no further 3666 * F_WRITE_SEAL can be taken until the vma is destroyed. 3667 */ 3668 ret = -EPERM; 3669 if (unlikely(!(cur->vm_flags & VM_MAYWRITE))) 3670 goto out_unlock; 3671 3672 /* 3673 * If this vma contains ending address, and huge pages 3674 * check alignment. 3675 */ 3676 if (is_vm_hugetlb_page(cur) && end <= cur->vm_end && 3677 end > cur->vm_start) { 3678 unsigned long vma_hpagesize = vma_kernel_pagesize(cur); 3679 3680 ret = -EINVAL; 3681 3682 if (end & (vma_hpagesize - 1)) 3683 goto out_unlock; 3684 } 3685 if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE)) 3686 goto out_unlock; 3687 3688 /* 3689 * Check that this vma isn't already owned by a 3690 * different userfaultfd. We can't allow more than one 3691 * userfaultfd to own a single vma simultaneously or we 3692 * wouldn't know which one to deliver the userfaults to. 3693 */ 3694 ret = -EBUSY; 3695 if (cur->vm_userfaultfd_ctx.ctx && 3696 cur->vm_userfaultfd_ctx.ctx != ctx) 3697 goto out_unlock; 3698 3699 /* 3700 * Note vmas containing huge pages 3701 */ 3702 if (is_vm_hugetlb_page(cur)) 3703 basic_ioctls = true; 3704 3705 found = true; 3706 } for_each_vma_range(vmi, cur, end); 3707 VM_WARN_ON_ONCE(!found); 3708 3709 ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end, 3710 wp_async); 3711 3712 out_unlock: 3713 mmap_write_unlock(mm); 3714 mmput(mm); 3715 if (!ret) { 3716 __u64 ioctls_out; 3717 3718 ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : 3719 UFFD_API_RANGE_IOCTLS; 3720 3721 /* 3722 * Declare the WP ioctl only if the WP mode is 3723 * specified and all checks passed with the range 3724 */ 3725 if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)) 3726 ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT); 3727 3728 /* CONTINUE ioctl is only supported for MINOR ranges. */ 3729 if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR)) 3730 ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE); 3731 3732 /* 3733 * Now that we scanned all vmas we can already tell 3734 * userland which ioctls methods are guaranteed to 3735 * succeed on this range. 3736 */ 3737 if (put_user(ioctls_out, &user_uffdio_register->ioctls)) 3738 ret = -EFAULT; 3739 } 3740 out: 3741 return ret; 3742 } 3743 3744 static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, 3745 unsigned long arg) 3746 { 3747 struct mm_struct *mm = ctx->mm; 3748 struct vm_area_struct *vma, *prev, *cur; 3749 int ret; 3750 struct uffdio_range uffdio_unregister; 3751 bool found; 3752 unsigned long start, end, vma_end; 3753 const void __user *buf = (void __user *)arg; 3754 struct vma_iterator vmi; 3755 bool wp_async = userfaultfd_wp_async_ctx(ctx); 3756 3757 ret = -EFAULT; 3758 if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) 3759 goto out; 3760 3761 ret = validate_range(mm, uffdio_unregister.start, 3762 uffdio_unregister.len); 3763 if (ret) 3764 goto out; 3765 3766 start = uffdio_unregister.start; 3767 end = start + uffdio_unregister.len; 3768 3769 ret = -ENOMEM; 3770 if (!mmget_not_zero(mm)) 3771 goto out; 3772 3773 mmap_write_lock(mm); 3774 ret = -EINVAL; 3775 vma_iter_init(&vmi, mm, start); 3776 vma = vma_find(&vmi, end); 3777 if (!vma) 3778 goto out_unlock; 3779 3780 /* 3781 * If the first vma contains huge pages, make sure start address 3782 * is aligned to huge page size. 3783 */ 3784 if (is_vm_hugetlb_page(vma)) { 3785 unsigned long vma_hpagesize = vma_kernel_pagesize(vma); 3786 3787 if (start & (vma_hpagesize - 1)) 3788 goto out_unlock; 3789 } 3790 3791 /* 3792 * Search for not compatible vmas. 3793 */ 3794 found = false; 3795 cur = vma; 3796 do { 3797 cond_resched(); 3798 3799 VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^ 3800 !!(cur->vm_flags & __VM_UFFD_FLAGS)); 3801 3802 /* 3803 * Prevent unregistering through a different userfaultfd than 3804 * the one used for registration. 3805 */ 3806 if (cur->vm_userfaultfd_ctx.ctx && 3807 cur->vm_userfaultfd_ctx.ctx != ctx) 3808 goto out_unlock; 3809 3810 /* 3811 * Check not compatible vmas, not strictly required 3812 * here as not compatible vmas cannot have an 3813 * userfaultfd_ctx registered on them, but this 3814 * provides for more strict behavior to notice 3815 * unregistration errors. 3816 */ 3817 if (!vma_can_userfault(cur, cur->vm_flags, wp_async)) 3818 goto out_unlock; 3819 3820 found = true; 3821 } for_each_vma_range(vmi, cur, end); 3822 VM_WARN_ON_ONCE(!found); 3823 3824 vma_iter_set(&vmi, start); 3825 prev = vma_prev(&vmi); 3826 if (vma->vm_start < start) 3827 prev = vma; 3828 3829 ret = 0; 3830 for_each_vma_range(vmi, vma, end) { 3831 cond_resched(); 3832 3833 /* VMA not registered with userfaultfd. */ 3834 if (!vma->vm_userfaultfd_ctx.ctx) 3835 goto skip; 3836 3837 VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx != ctx); 3838 VM_WARN_ON_ONCE(!vma_can_userfault(vma, vma->vm_flags, wp_async)); 3839 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)); 3840 3841 if (vma->vm_start > start) 3842 start = vma->vm_start; 3843 vma_end = min(end, vma->vm_end); 3844 3845 if (userfaultfd_missing(vma)) { 3846 /* 3847 * Wake any concurrent pending userfault while 3848 * we unregister, so they will not hang 3849 * permanently and it avoids userland to call 3850 * UFFDIO_WAKE explicitly. 3851 */ 3852 struct userfaultfd_wake_range range; 3853 range.start = start; 3854 range.len = vma_end - start; 3855 wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); 3856 } 3857 3858 vma = userfaultfd_clear_vma(&vmi, prev, vma, 3859 start, vma_end); 3860 if (IS_ERR(vma)) { 3861 ret = PTR_ERR(vma); 3862 break; 3863 } 3864 3865 skip: 3866 prev = vma; 3867 start = vma->vm_end; 3868 } 3869 3870 out_unlock: 3871 mmap_write_unlock(mm); 3872 mmput(mm); 3873 out: 3874 return ret; 3875 } 3876 3877 /* 3878 * userfaultfd_wake may be used in combination with the 3879 * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches. 3880 */ 3881 static int userfaultfd_wake(struct userfaultfd_ctx *ctx, 3882 unsigned long arg) 3883 { 3884 int ret; 3885 struct uffdio_range uffdio_wake; 3886 struct userfaultfd_wake_range range; 3887 const void __user *buf = (void __user *)arg; 3888 3889 ret = -EFAULT; 3890 if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) 3891 goto out; 3892 3893 ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); 3894 if (ret) 3895 goto out; 3896 3897 range.start = uffdio_wake.start; 3898 range.len = uffdio_wake.len; 3899 3900 /* 3901 * len == 0 means wake all and we don't want to wake all here, 3902 * so check it again to be sure. 3903 */ 3904 VM_WARN_ON_ONCE(!range.len); 3905 3906 wake_userfault(ctx, &range); 3907 ret = 0; 3908 3909 out: 3910 return ret; 3911 } 3912 3913 static int userfaultfd_copy(struct userfaultfd_ctx *ctx, 3914 unsigned long arg) 3915 { 3916 __s64 ret; 3917 struct uffdio_copy uffdio_copy; 3918 struct uffdio_copy __user *user_uffdio_copy; 3919 struct userfaultfd_wake_range range; 3920 uffd_flags_t flags = 0; 3921 3922 user_uffdio_copy = (struct uffdio_copy __user *) arg; 3923 3924 ret = -EAGAIN; 3925 if (unlikely(atomic_read(&ctx->mmap_changing))) { 3926 if (unlikely(put_user(ret, &user_uffdio_copy->copy))) 3927 return -EFAULT; 3928 goto out; 3929 } 3930 3931 ret = -EFAULT; 3932 if (copy_from_user(&uffdio_copy, user_uffdio_copy, 3933 /* don't copy "copy" last field */ 3934 sizeof(uffdio_copy)-sizeof(__s64))) 3935 goto out; 3936 3937 ret = validate_unaligned_range(ctx->mm, uffdio_copy.src, 3938 uffdio_copy.len); 3939 if (ret) 3940 goto out; 3941 ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); 3942 if (ret) 3943 goto out; 3944 3945 ret = -EINVAL; 3946 if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) 3947 goto out; 3948 if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP) 3949 flags |= MFILL_ATOMIC_WP; 3950 if (mmget_not_zero(ctx->mm)) { 3951 ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src, 3952 uffdio_copy.len, flags); 3953 mmput(ctx->mm); 3954 } else { 3955 return -ESRCH; 3956 } 3957 if (unlikely(put_user(ret, &user_uffdio_copy->copy))) 3958 return -EFAULT; 3959 if (ret < 0) 3960 goto out; 3961 VM_WARN_ON_ONCE(!ret); 3962 /* len == 0 would wake all */ 3963 range.len = ret; 3964 if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) { 3965 range.start = uffdio_copy.dst; 3966 wake_userfault(ctx, &range); 3967 } 3968 ret = range.len == uffdio_copy.len ? 0 : -EAGAIN; 3969 out: 3970 return ret; 3971 } 3972 3973 static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, 3974 unsigned long arg) 3975 { 3976 __s64 ret; 3977 struct uffdio_zeropage uffdio_zeropage; 3978 struct uffdio_zeropage __user *user_uffdio_zeropage; 3979 struct userfaultfd_wake_range range; 3980 3981 user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; 3982 3983 ret = -EAGAIN; 3984 if (unlikely(atomic_read(&ctx->mmap_changing))) { 3985 if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) 3986 return -EFAULT; 3987 goto out; 3988 } 3989 3990 ret = -EFAULT; 3991 if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, 3992 /* don't copy "zeropage" last field */ 3993 sizeof(uffdio_zeropage)-sizeof(__s64))) 3994 goto out; 3995 3996 ret = validate_range(ctx->mm, uffdio_zeropage.range.start, 3997 uffdio_zeropage.range.len); 3998 if (ret) 3999 goto out; 4000 ret = -EINVAL; 4001 if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) 4002 goto out; 4003 4004 if (mmget_not_zero(ctx->mm)) { 4005 ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start, 4006 uffdio_zeropage.range.len); 4007 mmput(ctx->mm); 4008 } else { 4009 return -ESRCH; 4010 } 4011 if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) 4012 return -EFAULT; 4013 if (ret < 0) 4014 goto out; 4015 /* len == 0 would wake all */ 4016 VM_WARN_ON_ONCE(!ret); 4017 range.len = ret; 4018 if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) { 4019 range.start = uffdio_zeropage.range.start; 4020 wake_userfault(ctx, &range); 4021 } 4022 ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN; 4023 out: 4024 return ret; 4025 } 4026 4027 static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, 4028 unsigned long arg) 4029 { 4030 int ret; 4031 struct uffdio_writeprotect uffdio_wp; 4032 struct uffdio_writeprotect __user *user_uffdio_wp; 4033 struct userfaultfd_wake_range range; 4034 bool mode_wp, mode_dontwake; 4035 4036 if (atomic_read(&ctx->mmap_changing)) 4037 return -EAGAIN; 4038 4039 user_uffdio_wp = (struct uffdio_writeprotect __user *) arg; 4040 4041 if (copy_from_user(&uffdio_wp, user_uffdio_wp, 4042 sizeof(struct uffdio_writeprotect))) 4043 return -EFAULT; 4044 4045 ret = validate_range(ctx->mm, uffdio_wp.range.start, 4046 uffdio_wp.range.len); 4047 if (ret) 4048 return ret; 4049 4050 if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE | 4051 UFFDIO_WRITEPROTECT_MODE_WP)) 4052 return -EINVAL; 4053 4054 mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP; 4055 mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE; 4056 4057 if (mode_wp && mode_dontwake) 4058 return -EINVAL; 4059 4060 if (mmget_not_zero(ctx->mm)) { 4061 ret = mwriteprotect_range(ctx, uffdio_wp.range.start, 4062 uffdio_wp.range.len, mode_wp); 4063 mmput(ctx->mm); 4064 } else { 4065 return -ESRCH; 4066 } 4067 4068 if (ret) 4069 return ret; 4070 4071 if (!mode_wp && !mode_dontwake) { 4072 range.start = uffdio_wp.range.start; 4073 range.len = uffdio_wp.range.len; 4074 wake_userfault(ctx, &range); 4075 } 4076 return ret; 4077 } 4078 4079 static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) 4080 { 4081 __s64 ret; 4082 struct uffdio_continue uffdio_continue; 4083 struct uffdio_continue __user *user_uffdio_continue; 4084 struct userfaultfd_wake_range range; 4085 uffd_flags_t flags = 0; 4086 4087 user_uffdio_continue = (struct uffdio_continue __user *)arg; 4088 4089 ret = -EAGAIN; 4090 if (unlikely(atomic_read(&ctx->mmap_changing))) { 4091 if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) 4092 return -EFAULT; 4093 goto out; 4094 } 4095 4096 ret = -EFAULT; 4097 if (copy_from_user(&uffdio_continue, user_uffdio_continue, 4098 /* don't copy the output fields */ 4099 sizeof(uffdio_continue) - (sizeof(__s64)))) 4100 goto out; 4101 4102 ret = validate_range(ctx->mm, uffdio_continue.range.start, 4103 uffdio_continue.range.len); 4104 if (ret) 4105 goto out; 4106 4107 ret = -EINVAL; 4108 if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE | 4109 UFFDIO_CONTINUE_MODE_WP)) 4110 goto out; 4111 if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP) 4112 flags |= MFILL_ATOMIC_WP; 4113 4114 if (mmget_not_zero(ctx->mm)) { 4115 ret = mfill_atomic_continue(ctx, uffdio_continue.range.start, 4116 uffdio_continue.range.len, flags); 4117 mmput(ctx->mm); 4118 } else { 4119 return -ESRCH; 4120 } 4121 4122 if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) 4123 return -EFAULT; 4124 if (ret < 0) 4125 goto out; 4126 4127 /* len == 0 would wake all */ 4128 VM_WARN_ON_ONCE(!ret); 4129 range.len = ret; 4130 if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) { 4131 range.start = uffdio_continue.range.start; 4132 wake_userfault(ctx, &range); 4133 } 4134 ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN; 4135 4136 out: 4137 return ret; 4138 } 4139 4140 static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg) 4141 { 4142 __s64 ret; 4143 struct uffdio_poison uffdio_poison; 4144 struct uffdio_poison __user *user_uffdio_poison; 4145 struct userfaultfd_wake_range range; 4146 4147 user_uffdio_poison = (struct uffdio_poison __user *)arg; 4148 4149 ret = -EAGAIN; 4150 if (unlikely(atomic_read(&ctx->mmap_changing))) { 4151 if (unlikely(put_user(ret, &user_uffdio_poison->updated))) 4152 return -EFAULT; 4153 goto out; 4154 } 4155 4156 ret = -EFAULT; 4157 if (copy_from_user(&uffdio_poison, user_uffdio_poison, 4158 /* don't copy the output fields */ 4159 sizeof(uffdio_poison) - (sizeof(__s64)))) 4160 goto out; 4161 4162 ret = validate_range(ctx->mm, uffdio_poison.range.start, 4163 uffdio_poison.range.len); 4164 if (ret) 4165 goto out; 4166 4167 ret = -EINVAL; 4168 if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE) 4169 goto out; 4170 4171 if (mmget_not_zero(ctx->mm)) { 4172 ret = mfill_atomic_poison(ctx, uffdio_poison.range.start, 4173 uffdio_poison.range.len, 0); 4174 mmput(ctx->mm); 4175 } else { 4176 return -ESRCH; 4177 } 4178 4179 if (unlikely(put_user(ret, &user_uffdio_poison->updated))) 4180 return -EFAULT; 4181 if (ret < 0) 4182 goto out; 4183 4184 /* len == 0 would wake all */ 4185 VM_WARN_ON_ONCE(!ret); 4186 range.len = ret; 4187 if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) { 4188 range.start = uffdio_poison.range.start; 4189 wake_userfault(ctx, &range); 4190 } 4191 ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN; 4192 4193 out: 4194 return ret; 4195 } 4196 4197 bool userfaultfd_wp_async(struct vm_area_struct *vma) 4198 { 4199 return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx); 4200 } 4201 4202 static inline unsigned int uffd_ctx_features(__u64 user_features) 4203 { 4204 /* 4205 * For the current set of features the bits just coincide. Set 4206 * UFFD_FEATURE_INITIALIZED to mark the features as enabled. 4207 */ 4208 return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED; 4209 } 4210 4211 static int userfaultfd_move(struct userfaultfd_ctx *ctx, 4212 unsigned long arg) 4213 { 4214 __s64 ret; 4215 struct uffdio_move uffdio_move; 4216 struct uffdio_move __user *user_uffdio_move; 4217 struct userfaultfd_wake_range range; 4218 struct mm_struct *mm = ctx->mm; 4219 4220 user_uffdio_move = (struct uffdio_move __user *) arg; 4221 4222 ret = -EAGAIN; 4223 if (unlikely(atomic_read(&ctx->mmap_changing))) { 4224 if (unlikely(put_user(ret, &user_uffdio_move->move))) 4225 return -EFAULT; 4226 goto out; 4227 } 4228 4229 if (copy_from_user(&uffdio_move, user_uffdio_move, 4230 /* don't copy "move" last field */ 4231 sizeof(uffdio_move)-sizeof(__s64))) 4232 return -EFAULT; 4233 4234 /* Do not allow cross-mm moves. */ 4235 if (mm != current->mm) 4236 return -EINVAL; 4237 4238 ret = validate_range(mm, uffdio_move.dst, uffdio_move.len); 4239 if (ret) 4240 return ret; 4241 4242 ret = validate_range(mm, uffdio_move.src, uffdio_move.len); 4243 if (ret) 4244 return ret; 4245 4246 if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES| 4247 UFFDIO_MOVE_MODE_DONTWAKE)) 4248 return -EINVAL; 4249 4250 if (mmget_not_zero(mm)) { 4251 ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src, 4252 uffdio_move.len, uffdio_move.mode); 4253 mmput(mm); 4254 } else { 4255 return -ESRCH; 4256 } 4257 4258 if (unlikely(put_user(ret, &user_uffdio_move->move))) 4259 return -EFAULT; 4260 if (ret < 0) 4261 goto out; 4262 4263 /* len == 0 would wake all */ 4264 VM_WARN_ON(!ret); 4265 range.len = ret; 4266 if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) { 4267 range.start = uffdio_move.dst; 4268 wake_userfault(ctx, &range); 4269 } 4270 ret = range.len == uffdio_move.len ? 0 : -EAGAIN; 4271 4272 out: 4273 return ret; 4274 } 4275 4276 /* 4277 * userland asks for a certain API version and we return which bits 4278 * and ioctl commands are implemented in this kernel for such API 4279 * version or -EINVAL if unknown. 4280 */ 4281 static int userfaultfd_api(struct userfaultfd_ctx *ctx, 4282 unsigned long arg) 4283 { 4284 struct uffdio_api uffdio_api; 4285 void __user *buf = (void __user *)arg; 4286 unsigned int ctx_features; 4287 int ret; 4288 __u64 features; 4289 4290 ret = -EFAULT; 4291 if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) 4292 goto out; 4293 features = uffdio_api.features; 4294 ret = -EINVAL; 4295 if (uffdio_api.api != UFFD_API) 4296 goto err_out; 4297 ret = -EPERM; 4298 if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) 4299 goto err_out; 4300 4301 /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */ 4302 if (features & UFFD_FEATURE_WP_ASYNC) 4303 features |= UFFD_FEATURE_WP_UNPOPULATED; 4304 4305 /* report all available features and ioctls to userland */ 4306 uffdio_api.features = UFFD_API_FEATURES; 4307 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR 4308 uffdio_api.features &= 4309 ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); 4310 #endif 4311 if (!pgtable_supports_uffd_wp()) 4312 uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; 4313 4314 if (!uffd_supports_wp_marker()) { 4315 uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; 4316 uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; 4317 uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; 4318 } 4319 4320 ret = -EINVAL; 4321 if (features & ~uffdio_api.features) 4322 goto err_out; 4323 4324 uffdio_api.ioctls = UFFD_API_IOCTLS; 4325 ret = -EFAULT; 4326 if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) 4327 goto out; 4328 4329 /* only enable the requested features for this uffd context */ 4330 ctx_features = uffd_ctx_features(features); 4331 ret = -EINVAL; 4332 if (cmpxchg(&ctx->features, 0, ctx_features) != 0) 4333 goto err_out; 4334 4335 ret = 0; 4336 out: 4337 return ret; 4338 err_out: 4339 memset(&uffdio_api, 0, sizeof(uffdio_api)); 4340 if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) 4341 ret = -EFAULT; 4342 goto out; 4343 } 4344 4345 static long userfaultfd_ioctl(struct file *file, unsigned cmd, 4346 unsigned long arg) 4347 { 4348 int ret = -EINVAL; 4349 struct userfaultfd_ctx *ctx = file->private_data; 4350 4351 if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx)) 4352 return -EINVAL; 4353 4354 switch (cmd) { 4355 case UFFDIO_API: 4356 ret = userfaultfd_api(ctx, arg); 4357 break; 4358 case UFFDIO_REGISTER: 4359 ret = userfaultfd_register(ctx, arg); 4360 break; 4361 case UFFDIO_UNREGISTER: 4362 ret = userfaultfd_unregister(ctx, arg); 4363 break; 4364 case UFFDIO_WAKE: 4365 ret = userfaultfd_wake(ctx, arg); 4366 break; 4367 case UFFDIO_COPY: 4368 ret = userfaultfd_copy(ctx, arg); 4369 break; 4370 case UFFDIO_ZEROPAGE: 4371 ret = userfaultfd_zeropage(ctx, arg); 4372 break; 4373 case UFFDIO_MOVE: 4374 ret = userfaultfd_move(ctx, arg); 4375 break; 4376 case UFFDIO_WRITEPROTECT: 4377 ret = userfaultfd_writeprotect(ctx, arg); 4378 break; 4379 case UFFDIO_CONTINUE: 4380 ret = userfaultfd_continue(ctx, arg); 4381 break; 4382 case UFFDIO_POISON: 4383 ret = userfaultfd_poison(ctx, arg); 4384 break; 4385 } 4386 return ret; 4387 } 4388 4389 #ifdef CONFIG_PROC_FS 4390 static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) 4391 { 4392 struct userfaultfd_ctx *ctx = f->private_data; 4393 wait_queue_entry_t *wq; 4394 unsigned long pending = 0, total = 0; 4395 4396 spin_lock_irq(&ctx->fault_pending_wqh.lock); 4397 list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { 4398 pending++; 4399 total++; 4400 } 4401 list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { 4402 total++; 4403 } 4404 spin_unlock_irq(&ctx->fault_pending_wqh.lock); 4405 4406 /* 4407 * If more protocols will be added, there will be all shown 4408 * separated by a space. Like this: 4409 * protocols: aa:... bb:... 4410 */ 4411 seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n", 4412 pending, total, UFFD_API, ctx->features, 4413 UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS); 4414 } 4415 #endif 4416 4417 static const struct file_operations userfaultfd_fops = { 4418 #ifdef CONFIG_PROC_FS 4419 .show_fdinfo = userfaultfd_show_fdinfo, 4420 #endif 4421 .release = userfaultfd_release, 4422 .poll = userfaultfd_poll, 4423 .read_iter = userfaultfd_read_iter, 4424 .unlocked_ioctl = userfaultfd_ioctl, 4425 .compat_ioctl = compat_ptr_ioctl, 4426 .llseek = noop_llseek, 4427 }; 4428 4429 static void init_once_userfaultfd_ctx(void *mem) 4430 { 4431 struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem; 4432 4433 init_waitqueue_head(&ctx->fault_pending_wqh); 4434 init_waitqueue_head(&ctx->fault_wqh); 4435 init_waitqueue_head(&ctx->event_wqh); 4436 init_waitqueue_head(&ctx->fd_wqh); 4437 seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock); 4438 } 4439 4440 static int new_userfaultfd(int flags) 4441 { 4442 struct userfaultfd_ctx *ctx __free(kfree) = NULL; 4443 4444 VM_WARN_ON_ONCE(!current->mm); 4445 4446 /* Check the UFFD_* constants for consistency. */ 4447 BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS); 4448 4449 if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY)) 4450 return -EINVAL; 4451 4452 ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); 4453 if (!ctx) 4454 return -ENOMEM; 4455 4456 refcount_set(&ctx->refcount, 1); 4457 ctx->flags = flags; 4458 ctx->features = 0; 4459 ctx->released = false; 4460 init_rwsem(&ctx->map_changing_lock); 4461 atomic_set(&ctx->mmap_changing, 0); 4462 ctx->mm = current->mm; 4463 4464 FD_PREPARE(fdf, flags & UFFD_SHARED_FCNTL_FLAGS, 4465 anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx, 4466 O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), 4467 NULL)); 4468 if (fdf.err) 4469 return fdf.err; 4470 4471 /* prevent the mm struct to be freed */ 4472 mmgrab(ctx->mm); 4473 fd_prepare_file(fdf)->f_mode |= FMODE_NOWAIT; 4474 retain_and_null_ptr(ctx); 4475 return fd_publish(fdf); 4476 } 4477 4478 static inline bool userfaultfd_syscall_allowed(int flags) 4479 { 4480 /* Userspace-only page faults are always allowed */ 4481 if (flags & UFFD_USER_MODE_ONLY) 4482 return true; 4483 4484 /* 4485 * The user is requesting a userfaultfd which can handle kernel faults. 4486 * Privileged users are always allowed to do this. 4487 */ 4488 if (capable(CAP_SYS_PTRACE)) 4489 return true; 4490 4491 /* Otherwise, access to kernel fault handling is sysctl controlled. */ 4492 return sysctl_unprivileged_userfaultfd; 4493 } 4494 4495 SYSCALL_DEFINE1(userfaultfd, int, flags) 4496 { 4497 if (!userfaultfd_syscall_allowed(flags)) 4498 return -EPERM; 4499 4500 return new_userfaultfd(flags); 4501 } 4502 4503 static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags) 4504 { 4505 if (cmd != USERFAULTFD_IOC_NEW) 4506 return -EINVAL; 4507 4508 return new_userfaultfd(flags); 4509 } 4510 4511 static const struct file_operations userfaultfd_dev_fops = { 4512 .unlocked_ioctl = userfaultfd_dev_ioctl, 4513 .compat_ioctl = userfaultfd_dev_ioctl, 4514 .owner = THIS_MODULE, 4515 .llseek = noop_llseek, 4516 }; 4517 4518 static struct miscdevice userfaultfd_misc = { 4519 .minor = MISC_DYNAMIC_MINOR, 4520 .name = "userfaultfd", 4521 .fops = &userfaultfd_dev_fops 4522 }; 4523 4524 static int __init userfaultfd_init(void) 4525 { 4526 int ret; 4527 4528 ret = misc_register(&userfaultfd_misc); 4529 if (ret) 4530 return ret; 4531 4532 userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache", 4533 sizeof(struct userfaultfd_ctx), 4534 0, 4535 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 4536 init_once_userfaultfd_ctx); 4537 #ifdef CONFIG_SYSCTL 4538 register_sysctl_init("vm", vm_userfaultfd_table); 4539 #endif 4540 return 0; 4541 } 4542 __initcall(userfaultfd_init); 4543