1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * mm/userfaultfd.c
4 *
5 * Copyright (C) 2015 Red Hat, Inc.
6 */
7
8 #include <linux/mm.h>
9 #include <linux/sched/signal.h>
10 #include <linux/pagemap.h>
11 #include <linux/rmap.h>
12 #include <linux/swap.h>
13 #include <linux/leafops.h>
14 #include <linux/userfaultfd_k.h>
15 #include <linux/mmu_notifier.h>
16 #include <linux/hugetlb.h>
17 #include <linux/shmem_fs.h>
18 #include <asm/tlbflush.h>
19 #include <asm/tlb.h>
20 #include "internal.h"
21 #include "swap.h"
22
23 static __always_inline
validate_dst_vma(struct vm_area_struct * dst_vma,unsigned long dst_end)24 bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
25 {
26 /* Make sure that the dst range is fully within dst_vma. */
27 if (dst_end > dst_vma->vm_end)
28 return false;
29
30 /*
31 * Check the vma is registered in uffd, this is required to
32 * enforce the VM_MAYWRITE check done at uffd registration
33 * time.
34 */
35 if (!dst_vma->vm_userfaultfd_ctx.ctx)
36 return false;
37
38 return true;
39 }
40
41 static __always_inline
find_vma_and_prepare_anon(struct mm_struct * mm,unsigned long addr)42 struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm,
43 unsigned long addr)
44 {
45 struct vm_area_struct *vma;
46
47 mmap_assert_locked(mm);
48 vma = vma_lookup(mm, addr);
49 if (!vma)
50 vma = ERR_PTR(-ENOENT);
51 else if (!(vma->vm_flags & VM_SHARED) &&
52 unlikely(anon_vma_prepare(vma)))
53 vma = ERR_PTR(-ENOMEM);
54
55 return vma;
56 }
57
58 #ifdef CONFIG_PER_VMA_LOCK
59 /*
60 * uffd_lock_vma() - Lookup and lock vma corresponding to @address.
61 * @mm: mm to search vma in.
62 * @address: address that the vma should contain.
63 *
64 * Should be called without holding mmap_lock.
65 *
66 * Return: A locked vma containing @address, -ENOENT if no vma is found, or
67 * -ENOMEM if anon_vma couldn't be allocated.
68 */
uffd_lock_vma(struct mm_struct * mm,unsigned long address)69 static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm,
70 unsigned long address)
71 {
72 struct vm_area_struct *vma;
73
74 vma = lock_vma_under_rcu(mm, address);
75 if (vma) {
76 /*
77 * We know we're going to need to use anon_vma, so check
78 * that early.
79 */
80 if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma))
81 vma_end_read(vma);
82 else
83 return vma;
84 }
85
86 mmap_read_lock(mm);
87 vma = find_vma_and_prepare_anon(mm, address);
88 if (!IS_ERR(vma)) {
89 bool locked = vma_start_read_locked(vma);
90
91 if (!locked)
92 vma = ERR_PTR(-EAGAIN);
93 }
94
95 mmap_read_unlock(mm);
96 return vma;
97 }
98
uffd_mfill_lock(struct mm_struct * dst_mm,unsigned long dst_start,unsigned long len)99 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
100 unsigned long dst_start,
101 unsigned long len)
102 {
103 struct vm_area_struct *dst_vma;
104
105 dst_vma = uffd_lock_vma(dst_mm, dst_start);
106 if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len))
107 return dst_vma;
108
109 vma_end_read(dst_vma);
110 return ERR_PTR(-ENOENT);
111 }
112
uffd_mfill_unlock(struct vm_area_struct * vma)113 static void uffd_mfill_unlock(struct vm_area_struct *vma)
114 {
115 vma_end_read(vma);
116 }
117
118 #else
119
uffd_mfill_lock(struct mm_struct * dst_mm,unsigned long dst_start,unsigned long len)120 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
121 unsigned long dst_start,
122 unsigned long len)
123 {
124 struct vm_area_struct *dst_vma;
125
126 mmap_read_lock(dst_mm);
127 dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start);
128 if (IS_ERR(dst_vma))
129 goto out_unlock;
130
131 if (validate_dst_vma(dst_vma, dst_start + len))
132 return dst_vma;
133
134 dst_vma = ERR_PTR(-ENOENT);
135 out_unlock:
136 mmap_read_unlock(dst_mm);
137 return dst_vma;
138 }
139
uffd_mfill_unlock(struct vm_area_struct * vma)140 static void uffd_mfill_unlock(struct vm_area_struct *vma)
141 {
142 mmap_read_unlock(vma->vm_mm);
143 }
144 #endif
145
146 /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
mfill_file_over_size(struct vm_area_struct * dst_vma,unsigned long dst_addr)147 static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
148 unsigned long dst_addr)
149 {
150 struct inode *inode;
151 pgoff_t offset, max_off;
152
153 if (!dst_vma->vm_file)
154 return false;
155
156 inode = dst_vma->vm_file->f_inode;
157 offset = linear_page_index(dst_vma, dst_addr);
158 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
159 return offset >= max_off;
160 }
161
162 /*
163 * Install PTEs, to map dst_addr (within dst_vma) to page.
164 *
165 * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
166 * and anon, and for both shared and private VMAs.
167 */
mfill_atomic_install_pte(pmd_t * dst_pmd,struct vm_area_struct * dst_vma,unsigned long dst_addr,struct page * page,bool newly_allocated,uffd_flags_t flags)168 int mfill_atomic_install_pte(pmd_t *dst_pmd,
169 struct vm_area_struct *dst_vma,
170 unsigned long dst_addr, struct page *page,
171 bool newly_allocated, uffd_flags_t flags)
172 {
173 int ret;
174 struct mm_struct *dst_mm = dst_vma->vm_mm;
175 pte_t _dst_pte, *dst_pte;
176 bool writable = dst_vma->vm_flags & VM_WRITE;
177 bool vm_shared = dst_vma->vm_flags & VM_SHARED;
178 spinlock_t *ptl;
179 struct folio *folio = page_folio(page);
180 bool page_in_cache = folio_mapping(folio);
181 pte_t dst_ptep;
182
183 _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
184 _dst_pte = pte_mkdirty(_dst_pte);
185 if (page_in_cache && !vm_shared)
186 writable = false;
187 if (writable)
188 _dst_pte = pte_mkwrite(_dst_pte, dst_vma);
189 if (flags & MFILL_ATOMIC_WP)
190 _dst_pte = pte_mkuffd_wp(_dst_pte);
191
192 ret = -EAGAIN;
193 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
194 if (!dst_pte)
195 goto out;
196
197 if (mfill_file_over_size(dst_vma, dst_addr)) {
198 ret = -EFAULT;
199 goto out_unlock;
200 }
201
202 ret = -EEXIST;
203
204 dst_ptep = ptep_get(dst_pte);
205
206 /*
207 * We are allowed to overwrite a UFFD pte marker: consider when both
208 * MISSING|WP registered, we firstly wr-protect a none pte which has no
209 * page cache page backing it, then access the page.
210 */
211 if (!pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep))
212 goto out_unlock;
213
214 if (page_in_cache) {
215 /* Usually, cache pages are already added to LRU */
216 if (newly_allocated)
217 folio_add_lru(folio);
218 folio_add_file_rmap_pte(folio, page, dst_vma);
219 } else {
220 folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE);
221 folio_add_lru_vma(folio, dst_vma);
222 }
223
224 /*
225 * Must happen after rmap, as mm_counter() checks mapping (via
226 * PageAnon()), which is set by __page_set_anon_rmap().
227 */
228 inc_mm_counter(dst_mm, mm_counter(folio));
229
230 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
231
232 /* No need to invalidate - it was non-present before */
233 update_mmu_cache(dst_vma, dst_addr, dst_pte);
234 ret = 0;
235 out_unlock:
236 pte_unmap_unlock(dst_pte, ptl);
237 out:
238 return ret;
239 }
240
mfill_atomic_pte_copy(pmd_t * dst_pmd,struct vm_area_struct * dst_vma,unsigned long dst_addr,unsigned long src_addr,uffd_flags_t flags,struct folio ** foliop)241 static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
242 struct vm_area_struct *dst_vma,
243 unsigned long dst_addr,
244 unsigned long src_addr,
245 uffd_flags_t flags,
246 struct folio **foliop)
247 {
248 void *kaddr;
249 int ret;
250 struct folio *folio;
251
252 if (!*foliop) {
253 ret = -ENOMEM;
254 folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma,
255 dst_addr);
256 if (!folio)
257 goto out;
258
259 kaddr = kmap_local_folio(folio, 0);
260 /*
261 * The read mmap_lock is held here. Despite the
262 * mmap_lock being read recursive a deadlock is still
263 * possible if a writer has taken a lock. For example:
264 *
265 * process A thread 1 takes read lock on own mmap_lock
266 * process A thread 2 calls mmap, blocks taking write lock
267 * process B thread 1 takes page fault, read lock on own mmap lock
268 * process B thread 2 calls mmap, blocks taking write lock
269 * process A thread 1 blocks taking read lock on process B
270 * process B thread 1 blocks taking read lock on process A
271 *
272 * Disable page faults to prevent potential deadlock
273 * and retry the copy outside the mmap_lock.
274 */
275 pagefault_disable();
276 ret = copy_from_user(kaddr, (const void __user *) src_addr,
277 PAGE_SIZE);
278 pagefault_enable();
279 kunmap_local(kaddr);
280
281 /* fallback to copy_from_user outside mmap_lock */
282 if (unlikely(ret)) {
283 ret = -ENOENT;
284 *foliop = folio;
285 /* don't free the page */
286 goto out;
287 }
288
289 flush_dcache_folio(folio);
290 } else {
291 folio = *foliop;
292 *foliop = NULL;
293 }
294
295 /*
296 * The memory barrier inside __folio_mark_uptodate makes sure that
297 * preceding stores to the page contents become visible before
298 * the set_pte_at() write.
299 */
300 __folio_mark_uptodate(folio);
301
302 ret = -ENOMEM;
303 if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
304 goto out_release;
305
306 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
307 &folio->page, true, flags);
308 if (ret)
309 goto out_release;
310 out:
311 return ret;
312 out_release:
313 folio_put(folio);
314 goto out;
315 }
316
mfill_atomic_pte_zeroed_folio(pmd_t * dst_pmd,struct vm_area_struct * dst_vma,unsigned long dst_addr)317 static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
318 struct vm_area_struct *dst_vma,
319 unsigned long dst_addr)
320 {
321 struct folio *folio;
322 int ret = -ENOMEM;
323
324 folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr);
325 if (!folio)
326 return ret;
327
328 if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
329 goto out_put;
330
331 /*
332 * The memory barrier inside __folio_mark_uptodate makes sure that
333 * zeroing out the folio become visible before mapping the page
334 * using set_pte_at(). See do_anonymous_page().
335 */
336 __folio_mark_uptodate(folio);
337
338 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
339 &folio->page, true, 0);
340 if (ret)
341 goto out_put;
342
343 return 0;
344 out_put:
345 folio_put(folio);
346 return ret;
347 }
348
mfill_atomic_pte_zeropage(pmd_t * dst_pmd,struct vm_area_struct * dst_vma,unsigned long dst_addr)349 static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
350 struct vm_area_struct *dst_vma,
351 unsigned long dst_addr)
352 {
353 pte_t _dst_pte, *dst_pte;
354 spinlock_t *ptl;
355 int ret;
356
357 if (mm_forbids_zeropage(dst_vma->vm_mm))
358 return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr);
359
360 _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
361 dst_vma->vm_page_prot));
362 ret = -EAGAIN;
363 dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
364 if (!dst_pte)
365 goto out;
366 if (mfill_file_over_size(dst_vma, dst_addr)) {
367 ret = -EFAULT;
368 goto out_unlock;
369 }
370 ret = -EEXIST;
371 if (!pte_none(ptep_get(dst_pte)))
372 goto out_unlock;
373 set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
374 /* No need to invalidate - it was non-present before */
375 update_mmu_cache(dst_vma, dst_addr, dst_pte);
376 ret = 0;
377 out_unlock:
378 pte_unmap_unlock(dst_pte, ptl);
379 out:
380 return ret;
381 }
382
383 /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
mfill_atomic_pte_continue(pmd_t * dst_pmd,struct vm_area_struct * dst_vma,unsigned long dst_addr,uffd_flags_t flags)384 static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
385 struct vm_area_struct *dst_vma,
386 unsigned long dst_addr,
387 uffd_flags_t flags)
388 {
389 struct inode *inode = file_inode(dst_vma->vm_file);
390 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
391 struct folio *folio;
392 struct page *page;
393 int ret;
394
395 ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
396 /* Our caller expects us to return -EFAULT if we failed to find folio */
397 if (ret == -ENOENT)
398 ret = -EFAULT;
399 if (ret)
400 goto out;
401 if (!folio) {
402 ret = -EFAULT;
403 goto out;
404 }
405
406 page = folio_file_page(folio, pgoff);
407 if (PageHWPoison(page)) {
408 ret = -EIO;
409 goto out_release;
410 }
411
412 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
413 page, false, flags);
414 if (ret)
415 goto out_release;
416
417 folio_unlock(folio);
418 ret = 0;
419 out:
420 return ret;
421 out_release:
422 folio_unlock(folio);
423 folio_put(folio);
424 goto out;
425 }
426
427 /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */
mfill_atomic_pte_poison(pmd_t * dst_pmd,struct vm_area_struct * dst_vma,unsigned long dst_addr,uffd_flags_t flags)428 static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
429 struct vm_area_struct *dst_vma,
430 unsigned long dst_addr,
431 uffd_flags_t flags)
432 {
433 int ret;
434 struct mm_struct *dst_mm = dst_vma->vm_mm;
435 pte_t _dst_pte, *dst_pte;
436 spinlock_t *ptl;
437
438 _dst_pte = make_pte_marker(PTE_MARKER_POISONED);
439 ret = -EAGAIN;
440 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
441 if (!dst_pte)
442 goto out;
443
444 if (mfill_file_over_size(dst_vma, dst_addr)) {
445 ret = -EFAULT;
446 goto out_unlock;
447 }
448
449 ret = -EEXIST;
450 /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */
451 if (!pte_none(ptep_get(dst_pte)))
452 goto out_unlock;
453
454 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
455
456 /* No need to invalidate - it was non-present before */
457 update_mmu_cache(dst_vma, dst_addr, dst_pte);
458 ret = 0;
459 out_unlock:
460 pte_unmap_unlock(dst_pte, ptl);
461 out:
462 return ret;
463 }
464
mm_alloc_pmd(struct mm_struct * mm,unsigned long address)465 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
466 {
467 pgd_t *pgd;
468 p4d_t *p4d;
469 pud_t *pud;
470
471 pgd = pgd_offset(mm, address);
472 p4d = p4d_alloc(mm, pgd, address);
473 if (!p4d)
474 return NULL;
475 pud = pud_alloc(mm, p4d, address);
476 if (!pud)
477 return NULL;
478 /*
479 * Note that we didn't run this because the pmd was
480 * missing, the *pmd may be already established and in
481 * turn it may also be a trans_huge_pmd.
482 */
483 return pmd_alloc(mm, pud, address);
484 }
485
486 #ifdef CONFIG_HUGETLB_PAGE
487 /*
488 * mfill_atomic processing for HUGETLB vmas. Note that this routine is
489 * called with either vma-lock or mmap_lock held, it will release the lock
490 * before returning.
491 */
mfill_atomic_hugetlb(struct userfaultfd_ctx * ctx,struct vm_area_struct * dst_vma,unsigned long dst_start,unsigned long src_start,unsigned long len,uffd_flags_t flags)492 static __always_inline ssize_t mfill_atomic_hugetlb(
493 struct userfaultfd_ctx *ctx,
494 struct vm_area_struct *dst_vma,
495 unsigned long dst_start,
496 unsigned long src_start,
497 unsigned long len,
498 uffd_flags_t flags)
499 {
500 struct mm_struct *dst_mm = dst_vma->vm_mm;
501 ssize_t err;
502 pte_t *dst_pte;
503 unsigned long src_addr, dst_addr;
504 long copied;
505 struct folio *folio;
506 unsigned long vma_hpagesize;
507 pgoff_t idx;
508 u32 hash;
509 struct address_space *mapping;
510
511 /*
512 * There is no default zero huge page for all huge page sizes as
513 * supported by hugetlb. A PMD_SIZE huge pages may exist as used
514 * by THP. Since we can not reliably insert a zero page, this
515 * feature is not supported.
516 */
517 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
518 up_read(&ctx->map_changing_lock);
519 uffd_mfill_unlock(dst_vma);
520 return -EINVAL;
521 }
522
523 src_addr = src_start;
524 dst_addr = dst_start;
525 copied = 0;
526 folio = NULL;
527 vma_hpagesize = vma_kernel_pagesize(dst_vma);
528
529 /*
530 * Validate alignment based on huge page size
531 */
532 err = -EINVAL;
533 if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
534 goto out_unlock;
535
536 retry:
537 /*
538 * On routine entry dst_vma is set. If we had to drop mmap_lock and
539 * retry, dst_vma will be set to NULL and we must lookup again.
540 */
541 if (!dst_vma) {
542 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
543 if (IS_ERR(dst_vma)) {
544 err = PTR_ERR(dst_vma);
545 goto out;
546 }
547
548 err = -ENOENT;
549 if (!is_vm_hugetlb_page(dst_vma))
550 goto out_unlock_vma;
551
552 err = -EINVAL;
553 if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
554 goto out_unlock_vma;
555
556 /*
557 * If memory mappings are changing because of non-cooperative
558 * operation (e.g. mremap) running in parallel, bail out and
559 * request the user to retry later
560 */
561 down_read(&ctx->map_changing_lock);
562 err = -EAGAIN;
563 if (atomic_read(&ctx->mmap_changing))
564 goto out_unlock;
565 }
566
567 while (src_addr < src_start + len) {
568 VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
569
570 /*
571 * Serialize via vma_lock and hugetlb_fault_mutex.
572 * vma_lock ensures the dst_pte remains valid even
573 * in the case of shared pmds. fault mutex prevents
574 * races with other faulting threads.
575 */
576 idx = linear_page_index(dst_vma, dst_addr);
577 mapping = dst_vma->vm_file->f_mapping;
578 hash = hugetlb_fault_mutex_hash(mapping, idx);
579 mutex_lock(&hugetlb_fault_mutex_table[hash]);
580 hugetlb_vma_lock_read(dst_vma);
581
582 err = -ENOMEM;
583 dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
584 if (!dst_pte) {
585 hugetlb_vma_unlock_read(dst_vma);
586 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
587 goto out_unlock;
588 }
589
590 if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
591 const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte);
592
593 if (!huge_pte_none(ptep) && !pte_is_uffd_marker(ptep)) {
594 err = -EEXIST;
595 hugetlb_vma_unlock_read(dst_vma);
596 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
597 goto out_unlock;
598 }
599 }
600
601 err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr,
602 src_addr, flags, &folio);
603
604 hugetlb_vma_unlock_read(dst_vma);
605 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
606
607 cond_resched();
608
609 if (unlikely(err == -ENOENT)) {
610 up_read(&ctx->map_changing_lock);
611 uffd_mfill_unlock(dst_vma);
612 VM_WARN_ON_ONCE(!folio);
613
614 err = copy_folio_from_user(folio,
615 (const void __user *)src_addr, true);
616 if (unlikely(err)) {
617 err = -EFAULT;
618 goto out;
619 }
620
621 dst_vma = NULL;
622 goto retry;
623 } else
624 VM_WARN_ON_ONCE(folio);
625
626 if (!err) {
627 dst_addr += vma_hpagesize;
628 src_addr += vma_hpagesize;
629 copied += vma_hpagesize;
630
631 if (fatal_signal_pending(current))
632 err = -EINTR;
633 }
634 if (err)
635 break;
636 }
637
638 out_unlock:
639 up_read(&ctx->map_changing_lock);
640 out_unlock_vma:
641 uffd_mfill_unlock(dst_vma);
642 out:
643 if (folio)
644 folio_put(folio);
645 VM_WARN_ON_ONCE(copied < 0);
646 VM_WARN_ON_ONCE(err > 0);
647 VM_WARN_ON_ONCE(!copied && !err);
648 return copied ? copied : err;
649 }
650 #else /* !CONFIG_HUGETLB_PAGE */
651 /* fail at build time if gcc attempts to use this */
652 extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
653 struct vm_area_struct *dst_vma,
654 unsigned long dst_start,
655 unsigned long src_start,
656 unsigned long len,
657 uffd_flags_t flags);
658 #endif /* CONFIG_HUGETLB_PAGE */
659
mfill_atomic_pte(pmd_t * dst_pmd,struct vm_area_struct * dst_vma,unsigned long dst_addr,unsigned long src_addr,uffd_flags_t flags,struct folio ** foliop)660 static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
661 struct vm_area_struct *dst_vma,
662 unsigned long dst_addr,
663 unsigned long src_addr,
664 uffd_flags_t flags,
665 struct folio **foliop)
666 {
667 ssize_t err;
668
669 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
670 return mfill_atomic_pte_continue(dst_pmd, dst_vma,
671 dst_addr, flags);
672 } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
673 return mfill_atomic_pte_poison(dst_pmd, dst_vma,
674 dst_addr, flags);
675 }
676
677 /*
678 * The normal page fault path for a shmem will invoke the
679 * fault, fill the hole in the file and COW it right away. The
680 * result generates plain anonymous memory. So when we are
681 * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
682 * generate anonymous memory directly without actually filling
683 * the hole. For the MAP_PRIVATE case the robustness check
684 * only happens in the pagetable (to verify it's still none)
685 * and not in the radix tree.
686 */
687 if (!(dst_vma->vm_flags & VM_SHARED)) {
688 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
689 err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
690 dst_addr, src_addr,
691 flags, foliop);
692 else
693 err = mfill_atomic_pte_zeropage(dst_pmd,
694 dst_vma, dst_addr);
695 } else {
696 err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
697 dst_addr, src_addr,
698 flags, foliop);
699 }
700
701 return err;
702 }
703
mfill_atomic(struct userfaultfd_ctx * ctx,unsigned long dst_start,unsigned long src_start,unsigned long len,uffd_flags_t flags)704 static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
705 unsigned long dst_start,
706 unsigned long src_start,
707 unsigned long len,
708 uffd_flags_t flags)
709 {
710 struct mm_struct *dst_mm = ctx->mm;
711 struct vm_area_struct *dst_vma;
712 ssize_t err;
713 pmd_t *dst_pmd;
714 unsigned long src_addr, dst_addr;
715 long copied;
716 struct folio *folio;
717
718 /*
719 * Sanitize the command parameters:
720 */
721 VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK);
722 VM_WARN_ON_ONCE(len & ~PAGE_MASK);
723
724 /* Does the address range wrap, or is the span zero-sized? */
725 VM_WARN_ON_ONCE(src_start + len <= src_start);
726 VM_WARN_ON_ONCE(dst_start + len <= dst_start);
727
728 src_addr = src_start;
729 dst_addr = dst_start;
730 copied = 0;
731 folio = NULL;
732 retry:
733 /*
734 * Make sure the vma is not shared, that the dst range is
735 * both valid and fully within a single existing vma.
736 */
737 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
738 if (IS_ERR(dst_vma)) {
739 err = PTR_ERR(dst_vma);
740 goto out;
741 }
742
743 /*
744 * If memory mappings are changing because of non-cooperative
745 * operation (e.g. mremap) running in parallel, bail out and
746 * request the user to retry later
747 */
748 down_read(&ctx->map_changing_lock);
749 err = -EAGAIN;
750 if (atomic_read(&ctx->mmap_changing))
751 goto out_unlock;
752
753 err = -EINVAL;
754 /*
755 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
756 * it will overwrite vm_ops, so vma_is_anonymous must return false.
757 */
758 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
759 dst_vma->vm_flags & VM_SHARED))
760 goto out_unlock;
761
762 /*
763 * validate 'mode' now that we know the dst_vma: don't allow
764 * a wrprotect copy if the userfaultfd didn't register as WP.
765 */
766 if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
767 goto out_unlock;
768
769 /*
770 * If this is a HUGETLB vma, pass off to appropriate routine
771 */
772 if (is_vm_hugetlb_page(dst_vma))
773 return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
774 src_start, len, flags);
775
776 if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
777 goto out_unlock;
778 if (!vma_is_shmem(dst_vma) &&
779 uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
780 goto out_unlock;
781
782 while (src_addr < src_start + len) {
783 pmd_t dst_pmdval;
784
785 VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
786
787 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
788 if (unlikely(!dst_pmd)) {
789 err = -ENOMEM;
790 break;
791 }
792
793 dst_pmdval = pmdp_get_lockless(dst_pmd);
794 if (unlikely(pmd_none(dst_pmdval)) &&
795 unlikely(__pte_alloc(dst_mm, dst_pmd))) {
796 err = -ENOMEM;
797 break;
798 }
799 dst_pmdval = pmdp_get_lockless(dst_pmd);
800 /*
801 * If the dst_pmd is THP don't override it and just be strict.
802 * (This includes the case where the PMD used to be THP and
803 * changed back to none after __pte_alloc().)
804 */
805 if (unlikely(!pmd_present(dst_pmdval) ||
806 pmd_trans_huge(dst_pmdval))) {
807 err = -EEXIST;
808 break;
809 }
810 if (unlikely(pmd_bad(dst_pmdval))) {
811 err = -EFAULT;
812 break;
813 }
814 /*
815 * For shmem mappings, khugepaged is allowed to remove page
816 * tables under us; pte_offset_map_lock() will deal with that.
817 */
818
819 err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
820 src_addr, flags, &folio);
821 cond_resched();
822
823 if (unlikely(err == -ENOENT)) {
824 void *kaddr;
825
826 up_read(&ctx->map_changing_lock);
827 uffd_mfill_unlock(dst_vma);
828 VM_WARN_ON_ONCE(!folio);
829
830 kaddr = kmap_local_folio(folio, 0);
831 err = copy_from_user(kaddr,
832 (const void __user *) src_addr,
833 PAGE_SIZE);
834 kunmap_local(kaddr);
835 if (unlikely(err)) {
836 err = -EFAULT;
837 goto out;
838 }
839 flush_dcache_folio(folio);
840 goto retry;
841 } else
842 VM_WARN_ON_ONCE(folio);
843
844 if (!err) {
845 dst_addr += PAGE_SIZE;
846 src_addr += PAGE_SIZE;
847 copied += PAGE_SIZE;
848
849 if (fatal_signal_pending(current))
850 err = -EINTR;
851 }
852 if (err)
853 break;
854 }
855
856 out_unlock:
857 up_read(&ctx->map_changing_lock);
858 uffd_mfill_unlock(dst_vma);
859 out:
860 if (folio)
861 folio_put(folio);
862 VM_WARN_ON_ONCE(copied < 0);
863 VM_WARN_ON_ONCE(err > 0);
864 VM_WARN_ON_ONCE(!copied && !err);
865 return copied ? copied : err;
866 }
867
mfill_atomic_copy(struct userfaultfd_ctx * ctx,unsigned long dst_start,unsigned long src_start,unsigned long len,uffd_flags_t flags)868 ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
869 unsigned long src_start, unsigned long len,
870 uffd_flags_t flags)
871 {
872 return mfill_atomic(ctx, dst_start, src_start, len,
873 uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
874 }
875
mfill_atomic_zeropage(struct userfaultfd_ctx * ctx,unsigned long start,unsigned long len)876 ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
877 unsigned long start,
878 unsigned long len)
879 {
880 return mfill_atomic(ctx, start, 0, len,
881 uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
882 }
883
mfill_atomic_continue(struct userfaultfd_ctx * ctx,unsigned long start,unsigned long len,uffd_flags_t flags)884 ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
885 unsigned long len, uffd_flags_t flags)
886 {
887
888 /*
889 * A caller might reasonably assume that UFFDIO_CONTINUE contains an
890 * smp_wmb() to ensure that any writes to the about-to-be-mapped page by
891 * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to
892 * subsequent loads from the page through the newly mapped address range.
893 */
894 smp_wmb();
895
896 return mfill_atomic(ctx, start, 0, len,
897 uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
898 }
899
mfill_atomic_poison(struct userfaultfd_ctx * ctx,unsigned long start,unsigned long len,uffd_flags_t flags)900 ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
901 unsigned long len, uffd_flags_t flags)
902 {
903 return mfill_atomic(ctx, start, 0, len,
904 uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
905 }
906
uffd_wp_range(struct vm_area_struct * dst_vma,unsigned long start,unsigned long len,bool enable_wp)907 long uffd_wp_range(struct vm_area_struct *dst_vma,
908 unsigned long start, unsigned long len, bool enable_wp)
909 {
910 unsigned int mm_cp_flags;
911 struct mmu_gather tlb;
912 long ret;
913
914 VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end,
915 "The address range exceeds VMA boundary.\n");
916 if (enable_wp)
917 mm_cp_flags = MM_CP_UFFD_WP;
918 else
919 mm_cp_flags = MM_CP_UFFD_WP_RESOLVE;
920
921 /*
922 * vma->vm_page_prot already reflects that uffd-wp is enabled for this
923 * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed
924 * to be write-protected as default whenever protection changes.
925 * Try upgrading write permissions manually.
926 */
927 if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma))
928 mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
929 tlb_gather_mmu(&tlb, dst_vma->vm_mm);
930 ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags);
931 tlb_finish_mmu(&tlb);
932
933 return ret;
934 }
935
mwriteprotect_range(struct userfaultfd_ctx * ctx,unsigned long start,unsigned long len,bool enable_wp)936 int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
937 unsigned long len, bool enable_wp)
938 {
939 struct mm_struct *dst_mm = ctx->mm;
940 unsigned long end = start + len;
941 unsigned long _start, _end;
942 struct vm_area_struct *dst_vma;
943 unsigned long page_mask;
944 long err;
945 VMA_ITERATOR(vmi, dst_mm, start);
946
947 /*
948 * Sanitize the command parameters:
949 */
950 VM_WARN_ON_ONCE(start & ~PAGE_MASK);
951 VM_WARN_ON_ONCE(len & ~PAGE_MASK);
952
953 /* Does the address range wrap, or is the span zero-sized? */
954 VM_WARN_ON_ONCE(start + len <= start);
955
956 mmap_read_lock(dst_mm);
957
958 /*
959 * If memory mappings are changing because of non-cooperative
960 * operation (e.g. mremap) running in parallel, bail out and
961 * request the user to retry later
962 */
963 down_read(&ctx->map_changing_lock);
964 err = -EAGAIN;
965 if (atomic_read(&ctx->mmap_changing))
966 goto out_unlock;
967
968 err = -ENOENT;
969 for_each_vma_range(vmi, dst_vma, end) {
970
971 if (!userfaultfd_wp(dst_vma)) {
972 err = -ENOENT;
973 break;
974 }
975
976 if (is_vm_hugetlb_page(dst_vma)) {
977 err = -EINVAL;
978 page_mask = vma_kernel_pagesize(dst_vma) - 1;
979 if ((start & page_mask) || (len & page_mask))
980 break;
981 }
982
983 _start = max(dst_vma->vm_start, start);
984 _end = min(dst_vma->vm_end, end);
985
986 err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp);
987
988 /* Return 0 on success, <0 on failures */
989 if (err < 0)
990 break;
991 err = 0;
992 }
993 out_unlock:
994 up_read(&ctx->map_changing_lock);
995 mmap_read_unlock(dst_mm);
996 return err;
997 }
998
999
double_pt_lock(spinlock_t * ptl1,spinlock_t * ptl2)1000 void double_pt_lock(spinlock_t *ptl1,
1001 spinlock_t *ptl2)
1002 __acquires(ptl1)
1003 __acquires(ptl2)
1004 {
1005 if (ptl1 > ptl2)
1006 swap(ptl1, ptl2);
1007 /* lock in virtual address order to avoid lock inversion */
1008 spin_lock(ptl1);
1009 if (ptl1 != ptl2)
1010 spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING);
1011 else
1012 __acquire(ptl2);
1013 }
1014
double_pt_unlock(spinlock_t * ptl1,spinlock_t * ptl2)1015 void double_pt_unlock(spinlock_t *ptl1,
1016 spinlock_t *ptl2)
1017 __releases(ptl1)
1018 __releases(ptl2)
1019 {
1020 spin_unlock(ptl1);
1021 if (ptl1 != ptl2)
1022 spin_unlock(ptl2);
1023 else
1024 __release(ptl2);
1025 }
1026
is_pte_pages_stable(pte_t * dst_pte,pte_t * src_pte,pte_t orig_dst_pte,pte_t orig_src_pte,pmd_t * dst_pmd,pmd_t dst_pmdval)1027 static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte,
1028 pte_t orig_dst_pte, pte_t orig_src_pte,
1029 pmd_t *dst_pmd, pmd_t dst_pmdval)
1030 {
1031 return pte_same(ptep_get(src_pte), orig_src_pte) &&
1032 pte_same(ptep_get(dst_pte), orig_dst_pte) &&
1033 pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd));
1034 }
1035
1036 /*
1037 * Checks if the two ptes and the corresponding folio are eligible for batched
1038 * move. If so, then returns pointer to the locked folio. Otherwise, returns NULL.
1039 *
1040 * NOTE: folio's reference is not required as the whole operation is within
1041 * PTL's critical section.
1042 */
check_ptes_for_batched_move(struct vm_area_struct * src_vma,unsigned long src_addr,pte_t * src_pte,pte_t * dst_pte)1043 static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma,
1044 unsigned long src_addr,
1045 pte_t *src_pte, pte_t *dst_pte)
1046 {
1047 pte_t orig_dst_pte, orig_src_pte;
1048 struct folio *folio;
1049
1050 orig_dst_pte = ptep_get(dst_pte);
1051 if (!pte_none(orig_dst_pte))
1052 return NULL;
1053
1054 orig_src_pte = ptep_get(src_pte);
1055 if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte)))
1056 return NULL;
1057
1058 folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
1059 if (!folio || !folio_trylock(folio))
1060 return NULL;
1061 if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) {
1062 folio_unlock(folio);
1063 return NULL;
1064 }
1065 return folio;
1066 }
1067
1068 /*
1069 * Moves src folios to dst in a batch as long as they are not large, and can
1070 * successfully take the lock via folio_trylock().
1071 */
move_present_ptes(struct mm_struct * mm,struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma,unsigned long dst_addr,unsigned long src_addr,pte_t * dst_pte,pte_t * src_pte,pte_t orig_dst_pte,pte_t orig_src_pte,pmd_t * dst_pmd,pmd_t dst_pmdval,spinlock_t * dst_ptl,spinlock_t * src_ptl,struct folio ** first_src_folio,unsigned long len)1072 static long move_present_ptes(struct mm_struct *mm,
1073 struct vm_area_struct *dst_vma,
1074 struct vm_area_struct *src_vma,
1075 unsigned long dst_addr, unsigned long src_addr,
1076 pte_t *dst_pte, pte_t *src_pte,
1077 pte_t orig_dst_pte, pte_t orig_src_pte,
1078 pmd_t *dst_pmd, pmd_t dst_pmdval,
1079 spinlock_t *dst_ptl, spinlock_t *src_ptl,
1080 struct folio **first_src_folio, unsigned long len)
1081 {
1082 int err = 0;
1083 struct folio *src_folio = *first_src_folio;
1084 unsigned long src_start = src_addr;
1085 unsigned long src_end;
1086
1087 len = pmd_addr_end(dst_addr, dst_addr + len) - dst_addr;
1088 src_end = pmd_addr_end(src_addr, src_addr + len);
1089 flush_cache_range(src_vma, src_addr, src_end);
1090 double_pt_lock(dst_ptl, src_ptl);
1091
1092 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
1093 dst_pmd, dst_pmdval)) {
1094 err = -EAGAIN;
1095 goto out;
1096 }
1097 if (folio_test_large(src_folio) ||
1098 folio_maybe_dma_pinned(src_folio) ||
1099 !PageAnonExclusive(&src_folio->page)) {
1100 err = -EBUSY;
1101 goto out;
1102 }
1103 /* It's safe to drop the reference now as the page-table is holding one. */
1104 folio_put(*first_src_folio);
1105 *first_src_folio = NULL;
1106 arch_enter_lazy_mmu_mode();
1107
1108 while (true) {
1109 orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
1110 /* Folio got pinned from under us. Put it back and fail the move. */
1111 if (folio_maybe_dma_pinned(src_folio)) {
1112 set_pte_at(mm, src_addr, src_pte, orig_src_pte);
1113 err = -EBUSY;
1114 break;
1115 }
1116
1117 folio_move_anon_rmap(src_folio, dst_vma);
1118 src_folio->index = linear_page_index(dst_vma, dst_addr);
1119
1120 orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
1121 /* Set soft dirty bit so userspace can notice the pte was moved */
1122 if (pgtable_supports_soft_dirty())
1123 orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
1124 if (pte_dirty(orig_src_pte))
1125 orig_dst_pte = pte_mkdirty(orig_dst_pte);
1126 orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
1127 set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
1128
1129 src_addr += PAGE_SIZE;
1130 if (src_addr == src_end)
1131 break;
1132 dst_addr += PAGE_SIZE;
1133 dst_pte++;
1134 src_pte++;
1135
1136 folio_unlock(src_folio);
1137 src_folio = check_ptes_for_batched_move(src_vma, src_addr,
1138 src_pte, dst_pte);
1139 if (!src_folio)
1140 break;
1141 }
1142
1143 arch_leave_lazy_mmu_mode();
1144 if (src_addr > src_start)
1145 flush_tlb_range(src_vma, src_start, src_addr);
1146
1147 if (src_folio)
1148 folio_unlock(src_folio);
1149 out:
1150 double_pt_unlock(dst_ptl, src_ptl);
1151 return src_addr > src_start ? src_addr - src_start : err;
1152 }
1153
move_swap_pte(struct mm_struct * mm,struct vm_area_struct * dst_vma,unsigned long dst_addr,unsigned long src_addr,pte_t * dst_pte,pte_t * src_pte,pte_t orig_dst_pte,pte_t orig_src_pte,pmd_t * dst_pmd,pmd_t dst_pmdval,spinlock_t * dst_ptl,spinlock_t * src_ptl,struct folio * src_folio,struct swap_info_struct * si,swp_entry_t entry)1154 static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
1155 unsigned long dst_addr, unsigned long src_addr,
1156 pte_t *dst_pte, pte_t *src_pte,
1157 pte_t orig_dst_pte, pte_t orig_src_pte,
1158 pmd_t *dst_pmd, pmd_t dst_pmdval,
1159 spinlock_t *dst_ptl, spinlock_t *src_ptl,
1160 struct folio *src_folio,
1161 struct swap_info_struct *si, swp_entry_t entry)
1162 {
1163 /*
1164 * Check if the folio still belongs to the target swap entry after
1165 * acquiring the lock. Folio can be freed in the swap cache while
1166 * not locked.
1167 */
1168 if (src_folio && unlikely(!folio_test_swapcache(src_folio) ||
1169 entry.val != src_folio->swap.val))
1170 return -EAGAIN;
1171
1172 double_pt_lock(dst_ptl, src_ptl);
1173
1174 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
1175 dst_pmd, dst_pmdval)) {
1176 double_pt_unlock(dst_ptl, src_ptl);
1177 return -EAGAIN;
1178 }
1179
1180 /*
1181 * The src_folio resides in the swapcache, requiring an update to its
1182 * index and mapping to align with the dst_vma, where a swap-in may
1183 * occur and hit the swapcache after moving the PTE.
1184 */
1185 if (src_folio) {
1186 folio_move_anon_rmap(src_folio, dst_vma);
1187 src_folio->index = linear_page_index(dst_vma, dst_addr);
1188 } else {
1189 /*
1190 * Check if the swap entry is cached after acquiring the src_pte
1191 * lock. Otherwise, we might miss a newly loaded swap cache folio.
1192 *
1193 * Check swap_map directly to minimize overhead, READ_ONCE is sufficient.
1194 * We are trying to catch newly added swap cache, the only possible case is
1195 * when a folio is swapped in and out again staying in swap cache, using the
1196 * same entry before the PTE check above. The PTL is acquired and released
1197 * twice, each time after updating the swap_map's flag. So holding
1198 * the PTL here ensures we see the updated value. False positive is possible,
1199 * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the
1200 * cache, or during the tiny synchronization window between swap cache and
1201 * swap_map, but it will be gone very quickly, worst result is retry jitters.
1202 */
1203 if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) {
1204 double_pt_unlock(dst_ptl, src_ptl);
1205 return -EAGAIN;
1206 }
1207 }
1208
1209 orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
1210 if (pgtable_supports_soft_dirty())
1211 orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte);
1212 set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
1213 double_pt_unlock(dst_ptl, src_ptl);
1214
1215 return PAGE_SIZE;
1216 }
1217
move_zeropage_pte(struct mm_struct * mm,struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma,unsigned long dst_addr,unsigned long src_addr,pte_t * dst_pte,pte_t * src_pte,pte_t orig_dst_pte,pte_t orig_src_pte,pmd_t * dst_pmd,pmd_t dst_pmdval,spinlock_t * dst_ptl,spinlock_t * src_ptl)1218 static int move_zeropage_pte(struct mm_struct *mm,
1219 struct vm_area_struct *dst_vma,
1220 struct vm_area_struct *src_vma,
1221 unsigned long dst_addr, unsigned long src_addr,
1222 pte_t *dst_pte, pte_t *src_pte,
1223 pte_t orig_dst_pte, pte_t orig_src_pte,
1224 pmd_t *dst_pmd, pmd_t dst_pmdval,
1225 spinlock_t *dst_ptl, spinlock_t *src_ptl)
1226 {
1227 pte_t zero_pte;
1228
1229 double_pt_lock(dst_ptl, src_ptl);
1230 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
1231 dst_pmd, dst_pmdval)) {
1232 double_pt_unlock(dst_ptl, src_ptl);
1233 return -EAGAIN;
1234 }
1235
1236 zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
1237 dst_vma->vm_page_prot));
1238 ptep_clear_flush(src_vma, src_addr, src_pte);
1239 set_pte_at(mm, dst_addr, dst_pte, zero_pte);
1240 double_pt_unlock(dst_ptl, src_ptl);
1241
1242 return PAGE_SIZE;
1243 }
1244
1245
1246 /*
1247 * The mmap_lock for reading is held by the caller. Just move the page(s)
1248 * from src_pmd to dst_pmd if possible, and return number of bytes moved.
1249 * On failure, an error code is returned.
1250 */
move_pages_ptes(struct mm_struct * mm,pmd_t * dst_pmd,pmd_t * src_pmd,struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma,unsigned long dst_addr,unsigned long src_addr,unsigned long len,__u64 mode)1251 static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
1252 struct vm_area_struct *dst_vma,
1253 struct vm_area_struct *src_vma,
1254 unsigned long dst_addr, unsigned long src_addr,
1255 unsigned long len, __u64 mode)
1256 {
1257 struct swap_info_struct *si = NULL;
1258 pte_t orig_src_pte, orig_dst_pte;
1259 pte_t src_folio_pte;
1260 spinlock_t *src_ptl, *dst_ptl;
1261 pte_t *src_pte = NULL;
1262 pte_t *dst_pte = NULL;
1263 pmd_t dummy_pmdval;
1264 pmd_t dst_pmdval;
1265 struct folio *src_folio = NULL;
1266 struct mmu_notifier_range range;
1267 long ret = 0;
1268
1269 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
1270 src_addr, src_addr + len);
1271 mmu_notifier_invalidate_range_start(&range);
1272 retry:
1273 /*
1274 * Use the maywrite version to indicate that dst_pte will be modified,
1275 * since dst_pte needs to be none, the subsequent pte_same() check
1276 * cannot prevent the dst_pte page from being freed concurrently, so we
1277 * also need to abtain dst_pmdval and recheck pmd_same() later.
1278 */
1279 dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval,
1280 &dst_ptl);
1281
1282 /* Retry if a huge pmd materialized from under us */
1283 if (unlikely(!dst_pte)) {
1284 ret = -EAGAIN;
1285 goto out;
1286 }
1287
1288 /*
1289 * Unlike dst_pte, the subsequent pte_same() check can ensure the
1290 * stability of the src_pte page, so there is no need to get pmdval,
1291 * just pass a dummy variable to it.
1292 */
1293 src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval,
1294 &src_ptl);
1295
1296 /*
1297 * We held the mmap_lock for reading so MADV_DONTNEED
1298 * can zap transparent huge pages under us, or the
1299 * transparent huge page fault can establish new
1300 * transparent huge pages under us.
1301 */
1302 if (unlikely(!src_pte)) {
1303 ret = -EAGAIN;
1304 goto out;
1305 }
1306
1307 /* Sanity checks before the operation */
1308 if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) ||
1309 pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) {
1310 ret = -EINVAL;
1311 goto out;
1312 }
1313
1314 spin_lock(dst_ptl);
1315 orig_dst_pte = ptep_get(dst_pte);
1316 spin_unlock(dst_ptl);
1317 if (!pte_none(orig_dst_pte)) {
1318 ret = -EEXIST;
1319 goto out;
1320 }
1321
1322 spin_lock(src_ptl);
1323 orig_src_pte = ptep_get(src_pte);
1324 spin_unlock(src_ptl);
1325 if (pte_none(orig_src_pte)) {
1326 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES))
1327 ret = -ENOENT;
1328 else /* nothing to do to move a hole */
1329 ret = PAGE_SIZE;
1330 goto out;
1331 }
1332
1333 /* If PTE changed after we locked the folio them start over */
1334 if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) {
1335 ret = -EAGAIN;
1336 goto out;
1337 }
1338
1339 if (pte_present(orig_src_pte)) {
1340 if (is_zero_pfn(pte_pfn(orig_src_pte))) {
1341 ret = move_zeropage_pte(mm, dst_vma, src_vma,
1342 dst_addr, src_addr, dst_pte, src_pte,
1343 orig_dst_pte, orig_src_pte,
1344 dst_pmd, dst_pmdval, dst_ptl, src_ptl);
1345 goto out;
1346 }
1347
1348 /*
1349 * Pin and lock source folio. Since we are in RCU read section,
1350 * we can't block, so on contention have to unmap the ptes,
1351 * obtain the lock and retry.
1352 */
1353 if (!src_folio) {
1354 struct folio *folio;
1355 bool locked;
1356
1357 /*
1358 * Pin the page while holding the lock to be sure the
1359 * page isn't freed under us
1360 */
1361 spin_lock(src_ptl);
1362 if (!pte_same(orig_src_pte, ptep_get(src_pte))) {
1363 spin_unlock(src_ptl);
1364 ret = -EAGAIN;
1365 goto out;
1366 }
1367
1368 folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
1369 if (!folio || !PageAnonExclusive(&folio->page)) {
1370 spin_unlock(src_ptl);
1371 ret = -EBUSY;
1372 goto out;
1373 }
1374
1375 locked = folio_trylock(folio);
1376 /*
1377 * We avoid waiting for folio lock with a raised
1378 * refcount for large folios because extra refcounts
1379 * will result in split_folio() failing later and
1380 * retrying. If multiple tasks are trying to move a
1381 * large folio we can end up livelocking.
1382 */
1383 if (!locked && folio_test_large(folio)) {
1384 spin_unlock(src_ptl);
1385 ret = -EAGAIN;
1386 goto out;
1387 }
1388
1389 folio_get(folio);
1390 src_folio = folio;
1391 src_folio_pte = orig_src_pte;
1392 spin_unlock(src_ptl);
1393
1394 if (!locked) {
1395 pte_unmap(src_pte);
1396 pte_unmap(dst_pte);
1397 src_pte = dst_pte = NULL;
1398 /* now we can block and wait */
1399 folio_lock(src_folio);
1400 goto retry;
1401 }
1402
1403 if (WARN_ON_ONCE(!folio_test_anon(src_folio))) {
1404 ret = -EBUSY;
1405 goto out;
1406 }
1407 }
1408
1409 /* at this point we have src_folio locked */
1410 if (folio_test_large(src_folio)) {
1411 /* split_folio() can block */
1412 pte_unmap(src_pte);
1413 pte_unmap(dst_pte);
1414 src_pte = dst_pte = NULL;
1415 ret = split_folio(src_folio);
1416 if (ret)
1417 goto out;
1418 /* have to reacquire the folio after it got split */
1419 folio_unlock(src_folio);
1420 folio_put(src_folio);
1421 src_folio = NULL;
1422 goto retry;
1423 }
1424
1425 ret = move_present_ptes(mm, dst_vma, src_vma,
1426 dst_addr, src_addr, dst_pte, src_pte,
1427 orig_dst_pte, orig_src_pte, dst_pmd,
1428 dst_pmdval, dst_ptl, src_ptl, &src_folio,
1429 len);
1430 } else { /* !pte_present() */
1431 struct folio *folio = NULL;
1432 const softleaf_t entry = softleaf_from_pte(orig_src_pte);
1433
1434 if (softleaf_is_migration(entry)) {
1435 pte_unmap(src_pte);
1436 pte_unmap(dst_pte);
1437 src_pte = dst_pte = NULL;
1438 migration_entry_wait(mm, src_pmd, src_addr);
1439
1440 ret = -EAGAIN;
1441 goto out;
1442 } else if (!softleaf_is_swap(entry)) {
1443 ret = -EFAULT;
1444 goto out;
1445 }
1446
1447 if (!pte_swp_exclusive(orig_src_pte)) {
1448 ret = -EBUSY;
1449 goto out;
1450 }
1451
1452 si = get_swap_device(entry);
1453 if (unlikely(!si)) {
1454 ret = -EAGAIN;
1455 goto out;
1456 }
1457 /*
1458 * Verify the existence of the swapcache. If present, the folio's
1459 * index and mapping must be updated even when the PTE is a swap
1460 * entry. The anon_vma lock is not taken during this process since
1461 * the folio has already been unmapped, and the swap entry is
1462 * exclusive, preventing rmap walks.
1463 *
1464 * For large folios, return -EBUSY immediately, as split_folio()
1465 * also returns -EBUSY when attempting to split unmapped large
1466 * folios in the swapcache. This issue needs to be resolved
1467 * separately to allow proper handling.
1468 */
1469 if (!src_folio)
1470 folio = swap_cache_get_folio(entry);
1471 if (folio) {
1472 if (folio_test_large(folio)) {
1473 ret = -EBUSY;
1474 folio_put(folio);
1475 goto out;
1476 }
1477 src_folio = folio;
1478 src_folio_pte = orig_src_pte;
1479 if (!folio_trylock(src_folio)) {
1480 pte_unmap(src_pte);
1481 pte_unmap(dst_pte);
1482 src_pte = dst_pte = NULL;
1483 put_swap_device(si);
1484 si = NULL;
1485 /* now we can block and wait */
1486 folio_lock(src_folio);
1487 goto retry;
1488 }
1489 }
1490 ret = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
1491 orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
1492 dst_ptl, src_ptl, src_folio, si, entry);
1493 }
1494
1495 out:
1496 if (src_folio) {
1497 folio_unlock(src_folio);
1498 folio_put(src_folio);
1499 }
1500 /*
1501 * Unmap in reverse order (LIFO) to maintain proper kmap_local
1502 * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte
1503 * first, then src_pte, so we must unmap src_pte first, then dst_pte.
1504 */
1505 if (src_pte)
1506 pte_unmap(src_pte);
1507 if (dst_pte)
1508 pte_unmap(dst_pte);
1509 mmu_notifier_invalidate_range_end(&range);
1510 if (si)
1511 put_swap_device(si);
1512
1513 return ret;
1514 }
1515
1516 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
move_splits_huge_pmd(unsigned long dst_addr,unsigned long src_addr,unsigned long src_end)1517 static inline bool move_splits_huge_pmd(unsigned long dst_addr,
1518 unsigned long src_addr,
1519 unsigned long src_end)
1520 {
1521 return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) ||
1522 src_end - src_addr < HPAGE_PMD_SIZE;
1523 }
1524 #else
move_splits_huge_pmd(unsigned long dst_addr,unsigned long src_addr,unsigned long src_end)1525 static inline bool move_splits_huge_pmd(unsigned long dst_addr,
1526 unsigned long src_addr,
1527 unsigned long src_end)
1528 {
1529 /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */
1530 return false;
1531 }
1532 #endif
1533
vma_move_compatible(struct vm_area_struct * vma)1534 static inline bool vma_move_compatible(struct vm_area_struct *vma)
1535 {
1536 return !(vma->vm_flags & (VM_PFNMAP | VM_IO | VM_HUGETLB |
1537 VM_MIXEDMAP | VM_SHADOW_STACK));
1538 }
1539
validate_move_areas(struct userfaultfd_ctx * ctx,struct vm_area_struct * src_vma,struct vm_area_struct * dst_vma)1540 static int validate_move_areas(struct userfaultfd_ctx *ctx,
1541 struct vm_area_struct *src_vma,
1542 struct vm_area_struct *dst_vma)
1543 {
1544 /* Only allow moving if both have the same access and protection */
1545 if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) ||
1546 pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot))
1547 return -EINVAL;
1548
1549 /* Only allow moving if both are mlocked or both aren't */
1550 if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED))
1551 return -EINVAL;
1552
1553 /*
1554 * For now, we keep it simple and only move between writable VMAs.
1555 * Access flags are equal, therefore checking only the source is enough.
1556 */
1557 if (!(src_vma->vm_flags & VM_WRITE))
1558 return -EINVAL;
1559
1560 /* Check if vma flags indicate content which can be moved */
1561 if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma))
1562 return -EINVAL;
1563
1564 /* Ensure dst_vma is registered in uffd we are operating on */
1565 if (!dst_vma->vm_userfaultfd_ctx.ctx ||
1566 dst_vma->vm_userfaultfd_ctx.ctx != ctx)
1567 return -EINVAL;
1568
1569 /* Only allow moving across anonymous vmas */
1570 if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma))
1571 return -EINVAL;
1572
1573 return 0;
1574 }
1575
1576 static __always_inline
find_vmas_mm_locked(struct mm_struct * mm,unsigned long dst_start,unsigned long src_start,struct vm_area_struct ** dst_vmap,struct vm_area_struct ** src_vmap)1577 int find_vmas_mm_locked(struct mm_struct *mm,
1578 unsigned long dst_start,
1579 unsigned long src_start,
1580 struct vm_area_struct **dst_vmap,
1581 struct vm_area_struct **src_vmap)
1582 {
1583 struct vm_area_struct *vma;
1584
1585 mmap_assert_locked(mm);
1586 vma = find_vma_and_prepare_anon(mm, dst_start);
1587 if (IS_ERR(vma))
1588 return PTR_ERR(vma);
1589
1590 *dst_vmap = vma;
1591 /* Skip finding src_vma if src_start is in dst_vma */
1592 if (src_start >= vma->vm_start && src_start < vma->vm_end)
1593 goto out_success;
1594
1595 vma = vma_lookup(mm, src_start);
1596 if (!vma)
1597 return -ENOENT;
1598 out_success:
1599 *src_vmap = vma;
1600 return 0;
1601 }
1602
1603 #ifdef CONFIG_PER_VMA_LOCK
uffd_move_lock(struct mm_struct * mm,unsigned long dst_start,unsigned long src_start,struct vm_area_struct ** dst_vmap,struct vm_area_struct ** src_vmap)1604 static int uffd_move_lock(struct mm_struct *mm,
1605 unsigned long dst_start,
1606 unsigned long src_start,
1607 struct vm_area_struct **dst_vmap,
1608 struct vm_area_struct **src_vmap)
1609 {
1610 struct vm_area_struct *vma;
1611 int err;
1612
1613 vma = uffd_lock_vma(mm, dst_start);
1614 if (IS_ERR(vma))
1615 return PTR_ERR(vma);
1616
1617 *dst_vmap = vma;
1618 /*
1619 * Skip finding src_vma if src_start is in dst_vma. This also ensures
1620 * that we don't lock the same vma twice.
1621 */
1622 if (src_start >= vma->vm_start && src_start < vma->vm_end) {
1623 *src_vmap = vma;
1624 return 0;
1625 }
1626
1627 /*
1628 * Using uffd_lock_vma() to get src_vma can lead to following deadlock:
1629 *
1630 * Thread1 Thread2
1631 * ------- -------
1632 * vma_start_read(dst_vma)
1633 * mmap_write_lock(mm)
1634 * vma_start_write(src_vma)
1635 * vma_start_read(src_vma)
1636 * mmap_read_lock(mm)
1637 * vma_start_write(dst_vma)
1638 */
1639 *src_vmap = lock_vma_under_rcu(mm, src_start);
1640 if (likely(*src_vmap))
1641 return 0;
1642
1643 /* Undo any locking and retry in mmap_lock critical section */
1644 vma_end_read(*dst_vmap);
1645
1646 mmap_read_lock(mm);
1647 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
1648 if (err)
1649 goto out;
1650
1651 if (!vma_start_read_locked(*dst_vmap)) {
1652 err = -EAGAIN;
1653 goto out;
1654 }
1655
1656 /* Nothing further to do if both vmas are locked. */
1657 if (*dst_vmap == *src_vmap)
1658 goto out;
1659
1660 if (!vma_start_read_locked_nested(*src_vmap, SINGLE_DEPTH_NESTING)) {
1661 /* Undo dst_vmap locking if src_vmap failed to lock */
1662 vma_end_read(*dst_vmap);
1663 err = -EAGAIN;
1664 }
1665 out:
1666 mmap_read_unlock(mm);
1667 return err;
1668 }
1669
uffd_move_unlock(struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma)1670 static void uffd_move_unlock(struct vm_area_struct *dst_vma,
1671 struct vm_area_struct *src_vma)
1672 {
1673 vma_end_read(src_vma);
1674 if (src_vma != dst_vma)
1675 vma_end_read(dst_vma);
1676 }
1677
1678 #else
1679
uffd_move_lock(struct mm_struct * mm,unsigned long dst_start,unsigned long src_start,struct vm_area_struct ** dst_vmap,struct vm_area_struct ** src_vmap)1680 static int uffd_move_lock(struct mm_struct *mm,
1681 unsigned long dst_start,
1682 unsigned long src_start,
1683 struct vm_area_struct **dst_vmap,
1684 struct vm_area_struct **src_vmap)
1685 {
1686 int err;
1687
1688 mmap_read_lock(mm);
1689 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
1690 if (err)
1691 mmap_read_unlock(mm);
1692 return err;
1693 }
1694
uffd_move_unlock(struct vm_area_struct * dst_vma,struct vm_area_struct * src_vma)1695 static void uffd_move_unlock(struct vm_area_struct *dst_vma,
1696 struct vm_area_struct *src_vma)
1697 {
1698 mmap_assert_locked(src_vma->vm_mm);
1699 mmap_read_unlock(dst_vma->vm_mm);
1700 }
1701 #endif
1702
1703 /**
1704 * move_pages - move arbitrary anonymous pages of an existing vma
1705 * @ctx: pointer to the userfaultfd context
1706 * @dst_start: start of the destination virtual memory range
1707 * @src_start: start of the source virtual memory range
1708 * @len: length of the virtual memory range
1709 * @mode: flags from uffdio_move.mode
1710 *
1711 * It will either use the mmap_lock in read mode or per-vma locks
1712 *
1713 * move_pages() remaps arbitrary anonymous pages atomically in zero
1714 * copy. It only works on non shared anonymous pages because those can
1715 * be relocated without generating non linear anon_vmas in the rmap
1716 * code.
1717 *
1718 * It provides a zero copy mechanism to handle userspace page faults.
1719 * The source vma pages should have mapcount == 1, which can be
1720 * enforced by using madvise(MADV_DONTFORK) on src vma.
1721 *
1722 * The thread receiving the page during the userland page fault
1723 * will receive the faulting page in the source vma through the network,
1724 * storage or any other I/O device (MADV_DONTFORK in the source vma
1725 * avoids move_pages() to fail with -EBUSY if the process forks before
1726 * move_pages() is called), then it will call move_pages() to map the
1727 * page in the faulting address in the destination vma.
1728 *
1729 * This userfaultfd command works purely via pagetables, so it's the
1730 * most efficient way to move physical non shared anonymous pages
1731 * across different virtual addresses. Unlike mremap()/mmap()/munmap()
1732 * it does not create any new vmas. The mapping in the destination
1733 * address is atomic.
1734 *
1735 * It only works if the vma protection bits are identical from the
1736 * source and destination vma.
1737 *
1738 * It can remap non shared anonymous pages within the same vma too.
1739 *
1740 * If the source virtual memory range has any unmapped holes, or if
1741 * the destination virtual memory range is not a whole unmapped hole,
1742 * move_pages() will fail respectively with -ENOENT or -EEXIST. This
1743 * provides a very strict behavior to avoid any chance of memory
1744 * corruption going unnoticed if there are userland race conditions.
1745 * Only one thread should resolve the userland page fault at any given
1746 * time for any given faulting address. This means that if two threads
1747 * try to both call move_pages() on the same destination address at the
1748 * same time, the second thread will get an explicit error from this
1749 * command.
1750 *
1751 * The command retval will return "len" is successful. The command
1752 * however can be interrupted by fatal signals or errors. If
1753 * interrupted it will return the number of bytes successfully
1754 * remapped before the interruption if any, or the negative error if
1755 * none. It will never return zero. Either it will return an error or
1756 * an amount of bytes successfully moved. If the retval reports a
1757 * "short" remap, the move_pages() command should be repeated by
1758 * userland with src+retval, dst+reval, len-retval if it wants to know
1759 * about the error that interrupted it.
1760 *
1761 * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to
1762 * prevent -ENOENT errors to materialize if there are holes in the
1763 * source virtual range that is being remapped. The holes will be
1764 * accounted as successfully remapped in the retval of the
1765 * command. This is mostly useful to remap hugepage naturally aligned
1766 * virtual regions without knowing if there are transparent hugepage
1767 * in the regions or not, but preventing the risk of having to split
1768 * the hugepmd during the remap.
1769 */
move_pages(struct userfaultfd_ctx * ctx,unsigned long dst_start,unsigned long src_start,unsigned long len,__u64 mode)1770 ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
1771 unsigned long src_start, unsigned long len, __u64 mode)
1772 {
1773 struct mm_struct *mm = ctx->mm;
1774 struct vm_area_struct *src_vma, *dst_vma;
1775 unsigned long src_addr, dst_addr, src_end;
1776 pmd_t *src_pmd, *dst_pmd;
1777 long err = -EINVAL;
1778 ssize_t moved = 0;
1779
1780 /* Sanitize the command parameters. */
1781 VM_WARN_ON_ONCE(src_start & ~PAGE_MASK);
1782 VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK);
1783 VM_WARN_ON_ONCE(len & ~PAGE_MASK);
1784
1785 /* Does the address range wrap, or is the span zero-sized? */
1786 VM_WARN_ON_ONCE(src_start + len < src_start);
1787 VM_WARN_ON_ONCE(dst_start + len < dst_start);
1788
1789 err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma);
1790 if (err)
1791 goto out;
1792
1793 /* Re-check after taking map_changing_lock */
1794 err = -EAGAIN;
1795 down_read(&ctx->map_changing_lock);
1796 if (likely(atomic_read(&ctx->mmap_changing)))
1797 goto out_unlock;
1798 /*
1799 * Make sure the vma is not shared, that the src and dst remap
1800 * ranges are both valid and fully within a single existing
1801 * vma.
1802 */
1803 err = -EINVAL;
1804 if (src_vma->vm_flags & VM_SHARED)
1805 goto out_unlock;
1806 if (src_start + len > src_vma->vm_end)
1807 goto out_unlock;
1808
1809 if (dst_vma->vm_flags & VM_SHARED)
1810 goto out_unlock;
1811 if (dst_start + len > dst_vma->vm_end)
1812 goto out_unlock;
1813
1814 err = validate_move_areas(ctx, src_vma, dst_vma);
1815 if (err)
1816 goto out_unlock;
1817
1818 for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len;
1819 src_addr < src_end;) {
1820 spinlock_t *ptl;
1821 pmd_t dst_pmdval;
1822 unsigned long step_size;
1823
1824 /*
1825 * Below works because anonymous area would not have a
1826 * transparent huge PUD. If file-backed support is added,
1827 * that case would need to be handled here.
1828 */
1829 src_pmd = mm_find_pmd(mm, src_addr);
1830 if (unlikely(!src_pmd)) {
1831 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
1832 err = -ENOENT;
1833 break;
1834 }
1835 src_pmd = mm_alloc_pmd(mm, src_addr);
1836 if (unlikely(!src_pmd)) {
1837 err = -ENOMEM;
1838 break;
1839 }
1840 }
1841 dst_pmd = mm_alloc_pmd(mm, dst_addr);
1842 if (unlikely(!dst_pmd)) {
1843 err = -ENOMEM;
1844 break;
1845 }
1846
1847 dst_pmdval = pmdp_get_lockless(dst_pmd);
1848 /*
1849 * If the dst_pmd is mapped as THP don't override it and just
1850 * be strict. If dst_pmd changes into TPH after this check, the
1851 * move_pages_huge_pmd() will detect the change and retry
1852 * while move_pages_pte() will detect the change and fail.
1853 */
1854 if (unlikely(pmd_trans_huge(dst_pmdval))) {
1855 err = -EEXIST;
1856 break;
1857 }
1858
1859 ptl = pmd_trans_huge_lock(src_pmd, src_vma);
1860 if (ptl) {
1861 /* Check if we can move the pmd without splitting it. */
1862 if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
1863 !pmd_none(dst_pmdval)) {
1864 /* Can be a migration entry */
1865 if (pmd_present(*src_pmd)) {
1866 struct folio *folio = pmd_folio(*src_pmd);
1867
1868 if (!is_huge_zero_folio(folio) &&
1869 !PageAnonExclusive(&folio->page)) {
1870 spin_unlock(ptl);
1871 err = -EBUSY;
1872 break;
1873 }
1874 }
1875
1876 spin_unlock(ptl);
1877 split_huge_pmd(src_vma, src_pmd, src_addr);
1878 /* The folio will be split by move_pages_pte() */
1879 continue;
1880 }
1881
1882 err = move_pages_huge_pmd(mm, dst_pmd, src_pmd,
1883 dst_pmdval, dst_vma, src_vma,
1884 dst_addr, src_addr);
1885 step_size = HPAGE_PMD_SIZE;
1886 } else {
1887 long ret;
1888
1889 if (pmd_none(*src_pmd)) {
1890 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
1891 err = -ENOENT;
1892 break;
1893 }
1894 if (unlikely(__pte_alloc(mm, src_pmd))) {
1895 err = -ENOMEM;
1896 break;
1897 }
1898 }
1899
1900 if (unlikely(pte_alloc(mm, dst_pmd))) {
1901 err = -ENOMEM;
1902 break;
1903 }
1904
1905 ret = move_pages_ptes(mm, dst_pmd, src_pmd,
1906 dst_vma, src_vma, dst_addr,
1907 src_addr, src_end - src_addr, mode);
1908 if (ret < 0)
1909 err = ret;
1910 else
1911 step_size = ret;
1912 }
1913
1914 cond_resched();
1915
1916 if (fatal_signal_pending(current)) {
1917 /* Do not override an error */
1918 if (!err || err == -EAGAIN)
1919 err = -EINTR;
1920 break;
1921 }
1922
1923 if (err) {
1924 if (err == -EAGAIN)
1925 continue;
1926 break;
1927 }
1928
1929 /* Proceed to the next page */
1930 dst_addr += step_size;
1931 src_addr += step_size;
1932 moved += step_size;
1933 }
1934
1935 out_unlock:
1936 up_read(&ctx->map_changing_lock);
1937 uffd_move_unlock(dst_vma, src_vma);
1938 out:
1939 VM_WARN_ON_ONCE(moved < 0);
1940 VM_WARN_ON_ONCE(err > 0);
1941 VM_WARN_ON_ONCE(!moved && !err);
1942 return moved ? moved : err;
1943 }
1944
userfaultfd_set_vm_flags(struct vm_area_struct * vma,vm_flags_t vm_flags)1945 static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
1946 vm_flags_t vm_flags)
1947 {
1948 const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP;
1949
1950 vm_flags_reset(vma, vm_flags);
1951 /*
1952 * For shared mappings, we want to enable writenotify while
1953 * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
1954 * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
1955 */
1956 if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
1957 vma_set_page_prot(vma);
1958 }
1959
userfaultfd_set_ctx(struct vm_area_struct * vma,struct userfaultfd_ctx * ctx,vm_flags_t vm_flags)1960 static void userfaultfd_set_ctx(struct vm_area_struct *vma,
1961 struct userfaultfd_ctx *ctx,
1962 vm_flags_t vm_flags)
1963 {
1964 vma_start_write(vma);
1965 vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx};
1966 userfaultfd_set_vm_flags(vma,
1967 (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags);
1968 }
1969
userfaultfd_reset_ctx(struct vm_area_struct * vma)1970 void userfaultfd_reset_ctx(struct vm_area_struct *vma)
1971 {
1972 userfaultfd_set_ctx(vma, NULL, 0);
1973 }
1974
userfaultfd_clear_vma(struct vma_iterator * vmi,struct vm_area_struct * prev,struct vm_area_struct * vma,unsigned long start,unsigned long end)1975 struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
1976 struct vm_area_struct *prev,
1977 struct vm_area_struct *vma,
1978 unsigned long start,
1979 unsigned long end)
1980 {
1981 struct vm_area_struct *ret;
1982 bool give_up_on_oom = false;
1983
1984 /*
1985 * If we are modifying only and not splitting, just give up on the merge
1986 * if OOM prevents us from merging successfully.
1987 */
1988 if (start == vma->vm_start && end == vma->vm_end)
1989 give_up_on_oom = true;
1990
1991 /* Reset ptes for the whole vma range if wr-protected */
1992 if (userfaultfd_wp(vma))
1993 uffd_wp_range(vma, start, end - start, false);
1994
1995 ret = vma_modify_flags_uffd(vmi, prev, vma, start, end,
1996 vma->vm_flags & ~__VM_UFFD_FLAGS,
1997 NULL_VM_UFFD_CTX, give_up_on_oom);
1998
1999 /*
2000 * In the vma_merge() successful mprotect-like case 8:
2001 * the next vma was merged into the current one and
2002 * the current one has not been updated yet.
2003 */
2004 if (!IS_ERR(ret))
2005 userfaultfd_reset_ctx(ret);
2006
2007 return ret;
2008 }
2009
2010 /* Assumes mmap write lock taken, and mm_struct pinned. */
userfaultfd_register_range(struct userfaultfd_ctx * ctx,struct vm_area_struct * vma,vm_flags_t vm_flags,unsigned long start,unsigned long end,bool wp_async)2011 int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
2012 struct vm_area_struct *vma,
2013 vm_flags_t vm_flags,
2014 unsigned long start, unsigned long end,
2015 bool wp_async)
2016 {
2017 VMA_ITERATOR(vmi, ctx->mm, start);
2018 struct vm_area_struct *prev = vma_prev(&vmi);
2019 unsigned long vma_end;
2020 vm_flags_t new_flags;
2021
2022 if (vma->vm_start < start)
2023 prev = vma;
2024
2025 for_each_vma_range(vmi, vma, end) {
2026 cond_resched();
2027
2028 VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async));
2029 VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx &&
2030 vma->vm_userfaultfd_ctx.ctx != ctx);
2031 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE));
2032
2033 /*
2034 * Nothing to do: this vma is already registered into this
2035 * userfaultfd and with the right tracking mode too.
2036 */
2037 if (vma->vm_userfaultfd_ctx.ctx == ctx &&
2038 (vma->vm_flags & vm_flags) == vm_flags)
2039 goto skip;
2040
2041 if (vma->vm_start > start)
2042 start = vma->vm_start;
2043 vma_end = min(end, vma->vm_end);
2044
2045 new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
2046 vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
2047 new_flags,
2048 (struct vm_userfaultfd_ctx){ctx},
2049 /* give_up_on_oom = */false);
2050 if (IS_ERR(vma))
2051 return PTR_ERR(vma);
2052
2053 /*
2054 * In the vma_merge() successful mprotect-like case 8:
2055 * the next vma was merged into the current one and
2056 * the current one has not been updated yet.
2057 */
2058 userfaultfd_set_ctx(vma, ctx, vm_flags);
2059
2060 if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
2061 hugetlb_unshare_all_pmds(vma);
2062
2063 skip:
2064 prev = vma;
2065 start = vma->vm_end;
2066 }
2067
2068 return 0;
2069 }
2070
userfaultfd_release_new(struct userfaultfd_ctx * ctx)2071 void userfaultfd_release_new(struct userfaultfd_ctx *ctx)
2072 {
2073 struct mm_struct *mm = ctx->mm;
2074 struct vm_area_struct *vma;
2075 VMA_ITERATOR(vmi, mm, 0);
2076
2077 /* the various vma->vm_userfaultfd_ctx still points to it */
2078 mmap_write_lock(mm);
2079 for_each_vma(vmi, vma) {
2080 if (vma->vm_userfaultfd_ctx.ctx == ctx)
2081 userfaultfd_reset_ctx(vma);
2082 }
2083 mmap_write_unlock(mm);
2084 }
2085
userfaultfd_release_all(struct mm_struct * mm,struct userfaultfd_ctx * ctx)2086 void userfaultfd_release_all(struct mm_struct *mm,
2087 struct userfaultfd_ctx *ctx)
2088 {
2089 struct vm_area_struct *vma, *prev;
2090 VMA_ITERATOR(vmi, mm, 0);
2091
2092 if (!mmget_not_zero(mm))
2093 return;
2094
2095 /*
2096 * Flush page faults out of all CPUs. NOTE: all page faults
2097 * must be retried without returning VM_FAULT_SIGBUS if
2098 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
2099 * changes while handle_userfault released the mmap_lock. So
2100 * it's critical that released is set to true (above), before
2101 * taking the mmap_lock for writing.
2102 */
2103 mmap_write_lock(mm);
2104 prev = NULL;
2105 for_each_vma(vmi, vma) {
2106 cond_resched();
2107 VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^
2108 !!(vma->vm_flags & __VM_UFFD_FLAGS));
2109 if (vma->vm_userfaultfd_ctx.ctx != ctx) {
2110 prev = vma;
2111 continue;
2112 }
2113
2114 vma = userfaultfd_clear_vma(&vmi, prev, vma,
2115 vma->vm_start, vma->vm_end);
2116 prev = vma;
2117 }
2118 mmap_write_unlock(mm);
2119 mmput(mm);
2120 }
2121