xref: /linux/mm/userfaultfd.c (revision a4c43b8a09805a7b9b39344c1ba304a5641aca77)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  mm/userfaultfd.c
4  *
5  *  Copyright (C) 2015  Red Hat, Inc.
6  */
7 
8 #include <linux/mm.h>
9 #include <linux/sched/signal.h>
10 #include <linux/pagemap.h>
11 #include <linux/rmap.h>
12 #include <linux/swap.h>
13 #include <linux/swapops.h>
14 #include <linux/userfaultfd_k.h>
15 #include <linux/mmu_notifier.h>
16 #include <linux/hugetlb.h>
17 #include <linux/shmem_fs.h>
18 #include <asm/tlbflush.h>
19 #include <asm/tlb.h>
20 #include "internal.h"
21 
22 static __always_inline
23 bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
24 {
25 	/* Make sure that the dst range is fully within dst_vma. */
26 	if (dst_end > dst_vma->vm_end)
27 		return false;
28 
29 	/*
30 	 * Check the vma is registered in uffd, this is required to
31 	 * enforce the VM_MAYWRITE check done at uffd registration
32 	 * time.
33 	 */
34 	if (!dst_vma->vm_userfaultfd_ctx.ctx)
35 		return false;
36 
37 	return true;
38 }
39 
40 static __always_inline
41 struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm,
42 						 unsigned long addr)
43 {
44 	struct vm_area_struct *vma;
45 
46 	mmap_assert_locked(mm);
47 	vma = vma_lookup(mm, addr);
48 	if (!vma)
49 		vma = ERR_PTR(-ENOENT);
50 	else if (!(vma->vm_flags & VM_SHARED) &&
51 		 unlikely(anon_vma_prepare(vma)))
52 		vma = ERR_PTR(-ENOMEM);
53 
54 	return vma;
55 }
56 
57 #ifdef CONFIG_PER_VMA_LOCK
58 /*
59  * uffd_lock_vma() - Lookup and lock vma corresponding to @address.
60  * @mm: mm to search vma in.
61  * @address: address that the vma should contain.
62  *
63  * Should be called without holding mmap_lock.
64  *
65  * Return: A locked vma containing @address, -ENOENT if no vma is found, or
66  * -ENOMEM if anon_vma couldn't be allocated.
67  */
68 static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm,
69 				       unsigned long address)
70 {
71 	struct vm_area_struct *vma;
72 
73 	vma = lock_vma_under_rcu(mm, address);
74 	if (vma) {
75 		/*
76 		 * We know we're going to need to use anon_vma, so check
77 		 * that early.
78 		 */
79 		if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma))
80 			vma_end_read(vma);
81 		else
82 			return vma;
83 	}
84 
85 	mmap_read_lock(mm);
86 	vma = find_vma_and_prepare_anon(mm, address);
87 	if (!IS_ERR(vma)) {
88 		/*
89 		 * We cannot use vma_start_read() as it may fail due to
90 		 * false locked (see comment in vma_start_read()). We
91 		 * can avoid that by directly locking vm_lock under
92 		 * mmap_lock, which guarantees that nobody can lock the
93 		 * vma for write (vma_start_write()) under us.
94 		 */
95 		down_read(&vma->vm_lock->lock);
96 	}
97 
98 	mmap_read_unlock(mm);
99 	return vma;
100 }
101 
102 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
103 					      unsigned long dst_start,
104 					      unsigned long len)
105 {
106 	struct vm_area_struct *dst_vma;
107 
108 	dst_vma = uffd_lock_vma(dst_mm, dst_start);
109 	if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len))
110 		return dst_vma;
111 
112 	vma_end_read(dst_vma);
113 	return ERR_PTR(-ENOENT);
114 }
115 
116 static void uffd_mfill_unlock(struct vm_area_struct *vma)
117 {
118 	vma_end_read(vma);
119 }
120 
121 #else
122 
123 static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
124 					      unsigned long dst_start,
125 					      unsigned long len)
126 {
127 	struct vm_area_struct *dst_vma;
128 
129 	mmap_read_lock(dst_mm);
130 	dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start);
131 	if (IS_ERR(dst_vma))
132 		goto out_unlock;
133 
134 	if (validate_dst_vma(dst_vma, dst_start + len))
135 		return dst_vma;
136 
137 	dst_vma = ERR_PTR(-ENOENT);
138 out_unlock:
139 	mmap_read_unlock(dst_mm);
140 	return dst_vma;
141 }
142 
143 static void uffd_mfill_unlock(struct vm_area_struct *vma)
144 {
145 	mmap_read_unlock(vma->vm_mm);
146 }
147 #endif
148 
149 /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
150 static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
151 				 unsigned long dst_addr)
152 {
153 	struct inode *inode;
154 	pgoff_t offset, max_off;
155 
156 	if (!dst_vma->vm_file)
157 		return false;
158 
159 	inode = dst_vma->vm_file->f_inode;
160 	offset = linear_page_index(dst_vma, dst_addr);
161 	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
162 	return offset >= max_off;
163 }
164 
165 /*
166  * Install PTEs, to map dst_addr (within dst_vma) to page.
167  *
168  * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
169  * and anon, and for both shared and private VMAs.
170  */
171 int mfill_atomic_install_pte(pmd_t *dst_pmd,
172 			     struct vm_area_struct *dst_vma,
173 			     unsigned long dst_addr, struct page *page,
174 			     bool newly_allocated, uffd_flags_t flags)
175 {
176 	int ret;
177 	struct mm_struct *dst_mm = dst_vma->vm_mm;
178 	pte_t _dst_pte, *dst_pte;
179 	bool writable = dst_vma->vm_flags & VM_WRITE;
180 	bool vm_shared = dst_vma->vm_flags & VM_SHARED;
181 	spinlock_t *ptl;
182 	struct folio *folio = page_folio(page);
183 	bool page_in_cache = folio_mapping(folio);
184 
185 	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
186 	_dst_pte = pte_mkdirty(_dst_pte);
187 	if (page_in_cache && !vm_shared)
188 		writable = false;
189 	if (writable)
190 		_dst_pte = pte_mkwrite(_dst_pte, dst_vma);
191 	if (flags & MFILL_ATOMIC_WP)
192 		_dst_pte = pte_mkuffd_wp(_dst_pte);
193 
194 	ret = -EAGAIN;
195 	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
196 	if (!dst_pte)
197 		goto out;
198 
199 	if (mfill_file_over_size(dst_vma, dst_addr)) {
200 		ret = -EFAULT;
201 		goto out_unlock;
202 	}
203 
204 	ret = -EEXIST;
205 	/*
206 	 * We allow to overwrite a pte marker: consider when both MISSING|WP
207 	 * registered, we firstly wr-protect a none pte which has no page cache
208 	 * page backing it, then access the page.
209 	 */
210 	if (!pte_none_mostly(ptep_get(dst_pte)))
211 		goto out_unlock;
212 
213 	if (page_in_cache) {
214 		/* Usually, cache pages are already added to LRU */
215 		if (newly_allocated)
216 			folio_add_lru(folio);
217 		folio_add_file_rmap_pte(folio, page, dst_vma);
218 	} else {
219 		folio_add_new_anon_rmap(folio, dst_vma, dst_addr);
220 		folio_add_lru_vma(folio, dst_vma);
221 	}
222 
223 	/*
224 	 * Must happen after rmap, as mm_counter() checks mapping (via
225 	 * PageAnon()), which is set by __page_set_anon_rmap().
226 	 */
227 	inc_mm_counter(dst_mm, mm_counter(folio));
228 
229 	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
230 
231 	/* No need to invalidate - it was non-present before */
232 	update_mmu_cache(dst_vma, dst_addr, dst_pte);
233 	ret = 0;
234 out_unlock:
235 	pte_unmap_unlock(dst_pte, ptl);
236 out:
237 	return ret;
238 }
239 
240 static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
241 				 struct vm_area_struct *dst_vma,
242 				 unsigned long dst_addr,
243 				 unsigned long src_addr,
244 				 uffd_flags_t flags,
245 				 struct folio **foliop)
246 {
247 	void *kaddr;
248 	int ret;
249 	struct folio *folio;
250 
251 	if (!*foliop) {
252 		ret = -ENOMEM;
253 		folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma,
254 					dst_addr, false);
255 		if (!folio)
256 			goto out;
257 
258 		kaddr = kmap_local_folio(folio, 0);
259 		/*
260 		 * The read mmap_lock is held here.  Despite the
261 		 * mmap_lock being read recursive a deadlock is still
262 		 * possible if a writer has taken a lock.  For example:
263 		 *
264 		 * process A thread 1 takes read lock on own mmap_lock
265 		 * process A thread 2 calls mmap, blocks taking write lock
266 		 * process B thread 1 takes page fault, read lock on own mmap lock
267 		 * process B thread 2 calls mmap, blocks taking write lock
268 		 * process A thread 1 blocks taking read lock on process B
269 		 * process B thread 1 blocks taking read lock on process A
270 		 *
271 		 * Disable page faults to prevent potential deadlock
272 		 * and retry the copy outside the mmap_lock.
273 		 */
274 		pagefault_disable();
275 		ret = copy_from_user(kaddr, (const void __user *) src_addr,
276 				     PAGE_SIZE);
277 		pagefault_enable();
278 		kunmap_local(kaddr);
279 
280 		/* fallback to copy_from_user outside mmap_lock */
281 		if (unlikely(ret)) {
282 			ret = -ENOENT;
283 			*foliop = folio;
284 			/* don't free the page */
285 			goto out;
286 		}
287 
288 		flush_dcache_folio(folio);
289 	} else {
290 		folio = *foliop;
291 		*foliop = NULL;
292 	}
293 
294 	/*
295 	 * The memory barrier inside __folio_mark_uptodate makes sure that
296 	 * preceding stores to the page contents become visible before
297 	 * the set_pte_at() write.
298 	 */
299 	__folio_mark_uptodate(folio);
300 
301 	ret = -ENOMEM;
302 	if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
303 		goto out_release;
304 
305 	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
306 				       &folio->page, true, flags);
307 	if (ret)
308 		goto out_release;
309 out:
310 	return ret;
311 out_release:
312 	folio_put(folio);
313 	goto out;
314 }
315 
316 static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
317 				     struct vm_area_struct *dst_vma,
318 				     unsigned long dst_addr)
319 {
320 	pte_t _dst_pte, *dst_pte;
321 	spinlock_t *ptl;
322 	int ret;
323 
324 	_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
325 					 dst_vma->vm_page_prot));
326 	ret = -EAGAIN;
327 	dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
328 	if (!dst_pte)
329 		goto out;
330 	if (mfill_file_over_size(dst_vma, dst_addr)) {
331 		ret = -EFAULT;
332 		goto out_unlock;
333 	}
334 	ret = -EEXIST;
335 	if (!pte_none(ptep_get(dst_pte)))
336 		goto out_unlock;
337 	set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
338 	/* No need to invalidate - it was non-present before */
339 	update_mmu_cache(dst_vma, dst_addr, dst_pte);
340 	ret = 0;
341 out_unlock:
342 	pte_unmap_unlock(dst_pte, ptl);
343 out:
344 	return ret;
345 }
346 
347 /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
348 static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
349 				     struct vm_area_struct *dst_vma,
350 				     unsigned long dst_addr,
351 				     uffd_flags_t flags)
352 {
353 	struct inode *inode = file_inode(dst_vma->vm_file);
354 	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
355 	struct folio *folio;
356 	struct page *page;
357 	int ret;
358 
359 	ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC);
360 	/* Our caller expects us to return -EFAULT if we failed to find folio */
361 	if (ret == -ENOENT)
362 		ret = -EFAULT;
363 	if (ret)
364 		goto out;
365 	if (!folio) {
366 		ret = -EFAULT;
367 		goto out;
368 	}
369 
370 	page = folio_file_page(folio, pgoff);
371 	if (PageHWPoison(page)) {
372 		ret = -EIO;
373 		goto out_release;
374 	}
375 
376 	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
377 				       page, false, flags);
378 	if (ret)
379 		goto out_release;
380 
381 	folio_unlock(folio);
382 	ret = 0;
383 out:
384 	return ret;
385 out_release:
386 	folio_unlock(folio);
387 	folio_put(folio);
388 	goto out;
389 }
390 
391 /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */
392 static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
393 				   struct vm_area_struct *dst_vma,
394 				   unsigned long dst_addr,
395 				   uffd_flags_t flags)
396 {
397 	int ret;
398 	struct mm_struct *dst_mm = dst_vma->vm_mm;
399 	pte_t _dst_pte, *dst_pte;
400 	spinlock_t *ptl;
401 
402 	_dst_pte = make_pte_marker(PTE_MARKER_POISONED);
403 	ret = -EAGAIN;
404 	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
405 	if (!dst_pte)
406 		goto out;
407 
408 	if (mfill_file_over_size(dst_vma, dst_addr)) {
409 		ret = -EFAULT;
410 		goto out_unlock;
411 	}
412 
413 	ret = -EEXIST;
414 	/* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */
415 	if (!pte_none(ptep_get(dst_pte)))
416 		goto out_unlock;
417 
418 	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
419 
420 	/* No need to invalidate - it was non-present before */
421 	update_mmu_cache(dst_vma, dst_addr, dst_pte);
422 	ret = 0;
423 out_unlock:
424 	pte_unmap_unlock(dst_pte, ptl);
425 out:
426 	return ret;
427 }
428 
429 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
430 {
431 	pgd_t *pgd;
432 	p4d_t *p4d;
433 	pud_t *pud;
434 
435 	pgd = pgd_offset(mm, address);
436 	p4d = p4d_alloc(mm, pgd, address);
437 	if (!p4d)
438 		return NULL;
439 	pud = pud_alloc(mm, p4d, address);
440 	if (!pud)
441 		return NULL;
442 	/*
443 	 * Note that we didn't run this because the pmd was
444 	 * missing, the *pmd may be already established and in
445 	 * turn it may also be a trans_huge_pmd.
446 	 */
447 	return pmd_alloc(mm, pud, address);
448 }
449 
450 #ifdef CONFIG_HUGETLB_PAGE
451 /*
452  * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
453  * called with either vma-lock or mmap_lock held, it will release the lock
454  * before returning.
455  */
456 static __always_inline ssize_t mfill_atomic_hugetlb(
457 					      struct userfaultfd_ctx *ctx,
458 					      struct vm_area_struct *dst_vma,
459 					      unsigned long dst_start,
460 					      unsigned long src_start,
461 					      unsigned long len,
462 					      uffd_flags_t flags)
463 {
464 	struct mm_struct *dst_mm = dst_vma->vm_mm;
465 	ssize_t err;
466 	pte_t *dst_pte;
467 	unsigned long src_addr, dst_addr;
468 	long copied;
469 	struct folio *folio;
470 	unsigned long vma_hpagesize;
471 	pgoff_t idx;
472 	u32 hash;
473 	struct address_space *mapping;
474 
475 	/*
476 	 * There is no default zero huge page for all huge page sizes as
477 	 * supported by hugetlb.  A PMD_SIZE huge pages may exist as used
478 	 * by THP.  Since we can not reliably insert a zero page, this
479 	 * feature is not supported.
480 	 */
481 	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
482 		up_read(&ctx->map_changing_lock);
483 		uffd_mfill_unlock(dst_vma);
484 		return -EINVAL;
485 	}
486 
487 	src_addr = src_start;
488 	dst_addr = dst_start;
489 	copied = 0;
490 	folio = NULL;
491 	vma_hpagesize = vma_kernel_pagesize(dst_vma);
492 
493 	/*
494 	 * Validate alignment based on huge page size
495 	 */
496 	err = -EINVAL;
497 	if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
498 		goto out_unlock;
499 
500 retry:
501 	/*
502 	 * On routine entry dst_vma is set.  If we had to drop mmap_lock and
503 	 * retry, dst_vma will be set to NULL and we must lookup again.
504 	 */
505 	if (!dst_vma) {
506 		dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
507 		if (IS_ERR(dst_vma)) {
508 			err = PTR_ERR(dst_vma);
509 			goto out;
510 		}
511 
512 		err = -ENOENT;
513 		if (!is_vm_hugetlb_page(dst_vma))
514 			goto out_unlock_vma;
515 
516 		err = -EINVAL;
517 		if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
518 			goto out_unlock_vma;
519 
520 		/*
521 		 * If memory mappings are changing because of non-cooperative
522 		 * operation (e.g. mremap) running in parallel, bail out and
523 		 * request the user to retry later
524 		 */
525 		down_read(&ctx->map_changing_lock);
526 		err = -EAGAIN;
527 		if (atomic_read(&ctx->mmap_changing))
528 			goto out_unlock;
529 	}
530 
531 	while (src_addr < src_start + len) {
532 		BUG_ON(dst_addr >= dst_start + len);
533 
534 		/*
535 		 * Serialize via vma_lock and hugetlb_fault_mutex.
536 		 * vma_lock ensures the dst_pte remains valid even
537 		 * in the case of shared pmds.  fault mutex prevents
538 		 * races with other faulting threads.
539 		 */
540 		idx = linear_page_index(dst_vma, dst_addr);
541 		mapping = dst_vma->vm_file->f_mapping;
542 		hash = hugetlb_fault_mutex_hash(mapping, idx);
543 		mutex_lock(&hugetlb_fault_mutex_table[hash]);
544 		hugetlb_vma_lock_read(dst_vma);
545 
546 		err = -ENOMEM;
547 		dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
548 		if (!dst_pte) {
549 			hugetlb_vma_unlock_read(dst_vma);
550 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
551 			goto out_unlock;
552 		}
553 
554 		if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) &&
555 		    !huge_pte_none_mostly(huge_ptep_get(dst_pte))) {
556 			err = -EEXIST;
557 			hugetlb_vma_unlock_read(dst_vma);
558 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
559 			goto out_unlock;
560 		}
561 
562 		err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr,
563 					       src_addr, flags, &folio);
564 
565 		hugetlb_vma_unlock_read(dst_vma);
566 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
567 
568 		cond_resched();
569 
570 		if (unlikely(err == -ENOENT)) {
571 			up_read(&ctx->map_changing_lock);
572 			uffd_mfill_unlock(dst_vma);
573 			BUG_ON(!folio);
574 
575 			err = copy_folio_from_user(folio,
576 						   (const void __user *)src_addr, true);
577 			if (unlikely(err)) {
578 				err = -EFAULT;
579 				goto out;
580 			}
581 
582 			dst_vma = NULL;
583 			goto retry;
584 		} else
585 			BUG_ON(folio);
586 
587 		if (!err) {
588 			dst_addr += vma_hpagesize;
589 			src_addr += vma_hpagesize;
590 			copied += vma_hpagesize;
591 
592 			if (fatal_signal_pending(current))
593 				err = -EINTR;
594 		}
595 		if (err)
596 			break;
597 	}
598 
599 out_unlock:
600 	up_read(&ctx->map_changing_lock);
601 out_unlock_vma:
602 	uffd_mfill_unlock(dst_vma);
603 out:
604 	if (folio)
605 		folio_put(folio);
606 	BUG_ON(copied < 0);
607 	BUG_ON(err > 0);
608 	BUG_ON(!copied && !err);
609 	return copied ? copied : err;
610 }
611 #else /* !CONFIG_HUGETLB_PAGE */
612 /* fail at build time if gcc attempts to use this */
613 extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
614 				    struct vm_area_struct *dst_vma,
615 				    unsigned long dst_start,
616 				    unsigned long src_start,
617 				    unsigned long len,
618 				    uffd_flags_t flags);
619 #endif /* CONFIG_HUGETLB_PAGE */
620 
621 static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
622 						struct vm_area_struct *dst_vma,
623 						unsigned long dst_addr,
624 						unsigned long src_addr,
625 						uffd_flags_t flags,
626 						struct folio **foliop)
627 {
628 	ssize_t err;
629 
630 	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
631 		return mfill_atomic_pte_continue(dst_pmd, dst_vma,
632 						 dst_addr, flags);
633 	} else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
634 		return mfill_atomic_pte_poison(dst_pmd, dst_vma,
635 					       dst_addr, flags);
636 	}
637 
638 	/*
639 	 * The normal page fault path for a shmem will invoke the
640 	 * fault, fill the hole in the file and COW it right away. The
641 	 * result generates plain anonymous memory. So when we are
642 	 * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
643 	 * generate anonymous memory directly without actually filling
644 	 * the hole. For the MAP_PRIVATE case the robustness check
645 	 * only happens in the pagetable (to verify it's still none)
646 	 * and not in the radix tree.
647 	 */
648 	if (!(dst_vma->vm_flags & VM_SHARED)) {
649 		if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
650 			err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
651 						    dst_addr, src_addr,
652 						    flags, foliop);
653 		else
654 			err = mfill_atomic_pte_zeropage(dst_pmd,
655 						 dst_vma, dst_addr);
656 	} else {
657 		err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
658 					     dst_addr, src_addr,
659 					     flags, foliop);
660 	}
661 
662 	return err;
663 }
664 
665 static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
666 					    unsigned long dst_start,
667 					    unsigned long src_start,
668 					    unsigned long len,
669 					    uffd_flags_t flags)
670 {
671 	struct mm_struct *dst_mm = ctx->mm;
672 	struct vm_area_struct *dst_vma;
673 	ssize_t err;
674 	pmd_t *dst_pmd;
675 	unsigned long src_addr, dst_addr;
676 	long copied;
677 	struct folio *folio;
678 
679 	/*
680 	 * Sanitize the command parameters:
681 	 */
682 	BUG_ON(dst_start & ~PAGE_MASK);
683 	BUG_ON(len & ~PAGE_MASK);
684 
685 	/* Does the address range wrap, or is the span zero-sized? */
686 	BUG_ON(src_start + len <= src_start);
687 	BUG_ON(dst_start + len <= dst_start);
688 
689 	src_addr = src_start;
690 	dst_addr = dst_start;
691 	copied = 0;
692 	folio = NULL;
693 retry:
694 	/*
695 	 * Make sure the vma is not shared, that the dst range is
696 	 * both valid and fully within a single existing vma.
697 	 */
698 	dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
699 	if (IS_ERR(dst_vma)) {
700 		err = PTR_ERR(dst_vma);
701 		goto out;
702 	}
703 
704 	/*
705 	 * If memory mappings are changing because of non-cooperative
706 	 * operation (e.g. mremap) running in parallel, bail out and
707 	 * request the user to retry later
708 	 */
709 	down_read(&ctx->map_changing_lock);
710 	err = -EAGAIN;
711 	if (atomic_read(&ctx->mmap_changing))
712 		goto out_unlock;
713 
714 	err = -EINVAL;
715 	/*
716 	 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
717 	 * it will overwrite vm_ops, so vma_is_anonymous must return false.
718 	 */
719 	if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
720 	    dst_vma->vm_flags & VM_SHARED))
721 		goto out_unlock;
722 
723 	/*
724 	 * validate 'mode' now that we know the dst_vma: don't allow
725 	 * a wrprotect copy if the userfaultfd didn't register as WP.
726 	 */
727 	if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
728 		goto out_unlock;
729 
730 	/*
731 	 * If this is a HUGETLB vma, pass off to appropriate routine
732 	 */
733 	if (is_vm_hugetlb_page(dst_vma))
734 		return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
735 					     src_start, len, flags);
736 
737 	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
738 		goto out_unlock;
739 	if (!vma_is_shmem(dst_vma) &&
740 	    uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
741 		goto out_unlock;
742 
743 	while (src_addr < src_start + len) {
744 		pmd_t dst_pmdval;
745 
746 		BUG_ON(dst_addr >= dst_start + len);
747 
748 		dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
749 		if (unlikely(!dst_pmd)) {
750 			err = -ENOMEM;
751 			break;
752 		}
753 
754 		dst_pmdval = pmdp_get_lockless(dst_pmd);
755 		/*
756 		 * If the dst_pmd is mapped as THP don't
757 		 * override it and just be strict.
758 		 */
759 		if (unlikely(pmd_trans_huge(dst_pmdval))) {
760 			err = -EEXIST;
761 			break;
762 		}
763 		if (unlikely(pmd_none(dst_pmdval)) &&
764 		    unlikely(__pte_alloc(dst_mm, dst_pmd))) {
765 			err = -ENOMEM;
766 			break;
767 		}
768 		/* If an huge pmd materialized from under us fail */
769 		if (unlikely(pmd_trans_huge(*dst_pmd))) {
770 			err = -EFAULT;
771 			break;
772 		}
773 
774 		BUG_ON(pmd_none(*dst_pmd));
775 		BUG_ON(pmd_trans_huge(*dst_pmd));
776 
777 		err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
778 				       src_addr, flags, &folio);
779 		cond_resched();
780 
781 		if (unlikely(err == -ENOENT)) {
782 			void *kaddr;
783 
784 			up_read(&ctx->map_changing_lock);
785 			uffd_mfill_unlock(dst_vma);
786 			BUG_ON(!folio);
787 
788 			kaddr = kmap_local_folio(folio, 0);
789 			err = copy_from_user(kaddr,
790 					     (const void __user *) src_addr,
791 					     PAGE_SIZE);
792 			kunmap_local(kaddr);
793 			if (unlikely(err)) {
794 				err = -EFAULT;
795 				goto out;
796 			}
797 			flush_dcache_folio(folio);
798 			goto retry;
799 		} else
800 			BUG_ON(folio);
801 
802 		if (!err) {
803 			dst_addr += PAGE_SIZE;
804 			src_addr += PAGE_SIZE;
805 			copied += PAGE_SIZE;
806 
807 			if (fatal_signal_pending(current))
808 				err = -EINTR;
809 		}
810 		if (err)
811 			break;
812 	}
813 
814 out_unlock:
815 	up_read(&ctx->map_changing_lock);
816 	uffd_mfill_unlock(dst_vma);
817 out:
818 	if (folio)
819 		folio_put(folio);
820 	BUG_ON(copied < 0);
821 	BUG_ON(err > 0);
822 	BUG_ON(!copied && !err);
823 	return copied ? copied : err;
824 }
825 
826 ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
827 			  unsigned long src_start, unsigned long len,
828 			  uffd_flags_t flags)
829 {
830 	return mfill_atomic(ctx, dst_start, src_start, len,
831 			    uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
832 }
833 
834 ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
835 			      unsigned long start,
836 			      unsigned long len)
837 {
838 	return mfill_atomic(ctx, start, 0, len,
839 			    uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
840 }
841 
842 ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
843 			      unsigned long len, uffd_flags_t flags)
844 {
845 
846 	/*
847 	 * A caller might reasonably assume that UFFDIO_CONTINUE contains an
848 	 * smp_wmb() to ensure that any writes to the about-to-be-mapped page by
849 	 * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to
850 	 * subsequent loads from the page through the newly mapped address range.
851 	 */
852 	smp_wmb();
853 
854 	return mfill_atomic(ctx, start, 0, len,
855 			    uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
856 }
857 
858 ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
859 			    unsigned long len, uffd_flags_t flags)
860 {
861 	return mfill_atomic(ctx, start, 0, len,
862 			    uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
863 }
864 
865 long uffd_wp_range(struct vm_area_struct *dst_vma,
866 		   unsigned long start, unsigned long len, bool enable_wp)
867 {
868 	unsigned int mm_cp_flags;
869 	struct mmu_gather tlb;
870 	long ret;
871 
872 	VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end,
873 			"The address range exceeds VMA boundary.\n");
874 	if (enable_wp)
875 		mm_cp_flags = MM_CP_UFFD_WP;
876 	else
877 		mm_cp_flags = MM_CP_UFFD_WP_RESOLVE;
878 
879 	/*
880 	 * vma->vm_page_prot already reflects that uffd-wp is enabled for this
881 	 * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed
882 	 * to be write-protected as default whenever protection changes.
883 	 * Try upgrading write permissions manually.
884 	 */
885 	if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma))
886 		mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
887 	tlb_gather_mmu(&tlb, dst_vma->vm_mm);
888 	ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags);
889 	tlb_finish_mmu(&tlb);
890 
891 	return ret;
892 }
893 
894 int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
895 			unsigned long len, bool enable_wp)
896 {
897 	struct mm_struct *dst_mm = ctx->mm;
898 	unsigned long end = start + len;
899 	unsigned long _start, _end;
900 	struct vm_area_struct *dst_vma;
901 	unsigned long page_mask;
902 	long err;
903 	VMA_ITERATOR(vmi, dst_mm, start);
904 
905 	/*
906 	 * Sanitize the command parameters:
907 	 */
908 	BUG_ON(start & ~PAGE_MASK);
909 	BUG_ON(len & ~PAGE_MASK);
910 
911 	/* Does the address range wrap, or is the span zero-sized? */
912 	BUG_ON(start + len <= start);
913 
914 	mmap_read_lock(dst_mm);
915 
916 	/*
917 	 * If memory mappings are changing because of non-cooperative
918 	 * operation (e.g. mremap) running in parallel, bail out and
919 	 * request the user to retry later
920 	 */
921 	down_read(&ctx->map_changing_lock);
922 	err = -EAGAIN;
923 	if (atomic_read(&ctx->mmap_changing))
924 		goto out_unlock;
925 
926 	err = -ENOENT;
927 	for_each_vma_range(vmi, dst_vma, end) {
928 
929 		if (!userfaultfd_wp(dst_vma)) {
930 			err = -ENOENT;
931 			break;
932 		}
933 
934 		if (is_vm_hugetlb_page(dst_vma)) {
935 			err = -EINVAL;
936 			page_mask = vma_kernel_pagesize(dst_vma) - 1;
937 			if ((start & page_mask) || (len & page_mask))
938 				break;
939 		}
940 
941 		_start = max(dst_vma->vm_start, start);
942 		_end = min(dst_vma->vm_end, end);
943 
944 		err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp);
945 
946 		/* Return 0 on success, <0 on failures */
947 		if (err < 0)
948 			break;
949 		err = 0;
950 	}
951 out_unlock:
952 	up_read(&ctx->map_changing_lock);
953 	mmap_read_unlock(dst_mm);
954 	return err;
955 }
956 
957 
958 void double_pt_lock(spinlock_t *ptl1,
959 		    spinlock_t *ptl2)
960 	__acquires(ptl1)
961 	__acquires(ptl2)
962 {
963 	spinlock_t *ptl_tmp;
964 
965 	if (ptl1 > ptl2) {
966 		/* exchange ptl1 and ptl2 */
967 		ptl_tmp = ptl1;
968 		ptl1 = ptl2;
969 		ptl2 = ptl_tmp;
970 	}
971 	/* lock in virtual address order to avoid lock inversion */
972 	spin_lock(ptl1);
973 	if (ptl1 != ptl2)
974 		spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING);
975 	else
976 		__acquire(ptl2);
977 }
978 
979 void double_pt_unlock(spinlock_t *ptl1,
980 		      spinlock_t *ptl2)
981 	__releases(ptl1)
982 	__releases(ptl2)
983 {
984 	spin_unlock(ptl1);
985 	if (ptl1 != ptl2)
986 		spin_unlock(ptl2);
987 	else
988 		__release(ptl2);
989 }
990 
991 
992 static int move_present_pte(struct mm_struct *mm,
993 			    struct vm_area_struct *dst_vma,
994 			    struct vm_area_struct *src_vma,
995 			    unsigned long dst_addr, unsigned long src_addr,
996 			    pte_t *dst_pte, pte_t *src_pte,
997 			    pte_t orig_dst_pte, pte_t orig_src_pte,
998 			    spinlock_t *dst_ptl, spinlock_t *src_ptl,
999 			    struct folio *src_folio)
1000 {
1001 	int err = 0;
1002 
1003 	double_pt_lock(dst_ptl, src_ptl);
1004 
1005 	if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
1006 	    !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
1007 		err = -EAGAIN;
1008 		goto out;
1009 	}
1010 	if (folio_test_large(src_folio) ||
1011 	    folio_maybe_dma_pinned(src_folio) ||
1012 	    !PageAnonExclusive(&src_folio->page)) {
1013 		err = -EBUSY;
1014 		goto out;
1015 	}
1016 
1017 	orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
1018 	/* Folio got pinned from under us. Put it back and fail the move. */
1019 	if (folio_maybe_dma_pinned(src_folio)) {
1020 		set_pte_at(mm, src_addr, src_pte, orig_src_pte);
1021 		err = -EBUSY;
1022 		goto out;
1023 	}
1024 
1025 	folio_move_anon_rmap(src_folio, dst_vma);
1026 	src_folio->index = linear_page_index(dst_vma, dst_addr);
1027 
1028 	orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot);
1029 	/* Follow mremap() behavior and treat the entry dirty after the move */
1030 	orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma);
1031 
1032 	set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
1033 out:
1034 	double_pt_unlock(dst_ptl, src_ptl);
1035 	return err;
1036 }
1037 
1038 static int move_swap_pte(struct mm_struct *mm,
1039 			 unsigned long dst_addr, unsigned long src_addr,
1040 			 pte_t *dst_pte, pte_t *src_pte,
1041 			 pte_t orig_dst_pte, pte_t orig_src_pte,
1042 			 spinlock_t *dst_ptl, spinlock_t *src_ptl)
1043 {
1044 	if (!pte_swp_exclusive(orig_src_pte))
1045 		return -EBUSY;
1046 
1047 	double_pt_lock(dst_ptl, src_ptl);
1048 
1049 	if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
1050 	    !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
1051 		double_pt_unlock(dst_ptl, src_ptl);
1052 		return -EAGAIN;
1053 	}
1054 
1055 	orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
1056 	set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
1057 	double_pt_unlock(dst_ptl, src_ptl);
1058 
1059 	return 0;
1060 }
1061 
1062 static int move_zeropage_pte(struct mm_struct *mm,
1063 			     struct vm_area_struct *dst_vma,
1064 			     struct vm_area_struct *src_vma,
1065 			     unsigned long dst_addr, unsigned long src_addr,
1066 			     pte_t *dst_pte, pte_t *src_pte,
1067 			     pte_t orig_dst_pte, pte_t orig_src_pte,
1068 			     spinlock_t *dst_ptl, spinlock_t *src_ptl)
1069 {
1070 	pte_t zero_pte;
1071 
1072 	double_pt_lock(dst_ptl, src_ptl);
1073 	if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
1074 	    !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
1075 		double_pt_unlock(dst_ptl, src_ptl);
1076 		return -EAGAIN;
1077 	}
1078 
1079 	zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
1080 					 dst_vma->vm_page_prot));
1081 	ptep_clear_flush(src_vma, src_addr, src_pte);
1082 	set_pte_at(mm, dst_addr, dst_pte, zero_pte);
1083 	double_pt_unlock(dst_ptl, src_ptl);
1084 
1085 	return 0;
1086 }
1087 
1088 
1089 /*
1090  * The mmap_lock for reading is held by the caller. Just move the page
1091  * from src_pmd to dst_pmd if possible, and return true if succeeded
1092  * in moving the page.
1093  */
1094 static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
1095 			  struct vm_area_struct *dst_vma,
1096 			  struct vm_area_struct *src_vma,
1097 			  unsigned long dst_addr, unsigned long src_addr,
1098 			  __u64 mode)
1099 {
1100 	swp_entry_t entry;
1101 	pte_t orig_src_pte, orig_dst_pte;
1102 	pte_t src_folio_pte;
1103 	spinlock_t *src_ptl, *dst_ptl;
1104 	pte_t *src_pte = NULL;
1105 	pte_t *dst_pte = NULL;
1106 
1107 	struct folio *src_folio = NULL;
1108 	struct anon_vma *src_anon_vma = NULL;
1109 	struct mmu_notifier_range range;
1110 	int err = 0;
1111 
1112 	flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE);
1113 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
1114 				src_addr, src_addr + PAGE_SIZE);
1115 	mmu_notifier_invalidate_range_start(&range);
1116 retry:
1117 	dst_pte = pte_offset_map_nolock(mm, dst_pmd, dst_addr, &dst_ptl);
1118 
1119 	/* Retry if a huge pmd materialized from under us */
1120 	if (unlikely(!dst_pte)) {
1121 		err = -EAGAIN;
1122 		goto out;
1123 	}
1124 
1125 	src_pte = pte_offset_map_nolock(mm, src_pmd, src_addr, &src_ptl);
1126 
1127 	/*
1128 	 * We held the mmap_lock for reading so MADV_DONTNEED
1129 	 * can zap transparent huge pages under us, or the
1130 	 * transparent huge page fault can establish new
1131 	 * transparent huge pages under us.
1132 	 */
1133 	if (unlikely(!src_pte)) {
1134 		err = -EAGAIN;
1135 		goto out;
1136 	}
1137 
1138 	/* Sanity checks before the operation */
1139 	if (WARN_ON_ONCE(pmd_none(*dst_pmd)) ||	WARN_ON_ONCE(pmd_none(*src_pmd)) ||
1140 	    WARN_ON_ONCE(pmd_trans_huge(*dst_pmd)) || WARN_ON_ONCE(pmd_trans_huge(*src_pmd))) {
1141 		err = -EINVAL;
1142 		goto out;
1143 	}
1144 
1145 	spin_lock(dst_ptl);
1146 	orig_dst_pte = ptep_get(dst_pte);
1147 	spin_unlock(dst_ptl);
1148 	if (!pte_none(orig_dst_pte)) {
1149 		err = -EEXIST;
1150 		goto out;
1151 	}
1152 
1153 	spin_lock(src_ptl);
1154 	orig_src_pte = ptep_get(src_pte);
1155 	spin_unlock(src_ptl);
1156 	if (pte_none(orig_src_pte)) {
1157 		if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES))
1158 			err = -ENOENT;
1159 		else /* nothing to do to move a hole */
1160 			err = 0;
1161 		goto out;
1162 	}
1163 
1164 	/* If PTE changed after we locked the folio them start over */
1165 	if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) {
1166 		err = -EAGAIN;
1167 		goto out;
1168 	}
1169 
1170 	if (pte_present(orig_src_pte)) {
1171 		if (is_zero_pfn(pte_pfn(orig_src_pte))) {
1172 			err = move_zeropage_pte(mm, dst_vma, src_vma,
1173 					       dst_addr, src_addr, dst_pte, src_pte,
1174 					       orig_dst_pte, orig_src_pte,
1175 					       dst_ptl, src_ptl);
1176 			goto out;
1177 		}
1178 
1179 		/*
1180 		 * Pin and lock both source folio and anon_vma. Since we are in
1181 		 * RCU read section, we can't block, so on contention have to
1182 		 * unmap the ptes, obtain the lock and retry.
1183 		 */
1184 		if (!src_folio) {
1185 			struct folio *folio;
1186 
1187 			/*
1188 			 * Pin the page while holding the lock to be sure the
1189 			 * page isn't freed under us
1190 			 */
1191 			spin_lock(src_ptl);
1192 			if (!pte_same(orig_src_pte, ptep_get(src_pte))) {
1193 				spin_unlock(src_ptl);
1194 				err = -EAGAIN;
1195 				goto out;
1196 			}
1197 
1198 			folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
1199 			if (!folio || !PageAnonExclusive(&folio->page)) {
1200 				spin_unlock(src_ptl);
1201 				err = -EBUSY;
1202 				goto out;
1203 			}
1204 
1205 			folio_get(folio);
1206 			src_folio = folio;
1207 			src_folio_pte = orig_src_pte;
1208 			spin_unlock(src_ptl);
1209 
1210 			if (!folio_trylock(src_folio)) {
1211 				pte_unmap(&orig_src_pte);
1212 				pte_unmap(&orig_dst_pte);
1213 				src_pte = dst_pte = NULL;
1214 				/* now we can block and wait */
1215 				folio_lock(src_folio);
1216 				goto retry;
1217 			}
1218 
1219 			if (WARN_ON_ONCE(!folio_test_anon(src_folio))) {
1220 				err = -EBUSY;
1221 				goto out;
1222 			}
1223 		}
1224 
1225 		/* at this point we have src_folio locked */
1226 		if (folio_test_large(src_folio)) {
1227 			/* split_folio() can block */
1228 			pte_unmap(&orig_src_pte);
1229 			pte_unmap(&orig_dst_pte);
1230 			src_pte = dst_pte = NULL;
1231 			err = split_folio(src_folio);
1232 			if (err)
1233 				goto out;
1234 			/* have to reacquire the folio after it got split */
1235 			folio_unlock(src_folio);
1236 			folio_put(src_folio);
1237 			src_folio = NULL;
1238 			goto retry;
1239 		}
1240 
1241 		if (!src_anon_vma) {
1242 			/*
1243 			 * folio_referenced walks the anon_vma chain
1244 			 * without the folio lock. Serialize against it with
1245 			 * the anon_vma lock, the folio lock is not enough.
1246 			 */
1247 			src_anon_vma = folio_get_anon_vma(src_folio);
1248 			if (!src_anon_vma) {
1249 				/* page was unmapped from under us */
1250 				err = -EAGAIN;
1251 				goto out;
1252 			}
1253 			if (!anon_vma_trylock_write(src_anon_vma)) {
1254 				pte_unmap(&orig_src_pte);
1255 				pte_unmap(&orig_dst_pte);
1256 				src_pte = dst_pte = NULL;
1257 				/* now we can block and wait */
1258 				anon_vma_lock_write(src_anon_vma);
1259 				goto retry;
1260 			}
1261 		}
1262 
1263 		err = move_present_pte(mm,  dst_vma, src_vma,
1264 				       dst_addr, src_addr, dst_pte, src_pte,
1265 				       orig_dst_pte, orig_src_pte,
1266 				       dst_ptl, src_ptl, src_folio);
1267 	} else {
1268 		entry = pte_to_swp_entry(orig_src_pte);
1269 		if (non_swap_entry(entry)) {
1270 			if (is_migration_entry(entry)) {
1271 				pte_unmap(&orig_src_pte);
1272 				pte_unmap(&orig_dst_pte);
1273 				src_pte = dst_pte = NULL;
1274 				migration_entry_wait(mm, src_pmd, src_addr);
1275 				err = -EAGAIN;
1276 			} else
1277 				err = -EFAULT;
1278 			goto out;
1279 		}
1280 
1281 		err = move_swap_pte(mm, dst_addr, src_addr,
1282 				    dst_pte, src_pte,
1283 				    orig_dst_pte, orig_src_pte,
1284 				    dst_ptl, src_ptl);
1285 	}
1286 
1287 out:
1288 	if (src_anon_vma) {
1289 		anon_vma_unlock_write(src_anon_vma);
1290 		put_anon_vma(src_anon_vma);
1291 	}
1292 	if (src_folio) {
1293 		folio_unlock(src_folio);
1294 		folio_put(src_folio);
1295 	}
1296 	if (dst_pte)
1297 		pte_unmap(dst_pte);
1298 	if (src_pte)
1299 		pte_unmap(src_pte);
1300 	mmu_notifier_invalidate_range_end(&range);
1301 
1302 	return err;
1303 }
1304 
1305 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1306 static inline bool move_splits_huge_pmd(unsigned long dst_addr,
1307 					unsigned long src_addr,
1308 					unsigned long src_end)
1309 {
1310 	return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) ||
1311 		src_end - src_addr < HPAGE_PMD_SIZE;
1312 }
1313 #else
1314 static inline bool move_splits_huge_pmd(unsigned long dst_addr,
1315 					unsigned long src_addr,
1316 					unsigned long src_end)
1317 {
1318 	/* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */
1319 	return false;
1320 }
1321 #endif
1322 
1323 static inline bool vma_move_compatible(struct vm_area_struct *vma)
1324 {
1325 	return !(vma->vm_flags & (VM_PFNMAP | VM_IO |  VM_HUGETLB |
1326 				  VM_MIXEDMAP | VM_SHADOW_STACK));
1327 }
1328 
1329 static int validate_move_areas(struct userfaultfd_ctx *ctx,
1330 			       struct vm_area_struct *src_vma,
1331 			       struct vm_area_struct *dst_vma)
1332 {
1333 	/* Only allow moving if both have the same access and protection */
1334 	if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) ||
1335 	    pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot))
1336 		return -EINVAL;
1337 
1338 	/* Only allow moving if both are mlocked or both aren't */
1339 	if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED))
1340 		return -EINVAL;
1341 
1342 	/*
1343 	 * For now, we keep it simple and only move between writable VMAs.
1344 	 * Access flags are equal, therefore cheching only the source is enough.
1345 	 */
1346 	if (!(src_vma->vm_flags & VM_WRITE))
1347 		return -EINVAL;
1348 
1349 	/* Check if vma flags indicate content which can be moved */
1350 	if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma))
1351 		return -EINVAL;
1352 
1353 	/* Ensure dst_vma is registered in uffd we are operating on */
1354 	if (!dst_vma->vm_userfaultfd_ctx.ctx ||
1355 	    dst_vma->vm_userfaultfd_ctx.ctx != ctx)
1356 		return -EINVAL;
1357 
1358 	/* Only allow moving across anonymous vmas */
1359 	if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma))
1360 		return -EINVAL;
1361 
1362 	return 0;
1363 }
1364 
1365 static __always_inline
1366 int find_vmas_mm_locked(struct mm_struct *mm,
1367 			unsigned long dst_start,
1368 			unsigned long src_start,
1369 			struct vm_area_struct **dst_vmap,
1370 			struct vm_area_struct **src_vmap)
1371 {
1372 	struct vm_area_struct *vma;
1373 
1374 	mmap_assert_locked(mm);
1375 	vma = find_vma_and_prepare_anon(mm, dst_start);
1376 	if (IS_ERR(vma))
1377 		return PTR_ERR(vma);
1378 
1379 	*dst_vmap = vma;
1380 	/* Skip finding src_vma if src_start is in dst_vma */
1381 	if (src_start >= vma->vm_start && src_start < vma->vm_end)
1382 		goto out_success;
1383 
1384 	vma = vma_lookup(mm, src_start);
1385 	if (!vma)
1386 		return -ENOENT;
1387 out_success:
1388 	*src_vmap = vma;
1389 	return 0;
1390 }
1391 
1392 #ifdef CONFIG_PER_VMA_LOCK
1393 static int uffd_move_lock(struct mm_struct *mm,
1394 			  unsigned long dst_start,
1395 			  unsigned long src_start,
1396 			  struct vm_area_struct **dst_vmap,
1397 			  struct vm_area_struct **src_vmap)
1398 {
1399 	struct vm_area_struct *vma;
1400 	int err;
1401 
1402 	vma = uffd_lock_vma(mm, dst_start);
1403 	if (IS_ERR(vma))
1404 		return PTR_ERR(vma);
1405 
1406 	*dst_vmap = vma;
1407 	/*
1408 	 * Skip finding src_vma if src_start is in dst_vma. This also ensures
1409 	 * that we don't lock the same vma twice.
1410 	 */
1411 	if (src_start >= vma->vm_start && src_start < vma->vm_end) {
1412 		*src_vmap = vma;
1413 		return 0;
1414 	}
1415 
1416 	/*
1417 	 * Using uffd_lock_vma() to get src_vma can lead to following deadlock:
1418 	 *
1419 	 * Thread1				Thread2
1420 	 * -------				-------
1421 	 * vma_start_read(dst_vma)
1422 	 *					mmap_write_lock(mm)
1423 	 *					vma_start_write(src_vma)
1424 	 * vma_start_read(src_vma)
1425 	 * mmap_read_lock(mm)
1426 	 *					vma_start_write(dst_vma)
1427 	 */
1428 	*src_vmap = lock_vma_under_rcu(mm, src_start);
1429 	if (likely(*src_vmap))
1430 		return 0;
1431 
1432 	/* Undo any locking and retry in mmap_lock critical section */
1433 	vma_end_read(*dst_vmap);
1434 
1435 	mmap_read_lock(mm);
1436 	err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
1437 	if (!err) {
1438 		/*
1439 		 * See comment in uffd_lock_vma() as to why not using
1440 		 * vma_start_read() here.
1441 		 */
1442 		down_read(&(*dst_vmap)->vm_lock->lock);
1443 		if (*dst_vmap != *src_vmap)
1444 			down_read_nested(&(*src_vmap)->vm_lock->lock,
1445 					 SINGLE_DEPTH_NESTING);
1446 	}
1447 	mmap_read_unlock(mm);
1448 	return err;
1449 }
1450 
1451 static void uffd_move_unlock(struct vm_area_struct *dst_vma,
1452 			     struct vm_area_struct *src_vma)
1453 {
1454 	vma_end_read(src_vma);
1455 	if (src_vma != dst_vma)
1456 		vma_end_read(dst_vma);
1457 }
1458 
1459 #else
1460 
1461 static int uffd_move_lock(struct mm_struct *mm,
1462 			  unsigned long dst_start,
1463 			  unsigned long src_start,
1464 			  struct vm_area_struct **dst_vmap,
1465 			  struct vm_area_struct **src_vmap)
1466 {
1467 	int err;
1468 
1469 	mmap_read_lock(mm);
1470 	err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
1471 	if (err)
1472 		mmap_read_unlock(mm);
1473 	return err;
1474 }
1475 
1476 static void uffd_move_unlock(struct vm_area_struct *dst_vma,
1477 			     struct vm_area_struct *src_vma)
1478 {
1479 	mmap_assert_locked(src_vma->vm_mm);
1480 	mmap_read_unlock(dst_vma->vm_mm);
1481 }
1482 #endif
1483 
1484 /**
1485  * move_pages - move arbitrary anonymous pages of an existing vma
1486  * @ctx: pointer to the userfaultfd context
1487  * @dst_start: start of the destination virtual memory range
1488  * @src_start: start of the source virtual memory range
1489  * @len: length of the virtual memory range
1490  * @mode: flags from uffdio_move.mode
1491  *
1492  * It will either use the mmap_lock in read mode or per-vma locks
1493  *
1494  * move_pages() remaps arbitrary anonymous pages atomically in zero
1495  * copy. It only works on non shared anonymous pages because those can
1496  * be relocated without generating non linear anon_vmas in the rmap
1497  * code.
1498  *
1499  * It provides a zero copy mechanism to handle userspace page faults.
1500  * The source vma pages should have mapcount == 1, which can be
1501  * enforced by using madvise(MADV_DONTFORK) on src vma.
1502  *
1503  * The thread receiving the page during the userland page fault
1504  * will receive the faulting page in the source vma through the network,
1505  * storage or any other I/O device (MADV_DONTFORK in the source vma
1506  * avoids move_pages() to fail with -EBUSY if the process forks before
1507  * move_pages() is called), then it will call move_pages() to map the
1508  * page in the faulting address in the destination vma.
1509  *
1510  * This userfaultfd command works purely via pagetables, so it's the
1511  * most efficient way to move physical non shared anonymous pages
1512  * across different virtual addresses. Unlike mremap()/mmap()/munmap()
1513  * it does not create any new vmas. The mapping in the destination
1514  * address is atomic.
1515  *
1516  * It only works if the vma protection bits are identical from the
1517  * source and destination vma.
1518  *
1519  * It can remap non shared anonymous pages within the same vma too.
1520  *
1521  * If the source virtual memory range has any unmapped holes, or if
1522  * the destination virtual memory range is not a whole unmapped hole,
1523  * move_pages() will fail respectively with -ENOENT or -EEXIST. This
1524  * provides a very strict behavior to avoid any chance of memory
1525  * corruption going unnoticed if there are userland race conditions.
1526  * Only one thread should resolve the userland page fault at any given
1527  * time for any given faulting address. This means that if two threads
1528  * try to both call move_pages() on the same destination address at the
1529  * same time, the second thread will get an explicit error from this
1530  * command.
1531  *
1532  * The command retval will return "len" is successful. The command
1533  * however can be interrupted by fatal signals or errors. If
1534  * interrupted it will return the number of bytes successfully
1535  * remapped before the interruption if any, or the negative error if
1536  * none. It will never return zero. Either it will return an error or
1537  * an amount of bytes successfully moved. If the retval reports a
1538  * "short" remap, the move_pages() command should be repeated by
1539  * userland with src+retval, dst+reval, len-retval if it wants to know
1540  * about the error that interrupted it.
1541  *
1542  * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to
1543  * prevent -ENOENT errors to materialize if there are holes in the
1544  * source virtual range that is being remapped. The holes will be
1545  * accounted as successfully remapped in the retval of the
1546  * command. This is mostly useful to remap hugepage naturally aligned
1547  * virtual regions without knowing if there are transparent hugepage
1548  * in the regions or not, but preventing the risk of having to split
1549  * the hugepmd during the remap.
1550  *
1551  * If there's any rmap walk that is taking the anon_vma locks without
1552  * first obtaining the folio lock (the only current instance is
1553  * folio_referenced), they will have to verify if the folio->mapping
1554  * has changed after taking the anon_vma lock. If it changed they
1555  * should release the lock and retry obtaining a new anon_vma, because
1556  * it means the anon_vma was changed by move_pages() before the lock
1557  * could be obtained. This is the only additional complexity added to
1558  * the rmap code to provide this anonymous page remapping functionality.
1559  */
1560 ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
1561 		   unsigned long src_start, unsigned long len, __u64 mode)
1562 {
1563 	struct mm_struct *mm = ctx->mm;
1564 	struct vm_area_struct *src_vma, *dst_vma;
1565 	unsigned long src_addr, dst_addr;
1566 	pmd_t *src_pmd, *dst_pmd;
1567 	long err = -EINVAL;
1568 	ssize_t moved = 0;
1569 
1570 	/* Sanitize the command parameters. */
1571 	if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
1572 	    WARN_ON_ONCE(dst_start & ~PAGE_MASK) ||
1573 	    WARN_ON_ONCE(len & ~PAGE_MASK))
1574 		goto out;
1575 
1576 	/* Does the address range wrap, or is the span zero-sized? */
1577 	if (WARN_ON_ONCE(src_start + len <= src_start) ||
1578 	    WARN_ON_ONCE(dst_start + len <= dst_start))
1579 		goto out;
1580 
1581 	err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma);
1582 	if (err)
1583 		goto out;
1584 
1585 	/* Re-check after taking map_changing_lock */
1586 	err = -EAGAIN;
1587 	down_read(&ctx->map_changing_lock);
1588 	if (likely(atomic_read(&ctx->mmap_changing)))
1589 		goto out_unlock;
1590 	/*
1591 	 * Make sure the vma is not shared, that the src and dst remap
1592 	 * ranges are both valid and fully within a single existing
1593 	 * vma.
1594 	 */
1595 	err = -EINVAL;
1596 	if (src_vma->vm_flags & VM_SHARED)
1597 		goto out_unlock;
1598 	if (src_start + len > src_vma->vm_end)
1599 		goto out_unlock;
1600 
1601 	if (dst_vma->vm_flags & VM_SHARED)
1602 		goto out_unlock;
1603 	if (dst_start + len > dst_vma->vm_end)
1604 		goto out_unlock;
1605 
1606 	err = validate_move_areas(ctx, src_vma, dst_vma);
1607 	if (err)
1608 		goto out_unlock;
1609 
1610 	for (src_addr = src_start, dst_addr = dst_start;
1611 	     src_addr < src_start + len;) {
1612 		spinlock_t *ptl;
1613 		pmd_t dst_pmdval;
1614 		unsigned long step_size;
1615 
1616 		/*
1617 		 * Below works because anonymous area would not have a
1618 		 * transparent huge PUD. If file-backed support is added,
1619 		 * that case would need to be handled here.
1620 		 */
1621 		src_pmd = mm_find_pmd(mm, src_addr);
1622 		if (unlikely(!src_pmd)) {
1623 			if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
1624 				err = -ENOENT;
1625 				break;
1626 			}
1627 			src_pmd = mm_alloc_pmd(mm, src_addr);
1628 			if (unlikely(!src_pmd)) {
1629 				err = -ENOMEM;
1630 				break;
1631 			}
1632 		}
1633 		dst_pmd = mm_alloc_pmd(mm, dst_addr);
1634 		if (unlikely(!dst_pmd)) {
1635 			err = -ENOMEM;
1636 			break;
1637 		}
1638 
1639 		dst_pmdval = pmdp_get_lockless(dst_pmd);
1640 		/*
1641 		 * If the dst_pmd is mapped as THP don't override it and just
1642 		 * be strict. If dst_pmd changes into TPH after this check, the
1643 		 * move_pages_huge_pmd() will detect the change and retry
1644 		 * while move_pages_pte() will detect the change and fail.
1645 		 */
1646 		if (unlikely(pmd_trans_huge(dst_pmdval))) {
1647 			err = -EEXIST;
1648 			break;
1649 		}
1650 
1651 		ptl = pmd_trans_huge_lock(src_pmd, src_vma);
1652 		if (ptl) {
1653 			if (pmd_devmap(*src_pmd)) {
1654 				spin_unlock(ptl);
1655 				err = -ENOENT;
1656 				break;
1657 			}
1658 
1659 			/* Check if we can move the pmd without splitting it. */
1660 			if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
1661 			    !pmd_none(dst_pmdval)) {
1662 				struct folio *folio = pmd_folio(*src_pmd);
1663 
1664 				if (!folio || (!is_huge_zero_folio(folio) &&
1665 					       !PageAnonExclusive(&folio->page))) {
1666 					spin_unlock(ptl);
1667 					err = -EBUSY;
1668 					break;
1669 				}
1670 
1671 				spin_unlock(ptl);
1672 				split_huge_pmd(src_vma, src_pmd, src_addr);
1673 				/* The folio will be split by move_pages_pte() */
1674 				continue;
1675 			}
1676 
1677 			err = move_pages_huge_pmd(mm, dst_pmd, src_pmd,
1678 						  dst_pmdval, dst_vma, src_vma,
1679 						  dst_addr, src_addr);
1680 			step_size = HPAGE_PMD_SIZE;
1681 		} else {
1682 			if (pmd_none(*src_pmd)) {
1683 				if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
1684 					err = -ENOENT;
1685 					break;
1686 				}
1687 				if (unlikely(__pte_alloc(mm, src_pmd))) {
1688 					err = -ENOMEM;
1689 					break;
1690 				}
1691 			}
1692 
1693 			if (unlikely(pte_alloc(mm, dst_pmd))) {
1694 				err = -ENOMEM;
1695 				break;
1696 			}
1697 
1698 			err = move_pages_pte(mm, dst_pmd, src_pmd,
1699 					     dst_vma, src_vma,
1700 					     dst_addr, src_addr, mode);
1701 			step_size = PAGE_SIZE;
1702 		}
1703 
1704 		cond_resched();
1705 
1706 		if (fatal_signal_pending(current)) {
1707 			/* Do not override an error */
1708 			if (!err || err == -EAGAIN)
1709 				err = -EINTR;
1710 			break;
1711 		}
1712 
1713 		if (err) {
1714 			if (err == -EAGAIN)
1715 				continue;
1716 			break;
1717 		}
1718 
1719 		/* Proceed to the next page */
1720 		dst_addr += step_size;
1721 		src_addr += step_size;
1722 		moved += step_size;
1723 	}
1724 
1725 out_unlock:
1726 	up_read(&ctx->map_changing_lock);
1727 	uffd_move_unlock(dst_vma, src_vma);
1728 out:
1729 	VM_WARN_ON(moved < 0);
1730 	VM_WARN_ON(err > 0);
1731 	VM_WARN_ON(!moved && !err);
1732 	return moved ? moved : err;
1733 }
1734