xref: /linux/fs/userfaultfd.c (revision ff9a79307f89563da6d841da8b7cc4a0afceb0e2)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  fs/userfaultfd.c
4  *
5  *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
6  *  Copyright (C) 2008-2009 Red Hat, Inc.
7  *  Copyright (C) 2015  Red Hat, Inc.
8  *
9  *  Some part derived from fs/eventfd.c (anon inode setup) and
10  *  mm/ksm.c (mm hashing).
11  */
12 
13 #include <linux/list.h>
14 #include <linux/hashtable.h>
15 #include <linux/sched/signal.h>
16 #include <linux/sched/mm.h>
17 #include <linux/mm.h>
18 #include <linux/mm_inline.h>
19 #include <linux/mmu_notifier.h>
20 #include <linux/poll.h>
21 #include <linux/slab.h>
22 #include <linux/seq_file.h>
23 #include <linux/file.h>
24 #include <linux/bug.h>
25 #include <linux/anon_inodes.h>
26 #include <linux/syscalls.h>
27 #include <linux/userfaultfd_k.h>
28 #include <linux/mempolicy.h>
29 #include <linux/ioctl.h>
30 #include <linux/security.h>
31 #include <linux/hugetlb.h>
32 #include <linux/swapops.h>
33 #include <linux/miscdevice.h>
34 #include <linux/uio.h>
35 
36 static int sysctl_unprivileged_userfaultfd __read_mostly;
37 
38 #ifdef CONFIG_SYSCTL
39 static struct ctl_table vm_userfaultfd_table[] = {
40 	{
41 		.procname	= "unprivileged_userfaultfd",
42 		.data		= &sysctl_unprivileged_userfaultfd,
43 		.maxlen		= sizeof(sysctl_unprivileged_userfaultfd),
44 		.mode		= 0644,
45 		.proc_handler	= proc_dointvec_minmax,
46 		.extra1		= SYSCTL_ZERO,
47 		.extra2		= SYSCTL_ONE,
48 	},
49 };
50 #endif
51 
52 static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
53 
54 struct userfaultfd_fork_ctx {
55 	struct userfaultfd_ctx *orig;
56 	struct userfaultfd_ctx *new;
57 	struct list_head list;
58 };
59 
60 struct userfaultfd_unmap_ctx {
61 	struct userfaultfd_ctx *ctx;
62 	unsigned long start;
63 	unsigned long end;
64 	struct list_head list;
65 };
66 
67 struct userfaultfd_wait_queue {
68 	struct uffd_msg msg;
69 	wait_queue_entry_t wq;
70 	struct userfaultfd_ctx *ctx;
71 	bool waken;
72 };
73 
74 struct userfaultfd_wake_range {
75 	unsigned long start;
76 	unsigned long len;
77 };
78 
79 /* internal indication that UFFD_API ioctl was successfully executed */
80 #define UFFD_FEATURE_INITIALIZED		(1u << 31)
81 
82 static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
83 {
84 	return ctx->features & UFFD_FEATURE_INITIALIZED;
85 }
86 
87 static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
88 {
89 	return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
90 }
91 
92 /*
93  * Whether WP_UNPOPULATED is enabled on the uffd context.  It is only
94  * meaningful when userfaultfd_wp()==true on the vma and when it's
95  * anonymous.
96  */
97 bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
98 {
99 	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
100 
101 	if (!ctx)
102 		return false;
103 
104 	return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
105 }
106 
107 static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
108 				     vm_flags_t flags)
109 {
110 	const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
111 
112 	vm_flags_reset(vma, flags);
113 	/*
114 	 * For shared mappings, we want to enable writenotify while
115 	 * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
116 	 * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
117 	 */
118 	if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
119 		vma_set_page_prot(vma);
120 }
121 
122 static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
123 				     int wake_flags, void *key)
124 {
125 	struct userfaultfd_wake_range *range = key;
126 	int ret;
127 	struct userfaultfd_wait_queue *uwq;
128 	unsigned long start, len;
129 
130 	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
131 	ret = 0;
132 	/* len == 0 means wake all */
133 	start = range->start;
134 	len = range->len;
135 	if (len && (start > uwq->msg.arg.pagefault.address ||
136 		    start + len <= uwq->msg.arg.pagefault.address))
137 		goto out;
138 	WRITE_ONCE(uwq->waken, true);
139 	/*
140 	 * The Program-Order guarantees provided by the scheduler
141 	 * ensure uwq->waken is visible before the task is woken.
142 	 */
143 	ret = wake_up_state(wq->private, mode);
144 	if (ret) {
145 		/*
146 		 * Wake only once, autoremove behavior.
147 		 *
148 		 * After the effect of list_del_init is visible to the other
149 		 * CPUs, the waitqueue may disappear from under us, see the
150 		 * !list_empty_careful() in handle_userfault().
151 		 *
152 		 * try_to_wake_up() has an implicit smp_mb(), and the
153 		 * wq->private is read before calling the extern function
154 		 * "wake_up_state" (which in turns calls try_to_wake_up).
155 		 */
156 		list_del_init(&wq->entry);
157 	}
158 out:
159 	return ret;
160 }
161 
162 /**
163  * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
164  * context.
165  * @ctx: [in] Pointer to the userfaultfd context.
166  */
167 static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
168 {
169 	refcount_inc(&ctx->refcount);
170 }
171 
172 /**
173  * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
174  * context.
175  * @ctx: [in] Pointer to userfaultfd context.
176  *
177  * The userfaultfd context reference must have been previously acquired either
178  * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
179  */
180 static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
181 {
182 	if (refcount_dec_and_test(&ctx->refcount)) {
183 		VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
184 		VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
185 		VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
186 		VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
187 		VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
188 		VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
189 		VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
190 		VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
191 		mmdrop(ctx->mm);
192 		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
193 	}
194 }
195 
196 static inline void msg_init(struct uffd_msg *msg)
197 {
198 	BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
199 	/*
200 	 * Must use memset to zero out the paddings or kernel data is
201 	 * leaked to userland.
202 	 */
203 	memset(msg, 0, sizeof(struct uffd_msg));
204 }
205 
206 static inline struct uffd_msg userfault_msg(unsigned long address,
207 					    unsigned long real_address,
208 					    unsigned int flags,
209 					    unsigned long reason,
210 					    unsigned int features)
211 {
212 	struct uffd_msg msg;
213 
214 	msg_init(&msg);
215 	msg.event = UFFD_EVENT_PAGEFAULT;
216 
217 	msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
218 				    real_address : address;
219 
220 	/*
221 	 * These flags indicate why the userfault occurred:
222 	 * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
223 	 * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
224 	 * - Neither of these flags being set indicates a MISSING fault.
225 	 *
226 	 * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
227 	 * fault. Otherwise, it was a read fault.
228 	 */
229 	if (flags & FAULT_FLAG_WRITE)
230 		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
231 	if (reason & VM_UFFD_WP)
232 		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
233 	if (reason & VM_UFFD_MINOR)
234 		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
235 	if (features & UFFD_FEATURE_THREAD_ID)
236 		msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
237 	return msg;
238 }
239 
240 #ifdef CONFIG_HUGETLB_PAGE
241 /*
242  * Same functionality as userfaultfd_must_wait below with modifications for
243  * hugepmd ranges.
244  */
245 static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
246 					      struct vm_fault *vmf,
247 					      unsigned long reason)
248 {
249 	struct vm_area_struct *vma = vmf->vma;
250 	pte_t *ptep, pte;
251 	bool ret = true;
252 
253 	assert_fault_locked(vmf);
254 
255 	ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
256 	if (!ptep)
257 		goto out;
258 
259 	ret = false;
260 	pte = huge_ptep_get(ptep);
261 
262 	/*
263 	 * Lockless access: we're in a wait_event so it's ok if it
264 	 * changes under us.  PTE markers should be handled the same as none
265 	 * ptes here.
266 	 */
267 	if (huge_pte_none_mostly(pte))
268 		ret = true;
269 	if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
270 		ret = true;
271 out:
272 	return ret;
273 }
274 #else
275 static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
276 					      struct vm_fault *vmf,
277 					      unsigned long reason)
278 {
279 	return false;	/* should never get here */
280 }
281 #endif /* CONFIG_HUGETLB_PAGE */
282 
283 /*
284  * Verify the pagetables are still not ok after having reigstered into
285  * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
286  * userfault that has already been resolved, if userfaultfd_read_iter and
287  * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
288  * threads.
289  */
290 static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
291 					 struct vm_fault *vmf,
292 					 unsigned long reason)
293 {
294 	struct mm_struct *mm = ctx->mm;
295 	unsigned long address = vmf->address;
296 	pgd_t *pgd;
297 	p4d_t *p4d;
298 	pud_t *pud;
299 	pmd_t *pmd, _pmd;
300 	pte_t *pte;
301 	pte_t ptent;
302 	bool ret = true;
303 
304 	assert_fault_locked(vmf);
305 
306 	pgd = pgd_offset(mm, address);
307 	if (!pgd_present(*pgd))
308 		goto out;
309 	p4d = p4d_offset(pgd, address);
310 	if (!p4d_present(*p4d))
311 		goto out;
312 	pud = pud_offset(p4d, address);
313 	if (!pud_present(*pud))
314 		goto out;
315 	pmd = pmd_offset(pud, address);
316 again:
317 	_pmd = pmdp_get_lockless(pmd);
318 	if (pmd_none(_pmd))
319 		goto out;
320 
321 	ret = false;
322 	if (!pmd_present(_pmd) || pmd_devmap(_pmd))
323 		goto out;
324 
325 	if (pmd_trans_huge(_pmd)) {
326 		if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
327 			ret = true;
328 		goto out;
329 	}
330 
331 	pte = pte_offset_map(pmd, address);
332 	if (!pte) {
333 		ret = true;
334 		goto again;
335 	}
336 	/*
337 	 * Lockless access: we're in a wait_event so it's ok if it
338 	 * changes under us.  PTE markers should be handled the same as none
339 	 * ptes here.
340 	 */
341 	ptent = ptep_get(pte);
342 	if (pte_none_mostly(ptent))
343 		ret = true;
344 	if (!pte_write(ptent) && (reason & VM_UFFD_WP))
345 		ret = true;
346 	pte_unmap(pte);
347 
348 out:
349 	return ret;
350 }
351 
352 static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
353 {
354 	if (flags & FAULT_FLAG_INTERRUPTIBLE)
355 		return TASK_INTERRUPTIBLE;
356 
357 	if (flags & FAULT_FLAG_KILLABLE)
358 		return TASK_KILLABLE;
359 
360 	return TASK_UNINTERRUPTIBLE;
361 }
362 
363 /*
364  * The locking rules involved in returning VM_FAULT_RETRY depending on
365  * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
366  * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
367  * recommendation in __lock_page_or_retry is not an understatement.
368  *
369  * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
370  * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
371  * not set.
372  *
373  * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
374  * set, VM_FAULT_RETRY can still be returned if and only if there are
375  * fatal_signal_pending()s, and the mmap_lock must be released before
376  * returning it.
377  */
378 vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
379 {
380 	struct vm_area_struct *vma = vmf->vma;
381 	struct mm_struct *mm = vma->vm_mm;
382 	struct userfaultfd_ctx *ctx;
383 	struct userfaultfd_wait_queue uwq;
384 	vm_fault_t ret = VM_FAULT_SIGBUS;
385 	bool must_wait;
386 	unsigned int blocking_state;
387 
388 	/*
389 	 * We don't do userfault handling for the final child pid update.
390 	 *
391 	 * We also don't do userfault handling during
392 	 * coredumping. hugetlbfs has the special
393 	 * hugetlb_follow_page_mask() to skip missing pages in the
394 	 * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
395 	 * the no_page_table() helper in follow_page_mask(), but the
396 	 * shmem_vm_ops->fault method is invoked even during
397 	 * coredumping and it ends up here.
398 	 */
399 	if (current->flags & (PF_EXITING|PF_DUMPCORE))
400 		goto out;
401 
402 	assert_fault_locked(vmf);
403 
404 	ctx = vma->vm_userfaultfd_ctx.ctx;
405 	if (!ctx)
406 		goto out;
407 
408 	BUG_ON(ctx->mm != mm);
409 
410 	/* Any unrecognized flag is a bug. */
411 	VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
412 	/* 0 or > 1 flags set is a bug; we expect exactly 1. */
413 	VM_BUG_ON(!reason || (reason & (reason - 1)));
414 
415 	if (ctx->features & UFFD_FEATURE_SIGBUS)
416 		goto out;
417 	if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
418 		goto out;
419 
420 	/*
421 	 * If it's already released don't get it. This avoids to loop
422 	 * in __get_user_pages if userfaultfd_release waits on the
423 	 * caller of handle_userfault to release the mmap_lock.
424 	 */
425 	if (unlikely(READ_ONCE(ctx->released))) {
426 		/*
427 		 * Don't return VM_FAULT_SIGBUS in this case, so a non
428 		 * cooperative manager can close the uffd after the
429 		 * last UFFDIO_COPY, without risking to trigger an
430 		 * involuntary SIGBUS if the process was starting the
431 		 * userfaultfd while the userfaultfd was still armed
432 		 * (but after the last UFFDIO_COPY). If the uffd
433 		 * wasn't already closed when the userfault reached
434 		 * this point, that would normally be solved by
435 		 * userfaultfd_must_wait returning 'false'.
436 		 *
437 		 * If we were to return VM_FAULT_SIGBUS here, the non
438 		 * cooperative manager would be instead forced to
439 		 * always call UFFDIO_UNREGISTER before it can safely
440 		 * close the uffd.
441 		 */
442 		ret = VM_FAULT_NOPAGE;
443 		goto out;
444 	}
445 
446 	/*
447 	 * Check that we can return VM_FAULT_RETRY.
448 	 *
449 	 * NOTE: it should become possible to return VM_FAULT_RETRY
450 	 * even if FAULT_FLAG_TRIED is set without leading to gup()
451 	 * -EBUSY failures, if the userfaultfd is to be extended for
452 	 * VM_UFFD_WP tracking and we intend to arm the userfault
453 	 * without first stopping userland access to the memory. For
454 	 * VM_UFFD_MISSING userfaults this is enough for now.
455 	 */
456 	if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
457 		/*
458 		 * Validate the invariant that nowait must allow retry
459 		 * to be sure not to return SIGBUS erroneously on
460 		 * nowait invocations.
461 		 */
462 		BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
463 #ifdef CONFIG_DEBUG_VM
464 		if (printk_ratelimit()) {
465 			printk(KERN_WARNING
466 			       "FAULT_FLAG_ALLOW_RETRY missing %x\n",
467 			       vmf->flags);
468 			dump_stack();
469 		}
470 #endif
471 		goto out;
472 	}
473 
474 	/*
475 	 * Handle nowait, not much to do other than tell it to retry
476 	 * and wait.
477 	 */
478 	ret = VM_FAULT_RETRY;
479 	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
480 		goto out;
481 
482 	/* take the reference before dropping the mmap_lock */
483 	userfaultfd_ctx_get(ctx);
484 
485 	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
486 	uwq.wq.private = current;
487 	uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
488 				reason, ctx->features);
489 	uwq.ctx = ctx;
490 	uwq.waken = false;
491 
492 	blocking_state = userfaultfd_get_blocking_state(vmf->flags);
493 
494         /*
495          * Take the vma lock now, in order to safely call
496          * userfaultfd_huge_must_wait() later. Since acquiring the
497          * (sleepable) vma lock can modify the current task state, that
498          * must be before explicitly calling set_current_state().
499          */
500 	if (is_vm_hugetlb_page(vma))
501 		hugetlb_vma_lock_read(vma);
502 
503 	spin_lock_irq(&ctx->fault_pending_wqh.lock);
504 	/*
505 	 * After the __add_wait_queue the uwq is visible to userland
506 	 * through poll/read().
507 	 */
508 	__add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
509 	/*
510 	 * The smp_mb() after __set_current_state prevents the reads
511 	 * following the spin_unlock to happen before the list_add in
512 	 * __add_wait_queue.
513 	 */
514 	set_current_state(blocking_state);
515 	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
516 
517 	if (!is_vm_hugetlb_page(vma))
518 		must_wait = userfaultfd_must_wait(ctx, vmf, reason);
519 	else
520 		must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
521 	if (is_vm_hugetlb_page(vma))
522 		hugetlb_vma_unlock_read(vma);
523 	release_fault_lock(vmf);
524 
525 	if (likely(must_wait && !READ_ONCE(ctx->released))) {
526 		wake_up_poll(&ctx->fd_wqh, EPOLLIN);
527 		schedule();
528 	}
529 
530 	__set_current_state(TASK_RUNNING);
531 
532 	/*
533 	 * Here we race with the list_del; list_add in
534 	 * userfaultfd_ctx_read(), however because we don't ever run
535 	 * list_del_init() to refile across the two lists, the prev
536 	 * and next pointers will never point to self. list_add also
537 	 * would never let any of the two pointers to point to
538 	 * self. So list_empty_careful won't risk to see both pointers
539 	 * pointing to self at any time during the list refile. The
540 	 * only case where list_del_init() is called is the full
541 	 * removal in the wake function and there we don't re-list_add
542 	 * and it's fine not to block on the spinlock. The uwq on this
543 	 * kernel stack can be released after the list_del_init.
544 	 */
545 	if (!list_empty_careful(&uwq.wq.entry)) {
546 		spin_lock_irq(&ctx->fault_pending_wqh.lock);
547 		/*
548 		 * No need of list_del_init(), the uwq on the stack
549 		 * will be freed shortly anyway.
550 		 */
551 		list_del(&uwq.wq.entry);
552 		spin_unlock_irq(&ctx->fault_pending_wqh.lock);
553 	}
554 
555 	/*
556 	 * ctx may go away after this if the userfault pseudo fd is
557 	 * already released.
558 	 */
559 	userfaultfd_ctx_put(ctx);
560 
561 out:
562 	return ret;
563 }
564 
565 static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
566 					      struct userfaultfd_wait_queue *ewq)
567 {
568 	struct userfaultfd_ctx *release_new_ctx;
569 
570 	if (WARN_ON_ONCE(current->flags & PF_EXITING))
571 		goto out;
572 
573 	ewq->ctx = ctx;
574 	init_waitqueue_entry(&ewq->wq, current);
575 	release_new_ctx = NULL;
576 
577 	spin_lock_irq(&ctx->event_wqh.lock);
578 	/*
579 	 * After the __add_wait_queue the uwq is visible to userland
580 	 * through poll/read().
581 	 */
582 	__add_wait_queue(&ctx->event_wqh, &ewq->wq);
583 	for (;;) {
584 		set_current_state(TASK_KILLABLE);
585 		if (ewq->msg.event == 0)
586 			break;
587 		if (READ_ONCE(ctx->released) ||
588 		    fatal_signal_pending(current)) {
589 			/*
590 			 * &ewq->wq may be queued in fork_event, but
591 			 * __remove_wait_queue ignores the head
592 			 * parameter. It would be a problem if it
593 			 * didn't.
594 			 */
595 			__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
596 			if (ewq->msg.event == UFFD_EVENT_FORK) {
597 				struct userfaultfd_ctx *new;
598 
599 				new = (struct userfaultfd_ctx *)
600 					(unsigned long)
601 					ewq->msg.arg.reserved.reserved1;
602 				release_new_ctx = new;
603 			}
604 			break;
605 		}
606 
607 		spin_unlock_irq(&ctx->event_wqh.lock);
608 
609 		wake_up_poll(&ctx->fd_wqh, EPOLLIN);
610 		schedule();
611 
612 		spin_lock_irq(&ctx->event_wqh.lock);
613 	}
614 	__set_current_state(TASK_RUNNING);
615 	spin_unlock_irq(&ctx->event_wqh.lock);
616 
617 	if (release_new_ctx) {
618 		struct vm_area_struct *vma;
619 		struct mm_struct *mm = release_new_ctx->mm;
620 		VMA_ITERATOR(vmi, mm, 0);
621 
622 		/* the various vma->vm_userfaultfd_ctx still points to it */
623 		mmap_write_lock(mm);
624 		for_each_vma(vmi, vma) {
625 			if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
626 				vma_start_write(vma);
627 				vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
628 				userfaultfd_set_vm_flags(vma,
629 							 vma->vm_flags & ~__VM_UFFD_FLAGS);
630 			}
631 		}
632 		mmap_write_unlock(mm);
633 
634 		userfaultfd_ctx_put(release_new_ctx);
635 	}
636 
637 	/*
638 	 * ctx may go away after this if the userfault pseudo fd is
639 	 * already released.
640 	 */
641 out:
642 	atomic_dec(&ctx->mmap_changing);
643 	VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0);
644 	userfaultfd_ctx_put(ctx);
645 }
646 
647 static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
648 				       struct userfaultfd_wait_queue *ewq)
649 {
650 	ewq->msg.event = 0;
651 	wake_up_locked(&ctx->event_wqh);
652 	__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
653 }
654 
655 int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
656 {
657 	struct userfaultfd_ctx *ctx = NULL, *octx;
658 	struct userfaultfd_fork_ctx *fctx;
659 
660 	octx = vma->vm_userfaultfd_ctx.ctx;
661 	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
662 		vma_start_write(vma);
663 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
664 		userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
665 		return 0;
666 	}
667 
668 	list_for_each_entry(fctx, fcs, list)
669 		if (fctx->orig == octx) {
670 			ctx = fctx->new;
671 			break;
672 		}
673 
674 	if (!ctx) {
675 		fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
676 		if (!fctx)
677 			return -ENOMEM;
678 
679 		ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
680 		if (!ctx) {
681 			kfree(fctx);
682 			return -ENOMEM;
683 		}
684 
685 		refcount_set(&ctx->refcount, 1);
686 		ctx->flags = octx->flags;
687 		ctx->features = octx->features;
688 		ctx->released = false;
689 		init_rwsem(&ctx->map_changing_lock);
690 		atomic_set(&ctx->mmap_changing, 0);
691 		ctx->mm = vma->vm_mm;
692 		mmgrab(ctx->mm);
693 
694 		userfaultfd_ctx_get(octx);
695 		down_write(&octx->map_changing_lock);
696 		atomic_inc(&octx->mmap_changing);
697 		up_write(&octx->map_changing_lock);
698 		fctx->orig = octx;
699 		fctx->new = ctx;
700 		list_add_tail(&fctx->list, fcs);
701 	}
702 
703 	vma->vm_userfaultfd_ctx.ctx = ctx;
704 	return 0;
705 }
706 
707 static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
708 {
709 	struct userfaultfd_ctx *ctx = fctx->orig;
710 	struct userfaultfd_wait_queue ewq;
711 
712 	msg_init(&ewq.msg);
713 
714 	ewq.msg.event = UFFD_EVENT_FORK;
715 	ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
716 
717 	userfaultfd_event_wait_completion(ctx, &ewq);
718 }
719 
720 void dup_userfaultfd_complete(struct list_head *fcs)
721 {
722 	struct userfaultfd_fork_ctx *fctx, *n;
723 
724 	list_for_each_entry_safe(fctx, n, fcs, list) {
725 		dup_fctx(fctx);
726 		list_del(&fctx->list);
727 		kfree(fctx);
728 	}
729 }
730 
731 void mremap_userfaultfd_prep(struct vm_area_struct *vma,
732 			     struct vm_userfaultfd_ctx *vm_ctx)
733 {
734 	struct userfaultfd_ctx *ctx;
735 
736 	ctx = vma->vm_userfaultfd_ctx.ctx;
737 
738 	if (!ctx)
739 		return;
740 
741 	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
742 		vm_ctx->ctx = ctx;
743 		userfaultfd_ctx_get(ctx);
744 		down_write(&ctx->map_changing_lock);
745 		atomic_inc(&ctx->mmap_changing);
746 		up_write(&ctx->map_changing_lock);
747 	} else {
748 		/* Drop uffd context if remap feature not enabled */
749 		vma_start_write(vma);
750 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
751 		userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
752 	}
753 }
754 
755 void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
756 				 unsigned long from, unsigned long to,
757 				 unsigned long len)
758 {
759 	struct userfaultfd_ctx *ctx = vm_ctx->ctx;
760 	struct userfaultfd_wait_queue ewq;
761 
762 	if (!ctx)
763 		return;
764 
765 	if (to & ~PAGE_MASK) {
766 		userfaultfd_ctx_put(ctx);
767 		return;
768 	}
769 
770 	msg_init(&ewq.msg);
771 
772 	ewq.msg.event = UFFD_EVENT_REMAP;
773 	ewq.msg.arg.remap.from = from;
774 	ewq.msg.arg.remap.to = to;
775 	ewq.msg.arg.remap.len = len;
776 
777 	userfaultfd_event_wait_completion(ctx, &ewq);
778 }
779 
780 bool userfaultfd_remove(struct vm_area_struct *vma,
781 			unsigned long start, unsigned long end)
782 {
783 	struct mm_struct *mm = vma->vm_mm;
784 	struct userfaultfd_ctx *ctx;
785 	struct userfaultfd_wait_queue ewq;
786 
787 	ctx = vma->vm_userfaultfd_ctx.ctx;
788 	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
789 		return true;
790 
791 	userfaultfd_ctx_get(ctx);
792 	down_write(&ctx->map_changing_lock);
793 	atomic_inc(&ctx->mmap_changing);
794 	up_write(&ctx->map_changing_lock);
795 	mmap_read_unlock(mm);
796 
797 	msg_init(&ewq.msg);
798 
799 	ewq.msg.event = UFFD_EVENT_REMOVE;
800 	ewq.msg.arg.remove.start = start;
801 	ewq.msg.arg.remove.end = end;
802 
803 	userfaultfd_event_wait_completion(ctx, &ewq);
804 
805 	return false;
806 }
807 
808 static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
809 			  unsigned long start, unsigned long end)
810 {
811 	struct userfaultfd_unmap_ctx *unmap_ctx;
812 
813 	list_for_each_entry(unmap_ctx, unmaps, list)
814 		if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
815 		    unmap_ctx->end == end)
816 			return true;
817 
818 	return false;
819 }
820 
821 int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
822 			   unsigned long end, struct list_head *unmaps)
823 {
824 	struct userfaultfd_unmap_ctx *unmap_ctx;
825 	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
826 
827 	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
828 	    has_unmap_ctx(ctx, unmaps, start, end))
829 		return 0;
830 
831 	unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
832 	if (!unmap_ctx)
833 		return -ENOMEM;
834 
835 	userfaultfd_ctx_get(ctx);
836 	down_write(&ctx->map_changing_lock);
837 	atomic_inc(&ctx->mmap_changing);
838 	up_write(&ctx->map_changing_lock);
839 	unmap_ctx->ctx = ctx;
840 	unmap_ctx->start = start;
841 	unmap_ctx->end = end;
842 	list_add_tail(&unmap_ctx->list, unmaps);
843 
844 	return 0;
845 }
846 
847 void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
848 {
849 	struct userfaultfd_unmap_ctx *ctx, *n;
850 	struct userfaultfd_wait_queue ewq;
851 
852 	list_for_each_entry_safe(ctx, n, uf, list) {
853 		msg_init(&ewq.msg);
854 
855 		ewq.msg.event = UFFD_EVENT_UNMAP;
856 		ewq.msg.arg.remove.start = ctx->start;
857 		ewq.msg.arg.remove.end = ctx->end;
858 
859 		userfaultfd_event_wait_completion(ctx->ctx, &ewq);
860 
861 		list_del(&ctx->list);
862 		kfree(ctx);
863 	}
864 }
865 
866 static int userfaultfd_release(struct inode *inode, struct file *file)
867 {
868 	struct userfaultfd_ctx *ctx = file->private_data;
869 	struct mm_struct *mm = ctx->mm;
870 	struct vm_area_struct *vma, *prev;
871 	/* len == 0 means wake all */
872 	struct userfaultfd_wake_range range = { .len = 0, };
873 	unsigned long new_flags;
874 	VMA_ITERATOR(vmi, mm, 0);
875 
876 	WRITE_ONCE(ctx->released, true);
877 
878 	if (!mmget_not_zero(mm))
879 		goto wakeup;
880 
881 	/*
882 	 * Flush page faults out of all CPUs. NOTE: all page faults
883 	 * must be retried without returning VM_FAULT_SIGBUS if
884 	 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
885 	 * changes while handle_userfault released the mmap_lock. So
886 	 * it's critical that released is set to true (above), before
887 	 * taking the mmap_lock for writing.
888 	 */
889 	mmap_write_lock(mm);
890 	prev = NULL;
891 	for_each_vma(vmi, vma) {
892 		cond_resched();
893 		BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
894 		       !!(vma->vm_flags & __VM_UFFD_FLAGS));
895 		if (vma->vm_userfaultfd_ctx.ctx != ctx) {
896 			prev = vma;
897 			continue;
898 		}
899 		/* Reset ptes for the whole vma range if wr-protected */
900 		if (userfaultfd_wp(vma))
901 			uffd_wp_range(vma, vma->vm_start,
902 				      vma->vm_end - vma->vm_start, false);
903 		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
904 		vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start,
905 					    vma->vm_end, new_flags,
906 					    NULL_VM_UFFD_CTX);
907 
908 		vma_start_write(vma);
909 		userfaultfd_set_vm_flags(vma, new_flags);
910 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
911 
912 		prev = vma;
913 	}
914 	mmap_write_unlock(mm);
915 	mmput(mm);
916 wakeup:
917 	/*
918 	 * After no new page faults can wait on this fault_*wqh, flush
919 	 * the last page faults that may have been already waiting on
920 	 * the fault_*wqh.
921 	 */
922 	spin_lock_irq(&ctx->fault_pending_wqh.lock);
923 	__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
924 	__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
925 	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
926 
927 	/* Flush pending events that may still wait on event_wqh */
928 	wake_up_all(&ctx->event_wqh);
929 
930 	wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
931 	userfaultfd_ctx_put(ctx);
932 	return 0;
933 }
934 
935 /* fault_pending_wqh.lock must be hold by the caller */
936 static inline struct userfaultfd_wait_queue *find_userfault_in(
937 		wait_queue_head_t *wqh)
938 {
939 	wait_queue_entry_t *wq;
940 	struct userfaultfd_wait_queue *uwq;
941 
942 	lockdep_assert_held(&wqh->lock);
943 
944 	uwq = NULL;
945 	if (!waitqueue_active(wqh))
946 		goto out;
947 	/* walk in reverse to provide FIFO behavior to read userfaults */
948 	wq = list_last_entry(&wqh->head, typeof(*wq), entry);
949 	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
950 out:
951 	return uwq;
952 }
953 
954 static inline struct userfaultfd_wait_queue *find_userfault(
955 		struct userfaultfd_ctx *ctx)
956 {
957 	return find_userfault_in(&ctx->fault_pending_wqh);
958 }
959 
960 static inline struct userfaultfd_wait_queue *find_userfault_evt(
961 		struct userfaultfd_ctx *ctx)
962 {
963 	return find_userfault_in(&ctx->event_wqh);
964 }
965 
966 static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
967 {
968 	struct userfaultfd_ctx *ctx = file->private_data;
969 	__poll_t ret;
970 
971 	poll_wait(file, &ctx->fd_wqh, wait);
972 
973 	if (!userfaultfd_is_initialized(ctx))
974 		return EPOLLERR;
975 
976 	/*
977 	 * poll() never guarantees that read won't block.
978 	 * userfaults can be waken before they're read().
979 	 */
980 	if (unlikely(!(file->f_flags & O_NONBLOCK)))
981 		return EPOLLERR;
982 	/*
983 	 * lockless access to see if there are pending faults
984 	 * __pollwait last action is the add_wait_queue but
985 	 * the spin_unlock would allow the waitqueue_active to
986 	 * pass above the actual list_add inside
987 	 * add_wait_queue critical section. So use a full
988 	 * memory barrier to serialize the list_add write of
989 	 * add_wait_queue() with the waitqueue_active read
990 	 * below.
991 	 */
992 	ret = 0;
993 	smp_mb();
994 	if (waitqueue_active(&ctx->fault_pending_wqh))
995 		ret = EPOLLIN;
996 	else if (waitqueue_active(&ctx->event_wqh))
997 		ret = EPOLLIN;
998 
999 	return ret;
1000 }
1001 
1002 static const struct file_operations userfaultfd_fops;
1003 
1004 static int resolve_userfault_fork(struct userfaultfd_ctx *new,
1005 				  struct inode *inode,
1006 				  struct uffd_msg *msg)
1007 {
1008 	int fd;
1009 
1010 	fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new,
1011 			O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
1012 	if (fd < 0)
1013 		return fd;
1014 
1015 	msg->arg.reserved.reserved1 = 0;
1016 	msg->arg.fork.ufd = fd;
1017 	return 0;
1018 }
1019 
1020 static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
1021 				    struct uffd_msg *msg, struct inode *inode)
1022 {
1023 	ssize_t ret;
1024 	DECLARE_WAITQUEUE(wait, current);
1025 	struct userfaultfd_wait_queue *uwq;
1026 	/*
1027 	 * Handling fork event requires sleeping operations, so
1028 	 * we drop the event_wqh lock, then do these ops, then
1029 	 * lock it back and wake up the waiter. While the lock is
1030 	 * dropped the ewq may go away so we keep track of it
1031 	 * carefully.
1032 	 */
1033 	LIST_HEAD(fork_event);
1034 	struct userfaultfd_ctx *fork_nctx = NULL;
1035 
1036 	/* always take the fd_wqh lock before the fault_pending_wqh lock */
1037 	spin_lock_irq(&ctx->fd_wqh.lock);
1038 	__add_wait_queue(&ctx->fd_wqh, &wait);
1039 	for (;;) {
1040 		set_current_state(TASK_INTERRUPTIBLE);
1041 		spin_lock(&ctx->fault_pending_wqh.lock);
1042 		uwq = find_userfault(ctx);
1043 		if (uwq) {
1044 			/*
1045 			 * Use a seqcount to repeat the lockless check
1046 			 * in wake_userfault() to avoid missing
1047 			 * wakeups because during the refile both
1048 			 * waitqueue could become empty if this is the
1049 			 * only userfault.
1050 			 */
1051 			write_seqcount_begin(&ctx->refile_seq);
1052 
1053 			/*
1054 			 * The fault_pending_wqh.lock prevents the uwq
1055 			 * to disappear from under us.
1056 			 *
1057 			 * Refile this userfault from
1058 			 * fault_pending_wqh to fault_wqh, it's not
1059 			 * pending anymore after we read it.
1060 			 *
1061 			 * Use list_del() by hand (as
1062 			 * userfaultfd_wake_function also uses
1063 			 * list_del_init() by hand) to be sure nobody
1064 			 * changes __remove_wait_queue() to use
1065 			 * list_del_init() in turn breaking the
1066 			 * !list_empty_careful() check in
1067 			 * handle_userfault(). The uwq->wq.head list
1068 			 * must never be empty at any time during the
1069 			 * refile, or the waitqueue could disappear
1070 			 * from under us. The "wait_queue_head_t"
1071 			 * parameter of __remove_wait_queue() is unused
1072 			 * anyway.
1073 			 */
1074 			list_del(&uwq->wq.entry);
1075 			add_wait_queue(&ctx->fault_wqh, &uwq->wq);
1076 
1077 			write_seqcount_end(&ctx->refile_seq);
1078 
1079 			/* careful to always initialize msg if ret == 0 */
1080 			*msg = uwq->msg;
1081 			spin_unlock(&ctx->fault_pending_wqh.lock);
1082 			ret = 0;
1083 			break;
1084 		}
1085 		spin_unlock(&ctx->fault_pending_wqh.lock);
1086 
1087 		spin_lock(&ctx->event_wqh.lock);
1088 		uwq = find_userfault_evt(ctx);
1089 		if (uwq) {
1090 			*msg = uwq->msg;
1091 
1092 			if (uwq->msg.event == UFFD_EVENT_FORK) {
1093 				fork_nctx = (struct userfaultfd_ctx *)
1094 					(unsigned long)
1095 					uwq->msg.arg.reserved.reserved1;
1096 				list_move(&uwq->wq.entry, &fork_event);
1097 				/*
1098 				 * fork_nctx can be freed as soon as
1099 				 * we drop the lock, unless we take a
1100 				 * reference on it.
1101 				 */
1102 				userfaultfd_ctx_get(fork_nctx);
1103 				spin_unlock(&ctx->event_wqh.lock);
1104 				ret = 0;
1105 				break;
1106 			}
1107 
1108 			userfaultfd_event_complete(ctx, uwq);
1109 			spin_unlock(&ctx->event_wqh.lock);
1110 			ret = 0;
1111 			break;
1112 		}
1113 		spin_unlock(&ctx->event_wqh.lock);
1114 
1115 		if (signal_pending(current)) {
1116 			ret = -ERESTARTSYS;
1117 			break;
1118 		}
1119 		if (no_wait) {
1120 			ret = -EAGAIN;
1121 			break;
1122 		}
1123 		spin_unlock_irq(&ctx->fd_wqh.lock);
1124 		schedule();
1125 		spin_lock_irq(&ctx->fd_wqh.lock);
1126 	}
1127 	__remove_wait_queue(&ctx->fd_wqh, &wait);
1128 	__set_current_state(TASK_RUNNING);
1129 	spin_unlock_irq(&ctx->fd_wqh.lock);
1130 
1131 	if (!ret && msg->event == UFFD_EVENT_FORK) {
1132 		ret = resolve_userfault_fork(fork_nctx, inode, msg);
1133 		spin_lock_irq(&ctx->event_wqh.lock);
1134 		if (!list_empty(&fork_event)) {
1135 			/*
1136 			 * The fork thread didn't abort, so we can
1137 			 * drop the temporary refcount.
1138 			 */
1139 			userfaultfd_ctx_put(fork_nctx);
1140 
1141 			uwq = list_first_entry(&fork_event,
1142 					       typeof(*uwq),
1143 					       wq.entry);
1144 			/*
1145 			 * If fork_event list wasn't empty and in turn
1146 			 * the event wasn't already released by fork
1147 			 * (the event is allocated on fork kernel
1148 			 * stack), put the event back to its place in
1149 			 * the event_wq. fork_event head will be freed
1150 			 * as soon as we return so the event cannot
1151 			 * stay queued there no matter the current
1152 			 * "ret" value.
1153 			 */
1154 			list_del(&uwq->wq.entry);
1155 			__add_wait_queue(&ctx->event_wqh, &uwq->wq);
1156 
1157 			/*
1158 			 * Leave the event in the waitqueue and report
1159 			 * error to userland if we failed to resolve
1160 			 * the userfault fork.
1161 			 */
1162 			if (likely(!ret))
1163 				userfaultfd_event_complete(ctx, uwq);
1164 		} else {
1165 			/*
1166 			 * Here the fork thread aborted and the
1167 			 * refcount from the fork thread on fork_nctx
1168 			 * has already been released. We still hold
1169 			 * the reference we took before releasing the
1170 			 * lock above. If resolve_userfault_fork
1171 			 * failed we've to drop it because the
1172 			 * fork_nctx has to be freed in such case. If
1173 			 * it succeeded we'll hold it because the new
1174 			 * uffd references it.
1175 			 */
1176 			if (ret)
1177 				userfaultfd_ctx_put(fork_nctx);
1178 		}
1179 		spin_unlock_irq(&ctx->event_wqh.lock);
1180 	}
1181 
1182 	return ret;
1183 }
1184 
1185 static ssize_t userfaultfd_read_iter(struct kiocb *iocb, struct iov_iter *to)
1186 {
1187 	struct file *file = iocb->ki_filp;
1188 	struct userfaultfd_ctx *ctx = file->private_data;
1189 	ssize_t _ret, ret = 0;
1190 	struct uffd_msg msg;
1191 	struct inode *inode = file_inode(file);
1192 	bool no_wait;
1193 
1194 	if (!userfaultfd_is_initialized(ctx))
1195 		return -EINVAL;
1196 
1197 	no_wait = file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT;
1198 	for (;;) {
1199 		if (iov_iter_count(to) < sizeof(msg))
1200 			return ret ? ret : -EINVAL;
1201 		_ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
1202 		if (_ret < 0)
1203 			return ret ? ret : _ret;
1204 		_ret = !copy_to_iter_full(&msg, sizeof(msg), to);
1205 		if (_ret)
1206 			return ret ? ret : -EFAULT;
1207 		ret += sizeof(msg);
1208 		/*
1209 		 * Allow to read more than one fault at time but only
1210 		 * block if waiting for the very first one.
1211 		 */
1212 		no_wait = true;
1213 	}
1214 }
1215 
1216 static void __wake_userfault(struct userfaultfd_ctx *ctx,
1217 			     struct userfaultfd_wake_range *range)
1218 {
1219 	spin_lock_irq(&ctx->fault_pending_wqh.lock);
1220 	/* wake all in the range and autoremove */
1221 	if (waitqueue_active(&ctx->fault_pending_wqh))
1222 		__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
1223 				     range);
1224 	if (waitqueue_active(&ctx->fault_wqh))
1225 		__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
1226 	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
1227 }
1228 
1229 static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
1230 					   struct userfaultfd_wake_range *range)
1231 {
1232 	unsigned seq;
1233 	bool need_wakeup;
1234 
1235 	/*
1236 	 * To be sure waitqueue_active() is not reordered by the CPU
1237 	 * before the pagetable update, use an explicit SMP memory
1238 	 * barrier here. PT lock release or mmap_read_unlock(mm) still
1239 	 * have release semantics that can allow the
1240 	 * waitqueue_active() to be reordered before the pte update.
1241 	 */
1242 	smp_mb();
1243 
1244 	/*
1245 	 * Use waitqueue_active because it's very frequent to
1246 	 * change the address space atomically even if there are no
1247 	 * userfaults yet. So we take the spinlock only when we're
1248 	 * sure we've userfaults to wake.
1249 	 */
1250 	do {
1251 		seq = read_seqcount_begin(&ctx->refile_seq);
1252 		need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
1253 			waitqueue_active(&ctx->fault_wqh);
1254 		cond_resched();
1255 	} while (read_seqcount_retry(&ctx->refile_seq, seq));
1256 	if (need_wakeup)
1257 		__wake_userfault(ctx, range);
1258 }
1259 
1260 static __always_inline int validate_unaligned_range(
1261 	struct mm_struct *mm, __u64 start, __u64 len)
1262 {
1263 	__u64 task_size = mm->task_size;
1264 
1265 	if (len & ~PAGE_MASK)
1266 		return -EINVAL;
1267 	if (!len)
1268 		return -EINVAL;
1269 	if (start < mmap_min_addr)
1270 		return -EINVAL;
1271 	if (start >= task_size)
1272 		return -EINVAL;
1273 	if (len > task_size - start)
1274 		return -EINVAL;
1275 	if (start + len <= start)
1276 		return -EINVAL;
1277 	return 0;
1278 }
1279 
1280 static __always_inline int validate_range(struct mm_struct *mm,
1281 					  __u64 start, __u64 len)
1282 {
1283 	if (start & ~PAGE_MASK)
1284 		return -EINVAL;
1285 
1286 	return validate_unaligned_range(mm, start, len);
1287 }
1288 
1289 static int userfaultfd_register(struct userfaultfd_ctx *ctx,
1290 				unsigned long arg)
1291 {
1292 	struct mm_struct *mm = ctx->mm;
1293 	struct vm_area_struct *vma, *prev, *cur;
1294 	int ret;
1295 	struct uffdio_register uffdio_register;
1296 	struct uffdio_register __user *user_uffdio_register;
1297 	unsigned long vm_flags, new_flags;
1298 	bool found;
1299 	bool basic_ioctls;
1300 	unsigned long start, end, vma_end;
1301 	struct vma_iterator vmi;
1302 	bool wp_async = userfaultfd_wp_async_ctx(ctx);
1303 
1304 	user_uffdio_register = (struct uffdio_register __user *) arg;
1305 
1306 	ret = -EFAULT;
1307 	if (copy_from_user(&uffdio_register, user_uffdio_register,
1308 			   sizeof(uffdio_register)-sizeof(__u64)))
1309 		goto out;
1310 
1311 	ret = -EINVAL;
1312 	if (!uffdio_register.mode)
1313 		goto out;
1314 	if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
1315 		goto out;
1316 	vm_flags = 0;
1317 	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
1318 		vm_flags |= VM_UFFD_MISSING;
1319 	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
1320 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
1321 		goto out;
1322 #endif
1323 		vm_flags |= VM_UFFD_WP;
1324 	}
1325 	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
1326 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
1327 		goto out;
1328 #endif
1329 		vm_flags |= VM_UFFD_MINOR;
1330 	}
1331 
1332 	ret = validate_range(mm, uffdio_register.range.start,
1333 			     uffdio_register.range.len);
1334 	if (ret)
1335 		goto out;
1336 
1337 	start = uffdio_register.range.start;
1338 	end = start + uffdio_register.range.len;
1339 
1340 	ret = -ENOMEM;
1341 	if (!mmget_not_zero(mm))
1342 		goto out;
1343 
1344 	ret = -EINVAL;
1345 	mmap_write_lock(mm);
1346 	vma_iter_init(&vmi, mm, start);
1347 	vma = vma_find(&vmi, end);
1348 	if (!vma)
1349 		goto out_unlock;
1350 
1351 	/*
1352 	 * If the first vma contains huge pages, make sure start address
1353 	 * is aligned to huge page size.
1354 	 */
1355 	if (is_vm_hugetlb_page(vma)) {
1356 		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1357 
1358 		if (start & (vma_hpagesize - 1))
1359 			goto out_unlock;
1360 	}
1361 
1362 	/*
1363 	 * Search for not compatible vmas.
1364 	 */
1365 	found = false;
1366 	basic_ioctls = false;
1367 	cur = vma;
1368 	do {
1369 		cond_resched();
1370 
1371 		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1372 		       !!(cur->vm_flags & __VM_UFFD_FLAGS));
1373 
1374 		/* check not compatible vmas */
1375 		ret = -EINVAL;
1376 		if (!vma_can_userfault(cur, vm_flags, wp_async))
1377 			goto out_unlock;
1378 
1379 		/*
1380 		 * UFFDIO_COPY will fill file holes even without
1381 		 * PROT_WRITE. This check enforces that if this is a
1382 		 * MAP_SHARED, the process has write permission to the backing
1383 		 * file. If VM_MAYWRITE is set it also enforces that on a
1384 		 * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
1385 		 * F_WRITE_SEAL can be taken until the vma is destroyed.
1386 		 */
1387 		ret = -EPERM;
1388 		if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
1389 			goto out_unlock;
1390 
1391 		/*
1392 		 * If this vma contains ending address, and huge pages
1393 		 * check alignment.
1394 		 */
1395 		if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
1396 		    end > cur->vm_start) {
1397 			unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
1398 
1399 			ret = -EINVAL;
1400 
1401 			if (end & (vma_hpagesize - 1))
1402 				goto out_unlock;
1403 		}
1404 		if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
1405 			goto out_unlock;
1406 
1407 		/*
1408 		 * Check that this vma isn't already owned by a
1409 		 * different userfaultfd. We can't allow more than one
1410 		 * userfaultfd to own a single vma simultaneously or we
1411 		 * wouldn't know which one to deliver the userfaults to.
1412 		 */
1413 		ret = -EBUSY;
1414 		if (cur->vm_userfaultfd_ctx.ctx &&
1415 		    cur->vm_userfaultfd_ctx.ctx != ctx)
1416 			goto out_unlock;
1417 
1418 		/*
1419 		 * Note vmas containing huge pages
1420 		 */
1421 		if (is_vm_hugetlb_page(cur))
1422 			basic_ioctls = true;
1423 
1424 		found = true;
1425 	} for_each_vma_range(vmi, cur, end);
1426 	BUG_ON(!found);
1427 
1428 	vma_iter_set(&vmi, start);
1429 	prev = vma_prev(&vmi);
1430 	if (vma->vm_start < start)
1431 		prev = vma;
1432 
1433 	ret = 0;
1434 	for_each_vma_range(vmi, vma, end) {
1435 		cond_resched();
1436 
1437 		BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
1438 		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
1439 		       vma->vm_userfaultfd_ctx.ctx != ctx);
1440 		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1441 
1442 		/*
1443 		 * Nothing to do: this vma is already registered into this
1444 		 * userfaultfd and with the right tracking mode too.
1445 		 */
1446 		if (vma->vm_userfaultfd_ctx.ctx == ctx &&
1447 		    (vma->vm_flags & vm_flags) == vm_flags)
1448 			goto skip;
1449 
1450 		if (vma->vm_start > start)
1451 			start = vma->vm_start;
1452 		vma_end = min(end, vma->vm_end);
1453 
1454 		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
1455 		vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
1456 					    new_flags,
1457 					    (struct vm_userfaultfd_ctx){ctx});
1458 		if (IS_ERR(vma)) {
1459 			ret = PTR_ERR(vma);
1460 			break;
1461 		}
1462 
1463 		/*
1464 		 * In the vma_merge() successful mprotect-like case 8:
1465 		 * the next vma was merged into the current one and
1466 		 * the current one has not been updated yet.
1467 		 */
1468 		vma_start_write(vma);
1469 		userfaultfd_set_vm_flags(vma, new_flags);
1470 		vma->vm_userfaultfd_ctx.ctx = ctx;
1471 
1472 		if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
1473 			hugetlb_unshare_all_pmds(vma);
1474 
1475 	skip:
1476 		prev = vma;
1477 		start = vma->vm_end;
1478 	}
1479 
1480 out_unlock:
1481 	mmap_write_unlock(mm);
1482 	mmput(mm);
1483 	if (!ret) {
1484 		__u64 ioctls_out;
1485 
1486 		ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
1487 		    UFFD_API_RANGE_IOCTLS;
1488 
1489 		/*
1490 		 * Declare the WP ioctl only if the WP mode is
1491 		 * specified and all checks passed with the range
1492 		 */
1493 		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
1494 			ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
1495 
1496 		/* CONTINUE ioctl is only supported for MINOR ranges. */
1497 		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
1498 			ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
1499 
1500 		/*
1501 		 * Now that we scanned all vmas we can already tell
1502 		 * userland which ioctls methods are guaranteed to
1503 		 * succeed on this range.
1504 		 */
1505 		if (put_user(ioctls_out, &user_uffdio_register->ioctls))
1506 			ret = -EFAULT;
1507 	}
1508 out:
1509 	return ret;
1510 }
1511 
1512 static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
1513 				  unsigned long arg)
1514 {
1515 	struct mm_struct *mm = ctx->mm;
1516 	struct vm_area_struct *vma, *prev, *cur;
1517 	int ret;
1518 	struct uffdio_range uffdio_unregister;
1519 	unsigned long new_flags;
1520 	bool found;
1521 	unsigned long start, end, vma_end;
1522 	const void __user *buf = (void __user *)arg;
1523 	struct vma_iterator vmi;
1524 	bool wp_async = userfaultfd_wp_async_ctx(ctx);
1525 
1526 	ret = -EFAULT;
1527 	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
1528 		goto out;
1529 
1530 	ret = validate_range(mm, uffdio_unregister.start,
1531 			     uffdio_unregister.len);
1532 	if (ret)
1533 		goto out;
1534 
1535 	start = uffdio_unregister.start;
1536 	end = start + uffdio_unregister.len;
1537 
1538 	ret = -ENOMEM;
1539 	if (!mmget_not_zero(mm))
1540 		goto out;
1541 
1542 	mmap_write_lock(mm);
1543 	ret = -EINVAL;
1544 	vma_iter_init(&vmi, mm, start);
1545 	vma = vma_find(&vmi, end);
1546 	if (!vma)
1547 		goto out_unlock;
1548 
1549 	/*
1550 	 * If the first vma contains huge pages, make sure start address
1551 	 * is aligned to huge page size.
1552 	 */
1553 	if (is_vm_hugetlb_page(vma)) {
1554 		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1555 
1556 		if (start & (vma_hpagesize - 1))
1557 			goto out_unlock;
1558 	}
1559 
1560 	/*
1561 	 * Search for not compatible vmas.
1562 	 */
1563 	found = false;
1564 	cur = vma;
1565 	do {
1566 		cond_resched();
1567 
1568 		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1569 		       !!(cur->vm_flags & __VM_UFFD_FLAGS));
1570 
1571 		/*
1572 		 * Check not compatible vmas, not strictly required
1573 		 * here as not compatible vmas cannot have an
1574 		 * userfaultfd_ctx registered on them, but this
1575 		 * provides for more strict behavior to notice
1576 		 * unregistration errors.
1577 		 */
1578 		if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
1579 			goto out_unlock;
1580 
1581 		found = true;
1582 	} for_each_vma_range(vmi, cur, end);
1583 	BUG_ON(!found);
1584 
1585 	vma_iter_set(&vmi, start);
1586 	prev = vma_prev(&vmi);
1587 	if (vma->vm_start < start)
1588 		prev = vma;
1589 
1590 	ret = 0;
1591 	for_each_vma_range(vmi, vma, end) {
1592 		cond_resched();
1593 
1594 		BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));
1595 
1596 		/*
1597 		 * Nothing to do: this vma is already registered into this
1598 		 * userfaultfd and with the right tracking mode too.
1599 		 */
1600 		if (!vma->vm_userfaultfd_ctx.ctx)
1601 			goto skip;
1602 
1603 		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1604 
1605 		if (vma->vm_start > start)
1606 			start = vma->vm_start;
1607 		vma_end = min(end, vma->vm_end);
1608 
1609 		if (userfaultfd_missing(vma)) {
1610 			/*
1611 			 * Wake any concurrent pending userfault while
1612 			 * we unregister, so they will not hang
1613 			 * permanently and it avoids userland to call
1614 			 * UFFDIO_WAKE explicitly.
1615 			 */
1616 			struct userfaultfd_wake_range range;
1617 			range.start = start;
1618 			range.len = vma_end - start;
1619 			wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
1620 		}
1621 
1622 		/* Reset ptes for the whole vma range if wr-protected */
1623 		if (userfaultfd_wp(vma))
1624 			uffd_wp_range(vma, start, vma_end - start, false);
1625 
1626 		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
1627 		vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
1628 					    new_flags, NULL_VM_UFFD_CTX);
1629 		if (IS_ERR(vma)) {
1630 			ret = PTR_ERR(vma);
1631 			break;
1632 		}
1633 
1634 		/*
1635 		 * In the vma_merge() successful mprotect-like case 8:
1636 		 * the next vma was merged into the current one and
1637 		 * the current one has not been updated yet.
1638 		 */
1639 		vma_start_write(vma);
1640 		userfaultfd_set_vm_flags(vma, new_flags);
1641 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
1642 
1643 	skip:
1644 		prev = vma;
1645 		start = vma->vm_end;
1646 	}
1647 
1648 out_unlock:
1649 	mmap_write_unlock(mm);
1650 	mmput(mm);
1651 out:
1652 	return ret;
1653 }
1654 
1655 /*
1656  * userfaultfd_wake may be used in combination with the
1657  * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
1658  */
1659 static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
1660 			    unsigned long arg)
1661 {
1662 	int ret;
1663 	struct uffdio_range uffdio_wake;
1664 	struct userfaultfd_wake_range range;
1665 	const void __user *buf = (void __user *)arg;
1666 
1667 	ret = -EFAULT;
1668 	if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
1669 		goto out;
1670 
1671 	ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
1672 	if (ret)
1673 		goto out;
1674 
1675 	range.start = uffdio_wake.start;
1676 	range.len = uffdio_wake.len;
1677 
1678 	/*
1679 	 * len == 0 means wake all and we don't want to wake all here,
1680 	 * so check it again to be sure.
1681 	 */
1682 	VM_BUG_ON(!range.len);
1683 
1684 	wake_userfault(ctx, &range);
1685 	ret = 0;
1686 
1687 out:
1688 	return ret;
1689 }
1690 
1691 static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
1692 			    unsigned long arg)
1693 {
1694 	__s64 ret;
1695 	struct uffdio_copy uffdio_copy;
1696 	struct uffdio_copy __user *user_uffdio_copy;
1697 	struct userfaultfd_wake_range range;
1698 	uffd_flags_t flags = 0;
1699 
1700 	user_uffdio_copy = (struct uffdio_copy __user *) arg;
1701 
1702 	ret = -EAGAIN;
1703 	if (atomic_read(&ctx->mmap_changing))
1704 		goto out;
1705 
1706 	ret = -EFAULT;
1707 	if (copy_from_user(&uffdio_copy, user_uffdio_copy,
1708 			   /* don't copy "copy" last field */
1709 			   sizeof(uffdio_copy)-sizeof(__s64)))
1710 		goto out;
1711 
1712 	ret = validate_unaligned_range(ctx->mm, uffdio_copy.src,
1713 				       uffdio_copy.len);
1714 	if (ret)
1715 		goto out;
1716 	ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
1717 	if (ret)
1718 		goto out;
1719 
1720 	ret = -EINVAL;
1721 	if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
1722 		goto out;
1723 	if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
1724 		flags |= MFILL_ATOMIC_WP;
1725 	if (mmget_not_zero(ctx->mm)) {
1726 		ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
1727 					uffdio_copy.len, flags);
1728 		mmput(ctx->mm);
1729 	} else {
1730 		return -ESRCH;
1731 	}
1732 	if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
1733 		return -EFAULT;
1734 	if (ret < 0)
1735 		goto out;
1736 	BUG_ON(!ret);
1737 	/* len == 0 would wake all */
1738 	range.len = ret;
1739 	if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
1740 		range.start = uffdio_copy.dst;
1741 		wake_userfault(ctx, &range);
1742 	}
1743 	ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
1744 out:
1745 	return ret;
1746 }
1747 
1748 static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
1749 				unsigned long arg)
1750 {
1751 	__s64 ret;
1752 	struct uffdio_zeropage uffdio_zeropage;
1753 	struct uffdio_zeropage __user *user_uffdio_zeropage;
1754 	struct userfaultfd_wake_range range;
1755 
1756 	user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
1757 
1758 	ret = -EAGAIN;
1759 	if (atomic_read(&ctx->mmap_changing))
1760 		goto out;
1761 
1762 	ret = -EFAULT;
1763 	if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
1764 			   /* don't copy "zeropage" last field */
1765 			   sizeof(uffdio_zeropage)-sizeof(__s64)))
1766 		goto out;
1767 
1768 	ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
1769 			     uffdio_zeropage.range.len);
1770 	if (ret)
1771 		goto out;
1772 	ret = -EINVAL;
1773 	if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
1774 		goto out;
1775 
1776 	if (mmget_not_zero(ctx->mm)) {
1777 		ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
1778 					   uffdio_zeropage.range.len);
1779 		mmput(ctx->mm);
1780 	} else {
1781 		return -ESRCH;
1782 	}
1783 	if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
1784 		return -EFAULT;
1785 	if (ret < 0)
1786 		goto out;
1787 	/* len == 0 would wake all */
1788 	BUG_ON(!ret);
1789 	range.len = ret;
1790 	if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
1791 		range.start = uffdio_zeropage.range.start;
1792 		wake_userfault(ctx, &range);
1793 	}
1794 	ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
1795 out:
1796 	return ret;
1797 }
1798 
1799 static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
1800 				    unsigned long arg)
1801 {
1802 	int ret;
1803 	struct uffdio_writeprotect uffdio_wp;
1804 	struct uffdio_writeprotect __user *user_uffdio_wp;
1805 	struct userfaultfd_wake_range range;
1806 	bool mode_wp, mode_dontwake;
1807 
1808 	if (atomic_read(&ctx->mmap_changing))
1809 		return -EAGAIN;
1810 
1811 	user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
1812 
1813 	if (copy_from_user(&uffdio_wp, user_uffdio_wp,
1814 			   sizeof(struct uffdio_writeprotect)))
1815 		return -EFAULT;
1816 
1817 	ret = validate_range(ctx->mm, uffdio_wp.range.start,
1818 			     uffdio_wp.range.len);
1819 	if (ret)
1820 		return ret;
1821 
1822 	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
1823 			       UFFDIO_WRITEPROTECT_MODE_WP))
1824 		return -EINVAL;
1825 
1826 	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
1827 	mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
1828 
1829 	if (mode_wp && mode_dontwake)
1830 		return -EINVAL;
1831 
1832 	if (mmget_not_zero(ctx->mm)) {
1833 		ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
1834 					  uffdio_wp.range.len, mode_wp);
1835 		mmput(ctx->mm);
1836 	} else {
1837 		return -ESRCH;
1838 	}
1839 
1840 	if (ret)
1841 		return ret;
1842 
1843 	if (!mode_wp && !mode_dontwake) {
1844 		range.start = uffdio_wp.range.start;
1845 		range.len = uffdio_wp.range.len;
1846 		wake_userfault(ctx, &range);
1847 	}
1848 	return ret;
1849 }
1850 
1851 static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
1852 {
1853 	__s64 ret;
1854 	struct uffdio_continue uffdio_continue;
1855 	struct uffdio_continue __user *user_uffdio_continue;
1856 	struct userfaultfd_wake_range range;
1857 	uffd_flags_t flags = 0;
1858 
1859 	user_uffdio_continue = (struct uffdio_continue __user *)arg;
1860 
1861 	ret = -EAGAIN;
1862 	if (atomic_read(&ctx->mmap_changing))
1863 		goto out;
1864 
1865 	ret = -EFAULT;
1866 	if (copy_from_user(&uffdio_continue, user_uffdio_continue,
1867 			   /* don't copy the output fields */
1868 			   sizeof(uffdio_continue) - (sizeof(__s64))))
1869 		goto out;
1870 
1871 	ret = validate_range(ctx->mm, uffdio_continue.range.start,
1872 			     uffdio_continue.range.len);
1873 	if (ret)
1874 		goto out;
1875 
1876 	ret = -EINVAL;
1877 	if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
1878 				     UFFDIO_CONTINUE_MODE_WP))
1879 		goto out;
1880 	if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
1881 		flags |= MFILL_ATOMIC_WP;
1882 
1883 	if (mmget_not_zero(ctx->mm)) {
1884 		ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
1885 					    uffdio_continue.range.len, flags);
1886 		mmput(ctx->mm);
1887 	} else {
1888 		return -ESRCH;
1889 	}
1890 
1891 	if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
1892 		return -EFAULT;
1893 	if (ret < 0)
1894 		goto out;
1895 
1896 	/* len == 0 would wake all */
1897 	BUG_ON(!ret);
1898 	range.len = ret;
1899 	if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
1900 		range.start = uffdio_continue.range.start;
1901 		wake_userfault(ctx, &range);
1902 	}
1903 	ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
1904 
1905 out:
1906 	return ret;
1907 }
1908 
1909 static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
1910 {
1911 	__s64 ret;
1912 	struct uffdio_poison uffdio_poison;
1913 	struct uffdio_poison __user *user_uffdio_poison;
1914 	struct userfaultfd_wake_range range;
1915 
1916 	user_uffdio_poison = (struct uffdio_poison __user *)arg;
1917 
1918 	ret = -EAGAIN;
1919 	if (atomic_read(&ctx->mmap_changing))
1920 		goto out;
1921 
1922 	ret = -EFAULT;
1923 	if (copy_from_user(&uffdio_poison, user_uffdio_poison,
1924 			   /* don't copy the output fields */
1925 			   sizeof(uffdio_poison) - (sizeof(__s64))))
1926 		goto out;
1927 
1928 	ret = validate_range(ctx->mm, uffdio_poison.range.start,
1929 			     uffdio_poison.range.len);
1930 	if (ret)
1931 		goto out;
1932 
1933 	ret = -EINVAL;
1934 	if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
1935 		goto out;
1936 
1937 	if (mmget_not_zero(ctx->mm)) {
1938 		ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
1939 					  uffdio_poison.range.len, 0);
1940 		mmput(ctx->mm);
1941 	} else {
1942 		return -ESRCH;
1943 	}
1944 
1945 	if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
1946 		return -EFAULT;
1947 	if (ret < 0)
1948 		goto out;
1949 
1950 	/* len == 0 would wake all */
1951 	BUG_ON(!ret);
1952 	range.len = ret;
1953 	if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
1954 		range.start = uffdio_poison.range.start;
1955 		wake_userfault(ctx, &range);
1956 	}
1957 	ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;
1958 
1959 out:
1960 	return ret;
1961 }
1962 
1963 bool userfaultfd_wp_async(struct vm_area_struct *vma)
1964 {
1965 	return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
1966 }
1967 
1968 static inline unsigned int uffd_ctx_features(__u64 user_features)
1969 {
1970 	/*
1971 	 * For the current set of features the bits just coincide. Set
1972 	 * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
1973 	 */
1974 	return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
1975 }
1976 
1977 static int userfaultfd_move(struct userfaultfd_ctx *ctx,
1978 			    unsigned long arg)
1979 {
1980 	__s64 ret;
1981 	struct uffdio_move uffdio_move;
1982 	struct uffdio_move __user *user_uffdio_move;
1983 	struct userfaultfd_wake_range range;
1984 	struct mm_struct *mm = ctx->mm;
1985 
1986 	user_uffdio_move = (struct uffdio_move __user *) arg;
1987 
1988 	if (atomic_read(&ctx->mmap_changing))
1989 		return -EAGAIN;
1990 
1991 	if (copy_from_user(&uffdio_move, user_uffdio_move,
1992 			   /* don't copy "move" last field */
1993 			   sizeof(uffdio_move)-sizeof(__s64)))
1994 		return -EFAULT;
1995 
1996 	/* Do not allow cross-mm moves. */
1997 	if (mm != current->mm)
1998 		return -EINVAL;
1999 
2000 	ret = validate_range(mm, uffdio_move.dst, uffdio_move.len);
2001 	if (ret)
2002 		return ret;
2003 
2004 	ret = validate_range(mm, uffdio_move.src, uffdio_move.len);
2005 	if (ret)
2006 		return ret;
2007 
2008 	if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES|
2009 				  UFFDIO_MOVE_MODE_DONTWAKE))
2010 		return -EINVAL;
2011 
2012 	if (mmget_not_zero(mm)) {
2013 		ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
2014 				 uffdio_move.len, uffdio_move.mode);
2015 		mmput(mm);
2016 	} else {
2017 		return -ESRCH;
2018 	}
2019 
2020 	if (unlikely(put_user(ret, &user_uffdio_move->move)))
2021 		return -EFAULT;
2022 	if (ret < 0)
2023 		goto out;
2024 
2025 	/* len == 0 would wake all */
2026 	VM_WARN_ON(!ret);
2027 	range.len = ret;
2028 	if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) {
2029 		range.start = uffdio_move.dst;
2030 		wake_userfault(ctx, &range);
2031 	}
2032 	ret = range.len == uffdio_move.len ? 0 : -EAGAIN;
2033 
2034 out:
2035 	return ret;
2036 }
2037 
2038 /*
2039  * userland asks for a certain API version and we return which bits
2040  * and ioctl commands are implemented in this kernel for such API
2041  * version or -EINVAL if unknown.
2042  */
2043 static int userfaultfd_api(struct userfaultfd_ctx *ctx,
2044 			   unsigned long arg)
2045 {
2046 	struct uffdio_api uffdio_api;
2047 	void __user *buf = (void __user *)arg;
2048 	unsigned int ctx_features;
2049 	int ret;
2050 	__u64 features;
2051 
2052 	ret = -EFAULT;
2053 	if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
2054 		goto out;
2055 	features = uffdio_api.features;
2056 	ret = -EINVAL;
2057 	if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
2058 		goto err_out;
2059 	ret = -EPERM;
2060 	if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
2061 		goto err_out;
2062 
2063 	/* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */
2064 	if (features & UFFD_FEATURE_WP_ASYNC)
2065 		features |= UFFD_FEATURE_WP_UNPOPULATED;
2066 
2067 	/* report all available features and ioctls to userland */
2068 	uffdio_api.features = UFFD_API_FEATURES;
2069 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
2070 	uffdio_api.features &=
2071 		~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
2072 #endif
2073 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
2074 	uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
2075 #endif
2076 #ifndef CONFIG_PTE_MARKER_UFFD_WP
2077 	uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
2078 	uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
2079 	uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
2080 #endif
2081 	uffdio_api.ioctls = UFFD_API_IOCTLS;
2082 	ret = -EFAULT;
2083 	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
2084 		goto out;
2085 
2086 	/* only enable the requested features for this uffd context */
2087 	ctx_features = uffd_ctx_features(features);
2088 	ret = -EINVAL;
2089 	if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
2090 		goto err_out;
2091 
2092 	ret = 0;
2093 out:
2094 	return ret;
2095 err_out:
2096 	memset(&uffdio_api, 0, sizeof(uffdio_api));
2097 	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
2098 		ret = -EFAULT;
2099 	goto out;
2100 }
2101 
2102 static long userfaultfd_ioctl(struct file *file, unsigned cmd,
2103 			      unsigned long arg)
2104 {
2105 	int ret = -EINVAL;
2106 	struct userfaultfd_ctx *ctx = file->private_data;
2107 
2108 	if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
2109 		return -EINVAL;
2110 
2111 	switch(cmd) {
2112 	case UFFDIO_API:
2113 		ret = userfaultfd_api(ctx, arg);
2114 		break;
2115 	case UFFDIO_REGISTER:
2116 		ret = userfaultfd_register(ctx, arg);
2117 		break;
2118 	case UFFDIO_UNREGISTER:
2119 		ret = userfaultfd_unregister(ctx, arg);
2120 		break;
2121 	case UFFDIO_WAKE:
2122 		ret = userfaultfd_wake(ctx, arg);
2123 		break;
2124 	case UFFDIO_COPY:
2125 		ret = userfaultfd_copy(ctx, arg);
2126 		break;
2127 	case UFFDIO_ZEROPAGE:
2128 		ret = userfaultfd_zeropage(ctx, arg);
2129 		break;
2130 	case UFFDIO_MOVE:
2131 		ret = userfaultfd_move(ctx, arg);
2132 		break;
2133 	case UFFDIO_WRITEPROTECT:
2134 		ret = userfaultfd_writeprotect(ctx, arg);
2135 		break;
2136 	case UFFDIO_CONTINUE:
2137 		ret = userfaultfd_continue(ctx, arg);
2138 		break;
2139 	case UFFDIO_POISON:
2140 		ret = userfaultfd_poison(ctx, arg);
2141 		break;
2142 	}
2143 	return ret;
2144 }
2145 
2146 #ifdef CONFIG_PROC_FS
2147 static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
2148 {
2149 	struct userfaultfd_ctx *ctx = f->private_data;
2150 	wait_queue_entry_t *wq;
2151 	unsigned long pending = 0, total = 0;
2152 
2153 	spin_lock_irq(&ctx->fault_pending_wqh.lock);
2154 	list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
2155 		pending++;
2156 		total++;
2157 	}
2158 	list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
2159 		total++;
2160 	}
2161 	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
2162 
2163 	/*
2164 	 * If more protocols will be added, there will be all shown
2165 	 * separated by a space. Like this:
2166 	 *	protocols: aa:... bb:...
2167 	 */
2168 	seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
2169 		   pending, total, UFFD_API, ctx->features,
2170 		   UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
2171 }
2172 #endif
2173 
2174 static const struct file_operations userfaultfd_fops = {
2175 #ifdef CONFIG_PROC_FS
2176 	.show_fdinfo	= userfaultfd_show_fdinfo,
2177 #endif
2178 	.release	= userfaultfd_release,
2179 	.poll		= userfaultfd_poll,
2180 	.read_iter	= userfaultfd_read_iter,
2181 	.unlocked_ioctl = userfaultfd_ioctl,
2182 	.compat_ioctl	= compat_ptr_ioctl,
2183 	.llseek		= noop_llseek,
2184 };
2185 
2186 static void init_once_userfaultfd_ctx(void *mem)
2187 {
2188 	struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
2189 
2190 	init_waitqueue_head(&ctx->fault_pending_wqh);
2191 	init_waitqueue_head(&ctx->fault_wqh);
2192 	init_waitqueue_head(&ctx->event_wqh);
2193 	init_waitqueue_head(&ctx->fd_wqh);
2194 	seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
2195 }
2196 
2197 static int new_userfaultfd(int flags)
2198 {
2199 	struct userfaultfd_ctx *ctx;
2200 	struct file *file;
2201 	int fd;
2202 
2203 	BUG_ON(!current->mm);
2204 
2205 	/* Check the UFFD_* constants for consistency.  */
2206 	BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
2207 	BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
2208 	BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
2209 
2210 	if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
2211 		return -EINVAL;
2212 
2213 	ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
2214 	if (!ctx)
2215 		return -ENOMEM;
2216 
2217 	refcount_set(&ctx->refcount, 1);
2218 	ctx->flags = flags;
2219 	ctx->features = 0;
2220 	ctx->released = false;
2221 	init_rwsem(&ctx->map_changing_lock);
2222 	atomic_set(&ctx->mmap_changing, 0);
2223 	ctx->mm = current->mm;
2224 
2225 	fd = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
2226 	if (fd < 0)
2227 		goto err_out;
2228 
2229 	/* Create a new inode so that the LSM can block the creation.  */
2230 	file = anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
2231 			O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
2232 	if (IS_ERR(file)) {
2233 		put_unused_fd(fd);
2234 		fd = PTR_ERR(file);
2235 		goto err_out;
2236 	}
2237 	/* prevent the mm struct to be freed */
2238 	mmgrab(ctx->mm);
2239 	file->f_mode |= FMODE_NOWAIT;
2240 	fd_install(fd, file);
2241 	return fd;
2242 err_out:
2243 	kmem_cache_free(userfaultfd_ctx_cachep, ctx);
2244 	return fd;
2245 }
2246 
2247 static inline bool userfaultfd_syscall_allowed(int flags)
2248 {
2249 	/* Userspace-only page faults are always allowed */
2250 	if (flags & UFFD_USER_MODE_ONLY)
2251 		return true;
2252 
2253 	/*
2254 	 * The user is requesting a userfaultfd which can handle kernel faults.
2255 	 * Privileged users are always allowed to do this.
2256 	 */
2257 	if (capable(CAP_SYS_PTRACE))
2258 		return true;
2259 
2260 	/* Otherwise, access to kernel fault handling is sysctl controlled. */
2261 	return sysctl_unprivileged_userfaultfd;
2262 }
2263 
2264 SYSCALL_DEFINE1(userfaultfd, int, flags)
2265 {
2266 	if (!userfaultfd_syscall_allowed(flags))
2267 		return -EPERM;
2268 
2269 	return new_userfaultfd(flags);
2270 }
2271 
2272 static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
2273 {
2274 	if (cmd != USERFAULTFD_IOC_NEW)
2275 		return -EINVAL;
2276 
2277 	return new_userfaultfd(flags);
2278 }
2279 
2280 static const struct file_operations userfaultfd_dev_fops = {
2281 	.unlocked_ioctl = userfaultfd_dev_ioctl,
2282 	.compat_ioctl = userfaultfd_dev_ioctl,
2283 	.owner = THIS_MODULE,
2284 	.llseek = noop_llseek,
2285 };
2286 
2287 static struct miscdevice userfaultfd_misc = {
2288 	.minor = MISC_DYNAMIC_MINOR,
2289 	.name = "userfaultfd",
2290 	.fops = &userfaultfd_dev_fops
2291 };
2292 
2293 static int __init userfaultfd_init(void)
2294 {
2295 	int ret;
2296 
2297 	ret = misc_register(&userfaultfd_misc);
2298 	if (ret)
2299 		return ret;
2300 
2301 	userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
2302 						sizeof(struct userfaultfd_ctx),
2303 						0,
2304 						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2305 						init_once_userfaultfd_ctx);
2306 #ifdef CONFIG_SYSCTL
2307 	register_sysctl_init("vm", vm_userfaultfd_table);
2308 #endif
2309 	return 0;
2310 }
2311 __initcall(userfaultfd_init);
2312