eventpoll.c - OpenGrok cross reference for /linux/fs/eventpoll.c

Lines Matching +full:closed +full:- +full:loop
1 // SPDX-License-Identifier: GPL-2.0-or-later
45  * fs/eventpoll.c - Efficient event polling ("epoll") kernel implementation.
49  * --------
52  * ->private_data is a struct eventpoll. Each EPOLL_CTL_ADD installs
54  * eventpoll via the watched file's f_op->poll() wait queue(s). When
57  * ep->mtx, re-queueing items in level-triggered mode.
64  * -------
69  *                     loop / path checks)
70  *     > ep->mtx     (per-eventpoll; sleepable; serializes most ops)
71  *       > ep->lock  (per-eventpoll; IRQ-safe spinlock)
73  *   file->f_lock    (per-file; NOT IRQ-safe; guards f_ep hlist ops;
74  *                    nested inside ep->mtx, outside ep->lock)
77  *   - ep->lock is a spinlock because ep_poll_callback() is called from
78  *     wake_up() which may run in hard-IRQ context. All ep->lock
80  *   - ep->mtx is a sleepable mutex because the event delivery loop
82  *     kmem_cache_alloc() and f_op->poll().
83  *   - epnested_mutex is global because cycle detection needs a global
84  *     view of the epoll topology; a per-object scheme would let two
87  *   - Per-ep ep->mtx is preferred for scalability elsewhere. Events
91  * ep->mtx on both: outer first, target second. Since cycles are
92  * forbidden the set of live ep->mtx holds is always a strict chain,
98  * ----------------
101  *   mtx              - self
102  *   rbr              - ep->mtx
103  *   ovflist, rdllist - ep->lock (IRQ-safe)
104  *   wq               - ep->lock for queue mutation
105  *   poll_wait        - internal waitqueue spinlock
106  *   refs             - file->f_lock for adds; ep->mtx for removes;
108  *   ws               - ep->mtx
109  *   gen, loop_check_depth - epnested_mutex
110  *   file, user       - immutable after setup
111  *   refcount         - atomic (refcount_t)
112  *   napi_*           - READ_ONCE / WRITE_ONCE
115  *   rbn / rcu union  - rbn: ep->mtx (while epi is linked in ep->rbr).
118  *   rdllink, next    - ep->lock
119  *   ffd, ep          - immutable after ep_insert()
120  *   pwqlist          - ep->mtx for writes; POLLFREE clears pwq->whead
122  *   fllink           - file->f_lock for mutation; hlist_del_rcu +
124  *   ws               - RCU (rcu_assign_pointer /
126  *   event            - ep->mtx for writes; lockless read in
131  * Ready-list state machine
132  * ------------------------
134  * Readiness is tracked in two lists under ep->lock:
136  *   rdllist   - doubly-linked FIFO; the "current" ready list.
137  *   ovflist   - singly-linked LIFO; used during a scan to catch
139  *               without ep->lock.
141  * Encoded in ep->ovflist:
142  *   EP_UNACTIVE_PTR - no scan active; callback appends to rdllist.
143  *   NULL            - scan active, no spill yet.
144  *   pointer to epi  - scan active with spilled items (LIFO).
146  * Encoded in epi->ovflist_next:
147  *   EP_UNACTIVE_PTR - epi is not on ovflist.
148  *   otherwise       - next epi on ovflist (NULL at tail).
151  * rdllist into a caller-local scan_batch. ep_done_scan() drains ovflist
152  * back to rdllist (list_add head-insert reverses LIFO to FIFO),
153  * flips back to "not scanning", and re-splices any items the caller
154  * left in scan_batch (e.g., level-triggered re-queues).
158  * -------------
162  *   A. ep_remove()              - EPOLL_CTL_DEL and ep_insert()
163  *                                 rollback. Caller holds ep->mtx.
164  *   B. ep_clear_and_put()       - close of the epoll fd itself
166  *   C. eventpoll_release_file() - close of a watched file, invoked
171  *   A pins the file with epi_fget() before touching file->f_ep or
172  *   file->f_lock; if the pin fails, __fput() is in flight and C
174  *   A and B both hold ep->mtx serially. B walks the rbtree with
176  *   B and C both take ep->mtx; the loser sees fewer entries or an
177  *   empty file->f_ep.
180  *   ep_unregister_pollwait()  - drain pwqlist; synchronizes with any
181  *                                in-flight ep_poll_callback via the
182  *                                watched wait-queue head's lock.
183  *   ep_remove_file()          - hlist_del_rcu of epi->fllink and,
184  *                                if last watcher, clear file->f_ep,
185  *                                under file->f_lock.
186  *   ep_remove_epi()           - rb_erase, rdllist unlink (ep->lock),
196  * ------------------
198  * When a subsystem tears down a wait-queue head that an epitem is
200  * POLLFREE and must RCU-defer the head's free. The store/load pair:
203  *     smp_store_release(&pwq->whead, NULL)
206  *     smp_load_acquire(&pwq->whead)
224 #define EP_UNACTIVE_PTR ((void *) -1L)
260 	/* Link on the owning eventpoll's ready list (ep->rdllist). */
264 	 * Link on the owning eventpoll's scan-overflow list (ep->ovflist),
266 	 * epi_clear_ovflist() and the "Ready-list state machine" section
267 	 * in the top-of-file banner.
299 	 * collection loop, the file cleanup path, the epoll file exit
307 	/* Wait queue used by file->poll() */
325 	 * holding ->lock.
337 	/* used to optimize loop detection check */
377  * Cycle and path-length checks at EPOLL_CTL_ADD
378  * ---------------------------------------------
383  *   1. no cycle is being formed -- ep_loop_check() walks downward
387  *      of a given length -- reverse_path_check().
391  * and carries its scratch state on a stack-allocated struct
392  * ep_ctl_ctx scoped to one do_epoll_ctl() call. Non-nested inserts
393  * skip this machinery entirely and take only ep->mtx.
397  *                      a check and again at the end. ep->gen caches
400  *                      ep_get_upwards_depth_proc(); the post-check
403  *                      "ep->gen == loop_check_gen" trigger in
412  * semi-unbounded recursion") hardened the walk; any refactor must
413  * preserve both bail-outs.
421  * Per-do_epoll_ctl() scratch for the loop / path checks. Allocated on
434 	 * Singly-linked list of epitems_head objects collected during
436 	 * Terminated by EP_UNACTIVE_PTR, not NULL: epitems_head->next
437 	 * doubles as a membership flag (a NULL ->next means "not on this
438 	 * list", see ep_remove_file()), so the list uses a non-NULL
445 	 * Per-depth wakeup-path tally used by reverse_path_check_proc();
459  * Wrapper anchor for file->f_ep when the watched file is not itself an
460  * eventpoll; for the epoll-watches-epoll case, file->f_ep points at
461  * &watched_ep->refs directly. The ->next field threads
462  * ctx->tfile_check_list during one EPOLL_CTL_ADD path check.
481 	head = container_of(file->f_ep, struct epitems_head, epitems);
482 	if (!head->next) {
483 		head->next = ctx->tfile_check_list;
484 		ctx->tfile_check_list = head;
491 	struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));
494 		spin_lock(&epi->ffd.file->f_lock);
495 		if (!hlist_empty(&head->epitems))
497 		head->next = NULL;
498 		spin_unlock(&epi->ffd.file->f_lock);
534 	return f->f_op == &eventpoll_fops;
540 	return (p1->file > p2->file ? +1:
541 	        (p1->file < p2->file ? -1 : p1->fd - p2->fd));
547 	return !list_empty(&epi->rdllink);
558 	return container_of(p, struct eppoll_entry, wait)->base;
562  * Ready-list / ovflist state (see "Ready-list state machine" in the
563  * top-of-file banner for the full state machine). EP_UNACTIVE_PTR is
571 	return READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
577 	WRITE_ONCE(ep->ovflist, NULL);
583 	WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
589 	return epi->ovflist_next != EP_UNACTIVE_PTR;
592 /* Mark @epi as not on any ovflist (init and post-drain). */
595 	epi->ovflist_next = EP_UNACTIVE_PTR;
601 	unsigned int seq = read_seqcount_begin(&ep->seq);
603 	return !list_empty_careful(&ep->rdllist) || ep_is_scanning(ep) ||
604 		read_seqcount_retry(&ep->seq, seq);
609  * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value
611  * the system-wide global via busy_loop_timeout.
621 	unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs);
635 	return !!READ_ONCE(ep->busy_poll_usecs) ||
636 	       READ_ONCE(ep->prefer_busy_poll) ||
649  * busy loop will return if need_resched or ep_events_available.
655 	unsigned int napi_id = READ_ONCE(ep->napi_id);
656 	u16 budget = READ_ONCE(ep->busy_poll_budget);
657 	bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
674 		ep->napi_id = 0;
685 	struct eventpoll *ep = epi->ep;
693 	sock = sock_from_file(epi->ffd.file);
697 	sk = sock->sk;
701 	napi_id = READ_ONCE(sk->sk_napi_id);
703 	/* Non-NAPI IDs can be rejected
707 	if (!napi_id_valid(napi_id) || napi_id == ep->napi_id)
711 	ep->napi_id = napi_id;
717 	struct eventpoll *ep = file->private_data;
724 			return -EFAULT;
728 			return -EINVAL;
731 			return -EINVAL;
734 			return -EINVAL;
738 			return -EPERM;
740 		WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs);
741 		WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget);
742 		WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll);
746 		epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs);
747 		epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget);
748 		epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
750 			return -EFAULT;
753 		return -ENOIOCTLCMD;
759 	unsigned int napi_id = READ_ONCE(ep->napi_id);
761 	if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll))
767 	unsigned int napi_id = READ_ONCE(ep->napi_id);
769 	if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll))
787 	return -EOPNOTSUPP;
836 	 * it might be natural to create a per-cpu nest count. However, since
837 	 * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
838 	 * schedule() in the -rt kernel, the per-cpu variable are no longer
850 		if ((is_file_epoll(epi->ffd.file))) {
851 			ep_src = epi->ffd.file->private_data;
852 			nests = ep_src->nests;
857 	spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
858 	ep->nests = nests + 1;
859 	wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags);
860 	ep->nests = 0;
861 	spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
869 	wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags);
885 	 * complete and we must not touch whead again. On a non-NULL load
887 	 * firers RCU-defer the free) and whead->lock inside
890 	whead = smp_load_acquire(&pwq->whead);
892 		remove_wait_queue(whead, &pwq->wait);
902 	struct eppoll_entry **p = &epi->pwqlist;
906 		*p = pwq->next;
912 /* call only when ep->mtx is held */
915 	return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
918 /* call only when ep->mtx is held */
929 	return rcu_access_pointer(epi->ws) ? true : false;
932 /* call when ep->mtx cannot be held (ep_poll_callback) */
938 	ws = rcu_dereference(epi->ws);
946  * ep->mutex needs to be held because we could be hit by
952 	 * Steal the ready list, and re-init the original one to the
953 	 * empty list. Also, set ep->ovflist to NULL so that events
955 	 * have the poll callback to queue directly on ep->rdllist,
960 	spin_lock_irq(&ep->lock);
961 	write_seqcount_begin(&ep->seq);
963 	list_splice_init(&ep->rdllist, scan_batch);
966 	write_seqcount_end(&ep->seq);
967 	spin_unlock_irq(&ep->lock);
975 	spin_lock_irq(&ep->lock);
979 	 * We re-insert them inside the main ready-list here.
981 	for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; ) {
982 		nepi = epi->ovflist_next;
986 		 * -- the list_splice() below takes care of those.
990 			 * ovflist is LIFO; list_add() head-insert here
993 			list_add(&epi->rdllink, &ep->rdllist);
998 	write_seqcount_begin(&ep->seq);
1000 	/* Back out of scan mode; callbacks target ep->rdllist again. */
1004 	 * Quickly re-inject items left on "scan_batch".
1006 	list_splice(scan_batch, &ep->rdllist);
1008 	write_seqcount_end(&ep->seq);
1010 	__pm_relax(ep->ws);
1012 	if (!list_empty(&ep->rdllist)) {
1013 		if (waitqueue_active(&ep->wq))
1014 			wake_up(&ep->wq);
1017 	spin_unlock_irq(&ep->lock);
1022 	refcount_inc(&ep->refcount);
1031 	if (!refcount_dec_and_test(&ep->refcount))
1034 	WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root));
1041 	mutex_destroy(&ep->mtx);
1042 	free_uid(ep->user);
1043 	wakeup_source_unregister(ep->ws);
1044 	/* ep_get_upwards_depth_proc() may still hold epi->ep under RCU */
1049  * Pin @epi->ffd.file for operations that require both safe dereference
1053  * reassigned at any time. The bare load of epi->ffd.file is safe here
1054  * because the caller holds ep->mtx and eventpoll_release_file() blocks
1061  * come back. ep_remove() relies on that to touch file->f_lock and
1062  * file->f_ep without racing eventpoll_release_file() (see commit
1071 	file = epi->ffd.file;
1072 	if (!file_ref_get(&file->f_ref))
1078  * Takes &file->f_lock; returns with it released.
1086 	lockdep_assert_held(&ep->mtx);
1088 	spin_lock(&file->f_lock);
1089 	head = file->f_ep;
1090 	if (hlist_is_singular_node(&epi->fllink, head)) {
1096 		 * itself -- see the comment in eventpoll_release().
1098 		WRITE_ONCE(file->f_ep, NULL);
1102 			if (!smp_load_acquire(&v->next))
1106 	hlist_del_rcu(&epi->fllink);
1107 	spin_unlock(&file->f_lock);
1113 	lockdep_assert_held(&ep->mtx);
1115 	rb_erase_cached(&epi->rbn, &ep->rbr);
1117 	spin_lock_irq(&ep->lock);
1119 		list_del_init(&epi->rdllink);
1120 	spin_unlock_irq(&ep->lock);
1125 	 * field epi->rcu, since we are trying to minimize the size of
1127 	 * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
1132 	percpu_counter_dec(&ep->user->epoll_watches);
1143 	lockdep_assert_held(&ep->mtx);
1163  * ep_unregister_pollwait() takes each watched wait-queue head's lock,
1164  * which synchronizes with any in-flight ep_poll_callback(); after
1166  * on this ep. Must strictly precede ep_drain_tree() -- fusing the
1175 	lockdep_assert_held(&ep->mtx);
1177 	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1186  * Pass 2 of ep_clear_and_put(): ep_remove() every epi. The per-epi
1193  * file serializes with us via ep->mtx; ep_remove() transparently
1202 	lockdep_assert_held(&ep->mtx);
1204 	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) {
1213  * Removal path B (see "Removal paths" in the top-of-file banner):
1216  * Two passes under ep->mtx: first ep_drain_pollwaits() quiesces
1217  * in-flight callbacks, then ep_drain_tree() frees the epis. The
1218  * ep->refcount is kept > 0 across the walk by the ep file's own
1224 	/* Release any threads blocked in poll-on-ep. */
1225 	if (waitqueue_active(&ep->poll_wait))
1228 	mutex_lock(&ep->mtx);
1231 	mutex_unlock(&ep->mtx);
1243 		return -EINVAL;
1251 		ret = -EINVAL;
1260 	struct eventpoll *ep = file->private_data;
1272 	struct eventpoll *ep = file->private_data;
1281 	poll_wait(file, &ep->poll_wait, wait);
1287 	mutex_lock_nested(&ep->mtx, depth);
1300 			list_del_init(&epi->rdllink);
1304 	mutex_unlock(&ep->mtx);
1310  * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
1326 	pt->_key = epi->event.events;
1332 	return res & epi->event.events;
1343 	struct eventpoll *ep = f->private_data;
1346 	mutex_lock(&ep->mtx);
1347 	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1349 		struct inode *inode = file_inode(epi->ffd.file);
1353 			   epi->ffd.fd, epi->event.events,
1354 			   (long long)epi->event.data,
1355 			   (long long)epi->ffd.file->f_pos,
1356 			   inode->i_ino, inode->i_sb->s_dev);
1360 	mutex_unlock(&ep->mtx);
1379  * closed without being removed from the eventpoll interface.
1389 	 * to zero -- the path we're on. So any racing ep_remove() bails
1393 	spin_lock(&file->f_lock);
1394 	if (file->f_ep && file->f_ep->first) {
1395 		epi = hlist_entry(file->f_ep->first, struct epitem, fllink);
1396 		spin_unlock(&file->f_lock);
1402 		ep = epi->ep;
1403 		mutex_lock(&ep->mtx);
1410 		mutex_unlock(&ep->mtx);
1416 	spin_unlock(&file->f_lock);
1425 		return -ENOMEM;
1427 	mutex_init(&ep->mtx);
1428 	spin_lock_init(&ep->lock);
1429 	seqcount_spinlock_init(&ep->seq, &ep->lock);
1430 	init_waitqueue_head(&ep->wq);
1431 	init_waitqueue_head(&ep->poll_wait);
1432 	INIT_LIST_HEAD(&ep->rdllist);
1433 	ep->rbr = RB_ROOT_CACHED;
1434 	ep->ovflist = EP_UNACTIVE_PTR;	/* not scanning */
1435 	ep->user = get_current_user();
1436 	refcount_set(&ep->refcount, 1);
1454 	for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
1456 		kcmp = ep_cmp_ffd(tf, &epi->ffd);
1458 			rbp = rbp->rb_right;
1460 			rbp = rbp->rb_left;
1479 	struct eventpoll *ep = epi->ep;
1484 	spin_lock_irqsave(&ep->lock, flags);
1494 	if (!(epi->event.events & ~EP_PRIVATE_BITS))
1503 	if (pollflags && !(pollflags & epi->event.events))
1508 	 * (because we're accessing user memory, and because of linux f_op->poll()
1510 	 * chained in ep->ovflist and requeued later on.
1514 			epi->ovflist_next = READ_ONCE(ep->ovflist);
1515 			WRITE_ONCE(ep->ovflist, epi);
1520 		list_add_tail(&epi->rdllink, &ep->rdllist);
1525 	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
1528 	if (waitqueue_active(&ep->wq)) {
1529 		if ((epi->event.events & EPOLLEXCLUSIVE) &&
1533 				if (epi->event.events & EPOLLIN)
1537 				if (epi->event.events & EPOLLOUT)
1546 			wake_up_sync(&ep->wq);
1548 			wake_up(&ep->wq);
1550 	if (waitqueue_active(&ep->poll_wait))
1554 	spin_unlock_irqrestore(&ep->lock, flags);
1560 	if (!(epi->event.events & EPOLLEXCLUSIVE))
1570 		 * that already loaded a non-NULL whead may still call
1578 		 * whead->lock -- the same lock held by our caller, so it
1582 		list_del_init(&wait->entry);
1583 		smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
1597 	struct epitem *epi = epq->epi;
1605 		epq->epi = NULL;
1609 	init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
1610 	pwq->whead = whead;
1611 	pwq->base = epi;
1612 	if (epi->event.events & EPOLLEXCLUSIVE)
1613 		add_wait_queue_exclusive(whead, &pwq->wait);
1615 		add_wait_queue(whead, &pwq->wait);
1616 	pwq->next = epi->pwqlist;
1617 	epi->pwqlist = pwq;
1623 	struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
1630 		kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
1632 			p = &parent->rb_right;
1635 			p = &parent->rb_left;
1637 	rb_link_node(&epi->rbn, parent, p);
1638 	rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
1647  * the wakeup amplification that can be built from epoll-watches-
1651  * The running tallies live in ctx->path_count[] and are protected by
1662 	if (++ctx->path_count[nests] > path_limits[nests])
1663 		return -1;
1672 		ctx->path_count[i] = 0;
1682 		return -1;
1686 		struct hlist_head *refs = &epi->ep->refs;
1698  * reverse_path_check - ctx->tfile_check_list is a list of epitems_head
1700  *                      sure those links don't push any path-length bucket
1702  * @ctx: Per-do_epoll_ctl() scratch for the loop / path checks.
1705  *	    %-1 otherwise.
1711 	for (p = ctx->tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
1715 		error = reverse_path_check_proc(ctx, &p->epitems, 0);
1728 	if (!epi->ep->ws) {
1729 		epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
1730 		if (!epi->ep->ws)
1731 			return -ENOMEM;
1734 	take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
1739 		return -ENOMEM;
1740 	rcu_assign_pointer(epi->ws, ws);
1750 	RCU_INIT_POINTER(epi->ws, NULL);
1768 		ep = file->private_data;
1771 		head = &ep->refs;
1772 	} else if (!READ_ONCE(file->f_ep)) {
1776 			return -ENOMEM;
1777 		head = &to_free->epitems;
1779 	spin_lock(&file->f_lock);
1780 	if (!file->f_ep) {
1782 			spin_unlock(&file->f_lock);
1786 		WRITE_ONCE(file->f_ep, head);
1789 	hlist_add_head_rcu(&epi->fllink, file->f_ep);
1790 	spin_unlock(&file->f_lock);
1802  * Returns ERR_PTR(-ENOSPC) if the quota is exceeded, ERR_PTR(-ENOMEM)
1811 	if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
1813 		return ERR_PTR(-ENOSPC);
1814 	percpu_counter_inc(&ep->user->epoll_watches);
1818 		percpu_counter_dec(&ep->user->epoll_watches);
1819 		return ERR_PTR(-ENOMEM);
1822 	INIT_LIST_HEAD(&epi->rdllink);
1823 	epi->ep = ep;
1824 	epi->ffd = *tf;
1825 	epi->event = *event;
1835  * If @tep is non-NULL, the target file is itself an eventpoll; we hold
1836  * tep->mtx at subclass 1 across the attach + rbtree insert to serialize
1837  * with the target side. RB tree ops are protected by @ep->mtx, which
1850 	struct file *tfile = epi->ffd.file;
1854 		mutex_lock_nested(&tep->mtx, 1);
1859 			mutex_unlock(&tep->mtx);
1861 		percpu_counter_dec(&ep->user->epoll_watches);
1871 		mutex_unlock(&tep->mtx);
1890 	if (is_file_epoll(tf->file))
1891 		tep = tf->file->private_data;
1903 	/* Reject the insert if the new link would create too many back-paths. */
1906 		return -EINVAL;
1909 	if (epi->event.events & EPOLLWAKEUP) {
1933 		return -ENOMEM;
1937 	spin_lock_irq(&ep->lock);
1942 		list_add_tail(&epi->rdllink, &ep->rdllist);
1945 		if (waitqueue_active(&ep->wq))
1946 			wake_up(&ep->wq);
1947 		if (waitqueue_active(&ep->poll_wait))
1951 	spin_unlock_irq(&ep->lock);
1975 	 * Set the new event interest mask before calling f_op->poll();
1977 	 * f_op->poll() call and the new event set registering.
1979 	epi->event.events = event->events; /* need barrier below */
1980 	epi->event.data = event->data; /* protected by mtx */
1981 	if (epi->event.events & EPOLLWAKEUP) {
1993 	 *    event occurs immediately after we call f_op->poll().
1994 	 *    We need this because we did not take ep->lock while
1996 	 *    ep->lock).
1999 	 *    when calling f_op->poll().  This barrier also
2003 	 * This barrier will now guarantee ep_poll_callback or f_op->poll
2015 		spin_lock_irq(&ep->lock);
2017 			list_add_tail(&epi->rdllink, &ep->rdllist);
2021 			if (waitqueue_active(&ep->wq))
2022 				wake_up(&ep->wq);
2023 			if (waitqueue_active(&ep->poll_wait))
2026 		spin_unlock_irq(&ep->lock);
2040  * next slot), 0 if the re-poll reported no caller-requested events
2041  * (@epi drops out of the ready list; a future callback will re-add
2042  * it), or -EFAULT if copy_to_user() faulted (in which case @epi is
2043  * re-inserted at the head of @scan_batch so ep_done_scan() merges it
2046  * PM bookkeeping and level-triggered re-queue are handled here.
2047  * Caller holds ep->mtx and the scan is active.
2059 	 * Activate ep->ws before deactivating epi->ws to prevent
2060 	 * triggering auto-suspend here (in case we reactivate epi->ws
2062 	 * epi->ws drift out of sync with ep_is_linked().
2066 		if (ws->active)
2067 			__pm_stay_awake(ep->ws);
2071 	list_del_init(&epi->rdllink);
2074 	 * Re-poll under ep->mtx so userspace cannot change the item
2075 	 * out from under us. If no caller-requested events remain,
2077 	 * re-queue it when events next appear.
2083 	next = epoll_put_uevent(revents, epi->event.data, *uevents);
2090 		list_add(&epi->rdllink, scan_batch);
2092 		return -EFAULT;
2096 	if (epi->event.events & EPOLLONESHOT) {
2097 		epi->event.events &= EP_PRIVATE_BITS;
2098 	} else if (!(epi->event.events & EPOLLET)) {
2100 		 * Level-triggered: re-queue so the next epoll_wait()
2102 		 * rdllist here -- epoll_ctl() callers are locked out
2103 		 * by ep->mtx, and the poll callback queues to ovflist
2106 		list_add_tail(&epi->rdllink, &ep->rdllist);
2121 	 * Always short-circuit for fatal signals to allow threads to make a
2126 		return -EINTR;
2130 	mutex_lock(&ep->mtx);
2134 	 * We can loop without lock because we are passed a task-private
2135 	 * scan_batch; items cannot vanish while we hold ep->mtx.
2153 	mutex_unlock(&ep->mtx);
2166 		to->tv_sec = 0;
2167 		to->tv_nsec = 0;
2171 	to->tv_sec = ms / MSEC_PER_SEC;
2172 	to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);
2182  * woken, and in that case the ep_poll loop will remove the entry anyways, not
2191 	 * Pairs with list_empty_careful in ep_poll, and ensures future loop
2194 	list_del_init_careful(&wq_entry->entry);
2223  * ep_poll - Retrieves ready events, and delivers them to the caller-supplied
2250 	if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
2256 		 * Avoid the unnecessary trip to the wait queue loop, if the
2265 	 * with a non-zero timeout, this thread will check the ready list under
2287 			return -EINTR;
2295 		 * lost. This is also good performance-wise, because on
2297 		 * explicitly, thus ep->lock is not taken, which halts the
2302 		 * entry between loop iterations. This lets us also avoid the
2309 		spin_lock_irq(&ep->lock);
2319 		 * plays with two lists (->rdllist and ->ovflist) and there
2326 			__add_wait_queue_exclusive(&ep->wq, &wait);
2328 		spin_unlock_irq(&ep->lock);
2344 			spin_lock_irq(&ep->lock);
2354 			__remove_wait_queue(&ep->wq, &wait);
2355 			spin_unlock_irq(&ep->lock);
2361  * ep_loop_check_proc - verify that adding an epoll file @ep inside another
2362  *                      epoll file does not create closed loops, and
2365  * @ctx: Per-do_epoll_ctl() scratch for the loop / path checks.
2370  * a loop or went too deep.
2379 	if (ep->gen == loop_check_gen)
2380 		return ep->loop_check_depth;
2382 	mutex_lock_nested(&ep->mtx, depth + 1);
2383 	ep->gen = loop_check_gen;
2384 	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
2386 		if (unlikely(is_file_epoll(epi->ffd.file))) {
2388 			ep_tovisit = epi->ffd.file->private_data;
2389 			if (ep_tovisit == ctx->inserting_into ||
2400 			 * A non-epoll leaf. Queue it for the companion
2405 			list_file(epi->ffd.file, ctx);
2408 	ep->loop_check_depth = result;
2409 	mutex_unlock(&ep->mtx);
2414 /* ep_get_upwards_depth_proc - determine depth of @ep when traversed upwards */
2420 	if (ep->gen == loop_check_gen)
2421 		return ep->loop_check_depth;
2422 	hlist_for_each_entry_rcu(epi, &ep->refs, fllink)
2423 		result = max(result, ep_get_upwards_depth_proc(epi->ep, depth + 1) + 1);
2424 	ep->gen = loop_check_gen;
2425 	ep->loop_check_depth = result;
2430  * ep_loop_check - Performs a check to verify that adding an epoll file (@to)
2432  *                 closed loops or too deep chains.
2434  * @ctx: Per-CTL_ADD scratch context.
2439  * does not violate the constraints, or %-1 otherwise.
2446 	ctx->inserting_into = ep;
2449 	 * to loop up to @ep.
2453 		return -1;
2459 	return (depth+1+upwards_depth > EP_MAX_NESTS) ? -1 : 0;
2465 	while (ctx->tfile_check_list != EP_UNACTIVE_PTR) {
2466 		struct epitems_head *head = ctx->tfile_check_list;
2467 		ctx->tfile_check_list = head->next;
2485 		return -EINVAL;
2503 	ep->file = fd_prepare_file(fdf);
2515 		return -EINVAL;
2523 	if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
2524 		epev->events &= ~EPOLLWAKEUP;
2529 	epev->events &= ~EPOLLWAKEUP;
2539 	return mutex_trylock(mutex) ? 0 : -EAGAIN;
2545  * Always takes ep->mtx. For EPOLL_CTL_ADD, additionally runs the
2546  * loop / path check under epnested_mutex when the topology can
2547  * change: @ep is already watched (epfile->f_ep non-NULL), @ep was
2548  * recently loop-checked (ep->gen == loop_check_gen), or @tfile is
2553  *   0        success; ep->mtx held.
2554  *   1        success; ep->mtx held AND the full check ran under
2557  *   -errno   failure; no locks held.
2571 	error = epoll_mutex_lock(&ep->mtx, nonblock);
2577 	if (!READ_ONCE(epfile->f_ep) && ep->gen != loop_check_gen &&
2581 	/* Full check needed: drop ep->mtx so we can take epnested_mutex. */
2582 	mutex_unlock(&ep->mtx);
2590 		tep = tfile->private_data;
2592 			error = -ELOOP;
2597 	error = epoll_mutex_lock(&ep->mtx, nonblock);
2613 	mutex_unlock(&ep->mtx);
2633 	if (!file_can_poll(tf->file))
2634 		return -EPERM;
2644 	if (f == tf->file || !is_file_epoll(f))
2645 		return -EINVAL;
2652 	if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
2654 			return -EINVAL;
2655 		if (op == EPOLL_CTL_ADD && (is_file_epoll(tf->file) ||
2656 				(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
2657 			return -EINVAL;
2660 	ep = f->private_data;
2662 	full_check = ep_ctl_lock(&ctx, ep, op, f, tf->file, nonblock);
2667 	 * Look the target up in ep's RB tree. We hold ep->mtx, so the
2672 	error = -EINVAL;
2676 			epds->events |= EPOLLERR | EPOLLHUP;
2679 			error = -EEXIST;
2690 			error = -ENOENT;
2695 			if (!(epi->event.events & EPOLLEXCLUSIVE)) {
2696 				epds->events |= EPOLLERR | EPOLLHUP;
2700 			error = -ENOENT;
2715 		return -EBADF;
2720 		return -EBADF;
2739 		return -EFAULT;
2749 		return -EINVAL;
2753 		return -EFAULT;
2760 		return -EINVAL;
2775 	ep = file->private_data;
2777 	 * Racy call, but that's ok - it should get retried based on
2798 		return -EBADF;
2808 	ep = fd_file(f)->private_data;
2843 	restore_saved_sigmask_unless(error == -EINTR);
2867 			return -EFAULT;
2870 			return -EINVAL;
2883 	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
2885 		if (epi->ffd.fd == tfd) {
2889 				toff--;
2905 		return ERR_PTR(-EINVAL);
2907 	ep = file->private_data;
2909 	mutex_lock(&ep->mtx);
2912 		file_raw = epi->ffd.file;
2914 		file_raw = ERR_PTR(-ENOENT);
2915 	mutex_unlock(&ep->mtx);
2939 	restore_saved_sigmask_unless(err == -EINTR);
2968 			return -EFAULT;
2971 			return -EINVAL;
2988 	max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
2994 	 * using an extra cache line on 64-bit (and smaller) CPUs