Lines Matching +full:closed +full:- +full:loop

1 // SPDX-License-Identifier: GPL-2.0-or-later
45 * fs/eventpoll.c - Efficient event polling ("epoll") kernel implementation.
49 * --------
52 * ->private_data is a struct eventpoll. Each EPOLL_CTL_ADD installs
54 * eventpoll via the watched file's f_op->poll() wait queue(s). When
57 * ep->mtx, re-queueing items in level-triggered mode.
64 * -------
69 * loop / path checks)
70 * > ep->mtx (per-eventpoll; sleepable; serializes most ops)
71 * > ep->lock (per-eventpoll; IRQ-safe spinlock)
73 * file->f_lock (per-file; NOT IRQ-safe; guards f_ep hlist ops;
74 * nested inside ep->mtx, outside ep->lock)
77 * - ep->lock is a spinlock because ep_poll_callback() is called from
78 * wake_up() which may run in hard-IRQ context. All ep->lock
80 * - ep->mtx is a sleepable mutex because the event delivery loop
82 * kmem_cache_alloc() and f_op->poll().
83 * - epnested_mutex is global because cycle detection needs a global
84 * view of the epoll topology; a per-object scheme would let two
87 * - Per-ep ep->mtx is preferred for scalability elsewhere. Events
91 * ep->mtx on both: outer first, target second. Since cycles are
92 * forbidden the set of live ep->mtx holds is always a strict chain,
98 * ----------------
101 * mtx - self
102 * rbr - ep->mtx
103 * ovflist, rdllist - ep->lock (IRQ-safe)
104 * wq - ep->lock for queue mutation
105 * poll_wait - internal waitqueue spinlock
106 * refs - file->f_lock for adds; ep->mtx for removes;
108 * ws - ep->mtx
109 * gen, loop_check_depth - epnested_mutex
110 * file, user - immutable after setup
111 * refcount - atomic (refcount_t)
112 * napi_* - READ_ONCE / WRITE_ONCE
115 * rbn / rcu union - rbn: ep->mtx (while epi is linked in ep->rbr).
118 * rdllink, next - ep->lock
119 * ffd, ep - immutable after ep_insert()
120 * pwqlist - ep->mtx for writes; POLLFREE clears pwq->whead
122 * fllink - file->f_lock for mutation; hlist_del_rcu +
124 * ws - RCU (rcu_assign_pointer /
126 * event - ep->mtx for writes; lockless read in
131 * Ready-list state machine
132 * ------------------------
134 * Readiness is tracked in two lists under ep->lock:
136 * rdllist - doubly-linked FIFO; the "current" ready list.
137 * ovflist - singly-linked LIFO; used during a scan to catch
139 * without ep->lock.
141 * Encoded in ep->ovflist:
142 * EP_UNACTIVE_PTR - no scan active; callback appends to rdllist.
143 * NULL - scan active, no spill yet.
144 * pointer to epi - scan active with spilled items (LIFO).
146 * Encoded in epi->ovflist_next:
147 * EP_UNACTIVE_PTR - epi is not on ovflist.
148 * otherwise - next epi on ovflist (NULL at tail).
151 * rdllist into a caller-local scan_batch. ep_done_scan() drains ovflist
152 * back to rdllist (list_add head-insert reverses LIFO to FIFO),
153 * flips back to "not scanning", and re-splices any items the caller
154 * left in scan_batch (e.g., level-triggered re-queues).
158 * -------------
162 * A. ep_remove() - EPOLL_CTL_DEL and ep_insert()
163 * rollback. Caller holds ep->mtx.
164 * B. ep_clear_and_put() - close of the epoll fd itself
166 * C. eventpoll_release_file() - close of a watched file, invoked
171 * A pins the file with epi_fget() before touching file->f_ep or
172 * file->f_lock; if the pin fails, __fput() is in flight and C
174 * A and B both hold ep->mtx serially. B walks the rbtree with
176 * B and C both take ep->mtx; the loser sees fewer entries or an
177 * empty file->f_ep.
180 * ep_unregister_pollwait() - drain pwqlist; synchronizes with any
181 * in-flight ep_poll_callback via the
182 * watched wait-queue head's lock.
183 * ep_remove_file() - hlist_del_rcu of epi->fllink and,
184 * if last watcher, clear file->f_ep,
185 * under file->f_lock.
186 * ep_remove_epi() - rb_erase, rdllist unlink (ep->lock),
196 * ------------------
198 * When a subsystem tears down a wait-queue head that an epitem is
200 * POLLFREE and must RCU-defer the head's free. The store/load pair:
203 * smp_store_release(&pwq->whead, NULL)
206 * smp_load_acquire(&pwq->whead)
224 #define EP_UNACTIVE_PTR ((void *) -1L)
260 /* Link on the owning eventpoll's ready list (ep->rdllist). */
264 * Link on the owning eventpoll's scan-overflow list (ep->ovflist),
266 * epi_clear_ovflist() and the "Ready-list state machine" section
267 * in the top-of-file banner.
299 * collection loop, the file cleanup path, the epoll file exit
307 /* Wait queue used by file->poll() */
325 * holding ->lock.
337 /* used to optimize loop detection check */
377 * Cycle and path-length checks at EPOLL_CTL_ADD
378 * ---------------------------------------------
383 * 1. no cycle is being formed -- ep_loop_check() walks downward
387 * of a given length -- reverse_path_check().
391 * and carries its scratch state on a stack-allocated struct
392 * ep_ctl_ctx scoped to one do_epoll_ctl() call. Non-nested inserts
393 * skip this machinery entirely and take only ep->mtx.
397 * a check and again at the end. ep->gen caches
400 * ep_get_upwards_depth_proc(); the post-check
403 * "ep->gen == loop_check_gen" trigger in
412 * semi-unbounded recursion") hardened the walk; any refactor must
413 * preserve both bail-outs.
421 * Per-do_epoll_ctl() scratch for the loop / path checks. Allocated on
434 * Singly-linked list of epitems_head objects collected during
436 * Terminated by EP_UNACTIVE_PTR, not NULL: epitems_head->next
437 * doubles as a membership flag (a NULL ->next means "not on this
438 * list", see ep_remove_file()), so the list uses a non-NULL
445 * Per-depth wakeup-path tally used by reverse_path_check_proc();
459 * Wrapper anchor for file->f_ep when the watched file is not itself an
460 * eventpoll; for the epoll-watches-epoll case, file->f_ep points at
461 * &watched_ep->refs directly. The ->next field threads
462 * ctx->tfile_check_list during one EPOLL_CTL_ADD path check.
481 head = container_of(file->f_ep, struct epitems_head, epitems);
482 if (!head->next) {
483 head->next = ctx->tfile_check_list;
484 ctx->tfile_check_list = head;
491 struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));
494 spin_lock(&epi->ffd.file->f_lock);
495 if (!hlist_empty(&head->epitems))
497 head->next = NULL;
498 spin_unlock(&epi->ffd.file->f_lock);
534 return f->f_op == &eventpoll_fops;
540 return (p1->file > p2->file ? +1:
541 (p1->file < p2->file ? -1 : p1->fd - p2->fd));
547 return !list_empty(&epi->rdllink);
558 return container_of(p, struct eppoll_entry, wait)->base;
562 * Ready-list / ovflist state (see "Ready-list state machine" in the
563 * top-of-file banner for the full state machine). EP_UNACTIVE_PTR is
571 return READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
577 WRITE_ONCE(ep->ovflist, NULL);
583 WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
589 return epi->ovflist_next != EP_UNACTIVE_PTR;
592 /* Mark @epi as not on any ovflist (init and post-drain). */
595 epi->ovflist_next = EP_UNACTIVE_PTR;
601 unsigned int seq = read_seqcount_begin(&ep->seq);
603 return !list_empty_careful(&ep->rdllist) || ep_is_scanning(ep) ||
604 read_seqcount_retry(&ep->seq, seq);
609 * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value
611 * the system-wide global via busy_loop_timeout.
621 unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs);
635 return !!READ_ONCE(ep->busy_poll_usecs) ||
636 READ_ONCE(ep->prefer_busy_poll) ||
649 * busy loop will return if need_resched or ep_events_available.
655 unsigned int napi_id = READ_ONCE(ep->napi_id);
656 u16 budget = READ_ONCE(ep->busy_poll_budget);
657 bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
674 ep->napi_id = 0;
685 struct eventpoll *ep = epi->ep;
693 sock = sock_from_file(epi->ffd.file);
697 sk = sock->sk;
701 napi_id = READ_ONCE(sk->sk_napi_id);
703 /* Non-NAPI IDs can be rejected
707 if (!napi_id_valid(napi_id) || napi_id == ep->napi_id)
711 ep->napi_id = napi_id;
717 struct eventpoll *ep = file->private_data;
724 return -EFAULT;
728 return -EINVAL;
731 return -EINVAL;
734 return -EINVAL;
738 return -EPERM;
740 WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs);
741 WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget);
742 WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll);
746 epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs);
747 epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget);
748 epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
750 return -EFAULT;
753 return -ENOIOCTLCMD;
759 unsigned int napi_id = READ_ONCE(ep->napi_id);
761 if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll))
767 unsigned int napi_id = READ_ONCE(ep->napi_id);
769 if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll))
787 return -EOPNOTSUPP;
836 * it might be natural to create a per-cpu nest count. However, since
837 * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
838 * schedule() in the -rt kernel, the per-cpu variable are no longer
850 if ((is_file_epoll(epi->ffd.file))) {
851 ep_src = epi->ffd.file->private_data;
852 nests = ep_src->nests;
857 spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
858 ep->nests = nests + 1;
859 wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags);
860 ep->nests = 0;
861 spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
869 wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags);
885 * complete and we must not touch whead again. On a non-NULL load
887 * firers RCU-defer the free) and whead->lock inside
890 whead = smp_load_acquire(&pwq->whead);
892 remove_wait_queue(whead, &pwq->wait);
902 struct eppoll_entry **p = &epi->pwqlist;
906 *p = pwq->next;
912 /* call only when ep->mtx is held */
915 return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
918 /* call only when ep->mtx is held */
929 return rcu_access_pointer(epi->ws) ? true : false;
932 /* call when ep->mtx cannot be held (ep_poll_callback) */
938 ws = rcu_dereference(epi->ws);
946 * ep->mutex needs to be held because we could be hit by
952 * Steal the ready list, and re-init the original one to the
953 * empty list. Also, set ep->ovflist to NULL so that events
955 * have the poll callback to queue directly on ep->rdllist,
960 spin_lock_irq(&ep->lock);
961 write_seqcount_begin(&ep->seq);
963 list_splice_init(&ep->rdllist, scan_batch);
966 write_seqcount_end(&ep->seq);
967 spin_unlock_irq(&ep->lock);
975 spin_lock_irq(&ep->lock);
979 * We re-insert them inside the main ready-list here.
981 for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; ) {
982 nepi = epi->ovflist_next;
986 * -- the list_splice() below takes care of those.
990 * ovflist is LIFO; list_add() head-insert here
993 list_add(&epi->rdllink, &ep->rdllist);
998 write_seqcount_begin(&ep->seq);
1000 /* Back out of scan mode; callbacks target ep->rdllist again. */
1004 * Quickly re-inject items left on "scan_batch".
1006 list_splice(scan_batch, &ep->rdllist);
1008 write_seqcount_end(&ep->seq);
1010 __pm_relax(ep->ws);
1012 if (!list_empty(&ep->rdllist)) {
1013 if (waitqueue_active(&ep->wq))
1014 wake_up(&ep->wq);
1017 spin_unlock_irq(&ep->lock);
1022 refcount_inc(&ep->refcount);
1031 if (!refcount_dec_and_test(&ep->refcount))
1034 WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root));
1041 mutex_destroy(&ep->mtx);
1042 free_uid(ep->user);
1043 wakeup_source_unregister(ep->ws);
1044 /* ep_get_upwards_depth_proc() may still hold epi->ep under RCU */
1049 * Pin @epi->ffd.file for operations that require both safe dereference
1053 * reassigned at any time. The bare load of epi->ffd.file is safe here
1054 * because the caller holds ep->mtx and eventpoll_release_file() blocks
1061 * come back. ep_remove() relies on that to touch file->f_lock and
1062 * file->f_ep without racing eventpoll_release_file() (see commit
1071 file = epi->ffd.file;
1072 if (!file_ref_get(&file->f_ref))
1078 * Takes &file->f_lock; returns with it released.
1086 lockdep_assert_held(&ep->mtx);
1088 spin_lock(&file->f_lock);
1089 head = file->f_ep;
1090 if (hlist_is_singular_node(&epi->fllink, head)) {
1096 * itself -- see the comment in eventpoll_release().
1098 WRITE_ONCE(file->f_ep, NULL);
1102 if (!smp_load_acquire(&v->next))
1106 hlist_del_rcu(&epi->fllink);
1107 spin_unlock(&file->f_lock);
1113 lockdep_assert_held(&ep->mtx);
1115 rb_erase_cached(&epi->rbn, &ep->rbr);
1117 spin_lock_irq(&ep->lock);
1119 list_del_init(&epi->rdllink);
1120 spin_unlock_irq(&ep->lock);
1125 * field epi->rcu, since we are trying to minimize the size of
1127 * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
1132 percpu_counter_dec(&ep->user->epoll_watches);
1143 lockdep_assert_held(&ep->mtx);
1163 * ep_unregister_pollwait() takes each watched wait-queue head's lock,
1164 * which synchronizes with any in-flight ep_poll_callback(); after
1166 * on this ep. Must strictly precede ep_drain_tree() -- fusing the
1175 lockdep_assert_held(&ep->mtx);
1177 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1186 * Pass 2 of ep_clear_and_put(): ep_remove() every epi. The per-epi
1193 * file serializes with us via ep->mtx; ep_remove() transparently
1202 lockdep_assert_held(&ep->mtx);
1204 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) {
1213 * Removal path B (see "Removal paths" in the top-of-file banner):
1216 * Two passes under ep->mtx: first ep_drain_pollwaits() quiesces
1217 * in-flight callbacks, then ep_drain_tree() frees the epis. The
1218 * ep->refcount is kept > 0 across the walk by the ep file's own
1224 /* Release any threads blocked in poll-on-ep. */
1225 if (waitqueue_active(&ep->poll_wait))
1228 mutex_lock(&ep->mtx);
1231 mutex_unlock(&ep->mtx);
1243 return -EINVAL;
1251 ret = -EINVAL;
1260 struct eventpoll *ep = file->private_data;
1272 struct eventpoll *ep = file->private_data;
1281 poll_wait(file, &ep->poll_wait, wait);
1287 mutex_lock_nested(&ep->mtx, depth);
1300 list_del_init(&epi->rdllink);
1304 mutex_unlock(&ep->mtx);
1310 * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
1326 pt->_key = epi->event.events;
1332 return res & epi->event.events;
1343 struct eventpoll *ep = f->private_data;
1346 mutex_lock(&ep->mtx);
1347 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1349 struct inode *inode = file_inode(epi->ffd.file);
1353 epi->ffd.fd, epi->event.events,
1354 (long long)epi->event.data,
1355 (long long)epi->ffd.file->f_pos,
1356 inode->i_ino, inode->i_sb->s_dev);
1360 mutex_unlock(&ep->mtx);
1379 * closed without being removed from the eventpoll interface.
1389 * to zero -- the path we're on. So any racing ep_remove() bails
1393 spin_lock(&file->f_lock);
1394 if (file->f_ep && file->f_ep->first) {
1395 epi = hlist_entry(file->f_ep->first, struct epitem, fllink);
1396 spin_unlock(&file->f_lock);
1402 ep = epi->ep;
1403 mutex_lock(&ep->mtx);
1410 mutex_unlock(&ep->mtx);
1416 spin_unlock(&file->f_lock);
1425 return -ENOMEM;
1427 mutex_init(&ep->mtx);
1428 spin_lock_init(&ep->lock);
1429 seqcount_spinlock_init(&ep->seq, &ep->lock);
1430 init_waitqueue_head(&ep->wq);
1431 init_waitqueue_head(&ep->poll_wait);
1432 INIT_LIST_HEAD(&ep->rdllist);
1433 ep->rbr = RB_ROOT_CACHED;
1434 ep->ovflist = EP_UNACTIVE_PTR; /* not scanning */
1435 ep->user = get_current_user();
1436 refcount_set(&ep->refcount, 1);
1454 for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
1456 kcmp = ep_cmp_ffd(tf, &epi->ffd);
1458 rbp = rbp->rb_right;
1460 rbp = rbp->rb_left;
1479 struct eventpoll *ep = epi->ep;
1484 spin_lock_irqsave(&ep->lock, flags);
1494 if (!(epi->event.events & ~EP_PRIVATE_BITS))
1503 if (pollflags && !(pollflags & epi->event.events))
1508 * (because we're accessing user memory, and because of linux f_op->poll()
1510 * chained in ep->ovflist and requeued later on.
1514 epi->ovflist_next = READ_ONCE(ep->ovflist);
1515 WRITE_ONCE(ep->ovflist, epi);
1520 list_add_tail(&epi->rdllink, &ep->rdllist);
1525 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
1528 if (waitqueue_active(&ep->wq)) {
1529 if ((epi->event.events & EPOLLEXCLUSIVE) &&
1533 if (epi->event.events & EPOLLIN)
1537 if (epi->event.events & EPOLLOUT)
1546 wake_up_sync(&ep->wq);
1548 wake_up(&ep->wq);
1550 if (waitqueue_active(&ep->poll_wait))
1554 spin_unlock_irqrestore(&ep->lock, flags);
1560 if (!(epi->event.events & EPOLLEXCLUSIVE))
1570 * that already loaded a non-NULL whead may still call
1578 * whead->lock -- the same lock held by our caller, so it
1582 list_del_init(&wait->entry);
1583 smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
1597 struct epitem *epi = epq->epi;
1605 epq->epi = NULL;
1609 init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
1610 pwq->whead = whead;
1611 pwq->base = epi;
1612 if (epi->event.events & EPOLLEXCLUSIVE)
1613 add_wait_queue_exclusive(whead, &pwq->wait);
1615 add_wait_queue(whead, &pwq->wait);
1616 pwq->next = epi->pwqlist;
1617 epi->pwqlist = pwq;
1623 struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
1630 kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
1632 p = &parent->rb_right;
1635 p = &parent->rb_left;
1637 rb_link_node(&epi->rbn, parent, p);
1638 rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
1647 * the wakeup amplification that can be built from epoll-watches-
1651 * The running tallies live in ctx->path_count[] and are protected by
1662 if (++ctx->path_count[nests] > path_limits[nests])
1663 return -1;
1672 ctx->path_count[i] = 0;
1682 return -1;
1686 struct hlist_head *refs = &epi->ep->refs;
1698 * reverse_path_check - ctx->tfile_check_list is a list of epitems_head
1700 * sure those links don't push any path-length bucket
1702 * @ctx: Per-do_epoll_ctl() scratch for the loop / path checks.
1705 * %-1 otherwise.
1711 for (p = ctx->tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
1715 error = reverse_path_check_proc(ctx, &p->epitems, 0);
1728 if (!epi->ep->ws) {
1729 epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
1730 if (!epi->ep->ws)
1731 return -ENOMEM;
1734 take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
1739 return -ENOMEM;
1740 rcu_assign_pointer(epi->ws, ws);
1750 RCU_INIT_POINTER(epi->ws, NULL);
1768 ep = file->private_data;
1771 head = &ep->refs;
1772 } else if (!READ_ONCE(file->f_ep)) {
1776 return -ENOMEM;
1777 head = &to_free->epitems;
1779 spin_lock(&file->f_lock);
1780 if (!file->f_ep) {
1782 spin_unlock(&file->f_lock);
1786 WRITE_ONCE(file->f_ep, head);
1789 hlist_add_head_rcu(&epi->fllink, file->f_ep);
1790 spin_unlock(&file->f_lock);
1802 * Returns ERR_PTR(-ENOSPC) if the quota is exceeded, ERR_PTR(-ENOMEM)
1811 if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
1813 return ERR_PTR(-ENOSPC);
1814 percpu_counter_inc(&ep->user->epoll_watches);
1818 percpu_counter_dec(&ep->user->epoll_watches);
1819 return ERR_PTR(-ENOMEM);
1822 INIT_LIST_HEAD(&epi->rdllink);
1823 epi->ep = ep;
1824 epi->ffd = *tf;
1825 epi->event = *event;
1835 * If @tep is non-NULL, the target file is itself an eventpoll; we hold
1836 * tep->mtx at subclass 1 across the attach + rbtree insert to serialize
1837 * with the target side. RB tree ops are protected by @ep->mtx, which
1850 struct file *tfile = epi->ffd.file;
1854 mutex_lock_nested(&tep->mtx, 1);
1859 mutex_unlock(&tep->mtx);
1861 percpu_counter_dec(&ep->user->epoll_watches);
1871 mutex_unlock(&tep->mtx);
1890 if (is_file_epoll(tf->file))
1891 tep = tf->file->private_data;
1903 /* Reject the insert if the new link would create too many back-paths. */
1906 return -EINVAL;
1909 if (epi->event.events & EPOLLWAKEUP) {
1933 return -ENOMEM;
1937 spin_lock_irq(&ep->lock);
1942 list_add_tail(&epi->rdllink, &ep->rdllist);
1945 if (waitqueue_active(&ep->wq))
1946 wake_up(&ep->wq);
1947 if (waitqueue_active(&ep->poll_wait))
1951 spin_unlock_irq(&ep->lock);
1975 * Set the new event interest mask before calling f_op->poll();
1977 * f_op->poll() call and the new event set registering.
1979 epi->event.events = event->events; /* need barrier below */
1980 epi->event.data = event->data; /* protected by mtx */
1981 if (epi->event.events & EPOLLWAKEUP) {
1993 * event occurs immediately after we call f_op->poll().
1994 * We need this because we did not take ep->lock while
1996 * ep->lock).
1999 * when calling f_op->poll(). This barrier also
2003 * This barrier will now guarantee ep_poll_callback or f_op->poll
2015 spin_lock_irq(&ep->lock);
2017 list_add_tail(&epi->rdllink, &ep->rdllist);
2021 if (waitqueue_active(&ep->wq))
2022 wake_up(&ep->wq);
2023 if (waitqueue_active(&ep->poll_wait))
2026 spin_unlock_irq(&ep->lock);
2040 * next slot), 0 if the re-poll reported no caller-requested events
2041 * (@epi drops out of the ready list; a future callback will re-add
2042 * it), or -EFAULT if copy_to_user() faulted (in which case @epi is
2043 * re-inserted at the head of @scan_batch so ep_done_scan() merges it
2046 * PM bookkeeping and level-triggered re-queue are handled here.
2047 * Caller holds ep->mtx and the scan is active.
2059 * Activate ep->ws before deactivating epi->ws to prevent
2060 * triggering auto-suspend here (in case we reactivate epi->ws
2062 * epi->ws drift out of sync with ep_is_linked().
2066 if (ws->active)
2067 __pm_stay_awake(ep->ws);
2071 list_del_init(&epi->rdllink);
2074 * Re-poll under ep->mtx so userspace cannot change the item
2075 * out from under us. If no caller-requested events remain,
2077 * re-queue it when events next appear.
2083 next = epoll_put_uevent(revents, epi->event.data, *uevents);
2090 list_add(&epi->rdllink, scan_batch);
2092 return -EFAULT;
2096 if (epi->event.events & EPOLLONESHOT) {
2097 epi->event.events &= EP_PRIVATE_BITS;
2098 } else if (!(epi->event.events & EPOLLET)) {
2100 * Level-triggered: re-queue so the next epoll_wait()
2102 * rdllist here -- epoll_ctl() callers are locked out
2103 * by ep->mtx, and the poll callback queues to ovflist
2106 list_add_tail(&epi->rdllink, &ep->rdllist);
2121 * Always short-circuit for fatal signals to allow threads to make a
2126 return -EINTR;
2130 mutex_lock(&ep->mtx);
2134 * We can loop without lock because we are passed a task-private
2135 * scan_batch; items cannot vanish while we hold ep->mtx.
2153 mutex_unlock(&ep->mtx);
2166 to->tv_sec = 0;
2167 to->tv_nsec = 0;
2171 to->tv_sec = ms / MSEC_PER_SEC;
2172 to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);
2182 * woken, and in that case the ep_poll loop will remove the entry anyways, not
2191 * Pairs with list_empty_careful in ep_poll, and ensures future loop
2194 list_del_init_careful(&wq_entry->entry);
2223 * ep_poll - Retrieves ready events, and delivers them to the caller-supplied
2250 if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
2256 * Avoid the unnecessary trip to the wait queue loop, if the
2265 * with a non-zero timeout, this thread will check the ready list under
2287 return -EINTR;
2295 * lost. This is also good performance-wise, because on
2297 * explicitly, thus ep->lock is not taken, which halts the
2302 * entry between loop iterations. This lets us also avoid the
2309 spin_lock_irq(&ep->lock);
2319 * plays with two lists (->rdllist and ->ovflist) and there
2326 __add_wait_queue_exclusive(&ep->wq, &wait);
2328 spin_unlock_irq(&ep->lock);
2344 spin_lock_irq(&ep->lock);
2354 __remove_wait_queue(&ep->wq, &wait);
2355 spin_unlock_irq(&ep->lock);
2361 * ep_loop_check_proc - verify that adding an epoll file @ep inside another
2362 * epoll file does not create closed loops, and
2365 * @ctx: Per-do_epoll_ctl() scratch for the loop / path checks.
2370 * a loop or went too deep.
2379 if (ep->gen == loop_check_gen)
2380 return ep->loop_check_depth;
2382 mutex_lock_nested(&ep->mtx, depth + 1);
2383 ep->gen = loop_check_gen;
2384 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
2386 if (unlikely(is_file_epoll(epi->ffd.file))) {
2388 ep_tovisit = epi->ffd.file->private_data;
2389 if (ep_tovisit == ctx->inserting_into ||
2400 * A non-epoll leaf. Queue it for the companion
2405 list_file(epi->ffd.file, ctx);
2408 ep->loop_check_depth = result;
2409 mutex_unlock(&ep->mtx);
2414 /* ep_get_upwards_depth_proc - determine depth of @ep when traversed upwards */
2420 if (ep->gen == loop_check_gen)
2421 return ep->loop_check_depth;
2422 hlist_for_each_entry_rcu(epi, &ep->refs, fllink)
2423 result = max(result, ep_get_upwards_depth_proc(epi->ep, depth + 1) + 1);
2424 ep->gen = loop_check_gen;
2425 ep->loop_check_depth = result;
2430 * ep_loop_check - Performs a check to verify that adding an epoll file (@to)
2432 * closed loops or too deep chains.
2434 * @ctx: Per-CTL_ADD scratch context.
2439 * does not violate the constraints, or %-1 otherwise.
2446 ctx->inserting_into = ep;
2449 * to loop up to @ep.
2453 return -1;
2459 return (depth+1+upwards_depth > EP_MAX_NESTS) ? -1 : 0;
2465 while (ctx->tfile_check_list != EP_UNACTIVE_PTR) {
2466 struct epitems_head *head = ctx->tfile_check_list;
2467 ctx->tfile_check_list = head->next;
2485 return -EINVAL;
2503 ep->file = fd_prepare_file(fdf);
2515 return -EINVAL;
2523 if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
2524 epev->events &= ~EPOLLWAKEUP;
2529 epev->events &= ~EPOLLWAKEUP;
2539 return mutex_trylock(mutex) ? 0 : -EAGAIN;
2545 * Always takes ep->mtx. For EPOLL_CTL_ADD, additionally runs the
2546 * loop / path check under epnested_mutex when the topology can
2547 * change: @ep is already watched (epfile->f_ep non-NULL), @ep was
2548 * recently loop-checked (ep->gen == loop_check_gen), or @tfile is
2553 * 0 success; ep->mtx held.
2554 * 1 success; ep->mtx held AND the full check ran under
2557 * -errno failure; no locks held.
2571 error = epoll_mutex_lock(&ep->mtx, nonblock);
2577 if (!READ_ONCE(epfile->f_ep) && ep->gen != loop_check_gen &&
2581 /* Full check needed: drop ep->mtx so we can take epnested_mutex. */
2582 mutex_unlock(&ep->mtx);
2590 tep = tfile->private_data;
2592 error = -ELOOP;
2597 error = epoll_mutex_lock(&ep->mtx, nonblock);
2613 mutex_unlock(&ep->mtx);
2633 if (!file_can_poll(tf->file))
2634 return -EPERM;
2644 if (f == tf->file || !is_file_epoll(f))
2645 return -EINVAL;
2652 if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
2654 return -EINVAL;
2655 if (op == EPOLL_CTL_ADD && (is_file_epoll(tf->file) ||
2656 (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
2657 return -EINVAL;
2660 ep = f->private_data;
2662 full_check = ep_ctl_lock(&ctx, ep, op, f, tf->file, nonblock);
2667 * Look the target up in ep's RB tree. We hold ep->mtx, so the
2672 error = -EINVAL;
2676 epds->events |= EPOLLERR | EPOLLHUP;
2679 error = -EEXIST;
2690 error = -ENOENT;
2695 if (!(epi->event.events & EPOLLEXCLUSIVE)) {
2696 epds->events |= EPOLLERR | EPOLLHUP;
2700 error = -ENOENT;
2715 return -EBADF;
2720 return -EBADF;
2739 return -EFAULT;
2749 return -EINVAL;
2753 return -EFAULT;
2760 return -EINVAL;
2775 ep = file->private_data;
2777 * Racy call, but that's ok - it should get retried based on
2798 return -EBADF;
2808 ep = fd_file(f)->private_data;
2843 restore_saved_sigmask_unless(error == -EINTR);
2867 return -EFAULT;
2870 return -EINVAL;
2883 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
2885 if (epi->ffd.fd == tfd) {
2889 toff--;
2905 return ERR_PTR(-EINVAL);
2907 ep = file->private_data;
2909 mutex_lock(&ep->mtx);
2912 file_raw = epi->ffd.file;
2914 file_raw = ERR_PTR(-ENOENT);
2915 mutex_unlock(&ep->mtx);
2939 restore_saved_sigmask_unless(err == -EINTR);
2968 return -EFAULT;
2971 return -EINVAL;
2988 max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
2994 * using an extra cache line on 64-bit (and smaller) CPUs