1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * fs/eventpoll.c (Efficient event retrieval implementation) 4 * Copyright (C) 2001,...,2009 Davide Libenzi 5 * 6 * Davide Libenzi <davidel@xmailserver.org> 7 */ 8 9 #include <linux/init.h> 10 #include <linux/kernel.h> 11 #include <linux/sched/signal.h> 12 #include <linux/fs.h> 13 #include <linux/file.h> 14 #include <linux/signal.h> 15 #include <linux/errno.h> 16 #include <linux/mm.h> 17 #include <linux/slab.h> 18 #include <linux/poll.h> 19 #include <linux/string.h> 20 #include <linux/list.h> 21 #include <linux/hash.h> 22 #include <linux/spinlock.h> 23 #include <linux/syscalls.h> 24 #include <linux/rbtree.h> 25 #include <linux/wait.h> 26 #include <linux/eventpoll.h> 27 #include <linux/mount.h> 28 #include <linux/bitops.h> 29 #include <linux/mutex.h> 30 #include <linux/anon_inodes.h> 31 #include <linux/device.h> 32 #include <linux/uaccess.h> 33 #include <asm/io.h> 34 #include <asm/mman.h> 35 #include <linux/atomic.h> 36 #include <linux/proc_fs.h> 37 #include <linux/seq_file.h> 38 #include <linux/compat.h> 39 #include <linux/rculist.h> 40 #include <linux/capability.h> 41 #include <linux/seqlock.h> 42 #include <net/busy_poll.h> 43 44 /* 45 * fs/eventpoll.c - Efficient event polling ("epoll") kernel implementation. 46 * 47 * 48 * Overview 49 * -------- 50 * 51 * Each epoll_create(2) returns an anonymous [eventpoll] file whose 52 * ->private_data is a struct eventpoll. Each EPOLL_CTL_ADD installs 53 * a struct epitem linking one (watched file, fd) pair back to that 54 * eventpoll via the watched file's f_op->poll() wait queue(s). When 55 * the watched file signals readiness, ep_poll_callback() fires and 56 * marks the epitem ready. epoll_wait(2) drains the ready list under 57 * ep->mtx, re-queueing items in level-triggered mode. 58 * 59 * epoll instances can watch other epoll instances up to EP_MAX_NESTS 60 * deep; cycles are forbidden and detected at EPOLL_CTL_ADD time. 61 * 62 * 63 * Locking 64 * ------- 65 * 66 * Three levels, acquired from outer to inner: 67 * 68 * epnested_mutex (global; rare; taken only for EPOLL_CTL_ADD 69 * loop / path checks) 70 * > ep->mtx (per-eventpoll; sleepable; serializes most ops) 71 * > ep->lock (per-eventpoll; IRQ-safe spinlock) 72 * 73 * file->f_lock (per-file; NOT IRQ-safe; guards f_ep hlist ops; 74 * nested inside ep->mtx, outside ep->lock) 75 * 76 * Rationale: 77 * - ep->lock is a spinlock because ep_poll_callback() is called from 78 * wake_up() which may run in hard-IRQ context. All ep->lock 79 * critical sections use spin_lock_irqsave(). 80 * - ep->mtx is a sleepable mutex because the event delivery loop 81 * calls copy_to_user(), and ep_insert() may sleep in 82 * kmem_cache_alloc() and f_op->poll(). 83 * - epnested_mutex is global because cycle detection needs a global 84 * view of the epoll topology; a per-object scheme would let two 85 * concurrent inserts (A into B, B into A) construct a cycle 86 * without either observer seeing it. 87 * - Per-ep ep->mtx is preferred for scalability elsewhere. Events 88 * that require epnested_mutex are rare. 89 * 90 * When EPOLL_CTL_ADD nests one eventpoll inside another we acquire 91 * ep->mtx on both: outer first, target second. Since cycles are 92 * forbidden the set of live ep->mtx holds is always a strict chain, 93 * communicated to lockdep via mutex_lock_nested() subclasses derived 94 * from the current recursion depth. 95 * 96 * 97 * Field protection 98 * ---------------- 99 * 100 * struct eventpoll: 101 * mtx - self 102 * rbr - ep->mtx 103 * ovflist, rdllist - ep->lock (IRQ-safe) 104 * wq - ep->lock for queue mutation 105 * poll_wait - internal waitqueue spinlock 106 * refs - file->f_lock for adds; ep->mtx for removes; 107 * RCU for readers (hlist_del_rcu + kfree_rcu(ep)) 108 * ws - ep->mtx 109 * gen, loop_check_depth - epnested_mutex 110 * file, user - immutable after setup 111 * refcount - atomic (refcount_t) 112 * napi_* - READ_ONCE / WRITE_ONCE 113 * 114 * struct epitem: 115 * rbn / rcu union - rbn: ep->mtx (while epi is linked in ep->rbr). 116 * rcu: written only by kfree_rcu(epi) on the free 117 * path; otherwise untouched by epoll code. 118 * rdllink, next - ep->lock 119 * ffd, ep - immutable after ep_insert() 120 * pwqlist - ep->mtx for writes; POLLFREE clears pwq->whead 121 * via smp_store_release(), see below 122 * fllink - file->f_lock for mutation; hlist_del_rcu + 123 * kfree_rcu(epi) for safe RCU readers 124 * ws - RCU (rcu_assign_pointer / 125 * rcu_dereference_check(mtx)) 126 * event - ep->mtx for writes; lockless read in 127 * ep_poll_callback pairs with smp_mb() in 128 * ep_modify() 129 * 130 * 131 * Ready-list state machine 132 * ------------------------ 133 * 134 * Readiness is tracked in two lists under ep->lock: 135 * 136 * rdllist - doubly-linked FIFO; the "current" ready list. 137 * ovflist - singly-linked LIFO; used during a scan to catch 138 * events that arrive while rdllist is being iterated 139 * without ep->lock. 140 * 141 * Encoded in ep->ovflist: 142 * EP_UNACTIVE_PTR - no scan active; callback appends to rdllist. 143 * NULL - scan active, no spill yet. 144 * pointer to epi - scan active with spilled items (LIFO). 145 * 146 * Encoded in epi->ovflist_next: 147 * EP_UNACTIVE_PTR - epi is not on ovflist. 148 * otherwise - next epi on ovflist (NULL at tail). 149 * 150 * ep_start_scan() flips "not scanning" to "scanning" and splices 151 * rdllist into a caller-local scan_batch. ep_done_scan() drains ovflist 152 * back to rdllist (list_add head-insert reverses LIFO to FIFO), 153 * flips back to "not scanning", and re-splices any items the caller 154 * left in scan_batch (e.g., level-triggered re-queues). 155 * 156 * 157 * Removal paths 158 * ------------- 159 * 160 * Three paths dispose of epitems and/or eventpolls: 161 * 162 * A. ep_remove() - EPOLL_CTL_DEL and ep_insert() 163 * rollback. Caller holds ep->mtx. 164 * B. ep_clear_and_put() - close of the epoll fd itself 165 * (ep_eventpoll_release). 166 * C. eventpoll_release_file() - close of a watched file, invoked 167 * from __fput(). 168 * 169 * Coordination: 170 * A and C exclude each other via the watched file's refcount. 171 * A pins the file with epi_fget() before touching file->f_ep or 172 * file->f_lock; if the pin fails, __fput() is in flight and C 173 * will clean this epi up. See the epi_fget() block comment. 174 * A and B both hold ep->mtx serially. B walks the rbtree with 175 * rb_next() captured before ep_remove() erases the current node. 176 * B and C both take ep->mtx; the loser sees fewer entries or an 177 * empty file->f_ep. 178 * 179 * Within every path the internal order is strict: 180 * ep_unregister_pollwait() - drain pwqlist; synchronizes with any 181 * in-flight ep_poll_callback via the 182 * watched wait-queue head's lock. 183 * ep_remove_file() - hlist_del_rcu of epi->fllink and, 184 * if last watcher, clear file->f_ep, 185 * under file->f_lock. 186 * ep_remove_epi() - rb_erase, rdllist unlink (ep->lock), 187 * wakeup_source_unregister, 188 * kfree_rcu(epi). 189 * 190 * kfree_rcu(epi) defers the free past RCU readers in 191 * reverse_path_check_proc(); kfree_rcu(ep) defers past readers in 192 * ep_get_upwards_depth_proc(). 193 * 194 * 195 * POLLFREE handshake 196 * ------------------ 197 * 198 * When a subsystem tears down a wait-queue head that an epitem is 199 * registered on (binder, signalfd, ...), it wakes the callback with 200 * POLLFREE and must RCU-defer the head's free. The store/load pair: 201 * 202 * ep_poll_callback() POLLFREE branch: 203 * smp_store_release(&pwq->whead, NULL) 204 * 205 * ep_remove_wait_queue(): 206 * smp_load_acquire(&pwq->whead) 207 * 208 * See those sites for the full argument. 209 */ 210 211 /* Epoll private bits inside the event mask */ 212 #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE) 213 214 #define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT) 215 216 #define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \ 217 EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE) 218 219 /* Maximum number of nesting allowed inside epoll sets */ 220 #define EP_MAX_NESTS 4 221 222 #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 223 224 #define EP_UNACTIVE_PTR ((void *) -1L) 225 226 #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry)) 227 228 /* Wait structure used by the poll hooks */ 229 struct eppoll_entry { 230 /* List header used to link this structure to the "struct epitem" */ 231 struct eppoll_entry *next; 232 233 /* The "base" pointer is set to the container "struct epitem" */ 234 struct epitem *base; 235 236 /* 237 * Wait queue item that will be linked to the target file wait 238 * queue head. 239 */ 240 wait_queue_entry_t wait; 241 242 /* The wait queue head that linked the "wait" wait queue item */ 243 wait_queue_head_t *whead; 244 }; 245 246 /* 247 * Each file descriptor added to the eventpoll interface will 248 * have an entry of this type linked to the "rbr" RB tree. 249 * Avoid increasing the size of this struct, there can be many thousands 250 * of these on a server and we do not want this to take another cache line. 251 */ 252 struct epitem { 253 union { 254 /* RB tree node links this structure to the eventpoll RB tree */ 255 struct rb_node rbn; 256 /* Used to free the struct epitem */ 257 struct rcu_head rcu; 258 }; 259 260 /* Link on the owning eventpoll's ready list (ep->rdllist). */ 261 struct list_head rdllink; 262 263 /* 264 * Link on the owning eventpoll's scan-overflow list (ep->ovflist), 265 * EP_UNACTIVE_PTR when not linked. See epi_on_ovflist() / 266 * epi_clear_ovflist() and the "Ready-list state machine" section 267 * in the top-of-file banner. 268 */ 269 struct epitem *ovflist_next; 270 271 /* The file descriptor information this item refers to */ 272 struct epoll_key ffd; 273 274 /* List containing poll wait queues */ 275 struct eppoll_entry *pwqlist; 276 277 /* The "container" of this item */ 278 struct eventpoll *ep; 279 280 /* List header used to link this item to the "struct file" items list */ 281 struct hlist_node fllink; 282 283 /* wakeup_source used when EPOLLWAKEUP is set */ 284 struct wakeup_source __rcu *ws; 285 286 /* The structure that describe the interested events and the source fd */ 287 struct epoll_event event; 288 }; 289 290 /* 291 * This structure is stored inside the "private_data" member of the file 292 * structure and represents the main data structure for the eventpoll 293 * interface. 294 */ 295 struct eventpoll { 296 /* 297 * This mutex is used to ensure that files are not removed 298 * while epoll is using them. This is held during the event 299 * collection loop, the file cleanup path, the epoll file exit 300 * code and the ctl operations. 301 */ 302 struct mutex mtx; 303 304 /* Wait queue used by sys_epoll_wait() */ 305 wait_queue_head_t wq; 306 307 /* Wait queue used by file->poll() */ 308 wait_queue_head_t poll_wait; 309 310 /* List of ready file descriptors */ 311 struct list_head rdllist; 312 313 /* Lock which protects rdllist and ovflist */ 314 spinlock_t lock; 315 316 /* Protect switching between rdllist and ovflist */ 317 seqcount_spinlock_t seq; 318 319 /* RB tree root used to store monitored fd structs */ 320 struct rb_root_cached rbr; 321 322 /* 323 * This is a single linked list that chains all the "struct epitem" that 324 * happened while transferring ready events to userspace w/out 325 * holding ->lock. 326 */ 327 struct epitem *ovflist; 328 329 /* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */ 330 struct wakeup_source *ws; 331 332 /* The user that created the eventpoll descriptor */ 333 struct user_struct *user; 334 335 struct file *file; 336 337 /* used to optimize loop detection check */ 338 u64 gen; 339 struct hlist_head refs; 340 u8 loop_check_depth; 341 342 /* usage count, orchestrates "struct eventpoll" disposal */ 343 refcount_t refcount; 344 345 /* used to defer freeing past ep_get_upwards_depth_proc() RCU walk */ 346 struct rcu_head rcu; 347 348 #ifdef CONFIG_NET_RX_BUSY_POLL 349 /* used to track busy poll napi_id */ 350 unsigned int napi_id; 351 /* busy poll timeout */ 352 u32 busy_poll_usecs; 353 /* busy poll packet budget */ 354 u16 busy_poll_budget; 355 bool prefer_busy_poll; 356 #endif 357 358 #ifdef CONFIG_DEBUG_LOCK_ALLOC 359 /* tracks wakeup nests for lockdep validation */ 360 u8 nests; 361 #endif 362 }; 363 364 /* Wrapper struct used by poll queueing */ 365 struct ep_pqueue { 366 poll_table pt; 367 struct epitem *epi; 368 }; 369 370 /* 371 * Configuration options available inside /proc/sys/fs/epoll/ 372 */ 373 /* Maximum number of epoll watched descriptors, per user */ 374 static long max_user_watches __read_mostly; 375 376 /* 377 * Cycle and path-length checks at EPOLL_CTL_ADD 378 * --------------------------------------------- 379 * 380 * When EPOLL_CTL_ADD creates a link that either targets an eventpoll 381 * file or extends an existing chain of eventpolls, two checks run: 382 * 383 * 1. no cycle is being formed -- ep_loop_check() walks downward 384 * from the candidate target, and ep_get_upwards_depth_proc() 385 * walks upward from the outer ep, both bounded by EP_MAX_NESTS. 386 * 2. no file accumulates more than path_limits[depth] wakeup paths 387 * of a given length -- reverse_path_check(). 388 * 389 * Both need a global view of the epoll topology and must be atomic 390 * with the insertion, so the check is serialized by epnested_mutex 391 * and carries its scratch state on a stack-allocated struct 392 * ep_ctl_ctx scoped to one do_epoll_ctl() call. Non-nested inserts 393 * skip this machinery entirely and take only ep->mtx. 394 * 395 * epnested_mutex Serializes the whole check. 396 * loop_check_gen Global monotonic stamp, bumped at the start of 397 * a check and again at the end. ep->gen caches 398 * the value under which ep was last visited by 399 * ep_loop_check_proc() or 400 * ep_get_upwards_depth_proc(); the post-check 401 * bump ensures those cached stamps can no longer 402 * equal loop_check_gen, so the 403 * "ep->gen == loop_check_gen" trigger in 404 * ep_ctl_lock() only fires while another check 405 * is in flight. 406 * 407 * struct ep_ctl_ctx carries the rest (inserting_into, tfile_check_list, 408 * path_count[]) through the walk; see its declaration below. 409 * 410 * Commits fdcfce93073d ("eventpoll: Fix integer overflow in 411 * ep_loop_check_proc()") and f2e467a48287 ("eventpoll: Fix 412 * semi-unbounded recursion") hardened the walk; any refactor must 413 * preserve both bail-outs. 414 */ 415 static DEFINE_MUTEX(epnested_mutex); 416 static u64 loop_check_gen = 0; 417 418 #define PATH_ARR_SIZE 5 419 420 /* 421 * Per-do_epoll_ctl() scratch for the loop / path checks. Allocated on 422 * the caller's stack; populated by ep_ctl_lock() and the downward 423 * walk; consumed by reverse_path_check(); released by ep_ctl_unlock(). 424 * Only valid while the caller holds epnested_mutex. 425 */ 426 struct ep_ctl_ctx { 427 /* 428 * Outer eventpoll for one ep_loop_check(); if the downward walk 429 * reaches it the insert would form a cycle. 430 */ 431 struct eventpoll *inserting_into; 432 433 /* 434 * Singly-linked list of epitems_head objects collected during 435 * ep_loop_check_proc(), then walked by reverse_path_check(). 436 * Terminated by EP_UNACTIVE_PTR, not NULL: epitems_head->next 437 * doubles as a membership flag (a NULL ->next means "not on this 438 * list", see ep_remove_file()), so the list uses a non-NULL 439 * sentinel to keep the tail head distinguishable from an unlisted 440 * one. 441 */ 442 struct epitems_head *tfile_check_list; 443 444 /* 445 * Per-depth wakeup-path tally used by reverse_path_check_proc(); 446 * reinitialized to zero at the start of each reverse_path_check() 447 * iteration. 448 */ 449 int path_count[PATH_ARR_SIZE]; 450 }; 451 452 /* Slab cache used to allocate "struct epitem" */ 453 static struct kmem_cache *epi_cache __ro_after_init; 454 455 /* Slab cache used to allocate "struct eppoll_entry" */ 456 static struct kmem_cache *pwq_cache __ro_after_init; 457 458 /* 459 * Wrapper anchor for file->f_ep when the watched file is not itself an 460 * eventpoll; for the epoll-watches-epoll case, file->f_ep points at 461 * &watched_ep->refs directly. The ->next field threads 462 * ctx->tfile_check_list during one EPOLL_CTL_ADD path check. 463 */ 464 struct epitems_head { 465 struct hlist_head epitems; 466 struct epitems_head *next; 467 }; 468 469 static struct kmem_cache *ephead_cache __ro_after_init; 470 471 static inline void free_ephead(struct epitems_head *head) 472 { 473 if (head) 474 kmem_cache_free(ephead_cache, head); 475 } 476 477 static void list_file(struct file *file, struct ep_ctl_ctx *ctx) 478 { 479 struct epitems_head *head; 480 481 head = container_of(file->f_ep, struct epitems_head, epitems); 482 if (!head->next) { 483 head->next = ctx->tfile_check_list; 484 ctx->tfile_check_list = head; 485 } 486 } 487 488 static void unlist_file(struct epitems_head *head) 489 { 490 struct epitems_head *to_free = head; 491 struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems)); 492 if (p) { 493 struct epitem *epi= container_of(p, struct epitem, fllink); 494 spin_lock(&epi->ffd.file->f_lock); 495 if (!hlist_empty(&head->epitems)) 496 to_free = NULL; 497 head->next = NULL; 498 spin_unlock(&epi->ffd.file->f_lock); 499 } 500 free_ephead(to_free); 501 } 502 503 #ifdef CONFIG_SYSCTL 504 505 #include <linux/sysctl.h> 506 507 static long long_zero; 508 static long long_max = LONG_MAX; 509 510 static const struct ctl_table epoll_table[] = { 511 { 512 .procname = "max_user_watches", 513 .data = &max_user_watches, 514 .maxlen = sizeof(max_user_watches), 515 .mode = 0644, 516 .proc_handler = proc_doulongvec_minmax, 517 .extra1 = &long_zero, 518 .extra2 = &long_max, 519 }, 520 }; 521 522 static void __init epoll_sysctls_init(void) 523 { 524 register_sysctl("fs/epoll", epoll_table); 525 } 526 #else 527 #define epoll_sysctls_init() do { } while (0) 528 #endif /* CONFIG_SYSCTL */ 529 530 static const struct file_operations eventpoll_fops; 531 532 bool is_file_epoll(struct file *f) 533 { 534 return f->f_op == &eventpoll_fops; 535 } 536 537 /* Compare RB tree keys */ 538 static inline int ep_cmp_ffd(struct epoll_key *p1, struct epoll_key *p2) 539 { 540 return (p1->file > p2->file ? +1: 541 (p1->file < p2->file ? -1 : p1->fd - p2->fd)); 542 } 543 544 /* True iff @epi is on its owning ep's ready list. */ 545 static inline bool ep_is_linked(struct epitem *epi) 546 { 547 return !list_empty(&epi->rdllink); 548 } 549 550 static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p) 551 { 552 return container_of(p, struct eppoll_entry, wait); 553 } 554 555 /* Get the "struct epitem" from a wait queue pointer */ 556 static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) 557 { 558 return container_of(p, struct eppoll_entry, wait)->base; 559 } 560 561 /* 562 * Ready-list / ovflist state (see "Ready-list state machine" in the 563 * top-of-file banner for the full state machine). EP_UNACTIVE_PTR is 564 * the sentinel; these wrappers name each transition and each test so 565 * call sites do not need to know the sentinel's value. 566 */ 567 568 /* True iff @ep is between ep_enter_scan() and ep_exit_scan(). */ 569 static inline bool ep_is_scanning(struct eventpoll *ep) 570 { 571 return READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR; 572 } 573 574 /* Called by ep_start_scan(): divert ep_poll_callback() to ovflist. */ 575 static inline void ep_enter_scan(struct eventpoll *ep) 576 { 577 WRITE_ONCE(ep->ovflist, NULL); 578 } 579 580 /* Called by ep_done_scan(): redirect ep_poll_callback() back to rdllist. */ 581 static inline void ep_exit_scan(struct eventpoll *ep) 582 { 583 WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR); 584 } 585 586 /* True iff @epi is currently linked on its ep's ovflist. */ 587 static inline bool epi_on_ovflist(const struct epitem *epi) 588 { 589 return epi->ovflist_next != EP_UNACTIVE_PTR; 590 } 591 592 /* Mark @epi as not on any ovflist (init and post-drain). */ 593 static inline void epi_clear_ovflist(struct epitem *epi) 594 { 595 epi->ovflist_next = EP_UNACTIVE_PTR; 596 } 597 598 /* True iff @ep has ready events that epoll_wait() might harvest. */ 599 static inline bool ep_events_available(struct eventpoll *ep) 600 { 601 unsigned int seq = read_seqcount_begin(&ep->seq); 602 603 return !list_empty_careful(&ep->rdllist) || ep_is_scanning(ep) || 604 read_seqcount_retry(&ep->seq, seq); 605 } 606 607 #ifdef CONFIG_NET_RX_BUSY_POLL 608 /** 609 * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value 610 * from the epoll instance ep is preferred, but if it is not set fallback to 611 * the system-wide global via busy_loop_timeout. 612 * 613 * @start_time: The start time used to compute the remaining time until timeout. 614 * @ep: Pointer to the eventpoll context. 615 * 616 * Return: true if the timeout has expired, false otherwise. 617 */ 618 static bool busy_loop_ep_timeout(unsigned long start_time, 619 struct eventpoll *ep) 620 { 621 unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs); 622 623 if (bp_usec) { 624 unsigned long end_time = start_time + bp_usec; 625 unsigned long now = busy_loop_current_time(); 626 627 return time_after(now, end_time); 628 } else { 629 return busy_loop_timeout(start_time); 630 } 631 } 632 633 static bool ep_busy_loop_on(struct eventpoll *ep) 634 { 635 return !!READ_ONCE(ep->busy_poll_usecs) || 636 READ_ONCE(ep->prefer_busy_poll) || 637 net_busy_loop_on(); 638 } 639 640 static bool ep_busy_loop_end(void *p, unsigned long start_time) 641 { 642 struct eventpoll *ep = p; 643 644 return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep); 645 } 646 647 /* 648 * Busy poll if globally on and supporting sockets found && no events, 649 * busy loop will return if need_resched or ep_events_available. 650 * 651 * we must do our busy polling with irqs enabled 652 */ 653 static bool ep_busy_loop(struct eventpoll *ep) 654 { 655 unsigned int napi_id = READ_ONCE(ep->napi_id); 656 u16 budget = READ_ONCE(ep->busy_poll_budget); 657 bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll); 658 659 if (!budget) 660 budget = BUSY_POLL_BUDGET; 661 662 if (napi_id_valid(napi_id) && ep_busy_loop_on(ep)) { 663 napi_busy_loop(napi_id, ep_busy_loop_end, 664 ep, prefer_busy_poll, budget); 665 if (ep_events_available(ep)) 666 return true; 667 /* 668 * Busy poll timed out. Drop NAPI ID for now, we can add 669 * it back in when we have moved a socket with a valid NAPI 670 * ID onto the ready list. 671 */ 672 if (prefer_busy_poll) 673 napi_resume_irqs(napi_id); 674 ep->napi_id = 0; 675 return false; 676 } 677 return false; 678 } 679 680 /* 681 * Set epoll busy poll NAPI ID from sk. 682 */ 683 static inline void ep_set_busy_poll_napi_id(struct epitem *epi) 684 { 685 struct eventpoll *ep = epi->ep; 686 unsigned int napi_id; 687 struct socket *sock; 688 struct sock *sk; 689 690 if (!ep_busy_loop_on(ep)) 691 return; 692 693 sock = sock_from_file(epi->ffd.file); 694 if (!sock) 695 return; 696 697 sk = sock->sk; 698 if (!sk) 699 return; 700 701 napi_id = READ_ONCE(sk->sk_napi_id); 702 703 /* Non-NAPI IDs can be rejected 704 * or 705 * Nothing to do if we already have this ID 706 */ 707 if (!napi_id_valid(napi_id) || napi_id == ep->napi_id) 708 return; 709 710 /* record NAPI ID for use in next busy poll */ 711 ep->napi_id = napi_id; 712 } 713 714 static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, 715 unsigned long arg) 716 { 717 struct eventpoll *ep = file->private_data; 718 void __user *uarg = (void __user *)arg; 719 struct epoll_params epoll_params; 720 721 switch (cmd) { 722 case EPIOCSPARAMS: 723 if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params))) 724 return -EFAULT; 725 726 /* pad byte must be zero */ 727 if (epoll_params.__pad) 728 return -EINVAL; 729 730 if (epoll_params.busy_poll_usecs > S32_MAX) 731 return -EINVAL; 732 733 if (epoll_params.prefer_busy_poll > 1) 734 return -EINVAL; 735 736 if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT && 737 !capable(CAP_NET_ADMIN)) 738 return -EPERM; 739 740 WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs); 741 WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget); 742 WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll); 743 return 0; 744 case EPIOCGPARAMS: 745 memset(&epoll_params, 0, sizeof(epoll_params)); 746 epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs); 747 epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget); 748 epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll); 749 if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params))) 750 return -EFAULT; 751 return 0; 752 default: 753 return -ENOIOCTLCMD; 754 } 755 } 756 757 static void ep_suspend_napi_irqs(struct eventpoll *ep) 758 { 759 unsigned int napi_id = READ_ONCE(ep->napi_id); 760 761 if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll)) 762 napi_suspend_irqs(napi_id); 763 } 764 765 static void ep_resume_napi_irqs(struct eventpoll *ep) 766 { 767 unsigned int napi_id = READ_ONCE(ep->napi_id); 768 769 if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll)) 770 napi_resume_irqs(napi_id); 771 } 772 773 #else 774 775 static inline bool ep_busy_loop(struct eventpoll *ep) 776 { 777 return false; 778 } 779 780 static inline void ep_set_busy_poll_napi_id(struct epitem *epi) 781 { 782 } 783 784 static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, 785 unsigned long arg) 786 { 787 return -EOPNOTSUPP; 788 } 789 790 static void ep_suspend_napi_irqs(struct eventpoll *ep) 791 { 792 } 793 794 static void ep_resume_napi_irqs(struct eventpoll *ep) 795 { 796 } 797 798 #endif /* CONFIG_NET_RX_BUSY_POLL */ 799 800 /* 801 * As described in commit 0ccf831cb lockdep: annotate epoll 802 * the use of wait queues used by epoll is done in a very controlled 803 * manner. Wake ups can nest inside each other, but are never done 804 * with the same locking. For example: 805 * 806 * dfd = socket(...); 807 * efd1 = epoll_create(); 808 * efd2 = epoll_create(); 809 * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...); 810 * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...); 811 * 812 * When a packet arrives to the device underneath "dfd", the net code will 813 * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a 814 * callback wakeup entry on that queue, and the wake_up() performed by the 815 * "dfd" net code will end up in ep_poll_callback(). At this point epoll 816 * (efd1) notices that it may have some event ready, so it needs to wake up 817 * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake() 818 * that ends up in another wake_up(), after having checked about the 819 * recursion constraints. That are, no more than EP_MAX_NESTS, to avoid 820 * stack blasting. 821 * 822 * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle 823 * this special case of epoll. 824 */ 825 #ifdef CONFIG_DEBUG_LOCK_ALLOC 826 827 static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, 828 unsigned pollflags) 829 { 830 struct eventpoll *ep_src; 831 unsigned long flags; 832 u8 nests = 0; 833 834 /* 835 * To set the subclass or nesting level for spin_lock_irqsave_nested() 836 * it might be natural to create a per-cpu nest count. However, since 837 * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can 838 * schedule() in the -rt kernel, the per-cpu variable are no longer 839 * protected. Thus, we are introducing a per eventpoll nest field. 840 * If we are not being call from ep_poll_callback(), epi is NULL and 841 * we are at the first level of nesting, 0. Otherwise, we are being 842 * called from ep_poll_callback() and if a previous wakeup source is 843 * not an epoll file itself, we are at depth 1 since the wakeup source 844 * is depth 0. If the wakeup source is a previous epoll file in the 845 * wakeup chain then we use its nests value and record ours as 846 * nests + 1. The previous epoll file nests value is stable since its 847 * already holding its own poll_wait.lock. 848 */ 849 if (epi) { 850 if ((is_file_epoll(epi->ffd.file))) { 851 ep_src = epi->ffd.file->private_data; 852 nests = ep_src->nests; 853 } else { 854 nests = 1; 855 } 856 } 857 spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); 858 ep->nests = nests + 1; 859 wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags); 860 ep->nests = 0; 861 spin_unlock_irqrestore(&ep->poll_wait.lock, flags); 862 } 863 864 #else 865 866 static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, 867 __poll_t pollflags) 868 { 869 wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags); 870 } 871 872 #endif 873 874 static void ep_remove_wait_queue(struct eppoll_entry *pwq) 875 { 876 wait_queue_head_t *whead; 877 878 rcu_read_lock(); 879 /* 880 * POLLFREE handshake, acquire side; see "POLLFREE handshake" 881 * at the top of this file. 882 * 883 * A NULL load is paired with the smp_store_release(&whead, NULL) 884 * in ep_poll_callback()'s POLLFREE branch: the teardown is 885 * complete and we must not touch whead again. On a non-NULL load 886 * rcu_read_lock() keeps the waitqueue memory alive (POLLFREE 887 * firers RCU-defer the free) and whead->lock inside 888 * remove_wait_queue() serializes us against the store side. 889 */ 890 whead = smp_load_acquire(&pwq->whead); 891 if (whead) 892 remove_wait_queue(whead, &pwq->wait); 893 rcu_read_unlock(); 894 } 895 896 /* 897 * This function unregisters poll callbacks from the associated file 898 * descriptor. Must be called with "mtx" held. 899 */ 900 static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) 901 { 902 struct eppoll_entry **p = &epi->pwqlist; 903 struct eppoll_entry *pwq; 904 905 while ((pwq = *p) != NULL) { 906 *p = pwq->next; 907 ep_remove_wait_queue(pwq); 908 kmem_cache_free(pwq_cache, pwq); 909 } 910 } 911 912 /* call only when ep->mtx is held */ 913 static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi) 914 { 915 return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx)); 916 } 917 918 /* call only when ep->mtx is held */ 919 static inline void ep_pm_stay_awake(struct epitem *epi) 920 { 921 struct wakeup_source *ws = ep_wakeup_source(epi); 922 923 if (ws) 924 __pm_stay_awake(ws); 925 } 926 927 static inline bool ep_has_wakeup_source(struct epitem *epi) 928 { 929 return rcu_access_pointer(epi->ws) ? true : false; 930 } 931 932 /* call when ep->mtx cannot be held (ep_poll_callback) */ 933 static inline void ep_pm_stay_awake_rcu(struct epitem *epi) 934 { 935 struct wakeup_source *ws; 936 937 rcu_read_lock(); 938 ws = rcu_dereference(epi->ws); 939 if (ws) 940 __pm_stay_awake(ws); 941 rcu_read_unlock(); 942 } 943 944 945 /* 946 * ep->mutex needs to be held because we could be hit by 947 * eventpoll_release_file() and epoll_ctl(). 948 */ 949 static void ep_start_scan(struct eventpoll *ep, struct list_head *scan_batch) 950 { 951 /* 952 * Steal the ready list, and re-init the original one to the 953 * empty list. Also, set ep->ovflist to NULL so that events 954 * happening while looping w/out locks, are not lost. We cannot 955 * have the poll callback to queue directly on ep->rdllist, 956 * because we want the "sproc" callback to be able to do it 957 * in a lockless way. 958 */ 959 lockdep_assert_irqs_enabled(); 960 spin_lock_irq(&ep->lock); 961 write_seqcount_begin(&ep->seq); 962 963 list_splice_init(&ep->rdllist, scan_batch); 964 ep_enter_scan(ep); 965 966 write_seqcount_end(&ep->seq); 967 spin_unlock_irq(&ep->lock); 968 } 969 970 static void ep_done_scan(struct eventpoll *ep, 971 struct list_head *scan_batch) 972 { 973 struct epitem *epi, *nepi; 974 975 spin_lock_irq(&ep->lock); 976 /* 977 * During the time we spent inside the "sproc" callback, some 978 * other events might have been queued by the poll callback. 979 * We re-insert them inside the main ready-list here. 980 */ 981 for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; ) { 982 nepi = epi->ovflist_next; 983 epi_clear_ovflist(epi); 984 /* 985 * Skip items that the caller already returned via @scan_batch 986 * -- the list_splice() below takes care of those. 987 */ 988 if (!ep_is_linked(epi)) { 989 /* 990 * ovflist is LIFO; list_add() head-insert here 991 * reverses the iteration order into FIFO. 992 */ 993 list_add(&epi->rdllink, &ep->rdllist); 994 ep_pm_stay_awake(epi); 995 } 996 } 997 998 write_seqcount_begin(&ep->seq); 999 1000 /* Back out of scan mode; callbacks target ep->rdllist again. */ 1001 ep_exit_scan(ep); 1002 1003 /* 1004 * Quickly re-inject items left on "scan_batch". 1005 */ 1006 list_splice(scan_batch, &ep->rdllist); 1007 1008 write_seqcount_end(&ep->seq); 1009 1010 __pm_relax(ep->ws); 1011 1012 if (!list_empty(&ep->rdllist)) { 1013 if (waitqueue_active(&ep->wq)) 1014 wake_up(&ep->wq); 1015 } 1016 1017 spin_unlock_irq(&ep->lock); 1018 } 1019 1020 static void ep_get(struct eventpoll *ep) 1021 { 1022 refcount_inc(&ep->refcount); 1023 } 1024 1025 /* 1026 * Drop a reference to @ep; returns true iff it was the last, in which 1027 * case the caller is responsible for ep_free(). 1028 */ 1029 static bool ep_put(struct eventpoll *ep) 1030 { 1031 if (!refcount_dec_and_test(&ep->refcount)) 1032 return false; 1033 1034 WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root)); 1035 return true; 1036 } 1037 1038 static void ep_free(struct eventpoll *ep) 1039 { 1040 ep_resume_napi_irqs(ep); 1041 mutex_destroy(&ep->mtx); 1042 free_uid(ep->user); 1043 wakeup_source_unregister(ep->ws); 1044 /* ep_get_upwards_depth_proc() may still hold epi->ep under RCU */ 1045 kfree_rcu(ep, rcu); 1046 } 1047 1048 /* 1049 * Pin @epi->ffd.file for operations that require both safe dereference 1050 * and exclusion from __fput(). 1051 * 1052 * struct file uses SLAB_TYPESAFE_BY_RCU, so a freed slot can be 1053 * reassigned at any time. The bare load of epi->ffd.file is safe here 1054 * because the caller holds ep->mtx and eventpoll_release_file() blocks 1055 * on that mutex while tearing down the epi, so the backing file 1056 * allocation cannot be freed and reused under us. An rcu_read_lock() 1057 * is therefore unnecessary for the load. 1058 * 1059 * A successful file_ref_get() additionally blocks __fput() from 1060 * starting on this file: once the refcount has reached zero it cannot 1061 * come back. ep_remove() relies on that to touch file->f_lock and 1062 * file->f_ep without racing eventpoll_release_file() (see commit 1063 * a6dc643c6931). A NULL return means __fput() is already in flight; 1064 * the caller must bail without touching the file, and 1065 * eventpoll_release_file() will clean the epi up from its side. 1066 */ 1067 static struct file *epi_fget(const struct epitem *epi) 1068 { 1069 struct file *file; 1070 1071 file = epi->ffd.file; 1072 if (!file_ref_get(&file->f_ref)) 1073 file = NULL; 1074 return file; 1075 } 1076 1077 /* 1078 * Takes &file->f_lock; returns with it released. 1079 */ 1080 static void ep_remove_file(struct eventpoll *ep, struct epitem *epi, 1081 struct file *file) 1082 { 1083 struct epitems_head *to_free = NULL; 1084 struct hlist_head *head; 1085 1086 lockdep_assert_held(&ep->mtx); 1087 1088 spin_lock(&file->f_lock); 1089 head = file->f_ep; 1090 if (hlist_is_singular_node(&epi->fllink, head)) { 1091 /* 1092 * Last watcher: publish NULL so the eventpoll_release() 1093 * fastpath in include/linux/eventpoll.h can skip the slow 1094 * path on a future __fput(). Safe because every f_ep writer 1095 * either holds a pin on @file via epi_fget() or is __fput() 1096 * itself -- see the comment in eventpoll_release(). 1097 */ 1098 WRITE_ONCE(file->f_ep, NULL); 1099 if (!is_file_epoll(file)) { 1100 struct epitems_head *v; 1101 v = container_of(head, struct epitems_head, epitems); 1102 if (!smp_load_acquire(&v->next)) 1103 to_free = v; 1104 } 1105 } 1106 hlist_del_rcu(&epi->fllink); 1107 spin_unlock(&file->f_lock); 1108 free_ephead(to_free); 1109 } 1110 1111 static void ep_remove_epi(struct eventpoll *ep, struct epitem *epi) 1112 { 1113 lockdep_assert_held(&ep->mtx); 1114 1115 rb_erase_cached(&epi->rbn, &ep->rbr); 1116 1117 spin_lock_irq(&ep->lock); 1118 if (ep_is_linked(epi)) 1119 list_del_init(&epi->rdllink); 1120 spin_unlock_irq(&ep->lock); 1121 1122 wakeup_source_unregister(ep_wakeup_source(epi)); 1123 /* 1124 * At this point it is safe to free the eventpoll item. Use the union 1125 * field epi->rcu, since we are trying to minimize the size of 1126 * 'struct epitem'. The 'rbn' field is no longer in use. Protected by 1127 * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make 1128 * use of the rbn field. 1129 */ 1130 kfree_rcu(epi, rcu); 1131 1132 percpu_counter_dec(&ep->user->epoll_watches); 1133 } 1134 1135 /* 1136 * ep_remove variant for callers owing an additional reference to the ep 1137 */ 1138 static void ep_remove(struct eventpoll *ep, struct epitem *epi) 1139 { 1140 struct file *file __free(fput) = NULL; 1141 1142 lockdep_assert_irqs_enabled(); 1143 lockdep_assert_held(&ep->mtx); 1144 1145 ep_unregister_pollwait(ep, epi); 1146 1147 /* 1148 * If we manage to grab a reference it means we're not in 1149 * eventpoll_release_file() and aren't going to be: once @file's 1150 * refcount has reached zero, file_ref_get() cannot bring it back. 1151 */ 1152 file = epi_fget(epi); 1153 if (!file) 1154 return; 1155 1156 ep_remove_file(ep, epi, file); 1157 ep_remove_epi(ep, epi); 1158 WARN_ON_ONCE(ep_put(ep)); 1159 } 1160 1161 /* 1162 * Pass 1 of ep_clear_and_put(): drain every epi's pwqlist. 1163 * ep_unregister_pollwait() takes each watched wait-queue head's lock, 1164 * which synchronizes with any in-flight ep_poll_callback(); after 1165 * this returns no callback can still be about to dereference an epi 1166 * on this ep. Must strictly precede ep_drain_tree() -- fusing the 1167 * two walks would let a callback queued on epi_i still fire after 1168 * epi_{i+k} had already been freed. 1169 */ 1170 static void ep_drain_pollwaits(struct eventpoll *ep) 1171 { 1172 struct rb_node *rbp; 1173 struct epitem *epi; 1174 1175 lockdep_assert_held(&ep->mtx); 1176 1177 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { 1178 epi = rb_entry(rbp, struct epitem, rbn); 1179 1180 ep_unregister_pollwait(ep, epi); 1181 cond_resched(); 1182 } 1183 } 1184 1185 /* 1186 * Pass 2 of ep_clear_and_put(): ep_remove() every epi. The per-epi 1187 * pwqlist is already empty (ep_drain_pollwaits ran), but the rest of 1188 * ep_remove() still runs: epi_fget() pin, f_ep clear under f_lock, 1189 * rbtree erase, rdllist unlink, kfree_rcu(epi). rb_next() is captured 1190 * before each erase so the iteration is stable. 1191 * 1192 * A concurrent eventpoll_release_file() (removal path C) on a watched 1193 * file serializes with us via ep->mtx; ep_remove() transparently 1194 * hands off any epi whose file is in __fput() by bailing when 1195 * epi_fget() returns NULL, and path C will clean that epi up. 1196 */ 1197 static void ep_drain_tree(struct eventpoll *ep) 1198 { 1199 struct rb_node *rbp, *next; 1200 struct epitem *epi; 1201 1202 lockdep_assert_held(&ep->mtx); 1203 1204 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) { 1205 next = rb_next(rbp); 1206 epi = rb_entry(rbp, struct epitem, rbn); 1207 ep_remove(ep, epi); 1208 cond_resched(); 1209 } 1210 } 1211 1212 /* 1213 * Removal path B (see "Removal paths" in the top-of-file banner): 1214 * close of the epoll fd itself, reached via ep_eventpoll_release(). 1215 * 1216 * Two passes under ep->mtx: first ep_drain_pollwaits() quiesces 1217 * in-flight callbacks, then ep_drain_tree() frees the epis. The 1218 * ep->refcount is kept > 0 across the walk by the ep file's own 1219 * share, which we drop below; ep_free() runs iff we were the last 1220 * holder after the tree drained. 1221 */ 1222 static void ep_clear_and_put(struct eventpoll *ep) 1223 { 1224 /* Release any threads blocked in poll-on-ep. */ 1225 if (waitqueue_active(&ep->poll_wait)) 1226 ep_poll_safewake(ep, NULL, 0); 1227 1228 mutex_lock(&ep->mtx); 1229 ep_drain_pollwaits(ep); 1230 ep_drain_tree(ep); 1231 mutex_unlock(&ep->mtx); 1232 1233 if (ep_put(ep)) 1234 ep_free(ep); 1235 } 1236 1237 static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd, 1238 unsigned long arg) 1239 { 1240 int ret; 1241 1242 if (!is_file_epoll(file)) 1243 return -EINVAL; 1244 1245 switch (cmd) { 1246 case EPIOCSPARAMS: 1247 case EPIOCGPARAMS: 1248 ret = ep_eventpoll_bp_ioctl(file, cmd, arg); 1249 break; 1250 default: 1251 ret = -EINVAL; 1252 break; 1253 } 1254 1255 return ret; 1256 } 1257 1258 static int ep_eventpoll_release(struct inode *inode, struct file *file) 1259 { 1260 struct eventpoll *ep = file->private_data; 1261 1262 if (ep) 1263 ep_clear_and_put(ep); 1264 1265 return 0; 1266 } 1267 1268 static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth); 1269 1270 static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth) 1271 { 1272 struct eventpoll *ep = file->private_data; 1273 LIST_HEAD(scan_batch); 1274 struct epitem *epi, *tmp; 1275 poll_table pt; 1276 __poll_t res = 0; 1277 1278 init_poll_funcptr(&pt, NULL); 1279 1280 /* Insert inside our poll wait queue */ 1281 poll_wait(file, &ep->poll_wait, wait); 1282 1283 /* 1284 * Proceed to find out if wanted events are really available inside 1285 * the ready list. 1286 */ 1287 mutex_lock_nested(&ep->mtx, depth); 1288 ep_start_scan(ep, &scan_batch); 1289 list_for_each_entry_safe(epi, tmp, &scan_batch, rdllink) { 1290 if (ep_item_poll(epi, &pt, depth + 1)) { 1291 res = EPOLLIN | EPOLLRDNORM; 1292 break; 1293 } else { 1294 /* 1295 * Item has been dropped into the ready list by the poll 1296 * callback, but it's not actually ready, as far as 1297 * caller requested events goes. We can remove it here. 1298 */ 1299 __pm_relax(ep_wakeup_source(epi)); 1300 list_del_init(&epi->rdllink); 1301 } 1302 } 1303 ep_done_scan(ep, &scan_batch); 1304 mutex_unlock(&ep->mtx); 1305 return res; 1306 } 1307 1308 /* 1309 * Differs from ep_eventpoll_poll() in that internal callers already have 1310 * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() 1311 * is correctly annotated. 1312 */ 1313 static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, 1314 int depth) 1315 { 1316 struct file *file = epi_fget(epi); 1317 __poll_t res; 1318 1319 /* 1320 * We could return EPOLLERR | EPOLLHUP or something, but let's 1321 * treat this more as "file doesn't exist, poll didn't happen". 1322 */ 1323 if (!file) 1324 return 0; 1325 1326 pt->_key = epi->event.events; 1327 if (!is_file_epoll(file)) 1328 res = vfs_poll(file, pt); 1329 else 1330 res = __ep_eventpoll_poll(file, pt, depth); 1331 fput(file); 1332 return res & epi->event.events; 1333 } 1334 1335 static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) 1336 { 1337 return __ep_eventpoll_poll(file, wait, 0); 1338 } 1339 1340 #ifdef CONFIG_PROC_FS 1341 static void ep_show_fdinfo(struct seq_file *m, struct file *f) 1342 { 1343 struct eventpoll *ep = f->private_data; 1344 struct rb_node *rbp; 1345 1346 mutex_lock(&ep->mtx); 1347 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { 1348 struct epitem *epi = rb_entry(rbp, struct epitem, rbn); 1349 struct inode *inode = file_inode(epi->ffd.file); 1350 1351 seq_printf(m, "tfd: %8d events: %8x data: %16llx " 1352 " pos:%lli ino:%llx sdev:%x\n", 1353 epi->ffd.fd, epi->event.events, 1354 (long long)epi->event.data, 1355 (long long)epi->ffd.file->f_pos, 1356 inode->i_ino, inode->i_sb->s_dev); 1357 if (seq_has_overflowed(m)) 1358 break; 1359 } 1360 mutex_unlock(&ep->mtx); 1361 } 1362 #endif 1363 1364 /* File callbacks that implement the eventpoll file behaviour */ 1365 static const struct file_operations eventpoll_fops = { 1366 #ifdef CONFIG_PROC_FS 1367 .show_fdinfo = ep_show_fdinfo, 1368 #endif 1369 .release = ep_eventpoll_release, 1370 .poll = ep_eventpoll_poll, 1371 .llseek = noop_llseek, 1372 .unlocked_ioctl = ep_eventpoll_ioctl, 1373 .compat_ioctl = compat_ptr_ioctl, 1374 }; 1375 1376 /* 1377 * This is called from eventpoll_release() to unlink files from the eventpoll 1378 * interface. We need to have this facility to cleanup correctly files that are 1379 * closed without being removed from the eventpoll interface. 1380 */ 1381 void eventpoll_release_file(struct file *file) 1382 { 1383 struct eventpoll *ep; 1384 struct epitem *epi; 1385 1386 /* 1387 * A concurrent ep_remove() cannot outrace us: it pins @file via 1388 * epi_fget(), which fails once __fput() has dropped the refcount 1389 * to zero -- the path we're on. So any racing ep_remove() bails 1390 * and leaves the epi for us to clean up here. 1391 */ 1392 again: 1393 spin_lock(&file->f_lock); 1394 if (file->f_ep && file->f_ep->first) { 1395 epi = hlist_entry(file->f_ep->first, struct epitem, fllink); 1396 spin_unlock(&file->f_lock); 1397 1398 /* 1399 * ep access is safe as we still own a reference to the ep 1400 * struct 1401 */ 1402 ep = epi->ep; 1403 mutex_lock(&ep->mtx); 1404 1405 ep_unregister_pollwait(ep, epi); 1406 1407 ep_remove_file(ep, epi, file); 1408 ep_remove_epi(ep, epi); 1409 1410 mutex_unlock(&ep->mtx); 1411 1412 if (ep_put(ep)) 1413 ep_free(ep); 1414 goto again; 1415 } 1416 spin_unlock(&file->f_lock); 1417 } 1418 1419 static int ep_alloc(struct eventpoll **pep) 1420 { 1421 struct eventpoll *ep; 1422 1423 ep = kzalloc_obj(*ep); 1424 if (unlikely(!ep)) 1425 return -ENOMEM; 1426 1427 mutex_init(&ep->mtx); 1428 spin_lock_init(&ep->lock); 1429 seqcount_spinlock_init(&ep->seq, &ep->lock); 1430 init_waitqueue_head(&ep->wq); 1431 init_waitqueue_head(&ep->poll_wait); 1432 INIT_LIST_HEAD(&ep->rdllist); 1433 ep->rbr = RB_ROOT_CACHED; 1434 ep->ovflist = EP_UNACTIVE_PTR; /* not scanning */ 1435 ep->user = get_current_user(); 1436 refcount_set(&ep->refcount, 1); 1437 1438 *pep = ep; 1439 1440 return 0; 1441 } 1442 1443 /* 1444 * Search the file inside the eventpoll tree. The RB tree operations 1445 * are protected by the "mtx" mutex, and ep_find() must be called with 1446 * "mtx" held. 1447 */ 1448 static struct epitem *ep_find(struct eventpoll *ep, struct epoll_key *tf) 1449 { 1450 int kcmp; 1451 struct rb_node *rbp; 1452 struct epitem *epi, *epir = NULL; 1453 1454 for (rbp = ep->rbr.rb_root.rb_node; rbp; ) { 1455 epi = rb_entry(rbp, struct epitem, rbn); 1456 kcmp = ep_cmp_ffd(tf, &epi->ffd); 1457 if (kcmp > 0) 1458 rbp = rbp->rb_right; 1459 else if (kcmp < 0) 1460 rbp = rbp->rb_left; 1461 else { 1462 epir = epi; 1463 break; 1464 } 1465 } 1466 1467 return epir; 1468 } 1469 1470 /* 1471 * This is the callback that is passed to the wait queue wakeup 1472 * mechanism. It is called by the stored file descriptors when they 1473 * have events to report. 1474 */ 1475 static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) 1476 { 1477 int pwake = 0; 1478 struct epitem *epi = ep_item_from_wait(wait); 1479 struct eventpoll *ep = epi->ep; 1480 __poll_t pollflags = key_to_poll(key); 1481 unsigned long flags; 1482 int ewake = 0; 1483 1484 spin_lock_irqsave(&ep->lock, flags); 1485 1486 ep_set_busy_poll_napi_id(epi); 1487 1488 /* 1489 * If the event mask does not contain any poll(2) event, we consider the 1490 * descriptor to be disabled. This condition is likely the effect of the 1491 * EPOLLONESHOT bit that disables the descriptor when an event is received, 1492 * until the next EPOLL_CTL_MOD will be issued. 1493 */ 1494 if (!(epi->event.events & ~EP_PRIVATE_BITS)) 1495 goto out_unlock; 1496 1497 /* 1498 * Check the events coming with the callback. At this stage, not 1499 * every device reports the events in the "key" parameter of the 1500 * callback. We need to be able to handle both cases here, hence the 1501 * test for "key" != NULL before the event match test. 1502 */ 1503 if (pollflags && !(pollflags & epi->event.events)) 1504 goto out_unlock; 1505 1506 /* 1507 * If we are transferring events to userspace, we can hold no locks 1508 * (because we're accessing user memory, and because of linux f_op->poll() 1509 * semantics). All the events that happen during that period of time are 1510 * chained in ep->ovflist and requeued later on. 1511 */ 1512 if (ep_is_scanning(ep)) { 1513 if (!epi_on_ovflist(epi)) { 1514 epi->ovflist_next = READ_ONCE(ep->ovflist); 1515 WRITE_ONCE(ep->ovflist, epi); 1516 ep_pm_stay_awake_rcu(epi); 1517 } 1518 } else if (!ep_is_linked(epi)) { 1519 /* In the usual case, add event to ready list. */ 1520 list_add_tail(&epi->rdllink, &ep->rdllist); 1521 ep_pm_stay_awake_rcu(epi); 1522 } 1523 1524 /* 1525 * Wake up ( if active ) both the eventpoll wait list and the ->poll() 1526 * wait list. 1527 */ 1528 if (waitqueue_active(&ep->wq)) { 1529 if ((epi->event.events & EPOLLEXCLUSIVE) && 1530 !(pollflags & POLLFREE)) { 1531 switch (pollflags & EPOLLINOUT_BITS) { 1532 case EPOLLIN: 1533 if (epi->event.events & EPOLLIN) 1534 ewake = 1; 1535 break; 1536 case EPOLLOUT: 1537 if (epi->event.events & EPOLLOUT) 1538 ewake = 1; 1539 break; 1540 case 0: 1541 ewake = 1; 1542 break; 1543 } 1544 } 1545 if (sync) 1546 wake_up_sync(&ep->wq); 1547 else 1548 wake_up(&ep->wq); 1549 } 1550 if (waitqueue_active(&ep->poll_wait)) 1551 pwake++; 1552 1553 out_unlock: 1554 spin_unlock_irqrestore(&ep->lock, flags); 1555 1556 /* We have to call this outside the lock */ 1557 if (pwake) 1558 ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE); 1559 1560 if (!(epi->event.events & EPOLLEXCLUSIVE)) 1561 ewake = 1; 1562 1563 if (pollflags & POLLFREE) { 1564 /* 1565 * POLLFREE handshake, release side; see "POLLFREE handshake" 1566 * at the top of this file. 1567 * 1568 * Unlink our wait entry with list_del_init rather than 1569 * __remove_wait_queue: a concurrent ep_remove_wait_queue() 1570 * that already loaded a non-NULL whead may still call 1571 * remove_wait_queue() after us, and list_del_init() tolerates 1572 * the second delete. 1573 * 1574 * smp_store_release(&whead, NULL) publishes the teardown to 1575 * ep_remove_wait_queue()'s smp_load_acquire(). Before this 1576 * store, a racing ep_clear_and_put() / ep_remove() reaches 1577 * ep_remove_wait_queue() which sees whead != NULL and takes 1578 * whead->lock -- the same lock held by our caller, so it 1579 * serializes behind us. Once whead is zeroed, nothing else 1580 * protects ep / epi / wait. 1581 */ 1582 list_del_init(&wait->entry); 1583 smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); 1584 } 1585 1586 return ewake; 1587 } 1588 1589 /* 1590 * This is the callback that is used to add our wait queue to the 1591 * target file wakeup lists. 1592 */ 1593 static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, 1594 poll_table *pt) 1595 { 1596 struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt); 1597 struct epitem *epi = epq->epi; 1598 struct eppoll_entry *pwq; 1599 1600 if (unlikely(!epi)) // an earlier allocation has failed 1601 return; 1602 1603 pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL); 1604 if (unlikely(!pwq)) { 1605 epq->epi = NULL; 1606 return; 1607 } 1608 1609 init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); 1610 pwq->whead = whead; 1611 pwq->base = epi; 1612 if (epi->event.events & EPOLLEXCLUSIVE) 1613 add_wait_queue_exclusive(whead, &pwq->wait); 1614 else 1615 add_wait_queue(whead, &pwq->wait); 1616 pwq->next = epi->pwqlist; 1617 epi->pwqlist = pwq; 1618 } 1619 1620 static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) 1621 { 1622 int kcmp; 1623 struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL; 1624 struct epitem *epic; 1625 bool leftmost = true; 1626 1627 while (*p) { 1628 parent = *p; 1629 epic = rb_entry(parent, struct epitem, rbn); 1630 kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd); 1631 if (kcmp > 0) { 1632 p = &parent->rb_right; 1633 leftmost = false; 1634 } else 1635 p = &parent->rb_left; 1636 } 1637 rb_link_node(&epi->rbn, parent, p); 1638 rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost); 1639 } 1640 1641 1642 1643 /* 1644 * Upper bound on wakeup paths emanating from any one watched file, 1645 * indexed by path depth (1..PATH_ARR_SIZE). For example, we allow 1646 * 1000 paths of length 1 from each watched file. These caps limit 1647 * the wakeup amplification that can be built from epoll-watches- 1648 * epoll topologies without rejecting reasonable usage. 1649 * 1650 * Enforced at EPOLL_CTL_ADD; CTL_MOD and CTL_DEL cannot add paths. 1651 * The running tallies live in ctx->path_count[] and are protected by 1652 * epnested_mutex. 1653 */ 1654 static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; 1655 1656 static int path_count_inc(struct ep_ctl_ctx *ctx, int nests) 1657 { 1658 /* Allow an arbitrary number of depth 1 paths */ 1659 if (nests == 0) 1660 return 0; 1661 1662 if (++ctx->path_count[nests] > path_limits[nests]) 1663 return -1; 1664 return 0; 1665 } 1666 1667 static void path_count_init(struct ep_ctl_ctx *ctx) 1668 { 1669 int i; 1670 1671 for (i = 0; i < PATH_ARR_SIZE; i++) 1672 ctx->path_count[i] = 0; 1673 } 1674 1675 static int reverse_path_check_proc(struct ep_ctl_ctx *ctx, 1676 struct hlist_head *refs, int depth) 1677 { 1678 int error = 0; 1679 struct epitem *epi; 1680 1681 if (depth > EP_MAX_NESTS) /* too deep nesting */ 1682 return -1; 1683 1684 /* CTL_DEL can remove links here, but that can't increase our count */ 1685 hlist_for_each_entry_rcu(epi, refs, fllink) { 1686 struct hlist_head *refs = &epi->ep->refs; 1687 if (hlist_empty(refs)) 1688 error = path_count_inc(ctx, depth); 1689 else 1690 error = reverse_path_check_proc(ctx, refs, depth + 1); 1691 if (error != 0) 1692 break; 1693 } 1694 return error; 1695 } 1696 1697 /** 1698 * reverse_path_check - ctx->tfile_check_list is a list of epitems_head 1699 * anchoring files with newly proposed links; make 1700 * sure those links don't push any path-length bucket 1701 * over its limit in path_limits[]. 1702 * @ctx: Per-do_epoll_ctl() scratch for the loop / path checks. 1703 * 1704 * Return: %zero if the proposed links don't create too many paths, 1705 * %-1 otherwise. 1706 */ 1707 static int reverse_path_check(struct ep_ctl_ctx *ctx) 1708 { 1709 struct epitems_head *p; 1710 1711 for (p = ctx->tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) { 1712 int error; 1713 path_count_init(ctx); 1714 rcu_read_lock(); 1715 error = reverse_path_check_proc(ctx, &p->epitems, 0); 1716 rcu_read_unlock(); 1717 if (error) 1718 return error; 1719 } 1720 return 0; 1721 } 1722 1723 static int ep_create_wakeup_source(struct epitem *epi) 1724 { 1725 struct name_snapshot n; 1726 struct wakeup_source *ws; 1727 1728 if (!epi->ep->ws) { 1729 epi->ep->ws = wakeup_source_register(NULL, "eventpoll"); 1730 if (!epi->ep->ws) 1731 return -ENOMEM; 1732 } 1733 1734 take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry); 1735 ws = wakeup_source_register(NULL, n.name.name); 1736 release_dentry_name_snapshot(&n); 1737 1738 if (!ws) 1739 return -ENOMEM; 1740 rcu_assign_pointer(epi->ws, ws); 1741 1742 return 0; 1743 } 1744 1745 /* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */ 1746 static noinline void ep_destroy_wakeup_source(struct epitem *epi) 1747 { 1748 struct wakeup_source *ws = ep_wakeup_source(epi); 1749 1750 RCU_INIT_POINTER(epi->ws, NULL); 1751 1752 /* 1753 * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is 1754 * used internally by wakeup_source_remove, too (called by 1755 * wakeup_source_unregister), so we cannot use call_rcu 1756 */ 1757 synchronize_rcu(); 1758 wakeup_source_unregister(ws); 1759 } 1760 1761 static int ep_attach_file(struct file *file, struct epitem *epi) 1762 { 1763 struct epitems_head *to_free = NULL; 1764 struct hlist_head *head = NULL; 1765 struct eventpoll *ep = NULL; 1766 1767 if (is_file_epoll(file)) 1768 ep = file->private_data; 1769 1770 if (ep) { 1771 head = &ep->refs; 1772 } else if (!READ_ONCE(file->f_ep)) { 1773 allocate: 1774 to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL); 1775 if (!to_free) 1776 return -ENOMEM; 1777 head = &to_free->epitems; 1778 } 1779 spin_lock(&file->f_lock); 1780 if (!file->f_ep) { 1781 if (unlikely(!head)) { 1782 spin_unlock(&file->f_lock); 1783 goto allocate; 1784 } 1785 /* See eventpoll_release() for details. */ 1786 WRITE_ONCE(file->f_ep, head); 1787 to_free = NULL; 1788 } 1789 hlist_add_head_rcu(&epi->fllink, file->f_ep); 1790 spin_unlock(&file->f_lock); 1791 free_ephead(to_free); 1792 return 0; 1793 } 1794 1795 /* 1796 * Charge the user's epoll_watches quota, allocate a fresh epitem for 1797 * @tf, and initialize its fields. The returned item is not yet linked 1798 * into any data structure; the caller must install it via 1799 * ep_register_epitem() (which takes over on success) or kmem_cache_free() 1800 * it and decrement epoll_watches on its own. 1801 * 1802 * Returns ERR_PTR(-ENOSPC) if the quota is exceeded, ERR_PTR(-ENOMEM) 1803 * if the slab allocation fails. 1804 */ 1805 static struct epitem *ep_alloc_epitem(struct eventpoll *ep, 1806 const struct epoll_event *event, 1807 struct epoll_key *tf) 1808 { 1809 struct epitem *epi; 1810 1811 if (unlikely(percpu_counter_compare(&ep->user->epoll_watches, 1812 max_user_watches) >= 0)) 1813 return ERR_PTR(-ENOSPC); 1814 percpu_counter_inc(&ep->user->epoll_watches); 1815 1816 epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL); 1817 if (unlikely(!epi)) { 1818 percpu_counter_dec(&ep->user->epoll_watches); 1819 return ERR_PTR(-ENOMEM); 1820 } 1821 1822 INIT_LIST_HEAD(&epi->rdllink); 1823 epi->ep = ep; 1824 epi->ffd = *tf; 1825 epi->event = *event; 1826 epi_clear_ovflist(epi); 1827 1828 return epi; 1829 } 1830 1831 /* 1832 * Install @epi into its target file's f_ep hlist and into @ep's rbtree, 1833 * taking one additional reference on @ep for the lifetime of the item. 1834 * 1835 * If @tep is non-NULL, the target file is itself an eventpoll; we hold 1836 * tep->mtx at subclass 1 across the attach + rbtree insert to serialize 1837 * with the target side. RB tree ops are protected by @ep->mtx, which 1838 * the caller already holds. 1839 * 1840 * On failure the epi is freed and the epoll_watches counter decremented, 1841 * matching ep_alloc_epitem()'s allocation. After this returns 1842 * successfully, ep_insert()'s later error paths use ep_remove() for 1843 * unwind; that cannot drop @ep's refcount to zero because the ep file 1844 * itself still holds the original reference. 1845 */ 1846 static int ep_register_epitem(struct ep_ctl_ctx *ctx, struct eventpoll *ep, 1847 struct epitem *epi, struct eventpoll *tep, 1848 int full_check) 1849 { 1850 struct file *tfile = epi->ffd.file; 1851 int error; 1852 1853 if (tep) 1854 mutex_lock_nested(&tep->mtx, 1); 1855 1856 error = ep_attach_file(tfile, epi); 1857 if (unlikely(error)) { 1858 if (tep) 1859 mutex_unlock(&tep->mtx); 1860 kmem_cache_free(epi_cache, epi); 1861 percpu_counter_dec(&ep->user->epoll_watches); 1862 return error; 1863 } 1864 1865 if (full_check && !tep) 1866 list_file(tfile, ctx); 1867 1868 ep_rbtree_insert(ep, epi); 1869 1870 if (tep) 1871 mutex_unlock(&tep->mtx); 1872 1873 ep_get(ep); 1874 return 0; 1875 } 1876 1877 /* 1878 * Must be called with "mtx" held. 1879 */ 1880 static int ep_insert(struct ep_ctl_ctx *ctx, struct eventpoll *ep, 1881 const struct epoll_event *event, struct epoll_key *tf, 1882 int full_check) 1883 { 1884 int error, pwake = 0; 1885 __poll_t revents; 1886 struct epitem *epi; 1887 struct ep_pqueue epq; 1888 struct eventpoll *tep = NULL; 1889 1890 if (is_file_epoll(tf->file)) 1891 tep = tf->file->private_data; 1892 1893 lockdep_assert_irqs_enabled(); 1894 1895 epi = ep_alloc_epitem(ep, event, tf); 1896 if (IS_ERR(epi)) 1897 return PTR_ERR(epi); 1898 1899 error = ep_register_epitem(ctx, ep, epi, tep, full_check); 1900 if (error) 1901 return error; 1902 1903 /* Reject the insert if the new link would create too many back-paths. */ 1904 if (unlikely(full_check && reverse_path_check(ctx))) { 1905 ep_remove(ep, epi); 1906 return -EINVAL; 1907 } 1908 1909 if (epi->event.events & EPOLLWAKEUP) { 1910 error = ep_create_wakeup_source(epi); 1911 if (error) { 1912 ep_remove(ep, epi); 1913 return error; 1914 } 1915 } 1916 1917 /* Initialize the poll table using the queue callback */ 1918 epq.epi = epi; 1919 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); 1920 1921 /* 1922 * Attach the item to the poll hooks and get current event bits. 1923 * We can safely use the file* here because its usage count has 1924 * been increased by the caller of this function. Note that after 1925 * this operation completes, the poll callback can start hitting 1926 * the new item. 1927 */ 1928 revents = ep_item_poll(epi, &epq.pt, 1); 1929 1930 /* ep_ptable_queue_proc() signals allocation failure by clearing epq.epi. */ 1931 if (unlikely(!epq.epi)) { 1932 ep_remove(ep, epi); 1933 return -ENOMEM; 1934 } 1935 1936 /* Drop the new item onto the ready list if it is already ready. */ 1937 spin_lock_irq(&ep->lock); 1938 1939 ep_set_busy_poll_napi_id(epi); 1940 1941 if (revents && !ep_is_linked(epi)) { 1942 list_add_tail(&epi->rdllink, &ep->rdllist); 1943 ep_pm_stay_awake(epi); 1944 1945 if (waitqueue_active(&ep->wq)) 1946 wake_up(&ep->wq); 1947 if (waitqueue_active(&ep->poll_wait)) 1948 pwake++; 1949 } 1950 1951 spin_unlock_irq(&ep->lock); 1952 1953 /* We have to call this outside the lock */ 1954 if (pwake) 1955 ep_poll_safewake(ep, NULL, 0); 1956 1957 return 0; 1958 } 1959 1960 /* 1961 * Modify the interest event mask by dropping an event if the new mask 1962 * has a match in the current file status. Must be called with "mtx" held. 1963 */ 1964 static int ep_modify(struct eventpoll *ep, struct epitem *epi, 1965 const struct epoll_event *event) 1966 { 1967 int pwake = 0; 1968 poll_table pt; 1969 1970 lockdep_assert_irqs_enabled(); 1971 1972 init_poll_funcptr(&pt, NULL); 1973 1974 /* 1975 * Set the new event interest mask before calling f_op->poll(); 1976 * otherwise we might miss an event that happens between the 1977 * f_op->poll() call and the new event set registering. 1978 */ 1979 epi->event.events = event->events; /* need barrier below */ 1980 epi->event.data = event->data; /* protected by mtx */ 1981 if (epi->event.events & EPOLLWAKEUP) { 1982 if (!ep_has_wakeup_source(epi)) 1983 ep_create_wakeup_source(epi); 1984 } else if (ep_has_wakeup_source(epi)) { 1985 ep_destroy_wakeup_source(epi); 1986 } 1987 1988 /* 1989 * The following barrier has two effects: 1990 * 1991 * 1) Flush epi changes above to other CPUs. This ensures 1992 * we do not miss events from ep_poll_callback if an 1993 * event occurs immediately after we call f_op->poll(). 1994 * We need this because we did not take ep->lock while 1995 * changing epi above (but ep_poll_callback does take 1996 * ep->lock). 1997 * 1998 * 2) We also need to ensure we do not miss _past_ events 1999 * when calling f_op->poll(). This barrier also 2000 * pairs with the barrier in wq_has_sleeper (see 2001 * comments for wq_has_sleeper). 2002 * 2003 * This barrier will now guarantee ep_poll_callback or f_op->poll 2004 * (or both) will notice the readiness of an item. 2005 */ 2006 smp_mb(); 2007 2008 /* 2009 * Get current event bits. We can safely use the file* here because 2010 * its usage count has been increased by the caller of this function. 2011 * If the item is "hot" and it is not registered inside the ready 2012 * list, push it inside. 2013 */ 2014 if (ep_item_poll(epi, &pt, 1)) { 2015 spin_lock_irq(&ep->lock); 2016 if (!ep_is_linked(epi)) { 2017 list_add_tail(&epi->rdllink, &ep->rdllist); 2018 ep_pm_stay_awake(epi); 2019 2020 /* Notify waiting tasks that events are available */ 2021 if (waitqueue_active(&ep->wq)) 2022 wake_up(&ep->wq); 2023 if (waitqueue_active(&ep->poll_wait)) 2024 pwake++; 2025 } 2026 spin_unlock_irq(&ep->lock); 2027 } 2028 2029 /* We have to call this outside the lock */ 2030 if (pwake) 2031 ep_poll_safewake(ep, NULL, 0); 2032 2033 return 0; 2034 } 2035 2036 /* 2037 * Attempt to deliver one event for @epi into @*uevents. 2038 * 2039 * Returns 1 if an event was delivered (with *uevents advanced to the 2040 * next slot), 0 if the re-poll reported no caller-requested events 2041 * (@epi drops out of the ready list; a future callback will re-add 2042 * it), or -EFAULT if copy_to_user() faulted (in which case @epi is 2043 * re-inserted at the head of @scan_batch so ep_done_scan() merges it 2044 * back to rdllist for the next attempt). 2045 * 2046 * PM bookkeeping and level-triggered re-queue are handled here. 2047 * Caller holds ep->mtx and the scan is active. 2048 */ 2049 static int ep_deliver_event(struct eventpoll *ep, struct epitem *epi, 2050 poll_table *pt, 2051 struct epoll_event __user **uevents, 2052 struct list_head *scan_batch) 2053 { 2054 struct epoll_event __user *next; 2055 struct wakeup_source *ws; 2056 __poll_t revents; 2057 2058 /* 2059 * Activate ep->ws before deactivating epi->ws to prevent 2060 * triggering auto-suspend here (in case we reactivate epi->ws 2061 * below). Rearranging to delay the deactivation would let 2062 * epi->ws drift out of sync with ep_is_linked(). 2063 */ 2064 ws = ep_wakeup_source(epi); 2065 if (ws) { 2066 if (ws->active) 2067 __pm_stay_awake(ep->ws); 2068 __pm_relax(ws); 2069 } 2070 2071 list_del_init(&epi->rdllink); 2072 2073 /* 2074 * Re-poll under ep->mtx so userspace cannot change the item 2075 * out from under us. If no caller-requested events remain, 2076 * @epi stays off the ready list; the poll callback will 2077 * re-queue it when events next appear. 2078 */ 2079 revents = ep_item_poll(epi, pt, 1); 2080 if (!revents) 2081 return 0; 2082 2083 next = epoll_put_uevent(revents, epi->event.data, *uevents); 2084 if (!next) { 2085 /* 2086 * copy_to_user() faulted: put the item back so 2087 * ep_done_scan() splices it onto rdllist for the next 2088 * attempt. 2089 */ 2090 list_add(&epi->rdllink, scan_batch); 2091 ep_pm_stay_awake(epi); 2092 return -EFAULT; 2093 } 2094 *uevents = next; 2095 2096 if (epi->event.events & EPOLLONESHOT) { 2097 epi->event.events &= EP_PRIVATE_BITS; 2098 } else if (!(epi->event.events & EPOLLET)) { 2099 /* 2100 * Level-triggered: re-queue so the next epoll_wait() 2101 * rechecks availability. We are the sole writer to 2102 * rdllist here -- epoll_ctl() callers are locked out 2103 * by ep->mtx, and the poll callback queues to ovflist 2104 * during scans. 2105 */ 2106 list_add_tail(&epi->rdllink, &ep->rdllist); 2107 ep_pm_stay_awake(epi); 2108 } 2109 return 1; 2110 } 2111 2112 static int ep_send_events(struct eventpoll *ep, 2113 struct epoll_event __user *events, int maxevents) 2114 { 2115 struct epitem *epi, *tmp; 2116 LIST_HEAD(scan_batch); 2117 poll_table pt; 2118 int res = 0; 2119 2120 /* 2121 * Always short-circuit for fatal signals to allow threads to make a 2122 * timely exit without the chance of finding more events available and 2123 * fetching repeatedly. 2124 */ 2125 if (fatal_signal_pending(current)) 2126 return -EINTR; 2127 2128 init_poll_funcptr(&pt, NULL); 2129 2130 mutex_lock(&ep->mtx); 2131 ep_start_scan(ep, &scan_batch); 2132 2133 /* 2134 * We can loop without lock because we are passed a task-private 2135 * scan_batch; items cannot vanish while we hold ep->mtx. 2136 */ 2137 list_for_each_entry_safe(epi, tmp, &scan_batch, rdllink) { 2138 int delivered; 2139 2140 if (res >= maxevents) 2141 break; 2142 2143 delivered = ep_deliver_event(ep, epi, &pt, &events, &scan_batch); 2144 if (delivered < 0) { 2145 if (!res) 2146 res = delivered; 2147 break; 2148 } 2149 res += delivered; 2150 } 2151 2152 ep_done_scan(ep, &scan_batch); 2153 mutex_unlock(&ep->mtx); 2154 2155 return res; 2156 } 2157 2158 static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms) 2159 { 2160 struct timespec64 now; 2161 2162 if (ms < 0) 2163 return NULL; 2164 2165 if (!ms) { 2166 to->tv_sec = 0; 2167 to->tv_nsec = 0; 2168 return to; 2169 } 2170 2171 to->tv_sec = ms / MSEC_PER_SEC; 2172 to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC); 2173 2174 ktime_get_ts64(&now); 2175 *to = timespec64_add_safe(now, *to); 2176 return to; 2177 } 2178 2179 /* 2180 * autoremove_wake_function, but remove even on failure to wake up, because we 2181 * know that default_wake_function/ttwu will only fail if the thread is already 2182 * woken, and in that case the ep_poll loop will remove the entry anyways, not 2183 * try to reuse it. 2184 */ 2185 static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, 2186 unsigned int mode, int sync, void *key) 2187 { 2188 int ret = default_wake_function(wq_entry, mode, sync, key); 2189 2190 /* 2191 * Pairs with list_empty_careful in ep_poll, and ensures future loop 2192 * iterations see the cause of this wakeup. 2193 */ 2194 list_del_init_careful(&wq_entry->entry); 2195 return ret; 2196 } 2197 2198 static int ep_try_send_events(struct eventpoll *ep, 2199 struct epoll_event __user *events, int maxevents) 2200 { 2201 int res; 2202 2203 /* 2204 * Try to transfer events to user space. In case we get 0 events and 2205 * there's still timeout left over, we go trying again in search of 2206 * more luck. 2207 */ 2208 res = ep_send_events(ep, events, maxevents); 2209 if (res > 0) 2210 ep_suspend_napi_irqs(ep); 2211 return res; 2212 } 2213 2214 static int ep_schedule_timeout(ktime_t *to) 2215 { 2216 if (to) 2217 return ktime_after(*to, ktime_get()); 2218 else 2219 return 1; 2220 } 2221 2222 /** 2223 * ep_poll - Retrieves ready events, and delivers them to the caller-supplied 2224 * event buffer. 2225 * 2226 * @ep: Pointer to the eventpoll context. 2227 * @events: Pointer to the userspace buffer where the ready events should be 2228 * stored. 2229 * @maxevents: Size (in terms of number of events) of the caller event buffer. 2230 * @timeout: Maximum timeout for the ready events fetch operation, in 2231 * timespec. If the timeout is zero, the function will not block, 2232 * while if the @timeout ptr is NULL, the function will block 2233 * until at least one event has been retrieved (or an error 2234 * occurred). 2235 * 2236 * Return: the number of ready events which have been fetched, or an 2237 * error code, in case of error. 2238 */ 2239 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 2240 int maxevents, struct timespec64 *timeout) 2241 { 2242 int res, timed_out = 0; 2243 bool eavail; 2244 u64 slack = 0; 2245 wait_queue_entry_t wait; 2246 ktime_t expires, *to = NULL; 2247 2248 lockdep_assert_irqs_enabled(); 2249 2250 if (timeout && (timeout->tv_sec | timeout->tv_nsec)) { 2251 slack = select_estimate_accuracy(timeout); 2252 to = &expires; 2253 *to = timespec64_to_ktime(*timeout); 2254 } else if (timeout) { 2255 /* 2256 * Avoid the unnecessary trip to the wait queue loop, if the 2257 * caller specified a non blocking operation. 2258 */ 2259 timed_out = 1; 2260 } 2261 2262 /* 2263 * This call is racy: We may or may not see events that are being added 2264 * to the ready list under the lock (e.g., in IRQ callbacks). For cases 2265 * with a non-zero timeout, this thread will check the ready list under 2266 * lock and will add to the wait queue. For cases with a zero 2267 * timeout, the user by definition should not care and will have to 2268 * recheck again. 2269 */ 2270 eavail = ep_events_available(ep); 2271 2272 while (1) { 2273 if (eavail) { 2274 res = ep_try_send_events(ep, events, maxevents); 2275 if (res) 2276 return res; 2277 } 2278 2279 if (timed_out) 2280 return 0; 2281 2282 eavail = ep_busy_loop(ep); 2283 if (eavail) 2284 continue; 2285 2286 if (signal_pending(current)) 2287 return -EINTR; 2288 2289 /* 2290 * Internally init_wait() uses autoremove_wake_function(), 2291 * thus wait entry is removed from the wait queue on each 2292 * wakeup. Why it is important? In case of several waiters 2293 * each new wakeup will hit the next waiter, giving it the 2294 * chance to harvest new event. Otherwise wakeup can be 2295 * lost. This is also good performance-wise, because on 2296 * normal wakeup path no need to call __remove_wait_queue() 2297 * explicitly, thus ep->lock is not taken, which halts the 2298 * event delivery. 2299 * 2300 * In fact, we now use an even more aggressive function that 2301 * unconditionally removes, because we don't reuse the wait 2302 * entry between loop iterations. This lets us also avoid the 2303 * performance issue if a process is killed, causing all of its 2304 * threads to wake up without being removed normally. 2305 */ 2306 init_wait(&wait); 2307 wait.func = ep_autoremove_wake_function; 2308 2309 spin_lock_irq(&ep->lock); 2310 /* 2311 * Barrierless variant, waitqueue_active() is called under 2312 * the same lock on wakeup ep_poll_callback() side, so it 2313 * is safe to avoid an explicit barrier. 2314 */ 2315 __set_current_state(TASK_INTERRUPTIBLE); 2316 2317 /* 2318 * Do the final check under the lock. ep_start/done_scan() 2319 * plays with two lists (->rdllist and ->ovflist) and there 2320 * is always a race when both lists are empty for short 2321 * period of time although events are pending, so lock is 2322 * important. 2323 */ 2324 eavail = ep_events_available(ep); 2325 if (!eavail) 2326 __add_wait_queue_exclusive(&ep->wq, &wait); 2327 2328 spin_unlock_irq(&ep->lock); 2329 2330 if (!eavail) 2331 timed_out = !ep_schedule_timeout(to) || 2332 !schedule_hrtimeout_range(to, slack, 2333 HRTIMER_MODE_ABS); 2334 __set_current_state(TASK_RUNNING); 2335 2336 /* 2337 * We were woken up, thus go and try to harvest some events. 2338 * If timed out and still on the wait queue, recheck eavail 2339 * carefully under lock, below. 2340 */ 2341 eavail = true; 2342 2343 if (!list_empty_careful(&wait.entry)) { 2344 spin_lock_irq(&ep->lock); 2345 /* 2346 * If the thread timed out and is not on the wait queue, 2347 * it means that the thread was woken up after its 2348 * timeout expired before it could reacquire the lock. 2349 * Thus, when wait.entry is empty, it needs to harvest 2350 * events. 2351 */ 2352 if (timed_out) 2353 eavail = list_empty(&wait.entry); 2354 __remove_wait_queue(&ep->wq, &wait); 2355 spin_unlock_irq(&ep->lock); 2356 } 2357 } 2358 } 2359 2360 /** 2361 * ep_loop_check_proc - verify that adding an epoll file @ep inside another 2362 * epoll file does not create closed loops, and 2363 * determine the depth of the subtree starting at @ep 2364 * 2365 * @ctx: Per-do_epoll_ctl() scratch for the loop / path checks. 2366 * @ep: the &struct eventpoll to be currently checked. 2367 * @depth: Current depth of the path being checked. 2368 * 2369 * Return: depth of the subtree, or a value bigger than EP_MAX_NESTS if we found 2370 * a loop or went too deep. 2371 */ 2372 static int ep_loop_check_proc(struct ep_ctl_ctx *ctx, 2373 struct eventpoll *ep, int depth) 2374 { 2375 int result = 0; 2376 struct rb_node *rbp; 2377 struct epitem *epi; 2378 2379 if (ep->gen == loop_check_gen) 2380 return ep->loop_check_depth; 2381 2382 mutex_lock_nested(&ep->mtx, depth + 1); 2383 ep->gen = loop_check_gen; 2384 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { 2385 epi = rb_entry(rbp, struct epitem, rbn); 2386 if (unlikely(is_file_epoll(epi->ffd.file))) { 2387 struct eventpoll *ep_tovisit; 2388 ep_tovisit = epi->ffd.file->private_data; 2389 if (ep_tovisit == ctx->inserting_into || 2390 depth > EP_MAX_NESTS) 2391 result = EP_MAX_NESTS+1; 2392 else 2393 result = max(result, 2394 ep_loop_check_proc(ctx, ep_tovisit, 2395 depth + 1) + 1); 2396 if (result > EP_MAX_NESTS) 2397 break; 2398 } else { 2399 /* 2400 * A non-epoll leaf. Queue it for the companion 2401 * reverse_path_check() that runs after this walk so 2402 * any new links we propose don't add too many wakeup 2403 * paths. 2404 */ 2405 list_file(epi->ffd.file, ctx); 2406 } 2407 } 2408 ep->loop_check_depth = result; 2409 mutex_unlock(&ep->mtx); 2410 2411 return result; 2412 } 2413 2414 /* ep_get_upwards_depth_proc - determine depth of @ep when traversed upwards */ 2415 static int ep_get_upwards_depth_proc(struct eventpoll *ep, int depth) 2416 { 2417 int result = 0; 2418 struct epitem *epi; 2419 2420 if (ep->gen == loop_check_gen) 2421 return ep->loop_check_depth; 2422 hlist_for_each_entry_rcu(epi, &ep->refs, fllink) 2423 result = max(result, ep_get_upwards_depth_proc(epi->ep, depth + 1) + 1); 2424 ep->gen = loop_check_gen; 2425 ep->loop_check_depth = result; 2426 return result; 2427 } 2428 2429 /** 2430 * ep_loop_check - Performs a check to verify that adding an epoll file (@to) 2431 * into another epoll file (represented by @ep) does not create 2432 * closed loops or too deep chains. 2433 * 2434 * @ctx: Per-CTL_ADD scratch context. 2435 * @ep: Pointer to the epoll we are inserting into. 2436 * @to: Pointer to the epoll to be inserted. 2437 * 2438 * Return: %zero if adding the epoll @to inside the epoll @from 2439 * does not violate the constraints, or %-1 otherwise. 2440 */ 2441 static int ep_loop_check(struct ep_ctl_ctx *ctx, struct eventpoll *ep, 2442 struct eventpoll *to) 2443 { 2444 int depth, upwards_depth; 2445 2446 ctx->inserting_into = ep; 2447 /* 2448 * Check how deep down we can get from @to, and whether it is possible 2449 * to loop up to @ep. 2450 */ 2451 depth = ep_loop_check_proc(ctx, to, 0); 2452 if (depth > EP_MAX_NESTS) 2453 return -1; 2454 /* Check how far up we can go from @ep. */ 2455 rcu_read_lock(); 2456 upwards_depth = ep_get_upwards_depth_proc(ep, 0); 2457 rcu_read_unlock(); 2458 2459 return (depth+1+upwards_depth > EP_MAX_NESTS) ? -1 : 0; 2460 } 2461 2462 static void clear_tfile_check_list(struct ep_ctl_ctx *ctx) 2463 { 2464 rcu_read_lock(); 2465 while (ctx->tfile_check_list != EP_UNACTIVE_PTR) { 2466 struct epitems_head *head = ctx->tfile_check_list; 2467 ctx->tfile_check_list = head->next; 2468 unlist_file(head); 2469 } 2470 rcu_read_unlock(); 2471 } 2472 2473 /* 2474 * Open an eventpoll file descriptor. 2475 */ 2476 static int do_epoll_create(int flags) 2477 { 2478 int error; 2479 struct eventpoll *ep; 2480 2481 /* Check the EPOLL_* constant for consistency. */ 2482 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); 2483 2484 if (flags & ~EPOLL_CLOEXEC) 2485 return -EINVAL; 2486 /* 2487 * Create the internal data structure ("struct eventpoll"). 2488 */ 2489 error = ep_alloc(&ep); 2490 if (error < 0) 2491 return error; 2492 /* 2493 * Creates all the items needed to setup an eventpoll file. That is, 2494 * a file structure and a free file descriptor. 2495 */ 2496 FD_PREPARE(fdf, O_RDWR | (flags & O_CLOEXEC), 2497 anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, 2498 O_RDWR | (flags & O_CLOEXEC))); 2499 if (fdf.err) { 2500 ep_clear_and_put(ep); 2501 return fdf.err; 2502 } 2503 ep->file = fd_prepare_file(fdf); 2504 return fd_publish(fdf); 2505 } 2506 2507 SYSCALL_DEFINE1(epoll_create1, int, flags) 2508 { 2509 return do_epoll_create(flags); 2510 } 2511 2512 SYSCALL_DEFINE1(epoll_create, int, size) 2513 { 2514 if (size <= 0) 2515 return -EINVAL; 2516 2517 return do_epoll_create(0); 2518 } 2519 2520 #ifdef CONFIG_PM_SLEEP 2521 static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) 2522 { 2523 if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND)) 2524 epev->events &= ~EPOLLWAKEUP; 2525 } 2526 #else 2527 static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) 2528 { 2529 epev->events &= ~EPOLLWAKEUP; 2530 } 2531 #endif 2532 2533 static inline int epoll_mutex_lock(struct mutex *mutex, bool nonblock) 2534 { 2535 if (!nonblock) { 2536 mutex_lock(mutex); 2537 return 0; 2538 } 2539 return mutex_trylock(mutex) ? 0 : -EAGAIN; 2540 } 2541 2542 /* 2543 * Acquire the locks required for do_epoll_ctl() on @ep for @op. 2544 * 2545 * Always takes ep->mtx. For EPOLL_CTL_ADD, additionally runs the 2546 * loop / path check under epnested_mutex when the topology can 2547 * change: @ep is already watched (epfile->f_ep non-NULL), @ep was 2548 * recently loop-checked (ep->gen == loop_check_gen), or @tfile is 2549 * itself an eventpoll. 2550 * 2551 * Return value encodes both outcome and lock state: 2552 * 2553 * 0 success; ep->mtx held. 2554 * 1 success; ep->mtx held AND the full check ran under 2555 * epnested_mutex (which is also still held). The value 2556 * doubles as the @full_check argument to ep_insert(). 2557 * -errno failure; no locks held. 2558 * 2559 * The caller releases what was taken with ep_ctl_unlock(ep, ret). 2560 * 2561 * Holding epnested_mutex on add is what prevents two racing 2562 * EPOLL_CTL_ADDs on different eps from building a cycle without 2563 * either walker observing it. 2564 */ 2565 static int ep_ctl_lock(struct ep_ctl_ctx *ctx, struct eventpoll *ep, int op, 2566 struct file *epfile, struct file *tfile, bool nonblock) 2567 { 2568 struct eventpoll *tep; 2569 int error; 2570 2571 error = epoll_mutex_lock(&ep->mtx, nonblock); 2572 if (error) 2573 return error; 2574 2575 if (op != EPOLL_CTL_ADD) 2576 return 0; 2577 if (!READ_ONCE(epfile->f_ep) && ep->gen != loop_check_gen && 2578 !is_file_epoll(tfile)) 2579 return 0; 2580 2581 /* Full check needed: drop ep->mtx so we can take epnested_mutex. */ 2582 mutex_unlock(&ep->mtx); 2583 error = epoll_mutex_lock(&epnested_mutex, nonblock); 2584 if (error) 2585 return error; 2586 2587 loop_check_gen++; 2588 2589 if (is_file_epoll(tfile)) { 2590 tep = tfile->private_data; 2591 if (ep_loop_check(ctx, ep, tep) != 0) { 2592 error = -ELOOP; 2593 goto err_unlock_nested; 2594 } 2595 } 2596 2597 error = epoll_mutex_lock(&ep->mtx, nonblock); 2598 if (error) 2599 goto err_unlock_nested; 2600 2601 return 1; 2602 2603 err_unlock_nested: 2604 clear_tfile_check_list(ctx); 2605 loop_check_gen++; 2606 mutex_unlock(&epnested_mutex); 2607 return error; 2608 } 2609 2610 static void ep_ctl_unlock(struct ep_ctl_ctx *ctx, struct eventpoll *ep, 2611 int full_check) 2612 { 2613 mutex_unlock(&ep->mtx); 2614 if (full_check) { 2615 clear_tfile_check_list(ctx); 2616 loop_check_gen++; 2617 mutex_unlock(&epnested_mutex); 2618 } 2619 } 2620 2621 int do_epoll_ctl_file(struct file *f, int op, struct epoll_key *tf, 2622 struct epoll_event *epds, bool nonblock) 2623 { 2624 int error; 2625 int full_check; 2626 struct eventpoll *ep; 2627 struct epitem *epi; 2628 struct ep_ctl_ctx ctx = { 2629 .tfile_check_list = EP_UNACTIVE_PTR, 2630 }; 2631 2632 /* The target file descriptor must support poll */ 2633 if (!file_can_poll(tf->file)) 2634 return -EPERM; 2635 2636 /* Check if EPOLLWAKEUP is allowed */ 2637 if (ep_op_has_event(op)) 2638 ep_take_care_of_epollwakeup(epds); 2639 2640 /* 2641 * The @f file must itself be an eventpoll, and we do not permit 2642 * adding an epoll file descriptor inside itself. 2643 */ 2644 if (f == tf->file || !is_file_epoll(f)) 2645 return -EINVAL; 2646 2647 /* 2648 * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only, 2649 * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. 2650 * Also, nested exclusive wakeups are not supported. 2651 */ 2652 if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { 2653 if (op == EPOLL_CTL_MOD) 2654 return -EINVAL; 2655 if (op == EPOLL_CTL_ADD && (is_file_epoll(tf->file) || 2656 (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) 2657 return -EINVAL; 2658 } 2659 2660 ep = f->private_data; 2661 2662 full_check = ep_ctl_lock(&ctx, ep, op, f, tf->file, nonblock); 2663 if (full_check < 0) 2664 return full_check; 2665 2666 /* 2667 * Look the target up in ep's RB tree. We hold ep->mtx, so the 2668 * item stays valid until we release. 2669 */ 2670 epi = ep_find(ep, tf); 2671 2672 error = -EINVAL; 2673 switch (op) { 2674 case EPOLL_CTL_ADD: 2675 if (!epi) { 2676 epds->events |= EPOLLERR | EPOLLHUP; 2677 error = ep_insert(&ctx, ep, epds, tf, full_check); 2678 } else 2679 error = -EEXIST; 2680 break; 2681 case EPOLL_CTL_DEL: 2682 if (epi) { 2683 /* 2684 * The eventpoll itself is still alive: the refcount 2685 * can't go to zero here. 2686 */ 2687 ep_remove(ep, epi); 2688 error = 0; 2689 } else { 2690 error = -ENOENT; 2691 } 2692 break; 2693 case EPOLL_CTL_MOD: 2694 if (epi) { 2695 if (!(epi->event.events & EPOLLEXCLUSIVE)) { 2696 epds->events |= EPOLLERR | EPOLLHUP; 2697 error = ep_modify(ep, epi, epds); 2698 } 2699 } else 2700 error = -ENOENT; 2701 break; 2702 } 2703 2704 ep_ctl_unlock(&ctx, ep, full_check); 2705 return error; 2706 } 2707 2708 int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, 2709 bool nonblock) 2710 { 2711 struct epoll_key efd; 2712 2713 CLASS(fd, f)(epfd); 2714 if (fd_empty(f)) 2715 return -EBADF; 2716 2717 /* Get the "struct file *" for the target file */ 2718 CLASS(fd, tf)(fd); 2719 if (fd_empty(tf)) 2720 return -EBADF; 2721 2722 efd.file = fd_file(tf); 2723 efd.fd = fd; 2724 return do_epoll_ctl_file(fd_file(f), op, &efd, epds, nonblock); 2725 } 2726 2727 /* 2728 * The following function implements the controller interface for 2729 * the eventpoll file that enables the insertion/removal/change of 2730 * file descriptors inside the interest set. 2731 */ 2732 SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, 2733 struct epoll_event __user *, event) 2734 { 2735 struct epoll_event epds; 2736 2737 if (ep_op_has_event(op) && 2738 copy_from_user(&epds, event, sizeof(struct epoll_event))) 2739 return -EFAULT; 2740 2741 return do_epoll_ctl(epfd, op, fd, &epds, false); 2742 } 2743 2744 static int ep_check_params(struct file *file, struct epoll_event __user *evs, 2745 int maxevents) 2746 { 2747 /* The maximum number of event must be greater than zero */ 2748 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) 2749 return -EINVAL; 2750 2751 /* Verify that the area passed by the user is writeable */ 2752 if (!access_ok(evs, maxevents * sizeof(struct epoll_event))) 2753 return -EFAULT; 2754 2755 /* 2756 * We have to check that the file structure underneath the fd 2757 * the user passed to us _is_ an eventpoll file. 2758 */ 2759 if (!is_file_epoll(file)) 2760 return -EINVAL; 2761 2762 return 0; 2763 } 2764 2765 int epoll_sendevents(struct file *file, struct epoll_event __user *events, 2766 int maxevents) 2767 { 2768 struct eventpoll *ep; 2769 int ret; 2770 2771 ret = ep_check_params(file, events, maxevents); 2772 if (unlikely(ret)) 2773 return ret; 2774 2775 ep = file->private_data; 2776 /* 2777 * Racy call, but that's ok - it should get retried based on 2778 * poll readiness anyway. 2779 */ 2780 if (ep_events_available(ep)) 2781 return ep_try_send_events(ep, events, maxevents); 2782 return 0; 2783 } 2784 2785 /* 2786 * Implement the event wait interface for the eventpoll file. It is the kernel 2787 * part of the user space epoll_wait(2). 2788 */ 2789 static int do_epoll_wait(int epfd, struct epoll_event __user *events, 2790 int maxevents, struct timespec64 *to) 2791 { 2792 struct eventpoll *ep; 2793 int ret; 2794 2795 /* Get the "struct file *" for the eventpoll file */ 2796 CLASS(fd, f)(epfd); 2797 if (fd_empty(f)) 2798 return -EBADF; 2799 2800 ret = ep_check_params(fd_file(f), events, maxevents); 2801 if (unlikely(ret)) 2802 return ret; 2803 2804 /* 2805 * At this point it is safe to assume that the "private_data" contains 2806 * our own data structure. 2807 */ 2808 ep = fd_file(f)->private_data; 2809 2810 /* Time to fish for events ... */ 2811 return ep_poll(ep, events, maxevents, to); 2812 } 2813 2814 SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, 2815 int, maxevents, int, timeout) 2816 { 2817 struct timespec64 to; 2818 2819 return do_epoll_wait(epfd, events, maxevents, 2820 ep_timeout_to_timespec(&to, timeout)); 2821 } 2822 2823 /* 2824 * Implement the event wait interface for the eventpoll file. It is the kernel 2825 * part of the user space epoll_pwait(2). 2826 */ 2827 static int do_epoll_pwait(int epfd, struct epoll_event __user *events, 2828 int maxevents, struct timespec64 *to, 2829 const sigset_t __user *sigmask, size_t sigsetsize) 2830 { 2831 int error; 2832 2833 /* 2834 * If the caller wants a certain signal mask to be set during the wait, 2835 * we apply it here. 2836 */ 2837 error = set_user_sigmask(sigmask, sigsetsize); 2838 if (error) 2839 return error; 2840 2841 error = do_epoll_wait(epfd, events, maxevents, to); 2842 2843 restore_saved_sigmask_unless(error == -EINTR); 2844 2845 return error; 2846 } 2847 2848 SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, 2849 int, maxevents, int, timeout, const sigset_t __user *, sigmask, 2850 size_t, sigsetsize) 2851 { 2852 struct timespec64 to; 2853 2854 return do_epoll_pwait(epfd, events, maxevents, 2855 ep_timeout_to_timespec(&to, timeout), 2856 sigmask, sigsetsize); 2857 } 2858 2859 SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events, 2860 int, maxevents, const struct __kernel_timespec __user *, timeout, 2861 const sigset_t __user *, sigmask, size_t, sigsetsize) 2862 { 2863 struct timespec64 ts, *to = NULL; 2864 2865 if (timeout) { 2866 if (get_timespec64(&ts, timeout)) 2867 return -EFAULT; 2868 to = &ts; 2869 if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) 2870 return -EINVAL; 2871 } 2872 2873 return do_epoll_pwait(epfd, events, maxevents, to, 2874 sigmask, sigsetsize); 2875 } 2876 2877 #ifdef CONFIG_KCMP 2878 static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) 2879 { 2880 struct rb_node *rbp; 2881 struct epitem *epi; 2882 2883 for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { 2884 epi = rb_entry(rbp, struct epitem, rbn); 2885 if (epi->ffd.fd == tfd) { 2886 if (toff == 0) 2887 return epi; 2888 else 2889 toff--; 2890 } 2891 cond_resched(); 2892 } 2893 2894 return NULL; 2895 } 2896 2897 struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, 2898 unsigned long toff) 2899 { 2900 struct file *file_raw; 2901 struct eventpoll *ep; 2902 struct epitem *epi; 2903 2904 if (!is_file_epoll(file)) 2905 return ERR_PTR(-EINVAL); 2906 2907 ep = file->private_data; 2908 2909 mutex_lock(&ep->mtx); 2910 epi = ep_find_tfd(ep, tfd, toff); 2911 if (epi) 2912 file_raw = epi->ffd.file; 2913 else 2914 file_raw = ERR_PTR(-ENOENT); 2915 mutex_unlock(&ep->mtx); 2916 2917 return file_raw; 2918 } 2919 #endif /* CONFIG_KCMP */ 2920 2921 #ifdef CONFIG_COMPAT 2922 static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events, 2923 int maxevents, struct timespec64 *timeout, 2924 const compat_sigset_t __user *sigmask, 2925 compat_size_t sigsetsize) 2926 { 2927 long err; 2928 2929 /* 2930 * If the caller wants a certain signal mask to be set during the wait, 2931 * we apply it here. 2932 */ 2933 err = set_compat_user_sigmask(sigmask, sigsetsize); 2934 if (err) 2935 return err; 2936 2937 err = do_epoll_wait(epfd, events, maxevents, timeout); 2938 2939 restore_saved_sigmask_unless(err == -EINTR); 2940 2941 return err; 2942 } 2943 2944 COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, 2945 struct epoll_event __user *, events, 2946 int, maxevents, int, timeout, 2947 const compat_sigset_t __user *, sigmask, 2948 compat_size_t, sigsetsize) 2949 { 2950 struct timespec64 to; 2951 2952 return do_compat_epoll_pwait(epfd, events, maxevents, 2953 ep_timeout_to_timespec(&to, timeout), 2954 sigmask, sigsetsize); 2955 } 2956 2957 COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd, 2958 struct epoll_event __user *, events, 2959 int, maxevents, 2960 const struct __kernel_timespec __user *, timeout, 2961 const compat_sigset_t __user *, sigmask, 2962 compat_size_t, sigsetsize) 2963 { 2964 struct timespec64 ts, *to = NULL; 2965 2966 if (timeout) { 2967 if (get_timespec64(&ts, timeout)) 2968 return -EFAULT; 2969 to = &ts; 2970 if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) 2971 return -EINVAL; 2972 } 2973 2974 return do_compat_epoll_pwait(epfd, events, maxevents, to, 2975 sigmask, sigsetsize); 2976 } 2977 2978 #endif 2979 2980 static int __init eventpoll_init(void) 2981 { 2982 struct sysinfo si; 2983 2984 si_meminfo(&si); 2985 /* 2986 * Allows top 4% of lomem to be allocated for epoll watches (per user). 2987 */ 2988 max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) / 2989 EP_ITEM_COST; 2990 BUG_ON(max_user_watches < 0); 2991 2992 /* 2993 * We can have many thousands of epitems, so prevent this from 2994 * using an extra cache line on 64-bit (and smaller) CPUs 2995 */ 2996 BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128); 2997 2998 /* Allocates slab cache used to allocate "struct epitem" items */ 2999 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 3000 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); 3001 3002 /* Allocates slab cache used to allocate "struct eppoll_entry" */ 3003 pwq_cache = kmem_cache_create("eventpoll_pwq", 3004 sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); 3005 epoll_sysctls_init(); 3006 3007 ephead_cache = kmem_cache_create("ep_head", 3008 sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); 3009 3010 return 0; 3011 } 3012 fs_initcall(eventpoll_init); 3013