1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org> 5 * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org> 6 * Copyright (c) 2009 Apple, Inc. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 34 #include "opt_ktrace.h" 35 #include "opt_kqueue.h" 36 37 #ifdef COMPAT_FREEBSD11 38 #define _WANT_FREEBSD11_KEVENT 39 #endif 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/capsicum.h> 44 #include <sys/kernel.h> 45 #include <sys/limits.h> 46 #include <sys/lock.h> 47 #include <sys/mutex.h> 48 #include <sys/proc.h> 49 #include <sys/malloc.h> 50 #include <sys/unistd.h> 51 #include <sys/file.h> 52 #include <sys/filedesc.h> 53 #include <sys/filio.h> 54 #include <sys/fcntl.h> 55 #include <sys/kthread.h> 56 #include <sys/selinfo.h> 57 #include <sys/queue.h> 58 #include <sys/event.h> 59 #include <sys/eventvar.h> 60 #include <sys/poll.h> 61 #include <sys/protosw.h> 62 #include <sys/resourcevar.h> 63 #include <sys/sigio.h> 64 #include <sys/signalvar.h> 65 #include <sys/socket.h> 66 #include <sys/socketvar.h> 67 #include <sys/stat.h> 68 #include <sys/sysctl.h> 69 #include <sys/sysproto.h> 70 #include <sys/syscallsubr.h> 71 #include <sys/taskqueue.h> 72 #include <sys/uio.h> 73 #include <sys/user.h> 74 #ifdef KTRACE 75 #include <sys/ktrace.h> 76 #endif 77 #include <machine/atomic.h> 78 79 #include <vm/uma.h> 80 81 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); 82 83 /* 84 * This lock is used if multiple kq locks are required. This possibly 85 * should be made into a per proc lock. 86 */ 87 static struct mtx kq_global; 88 MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF); 89 #define KQ_GLOBAL_LOCK(lck, haslck) do { \ 90 if (!haslck) \ 91 mtx_lock(lck); \ 92 haslck = 1; \ 93 } while (0) 94 #define KQ_GLOBAL_UNLOCK(lck, haslck) do { \ 95 if (haslck) \ 96 mtx_unlock(lck); \ 97 haslck = 0; \ 98 } while (0) 99 100 TASKQUEUE_DEFINE_THREAD(kqueue_ctx); 101 102 static int kevent_copyout(void *arg, struct kevent *kevp, int count); 103 static int kevent_copyin(void *arg, struct kevent *kevp, int count); 104 static int kqueue_register(struct kqueue *kq, struct kevent *kev, 105 struct thread *td, int mflag); 106 static int kqueue_acquire(struct file *fp, struct kqueue **kqp); 107 static void kqueue_release(struct kqueue *kq, int locked); 108 static void kqueue_destroy(struct kqueue *kq); 109 static void kqueue_drain(struct kqueue *kq, struct thread *td); 110 static int kqueue_expand(struct kqueue *kq, struct filterops *fops, 111 uintptr_t ident, int mflag); 112 static void kqueue_task(void *arg, int pending); 113 static int kqueue_scan(struct kqueue *kq, int maxevents, 114 struct kevent_copyops *k_ops, 115 const struct timespec *timeout, 116 struct kevent *keva, struct thread *td); 117 static void kqueue_wakeup(struct kqueue *kq); 118 static struct filterops *kqueue_fo_find(int filt); 119 static void kqueue_fo_release(int filt); 120 struct g_kevent_args; 121 static int kern_kevent_generic(struct thread *td, 122 struct g_kevent_args *uap, 123 struct kevent_copyops *k_ops, const char *struct_name); 124 125 static fo_ioctl_t kqueue_ioctl; 126 static fo_poll_t kqueue_poll; 127 static fo_kqfilter_t kqueue_kqfilter; 128 static fo_stat_t kqueue_stat; 129 static fo_close_t kqueue_close; 130 static fo_fill_kinfo_t kqueue_fill_kinfo; 131 132 static struct fileops kqueueops = { 133 .fo_read = invfo_rdwr, 134 .fo_write = invfo_rdwr, 135 .fo_truncate = invfo_truncate, 136 .fo_ioctl = kqueue_ioctl, 137 .fo_poll = kqueue_poll, 138 .fo_kqfilter = kqueue_kqfilter, 139 .fo_stat = kqueue_stat, 140 .fo_close = kqueue_close, 141 .fo_chmod = invfo_chmod, 142 .fo_chown = invfo_chown, 143 .fo_sendfile = invfo_sendfile, 144 .fo_fill_kinfo = kqueue_fill_kinfo, 145 }; 146 147 static int knote_attach(struct knote *kn, struct kqueue *kq); 148 static void knote_drop(struct knote *kn, struct thread *td); 149 static void knote_drop_detached(struct knote *kn, struct thread *td); 150 static void knote_enqueue(struct knote *kn); 151 static void knote_dequeue(struct knote *kn); 152 static void knote_init(void); 153 static struct knote *knote_alloc(int mflag); 154 static void knote_free(struct knote *kn); 155 156 static void filt_kqdetach(struct knote *kn); 157 static int filt_kqueue(struct knote *kn, long hint); 158 static int filt_procattach(struct knote *kn); 159 static void filt_procdetach(struct knote *kn); 160 static int filt_proc(struct knote *kn, long hint); 161 static int filt_fileattach(struct knote *kn); 162 static void filt_timerexpire(void *knx); 163 static void filt_timerexpire_l(struct knote *kn, bool proc_locked); 164 static int filt_timerattach(struct knote *kn); 165 static void filt_timerdetach(struct knote *kn); 166 static void filt_timerstart(struct knote *kn, sbintime_t to); 167 static void filt_timertouch(struct knote *kn, struct kevent *kev, 168 u_long type); 169 static int filt_timervalidate(struct knote *kn, sbintime_t *to); 170 static int filt_timer(struct knote *kn, long hint); 171 static int filt_userattach(struct knote *kn); 172 static void filt_userdetach(struct knote *kn); 173 static int filt_user(struct knote *kn, long hint); 174 static void filt_usertouch(struct knote *kn, struct kevent *kev, 175 u_long type); 176 177 static struct filterops file_filtops = { 178 .f_isfd = 1, 179 .f_attach = filt_fileattach, 180 }; 181 static struct filterops kqread_filtops = { 182 .f_isfd = 1, 183 .f_detach = filt_kqdetach, 184 .f_event = filt_kqueue, 185 }; 186 /* XXX - move to kern_proc.c? */ 187 static struct filterops proc_filtops = { 188 .f_isfd = 0, 189 .f_attach = filt_procattach, 190 .f_detach = filt_procdetach, 191 .f_event = filt_proc, 192 }; 193 static struct filterops timer_filtops = { 194 .f_isfd = 0, 195 .f_attach = filt_timerattach, 196 .f_detach = filt_timerdetach, 197 .f_event = filt_timer, 198 .f_touch = filt_timertouch, 199 }; 200 static struct filterops user_filtops = { 201 .f_attach = filt_userattach, 202 .f_detach = filt_userdetach, 203 .f_event = filt_user, 204 .f_touch = filt_usertouch, 205 }; 206 207 static uma_zone_t knote_zone; 208 static unsigned int __exclusive_cache_line kq_ncallouts; 209 static unsigned int kq_calloutmax = 4 * 1024; 210 SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, 211 &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); 212 213 /* XXX - ensure not influx ? */ 214 #define KNOTE_ACTIVATE(kn, islock) do { \ 215 if ((islock)) \ 216 mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \ 217 else \ 218 KQ_LOCK((kn)->kn_kq); \ 219 (kn)->kn_status |= KN_ACTIVE; \ 220 if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ 221 knote_enqueue((kn)); \ 222 if (!(islock)) \ 223 KQ_UNLOCK((kn)->kn_kq); \ 224 } while (0) 225 #define KQ_LOCK(kq) do { \ 226 mtx_lock(&(kq)->kq_lock); \ 227 } while (0) 228 #define KQ_FLUX_WAKEUP(kq) do { \ 229 if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \ 230 (kq)->kq_state &= ~KQ_FLUXWAIT; \ 231 wakeup((kq)); \ 232 } \ 233 } while (0) 234 #define KQ_UNLOCK_FLUX(kq) do { \ 235 KQ_FLUX_WAKEUP(kq); \ 236 mtx_unlock(&(kq)->kq_lock); \ 237 } while (0) 238 #define KQ_UNLOCK(kq) do { \ 239 mtx_unlock(&(kq)->kq_lock); \ 240 } while (0) 241 #define KQ_OWNED(kq) do { \ 242 mtx_assert(&(kq)->kq_lock, MA_OWNED); \ 243 } while (0) 244 #define KQ_NOTOWNED(kq) do { \ 245 mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \ 246 } while (0) 247 248 static struct knlist * 249 kn_list_lock(struct knote *kn) 250 { 251 struct knlist *knl; 252 253 knl = kn->kn_knlist; 254 if (knl != NULL) 255 knl->kl_lock(knl->kl_lockarg); 256 return (knl); 257 } 258 259 static void 260 kn_list_unlock(struct knlist *knl) 261 { 262 bool do_free; 263 264 if (knl == NULL) 265 return; 266 do_free = knl->kl_autodestroy && knlist_empty(knl); 267 knl->kl_unlock(knl->kl_lockarg); 268 if (do_free) { 269 knlist_destroy(knl); 270 free(knl, M_KQUEUE); 271 } 272 } 273 274 static bool 275 kn_in_flux(struct knote *kn) 276 { 277 278 return (kn->kn_influx > 0); 279 } 280 281 static void 282 kn_enter_flux(struct knote *kn) 283 { 284 285 KQ_OWNED(kn->kn_kq); 286 MPASS(kn->kn_influx < INT_MAX); 287 kn->kn_influx++; 288 } 289 290 static bool 291 kn_leave_flux(struct knote *kn) 292 { 293 294 KQ_OWNED(kn->kn_kq); 295 MPASS(kn->kn_influx > 0); 296 kn->kn_influx--; 297 return (kn->kn_influx == 0); 298 } 299 300 #define KNL_ASSERT_LOCK(knl, islocked) do { \ 301 if (islocked) \ 302 KNL_ASSERT_LOCKED(knl); \ 303 else \ 304 KNL_ASSERT_UNLOCKED(knl); \ 305 } while (0) 306 #ifdef INVARIANTS 307 #define KNL_ASSERT_LOCKED(knl) do { \ 308 knl->kl_assert_lock((knl)->kl_lockarg, LA_LOCKED); \ 309 } while (0) 310 #define KNL_ASSERT_UNLOCKED(knl) do { \ 311 knl->kl_assert_lock((knl)->kl_lockarg, LA_UNLOCKED); \ 312 } while (0) 313 #else /* !INVARIANTS */ 314 #define KNL_ASSERT_LOCKED(knl) do {} while (0) 315 #define KNL_ASSERT_UNLOCKED(knl) do {} while (0) 316 #endif /* INVARIANTS */ 317 318 #ifndef KN_HASHSIZE 319 #define KN_HASHSIZE 64 /* XXX should be tunable */ 320 #endif 321 322 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) 323 324 static int 325 filt_nullattach(struct knote *kn) 326 { 327 328 return (ENXIO); 329 }; 330 331 struct filterops null_filtops = { 332 .f_isfd = 0, 333 .f_attach = filt_nullattach, 334 }; 335 336 /* XXX - make SYSINIT to add these, and move into respective modules. */ 337 extern struct filterops sig_filtops; 338 extern struct filterops fs_filtops; 339 340 /* 341 * Table for all system-defined filters. 342 */ 343 static struct mtx filterops_lock; 344 MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops", 345 MTX_DEF); 346 static struct { 347 struct filterops *for_fop; 348 int for_nolock; 349 int for_refcnt; 350 } sysfilt_ops[EVFILT_SYSCOUNT] = { 351 { &file_filtops, 1 }, /* EVFILT_READ */ 352 { &file_filtops, 1 }, /* EVFILT_WRITE */ 353 { &null_filtops }, /* EVFILT_AIO */ 354 { &file_filtops, 1 }, /* EVFILT_VNODE */ 355 { &proc_filtops, 1 }, /* EVFILT_PROC */ 356 { &sig_filtops, 1 }, /* EVFILT_SIGNAL */ 357 { &timer_filtops, 1 }, /* EVFILT_TIMER */ 358 { &file_filtops, 1 }, /* EVFILT_PROCDESC */ 359 { &fs_filtops, 1 }, /* EVFILT_FS */ 360 { &null_filtops }, /* EVFILT_LIO */ 361 { &user_filtops, 1 }, /* EVFILT_USER */ 362 { &null_filtops }, /* EVFILT_SENDFILE */ 363 { &file_filtops, 1 }, /* EVFILT_EMPTY */ 364 }; 365 366 /* 367 * Simple redirection for all cdevsw style objects to call their fo_kqfilter 368 * method. 369 */ 370 static int 371 filt_fileattach(struct knote *kn) 372 { 373 374 return (fo_kqfilter(kn->kn_fp, kn)); 375 } 376 377 /*ARGSUSED*/ 378 static int 379 kqueue_kqfilter(struct file *fp, struct knote *kn) 380 { 381 struct kqueue *kq = kn->kn_fp->f_data; 382 383 if (kn->kn_filter != EVFILT_READ) 384 return (EINVAL); 385 386 kn->kn_status |= KN_KQUEUE; 387 kn->kn_fop = &kqread_filtops; 388 knlist_add(&kq->kq_sel.si_note, kn, 0); 389 390 return (0); 391 } 392 393 static void 394 filt_kqdetach(struct knote *kn) 395 { 396 struct kqueue *kq = kn->kn_fp->f_data; 397 398 knlist_remove(&kq->kq_sel.si_note, kn, 0); 399 } 400 401 /*ARGSUSED*/ 402 static int 403 filt_kqueue(struct knote *kn, long hint) 404 { 405 struct kqueue *kq = kn->kn_fp->f_data; 406 407 kn->kn_data = kq->kq_count; 408 return (kn->kn_data > 0); 409 } 410 411 /* XXX - move to kern_proc.c? */ 412 static int 413 filt_procattach(struct knote *kn) 414 { 415 struct proc *p; 416 int error; 417 bool exiting, immediate; 418 419 exiting = immediate = false; 420 if (kn->kn_sfflags & NOTE_EXIT) 421 p = pfind_any(kn->kn_id); 422 else 423 p = pfind(kn->kn_id); 424 if (p == NULL) 425 return (ESRCH); 426 if (p->p_flag & P_WEXIT) 427 exiting = true; 428 429 if ((error = p_cansee(curthread, p))) { 430 PROC_UNLOCK(p); 431 return (error); 432 } 433 434 kn->kn_ptr.p_proc = p; 435 kn->kn_flags |= EV_CLEAR; /* automatically set */ 436 437 /* 438 * Internal flag indicating registration done by kernel for the 439 * purposes of getting a NOTE_CHILD notification. 440 */ 441 if (kn->kn_flags & EV_FLAG2) { 442 kn->kn_flags &= ~EV_FLAG2; 443 kn->kn_data = kn->kn_sdata; /* ppid */ 444 kn->kn_fflags = NOTE_CHILD; 445 kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK); 446 immediate = true; /* Force immediate activation of child note. */ 447 } 448 /* 449 * Internal flag indicating registration done by kernel (for other than 450 * NOTE_CHILD). 451 */ 452 if (kn->kn_flags & EV_FLAG1) { 453 kn->kn_flags &= ~EV_FLAG1; 454 } 455 456 knlist_add(p->p_klist, kn, 1); 457 458 /* 459 * Immediately activate any child notes or, in the case of a zombie 460 * target process, exit notes. The latter is necessary to handle the 461 * case where the target process, e.g. a child, dies before the kevent 462 * is registered. 463 */ 464 if (immediate || (exiting && filt_proc(kn, NOTE_EXIT))) 465 KNOTE_ACTIVATE(kn, 0); 466 467 PROC_UNLOCK(p); 468 469 return (0); 470 } 471 472 /* 473 * The knote may be attached to a different process, which may exit, 474 * leaving nothing for the knote to be attached to. So when the process 475 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so 476 * it will be deleted when read out. However, as part of the knote deletion, 477 * this routine is called, so a check is needed to avoid actually performing 478 * a detach, because the original process does not exist any more. 479 */ 480 /* XXX - move to kern_proc.c? */ 481 static void 482 filt_procdetach(struct knote *kn) 483 { 484 485 knlist_remove(kn->kn_knlist, kn, 0); 486 kn->kn_ptr.p_proc = NULL; 487 } 488 489 /* XXX - move to kern_proc.c? */ 490 static int 491 filt_proc(struct knote *kn, long hint) 492 { 493 struct proc *p; 494 u_int event; 495 496 p = kn->kn_ptr.p_proc; 497 if (p == NULL) /* already activated, from attach filter */ 498 return (0); 499 500 /* Mask off extra data. */ 501 event = (u_int)hint & NOTE_PCTRLMASK; 502 503 /* If the user is interested in this event, record it. */ 504 if (kn->kn_sfflags & event) 505 kn->kn_fflags |= event; 506 507 /* Process is gone, so flag the event as finished. */ 508 if (event == NOTE_EXIT) { 509 kn->kn_flags |= EV_EOF | EV_ONESHOT; 510 kn->kn_ptr.p_proc = NULL; 511 if (kn->kn_fflags & NOTE_EXIT) 512 kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig); 513 if (kn->kn_fflags == 0) 514 kn->kn_flags |= EV_DROP; 515 return (1); 516 } 517 518 return (kn->kn_fflags != 0); 519 } 520 521 /* 522 * Called when the process forked. It mostly does the same as the 523 * knote(), activating all knotes registered to be activated when the 524 * process forked. Additionally, for each knote attached to the 525 * parent, check whether user wants to track the new process. If so 526 * attach a new knote to it, and immediately report an event with the 527 * child's pid. 528 */ 529 void 530 knote_fork(struct knlist *list, int pid) 531 { 532 struct kqueue *kq; 533 struct knote *kn; 534 struct kevent kev; 535 int error; 536 537 MPASS(list != NULL); 538 KNL_ASSERT_LOCKED(list); 539 if (SLIST_EMPTY(&list->kl_list)) 540 return; 541 542 memset(&kev, 0, sizeof(kev)); 543 SLIST_FOREACH(kn, &list->kl_list, kn_selnext) { 544 kq = kn->kn_kq; 545 KQ_LOCK(kq); 546 if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { 547 KQ_UNLOCK(kq); 548 continue; 549 } 550 551 /* 552 * The same as knote(), activate the event. 553 */ 554 if ((kn->kn_sfflags & NOTE_TRACK) == 0) { 555 if (kn->kn_fop->f_event(kn, NOTE_FORK)) 556 KNOTE_ACTIVATE(kn, 1); 557 KQ_UNLOCK(kq); 558 continue; 559 } 560 561 /* 562 * The NOTE_TRACK case. In addition to the activation 563 * of the event, we need to register new events to 564 * track the child. Drop the locks in preparation for 565 * the call to kqueue_register(). 566 */ 567 kn_enter_flux(kn); 568 KQ_UNLOCK(kq); 569 list->kl_unlock(list->kl_lockarg); 570 571 /* 572 * Activate existing knote and register tracking knotes with 573 * new process. 574 * 575 * First register a knote to get just the child notice. This 576 * must be a separate note from a potential NOTE_EXIT 577 * notification since both NOTE_CHILD and NOTE_EXIT are defined 578 * to use the data field (in conflicting ways). 579 */ 580 kev.ident = pid; 581 kev.filter = kn->kn_filter; 582 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT | 583 EV_FLAG2; 584 kev.fflags = kn->kn_sfflags; 585 kev.data = kn->kn_id; /* parent */ 586 kev.udata = kn->kn_kevent.udata;/* preserve udata */ 587 error = kqueue_register(kq, &kev, NULL, M_NOWAIT); 588 if (error) 589 kn->kn_fflags |= NOTE_TRACKERR; 590 591 /* 592 * Then register another knote to track other potential events 593 * from the new process. 594 */ 595 kev.ident = pid; 596 kev.filter = kn->kn_filter; 597 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; 598 kev.fflags = kn->kn_sfflags; 599 kev.data = kn->kn_id; /* parent */ 600 kev.udata = kn->kn_kevent.udata;/* preserve udata */ 601 error = kqueue_register(kq, &kev, NULL, M_NOWAIT); 602 if (error) 603 kn->kn_fflags |= NOTE_TRACKERR; 604 if (kn->kn_fop->f_event(kn, NOTE_FORK)) 605 KNOTE_ACTIVATE(kn, 0); 606 list->kl_lock(list->kl_lockarg); 607 KQ_LOCK(kq); 608 kn_leave_flux(kn); 609 KQ_UNLOCK_FLUX(kq); 610 } 611 } 612 613 /* 614 * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the 615 * interval timer support code. 616 */ 617 618 #define NOTE_TIMER_PRECMASK \ 619 (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS) 620 621 static sbintime_t 622 timer2sbintime(int64_t data, int flags) 623 { 624 int64_t secs; 625 626 /* 627 * Macros for converting to the fractional second portion of an 628 * sbintime_t using 64bit multiplication to improve precision. 629 */ 630 #define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32) 631 #define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32) 632 #define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32) 633 switch (flags & NOTE_TIMER_PRECMASK) { 634 case NOTE_SECONDS: 635 #ifdef __LP64__ 636 if (data > (SBT_MAX / SBT_1S)) 637 return (SBT_MAX); 638 #endif 639 return ((sbintime_t)data << 32); 640 case NOTE_MSECONDS: /* FALLTHROUGH */ 641 case 0: 642 if (data >= 1000) { 643 secs = data / 1000; 644 #ifdef __LP64__ 645 if (secs > (SBT_MAX / SBT_1S)) 646 return (SBT_MAX); 647 #endif 648 return (secs << 32 | MS_TO_SBT(data % 1000)); 649 } 650 return (MS_TO_SBT(data)); 651 case NOTE_USECONDS: 652 if (data >= 1000000) { 653 secs = data / 1000000; 654 #ifdef __LP64__ 655 if (secs > (SBT_MAX / SBT_1S)) 656 return (SBT_MAX); 657 #endif 658 return (secs << 32 | US_TO_SBT(data % 1000000)); 659 } 660 return (US_TO_SBT(data)); 661 case NOTE_NSECONDS: 662 if (data >= 1000000000) { 663 secs = data / 1000000000; 664 #ifdef __LP64__ 665 if (secs > (SBT_MAX / SBT_1S)) 666 return (SBT_MAX); 667 #endif 668 return (secs << 32 | NS_TO_SBT(data % 1000000000)); 669 } 670 return (NS_TO_SBT(data)); 671 default: 672 break; 673 } 674 return (-1); 675 } 676 677 struct kq_timer_cb_data { 678 struct callout c; 679 struct proc *p; 680 struct knote *kn; 681 int cpuid; 682 int flags; 683 TAILQ_ENTRY(kq_timer_cb_data) link; 684 sbintime_t next; /* next timer event fires at */ 685 sbintime_t to; /* precalculated timer period, 0 for abs */ 686 }; 687 688 #define KQ_TIMER_CB_ENQUEUED 0x01 689 690 static void 691 kqtimer_sched_callout(struct kq_timer_cb_data *kc) 692 { 693 callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kc->kn, 694 kc->cpuid, C_ABSOLUTE); 695 } 696 697 void 698 kqtimer_proc_continue(struct proc *p) 699 { 700 struct kq_timer_cb_data *kc, *kc1; 701 struct bintime bt; 702 sbintime_t now; 703 704 PROC_LOCK_ASSERT(p, MA_OWNED); 705 706 getboottimebin(&bt); 707 now = bttosbt(bt); 708 709 TAILQ_FOREACH_SAFE(kc, &p->p_kqtim_stop, link, kc1) { 710 TAILQ_REMOVE(&p->p_kqtim_stop, kc, link); 711 kc->flags &= ~KQ_TIMER_CB_ENQUEUED; 712 if (kc->next <= now) 713 filt_timerexpire_l(kc->kn, true); 714 else 715 kqtimer_sched_callout(kc); 716 } 717 } 718 719 static void 720 filt_timerexpire_l(struct knote *kn, bool proc_locked) 721 { 722 struct kq_timer_cb_data *kc; 723 struct proc *p; 724 uint64_t delta; 725 sbintime_t now; 726 727 kc = kn->kn_ptr.p_v; 728 729 if ((kn->kn_flags & EV_ONESHOT) != 0 || kc->to == 0) { 730 kn->kn_data++; 731 KNOTE_ACTIVATE(kn, 0); 732 return; 733 } 734 735 now = sbinuptime(); 736 if (now >= kc->next) { 737 delta = (now - kc->next) / kc->to; 738 if (delta == 0) 739 delta = 1; 740 kn->kn_data += delta; 741 kc->next += delta * kc->to; 742 if (now >= kc->next) /* overflow */ 743 kc->next = now + kc->to; 744 KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */ 745 } 746 747 /* 748 * Initial check for stopped kc->p is racy. It is fine to 749 * miss the set of the stop flags, at worst we would schedule 750 * one more callout. On the other hand, it is not fine to not 751 * schedule when we we missed clearing of the flags, we 752 * recheck them under the lock and observe consistent state. 753 */ 754 p = kc->p; 755 if (P_SHOULDSTOP(p) || P_KILLED(p)) { 756 if (!proc_locked) 757 PROC_LOCK(p); 758 if (P_SHOULDSTOP(p) || P_KILLED(p)) { 759 if ((kc->flags & KQ_TIMER_CB_ENQUEUED) == 0) { 760 kc->flags |= KQ_TIMER_CB_ENQUEUED; 761 TAILQ_INSERT_TAIL(&p->p_kqtim_stop, kc, link); 762 } 763 if (!proc_locked) 764 PROC_UNLOCK(p); 765 return; 766 } 767 if (!proc_locked) 768 PROC_UNLOCK(p); 769 } 770 kqtimer_sched_callout(kc); 771 } 772 773 static void 774 filt_timerexpire(void *knx) 775 { 776 filt_timerexpire_l(knx, false); 777 } 778 779 /* 780 * data contains amount of time to sleep 781 */ 782 static int 783 filt_timervalidate(struct knote *kn, sbintime_t *to) 784 { 785 struct bintime bt; 786 sbintime_t sbt; 787 788 if (kn->kn_sdata < 0) 789 return (EINVAL); 790 if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0) 791 kn->kn_sdata = 1; 792 /* 793 * The only fflags values supported are the timer unit 794 * (precision) and the absolute time indicator. 795 */ 796 if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0) 797 return (EINVAL); 798 799 *to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags); 800 if (*to < 0) 801 return (EINVAL); 802 if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { 803 getboottimebin(&bt); 804 sbt = bttosbt(bt); 805 *to = MAX(0, *to - sbt); 806 } 807 return (0); 808 } 809 810 static int 811 filt_timerattach(struct knote *kn) 812 { 813 struct kq_timer_cb_data *kc; 814 sbintime_t to; 815 int error; 816 817 to = -1; 818 error = filt_timervalidate(kn, &to); 819 if (error != 0) 820 return (error); 821 KASSERT(to > 0 || (kn->kn_flags & EV_ONESHOT) != 0 || 822 (kn->kn_sfflags & NOTE_ABSTIME) != 0, 823 ("%s: periodic timer has a calculated zero timeout", __func__)); 824 KASSERT(to >= 0, 825 ("%s: timer has a calculated negative timeout", __func__)); 826 827 if (atomic_fetchadd_int(&kq_ncallouts, 1) + 1 > kq_calloutmax) { 828 atomic_subtract_int(&kq_ncallouts, 1); 829 return (ENOMEM); 830 } 831 832 if ((kn->kn_sfflags & NOTE_ABSTIME) == 0) 833 kn->kn_flags |= EV_CLEAR; /* automatically set */ 834 kn->kn_status &= ~KN_DETACHED; /* knlist_add clears it */ 835 kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK); 836 kc->kn = kn; 837 kc->p = curproc; 838 kc->cpuid = PCPU_GET(cpuid); 839 kc->flags = 0; 840 callout_init(&kc->c, 1); 841 filt_timerstart(kn, to); 842 843 return (0); 844 } 845 846 static void 847 filt_timerstart(struct knote *kn, sbintime_t to) 848 { 849 struct kq_timer_cb_data *kc; 850 851 kc = kn->kn_ptr.p_v; 852 if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { 853 kc->next = to; 854 kc->to = 0; 855 } else { 856 kc->next = to + sbinuptime(); 857 kc->to = to; 858 } 859 kqtimer_sched_callout(kc); 860 } 861 862 static void 863 filt_timerdetach(struct knote *kn) 864 { 865 struct kq_timer_cb_data *kc; 866 unsigned int old __unused; 867 bool pending; 868 869 kc = kn->kn_ptr.p_v; 870 do { 871 callout_drain(&kc->c); 872 873 /* 874 * kqtimer_proc_continue() might have rescheduled this callout. 875 * Double-check, using the process mutex as an interlock. 876 */ 877 PROC_LOCK(kc->p); 878 if ((kc->flags & KQ_TIMER_CB_ENQUEUED) != 0) { 879 kc->flags &= ~KQ_TIMER_CB_ENQUEUED; 880 TAILQ_REMOVE(&kc->p->p_kqtim_stop, kc, link); 881 } 882 pending = callout_pending(&kc->c); 883 PROC_UNLOCK(kc->p); 884 } while (pending); 885 free(kc, M_KQUEUE); 886 old = atomic_fetchadd_int(&kq_ncallouts, -1); 887 KASSERT(old > 0, ("Number of callouts cannot become negative")); 888 kn->kn_status |= KN_DETACHED; /* knlist_remove sets it */ 889 } 890 891 static void 892 filt_timertouch(struct knote *kn, struct kevent *kev, u_long type) 893 { 894 struct kq_timer_cb_data *kc; 895 struct kqueue *kq; 896 sbintime_t to; 897 int error; 898 899 switch (type) { 900 case EVENT_REGISTER: 901 /* Handle re-added timers that update data/fflags */ 902 if (kev->flags & EV_ADD) { 903 kc = kn->kn_ptr.p_v; 904 905 /* Drain any existing callout. */ 906 callout_drain(&kc->c); 907 908 /* Throw away any existing undelivered record 909 * of the timer expiration. This is done under 910 * the presumption that if a process is 911 * re-adding this timer with new parameters, 912 * it is no longer interested in what may have 913 * happened under the old parameters. If it is 914 * interested, it can wait for the expiration, 915 * delete the old timer definition, and then 916 * add the new one. 917 * 918 * This has to be done while the kq is locked: 919 * - if enqueued, dequeue 920 * - make it no longer active 921 * - clear the count of expiration events 922 */ 923 kq = kn->kn_kq; 924 KQ_LOCK(kq); 925 if (kn->kn_status & KN_QUEUED) 926 knote_dequeue(kn); 927 928 kn->kn_status &= ~KN_ACTIVE; 929 kn->kn_data = 0; 930 KQ_UNLOCK(kq); 931 932 /* Reschedule timer based on new data/fflags */ 933 kn->kn_sfflags = kev->fflags; 934 kn->kn_sdata = kev->data; 935 error = filt_timervalidate(kn, &to); 936 if (error != 0) { 937 kn->kn_flags |= EV_ERROR; 938 kn->kn_data = error; 939 } else 940 filt_timerstart(kn, to); 941 } 942 break; 943 944 case EVENT_PROCESS: 945 *kev = kn->kn_kevent; 946 if (kn->kn_flags & EV_CLEAR) { 947 kn->kn_data = 0; 948 kn->kn_fflags = 0; 949 } 950 break; 951 952 default: 953 panic("filt_timertouch() - invalid type (%ld)", type); 954 break; 955 } 956 } 957 958 static int 959 filt_timer(struct knote *kn, long hint) 960 { 961 962 return (kn->kn_data != 0); 963 } 964 965 static int 966 filt_userattach(struct knote *kn) 967 { 968 969 /* 970 * EVFILT_USER knotes are not attached to anything in the kernel. 971 */ 972 kn->kn_hook = NULL; 973 if (kn->kn_fflags & NOTE_TRIGGER) 974 kn->kn_hookid = 1; 975 else 976 kn->kn_hookid = 0; 977 return (0); 978 } 979 980 static void 981 filt_userdetach(__unused struct knote *kn) 982 { 983 984 /* 985 * EVFILT_USER knotes are not attached to anything in the kernel. 986 */ 987 } 988 989 static int 990 filt_user(struct knote *kn, __unused long hint) 991 { 992 993 return (kn->kn_hookid); 994 } 995 996 static void 997 filt_usertouch(struct knote *kn, struct kevent *kev, u_long type) 998 { 999 u_int ffctrl; 1000 1001 switch (type) { 1002 case EVENT_REGISTER: 1003 if (kev->fflags & NOTE_TRIGGER) 1004 kn->kn_hookid = 1; 1005 1006 ffctrl = kev->fflags & NOTE_FFCTRLMASK; 1007 kev->fflags &= NOTE_FFLAGSMASK; 1008 switch (ffctrl) { 1009 case NOTE_FFNOP: 1010 break; 1011 1012 case NOTE_FFAND: 1013 kn->kn_sfflags &= kev->fflags; 1014 break; 1015 1016 case NOTE_FFOR: 1017 kn->kn_sfflags |= kev->fflags; 1018 break; 1019 1020 case NOTE_FFCOPY: 1021 kn->kn_sfflags = kev->fflags; 1022 break; 1023 1024 default: 1025 /* XXX Return error? */ 1026 break; 1027 } 1028 kn->kn_sdata = kev->data; 1029 if (kev->flags & EV_CLEAR) { 1030 kn->kn_hookid = 0; 1031 kn->kn_data = 0; 1032 kn->kn_fflags = 0; 1033 } 1034 break; 1035 1036 case EVENT_PROCESS: 1037 *kev = kn->kn_kevent; 1038 kev->fflags = kn->kn_sfflags; 1039 kev->data = kn->kn_sdata; 1040 if (kn->kn_flags & EV_CLEAR) { 1041 kn->kn_hookid = 0; 1042 kn->kn_data = 0; 1043 kn->kn_fflags = 0; 1044 } 1045 break; 1046 1047 default: 1048 panic("filt_usertouch() - invalid type (%ld)", type); 1049 break; 1050 } 1051 } 1052 1053 int 1054 sys_kqueue(struct thread *td, struct kqueue_args *uap) 1055 { 1056 1057 return (kern_kqueue(td, 0, NULL)); 1058 } 1059 1060 int 1061 sys_kqueuex(struct thread *td, struct kqueuex_args *uap) 1062 { 1063 int flags; 1064 1065 if ((uap->flags & ~(KQUEUE_CLOEXEC)) != 0) 1066 return (EINVAL); 1067 flags = 0; 1068 if ((uap->flags & KQUEUE_CLOEXEC) != 0) 1069 flags |= O_CLOEXEC; 1070 return (kern_kqueue(td, flags, NULL)); 1071 } 1072 1073 static void 1074 kqueue_init(struct kqueue *kq) 1075 { 1076 1077 mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK); 1078 TAILQ_INIT(&kq->kq_head); 1079 knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock); 1080 TASK_INIT(&kq->kq_task, 0, kqueue_task, kq); 1081 } 1082 1083 int 1084 kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps) 1085 { 1086 struct filedesc *fdp; 1087 struct kqueue *kq; 1088 struct file *fp; 1089 struct ucred *cred; 1090 int fd, error; 1091 1092 fdp = td->td_proc->p_fd; 1093 cred = td->td_ucred; 1094 if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES))) 1095 return (ENOMEM); 1096 1097 error = falloc_caps(td, &fp, &fd, flags, fcaps); 1098 if (error != 0) { 1099 chgkqcnt(cred->cr_ruidinfo, -1, 0); 1100 return (error); 1101 } 1102 1103 /* An extra reference on `fp' has been held for us by falloc(). */ 1104 kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO); 1105 kqueue_init(kq); 1106 kq->kq_fdp = fdp; 1107 kq->kq_cred = crhold(cred); 1108 1109 FILEDESC_XLOCK(fdp); 1110 TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list); 1111 FILEDESC_XUNLOCK(fdp); 1112 1113 finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); 1114 fdrop(fp, td); 1115 1116 td->td_retval[0] = fd; 1117 return (0); 1118 } 1119 1120 struct g_kevent_args { 1121 int fd; 1122 const void *changelist; 1123 int nchanges; 1124 void *eventlist; 1125 int nevents; 1126 const struct timespec *timeout; 1127 }; 1128 1129 int 1130 sys_kevent(struct thread *td, struct kevent_args *uap) 1131 { 1132 struct kevent_copyops k_ops = { 1133 .arg = uap, 1134 .k_copyout = kevent_copyout, 1135 .k_copyin = kevent_copyin, 1136 .kevent_size = sizeof(struct kevent), 1137 }; 1138 struct g_kevent_args gk_args = { 1139 .fd = uap->fd, 1140 .changelist = uap->changelist, 1141 .nchanges = uap->nchanges, 1142 .eventlist = uap->eventlist, 1143 .nevents = uap->nevents, 1144 .timeout = uap->timeout, 1145 }; 1146 1147 return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent")); 1148 } 1149 1150 static int 1151 kern_kevent_generic(struct thread *td, struct g_kevent_args *uap, 1152 struct kevent_copyops *k_ops, const char *struct_name) 1153 { 1154 struct timespec ts, *tsp; 1155 #ifdef KTRACE 1156 struct kevent *eventlist = uap->eventlist; 1157 #endif 1158 int error; 1159 1160 if (uap->timeout != NULL) { 1161 error = copyin(uap->timeout, &ts, sizeof(ts)); 1162 if (error) 1163 return (error); 1164 tsp = &ts; 1165 } else 1166 tsp = NULL; 1167 1168 #ifdef KTRACE 1169 if (KTRPOINT(td, KTR_STRUCT_ARRAY)) 1170 ktrstructarray(struct_name, UIO_USERSPACE, uap->changelist, 1171 uap->nchanges, k_ops->kevent_size); 1172 #endif 1173 1174 error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents, 1175 k_ops, tsp); 1176 1177 #ifdef KTRACE 1178 if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY)) 1179 ktrstructarray(struct_name, UIO_USERSPACE, eventlist, 1180 td->td_retval[0], k_ops->kevent_size); 1181 #endif 1182 1183 return (error); 1184 } 1185 1186 /* 1187 * Copy 'count' items into the destination list pointed to by uap->eventlist. 1188 */ 1189 static int 1190 kevent_copyout(void *arg, struct kevent *kevp, int count) 1191 { 1192 struct kevent_args *uap; 1193 int error; 1194 1195 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); 1196 uap = (struct kevent_args *)arg; 1197 1198 error = copyout(kevp, uap->eventlist, count * sizeof *kevp); 1199 if (error == 0) 1200 uap->eventlist += count; 1201 return (error); 1202 } 1203 1204 /* 1205 * Copy 'count' items from the list pointed to by uap->changelist. 1206 */ 1207 static int 1208 kevent_copyin(void *arg, struct kevent *kevp, int count) 1209 { 1210 struct kevent_args *uap; 1211 int error; 1212 1213 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); 1214 uap = (struct kevent_args *)arg; 1215 1216 error = copyin(uap->changelist, kevp, count * sizeof *kevp); 1217 if (error == 0) 1218 uap->changelist += count; 1219 return (error); 1220 } 1221 1222 #ifdef COMPAT_FREEBSD11 1223 static int 1224 kevent11_copyout(void *arg, struct kevent *kevp, int count) 1225 { 1226 struct freebsd11_kevent_args *uap; 1227 struct freebsd11_kevent kev11; 1228 int error, i; 1229 1230 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); 1231 uap = (struct freebsd11_kevent_args *)arg; 1232 1233 for (i = 0; i < count; i++) { 1234 kev11.ident = kevp->ident; 1235 kev11.filter = kevp->filter; 1236 kev11.flags = kevp->flags; 1237 kev11.fflags = kevp->fflags; 1238 kev11.data = kevp->data; 1239 kev11.udata = kevp->udata; 1240 error = copyout(&kev11, uap->eventlist, sizeof(kev11)); 1241 if (error != 0) 1242 break; 1243 uap->eventlist++; 1244 kevp++; 1245 } 1246 return (error); 1247 } 1248 1249 /* 1250 * Copy 'count' items from the list pointed to by uap->changelist. 1251 */ 1252 static int 1253 kevent11_copyin(void *arg, struct kevent *kevp, int count) 1254 { 1255 struct freebsd11_kevent_args *uap; 1256 struct freebsd11_kevent kev11; 1257 int error, i; 1258 1259 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); 1260 uap = (struct freebsd11_kevent_args *)arg; 1261 1262 for (i = 0; i < count; i++) { 1263 error = copyin(uap->changelist, &kev11, sizeof(kev11)); 1264 if (error != 0) 1265 break; 1266 kevp->ident = kev11.ident; 1267 kevp->filter = kev11.filter; 1268 kevp->flags = kev11.flags; 1269 kevp->fflags = kev11.fflags; 1270 kevp->data = (uintptr_t)kev11.data; 1271 kevp->udata = kev11.udata; 1272 bzero(&kevp->ext, sizeof(kevp->ext)); 1273 uap->changelist++; 1274 kevp++; 1275 } 1276 return (error); 1277 } 1278 1279 int 1280 freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap) 1281 { 1282 struct kevent_copyops k_ops = { 1283 .arg = uap, 1284 .k_copyout = kevent11_copyout, 1285 .k_copyin = kevent11_copyin, 1286 .kevent_size = sizeof(struct freebsd11_kevent), 1287 }; 1288 struct g_kevent_args gk_args = { 1289 .fd = uap->fd, 1290 .changelist = uap->changelist, 1291 .nchanges = uap->nchanges, 1292 .eventlist = uap->eventlist, 1293 .nevents = uap->nevents, 1294 .timeout = uap->timeout, 1295 }; 1296 1297 return (kern_kevent_generic(td, &gk_args, &k_ops, "freebsd11_kevent")); 1298 } 1299 #endif 1300 1301 int 1302 kern_kevent(struct thread *td, int fd, int nchanges, int nevents, 1303 struct kevent_copyops *k_ops, const struct timespec *timeout) 1304 { 1305 cap_rights_t rights; 1306 struct file *fp; 1307 int error; 1308 1309 cap_rights_init_zero(&rights); 1310 if (nchanges > 0) 1311 cap_rights_set_one(&rights, CAP_KQUEUE_CHANGE); 1312 if (nevents > 0) 1313 cap_rights_set_one(&rights, CAP_KQUEUE_EVENT); 1314 error = fget(td, fd, &rights, &fp); 1315 if (error != 0) 1316 return (error); 1317 1318 error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout); 1319 fdrop(fp, td); 1320 1321 return (error); 1322 } 1323 1324 static int 1325 kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents, 1326 struct kevent_copyops *k_ops, const struct timespec *timeout) 1327 { 1328 struct kevent keva[KQ_NEVENTS]; 1329 struct kevent *kevp, *changes; 1330 int i, n, nerrors, error; 1331 1332 if (nchanges < 0) 1333 return (EINVAL); 1334 1335 nerrors = 0; 1336 while (nchanges > 0) { 1337 n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges; 1338 error = k_ops->k_copyin(k_ops->arg, keva, n); 1339 if (error) 1340 return (error); 1341 changes = keva; 1342 for (i = 0; i < n; i++) { 1343 kevp = &changes[i]; 1344 if (!kevp->filter) 1345 continue; 1346 kevp->flags &= ~EV_SYSFLAGS; 1347 error = kqueue_register(kq, kevp, td, M_WAITOK); 1348 if (error || (kevp->flags & EV_RECEIPT)) { 1349 if (nevents == 0) 1350 return (error); 1351 kevp->flags = EV_ERROR; 1352 kevp->data = error; 1353 (void)k_ops->k_copyout(k_ops->arg, kevp, 1); 1354 nevents--; 1355 nerrors++; 1356 } 1357 } 1358 nchanges -= n; 1359 } 1360 if (nerrors) { 1361 td->td_retval[0] = nerrors; 1362 return (0); 1363 } 1364 1365 return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td)); 1366 } 1367 1368 int 1369 kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents, 1370 struct kevent_copyops *k_ops, const struct timespec *timeout) 1371 { 1372 struct kqueue *kq; 1373 int error; 1374 1375 error = kqueue_acquire(fp, &kq); 1376 if (error != 0) 1377 return (error); 1378 error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout); 1379 kqueue_release(kq, 0); 1380 return (error); 1381 } 1382 1383 /* 1384 * Performs a kevent() call on a temporarily created kqueue. This can be 1385 * used to perform one-shot polling, similar to poll() and select(). 1386 */ 1387 int 1388 kern_kevent_anonymous(struct thread *td, int nevents, 1389 struct kevent_copyops *k_ops) 1390 { 1391 struct kqueue kq = {}; 1392 int error; 1393 1394 kqueue_init(&kq); 1395 kq.kq_refcnt = 1; 1396 error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL); 1397 kqueue_drain(&kq, td); 1398 kqueue_destroy(&kq); 1399 return (error); 1400 } 1401 1402 int 1403 kqueue_add_filteropts(int filt, struct filterops *filtops) 1404 { 1405 int error; 1406 1407 error = 0; 1408 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) { 1409 printf( 1410 "trying to add a filterop that is out of range: %d is beyond %d\n", 1411 ~filt, EVFILT_SYSCOUNT); 1412 return EINVAL; 1413 } 1414 mtx_lock(&filterops_lock); 1415 if (sysfilt_ops[~filt].for_fop != &null_filtops && 1416 sysfilt_ops[~filt].for_fop != NULL) 1417 error = EEXIST; 1418 else { 1419 sysfilt_ops[~filt].for_fop = filtops; 1420 sysfilt_ops[~filt].for_refcnt = 0; 1421 } 1422 mtx_unlock(&filterops_lock); 1423 1424 return (error); 1425 } 1426 1427 int 1428 kqueue_del_filteropts(int filt) 1429 { 1430 int error; 1431 1432 error = 0; 1433 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) 1434 return EINVAL; 1435 1436 mtx_lock(&filterops_lock); 1437 if (sysfilt_ops[~filt].for_fop == &null_filtops || 1438 sysfilt_ops[~filt].for_fop == NULL) 1439 error = EINVAL; 1440 else if (sysfilt_ops[~filt].for_refcnt != 0) 1441 error = EBUSY; 1442 else { 1443 sysfilt_ops[~filt].for_fop = &null_filtops; 1444 sysfilt_ops[~filt].for_refcnt = 0; 1445 } 1446 mtx_unlock(&filterops_lock); 1447 1448 return error; 1449 } 1450 1451 static struct filterops * 1452 kqueue_fo_find(int filt) 1453 { 1454 1455 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) 1456 return NULL; 1457 1458 if (sysfilt_ops[~filt].for_nolock) 1459 return sysfilt_ops[~filt].for_fop; 1460 1461 mtx_lock(&filterops_lock); 1462 sysfilt_ops[~filt].for_refcnt++; 1463 if (sysfilt_ops[~filt].for_fop == NULL) 1464 sysfilt_ops[~filt].for_fop = &null_filtops; 1465 mtx_unlock(&filterops_lock); 1466 1467 return sysfilt_ops[~filt].for_fop; 1468 } 1469 1470 static void 1471 kqueue_fo_release(int filt) 1472 { 1473 1474 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) 1475 return; 1476 1477 if (sysfilt_ops[~filt].for_nolock) 1478 return; 1479 1480 mtx_lock(&filterops_lock); 1481 KASSERT(sysfilt_ops[~filt].for_refcnt > 0, 1482 ("filter object refcount not valid on release")); 1483 sysfilt_ops[~filt].for_refcnt--; 1484 mtx_unlock(&filterops_lock); 1485 } 1486 1487 /* 1488 * A ref to kq (obtained via kqueue_acquire) must be held. 1489 */ 1490 static int 1491 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, 1492 int mflag) 1493 { 1494 struct filterops *fops; 1495 struct file *fp; 1496 struct knote *kn, *tkn; 1497 struct knlist *knl; 1498 int error, filt, event; 1499 int haskqglobal, filedesc_unlock; 1500 1501 if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE)) 1502 return (EINVAL); 1503 1504 fp = NULL; 1505 kn = NULL; 1506 knl = NULL; 1507 error = 0; 1508 haskqglobal = 0; 1509 filedesc_unlock = 0; 1510 1511 filt = kev->filter; 1512 fops = kqueue_fo_find(filt); 1513 if (fops == NULL) 1514 return EINVAL; 1515 1516 if (kev->flags & EV_ADD) { 1517 /* Reject an invalid flag pair early */ 1518 if (kev->flags & EV_KEEPUDATA) { 1519 tkn = NULL; 1520 error = EINVAL; 1521 goto done; 1522 } 1523 1524 /* 1525 * Prevent waiting with locks. Non-sleepable 1526 * allocation failures are handled in the loop, only 1527 * if the spare knote appears to be actually required. 1528 */ 1529 tkn = knote_alloc(mflag); 1530 } else { 1531 tkn = NULL; 1532 } 1533 1534 findkn: 1535 if (fops->f_isfd) { 1536 KASSERT(td != NULL, ("td is NULL")); 1537 if (kev->ident > INT_MAX) 1538 error = EBADF; 1539 else 1540 error = fget(td, kev->ident, &cap_event_rights, &fp); 1541 if (error) 1542 goto done; 1543 1544 if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops, 1545 kev->ident, M_NOWAIT) != 0) { 1546 /* try again */ 1547 fdrop(fp, td); 1548 fp = NULL; 1549 error = kqueue_expand(kq, fops, kev->ident, mflag); 1550 if (error) 1551 goto done; 1552 goto findkn; 1553 } 1554 1555 if (fp->f_type == DTYPE_KQUEUE) { 1556 /* 1557 * If we add some intelligence about what we are doing, 1558 * we should be able to support events on ourselves. 1559 * We need to know when we are doing this to prevent 1560 * getting both the knlist lock and the kq lock since 1561 * they are the same thing. 1562 */ 1563 if (fp->f_data == kq) { 1564 error = EINVAL; 1565 goto done; 1566 } 1567 1568 /* 1569 * Pre-lock the filedesc before the global 1570 * lock mutex, see the comment in 1571 * kqueue_close(). 1572 */ 1573 FILEDESC_XLOCK(td->td_proc->p_fd); 1574 filedesc_unlock = 1; 1575 KQ_GLOBAL_LOCK(&kq_global, haskqglobal); 1576 } 1577 1578 KQ_LOCK(kq); 1579 if (kev->ident < kq->kq_knlistsize) { 1580 SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link) 1581 if (kev->filter == kn->kn_filter) 1582 break; 1583 } 1584 } else { 1585 if ((kev->flags & EV_ADD) == EV_ADD) { 1586 error = kqueue_expand(kq, fops, kev->ident, mflag); 1587 if (error != 0) 1588 goto done; 1589 } 1590 1591 KQ_LOCK(kq); 1592 1593 /* 1594 * If possible, find an existing knote to use for this kevent. 1595 */ 1596 if (kev->filter == EVFILT_PROC && 1597 (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) { 1598 /* This is an internal creation of a process tracking 1599 * note. Don't attempt to coalesce this with an 1600 * existing note. 1601 */ 1602 ; 1603 } else if (kq->kq_knhashmask != 0) { 1604 struct klist *list; 1605 1606 list = &kq->kq_knhash[ 1607 KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; 1608 SLIST_FOREACH(kn, list, kn_link) 1609 if (kev->ident == kn->kn_id && 1610 kev->filter == kn->kn_filter) 1611 break; 1612 } 1613 } 1614 1615 /* knote is in the process of changing, wait for it to stabilize. */ 1616 if (kn != NULL && kn_in_flux(kn)) { 1617 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1618 if (filedesc_unlock) { 1619 FILEDESC_XUNLOCK(td->td_proc->p_fd); 1620 filedesc_unlock = 0; 1621 } 1622 kq->kq_state |= KQ_FLUXWAIT; 1623 msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0); 1624 if (fp != NULL) { 1625 fdrop(fp, td); 1626 fp = NULL; 1627 } 1628 goto findkn; 1629 } 1630 1631 /* 1632 * kn now contains the matching knote, or NULL if no match 1633 */ 1634 if (kn == NULL) { 1635 if (kev->flags & EV_ADD) { 1636 kn = tkn; 1637 tkn = NULL; 1638 if (kn == NULL) { 1639 KQ_UNLOCK(kq); 1640 error = ENOMEM; 1641 goto done; 1642 } 1643 kn->kn_fp = fp; 1644 kn->kn_kq = kq; 1645 kn->kn_fop = fops; 1646 /* 1647 * apply reference counts to knote structure, and 1648 * do not release it at the end of this routine. 1649 */ 1650 fops = NULL; 1651 fp = NULL; 1652 1653 kn->kn_sfflags = kev->fflags; 1654 kn->kn_sdata = kev->data; 1655 kev->fflags = 0; 1656 kev->data = 0; 1657 kn->kn_kevent = *kev; 1658 kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE | 1659 EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT); 1660 kn->kn_status = KN_DETACHED; 1661 if ((kev->flags & EV_DISABLE) != 0) 1662 kn->kn_status |= KN_DISABLED; 1663 kn_enter_flux(kn); 1664 1665 error = knote_attach(kn, kq); 1666 KQ_UNLOCK(kq); 1667 if (error != 0) { 1668 tkn = kn; 1669 goto done; 1670 } 1671 1672 if ((error = kn->kn_fop->f_attach(kn)) != 0) { 1673 knote_drop_detached(kn, td); 1674 goto done; 1675 } 1676 knl = kn_list_lock(kn); 1677 goto done_ev_add; 1678 } else { 1679 /* No matching knote and the EV_ADD flag is not set. */ 1680 KQ_UNLOCK(kq); 1681 error = ENOENT; 1682 goto done; 1683 } 1684 } 1685 1686 if (kev->flags & EV_DELETE) { 1687 kn_enter_flux(kn); 1688 KQ_UNLOCK(kq); 1689 knote_drop(kn, td); 1690 goto done; 1691 } 1692 1693 if (kev->flags & EV_FORCEONESHOT) { 1694 kn->kn_flags |= EV_ONESHOT; 1695 KNOTE_ACTIVATE(kn, 1); 1696 } 1697 1698 if ((kev->flags & EV_ENABLE) != 0) 1699 kn->kn_status &= ~KN_DISABLED; 1700 else if ((kev->flags & EV_DISABLE) != 0) 1701 kn->kn_status |= KN_DISABLED; 1702 1703 /* 1704 * The user may change some filter values after the initial EV_ADD, 1705 * but doing so will not reset any filter which has already been 1706 * triggered. 1707 */ 1708 kn->kn_status |= KN_SCAN; 1709 kn_enter_flux(kn); 1710 KQ_UNLOCK(kq); 1711 knl = kn_list_lock(kn); 1712 if ((kev->flags & EV_KEEPUDATA) == 0) 1713 kn->kn_kevent.udata = kev->udata; 1714 if (!fops->f_isfd && fops->f_touch != NULL) { 1715 fops->f_touch(kn, kev, EVENT_REGISTER); 1716 } else { 1717 kn->kn_sfflags = kev->fflags; 1718 kn->kn_sdata = kev->data; 1719 } 1720 1721 done_ev_add: 1722 /* 1723 * We can get here with kn->kn_knlist == NULL. This can happen when 1724 * the initial attach event decides that the event is "completed" 1725 * already, e.g., filt_procattach() is called on a zombie process. It 1726 * will call filt_proc() which will remove it from the list, and NULL 1727 * kn_knlist. 1728 * 1729 * KN_DISABLED will be stable while the knote is in flux, so the 1730 * unlocked read will not race with an update. 1731 */ 1732 if ((kn->kn_status & KN_DISABLED) == 0) 1733 event = kn->kn_fop->f_event(kn, 0); 1734 else 1735 event = 0; 1736 1737 KQ_LOCK(kq); 1738 if (event) 1739 kn->kn_status |= KN_ACTIVE; 1740 if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) == 1741 KN_ACTIVE) 1742 knote_enqueue(kn); 1743 kn->kn_status &= ~KN_SCAN; 1744 kn_leave_flux(kn); 1745 kn_list_unlock(knl); 1746 KQ_UNLOCK_FLUX(kq); 1747 1748 done: 1749 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1750 if (filedesc_unlock) 1751 FILEDESC_XUNLOCK(td->td_proc->p_fd); 1752 if (fp != NULL) 1753 fdrop(fp, td); 1754 knote_free(tkn); 1755 if (fops != NULL) 1756 kqueue_fo_release(filt); 1757 return (error); 1758 } 1759 1760 static int 1761 kqueue_acquire(struct file *fp, struct kqueue **kqp) 1762 { 1763 int error; 1764 struct kqueue *kq; 1765 1766 error = 0; 1767 1768 kq = fp->f_data; 1769 if (fp->f_type != DTYPE_KQUEUE || kq == NULL) 1770 return (EBADF); 1771 *kqp = kq; 1772 KQ_LOCK(kq); 1773 if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { 1774 KQ_UNLOCK(kq); 1775 return (EBADF); 1776 } 1777 kq->kq_refcnt++; 1778 KQ_UNLOCK(kq); 1779 1780 return error; 1781 } 1782 1783 static void 1784 kqueue_release(struct kqueue *kq, int locked) 1785 { 1786 if (locked) 1787 KQ_OWNED(kq); 1788 else 1789 KQ_LOCK(kq); 1790 kq->kq_refcnt--; 1791 if (kq->kq_refcnt == 1) 1792 wakeup(&kq->kq_refcnt); 1793 if (!locked) 1794 KQ_UNLOCK(kq); 1795 } 1796 1797 static void 1798 ast_kqueue(struct thread *td, int tda __unused) 1799 { 1800 taskqueue_quiesce(taskqueue_kqueue_ctx); 1801 } 1802 1803 static void 1804 kqueue_schedtask(struct kqueue *kq) 1805 { 1806 KQ_OWNED(kq); 1807 KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN), 1808 ("scheduling kqueue task while draining")); 1809 1810 if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) { 1811 taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task); 1812 kq->kq_state |= KQ_TASKSCHED; 1813 ast_sched(curthread, TDA_KQUEUE); 1814 } 1815 } 1816 1817 /* 1818 * Expand the kq to make sure we have storage for fops/ident pair. 1819 * 1820 * Return 0 on success (or no work necessary), return errno on failure. 1821 */ 1822 static int 1823 kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, 1824 int mflag) 1825 { 1826 struct klist *list, *tmp_knhash, *to_free; 1827 u_long tmp_knhashmask; 1828 int error, fd, size; 1829 1830 KQ_NOTOWNED(kq); 1831 1832 error = 0; 1833 to_free = NULL; 1834 if (fops->f_isfd) { 1835 fd = ident; 1836 if (kq->kq_knlistsize <= fd) { 1837 size = kq->kq_knlistsize; 1838 while (size <= fd) 1839 size += KQEXTENT; 1840 list = malloc(size * sizeof(*list), M_KQUEUE, mflag); 1841 if (list == NULL) 1842 return ENOMEM; 1843 KQ_LOCK(kq); 1844 if ((kq->kq_state & KQ_CLOSING) != 0) { 1845 to_free = list; 1846 error = EBADF; 1847 } else if (kq->kq_knlistsize > fd) { 1848 to_free = list; 1849 } else { 1850 if (kq->kq_knlist != NULL) { 1851 bcopy(kq->kq_knlist, list, 1852 kq->kq_knlistsize * sizeof(*list)); 1853 to_free = kq->kq_knlist; 1854 kq->kq_knlist = NULL; 1855 } 1856 bzero((caddr_t)list + 1857 kq->kq_knlistsize * sizeof(*list), 1858 (size - kq->kq_knlistsize) * sizeof(*list)); 1859 kq->kq_knlistsize = size; 1860 kq->kq_knlist = list; 1861 } 1862 KQ_UNLOCK(kq); 1863 } 1864 } else { 1865 if (kq->kq_knhashmask == 0) { 1866 tmp_knhash = hashinit_flags(KN_HASHSIZE, M_KQUEUE, 1867 &tmp_knhashmask, (mflag & M_WAITOK) != 0 ? 1868 HASH_WAITOK : HASH_NOWAIT); 1869 if (tmp_knhash == NULL) 1870 return (ENOMEM); 1871 KQ_LOCK(kq); 1872 if ((kq->kq_state & KQ_CLOSING) != 0) { 1873 to_free = tmp_knhash; 1874 error = EBADF; 1875 } else if (kq->kq_knhashmask == 0) { 1876 kq->kq_knhash = tmp_knhash; 1877 kq->kq_knhashmask = tmp_knhashmask; 1878 } else { 1879 to_free = tmp_knhash; 1880 } 1881 KQ_UNLOCK(kq); 1882 } 1883 } 1884 free(to_free, M_KQUEUE); 1885 1886 KQ_NOTOWNED(kq); 1887 return (error); 1888 } 1889 1890 static void 1891 kqueue_task(void *arg, int pending) 1892 { 1893 struct kqueue *kq; 1894 int haskqglobal; 1895 1896 haskqglobal = 0; 1897 kq = arg; 1898 1899 KQ_GLOBAL_LOCK(&kq_global, haskqglobal); 1900 KQ_LOCK(kq); 1901 1902 KNOTE_LOCKED(&kq->kq_sel.si_note, 0); 1903 1904 kq->kq_state &= ~KQ_TASKSCHED; 1905 if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) { 1906 wakeup(&kq->kq_state); 1907 } 1908 KQ_UNLOCK(kq); 1909 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1910 } 1911 1912 /* 1913 * Scan, update kn_data (if not ONESHOT), and copyout triggered events. 1914 * We treat KN_MARKER knotes as if they are in flux. 1915 */ 1916 static int 1917 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, 1918 const struct timespec *tsp, struct kevent *keva, struct thread *td) 1919 { 1920 struct kevent *kevp; 1921 struct knote *kn, *marker; 1922 struct knlist *knl; 1923 sbintime_t asbt, rsbt; 1924 int count, error, haskqglobal, influx, nkev, touch; 1925 1926 count = maxevents; 1927 nkev = 0; 1928 error = 0; 1929 haskqglobal = 0; 1930 1931 if (maxevents == 0) 1932 goto done_nl; 1933 if (maxevents < 0) { 1934 error = EINVAL; 1935 goto done_nl; 1936 } 1937 1938 rsbt = 0; 1939 if (tsp != NULL) { 1940 if (!timespecvalid_interval(tsp)) { 1941 error = EINVAL; 1942 goto done_nl; 1943 } 1944 if (timespecisset(tsp)) { 1945 if (tsp->tv_sec <= INT32_MAX) { 1946 rsbt = tstosbt(*tsp); 1947 if (TIMESEL(&asbt, rsbt)) 1948 asbt += tc_tick_sbt; 1949 if (asbt <= SBT_MAX - rsbt) 1950 asbt += rsbt; 1951 else 1952 asbt = 0; 1953 rsbt >>= tc_precexp; 1954 } else 1955 asbt = 0; 1956 } else 1957 asbt = -1; 1958 } else 1959 asbt = 0; 1960 marker = knote_alloc(M_WAITOK); 1961 marker->kn_status = KN_MARKER; 1962 KQ_LOCK(kq); 1963 1964 retry: 1965 kevp = keva; 1966 if (kq->kq_count == 0) { 1967 if (asbt == -1) { 1968 error = EWOULDBLOCK; 1969 } else { 1970 kq->kq_state |= KQ_SLEEP; 1971 error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH, 1972 "kqread", asbt, rsbt, C_ABSOLUTE); 1973 } 1974 if (error == 0) 1975 goto retry; 1976 /* don't restart after signals... */ 1977 if (error == ERESTART) 1978 error = EINTR; 1979 else if (error == EWOULDBLOCK) 1980 error = 0; 1981 goto done; 1982 } 1983 1984 TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe); 1985 influx = 0; 1986 while (count) { 1987 KQ_OWNED(kq); 1988 kn = TAILQ_FIRST(&kq->kq_head); 1989 1990 if ((kn->kn_status == KN_MARKER && kn != marker) || 1991 kn_in_flux(kn)) { 1992 if (influx) { 1993 influx = 0; 1994 KQ_FLUX_WAKEUP(kq); 1995 } 1996 kq->kq_state |= KQ_FLUXWAIT; 1997 error = msleep(kq, &kq->kq_lock, PSOCK, 1998 "kqflxwt", 0); 1999 continue; 2000 } 2001 2002 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 2003 if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) { 2004 kn->kn_status &= ~KN_QUEUED; 2005 kq->kq_count--; 2006 continue; 2007 } 2008 if (kn == marker) { 2009 KQ_FLUX_WAKEUP(kq); 2010 if (count == maxevents) 2011 goto retry; 2012 goto done; 2013 } 2014 KASSERT(!kn_in_flux(kn), 2015 ("knote %p is unexpectedly in flux", kn)); 2016 2017 if ((kn->kn_flags & EV_DROP) == EV_DROP) { 2018 kn->kn_status &= ~KN_QUEUED; 2019 kn_enter_flux(kn); 2020 kq->kq_count--; 2021 KQ_UNLOCK(kq); 2022 /* 2023 * We don't need to lock the list since we've 2024 * marked it as in flux. 2025 */ 2026 knote_drop(kn, td); 2027 KQ_LOCK(kq); 2028 continue; 2029 } else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) { 2030 kn->kn_status &= ~KN_QUEUED; 2031 kn_enter_flux(kn); 2032 kq->kq_count--; 2033 KQ_UNLOCK(kq); 2034 /* 2035 * We don't need to lock the list since we've 2036 * marked the knote as being in flux. 2037 */ 2038 *kevp = kn->kn_kevent; 2039 knote_drop(kn, td); 2040 KQ_LOCK(kq); 2041 kn = NULL; 2042 } else { 2043 kn->kn_status |= KN_SCAN; 2044 kn_enter_flux(kn); 2045 KQ_UNLOCK(kq); 2046 if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE) 2047 KQ_GLOBAL_LOCK(&kq_global, haskqglobal); 2048 knl = kn_list_lock(kn); 2049 if (kn->kn_fop->f_event(kn, 0) == 0) { 2050 KQ_LOCK(kq); 2051 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 2052 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE | 2053 KN_SCAN); 2054 kn_leave_flux(kn); 2055 kq->kq_count--; 2056 kn_list_unlock(knl); 2057 influx = 1; 2058 continue; 2059 } 2060 touch = (!kn->kn_fop->f_isfd && 2061 kn->kn_fop->f_touch != NULL); 2062 if (touch) 2063 kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS); 2064 else 2065 *kevp = kn->kn_kevent; 2066 KQ_LOCK(kq); 2067 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 2068 if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) { 2069 /* 2070 * Manually clear knotes who weren't 2071 * 'touch'ed. 2072 */ 2073 if (touch == 0 && kn->kn_flags & EV_CLEAR) { 2074 kn->kn_data = 0; 2075 kn->kn_fflags = 0; 2076 } 2077 if (kn->kn_flags & EV_DISPATCH) 2078 kn->kn_status |= KN_DISABLED; 2079 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); 2080 kq->kq_count--; 2081 } else 2082 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 2083 2084 kn->kn_status &= ~KN_SCAN; 2085 kn_leave_flux(kn); 2086 kn_list_unlock(knl); 2087 influx = 1; 2088 } 2089 2090 /* we are returning a copy to the user */ 2091 kevp++; 2092 nkev++; 2093 count--; 2094 2095 if (nkev == KQ_NEVENTS) { 2096 influx = 0; 2097 KQ_UNLOCK_FLUX(kq); 2098 error = k_ops->k_copyout(k_ops->arg, keva, nkev); 2099 nkev = 0; 2100 kevp = keva; 2101 KQ_LOCK(kq); 2102 if (error) 2103 break; 2104 } 2105 } 2106 TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe); 2107 done: 2108 KQ_OWNED(kq); 2109 KQ_UNLOCK_FLUX(kq); 2110 knote_free(marker); 2111 done_nl: 2112 KQ_NOTOWNED(kq); 2113 if (nkev != 0) 2114 error = k_ops->k_copyout(k_ops->arg, keva, nkev); 2115 td->td_retval[0] = maxevents - count; 2116 return (error); 2117 } 2118 2119 /*ARGSUSED*/ 2120 static int 2121 kqueue_ioctl(struct file *fp, u_long cmd, void *data, 2122 struct ucred *active_cred, struct thread *td) 2123 { 2124 /* 2125 * Enabling sigio causes two major problems: 2126 * 1) infinite recursion: 2127 * Synopsys: kevent is being used to track signals and have FIOASYNC 2128 * set. On receipt of a signal this will cause a kqueue to recurse 2129 * into itself over and over. Sending the sigio causes the kqueue 2130 * to become ready, which in turn posts sigio again, forever. 2131 * Solution: this can be solved by setting a flag in the kqueue that 2132 * we have a SIGIO in progress. 2133 * 2) locking problems: 2134 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts 2135 * us above the proc and pgrp locks. 2136 * Solution: Post a signal using an async mechanism, being sure to 2137 * record a generation count in the delivery so that we do not deliver 2138 * a signal to the wrong process. 2139 * 2140 * Note, these two mechanisms are somewhat mutually exclusive! 2141 */ 2142 #if 0 2143 struct kqueue *kq; 2144 2145 kq = fp->f_data; 2146 switch (cmd) { 2147 case FIOASYNC: 2148 if (*(int *)data) { 2149 kq->kq_state |= KQ_ASYNC; 2150 } else { 2151 kq->kq_state &= ~KQ_ASYNC; 2152 } 2153 return (0); 2154 2155 case FIOSETOWN: 2156 return (fsetown(*(int *)data, &kq->kq_sigio)); 2157 2158 case FIOGETOWN: 2159 *(int *)data = fgetown(&kq->kq_sigio); 2160 return (0); 2161 } 2162 #endif 2163 2164 return (ENOTTY); 2165 } 2166 2167 /*ARGSUSED*/ 2168 static int 2169 kqueue_poll(struct file *fp, int events, struct ucred *active_cred, 2170 struct thread *td) 2171 { 2172 struct kqueue *kq; 2173 int revents = 0; 2174 int error; 2175 2176 if ((error = kqueue_acquire(fp, &kq))) 2177 return POLLERR; 2178 2179 KQ_LOCK(kq); 2180 if (events & (POLLIN | POLLRDNORM)) { 2181 if (kq->kq_count) { 2182 revents |= events & (POLLIN | POLLRDNORM); 2183 } else { 2184 selrecord(td, &kq->kq_sel); 2185 if (SEL_WAITING(&kq->kq_sel)) 2186 kq->kq_state |= KQ_SEL; 2187 } 2188 } 2189 kqueue_release(kq, 1); 2190 KQ_UNLOCK(kq); 2191 return (revents); 2192 } 2193 2194 /*ARGSUSED*/ 2195 static int 2196 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred) 2197 { 2198 2199 bzero((void *)st, sizeof *st); 2200 /* 2201 * We no longer return kq_count because the unlocked value is useless. 2202 * If you spent all this time getting the count, why not spend your 2203 * syscall better by calling kevent? 2204 * 2205 * XXX - This is needed for libc_r. 2206 */ 2207 st->st_mode = S_IFIFO; 2208 return (0); 2209 } 2210 2211 static void 2212 kqueue_drain(struct kqueue *kq, struct thread *td) 2213 { 2214 struct knote *kn; 2215 int i; 2216 2217 KQ_LOCK(kq); 2218 2219 KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING, 2220 ("kqueue already closing")); 2221 kq->kq_state |= KQ_CLOSING; 2222 if (kq->kq_refcnt > 1) 2223 msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0); 2224 2225 KASSERT(kq->kq_refcnt == 1, ("other refs are out there!")); 2226 2227 KASSERT(knlist_empty(&kq->kq_sel.si_note), 2228 ("kqueue's knlist not empty")); 2229 2230 for (i = 0; i < kq->kq_knlistsize; i++) { 2231 while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) { 2232 if (kn_in_flux(kn)) { 2233 kq->kq_state |= KQ_FLUXWAIT; 2234 msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0); 2235 continue; 2236 } 2237 kn_enter_flux(kn); 2238 KQ_UNLOCK(kq); 2239 knote_drop(kn, td); 2240 KQ_LOCK(kq); 2241 } 2242 } 2243 if (kq->kq_knhashmask != 0) { 2244 for (i = 0; i <= kq->kq_knhashmask; i++) { 2245 while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) { 2246 if (kn_in_flux(kn)) { 2247 kq->kq_state |= KQ_FLUXWAIT; 2248 msleep(kq, &kq->kq_lock, PSOCK, 2249 "kqclo2", 0); 2250 continue; 2251 } 2252 kn_enter_flux(kn); 2253 KQ_UNLOCK(kq); 2254 knote_drop(kn, td); 2255 KQ_LOCK(kq); 2256 } 2257 } 2258 } 2259 2260 if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) { 2261 kq->kq_state |= KQ_TASKDRAIN; 2262 msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0); 2263 } 2264 2265 if ((kq->kq_state & KQ_SEL) == KQ_SEL) { 2266 selwakeuppri(&kq->kq_sel, PSOCK); 2267 if (!SEL_WAITING(&kq->kq_sel)) 2268 kq->kq_state &= ~KQ_SEL; 2269 } 2270 2271 KQ_UNLOCK(kq); 2272 } 2273 2274 static void 2275 kqueue_destroy(struct kqueue *kq) 2276 { 2277 2278 KASSERT(kq->kq_fdp == NULL, 2279 ("kqueue still attached to a file descriptor")); 2280 seldrain(&kq->kq_sel); 2281 knlist_destroy(&kq->kq_sel.si_note); 2282 mtx_destroy(&kq->kq_lock); 2283 2284 if (kq->kq_knhash != NULL) 2285 free(kq->kq_knhash, M_KQUEUE); 2286 if (kq->kq_knlist != NULL) 2287 free(kq->kq_knlist, M_KQUEUE); 2288 2289 funsetown(&kq->kq_sigio); 2290 } 2291 2292 /*ARGSUSED*/ 2293 static int 2294 kqueue_close(struct file *fp, struct thread *td) 2295 { 2296 struct kqueue *kq = fp->f_data; 2297 struct filedesc *fdp; 2298 int error; 2299 int filedesc_unlock; 2300 2301 if ((error = kqueue_acquire(fp, &kq))) 2302 return error; 2303 kqueue_drain(kq, td); 2304 2305 /* 2306 * We could be called due to the knote_drop() doing fdrop(), 2307 * called from kqueue_register(). In this case the global 2308 * lock is owned, and filedesc sx is locked before, to not 2309 * take the sleepable lock after non-sleepable. 2310 */ 2311 fdp = kq->kq_fdp; 2312 kq->kq_fdp = NULL; 2313 if (!sx_xlocked(FILEDESC_LOCK(fdp))) { 2314 FILEDESC_XLOCK(fdp); 2315 filedesc_unlock = 1; 2316 } else 2317 filedesc_unlock = 0; 2318 TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list); 2319 if (filedesc_unlock) 2320 FILEDESC_XUNLOCK(fdp); 2321 2322 kqueue_destroy(kq); 2323 chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0); 2324 crfree(kq->kq_cred); 2325 free(kq, M_KQUEUE); 2326 fp->f_data = NULL; 2327 2328 return (0); 2329 } 2330 2331 static int 2332 kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 2333 { 2334 struct kqueue *kq = fp->f_data; 2335 2336 kif->kf_type = KF_TYPE_KQUEUE; 2337 kif->kf_un.kf_kqueue.kf_kqueue_addr = (uintptr_t)kq; 2338 kif->kf_un.kf_kqueue.kf_kqueue_count = kq->kq_count; 2339 kif->kf_un.kf_kqueue.kf_kqueue_state = kq->kq_state; 2340 return (0); 2341 } 2342 2343 static void 2344 kqueue_wakeup(struct kqueue *kq) 2345 { 2346 KQ_OWNED(kq); 2347 2348 if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) { 2349 kq->kq_state &= ~KQ_SLEEP; 2350 wakeup(kq); 2351 } 2352 if ((kq->kq_state & KQ_SEL) == KQ_SEL) { 2353 selwakeuppri(&kq->kq_sel, PSOCK); 2354 if (!SEL_WAITING(&kq->kq_sel)) 2355 kq->kq_state &= ~KQ_SEL; 2356 } 2357 if (!knlist_empty(&kq->kq_sel.si_note)) 2358 kqueue_schedtask(kq); 2359 if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) { 2360 pgsigio(&kq->kq_sigio, SIGIO, 0); 2361 } 2362 } 2363 2364 /* 2365 * Walk down a list of knotes, activating them if their event has triggered. 2366 * 2367 * There is a possibility to optimize in the case of one kq watching another. 2368 * Instead of scheduling a task to wake it up, you could pass enough state 2369 * down the chain to make up the parent kqueue. Make this code functional 2370 * first. 2371 */ 2372 void 2373 knote(struct knlist *list, long hint, int lockflags) 2374 { 2375 struct kqueue *kq; 2376 struct knote *kn, *tkn; 2377 int error; 2378 2379 if (list == NULL) 2380 return; 2381 2382 KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED); 2383 2384 if ((lockflags & KNF_LISTLOCKED) == 0) 2385 list->kl_lock(list->kl_lockarg); 2386 2387 /* 2388 * If we unlock the list lock (and enter influx), we can 2389 * eliminate the kqueue scheduling, but this will introduce 2390 * four lock/unlock's for each knote to test. Also, marker 2391 * would be needed to keep iteration position, since filters 2392 * or other threads could remove events. 2393 */ 2394 SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) { 2395 kq = kn->kn_kq; 2396 KQ_LOCK(kq); 2397 if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { 2398 /* 2399 * Do not process the influx notes, except for 2400 * the influx coming from the kq unlock in the 2401 * kqueue_scan(). In the later case, we do 2402 * not interfere with the scan, since the code 2403 * fragment in kqueue_scan() locks the knlist, 2404 * and cannot proceed until we finished. 2405 */ 2406 KQ_UNLOCK(kq); 2407 } else if ((lockflags & KNF_NOKQLOCK) != 0) { 2408 kn_enter_flux(kn); 2409 KQ_UNLOCK(kq); 2410 error = kn->kn_fop->f_event(kn, hint); 2411 KQ_LOCK(kq); 2412 kn_leave_flux(kn); 2413 if (error) 2414 KNOTE_ACTIVATE(kn, 1); 2415 KQ_UNLOCK_FLUX(kq); 2416 } else { 2417 if (kn->kn_fop->f_event(kn, hint)) 2418 KNOTE_ACTIVATE(kn, 1); 2419 KQ_UNLOCK(kq); 2420 } 2421 } 2422 if ((lockflags & KNF_LISTLOCKED) == 0) 2423 list->kl_unlock(list->kl_lockarg); 2424 } 2425 2426 /* 2427 * add a knote to a knlist 2428 */ 2429 void 2430 knlist_add(struct knlist *knl, struct knote *kn, int islocked) 2431 { 2432 2433 KNL_ASSERT_LOCK(knl, islocked); 2434 KQ_NOTOWNED(kn->kn_kq); 2435 KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn)); 2436 KASSERT((kn->kn_status & KN_DETACHED) != 0, 2437 ("knote %p was not detached", kn)); 2438 if (!islocked) 2439 knl->kl_lock(knl->kl_lockarg); 2440 SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext); 2441 if (!islocked) 2442 knl->kl_unlock(knl->kl_lockarg); 2443 KQ_LOCK(kn->kn_kq); 2444 kn->kn_knlist = knl; 2445 kn->kn_status &= ~KN_DETACHED; 2446 KQ_UNLOCK(kn->kn_kq); 2447 } 2448 2449 static void 2450 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, 2451 int kqislocked) 2452 { 2453 2454 KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked")); 2455 KNL_ASSERT_LOCK(knl, knlislocked); 2456 mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED); 2457 KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn)); 2458 KASSERT((kn->kn_status & KN_DETACHED) == 0, 2459 ("knote %p was already detached", kn)); 2460 if (!knlislocked) 2461 knl->kl_lock(knl->kl_lockarg); 2462 SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext); 2463 kn->kn_knlist = NULL; 2464 if (!knlislocked) 2465 kn_list_unlock(knl); 2466 if (!kqislocked) 2467 KQ_LOCK(kn->kn_kq); 2468 kn->kn_status |= KN_DETACHED; 2469 if (!kqislocked) 2470 KQ_UNLOCK(kn->kn_kq); 2471 } 2472 2473 /* 2474 * remove knote from the specified knlist 2475 */ 2476 void 2477 knlist_remove(struct knlist *knl, struct knote *kn, int islocked) 2478 { 2479 2480 knlist_remove_kq(knl, kn, islocked, 0); 2481 } 2482 2483 int 2484 knlist_empty(struct knlist *knl) 2485 { 2486 2487 KNL_ASSERT_LOCKED(knl); 2488 return (SLIST_EMPTY(&knl->kl_list)); 2489 } 2490 2491 static struct mtx knlist_lock; 2492 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects", 2493 MTX_DEF); 2494 static void knlist_mtx_lock(void *arg); 2495 static void knlist_mtx_unlock(void *arg); 2496 2497 static void 2498 knlist_mtx_lock(void *arg) 2499 { 2500 2501 mtx_lock((struct mtx *)arg); 2502 } 2503 2504 static void 2505 knlist_mtx_unlock(void *arg) 2506 { 2507 2508 mtx_unlock((struct mtx *)arg); 2509 } 2510 2511 static void 2512 knlist_mtx_assert_lock(void *arg, int what) 2513 { 2514 2515 if (what == LA_LOCKED) 2516 mtx_assert((struct mtx *)arg, MA_OWNED); 2517 else 2518 mtx_assert((struct mtx *)arg, MA_NOTOWNED); 2519 } 2520 2521 void 2522 knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *), 2523 void (*kl_unlock)(void *), 2524 void (*kl_assert_lock)(void *, int)) 2525 { 2526 2527 if (lock == NULL) 2528 knl->kl_lockarg = &knlist_lock; 2529 else 2530 knl->kl_lockarg = lock; 2531 2532 if (kl_lock == NULL) 2533 knl->kl_lock = knlist_mtx_lock; 2534 else 2535 knl->kl_lock = kl_lock; 2536 if (kl_unlock == NULL) 2537 knl->kl_unlock = knlist_mtx_unlock; 2538 else 2539 knl->kl_unlock = kl_unlock; 2540 if (kl_assert_lock == NULL) 2541 knl->kl_assert_lock = knlist_mtx_assert_lock; 2542 else 2543 knl->kl_assert_lock = kl_assert_lock; 2544 2545 knl->kl_autodestroy = 0; 2546 SLIST_INIT(&knl->kl_list); 2547 } 2548 2549 void 2550 knlist_init_mtx(struct knlist *knl, struct mtx *lock) 2551 { 2552 2553 knlist_init(knl, lock, NULL, NULL, NULL); 2554 } 2555 2556 struct knlist * 2557 knlist_alloc(struct mtx *lock) 2558 { 2559 struct knlist *knl; 2560 2561 knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK); 2562 knlist_init_mtx(knl, lock); 2563 return (knl); 2564 } 2565 2566 void 2567 knlist_destroy(struct knlist *knl) 2568 { 2569 2570 KASSERT(KNLIST_EMPTY(knl), 2571 ("destroying knlist %p with knotes on it", knl)); 2572 } 2573 2574 void 2575 knlist_detach(struct knlist *knl) 2576 { 2577 2578 KNL_ASSERT_LOCKED(knl); 2579 knl->kl_autodestroy = 1; 2580 if (knlist_empty(knl)) { 2581 knlist_destroy(knl); 2582 free(knl, M_KQUEUE); 2583 } 2584 } 2585 2586 /* 2587 * Even if we are locked, we may need to drop the lock to allow any influx 2588 * knotes time to "settle". 2589 */ 2590 void 2591 knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn) 2592 { 2593 struct knote *kn, *kn2; 2594 struct kqueue *kq; 2595 2596 KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl)); 2597 if (islocked) 2598 KNL_ASSERT_LOCKED(knl); 2599 else { 2600 KNL_ASSERT_UNLOCKED(knl); 2601 again: /* need to reacquire lock since we have dropped it */ 2602 knl->kl_lock(knl->kl_lockarg); 2603 } 2604 2605 SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) { 2606 kq = kn->kn_kq; 2607 KQ_LOCK(kq); 2608 if (kn_in_flux(kn)) { 2609 KQ_UNLOCK(kq); 2610 continue; 2611 } 2612 knlist_remove_kq(knl, kn, 1, 1); 2613 if (killkn) { 2614 kn_enter_flux(kn); 2615 KQ_UNLOCK(kq); 2616 knote_drop_detached(kn, td); 2617 } else { 2618 /* Make sure cleared knotes disappear soon */ 2619 kn->kn_flags |= EV_EOF | EV_ONESHOT; 2620 KQ_UNLOCK(kq); 2621 } 2622 kq = NULL; 2623 } 2624 2625 if (!SLIST_EMPTY(&knl->kl_list)) { 2626 /* there are still in flux knotes remaining */ 2627 kn = SLIST_FIRST(&knl->kl_list); 2628 kq = kn->kn_kq; 2629 KQ_LOCK(kq); 2630 KASSERT(kn_in_flux(kn), ("knote removed w/o list lock")); 2631 knl->kl_unlock(knl->kl_lockarg); 2632 kq->kq_state |= KQ_FLUXWAIT; 2633 msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0); 2634 kq = NULL; 2635 goto again; 2636 } 2637 2638 if (islocked) 2639 KNL_ASSERT_LOCKED(knl); 2640 else { 2641 knl->kl_unlock(knl->kl_lockarg); 2642 KNL_ASSERT_UNLOCKED(knl); 2643 } 2644 } 2645 2646 /* 2647 * Remove all knotes referencing a specified fd must be called with FILEDESC 2648 * lock. This prevents a race where a new fd comes along and occupies the 2649 * entry and we attach a knote to the fd. 2650 */ 2651 void 2652 knote_fdclose(struct thread *td, int fd) 2653 { 2654 struct filedesc *fdp = td->td_proc->p_fd; 2655 struct kqueue *kq; 2656 struct knote *kn; 2657 int influx; 2658 2659 FILEDESC_XLOCK_ASSERT(fdp); 2660 2661 /* 2662 * We shouldn't have to worry about new kevents appearing on fd 2663 * since filedesc is locked. 2664 */ 2665 TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) { 2666 KQ_LOCK(kq); 2667 2668 again: 2669 influx = 0; 2670 while (kq->kq_knlistsize > fd && 2671 (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) { 2672 if (kn_in_flux(kn)) { 2673 /* someone else might be waiting on our knote */ 2674 if (influx) 2675 wakeup(kq); 2676 kq->kq_state |= KQ_FLUXWAIT; 2677 msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); 2678 goto again; 2679 } 2680 kn_enter_flux(kn); 2681 KQ_UNLOCK(kq); 2682 influx = 1; 2683 knote_drop(kn, td); 2684 KQ_LOCK(kq); 2685 } 2686 KQ_UNLOCK_FLUX(kq); 2687 } 2688 } 2689 2690 static int 2691 knote_attach(struct knote *kn, struct kqueue *kq) 2692 { 2693 struct klist *list; 2694 2695 KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn)); 2696 KQ_OWNED(kq); 2697 2698 if ((kq->kq_state & KQ_CLOSING) != 0) 2699 return (EBADF); 2700 if (kn->kn_fop->f_isfd) { 2701 if (kn->kn_id >= kq->kq_knlistsize) 2702 return (ENOMEM); 2703 list = &kq->kq_knlist[kn->kn_id]; 2704 } else { 2705 if (kq->kq_knhash == NULL) 2706 return (ENOMEM); 2707 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 2708 } 2709 SLIST_INSERT_HEAD(list, kn, kn_link); 2710 return (0); 2711 } 2712 2713 static void 2714 knote_drop(struct knote *kn, struct thread *td) 2715 { 2716 2717 if ((kn->kn_status & KN_DETACHED) == 0) 2718 kn->kn_fop->f_detach(kn); 2719 knote_drop_detached(kn, td); 2720 } 2721 2722 static void 2723 knote_drop_detached(struct knote *kn, struct thread *td) 2724 { 2725 struct kqueue *kq; 2726 struct klist *list; 2727 2728 kq = kn->kn_kq; 2729 2730 KASSERT((kn->kn_status & KN_DETACHED) != 0, 2731 ("knote %p still attached", kn)); 2732 KQ_NOTOWNED(kq); 2733 2734 KQ_LOCK(kq); 2735 KASSERT(kn->kn_influx == 1, 2736 ("knote_drop called on %p with influx %d", kn, kn->kn_influx)); 2737 2738 if (kn->kn_fop->f_isfd) 2739 list = &kq->kq_knlist[kn->kn_id]; 2740 else 2741 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 2742 2743 if (!SLIST_EMPTY(list)) 2744 SLIST_REMOVE(list, kn, knote, kn_link); 2745 if (kn->kn_status & KN_QUEUED) 2746 knote_dequeue(kn); 2747 KQ_UNLOCK_FLUX(kq); 2748 2749 if (kn->kn_fop->f_isfd) { 2750 fdrop(kn->kn_fp, td); 2751 kn->kn_fp = NULL; 2752 } 2753 kqueue_fo_release(kn->kn_kevent.filter); 2754 kn->kn_fop = NULL; 2755 knote_free(kn); 2756 } 2757 2758 static void 2759 knote_enqueue(struct knote *kn) 2760 { 2761 struct kqueue *kq = kn->kn_kq; 2762 2763 KQ_OWNED(kn->kn_kq); 2764 KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); 2765 2766 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 2767 kn->kn_status |= KN_QUEUED; 2768 kq->kq_count++; 2769 kqueue_wakeup(kq); 2770 } 2771 2772 static void 2773 knote_dequeue(struct knote *kn) 2774 { 2775 struct kqueue *kq = kn->kn_kq; 2776 2777 KQ_OWNED(kn->kn_kq); 2778 KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); 2779 2780 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 2781 kn->kn_status &= ~KN_QUEUED; 2782 kq->kq_count--; 2783 } 2784 2785 static void 2786 knote_init(void) 2787 { 2788 2789 knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, 2790 NULL, NULL, UMA_ALIGN_PTR, 0); 2791 ast_register(TDA_KQUEUE, ASTR_ASTF_REQUIRED, 0, ast_kqueue); 2792 } 2793 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL); 2794 2795 static struct knote * 2796 knote_alloc(int mflag) 2797 { 2798 2799 return (uma_zalloc(knote_zone, mflag | M_ZERO)); 2800 } 2801 2802 static void 2803 knote_free(struct knote *kn) 2804 { 2805 2806 uma_zfree(knote_zone, kn); 2807 } 2808 2809 /* 2810 * Register the kev w/ the kq specified by fd. 2811 */ 2812 int 2813 kqfd_register(int fd, struct kevent *kev, struct thread *td, int mflag) 2814 { 2815 struct kqueue *kq; 2816 struct file *fp; 2817 cap_rights_t rights; 2818 int error; 2819 2820 error = fget(td, fd, cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE), 2821 &fp); 2822 if (error != 0) 2823 return (error); 2824 if ((error = kqueue_acquire(fp, &kq)) != 0) 2825 goto noacquire; 2826 2827 error = kqueue_register(kq, kev, td, mflag); 2828 kqueue_release(kq, 0); 2829 2830 noacquire: 2831 fdrop(fp, td); 2832 return (error); 2833 } 2834