1 /*- 2 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org> 3 * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org> 4 * Copyright (c) 2009 Apple, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include "opt_compat.h" 33 #include "opt_ktrace.h" 34 #include "opt_kqueue.h" 35 36 #include <sys/param.h> 37 #include <sys/systm.h> 38 #include <sys/capsicum.h> 39 #include <sys/kernel.h> 40 #include <sys/lock.h> 41 #include <sys/mutex.h> 42 #include <sys/rwlock.h> 43 #include <sys/proc.h> 44 #include <sys/malloc.h> 45 #include <sys/unistd.h> 46 #include <sys/file.h> 47 #include <sys/filedesc.h> 48 #include <sys/filio.h> 49 #include <sys/fcntl.h> 50 #include <sys/kthread.h> 51 #include <sys/selinfo.h> 52 #include <sys/queue.h> 53 #include <sys/event.h> 54 #include <sys/eventvar.h> 55 #include <sys/poll.h> 56 #include <sys/protosw.h> 57 #include <sys/resourcevar.h> 58 #include <sys/sigio.h> 59 #include <sys/signalvar.h> 60 #include <sys/socket.h> 61 #include <sys/socketvar.h> 62 #include <sys/stat.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/syscallsubr.h> 66 #include <sys/taskqueue.h> 67 #include <sys/uio.h> 68 #include <sys/user.h> 69 #ifdef KTRACE 70 #include <sys/ktrace.h> 71 #endif 72 #include <machine/atomic.h> 73 74 #include <vm/uma.h> 75 76 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); 77 78 /* 79 * This lock is used if multiple kq locks are required. This possibly 80 * should be made into a per proc lock. 81 */ 82 static struct mtx kq_global; 83 MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF); 84 #define KQ_GLOBAL_LOCK(lck, haslck) do { \ 85 if (!haslck) \ 86 mtx_lock(lck); \ 87 haslck = 1; \ 88 } while (0) 89 #define KQ_GLOBAL_UNLOCK(lck, haslck) do { \ 90 if (haslck) \ 91 mtx_unlock(lck); \ 92 haslck = 0; \ 93 } while (0) 94 95 TASKQUEUE_DEFINE_THREAD(kqueue_ctx); 96 97 static int kevent_copyout(void *arg, struct kevent *kevp, int count); 98 static int kevent_copyin(void *arg, struct kevent *kevp, int count); 99 static int kqueue_register(struct kqueue *kq, struct kevent *kev, 100 struct thread *td, int waitok); 101 static int kqueue_acquire(struct file *fp, struct kqueue **kqp); 102 static void kqueue_release(struct kqueue *kq, int locked); 103 static void kqueue_destroy(struct kqueue *kq); 104 static void kqueue_drain(struct kqueue *kq, struct thread *td); 105 static int kqueue_expand(struct kqueue *kq, struct filterops *fops, 106 uintptr_t ident, int waitok); 107 static void kqueue_task(void *arg, int pending); 108 static int kqueue_scan(struct kqueue *kq, int maxevents, 109 struct kevent_copyops *k_ops, 110 const struct timespec *timeout, 111 struct kevent *keva, struct thread *td); 112 static void kqueue_wakeup(struct kqueue *kq); 113 static struct filterops *kqueue_fo_find(int filt); 114 static void kqueue_fo_release(int filt); 115 struct g_kevent_args; 116 static int kern_kevent_generic(struct thread *td, 117 struct g_kevent_args *uap, 118 struct kevent_copyops *k_ops); 119 120 static fo_ioctl_t kqueue_ioctl; 121 static fo_poll_t kqueue_poll; 122 static fo_kqfilter_t kqueue_kqfilter; 123 static fo_stat_t kqueue_stat; 124 static fo_close_t kqueue_close; 125 static fo_fill_kinfo_t kqueue_fill_kinfo; 126 127 static struct fileops kqueueops = { 128 .fo_read = invfo_rdwr, 129 .fo_write = invfo_rdwr, 130 .fo_truncate = invfo_truncate, 131 .fo_ioctl = kqueue_ioctl, 132 .fo_poll = kqueue_poll, 133 .fo_kqfilter = kqueue_kqfilter, 134 .fo_stat = kqueue_stat, 135 .fo_close = kqueue_close, 136 .fo_chmod = invfo_chmod, 137 .fo_chown = invfo_chown, 138 .fo_sendfile = invfo_sendfile, 139 .fo_fill_kinfo = kqueue_fill_kinfo, 140 }; 141 142 static int knote_attach(struct knote *kn, struct kqueue *kq); 143 static void knote_drop(struct knote *kn, struct thread *td); 144 static void knote_drop_detached(struct knote *kn, struct thread *td); 145 static void knote_enqueue(struct knote *kn); 146 static void knote_dequeue(struct knote *kn); 147 static void knote_init(void); 148 static struct knote *knote_alloc(int waitok); 149 static void knote_free(struct knote *kn); 150 151 static void filt_kqdetach(struct knote *kn); 152 static int filt_kqueue(struct knote *kn, long hint); 153 static int filt_procattach(struct knote *kn); 154 static void filt_procdetach(struct knote *kn); 155 static int filt_proc(struct knote *kn, long hint); 156 static int filt_fileattach(struct knote *kn); 157 static void filt_timerexpire(void *knx); 158 static int filt_timerattach(struct knote *kn); 159 static void filt_timerdetach(struct knote *kn); 160 static int filt_timer(struct knote *kn, long hint); 161 static int filt_userattach(struct knote *kn); 162 static void filt_userdetach(struct knote *kn); 163 static int filt_user(struct knote *kn, long hint); 164 static void filt_usertouch(struct knote *kn, struct kevent *kev, 165 u_long type); 166 167 static struct filterops file_filtops = { 168 .f_isfd = 1, 169 .f_attach = filt_fileattach, 170 }; 171 static struct filterops kqread_filtops = { 172 .f_isfd = 1, 173 .f_detach = filt_kqdetach, 174 .f_event = filt_kqueue, 175 }; 176 /* XXX - move to kern_proc.c? */ 177 static struct filterops proc_filtops = { 178 .f_isfd = 0, 179 .f_attach = filt_procattach, 180 .f_detach = filt_procdetach, 181 .f_event = filt_proc, 182 }; 183 static struct filterops timer_filtops = { 184 .f_isfd = 0, 185 .f_attach = filt_timerattach, 186 .f_detach = filt_timerdetach, 187 .f_event = filt_timer, 188 }; 189 static struct filterops user_filtops = { 190 .f_attach = filt_userattach, 191 .f_detach = filt_userdetach, 192 .f_event = filt_user, 193 .f_touch = filt_usertouch, 194 }; 195 196 static uma_zone_t knote_zone; 197 static unsigned int kq_ncallouts = 0; 198 static unsigned int kq_calloutmax = 4 * 1024; 199 SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, 200 &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); 201 202 /* XXX - ensure not influx ? */ 203 #define KNOTE_ACTIVATE(kn, islock) do { \ 204 if ((islock)) \ 205 mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \ 206 else \ 207 KQ_LOCK((kn)->kn_kq); \ 208 (kn)->kn_status |= KN_ACTIVE; \ 209 if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ 210 knote_enqueue((kn)); \ 211 if (!(islock)) \ 212 KQ_UNLOCK((kn)->kn_kq); \ 213 } while(0) 214 #define KQ_LOCK(kq) do { \ 215 mtx_lock(&(kq)->kq_lock); \ 216 } while (0) 217 #define KQ_FLUX_WAKEUP(kq) do { \ 218 if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \ 219 (kq)->kq_state &= ~KQ_FLUXWAIT; \ 220 wakeup((kq)); \ 221 } \ 222 } while (0) 223 #define KQ_UNLOCK_FLUX(kq) do { \ 224 KQ_FLUX_WAKEUP(kq); \ 225 mtx_unlock(&(kq)->kq_lock); \ 226 } while (0) 227 #define KQ_UNLOCK(kq) do { \ 228 mtx_unlock(&(kq)->kq_lock); \ 229 } while (0) 230 #define KQ_OWNED(kq) do { \ 231 mtx_assert(&(kq)->kq_lock, MA_OWNED); \ 232 } while (0) 233 #define KQ_NOTOWNED(kq) do { \ 234 mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \ 235 } while (0) 236 237 static struct knlist * 238 kn_list_lock(struct knote *kn) 239 { 240 struct knlist *knl; 241 242 knl = kn->kn_knlist; 243 if (knl != NULL) 244 knl->kl_lock(knl->kl_lockarg); 245 return (knl); 246 } 247 248 static void 249 kn_list_unlock(struct knlist *knl) 250 { 251 bool do_free; 252 253 if (knl == NULL) 254 return; 255 do_free = knl->kl_autodestroy && knlist_empty(knl); 256 knl->kl_unlock(knl->kl_lockarg); 257 if (do_free) { 258 knlist_destroy(knl); 259 free(knl, M_KQUEUE); 260 } 261 } 262 263 static bool 264 kn_in_flux(struct knote *kn) 265 { 266 267 return (kn->kn_influx > 0); 268 } 269 270 static void 271 kn_enter_flux(struct knote *kn) 272 { 273 274 KQ_OWNED(kn->kn_kq); 275 MPASS(kn->kn_influx < INT_MAX); 276 kn->kn_influx++; 277 } 278 279 static bool 280 kn_leave_flux(struct knote *kn) 281 { 282 283 KQ_OWNED(kn->kn_kq); 284 MPASS(kn->kn_influx > 0); 285 kn->kn_influx--; 286 return (kn->kn_influx == 0); 287 } 288 289 #define KNL_ASSERT_LOCK(knl, islocked) do { \ 290 if (islocked) \ 291 KNL_ASSERT_LOCKED(knl); \ 292 else \ 293 KNL_ASSERT_UNLOCKED(knl); \ 294 } while (0) 295 #ifdef INVARIANTS 296 #define KNL_ASSERT_LOCKED(knl) do { \ 297 knl->kl_assert_locked((knl)->kl_lockarg); \ 298 } while (0) 299 #define KNL_ASSERT_UNLOCKED(knl) do { \ 300 knl->kl_assert_unlocked((knl)->kl_lockarg); \ 301 } while (0) 302 #else /* !INVARIANTS */ 303 #define KNL_ASSERT_LOCKED(knl) do {} while(0) 304 #define KNL_ASSERT_UNLOCKED(knl) do {} while (0) 305 #endif /* INVARIANTS */ 306 307 #ifndef KN_HASHSIZE 308 #define KN_HASHSIZE 64 /* XXX should be tunable */ 309 #endif 310 311 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) 312 313 static int 314 filt_nullattach(struct knote *kn) 315 { 316 317 return (ENXIO); 318 }; 319 320 struct filterops null_filtops = { 321 .f_isfd = 0, 322 .f_attach = filt_nullattach, 323 }; 324 325 /* XXX - make SYSINIT to add these, and move into respective modules. */ 326 extern struct filterops sig_filtops; 327 extern struct filterops fs_filtops; 328 329 /* 330 * Table for for all system-defined filters. 331 */ 332 static struct mtx filterops_lock; 333 MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops", 334 MTX_DEF); 335 static struct { 336 struct filterops *for_fop; 337 int for_nolock; 338 int for_refcnt; 339 } sysfilt_ops[EVFILT_SYSCOUNT] = { 340 { &file_filtops, 1 }, /* EVFILT_READ */ 341 { &file_filtops, 1 }, /* EVFILT_WRITE */ 342 { &null_filtops }, /* EVFILT_AIO */ 343 { &file_filtops, 1 }, /* EVFILT_VNODE */ 344 { &proc_filtops, 1 }, /* EVFILT_PROC */ 345 { &sig_filtops, 1 }, /* EVFILT_SIGNAL */ 346 { &timer_filtops, 1 }, /* EVFILT_TIMER */ 347 { &file_filtops, 1 }, /* EVFILT_PROCDESC */ 348 { &fs_filtops, 1 }, /* EVFILT_FS */ 349 { &null_filtops }, /* EVFILT_LIO */ 350 { &user_filtops, 1 }, /* EVFILT_USER */ 351 { &null_filtops }, /* EVFILT_SENDFILE */ 352 { &file_filtops, 1 }, /* EVFILT_EMPTY */ 353 }; 354 355 /* 356 * Simple redirection for all cdevsw style objects to call their fo_kqfilter 357 * method. 358 */ 359 static int 360 filt_fileattach(struct knote *kn) 361 { 362 363 return (fo_kqfilter(kn->kn_fp, kn)); 364 } 365 366 /*ARGSUSED*/ 367 static int 368 kqueue_kqfilter(struct file *fp, struct knote *kn) 369 { 370 struct kqueue *kq = kn->kn_fp->f_data; 371 372 if (kn->kn_filter != EVFILT_READ) 373 return (EINVAL); 374 375 kn->kn_status |= KN_KQUEUE; 376 kn->kn_fop = &kqread_filtops; 377 knlist_add(&kq->kq_sel.si_note, kn, 0); 378 379 return (0); 380 } 381 382 static void 383 filt_kqdetach(struct knote *kn) 384 { 385 struct kqueue *kq = kn->kn_fp->f_data; 386 387 knlist_remove(&kq->kq_sel.si_note, kn, 0); 388 } 389 390 /*ARGSUSED*/ 391 static int 392 filt_kqueue(struct knote *kn, long hint) 393 { 394 struct kqueue *kq = kn->kn_fp->f_data; 395 396 kn->kn_data = kq->kq_count; 397 return (kn->kn_data > 0); 398 } 399 400 /* XXX - move to kern_proc.c? */ 401 static int 402 filt_procattach(struct knote *kn) 403 { 404 struct proc *p; 405 int error; 406 bool exiting, immediate; 407 408 exiting = immediate = false; 409 p = pfind(kn->kn_id); 410 if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) { 411 p = zpfind(kn->kn_id); 412 exiting = true; 413 } else if (p != NULL && (p->p_flag & P_WEXIT)) { 414 exiting = true; 415 } 416 417 if (p == NULL) 418 return (ESRCH); 419 if ((error = p_cansee(curthread, p))) { 420 PROC_UNLOCK(p); 421 return (error); 422 } 423 424 kn->kn_ptr.p_proc = p; 425 kn->kn_flags |= EV_CLEAR; /* automatically set */ 426 427 /* 428 * Internal flag indicating registration done by kernel for the 429 * purposes of getting a NOTE_CHILD notification. 430 */ 431 if (kn->kn_flags & EV_FLAG2) { 432 kn->kn_flags &= ~EV_FLAG2; 433 kn->kn_data = kn->kn_sdata; /* ppid */ 434 kn->kn_fflags = NOTE_CHILD; 435 kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK); 436 immediate = true; /* Force immediate activation of child note. */ 437 } 438 /* 439 * Internal flag indicating registration done by kernel (for other than 440 * NOTE_CHILD). 441 */ 442 if (kn->kn_flags & EV_FLAG1) { 443 kn->kn_flags &= ~EV_FLAG1; 444 } 445 446 knlist_add(p->p_klist, kn, 1); 447 448 /* 449 * Immediately activate any child notes or, in the case of a zombie 450 * target process, exit notes. The latter is necessary to handle the 451 * case where the target process, e.g. a child, dies before the kevent 452 * is registered. 453 */ 454 if (immediate || (exiting && filt_proc(kn, NOTE_EXIT))) 455 KNOTE_ACTIVATE(kn, 0); 456 457 PROC_UNLOCK(p); 458 459 return (0); 460 } 461 462 /* 463 * The knote may be attached to a different process, which may exit, 464 * leaving nothing for the knote to be attached to. So when the process 465 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so 466 * it will be deleted when read out. However, as part of the knote deletion, 467 * this routine is called, so a check is needed to avoid actually performing 468 * a detach, because the original process does not exist any more. 469 */ 470 /* XXX - move to kern_proc.c? */ 471 static void 472 filt_procdetach(struct knote *kn) 473 { 474 475 knlist_remove(kn->kn_knlist, kn, 0); 476 kn->kn_ptr.p_proc = NULL; 477 } 478 479 /* XXX - move to kern_proc.c? */ 480 static int 481 filt_proc(struct knote *kn, long hint) 482 { 483 struct proc *p; 484 u_int event; 485 486 p = kn->kn_ptr.p_proc; 487 if (p == NULL) /* already activated, from attach filter */ 488 return (0); 489 490 /* Mask off extra data. */ 491 event = (u_int)hint & NOTE_PCTRLMASK; 492 493 /* If the user is interested in this event, record it. */ 494 if (kn->kn_sfflags & event) 495 kn->kn_fflags |= event; 496 497 /* Process is gone, so flag the event as finished. */ 498 if (event == NOTE_EXIT) { 499 kn->kn_flags |= EV_EOF | EV_ONESHOT; 500 kn->kn_ptr.p_proc = NULL; 501 if (kn->kn_fflags & NOTE_EXIT) 502 kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig); 503 if (kn->kn_fflags == 0) 504 kn->kn_flags |= EV_DROP; 505 return (1); 506 } 507 508 return (kn->kn_fflags != 0); 509 } 510 511 /* 512 * Called when the process forked. It mostly does the same as the 513 * knote(), activating all knotes registered to be activated when the 514 * process forked. Additionally, for each knote attached to the 515 * parent, check whether user wants to track the new process. If so 516 * attach a new knote to it, and immediately report an event with the 517 * child's pid. 518 */ 519 void 520 knote_fork(struct knlist *list, int pid) 521 { 522 struct kqueue *kq; 523 struct knote *kn; 524 struct kevent kev; 525 int error; 526 527 if (list == NULL) 528 return; 529 list->kl_lock(list->kl_lockarg); 530 531 SLIST_FOREACH(kn, &list->kl_list, kn_selnext) { 532 kq = kn->kn_kq; 533 KQ_LOCK(kq); 534 if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { 535 KQ_UNLOCK(kq); 536 continue; 537 } 538 539 /* 540 * The same as knote(), activate the event. 541 */ 542 if ((kn->kn_sfflags & NOTE_TRACK) == 0) { 543 kn->kn_status |= KN_HASKQLOCK; 544 if (kn->kn_fop->f_event(kn, NOTE_FORK)) 545 KNOTE_ACTIVATE(kn, 1); 546 kn->kn_status &= ~KN_HASKQLOCK; 547 KQ_UNLOCK(kq); 548 continue; 549 } 550 551 /* 552 * The NOTE_TRACK case. In addition to the activation 553 * of the event, we need to register new events to 554 * track the child. Drop the locks in preparation for 555 * the call to kqueue_register(). 556 */ 557 kn_enter_flux(kn); 558 KQ_UNLOCK(kq); 559 list->kl_unlock(list->kl_lockarg); 560 561 /* 562 * Activate existing knote and register tracking knotes with 563 * new process. 564 * 565 * First register a knote to get just the child notice. This 566 * must be a separate note from a potential NOTE_EXIT 567 * notification since both NOTE_CHILD and NOTE_EXIT are defined 568 * to use the data field (in conflicting ways). 569 */ 570 kev.ident = pid; 571 kev.filter = kn->kn_filter; 572 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT | 573 EV_FLAG2; 574 kev.fflags = kn->kn_sfflags; 575 kev.data = kn->kn_id; /* parent */ 576 kev.udata = kn->kn_kevent.udata;/* preserve udata */ 577 error = kqueue_register(kq, &kev, NULL, 0); 578 if (error) 579 kn->kn_fflags |= NOTE_TRACKERR; 580 581 /* 582 * Then register another knote to track other potential events 583 * from the new process. 584 */ 585 kev.ident = pid; 586 kev.filter = kn->kn_filter; 587 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; 588 kev.fflags = kn->kn_sfflags; 589 kev.data = kn->kn_id; /* parent */ 590 kev.udata = kn->kn_kevent.udata;/* preserve udata */ 591 error = kqueue_register(kq, &kev, NULL, 0); 592 if (error) 593 kn->kn_fflags |= NOTE_TRACKERR; 594 if (kn->kn_fop->f_event(kn, NOTE_FORK)) 595 KNOTE_ACTIVATE(kn, 0); 596 KQ_LOCK(kq); 597 kn_leave_flux(kn); 598 KQ_UNLOCK_FLUX(kq); 599 list->kl_lock(list->kl_lockarg); 600 } 601 list->kl_unlock(list->kl_lockarg); 602 } 603 604 /* 605 * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the 606 * interval timer support code. 607 */ 608 609 #define NOTE_TIMER_PRECMASK \ 610 (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS) 611 612 static sbintime_t 613 timer2sbintime(intptr_t data, int flags) 614 { 615 int64_t secs; 616 617 /* 618 * Macros for converting to the fractional second portion of an 619 * sbintime_t using 64bit multiplication to improve precision. 620 */ 621 #define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32) 622 #define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32) 623 #define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32) 624 switch (flags & NOTE_TIMER_PRECMASK) { 625 case NOTE_SECONDS: 626 #ifdef __LP64__ 627 if (data > (SBT_MAX / SBT_1S)) 628 return (SBT_MAX); 629 #endif 630 return ((sbintime_t)data << 32); 631 case NOTE_MSECONDS: /* FALLTHROUGH */ 632 case 0: 633 if (data >= 1000) { 634 secs = data / 1000; 635 #ifdef __LP64__ 636 if (secs > (SBT_MAX / SBT_1S)) 637 return (SBT_MAX); 638 #endif 639 return (secs << 32 | MS_TO_SBT(data % 1000)); 640 } 641 return (MS_TO_SBT(data)); 642 case NOTE_USECONDS: 643 if (data >= 1000000) { 644 secs = data / 1000000; 645 #ifdef __LP64__ 646 if (secs > (SBT_MAX / SBT_1S)) 647 return (SBT_MAX); 648 #endif 649 return (secs << 32 | US_TO_SBT(data % 1000000)); 650 } 651 return (US_TO_SBT(data)); 652 case NOTE_NSECONDS: 653 if (data >= 1000000000) { 654 secs = data / 1000000000; 655 #ifdef __LP64__ 656 if (secs > (SBT_MAX / SBT_1S)) 657 return (SBT_MAX); 658 #endif 659 return (secs << 32 | US_TO_SBT(data % 1000000000)); 660 } 661 return (NS_TO_SBT(data)); 662 default: 663 break; 664 } 665 return (-1); 666 } 667 668 struct kq_timer_cb_data { 669 struct callout c; 670 sbintime_t next; /* next timer event fires at */ 671 sbintime_t to; /* precalculated timer period, 0 for abs */ 672 }; 673 674 static void 675 filt_timerexpire(void *knx) 676 { 677 struct knote *kn; 678 struct kq_timer_cb_data *kc; 679 680 kn = knx; 681 kn->kn_data++; 682 KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */ 683 684 if ((kn->kn_flags & EV_ONESHOT) != 0) 685 return; 686 kc = kn->kn_ptr.p_v; 687 if (kc->to == 0) 688 return; 689 kc->next += kc->to; 690 callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn, 691 PCPU_GET(cpuid), C_ABSOLUTE); 692 } 693 694 /* 695 * data contains amount of time to sleep 696 */ 697 static int 698 filt_timerattach(struct knote *kn) 699 { 700 struct kq_timer_cb_data *kc; 701 struct bintime bt; 702 sbintime_t to, sbt; 703 unsigned int ncallouts; 704 705 if (kn->kn_sdata < 0) 706 return (EINVAL); 707 if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0) 708 kn->kn_sdata = 1; 709 /* Only precision unit are supported in flags so far */ 710 if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0) 711 return (EINVAL); 712 713 to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags); 714 if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { 715 getboottimebin(&bt); 716 sbt = bttosbt(bt); 717 to -= sbt; 718 } 719 if (to < 0) 720 return (EINVAL); 721 722 do { 723 ncallouts = kq_ncallouts; 724 if (ncallouts >= kq_calloutmax) 725 return (ENOMEM); 726 } while (!atomic_cmpset_int(&kq_ncallouts, ncallouts, ncallouts + 1)); 727 728 if ((kn->kn_sfflags & NOTE_ABSTIME) == 0) 729 kn->kn_flags |= EV_CLEAR; /* automatically set */ 730 kn->kn_status &= ~KN_DETACHED; /* knlist_add clears it */ 731 kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK); 732 callout_init(&kc->c, 1); 733 if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { 734 kc->next = to; 735 kc->to = 0; 736 } else { 737 kc->next = to + sbinuptime(); 738 kc->to = to; 739 } 740 callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn, 741 PCPU_GET(cpuid), C_ABSOLUTE); 742 743 return (0); 744 } 745 746 static void 747 filt_timerdetach(struct knote *kn) 748 { 749 struct kq_timer_cb_data *kc; 750 unsigned int old; 751 752 kc = kn->kn_ptr.p_v; 753 callout_drain(&kc->c); 754 free(kc, M_KQUEUE); 755 old = atomic_fetchadd_int(&kq_ncallouts, -1); 756 KASSERT(old > 0, ("Number of callouts cannot become negative")); 757 kn->kn_status |= KN_DETACHED; /* knlist_remove sets it */ 758 } 759 760 static int 761 filt_timer(struct knote *kn, long hint) 762 { 763 764 return (kn->kn_data != 0); 765 } 766 767 static int 768 filt_userattach(struct knote *kn) 769 { 770 771 /* 772 * EVFILT_USER knotes are not attached to anything in the kernel. 773 */ 774 kn->kn_hook = NULL; 775 if (kn->kn_fflags & NOTE_TRIGGER) 776 kn->kn_hookid = 1; 777 else 778 kn->kn_hookid = 0; 779 return (0); 780 } 781 782 static void 783 filt_userdetach(__unused struct knote *kn) 784 { 785 786 /* 787 * EVFILT_USER knotes are not attached to anything in the kernel. 788 */ 789 } 790 791 static int 792 filt_user(struct knote *kn, __unused long hint) 793 { 794 795 return (kn->kn_hookid); 796 } 797 798 static void 799 filt_usertouch(struct knote *kn, struct kevent *kev, u_long type) 800 { 801 u_int ffctrl; 802 803 switch (type) { 804 case EVENT_REGISTER: 805 if (kev->fflags & NOTE_TRIGGER) 806 kn->kn_hookid = 1; 807 808 ffctrl = kev->fflags & NOTE_FFCTRLMASK; 809 kev->fflags &= NOTE_FFLAGSMASK; 810 switch (ffctrl) { 811 case NOTE_FFNOP: 812 break; 813 814 case NOTE_FFAND: 815 kn->kn_sfflags &= kev->fflags; 816 break; 817 818 case NOTE_FFOR: 819 kn->kn_sfflags |= kev->fflags; 820 break; 821 822 case NOTE_FFCOPY: 823 kn->kn_sfflags = kev->fflags; 824 break; 825 826 default: 827 /* XXX Return error? */ 828 break; 829 } 830 kn->kn_sdata = kev->data; 831 if (kev->flags & EV_CLEAR) { 832 kn->kn_hookid = 0; 833 kn->kn_data = 0; 834 kn->kn_fflags = 0; 835 } 836 break; 837 838 case EVENT_PROCESS: 839 *kev = kn->kn_kevent; 840 kev->fflags = kn->kn_sfflags; 841 kev->data = kn->kn_sdata; 842 if (kn->kn_flags & EV_CLEAR) { 843 kn->kn_hookid = 0; 844 kn->kn_data = 0; 845 kn->kn_fflags = 0; 846 } 847 break; 848 849 default: 850 panic("filt_usertouch() - invalid type (%ld)", type); 851 break; 852 } 853 } 854 855 int 856 sys_kqueue(struct thread *td, struct kqueue_args *uap) 857 { 858 859 return (kern_kqueue(td, 0, NULL)); 860 } 861 862 static void 863 kqueue_init(struct kqueue *kq) 864 { 865 866 mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK); 867 TAILQ_INIT(&kq->kq_head); 868 knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock); 869 TASK_INIT(&kq->kq_task, 0, kqueue_task, kq); 870 } 871 872 int 873 kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps) 874 { 875 struct filedesc *fdp; 876 struct kqueue *kq; 877 struct file *fp; 878 struct ucred *cred; 879 int fd, error; 880 881 fdp = td->td_proc->p_fd; 882 cred = td->td_ucred; 883 if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES))) 884 return (ENOMEM); 885 886 error = falloc_caps(td, &fp, &fd, flags, fcaps); 887 if (error != 0) { 888 chgkqcnt(cred->cr_ruidinfo, -1, 0); 889 return (error); 890 } 891 892 /* An extra reference on `fp' has been held for us by falloc(). */ 893 kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO); 894 kqueue_init(kq); 895 kq->kq_fdp = fdp; 896 kq->kq_cred = crhold(cred); 897 898 FILEDESC_XLOCK(fdp); 899 TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list); 900 FILEDESC_XUNLOCK(fdp); 901 902 finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); 903 fdrop(fp, td); 904 905 td->td_retval[0] = fd; 906 return (0); 907 } 908 909 #ifdef KTRACE 910 static size_t 911 kev_iovlen(int n, u_int kgio, size_t kevent_size) 912 { 913 914 if (n < 0 || n >= kgio / kevent_size) 915 return (kgio); 916 return (n * kevent_size); 917 } 918 #endif 919 920 struct g_kevent_args { 921 int fd; 922 void *changelist; 923 int nchanges; 924 void *eventlist; 925 int nevents; 926 const struct timespec *timeout; 927 }; 928 929 int 930 sys_kevent(struct thread *td, struct kevent_args *uap) 931 { 932 struct kevent_copyops k_ops = { 933 .arg = uap, 934 .k_copyout = kevent_copyout, 935 .k_copyin = kevent_copyin, 936 .kevent_size = sizeof(struct kevent), 937 }; 938 struct g_kevent_args gk_args = { 939 .fd = uap->fd, 940 .changelist = uap->changelist, 941 .nchanges = uap->nchanges, 942 .eventlist = uap->eventlist, 943 .nevents = uap->nevents, 944 .timeout = uap->timeout, 945 }; 946 947 return (kern_kevent_generic(td, &gk_args, &k_ops)); 948 } 949 950 static int 951 kern_kevent_generic(struct thread *td, struct g_kevent_args *uap, 952 struct kevent_copyops *k_ops) 953 { 954 struct timespec ts, *tsp; 955 int error; 956 #ifdef KTRACE 957 struct uio ktruio; 958 struct iovec ktriov; 959 struct uio *ktruioin = NULL; 960 struct uio *ktruioout = NULL; 961 u_int kgio; 962 #endif 963 964 if (uap->timeout != NULL) { 965 error = copyin(uap->timeout, &ts, sizeof(ts)); 966 if (error) 967 return (error); 968 tsp = &ts; 969 } else 970 tsp = NULL; 971 972 #ifdef KTRACE 973 if (KTRPOINT(td, KTR_GENIO)) { 974 kgio = ktr_geniosize; 975 ktriov.iov_base = uap->changelist; 976 ktriov.iov_len = kev_iovlen(uap->nchanges, kgio, 977 k_ops->kevent_size); 978 ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1, 979 .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ, 980 .uio_td = td }; 981 ktruioin = cloneuio(&ktruio); 982 ktriov.iov_base = uap->eventlist; 983 ktriov.iov_len = kev_iovlen(uap->nevents, kgio, 984 k_ops->kevent_size); 985 ktriov.iov_len = uap->nevents * k_ops->kevent_size; 986 ktruioout = cloneuio(&ktruio); 987 } 988 #endif 989 990 error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents, 991 k_ops, tsp); 992 993 #ifdef KTRACE 994 if (ktruioin != NULL) { 995 ktruioin->uio_resid = kev_iovlen(uap->nchanges, kgio, 996 k_ops->kevent_size); 997 ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0); 998 ktruioout->uio_resid = kev_iovlen(td->td_retval[0], kgio, 999 k_ops->kevent_size); 1000 ktrgenio(uap->fd, UIO_READ, ktruioout, error); 1001 } 1002 #endif 1003 1004 return (error); 1005 } 1006 1007 /* 1008 * Copy 'count' items into the destination list pointed to by uap->eventlist. 1009 */ 1010 static int 1011 kevent_copyout(void *arg, struct kevent *kevp, int count) 1012 { 1013 struct kevent_args *uap; 1014 int error; 1015 1016 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); 1017 uap = (struct kevent_args *)arg; 1018 1019 error = copyout(kevp, uap->eventlist, count * sizeof *kevp); 1020 if (error == 0) 1021 uap->eventlist += count; 1022 return (error); 1023 } 1024 1025 /* 1026 * Copy 'count' items from the list pointed to by uap->changelist. 1027 */ 1028 static int 1029 kevent_copyin(void *arg, struct kevent *kevp, int count) 1030 { 1031 struct kevent_args *uap; 1032 int error; 1033 1034 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); 1035 uap = (struct kevent_args *)arg; 1036 1037 error = copyin(uap->changelist, kevp, count * sizeof *kevp); 1038 if (error == 0) 1039 uap->changelist += count; 1040 return (error); 1041 } 1042 1043 #ifdef COMPAT_FREEBSD11 1044 struct kevent_freebsd11 { 1045 __uintptr_t ident; /* identifier for this event */ 1046 short filter; /* filter for event */ 1047 unsigned short flags; 1048 unsigned int fflags; 1049 __intptr_t data; 1050 void *udata; /* opaque user data identifier */ 1051 }; 1052 1053 static int 1054 kevent11_copyout(void *arg, struct kevent *kevp, int count) 1055 { 1056 struct freebsd11_kevent_args *uap; 1057 struct kevent_freebsd11 kev11; 1058 int error, i; 1059 1060 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); 1061 uap = (struct freebsd11_kevent_args *)arg; 1062 1063 for (i = 0; i < count; i++) { 1064 kev11.ident = kevp->ident; 1065 kev11.filter = kevp->filter; 1066 kev11.flags = kevp->flags; 1067 kev11.fflags = kevp->fflags; 1068 kev11.data = kevp->data; 1069 kev11.udata = kevp->udata; 1070 error = copyout(&kev11, uap->eventlist, sizeof(kev11)); 1071 if (error != 0) 1072 break; 1073 uap->eventlist++; 1074 kevp++; 1075 } 1076 return (error); 1077 } 1078 1079 /* 1080 * Copy 'count' items from the list pointed to by uap->changelist. 1081 */ 1082 static int 1083 kevent11_copyin(void *arg, struct kevent *kevp, int count) 1084 { 1085 struct freebsd11_kevent_args *uap; 1086 struct kevent_freebsd11 kev11; 1087 int error, i; 1088 1089 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); 1090 uap = (struct freebsd11_kevent_args *)arg; 1091 1092 for (i = 0; i < count; i++) { 1093 error = copyin(uap->changelist, &kev11, sizeof(kev11)); 1094 if (error != 0) 1095 break; 1096 kevp->ident = kev11.ident; 1097 kevp->filter = kev11.filter; 1098 kevp->flags = kev11.flags; 1099 kevp->fflags = kev11.fflags; 1100 kevp->data = (uintptr_t)kev11.data; 1101 kevp->udata = kev11.udata; 1102 bzero(&kevp->ext, sizeof(kevp->ext)); 1103 uap->changelist++; 1104 kevp++; 1105 } 1106 return (error); 1107 } 1108 1109 int 1110 freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap) 1111 { 1112 struct kevent_copyops k_ops = { 1113 .arg = uap, 1114 .k_copyout = kevent11_copyout, 1115 .k_copyin = kevent11_copyin, 1116 .kevent_size = sizeof(struct kevent_freebsd11), 1117 }; 1118 struct g_kevent_args gk_args = { 1119 .fd = uap->fd, 1120 .changelist = uap->changelist, 1121 .nchanges = uap->nchanges, 1122 .eventlist = uap->eventlist, 1123 .nevents = uap->nevents, 1124 .timeout = uap->timeout, 1125 }; 1126 1127 return (kern_kevent_generic(td, &gk_args, &k_ops)); 1128 } 1129 #endif 1130 1131 int 1132 kern_kevent(struct thread *td, int fd, int nchanges, int nevents, 1133 struct kevent_copyops *k_ops, const struct timespec *timeout) 1134 { 1135 cap_rights_t rights; 1136 struct file *fp; 1137 int error; 1138 1139 cap_rights_init(&rights); 1140 if (nchanges > 0) 1141 cap_rights_set(&rights, CAP_KQUEUE_CHANGE); 1142 if (nevents > 0) 1143 cap_rights_set(&rights, CAP_KQUEUE_EVENT); 1144 error = fget(td, fd, &rights, &fp); 1145 if (error != 0) 1146 return (error); 1147 1148 error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout); 1149 fdrop(fp, td); 1150 1151 return (error); 1152 } 1153 1154 static int 1155 kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents, 1156 struct kevent_copyops *k_ops, const struct timespec *timeout) 1157 { 1158 struct kevent keva[KQ_NEVENTS]; 1159 struct kevent *kevp, *changes; 1160 int i, n, nerrors, error; 1161 1162 nerrors = 0; 1163 while (nchanges > 0) { 1164 n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges; 1165 error = k_ops->k_copyin(k_ops->arg, keva, n); 1166 if (error) 1167 return (error); 1168 changes = keva; 1169 for (i = 0; i < n; i++) { 1170 kevp = &changes[i]; 1171 if (!kevp->filter) 1172 continue; 1173 kevp->flags &= ~EV_SYSFLAGS; 1174 error = kqueue_register(kq, kevp, td, 1); 1175 if (error || (kevp->flags & EV_RECEIPT)) { 1176 if (nevents == 0) 1177 return (error); 1178 kevp->flags = EV_ERROR; 1179 kevp->data = error; 1180 (void)k_ops->k_copyout(k_ops->arg, kevp, 1); 1181 nevents--; 1182 nerrors++; 1183 } 1184 } 1185 nchanges -= n; 1186 } 1187 if (nerrors) { 1188 td->td_retval[0] = nerrors; 1189 return (0); 1190 } 1191 1192 return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td)); 1193 } 1194 1195 int 1196 kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents, 1197 struct kevent_copyops *k_ops, const struct timespec *timeout) 1198 { 1199 struct kqueue *kq; 1200 int error; 1201 1202 error = kqueue_acquire(fp, &kq); 1203 if (error != 0) 1204 return (error); 1205 error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout); 1206 kqueue_release(kq, 0); 1207 return (error); 1208 } 1209 1210 /* 1211 * Performs a kevent() call on a temporarily created kqueue. This can be 1212 * used to perform one-shot polling, similar to poll() and select(). 1213 */ 1214 int 1215 kern_kevent_anonymous(struct thread *td, int nevents, 1216 struct kevent_copyops *k_ops) 1217 { 1218 struct kqueue kq = {}; 1219 int error; 1220 1221 kqueue_init(&kq); 1222 kq.kq_refcnt = 1; 1223 error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL); 1224 kqueue_drain(&kq, td); 1225 kqueue_destroy(&kq); 1226 return (error); 1227 } 1228 1229 int 1230 kqueue_add_filteropts(int filt, struct filterops *filtops) 1231 { 1232 int error; 1233 1234 error = 0; 1235 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) { 1236 printf( 1237 "trying to add a filterop that is out of range: %d is beyond %d\n", 1238 ~filt, EVFILT_SYSCOUNT); 1239 return EINVAL; 1240 } 1241 mtx_lock(&filterops_lock); 1242 if (sysfilt_ops[~filt].for_fop != &null_filtops && 1243 sysfilt_ops[~filt].for_fop != NULL) 1244 error = EEXIST; 1245 else { 1246 sysfilt_ops[~filt].for_fop = filtops; 1247 sysfilt_ops[~filt].for_refcnt = 0; 1248 } 1249 mtx_unlock(&filterops_lock); 1250 1251 return (error); 1252 } 1253 1254 int 1255 kqueue_del_filteropts(int filt) 1256 { 1257 int error; 1258 1259 error = 0; 1260 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) 1261 return EINVAL; 1262 1263 mtx_lock(&filterops_lock); 1264 if (sysfilt_ops[~filt].for_fop == &null_filtops || 1265 sysfilt_ops[~filt].for_fop == NULL) 1266 error = EINVAL; 1267 else if (sysfilt_ops[~filt].for_refcnt != 0) 1268 error = EBUSY; 1269 else { 1270 sysfilt_ops[~filt].for_fop = &null_filtops; 1271 sysfilt_ops[~filt].for_refcnt = 0; 1272 } 1273 mtx_unlock(&filterops_lock); 1274 1275 return error; 1276 } 1277 1278 static struct filterops * 1279 kqueue_fo_find(int filt) 1280 { 1281 1282 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) 1283 return NULL; 1284 1285 if (sysfilt_ops[~filt].for_nolock) 1286 return sysfilt_ops[~filt].for_fop; 1287 1288 mtx_lock(&filterops_lock); 1289 sysfilt_ops[~filt].for_refcnt++; 1290 if (sysfilt_ops[~filt].for_fop == NULL) 1291 sysfilt_ops[~filt].for_fop = &null_filtops; 1292 mtx_unlock(&filterops_lock); 1293 1294 return sysfilt_ops[~filt].for_fop; 1295 } 1296 1297 static void 1298 kqueue_fo_release(int filt) 1299 { 1300 1301 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) 1302 return; 1303 1304 if (sysfilt_ops[~filt].for_nolock) 1305 return; 1306 1307 mtx_lock(&filterops_lock); 1308 KASSERT(sysfilt_ops[~filt].for_refcnt > 0, 1309 ("filter object refcount not valid on release")); 1310 sysfilt_ops[~filt].for_refcnt--; 1311 mtx_unlock(&filterops_lock); 1312 } 1313 1314 /* 1315 * A ref to kq (obtained via kqueue_acquire) must be held. waitok will 1316 * influence if memory allocation should wait. Make sure it is 0 if you 1317 * hold any mutexes. 1318 */ 1319 static int 1320 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok) 1321 { 1322 struct filterops *fops; 1323 struct file *fp; 1324 struct knote *kn, *tkn; 1325 struct knlist *knl; 1326 cap_rights_t rights; 1327 int error, filt, event; 1328 int haskqglobal, filedesc_unlock; 1329 1330 if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE)) 1331 return (EINVAL); 1332 1333 fp = NULL; 1334 kn = NULL; 1335 knl = NULL; 1336 error = 0; 1337 haskqglobal = 0; 1338 filedesc_unlock = 0; 1339 1340 filt = kev->filter; 1341 fops = kqueue_fo_find(filt); 1342 if (fops == NULL) 1343 return EINVAL; 1344 1345 if (kev->flags & EV_ADD) { 1346 /* 1347 * Prevent waiting with locks. Non-sleepable 1348 * allocation failures are handled in the loop, only 1349 * if the spare knote appears to be actually required. 1350 */ 1351 tkn = knote_alloc(waitok); 1352 } else { 1353 tkn = NULL; 1354 } 1355 1356 findkn: 1357 if (fops->f_isfd) { 1358 KASSERT(td != NULL, ("td is NULL")); 1359 if (kev->ident > INT_MAX) 1360 error = EBADF; 1361 else 1362 error = fget(td, kev->ident, 1363 cap_rights_init(&rights, CAP_EVENT), &fp); 1364 if (error) 1365 goto done; 1366 1367 if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops, 1368 kev->ident, 0) != 0) { 1369 /* try again */ 1370 fdrop(fp, td); 1371 fp = NULL; 1372 error = kqueue_expand(kq, fops, kev->ident, waitok); 1373 if (error) 1374 goto done; 1375 goto findkn; 1376 } 1377 1378 if (fp->f_type == DTYPE_KQUEUE) { 1379 /* 1380 * If we add some intelligence about what we are doing, 1381 * we should be able to support events on ourselves. 1382 * We need to know when we are doing this to prevent 1383 * getting both the knlist lock and the kq lock since 1384 * they are the same thing. 1385 */ 1386 if (fp->f_data == kq) { 1387 error = EINVAL; 1388 goto done; 1389 } 1390 1391 /* 1392 * Pre-lock the filedesc before the global 1393 * lock mutex, see the comment in 1394 * kqueue_close(). 1395 */ 1396 FILEDESC_XLOCK(td->td_proc->p_fd); 1397 filedesc_unlock = 1; 1398 KQ_GLOBAL_LOCK(&kq_global, haskqglobal); 1399 } 1400 1401 KQ_LOCK(kq); 1402 if (kev->ident < kq->kq_knlistsize) { 1403 SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link) 1404 if (kev->filter == kn->kn_filter) 1405 break; 1406 } 1407 } else { 1408 if ((kev->flags & EV_ADD) == EV_ADD) 1409 kqueue_expand(kq, fops, kev->ident, waitok); 1410 1411 KQ_LOCK(kq); 1412 1413 /* 1414 * If possible, find an existing knote to use for this kevent. 1415 */ 1416 if (kev->filter == EVFILT_PROC && 1417 (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) { 1418 /* This is an internal creation of a process tracking 1419 * note. Don't attempt to coalesce this with an 1420 * existing note. 1421 */ 1422 ; 1423 } else if (kq->kq_knhashmask != 0) { 1424 struct klist *list; 1425 1426 list = &kq->kq_knhash[ 1427 KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; 1428 SLIST_FOREACH(kn, list, kn_link) 1429 if (kev->ident == kn->kn_id && 1430 kev->filter == kn->kn_filter) 1431 break; 1432 } 1433 } 1434 1435 /* knote is in the process of changing, wait for it to stabilize. */ 1436 if (kn != NULL && kn_in_flux(kn)) { 1437 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1438 if (filedesc_unlock) { 1439 FILEDESC_XUNLOCK(td->td_proc->p_fd); 1440 filedesc_unlock = 0; 1441 } 1442 kq->kq_state |= KQ_FLUXWAIT; 1443 msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0); 1444 if (fp != NULL) { 1445 fdrop(fp, td); 1446 fp = NULL; 1447 } 1448 goto findkn; 1449 } 1450 1451 /* 1452 * kn now contains the matching knote, or NULL if no match 1453 */ 1454 if (kn == NULL) { 1455 if (kev->flags & EV_ADD) { 1456 kn = tkn; 1457 tkn = NULL; 1458 if (kn == NULL) { 1459 KQ_UNLOCK(kq); 1460 error = ENOMEM; 1461 goto done; 1462 } 1463 kn->kn_fp = fp; 1464 kn->kn_kq = kq; 1465 kn->kn_fop = fops; 1466 /* 1467 * apply reference counts to knote structure, and 1468 * do not release it at the end of this routine. 1469 */ 1470 fops = NULL; 1471 fp = NULL; 1472 1473 kn->kn_sfflags = kev->fflags; 1474 kn->kn_sdata = kev->data; 1475 kev->fflags = 0; 1476 kev->data = 0; 1477 kn->kn_kevent = *kev; 1478 kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE | 1479 EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT); 1480 kn->kn_status = KN_DETACHED; 1481 kn_enter_flux(kn); 1482 1483 error = knote_attach(kn, kq); 1484 KQ_UNLOCK(kq); 1485 if (error != 0) { 1486 tkn = kn; 1487 goto done; 1488 } 1489 1490 if ((error = kn->kn_fop->f_attach(kn)) != 0) { 1491 knote_drop_detached(kn, td); 1492 goto done; 1493 } 1494 knl = kn_list_lock(kn); 1495 goto done_ev_add; 1496 } else { 1497 /* No matching knote and the EV_ADD flag is not set. */ 1498 KQ_UNLOCK(kq); 1499 error = ENOENT; 1500 goto done; 1501 } 1502 } 1503 1504 if (kev->flags & EV_DELETE) { 1505 kn_enter_flux(kn); 1506 KQ_UNLOCK(kq); 1507 knote_drop(kn, td); 1508 goto done; 1509 } 1510 1511 if (kev->flags & EV_FORCEONESHOT) { 1512 kn->kn_flags |= EV_ONESHOT; 1513 KNOTE_ACTIVATE(kn, 1); 1514 } 1515 1516 /* 1517 * The user may change some filter values after the initial EV_ADD, 1518 * but doing so will not reset any filter which has already been 1519 * triggered. 1520 */ 1521 kn->kn_status |= KN_SCAN; 1522 kn_enter_flux(kn); 1523 KQ_UNLOCK(kq); 1524 knl = kn_list_lock(kn); 1525 kn->kn_kevent.udata = kev->udata; 1526 if (!fops->f_isfd && fops->f_touch != NULL) { 1527 fops->f_touch(kn, kev, EVENT_REGISTER); 1528 } else { 1529 kn->kn_sfflags = kev->fflags; 1530 kn->kn_sdata = kev->data; 1531 } 1532 1533 /* 1534 * We can get here with kn->kn_knlist == NULL. This can happen when 1535 * the initial attach event decides that the event is "completed" 1536 * already. i.e. filt_procattach is called on a zombie process. It 1537 * will call filt_proc which will remove it from the list, and NULL 1538 * kn_knlist. 1539 */ 1540 done_ev_add: 1541 if ((kev->flags & EV_ENABLE) != 0) 1542 kn->kn_status &= ~KN_DISABLED; 1543 else if ((kev->flags & EV_DISABLE) != 0) 1544 kn->kn_status |= KN_DISABLED; 1545 1546 if ((kn->kn_status & KN_DISABLED) == 0) 1547 event = kn->kn_fop->f_event(kn, 0); 1548 else 1549 event = 0; 1550 1551 KQ_LOCK(kq); 1552 if (event) 1553 kn->kn_status |= KN_ACTIVE; 1554 if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) == 1555 KN_ACTIVE) 1556 knote_enqueue(kn); 1557 kn->kn_status &= ~KN_SCAN; 1558 kn_leave_flux(kn); 1559 kn_list_unlock(knl); 1560 KQ_UNLOCK_FLUX(kq); 1561 1562 done: 1563 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1564 if (filedesc_unlock) 1565 FILEDESC_XUNLOCK(td->td_proc->p_fd); 1566 if (fp != NULL) 1567 fdrop(fp, td); 1568 knote_free(tkn); 1569 if (fops != NULL) 1570 kqueue_fo_release(filt); 1571 return (error); 1572 } 1573 1574 static int 1575 kqueue_acquire(struct file *fp, struct kqueue **kqp) 1576 { 1577 int error; 1578 struct kqueue *kq; 1579 1580 error = 0; 1581 1582 kq = fp->f_data; 1583 if (fp->f_type != DTYPE_KQUEUE || kq == NULL) 1584 return (EBADF); 1585 *kqp = kq; 1586 KQ_LOCK(kq); 1587 if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { 1588 KQ_UNLOCK(kq); 1589 return (EBADF); 1590 } 1591 kq->kq_refcnt++; 1592 KQ_UNLOCK(kq); 1593 1594 return error; 1595 } 1596 1597 static void 1598 kqueue_release(struct kqueue *kq, int locked) 1599 { 1600 if (locked) 1601 KQ_OWNED(kq); 1602 else 1603 KQ_LOCK(kq); 1604 kq->kq_refcnt--; 1605 if (kq->kq_refcnt == 1) 1606 wakeup(&kq->kq_refcnt); 1607 if (!locked) 1608 KQ_UNLOCK(kq); 1609 } 1610 1611 static void 1612 kqueue_schedtask(struct kqueue *kq) 1613 { 1614 1615 KQ_OWNED(kq); 1616 KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN), 1617 ("scheduling kqueue task while draining")); 1618 1619 if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) { 1620 taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task); 1621 kq->kq_state |= KQ_TASKSCHED; 1622 } 1623 } 1624 1625 /* 1626 * Expand the kq to make sure we have storage for fops/ident pair. 1627 * 1628 * Return 0 on success (or no work necessary), return errno on failure. 1629 * 1630 * Not calling hashinit w/ waitok (proper malloc flag) should be safe. 1631 * If kqueue_register is called from a non-fd context, there usually/should 1632 * be no locks held. 1633 */ 1634 static int 1635 kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, 1636 int waitok) 1637 { 1638 struct klist *list, *tmp_knhash, *to_free; 1639 u_long tmp_knhashmask; 1640 int size; 1641 int fd; 1642 int mflag = waitok ? M_WAITOK : M_NOWAIT; 1643 1644 KQ_NOTOWNED(kq); 1645 1646 to_free = NULL; 1647 if (fops->f_isfd) { 1648 fd = ident; 1649 if (kq->kq_knlistsize <= fd) { 1650 size = kq->kq_knlistsize; 1651 while (size <= fd) 1652 size += KQEXTENT; 1653 list = malloc(size * sizeof(*list), M_KQUEUE, mflag); 1654 if (list == NULL) 1655 return ENOMEM; 1656 KQ_LOCK(kq); 1657 if (kq->kq_knlistsize > fd) { 1658 to_free = list; 1659 list = NULL; 1660 } else { 1661 if (kq->kq_knlist != NULL) { 1662 bcopy(kq->kq_knlist, list, 1663 kq->kq_knlistsize * sizeof(*list)); 1664 to_free = kq->kq_knlist; 1665 kq->kq_knlist = NULL; 1666 } 1667 bzero((caddr_t)list + 1668 kq->kq_knlistsize * sizeof(*list), 1669 (size - kq->kq_knlistsize) * sizeof(*list)); 1670 kq->kq_knlistsize = size; 1671 kq->kq_knlist = list; 1672 } 1673 KQ_UNLOCK(kq); 1674 } 1675 } else { 1676 if (kq->kq_knhashmask == 0) { 1677 tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE, 1678 &tmp_knhashmask); 1679 if (tmp_knhash == NULL) 1680 return ENOMEM; 1681 KQ_LOCK(kq); 1682 if (kq->kq_knhashmask == 0) { 1683 kq->kq_knhash = tmp_knhash; 1684 kq->kq_knhashmask = tmp_knhashmask; 1685 } else { 1686 to_free = tmp_knhash; 1687 } 1688 KQ_UNLOCK(kq); 1689 } 1690 } 1691 free(to_free, M_KQUEUE); 1692 1693 KQ_NOTOWNED(kq); 1694 return 0; 1695 } 1696 1697 static void 1698 kqueue_task(void *arg, int pending) 1699 { 1700 struct kqueue *kq; 1701 int haskqglobal; 1702 1703 haskqglobal = 0; 1704 kq = arg; 1705 1706 KQ_GLOBAL_LOCK(&kq_global, haskqglobal); 1707 KQ_LOCK(kq); 1708 1709 KNOTE_LOCKED(&kq->kq_sel.si_note, 0); 1710 1711 kq->kq_state &= ~KQ_TASKSCHED; 1712 if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) { 1713 wakeup(&kq->kq_state); 1714 } 1715 KQ_UNLOCK(kq); 1716 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1717 } 1718 1719 /* 1720 * Scan, update kn_data (if not ONESHOT), and copyout triggered events. 1721 * We treat KN_MARKER knotes as if they are in flux. 1722 */ 1723 static int 1724 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, 1725 const struct timespec *tsp, struct kevent *keva, struct thread *td) 1726 { 1727 struct kevent *kevp; 1728 struct knote *kn, *marker; 1729 struct knlist *knl; 1730 sbintime_t asbt, rsbt; 1731 int count, error, haskqglobal, influx, nkev, touch; 1732 1733 count = maxevents; 1734 nkev = 0; 1735 error = 0; 1736 haskqglobal = 0; 1737 1738 if (maxevents == 0) 1739 goto done_nl; 1740 1741 rsbt = 0; 1742 if (tsp != NULL) { 1743 if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 || 1744 tsp->tv_nsec >= 1000000000) { 1745 error = EINVAL; 1746 goto done_nl; 1747 } 1748 if (timespecisset(tsp)) { 1749 if (tsp->tv_sec <= INT32_MAX) { 1750 rsbt = tstosbt(*tsp); 1751 if (TIMESEL(&asbt, rsbt)) 1752 asbt += tc_tick_sbt; 1753 if (asbt <= SBT_MAX - rsbt) 1754 asbt += rsbt; 1755 else 1756 asbt = 0; 1757 rsbt >>= tc_precexp; 1758 } else 1759 asbt = 0; 1760 } else 1761 asbt = -1; 1762 } else 1763 asbt = 0; 1764 marker = knote_alloc(1); 1765 marker->kn_status = KN_MARKER; 1766 KQ_LOCK(kq); 1767 1768 retry: 1769 kevp = keva; 1770 if (kq->kq_count == 0) { 1771 if (asbt == -1) { 1772 error = EWOULDBLOCK; 1773 } else { 1774 kq->kq_state |= KQ_SLEEP; 1775 error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH, 1776 "kqread", asbt, rsbt, C_ABSOLUTE); 1777 } 1778 if (error == 0) 1779 goto retry; 1780 /* don't restart after signals... */ 1781 if (error == ERESTART) 1782 error = EINTR; 1783 else if (error == EWOULDBLOCK) 1784 error = 0; 1785 goto done; 1786 } 1787 1788 TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe); 1789 influx = 0; 1790 while (count) { 1791 KQ_OWNED(kq); 1792 kn = TAILQ_FIRST(&kq->kq_head); 1793 1794 if ((kn->kn_status == KN_MARKER && kn != marker) || 1795 kn_in_flux(kn)) { 1796 if (influx) { 1797 influx = 0; 1798 KQ_FLUX_WAKEUP(kq); 1799 } 1800 kq->kq_state |= KQ_FLUXWAIT; 1801 error = msleep(kq, &kq->kq_lock, PSOCK, 1802 "kqflxwt", 0); 1803 continue; 1804 } 1805 1806 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 1807 if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) { 1808 kn->kn_status &= ~KN_QUEUED; 1809 kq->kq_count--; 1810 continue; 1811 } 1812 if (kn == marker) { 1813 KQ_FLUX_WAKEUP(kq); 1814 if (count == maxevents) 1815 goto retry; 1816 goto done; 1817 } 1818 KASSERT(!kn_in_flux(kn), 1819 ("knote %p is unexpectedly in flux", kn)); 1820 1821 if ((kn->kn_flags & EV_DROP) == EV_DROP) { 1822 kn->kn_status &= ~KN_QUEUED; 1823 kn_enter_flux(kn); 1824 kq->kq_count--; 1825 KQ_UNLOCK(kq); 1826 /* 1827 * We don't need to lock the list since we've 1828 * marked it as in flux. 1829 */ 1830 knote_drop(kn, td); 1831 KQ_LOCK(kq); 1832 continue; 1833 } else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) { 1834 kn->kn_status &= ~KN_QUEUED; 1835 kn_enter_flux(kn); 1836 kq->kq_count--; 1837 KQ_UNLOCK(kq); 1838 /* 1839 * We don't need to lock the list since we've 1840 * marked the knote as being in flux. 1841 */ 1842 *kevp = kn->kn_kevent; 1843 knote_drop(kn, td); 1844 KQ_LOCK(kq); 1845 kn = NULL; 1846 } else { 1847 kn->kn_status |= KN_SCAN; 1848 kn_enter_flux(kn); 1849 KQ_UNLOCK(kq); 1850 if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE) 1851 KQ_GLOBAL_LOCK(&kq_global, haskqglobal); 1852 knl = kn_list_lock(kn); 1853 if (kn->kn_fop->f_event(kn, 0) == 0) { 1854 KQ_LOCK(kq); 1855 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1856 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE | 1857 KN_SCAN); 1858 kn_leave_flux(kn); 1859 kq->kq_count--; 1860 kn_list_unlock(knl); 1861 influx = 1; 1862 continue; 1863 } 1864 touch = (!kn->kn_fop->f_isfd && 1865 kn->kn_fop->f_touch != NULL); 1866 if (touch) 1867 kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS); 1868 else 1869 *kevp = kn->kn_kevent; 1870 KQ_LOCK(kq); 1871 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1872 if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) { 1873 /* 1874 * Manually clear knotes who weren't 1875 * 'touch'ed. 1876 */ 1877 if (touch == 0 && kn->kn_flags & EV_CLEAR) { 1878 kn->kn_data = 0; 1879 kn->kn_fflags = 0; 1880 } 1881 if (kn->kn_flags & EV_DISPATCH) 1882 kn->kn_status |= KN_DISABLED; 1883 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); 1884 kq->kq_count--; 1885 } else 1886 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 1887 1888 kn->kn_status &= ~KN_SCAN; 1889 kn_leave_flux(kn); 1890 kn_list_unlock(knl); 1891 influx = 1; 1892 } 1893 1894 /* we are returning a copy to the user */ 1895 kevp++; 1896 nkev++; 1897 count--; 1898 1899 if (nkev == KQ_NEVENTS) { 1900 influx = 0; 1901 KQ_UNLOCK_FLUX(kq); 1902 error = k_ops->k_copyout(k_ops->arg, keva, nkev); 1903 nkev = 0; 1904 kevp = keva; 1905 KQ_LOCK(kq); 1906 if (error) 1907 break; 1908 } 1909 } 1910 TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe); 1911 done: 1912 KQ_OWNED(kq); 1913 KQ_UNLOCK_FLUX(kq); 1914 knote_free(marker); 1915 done_nl: 1916 KQ_NOTOWNED(kq); 1917 if (nkev != 0) 1918 error = k_ops->k_copyout(k_ops->arg, keva, nkev); 1919 td->td_retval[0] = maxevents - count; 1920 return (error); 1921 } 1922 1923 /*ARGSUSED*/ 1924 static int 1925 kqueue_ioctl(struct file *fp, u_long cmd, void *data, 1926 struct ucred *active_cred, struct thread *td) 1927 { 1928 /* 1929 * Enabling sigio causes two major problems: 1930 * 1) infinite recursion: 1931 * Synopsys: kevent is being used to track signals and have FIOASYNC 1932 * set. On receipt of a signal this will cause a kqueue to recurse 1933 * into itself over and over. Sending the sigio causes the kqueue 1934 * to become ready, which in turn posts sigio again, forever. 1935 * Solution: this can be solved by setting a flag in the kqueue that 1936 * we have a SIGIO in progress. 1937 * 2) locking problems: 1938 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts 1939 * us above the proc and pgrp locks. 1940 * Solution: Post a signal using an async mechanism, being sure to 1941 * record a generation count in the delivery so that we do not deliver 1942 * a signal to the wrong process. 1943 * 1944 * Note, these two mechanisms are somewhat mutually exclusive! 1945 */ 1946 #if 0 1947 struct kqueue *kq; 1948 1949 kq = fp->f_data; 1950 switch (cmd) { 1951 case FIOASYNC: 1952 if (*(int *)data) { 1953 kq->kq_state |= KQ_ASYNC; 1954 } else { 1955 kq->kq_state &= ~KQ_ASYNC; 1956 } 1957 return (0); 1958 1959 case FIOSETOWN: 1960 return (fsetown(*(int *)data, &kq->kq_sigio)); 1961 1962 case FIOGETOWN: 1963 *(int *)data = fgetown(&kq->kq_sigio); 1964 return (0); 1965 } 1966 #endif 1967 1968 return (ENOTTY); 1969 } 1970 1971 /*ARGSUSED*/ 1972 static int 1973 kqueue_poll(struct file *fp, int events, struct ucred *active_cred, 1974 struct thread *td) 1975 { 1976 struct kqueue *kq; 1977 int revents = 0; 1978 int error; 1979 1980 if ((error = kqueue_acquire(fp, &kq))) 1981 return POLLERR; 1982 1983 KQ_LOCK(kq); 1984 if (events & (POLLIN | POLLRDNORM)) { 1985 if (kq->kq_count) { 1986 revents |= events & (POLLIN | POLLRDNORM); 1987 } else { 1988 selrecord(td, &kq->kq_sel); 1989 if (SEL_WAITING(&kq->kq_sel)) 1990 kq->kq_state |= KQ_SEL; 1991 } 1992 } 1993 kqueue_release(kq, 1); 1994 KQ_UNLOCK(kq); 1995 return (revents); 1996 } 1997 1998 /*ARGSUSED*/ 1999 static int 2000 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred, 2001 struct thread *td) 2002 { 2003 2004 bzero((void *)st, sizeof *st); 2005 /* 2006 * We no longer return kq_count because the unlocked value is useless. 2007 * If you spent all this time getting the count, why not spend your 2008 * syscall better by calling kevent? 2009 * 2010 * XXX - This is needed for libc_r. 2011 */ 2012 st->st_mode = S_IFIFO; 2013 return (0); 2014 } 2015 2016 static void 2017 kqueue_drain(struct kqueue *kq, struct thread *td) 2018 { 2019 struct knote *kn; 2020 int i; 2021 2022 KQ_LOCK(kq); 2023 2024 KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING, 2025 ("kqueue already closing")); 2026 kq->kq_state |= KQ_CLOSING; 2027 if (kq->kq_refcnt > 1) 2028 msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0); 2029 2030 KASSERT(kq->kq_refcnt == 1, ("other refs are out there!")); 2031 2032 KASSERT(knlist_empty(&kq->kq_sel.si_note), 2033 ("kqueue's knlist not empty")); 2034 2035 for (i = 0; i < kq->kq_knlistsize; i++) { 2036 while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) { 2037 if (kn_in_flux(kn)) { 2038 kq->kq_state |= KQ_FLUXWAIT; 2039 msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0); 2040 continue; 2041 } 2042 kn_enter_flux(kn); 2043 KQ_UNLOCK(kq); 2044 knote_drop(kn, td); 2045 KQ_LOCK(kq); 2046 } 2047 } 2048 if (kq->kq_knhashmask != 0) { 2049 for (i = 0; i <= kq->kq_knhashmask; i++) { 2050 while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) { 2051 if (kn_in_flux(kn)) { 2052 kq->kq_state |= KQ_FLUXWAIT; 2053 msleep(kq, &kq->kq_lock, PSOCK, 2054 "kqclo2", 0); 2055 continue; 2056 } 2057 kn_enter_flux(kn); 2058 KQ_UNLOCK(kq); 2059 knote_drop(kn, td); 2060 KQ_LOCK(kq); 2061 } 2062 } 2063 } 2064 2065 if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) { 2066 kq->kq_state |= KQ_TASKDRAIN; 2067 msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0); 2068 } 2069 2070 if ((kq->kq_state & KQ_SEL) == KQ_SEL) { 2071 selwakeuppri(&kq->kq_sel, PSOCK); 2072 if (!SEL_WAITING(&kq->kq_sel)) 2073 kq->kq_state &= ~KQ_SEL; 2074 } 2075 2076 KQ_UNLOCK(kq); 2077 } 2078 2079 static void 2080 kqueue_destroy(struct kqueue *kq) 2081 { 2082 2083 KASSERT(kq->kq_fdp == NULL, 2084 ("kqueue still attached to a file descriptor")); 2085 seldrain(&kq->kq_sel); 2086 knlist_destroy(&kq->kq_sel.si_note); 2087 mtx_destroy(&kq->kq_lock); 2088 2089 if (kq->kq_knhash != NULL) 2090 free(kq->kq_knhash, M_KQUEUE); 2091 if (kq->kq_knlist != NULL) 2092 free(kq->kq_knlist, M_KQUEUE); 2093 2094 funsetown(&kq->kq_sigio); 2095 } 2096 2097 /*ARGSUSED*/ 2098 static int 2099 kqueue_close(struct file *fp, struct thread *td) 2100 { 2101 struct kqueue *kq = fp->f_data; 2102 struct filedesc *fdp; 2103 int error; 2104 int filedesc_unlock; 2105 2106 if ((error = kqueue_acquire(fp, &kq))) 2107 return error; 2108 kqueue_drain(kq, td); 2109 2110 /* 2111 * We could be called due to the knote_drop() doing fdrop(), 2112 * called from kqueue_register(). In this case the global 2113 * lock is owned, and filedesc sx is locked before, to not 2114 * take the sleepable lock after non-sleepable. 2115 */ 2116 fdp = kq->kq_fdp; 2117 kq->kq_fdp = NULL; 2118 if (!sx_xlocked(FILEDESC_LOCK(fdp))) { 2119 FILEDESC_XLOCK(fdp); 2120 filedesc_unlock = 1; 2121 } else 2122 filedesc_unlock = 0; 2123 TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list); 2124 if (filedesc_unlock) 2125 FILEDESC_XUNLOCK(fdp); 2126 2127 kqueue_destroy(kq); 2128 chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0); 2129 crfree(kq->kq_cred); 2130 free(kq, M_KQUEUE); 2131 fp->f_data = NULL; 2132 2133 return (0); 2134 } 2135 2136 static int 2137 kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 2138 { 2139 2140 kif->kf_type = KF_TYPE_KQUEUE; 2141 return (0); 2142 } 2143 2144 static void 2145 kqueue_wakeup(struct kqueue *kq) 2146 { 2147 KQ_OWNED(kq); 2148 2149 if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) { 2150 kq->kq_state &= ~KQ_SLEEP; 2151 wakeup(kq); 2152 } 2153 if ((kq->kq_state & KQ_SEL) == KQ_SEL) { 2154 selwakeuppri(&kq->kq_sel, PSOCK); 2155 if (!SEL_WAITING(&kq->kq_sel)) 2156 kq->kq_state &= ~KQ_SEL; 2157 } 2158 if (!knlist_empty(&kq->kq_sel.si_note)) 2159 kqueue_schedtask(kq); 2160 if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) { 2161 pgsigio(&kq->kq_sigio, SIGIO, 0); 2162 } 2163 } 2164 2165 /* 2166 * Walk down a list of knotes, activating them if their event has triggered. 2167 * 2168 * There is a possibility to optimize in the case of one kq watching another. 2169 * Instead of scheduling a task to wake it up, you could pass enough state 2170 * down the chain to make up the parent kqueue. Make this code functional 2171 * first. 2172 */ 2173 void 2174 knote(struct knlist *list, long hint, int lockflags) 2175 { 2176 struct kqueue *kq; 2177 struct knote *kn, *tkn; 2178 int error; 2179 2180 if (list == NULL) 2181 return; 2182 2183 KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED); 2184 2185 if ((lockflags & KNF_LISTLOCKED) == 0) 2186 list->kl_lock(list->kl_lockarg); 2187 2188 /* 2189 * If we unlock the list lock (and enter influx), we can 2190 * eliminate the kqueue scheduling, but this will introduce 2191 * four lock/unlock's for each knote to test. Also, marker 2192 * would be needed to keep iteration position, since filters 2193 * or other threads could remove events. 2194 */ 2195 SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) { 2196 kq = kn->kn_kq; 2197 KQ_LOCK(kq); 2198 if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { 2199 /* 2200 * Do not process the influx notes, except for 2201 * the influx coming from the kq unlock in the 2202 * kqueue_scan(). In the later case, we do 2203 * not interfere with the scan, since the code 2204 * fragment in kqueue_scan() locks the knlist, 2205 * and cannot proceed until we finished. 2206 */ 2207 KQ_UNLOCK(kq); 2208 } else if ((lockflags & KNF_NOKQLOCK) != 0) { 2209 kn_enter_flux(kn); 2210 KQ_UNLOCK(kq); 2211 error = kn->kn_fop->f_event(kn, hint); 2212 KQ_LOCK(kq); 2213 kn_leave_flux(kn); 2214 if (error) 2215 KNOTE_ACTIVATE(kn, 1); 2216 KQ_UNLOCK_FLUX(kq); 2217 } else { 2218 kn->kn_status |= KN_HASKQLOCK; 2219 if (kn->kn_fop->f_event(kn, hint)) 2220 KNOTE_ACTIVATE(kn, 1); 2221 kn->kn_status &= ~KN_HASKQLOCK; 2222 KQ_UNLOCK(kq); 2223 } 2224 } 2225 if ((lockflags & KNF_LISTLOCKED) == 0) 2226 list->kl_unlock(list->kl_lockarg); 2227 } 2228 2229 /* 2230 * add a knote to a knlist 2231 */ 2232 void 2233 knlist_add(struct knlist *knl, struct knote *kn, int islocked) 2234 { 2235 2236 KNL_ASSERT_LOCK(knl, islocked); 2237 KQ_NOTOWNED(kn->kn_kq); 2238 KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn)); 2239 KASSERT((kn->kn_status & KN_DETACHED) != 0, 2240 ("knote %p was not detached", kn)); 2241 if (!islocked) 2242 knl->kl_lock(knl->kl_lockarg); 2243 SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext); 2244 if (!islocked) 2245 knl->kl_unlock(knl->kl_lockarg); 2246 KQ_LOCK(kn->kn_kq); 2247 kn->kn_knlist = knl; 2248 kn->kn_status &= ~KN_DETACHED; 2249 KQ_UNLOCK(kn->kn_kq); 2250 } 2251 2252 static void 2253 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, 2254 int kqislocked) 2255 { 2256 2257 KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked")); 2258 KNL_ASSERT_LOCK(knl, knlislocked); 2259 mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED); 2260 KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn)); 2261 KASSERT((kn->kn_status & KN_DETACHED) == 0, 2262 ("knote %p was already detached", kn)); 2263 if (!knlislocked) 2264 knl->kl_lock(knl->kl_lockarg); 2265 SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext); 2266 kn->kn_knlist = NULL; 2267 if (!knlislocked) 2268 kn_list_unlock(knl); 2269 if (!kqislocked) 2270 KQ_LOCK(kn->kn_kq); 2271 kn->kn_status |= KN_DETACHED; 2272 if (!kqislocked) 2273 KQ_UNLOCK(kn->kn_kq); 2274 } 2275 2276 /* 2277 * remove knote from the specified knlist 2278 */ 2279 void 2280 knlist_remove(struct knlist *knl, struct knote *kn, int islocked) 2281 { 2282 2283 knlist_remove_kq(knl, kn, islocked, 0); 2284 } 2285 2286 int 2287 knlist_empty(struct knlist *knl) 2288 { 2289 2290 KNL_ASSERT_LOCKED(knl); 2291 return (SLIST_EMPTY(&knl->kl_list)); 2292 } 2293 2294 static struct mtx knlist_lock; 2295 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects", 2296 MTX_DEF); 2297 static void knlist_mtx_lock(void *arg); 2298 static void knlist_mtx_unlock(void *arg); 2299 2300 static void 2301 knlist_mtx_lock(void *arg) 2302 { 2303 2304 mtx_lock((struct mtx *)arg); 2305 } 2306 2307 static void 2308 knlist_mtx_unlock(void *arg) 2309 { 2310 2311 mtx_unlock((struct mtx *)arg); 2312 } 2313 2314 static void 2315 knlist_mtx_assert_locked(void *arg) 2316 { 2317 2318 mtx_assert((struct mtx *)arg, MA_OWNED); 2319 } 2320 2321 static void 2322 knlist_mtx_assert_unlocked(void *arg) 2323 { 2324 2325 mtx_assert((struct mtx *)arg, MA_NOTOWNED); 2326 } 2327 2328 static void 2329 knlist_rw_rlock(void *arg) 2330 { 2331 2332 rw_rlock((struct rwlock *)arg); 2333 } 2334 2335 static void 2336 knlist_rw_runlock(void *arg) 2337 { 2338 2339 rw_runlock((struct rwlock *)arg); 2340 } 2341 2342 static void 2343 knlist_rw_assert_locked(void *arg) 2344 { 2345 2346 rw_assert((struct rwlock *)arg, RA_LOCKED); 2347 } 2348 2349 static void 2350 knlist_rw_assert_unlocked(void *arg) 2351 { 2352 2353 rw_assert((struct rwlock *)arg, RA_UNLOCKED); 2354 } 2355 2356 void 2357 knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *), 2358 void (*kl_unlock)(void *), 2359 void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *)) 2360 { 2361 2362 if (lock == NULL) 2363 knl->kl_lockarg = &knlist_lock; 2364 else 2365 knl->kl_lockarg = lock; 2366 2367 if (kl_lock == NULL) 2368 knl->kl_lock = knlist_mtx_lock; 2369 else 2370 knl->kl_lock = kl_lock; 2371 if (kl_unlock == NULL) 2372 knl->kl_unlock = knlist_mtx_unlock; 2373 else 2374 knl->kl_unlock = kl_unlock; 2375 if (kl_assert_locked == NULL) 2376 knl->kl_assert_locked = knlist_mtx_assert_locked; 2377 else 2378 knl->kl_assert_locked = kl_assert_locked; 2379 if (kl_assert_unlocked == NULL) 2380 knl->kl_assert_unlocked = knlist_mtx_assert_unlocked; 2381 else 2382 knl->kl_assert_unlocked = kl_assert_unlocked; 2383 2384 knl->kl_autodestroy = 0; 2385 SLIST_INIT(&knl->kl_list); 2386 } 2387 2388 void 2389 knlist_init_mtx(struct knlist *knl, struct mtx *lock) 2390 { 2391 2392 knlist_init(knl, lock, NULL, NULL, NULL, NULL); 2393 } 2394 2395 struct knlist * 2396 knlist_alloc(struct mtx *lock) 2397 { 2398 struct knlist *knl; 2399 2400 knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK); 2401 knlist_init_mtx(knl, lock); 2402 return (knl); 2403 } 2404 2405 void 2406 knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock) 2407 { 2408 2409 knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock, 2410 knlist_rw_assert_locked, knlist_rw_assert_unlocked); 2411 } 2412 2413 void 2414 knlist_destroy(struct knlist *knl) 2415 { 2416 2417 KASSERT(KNLIST_EMPTY(knl), 2418 ("destroying knlist %p with knotes on it", knl)); 2419 } 2420 2421 void 2422 knlist_detach(struct knlist *knl) 2423 { 2424 2425 KNL_ASSERT_LOCKED(knl); 2426 knl->kl_autodestroy = 1; 2427 if (knlist_empty(knl)) { 2428 knlist_destroy(knl); 2429 free(knl, M_KQUEUE); 2430 } 2431 } 2432 2433 /* 2434 * Even if we are locked, we may need to drop the lock to allow any influx 2435 * knotes time to "settle". 2436 */ 2437 void 2438 knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn) 2439 { 2440 struct knote *kn, *kn2; 2441 struct kqueue *kq; 2442 2443 KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl)); 2444 if (islocked) 2445 KNL_ASSERT_LOCKED(knl); 2446 else { 2447 KNL_ASSERT_UNLOCKED(knl); 2448 again: /* need to reacquire lock since we have dropped it */ 2449 knl->kl_lock(knl->kl_lockarg); 2450 } 2451 2452 SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) { 2453 kq = kn->kn_kq; 2454 KQ_LOCK(kq); 2455 if (kn_in_flux(kn)) { 2456 KQ_UNLOCK(kq); 2457 continue; 2458 } 2459 knlist_remove_kq(knl, kn, 1, 1); 2460 if (killkn) { 2461 kn_enter_flux(kn); 2462 KQ_UNLOCK(kq); 2463 knote_drop_detached(kn, td); 2464 } else { 2465 /* Make sure cleared knotes disappear soon */ 2466 kn->kn_flags |= EV_EOF | EV_ONESHOT; 2467 KQ_UNLOCK(kq); 2468 } 2469 kq = NULL; 2470 } 2471 2472 if (!SLIST_EMPTY(&knl->kl_list)) { 2473 /* there are still in flux knotes remaining */ 2474 kn = SLIST_FIRST(&knl->kl_list); 2475 kq = kn->kn_kq; 2476 KQ_LOCK(kq); 2477 KASSERT(kn_in_flux(kn), ("knote removed w/o list lock")); 2478 knl->kl_unlock(knl->kl_lockarg); 2479 kq->kq_state |= KQ_FLUXWAIT; 2480 msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0); 2481 kq = NULL; 2482 goto again; 2483 } 2484 2485 if (islocked) 2486 KNL_ASSERT_LOCKED(knl); 2487 else { 2488 knl->kl_unlock(knl->kl_lockarg); 2489 KNL_ASSERT_UNLOCKED(knl); 2490 } 2491 } 2492 2493 /* 2494 * Remove all knotes referencing a specified fd must be called with FILEDESC 2495 * lock. This prevents a race where a new fd comes along and occupies the 2496 * entry and we attach a knote to the fd. 2497 */ 2498 void 2499 knote_fdclose(struct thread *td, int fd) 2500 { 2501 struct filedesc *fdp = td->td_proc->p_fd; 2502 struct kqueue *kq; 2503 struct knote *kn; 2504 int influx; 2505 2506 FILEDESC_XLOCK_ASSERT(fdp); 2507 2508 /* 2509 * We shouldn't have to worry about new kevents appearing on fd 2510 * since filedesc is locked. 2511 */ 2512 TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) { 2513 KQ_LOCK(kq); 2514 2515 again: 2516 influx = 0; 2517 while (kq->kq_knlistsize > fd && 2518 (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) { 2519 if (kn_in_flux(kn)) { 2520 /* someone else might be waiting on our knote */ 2521 if (influx) 2522 wakeup(kq); 2523 kq->kq_state |= KQ_FLUXWAIT; 2524 msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); 2525 goto again; 2526 } 2527 kn_enter_flux(kn); 2528 KQ_UNLOCK(kq); 2529 influx = 1; 2530 knote_drop(kn, td); 2531 KQ_LOCK(kq); 2532 } 2533 KQ_UNLOCK_FLUX(kq); 2534 } 2535 } 2536 2537 static int 2538 knote_attach(struct knote *kn, struct kqueue *kq) 2539 { 2540 struct klist *list; 2541 2542 KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn)); 2543 KQ_OWNED(kq); 2544 2545 if (kn->kn_fop->f_isfd) { 2546 if (kn->kn_id >= kq->kq_knlistsize) 2547 return (ENOMEM); 2548 list = &kq->kq_knlist[kn->kn_id]; 2549 } else { 2550 if (kq->kq_knhash == NULL) 2551 return (ENOMEM); 2552 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 2553 } 2554 SLIST_INSERT_HEAD(list, kn, kn_link); 2555 return (0); 2556 } 2557 2558 static void 2559 knote_drop(struct knote *kn, struct thread *td) 2560 { 2561 2562 if ((kn->kn_status & KN_DETACHED) == 0) 2563 kn->kn_fop->f_detach(kn); 2564 knote_drop_detached(kn, td); 2565 } 2566 2567 static void 2568 knote_drop_detached(struct knote *kn, struct thread *td) 2569 { 2570 struct kqueue *kq; 2571 struct klist *list; 2572 2573 kq = kn->kn_kq; 2574 2575 KASSERT((kn->kn_status & KN_DETACHED) != 0, 2576 ("knote %p still attached", kn)); 2577 KQ_NOTOWNED(kq); 2578 2579 KQ_LOCK(kq); 2580 KASSERT(kn->kn_influx == 1, 2581 ("knote_drop called on %p with influx %d", kn, kn->kn_influx)); 2582 2583 if (kn->kn_fop->f_isfd) 2584 list = &kq->kq_knlist[kn->kn_id]; 2585 else 2586 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 2587 2588 if (!SLIST_EMPTY(list)) 2589 SLIST_REMOVE(list, kn, knote, kn_link); 2590 if (kn->kn_status & KN_QUEUED) 2591 knote_dequeue(kn); 2592 KQ_UNLOCK_FLUX(kq); 2593 2594 if (kn->kn_fop->f_isfd) { 2595 fdrop(kn->kn_fp, td); 2596 kn->kn_fp = NULL; 2597 } 2598 kqueue_fo_release(kn->kn_kevent.filter); 2599 kn->kn_fop = NULL; 2600 knote_free(kn); 2601 } 2602 2603 static void 2604 knote_enqueue(struct knote *kn) 2605 { 2606 struct kqueue *kq = kn->kn_kq; 2607 2608 KQ_OWNED(kn->kn_kq); 2609 KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); 2610 2611 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 2612 kn->kn_status |= KN_QUEUED; 2613 kq->kq_count++; 2614 kqueue_wakeup(kq); 2615 } 2616 2617 static void 2618 knote_dequeue(struct knote *kn) 2619 { 2620 struct kqueue *kq = kn->kn_kq; 2621 2622 KQ_OWNED(kn->kn_kq); 2623 KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); 2624 2625 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 2626 kn->kn_status &= ~KN_QUEUED; 2627 kq->kq_count--; 2628 } 2629 2630 static void 2631 knote_init(void) 2632 { 2633 2634 knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, 2635 NULL, NULL, UMA_ALIGN_PTR, 0); 2636 } 2637 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL); 2638 2639 static struct knote * 2640 knote_alloc(int waitok) 2641 { 2642 2643 return (uma_zalloc(knote_zone, (waitok ? M_WAITOK : M_NOWAIT) | 2644 M_ZERO)); 2645 } 2646 2647 static void 2648 knote_free(struct knote *kn) 2649 { 2650 2651 uma_zfree(knote_zone, kn); 2652 } 2653 2654 /* 2655 * Register the kev w/ the kq specified by fd. 2656 */ 2657 int 2658 kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok) 2659 { 2660 struct kqueue *kq; 2661 struct file *fp; 2662 cap_rights_t rights; 2663 int error; 2664 2665 error = fget(td, fd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &fp); 2666 if (error != 0) 2667 return (error); 2668 if ((error = kqueue_acquire(fp, &kq)) != 0) 2669 goto noacquire; 2670 2671 error = kqueue_register(kq, kev, td, waitok); 2672 kqueue_release(kq, 0); 2673 2674 noacquire: 2675 fdrop(fp, td); 2676 return (error); 2677 } 2678