1 /*- 2 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org> 3 * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org> 4 * Copyright (c) 2009 Apple, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include "opt_compat.h" 33 #include "opt_ktrace.h" 34 #include "opt_kqueue.h" 35 36 #include <sys/param.h> 37 #include <sys/systm.h> 38 #include <sys/capsicum.h> 39 #include <sys/kernel.h> 40 #include <sys/lock.h> 41 #include <sys/mutex.h> 42 #include <sys/rwlock.h> 43 #include <sys/proc.h> 44 #include <sys/malloc.h> 45 #include <sys/unistd.h> 46 #include <sys/file.h> 47 #include <sys/filedesc.h> 48 #include <sys/filio.h> 49 #include <sys/fcntl.h> 50 #include <sys/kthread.h> 51 #include <sys/selinfo.h> 52 #include <sys/queue.h> 53 #include <sys/event.h> 54 #include <sys/eventvar.h> 55 #include <sys/poll.h> 56 #include <sys/protosw.h> 57 #include <sys/resourcevar.h> 58 #include <sys/sigio.h> 59 #include <sys/signalvar.h> 60 #include <sys/socket.h> 61 #include <sys/socketvar.h> 62 #include <sys/stat.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/syscallsubr.h> 66 #include <sys/taskqueue.h> 67 #include <sys/uio.h> 68 #include <sys/user.h> 69 #ifdef KTRACE 70 #include <sys/ktrace.h> 71 #endif 72 #include <machine/atomic.h> 73 74 #include <vm/uma.h> 75 76 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); 77 78 /* 79 * This lock is used if multiple kq locks are required. This possibly 80 * should be made into a per proc lock. 81 */ 82 static struct mtx kq_global; 83 MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF); 84 #define KQ_GLOBAL_LOCK(lck, haslck) do { \ 85 if (!haslck) \ 86 mtx_lock(lck); \ 87 haslck = 1; \ 88 } while (0) 89 #define KQ_GLOBAL_UNLOCK(lck, haslck) do { \ 90 if (haslck) \ 91 mtx_unlock(lck); \ 92 haslck = 0; \ 93 } while (0) 94 95 TASKQUEUE_DEFINE_THREAD(kqueue_ctx); 96 97 static int kevent_copyout(void *arg, struct kevent *kevp, int count); 98 static int kevent_copyin(void *arg, struct kevent *kevp, int count); 99 static int kqueue_register(struct kqueue *kq, struct kevent *kev, 100 struct thread *td, int waitok); 101 static int kqueue_acquire(struct file *fp, struct kqueue **kqp); 102 static void kqueue_release(struct kqueue *kq, int locked); 103 static void kqueue_destroy(struct kqueue *kq); 104 static void kqueue_drain(struct kqueue *kq, struct thread *td); 105 static int kqueue_expand(struct kqueue *kq, struct filterops *fops, 106 uintptr_t ident, int waitok); 107 static void kqueue_task(void *arg, int pending); 108 static int kqueue_scan(struct kqueue *kq, int maxevents, 109 struct kevent_copyops *k_ops, 110 const struct timespec *timeout, 111 struct kevent *keva, struct thread *td); 112 static void kqueue_wakeup(struct kqueue *kq); 113 static struct filterops *kqueue_fo_find(int filt); 114 static void kqueue_fo_release(int filt); 115 struct g_kevent_args; 116 static int kern_kevent_generic(struct thread *td, 117 struct g_kevent_args *uap, 118 struct kevent_copyops *k_ops); 119 120 static fo_ioctl_t kqueue_ioctl; 121 static fo_poll_t kqueue_poll; 122 static fo_kqfilter_t kqueue_kqfilter; 123 static fo_stat_t kqueue_stat; 124 static fo_close_t kqueue_close; 125 static fo_fill_kinfo_t kqueue_fill_kinfo; 126 127 static struct fileops kqueueops = { 128 .fo_read = invfo_rdwr, 129 .fo_write = invfo_rdwr, 130 .fo_truncate = invfo_truncate, 131 .fo_ioctl = kqueue_ioctl, 132 .fo_poll = kqueue_poll, 133 .fo_kqfilter = kqueue_kqfilter, 134 .fo_stat = kqueue_stat, 135 .fo_close = kqueue_close, 136 .fo_chmod = invfo_chmod, 137 .fo_chown = invfo_chown, 138 .fo_sendfile = invfo_sendfile, 139 .fo_fill_kinfo = kqueue_fill_kinfo, 140 }; 141 142 static int knote_attach(struct knote *kn, struct kqueue *kq); 143 static void knote_drop(struct knote *kn, struct thread *td); 144 static void knote_drop_detached(struct knote *kn, struct thread *td); 145 static void knote_enqueue(struct knote *kn); 146 static void knote_dequeue(struct knote *kn); 147 static void knote_init(void); 148 static struct knote *knote_alloc(int waitok); 149 static void knote_free(struct knote *kn); 150 151 static void filt_kqdetach(struct knote *kn); 152 static int filt_kqueue(struct knote *kn, long hint); 153 static int filt_procattach(struct knote *kn); 154 static void filt_procdetach(struct knote *kn); 155 static int filt_proc(struct knote *kn, long hint); 156 static int filt_fileattach(struct knote *kn); 157 static void filt_timerexpire(void *knx); 158 static int filt_timerattach(struct knote *kn); 159 static void filt_timerdetach(struct knote *kn); 160 static int filt_timer(struct knote *kn, long hint); 161 static int filt_userattach(struct knote *kn); 162 static void filt_userdetach(struct knote *kn); 163 static int filt_user(struct knote *kn, long hint); 164 static void filt_usertouch(struct knote *kn, struct kevent *kev, 165 u_long type); 166 167 static struct filterops file_filtops = { 168 .f_isfd = 1, 169 .f_attach = filt_fileattach, 170 }; 171 static struct filterops kqread_filtops = { 172 .f_isfd = 1, 173 .f_detach = filt_kqdetach, 174 .f_event = filt_kqueue, 175 }; 176 /* XXX - move to kern_proc.c? */ 177 static struct filterops proc_filtops = { 178 .f_isfd = 0, 179 .f_attach = filt_procattach, 180 .f_detach = filt_procdetach, 181 .f_event = filt_proc, 182 }; 183 static struct filterops timer_filtops = { 184 .f_isfd = 0, 185 .f_attach = filt_timerattach, 186 .f_detach = filt_timerdetach, 187 .f_event = filt_timer, 188 }; 189 static struct filterops user_filtops = { 190 .f_attach = filt_userattach, 191 .f_detach = filt_userdetach, 192 .f_event = filt_user, 193 .f_touch = filt_usertouch, 194 }; 195 196 static uma_zone_t knote_zone; 197 static unsigned int kq_ncallouts = 0; 198 static unsigned int kq_calloutmax = 4 * 1024; 199 SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, 200 &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); 201 202 /* XXX - ensure not influx ? */ 203 #define KNOTE_ACTIVATE(kn, islock) do { \ 204 if ((islock)) \ 205 mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \ 206 else \ 207 KQ_LOCK((kn)->kn_kq); \ 208 (kn)->kn_status |= KN_ACTIVE; \ 209 if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ 210 knote_enqueue((kn)); \ 211 if (!(islock)) \ 212 KQ_UNLOCK((kn)->kn_kq); \ 213 } while(0) 214 #define KQ_LOCK(kq) do { \ 215 mtx_lock(&(kq)->kq_lock); \ 216 } while (0) 217 #define KQ_FLUX_WAKEUP(kq) do { \ 218 if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \ 219 (kq)->kq_state &= ~KQ_FLUXWAIT; \ 220 wakeup((kq)); \ 221 } \ 222 } while (0) 223 #define KQ_UNLOCK_FLUX(kq) do { \ 224 KQ_FLUX_WAKEUP(kq); \ 225 mtx_unlock(&(kq)->kq_lock); \ 226 } while (0) 227 #define KQ_UNLOCK(kq) do { \ 228 mtx_unlock(&(kq)->kq_lock); \ 229 } while (0) 230 #define KQ_OWNED(kq) do { \ 231 mtx_assert(&(kq)->kq_lock, MA_OWNED); \ 232 } while (0) 233 #define KQ_NOTOWNED(kq) do { \ 234 mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \ 235 } while (0) 236 237 static struct knlist * 238 kn_list_lock(struct knote *kn) 239 { 240 struct knlist *knl; 241 242 knl = kn->kn_knlist; 243 if (knl != NULL) 244 knl->kl_lock(knl->kl_lockarg); 245 return (knl); 246 } 247 248 static void 249 kn_list_unlock(struct knlist *knl) 250 { 251 bool do_free; 252 253 if (knl == NULL) 254 return; 255 do_free = knl->kl_autodestroy && knlist_empty(knl); 256 knl->kl_unlock(knl->kl_lockarg); 257 if (do_free) { 258 knlist_destroy(knl); 259 free(knl, M_KQUEUE); 260 } 261 } 262 263 static bool 264 kn_in_flux(struct knote *kn) 265 { 266 267 return (kn->kn_influx > 0); 268 } 269 270 static void 271 kn_enter_flux(struct knote *kn) 272 { 273 274 KQ_OWNED(kn->kn_kq); 275 MPASS(kn->kn_influx < INT_MAX); 276 kn->kn_influx++; 277 } 278 279 static bool 280 kn_leave_flux(struct knote *kn) 281 { 282 283 KQ_OWNED(kn->kn_kq); 284 MPASS(kn->kn_influx > 0); 285 kn->kn_influx--; 286 return (kn->kn_influx == 0); 287 } 288 289 #define KNL_ASSERT_LOCK(knl, islocked) do { \ 290 if (islocked) \ 291 KNL_ASSERT_LOCKED(knl); \ 292 else \ 293 KNL_ASSERT_UNLOCKED(knl); \ 294 } while (0) 295 #ifdef INVARIANTS 296 #define KNL_ASSERT_LOCKED(knl) do { \ 297 knl->kl_assert_locked((knl)->kl_lockarg); \ 298 } while (0) 299 #define KNL_ASSERT_UNLOCKED(knl) do { \ 300 knl->kl_assert_unlocked((knl)->kl_lockarg); \ 301 } while (0) 302 #else /* !INVARIANTS */ 303 #define KNL_ASSERT_LOCKED(knl) do {} while(0) 304 #define KNL_ASSERT_UNLOCKED(knl) do {} while (0) 305 #endif /* INVARIANTS */ 306 307 #ifndef KN_HASHSIZE 308 #define KN_HASHSIZE 64 /* XXX should be tunable */ 309 #endif 310 311 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) 312 313 static int 314 filt_nullattach(struct knote *kn) 315 { 316 317 return (ENXIO); 318 }; 319 320 struct filterops null_filtops = { 321 .f_isfd = 0, 322 .f_attach = filt_nullattach, 323 }; 324 325 /* XXX - make SYSINIT to add these, and move into respective modules. */ 326 extern struct filterops sig_filtops; 327 extern struct filterops fs_filtops; 328 329 /* 330 * Table for for all system-defined filters. 331 */ 332 static struct mtx filterops_lock; 333 MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops", 334 MTX_DEF); 335 static struct { 336 struct filterops *for_fop; 337 int for_nolock; 338 int for_refcnt; 339 } sysfilt_ops[EVFILT_SYSCOUNT] = { 340 { &file_filtops, 1 }, /* EVFILT_READ */ 341 { &file_filtops, 1 }, /* EVFILT_WRITE */ 342 { &null_filtops }, /* EVFILT_AIO */ 343 { &file_filtops, 1 }, /* EVFILT_VNODE */ 344 { &proc_filtops, 1 }, /* EVFILT_PROC */ 345 { &sig_filtops, 1 }, /* EVFILT_SIGNAL */ 346 { &timer_filtops, 1 }, /* EVFILT_TIMER */ 347 { &file_filtops, 1 }, /* EVFILT_PROCDESC */ 348 { &fs_filtops, 1 }, /* EVFILT_FS */ 349 { &null_filtops }, /* EVFILT_LIO */ 350 { &user_filtops, 1 }, /* EVFILT_USER */ 351 { &null_filtops }, /* EVFILT_SENDFILE */ 352 { &file_filtops, 1 }, /* EVFILT_EMPTY */ 353 }; 354 355 /* 356 * Simple redirection for all cdevsw style objects to call their fo_kqfilter 357 * method. 358 */ 359 static int 360 filt_fileattach(struct knote *kn) 361 { 362 363 return (fo_kqfilter(kn->kn_fp, kn)); 364 } 365 366 /*ARGSUSED*/ 367 static int 368 kqueue_kqfilter(struct file *fp, struct knote *kn) 369 { 370 struct kqueue *kq = kn->kn_fp->f_data; 371 372 if (kn->kn_filter != EVFILT_READ) 373 return (EINVAL); 374 375 kn->kn_status |= KN_KQUEUE; 376 kn->kn_fop = &kqread_filtops; 377 knlist_add(&kq->kq_sel.si_note, kn, 0); 378 379 return (0); 380 } 381 382 static void 383 filt_kqdetach(struct knote *kn) 384 { 385 struct kqueue *kq = kn->kn_fp->f_data; 386 387 knlist_remove(&kq->kq_sel.si_note, kn, 0); 388 } 389 390 /*ARGSUSED*/ 391 static int 392 filt_kqueue(struct knote *kn, long hint) 393 { 394 struct kqueue *kq = kn->kn_fp->f_data; 395 396 kn->kn_data = kq->kq_count; 397 return (kn->kn_data > 0); 398 } 399 400 /* XXX - move to kern_proc.c? */ 401 static int 402 filt_procattach(struct knote *kn) 403 { 404 struct proc *p; 405 int error; 406 bool exiting, immediate; 407 408 exiting = immediate = false; 409 p = pfind(kn->kn_id); 410 if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) { 411 p = zpfind(kn->kn_id); 412 exiting = true; 413 } else if (p != NULL && (p->p_flag & P_WEXIT)) { 414 exiting = true; 415 } 416 417 if (p == NULL) 418 return (ESRCH); 419 if ((error = p_cansee(curthread, p))) { 420 PROC_UNLOCK(p); 421 return (error); 422 } 423 424 kn->kn_ptr.p_proc = p; 425 kn->kn_flags |= EV_CLEAR; /* automatically set */ 426 427 /* 428 * Internal flag indicating registration done by kernel for the 429 * purposes of getting a NOTE_CHILD notification. 430 */ 431 if (kn->kn_flags & EV_FLAG2) { 432 kn->kn_flags &= ~EV_FLAG2; 433 kn->kn_data = kn->kn_sdata; /* ppid */ 434 kn->kn_fflags = NOTE_CHILD; 435 kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK); 436 immediate = true; /* Force immediate activation of child note. */ 437 } 438 /* 439 * Internal flag indicating registration done by kernel (for other than 440 * NOTE_CHILD). 441 */ 442 if (kn->kn_flags & EV_FLAG1) { 443 kn->kn_flags &= ~EV_FLAG1; 444 } 445 446 knlist_add(p->p_klist, kn, 1); 447 448 /* 449 * Immediately activate any child notes or, in the case of a zombie 450 * target process, exit notes. The latter is necessary to handle the 451 * case where the target process, e.g. a child, dies before the kevent 452 * is registered. 453 */ 454 if (immediate || (exiting && filt_proc(kn, NOTE_EXIT))) 455 KNOTE_ACTIVATE(kn, 0); 456 457 PROC_UNLOCK(p); 458 459 return (0); 460 } 461 462 /* 463 * The knote may be attached to a different process, which may exit, 464 * leaving nothing for the knote to be attached to. So when the process 465 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so 466 * it will be deleted when read out. However, as part of the knote deletion, 467 * this routine is called, so a check is needed to avoid actually performing 468 * a detach, because the original process does not exist any more. 469 */ 470 /* XXX - move to kern_proc.c? */ 471 static void 472 filt_procdetach(struct knote *kn) 473 { 474 475 knlist_remove(kn->kn_knlist, kn, 0); 476 kn->kn_ptr.p_proc = NULL; 477 } 478 479 /* XXX - move to kern_proc.c? */ 480 static int 481 filt_proc(struct knote *kn, long hint) 482 { 483 struct proc *p; 484 u_int event; 485 486 p = kn->kn_ptr.p_proc; 487 if (p == NULL) /* already activated, from attach filter */ 488 return (0); 489 490 /* Mask off extra data. */ 491 event = (u_int)hint & NOTE_PCTRLMASK; 492 493 /* If the user is interested in this event, record it. */ 494 if (kn->kn_sfflags & event) 495 kn->kn_fflags |= event; 496 497 /* Process is gone, so flag the event as finished. */ 498 if (event == NOTE_EXIT) { 499 kn->kn_flags |= EV_EOF | EV_ONESHOT; 500 kn->kn_ptr.p_proc = NULL; 501 if (kn->kn_fflags & NOTE_EXIT) 502 kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig); 503 if (kn->kn_fflags == 0) 504 kn->kn_flags |= EV_DROP; 505 return (1); 506 } 507 508 return (kn->kn_fflags != 0); 509 } 510 511 /* 512 * Called when the process forked. It mostly does the same as the 513 * knote(), activating all knotes registered to be activated when the 514 * process forked. Additionally, for each knote attached to the 515 * parent, check whether user wants to track the new process. If so 516 * attach a new knote to it, and immediately report an event with the 517 * child's pid. 518 */ 519 void 520 knote_fork(struct knlist *list, int pid) 521 { 522 struct kqueue *kq; 523 struct knote *kn; 524 struct kevent kev; 525 int error; 526 527 if (list == NULL) 528 return; 529 list->kl_lock(list->kl_lockarg); 530 531 SLIST_FOREACH(kn, &list->kl_list, kn_selnext) { 532 kq = kn->kn_kq; 533 KQ_LOCK(kq); 534 if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { 535 KQ_UNLOCK(kq); 536 continue; 537 } 538 539 /* 540 * The same as knote(), activate the event. 541 */ 542 if ((kn->kn_sfflags & NOTE_TRACK) == 0) { 543 kn->kn_status |= KN_HASKQLOCK; 544 if (kn->kn_fop->f_event(kn, NOTE_FORK)) 545 KNOTE_ACTIVATE(kn, 1); 546 kn->kn_status &= ~KN_HASKQLOCK; 547 KQ_UNLOCK(kq); 548 continue; 549 } 550 551 /* 552 * The NOTE_TRACK case. In addition to the activation 553 * of the event, we need to register new events to 554 * track the child. Drop the locks in preparation for 555 * the call to kqueue_register(). 556 */ 557 kn_enter_flux(kn); 558 KQ_UNLOCK(kq); 559 list->kl_unlock(list->kl_lockarg); 560 561 /* 562 * Activate existing knote and register tracking knotes with 563 * new process. 564 * 565 * First register a knote to get just the child notice. This 566 * must be a separate note from a potential NOTE_EXIT 567 * notification since both NOTE_CHILD and NOTE_EXIT are defined 568 * to use the data field (in conflicting ways). 569 */ 570 kev.ident = pid; 571 kev.filter = kn->kn_filter; 572 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT | 573 EV_FLAG2; 574 kev.fflags = kn->kn_sfflags; 575 kev.data = kn->kn_id; /* parent */ 576 kev.udata = kn->kn_kevent.udata;/* preserve udata */ 577 error = kqueue_register(kq, &kev, NULL, 0); 578 if (error) 579 kn->kn_fflags |= NOTE_TRACKERR; 580 581 /* 582 * Then register another knote to track other potential events 583 * from the new process. 584 */ 585 kev.ident = pid; 586 kev.filter = kn->kn_filter; 587 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; 588 kev.fflags = kn->kn_sfflags; 589 kev.data = kn->kn_id; /* parent */ 590 kev.udata = kn->kn_kevent.udata;/* preserve udata */ 591 error = kqueue_register(kq, &kev, NULL, 0); 592 if (error) 593 kn->kn_fflags |= NOTE_TRACKERR; 594 if (kn->kn_fop->f_event(kn, NOTE_FORK)) 595 KNOTE_ACTIVATE(kn, 0); 596 KQ_LOCK(kq); 597 kn_leave_flux(kn); 598 KQ_UNLOCK_FLUX(kq); 599 list->kl_lock(list->kl_lockarg); 600 } 601 list->kl_unlock(list->kl_lockarg); 602 } 603 604 /* 605 * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the 606 * interval timer support code. 607 */ 608 609 #define NOTE_TIMER_PRECMASK \ 610 (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS) 611 612 static sbintime_t 613 timer2sbintime(intptr_t data, int flags) 614 { 615 int64_t secs; 616 617 /* 618 * Macros for converting to the fractional second portion of an 619 * sbintime_t using 64bit multiplication to improve precision. 620 */ 621 #define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32) 622 #define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32) 623 #define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32) 624 switch (flags & NOTE_TIMER_PRECMASK) { 625 case NOTE_SECONDS: 626 #ifdef __LP64__ 627 if (data > (SBT_MAX / SBT_1S)) 628 return (SBT_MAX); 629 #endif 630 return ((sbintime_t)data << 32); 631 case NOTE_MSECONDS: /* FALLTHROUGH */ 632 case 0: 633 if (data >= 1000) { 634 secs = data / 1000; 635 #ifdef __LP64__ 636 if (secs > (SBT_MAX / SBT_1S)) 637 return (SBT_MAX); 638 #endif 639 return (secs << 32 | MS_TO_SBT(data % 1000)); 640 } 641 return (MS_TO_SBT(data)); 642 case NOTE_USECONDS: 643 if (data >= 1000000) { 644 secs = data / 1000000; 645 #ifdef __LP64__ 646 if (secs > (SBT_MAX / SBT_1S)) 647 return (SBT_MAX); 648 #endif 649 return (secs << 32 | US_TO_SBT(data % 1000000)); 650 } 651 return (US_TO_SBT(data)); 652 case NOTE_NSECONDS: 653 if (data >= 1000000000) { 654 secs = data / 1000000000; 655 #ifdef __LP64__ 656 if (secs > (SBT_MAX / SBT_1S)) 657 return (SBT_MAX); 658 #endif 659 return (secs << 32 | US_TO_SBT(data % 1000000000)); 660 } 661 return (NS_TO_SBT(data)); 662 default: 663 break; 664 } 665 return (-1); 666 } 667 668 struct kq_timer_cb_data { 669 struct callout c; 670 sbintime_t next; /* next timer event fires at */ 671 sbintime_t to; /* precalculated timer period, 0 for abs */ 672 }; 673 674 static void 675 filt_timerexpire(void *knx) 676 { 677 struct knote *kn; 678 struct kq_timer_cb_data *kc; 679 680 kn = knx; 681 kn->kn_data++; 682 KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */ 683 684 if ((kn->kn_flags & EV_ONESHOT) != 0) 685 return; 686 kc = kn->kn_ptr.p_v; 687 if (kc->to == 0) 688 return; 689 kc->next += kc->to; 690 callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn, 691 PCPU_GET(cpuid), C_ABSOLUTE); 692 } 693 694 /* 695 * data contains amount of time to sleep 696 */ 697 static int 698 filt_timerattach(struct knote *kn) 699 { 700 struct kq_timer_cb_data *kc; 701 struct bintime bt; 702 sbintime_t to, sbt; 703 unsigned int ncallouts; 704 705 if (kn->kn_sdata < 0) 706 return (EINVAL); 707 if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0) 708 kn->kn_sdata = 1; 709 /* Only precision unit are supported in flags so far */ 710 if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0) 711 return (EINVAL); 712 713 to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags); 714 if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { 715 getboottimebin(&bt); 716 sbt = bttosbt(bt); 717 to -= sbt; 718 } 719 if (to < 0) 720 return (EINVAL); 721 722 do { 723 ncallouts = kq_ncallouts; 724 if (ncallouts >= kq_calloutmax) 725 return (ENOMEM); 726 } while (!atomic_cmpset_int(&kq_ncallouts, ncallouts, ncallouts + 1)); 727 728 if ((kn->kn_sfflags & NOTE_ABSTIME) == 0) 729 kn->kn_flags |= EV_CLEAR; /* automatically set */ 730 kn->kn_status &= ~KN_DETACHED; /* knlist_add clears it */ 731 kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK); 732 callout_init(&kc->c, 1); 733 if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { 734 kc->next = to; 735 kc->to = 0; 736 } else { 737 kc->next = to + sbinuptime(); 738 kc->to = to; 739 } 740 callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn, 741 PCPU_GET(cpuid), C_ABSOLUTE); 742 743 return (0); 744 } 745 746 static void 747 filt_timerdetach(struct knote *kn) 748 { 749 struct kq_timer_cb_data *kc; 750 unsigned int old; 751 752 kc = kn->kn_ptr.p_v; 753 callout_drain(&kc->c); 754 free(kc, M_KQUEUE); 755 old = atomic_fetchadd_int(&kq_ncallouts, -1); 756 KASSERT(old > 0, ("Number of callouts cannot become negative")); 757 kn->kn_status |= KN_DETACHED; /* knlist_remove sets it */ 758 } 759 760 static int 761 filt_timer(struct knote *kn, long hint) 762 { 763 764 return (kn->kn_data != 0); 765 } 766 767 static int 768 filt_userattach(struct knote *kn) 769 { 770 771 /* 772 * EVFILT_USER knotes are not attached to anything in the kernel. 773 */ 774 kn->kn_hook = NULL; 775 if (kn->kn_fflags & NOTE_TRIGGER) 776 kn->kn_hookid = 1; 777 else 778 kn->kn_hookid = 0; 779 return (0); 780 } 781 782 static void 783 filt_userdetach(__unused struct knote *kn) 784 { 785 786 /* 787 * EVFILT_USER knotes are not attached to anything in the kernel. 788 */ 789 } 790 791 static int 792 filt_user(struct knote *kn, __unused long hint) 793 { 794 795 return (kn->kn_hookid); 796 } 797 798 static void 799 filt_usertouch(struct knote *kn, struct kevent *kev, u_long type) 800 { 801 u_int ffctrl; 802 803 switch (type) { 804 case EVENT_REGISTER: 805 if (kev->fflags & NOTE_TRIGGER) 806 kn->kn_hookid = 1; 807 808 ffctrl = kev->fflags & NOTE_FFCTRLMASK; 809 kev->fflags &= NOTE_FFLAGSMASK; 810 switch (ffctrl) { 811 case NOTE_FFNOP: 812 break; 813 814 case NOTE_FFAND: 815 kn->kn_sfflags &= kev->fflags; 816 break; 817 818 case NOTE_FFOR: 819 kn->kn_sfflags |= kev->fflags; 820 break; 821 822 case NOTE_FFCOPY: 823 kn->kn_sfflags = kev->fflags; 824 break; 825 826 default: 827 /* XXX Return error? */ 828 break; 829 } 830 kn->kn_sdata = kev->data; 831 if (kev->flags & EV_CLEAR) { 832 kn->kn_hookid = 0; 833 kn->kn_data = 0; 834 kn->kn_fflags = 0; 835 } 836 break; 837 838 case EVENT_PROCESS: 839 *kev = kn->kn_kevent; 840 kev->fflags = kn->kn_sfflags; 841 kev->data = kn->kn_sdata; 842 if (kn->kn_flags & EV_CLEAR) { 843 kn->kn_hookid = 0; 844 kn->kn_data = 0; 845 kn->kn_fflags = 0; 846 } 847 break; 848 849 default: 850 panic("filt_usertouch() - invalid type (%ld)", type); 851 break; 852 } 853 } 854 855 int 856 sys_kqueue(struct thread *td, struct kqueue_args *uap) 857 { 858 859 return (kern_kqueue(td, 0, NULL)); 860 } 861 862 static void 863 kqueue_init(struct kqueue *kq) 864 { 865 866 mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK); 867 TAILQ_INIT(&kq->kq_head); 868 knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock); 869 TASK_INIT(&kq->kq_task, 0, kqueue_task, kq); 870 } 871 872 int 873 kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps) 874 { 875 struct filedesc *fdp; 876 struct kqueue *kq; 877 struct file *fp; 878 struct ucred *cred; 879 int fd, error; 880 881 fdp = td->td_proc->p_fd; 882 cred = td->td_ucred; 883 if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES))) 884 return (ENOMEM); 885 886 error = falloc_caps(td, &fp, &fd, flags, fcaps); 887 if (error != 0) { 888 chgkqcnt(cred->cr_ruidinfo, -1, 0); 889 return (error); 890 } 891 892 /* An extra reference on `fp' has been held for us by falloc(). */ 893 kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO); 894 kqueue_init(kq); 895 kq->kq_fdp = fdp; 896 kq->kq_cred = crhold(cred); 897 898 FILEDESC_XLOCK(fdp); 899 TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list); 900 FILEDESC_XUNLOCK(fdp); 901 902 finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); 903 fdrop(fp, td); 904 905 td->td_retval[0] = fd; 906 return (0); 907 } 908 909 #ifdef KTRACE 910 static size_t 911 kev_iovlen(int n, u_int kgio, size_t kevent_size) 912 { 913 914 if (n < 0 || n >= kgio / kevent_size) 915 return (kgio); 916 return (n * kevent_size); 917 } 918 #endif 919 920 struct g_kevent_args { 921 int fd; 922 void *changelist; 923 int nchanges; 924 void *eventlist; 925 int nevents; 926 const struct timespec *timeout; 927 }; 928 929 int 930 sys_kevent(struct thread *td, struct kevent_args *uap) 931 { 932 struct kevent_copyops k_ops = { 933 .arg = uap, 934 .k_copyout = kevent_copyout, 935 .k_copyin = kevent_copyin, 936 .kevent_size = sizeof(struct kevent), 937 }; 938 939 return (kern_kevent_generic(td, (struct g_kevent_args *)uap, &k_ops)); 940 } 941 942 static int 943 kern_kevent_generic(struct thread *td, struct g_kevent_args *uap, 944 struct kevent_copyops *k_ops) 945 { 946 struct timespec ts, *tsp; 947 int error; 948 #ifdef KTRACE 949 struct uio ktruio; 950 struct iovec ktriov; 951 struct uio *ktruioin = NULL; 952 struct uio *ktruioout = NULL; 953 u_int kgio; 954 #endif 955 956 if (uap->timeout != NULL) { 957 error = copyin(uap->timeout, &ts, sizeof(ts)); 958 if (error) 959 return (error); 960 tsp = &ts; 961 } else 962 tsp = NULL; 963 964 #ifdef KTRACE 965 if (KTRPOINT(td, KTR_GENIO)) { 966 kgio = ktr_geniosize; 967 ktriov.iov_base = uap->changelist; 968 ktriov.iov_len = kev_iovlen(uap->nchanges, kgio, 969 k_ops->kevent_size); 970 ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1, 971 .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ, 972 .uio_td = td }; 973 ktruioin = cloneuio(&ktruio); 974 ktriov.iov_base = uap->eventlist; 975 ktriov.iov_len = kev_iovlen(uap->nevents, kgio, 976 k_ops->kevent_size); 977 ktriov.iov_len = uap->nevents * k_ops->kevent_size; 978 ktruioout = cloneuio(&ktruio); 979 } 980 #endif 981 982 error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents, 983 k_ops, tsp); 984 985 #ifdef KTRACE 986 if (ktruioin != NULL) { 987 ktruioin->uio_resid = kev_iovlen(uap->nchanges, kgio, 988 k_ops->kevent_size); 989 ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0); 990 ktruioout->uio_resid = kev_iovlen(td->td_retval[0], kgio, 991 k_ops->kevent_size); 992 ktrgenio(uap->fd, UIO_READ, ktruioout, error); 993 } 994 #endif 995 996 return (error); 997 } 998 999 /* 1000 * Copy 'count' items into the destination list pointed to by uap->eventlist. 1001 */ 1002 static int 1003 kevent_copyout(void *arg, struct kevent *kevp, int count) 1004 { 1005 struct kevent_args *uap; 1006 int error; 1007 1008 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); 1009 uap = (struct kevent_args *)arg; 1010 1011 error = copyout(kevp, uap->eventlist, count * sizeof *kevp); 1012 if (error == 0) 1013 uap->eventlist += count; 1014 return (error); 1015 } 1016 1017 /* 1018 * Copy 'count' items from the list pointed to by uap->changelist. 1019 */ 1020 static int 1021 kevent_copyin(void *arg, struct kevent *kevp, int count) 1022 { 1023 struct kevent_args *uap; 1024 int error; 1025 1026 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); 1027 uap = (struct kevent_args *)arg; 1028 1029 error = copyin(uap->changelist, kevp, count * sizeof *kevp); 1030 if (error == 0) 1031 uap->changelist += count; 1032 return (error); 1033 } 1034 1035 #ifdef COMPAT_FREEBSD11 1036 struct kevent_freebsd11 { 1037 __uintptr_t ident; /* identifier for this event */ 1038 short filter; /* filter for event */ 1039 unsigned short flags; 1040 unsigned int fflags; 1041 __intptr_t data; 1042 void *udata; /* opaque user data identifier */ 1043 }; 1044 1045 static int 1046 kevent11_copyout(void *arg, struct kevent *kevp, int count) 1047 { 1048 struct freebsd11_kevent_args *uap; 1049 struct kevent_freebsd11 kev11; 1050 int error, i; 1051 1052 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); 1053 uap = (struct freebsd11_kevent_args *)arg; 1054 1055 for (i = 0; i < count; i++) { 1056 kev11.ident = kevp->ident; 1057 kev11.filter = kevp->filter; 1058 kev11.flags = kevp->flags; 1059 kev11.fflags = kevp->fflags; 1060 kev11.data = kevp->data; 1061 kev11.udata = kevp->udata; 1062 error = copyout(&kev11, uap->eventlist, sizeof(kev11)); 1063 if (error != 0) 1064 break; 1065 uap->eventlist++; 1066 kevp++; 1067 } 1068 return (error); 1069 } 1070 1071 /* 1072 * Copy 'count' items from the list pointed to by uap->changelist. 1073 */ 1074 static int 1075 kevent11_copyin(void *arg, struct kevent *kevp, int count) 1076 { 1077 struct freebsd11_kevent_args *uap; 1078 struct kevent_freebsd11 kev11; 1079 int error, i; 1080 1081 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); 1082 uap = (struct freebsd11_kevent_args *)arg; 1083 1084 for (i = 0; i < count; i++) { 1085 error = copyin(uap->changelist, &kev11, sizeof(kev11)); 1086 if (error != 0) 1087 break; 1088 kevp->ident = kev11.ident; 1089 kevp->filter = kev11.filter; 1090 kevp->flags = kev11.flags; 1091 kevp->fflags = kev11.fflags; 1092 kevp->data = (uintptr_t)kev11.data; 1093 kevp->udata = kev11.udata; 1094 bzero(&kevp->ext, sizeof(kevp->ext)); 1095 uap->changelist++; 1096 kevp++; 1097 } 1098 return (error); 1099 } 1100 1101 int 1102 freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap) 1103 { 1104 struct kevent_copyops k_ops = { 1105 .arg = uap, 1106 .k_copyout = kevent11_copyout, 1107 .k_copyin = kevent11_copyin, 1108 .kevent_size = sizeof(struct kevent_freebsd11), 1109 }; 1110 1111 return (kern_kevent_generic(td, (struct g_kevent_args *)uap, &k_ops)); 1112 } 1113 #endif 1114 1115 int 1116 kern_kevent(struct thread *td, int fd, int nchanges, int nevents, 1117 struct kevent_copyops *k_ops, const struct timespec *timeout) 1118 { 1119 cap_rights_t rights; 1120 struct file *fp; 1121 int error; 1122 1123 cap_rights_init(&rights); 1124 if (nchanges > 0) 1125 cap_rights_set(&rights, CAP_KQUEUE_CHANGE); 1126 if (nevents > 0) 1127 cap_rights_set(&rights, CAP_KQUEUE_EVENT); 1128 error = fget(td, fd, &rights, &fp); 1129 if (error != 0) 1130 return (error); 1131 1132 error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout); 1133 fdrop(fp, td); 1134 1135 return (error); 1136 } 1137 1138 static int 1139 kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents, 1140 struct kevent_copyops *k_ops, const struct timespec *timeout) 1141 { 1142 struct kevent keva[KQ_NEVENTS]; 1143 struct kevent *kevp, *changes; 1144 int i, n, nerrors, error; 1145 1146 nerrors = 0; 1147 while (nchanges > 0) { 1148 n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges; 1149 error = k_ops->k_copyin(k_ops->arg, keva, n); 1150 if (error) 1151 return (error); 1152 changes = keva; 1153 for (i = 0; i < n; i++) { 1154 kevp = &changes[i]; 1155 if (!kevp->filter) 1156 continue; 1157 kevp->flags &= ~EV_SYSFLAGS; 1158 error = kqueue_register(kq, kevp, td, 1); 1159 if (error || (kevp->flags & EV_RECEIPT)) { 1160 if (nevents == 0) 1161 return (error); 1162 kevp->flags = EV_ERROR; 1163 kevp->data = error; 1164 (void)k_ops->k_copyout(k_ops->arg, kevp, 1); 1165 nevents--; 1166 nerrors++; 1167 } 1168 } 1169 nchanges -= n; 1170 } 1171 if (nerrors) { 1172 td->td_retval[0] = nerrors; 1173 return (0); 1174 } 1175 1176 return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td)); 1177 } 1178 1179 int 1180 kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents, 1181 struct kevent_copyops *k_ops, const struct timespec *timeout) 1182 { 1183 struct kqueue *kq; 1184 int error; 1185 1186 error = kqueue_acquire(fp, &kq); 1187 if (error != 0) 1188 return (error); 1189 error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout); 1190 kqueue_release(kq, 0); 1191 return (error); 1192 } 1193 1194 /* 1195 * Performs a kevent() call on a temporarily created kqueue. This can be 1196 * used to perform one-shot polling, similar to poll() and select(). 1197 */ 1198 int 1199 kern_kevent_anonymous(struct thread *td, int nevents, 1200 struct kevent_copyops *k_ops) 1201 { 1202 struct kqueue kq = {}; 1203 int error; 1204 1205 kqueue_init(&kq); 1206 kq.kq_refcnt = 1; 1207 error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL); 1208 kqueue_drain(&kq, td); 1209 kqueue_destroy(&kq); 1210 return (error); 1211 } 1212 1213 int 1214 kqueue_add_filteropts(int filt, struct filterops *filtops) 1215 { 1216 int error; 1217 1218 error = 0; 1219 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) { 1220 printf( 1221 "trying to add a filterop that is out of range: %d is beyond %d\n", 1222 ~filt, EVFILT_SYSCOUNT); 1223 return EINVAL; 1224 } 1225 mtx_lock(&filterops_lock); 1226 if (sysfilt_ops[~filt].for_fop != &null_filtops && 1227 sysfilt_ops[~filt].for_fop != NULL) 1228 error = EEXIST; 1229 else { 1230 sysfilt_ops[~filt].for_fop = filtops; 1231 sysfilt_ops[~filt].for_refcnt = 0; 1232 } 1233 mtx_unlock(&filterops_lock); 1234 1235 return (error); 1236 } 1237 1238 int 1239 kqueue_del_filteropts(int filt) 1240 { 1241 int error; 1242 1243 error = 0; 1244 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) 1245 return EINVAL; 1246 1247 mtx_lock(&filterops_lock); 1248 if (sysfilt_ops[~filt].for_fop == &null_filtops || 1249 sysfilt_ops[~filt].for_fop == NULL) 1250 error = EINVAL; 1251 else if (sysfilt_ops[~filt].for_refcnt != 0) 1252 error = EBUSY; 1253 else { 1254 sysfilt_ops[~filt].for_fop = &null_filtops; 1255 sysfilt_ops[~filt].for_refcnt = 0; 1256 } 1257 mtx_unlock(&filterops_lock); 1258 1259 return error; 1260 } 1261 1262 static struct filterops * 1263 kqueue_fo_find(int filt) 1264 { 1265 1266 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) 1267 return NULL; 1268 1269 if (sysfilt_ops[~filt].for_nolock) 1270 return sysfilt_ops[~filt].for_fop; 1271 1272 mtx_lock(&filterops_lock); 1273 sysfilt_ops[~filt].for_refcnt++; 1274 if (sysfilt_ops[~filt].for_fop == NULL) 1275 sysfilt_ops[~filt].for_fop = &null_filtops; 1276 mtx_unlock(&filterops_lock); 1277 1278 return sysfilt_ops[~filt].for_fop; 1279 } 1280 1281 static void 1282 kqueue_fo_release(int filt) 1283 { 1284 1285 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) 1286 return; 1287 1288 if (sysfilt_ops[~filt].for_nolock) 1289 return; 1290 1291 mtx_lock(&filterops_lock); 1292 KASSERT(sysfilt_ops[~filt].for_refcnt > 0, 1293 ("filter object refcount not valid on release")); 1294 sysfilt_ops[~filt].for_refcnt--; 1295 mtx_unlock(&filterops_lock); 1296 } 1297 1298 /* 1299 * A ref to kq (obtained via kqueue_acquire) must be held. waitok will 1300 * influence if memory allocation should wait. Make sure it is 0 if you 1301 * hold any mutexes. 1302 */ 1303 static int 1304 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok) 1305 { 1306 struct filterops *fops; 1307 struct file *fp; 1308 struct knote *kn, *tkn; 1309 struct knlist *knl; 1310 cap_rights_t rights; 1311 int error, filt, event; 1312 int haskqglobal, filedesc_unlock; 1313 1314 if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE)) 1315 return (EINVAL); 1316 1317 fp = NULL; 1318 kn = NULL; 1319 knl = NULL; 1320 error = 0; 1321 haskqglobal = 0; 1322 filedesc_unlock = 0; 1323 1324 filt = kev->filter; 1325 fops = kqueue_fo_find(filt); 1326 if (fops == NULL) 1327 return EINVAL; 1328 1329 if (kev->flags & EV_ADD) { 1330 /* 1331 * Prevent waiting with locks. Non-sleepable 1332 * allocation failures are handled in the loop, only 1333 * if the spare knote appears to be actually required. 1334 */ 1335 tkn = knote_alloc(waitok); 1336 } else { 1337 tkn = NULL; 1338 } 1339 1340 findkn: 1341 if (fops->f_isfd) { 1342 KASSERT(td != NULL, ("td is NULL")); 1343 if (kev->ident > INT_MAX) 1344 error = EBADF; 1345 else 1346 error = fget(td, kev->ident, 1347 cap_rights_init(&rights, CAP_EVENT), &fp); 1348 if (error) 1349 goto done; 1350 1351 if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops, 1352 kev->ident, 0) != 0) { 1353 /* try again */ 1354 fdrop(fp, td); 1355 fp = NULL; 1356 error = kqueue_expand(kq, fops, kev->ident, waitok); 1357 if (error) 1358 goto done; 1359 goto findkn; 1360 } 1361 1362 if (fp->f_type == DTYPE_KQUEUE) { 1363 /* 1364 * If we add some intelligence about what we are doing, 1365 * we should be able to support events on ourselves. 1366 * We need to know when we are doing this to prevent 1367 * getting both the knlist lock and the kq lock since 1368 * they are the same thing. 1369 */ 1370 if (fp->f_data == kq) { 1371 error = EINVAL; 1372 goto done; 1373 } 1374 1375 /* 1376 * Pre-lock the filedesc before the global 1377 * lock mutex, see the comment in 1378 * kqueue_close(). 1379 */ 1380 FILEDESC_XLOCK(td->td_proc->p_fd); 1381 filedesc_unlock = 1; 1382 KQ_GLOBAL_LOCK(&kq_global, haskqglobal); 1383 } 1384 1385 KQ_LOCK(kq); 1386 if (kev->ident < kq->kq_knlistsize) { 1387 SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link) 1388 if (kev->filter == kn->kn_filter) 1389 break; 1390 } 1391 } else { 1392 if ((kev->flags & EV_ADD) == EV_ADD) 1393 kqueue_expand(kq, fops, kev->ident, waitok); 1394 1395 KQ_LOCK(kq); 1396 1397 /* 1398 * If possible, find an existing knote to use for this kevent. 1399 */ 1400 if (kev->filter == EVFILT_PROC && 1401 (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) { 1402 /* This is an internal creation of a process tracking 1403 * note. Don't attempt to coalesce this with an 1404 * existing note. 1405 */ 1406 ; 1407 } else if (kq->kq_knhashmask != 0) { 1408 struct klist *list; 1409 1410 list = &kq->kq_knhash[ 1411 KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; 1412 SLIST_FOREACH(kn, list, kn_link) 1413 if (kev->ident == kn->kn_id && 1414 kev->filter == kn->kn_filter) 1415 break; 1416 } 1417 } 1418 1419 /* knote is in the process of changing, wait for it to stabilize. */ 1420 if (kn != NULL && kn_in_flux(kn)) { 1421 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1422 if (filedesc_unlock) { 1423 FILEDESC_XUNLOCK(td->td_proc->p_fd); 1424 filedesc_unlock = 0; 1425 } 1426 kq->kq_state |= KQ_FLUXWAIT; 1427 msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0); 1428 if (fp != NULL) { 1429 fdrop(fp, td); 1430 fp = NULL; 1431 } 1432 goto findkn; 1433 } 1434 1435 /* 1436 * kn now contains the matching knote, or NULL if no match 1437 */ 1438 if (kn == NULL) { 1439 if (kev->flags & EV_ADD) { 1440 kn = tkn; 1441 tkn = NULL; 1442 if (kn == NULL) { 1443 KQ_UNLOCK(kq); 1444 error = ENOMEM; 1445 goto done; 1446 } 1447 kn->kn_fp = fp; 1448 kn->kn_kq = kq; 1449 kn->kn_fop = fops; 1450 /* 1451 * apply reference counts to knote structure, and 1452 * do not release it at the end of this routine. 1453 */ 1454 fops = NULL; 1455 fp = NULL; 1456 1457 kn->kn_sfflags = kev->fflags; 1458 kn->kn_sdata = kev->data; 1459 kev->fflags = 0; 1460 kev->data = 0; 1461 kn->kn_kevent = *kev; 1462 kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE | 1463 EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT); 1464 kn->kn_status = KN_DETACHED; 1465 kn_enter_flux(kn); 1466 1467 error = knote_attach(kn, kq); 1468 KQ_UNLOCK(kq); 1469 if (error != 0) { 1470 tkn = kn; 1471 goto done; 1472 } 1473 1474 if ((error = kn->kn_fop->f_attach(kn)) != 0) { 1475 knote_drop_detached(kn, td); 1476 goto done; 1477 } 1478 knl = kn_list_lock(kn); 1479 goto done_ev_add; 1480 } else { 1481 /* No matching knote and the EV_ADD flag is not set. */ 1482 KQ_UNLOCK(kq); 1483 error = ENOENT; 1484 goto done; 1485 } 1486 } 1487 1488 if (kev->flags & EV_DELETE) { 1489 kn_enter_flux(kn); 1490 KQ_UNLOCK(kq); 1491 knote_drop(kn, td); 1492 goto done; 1493 } 1494 1495 if (kev->flags & EV_FORCEONESHOT) { 1496 kn->kn_flags |= EV_ONESHOT; 1497 KNOTE_ACTIVATE(kn, 1); 1498 } 1499 1500 /* 1501 * The user may change some filter values after the initial EV_ADD, 1502 * but doing so will not reset any filter which has already been 1503 * triggered. 1504 */ 1505 kn->kn_status |= KN_SCAN; 1506 kn_enter_flux(kn); 1507 KQ_UNLOCK(kq); 1508 knl = kn_list_lock(kn); 1509 kn->kn_kevent.udata = kev->udata; 1510 if (!fops->f_isfd && fops->f_touch != NULL) { 1511 fops->f_touch(kn, kev, EVENT_REGISTER); 1512 } else { 1513 kn->kn_sfflags = kev->fflags; 1514 kn->kn_sdata = kev->data; 1515 } 1516 1517 /* 1518 * We can get here with kn->kn_knlist == NULL. This can happen when 1519 * the initial attach event decides that the event is "completed" 1520 * already. i.e. filt_procattach is called on a zombie process. It 1521 * will call filt_proc which will remove it from the list, and NULL 1522 * kn_knlist. 1523 */ 1524 done_ev_add: 1525 if ((kev->flags & EV_ENABLE) != 0) 1526 kn->kn_status &= ~KN_DISABLED; 1527 else if ((kev->flags & EV_DISABLE) != 0) 1528 kn->kn_status |= KN_DISABLED; 1529 1530 if ((kn->kn_status & KN_DISABLED) == 0) 1531 event = kn->kn_fop->f_event(kn, 0); 1532 else 1533 event = 0; 1534 1535 KQ_LOCK(kq); 1536 if (event) 1537 kn->kn_status |= KN_ACTIVE; 1538 if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) == 1539 KN_ACTIVE) 1540 knote_enqueue(kn); 1541 kn->kn_status &= ~KN_SCAN; 1542 kn_leave_flux(kn); 1543 kn_list_unlock(knl); 1544 KQ_UNLOCK_FLUX(kq); 1545 1546 done: 1547 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1548 if (filedesc_unlock) 1549 FILEDESC_XUNLOCK(td->td_proc->p_fd); 1550 if (fp != NULL) 1551 fdrop(fp, td); 1552 knote_free(tkn); 1553 if (fops != NULL) 1554 kqueue_fo_release(filt); 1555 return (error); 1556 } 1557 1558 static int 1559 kqueue_acquire(struct file *fp, struct kqueue **kqp) 1560 { 1561 int error; 1562 struct kqueue *kq; 1563 1564 error = 0; 1565 1566 kq = fp->f_data; 1567 if (fp->f_type != DTYPE_KQUEUE || kq == NULL) 1568 return (EBADF); 1569 *kqp = kq; 1570 KQ_LOCK(kq); 1571 if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { 1572 KQ_UNLOCK(kq); 1573 return (EBADF); 1574 } 1575 kq->kq_refcnt++; 1576 KQ_UNLOCK(kq); 1577 1578 return error; 1579 } 1580 1581 static void 1582 kqueue_release(struct kqueue *kq, int locked) 1583 { 1584 if (locked) 1585 KQ_OWNED(kq); 1586 else 1587 KQ_LOCK(kq); 1588 kq->kq_refcnt--; 1589 if (kq->kq_refcnt == 1) 1590 wakeup(&kq->kq_refcnt); 1591 if (!locked) 1592 KQ_UNLOCK(kq); 1593 } 1594 1595 static void 1596 kqueue_schedtask(struct kqueue *kq) 1597 { 1598 1599 KQ_OWNED(kq); 1600 KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN), 1601 ("scheduling kqueue task while draining")); 1602 1603 if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) { 1604 taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task); 1605 kq->kq_state |= KQ_TASKSCHED; 1606 } 1607 } 1608 1609 /* 1610 * Expand the kq to make sure we have storage for fops/ident pair. 1611 * 1612 * Return 0 on success (or no work necessary), return errno on failure. 1613 * 1614 * Not calling hashinit w/ waitok (proper malloc flag) should be safe. 1615 * If kqueue_register is called from a non-fd context, there usually/should 1616 * be no locks held. 1617 */ 1618 static int 1619 kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, 1620 int waitok) 1621 { 1622 struct klist *list, *tmp_knhash, *to_free; 1623 u_long tmp_knhashmask; 1624 int size; 1625 int fd; 1626 int mflag = waitok ? M_WAITOK : M_NOWAIT; 1627 1628 KQ_NOTOWNED(kq); 1629 1630 to_free = NULL; 1631 if (fops->f_isfd) { 1632 fd = ident; 1633 if (kq->kq_knlistsize <= fd) { 1634 size = kq->kq_knlistsize; 1635 while (size <= fd) 1636 size += KQEXTENT; 1637 list = malloc(size * sizeof(*list), M_KQUEUE, mflag); 1638 if (list == NULL) 1639 return ENOMEM; 1640 KQ_LOCK(kq); 1641 if (kq->kq_knlistsize > fd) { 1642 to_free = list; 1643 list = NULL; 1644 } else { 1645 if (kq->kq_knlist != NULL) { 1646 bcopy(kq->kq_knlist, list, 1647 kq->kq_knlistsize * sizeof(*list)); 1648 to_free = kq->kq_knlist; 1649 kq->kq_knlist = NULL; 1650 } 1651 bzero((caddr_t)list + 1652 kq->kq_knlistsize * sizeof(*list), 1653 (size - kq->kq_knlistsize) * sizeof(*list)); 1654 kq->kq_knlistsize = size; 1655 kq->kq_knlist = list; 1656 } 1657 KQ_UNLOCK(kq); 1658 } 1659 } else { 1660 if (kq->kq_knhashmask == 0) { 1661 tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE, 1662 &tmp_knhashmask); 1663 if (tmp_knhash == NULL) 1664 return ENOMEM; 1665 KQ_LOCK(kq); 1666 if (kq->kq_knhashmask == 0) { 1667 kq->kq_knhash = tmp_knhash; 1668 kq->kq_knhashmask = tmp_knhashmask; 1669 } else { 1670 to_free = tmp_knhash; 1671 } 1672 KQ_UNLOCK(kq); 1673 } 1674 } 1675 free(to_free, M_KQUEUE); 1676 1677 KQ_NOTOWNED(kq); 1678 return 0; 1679 } 1680 1681 static void 1682 kqueue_task(void *arg, int pending) 1683 { 1684 struct kqueue *kq; 1685 int haskqglobal; 1686 1687 haskqglobal = 0; 1688 kq = arg; 1689 1690 KQ_GLOBAL_LOCK(&kq_global, haskqglobal); 1691 KQ_LOCK(kq); 1692 1693 KNOTE_LOCKED(&kq->kq_sel.si_note, 0); 1694 1695 kq->kq_state &= ~KQ_TASKSCHED; 1696 if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) { 1697 wakeup(&kq->kq_state); 1698 } 1699 KQ_UNLOCK(kq); 1700 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1701 } 1702 1703 /* 1704 * Scan, update kn_data (if not ONESHOT), and copyout triggered events. 1705 * We treat KN_MARKER knotes as if they are in flux. 1706 */ 1707 static int 1708 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, 1709 const struct timespec *tsp, struct kevent *keva, struct thread *td) 1710 { 1711 struct kevent *kevp; 1712 struct knote *kn, *marker; 1713 struct knlist *knl; 1714 sbintime_t asbt, rsbt; 1715 int count, error, haskqglobal, influx, nkev, touch; 1716 1717 count = maxevents; 1718 nkev = 0; 1719 error = 0; 1720 haskqglobal = 0; 1721 1722 if (maxevents == 0) 1723 goto done_nl; 1724 1725 rsbt = 0; 1726 if (tsp != NULL) { 1727 if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 || 1728 tsp->tv_nsec >= 1000000000) { 1729 error = EINVAL; 1730 goto done_nl; 1731 } 1732 if (timespecisset(tsp)) { 1733 if (tsp->tv_sec <= INT32_MAX) { 1734 rsbt = tstosbt(*tsp); 1735 if (TIMESEL(&asbt, rsbt)) 1736 asbt += tc_tick_sbt; 1737 if (asbt <= SBT_MAX - rsbt) 1738 asbt += rsbt; 1739 else 1740 asbt = 0; 1741 rsbt >>= tc_precexp; 1742 } else 1743 asbt = 0; 1744 } else 1745 asbt = -1; 1746 } else 1747 asbt = 0; 1748 marker = knote_alloc(1); 1749 marker->kn_status = KN_MARKER; 1750 KQ_LOCK(kq); 1751 1752 retry: 1753 kevp = keva; 1754 if (kq->kq_count == 0) { 1755 if (asbt == -1) { 1756 error = EWOULDBLOCK; 1757 } else { 1758 kq->kq_state |= KQ_SLEEP; 1759 error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH, 1760 "kqread", asbt, rsbt, C_ABSOLUTE); 1761 } 1762 if (error == 0) 1763 goto retry; 1764 /* don't restart after signals... */ 1765 if (error == ERESTART) 1766 error = EINTR; 1767 else if (error == EWOULDBLOCK) 1768 error = 0; 1769 goto done; 1770 } 1771 1772 TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe); 1773 influx = 0; 1774 while (count) { 1775 KQ_OWNED(kq); 1776 kn = TAILQ_FIRST(&kq->kq_head); 1777 1778 if ((kn->kn_status == KN_MARKER && kn != marker) || 1779 kn_in_flux(kn)) { 1780 if (influx) { 1781 influx = 0; 1782 KQ_FLUX_WAKEUP(kq); 1783 } 1784 kq->kq_state |= KQ_FLUXWAIT; 1785 error = msleep(kq, &kq->kq_lock, PSOCK, 1786 "kqflxwt", 0); 1787 continue; 1788 } 1789 1790 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 1791 if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) { 1792 kn->kn_status &= ~KN_QUEUED; 1793 kq->kq_count--; 1794 continue; 1795 } 1796 if (kn == marker) { 1797 KQ_FLUX_WAKEUP(kq); 1798 if (count == maxevents) 1799 goto retry; 1800 goto done; 1801 } 1802 KASSERT(!kn_in_flux(kn), 1803 ("knote %p is unexpectedly in flux", kn)); 1804 1805 if ((kn->kn_flags & EV_DROP) == EV_DROP) { 1806 kn->kn_status &= ~KN_QUEUED; 1807 kn_enter_flux(kn); 1808 kq->kq_count--; 1809 KQ_UNLOCK(kq); 1810 /* 1811 * We don't need to lock the list since we've 1812 * marked it as in flux. 1813 */ 1814 knote_drop(kn, td); 1815 KQ_LOCK(kq); 1816 continue; 1817 } else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) { 1818 kn->kn_status &= ~KN_QUEUED; 1819 kn_enter_flux(kn); 1820 kq->kq_count--; 1821 KQ_UNLOCK(kq); 1822 /* 1823 * We don't need to lock the list since we've 1824 * marked the knote as being in flux. 1825 */ 1826 *kevp = kn->kn_kevent; 1827 knote_drop(kn, td); 1828 KQ_LOCK(kq); 1829 kn = NULL; 1830 } else { 1831 kn->kn_status |= KN_SCAN; 1832 kn_enter_flux(kn); 1833 KQ_UNLOCK(kq); 1834 if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE) 1835 KQ_GLOBAL_LOCK(&kq_global, haskqglobal); 1836 knl = kn_list_lock(kn); 1837 if (kn->kn_fop->f_event(kn, 0) == 0) { 1838 KQ_LOCK(kq); 1839 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1840 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE | 1841 KN_SCAN); 1842 kn_leave_flux(kn); 1843 kq->kq_count--; 1844 kn_list_unlock(knl); 1845 influx = 1; 1846 continue; 1847 } 1848 touch = (!kn->kn_fop->f_isfd && 1849 kn->kn_fop->f_touch != NULL); 1850 if (touch) 1851 kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS); 1852 else 1853 *kevp = kn->kn_kevent; 1854 KQ_LOCK(kq); 1855 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1856 if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) { 1857 /* 1858 * Manually clear knotes who weren't 1859 * 'touch'ed. 1860 */ 1861 if (touch == 0 && kn->kn_flags & EV_CLEAR) { 1862 kn->kn_data = 0; 1863 kn->kn_fflags = 0; 1864 } 1865 if (kn->kn_flags & EV_DISPATCH) 1866 kn->kn_status |= KN_DISABLED; 1867 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); 1868 kq->kq_count--; 1869 } else 1870 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 1871 1872 kn->kn_status &= ~KN_SCAN; 1873 kn_leave_flux(kn); 1874 kn_list_unlock(knl); 1875 influx = 1; 1876 } 1877 1878 /* we are returning a copy to the user */ 1879 kevp++; 1880 nkev++; 1881 count--; 1882 1883 if (nkev == KQ_NEVENTS) { 1884 influx = 0; 1885 KQ_UNLOCK_FLUX(kq); 1886 error = k_ops->k_copyout(k_ops->arg, keva, nkev); 1887 nkev = 0; 1888 kevp = keva; 1889 KQ_LOCK(kq); 1890 if (error) 1891 break; 1892 } 1893 } 1894 TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe); 1895 done: 1896 KQ_OWNED(kq); 1897 KQ_UNLOCK_FLUX(kq); 1898 knote_free(marker); 1899 done_nl: 1900 KQ_NOTOWNED(kq); 1901 if (nkev != 0) 1902 error = k_ops->k_copyout(k_ops->arg, keva, nkev); 1903 td->td_retval[0] = maxevents - count; 1904 return (error); 1905 } 1906 1907 /*ARGSUSED*/ 1908 static int 1909 kqueue_ioctl(struct file *fp, u_long cmd, void *data, 1910 struct ucred *active_cred, struct thread *td) 1911 { 1912 /* 1913 * Enabling sigio causes two major problems: 1914 * 1) infinite recursion: 1915 * Synopsys: kevent is being used to track signals and have FIOASYNC 1916 * set. On receipt of a signal this will cause a kqueue to recurse 1917 * into itself over and over. Sending the sigio causes the kqueue 1918 * to become ready, which in turn posts sigio again, forever. 1919 * Solution: this can be solved by setting a flag in the kqueue that 1920 * we have a SIGIO in progress. 1921 * 2) locking problems: 1922 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts 1923 * us above the proc and pgrp locks. 1924 * Solution: Post a signal using an async mechanism, being sure to 1925 * record a generation count in the delivery so that we do not deliver 1926 * a signal to the wrong process. 1927 * 1928 * Note, these two mechanisms are somewhat mutually exclusive! 1929 */ 1930 #if 0 1931 struct kqueue *kq; 1932 1933 kq = fp->f_data; 1934 switch (cmd) { 1935 case FIOASYNC: 1936 if (*(int *)data) { 1937 kq->kq_state |= KQ_ASYNC; 1938 } else { 1939 kq->kq_state &= ~KQ_ASYNC; 1940 } 1941 return (0); 1942 1943 case FIOSETOWN: 1944 return (fsetown(*(int *)data, &kq->kq_sigio)); 1945 1946 case FIOGETOWN: 1947 *(int *)data = fgetown(&kq->kq_sigio); 1948 return (0); 1949 } 1950 #endif 1951 1952 return (ENOTTY); 1953 } 1954 1955 /*ARGSUSED*/ 1956 static int 1957 kqueue_poll(struct file *fp, int events, struct ucred *active_cred, 1958 struct thread *td) 1959 { 1960 struct kqueue *kq; 1961 int revents = 0; 1962 int error; 1963 1964 if ((error = kqueue_acquire(fp, &kq))) 1965 return POLLERR; 1966 1967 KQ_LOCK(kq); 1968 if (events & (POLLIN | POLLRDNORM)) { 1969 if (kq->kq_count) { 1970 revents |= events & (POLLIN | POLLRDNORM); 1971 } else { 1972 selrecord(td, &kq->kq_sel); 1973 if (SEL_WAITING(&kq->kq_sel)) 1974 kq->kq_state |= KQ_SEL; 1975 } 1976 } 1977 kqueue_release(kq, 1); 1978 KQ_UNLOCK(kq); 1979 return (revents); 1980 } 1981 1982 /*ARGSUSED*/ 1983 static int 1984 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred, 1985 struct thread *td) 1986 { 1987 1988 bzero((void *)st, sizeof *st); 1989 /* 1990 * We no longer return kq_count because the unlocked value is useless. 1991 * If you spent all this time getting the count, why not spend your 1992 * syscall better by calling kevent? 1993 * 1994 * XXX - This is needed for libc_r. 1995 */ 1996 st->st_mode = S_IFIFO; 1997 return (0); 1998 } 1999 2000 static void 2001 kqueue_drain(struct kqueue *kq, struct thread *td) 2002 { 2003 struct knote *kn; 2004 int i; 2005 2006 KQ_LOCK(kq); 2007 2008 KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING, 2009 ("kqueue already closing")); 2010 kq->kq_state |= KQ_CLOSING; 2011 if (kq->kq_refcnt > 1) 2012 msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0); 2013 2014 KASSERT(kq->kq_refcnt == 1, ("other refs are out there!")); 2015 2016 KASSERT(knlist_empty(&kq->kq_sel.si_note), 2017 ("kqueue's knlist not empty")); 2018 2019 for (i = 0; i < kq->kq_knlistsize; i++) { 2020 while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) { 2021 if (kn_in_flux(kn)) { 2022 kq->kq_state |= KQ_FLUXWAIT; 2023 msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0); 2024 continue; 2025 } 2026 kn_enter_flux(kn); 2027 KQ_UNLOCK(kq); 2028 knote_drop(kn, td); 2029 KQ_LOCK(kq); 2030 } 2031 } 2032 if (kq->kq_knhashmask != 0) { 2033 for (i = 0; i <= kq->kq_knhashmask; i++) { 2034 while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) { 2035 if (kn_in_flux(kn)) { 2036 kq->kq_state |= KQ_FLUXWAIT; 2037 msleep(kq, &kq->kq_lock, PSOCK, 2038 "kqclo2", 0); 2039 continue; 2040 } 2041 kn_enter_flux(kn); 2042 KQ_UNLOCK(kq); 2043 knote_drop(kn, td); 2044 KQ_LOCK(kq); 2045 } 2046 } 2047 } 2048 2049 if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) { 2050 kq->kq_state |= KQ_TASKDRAIN; 2051 msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0); 2052 } 2053 2054 if ((kq->kq_state & KQ_SEL) == KQ_SEL) { 2055 selwakeuppri(&kq->kq_sel, PSOCK); 2056 if (!SEL_WAITING(&kq->kq_sel)) 2057 kq->kq_state &= ~KQ_SEL; 2058 } 2059 2060 KQ_UNLOCK(kq); 2061 } 2062 2063 static void 2064 kqueue_destroy(struct kqueue *kq) 2065 { 2066 2067 KASSERT(kq->kq_fdp == NULL, 2068 ("kqueue still attached to a file descriptor")); 2069 seldrain(&kq->kq_sel); 2070 knlist_destroy(&kq->kq_sel.si_note); 2071 mtx_destroy(&kq->kq_lock); 2072 2073 if (kq->kq_knhash != NULL) 2074 free(kq->kq_knhash, M_KQUEUE); 2075 if (kq->kq_knlist != NULL) 2076 free(kq->kq_knlist, M_KQUEUE); 2077 2078 funsetown(&kq->kq_sigio); 2079 } 2080 2081 /*ARGSUSED*/ 2082 static int 2083 kqueue_close(struct file *fp, struct thread *td) 2084 { 2085 struct kqueue *kq = fp->f_data; 2086 struct filedesc *fdp; 2087 int error; 2088 int filedesc_unlock; 2089 2090 if ((error = kqueue_acquire(fp, &kq))) 2091 return error; 2092 kqueue_drain(kq, td); 2093 2094 /* 2095 * We could be called due to the knote_drop() doing fdrop(), 2096 * called from kqueue_register(). In this case the global 2097 * lock is owned, and filedesc sx is locked before, to not 2098 * take the sleepable lock after non-sleepable. 2099 */ 2100 fdp = kq->kq_fdp; 2101 kq->kq_fdp = NULL; 2102 if (!sx_xlocked(FILEDESC_LOCK(fdp))) { 2103 FILEDESC_XLOCK(fdp); 2104 filedesc_unlock = 1; 2105 } else 2106 filedesc_unlock = 0; 2107 TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list); 2108 if (filedesc_unlock) 2109 FILEDESC_XUNLOCK(fdp); 2110 2111 kqueue_destroy(kq); 2112 chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0); 2113 crfree(kq->kq_cred); 2114 free(kq, M_KQUEUE); 2115 fp->f_data = NULL; 2116 2117 return (0); 2118 } 2119 2120 static int 2121 kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 2122 { 2123 2124 kif->kf_type = KF_TYPE_KQUEUE; 2125 return (0); 2126 } 2127 2128 static void 2129 kqueue_wakeup(struct kqueue *kq) 2130 { 2131 KQ_OWNED(kq); 2132 2133 if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) { 2134 kq->kq_state &= ~KQ_SLEEP; 2135 wakeup(kq); 2136 } 2137 if ((kq->kq_state & KQ_SEL) == KQ_SEL) { 2138 selwakeuppri(&kq->kq_sel, PSOCK); 2139 if (!SEL_WAITING(&kq->kq_sel)) 2140 kq->kq_state &= ~KQ_SEL; 2141 } 2142 if (!knlist_empty(&kq->kq_sel.si_note)) 2143 kqueue_schedtask(kq); 2144 if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) { 2145 pgsigio(&kq->kq_sigio, SIGIO, 0); 2146 } 2147 } 2148 2149 /* 2150 * Walk down a list of knotes, activating them if their event has triggered. 2151 * 2152 * There is a possibility to optimize in the case of one kq watching another. 2153 * Instead of scheduling a task to wake it up, you could pass enough state 2154 * down the chain to make up the parent kqueue. Make this code functional 2155 * first. 2156 */ 2157 void 2158 knote(struct knlist *list, long hint, int lockflags) 2159 { 2160 struct kqueue *kq; 2161 struct knote *kn, *tkn; 2162 int error; 2163 2164 if (list == NULL) 2165 return; 2166 2167 KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED); 2168 2169 if ((lockflags & KNF_LISTLOCKED) == 0) 2170 list->kl_lock(list->kl_lockarg); 2171 2172 /* 2173 * If we unlock the list lock (and enter influx), we can 2174 * eliminate the kqueue scheduling, but this will introduce 2175 * four lock/unlock's for each knote to test. Also, marker 2176 * would be needed to keep iteration position, since filters 2177 * or other threads could remove events. 2178 */ 2179 SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) { 2180 kq = kn->kn_kq; 2181 KQ_LOCK(kq); 2182 if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { 2183 /* 2184 * Do not process the influx notes, except for 2185 * the influx coming from the kq unlock in the 2186 * kqueue_scan(). In the later case, we do 2187 * not interfere with the scan, since the code 2188 * fragment in kqueue_scan() locks the knlist, 2189 * and cannot proceed until we finished. 2190 */ 2191 KQ_UNLOCK(kq); 2192 } else if ((lockflags & KNF_NOKQLOCK) != 0) { 2193 kn_enter_flux(kn); 2194 KQ_UNLOCK(kq); 2195 error = kn->kn_fop->f_event(kn, hint); 2196 KQ_LOCK(kq); 2197 kn_leave_flux(kn); 2198 if (error) 2199 KNOTE_ACTIVATE(kn, 1); 2200 KQ_UNLOCK_FLUX(kq); 2201 } else { 2202 kn->kn_status |= KN_HASKQLOCK; 2203 if (kn->kn_fop->f_event(kn, hint)) 2204 KNOTE_ACTIVATE(kn, 1); 2205 kn->kn_status &= ~KN_HASKQLOCK; 2206 KQ_UNLOCK(kq); 2207 } 2208 } 2209 if ((lockflags & KNF_LISTLOCKED) == 0) 2210 list->kl_unlock(list->kl_lockarg); 2211 } 2212 2213 /* 2214 * add a knote to a knlist 2215 */ 2216 void 2217 knlist_add(struct knlist *knl, struct knote *kn, int islocked) 2218 { 2219 2220 KNL_ASSERT_LOCK(knl, islocked); 2221 KQ_NOTOWNED(kn->kn_kq); 2222 KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn)); 2223 KASSERT((kn->kn_status & KN_DETACHED) != 0, 2224 ("knote %p was not detached", kn)); 2225 if (!islocked) 2226 knl->kl_lock(knl->kl_lockarg); 2227 SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext); 2228 if (!islocked) 2229 knl->kl_unlock(knl->kl_lockarg); 2230 KQ_LOCK(kn->kn_kq); 2231 kn->kn_knlist = knl; 2232 kn->kn_status &= ~KN_DETACHED; 2233 KQ_UNLOCK(kn->kn_kq); 2234 } 2235 2236 static void 2237 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, 2238 int kqislocked) 2239 { 2240 2241 KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked")); 2242 KNL_ASSERT_LOCK(knl, knlislocked); 2243 mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED); 2244 KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn)); 2245 KASSERT((kn->kn_status & KN_DETACHED) == 0, 2246 ("knote %p was already detached", kn)); 2247 if (!knlislocked) 2248 knl->kl_lock(knl->kl_lockarg); 2249 SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext); 2250 kn->kn_knlist = NULL; 2251 if (!knlislocked) 2252 kn_list_unlock(knl); 2253 if (!kqislocked) 2254 KQ_LOCK(kn->kn_kq); 2255 kn->kn_status |= KN_DETACHED; 2256 if (!kqislocked) 2257 KQ_UNLOCK(kn->kn_kq); 2258 } 2259 2260 /* 2261 * remove knote from the specified knlist 2262 */ 2263 void 2264 knlist_remove(struct knlist *knl, struct knote *kn, int islocked) 2265 { 2266 2267 knlist_remove_kq(knl, kn, islocked, 0); 2268 } 2269 2270 int 2271 knlist_empty(struct knlist *knl) 2272 { 2273 2274 KNL_ASSERT_LOCKED(knl); 2275 return (SLIST_EMPTY(&knl->kl_list)); 2276 } 2277 2278 static struct mtx knlist_lock; 2279 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects", 2280 MTX_DEF); 2281 static void knlist_mtx_lock(void *arg); 2282 static void knlist_mtx_unlock(void *arg); 2283 2284 static void 2285 knlist_mtx_lock(void *arg) 2286 { 2287 2288 mtx_lock((struct mtx *)arg); 2289 } 2290 2291 static void 2292 knlist_mtx_unlock(void *arg) 2293 { 2294 2295 mtx_unlock((struct mtx *)arg); 2296 } 2297 2298 static void 2299 knlist_mtx_assert_locked(void *arg) 2300 { 2301 2302 mtx_assert((struct mtx *)arg, MA_OWNED); 2303 } 2304 2305 static void 2306 knlist_mtx_assert_unlocked(void *arg) 2307 { 2308 2309 mtx_assert((struct mtx *)arg, MA_NOTOWNED); 2310 } 2311 2312 static void 2313 knlist_rw_rlock(void *arg) 2314 { 2315 2316 rw_rlock((struct rwlock *)arg); 2317 } 2318 2319 static void 2320 knlist_rw_runlock(void *arg) 2321 { 2322 2323 rw_runlock((struct rwlock *)arg); 2324 } 2325 2326 static void 2327 knlist_rw_assert_locked(void *arg) 2328 { 2329 2330 rw_assert((struct rwlock *)arg, RA_LOCKED); 2331 } 2332 2333 static void 2334 knlist_rw_assert_unlocked(void *arg) 2335 { 2336 2337 rw_assert((struct rwlock *)arg, RA_UNLOCKED); 2338 } 2339 2340 void 2341 knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *), 2342 void (*kl_unlock)(void *), 2343 void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *)) 2344 { 2345 2346 if (lock == NULL) 2347 knl->kl_lockarg = &knlist_lock; 2348 else 2349 knl->kl_lockarg = lock; 2350 2351 if (kl_lock == NULL) 2352 knl->kl_lock = knlist_mtx_lock; 2353 else 2354 knl->kl_lock = kl_lock; 2355 if (kl_unlock == NULL) 2356 knl->kl_unlock = knlist_mtx_unlock; 2357 else 2358 knl->kl_unlock = kl_unlock; 2359 if (kl_assert_locked == NULL) 2360 knl->kl_assert_locked = knlist_mtx_assert_locked; 2361 else 2362 knl->kl_assert_locked = kl_assert_locked; 2363 if (kl_assert_unlocked == NULL) 2364 knl->kl_assert_unlocked = knlist_mtx_assert_unlocked; 2365 else 2366 knl->kl_assert_unlocked = kl_assert_unlocked; 2367 2368 knl->kl_autodestroy = 0; 2369 SLIST_INIT(&knl->kl_list); 2370 } 2371 2372 void 2373 knlist_init_mtx(struct knlist *knl, struct mtx *lock) 2374 { 2375 2376 knlist_init(knl, lock, NULL, NULL, NULL, NULL); 2377 } 2378 2379 struct knlist * 2380 knlist_alloc(struct mtx *lock) 2381 { 2382 struct knlist *knl; 2383 2384 knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK); 2385 knlist_init_mtx(knl, lock); 2386 return (knl); 2387 } 2388 2389 void 2390 knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock) 2391 { 2392 2393 knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock, 2394 knlist_rw_assert_locked, knlist_rw_assert_unlocked); 2395 } 2396 2397 void 2398 knlist_destroy(struct knlist *knl) 2399 { 2400 2401 KASSERT(KNLIST_EMPTY(knl), 2402 ("destroying knlist %p with knotes on it", knl)); 2403 } 2404 2405 void 2406 knlist_detach(struct knlist *knl) 2407 { 2408 2409 KNL_ASSERT_LOCKED(knl); 2410 knl->kl_autodestroy = 1; 2411 if (knlist_empty(knl)) { 2412 knlist_destroy(knl); 2413 free(knl, M_KQUEUE); 2414 } 2415 } 2416 2417 /* 2418 * Even if we are locked, we may need to drop the lock to allow any influx 2419 * knotes time to "settle". 2420 */ 2421 void 2422 knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn) 2423 { 2424 struct knote *kn, *kn2; 2425 struct kqueue *kq; 2426 2427 KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl)); 2428 if (islocked) 2429 KNL_ASSERT_LOCKED(knl); 2430 else { 2431 KNL_ASSERT_UNLOCKED(knl); 2432 again: /* need to reacquire lock since we have dropped it */ 2433 knl->kl_lock(knl->kl_lockarg); 2434 } 2435 2436 SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) { 2437 kq = kn->kn_kq; 2438 KQ_LOCK(kq); 2439 if (kn_in_flux(kn)) { 2440 KQ_UNLOCK(kq); 2441 continue; 2442 } 2443 knlist_remove_kq(knl, kn, 1, 1); 2444 if (killkn) { 2445 kn_enter_flux(kn); 2446 KQ_UNLOCK(kq); 2447 knote_drop_detached(kn, td); 2448 } else { 2449 /* Make sure cleared knotes disappear soon */ 2450 kn->kn_flags |= EV_EOF | EV_ONESHOT; 2451 KQ_UNLOCK(kq); 2452 } 2453 kq = NULL; 2454 } 2455 2456 if (!SLIST_EMPTY(&knl->kl_list)) { 2457 /* there are still in flux knotes remaining */ 2458 kn = SLIST_FIRST(&knl->kl_list); 2459 kq = kn->kn_kq; 2460 KQ_LOCK(kq); 2461 KASSERT(kn_in_flux(kn), ("knote removed w/o list lock")); 2462 knl->kl_unlock(knl->kl_lockarg); 2463 kq->kq_state |= KQ_FLUXWAIT; 2464 msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0); 2465 kq = NULL; 2466 goto again; 2467 } 2468 2469 if (islocked) 2470 KNL_ASSERT_LOCKED(knl); 2471 else { 2472 knl->kl_unlock(knl->kl_lockarg); 2473 KNL_ASSERT_UNLOCKED(knl); 2474 } 2475 } 2476 2477 /* 2478 * Remove all knotes referencing a specified fd must be called with FILEDESC 2479 * lock. This prevents a race where a new fd comes along and occupies the 2480 * entry and we attach a knote to the fd. 2481 */ 2482 void 2483 knote_fdclose(struct thread *td, int fd) 2484 { 2485 struct filedesc *fdp = td->td_proc->p_fd; 2486 struct kqueue *kq; 2487 struct knote *kn; 2488 int influx; 2489 2490 FILEDESC_XLOCK_ASSERT(fdp); 2491 2492 /* 2493 * We shouldn't have to worry about new kevents appearing on fd 2494 * since filedesc is locked. 2495 */ 2496 TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) { 2497 KQ_LOCK(kq); 2498 2499 again: 2500 influx = 0; 2501 while (kq->kq_knlistsize > fd && 2502 (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) { 2503 if (kn_in_flux(kn)) { 2504 /* someone else might be waiting on our knote */ 2505 if (influx) 2506 wakeup(kq); 2507 kq->kq_state |= KQ_FLUXWAIT; 2508 msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); 2509 goto again; 2510 } 2511 kn_enter_flux(kn); 2512 KQ_UNLOCK(kq); 2513 influx = 1; 2514 knote_drop(kn, td); 2515 KQ_LOCK(kq); 2516 } 2517 KQ_UNLOCK_FLUX(kq); 2518 } 2519 } 2520 2521 static int 2522 knote_attach(struct knote *kn, struct kqueue *kq) 2523 { 2524 struct klist *list; 2525 2526 KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn)); 2527 KQ_OWNED(kq); 2528 2529 if (kn->kn_fop->f_isfd) { 2530 if (kn->kn_id >= kq->kq_knlistsize) 2531 return (ENOMEM); 2532 list = &kq->kq_knlist[kn->kn_id]; 2533 } else { 2534 if (kq->kq_knhash == NULL) 2535 return (ENOMEM); 2536 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 2537 } 2538 SLIST_INSERT_HEAD(list, kn, kn_link); 2539 return (0); 2540 } 2541 2542 static void 2543 knote_drop(struct knote *kn, struct thread *td) 2544 { 2545 2546 if ((kn->kn_status & KN_DETACHED) == 0) 2547 kn->kn_fop->f_detach(kn); 2548 knote_drop_detached(kn, td); 2549 } 2550 2551 static void 2552 knote_drop_detached(struct knote *kn, struct thread *td) 2553 { 2554 struct kqueue *kq; 2555 struct klist *list; 2556 2557 kq = kn->kn_kq; 2558 2559 KASSERT((kn->kn_status & KN_DETACHED) != 0, 2560 ("knote %p still attached", kn)); 2561 KQ_NOTOWNED(kq); 2562 2563 KQ_LOCK(kq); 2564 KASSERT(kn->kn_influx == 1, 2565 ("knote_drop called on %p with influx %d", kn, kn->kn_influx)); 2566 2567 if (kn->kn_fop->f_isfd) 2568 list = &kq->kq_knlist[kn->kn_id]; 2569 else 2570 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 2571 2572 if (!SLIST_EMPTY(list)) 2573 SLIST_REMOVE(list, kn, knote, kn_link); 2574 if (kn->kn_status & KN_QUEUED) 2575 knote_dequeue(kn); 2576 KQ_UNLOCK_FLUX(kq); 2577 2578 if (kn->kn_fop->f_isfd) { 2579 fdrop(kn->kn_fp, td); 2580 kn->kn_fp = NULL; 2581 } 2582 kqueue_fo_release(kn->kn_kevent.filter); 2583 kn->kn_fop = NULL; 2584 knote_free(kn); 2585 } 2586 2587 static void 2588 knote_enqueue(struct knote *kn) 2589 { 2590 struct kqueue *kq = kn->kn_kq; 2591 2592 KQ_OWNED(kn->kn_kq); 2593 KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); 2594 2595 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 2596 kn->kn_status |= KN_QUEUED; 2597 kq->kq_count++; 2598 kqueue_wakeup(kq); 2599 } 2600 2601 static void 2602 knote_dequeue(struct knote *kn) 2603 { 2604 struct kqueue *kq = kn->kn_kq; 2605 2606 KQ_OWNED(kn->kn_kq); 2607 KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); 2608 2609 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 2610 kn->kn_status &= ~KN_QUEUED; 2611 kq->kq_count--; 2612 } 2613 2614 static void 2615 knote_init(void) 2616 { 2617 2618 knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, 2619 NULL, NULL, UMA_ALIGN_PTR, 0); 2620 } 2621 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL); 2622 2623 static struct knote * 2624 knote_alloc(int waitok) 2625 { 2626 2627 return (uma_zalloc(knote_zone, (waitok ? M_WAITOK : M_NOWAIT) | 2628 M_ZERO)); 2629 } 2630 2631 static void 2632 knote_free(struct knote *kn) 2633 { 2634 2635 uma_zfree(knote_zone, kn); 2636 } 2637 2638 /* 2639 * Register the kev w/ the kq specified by fd. 2640 */ 2641 int 2642 kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok) 2643 { 2644 struct kqueue *kq; 2645 struct file *fp; 2646 cap_rights_t rights; 2647 int error; 2648 2649 error = fget(td, fd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &fp); 2650 if (error != 0) 2651 return (error); 2652 if ((error = kqueue_acquire(fp, &kq)) != 0) 2653 goto noacquire; 2654 2655 error = kqueue_register(kq, kev, td, waitok); 2656 kqueue_release(kq, 0); 2657 2658 noacquire: 2659 fdrop(fp, td); 2660 return (error); 2661 } 2662