1 /*- 2 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org> 3 * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org> 4 * Copyright (c) 2009 Apple, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include "opt_ktrace.h" 33 #include "opt_kqueue.h" 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/capsicum.h> 38 #include <sys/kernel.h> 39 #include <sys/lock.h> 40 #include <sys/mutex.h> 41 #include <sys/rwlock.h> 42 #include <sys/proc.h> 43 #include <sys/malloc.h> 44 #include <sys/unistd.h> 45 #include <sys/file.h> 46 #include <sys/filedesc.h> 47 #include <sys/filio.h> 48 #include <sys/fcntl.h> 49 #include <sys/kthread.h> 50 #include <sys/selinfo.h> 51 #include <sys/stdatomic.h> 52 #include <sys/queue.h> 53 #include <sys/event.h> 54 #include <sys/eventvar.h> 55 #include <sys/poll.h> 56 #include <sys/protosw.h> 57 #include <sys/resourcevar.h> 58 #include <sys/sigio.h> 59 #include <sys/signalvar.h> 60 #include <sys/socket.h> 61 #include <sys/socketvar.h> 62 #include <sys/stat.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysproto.h> 65 #include <sys/syscallsubr.h> 66 #include <sys/taskqueue.h> 67 #include <sys/uio.h> 68 #include <sys/user.h> 69 #ifdef KTRACE 70 #include <sys/ktrace.h> 71 #endif 72 73 #include <vm/uma.h> 74 75 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); 76 77 /* 78 * This lock is used if multiple kq locks are required. This possibly 79 * should be made into a per proc lock. 80 */ 81 static struct mtx kq_global; 82 MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF); 83 #define KQ_GLOBAL_LOCK(lck, haslck) do { \ 84 if (!haslck) \ 85 mtx_lock(lck); \ 86 haslck = 1; \ 87 } while (0) 88 #define KQ_GLOBAL_UNLOCK(lck, haslck) do { \ 89 if (haslck) \ 90 mtx_unlock(lck); \ 91 haslck = 0; \ 92 } while (0) 93 94 TASKQUEUE_DEFINE_THREAD(kqueue); 95 96 static int kevent_copyout(void *arg, struct kevent *kevp, int count); 97 static int kevent_copyin(void *arg, struct kevent *kevp, int count); 98 static int kqueue_register(struct kqueue *kq, struct kevent *kev, 99 struct thread *td, int waitok); 100 static int kqueue_acquire(struct file *fp, struct kqueue **kqp); 101 static void kqueue_release(struct kqueue *kq, int locked); 102 static void kqueue_destroy(struct kqueue *kq); 103 static void kqueue_drain(struct kqueue *kq, struct thread *td); 104 static int kqueue_expand(struct kqueue *kq, struct filterops *fops, 105 uintptr_t ident, int waitok); 106 static void kqueue_task(void *arg, int pending); 107 static int kqueue_scan(struct kqueue *kq, int maxevents, 108 struct kevent_copyops *k_ops, 109 const struct timespec *timeout, 110 struct kevent *keva, struct thread *td); 111 static void kqueue_wakeup(struct kqueue *kq); 112 static struct filterops *kqueue_fo_find(int filt); 113 static void kqueue_fo_release(int filt); 114 115 static fo_ioctl_t kqueue_ioctl; 116 static fo_poll_t kqueue_poll; 117 static fo_kqfilter_t kqueue_kqfilter; 118 static fo_stat_t kqueue_stat; 119 static fo_close_t kqueue_close; 120 static fo_fill_kinfo_t kqueue_fill_kinfo; 121 122 static struct fileops kqueueops = { 123 .fo_read = invfo_rdwr, 124 .fo_write = invfo_rdwr, 125 .fo_truncate = invfo_truncate, 126 .fo_ioctl = kqueue_ioctl, 127 .fo_poll = kqueue_poll, 128 .fo_kqfilter = kqueue_kqfilter, 129 .fo_stat = kqueue_stat, 130 .fo_close = kqueue_close, 131 .fo_chmod = invfo_chmod, 132 .fo_chown = invfo_chown, 133 .fo_sendfile = invfo_sendfile, 134 .fo_fill_kinfo = kqueue_fill_kinfo, 135 }; 136 137 static int knote_attach(struct knote *kn, struct kqueue *kq); 138 static void knote_drop(struct knote *kn, struct thread *td); 139 static void knote_enqueue(struct knote *kn); 140 static void knote_dequeue(struct knote *kn); 141 static void knote_init(void); 142 static struct knote *knote_alloc(int waitok); 143 static void knote_free(struct knote *kn); 144 145 static void filt_kqdetach(struct knote *kn); 146 static int filt_kqueue(struct knote *kn, long hint); 147 static int filt_procattach(struct knote *kn); 148 static void filt_procdetach(struct knote *kn); 149 static int filt_proc(struct knote *kn, long hint); 150 static int filt_fileattach(struct knote *kn); 151 static void filt_timerexpire(void *knx); 152 static int filt_timerattach(struct knote *kn); 153 static void filt_timerdetach(struct knote *kn); 154 static int filt_timer(struct knote *kn, long hint); 155 static int filt_userattach(struct knote *kn); 156 static void filt_userdetach(struct knote *kn); 157 static int filt_user(struct knote *kn, long hint); 158 static void filt_usertouch(struct knote *kn, struct kevent *kev, 159 u_long type); 160 161 static struct filterops file_filtops = { 162 .f_isfd = 1, 163 .f_attach = filt_fileattach, 164 }; 165 static struct filterops kqread_filtops = { 166 .f_isfd = 1, 167 .f_detach = filt_kqdetach, 168 .f_event = filt_kqueue, 169 }; 170 /* XXX - move to kern_proc.c? */ 171 static struct filterops proc_filtops = { 172 .f_isfd = 0, 173 .f_attach = filt_procattach, 174 .f_detach = filt_procdetach, 175 .f_event = filt_proc, 176 }; 177 static struct filterops timer_filtops = { 178 .f_isfd = 0, 179 .f_attach = filt_timerattach, 180 .f_detach = filt_timerdetach, 181 .f_event = filt_timer, 182 }; 183 static struct filterops user_filtops = { 184 .f_attach = filt_userattach, 185 .f_detach = filt_userdetach, 186 .f_event = filt_user, 187 .f_touch = filt_usertouch, 188 }; 189 190 static uma_zone_t knote_zone; 191 static atomic_uint kq_ncallouts = ATOMIC_VAR_INIT(0); 192 static unsigned int kq_calloutmax = 4 * 1024; 193 SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, 194 &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); 195 196 /* XXX - ensure not KN_INFLUX?? */ 197 #define KNOTE_ACTIVATE(kn, islock) do { \ 198 if ((islock)) \ 199 mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \ 200 else \ 201 KQ_LOCK((kn)->kn_kq); \ 202 (kn)->kn_status |= KN_ACTIVE; \ 203 if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ 204 knote_enqueue((kn)); \ 205 if (!(islock)) \ 206 KQ_UNLOCK((kn)->kn_kq); \ 207 } while(0) 208 #define KQ_LOCK(kq) do { \ 209 mtx_lock(&(kq)->kq_lock); \ 210 } while (0) 211 #define KQ_FLUX_WAKEUP(kq) do { \ 212 if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \ 213 (kq)->kq_state &= ~KQ_FLUXWAIT; \ 214 wakeup((kq)); \ 215 } \ 216 } while (0) 217 #define KQ_UNLOCK_FLUX(kq) do { \ 218 KQ_FLUX_WAKEUP(kq); \ 219 mtx_unlock(&(kq)->kq_lock); \ 220 } while (0) 221 #define KQ_UNLOCK(kq) do { \ 222 mtx_unlock(&(kq)->kq_lock); \ 223 } while (0) 224 #define KQ_OWNED(kq) do { \ 225 mtx_assert(&(kq)->kq_lock, MA_OWNED); \ 226 } while (0) 227 #define KQ_NOTOWNED(kq) do { \ 228 mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \ 229 } while (0) 230 #define KN_LIST_LOCK(kn) do { \ 231 if (kn->kn_knlist != NULL) \ 232 kn->kn_knlist->kl_lock(kn->kn_knlist->kl_lockarg); \ 233 } while (0) 234 #define KN_LIST_UNLOCK(kn) do { \ 235 if (kn->kn_knlist != NULL) \ 236 kn->kn_knlist->kl_unlock(kn->kn_knlist->kl_lockarg); \ 237 } while (0) 238 #define KNL_ASSERT_LOCK(knl, islocked) do { \ 239 if (islocked) \ 240 KNL_ASSERT_LOCKED(knl); \ 241 else \ 242 KNL_ASSERT_UNLOCKED(knl); \ 243 } while (0) 244 #ifdef INVARIANTS 245 #define KNL_ASSERT_LOCKED(knl) do { \ 246 knl->kl_assert_locked((knl)->kl_lockarg); \ 247 } while (0) 248 #define KNL_ASSERT_UNLOCKED(knl) do { \ 249 knl->kl_assert_unlocked((knl)->kl_lockarg); \ 250 } while (0) 251 #else /* !INVARIANTS */ 252 #define KNL_ASSERT_LOCKED(knl) do {} while(0) 253 #define KNL_ASSERT_UNLOCKED(knl) do {} while (0) 254 #endif /* INVARIANTS */ 255 256 #ifndef KN_HASHSIZE 257 #define KN_HASHSIZE 64 /* XXX should be tunable */ 258 #endif 259 260 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) 261 262 static int 263 filt_nullattach(struct knote *kn) 264 { 265 266 return (ENXIO); 267 }; 268 269 struct filterops null_filtops = { 270 .f_isfd = 0, 271 .f_attach = filt_nullattach, 272 }; 273 274 /* XXX - make SYSINIT to add these, and move into respective modules. */ 275 extern struct filterops sig_filtops; 276 extern struct filterops fs_filtops; 277 278 /* 279 * Table for for all system-defined filters. 280 */ 281 static struct mtx filterops_lock; 282 MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops", 283 MTX_DEF); 284 static struct { 285 struct filterops *for_fop; 286 int for_nolock; 287 int for_refcnt; 288 } sysfilt_ops[EVFILT_SYSCOUNT] = { 289 { &file_filtops, 1 }, /* EVFILT_READ */ 290 { &file_filtops, 1 }, /* EVFILT_WRITE */ 291 { &null_filtops }, /* EVFILT_AIO */ 292 { &file_filtops, 1 }, /* EVFILT_VNODE */ 293 { &proc_filtops, 1 }, /* EVFILT_PROC */ 294 { &sig_filtops, 1 }, /* EVFILT_SIGNAL */ 295 { &timer_filtops, 1 }, /* EVFILT_TIMER */ 296 { &file_filtops, 1 }, /* EVFILT_PROCDESC */ 297 { &fs_filtops, 1 }, /* EVFILT_FS */ 298 { &null_filtops }, /* EVFILT_LIO */ 299 { &user_filtops, 1 }, /* EVFILT_USER */ 300 { &null_filtops }, /* EVFILT_SENDFILE */ 301 }; 302 303 /* 304 * Simple redirection for all cdevsw style objects to call their fo_kqfilter 305 * method. 306 */ 307 static int 308 filt_fileattach(struct knote *kn) 309 { 310 311 return (fo_kqfilter(kn->kn_fp, kn)); 312 } 313 314 /*ARGSUSED*/ 315 static int 316 kqueue_kqfilter(struct file *fp, struct knote *kn) 317 { 318 struct kqueue *kq = kn->kn_fp->f_data; 319 320 if (kn->kn_filter != EVFILT_READ) 321 return (EINVAL); 322 323 kn->kn_status |= KN_KQUEUE; 324 kn->kn_fop = &kqread_filtops; 325 knlist_add(&kq->kq_sel.si_note, kn, 0); 326 327 return (0); 328 } 329 330 static void 331 filt_kqdetach(struct knote *kn) 332 { 333 struct kqueue *kq = kn->kn_fp->f_data; 334 335 knlist_remove(&kq->kq_sel.si_note, kn, 0); 336 } 337 338 /*ARGSUSED*/ 339 static int 340 filt_kqueue(struct knote *kn, long hint) 341 { 342 struct kqueue *kq = kn->kn_fp->f_data; 343 344 kn->kn_data = kq->kq_count; 345 return (kn->kn_data > 0); 346 } 347 348 /* XXX - move to kern_proc.c? */ 349 static int 350 filt_procattach(struct knote *kn) 351 { 352 struct proc *p; 353 int immediate; 354 int error; 355 356 immediate = 0; 357 p = pfind(kn->kn_id); 358 if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) { 359 p = zpfind(kn->kn_id); 360 immediate = 1; 361 } else if (p != NULL && (p->p_flag & P_WEXIT)) { 362 immediate = 1; 363 } 364 365 if (p == NULL) 366 return (ESRCH); 367 if ((error = p_cansee(curthread, p))) { 368 PROC_UNLOCK(p); 369 return (error); 370 } 371 372 kn->kn_ptr.p_proc = p; 373 kn->kn_flags |= EV_CLEAR; /* automatically set */ 374 375 /* 376 * Internal flag indicating registration done by kernel for the 377 * purposes of getting a NOTE_CHILD notification. 378 */ 379 if (kn->kn_flags & EV_FLAG2) { 380 kn->kn_flags &= ~EV_FLAG2; 381 kn->kn_data = kn->kn_sdata; /* ppid */ 382 kn->kn_fflags = NOTE_CHILD; 383 kn->kn_sfflags &= ~NOTE_EXIT; 384 immediate = 1; /* Force immediate activation of child note. */ 385 } 386 /* 387 * Internal flag indicating registration done by kernel (for other than 388 * NOTE_CHILD). 389 */ 390 if (kn->kn_flags & EV_FLAG1) { 391 kn->kn_flags &= ~EV_FLAG1; 392 } 393 394 if (immediate == 0) 395 knlist_add(&p->p_klist, kn, 1); 396 397 /* 398 * Immediately activate any child notes or, in the case of a zombie 399 * target process, exit notes. The latter is necessary to handle the 400 * case where the target process, e.g. a child, dies before the kevent 401 * is registered. 402 */ 403 if (immediate && filt_proc(kn, NOTE_EXIT)) 404 KNOTE_ACTIVATE(kn, 0); 405 406 PROC_UNLOCK(p); 407 408 return (0); 409 } 410 411 /* 412 * The knote may be attached to a different process, which may exit, 413 * leaving nothing for the knote to be attached to. So when the process 414 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so 415 * it will be deleted when read out. However, as part of the knote deletion, 416 * this routine is called, so a check is needed to avoid actually performing 417 * a detach, because the original process does not exist any more. 418 */ 419 /* XXX - move to kern_proc.c? */ 420 static void 421 filt_procdetach(struct knote *kn) 422 { 423 struct proc *p; 424 425 p = kn->kn_ptr.p_proc; 426 knlist_remove(&p->p_klist, kn, 0); 427 kn->kn_ptr.p_proc = NULL; 428 } 429 430 /* XXX - move to kern_proc.c? */ 431 static int 432 filt_proc(struct knote *kn, long hint) 433 { 434 struct proc *p; 435 u_int event; 436 437 p = kn->kn_ptr.p_proc; 438 /* Mask off extra data. */ 439 event = (u_int)hint & NOTE_PCTRLMASK; 440 441 /* If the user is interested in this event, record it. */ 442 if (kn->kn_sfflags & event) 443 kn->kn_fflags |= event; 444 445 /* Process is gone, so flag the event as finished. */ 446 if (event == NOTE_EXIT) { 447 if (!(kn->kn_status & KN_DETACHED)) 448 knlist_remove_inevent(&p->p_klist, kn); 449 kn->kn_flags |= EV_EOF | EV_ONESHOT; 450 kn->kn_ptr.p_proc = NULL; 451 if (kn->kn_fflags & NOTE_EXIT) 452 kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig); 453 if (kn->kn_fflags == 0) 454 kn->kn_flags |= EV_DROP; 455 return (1); 456 } 457 458 return (kn->kn_fflags != 0); 459 } 460 461 /* 462 * Called when the process forked. It mostly does the same as the 463 * knote(), activating all knotes registered to be activated when the 464 * process forked. Additionally, for each knote attached to the 465 * parent, check whether user wants to track the new process. If so 466 * attach a new knote to it, and immediately report an event with the 467 * child's pid. 468 */ 469 void 470 knote_fork(struct knlist *list, int pid) 471 { 472 struct kqueue *kq; 473 struct knote *kn; 474 struct kevent kev; 475 int error; 476 477 if (list == NULL) 478 return; 479 list->kl_lock(list->kl_lockarg); 480 481 SLIST_FOREACH(kn, &list->kl_list, kn_selnext) { 482 /* 483 * XXX - Why do we skip the kn if it is _INFLUX? Does this 484 * mean we will not properly wake up some notes? 485 */ 486 if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) 487 continue; 488 kq = kn->kn_kq; 489 KQ_LOCK(kq); 490 if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) { 491 KQ_UNLOCK(kq); 492 continue; 493 } 494 495 /* 496 * The same as knote(), activate the event. 497 */ 498 if ((kn->kn_sfflags & NOTE_TRACK) == 0) { 499 kn->kn_status |= KN_HASKQLOCK; 500 if (kn->kn_fop->f_event(kn, NOTE_FORK)) 501 KNOTE_ACTIVATE(kn, 1); 502 kn->kn_status &= ~KN_HASKQLOCK; 503 KQ_UNLOCK(kq); 504 continue; 505 } 506 507 /* 508 * The NOTE_TRACK case. In addition to the activation 509 * of the event, we need to register new events to 510 * track the child. Drop the locks in preparation for 511 * the call to kqueue_register(). 512 */ 513 kn->kn_status |= KN_INFLUX; 514 KQ_UNLOCK(kq); 515 list->kl_unlock(list->kl_lockarg); 516 517 /* 518 * Activate existing knote and register tracking knotes with 519 * new process. 520 * 521 * First register a knote to get just the child notice. This 522 * must be a separate note from a potential NOTE_EXIT 523 * notification since both NOTE_CHILD and NOTE_EXIT are defined 524 * to use the data field (in conflicting ways). 525 */ 526 kev.ident = pid; 527 kev.filter = kn->kn_filter; 528 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT | EV_FLAG2; 529 kev.fflags = kn->kn_sfflags; 530 kev.data = kn->kn_id; /* parent */ 531 kev.udata = kn->kn_kevent.udata;/* preserve udata */ 532 error = kqueue_register(kq, &kev, NULL, 0); 533 if (error) 534 kn->kn_fflags |= NOTE_TRACKERR; 535 536 /* 537 * Then register another knote to track other potential events 538 * from the new process. 539 */ 540 kev.ident = pid; 541 kev.filter = kn->kn_filter; 542 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; 543 kev.fflags = kn->kn_sfflags; 544 kev.data = kn->kn_id; /* parent */ 545 kev.udata = kn->kn_kevent.udata;/* preserve udata */ 546 error = kqueue_register(kq, &kev, NULL, 0); 547 if (error) 548 kn->kn_fflags |= NOTE_TRACKERR; 549 if (kn->kn_fop->f_event(kn, NOTE_FORK)) 550 KNOTE_ACTIVATE(kn, 0); 551 KQ_LOCK(kq); 552 kn->kn_status &= ~KN_INFLUX; 553 KQ_UNLOCK_FLUX(kq); 554 list->kl_lock(list->kl_lockarg); 555 } 556 list->kl_unlock(list->kl_lockarg); 557 } 558 559 /* 560 * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the 561 * interval timer support code. 562 */ 563 564 #define NOTE_TIMER_PRECMASK (NOTE_SECONDS|NOTE_MSECONDS|NOTE_USECONDS| \ 565 NOTE_NSECONDS) 566 567 static __inline sbintime_t 568 timer2sbintime(intptr_t data, int flags) 569 { 570 sbintime_t modifier; 571 572 switch (flags & NOTE_TIMER_PRECMASK) { 573 case NOTE_SECONDS: 574 modifier = SBT_1S; 575 break; 576 case NOTE_MSECONDS: /* FALLTHROUGH */ 577 case 0: 578 modifier = SBT_1MS; 579 break; 580 case NOTE_USECONDS: 581 modifier = SBT_1US; 582 break; 583 case NOTE_NSECONDS: 584 modifier = SBT_1NS; 585 break; 586 default: 587 return (-1); 588 } 589 590 #ifdef __LP64__ 591 if (data > SBT_MAX / modifier) 592 return (SBT_MAX); 593 #endif 594 return (modifier * data); 595 } 596 597 static void 598 filt_timerexpire(void *knx) 599 { 600 struct callout *calloutp; 601 struct knote *kn; 602 603 kn = knx; 604 kn->kn_data++; 605 KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */ 606 607 if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) { 608 calloutp = (struct callout *)kn->kn_hook; 609 *kn->kn_ptr.p_nexttime += timer2sbintime(kn->kn_sdata, 610 kn->kn_sfflags); 611 callout_reset_sbt_on(calloutp, *kn->kn_ptr.p_nexttime, 0, 612 filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE); 613 } 614 } 615 616 /* 617 * data contains amount of time to sleep 618 */ 619 static int 620 filt_timerattach(struct knote *kn) 621 { 622 struct callout *calloutp; 623 sbintime_t to; 624 unsigned int ncallouts; 625 626 if ((intptr_t)kn->kn_sdata < 0) 627 return (EINVAL); 628 if ((intptr_t)kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0) 629 kn->kn_sdata = 1; 630 /* Only precision unit are supported in flags so far */ 631 if (kn->kn_sfflags & ~NOTE_TIMER_PRECMASK) 632 return (EINVAL); 633 634 to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags); 635 if (to < 0) 636 return (EINVAL); 637 638 ncallouts = atomic_load_explicit(&kq_ncallouts, memory_order_relaxed); 639 do { 640 if (ncallouts >= kq_calloutmax) 641 return (ENOMEM); 642 } while (!atomic_compare_exchange_weak_explicit(&kq_ncallouts, 643 &ncallouts, ncallouts + 1, memory_order_relaxed, 644 memory_order_relaxed)); 645 646 kn->kn_flags |= EV_CLEAR; /* automatically set */ 647 kn->kn_status &= ~KN_DETACHED; /* knlist_add clears it */ 648 kn->kn_ptr.p_nexttime = malloc(sizeof(sbintime_t), M_KQUEUE, M_WAITOK); 649 calloutp = malloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK); 650 callout_init(calloutp, 1); 651 kn->kn_hook = calloutp; 652 *kn->kn_ptr.p_nexttime = to + sbinuptime(); 653 callout_reset_sbt_on(calloutp, *kn->kn_ptr.p_nexttime, 0, 654 filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE); 655 656 return (0); 657 } 658 659 static void 660 filt_timerdetach(struct knote *kn) 661 { 662 struct callout *calloutp; 663 unsigned int old; 664 665 calloutp = (struct callout *)kn->kn_hook; 666 callout_drain(calloutp); 667 free(calloutp, M_KQUEUE); 668 free(kn->kn_ptr.p_nexttime, M_KQUEUE); 669 old = atomic_fetch_sub_explicit(&kq_ncallouts, 1, memory_order_relaxed); 670 KASSERT(old > 0, ("Number of callouts cannot become negative")); 671 kn->kn_status |= KN_DETACHED; /* knlist_remove sets it */ 672 } 673 674 static int 675 filt_timer(struct knote *kn, long hint) 676 { 677 678 return (kn->kn_data != 0); 679 } 680 681 static int 682 filt_userattach(struct knote *kn) 683 { 684 685 /* 686 * EVFILT_USER knotes are not attached to anything in the kernel. 687 */ 688 kn->kn_hook = NULL; 689 if (kn->kn_fflags & NOTE_TRIGGER) 690 kn->kn_hookid = 1; 691 else 692 kn->kn_hookid = 0; 693 return (0); 694 } 695 696 static void 697 filt_userdetach(__unused struct knote *kn) 698 { 699 700 /* 701 * EVFILT_USER knotes are not attached to anything in the kernel. 702 */ 703 } 704 705 static int 706 filt_user(struct knote *kn, __unused long hint) 707 { 708 709 return (kn->kn_hookid); 710 } 711 712 static void 713 filt_usertouch(struct knote *kn, struct kevent *kev, u_long type) 714 { 715 u_int ffctrl; 716 717 switch (type) { 718 case EVENT_REGISTER: 719 if (kev->fflags & NOTE_TRIGGER) 720 kn->kn_hookid = 1; 721 722 ffctrl = kev->fflags & NOTE_FFCTRLMASK; 723 kev->fflags &= NOTE_FFLAGSMASK; 724 switch (ffctrl) { 725 case NOTE_FFNOP: 726 break; 727 728 case NOTE_FFAND: 729 kn->kn_sfflags &= kev->fflags; 730 break; 731 732 case NOTE_FFOR: 733 kn->kn_sfflags |= kev->fflags; 734 break; 735 736 case NOTE_FFCOPY: 737 kn->kn_sfflags = kev->fflags; 738 break; 739 740 default: 741 /* XXX Return error? */ 742 break; 743 } 744 kn->kn_sdata = kev->data; 745 if (kev->flags & EV_CLEAR) { 746 kn->kn_hookid = 0; 747 kn->kn_data = 0; 748 kn->kn_fflags = 0; 749 } 750 break; 751 752 case EVENT_PROCESS: 753 *kev = kn->kn_kevent; 754 kev->fflags = kn->kn_sfflags; 755 kev->data = kn->kn_sdata; 756 if (kn->kn_flags & EV_CLEAR) { 757 kn->kn_hookid = 0; 758 kn->kn_data = 0; 759 kn->kn_fflags = 0; 760 } 761 break; 762 763 default: 764 panic("filt_usertouch() - invalid type (%ld)", type); 765 break; 766 } 767 } 768 769 int 770 sys_kqueue(struct thread *td, struct kqueue_args *uap) 771 { 772 773 return (kern_kqueue(td, 0, NULL)); 774 } 775 776 static void 777 kqueue_init(struct kqueue *kq) 778 { 779 780 mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK); 781 TAILQ_INIT(&kq->kq_head); 782 knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock); 783 TASK_INIT(&kq->kq_task, 0, kqueue_task, kq); 784 } 785 786 int 787 kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps) 788 { 789 struct filedesc *fdp; 790 struct kqueue *kq; 791 struct file *fp; 792 struct ucred *cred; 793 int fd, error; 794 795 fdp = td->td_proc->p_fd; 796 cred = td->td_ucred; 797 if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES))) 798 return (ENOMEM); 799 800 error = falloc_caps(td, &fp, &fd, flags, fcaps); 801 if (error != 0) { 802 chgkqcnt(cred->cr_ruidinfo, -1, 0); 803 return (error); 804 } 805 806 /* An extra reference on `fp' has been held for us by falloc(). */ 807 kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO); 808 kqueue_init(kq); 809 kq->kq_fdp = fdp; 810 kq->kq_cred = crhold(cred); 811 812 FILEDESC_XLOCK(fdp); 813 TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list); 814 FILEDESC_XUNLOCK(fdp); 815 816 finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); 817 fdrop(fp, td); 818 819 td->td_retval[0] = fd; 820 return (0); 821 } 822 823 #ifndef _SYS_SYSPROTO_H_ 824 struct kevent_args { 825 int fd; 826 const struct kevent *changelist; 827 int nchanges; 828 struct kevent *eventlist; 829 int nevents; 830 const struct timespec *timeout; 831 }; 832 #endif 833 int 834 sys_kevent(struct thread *td, struct kevent_args *uap) 835 { 836 struct timespec ts, *tsp; 837 struct kevent_copyops k_ops = { uap, 838 kevent_copyout, 839 kevent_copyin}; 840 int error; 841 #ifdef KTRACE 842 struct uio ktruio; 843 struct iovec ktriov; 844 struct uio *ktruioin = NULL; 845 struct uio *ktruioout = NULL; 846 #endif 847 848 if (uap->timeout != NULL) { 849 error = copyin(uap->timeout, &ts, sizeof(ts)); 850 if (error) 851 return (error); 852 tsp = &ts; 853 } else 854 tsp = NULL; 855 856 #ifdef KTRACE 857 if (KTRPOINT(td, KTR_GENIO)) { 858 ktriov.iov_base = uap->changelist; 859 ktriov.iov_len = uap->nchanges * sizeof(struct kevent); 860 ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1, 861 .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ, 862 .uio_td = td }; 863 ktruioin = cloneuio(&ktruio); 864 ktriov.iov_base = uap->eventlist; 865 ktriov.iov_len = uap->nevents * sizeof(struct kevent); 866 ktruioout = cloneuio(&ktruio); 867 } 868 #endif 869 870 error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents, 871 &k_ops, tsp); 872 873 #ifdef KTRACE 874 if (ktruioin != NULL) { 875 ktruioin->uio_resid = uap->nchanges * sizeof(struct kevent); 876 ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0); 877 ktruioout->uio_resid = td->td_retval[0] * sizeof(struct kevent); 878 ktrgenio(uap->fd, UIO_READ, ktruioout, error); 879 } 880 #endif 881 882 return (error); 883 } 884 885 /* 886 * Copy 'count' items into the destination list pointed to by uap->eventlist. 887 */ 888 static int 889 kevent_copyout(void *arg, struct kevent *kevp, int count) 890 { 891 struct kevent_args *uap; 892 int error; 893 894 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); 895 uap = (struct kevent_args *)arg; 896 897 error = copyout(kevp, uap->eventlist, count * sizeof *kevp); 898 if (error == 0) 899 uap->eventlist += count; 900 return (error); 901 } 902 903 /* 904 * Copy 'count' items from the list pointed to by uap->changelist. 905 */ 906 static int 907 kevent_copyin(void *arg, struct kevent *kevp, int count) 908 { 909 struct kevent_args *uap; 910 int error; 911 912 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); 913 uap = (struct kevent_args *)arg; 914 915 error = copyin(uap->changelist, kevp, count * sizeof *kevp); 916 if (error == 0) 917 uap->changelist += count; 918 return (error); 919 } 920 921 int 922 kern_kevent(struct thread *td, int fd, int nchanges, int nevents, 923 struct kevent_copyops *k_ops, const struct timespec *timeout) 924 { 925 cap_rights_t rights; 926 struct file *fp; 927 int error; 928 929 cap_rights_init(&rights); 930 if (nchanges > 0) 931 cap_rights_set(&rights, CAP_KQUEUE_CHANGE); 932 if (nevents > 0) 933 cap_rights_set(&rights, CAP_KQUEUE_EVENT); 934 error = fget(td, fd, &rights, &fp); 935 if (error != 0) 936 return (error); 937 938 error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout); 939 fdrop(fp, td); 940 941 return (error); 942 } 943 944 static int 945 kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents, 946 struct kevent_copyops *k_ops, const struct timespec *timeout) 947 { 948 struct kevent keva[KQ_NEVENTS]; 949 struct kevent *kevp, *changes; 950 int i, n, nerrors, error; 951 952 nerrors = 0; 953 while (nchanges > 0) { 954 n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges; 955 error = k_ops->k_copyin(k_ops->arg, keva, n); 956 if (error) 957 return (error); 958 changes = keva; 959 for (i = 0; i < n; i++) { 960 kevp = &changes[i]; 961 if (!kevp->filter) 962 continue; 963 kevp->flags &= ~EV_SYSFLAGS; 964 error = kqueue_register(kq, kevp, td, 1); 965 if (error || (kevp->flags & EV_RECEIPT)) { 966 if (nevents == 0) 967 return (error); 968 kevp->flags = EV_ERROR; 969 kevp->data = error; 970 (void)k_ops->k_copyout(k_ops->arg, kevp, 1); 971 nevents--; 972 nerrors++; 973 } 974 } 975 nchanges -= n; 976 } 977 if (nerrors) { 978 td->td_retval[0] = nerrors; 979 return (0); 980 } 981 982 return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td)); 983 } 984 985 int 986 kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents, 987 struct kevent_copyops *k_ops, const struct timespec *timeout) 988 { 989 struct kqueue *kq; 990 int error; 991 992 error = kqueue_acquire(fp, &kq); 993 if (error != 0) 994 return (error); 995 error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout); 996 kqueue_release(kq, 0); 997 return (error); 998 } 999 1000 /* 1001 * Performs a kevent() call on a temporarily created kqueue. This can be 1002 * used to perform one-shot polling, similar to poll() and select(). 1003 */ 1004 int 1005 kern_kevent_anonymous(struct thread *td, int nevents, 1006 struct kevent_copyops *k_ops) 1007 { 1008 struct kqueue kq = {}; 1009 int error; 1010 1011 kqueue_init(&kq); 1012 kq.kq_refcnt = 1; 1013 error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL); 1014 kqueue_drain(&kq, td); 1015 kqueue_destroy(&kq); 1016 return (error); 1017 } 1018 1019 int 1020 kqueue_add_filteropts(int filt, struct filterops *filtops) 1021 { 1022 int error; 1023 1024 error = 0; 1025 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) { 1026 printf( 1027 "trying to add a filterop that is out of range: %d is beyond %d\n", 1028 ~filt, EVFILT_SYSCOUNT); 1029 return EINVAL; 1030 } 1031 mtx_lock(&filterops_lock); 1032 if (sysfilt_ops[~filt].for_fop != &null_filtops && 1033 sysfilt_ops[~filt].for_fop != NULL) 1034 error = EEXIST; 1035 else { 1036 sysfilt_ops[~filt].for_fop = filtops; 1037 sysfilt_ops[~filt].for_refcnt = 0; 1038 } 1039 mtx_unlock(&filterops_lock); 1040 1041 return (error); 1042 } 1043 1044 int 1045 kqueue_del_filteropts(int filt) 1046 { 1047 int error; 1048 1049 error = 0; 1050 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) 1051 return EINVAL; 1052 1053 mtx_lock(&filterops_lock); 1054 if (sysfilt_ops[~filt].for_fop == &null_filtops || 1055 sysfilt_ops[~filt].for_fop == NULL) 1056 error = EINVAL; 1057 else if (sysfilt_ops[~filt].for_refcnt != 0) 1058 error = EBUSY; 1059 else { 1060 sysfilt_ops[~filt].for_fop = &null_filtops; 1061 sysfilt_ops[~filt].for_refcnt = 0; 1062 } 1063 mtx_unlock(&filterops_lock); 1064 1065 return error; 1066 } 1067 1068 static struct filterops * 1069 kqueue_fo_find(int filt) 1070 { 1071 1072 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) 1073 return NULL; 1074 1075 if (sysfilt_ops[~filt].for_nolock) 1076 return sysfilt_ops[~filt].for_fop; 1077 1078 mtx_lock(&filterops_lock); 1079 sysfilt_ops[~filt].for_refcnt++; 1080 if (sysfilt_ops[~filt].for_fop == NULL) 1081 sysfilt_ops[~filt].for_fop = &null_filtops; 1082 mtx_unlock(&filterops_lock); 1083 1084 return sysfilt_ops[~filt].for_fop; 1085 } 1086 1087 static void 1088 kqueue_fo_release(int filt) 1089 { 1090 1091 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) 1092 return; 1093 1094 if (sysfilt_ops[~filt].for_nolock) 1095 return; 1096 1097 mtx_lock(&filterops_lock); 1098 KASSERT(sysfilt_ops[~filt].for_refcnt > 0, 1099 ("filter object refcount not valid on release")); 1100 sysfilt_ops[~filt].for_refcnt--; 1101 mtx_unlock(&filterops_lock); 1102 } 1103 1104 /* 1105 * A ref to kq (obtained via kqueue_acquire) must be held. waitok will 1106 * influence if memory allocation should wait. Make sure it is 0 if you 1107 * hold any mutexes. 1108 */ 1109 static int 1110 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok) 1111 { 1112 struct filterops *fops; 1113 struct file *fp; 1114 struct knote *kn, *tkn; 1115 cap_rights_t rights; 1116 int error, filt, event; 1117 int haskqglobal, filedesc_unlock; 1118 1119 if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE)) 1120 return (EINVAL); 1121 1122 fp = NULL; 1123 kn = NULL; 1124 error = 0; 1125 haskqglobal = 0; 1126 filedesc_unlock = 0; 1127 1128 filt = kev->filter; 1129 fops = kqueue_fo_find(filt); 1130 if (fops == NULL) 1131 return EINVAL; 1132 1133 if (kev->flags & EV_ADD) { 1134 /* 1135 * Prevent waiting with locks. Non-sleepable 1136 * allocation failures are handled in the loop, only 1137 * if the spare knote appears to be actually required. 1138 */ 1139 tkn = knote_alloc(waitok); 1140 } else { 1141 tkn = NULL; 1142 } 1143 1144 findkn: 1145 if (fops->f_isfd) { 1146 KASSERT(td != NULL, ("td is NULL")); 1147 error = fget(td, kev->ident, 1148 cap_rights_init(&rights, CAP_EVENT), &fp); 1149 if (error) 1150 goto done; 1151 1152 if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops, 1153 kev->ident, 0) != 0) { 1154 /* try again */ 1155 fdrop(fp, td); 1156 fp = NULL; 1157 error = kqueue_expand(kq, fops, kev->ident, waitok); 1158 if (error) 1159 goto done; 1160 goto findkn; 1161 } 1162 1163 if (fp->f_type == DTYPE_KQUEUE) { 1164 /* 1165 * If we add some intelligence about what we are doing, 1166 * we should be able to support events on ourselves. 1167 * We need to know when we are doing this to prevent 1168 * getting both the knlist lock and the kq lock since 1169 * they are the same thing. 1170 */ 1171 if (fp->f_data == kq) { 1172 error = EINVAL; 1173 goto done; 1174 } 1175 1176 /* 1177 * Pre-lock the filedesc before the global 1178 * lock mutex, see the comment in 1179 * kqueue_close(). 1180 */ 1181 FILEDESC_XLOCK(td->td_proc->p_fd); 1182 filedesc_unlock = 1; 1183 KQ_GLOBAL_LOCK(&kq_global, haskqglobal); 1184 } 1185 1186 KQ_LOCK(kq); 1187 if (kev->ident < kq->kq_knlistsize) { 1188 SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link) 1189 if (kev->filter == kn->kn_filter) 1190 break; 1191 } 1192 } else { 1193 if ((kev->flags & EV_ADD) == EV_ADD) 1194 kqueue_expand(kq, fops, kev->ident, waitok); 1195 1196 KQ_LOCK(kq); 1197 1198 /* 1199 * If possible, find an existing knote to use for this kevent. 1200 */ 1201 if (kev->filter == EVFILT_PROC && 1202 (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) { 1203 /* This is an internal creation of a process tracking 1204 * note. Don't attempt to coalesce this with an 1205 * existing note. 1206 */ 1207 ; 1208 } else if (kq->kq_knhashmask != 0) { 1209 struct klist *list; 1210 1211 list = &kq->kq_knhash[ 1212 KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; 1213 SLIST_FOREACH(kn, list, kn_link) 1214 if (kev->ident == kn->kn_id && 1215 kev->filter == kn->kn_filter) 1216 break; 1217 } 1218 } 1219 1220 /* knote is in the process of changing, wait for it to stabilize. */ 1221 if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) { 1222 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1223 if (filedesc_unlock) { 1224 FILEDESC_XUNLOCK(td->td_proc->p_fd); 1225 filedesc_unlock = 0; 1226 } 1227 kq->kq_state |= KQ_FLUXWAIT; 1228 msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0); 1229 if (fp != NULL) { 1230 fdrop(fp, td); 1231 fp = NULL; 1232 } 1233 goto findkn; 1234 } 1235 1236 /* 1237 * kn now contains the matching knote, or NULL if no match 1238 */ 1239 if (kn == NULL) { 1240 if (kev->flags & EV_ADD) { 1241 kn = tkn; 1242 tkn = NULL; 1243 if (kn == NULL) { 1244 KQ_UNLOCK(kq); 1245 error = ENOMEM; 1246 goto done; 1247 } 1248 kn->kn_fp = fp; 1249 kn->kn_kq = kq; 1250 kn->kn_fop = fops; 1251 /* 1252 * apply reference counts to knote structure, and 1253 * do not release it at the end of this routine. 1254 */ 1255 fops = NULL; 1256 fp = NULL; 1257 1258 kn->kn_sfflags = kev->fflags; 1259 kn->kn_sdata = kev->data; 1260 kev->fflags = 0; 1261 kev->data = 0; 1262 kn->kn_kevent = *kev; 1263 kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE | 1264 EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT); 1265 kn->kn_status = KN_INFLUX|KN_DETACHED; 1266 1267 error = knote_attach(kn, kq); 1268 KQ_UNLOCK(kq); 1269 if (error != 0) { 1270 tkn = kn; 1271 goto done; 1272 } 1273 1274 if ((error = kn->kn_fop->f_attach(kn)) != 0) { 1275 knote_drop(kn, td); 1276 goto done; 1277 } 1278 KN_LIST_LOCK(kn); 1279 goto done_ev_add; 1280 } else { 1281 /* No matching knote and the EV_ADD flag is not set. */ 1282 KQ_UNLOCK(kq); 1283 error = ENOENT; 1284 goto done; 1285 } 1286 } 1287 1288 if (kev->flags & EV_DELETE) { 1289 kn->kn_status |= KN_INFLUX; 1290 KQ_UNLOCK(kq); 1291 if (!(kn->kn_status & KN_DETACHED)) 1292 kn->kn_fop->f_detach(kn); 1293 knote_drop(kn, td); 1294 goto done; 1295 } 1296 1297 if (kev->flags & EV_FORCEONESHOT) { 1298 kn->kn_flags |= EV_ONESHOT; 1299 KNOTE_ACTIVATE(kn, 1); 1300 } 1301 1302 /* 1303 * The user may change some filter values after the initial EV_ADD, 1304 * but doing so will not reset any filter which has already been 1305 * triggered. 1306 */ 1307 kn->kn_status |= KN_INFLUX | KN_SCAN; 1308 KQ_UNLOCK(kq); 1309 KN_LIST_LOCK(kn); 1310 kn->kn_kevent.udata = kev->udata; 1311 if (!fops->f_isfd && fops->f_touch != NULL) { 1312 fops->f_touch(kn, kev, EVENT_REGISTER); 1313 } else { 1314 kn->kn_sfflags = kev->fflags; 1315 kn->kn_sdata = kev->data; 1316 } 1317 1318 /* 1319 * We can get here with kn->kn_knlist == NULL. This can happen when 1320 * the initial attach event decides that the event is "completed" 1321 * already. i.e. filt_procattach is called on a zombie process. It 1322 * will call filt_proc which will remove it from the list, and NULL 1323 * kn_knlist. 1324 */ 1325 done_ev_add: 1326 if ((kev->flags & EV_ENABLE) != 0) 1327 kn->kn_status &= ~KN_DISABLED; 1328 else if ((kev->flags & EV_DISABLE) != 0) 1329 kn->kn_status |= KN_DISABLED; 1330 1331 if ((kn->kn_status & KN_DISABLED) == 0) 1332 event = kn->kn_fop->f_event(kn, 0); 1333 else 1334 event = 0; 1335 1336 KQ_LOCK(kq); 1337 if (event) 1338 kn->kn_status |= KN_ACTIVE; 1339 if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) == 1340 KN_ACTIVE) 1341 knote_enqueue(kn); 1342 kn->kn_status &= ~(KN_INFLUX | KN_SCAN); 1343 KN_LIST_UNLOCK(kn); 1344 KQ_UNLOCK_FLUX(kq); 1345 1346 done: 1347 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1348 if (filedesc_unlock) 1349 FILEDESC_XUNLOCK(td->td_proc->p_fd); 1350 if (fp != NULL) 1351 fdrop(fp, td); 1352 knote_free(tkn); 1353 if (fops != NULL) 1354 kqueue_fo_release(filt); 1355 return (error); 1356 } 1357 1358 static int 1359 kqueue_acquire(struct file *fp, struct kqueue **kqp) 1360 { 1361 int error; 1362 struct kqueue *kq; 1363 1364 error = 0; 1365 1366 kq = fp->f_data; 1367 if (fp->f_type != DTYPE_KQUEUE || kq == NULL) 1368 return (EBADF); 1369 *kqp = kq; 1370 KQ_LOCK(kq); 1371 if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { 1372 KQ_UNLOCK(kq); 1373 return (EBADF); 1374 } 1375 kq->kq_refcnt++; 1376 KQ_UNLOCK(kq); 1377 1378 return error; 1379 } 1380 1381 static void 1382 kqueue_release(struct kqueue *kq, int locked) 1383 { 1384 if (locked) 1385 KQ_OWNED(kq); 1386 else 1387 KQ_LOCK(kq); 1388 kq->kq_refcnt--; 1389 if (kq->kq_refcnt == 1) 1390 wakeup(&kq->kq_refcnt); 1391 if (!locked) 1392 KQ_UNLOCK(kq); 1393 } 1394 1395 static void 1396 kqueue_schedtask(struct kqueue *kq) 1397 { 1398 1399 KQ_OWNED(kq); 1400 KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN), 1401 ("scheduling kqueue task while draining")); 1402 1403 if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) { 1404 taskqueue_enqueue(taskqueue_kqueue, &kq->kq_task); 1405 kq->kq_state |= KQ_TASKSCHED; 1406 } 1407 } 1408 1409 /* 1410 * Expand the kq to make sure we have storage for fops/ident pair. 1411 * 1412 * Return 0 on success (or no work necessary), return errno on failure. 1413 * 1414 * Not calling hashinit w/ waitok (proper malloc flag) should be safe. 1415 * If kqueue_register is called from a non-fd context, there usually/should 1416 * be no locks held. 1417 */ 1418 static int 1419 kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, 1420 int waitok) 1421 { 1422 struct klist *list, *tmp_knhash, *to_free; 1423 u_long tmp_knhashmask; 1424 int size; 1425 int fd; 1426 int mflag = waitok ? M_WAITOK : M_NOWAIT; 1427 1428 KQ_NOTOWNED(kq); 1429 1430 to_free = NULL; 1431 if (fops->f_isfd) { 1432 fd = ident; 1433 if (kq->kq_knlistsize <= fd) { 1434 size = kq->kq_knlistsize; 1435 while (size <= fd) 1436 size += KQEXTENT; 1437 list = malloc(size * sizeof(*list), M_KQUEUE, mflag); 1438 if (list == NULL) 1439 return ENOMEM; 1440 KQ_LOCK(kq); 1441 if (kq->kq_knlistsize > fd) { 1442 to_free = list; 1443 list = NULL; 1444 } else { 1445 if (kq->kq_knlist != NULL) { 1446 bcopy(kq->kq_knlist, list, 1447 kq->kq_knlistsize * sizeof(*list)); 1448 to_free = kq->kq_knlist; 1449 kq->kq_knlist = NULL; 1450 } 1451 bzero((caddr_t)list + 1452 kq->kq_knlistsize * sizeof(*list), 1453 (size - kq->kq_knlistsize) * sizeof(*list)); 1454 kq->kq_knlistsize = size; 1455 kq->kq_knlist = list; 1456 } 1457 KQ_UNLOCK(kq); 1458 } 1459 } else { 1460 if (kq->kq_knhashmask == 0) { 1461 tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE, 1462 &tmp_knhashmask); 1463 if (tmp_knhash == NULL) 1464 return ENOMEM; 1465 KQ_LOCK(kq); 1466 if (kq->kq_knhashmask == 0) { 1467 kq->kq_knhash = tmp_knhash; 1468 kq->kq_knhashmask = tmp_knhashmask; 1469 } else { 1470 to_free = tmp_knhash; 1471 } 1472 KQ_UNLOCK(kq); 1473 } 1474 } 1475 free(to_free, M_KQUEUE); 1476 1477 KQ_NOTOWNED(kq); 1478 return 0; 1479 } 1480 1481 static void 1482 kqueue_task(void *arg, int pending) 1483 { 1484 struct kqueue *kq; 1485 int haskqglobal; 1486 1487 haskqglobal = 0; 1488 kq = arg; 1489 1490 KQ_GLOBAL_LOCK(&kq_global, haskqglobal); 1491 KQ_LOCK(kq); 1492 1493 KNOTE_LOCKED(&kq->kq_sel.si_note, 0); 1494 1495 kq->kq_state &= ~KQ_TASKSCHED; 1496 if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) { 1497 wakeup(&kq->kq_state); 1498 } 1499 KQ_UNLOCK(kq); 1500 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1501 } 1502 1503 /* 1504 * Scan, update kn_data (if not ONESHOT), and copyout triggered events. 1505 * We treat KN_MARKER knotes as if they are INFLUX. 1506 */ 1507 static int 1508 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, 1509 const struct timespec *tsp, struct kevent *keva, struct thread *td) 1510 { 1511 struct kevent *kevp; 1512 struct knote *kn, *marker; 1513 sbintime_t asbt, rsbt; 1514 int count, error, haskqglobal, influx, nkev, touch; 1515 1516 count = maxevents; 1517 nkev = 0; 1518 error = 0; 1519 haskqglobal = 0; 1520 1521 if (maxevents == 0) 1522 goto done_nl; 1523 1524 rsbt = 0; 1525 if (tsp != NULL) { 1526 if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 || 1527 tsp->tv_nsec >= 1000000000) { 1528 error = EINVAL; 1529 goto done_nl; 1530 } 1531 if (timespecisset(tsp)) { 1532 if (tsp->tv_sec <= INT32_MAX) { 1533 rsbt = tstosbt(*tsp); 1534 if (TIMESEL(&asbt, rsbt)) 1535 asbt += tc_tick_sbt; 1536 if (asbt <= SBT_MAX - rsbt) 1537 asbt += rsbt; 1538 else 1539 asbt = 0; 1540 rsbt >>= tc_precexp; 1541 } else 1542 asbt = 0; 1543 } else 1544 asbt = -1; 1545 } else 1546 asbt = 0; 1547 marker = knote_alloc(1); 1548 marker->kn_status = KN_MARKER; 1549 KQ_LOCK(kq); 1550 1551 retry: 1552 kevp = keva; 1553 if (kq->kq_count == 0) { 1554 if (asbt == -1) { 1555 error = EWOULDBLOCK; 1556 } else { 1557 kq->kq_state |= KQ_SLEEP; 1558 error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH, 1559 "kqread", asbt, rsbt, C_ABSOLUTE); 1560 } 1561 if (error == 0) 1562 goto retry; 1563 /* don't restart after signals... */ 1564 if (error == ERESTART) 1565 error = EINTR; 1566 else if (error == EWOULDBLOCK) 1567 error = 0; 1568 goto done; 1569 } 1570 1571 TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe); 1572 influx = 0; 1573 while (count) { 1574 KQ_OWNED(kq); 1575 kn = TAILQ_FIRST(&kq->kq_head); 1576 1577 if ((kn->kn_status == KN_MARKER && kn != marker) || 1578 (kn->kn_status & KN_INFLUX) == KN_INFLUX) { 1579 if (influx) { 1580 influx = 0; 1581 KQ_FLUX_WAKEUP(kq); 1582 } 1583 kq->kq_state |= KQ_FLUXWAIT; 1584 error = msleep(kq, &kq->kq_lock, PSOCK, 1585 "kqflxwt", 0); 1586 continue; 1587 } 1588 1589 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 1590 if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) { 1591 kn->kn_status &= ~KN_QUEUED; 1592 kq->kq_count--; 1593 continue; 1594 } 1595 if (kn == marker) { 1596 KQ_FLUX_WAKEUP(kq); 1597 if (count == maxevents) 1598 goto retry; 1599 goto done; 1600 } 1601 KASSERT((kn->kn_status & KN_INFLUX) == 0, 1602 ("KN_INFLUX set when not suppose to be")); 1603 1604 if ((kn->kn_flags & EV_DROP) == EV_DROP) { 1605 kn->kn_status &= ~KN_QUEUED; 1606 kn->kn_status |= KN_INFLUX; 1607 kq->kq_count--; 1608 KQ_UNLOCK(kq); 1609 /* 1610 * We don't need to lock the list since we've marked 1611 * it _INFLUX. 1612 */ 1613 if (!(kn->kn_status & KN_DETACHED)) 1614 kn->kn_fop->f_detach(kn); 1615 knote_drop(kn, td); 1616 KQ_LOCK(kq); 1617 continue; 1618 } else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) { 1619 kn->kn_status &= ~KN_QUEUED; 1620 kn->kn_status |= KN_INFLUX; 1621 kq->kq_count--; 1622 KQ_UNLOCK(kq); 1623 /* 1624 * We don't need to lock the list since we've marked 1625 * it _INFLUX. 1626 */ 1627 *kevp = kn->kn_kevent; 1628 if (!(kn->kn_status & KN_DETACHED)) 1629 kn->kn_fop->f_detach(kn); 1630 knote_drop(kn, td); 1631 KQ_LOCK(kq); 1632 kn = NULL; 1633 } else { 1634 kn->kn_status |= KN_INFLUX | KN_SCAN; 1635 KQ_UNLOCK(kq); 1636 if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE) 1637 KQ_GLOBAL_LOCK(&kq_global, haskqglobal); 1638 KN_LIST_LOCK(kn); 1639 if (kn->kn_fop->f_event(kn, 0) == 0) { 1640 KQ_LOCK(kq); 1641 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1642 kn->kn_status &= 1643 ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX | 1644 KN_SCAN); 1645 kq->kq_count--; 1646 KN_LIST_UNLOCK(kn); 1647 influx = 1; 1648 continue; 1649 } 1650 touch = (!kn->kn_fop->f_isfd && 1651 kn->kn_fop->f_touch != NULL); 1652 if (touch) 1653 kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS); 1654 else 1655 *kevp = kn->kn_kevent; 1656 KQ_LOCK(kq); 1657 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1658 if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) { 1659 /* 1660 * Manually clear knotes who weren't 1661 * 'touch'ed. 1662 */ 1663 if (touch == 0 && kn->kn_flags & EV_CLEAR) { 1664 kn->kn_data = 0; 1665 kn->kn_fflags = 0; 1666 } 1667 if (kn->kn_flags & EV_DISPATCH) 1668 kn->kn_status |= KN_DISABLED; 1669 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); 1670 kq->kq_count--; 1671 } else 1672 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 1673 1674 kn->kn_status &= ~(KN_INFLUX | KN_SCAN); 1675 KN_LIST_UNLOCK(kn); 1676 influx = 1; 1677 } 1678 1679 /* we are returning a copy to the user */ 1680 kevp++; 1681 nkev++; 1682 count--; 1683 1684 if (nkev == KQ_NEVENTS) { 1685 influx = 0; 1686 KQ_UNLOCK_FLUX(kq); 1687 error = k_ops->k_copyout(k_ops->arg, keva, nkev); 1688 nkev = 0; 1689 kevp = keva; 1690 KQ_LOCK(kq); 1691 if (error) 1692 break; 1693 } 1694 } 1695 TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe); 1696 done: 1697 KQ_OWNED(kq); 1698 KQ_UNLOCK_FLUX(kq); 1699 knote_free(marker); 1700 done_nl: 1701 KQ_NOTOWNED(kq); 1702 if (nkev != 0) 1703 error = k_ops->k_copyout(k_ops->arg, keva, nkev); 1704 td->td_retval[0] = maxevents - count; 1705 return (error); 1706 } 1707 1708 /*ARGSUSED*/ 1709 static int 1710 kqueue_ioctl(struct file *fp, u_long cmd, void *data, 1711 struct ucred *active_cred, struct thread *td) 1712 { 1713 /* 1714 * Enabling sigio causes two major problems: 1715 * 1) infinite recursion: 1716 * Synopsys: kevent is being used to track signals and have FIOASYNC 1717 * set. On receipt of a signal this will cause a kqueue to recurse 1718 * into itself over and over. Sending the sigio causes the kqueue 1719 * to become ready, which in turn posts sigio again, forever. 1720 * Solution: this can be solved by setting a flag in the kqueue that 1721 * we have a SIGIO in progress. 1722 * 2) locking problems: 1723 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts 1724 * us above the proc and pgrp locks. 1725 * Solution: Post a signal using an async mechanism, being sure to 1726 * record a generation count in the delivery so that we do not deliver 1727 * a signal to the wrong process. 1728 * 1729 * Note, these two mechanisms are somewhat mutually exclusive! 1730 */ 1731 #if 0 1732 struct kqueue *kq; 1733 1734 kq = fp->f_data; 1735 switch (cmd) { 1736 case FIOASYNC: 1737 if (*(int *)data) { 1738 kq->kq_state |= KQ_ASYNC; 1739 } else { 1740 kq->kq_state &= ~KQ_ASYNC; 1741 } 1742 return (0); 1743 1744 case FIOSETOWN: 1745 return (fsetown(*(int *)data, &kq->kq_sigio)); 1746 1747 case FIOGETOWN: 1748 *(int *)data = fgetown(&kq->kq_sigio); 1749 return (0); 1750 } 1751 #endif 1752 1753 return (ENOTTY); 1754 } 1755 1756 /*ARGSUSED*/ 1757 static int 1758 kqueue_poll(struct file *fp, int events, struct ucred *active_cred, 1759 struct thread *td) 1760 { 1761 struct kqueue *kq; 1762 int revents = 0; 1763 int error; 1764 1765 if ((error = kqueue_acquire(fp, &kq))) 1766 return POLLERR; 1767 1768 KQ_LOCK(kq); 1769 if (events & (POLLIN | POLLRDNORM)) { 1770 if (kq->kq_count) { 1771 revents |= events & (POLLIN | POLLRDNORM); 1772 } else { 1773 selrecord(td, &kq->kq_sel); 1774 if (SEL_WAITING(&kq->kq_sel)) 1775 kq->kq_state |= KQ_SEL; 1776 } 1777 } 1778 kqueue_release(kq, 1); 1779 KQ_UNLOCK(kq); 1780 return (revents); 1781 } 1782 1783 /*ARGSUSED*/ 1784 static int 1785 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred, 1786 struct thread *td) 1787 { 1788 1789 bzero((void *)st, sizeof *st); 1790 /* 1791 * We no longer return kq_count because the unlocked value is useless. 1792 * If you spent all this time getting the count, why not spend your 1793 * syscall better by calling kevent? 1794 * 1795 * XXX - This is needed for libc_r. 1796 */ 1797 st->st_mode = S_IFIFO; 1798 return (0); 1799 } 1800 1801 static void 1802 kqueue_drain(struct kqueue *kq, struct thread *td) 1803 { 1804 struct knote *kn; 1805 int i; 1806 1807 KQ_LOCK(kq); 1808 1809 KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING, 1810 ("kqueue already closing")); 1811 kq->kq_state |= KQ_CLOSING; 1812 if (kq->kq_refcnt > 1) 1813 msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0); 1814 1815 KASSERT(kq->kq_refcnt == 1, ("other refs are out there!")); 1816 1817 KASSERT(knlist_empty(&kq->kq_sel.si_note), 1818 ("kqueue's knlist not empty")); 1819 1820 for (i = 0; i < kq->kq_knlistsize; i++) { 1821 while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) { 1822 if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) { 1823 kq->kq_state |= KQ_FLUXWAIT; 1824 msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0); 1825 continue; 1826 } 1827 kn->kn_status |= KN_INFLUX; 1828 KQ_UNLOCK(kq); 1829 if (!(kn->kn_status & KN_DETACHED)) 1830 kn->kn_fop->f_detach(kn); 1831 knote_drop(kn, td); 1832 KQ_LOCK(kq); 1833 } 1834 } 1835 if (kq->kq_knhashmask != 0) { 1836 for (i = 0; i <= kq->kq_knhashmask; i++) { 1837 while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) { 1838 if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) { 1839 kq->kq_state |= KQ_FLUXWAIT; 1840 msleep(kq, &kq->kq_lock, PSOCK, 1841 "kqclo2", 0); 1842 continue; 1843 } 1844 kn->kn_status |= KN_INFLUX; 1845 KQ_UNLOCK(kq); 1846 if (!(kn->kn_status & KN_DETACHED)) 1847 kn->kn_fop->f_detach(kn); 1848 knote_drop(kn, td); 1849 KQ_LOCK(kq); 1850 } 1851 } 1852 } 1853 1854 if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) { 1855 kq->kq_state |= KQ_TASKDRAIN; 1856 msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0); 1857 } 1858 1859 if ((kq->kq_state & KQ_SEL) == KQ_SEL) { 1860 selwakeuppri(&kq->kq_sel, PSOCK); 1861 if (!SEL_WAITING(&kq->kq_sel)) 1862 kq->kq_state &= ~KQ_SEL; 1863 } 1864 1865 KQ_UNLOCK(kq); 1866 } 1867 1868 static void 1869 kqueue_destroy(struct kqueue *kq) 1870 { 1871 1872 KASSERT(kq->kq_fdp == NULL, 1873 ("kqueue still attached to a file descriptor")); 1874 seldrain(&kq->kq_sel); 1875 knlist_destroy(&kq->kq_sel.si_note); 1876 mtx_destroy(&kq->kq_lock); 1877 1878 if (kq->kq_knhash != NULL) 1879 free(kq->kq_knhash, M_KQUEUE); 1880 if (kq->kq_knlist != NULL) 1881 free(kq->kq_knlist, M_KQUEUE); 1882 1883 funsetown(&kq->kq_sigio); 1884 } 1885 1886 /*ARGSUSED*/ 1887 static int 1888 kqueue_close(struct file *fp, struct thread *td) 1889 { 1890 struct kqueue *kq = fp->f_data; 1891 struct filedesc *fdp; 1892 int error; 1893 int filedesc_unlock; 1894 1895 if ((error = kqueue_acquire(fp, &kq))) 1896 return error; 1897 kqueue_drain(kq, td); 1898 1899 /* 1900 * We could be called due to the knote_drop() doing fdrop(), 1901 * called from kqueue_register(). In this case the global 1902 * lock is owned, and filedesc sx is locked before, to not 1903 * take the sleepable lock after non-sleepable. 1904 */ 1905 fdp = kq->kq_fdp; 1906 kq->kq_fdp = NULL; 1907 if (!sx_xlocked(FILEDESC_LOCK(fdp))) { 1908 FILEDESC_XLOCK(fdp); 1909 filedesc_unlock = 1; 1910 } else 1911 filedesc_unlock = 0; 1912 TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list); 1913 if (filedesc_unlock) 1914 FILEDESC_XUNLOCK(fdp); 1915 1916 kqueue_destroy(kq); 1917 chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0); 1918 crfree(kq->kq_cred); 1919 free(kq, M_KQUEUE); 1920 fp->f_data = NULL; 1921 1922 return (0); 1923 } 1924 1925 static int 1926 kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 1927 { 1928 1929 kif->kf_type = KF_TYPE_KQUEUE; 1930 return (0); 1931 } 1932 1933 static void 1934 kqueue_wakeup(struct kqueue *kq) 1935 { 1936 KQ_OWNED(kq); 1937 1938 if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) { 1939 kq->kq_state &= ~KQ_SLEEP; 1940 wakeup(kq); 1941 } 1942 if ((kq->kq_state & KQ_SEL) == KQ_SEL) { 1943 selwakeuppri(&kq->kq_sel, PSOCK); 1944 if (!SEL_WAITING(&kq->kq_sel)) 1945 kq->kq_state &= ~KQ_SEL; 1946 } 1947 if (!knlist_empty(&kq->kq_sel.si_note)) 1948 kqueue_schedtask(kq); 1949 if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) { 1950 pgsigio(&kq->kq_sigio, SIGIO, 0); 1951 } 1952 } 1953 1954 /* 1955 * Walk down a list of knotes, activating them if their event has triggered. 1956 * 1957 * There is a possibility to optimize in the case of one kq watching another. 1958 * Instead of scheduling a task to wake it up, you could pass enough state 1959 * down the chain to make up the parent kqueue. Make this code functional 1960 * first. 1961 */ 1962 void 1963 knote(struct knlist *list, long hint, int lockflags) 1964 { 1965 struct kqueue *kq; 1966 struct knote *kn, *tkn; 1967 int error; 1968 1969 if (list == NULL) 1970 return; 1971 1972 KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED); 1973 1974 if ((lockflags & KNF_LISTLOCKED) == 0) 1975 list->kl_lock(list->kl_lockarg); 1976 1977 /* 1978 * If we unlock the list lock (and set KN_INFLUX), we can 1979 * eliminate the kqueue scheduling, but this will introduce 1980 * four lock/unlock's for each knote to test. Also, marker 1981 * would be needed to keep iteration position, since filters 1982 * or other threads could remove events. 1983 */ 1984 SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) { 1985 kq = kn->kn_kq; 1986 KQ_LOCK(kq); 1987 if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) { 1988 /* 1989 * Do not process the influx notes, except for 1990 * the influx coming from the kq unlock in the 1991 * kqueue_scan(). In the later case, we do 1992 * not interfere with the scan, since the code 1993 * fragment in kqueue_scan() locks the knlist, 1994 * and cannot proceed until we finished. 1995 */ 1996 KQ_UNLOCK(kq); 1997 } else if ((lockflags & KNF_NOKQLOCK) != 0) { 1998 kn->kn_status |= KN_INFLUX; 1999 KQ_UNLOCK(kq); 2000 error = kn->kn_fop->f_event(kn, hint); 2001 KQ_LOCK(kq); 2002 kn->kn_status &= ~KN_INFLUX; 2003 if (error) 2004 KNOTE_ACTIVATE(kn, 1); 2005 KQ_UNLOCK_FLUX(kq); 2006 } else { 2007 kn->kn_status |= KN_HASKQLOCK; 2008 if (kn->kn_fop->f_event(kn, hint)) 2009 KNOTE_ACTIVATE(kn, 1); 2010 kn->kn_status &= ~KN_HASKQLOCK; 2011 KQ_UNLOCK(kq); 2012 } 2013 } 2014 if ((lockflags & KNF_LISTLOCKED) == 0) 2015 list->kl_unlock(list->kl_lockarg); 2016 } 2017 2018 /* 2019 * add a knote to a knlist 2020 */ 2021 void 2022 knlist_add(struct knlist *knl, struct knote *kn, int islocked) 2023 { 2024 KNL_ASSERT_LOCK(knl, islocked); 2025 KQ_NOTOWNED(kn->kn_kq); 2026 KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == 2027 (KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED")); 2028 if (!islocked) 2029 knl->kl_lock(knl->kl_lockarg); 2030 SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext); 2031 if (!islocked) 2032 knl->kl_unlock(knl->kl_lockarg); 2033 KQ_LOCK(kn->kn_kq); 2034 kn->kn_knlist = knl; 2035 kn->kn_status &= ~KN_DETACHED; 2036 KQ_UNLOCK(kn->kn_kq); 2037 } 2038 2039 static void 2040 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked) 2041 { 2042 KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked")); 2043 KNL_ASSERT_LOCK(knl, knlislocked); 2044 mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED); 2045 if (!kqislocked) 2046 KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX, 2047 ("knlist_remove called w/o knote being KN_INFLUX or already removed")); 2048 if (!knlislocked) 2049 knl->kl_lock(knl->kl_lockarg); 2050 SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext); 2051 kn->kn_knlist = NULL; 2052 if (!knlislocked) 2053 knl->kl_unlock(knl->kl_lockarg); 2054 if (!kqislocked) 2055 KQ_LOCK(kn->kn_kq); 2056 kn->kn_status |= KN_DETACHED; 2057 if (!kqislocked) 2058 KQ_UNLOCK(kn->kn_kq); 2059 } 2060 2061 /* 2062 * remove knote from the specified knlist 2063 */ 2064 void 2065 knlist_remove(struct knlist *knl, struct knote *kn, int islocked) 2066 { 2067 2068 knlist_remove_kq(knl, kn, islocked, 0); 2069 } 2070 2071 /* 2072 * remove knote from the specified knlist while in f_event handler. 2073 */ 2074 void 2075 knlist_remove_inevent(struct knlist *knl, struct knote *kn) 2076 { 2077 2078 knlist_remove_kq(knl, kn, 1, 2079 (kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK); 2080 } 2081 2082 int 2083 knlist_empty(struct knlist *knl) 2084 { 2085 2086 KNL_ASSERT_LOCKED(knl); 2087 return SLIST_EMPTY(&knl->kl_list); 2088 } 2089 2090 static struct mtx knlist_lock; 2091 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects", 2092 MTX_DEF); 2093 static void knlist_mtx_lock(void *arg); 2094 static void knlist_mtx_unlock(void *arg); 2095 2096 static void 2097 knlist_mtx_lock(void *arg) 2098 { 2099 2100 mtx_lock((struct mtx *)arg); 2101 } 2102 2103 static void 2104 knlist_mtx_unlock(void *arg) 2105 { 2106 2107 mtx_unlock((struct mtx *)arg); 2108 } 2109 2110 static void 2111 knlist_mtx_assert_locked(void *arg) 2112 { 2113 2114 mtx_assert((struct mtx *)arg, MA_OWNED); 2115 } 2116 2117 static void 2118 knlist_mtx_assert_unlocked(void *arg) 2119 { 2120 2121 mtx_assert((struct mtx *)arg, MA_NOTOWNED); 2122 } 2123 2124 static void 2125 knlist_rw_rlock(void *arg) 2126 { 2127 2128 rw_rlock((struct rwlock *)arg); 2129 } 2130 2131 static void 2132 knlist_rw_runlock(void *arg) 2133 { 2134 2135 rw_runlock((struct rwlock *)arg); 2136 } 2137 2138 static void 2139 knlist_rw_assert_locked(void *arg) 2140 { 2141 2142 rw_assert((struct rwlock *)arg, RA_LOCKED); 2143 } 2144 2145 static void 2146 knlist_rw_assert_unlocked(void *arg) 2147 { 2148 2149 rw_assert((struct rwlock *)arg, RA_UNLOCKED); 2150 } 2151 2152 void 2153 knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *), 2154 void (*kl_unlock)(void *), 2155 void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *)) 2156 { 2157 2158 if (lock == NULL) 2159 knl->kl_lockarg = &knlist_lock; 2160 else 2161 knl->kl_lockarg = lock; 2162 2163 if (kl_lock == NULL) 2164 knl->kl_lock = knlist_mtx_lock; 2165 else 2166 knl->kl_lock = kl_lock; 2167 if (kl_unlock == NULL) 2168 knl->kl_unlock = knlist_mtx_unlock; 2169 else 2170 knl->kl_unlock = kl_unlock; 2171 if (kl_assert_locked == NULL) 2172 knl->kl_assert_locked = knlist_mtx_assert_locked; 2173 else 2174 knl->kl_assert_locked = kl_assert_locked; 2175 if (kl_assert_unlocked == NULL) 2176 knl->kl_assert_unlocked = knlist_mtx_assert_unlocked; 2177 else 2178 knl->kl_assert_unlocked = kl_assert_unlocked; 2179 2180 SLIST_INIT(&knl->kl_list); 2181 } 2182 2183 void 2184 knlist_init_mtx(struct knlist *knl, struct mtx *lock) 2185 { 2186 2187 knlist_init(knl, lock, NULL, NULL, NULL, NULL); 2188 } 2189 2190 void 2191 knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock) 2192 { 2193 2194 knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock, 2195 knlist_rw_assert_locked, knlist_rw_assert_unlocked); 2196 } 2197 2198 void 2199 knlist_destroy(struct knlist *knl) 2200 { 2201 2202 #ifdef INVARIANTS 2203 /* 2204 * if we run across this error, we need to find the offending 2205 * driver and have it call knlist_clear or knlist_delete. 2206 */ 2207 if (!SLIST_EMPTY(&knl->kl_list)) 2208 printf("WARNING: destroying knlist w/ knotes on it!\n"); 2209 #endif 2210 2211 knl->kl_lockarg = knl->kl_lock = knl->kl_unlock = NULL; 2212 SLIST_INIT(&knl->kl_list); 2213 } 2214 2215 /* 2216 * Even if we are locked, we may need to drop the lock to allow any influx 2217 * knotes time to "settle". 2218 */ 2219 void 2220 knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn) 2221 { 2222 struct knote *kn, *kn2; 2223 struct kqueue *kq; 2224 2225 if (islocked) 2226 KNL_ASSERT_LOCKED(knl); 2227 else { 2228 KNL_ASSERT_UNLOCKED(knl); 2229 again: /* need to reacquire lock since we have dropped it */ 2230 knl->kl_lock(knl->kl_lockarg); 2231 } 2232 2233 SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) { 2234 kq = kn->kn_kq; 2235 KQ_LOCK(kq); 2236 if ((kn->kn_status & KN_INFLUX)) { 2237 KQ_UNLOCK(kq); 2238 continue; 2239 } 2240 knlist_remove_kq(knl, kn, 1, 1); 2241 if (killkn) { 2242 kn->kn_status |= KN_INFLUX | KN_DETACHED; 2243 KQ_UNLOCK(kq); 2244 knote_drop(kn, td); 2245 } else { 2246 /* Make sure cleared knotes disappear soon */ 2247 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 2248 KQ_UNLOCK(kq); 2249 } 2250 kq = NULL; 2251 } 2252 2253 if (!SLIST_EMPTY(&knl->kl_list)) { 2254 /* there are still KN_INFLUX remaining */ 2255 kn = SLIST_FIRST(&knl->kl_list); 2256 kq = kn->kn_kq; 2257 KQ_LOCK(kq); 2258 KASSERT(kn->kn_status & KN_INFLUX, 2259 ("knote removed w/o list lock")); 2260 knl->kl_unlock(knl->kl_lockarg); 2261 kq->kq_state |= KQ_FLUXWAIT; 2262 msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0); 2263 kq = NULL; 2264 goto again; 2265 } 2266 2267 if (islocked) 2268 KNL_ASSERT_LOCKED(knl); 2269 else { 2270 knl->kl_unlock(knl->kl_lockarg); 2271 KNL_ASSERT_UNLOCKED(knl); 2272 } 2273 } 2274 2275 /* 2276 * Remove all knotes referencing a specified fd must be called with FILEDESC 2277 * lock. This prevents a race where a new fd comes along and occupies the 2278 * entry and we attach a knote to the fd. 2279 */ 2280 void 2281 knote_fdclose(struct thread *td, int fd) 2282 { 2283 struct filedesc *fdp = td->td_proc->p_fd; 2284 struct kqueue *kq; 2285 struct knote *kn; 2286 int influx; 2287 2288 FILEDESC_XLOCK_ASSERT(fdp); 2289 2290 /* 2291 * We shouldn't have to worry about new kevents appearing on fd 2292 * since filedesc is locked. 2293 */ 2294 TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) { 2295 KQ_LOCK(kq); 2296 2297 again: 2298 influx = 0; 2299 while (kq->kq_knlistsize > fd && 2300 (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) { 2301 if (kn->kn_status & KN_INFLUX) { 2302 /* someone else might be waiting on our knote */ 2303 if (influx) 2304 wakeup(kq); 2305 kq->kq_state |= KQ_FLUXWAIT; 2306 msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); 2307 goto again; 2308 } 2309 kn->kn_status |= KN_INFLUX; 2310 KQ_UNLOCK(kq); 2311 if (!(kn->kn_status & KN_DETACHED)) 2312 kn->kn_fop->f_detach(kn); 2313 knote_drop(kn, td); 2314 influx = 1; 2315 KQ_LOCK(kq); 2316 } 2317 KQ_UNLOCK_FLUX(kq); 2318 } 2319 } 2320 2321 static int 2322 knote_attach(struct knote *kn, struct kqueue *kq) 2323 { 2324 struct klist *list; 2325 2326 KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX")); 2327 KQ_OWNED(kq); 2328 2329 if (kn->kn_fop->f_isfd) { 2330 if (kn->kn_id >= kq->kq_knlistsize) 2331 return ENOMEM; 2332 list = &kq->kq_knlist[kn->kn_id]; 2333 } else { 2334 if (kq->kq_knhash == NULL) 2335 return ENOMEM; 2336 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 2337 } 2338 2339 SLIST_INSERT_HEAD(list, kn, kn_link); 2340 2341 return 0; 2342 } 2343 2344 /* 2345 * knote must already have been detached using the f_detach method. 2346 * no lock need to be held, it is assumed that the KN_INFLUX flag is set 2347 * to prevent other removal. 2348 */ 2349 static void 2350 knote_drop(struct knote *kn, struct thread *td) 2351 { 2352 struct kqueue *kq; 2353 struct klist *list; 2354 2355 kq = kn->kn_kq; 2356 2357 KQ_NOTOWNED(kq); 2358 KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX, 2359 ("knote_drop called without KN_INFLUX set in kn_status")); 2360 2361 KQ_LOCK(kq); 2362 if (kn->kn_fop->f_isfd) 2363 list = &kq->kq_knlist[kn->kn_id]; 2364 else 2365 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 2366 2367 if (!SLIST_EMPTY(list)) 2368 SLIST_REMOVE(list, kn, knote, kn_link); 2369 if (kn->kn_status & KN_QUEUED) 2370 knote_dequeue(kn); 2371 KQ_UNLOCK_FLUX(kq); 2372 2373 if (kn->kn_fop->f_isfd) { 2374 fdrop(kn->kn_fp, td); 2375 kn->kn_fp = NULL; 2376 } 2377 kqueue_fo_release(kn->kn_kevent.filter); 2378 kn->kn_fop = NULL; 2379 knote_free(kn); 2380 } 2381 2382 static void 2383 knote_enqueue(struct knote *kn) 2384 { 2385 struct kqueue *kq = kn->kn_kq; 2386 2387 KQ_OWNED(kn->kn_kq); 2388 KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); 2389 2390 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 2391 kn->kn_status |= KN_QUEUED; 2392 kq->kq_count++; 2393 kqueue_wakeup(kq); 2394 } 2395 2396 static void 2397 knote_dequeue(struct knote *kn) 2398 { 2399 struct kqueue *kq = kn->kn_kq; 2400 2401 KQ_OWNED(kn->kn_kq); 2402 KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); 2403 2404 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 2405 kn->kn_status &= ~KN_QUEUED; 2406 kq->kq_count--; 2407 } 2408 2409 static void 2410 knote_init(void) 2411 { 2412 2413 knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, 2414 NULL, NULL, UMA_ALIGN_PTR, 0); 2415 } 2416 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL); 2417 2418 static struct knote * 2419 knote_alloc(int waitok) 2420 { 2421 2422 return (uma_zalloc(knote_zone, (waitok ? M_WAITOK : M_NOWAIT) | 2423 M_ZERO)); 2424 } 2425 2426 static void 2427 knote_free(struct knote *kn) 2428 { 2429 2430 uma_zfree(knote_zone, kn); 2431 } 2432 2433 /* 2434 * Register the kev w/ the kq specified by fd. 2435 */ 2436 int 2437 kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok) 2438 { 2439 struct kqueue *kq; 2440 struct file *fp; 2441 cap_rights_t rights; 2442 int error; 2443 2444 error = fget(td, fd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &fp); 2445 if (error != 0) 2446 return (error); 2447 if ((error = kqueue_acquire(fp, &kq)) != 0) 2448 goto noacquire; 2449 2450 error = kqueue_register(kq, kev, td, waitok); 2451 2452 kqueue_release(kq, 0); 2453 2454 noacquire: 2455 fdrop(fp, td); 2456 2457 return error; 2458 } 2459