1 /*- 2 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org> 3 * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org> 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include "opt_ktrace.h" 32 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/kernel.h> 36 #include <sys/lock.h> 37 #include <sys/mutex.h> 38 #include <sys/proc.h> 39 #include <sys/malloc.h> 40 #include <sys/unistd.h> 41 #include <sys/file.h> 42 #include <sys/filedesc.h> 43 #include <sys/filio.h> 44 #include <sys/fcntl.h> 45 #include <sys/kthread.h> 46 #include <sys/selinfo.h> 47 #include <sys/queue.h> 48 #include <sys/event.h> 49 #include <sys/eventvar.h> 50 #include <sys/poll.h> 51 #include <sys/protosw.h> 52 #include <sys/sigio.h> 53 #include <sys/signalvar.h> 54 #include <sys/socket.h> 55 #include <sys/socketvar.h> 56 #include <sys/stat.h> 57 #include <sys/sysctl.h> 58 #include <sys/sysproto.h> 59 #include <sys/syscallsubr.h> 60 #include <sys/taskqueue.h> 61 #include <sys/uio.h> 62 #ifdef KTRACE 63 #include <sys/ktrace.h> 64 #endif 65 66 #include <vm/uma.h> 67 68 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); 69 70 /* 71 * This lock is used if multiple kq locks are required. This possibly 72 * should be made into a per proc lock. 73 */ 74 static struct mtx kq_global; 75 MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF); 76 #define KQ_GLOBAL_LOCK(lck, haslck) do { \ 77 if (!haslck) \ 78 mtx_lock(lck); \ 79 haslck = 1; \ 80 } while (0) 81 #define KQ_GLOBAL_UNLOCK(lck, haslck) do { \ 82 if (haslck) \ 83 mtx_unlock(lck); \ 84 haslck = 0; \ 85 } while (0) 86 87 TASKQUEUE_DEFINE_THREAD(kqueue); 88 89 static int kevent_copyout(void *arg, struct kevent *kevp, int count); 90 static int kevent_copyin(void *arg, struct kevent *kevp, int count); 91 static int kqueue_register(struct kqueue *kq, struct kevent *kev, 92 struct thread *td, int waitok); 93 static int kqueue_aquire(struct file *fp, struct kqueue **kqp); 94 static void kqueue_release(struct kqueue *kq, int locked); 95 static int kqueue_expand(struct kqueue *kq, struct filterops *fops, 96 uintptr_t ident, int waitok); 97 static void kqueue_task(void *arg, int pending); 98 static int kqueue_scan(struct kqueue *kq, int maxevents, 99 struct kevent_copyops *k_ops, 100 const struct timespec *timeout, 101 struct kevent *keva, struct thread *td); 102 static void kqueue_wakeup(struct kqueue *kq); 103 static struct filterops *kqueue_fo_find(int filt); 104 static void kqueue_fo_release(int filt); 105 106 static fo_rdwr_t kqueue_read; 107 static fo_rdwr_t kqueue_write; 108 static fo_ioctl_t kqueue_ioctl; 109 static fo_poll_t kqueue_poll; 110 static fo_kqfilter_t kqueue_kqfilter; 111 static fo_stat_t kqueue_stat; 112 static fo_close_t kqueue_close; 113 114 static struct fileops kqueueops = { 115 .fo_read = kqueue_read, 116 .fo_write = kqueue_write, 117 .fo_ioctl = kqueue_ioctl, 118 .fo_poll = kqueue_poll, 119 .fo_kqfilter = kqueue_kqfilter, 120 .fo_stat = kqueue_stat, 121 .fo_close = kqueue_close, 122 }; 123 124 static int knote_attach(struct knote *kn, struct kqueue *kq); 125 static void knote_drop(struct knote *kn, struct thread *td); 126 static void knote_enqueue(struct knote *kn); 127 static void knote_dequeue(struct knote *kn); 128 static void knote_init(void); 129 static struct knote *knote_alloc(int waitok); 130 static void knote_free(struct knote *kn); 131 132 static void filt_kqdetach(struct knote *kn); 133 static int filt_kqueue(struct knote *kn, long hint); 134 static int filt_procattach(struct knote *kn); 135 static void filt_procdetach(struct knote *kn); 136 static int filt_proc(struct knote *kn, long hint); 137 static int filt_fileattach(struct knote *kn); 138 static void filt_timerexpire(void *knx); 139 static int filt_timerattach(struct knote *kn); 140 static void filt_timerdetach(struct knote *kn); 141 static int filt_timer(struct knote *kn, long hint); 142 143 static struct filterops file_filtops = 144 { 1, filt_fileattach, NULL, NULL }; 145 static struct filterops kqread_filtops = 146 { 1, NULL, filt_kqdetach, filt_kqueue }; 147 /* XXX - move to kern_proc.c? */ 148 static struct filterops proc_filtops = 149 { 0, filt_procattach, filt_procdetach, filt_proc }; 150 static struct filterops timer_filtops = 151 { 0, filt_timerattach, filt_timerdetach, filt_timer }; 152 153 static uma_zone_t knote_zone; 154 static int kq_ncallouts = 0; 155 static int kq_calloutmax = (4 * 1024); 156 SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, 157 &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); 158 159 /* XXX - ensure not KN_INFLUX?? */ 160 #define KNOTE_ACTIVATE(kn, islock) do { \ 161 if ((islock)) \ 162 mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \ 163 else \ 164 KQ_LOCK((kn)->kn_kq); \ 165 (kn)->kn_status |= KN_ACTIVE; \ 166 if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ 167 knote_enqueue((kn)); \ 168 if (!(islock)) \ 169 KQ_UNLOCK((kn)->kn_kq); \ 170 } while(0) 171 #define KQ_LOCK(kq) do { \ 172 mtx_lock(&(kq)->kq_lock); \ 173 } while (0) 174 #define KQ_FLUX_WAKEUP(kq) do { \ 175 if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \ 176 (kq)->kq_state &= ~KQ_FLUXWAIT; \ 177 wakeup((kq)); \ 178 } \ 179 } while (0) 180 #define KQ_UNLOCK_FLUX(kq) do { \ 181 KQ_FLUX_WAKEUP(kq); \ 182 mtx_unlock(&(kq)->kq_lock); \ 183 } while (0) 184 #define KQ_UNLOCK(kq) do { \ 185 mtx_unlock(&(kq)->kq_lock); \ 186 } while (0) 187 #define KQ_OWNED(kq) do { \ 188 mtx_assert(&(kq)->kq_lock, MA_OWNED); \ 189 } while (0) 190 #define KQ_NOTOWNED(kq) do { \ 191 mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \ 192 } while (0) 193 #define KN_LIST_LOCK(kn) do { \ 194 if (kn->kn_knlist != NULL) \ 195 kn->kn_knlist->kl_lock(kn->kn_knlist->kl_lockarg); \ 196 } while (0) 197 #define KN_LIST_UNLOCK(kn) do { \ 198 if (kn->kn_knlist != NULL) \ 199 kn->kn_knlist->kl_unlock(kn->kn_knlist->kl_lockarg); \ 200 } while (0) 201 #define KNL_ASSERT_LOCK(knl, islocked) do { \ 202 if (islocked) \ 203 KNL_ASSERT_LOCKED(knl); \ 204 else \ 205 KNL_ASSERT_UNLOCKED(knl); \ 206 } while (0) 207 #ifdef INVARIANTS 208 #define KNL_ASSERT_LOCKED(knl) do { \ 209 if (!knl->kl_locked((knl)->kl_lockarg)) \ 210 panic("knlist not locked, but should be"); \ 211 } while (0) 212 #define KNL_ASSERT_UNLOCKED(knl) do { \ 213 if (knl->kl_locked((knl)->kl_lockarg)) \ 214 panic("knlist locked, but should not be"); \ 215 } while (0) 216 #else /* !INVARIANTS */ 217 #define KNL_ASSERT_LOCKED(knl) do {} while(0) 218 #define KNL_ASSERT_UNLOCKED(knl) do {} while (0) 219 #endif /* INVARIANTS */ 220 221 #define KN_HASHSIZE 64 /* XXX should be tunable */ 222 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) 223 224 static int 225 filt_nullattach(struct knote *kn) 226 { 227 228 return (ENXIO); 229 }; 230 231 struct filterops null_filtops = 232 { 0, filt_nullattach, NULL, NULL }; 233 234 /* XXX - make SYSINIT to add these, and move into respective modules. */ 235 extern struct filterops sig_filtops; 236 extern struct filterops fs_filtops; 237 238 /* 239 * Table for for all system-defined filters. 240 */ 241 static struct mtx filterops_lock; 242 MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops", 243 MTX_DEF); 244 static struct { 245 struct filterops *for_fop; 246 int for_refcnt; 247 } sysfilt_ops[EVFILT_SYSCOUNT] = { 248 { &file_filtops }, /* EVFILT_READ */ 249 { &file_filtops }, /* EVFILT_WRITE */ 250 { &null_filtops }, /* EVFILT_AIO */ 251 { &file_filtops }, /* EVFILT_VNODE */ 252 { &proc_filtops }, /* EVFILT_PROC */ 253 { &sig_filtops }, /* EVFILT_SIGNAL */ 254 { &timer_filtops }, /* EVFILT_TIMER */ 255 { &file_filtops }, /* EVFILT_NETDEV */ 256 { &fs_filtops }, /* EVFILT_FS */ 257 { &null_filtops }, /* EVFILT_LIO */ 258 }; 259 260 /* 261 * Simple redirection for all cdevsw style objects to call their fo_kqfilter 262 * method. 263 */ 264 static int 265 filt_fileattach(struct knote *kn) 266 { 267 268 return (fo_kqfilter(kn->kn_fp, kn)); 269 } 270 271 /*ARGSUSED*/ 272 static int 273 kqueue_kqfilter(struct file *fp, struct knote *kn) 274 { 275 struct kqueue *kq = kn->kn_fp->f_data; 276 277 if (kn->kn_filter != EVFILT_READ) 278 return (EINVAL); 279 280 kn->kn_status |= KN_KQUEUE; 281 kn->kn_fop = &kqread_filtops; 282 knlist_add(&kq->kq_sel.si_note, kn, 0); 283 284 return (0); 285 } 286 287 static void 288 filt_kqdetach(struct knote *kn) 289 { 290 struct kqueue *kq = kn->kn_fp->f_data; 291 292 knlist_remove(&kq->kq_sel.si_note, kn, 0); 293 } 294 295 /*ARGSUSED*/ 296 static int 297 filt_kqueue(struct knote *kn, long hint) 298 { 299 struct kqueue *kq = kn->kn_fp->f_data; 300 301 kn->kn_data = kq->kq_count; 302 return (kn->kn_data > 0); 303 } 304 305 /* XXX - move to kern_proc.c? */ 306 static int 307 filt_procattach(struct knote *kn) 308 { 309 struct proc *p; 310 int immediate; 311 int error; 312 313 immediate = 0; 314 p = pfind(kn->kn_id); 315 if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) { 316 p = zpfind(kn->kn_id); 317 immediate = 1; 318 } else if (p != NULL && (p->p_flag & P_WEXIT)) { 319 immediate = 1; 320 } 321 322 if (p == NULL) 323 return (ESRCH); 324 if ((error = p_cansee(curthread, p))) 325 return (error); 326 327 kn->kn_ptr.p_proc = p; 328 kn->kn_flags |= EV_CLEAR; /* automatically set */ 329 330 /* 331 * internal flag indicating registration done by kernel 332 */ 333 if (kn->kn_flags & EV_FLAG1) { 334 kn->kn_data = kn->kn_sdata; /* ppid */ 335 kn->kn_fflags = NOTE_CHILD; 336 kn->kn_flags &= ~EV_FLAG1; 337 } 338 339 if (immediate == 0) 340 knlist_add(&p->p_klist, kn, 1); 341 342 /* 343 * Immediately activate any exit notes if the target process is a 344 * zombie. This is necessary to handle the case where the target 345 * process, e.g. a child, dies before the kevent is registered. 346 */ 347 if (immediate && filt_proc(kn, NOTE_EXIT)) 348 KNOTE_ACTIVATE(kn, 0); 349 350 PROC_UNLOCK(p); 351 352 return (0); 353 } 354 355 /* 356 * The knote may be attached to a different process, which may exit, 357 * leaving nothing for the knote to be attached to. So when the process 358 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so 359 * it will be deleted when read out. However, as part of the knote deletion, 360 * this routine is called, so a check is needed to avoid actually performing 361 * a detach, because the original process does not exist any more. 362 */ 363 /* XXX - move to kern_proc.c? */ 364 static void 365 filt_procdetach(struct knote *kn) 366 { 367 struct proc *p; 368 369 p = kn->kn_ptr.p_proc; 370 knlist_remove(&p->p_klist, kn, 0); 371 kn->kn_ptr.p_proc = NULL; 372 } 373 374 /* XXX - move to kern_proc.c? */ 375 static int 376 filt_proc(struct knote *kn, long hint) 377 { 378 struct proc *p = kn->kn_ptr.p_proc; 379 u_int event; 380 381 /* 382 * mask off extra data 383 */ 384 event = (u_int)hint & NOTE_PCTRLMASK; 385 386 /* 387 * if the user is interested in this event, record it. 388 */ 389 if (kn->kn_sfflags & event) 390 kn->kn_fflags |= event; 391 392 /* 393 * process is gone, so flag the event as finished. 394 */ 395 if (event == NOTE_EXIT) { 396 if (!(kn->kn_status & KN_DETACHED)) 397 knlist_remove_inevent(&p->p_klist, kn); 398 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 399 kn->kn_data = p->p_xstat; 400 kn->kn_ptr.p_proc = NULL; 401 return (1); 402 } 403 404 /* 405 * process forked, and user wants to track the new process, 406 * so attach a new knote to it, and immediately report an 407 * event with the parent's pid. 408 */ 409 if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) { 410 struct kevent kev; 411 int error; 412 413 /* 414 * register knote with new process. 415 */ 416 kev.ident = hint & NOTE_PDATAMASK; /* pid */ 417 kev.filter = kn->kn_filter; 418 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; 419 kev.fflags = kn->kn_sfflags; 420 kev.data = kn->kn_id; /* parent */ 421 kev.udata = kn->kn_kevent.udata; /* preserve udata */ 422 error = kqueue_register(kn->kn_kq, &kev, NULL, 0); 423 if (error) 424 kn->kn_fflags |= NOTE_TRACKERR; 425 } 426 427 return (kn->kn_fflags != 0); 428 } 429 430 static int 431 timertoticks(intptr_t data) 432 { 433 struct timeval tv; 434 int tticks; 435 436 tv.tv_sec = data / 1000; 437 tv.tv_usec = (data % 1000) * 1000; 438 tticks = tvtohz(&tv); 439 440 return tticks; 441 } 442 443 /* XXX - move to kern_timeout.c? */ 444 static void 445 filt_timerexpire(void *knx) 446 { 447 struct knote *kn = knx; 448 struct callout *calloutp; 449 450 kn->kn_data++; 451 KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */ 452 453 if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) { 454 calloutp = (struct callout *)kn->kn_hook; 455 callout_reset(calloutp, timertoticks(kn->kn_sdata), 456 filt_timerexpire, kn); 457 } 458 } 459 460 /* 461 * data contains amount of time to sleep, in milliseconds 462 */ 463 /* XXX - move to kern_timeout.c? */ 464 static int 465 filt_timerattach(struct knote *kn) 466 { 467 struct callout *calloutp; 468 469 atomic_add_int(&kq_ncallouts, 1); 470 471 if (kq_ncallouts >= kq_calloutmax) { 472 atomic_add_int(&kq_ncallouts, -1); 473 return (ENOMEM); 474 } 475 476 kn->kn_flags |= EV_CLEAR; /* automatically set */ 477 kn->kn_status &= ~KN_DETACHED; /* knlist_add usually sets it */ 478 MALLOC(calloutp, struct callout *, sizeof(*calloutp), 479 M_KQUEUE, M_WAITOK); 480 callout_init(calloutp, CALLOUT_MPSAFE); 481 kn->kn_hook = calloutp; 482 callout_reset(calloutp, timertoticks(kn->kn_sdata), filt_timerexpire, 483 kn); 484 485 return (0); 486 } 487 488 /* XXX - move to kern_timeout.c? */ 489 static void 490 filt_timerdetach(struct knote *kn) 491 { 492 struct callout *calloutp; 493 494 calloutp = (struct callout *)kn->kn_hook; 495 callout_drain(calloutp); 496 FREE(calloutp, M_KQUEUE); 497 atomic_add_int(&kq_ncallouts, -1); 498 kn->kn_status |= KN_DETACHED; /* knlist_remove usually clears it */ 499 } 500 501 /* XXX - move to kern_timeout.c? */ 502 static int 503 filt_timer(struct knote *kn, long hint) 504 { 505 506 return (kn->kn_data != 0); 507 } 508 509 /* 510 * MPSAFE 511 */ 512 int 513 kqueue(struct thread *td, struct kqueue_args *uap) 514 { 515 struct filedesc *fdp; 516 struct kqueue *kq; 517 struct file *fp; 518 int fd, error; 519 520 fdp = td->td_proc->p_fd; 521 error = falloc(td, &fp, &fd); 522 if (error) 523 goto done2; 524 525 /* An extra reference on `nfp' has been held for us by falloc(). */ 526 kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO); 527 mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK); 528 TAILQ_INIT(&kq->kq_head); 529 kq->kq_fdp = fdp; 530 knlist_init(&kq->kq_sel.si_note, &kq->kq_lock, NULL, NULL, NULL); 531 TASK_INIT(&kq->kq_task, 0, kqueue_task, kq); 532 533 FILEDESC_LOCK_FAST(fdp); 534 SLIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list); 535 FILEDESC_UNLOCK_FAST(fdp); 536 537 FILE_LOCK(fp); 538 fp->f_flag = FREAD | FWRITE; 539 fp->f_type = DTYPE_KQUEUE; 540 fp->f_ops = &kqueueops; 541 fp->f_data = kq; 542 FILE_UNLOCK(fp); 543 fdrop(fp, td); 544 545 td->td_retval[0] = fd; 546 done2: 547 return (error); 548 } 549 550 #ifndef _SYS_SYSPROTO_H_ 551 struct kevent_args { 552 int fd; 553 const struct kevent *changelist; 554 int nchanges; 555 struct kevent *eventlist; 556 int nevents; 557 const struct timespec *timeout; 558 }; 559 #endif 560 /* 561 * MPSAFE 562 */ 563 int 564 kevent(struct thread *td, struct kevent_args *uap) 565 { 566 struct timespec ts, *tsp; 567 struct kevent_copyops k_ops = { uap, 568 kevent_copyout, 569 kevent_copyin}; 570 int error; 571 #ifdef KTRACE 572 struct uio ktruio; 573 struct iovec ktriov; 574 struct uio *ktruioin = NULL; 575 struct uio *ktruioout = NULL; 576 #endif 577 578 if (uap->timeout != NULL) { 579 error = copyin(uap->timeout, &ts, sizeof(ts)); 580 if (error) 581 return (error); 582 tsp = &ts; 583 } else 584 tsp = NULL; 585 586 #ifdef KTRACE 587 if (KTRPOINT(td, KTR_GENIO)) { 588 ktriov.iov_base = uap->changelist; 589 ktriov.iov_len = uap->nchanges * sizeof(struct kevent); 590 ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1, 591 .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ, 592 .uio_td = td }; 593 ktruioin = cloneuio(&ktruio); 594 ktriov.iov_base = uap->eventlist; 595 ktriov.iov_len = uap->nevents * sizeof(struct kevent); 596 ktruioout = cloneuio(&ktruio); 597 } 598 #endif 599 600 error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents, 601 &k_ops, tsp); 602 603 #ifdef KTRACE 604 if (ktruioin != NULL) { 605 ktruioin->uio_resid = uap->nchanges * sizeof(struct kevent); 606 ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0); 607 ktruioout->uio_resid = td->td_retval[0] * sizeof(struct kevent); 608 ktrgenio(uap->fd, UIO_READ, ktruioout, error); 609 } 610 #endif 611 612 return (error); 613 } 614 615 /* 616 * Copy 'count' items into the destination list pointed to by uap->eventlist. 617 */ 618 static int 619 kevent_copyout(void *arg, struct kevent *kevp, int count) 620 { 621 struct kevent_args *uap; 622 int error; 623 624 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); 625 uap = (struct kevent_args *)arg; 626 627 error = copyout(kevp, uap->eventlist, count * sizeof *kevp); 628 if (error == 0) 629 uap->eventlist += count; 630 return (error); 631 } 632 633 /* 634 * Copy 'count' items from the list pointed to by uap->changelist. 635 */ 636 static int 637 kevent_copyin(void *arg, struct kevent *kevp, int count) 638 { 639 struct kevent_args *uap; 640 int error; 641 642 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); 643 uap = (struct kevent_args *)arg; 644 645 error = copyin(uap->changelist, kevp, count * sizeof *kevp); 646 if (error == 0) 647 uap->changelist += count; 648 return (error); 649 } 650 651 int 652 kern_kevent(struct thread *td, int fd, int nchanges, int nevents, 653 struct kevent_copyops *k_ops, const struct timespec *timeout) 654 { 655 struct kevent keva[KQ_NEVENTS]; 656 struct kevent *kevp, *changes; 657 struct kqueue *kq; 658 struct file *fp; 659 int i, n, nerrors, error; 660 661 if ((error = fget(td, fd, &fp)) != 0) 662 return (error); 663 if ((error = kqueue_aquire(fp, &kq)) != 0) 664 goto done_norel; 665 666 nerrors = 0; 667 668 while (nchanges > 0) { 669 n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges; 670 error = k_ops->k_copyin(k_ops->arg, keva, n); 671 if (error) 672 goto done; 673 changes = keva; 674 for (i = 0; i < n; i++) { 675 kevp = &changes[i]; 676 if (!kevp->filter) 677 continue; 678 kevp->flags &= ~EV_SYSFLAGS; 679 error = kqueue_register(kq, kevp, td, 1); 680 if (error) { 681 if (nevents != 0) { 682 kevp->flags = EV_ERROR; 683 kevp->data = error; 684 (void) k_ops->k_copyout(k_ops->arg, 685 kevp, 1); 686 nevents--; 687 nerrors++; 688 } else { 689 goto done; 690 } 691 } 692 } 693 nchanges -= n; 694 } 695 if (nerrors) { 696 td->td_retval[0] = nerrors; 697 error = 0; 698 goto done; 699 } 700 701 error = kqueue_scan(kq, nevents, k_ops, timeout, keva, td); 702 done: 703 kqueue_release(kq, 0); 704 done_norel: 705 if (fp != NULL) 706 fdrop(fp, td); 707 return (error); 708 } 709 710 int 711 kqueue_add_filteropts(int filt, struct filterops *filtops) 712 { 713 int error; 714 715 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) { 716 printf( 717 "trying to add a filterop that is out of range: %d is beyond %d\n", 718 ~filt, EVFILT_SYSCOUNT); 719 return EINVAL; 720 } 721 mtx_lock(&filterops_lock); 722 if (sysfilt_ops[~filt].for_fop != &null_filtops && 723 sysfilt_ops[~filt].for_fop != NULL) 724 error = EEXIST; 725 else { 726 sysfilt_ops[~filt].for_fop = filtops; 727 sysfilt_ops[~filt].for_refcnt = 0; 728 } 729 mtx_unlock(&filterops_lock); 730 731 return (0); 732 } 733 734 int 735 kqueue_del_filteropts(int filt) 736 { 737 int error; 738 739 error = 0; 740 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) 741 return EINVAL; 742 743 mtx_lock(&filterops_lock); 744 if (sysfilt_ops[~filt].for_fop == &null_filtops || 745 sysfilt_ops[~filt].for_fop == NULL) 746 error = EINVAL; 747 else if (sysfilt_ops[~filt].for_refcnt != 0) 748 error = EBUSY; 749 else { 750 sysfilt_ops[~filt].for_fop = &null_filtops; 751 sysfilt_ops[~filt].for_refcnt = 0; 752 } 753 mtx_unlock(&filterops_lock); 754 755 return error; 756 } 757 758 static struct filterops * 759 kqueue_fo_find(int filt) 760 { 761 762 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) 763 return NULL; 764 765 mtx_lock(&filterops_lock); 766 sysfilt_ops[~filt].for_refcnt++; 767 if (sysfilt_ops[~filt].for_fop == NULL) 768 sysfilt_ops[~filt].for_fop = &null_filtops; 769 mtx_unlock(&filterops_lock); 770 771 return sysfilt_ops[~filt].for_fop; 772 } 773 774 static void 775 kqueue_fo_release(int filt) 776 { 777 778 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) 779 return; 780 781 mtx_lock(&filterops_lock); 782 KASSERT(sysfilt_ops[~filt].for_refcnt > 0, 783 ("filter object refcount not valid on release")); 784 sysfilt_ops[~filt].for_refcnt--; 785 mtx_unlock(&filterops_lock); 786 } 787 788 /* 789 * A ref to kq (obtained via kqueue_aquire) must be held. waitok will 790 * influence if memory allocation should wait. Make sure it is 0 if you 791 * hold any mutexes. 792 */ 793 static int 794 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok) 795 { 796 struct filterops *fops; 797 struct file *fp; 798 struct knote *kn, *tkn; 799 int error, filt, event; 800 int haskqglobal; 801 802 fp = NULL; 803 kn = NULL; 804 error = 0; 805 haskqglobal = 0; 806 807 filt = kev->filter; 808 fops = kqueue_fo_find(filt); 809 if (fops == NULL) 810 return EINVAL; 811 812 tkn = knote_alloc(waitok); /* prevent waiting with locks */ 813 814 findkn: 815 if (fops->f_isfd) { 816 KASSERT(td != NULL, ("td is NULL")); 817 error = fget(td, kev->ident, &fp); 818 if (error) 819 goto done; 820 821 if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops, 822 kev->ident, 0) != 0) { 823 /* try again */ 824 fdrop(fp, td); 825 fp = NULL; 826 error = kqueue_expand(kq, fops, kev->ident, waitok); 827 if (error) 828 goto done; 829 goto findkn; 830 } 831 832 if (fp->f_type == DTYPE_KQUEUE) { 833 /* 834 * if we add some inteligence about what we are doing, 835 * we should be able to support events on ourselves. 836 * We need to know when we are doing this to prevent 837 * getting both the knlist lock and the kq lock since 838 * they are the same thing. 839 */ 840 if (fp->f_data == kq) { 841 error = EINVAL; 842 goto done; 843 } 844 845 KQ_GLOBAL_LOCK(&kq_global, haskqglobal); 846 } 847 848 KQ_LOCK(kq); 849 if (kev->ident < kq->kq_knlistsize) { 850 SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link) 851 if (kev->filter == kn->kn_filter) 852 break; 853 } 854 } else { 855 if ((kev->flags & EV_ADD) == EV_ADD) 856 kqueue_expand(kq, fops, kev->ident, waitok); 857 858 KQ_LOCK(kq); 859 if (kq->kq_knhashmask != 0) { 860 struct klist *list; 861 862 list = &kq->kq_knhash[ 863 KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; 864 SLIST_FOREACH(kn, list, kn_link) 865 if (kev->ident == kn->kn_id && 866 kev->filter == kn->kn_filter) 867 break; 868 } 869 } 870 871 /* knote is in the process of changing, wait for it to stablize. */ 872 if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) { 873 if (fp != NULL) { 874 fdrop(fp, td); 875 fp = NULL; 876 } 877 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 878 kq->kq_state |= KQ_FLUXWAIT; 879 msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0); 880 goto findkn; 881 } 882 883 if (kn == NULL && ((kev->flags & EV_ADD) == 0)) { 884 KQ_UNLOCK(kq); 885 error = ENOENT; 886 goto done; 887 } 888 889 /* 890 * kn now contains the matching knote, or NULL if no match 891 */ 892 if (kev->flags & EV_ADD) { 893 if (kn == NULL) { 894 kn = tkn; 895 tkn = NULL; 896 if (kn == NULL) { 897 KQ_UNLOCK(kq); 898 error = ENOMEM; 899 goto done; 900 } 901 kn->kn_fp = fp; 902 kn->kn_kq = kq; 903 kn->kn_fop = fops; 904 /* 905 * apply reference counts to knote structure, and 906 * do not release it at the end of this routine. 907 */ 908 fops = NULL; 909 fp = NULL; 910 911 kn->kn_sfflags = kev->fflags; 912 kn->kn_sdata = kev->data; 913 kev->fflags = 0; 914 kev->data = 0; 915 kn->kn_kevent = *kev; 916 kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE | 917 EV_ENABLE | EV_DISABLE); 918 kn->kn_status = KN_INFLUX|KN_DETACHED; 919 920 error = knote_attach(kn, kq); 921 KQ_UNLOCK(kq); 922 if (error != 0) { 923 tkn = kn; 924 goto done; 925 } 926 927 if ((error = kn->kn_fop->f_attach(kn)) != 0) { 928 knote_drop(kn, td); 929 goto done; 930 } 931 KN_LIST_LOCK(kn); 932 } else { 933 /* 934 * The user may change some filter values after the 935 * initial EV_ADD, but doing so will not reset any 936 * filter which has already been triggered. 937 */ 938 kn->kn_status |= KN_INFLUX; 939 KQ_UNLOCK(kq); 940 KN_LIST_LOCK(kn); 941 kn->kn_sfflags = kev->fflags; 942 kn->kn_sdata = kev->data; 943 kn->kn_kevent.udata = kev->udata; 944 } 945 946 /* 947 * We can get here with kn->kn_knlist == NULL. 948 * This can happen when the initial attach event decides that 949 * the event is "completed" already. i.e. filt_procattach 950 * is called on a zombie process. It will call filt_proc 951 * which will remove it from the list, and NULL kn_knlist. 952 */ 953 event = kn->kn_fop->f_event(kn, 0); 954 KQ_LOCK(kq); 955 if (event) 956 KNOTE_ACTIVATE(kn, 1); 957 kn->kn_status &= ~KN_INFLUX; 958 KN_LIST_UNLOCK(kn); 959 } else if (kev->flags & EV_DELETE) { 960 kn->kn_status |= KN_INFLUX; 961 KQ_UNLOCK(kq); 962 if (!(kn->kn_status & KN_DETACHED)) 963 kn->kn_fop->f_detach(kn); 964 knote_drop(kn, td); 965 goto done; 966 } 967 968 if ((kev->flags & EV_DISABLE) && 969 ((kn->kn_status & KN_DISABLED) == 0)) { 970 kn->kn_status |= KN_DISABLED; 971 } 972 973 if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) { 974 kn->kn_status &= ~KN_DISABLED; 975 if ((kn->kn_status & KN_ACTIVE) && 976 ((kn->kn_status & KN_QUEUED) == 0)) 977 knote_enqueue(kn); 978 } 979 KQ_UNLOCK_FLUX(kq); 980 981 done: 982 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 983 if (fp != NULL) 984 fdrop(fp, td); 985 if (tkn != NULL) 986 knote_free(tkn); 987 if (fops != NULL) 988 kqueue_fo_release(filt); 989 return (error); 990 } 991 992 static int 993 kqueue_aquire(struct file *fp, struct kqueue **kqp) 994 { 995 int error; 996 struct kqueue *kq; 997 998 error = 0; 999 1000 FILE_LOCK(fp); 1001 do { 1002 kq = fp->f_data; 1003 if (fp->f_type != DTYPE_KQUEUE || kq == NULL) { 1004 error = EBADF; 1005 break; 1006 } 1007 *kqp = kq; 1008 KQ_LOCK(kq); 1009 if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { 1010 KQ_UNLOCK(kq); 1011 error = EBADF; 1012 break; 1013 } 1014 kq->kq_refcnt++; 1015 KQ_UNLOCK(kq); 1016 } while (0); 1017 FILE_UNLOCK(fp); 1018 1019 return error; 1020 } 1021 1022 static void 1023 kqueue_release(struct kqueue *kq, int locked) 1024 { 1025 if (locked) 1026 KQ_OWNED(kq); 1027 else 1028 KQ_LOCK(kq); 1029 kq->kq_refcnt--; 1030 if (kq->kq_refcnt == 1) 1031 wakeup(&kq->kq_refcnt); 1032 if (!locked) 1033 KQ_UNLOCK(kq); 1034 } 1035 1036 static void 1037 kqueue_schedtask(struct kqueue *kq) 1038 { 1039 1040 KQ_OWNED(kq); 1041 KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN), 1042 ("scheduling kqueue task while draining")); 1043 1044 if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) { 1045 taskqueue_enqueue(taskqueue_kqueue, &kq->kq_task); 1046 kq->kq_state |= KQ_TASKSCHED; 1047 } 1048 } 1049 1050 /* 1051 * Expand the kq to make sure we have storage for fops/ident pair. 1052 * 1053 * Return 0 on success (or no work necessary), return errno on failure. 1054 * 1055 * Not calling hashinit w/ waitok (proper malloc flag) should be safe. 1056 * If kqueue_register is called from a non-fd context, there usually/should 1057 * be no locks held. 1058 */ 1059 static int 1060 kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, 1061 int waitok) 1062 { 1063 struct klist *list, *tmp_knhash; 1064 u_long tmp_knhashmask; 1065 int size; 1066 int fd; 1067 int mflag = waitok ? M_WAITOK : M_NOWAIT; 1068 1069 KQ_NOTOWNED(kq); 1070 1071 if (fops->f_isfd) { 1072 fd = ident; 1073 if (kq->kq_knlistsize <= fd) { 1074 size = kq->kq_knlistsize; 1075 while (size <= fd) 1076 size += KQEXTENT; 1077 MALLOC(list, struct klist *, 1078 size * sizeof list, M_KQUEUE, mflag); 1079 if (list == NULL) 1080 return ENOMEM; 1081 KQ_LOCK(kq); 1082 if (kq->kq_knlistsize > fd) { 1083 FREE(list, M_KQUEUE); 1084 list = NULL; 1085 } else { 1086 if (kq->kq_knlist != NULL) { 1087 bcopy(kq->kq_knlist, list, 1088 kq->kq_knlistsize * sizeof list); 1089 FREE(kq->kq_knlist, M_KQUEUE); 1090 kq->kq_knlist = NULL; 1091 } 1092 bzero((caddr_t)list + 1093 kq->kq_knlistsize * sizeof list, 1094 (size - kq->kq_knlistsize) * sizeof list); 1095 kq->kq_knlistsize = size; 1096 kq->kq_knlist = list; 1097 } 1098 KQ_UNLOCK(kq); 1099 } 1100 } else { 1101 if (kq->kq_knhashmask == 0) { 1102 tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE, 1103 &tmp_knhashmask); 1104 if (tmp_knhash == NULL) 1105 return ENOMEM; 1106 KQ_LOCK(kq); 1107 if (kq->kq_knhashmask == 0) { 1108 kq->kq_knhash = tmp_knhash; 1109 kq->kq_knhashmask = tmp_knhashmask; 1110 } else { 1111 free(tmp_knhash, M_KQUEUE); 1112 } 1113 KQ_UNLOCK(kq); 1114 } 1115 } 1116 1117 KQ_NOTOWNED(kq); 1118 return 0; 1119 } 1120 1121 static void 1122 kqueue_task(void *arg, int pending) 1123 { 1124 struct kqueue *kq; 1125 int haskqglobal; 1126 1127 haskqglobal = 0; 1128 kq = arg; 1129 1130 KQ_GLOBAL_LOCK(&kq_global, haskqglobal); 1131 KQ_LOCK(kq); 1132 1133 KNOTE_LOCKED(&kq->kq_sel.si_note, 0); 1134 1135 kq->kq_state &= ~KQ_TASKSCHED; 1136 if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) { 1137 wakeup(&kq->kq_state); 1138 } 1139 KQ_UNLOCK(kq); 1140 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1141 } 1142 1143 /* 1144 * Scan, update kn_data (if not ONESHOT), and copyout triggered events. 1145 * We treat KN_MARKER knotes as if they are INFLUX. 1146 */ 1147 static int 1148 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, 1149 const struct timespec *tsp, struct kevent *keva, struct thread *td) 1150 { 1151 struct kevent *kevp; 1152 struct timeval atv, rtv, ttv; 1153 struct knote *kn, *marker; 1154 int count, timeout, nkev, error; 1155 int haskqglobal; 1156 1157 count = maxevents; 1158 nkev = 0; 1159 error = 0; 1160 haskqglobal = 0; 1161 1162 if (maxevents == 0) 1163 goto done_nl; 1164 1165 if (tsp != NULL) { 1166 TIMESPEC_TO_TIMEVAL(&atv, tsp); 1167 if (itimerfix(&atv)) { 1168 error = EINVAL; 1169 goto done_nl; 1170 } 1171 if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) 1172 timeout = -1; 1173 else 1174 timeout = atv.tv_sec > 24 * 60 * 60 ? 1175 24 * 60 * 60 * hz : tvtohz(&atv); 1176 getmicrouptime(&rtv); 1177 timevaladd(&atv, &rtv); 1178 } else { 1179 atv.tv_sec = 0; 1180 atv.tv_usec = 0; 1181 timeout = 0; 1182 } 1183 marker = knote_alloc(1); 1184 if (marker == NULL) { 1185 error = ENOMEM; 1186 goto done_nl; 1187 } 1188 marker->kn_status = KN_MARKER; 1189 KQ_LOCK(kq); 1190 goto start; 1191 1192 retry: 1193 if (atv.tv_sec || atv.tv_usec) { 1194 getmicrouptime(&rtv); 1195 if (timevalcmp(&rtv, &atv, >=)) 1196 goto done; 1197 ttv = atv; 1198 timevalsub(&ttv, &rtv); 1199 timeout = ttv.tv_sec > 24 * 60 * 60 ? 1200 24 * 60 * 60 * hz : tvtohz(&ttv); 1201 } 1202 1203 start: 1204 kevp = keva; 1205 if (kq->kq_count == 0) { 1206 if (timeout < 0) { 1207 error = EWOULDBLOCK; 1208 } else { 1209 kq->kq_state |= KQ_SLEEP; 1210 error = msleep(kq, &kq->kq_lock, PSOCK | PCATCH, 1211 "kqread", timeout); 1212 } 1213 if (error == 0) 1214 goto retry; 1215 /* don't restart after signals... */ 1216 if (error == ERESTART) 1217 error = EINTR; 1218 else if (error == EWOULDBLOCK) 1219 error = 0; 1220 goto done; 1221 } 1222 1223 TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe); 1224 while (count) { 1225 KQ_OWNED(kq); 1226 kn = TAILQ_FIRST(&kq->kq_head); 1227 1228 if ((kn->kn_status == KN_MARKER && kn != marker) || 1229 (kn->kn_status & KN_INFLUX) == KN_INFLUX) { 1230 kq->kq_state |= KQ_FLUXWAIT; 1231 error = msleep(kq, &kq->kq_lock, PSOCK, 1232 "kqflxwt", 0); 1233 continue; 1234 } 1235 1236 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 1237 if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) { 1238 kn->kn_status &= ~KN_QUEUED; 1239 kq->kq_count--; 1240 continue; 1241 } 1242 if (kn == marker) { 1243 KQ_FLUX_WAKEUP(kq); 1244 if (count == maxevents) 1245 goto retry; 1246 goto done; 1247 } 1248 KASSERT((kn->kn_status & KN_INFLUX) == 0, 1249 ("KN_INFLUX set when not suppose to be")); 1250 1251 if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) { 1252 kn->kn_status &= ~KN_QUEUED; 1253 kn->kn_status |= KN_INFLUX; 1254 kq->kq_count--; 1255 KQ_UNLOCK(kq); 1256 /* 1257 * We don't need to lock the list since we've marked 1258 * it _INFLUX. 1259 */ 1260 *kevp = kn->kn_kevent; 1261 if (!(kn->kn_status & KN_DETACHED)) 1262 kn->kn_fop->f_detach(kn); 1263 knote_drop(kn, td); 1264 KQ_LOCK(kq); 1265 kn = NULL; 1266 } else { 1267 kn->kn_status |= KN_INFLUX; 1268 KQ_UNLOCK(kq); 1269 if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE) 1270 KQ_GLOBAL_LOCK(&kq_global, haskqglobal); 1271 KN_LIST_LOCK(kn); 1272 if (kn->kn_fop->f_event(kn, 0) == 0) { 1273 KQ_LOCK(kq); 1274 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1275 kn->kn_status &= 1276 ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX); 1277 kq->kq_count--; 1278 KN_LIST_UNLOCK(kn); 1279 continue; 1280 } 1281 *kevp = kn->kn_kevent; 1282 KQ_LOCK(kq); 1283 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); 1284 if (kn->kn_flags & EV_CLEAR) { 1285 kn->kn_data = 0; 1286 kn->kn_fflags = 0; 1287 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); 1288 kq->kq_count--; 1289 } else 1290 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 1291 1292 kn->kn_status &= ~(KN_INFLUX); 1293 KN_LIST_UNLOCK(kn); 1294 } 1295 1296 /* we are returning a copy to the user */ 1297 kevp++; 1298 nkev++; 1299 count--; 1300 1301 if (nkev == KQ_NEVENTS) { 1302 KQ_UNLOCK_FLUX(kq); 1303 error = k_ops->k_copyout(k_ops->arg, keva, nkev); 1304 nkev = 0; 1305 kevp = keva; 1306 KQ_LOCK(kq); 1307 if (error) 1308 break; 1309 } 1310 } 1311 TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe); 1312 done: 1313 KQ_OWNED(kq); 1314 KQ_UNLOCK_FLUX(kq); 1315 knote_free(marker); 1316 done_nl: 1317 KQ_NOTOWNED(kq); 1318 if (nkev != 0) 1319 error = k_ops->k_copyout(k_ops->arg, keva, nkev); 1320 td->td_retval[0] = maxevents - count; 1321 return (error); 1322 } 1323 1324 /* 1325 * XXX 1326 * This could be expanded to call kqueue_scan, if desired. 1327 */ 1328 /*ARGSUSED*/ 1329 static int 1330 kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 1331 int flags, struct thread *td) 1332 { 1333 return (ENXIO); 1334 } 1335 1336 /*ARGSUSED*/ 1337 static int 1338 kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred, 1339 int flags, struct thread *td) 1340 { 1341 return (ENXIO); 1342 } 1343 1344 /*ARGSUSED*/ 1345 static int 1346 kqueue_ioctl(struct file *fp, u_long cmd, void *data, 1347 struct ucred *active_cred, struct thread *td) 1348 { 1349 /* 1350 * Enabling sigio causes two major problems: 1351 * 1) infinite recursion: 1352 * Synopsys: kevent is being used to track signals and have FIOASYNC 1353 * set. On receipt of a signal this will cause a kqueue to recurse 1354 * into itself over and over. Sending the sigio causes the kqueue 1355 * to become ready, which in turn posts sigio again, forever. 1356 * Solution: this can be solved by setting a flag in the kqueue that 1357 * we have a SIGIO in progress. 1358 * 2) locking problems: 1359 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts 1360 * us above the proc and pgrp locks. 1361 * Solution: Post a signal using an async mechanism, being sure to 1362 * record a generation count in the delivery so that we do not deliver 1363 * a signal to the wrong process. 1364 * 1365 * Note, these two mechanisms are somewhat mutually exclusive! 1366 */ 1367 #if 0 1368 struct kqueue *kq; 1369 1370 kq = fp->f_data; 1371 switch (cmd) { 1372 case FIOASYNC: 1373 if (*(int *)data) { 1374 kq->kq_state |= KQ_ASYNC; 1375 } else { 1376 kq->kq_state &= ~KQ_ASYNC; 1377 } 1378 return (0); 1379 1380 case FIOSETOWN: 1381 return (fsetown(*(int *)data, &kq->kq_sigio)); 1382 1383 case FIOGETOWN: 1384 *(int *)data = fgetown(&kq->kq_sigio); 1385 return (0); 1386 } 1387 #endif 1388 1389 return (ENOTTY); 1390 } 1391 1392 /*ARGSUSED*/ 1393 static int 1394 kqueue_poll(struct file *fp, int events, struct ucred *active_cred, 1395 struct thread *td) 1396 { 1397 struct kqueue *kq; 1398 int revents = 0; 1399 int error; 1400 1401 if ((error = kqueue_aquire(fp, &kq))) 1402 return POLLERR; 1403 1404 KQ_LOCK(kq); 1405 if (events & (POLLIN | POLLRDNORM)) { 1406 if (kq->kq_count) { 1407 revents |= events & (POLLIN | POLLRDNORM); 1408 } else { 1409 selrecord(td, &kq->kq_sel); 1410 kq->kq_state |= KQ_SEL; 1411 } 1412 } 1413 kqueue_release(kq, 1); 1414 KQ_UNLOCK(kq); 1415 return (revents); 1416 } 1417 1418 /*ARGSUSED*/ 1419 static int 1420 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred, 1421 struct thread *td) 1422 { 1423 1424 bzero((void *)st, sizeof *st); 1425 /* 1426 * We no longer return kq_count because the unlocked value is useless. 1427 * If you spent all this time getting the count, why not spend your 1428 * syscall better by calling kevent? 1429 * 1430 * XXX - This is needed for libc_r. 1431 */ 1432 st->st_mode = S_IFIFO; 1433 return (0); 1434 } 1435 1436 /*ARGSUSED*/ 1437 static int 1438 kqueue_close(struct file *fp, struct thread *td) 1439 { 1440 struct kqueue *kq = fp->f_data; 1441 struct filedesc *fdp; 1442 struct knote *kn; 1443 int i; 1444 int error; 1445 1446 if ((error = kqueue_aquire(fp, &kq))) 1447 return error; 1448 1449 KQ_LOCK(kq); 1450 1451 KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING, 1452 ("kqueue already closing")); 1453 kq->kq_state |= KQ_CLOSING; 1454 if (kq->kq_refcnt > 1) 1455 msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0); 1456 1457 KASSERT(kq->kq_refcnt == 1, ("other refs are out there!")); 1458 fdp = kq->kq_fdp; 1459 1460 KASSERT(knlist_empty(&kq->kq_sel.si_note), 1461 ("kqueue's knlist not empty")); 1462 1463 for (i = 0; i < kq->kq_knlistsize; i++) { 1464 while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) { 1465 KASSERT((kn->kn_status & KN_INFLUX) == 0, 1466 ("KN_INFLUX set when not suppose to be")); 1467 kn->kn_status |= KN_INFLUX; 1468 KQ_UNLOCK(kq); 1469 if (!(kn->kn_status & KN_DETACHED)) 1470 kn->kn_fop->f_detach(kn); 1471 knote_drop(kn, td); 1472 KQ_LOCK(kq); 1473 } 1474 } 1475 if (kq->kq_knhashmask != 0) { 1476 for (i = 0; i <= kq->kq_knhashmask; i++) { 1477 while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) { 1478 KASSERT((kn->kn_status & KN_INFLUX) == 0, 1479 ("KN_INFLUX set when not suppose to be")); 1480 kn->kn_status |= KN_INFLUX; 1481 KQ_UNLOCK(kq); 1482 if (!(kn->kn_status & KN_DETACHED)) 1483 kn->kn_fop->f_detach(kn); 1484 knote_drop(kn, td); 1485 KQ_LOCK(kq); 1486 } 1487 } 1488 } 1489 1490 if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) { 1491 kq->kq_state |= KQ_TASKDRAIN; 1492 msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0); 1493 } 1494 1495 if ((kq->kq_state & KQ_SEL) == KQ_SEL) { 1496 kq->kq_state &= ~KQ_SEL; 1497 selwakeuppri(&kq->kq_sel, PSOCK); 1498 } 1499 1500 KQ_UNLOCK(kq); 1501 1502 FILEDESC_LOCK_FAST(fdp); 1503 SLIST_REMOVE(&fdp->fd_kqlist, kq, kqueue, kq_list); 1504 FILEDESC_UNLOCK_FAST(fdp); 1505 1506 knlist_destroy(&kq->kq_sel.si_note); 1507 mtx_destroy(&kq->kq_lock); 1508 kq->kq_fdp = NULL; 1509 1510 if (kq->kq_knhash != NULL) 1511 free(kq->kq_knhash, M_KQUEUE); 1512 if (kq->kq_knlist != NULL) 1513 free(kq->kq_knlist, M_KQUEUE); 1514 1515 funsetown(&kq->kq_sigio); 1516 free(kq, M_KQUEUE); 1517 fp->f_data = NULL; 1518 1519 return (0); 1520 } 1521 1522 static void 1523 kqueue_wakeup(struct kqueue *kq) 1524 { 1525 KQ_OWNED(kq); 1526 1527 if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) { 1528 kq->kq_state &= ~KQ_SLEEP; 1529 wakeup(kq); 1530 } 1531 if ((kq->kq_state & KQ_SEL) == KQ_SEL) { 1532 kq->kq_state &= ~KQ_SEL; 1533 selwakeuppri(&kq->kq_sel, PSOCK); 1534 } 1535 if (!knlist_empty(&kq->kq_sel.si_note)) 1536 kqueue_schedtask(kq); 1537 if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) { 1538 pgsigio(&kq->kq_sigio, SIGIO, 0); 1539 } 1540 } 1541 1542 /* 1543 * Walk down a list of knotes, activating them if their event has triggered. 1544 * 1545 * There is a possibility to optimize in the case of one kq watching another. 1546 * Instead of scheduling a task to wake it up, you could pass enough state 1547 * down the chain to make up the parent kqueue. Make this code functional 1548 * first. 1549 */ 1550 void 1551 knote(struct knlist *list, long hint, int islocked) 1552 { 1553 struct kqueue *kq; 1554 struct knote *kn; 1555 1556 if (list == NULL) 1557 return; 1558 1559 KNL_ASSERT_LOCK(list, islocked); 1560 1561 if (!islocked) 1562 list->kl_lock(list->kl_lockarg); 1563 1564 /* 1565 * If we unlock the list lock (and set KN_INFLUX), we can eliminate 1566 * the kqueue scheduling, but this will introduce four 1567 * lock/unlock's for each knote to test. If we do, continue to use 1568 * SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is 1569 * only safe if you want to remove the current item, which we are 1570 * not doing. 1571 */ 1572 SLIST_FOREACH(kn, &list->kl_list, kn_selnext) { 1573 kq = kn->kn_kq; 1574 if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) { 1575 KQ_LOCK(kq); 1576 if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) { 1577 kn->kn_status |= KN_HASKQLOCK; 1578 if (kn->kn_fop->f_event(kn, hint)) 1579 KNOTE_ACTIVATE(kn, 1); 1580 kn->kn_status &= ~KN_HASKQLOCK; 1581 } 1582 KQ_UNLOCK(kq); 1583 } 1584 kq = NULL; 1585 } 1586 if (!islocked) 1587 list->kl_unlock(list->kl_lockarg); 1588 } 1589 1590 /* 1591 * add a knote to a knlist 1592 */ 1593 void 1594 knlist_add(struct knlist *knl, struct knote *kn, int islocked) 1595 { 1596 KNL_ASSERT_LOCK(knl, islocked); 1597 KQ_NOTOWNED(kn->kn_kq); 1598 KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == 1599 (KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED")); 1600 if (!islocked) 1601 knl->kl_lock(knl->kl_lockarg); 1602 SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext); 1603 if (!islocked) 1604 knl->kl_unlock(knl->kl_lockarg); 1605 KQ_LOCK(kn->kn_kq); 1606 kn->kn_knlist = knl; 1607 kn->kn_status &= ~KN_DETACHED; 1608 KQ_UNLOCK(kn->kn_kq); 1609 } 1610 1611 static void 1612 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked) 1613 { 1614 KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked")); 1615 KNL_ASSERT_LOCK(knl, knlislocked); 1616 mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED); 1617 if (!kqislocked) 1618 KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX, 1619 ("knlist_remove called w/o knote being KN_INFLUX or already removed")); 1620 if (!knlislocked) 1621 knl->kl_lock(knl->kl_lockarg); 1622 SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext); 1623 kn->kn_knlist = NULL; 1624 if (!knlislocked) 1625 knl->kl_unlock(knl->kl_lockarg); 1626 if (!kqislocked) 1627 KQ_LOCK(kn->kn_kq); 1628 kn->kn_status |= KN_DETACHED; 1629 if (!kqislocked) 1630 KQ_UNLOCK(kn->kn_kq); 1631 } 1632 1633 /* 1634 * remove all knotes from a specified klist 1635 */ 1636 void 1637 knlist_remove(struct knlist *knl, struct knote *kn, int islocked) 1638 { 1639 1640 knlist_remove_kq(knl, kn, islocked, 0); 1641 } 1642 1643 /* 1644 * remove knote from a specified klist while in f_event handler. 1645 */ 1646 void 1647 knlist_remove_inevent(struct knlist *knl, struct knote *kn) 1648 { 1649 1650 knlist_remove_kq(knl, kn, 1, 1651 (kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK); 1652 } 1653 1654 int 1655 knlist_empty(struct knlist *knl) 1656 { 1657 KNL_ASSERT_LOCKED(knl); 1658 return SLIST_EMPTY(&knl->kl_list); 1659 } 1660 1661 static struct mtx knlist_lock; 1662 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects", 1663 MTX_DEF); 1664 static void knlist_mtx_lock(void *arg); 1665 static void knlist_mtx_unlock(void *arg); 1666 static int knlist_mtx_locked(void *arg); 1667 1668 static void 1669 knlist_mtx_lock(void *arg) 1670 { 1671 mtx_lock((struct mtx *)arg); 1672 } 1673 1674 static void 1675 knlist_mtx_unlock(void *arg) 1676 { 1677 mtx_unlock((struct mtx *)arg); 1678 } 1679 1680 static int 1681 knlist_mtx_locked(void *arg) 1682 { 1683 return (mtx_owned((struct mtx *)arg)); 1684 } 1685 1686 void 1687 knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *), 1688 void (*kl_unlock)(void *), int (*kl_locked)(void *)) 1689 { 1690 1691 if (lock == NULL) 1692 knl->kl_lockarg = &knlist_lock; 1693 else 1694 knl->kl_lockarg = lock; 1695 1696 if (kl_lock == NULL) 1697 knl->kl_lock = knlist_mtx_lock; 1698 else 1699 knl->kl_lock = kl_lock; 1700 if (kl_unlock == NULL) 1701 knl->kl_unlock = knlist_mtx_unlock; 1702 else 1703 knl->kl_unlock = kl_unlock; 1704 if (kl_locked == NULL) 1705 knl->kl_locked = knlist_mtx_locked; 1706 else 1707 knl->kl_locked = kl_locked; 1708 1709 SLIST_INIT(&knl->kl_list); 1710 } 1711 1712 void 1713 knlist_destroy(struct knlist *knl) 1714 { 1715 1716 #ifdef INVARIANTS 1717 /* 1718 * if we run across this error, we need to find the offending 1719 * driver and have it call knlist_clear. 1720 */ 1721 if (!SLIST_EMPTY(&knl->kl_list)) 1722 printf("WARNING: destroying knlist w/ knotes on it!\n"); 1723 #endif 1724 1725 knl->kl_lockarg = knl->kl_lock = knl->kl_unlock = NULL; 1726 SLIST_INIT(&knl->kl_list); 1727 } 1728 1729 /* 1730 * Even if we are locked, we may need to drop the lock to allow any influx 1731 * knotes time to "settle". 1732 */ 1733 void 1734 knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn) 1735 { 1736 struct knote *kn, *kn2; 1737 struct kqueue *kq; 1738 1739 if (islocked) 1740 KNL_ASSERT_LOCKED(knl); 1741 else { 1742 KNL_ASSERT_UNLOCKED(knl); 1743 again: /* need to reaquire lock since we have dropped it */ 1744 knl->kl_lock(knl->kl_lockarg); 1745 } 1746 1747 SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) { 1748 kq = kn->kn_kq; 1749 KQ_LOCK(kq); 1750 if ((kn->kn_status & KN_INFLUX)) { 1751 KQ_UNLOCK(kq); 1752 continue; 1753 } 1754 knlist_remove_kq(knl, kn, 1, 1); 1755 if (killkn) { 1756 kn->kn_status |= KN_INFLUX | KN_DETACHED; 1757 KQ_UNLOCK(kq); 1758 knote_drop(kn, td); 1759 } else { 1760 /* Make sure cleared knotes disappear soon */ 1761 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 1762 KQ_UNLOCK(kq); 1763 } 1764 kq = NULL; 1765 } 1766 1767 if (!SLIST_EMPTY(&knl->kl_list)) { 1768 /* there are still KN_INFLUX remaining */ 1769 kn = SLIST_FIRST(&knl->kl_list); 1770 kq = kn->kn_kq; 1771 KQ_LOCK(kq); 1772 KASSERT(kn->kn_status & KN_INFLUX, 1773 ("knote removed w/o list lock")); 1774 knl->kl_unlock(knl->kl_lockarg); 1775 kq->kq_state |= KQ_FLUXWAIT; 1776 msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0); 1777 kq = NULL; 1778 goto again; 1779 } 1780 1781 if (islocked) 1782 KNL_ASSERT_LOCKED(knl); 1783 else { 1784 knl->kl_unlock(knl->kl_lockarg); 1785 KNL_ASSERT_UNLOCKED(knl); 1786 } 1787 } 1788 1789 /* 1790 * remove all knotes referencing a specified fd 1791 * must be called with FILEDESC lock. This prevents a race where a new fd 1792 * comes along and occupies the entry and we attach a knote to the fd. 1793 */ 1794 void 1795 knote_fdclose(struct thread *td, int fd) 1796 { 1797 struct filedesc *fdp = td->td_proc->p_fd; 1798 struct kqueue *kq; 1799 struct knote *kn; 1800 int influx; 1801 1802 FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); 1803 1804 /* 1805 * We shouldn't have to worry about new kevents appearing on fd 1806 * since filedesc is locked. 1807 */ 1808 SLIST_FOREACH(kq, &fdp->fd_kqlist, kq_list) { 1809 KQ_LOCK(kq); 1810 1811 again: 1812 influx = 0; 1813 while (kq->kq_knlistsize > fd && 1814 (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) { 1815 if (kn->kn_status & KN_INFLUX) { 1816 /* someone else might be waiting on our knote */ 1817 if (influx) 1818 wakeup(kq); 1819 kq->kq_state |= KQ_FLUXWAIT; 1820 msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); 1821 goto again; 1822 } 1823 kn->kn_status |= KN_INFLUX; 1824 KQ_UNLOCK(kq); 1825 if (!(kn->kn_status & KN_DETACHED)) 1826 kn->kn_fop->f_detach(kn); 1827 knote_drop(kn, td); 1828 influx = 1; 1829 KQ_LOCK(kq); 1830 } 1831 KQ_UNLOCK_FLUX(kq); 1832 } 1833 } 1834 1835 static int 1836 knote_attach(struct knote *kn, struct kqueue *kq) 1837 { 1838 struct klist *list; 1839 1840 KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX")); 1841 KQ_OWNED(kq); 1842 1843 if (kn->kn_fop->f_isfd) { 1844 if (kn->kn_id >= kq->kq_knlistsize) 1845 return ENOMEM; 1846 list = &kq->kq_knlist[kn->kn_id]; 1847 } else { 1848 if (kq->kq_knhash == NULL) 1849 return ENOMEM; 1850 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 1851 } 1852 1853 SLIST_INSERT_HEAD(list, kn, kn_link); 1854 1855 return 0; 1856 } 1857 1858 /* 1859 * knote must already have been detached using the f_detach method. 1860 * no lock need to be held, it is assumed that the KN_INFLUX flag is set 1861 * to prevent other removal. 1862 */ 1863 static void 1864 knote_drop(struct knote *kn, struct thread *td) 1865 { 1866 struct kqueue *kq; 1867 struct klist *list; 1868 1869 kq = kn->kn_kq; 1870 1871 KQ_NOTOWNED(kq); 1872 KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX, 1873 ("knote_drop called without KN_INFLUX set in kn_status")); 1874 1875 KQ_LOCK(kq); 1876 if (kn->kn_fop->f_isfd) 1877 list = &kq->kq_knlist[kn->kn_id]; 1878 else 1879 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; 1880 1881 if (!SLIST_EMPTY(list)) 1882 SLIST_REMOVE(list, kn, knote, kn_link); 1883 if (kn->kn_status & KN_QUEUED) 1884 knote_dequeue(kn); 1885 KQ_UNLOCK_FLUX(kq); 1886 1887 if (kn->kn_fop->f_isfd) { 1888 fdrop(kn->kn_fp, td); 1889 kn->kn_fp = NULL; 1890 } 1891 kqueue_fo_release(kn->kn_kevent.filter); 1892 kn->kn_fop = NULL; 1893 knote_free(kn); 1894 } 1895 1896 static void 1897 knote_enqueue(struct knote *kn) 1898 { 1899 struct kqueue *kq = kn->kn_kq; 1900 1901 KQ_OWNED(kn->kn_kq); 1902 KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); 1903 1904 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 1905 kn->kn_status |= KN_QUEUED; 1906 kq->kq_count++; 1907 kqueue_wakeup(kq); 1908 } 1909 1910 static void 1911 knote_dequeue(struct knote *kn) 1912 { 1913 struct kqueue *kq = kn->kn_kq; 1914 1915 KQ_OWNED(kn->kn_kq); 1916 KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); 1917 1918 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 1919 kn->kn_status &= ~KN_QUEUED; 1920 kq->kq_count--; 1921 } 1922 1923 static void 1924 knote_init(void) 1925 { 1926 1927 knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, 1928 NULL, NULL, UMA_ALIGN_PTR, 0); 1929 } 1930 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL) 1931 1932 static struct knote * 1933 knote_alloc(int waitok) 1934 { 1935 return ((struct knote *)uma_zalloc(knote_zone, 1936 (waitok ? M_WAITOK : M_NOWAIT)|M_ZERO)); 1937 } 1938 1939 static void 1940 knote_free(struct knote *kn) 1941 { 1942 if (kn != NULL) 1943 uma_zfree(knote_zone, kn); 1944 } 1945 1946 /* 1947 * Register the kev w/ the kq specified by fd. 1948 */ 1949 int 1950 kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok) 1951 { 1952 struct kqueue *kq; 1953 struct file *fp; 1954 int error; 1955 1956 if ((error = fget(td, fd, &fp)) != 0) 1957 return (error); 1958 if ((error = kqueue_aquire(fp, &kq)) != 0) 1959 goto noaquire; 1960 1961 error = kqueue_register(kq, kev, td, waitok); 1962 1963 kqueue_release(kq, 0); 1964 1965 noaquire: 1966 fdrop(fp, td); 1967 1968 return error; 1969 } 1970