xref: /freebsd/sys/kern/kern_event.c (revision 718cf2ccb9956613756ab15d7a0e28f2c8e91cab)
1 /*-
2  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
3  * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
4  * Copyright (c) 2009 Apple, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include "opt_compat.h"
33 #include "opt_ktrace.h"
34 #include "opt_kqueue.h"
35 
36 #ifdef COMPAT_FREEBSD11
37 #define	_WANT_FREEBSD11_KEVENT
38 #endif
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/capsicum.h>
43 #include <sys/kernel.h>
44 #include <sys/lock.h>
45 #include <sys/mutex.h>
46 #include <sys/rwlock.h>
47 #include <sys/proc.h>
48 #include <sys/malloc.h>
49 #include <sys/unistd.h>
50 #include <sys/file.h>
51 #include <sys/filedesc.h>
52 #include <sys/filio.h>
53 #include <sys/fcntl.h>
54 #include <sys/kthread.h>
55 #include <sys/selinfo.h>
56 #include <sys/queue.h>
57 #include <sys/event.h>
58 #include <sys/eventvar.h>
59 #include <sys/poll.h>
60 #include <sys/protosw.h>
61 #include <sys/resourcevar.h>
62 #include <sys/sigio.h>
63 #include <sys/signalvar.h>
64 #include <sys/socket.h>
65 #include <sys/socketvar.h>
66 #include <sys/stat.h>
67 #include <sys/sysctl.h>
68 #include <sys/sysproto.h>
69 #include <sys/syscallsubr.h>
70 #include <sys/taskqueue.h>
71 #include <sys/uio.h>
72 #include <sys/user.h>
73 #ifdef KTRACE
74 #include <sys/ktrace.h>
75 #endif
76 #include <machine/atomic.h>
77 
78 #include <vm/uma.h>
79 
80 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
81 
82 /*
83  * This lock is used if multiple kq locks are required.  This possibly
84  * should be made into a per proc lock.
85  */
86 static struct mtx	kq_global;
87 MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
88 #define KQ_GLOBAL_LOCK(lck, haslck)	do {	\
89 	if (!haslck)				\
90 		mtx_lock(lck);			\
91 	haslck = 1;				\
92 } while (0)
93 #define KQ_GLOBAL_UNLOCK(lck, haslck)	do {	\
94 	if (haslck)				\
95 		mtx_unlock(lck);			\
96 	haslck = 0;				\
97 } while (0)
98 
99 TASKQUEUE_DEFINE_THREAD(kqueue_ctx);
100 
101 static int	kevent_copyout(void *arg, struct kevent *kevp, int count);
102 static int	kevent_copyin(void *arg, struct kevent *kevp, int count);
103 static int	kqueue_register(struct kqueue *kq, struct kevent *kev,
104 		    struct thread *td, int waitok);
105 static int	kqueue_acquire(struct file *fp, struct kqueue **kqp);
106 static void	kqueue_release(struct kqueue *kq, int locked);
107 static void	kqueue_destroy(struct kqueue *kq);
108 static void	kqueue_drain(struct kqueue *kq, struct thread *td);
109 static int	kqueue_expand(struct kqueue *kq, struct filterops *fops,
110 		    uintptr_t ident, int waitok);
111 static void	kqueue_task(void *arg, int pending);
112 static int	kqueue_scan(struct kqueue *kq, int maxevents,
113 		    struct kevent_copyops *k_ops,
114 		    const struct timespec *timeout,
115 		    struct kevent *keva, struct thread *td);
116 static void 	kqueue_wakeup(struct kqueue *kq);
117 static struct filterops *kqueue_fo_find(int filt);
118 static void	kqueue_fo_release(int filt);
119 struct g_kevent_args;
120 static int	kern_kevent_generic(struct thread *td,
121 		    struct g_kevent_args *uap,
122 		    struct kevent_copyops *k_ops, const char *struct_name);
123 
124 static fo_ioctl_t	kqueue_ioctl;
125 static fo_poll_t	kqueue_poll;
126 static fo_kqfilter_t	kqueue_kqfilter;
127 static fo_stat_t	kqueue_stat;
128 static fo_close_t	kqueue_close;
129 static fo_fill_kinfo_t	kqueue_fill_kinfo;
130 
131 static struct fileops kqueueops = {
132 	.fo_read = invfo_rdwr,
133 	.fo_write = invfo_rdwr,
134 	.fo_truncate = invfo_truncate,
135 	.fo_ioctl = kqueue_ioctl,
136 	.fo_poll = kqueue_poll,
137 	.fo_kqfilter = kqueue_kqfilter,
138 	.fo_stat = kqueue_stat,
139 	.fo_close = kqueue_close,
140 	.fo_chmod = invfo_chmod,
141 	.fo_chown = invfo_chown,
142 	.fo_sendfile = invfo_sendfile,
143 	.fo_fill_kinfo = kqueue_fill_kinfo,
144 };
145 
146 static int 	knote_attach(struct knote *kn, struct kqueue *kq);
147 static void 	knote_drop(struct knote *kn, struct thread *td);
148 static void 	knote_drop_detached(struct knote *kn, struct thread *td);
149 static void 	knote_enqueue(struct knote *kn);
150 static void 	knote_dequeue(struct knote *kn);
151 static void 	knote_init(void);
152 static struct 	knote *knote_alloc(int waitok);
153 static void 	knote_free(struct knote *kn);
154 
155 static void	filt_kqdetach(struct knote *kn);
156 static int	filt_kqueue(struct knote *kn, long hint);
157 static int	filt_procattach(struct knote *kn);
158 static void	filt_procdetach(struct knote *kn);
159 static int	filt_proc(struct knote *kn, long hint);
160 static int	filt_fileattach(struct knote *kn);
161 static void	filt_timerexpire(void *knx);
162 static int	filt_timerattach(struct knote *kn);
163 static void	filt_timerdetach(struct knote *kn);
164 static int	filt_timer(struct knote *kn, long hint);
165 static int	filt_userattach(struct knote *kn);
166 static void	filt_userdetach(struct knote *kn);
167 static int	filt_user(struct knote *kn, long hint);
168 static void	filt_usertouch(struct knote *kn, struct kevent *kev,
169 		    u_long type);
170 
171 static struct filterops file_filtops = {
172 	.f_isfd = 1,
173 	.f_attach = filt_fileattach,
174 };
175 static struct filterops kqread_filtops = {
176 	.f_isfd = 1,
177 	.f_detach = filt_kqdetach,
178 	.f_event = filt_kqueue,
179 };
180 /* XXX - move to kern_proc.c?  */
181 static struct filterops proc_filtops = {
182 	.f_isfd = 0,
183 	.f_attach = filt_procattach,
184 	.f_detach = filt_procdetach,
185 	.f_event = filt_proc,
186 };
187 static struct filterops timer_filtops = {
188 	.f_isfd = 0,
189 	.f_attach = filt_timerattach,
190 	.f_detach = filt_timerdetach,
191 	.f_event = filt_timer,
192 };
193 static struct filterops user_filtops = {
194 	.f_attach = filt_userattach,
195 	.f_detach = filt_userdetach,
196 	.f_event = filt_user,
197 	.f_touch = filt_usertouch,
198 };
199 
200 static uma_zone_t	knote_zone;
201 static unsigned int	kq_ncallouts = 0;
202 static unsigned int 	kq_calloutmax = 4 * 1024;
203 SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
204     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
205 
206 /* XXX - ensure not influx ? */
207 #define KNOTE_ACTIVATE(kn, islock) do { 				\
208 	if ((islock))							\
209 		mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED);		\
210 	else								\
211 		KQ_LOCK((kn)->kn_kq);					\
212 	(kn)->kn_status |= KN_ACTIVE;					\
213 	if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
214 		knote_enqueue((kn));					\
215 	if (!(islock))							\
216 		KQ_UNLOCK((kn)->kn_kq);					\
217 } while(0)
218 #define KQ_LOCK(kq) do {						\
219 	mtx_lock(&(kq)->kq_lock);					\
220 } while (0)
221 #define KQ_FLUX_WAKEUP(kq) do {						\
222 	if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) {		\
223 		(kq)->kq_state &= ~KQ_FLUXWAIT;				\
224 		wakeup((kq));						\
225 	}								\
226 } while (0)
227 #define KQ_UNLOCK_FLUX(kq) do {						\
228 	KQ_FLUX_WAKEUP(kq);						\
229 	mtx_unlock(&(kq)->kq_lock);					\
230 } while (0)
231 #define KQ_UNLOCK(kq) do {						\
232 	mtx_unlock(&(kq)->kq_lock);					\
233 } while (0)
234 #define KQ_OWNED(kq) do {						\
235 	mtx_assert(&(kq)->kq_lock, MA_OWNED);				\
236 } while (0)
237 #define KQ_NOTOWNED(kq) do {						\
238 	mtx_assert(&(kq)->kq_lock, MA_NOTOWNED);			\
239 } while (0)
240 
241 static struct knlist *
242 kn_list_lock(struct knote *kn)
243 {
244 	struct knlist *knl;
245 
246 	knl = kn->kn_knlist;
247 	if (knl != NULL)
248 		knl->kl_lock(knl->kl_lockarg);
249 	return (knl);
250 }
251 
252 static void
253 kn_list_unlock(struct knlist *knl)
254 {
255 	bool do_free;
256 
257 	if (knl == NULL)
258 		return;
259 	do_free = knl->kl_autodestroy && knlist_empty(knl);
260 	knl->kl_unlock(knl->kl_lockarg);
261 	if (do_free) {
262 		knlist_destroy(knl);
263 		free(knl, M_KQUEUE);
264 	}
265 }
266 
267 static bool
268 kn_in_flux(struct knote *kn)
269 {
270 
271 	return (kn->kn_influx > 0);
272 }
273 
274 static void
275 kn_enter_flux(struct knote *kn)
276 {
277 
278 	KQ_OWNED(kn->kn_kq);
279 	MPASS(kn->kn_influx < INT_MAX);
280 	kn->kn_influx++;
281 }
282 
283 static bool
284 kn_leave_flux(struct knote *kn)
285 {
286 
287 	KQ_OWNED(kn->kn_kq);
288 	MPASS(kn->kn_influx > 0);
289 	kn->kn_influx--;
290 	return (kn->kn_influx == 0);
291 }
292 
293 #define	KNL_ASSERT_LOCK(knl, islocked) do {				\
294 	if (islocked)							\
295 		KNL_ASSERT_LOCKED(knl);				\
296 	else								\
297 		KNL_ASSERT_UNLOCKED(knl);				\
298 } while (0)
299 #ifdef INVARIANTS
300 #define	KNL_ASSERT_LOCKED(knl) do {					\
301 	knl->kl_assert_locked((knl)->kl_lockarg);			\
302 } while (0)
303 #define	KNL_ASSERT_UNLOCKED(knl) do {					\
304 	knl->kl_assert_unlocked((knl)->kl_lockarg);			\
305 } while (0)
306 #else /* !INVARIANTS */
307 #define	KNL_ASSERT_LOCKED(knl) do {} while(0)
308 #define	KNL_ASSERT_UNLOCKED(knl) do {} while (0)
309 #endif /* INVARIANTS */
310 
311 #ifndef	KN_HASHSIZE
312 #define	KN_HASHSIZE		64		/* XXX should be tunable */
313 #endif
314 
315 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
316 
317 static int
318 filt_nullattach(struct knote *kn)
319 {
320 
321 	return (ENXIO);
322 };
323 
324 struct filterops null_filtops = {
325 	.f_isfd = 0,
326 	.f_attach = filt_nullattach,
327 };
328 
329 /* XXX - make SYSINIT to add these, and move into respective modules. */
330 extern struct filterops sig_filtops;
331 extern struct filterops fs_filtops;
332 
333 /*
334  * Table for for all system-defined filters.
335  */
336 static struct mtx	filterops_lock;
337 MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
338 	MTX_DEF);
339 static struct {
340 	struct filterops *for_fop;
341 	int for_nolock;
342 	int for_refcnt;
343 } sysfilt_ops[EVFILT_SYSCOUNT] = {
344 	{ &file_filtops, 1 },			/* EVFILT_READ */
345 	{ &file_filtops, 1 },			/* EVFILT_WRITE */
346 	{ &null_filtops },			/* EVFILT_AIO */
347 	{ &file_filtops, 1 },			/* EVFILT_VNODE */
348 	{ &proc_filtops, 1 },			/* EVFILT_PROC */
349 	{ &sig_filtops, 1 },			/* EVFILT_SIGNAL */
350 	{ &timer_filtops, 1 },			/* EVFILT_TIMER */
351 	{ &file_filtops, 1 },			/* EVFILT_PROCDESC */
352 	{ &fs_filtops, 1 },			/* EVFILT_FS */
353 	{ &null_filtops },			/* EVFILT_LIO */
354 	{ &user_filtops, 1 },			/* EVFILT_USER */
355 	{ &null_filtops },			/* EVFILT_SENDFILE */
356 	{ &file_filtops, 1 },                   /* EVFILT_EMPTY */
357 };
358 
359 /*
360  * Simple redirection for all cdevsw style objects to call their fo_kqfilter
361  * method.
362  */
363 static int
364 filt_fileattach(struct knote *kn)
365 {
366 
367 	return (fo_kqfilter(kn->kn_fp, kn));
368 }
369 
370 /*ARGSUSED*/
371 static int
372 kqueue_kqfilter(struct file *fp, struct knote *kn)
373 {
374 	struct kqueue *kq = kn->kn_fp->f_data;
375 
376 	if (kn->kn_filter != EVFILT_READ)
377 		return (EINVAL);
378 
379 	kn->kn_status |= KN_KQUEUE;
380 	kn->kn_fop = &kqread_filtops;
381 	knlist_add(&kq->kq_sel.si_note, kn, 0);
382 
383 	return (0);
384 }
385 
386 static void
387 filt_kqdetach(struct knote *kn)
388 {
389 	struct kqueue *kq = kn->kn_fp->f_data;
390 
391 	knlist_remove(&kq->kq_sel.si_note, kn, 0);
392 }
393 
394 /*ARGSUSED*/
395 static int
396 filt_kqueue(struct knote *kn, long hint)
397 {
398 	struct kqueue *kq = kn->kn_fp->f_data;
399 
400 	kn->kn_data = kq->kq_count;
401 	return (kn->kn_data > 0);
402 }
403 
404 /* XXX - move to kern_proc.c?  */
405 static int
406 filt_procattach(struct knote *kn)
407 {
408 	struct proc *p;
409 	int error;
410 	bool exiting, immediate;
411 
412 	exiting = immediate = false;
413 	if (kn->kn_sfflags & NOTE_EXIT)
414 		p = pfind_any(kn->kn_id);
415 	else
416 		p = pfind(kn->kn_id);
417 	if (p == NULL)
418 		return (ESRCH);
419 	if (p->p_flag & P_WEXIT)
420 		exiting = true;
421 
422 	if ((error = p_cansee(curthread, p))) {
423 		PROC_UNLOCK(p);
424 		return (error);
425 	}
426 
427 	kn->kn_ptr.p_proc = p;
428 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
429 
430 	/*
431 	 * Internal flag indicating registration done by kernel for the
432 	 * purposes of getting a NOTE_CHILD notification.
433 	 */
434 	if (kn->kn_flags & EV_FLAG2) {
435 		kn->kn_flags &= ~EV_FLAG2;
436 		kn->kn_data = kn->kn_sdata;		/* ppid */
437 		kn->kn_fflags = NOTE_CHILD;
438 		kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK);
439 		immediate = true; /* Force immediate activation of child note. */
440 	}
441 	/*
442 	 * Internal flag indicating registration done by kernel (for other than
443 	 * NOTE_CHILD).
444 	 */
445 	if (kn->kn_flags & EV_FLAG1) {
446 		kn->kn_flags &= ~EV_FLAG1;
447 	}
448 
449 	knlist_add(p->p_klist, kn, 1);
450 
451 	/*
452 	 * Immediately activate any child notes or, in the case of a zombie
453 	 * target process, exit notes.  The latter is necessary to handle the
454 	 * case where the target process, e.g. a child, dies before the kevent
455 	 * is registered.
456 	 */
457 	if (immediate || (exiting && filt_proc(kn, NOTE_EXIT)))
458 		KNOTE_ACTIVATE(kn, 0);
459 
460 	PROC_UNLOCK(p);
461 
462 	return (0);
463 }
464 
465 /*
466  * The knote may be attached to a different process, which may exit,
467  * leaving nothing for the knote to be attached to.  So when the process
468  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
469  * it will be deleted when read out.  However, as part of the knote deletion,
470  * this routine is called, so a check is needed to avoid actually performing
471  * a detach, because the original process does not exist any more.
472  */
473 /* XXX - move to kern_proc.c?  */
474 static void
475 filt_procdetach(struct knote *kn)
476 {
477 
478 	knlist_remove(kn->kn_knlist, kn, 0);
479 	kn->kn_ptr.p_proc = NULL;
480 }
481 
482 /* XXX - move to kern_proc.c?  */
483 static int
484 filt_proc(struct knote *kn, long hint)
485 {
486 	struct proc *p;
487 	u_int event;
488 
489 	p = kn->kn_ptr.p_proc;
490 	if (p == NULL) /* already activated, from attach filter */
491 		return (0);
492 
493 	/* Mask off extra data. */
494 	event = (u_int)hint & NOTE_PCTRLMASK;
495 
496 	/* If the user is interested in this event, record it. */
497 	if (kn->kn_sfflags & event)
498 		kn->kn_fflags |= event;
499 
500 	/* Process is gone, so flag the event as finished. */
501 	if (event == NOTE_EXIT) {
502 		kn->kn_flags |= EV_EOF | EV_ONESHOT;
503 		kn->kn_ptr.p_proc = NULL;
504 		if (kn->kn_fflags & NOTE_EXIT)
505 			kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig);
506 		if (kn->kn_fflags == 0)
507 			kn->kn_flags |= EV_DROP;
508 		return (1);
509 	}
510 
511 	return (kn->kn_fflags != 0);
512 }
513 
514 /*
515  * Called when the process forked. It mostly does the same as the
516  * knote(), activating all knotes registered to be activated when the
517  * process forked. Additionally, for each knote attached to the
518  * parent, check whether user wants to track the new process. If so
519  * attach a new knote to it, and immediately report an event with the
520  * child's pid.
521  */
522 void
523 knote_fork(struct knlist *list, int pid)
524 {
525 	struct kqueue *kq;
526 	struct knote *kn;
527 	struct kevent kev;
528 	int error;
529 
530 	if (list == NULL)
531 		return;
532 	list->kl_lock(list->kl_lockarg);
533 
534 	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
535 		kq = kn->kn_kq;
536 		KQ_LOCK(kq);
537 		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
538 			KQ_UNLOCK(kq);
539 			continue;
540 		}
541 
542 		/*
543 		 * The same as knote(), activate the event.
544 		 */
545 		if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
546 			kn->kn_status |= KN_HASKQLOCK;
547 			if (kn->kn_fop->f_event(kn, NOTE_FORK))
548 				KNOTE_ACTIVATE(kn, 1);
549 			kn->kn_status &= ~KN_HASKQLOCK;
550 			KQ_UNLOCK(kq);
551 			continue;
552 		}
553 
554 		/*
555 		 * The NOTE_TRACK case. In addition to the activation
556 		 * of the event, we need to register new events to
557 		 * track the child. Drop the locks in preparation for
558 		 * the call to kqueue_register().
559 		 */
560 		kn_enter_flux(kn);
561 		KQ_UNLOCK(kq);
562 		list->kl_unlock(list->kl_lockarg);
563 
564 		/*
565 		 * Activate existing knote and register tracking knotes with
566 		 * new process.
567 		 *
568 		 * First register a knote to get just the child notice. This
569 		 * must be a separate note from a potential NOTE_EXIT
570 		 * notification since both NOTE_CHILD and NOTE_EXIT are defined
571 		 * to use the data field (in conflicting ways).
572 		 */
573 		kev.ident = pid;
574 		kev.filter = kn->kn_filter;
575 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT |
576 		    EV_FLAG2;
577 		kev.fflags = kn->kn_sfflags;
578 		kev.data = kn->kn_id;		/* parent */
579 		kev.udata = kn->kn_kevent.udata;/* preserve udata */
580 		error = kqueue_register(kq, &kev, NULL, 0);
581 		if (error)
582 			kn->kn_fflags |= NOTE_TRACKERR;
583 
584 		/*
585 		 * Then register another knote to track other potential events
586 		 * from the new process.
587 		 */
588 		kev.ident = pid;
589 		kev.filter = kn->kn_filter;
590 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
591 		kev.fflags = kn->kn_sfflags;
592 		kev.data = kn->kn_id;		/* parent */
593 		kev.udata = kn->kn_kevent.udata;/* preserve udata */
594 		error = kqueue_register(kq, &kev, NULL, 0);
595 		if (error)
596 			kn->kn_fflags |= NOTE_TRACKERR;
597 		if (kn->kn_fop->f_event(kn, NOTE_FORK))
598 			KNOTE_ACTIVATE(kn, 0);
599 		KQ_LOCK(kq);
600 		kn_leave_flux(kn);
601 		KQ_UNLOCK_FLUX(kq);
602 		list->kl_lock(list->kl_lockarg);
603 	}
604 	list->kl_unlock(list->kl_lockarg);
605 }
606 
607 /*
608  * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
609  * interval timer support code.
610  */
611 
612 #define NOTE_TIMER_PRECMASK						\
613     (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS)
614 
615 static sbintime_t
616 timer2sbintime(intptr_t data, int flags)
617 {
618 	int64_t secs;
619 
620         /*
621          * Macros for converting to the fractional second portion of an
622          * sbintime_t using 64bit multiplication to improve precision.
623          */
624 #define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32)
625 #define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32)
626 #define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32)
627 	switch (flags & NOTE_TIMER_PRECMASK) {
628 	case NOTE_SECONDS:
629 #ifdef __LP64__
630 		if (data > (SBT_MAX / SBT_1S))
631 			return (SBT_MAX);
632 #endif
633 		return ((sbintime_t)data << 32);
634 	case NOTE_MSECONDS: /* FALLTHROUGH */
635 	case 0:
636 		if (data >= 1000) {
637 			secs = data / 1000;
638 #ifdef __LP64__
639 			if (secs > (SBT_MAX / SBT_1S))
640 				return (SBT_MAX);
641 #endif
642 			return (secs << 32 | MS_TO_SBT(data % 1000));
643 		}
644 		return (MS_TO_SBT(data));
645 	case NOTE_USECONDS:
646 		if (data >= 1000000) {
647 			secs = data / 1000000;
648 #ifdef __LP64__
649 			if (secs > (SBT_MAX / SBT_1S))
650 				return (SBT_MAX);
651 #endif
652 			return (secs << 32 | US_TO_SBT(data % 1000000));
653 		}
654 		return (US_TO_SBT(data));
655 	case NOTE_NSECONDS:
656 		if (data >= 1000000000) {
657 			secs = data / 1000000000;
658 #ifdef __LP64__
659 			if (secs > (SBT_MAX / SBT_1S))
660 				return (SBT_MAX);
661 #endif
662 			return (secs << 32 | US_TO_SBT(data % 1000000000));
663 		}
664 		return (NS_TO_SBT(data));
665 	default:
666 		break;
667 	}
668 	return (-1);
669 }
670 
671 struct kq_timer_cb_data {
672 	struct callout c;
673 	sbintime_t next;	/* next timer event fires at */
674 	sbintime_t to;		/* precalculated timer period, 0 for abs */
675 };
676 
677 static void
678 filt_timerexpire(void *knx)
679 {
680 	struct knote *kn;
681 	struct kq_timer_cb_data *kc;
682 
683 	kn = knx;
684 	kn->kn_data++;
685 	KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
686 
687 	if ((kn->kn_flags & EV_ONESHOT) != 0)
688 		return;
689 	kc = kn->kn_ptr.p_v;
690 	if (kc->to == 0)
691 		return;
692 	kc->next += kc->to;
693 	callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn,
694 	    PCPU_GET(cpuid), C_ABSOLUTE);
695 }
696 
697 /*
698  * data contains amount of time to sleep
699  */
700 static int
701 filt_timerattach(struct knote *kn)
702 {
703 	struct kq_timer_cb_data *kc;
704 	struct bintime bt;
705 	sbintime_t to, sbt;
706 	unsigned int ncallouts;
707 
708 	if (kn->kn_sdata < 0)
709 		return (EINVAL);
710 	if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
711 		kn->kn_sdata = 1;
712 	/* Only precision unit are supported in flags so far */
713 	if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0)
714 		return (EINVAL);
715 
716 	to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
717 	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
718 		getboottimebin(&bt);
719 		sbt = bttosbt(bt);
720 		to -= sbt;
721 	}
722 	if (to < 0)
723 		return (EINVAL);
724 
725 	do {
726 		ncallouts = kq_ncallouts;
727 		if (ncallouts >= kq_calloutmax)
728 			return (ENOMEM);
729 	} while (!atomic_cmpset_int(&kq_ncallouts, ncallouts, ncallouts + 1));
730 
731 	if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
732 		kn->kn_flags |= EV_CLEAR;	/* automatically set */
733 	kn->kn_status &= ~KN_DETACHED;		/* knlist_add clears it */
734 	kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK);
735 	callout_init(&kc->c, 1);
736 	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
737 		kc->next = to;
738 		kc->to = 0;
739 	} else {
740 		kc->next = to + sbinuptime();
741 		kc->to = to;
742 	}
743 	callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn,
744 	    PCPU_GET(cpuid), C_ABSOLUTE);
745 
746 	return (0);
747 }
748 
749 static void
750 filt_timerdetach(struct knote *kn)
751 {
752 	struct kq_timer_cb_data *kc;
753 	unsigned int old;
754 
755 	kc = kn->kn_ptr.p_v;
756 	callout_drain(&kc->c);
757 	free(kc, M_KQUEUE);
758 	old = atomic_fetchadd_int(&kq_ncallouts, -1);
759 	KASSERT(old > 0, ("Number of callouts cannot become negative"));
760 	kn->kn_status |= KN_DETACHED;	/* knlist_remove sets it */
761 }
762 
763 static int
764 filt_timer(struct knote *kn, long hint)
765 {
766 
767 	return (kn->kn_data != 0);
768 }
769 
770 static int
771 filt_userattach(struct knote *kn)
772 {
773 
774 	/*
775 	 * EVFILT_USER knotes are not attached to anything in the kernel.
776 	 */
777 	kn->kn_hook = NULL;
778 	if (kn->kn_fflags & NOTE_TRIGGER)
779 		kn->kn_hookid = 1;
780 	else
781 		kn->kn_hookid = 0;
782 	return (0);
783 }
784 
785 static void
786 filt_userdetach(__unused struct knote *kn)
787 {
788 
789 	/*
790 	 * EVFILT_USER knotes are not attached to anything in the kernel.
791 	 */
792 }
793 
794 static int
795 filt_user(struct knote *kn, __unused long hint)
796 {
797 
798 	return (kn->kn_hookid);
799 }
800 
801 static void
802 filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
803 {
804 	u_int ffctrl;
805 
806 	switch (type) {
807 	case EVENT_REGISTER:
808 		if (kev->fflags & NOTE_TRIGGER)
809 			kn->kn_hookid = 1;
810 
811 		ffctrl = kev->fflags & NOTE_FFCTRLMASK;
812 		kev->fflags &= NOTE_FFLAGSMASK;
813 		switch (ffctrl) {
814 		case NOTE_FFNOP:
815 			break;
816 
817 		case NOTE_FFAND:
818 			kn->kn_sfflags &= kev->fflags;
819 			break;
820 
821 		case NOTE_FFOR:
822 			kn->kn_sfflags |= kev->fflags;
823 			break;
824 
825 		case NOTE_FFCOPY:
826 			kn->kn_sfflags = kev->fflags;
827 			break;
828 
829 		default:
830 			/* XXX Return error? */
831 			break;
832 		}
833 		kn->kn_sdata = kev->data;
834 		if (kev->flags & EV_CLEAR) {
835 			kn->kn_hookid = 0;
836 			kn->kn_data = 0;
837 			kn->kn_fflags = 0;
838 		}
839 		break;
840 
841         case EVENT_PROCESS:
842 		*kev = kn->kn_kevent;
843 		kev->fflags = kn->kn_sfflags;
844 		kev->data = kn->kn_sdata;
845 		if (kn->kn_flags & EV_CLEAR) {
846 			kn->kn_hookid = 0;
847 			kn->kn_data = 0;
848 			kn->kn_fflags = 0;
849 		}
850 		break;
851 
852 	default:
853 		panic("filt_usertouch() - invalid type (%ld)", type);
854 		break;
855 	}
856 }
857 
858 int
859 sys_kqueue(struct thread *td, struct kqueue_args *uap)
860 {
861 
862 	return (kern_kqueue(td, 0, NULL));
863 }
864 
865 static void
866 kqueue_init(struct kqueue *kq)
867 {
868 
869 	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK);
870 	TAILQ_INIT(&kq->kq_head);
871 	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
872 	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
873 }
874 
875 int
876 kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps)
877 {
878 	struct filedesc *fdp;
879 	struct kqueue *kq;
880 	struct file *fp;
881 	struct ucred *cred;
882 	int fd, error;
883 
884 	fdp = td->td_proc->p_fd;
885 	cred = td->td_ucred;
886 	if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES)))
887 		return (ENOMEM);
888 
889 	error = falloc_caps(td, &fp, &fd, flags, fcaps);
890 	if (error != 0) {
891 		chgkqcnt(cred->cr_ruidinfo, -1, 0);
892 		return (error);
893 	}
894 
895 	/* An extra reference on `fp' has been held for us by falloc(). */
896 	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
897 	kqueue_init(kq);
898 	kq->kq_fdp = fdp;
899 	kq->kq_cred = crhold(cred);
900 
901 	FILEDESC_XLOCK(fdp);
902 	TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
903 	FILEDESC_XUNLOCK(fdp);
904 
905 	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
906 	fdrop(fp, td);
907 
908 	td->td_retval[0] = fd;
909 	return (0);
910 }
911 
912 struct g_kevent_args {
913 	int	fd;
914 	void	*changelist;
915 	int	nchanges;
916 	void	*eventlist;
917 	int	nevents;
918 	const struct timespec *timeout;
919 };
920 
921 int
922 sys_kevent(struct thread *td, struct kevent_args *uap)
923 {
924 	struct kevent_copyops k_ops = {
925 		.arg = uap,
926 		.k_copyout = kevent_copyout,
927 		.k_copyin = kevent_copyin,
928 		.kevent_size = sizeof(struct kevent),
929 	};
930 	struct g_kevent_args gk_args = {
931 		.fd = uap->fd,
932 		.changelist = uap->changelist,
933 		.nchanges = uap->nchanges,
934 		.eventlist = uap->eventlist,
935 		.nevents = uap->nevents,
936 		.timeout = uap->timeout,
937 	};
938 
939 	return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent"));
940 }
941 
942 static int
943 kern_kevent_generic(struct thread *td, struct g_kevent_args *uap,
944     struct kevent_copyops *k_ops, const char *struct_name)
945 {
946 	struct timespec ts, *tsp;
947 #ifdef KTRACE
948 	struct kevent *eventlist = uap->eventlist;
949 #endif
950 	int error;
951 
952 	if (uap->timeout != NULL) {
953 		error = copyin(uap->timeout, &ts, sizeof(ts));
954 		if (error)
955 			return (error);
956 		tsp = &ts;
957 	} else
958 		tsp = NULL;
959 
960 #ifdef KTRACE
961 	if (KTRPOINT(td, KTR_STRUCT_ARRAY))
962 		ktrstructarray(struct_name, UIO_USERSPACE, uap->changelist,
963 		    uap->nchanges, k_ops->kevent_size);
964 #endif
965 
966 	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
967 	    k_ops, tsp);
968 
969 #ifdef KTRACE
970 	if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY))
971 		ktrstructarray(struct_name, UIO_USERSPACE, eventlist,
972 		    td->td_retval[0], k_ops->kevent_size);
973 #endif
974 
975 	return (error);
976 }
977 
978 /*
979  * Copy 'count' items into the destination list pointed to by uap->eventlist.
980  */
981 static int
982 kevent_copyout(void *arg, struct kevent *kevp, int count)
983 {
984 	struct kevent_args *uap;
985 	int error;
986 
987 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
988 	uap = (struct kevent_args *)arg;
989 
990 	error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
991 	if (error == 0)
992 		uap->eventlist += count;
993 	return (error);
994 }
995 
996 /*
997  * Copy 'count' items from the list pointed to by uap->changelist.
998  */
999 static int
1000 kevent_copyin(void *arg, struct kevent *kevp, int count)
1001 {
1002 	struct kevent_args *uap;
1003 	int error;
1004 
1005 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1006 	uap = (struct kevent_args *)arg;
1007 
1008 	error = copyin(uap->changelist, kevp, count * sizeof *kevp);
1009 	if (error == 0)
1010 		uap->changelist += count;
1011 	return (error);
1012 }
1013 
1014 #ifdef COMPAT_FREEBSD11
1015 static int
1016 kevent11_copyout(void *arg, struct kevent *kevp, int count)
1017 {
1018 	struct freebsd11_kevent_args *uap;
1019 	struct kevent_freebsd11 kev11;
1020 	int error, i;
1021 
1022 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1023 	uap = (struct freebsd11_kevent_args *)arg;
1024 
1025 	for (i = 0; i < count; i++) {
1026 		kev11.ident = kevp->ident;
1027 		kev11.filter = kevp->filter;
1028 		kev11.flags = kevp->flags;
1029 		kev11.fflags = kevp->fflags;
1030 		kev11.data = kevp->data;
1031 		kev11.udata = kevp->udata;
1032 		error = copyout(&kev11, uap->eventlist, sizeof(kev11));
1033 		if (error != 0)
1034 			break;
1035 		uap->eventlist++;
1036 		kevp++;
1037 	}
1038 	return (error);
1039 }
1040 
1041 /*
1042  * Copy 'count' items from the list pointed to by uap->changelist.
1043  */
1044 static int
1045 kevent11_copyin(void *arg, struct kevent *kevp, int count)
1046 {
1047 	struct freebsd11_kevent_args *uap;
1048 	struct kevent_freebsd11 kev11;
1049 	int error, i;
1050 
1051 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1052 	uap = (struct freebsd11_kevent_args *)arg;
1053 
1054 	for (i = 0; i < count; i++) {
1055 		error = copyin(uap->changelist, &kev11, sizeof(kev11));
1056 		if (error != 0)
1057 			break;
1058 		kevp->ident = kev11.ident;
1059 		kevp->filter = kev11.filter;
1060 		kevp->flags = kev11.flags;
1061 		kevp->fflags = kev11.fflags;
1062 		kevp->data = (uintptr_t)kev11.data;
1063 		kevp->udata = kev11.udata;
1064 		bzero(&kevp->ext, sizeof(kevp->ext));
1065 		uap->changelist++;
1066 		kevp++;
1067 	}
1068 	return (error);
1069 }
1070 
1071 int
1072 freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap)
1073 {
1074 	struct kevent_copyops k_ops = {
1075 		.arg = uap,
1076 		.k_copyout = kevent11_copyout,
1077 		.k_copyin = kevent11_copyin,
1078 		.kevent_size = sizeof(struct kevent_freebsd11),
1079 	};
1080 	struct g_kevent_args gk_args = {
1081 		.fd = uap->fd,
1082 		.changelist = uap->changelist,
1083 		.nchanges = uap->nchanges,
1084 		.eventlist = uap->eventlist,
1085 		.nevents = uap->nevents,
1086 		.timeout = uap->timeout,
1087 	};
1088 
1089 	return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent_freebsd11"));
1090 }
1091 #endif
1092 
1093 int
1094 kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
1095     struct kevent_copyops *k_ops, const struct timespec *timeout)
1096 {
1097 	cap_rights_t rights;
1098 	struct file *fp;
1099 	int error;
1100 
1101 	cap_rights_init(&rights);
1102 	if (nchanges > 0)
1103 		cap_rights_set(&rights, CAP_KQUEUE_CHANGE);
1104 	if (nevents > 0)
1105 		cap_rights_set(&rights, CAP_KQUEUE_EVENT);
1106 	error = fget(td, fd, &rights, &fp);
1107 	if (error != 0)
1108 		return (error);
1109 
1110 	error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout);
1111 	fdrop(fp, td);
1112 
1113 	return (error);
1114 }
1115 
1116 static int
1117 kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents,
1118     struct kevent_copyops *k_ops, const struct timespec *timeout)
1119 {
1120 	struct kevent keva[KQ_NEVENTS];
1121 	struct kevent *kevp, *changes;
1122 	int i, n, nerrors, error;
1123 
1124 	nerrors = 0;
1125 	while (nchanges > 0) {
1126 		n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
1127 		error = k_ops->k_copyin(k_ops->arg, keva, n);
1128 		if (error)
1129 			return (error);
1130 		changes = keva;
1131 		for (i = 0; i < n; i++) {
1132 			kevp = &changes[i];
1133 			if (!kevp->filter)
1134 				continue;
1135 			kevp->flags &= ~EV_SYSFLAGS;
1136 			error = kqueue_register(kq, kevp, td, 1);
1137 			if (error || (kevp->flags & EV_RECEIPT)) {
1138 				if (nevents == 0)
1139 					return (error);
1140 				kevp->flags = EV_ERROR;
1141 				kevp->data = error;
1142 				(void)k_ops->k_copyout(k_ops->arg, kevp, 1);
1143 				nevents--;
1144 				nerrors++;
1145 			}
1146 		}
1147 		nchanges -= n;
1148 	}
1149 	if (nerrors) {
1150 		td->td_retval[0] = nerrors;
1151 		return (0);
1152 	}
1153 
1154 	return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td));
1155 }
1156 
1157 int
1158 kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents,
1159     struct kevent_copyops *k_ops, const struct timespec *timeout)
1160 {
1161 	struct kqueue *kq;
1162 	int error;
1163 
1164 	error = kqueue_acquire(fp, &kq);
1165 	if (error != 0)
1166 		return (error);
1167 	error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout);
1168 	kqueue_release(kq, 0);
1169 	return (error);
1170 }
1171 
1172 /*
1173  * Performs a kevent() call on a temporarily created kqueue. This can be
1174  * used to perform one-shot polling, similar to poll() and select().
1175  */
1176 int
1177 kern_kevent_anonymous(struct thread *td, int nevents,
1178     struct kevent_copyops *k_ops)
1179 {
1180 	struct kqueue kq = {};
1181 	int error;
1182 
1183 	kqueue_init(&kq);
1184 	kq.kq_refcnt = 1;
1185 	error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL);
1186 	kqueue_drain(&kq, td);
1187 	kqueue_destroy(&kq);
1188 	return (error);
1189 }
1190 
1191 int
1192 kqueue_add_filteropts(int filt, struct filterops *filtops)
1193 {
1194 	int error;
1195 
1196 	error = 0;
1197 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
1198 		printf(
1199 "trying to add a filterop that is out of range: %d is beyond %d\n",
1200 		    ~filt, EVFILT_SYSCOUNT);
1201 		return EINVAL;
1202 	}
1203 	mtx_lock(&filterops_lock);
1204 	if (sysfilt_ops[~filt].for_fop != &null_filtops &&
1205 	    sysfilt_ops[~filt].for_fop != NULL)
1206 		error = EEXIST;
1207 	else {
1208 		sysfilt_ops[~filt].for_fop = filtops;
1209 		sysfilt_ops[~filt].for_refcnt = 0;
1210 	}
1211 	mtx_unlock(&filterops_lock);
1212 
1213 	return (error);
1214 }
1215 
1216 int
1217 kqueue_del_filteropts(int filt)
1218 {
1219 	int error;
1220 
1221 	error = 0;
1222 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1223 		return EINVAL;
1224 
1225 	mtx_lock(&filterops_lock);
1226 	if (sysfilt_ops[~filt].for_fop == &null_filtops ||
1227 	    sysfilt_ops[~filt].for_fop == NULL)
1228 		error = EINVAL;
1229 	else if (sysfilt_ops[~filt].for_refcnt != 0)
1230 		error = EBUSY;
1231 	else {
1232 		sysfilt_ops[~filt].for_fop = &null_filtops;
1233 		sysfilt_ops[~filt].for_refcnt = 0;
1234 	}
1235 	mtx_unlock(&filterops_lock);
1236 
1237 	return error;
1238 }
1239 
1240 static struct filterops *
1241 kqueue_fo_find(int filt)
1242 {
1243 
1244 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1245 		return NULL;
1246 
1247 	if (sysfilt_ops[~filt].for_nolock)
1248 		return sysfilt_ops[~filt].for_fop;
1249 
1250 	mtx_lock(&filterops_lock);
1251 	sysfilt_ops[~filt].for_refcnt++;
1252 	if (sysfilt_ops[~filt].for_fop == NULL)
1253 		sysfilt_ops[~filt].for_fop = &null_filtops;
1254 	mtx_unlock(&filterops_lock);
1255 
1256 	return sysfilt_ops[~filt].for_fop;
1257 }
1258 
1259 static void
1260 kqueue_fo_release(int filt)
1261 {
1262 
1263 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1264 		return;
1265 
1266 	if (sysfilt_ops[~filt].for_nolock)
1267 		return;
1268 
1269 	mtx_lock(&filterops_lock);
1270 	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
1271 	    ("filter object refcount not valid on release"));
1272 	sysfilt_ops[~filt].for_refcnt--;
1273 	mtx_unlock(&filterops_lock);
1274 }
1275 
1276 /*
1277  * A ref to kq (obtained via kqueue_acquire) must be held.  waitok will
1278  * influence if memory allocation should wait.  Make sure it is 0 if you
1279  * hold any mutexes.
1280  */
1281 static int
1282 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok)
1283 {
1284 	struct filterops *fops;
1285 	struct file *fp;
1286 	struct knote *kn, *tkn;
1287 	struct knlist *knl;
1288 	cap_rights_t rights;
1289 	int error, filt, event;
1290 	int haskqglobal, filedesc_unlock;
1291 
1292 	if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE))
1293 		return (EINVAL);
1294 
1295 	fp = NULL;
1296 	kn = NULL;
1297 	knl = NULL;
1298 	error = 0;
1299 	haskqglobal = 0;
1300 	filedesc_unlock = 0;
1301 
1302 	filt = kev->filter;
1303 	fops = kqueue_fo_find(filt);
1304 	if (fops == NULL)
1305 		return EINVAL;
1306 
1307 	if (kev->flags & EV_ADD) {
1308 		/*
1309 		 * Prevent waiting with locks.  Non-sleepable
1310 		 * allocation failures are handled in the loop, only
1311 		 * if the spare knote appears to be actually required.
1312 		 */
1313 		tkn = knote_alloc(waitok);
1314 	} else {
1315 		tkn = NULL;
1316 	}
1317 
1318 findkn:
1319 	if (fops->f_isfd) {
1320 		KASSERT(td != NULL, ("td is NULL"));
1321 		if (kev->ident > INT_MAX)
1322 			error = EBADF;
1323 		else
1324 			error = fget(td, kev->ident,
1325 			    cap_rights_init(&rights, CAP_EVENT), &fp);
1326 		if (error)
1327 			goto done;
1328 
1329 		if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
1330 		    kev->ident, 0) != 0) {
1331 			/* try again */
1332 			fdrop(fp, td);
1333 			fp = NULL;
1334 			error = kqueue_expand(kq, fops, kev->ident, waitok);
1335 			if (error)
1336 				goto done;
1337 			goto findkn;
1338 		}
1339 
1340 		if (fp->f_type == DTYPE_KQUEUE) {
1341 			/*
1342 			 * If we add some intelligence about what we are doing,
1343 			 * we should be able to support events on ourselves.
1344 			 * We need to know when we are doing this to prevent
1345 			 * getting both the knlist lock and the kq lock since
1346 			 * they are the same thing.
1347 			 */
1348 			if (fp->f_data == kq) {
1349 				error = EINVAL;
1350 				goto done;
1351 			}
1352 
1353 			/*
1354 			 * Pre-lock the filedesc before the global
1355 			 * lock mutex, see the comment in
1356 			 * kqueue_close().
1357 			 */
1358 			FILEDESC_XLOCK(td->td_proc->p_fd);
1359 			filedesc_unlock = 1;
1360 			KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1361 		}
1362 
1363 		KQ_LOCK(kq);
1364 		if (kev->ident < kq->kq_knlistsize) {
1365 			SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
1366 				if (kev->filter == kn->kn_filter)
1367 					break;
1368 		}
1369 	} else {
1370 		if ((kev->flags & EV_ADD) == EV_ADD)
1371 			kqueue_expand(kq, fops, kev->ident, waitok);
1372 
1373 		KQ_LOCK(kq);
1374 
1375 		/*
1376 		 * If possible, find an existing knote to use for this kevent.
1377 		 */
1378 		if (kev->filter == EVFILT_PROC &&
1379 		    (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
1380 			/* This is an internal creation of a process tracking
1381 			 * note. Don't attempt to coalesce this with an
1382 			 * existing note.
1383 			 */
1384 			;
1385 		} else if (kq->kq_knhashmask != 0) {
1386 			struct klist *list;
1387 
1388 			list = &kq->kq_knhash[
1389 			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
1390 			SLIST_FOREACH(kn, list, kn_link)
1391 				if (kev->ident == kn->kn_id &&
1392 				    kev->filter == kn->kn_filter)
1393 					break;
1394 		}
1395 	}
1396 
1397 	/* knote is in the process of changing, wait for it to stabilize. */
1398 	if (kn != NULL && kn_in_flux(kn)) {
1399 		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1400 		if (filedesc_unlock) {
1401 			FILEDESC_XUNLOCK(td->td_proc->p_fd);
1402 			filedesc_unlock = 0;
1403 		}
1404 		kq->kq_state |= KQ_FLUXWAIT;
1405 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
1406 		if (fp != NULL) {
1407 			fdrop(fp, td);
1408 			fp = NULL;
1409 		}
1410 		goto findkn;
1411 	}
1412 
1413 	/*
1414 	 * kn now contains the matching knote, or NULL if no match
1415 	 */
1416 	if (kn == NULL) {
1417 		if (kev->flags & EV_ADD) {
1418 			kn = tkn;
1419 			tkn = NULL;
1420 			if (kn == NULL) {
1421 				KQ_UNLOCK(kq);
1422 				error = ENOMEM;
1423 				goto done;
1424 			}
1425 			kn->kn_fp = fp;
1426 			kn->kn_kq = kq;
1427 			kn->kn_fop = fops;
1428 			/*
1429 			 * apply reference counts to knote structure, and
1430 			 * do not release it at the end of this routine.
1431 			 */
1432 			fops = NULL;
1433 			fp = NULL;
1434 
1435 			kn->kn_sfflags = kev->fflags;
1436 			kn->kn_sdata = kev->data;
1437 			kev->fflags = 0;
1438 			kev->data = 0;
1439 			kn->kn_kevent = *kev;
1440 			kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
1441 			    EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT);
1442 			kn->kn_status = KN_DETACHED;
1443 			kn_enter_flux(kn);
1444 
1445 			error = knote_attach(kn, kq);
1446 			KQ_UNLOCK(kq);
1447 			if (error != 0) {
1448 				tkn = kn;
1449 				goto done;
1450 			}
1451 
1452 			if ((error = kn->kn_fop->f_attach(kn)) != 0) {
1453 				knote_drop_detached(kn, td);
1454 				goto done;
1455 			}
1456 			knl = kn_list_lock(kn);
1457 			goto done_ev_add;
1458 		} else {
1459 			/* No matching knote and the EV_ADD flag is not set. */
1460 			KQ_UNLOCK(kq);
1461 			error = ENOENT;
1462 			goto done;
1463 		}
1464 	}
1465 
1466 	if (kev->flags & EV_DELETE) {
1467 		kn_enter_flux(kn);
1468 		KQ_UNLOCK(kq);
1469 		knote_drop(kn, td);
1470 		goto done;
1471 	}
1472 
1473 	if (kev->flags & EV_FORCEONESHOT) {
1474 		kn->kn_flags |= EV_ONESHOT;
1475 		KNOTE_ACTIVATE(kn, 1);
1476 	}
1477 
1478 	/*
1479 	 * The user may change some filter values after the initial EV_ADD,
1480 	 * but doing so will not reset any filter which has already been
1481 	 * triggered.
1482 	 */
1483 	kn->kn_status |= KN_SCAN;
1484 	kn_enter_flux(kn);
1485 	KQ_UNLOCK(kq);
1486 	knl = kn_list_lock(kn);
1487 	kn->kn_kevent.udata = kev->udata;
1488 	if (!fops->f_isfd && fops->f_touch != NULL) {
1489 		fops->f_touch(kn, kev, EVENT_REGISTER);
1490 	} else {
1491 		kn->kn_sfflags = kev->fflags;
1492 		kn->kn_sdata = kev->data;
1493 	}
1494 
1495 	/*
1496 	 * We can get here with kn->kn_knlist == NULL.  This can happen when
1497 	 * the initial attach event decides that the event is "completed"
1498 	 * already.  i.e. filt_procattach is called on a zombie process.  It
1499 	 * will call filt_proc which will remove it from the list, and NULL
1500 	 * kn_knlist.
1501 	 */
1502 done_ev_add:
1503 	if ((kev->flags & EV_ENABLE) != 0)
1504 		kn->kn_status &= ~KN_DISABLED;
1505 	else if ((kev->flags & EV_DISABLE) != 0)
1506 		kn->kn_status |= KN_DISABLED;
1507 
1508 	if ((kn->kn_status & KN_DISABLED) == 0)
1509 		event = kn->kn_fop->f_event(kn, 0);
1510 	else
1511 		event = 0;
1512 
1513 	KQ_LOCK(kq);
1514 	if (event)
1515 		kn->kn_status |= KN_ACTIVE;
1516 	if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) ==
1517 	    KN_ACTIVE)
1518 		knote_enqueue(kn);
1519 	kn->kn_status &= ~KN_SCAN;
1520 	kn_leave_flux(kn);
1521 	kn_list_unlock(knl);
1522 	KQ_UNLOCK_FLUX(kq);
1523 
1524 done:
1525 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1526 	if (filedesc_unlock)
1527 		FILEDESC_XUNLOCK(td->td_proc->p_fd);
1528 	if (fp != NULL)
1529 		fdrop(fp, td);
1530 	knote_free(tkn);
1531 	if (fops != NULL)
1532 		kqueue_fo_release(filt);
1533 	return (error);
1534 }
1535 
1536 static int
1537 kqueue_acquire(struct file *fp, struct kqueue **kqp)
1538 {
1539 	int error;
1540 	struct kqueue *kq;
1541 
1542 	error = 0;
1543 
1544 	kq = fp->f_data;
1545 	if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
1546 		return (EBADF);
1547 	*kqp = kq;
1548 	KQ_LOCK(kq);
1549 	if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
1550 		KQ_UNLOCK(kq);
1551 		return (EBADF);
1552 	}
1553 	kq->kq_refcnt++;
1554 	KQ_UNLOCK(kq);
1555 
1556 	return error;
1557 }
1558 
1559 static void
1560 kqueue_release(struct kqueue *kq, int locked)
1561 {
1562 	if (locked)
1563 		KQ_OWNED(kq);
1564 	else
1565 		KQ_LOCK(kq);
1566 	kq->kq_refcnt--;
1567 	if (kq->kq_refcnt == 1)
1568 		wakeup(&kq->kq_refcnt);
1569 	if (!locked)
1570 		KQ_UNLOCK(kq);
1571 }
1572 
1573 static void
1574 kqueue_schedtask(struct kqueue *kq)
1575 {
1576 
1577 	KQ_OWNED(kq);
1578 	KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
1579 	    ("scheduling kqueue task while draining"));
1580 
1581 	if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
1582 		taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task);
1583 		kq->kq_state |= KQ_TASKSCHED;
1584 	}
1585 }
1586 
1587 /*
1588  * Expand the kq to make sure we have storage for fops/ident pair.
1589  *
1590  * Return 0 on success (or no work necessary), return errno on failure.
1591  *
1592  * Not calling hashinit w/ waitok (proper malloc flag) should be safe.
1593  * If kqueue_register is called from a non-fd context, there usually/should
1594  * be no locks held.
1595  */
1596 static int
1597 kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
1598 	int waitok)
1599 {
1600 	struct klist *list, *tmp_knhash, *to_free;
1601 	u_long tmp_knhashmask;
1602 	int size;
1603 	int fd;
1604 	int mflag = waitok ? M_WAITOK : M_NOWAIT;
1605 
1606 	KQ_NOTOWNED(kq);
1607 
1608 	to_free = NULL;
1609 	if (fops->f_isfd) {
1610 		fd = ident;
1611 		if (kq->kq_knlistsize <= fd) {
1612 			size = kq->kq_knlistsize;
1613 			while (size <= fd)
1614 				size += KQEXTENT;
1615 			list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
1616 			if (list == NULL)
1617 				return ENOMEM;
1618 			KQ_LOCK(kq);
1619 			if (kq->kq_knlistsize > fd) {
1620 				to_free = list;
1621 				list = NULL;
1622 			} else {
1623 				if (kq->kq_knlist != NULL) {
1624 					bcopy(kq->kq_knlist, list,
1625 					    kq->kq_knlistsize * sizeof(*list));
1626 					to_free = kq->kq_knlist;
1627 					kq->kq_knlist = NULL;
1628 				}
1629 				bzero((caddr_t)list +
1630 				    kq->kq_knlistsize * sizeof(*list),
1631 				    (size - kq->kq_knlistsize) * sizeof(*list));
1632 				kq->kq_knlistsize = size;
1633 				kq->kq_knlist = list;
1634 			}
1635 			KQ_UNLOCK(kq);
1636 		}
1637 	} else {
1638 		if (kq->kq_knhashmask == 0) {
1639 			tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
1640 			    &tmp_knhashmask);
1641 			if (tmp_knhash == NULL)
1642 				return ENOMEM;
1643 			KQ_LOCK(kq);
1644 			if (kq->kq_knhashmask == 0) {
1645 				kq->kq_knhash = tmp_knhash;
1646 				kq->kq_knhashmask = tmp_knhashmask;
1647 			} else {
1648 				to_free = tmp_knhash;
1649 			}
1650 			KQ_UNLOCK(kq);
1651 		}
1652 	}
1653 	free(to_free, M_KQUEUE);
1654 
1655 	KQ_NOTOWNED(kq);
1656 	return 0;
1657 }
1658 
1659 static void
1660 kqueue_task(void *arg, int pending)
1661 {
1662 	struct kqueue *kq;
1663 	int haskqglobal;
1664 
1665 	haskqglobal = 0;
1666 	kq = arg;
1667 
1668 	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1669 	KQ_LOCK(kq);
1670 
1671 	KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
1672 
1673 	kq->kq_state &= ~KQ_TASKSCHED;
1674 	if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
1675 		wakeup(&kq->kq_state);
1676 	}
1677 	KQ_UNLOCK(kq);
1678 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1679 }
1680 
1681 /*
1682  * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
1683  * We treat KN_MARKER knotes as if they are in flux.
1684  */
1685 static int
1686 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
1687     const struct timespec *tsp, struct kevent *keva, struct thread *td)
1688 {
1689 	struct kevent *kevp;
1690 	struct knote *kn, *marker;
1691 	struct knlist *knl;
1692 	sbintime_t asbt, rsbt;
1693 	int count, error, haskqglobal, influx, nkev, touch;
1694 
1695 	count = maxevents;
1696 	nkev = 0;
1697 	error = 0;
1698 	haskqglobal = 0;
1699 
1700 	if (maxevents == 0)
1701 		goto done_nl;
1702 
1703 	rsbt = 0;
1704 	if (tsp != NULL) {
1705 		if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 ||
1706 		    tsp->tv_nsec >= 1000000000) {
1707 			error = EINVAL;
1708 			goto done_nl;
1709 		}
1710 		if (timespecisset(tsp)) {
1711 			if (tsp->tv_sec <= INT32_MAX) {
1712 				rsbt = tstosbt(*tsp);
1713 				if (TIMESEL(&asbt, rsbt))
1714 					asbt += tc_tick_sbt;
1715 				if (asbt <= SBT_MAX - rsbt)
1716 					asbt += rsbt;
1717 				else
1718 					asbt = 0;
1719 				rsbt >>= tc_precexp;
1720 			} else
1721 				asbt = 0;
1722 		} else
1723 			asbt = -1;
1724 	} else
1725 		asbt = 0;
1726 	marker = knote_alloc(1);
1727 	marker->kn_status = KN_MARKER;
1728 	KQ_LOCK(kq);
1729 
1730 retry:
1731 	kevp = keva;
1732 	if (kq->kq_count == 0) {
1733 		if (asbt == -1) {
1734 			error = EWOULDBLOCK;
1735 		} else {
1736 			kq->kq_state |= KQ_SLEEP;
1737 			error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
1738 			    "kqread", asbt, rsbt, C_ABSOLUTE);
1739 		}
1740 		if (error == 0)
1741 			goto retry;
1742 		/* don't restart after signals... */
1743 		if (error == ERESTART)
1744 			error = EINTR;
1745 		else if (error == EWOULDBLOCK)
1746 			error = 0;
1747 		goto done;
1748 	}
1749 
1750 	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1751 	influx = 0;
1752 	while (count) {
1753 		KQ_OWNED(kq);
1754 		kn = TAILQ_FIRST(&kq->kq_head);
1755 
1756 		if ((kn->kn_status == KN_MARKER && kn != marker) ||
1757 		    kn_in_flux(kn)) {
1758 			if (influx) {
1759 				influx = 0;
1760 				KQ_FLUX_WAKEUP(kq);
1761 			}
1762 			kq->kq_state |= KQ_FLUXWAIT;
1763 			error = msleep(kq, &kq->kq_lock, PSOCK,
1764 			    "kqflxwt", 0);
1765 			continue;
1766 		}
1767 
1768 		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1769 		if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
1770 			kn->kn_status &= ~KN_QUEUED;
1771 			kq->kq_count--;
1772 			continue;
1773 		}
1774 		if (kn == marker) {
1775 			KQ_FLUX_WAKEUP(kq);
1776 			if (count == maxevents)
1777 				goto retry;
1778 			goto done;
1779 		}
1780 		KASSERT(!kn_in_flux(kn),
1781 		    ("knote %p is unexpectedly in flux", kn));
1782 
1783 		if ((kn->kn_flags & EV_DROP) == EV_DROP) {
1784 			kn->kn_status &= ~KN_QUEUED;
1785 			kn_enter_flux(kn);
1786 			kq->kq_count--;
1787 			KQ_UNLOCK(kq);
1788 			/*
1789 			 * We don't need to lock the list since we've
1790 			 * marked it as in flux.
1791 			 */
1792 			knote_drop(kn, td);
1793 			KQ_LOCK(kq);
1794 			continue;
1795 		} else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
1796 			kn->kn_status &= ~KN_QUEUED;
1797 			kn_enter_flux(kn);
1798 			kq->kq_count--;
1799 			KQ_UNLOCK(kq);
1800 			/*
1801 			 * We don't need to lock the list since we've
1802 			 * marked the knote as being in flux.
1803 			 */
1804 			*kevp = kn->kn_kevent;
1805 			knote_drop(kn, td);
1806 			KQ_LOCK(kq);
1807 			kn = NULL;
1808 		} else {
1809 			kn->kn_status |= KN_SCAN;
1810 			kn_enter_flux(kn);
1811 			KQ_UNLOCK(kq);
1812 			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
1813 				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1814 			knl = kn_list_lock(kn);
1815 			if (kn->kn_fop->f_event(kn, 0) == 0) {
1816 				KQ_LOCK(kq);
1817 				KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1818 				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE |
1819 				    KN_SCAN);
1820 				kn_leave_flux(kn);
1821 				kq->kq_count--;
1822 				kn_list_unlock(knl);
1823 				influx = 1;
1824 				continue;
1825 			}
1826 			touch = (!kn->kn_fop->f_isfd &&
1827 			    kn->kn_fop->f_touch != NULL);
1828 			if (touch)
1829 				kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
1830 			else
1831 				*kevp = kn->kn_kevent;
1832 			KQ_LOCK(kq);
1833 			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1834 			if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
1835 				/*
1836 				 * Manually clear knotes who weren't
1837 				 * 'touch'ed.
1838 				 */
1839 				if (touch == 0 && kn->kn_flags & EV_CLEAR) {
1840 					kn->kn_data = 0;
1841 					kn->kn_fflags = 0;
1842 				}
1843 				if (kn->kn_flags & EV_DISPATCH)
1844 					kn->kn_status |= KN_DISABLED;
1845 				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
1846 				kq->kq_count--;
1847 			} else
1848 				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1849 
1850 			kn->kn_status &= ~KN_SCAN;
1851 			kn_leave_flux(kn);
1852 			kn_list_unlock(knl);
1853 			influx = 1;
1854 		}
1855 
1856 		/* we are returning a copy to the user */
1857 		kevp++;
1858 		nkev++;
1859 		count--;
1860 
1861 		if (nkev == KQ_NEVENTS) {
1862 			influx = 0;
1863 			KQ_UNLOCK_FLUX(kq);
1864 			error = k_ops->k_copyout(k_ops->arg, keva, nkev);
1865 			nkev = 0;
1866 			kevp = keva;
1867 			KQ_LOCK(kq);
1868 			if (error)
1869 				break;
1870 		}
1871 	}
1872 	TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
1873 done:
1874 	KQ_OWNED(kq);
1875 	KQ_UNLOCK_FLUX(kq);
1876 	knote_free(marker);
1877 done_nl:
1878 	KQ_NOTOWNED(kq);
1879 	if (nkev != 0)
1880 		error = k_ops->k_copyout(k_ops->arg, keva, nkev);
1881 	td->td_retval[0] = maxevents - count;
1882 	return (error);
1883 }
1884 
1885 /*ARGSUSED*/
1886 static int
1887 kqueue_ioctl(struct file *fp, u_long cmd, void *data,
1888 	struct ucred *active_cred, struct thread *td)
1889 {
1890 	/*
1891 	 * Enabling sigio causes two major problems:
1892 	 * 1) infinite recursion:
1893 	 * Synopsys: kevent is being used to track signals and have FIOASYNC
1894 	 * set.  On receipt of a signal this will cause a kqueue to recurse
1895 	 * into itself over and over.  Sending the sigio causes the kqueue
1896 	 * to become ready, which in turn posts sigio again, forever.
1897 	 * Solution: this can be solved by setting a flag in the kqueue that
1898 	 * we have a SIGIO in progress.
1899 	 * 2) locking problems:
1900 	 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
1901 	 * us above the proc and pgrp locks.
1902 	 * Solution: Post a signal using an async mechanism, being sure to
1903 	 * record a generation count in the delivery so that we do not deliver
1904 	 * a signal to the wrong process.
1905 	 *
1906 	 * Note, these two mechanisms are somewhat mutually exclusive!
1907 	 */
1908 #if 0
1909 	struct kqueue *kq;
1910 
1911 	kq = fp->f_data;
1912 	switch (cmd) {
1913 	case FIOASYNC:
1914 		if (*(int *)data) {
1915 			kq->kq_state |= KQ_ASYNC;
1916 		} else {
1917 			kq->kq_state &= ~KQ_ASYNC;
1918 		}
1919 		return (0);
1920 
1921 	case FIOSETOWN:
1922 		return (fsetown(*(int *)data, &kq->kq_sigio));
1923 
1924 	case FIOGETOWN:
1925 		*(int *)data = fgetown(&kq->kq_sigio);
1926 		return (0);
1927 	}
1928 #endif
1929 
1930 	return (ENOTTY);
1931 }
1932 
1933 /*ARGSUSED*/
1934 static int
1935 kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
1936 	struct thread *td)
1937 {
1938 	struct kqueue *kq;
1939 	int revents = 0;
1940 	int error;
1941 
1942 	if ((error = kqueue_acquire(fp, &kq)))
1943 		return POLLERR;
1944 
1945 	KQ_LOCK(kq);
1946 	if (events & (POLLIN | POLLRDNORM)) {
1947 		if (kq->kq_count) {
1948 			revents |= events & (POLLIN | POLLRDNORM);
1949 		} else {
1950 			selrecord(td, &kq->kq_sel);
1951 			if (SEL_WAITING(&kq->kq_sel))
1952 				kq->kq_state |= KQ_SEL;
1953 		}
1954 	}
1955 	kqueue_release(kq, 1);
1956 	KQ_UNLOCK(kq);
1957 	return (revents);
1958 }
1959 
1960 /*ARGSUSED*/
1961 static int
1962 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
1963 	struct thread *td)
1964 {
1965 
1966 	bzero((void *)st, sizeof *st);
1967 	/*
1968 	 * We no longer return kq_count because the unlocked value is useless.
1969 	 * If you spent all this time getting the count, why not spend your
1970 	 * syscall better by calling kevent?
1971 	 *
1972 	 * XXX - This is needed for libc_r.
1973 	 */
1974 	st->st_mode = S_IFIFO;
1975 	return (0);
1976 }
1977 
1978 static void
1979 kqueue_drain(struct kqueue *kq, struct thread *td)
1980 {
1981 	struct knote *kn;
1982 	int i;
1983 
1984 	KQ_LOCK(kq);
1985 
1986 	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
1987 	    ("kqueue already closing"));
1988 	kq->kq_state |= KQ_CLOSING;
1989 	if (kq->kq_refcnt > 1)
1990 		msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
1991 
1992 	KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
1993 
1994 	KASSERT(knlist_empty(&kq->kq_sel.si_note),
1995 	    ("kqueue's knlist not empty"));
1996 
1997 	for (i = 0; i < kq->kq_knlistsize; i++) {
1998 		while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
1999 			if (kn_in_flux(kn)) {
2000 				kq->kq_state |= KQ_FLUXWAIT;
2001 				msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
2002 				continue;
2003 			}
2004 			kn_enter_flux(kn);
2005 			KQ_UNLOCK(kq);
2006 			knote_drop(kn, td);
2007 			KQ_LOCK(kq);
2008 		}
2009 	}
2010 	if (kq->kq_knhashmask != 0) {
2011 		for (i = 0; i <= kq->kq_knhashmask; i++) {
2012 			while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
2013 				if (kn_in_flux(kn)) {
2014 					kq->kq_state |= KQ_FLUXWAIT;
2015 					msleep(kq, &kq->kq_lock, PSOCK,
2016 					       "kqclo2", 0);
2017 					continue;
2018 				}
2019 				kn_enter_flux(kn);
2020 				KQ_UNLOCK(kq);
2021 				knote_drop(kn, td);
2022 				KQ_LOCK(kq);
2023 			}
2024 		}
2025 	}
2026 
2027 	if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
2028 		kq->kq_state |= KQ_TASKDRAIN;
2029 		msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
2030 	}
2031 
2032 	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
2033 		selwakeuppri(&kq->kq_sel, PSOCK);
2034 		if (!SEL_WAITING(&kq->kq_sel))
2035 			kq->kq_state &= ~KQ_SEL;
2036 	}
2037 
2038 	KQ_UNLOCK(kq);
2039 }
2040 
2041 static void
2042 kqueue_destroy(struct kqueue *kq)
2043 {
2044 
2045 	KASSERT(kq->kq_fdp == NULL,
2046 	    ("kqueue still attached to a file descriptor"));
2047 	seldrain(&kq->kq_sel);
2048 	knlist_destroy(&kq->kq_sel.si_note);
2049 	mtx_destroy(&kq->kq_lock);
2050 
2051 	if (kq->kq_knhash != NULL)
2052 		free(kq->kq_knhash, M_KQUEUE);
2053 	if (kq->kq_knlist != NULL)
2054 		free(kq->kq_knlist, M_KQUEUE);
2055 
2056 	funsetown(&kq->kq_sigio);
2057 }
2058 
2059 /*ARGSUSED*/
2060 static int
2061 kqueue_close(struct file *fp, struct thread *td)
2062 {
2063 	struct kqueue *kq = fp->f_data;
2064 	struct filedesc *fdp;
2065 	int error;
2066 	int filedesc_unlock;
2067 
2068 	if ((error = kqueue_acquire(fp, &kq)))
2069 		return error;
2070 	kqueue_drain(kq, td);
2071 
2072 	/*
2073 	 * We could be called due to the knote_drop() doing fdrop(),
2074 	 * called from kqueue_register().  In this case the global
2075 	 * lock is owned, and filedesc sx is locked before, to not
2076 	 * take the sleepable lock after non-sleepable.
2077 	 */
2078 	fdp = kq->kq_fdp;
2079 	kq->kq_fdp = NULL;
2080 	if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
2081 		FILEDESC_XLOCK(fdp);
2082 		filedesc_unlock = 1;
2083 	} else
2084 		filedesc_unlock = 0;
2085 	TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list);
2086 	if (filedesc_unlock)
2087 		FILEDESC_XUNLOCK(fdp);
2088 
2089 	kqueue_destroy(kq);
2090 	chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0);
2091 	crfree(kq->kq_cred);
2092 	free(kq, M_KQUEUE);
2093 	fp->f_data = NULL;
2094 
2095 	return (0);
2096 }
2097 
2098 static int
2099 kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
2100 {
2101 
2102 	kif->kf_type = KF_TYPE_KQUEUE;
2103 	return (0);
2104 }
2105 
2106 static void
2107 kqueue_wakeup(struct kqueue *kq)
2108 {
2109 	KQ_OWNED(kq);
2110 
2111 	if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
2112 		kq->kq_state &= ~KQ_SLEEP;
2113 		wakeup(kq);
2114 	}
2115 	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
2116 		selwakeuppri(&kq->kq_sel, PSOCK);
2117 		if (!SEL_WAITING(&kq->kq_sel))
2118 			kq->kq_state &= ~KQ_SEL;
2119 	}
2120 	if (!knlist_empty(&kq->kq_sel.si_note))
2121 		kqueue_schedtask(kq);
2122 	if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
2123 		pgsigio(&kq->kq_sigio, SIGIO, 0);
2124 	}
2125 }
2126 
2127 /*
2128  * Walk down a list of knotes, activating them if their event has triggered.
2129  *
2130  * There is a possibility to optimize in the case of one kq watching another.
2131  * Instead of scheduling a task to wake it up, you could pass enough state
2132  * down the chain to make up the parent kqueue.  Make this code functional
2133  * first.
2134  */
2135 void
2136 knote(struct knlist *list, long hint, int lockflags)
2137 {
2138 	struct kqueue *kq;
2139 	struct knote *kn, *tkn;
2140 	int error;
2141 
2142 	if (list == NULL)
2143 		return;
2144 
2145 	KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
2146 
2147 	if ((lockflags & KNF_LISTLOCKED) == 0)
2148 		list->kl_lock(list->kl_lockarg);
2149 
2150 	/*
2151 	 * If we unlock the list lock (and enter influx), we can
2152 	 * eliminate the kqueue scheduling, but this will introduce
2153 	 * four lock/unlock's for each knote to test.  Also, marker
2154 	 * would be needed to keep iteration position, since filters
2155 	 * or other threads could remove events.
2156 	 */
2157 	SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
2158 		kq = kn->kn_kq;
2159 		KQ_LOCK(kq);
2160 		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
2161 			/*
2162 			 * Do not process the influx notes, except for
2163 			 * the influx coming from the kq unlock in the
2164 			 * kqueue_scan().  In the later case, we do
2165 			 * not interfere with the scan, since the code
2166 			 * fragment in kqueue_scan() locks the knlist,
2167 			 * and cannot proceed until we finished.
2168 			 */
2169 			KQ_UNLOCK(kq);
2170 		} else if ((lockflags & KNF_NOKQLOCK) != 0) {
2171 			kn_enter_flux(kn);
2172 			KQ_UNLOCK(kq);
2173 			error = kn->kn_fop->f_event(kn, hint);
2174 			KQ_LOCK(kq);
2175 			kn_leave_flux(kn);
2176 			if (error)
2177 				KNOTE_ACTIVATE(kn, 1);
2178 			KQ_UNLOCK_FLUX(kq);
2179 		} else {
2180 			kn->kn_status |= KN_HASKQLOCK;
2181 			if (kn->kn_fop->f_event(kn, hint))
2182 				KNOTE_ACTIVATE(kn, 1);
2183 			kn->kn_status &= ~KN_HASKQLOCK;
2184 			KQ_UNLOCK(kq);
2185 		}
2186 	}
2187 	if ((lockflags & KNF_LISTLOCKED) == 0)
2188 		list->kl_unlock(list->kl_lockarg);
2189 }
2190 
2191 /*
2192  * add a knote to a knlist
2193  */
2194 void
2195 knlist_add(struct knlist *knl, struct knote *kn, int islocked)
2196 {
2197 
2198 	KNL_ASSERT_LOCK(knl, islocked);
2199 	KQ_NOTOWNED(kn->kn_kq);
2200 	KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn));
2201 	KASSERT((kn->kn_status & KN_DETACHED) != 0,
2202 	    ("knote %p was not detached", kn));
2203 	if (!islocked)
2204 		knl->kl_lock(knl->kl_lockarg);
2205 	SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
2206 	if (!islocked)
2207 		knl->kl_unlock(knl->kl_lockarg);
2208 	KQ_LOCK(kn->kn_kq);
2209 	kn->kn_knlist = knl;
2210 	kn->kn_status &= ~KN_DETACHED;
2211 	KQ_UNLOCK(kn->kn_kq);
2212 }
2213 
2214 static void
2215 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked,
2216     int kqislocked)
2217 {
2218 
2219 	KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked"));
2220 	KNL_ASSERT_LOCK(knl, knlislocked);
2221 	mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
2222 	KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn));
2223 	KASSERT((kn->kn_status & KN_DETACHED) == 0,
2224 	    ("knote %p was already detached", kn));
2225 	if (!knlislocked)
2226 		knl->kl_lock(knl->kl_lockarg);
2227 	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
2228 	kn->kn_knlist = NULL;
2229 	if (!knlislocked)
2230 		kn_list_unlock(knl);
2231 	if (!kqislocked)
2232 		KQ_LOCK(kn->kn_kq);
2233 	kn->kn_status |= KN_DETACHED;
2234 	if (!kqislocked)
2235 		KQ_UNLOCK(kn->kn_kq);
2236 }
2237 
2238 /*
2239  * remove knote from the specified knlist
2240  */
2241 void
2242 knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
2243 {
2244 
2245 	knlist_remove_kq(knl, kn, islocked, 0);
2246 }
2247 
2248 int
2249 knlist_empty(struct knlist *knl)
2250 {
2251 
2252 	KNL_ASSERT_LOCKED(knl);
2253 	return (SLIST_EMPTY(&knl->kl_list));
2254 }
2255 
2256 static struct mtx knlist_lock;
2257 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
2258     MTX_DEF);
2259 static void knlist_mtx_lock(void *arg);
2260 static void knlist_mtx_unlock(void *arg);
2261 
2262 static void
2263 knlist_mtx_lock(void *arg)
2264 {
2265 
2266 	mtx_lock((struct mtx *)arg);
2267 }
2268 
2269 static void
2270 knlist_mtx_unlock(void *arg)
2271 {
2272 
2273 	mtx_unlock((struct mtx *)arg);
2274 }
2275 
2276 static void
2277 knlist_mtx_assert_locked(void *arg)
2278 {
2279 
2280 	mtx_assert((struct mtx *)arg, MA_OWNED);
2281 }
2282 
2283 static void
2284 knlist_mtx_assert_unlocked(void *arg)
2285 {
2286 
2287 	mtx_assert((struct mtx *)arg, MA_NOTOWNED);
2288 }
2289 
2290 static void
2291 knlist_rw_rlock(void *arg)
2292 {
2293 
2294 	rw_rlock((struct rwlock *)arg);
2295 }
2296 
2297 static void
2298 knlist_rw_runlock(void *arg)
2299 {
2300 
2301 	rw_runlock((struct rwlock *)arg);
2302 }
2303 
2304 static void
2305 knlist_rw_assert_locked(void *arg)
2306 {
2307 
2308 	rw_assert((struct rwlock *)arg, RA_LOCKED);
2309 }
2310 
2311 static void
2312 knlist_rw_assert_unlocked(void *arg)
2313 {
2314 
2315 	rw_assert((struct rwlock *)arg, RA_UNLOCKED);
2316 }
2317 
2318 void
2319 knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
2320     void (*kl_unlock)(void *),
2321     void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *))
2322 {
2323 
2324 	if (lock == NULL)
2325 		knl->kl_lockarg = &knlist_lock;
2326 	else
2327 		knl->kl_lockarg = lock;
2328 
2329 	if (kl_lock == NULL)
2330 		knl->kl_lock = knlist_mtx_lock;
2331 	else
2332 		knl->kl_lock = kl_lock;
2333 	if (kl_unlock == NULL)
2334 		knl->kl_unlock = knlist_mtx_unlock;
2335 	else
2336 		knl->kl_unlock = kl_unlock;
2337 	if (kl_assert_locked == NULL)
2338 		knl->kl_assert_locked = knlist_mtx_assert_locked;
2339 	else
2340 		knl->kl_assert_locked = kl_assert_locked;
2341 	if (kl_assert_unlocked == NULL)
2342 		knl->kl_assert_unlocked = knlist_mtx_assert_unlocked;
2343 	else
2344 		knl->kl_assert_unlocked = kl_assert_unlocked;
2345 
2346 	knl->kl_autodestroy = 0;
2347 	SLIST_INIT(&knl->kl_list);
2348 }
2349 
2350 void
2351 knlist_init_mtx(struct knlist *knl, struct mtx *lock)
2352 {
2353 
2354 	knlist_init(knl, lock, NULL, NULL, NULL, NULL);
2355 }
2356 
2357 struct knlist *
2358 knlist_alloc(struct mtx *lock)
2359 {
2360 	struct knlist *knl;
2361 
2362 	knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK);
2363 	knlist_init_mtx(knl, lock);
2364 	return (knl);
2365 }
2366 
2367 void
2368 knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock)
2369 {
2370 
2371 	knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock,
2372 	    knlist_rw_assert_locked, knlist_rw_assert_unlocked);
2373 }
2374 
2375 void
2376 knlist_destroy(struct knlist *knl)
2377 {
2378 
2379 	KASSERT(KNLIST_EMPTY(knl),
2380 	    ("destroying knlist %p with knotes on it", knl));
2381 }
2382 
2383 void
2384 knlist_detach(struct knlist *knl)
2385 {
2386 
2387 	KNL_ASSERT_LOCKED(knl);
2388 	knl->kl_autodestroy = 1;
2389 	if (knlist_empty(knl)) {
2390 		knlist_destroy(knl);
2391 		free(knl, M_KQUEUE);
2392 	}
2393 }
2394 
2395 /*
2396  * Even if we are locked, we may need to drop the lock to allow any influx
2397  * knotes time to "settle".
2398  */
2399 void
2400 knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
2401 {
2402 	struct knote *kn, *kn2;
2403 	struct kqueue *kq;
2404 
2405 	KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl));
2406 	if (islocked)
2407 		KNL_ASSERT_LOCKED(knl);
2408 	else {
2409 		KNL_ASSERT_UNLOCKED(knl);
2410 again:		/* need to reacquire lock since we have dropped it */
2411 		knl->kl_lock(knl->kl_lockarg);
2412 	}
2413 
2414 	SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
2415 		kq = kn->kn_kq;
2416 		KQ_LOCK(kq);
2417 		if (kn_in_flux(kn)) {
2418 			KQ_UNLOCK(kq);
2419 			continue;
2420 		}
2421 		knlist_remove_kq(knl, kn, 1, 1);
2422 		if (killkn) {
2423 			kn_enter_flux(kn);
2424 			KQ_UNLOCK(kq);
2425 			knote_drop_detached(kn, td);
2426 		} else {
2427 			/* Make sure cleared knotes disappear soon */
2428 			kn->kn_flags |= EV_EOF | EV_ONESHOT;
2429 			KQ_UNLOCK(kq);
2430 		}
2431 		kq = NULL;
2432 	}
2433 
2434 	if (!SLIST_EMPTY(&knl->kl_list)) {
2435 		/* there are still in flux knotes remaining */
2436 		kn = SLIST_FIRST(&knl->kl_list);
2437 		kq = kn->kn_kq;
2438 		KQ_LOCK(kq);
2439 		KASSERT(kn_in_flux(kn), ("knote removed w/o list lock"));
2440 		knl->kl_unlock(knl->kl_lockarg);
2441 		kq->kq_state |= KQ_FLUXWAIT;
2442 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
2443 		kq = NULL;
2444 		goto again;
2445 	}
2446 
2447 	if (islocked)
2448 		KNL_ASSERT_LOCKED(knl);
2449 	else {
2450 		knl->kl_unlock(knl->kl_lockarg);
2451 		KNL_ASSERT_UNLOCKED(knl);
2452 	}
2453 }
2454 
2455 /*
2456  * Remove all knotes referencing a specified fd must be called with FILEDESC
2457  * lock.  This prevents a race where a new fd comes along and occupies the
2458  * entry and we attach a knote to the fd.
2459  */
2460 void
2461 knote_fdclose(struct thread *td, int fd)
2462 {
2463 	struct filedesc *fdp = td->td_proc->p_fd;
2464 	struct kqueue *kq;
2465 	struct knote *kn;
2466 	int influx;
2467 
2468 	FILEDESC_XLOCK_ASSERT(fdp);
2469 
2470 	/*
2471 	 * We shouldn't have to worry about new kevents appearing on fd
2472 	 * since filedesc is locked.
2473 	 */
2474 	TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
2475 		KQ_LOCK(kq);
2476 
2477 again:
2478 		influx = 0;
2479 		while (kq->kq_knlistsize > fd &&
2480 		    (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
2481 			if (kn_in_flux(kn)) {
2482 				/* someone else might be waiting on our knote */
2483 				if (influx)
2484 					wakeup(kq);
2485 				kq->kq_state |= KQ_FLUXWAIT;
2486 				msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
2487 				goto again;
2488 			}
2489 			kn_enter_flux(kn);
2490 			KQ_UNLOCK(kq);
2491 			influx = 1;
2492 			knote_drop(kn, td);
2493 			KQ_LOCK(kq);
2494 		}
2495 		KQ_UNLOCK_FLUX(kq);
2496 	}
2497 }
2498 
2499 static int
2500 knote_attach(struct knote *kn, struct kqueue *kq)
2501 {
2502 	struct klist *list;
2503 
2504 	KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn));
2505 	KQ_OWNED(kq);
2506 
2507 	if (kn->kn_fop->f_isfd) {
2508 		if (kn->kn_id >= kq->kq_knlistsize)
2509 			return (ENOMEM);
2510 		list = &kq->kq_knlist[kn->kn_id];
2511 	} else {
2512 		if (kq->kq_knhash == NULL)
2513 			return (ENOMEM);
2514 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2515 	}
2516 	SLIST_INSERT_HEAD(list, kn, kn_link);
2517 	return (0);
2518 }
2519 
2520 static void
2521 knote_drop(struct knote *kn, struct thread *td)
2522 {
2523 
2524 	if ((kn->kn_status & KN_DETACHED) == 0)
2525 		kn->kn_fop->f_detach(kn);
2526 	knote_drop_detached(kn, td);
2527 }
2528 
2529 static void
2530 knote_drop_detached(struct knote *kn, struct thread *td)
2531 {
2532 	struct kqueue *kq;
2533 	struct klist *list;
2534 
2535 	kq = kn->kn_kq;
2536 
2537 	KASSERT((kn->kn_status & KN_DETACHED) != 0,
2538 	    ("knote %p still attached", kn));
2539 	KQ_NOTOWNED(kq);
2540 
2541 	KQ_LOCK(kq);
2542 	KASSERT(kn->kn_influx == 1,
2543 	    ("knote_drop called on %p with influx %d", kn, kn->kn_influx));
2544 
2545 	if (kn->kn_fop->f_isfd)
2546 		list = &kq->kq_knlist[kn->kn_id];
2547 	else
2548 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2549 
2550 	if (!SLIST_EMPTY(list))
2551 		SLIST_REMOVE(list, kn, knote, kn_link);
2552 	if (kn->kn_status & KN_QUEUED)
2553 		knote_dequeue(kn);
2554 	KQ_UNLOCK_FLUX(kq);
2555 
2556 	if (kn->kn_fop->f_isfd) {
2557 		fdrop(kn->kn_fp, td);
2558 		kn->kn_fp = NULL;
2559 	}
2560 	kqueue_fo_release(kn->kn_kevent.filter);
2561 	kn->kn_fop = NULL;
2562 	knote_free(kn);
2563 }
2564 
2565 static void
2566 knote_enqueue(struct knote *kn)
2567 {
2568 	struct kqueue *kq = kn->kn_kq;
2569 
2570 	KQ_OWNED(kn->kn_kq);
2571 	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
2572 
2573 	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2574 	kn->kn_status |= KN_QUEUED;
2575 	kq->kq_count++;
2576 	kqueue_wakeup(kq);
2577 }
2578 
2579 static void
2580 knote_dequeue(struct knote *kn)
2581 {
2582 	struct kqueue *kq = kn->kn_kq;
2583 
2584 	KQ_OWNED(kn->kn_kq);
2585 	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
2586 
2587 	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2588 	kn->kn_status &= ~KN_QUEUED;
2589 	kq->kq_count--;
2590 }
2591 
2592 static void
2593 knote_init(void)
2594 {
2595 
2596 	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
2597 	    NULL, NULL, UMA_ALIGN_PTR, 0);
2598 }
2599 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
2600 
2601 static struct knote *
2602 knote_alloc(int waitok)
2603 {
2604 
2605 	return (uma_zalloc(knote_zone, (waitok ? M_WAITOK : M_NOWAIT) |
2606 	    M_ZERO));
2607 }
2608 
2609 static void
2610 knote_free(struct knote *kn)
2611 {
2612 
2613 	uma_zfree(knote_zone, kn);
2614 }
2615 
2616 /*
2617  * Register the kev w/ the kq specified by fd.
2618  */
2619 int
2620 kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok)
2621 {
2622 	struct kqueue *kq;
2623 	struct file *fp;
2624 	cap_rights_t rights;
2625 	int error;
2626 
2627 	error = fget(td, fd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &fp);
2628 	if (error != 0)
2629 		return (error);
2630 	if ((error = kqueue_acquire(fp, &kq)) != 0)
2631 		goto noacquire;
2632 
2633 	error = kqueue_register(kq, kev, td, waitok);
2634 	kqueue_release(kq, 0);
2635 
2636 noacquire:
2637 	fdrop(fp, td);
2638 	return (error);
2639 }
2640