xref: /freebsd/sys/kern/kern_event.c (revision 9ff086544d5f85b58349e28ed36a9811b8fe5cf9)
1 /*-
2  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
3  * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
4  * Copyright (c) 2009 Apple, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include "opt_ktrace.h"
33 #include "opt_kqueue.h"
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/capsicum.h>
38 #include <sys/kernel.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/rwlock.h>
42 #include <sys/proc.h>
43 #include <sys/malloc.h>
44 #include <sys/unistd.h>
45 #include <sys/file.h>
46 #include <sys/filedesc.h>
47 #include <sys/filio.h>
48 #include <sys/fcntl.h>
49 #include <sys/kthread.h>
50 #include <sys/selinfo.h>
51 #include <sys/queue.h>
52 #include <sys/event.h>
53 #include <sys/eventvar.h>
54 #include <sys/poll.h>
55 #include <sys/protosw.h>
56 #include <sys/resourcevar.h>
57 #include <sys/sigio.h>
58 #include <sys/signalvar.h>
59 #include <sys/socket.h>
60 #include <sys/socketvar.h>
61 #include <sys/stat.h>
62 #include <sys/sysctl.h>
63 #include <sys/sysproto.h>
64 #include <sys/syscallsubr.h>
65 #include <sys/taskqueue.h>
66 #include <sys/uio.h>
67 #include <sys/user.h>
68 #ifdef KTRACE
69 #include <sys/ktrace.h>
70 #endif
71 #include <machine/atomic.h>
72 
73 #include <vm/uma.h>
74 
75 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
76 
77 /*
78  * This lock is used if multiple kq locks are required.  This possibly
79  * should be made into a per proc lock.
80  */
81 static struct mtx	kq_global;
82 MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
83 #define KQ_GLOBAL_LOCK(lck, haslck)	do {	\
84 	if (!haslck)				\
85 		mtx_lock(lck);			\
86 	haslck = 1;				\
87 } while (0)
88 #define KQ_GLOBAL_UNLOCK(lck, haslck)	do {	\
89 	if (haslck)				\
90 		mtx_unlock(lck);			\
91 	haslck = 0;				\
92 } while (0)
93 
94 TASKQUEUE_DEFINE_THREAD(kqueue_ctx);
95 
96 static int	kevent_copyout(void *arg, struct kevent *kevp, int count);
97 static int	kevent_copyin(void *arg, struct kevent *kevp, int count);
98 static int	kqueue_register(struct kqueue *kq, struct kevent *kev,
99 		    struct thread *td, int waitok);
100 static int	kqueue_acquire(struct file *fp, struct kqueue **kqp);
101 static void	kqueue_release(struct kqueue *kq, int locked);
102 static void	kqueue_destroy(struct kqueue *kq);
103 static void	kqueue_drain(struct kqueue *kq, struct thread *td);
104 static int	kqueue_expand(struct kqueue *kq, struct filterops *fops,
105 		    uintptr_t ident, int waitok);
106 static void	kqueue_task(void *arg, int pending);
107 static int	kqueue_scan(struct kqueue *kq, int maxevents,
108 		    struct kevent_copyops *k_ops,
109 		    const struct timespec *timeout,
110 		    struct kevent *keva, struct thread *td);
111 static void 	kqueue_wakeup(struct kqueue *kq);
112 static struct filterops *kqueue_fo_find(int filt);
113 static void	kqueue_fo_release(int filt);
114 
115 static fo_ioctl_t	kqueue_ioctl;
116 static fo_poll_t	kqueue_poll;
117 static fo_kqfilter_t	kqueue_kqfilter;
118 static fo_stat_t	kqueue_stat;
119 static fo_close_t	kqueue_close;
120 static fo_fill_kinfo_t	kqueue_fill_kinfo;
121 
122 static struct fileops kqueueops = {
123 	.fo_read = invfo_rdwr,
124 	.fo_write = invfo_rdwr,
125 	.fo_truncate = invfo_truncate,
126 	.fo_ioctl = kqueue_ioctl,
127 	.fo_poll = kqueue_poll,
128 	.fo_kqfilter = kqueue_kqfilter,
129 	.fo_stat = kqueue_stat,
130 	.fo_close = kqueue_close,
131 	.fo_chmod = invfo_chmod,
132 	.fo_chown = invfo_chown,
133 	.fo_sendfile = invfo_sendfile,
134 	.fo_fill_kinfo = kqueue_fill_kinfo,
135 };
136 
137 static int 	knote_attach(struct knote *kn, struct kqueue *kq);
138 static void 	knote_drop(struct knote *kn, struct thread *td);
139 static void 	knote_enqueue(struct knote *kn);
140 static void 	knote_dequeue(struct knote *kn);
141 static void 	knote_init(void);
142 static struct 	knote *knote_alloc(int waitok);
143 static void 	knote_free(struct knote *kn);
144 
145 static void	filt_kqdetach(struct knote *kn);
146 static int	filt_kqueue(struct knote *kn, long hint);
147 static int	filt_procattach(struct knote *kn);
148 static void	filt_procdetach(struct knote *kn);
149 static int	filt_proc(struct knote *kn, long hint);
150 static int	filt_fileattach(struct knote *kn);
151 static void	filt_timerexpire(void *knx);
152 static int	filt_timerattach(struct knote *kn);
153 static void	filt_timerdetach(struct knote *kn);
154 static int	filt_timer(struct knote *kn, long hint);
155 static int	filt_userattach(struct knote *kn);
156 static void	filt_userdetach(struct knote *kn);
157 static int	filt_user(struct knote *kn, long hint);
158 static void	filt_usertouch(struct knote *kn, struct kevent *kev,
159 		    u_long type);
160 
161 static struct filterops file_filtops = {
162 	.f_isfd = 1,
163 	.f_attach = filt_fileattach,
164 };
165 static struct filterops kqread_filtops = {
166 	.f_isfd = 1,
167 	.f_detach = filt_kqdetach,
168 	.f_event = filt_kqueue,
169 };
170 /* XXX - move to kern_proc.c?  */
171 static struct filterops proc_filtops = {
172 	.f_isfd = 0,
173 	.f_attach = filt_procattach,
174 	.f_detach = filt_procdetach,
175 	.f_event = filt_proc,
176 };
177 static struct filterops timer_filtops = {
178 	.f_isfd = 0,
179 	.f_attach = filt_timerattach,
180 	.f_detach = filt_timerdetach,
181 	.f_event = filt_timer,
182 };
183 static struct filterops user_filtops = {
184 	.f_attach = filt_userattach,
185 	.f_detach = filt_userdetach,
186 	.f_event = filt_user,
187 	.f_touch = filt_usertouch,
188 };
189 
190 static uma_zone_t	knote_zone;
191 static unsigned int	kq_ncallouts = 0;
192 static unsigned int 	kq_calloutmax = 4 * 1024;
193 SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
194     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
195 
196 /* XXX - ensure not KN_INFLUX?? */
197 #define KNOTE_ACTIVATE(kn, islock) do { 				\
198 	if ((islock))							\
199 		mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED);		\
200 	else								\
201 		KQ_LOCK((kn)->kn_kq);					\
202 	(kn)->kn_status |= KN_ACTIVE;					\
203 	if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
204 		knote_enqueue((kn));					\
205 	if (!(islock))							\
206 		KQ_UNLOCK((kn)->kn_kq);					\
207 } while(0)
208 #define KQ_LOCK(kq) do {						\
209 	mtx_lock(&(kq)->kq_lock);					\
210 } while (0)
211 #define KQ_FLUX_WAKEUP(kq) do {						\
212 	if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) {		\
213 		(kq)->kq_state &= ~KQ_FLUXWAIT;				\
214 		wakeup((kq));						\
215 	}								\
216 } while (0)
217 #define KQ_UNLOCK_FLUX(kq) do {						\
218 	KQ_FLUX_WAKEUP(kq);						\
219 	mtx_unlock(&(kq)->kq_lock);					\
220 } while (0)
221 #define KQ_UNLOCK(kq) do {						\
222 	mtx_unlock(&(kq)->kq_lock);					\
223 } while (0)
224 #define KQ_OWNED(kq) do {						\
225 	mtx_assert(&(kq)->kq_lock, MA_OWNED);				\
226 } while (0)
227 #define KQ_NOTOWNED(kq) do {						\
228 	mtx_assert(&(kq)->kq_lock, MA_NOTOWNED);			\
229 } while (0)
230 
231 static struct knlist *
232 kn_list_lock(struct knote *kn)
233 {
234 	struct knlist *knl;
235 
236 	knl = kn->kn_knlist;
237 	if (knl != NULL)
238 		knl->kl_lock(knl->kl_lockarg);
239 	return (knl);
240 }
241 
242 static void
243 kn_list_unlock(struct knlist *knl)
244 {
245 	bool do_free;
246 
247 	if (knl == NULL)
248 		return;
249 	do_free = knl->kl_autodestroy && knlist_empty(knl);
250 	knl->kl_unlock(knl->kl_lockarg);
251 	if (do_free) {
252 		knlist_destroy(knl);
253 		free(knl, M_KQUEUE);
254 	}
255 }
256 
257 #define	KNL_ASSERT_LOCK(knl, islocked) do {				\
258 	if (islocked)							\
259 		KNL_ASSERT_LOCKED(knl);				\
260 	else								\
261 		KNL_ASSERT_UNLOCKED(knl);				\
262 } while (0)
263 #ifdef INVARIANTS
264 #define	KNL_ASSERT_LOCKED(knl) do {					\
265 	knl->kl_assert_locked((knl)->kl_lockarg);			\
266 } while (0)
267 #define	KNL_ASSERT_UNLOCKED(knl) do {					\
268 	knl->kl_assert_unlocked((knl)->kl_lockarg);			\
269 } while (0)
270 #else /* !INVARIANTS */
271 #define	KNL_ASSERT_LOCKED(knl) do {} while(0)
272 #define	KNL_ASSERT_UNLOCKED(knl) do {} while (0)
273 #endif /* INVARIANTS */
274 
275 #ifndef	KN_HASHSIZE
276 #define	KN_HASHSIZE		64		/* XXX should be tunable */
277 #endif
278 
279 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
280 
281 static int
282 filt_nullattach(struct knote *kn)
283 {
284 
285 	return (ENXIO);
286 };
287 
288 struct filterops null_filtops = {
289 	.f_isfd = 0,
290 	.f_attach = filt_nullattach,
291 };
292 
293 /* XXX - make SYSINIT to add these, and move into respective modules. */
294 extern struct filterops sig_filtops;
295 extern struct filterops fs_filtops;
296 
297 /*
298  * Table for for all system-defined filters.
299  */
300 static struct mtx	filterops_lock;
301 MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
302 	MTX_DEF);
303 static struct {
304 	struct filterops *for_fop;
305 	int for_nolock;
306 	int for_refcnt;
307 } sysfilt_ops[EVFILT_SYSCOUNT] = {
308 	{ &file_filtops, 1 },			/* EVFILT_READ */
309 	{ &file_filtops, 1 },			/* EVFILT_WRITE */
310 	{ &null_filtops },			/* EVFILT_AIO */
311 	{ &file_filtops, 1 },			/* EVFILT_VNODE */
312 	{ &proc_filtops, 1 },			/* EVFILT_PROC */
313 	{ &sig_filtops, 1 },			/* EVFILT_SIGNAL */
314 	{ &timer_filtops, 1 },			/* EVFILT_TIMER */
315 	{ &file_filtops, 1 },			/* EVFILT_PROCDESC */
316 	{ &fs_filtops, 1 },			/* EVFILT_FS */
317 	{ &null_filtops },			/* EVFILT_LIO */
318 	{ &user_filtops, 1 },			/* EVFILT_USER */
319 	{ &null_filtops },			/* EVFILT_SENDFILE */
320 };
321 
322 /*
323  * Simple redirection for all cdevsw style objects to call their fo_kqfilter
324  * method.
325  */
326 static int
327 filt_fileattach(struct knote *kn)
328 {
329 
330 	return (fo_kqfilter(kn->kn_fp, kn));
331 }
332 
333 /*ARGSUSED*/
334 static int
335 kqueue_kqfilter(struct file *fp, struct knote *kn)
336 {
337 	struct kqueue *kq = kn->kn_fp->f_data;
338 
339 	if (kn->kn_filter != EVFILT_READ)
340 		return (EINVAL);
341 
342 	kn->kn_status |= KN_KQUEUE;
343 	kn->kn_fop = &kqread_filtops;
344 	knlist_add(&kq->kq_sel.si_note, kn, 0);
345 
346 	return (0);
347 }
348 
349 static void
350 filt_kqdetach(struct knote *kn)
351 {
352 	struct kqueue *kq = kn->kn_fp->f_data;
353 
354 	knlist_remove(&kq->kq_sel.si_note, kn, 0);
355 }
356 
357 /*ARGSUSED*/
358 static int
359 filt_kqueue(struct knote *kn, long hint)
360 {
361 	struct kqueue *kq = kn->kn_fp->f_data;
362 
363 	kn->kn_data = kq->kq_count;
364 	return (kn->kn_data > 0);
365 }
366 
367 /* XXX - move to kern_proc.c?  */
368 static int
369 filt_procattach(struct knote *kn)
370 {
371 	struct proc *p;
372 	int error;
373 	bool exiting, immediate;
374 
375 	exiting = immediate = false;
376 	p = pfind(kn->kn_id);
377 	if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
378 		p = zpfind(kn->kn_id);
379 		exiting = true;
380 	} else if (p != NULL && (p->p_flag & P_WEXIT)) {
381 		exiting = true;
382 	}
383 
384 	if (p == NULL)
385 		return (ESRCH);
386 	if ((error = p_cansee(curthread, p))) {
387 		PROC_UNLOCK(p);
388 		return (error);
389 	}
390 
391 	kn->kn_ptr.p_proc = p;
392 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
393 
394 	/*
395 	 * Internal flag indicating registration done by kernel for the
396 	 * purposes of getting a NOTE_CHILD notification.
397 	 */
398 	if (kn->kn_flags & EV_FLAG2) {
399 		kn->kn_flags &= ~EV_FLAG2;
400 		kn->kn_data = kn->kn_sdata;		/* ppid */
401 		kn->kn_fflags = NOTE_CHILD;
402 		kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK);
403 		immediate = true; /* Force immediate activation of child note. */
404 	}
405 	/*
406 	 * Internal flag indicating registration done by kernel (for other than
407 	 * NOTE_CHILD).
408 	 */
409 	if (kn->kn_flags & EV_FLAG1) {
410 		kn->kn_flags &= ~EV_FLAG1;
411 	}
412 
413 	knlist_add(p->p_klist, kn, 1);
414 
415 	/*
416 	 * Immediately activate any child notes or, in the case of a zombie
417 	 * target process, exit notes.  The latter is necessary to handle the
418 	 * case where the target process, e.g. a child, dies before the kevent
419 	 * is registered.
420 	 */
421 	if (immediate || (exiting && filt_proc(kn, NOTE_EXIT)))
422 		KNOTE_ACTIVATE(kn, 0);
423 
424 	PROC_UNLOCK(p);
425 
426 	return (0);
427 }
428 
429 /*
430  * The knote may be attached to a different process, which may exit,
431  * leaving nothing for the knote to be attached to.  So when the process
432  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
433  * it will be deleted when read out.  However, as part of the knote deletion,
434  * this routine is called, so a check is needed to avoid actually performing
435  * a detach, because the original process does not exist any more.
436  */
437 /* XXX - move to kern_proc.c?  */
438 static void
439 filt_procdetach(struct knote *kn)
440 {
441 
442 	knlist_remove(kn->kn_knlist, kn, 0);
443 	kn->kn_ptr.p_proc = NULL;
444 }
445 
446 /* XXX - move to kern_proc.c?  */
447 static int
448 filt_proc(struct knote *kn, long hint)
449 {
450 	struct proc *p;
451 	u_int event;
452 
453 	p = kn->kn_ptr.p_proc;
454 	if (p == NULL) /* already activated, from attach filter */
455 		return (0);
456 
457 	/* Mask off extra data. */
458 	event = (u_int)hint & NOTE_PCTRLMASK;
459 
460 	/* If the user is interested in this event, record it. */
461 	if (kn->kn_sfflags & event)
462 		kn->kn_fflags |= event;
463 
464 	/* Process is gone, so flag the event as finished. */
465 	if (event == NOTE_EXIT) {
466 		kn->kn_flags |= EV_EOF | EV_ONESHOT;
467 		kn->kn_ptr.p_proc = NULL;
468 		if (kn->kn_fflags & NOTE_EXIT)
469 			kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig);
470 		if (kn->kn_fflags == 0)
471 			kn->kn_flags |= EV_DROP;
472 		return (1);
473 	}
474 
475 	return (kn->kn_fflags != 0);
476 }
477 
478 /*
479  * Called when the process forked. It mostly does the same as the
480  * knote(), activating all knotes registered to be activated when the
481  * process forked. Additionally, for each knote attached to the
482  * parent, check whether user wants to track the new process. If so
483  * attach a new knote to it, and immediately report an event with the
484  * child's pid.
485  */
486 void
487 knote_fork(struct knlist *list, int pid)
488 {
489 	struct kqueue *kq;
490 	struct knote *kn;
491 	struct kevent kev;
492 	int error;
493 
494 	if (list == NULL)
495 		return;
496 	list->kl_lock(list->kl_lockarg);
497 
498 	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
499 		kq = kn->kn_kq;
500 		KQ_LOCK(kq);
501 		if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
502 			KQ_UNLOCK(kq);
503 			continue;
504 		}
505 
506 		/*
507 		 * The same as knote(), activate the event.
508 		 */
509 		if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
510 			kn->kn_status |= KN_HASKQLOCK;
511 			if (kn->kn_fop->f_event(kn, NOTE_FORK))
512 				KNOTE_ACTIVATE(kn, 1);
513 			kn->kn_status &= ~KN_HASKQLOCK;
514 			KQ_UNLOCK(kq);
515 			continue;
516 		}
517 
518 		/*
519 		 * The NOTE_TRACK case. In addition to the activation
520 		 * of the event, we need to register new events to
521 		 * track the child. Drop the locks in preparation for
522 		 * the call to kqueue_register().
523 		 */
524 		kn->kn_status |= KN_INFLUX;
525 		KQ_UNLOCK(kq);
526 		list->kl_unlock(list->kl_lockarg);
527 
528 		/*
529 		 * Activate existing knote and register tracking knotes with
530 		 * new process.
531 		 *
532 		 * First register a knote to get just the child notice. This
533 		 * must be a separate note from a potential NOTE_EXIT
534 		 * notification since both NOTE_CHILD and NOTE_EXIT are defined
535 		 * to use the data field (in conflicting ways).
536 		 */
537 		kev.ident = pid;
538 		kev.filter = kn->kn_filter;
539 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT |
540 		    EV_FLAG2;
541 		kev.fflags = kn->kn_sfflags;
542 		kev.data = kn->kn_id;		/* parent */
543 		kev.udata = kn->kn_kevent.udata;/* preserve udata */
544 		error = kqueue_register(kq, &kev, NULL, 0);
545 		if (error)
546 			kn->kn_fflags |= NOTE_TRACKERR;
547 
548 		/*
549 		 * Then register another knote to track other potential events
550 		 * from the new process.
551 		 */
552 		kev.ident = pid;
553 		kev.filter = kn->kn_filter;
554 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
555 		kev.fflags = kn->kn_sfflags;
556 		kev.data = kn->kn_id;		/* parent */
557 		kev.udata = kn->kn_kevent.udata;/* preserve udata */
558 		error = kqueue_register(kq, &kev, NULL, 0);
559 		if (error)
560 			kn->kn_fflags |= NOTE_TRACKERR;
561 		if (kn->kn_fop->f_event(kn, NOTE_FORK))
562 			KNOTE_ACTIVATE(kn, 0);
563 		KQ_LOCK(kq);
564 		kn->kn_status &= ~KN_INFLUX;
565 		KQ_UNLOCK_FLUX(kq);
566 		list->kl_lock(list->kl_lockarg);
567 	}
568 	list->kl_unlock(list->kl_lockarg);
569 }
570 
571 /*
572  * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
573  * interval timer support code.
574  */
575 
576 #define NOTE_TIMER_PRECMASK	(NOTE_SECONDS|NOTE_MSECONDS|NOTE_USECONDS| \
577 				NOTE_NSECONDS)
578 
579 static sbintime_t
580 timer2sbintime(intptr_t data, int flags)
581 {
582 
583         /*
584          * Macros for converting to the fractional second portion of an
585          * sbintime_t using 64bit multiplication to improve precision.
586          */
587 #define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32)
588 #define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32)
589 #define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32)
590 	switch (flags & NOTE_TIMER_PRECMASK) {
591 	case NOTE_SECONDS:
592 #ifdef __LP64__
593 		if (data > (SBT_MAX / SBT_1S))
594 			return SBT_MAX;
595 #endif
596 		return ((sbintime_t)data << 32);
597 	case NOTE_MSECONDS: /* FALLTHROUGH */
598 	case 0:
599 		if (data >= 1000) {
600 			int64_t secs = data / 1000;
601 #ifdef __LP64__
602 			if (secs > (SBT_MAX / SBT_1S))
603 				return SBT_MAX;
604 #endif
605 			return (secs << 32 | MS_TO_SBT(data % 1000));
606 		}
607 		return MS_TO_SBT(data);
608 	case NOTE_USECONDS:
609 		if (data >= 1000000) {
610 			int64_t secs = data / 1000000;
611 #ifdef __LP64__
612 			if (secs > (SBT_MAX / SBT_1S))
613 				return SBT_MAX;
614 #endif
615 			return (secs << 32 | US_TO_SBT(data % 1000000));
616 		}
617 		return US_TO_SBT(data);
618 	case NOTE_NSECONDS:
619 		if (data >= 1000000000) {
620 			int64_t secs = data / 1000000000;
621 #ifdef __LP64__
622 			if (secs > (SBT_MAX / SBT_1S))
623 				return SBT_MAX;
624 #endif
625 			return (secs << 32 | US_TO_SBT(data % 1000000000));
626 		}
627 		return NS_TO_SBT(data);
628 	default:
629 		break;
630 	}
631 	return (-1);
632 }
633 
634 static void
635 filt_timerexpire(void *knx)
636 {
637 	struct callout *calloutp;
638 	struct knote *kn;
639 
640 	kn = knx;
641 	kn->kn_data++;
642 	KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
643 
644 	if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
645 		calloutp = (struct callout *)kn->kn_hook;
646 		*kn->kn_ptr.p_nexttime += timer2sbintime(kn->kn_sdata,
647 		    kn->kn_sfflags);
648 		callout_reset_sbt_on(calloutp, *kn->kn_ptr.p_nexttime, 0,
649 		    filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE);
650 	}
651 }
652 
653 /*
654  * data contains amount of time to sleep
655  */
656 static int
657 filt_timerattach(struct knote *kn)
658 {
659 	struct callout *calloutp;
660 	sbintime_t to;
661 	unsigned int ncallouts;
662 
663 	if ((intptr_t)kn->kn_sdata < 0)
664 		return (EINVAL);
665 	if ((intptr_t)kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
666 		kn->kn_sdata = 1;
667 	/* Only precision unit are supported in flags so far */
668 	if (kn->kn_sfflags & ~NOTE_TIMER_PRECMASK)
669 		return (EINVAL);
670 
671 	to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
672 	if (to < 0)
673 		return (EINVAL);
674 
675 	do {
676 		ncallouts = kq_ncallouts;
677 		if (ncallouts >= kq_calloutmax)
678 			return (ENOMEM);
679 	} while (!atomic_cmpset_int(&kq_ncallouts, ncallouts, ncallouts + 1));
680 
681 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
682 	kn->kn_status &= ~KN_DETACHED;		/* knlist_add clears it */
683 	kn->kn_ptr.p_nexttime = malloc(sizeof(sbintime_t), M_KQUEUE, M_WAITOK);
684 	calloutp = malloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK);
685 	callout_init(calloutp, 1);
686 	kn->kn_hook = calloutp;
687 	*kn->kn_ptr.p_nexttime = to + sbinuptime();
688 	callout_reset_sbt_on(calloutp, *kn->kn_ptr.p_nexttime, 0,
689 	    filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE);
690 
691 	return (0);
692 }
693 
694 static void
695 filt_timerdetach(struct knote *kn)
696 {
697 	struct callout *calloutp;
698 	unsigned int old;
699 
700 	calloutp = (struct callout *)kn->kn_hook;
701 	callout_drain(calloutp);
702 	free(calloutp, M_KQUEUE);
703 	free(kn->kn_ptr.p_nexttime, M_KQUEUE);
704 	old = atomic_fetchadd_int(&kq_ncallouts, -1);
705 	KASSERT(old > 0, ("Number of callouts cannot become negative"));
706 	kn->kn_status |= KN_DETACHED;	/* knlist_remove sets it */
707 }
708 
709 static int
710 filt_timer(struct knote *kn, long hint)
711 {
712 
713 	return (kn->kn_data != 0);
714 }
715 
716 static int
717 filt_userattach(struct knote *kn)
718 {
719 
720 	/*
721 	 * EVFILT_USER knotes are not attached to anything in the kernel.
722 	 */
723 	kn->kn_hook = NULL;
724 	if (kn->kn_fflags & NOTE_TRIGGER)
725 		kn->kn_hookid = 1;
726 	else
727 		kn->kn_hookid = 0;
728 	return (0);
729 }
730 
731 static void
732 filt_userdetach(__unused struct knote *kn)
733 {
734 
735 	/*
736 	 * EVFILT_USER knotes are not attached to anything in the kernel.
737 	 */
738 }
739 
740 static int
741 filt_user(struct knote *kn, __unused long hint)
742 {
743 
744 	return (kn->kn_hookid);
745 }
746 
747 static void
748 filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
749 {
750 	u_int ffctrl;
751 
752 	switch (type) {
753 	case EVENT_REGISTER:
754 		if (kev->fflags & NOTE_TRIGGER)
755 			kn->kn_hookid = 1;
756 
757 		ffctrl = kev->fflags & NOTE_FFCTRLMASK;
758 		kev->fflags &= NOTE_FFLAGSMASK;
759 		switch (ffctrl) {
760 		case NOTE_FFNOP:
761 			break;
762 
763 		case NOTE_FFAND:
764 			kn->kn_sfflags &= kev->fflags;
765 			break;
766 
767 		case NOTE_FFOR:
768 			kn->kn_sfflags |= kev->fflags;
769 			break;
770 
771 		case NOTE_FFCOPY:
772 			kn->kn_sfflags = kev->fflags;
773 			break;
774 
775 		default:
776 			/* XXX Return error? */
777 			break;
778 		}
779 		kn->kn_sdata = kev->data;
780 		if (kev->flags & EV_CLEAR) {
781 			kn->kn_hookid = 0;
782 			kn->kn_data = 0;
783 			kn->kn_fflags = 0;
784 		}
785 		break;
786 
787         case EVENT_PROCESS:
788 		*kev = kn->kn_kevent;
789 		kev->fflags = kn->kn_sfflags;
790 		kev->data = kn->kn_sdata;
791 		if (kn->kn_flags & EV_CLEAR) {
792 			kn->kn_hookid = 0;
793 			kn->kn_data = 0;
794 			kn->kn_fflags = 0;
795 		}
796 		break;
797 
798 	default:
799 		panic("filt_usertouch() - invalid type (%ld)", type);
800 		break;
801 	}
802 }
803 
804 int
805 sys_kqueue(struct thread *td, struct kqueue_args *uap)
806 {
807 
808 	return (kern_kqueue(td, 0, NULL));
809 }
810 
811 static void
812 kqueue_init(struct kqueue *kq)
813 {
814 
815 	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK);
816 	TAILQ_INIT(&kq->kq_head);
817 	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
818 	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
819 }
820 
821 int
822 kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps)
823 {
824 	struct filedesc *fdp;
825 	struct kqueue *kq;
826 	struct file *fp;
827 	struct ucred *cred;
828 	int fd, error;
829 
830 	fdp = td->td_proc->p_fd;
831 	cred = td->td_ucred;
832 	if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES)))
833 		return (ENOMEM);
834 
835 	error = falloc_caps(td, &fp, &fd, flags, fcaps);
836 	if (error != 0) {
837 		chgkqcnt(cred->cr_ruidinfo, -1, 0);
838 		return (error);
839 	}
840 
841 	/* An extra reference on `fp' has been held for us by falloc(). */
842 	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
843 	kqueue_init(kq);
844 	kq->kq_fdp = fdp;
845 	kq->kq_cred = crhold(cred);
846 
847 	FILEDESC_XLOCK(fdp);
848 	TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
849 	FILEDESC_XUNLOCK(fdp);
850 
851 	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
852 	fdrop(fp, td);
853 
854 	td->td_retval[0] = fd;
855 	return (0);
856 }
857 
858 #ifndef _SYS_SYSPROTO_H_
859 struct kevent_args {
860 	int	fd;
861 	const struct kevent *changelist;
862 	int	nchanges;
863 	struct	kevent *eventlist;
864 	int	nevents;
865 	const struct timespec *timeout;
866 };
867 #endif
868 int
869 sys_kevent(struct thread *td, struct kevent_args *uap)
870 {
871 	struct timespec ts, *tsp;
872 	struct kevent_copyops k_ops = { uap,
873 					kevent_copyout,
874 					kevent_copyin};
875 	int error;
876 #ifdef KTRACE
877 	struct uio ktruio;
878 	struct iovec ktriov;
879 	struct uio *ktruioin = NULL;
880 	struct uio *ktruioout = NULL;
881 #endif
882 
883 	if (uap->timeout != NULL) {
884 		error = copyin(uap->timeout, &ts, sizeof(ts));
885 		if (error)
886 			return (error);
887 		tsp = &ts;
888 	} else
889 		tsp = NULL;
890 
891 #ifdef KTRACE
892 	if (KTRPOINT(td, KTR_GENIO)) {
893 		ktriov.iov_base = uap->changelist;
894 		ktriov.iov_len = uap->nchanges * sizeof(struct kevent);
895 		ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1,
896 		    .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ,
897 		    .uio_td = td };
898 		ktruioin = cloneuio(&ktruio);
899 		ktriov.iov_base = uap->eventlist;
900 		ktriov.iov_len = uap->nevents * sizeof(struct kevent);
901 		ktruioout = cloneuio(&ktruio);
902 	}
903 #endif
904 
905 	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
906 	    &k_ops, tsp);
907 
908 #ifdef KTRACE
909 	if (ktruioin != NULL) {
910 		ktruioin->uio_resid = uap->nchanges * sizeof(struct kevent);
911 		ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0);
912 		ktruioout->uio_resid = td->td_retval[0] * sizeof(struct kevent);
913 		ktrgenio(uap->fd, UIO_READ, ktruioout, error);
914 	}
915 #endif
916 
917 	return (error);
918 }
919 
920 /*
921  * Copy 'count' items into the destination list pointed to by uap->eventlist.
922  */
923 static int
924 kevent_copyout(void *arg, struct kevent *kevp, int count)
925 {
926 	struct kevent_args *uap;
927 	int error;
928 
929 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
930 	uap = (struct kevent_args *)arg;
931 
932 	error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
933 	if (error == 0)
934 		uap->eventlist += count;
935 	return (error);
936 }
937 
938 /*
939  * Copy 'count' items from the list pointed to by uap->changelist.
940  */
941 static int
942 kevent_copyin(void *arg, struct kevent *kevp, int count)
943 {
944 	struct kevent_args *uap;
945 	int error;
946 
947 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
948 	uap = (struct kevent_args *)arg;
949 
950 	error = copyin(uap->changelist, kevp, count * sizeof *kevp);
951 	if (error == 0)
952 		uap->changelist += count;
953 	return (error);
954 }
955 
956 int
957 kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
958     struct kevent_copyops *k_ops, const struct timespec *timeout)
959 {
960 	cap_rights_t rights;
961 	struct file *fp;
962 	int error;
963 
964 	cap_rights_init(&rights);
965 	if (nchanges > 0)
966 		cap_rights_set(&rights, CAP_KQUEUE_CHANGE);
967 	if (nevents > 0)
968 		cap_rights_set(&rights, CAP_KQUEUE_EVENT);
969 	error = fget(td, fd, &rights, &fp);
970 	if (error != 0)
971 		return (error);
972 
973 	error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout);
974 	fdrop(fp, td);
975 
976 	return (error);
977 }
978 
979 static int
980 kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents,
981     struct kevent_copyops *k_ops, const struct timespec *timeout)
982 {
983 	struct kevent keva[KQ_NEVENTS];
984 	struct kevent *kevp, *changes;
985 	int i, n, nerrors, error;
986 
987 	nerrors = 0;
988 	while (nchanges > 0) {
989 		n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
990 		error = k_ops->k_copyin(k_ops->arg, keva, n);
991 		if (error)
992 			return (error);
993 		changes = keva;
994 		for (i = 0; i < n; i++) {
995 			kevp = &changes[i];
996 			if (!kevp->filter)
997 				continue;
998 			kevp->flags &= ~EV_SYSFLAGS;
999 			error = kqueue_register(kq, kevp, td, 1);
1000 			if (error || (kevp->flags & EV_RECEIPT)) {
1001 				if (nevents == 0)
1002 					return (error);
1003 				kevp->flags = EV_ERROR;
1004 				kevp->data = error;
1005 				(void)k_ops->k_copyout(k_ops->arg, kevp, 1);
1006 				nevents--;
1007 				nerrors++;
1008 			}
1009 		}
1010 		nchanges -= n;
1011 	}
1012 	if (nerrors) {
1013 		td->td_retval[0] = nerrors;
1014 		return (0);
1015 	}
1016 
1017 	return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td));
1018 }
1019 
1020 int
1021 kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents,
1022     struct kevent_copyops *k_ops, const struct timespec *timeout)
1023 {
1024 	struct kqueue *kq;
1025 	int error;
1026 
1027 	error = kqueue_acquire(fp, &kq);
1028 	if (error != 0)
1029 		return (error);
1030 	error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout);
1031 	kqueue_release(kq, 0);
1032 	return (error);
1033 }
1034 
1035 /*
1036  * Performs a kevent() call on a temporarily created kqueue. This can be
1037  * used to perform one-shot polling, similar to poll() and select().
1038  */
1039 int
1040 kern_kevent_anonymous(struct thread *td, int nevents,
1041     struct kevent_copyops *k_ops)
1042 {
1043 	struct kqueue kq = {};
1044 	int error;
1045 
1046 	kqueue_init(&kq);
1047 	kq.kq_refcnt = 1;
1048 	error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL);
1049 	kqueue_drain(&kq, td);
1050 	kqueue_destroy(&kq);
1051 	return (error);
1052 }
1053 
1054 int
1055 kqueue_add_filteropts(int filt, struct filterops *filtops)
1056 {
1057 	int error;
1058 
1059 	error = 0;
1060 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
1061 		printf(
1062 "trying to add a filterop that is out of range: %d is beyond %d\n",
1063 		    ~filt, EVFILT_SYSCOUNT);
1064 		return EINVAL;
1065 	}
1066 	mtx_lock(&filterops_lock);
1067 	if (sysfilt_ops[~filt].for_fop != &null_filtops &&
1068 	    sysfilt_ops[~filt].for_fop != NULL)
1069 		error = EEXIST;
1070 	else {
1071 		sysfilt_ops[~filt].for_fop = filtops;
1072 		sysfilt_ops[~filt].for_refcnt = 0;
1073 	}
1074 	mtx_unlock(&filterops_lock);
1075 
1076 	return (error);
1077 }
1078 
1079 int
1080 kqueue_del_filteropts(int filt)
1081 {
1082 	int error;
1083 
1084 	error = 0;
1085 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1086 		return EINVAL;
1087 
1088 	mtx_lock(&filterops_lock);
1089 	if (sysfilt_ops[~filt].for_fop == &null_filtops ||
1090 	    sysfilt_ops[~filt].for_fop == NULL)
1091 		error = EINVAL;
1092 	else if (sysfilt_ops[~filt].for_refcnt != 0)
1093 		error = EBUSY;
1094 	else {
1095 		sysfilt_ops[~filt].for_fop = &null_filtops;
1096 		sysfilt_ops[~filt].for_refcnt = 0;
1097 	}
1098 	mtx_unlock(&filterops_lock);
1099 
1100 	return error;
1101 }
1102 
1103 static struct filterops *
1104 kqueue_fo_find(int filt)
1105 {
1106 
1107 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1108 		return NULL;
1109 
1110 	if (sysfilt_ops[~filt].for_nolock)
1111 		return sysfilt_ops[~filt].for_fop;
1112 
1113 	mtx_lock(&filterops_lock);
1114 	sysfilt_ops[~filt].for_refcnt++;
1115 	if (sysfilt_ops[~filt].for_fop == NULL)
1116 		sysfilt_ops[~filt].for_fop = &null_filtops;
1117 	mtx_unlock(&filterops_lock);
1118 
1119 	return sysfilt_ops[~filt].for_fop;
1120 }
1121 
1122 static void
1123 kqueue_fo_release(int filt)
1124 {
1125 
1126 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1127 		return;
1128 
1129 	if (sysfilt_ops[~filt].for_nolock)
1130 		return;
1131 
1132 	mtx_lock(&filterops_lock);
1133 	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
1134 	    ("filter object refcount not valid on release"));
1135 	sysfilt_ops[~filt].for_refcnt--;
1136 	mtx_unlock(&filterops_lock);
1137 }
1138 
1139 /*
1140  * A ref to kq (obtained via kqueue_acquire) must be held.  waitok will
1141  * influence if memory allocation should wait.  Make sure it is 0 if you
1142  * hold any mutexes.
1143  */
1144 static int
1145 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok)
1146 {
1147 	struct filterops *fops;
1148 	struct file *fp;
1149 	struct knote *kn, *tkn;
1150 	struct knlist *knl;
1151 	cap_rights_t rights;
1152 	int error, filt, event;
1153 	int haskqglobal, filedesc_unlock;
1154 
1155 	if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE))
1156 		return (EINVAL);
1157 
1158 	fp = NULL;
1159 	kn = NULL;
1160 	knl = NULL;
1161 	error = 0;
1162 	haskqglobal = 0;
1163 	filedesc_unlock = 0;
1164 
1165 	filt = kev->filter;
1166 	fops = kqueue_fo_find(filt);
1167 	if (fops == NULL)
1168 		return EINVAL;
1169 
1170 	if (kev->flags & EV_ADD) {
1171 		/*
1172 		 * Prevent waiting with locks.  Non-sleepable
1173 		 * allocation failures are handled in the loop, only
1174 		 * if the spare knote appears to be actually required.
1175 		 */
1176 		tkn = knote_alloc(waitok);
1177 	} else {
1178 		tkn = NULL;
1179 	}
1180 
1181 findkn:
1182 	if (fops->f_isfd) {
1183 		KASSERT(td != NULL, ("td is NULL"));
1184 		if (kev->ident > INT_MAX)
1185 			error = EBADF;
1186 		else
1187 			error = fget(td, kev->ident,
1188 			    cap_rights_init(&rights, CAP_EVENT), &fp);
1189 		if (error)
1190 			goto done;
1191 
1192 		if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
1193 		    kev->ident, 0) != 0) {
1194 			/* try again */
1195 			fdrop(fp, td);
1196 			fp = NULL;
1197 			error = kqueue_expand(kq, fops, kev->ident, waitok);
1198 			if (error)
1199 				goto done;
1200 			goto findkn;
1201 		}
1202 
1203 		if (fp->f_type == DTYPE_KQUEUE) {
1204 			/*
1205 			 * If we add some intelligence about what we are doing,
1206 			 * we should be able to support events on ourselves.
1207 			 * We need to know when we are doing this to prevent
1208 			 * getting both the knlist lock and the kq lock since
1209 			 * they are the same thing.
1210 			 */
1211 			if (fp->f_data == kq) {
1212 				error = EINVAL;
1213 				goto done;
1214 			}
1215 
1216 			/*
1217 			 * Pre-lock the filedesc before the global
1218 			 * lock mutex, see the comment in
1219 			 * kqueue_close().
1220 			 */
1221 			FILEDESC_XLOCK(td->td_proc->p_fd);
1222 			filedesc_unlock = 1;
1223 			KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1224 		}
1225 
1226 		KQ_LOCK(kq);
1227 		if (kev->ident < kq->kq_knlistsize) {
1228 			SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
1229 				if (kev->filter == kn->kn_filter)
1230 					break;
1231 		}
1232 	} else {
1233 		if ((kev->flags & EV_ADD) == EV_ADD)
1234 			kqueue_expand(kq, fops, kev->ident, waitok);
1235 
1236 		KQ_LOCK(kq);
1237 
1238 		/*
1239 		 * If possible, find an existing knote to use for this kevent.
1240 		 */
1241 		if (kev->filter == EVFILT_PROC &&
1242 		    (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
1243 			/* This is an internal creation of a process tracking
1244 			 * note. Don't attempt to coalesce this with an
1245 			 * existing note.
1246 			 */
1247 			;
1248 		} else if (kq->kq_knhashmask != 0) {
1249 			struct klist *list;
1250 
1251 			list = &kq->kq_knhash[
1252 			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
1253 			SLIST_FOREACH(kn, list, kn_link)
1254 				if (kev->ident == kn->kn_id &&
1255 				    kev->filter == kn->kn_filter)
1256 					break;
1257 		}
1258 	}
1259 
1260 	/* knote is in the process of changing, wait for it to stabilize. */
1261 	if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1262 		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1263 		if (filedesc_unlock) {
1264 			FILEDESC_XUNLOCK(td->td_proc->p_fd);
1265 			filedesc_unlock = 0;
1266 		}
1267 		kq->kq_state |= KQ_FLUXWAIT;
1268 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
1269 		if (fp != NULL) {
1270 			fdrop(fp, td);
1271 			fp = NULL;
1272 		}
1273 		goto findkn;
1274 	}
1275 
1276 	/*
1277 	 * kn now contains the matching knote, or NULL if no match
1278 	 */
1279 	if (kn == NULL) {
1280 		if (kev->flags & EV_ADD) {
1281 			kn = tkn;
1282 			tkn = NULL;
1283 			if (kn == NULL) {
1284 				KQ_UNLOCK(kq);
1285 				error = ENOMEM;
1286 				goto done;
1287 			}
1288 			kn->kn_fp = fp;
1289 			kn->kn_kq = kq;
1290 			kn->kn_fop = fops;
1291 			/*
1292 			 * apply reference counts to knote structure, and
1293 			 * do not release it at the end of this routine.
1294 			 */
1295 			fops = NULL;
1296 			fp = NULL;
1297 
1298 			kn->kn_sfflags = kev->fflags;
1299 			kn->kn_sdata = kev->data;
1300 			kev->fflags = 0;
1301 			kev->data = 0;
1302 			kn->kn_kevent = *kev;
1303 			kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
1304 			    EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT);
1305 			kn->kn_status = KN_INFLUX|KN_DETACHED;
1306 
1307 			error = knote_attach(kn, kq);
1308 			KQ_UNLOCK(kq);
1309 			if (error != 0) {
1310 				tkn = kn;
1311 				goto done;
1312 			}
1313 
1314 			if ((error = kn->kn_fop->f_attach(kn)) != 0) {
1315 				knote_drop(kn, td);
1316 				goto done;
1317 			}
1318 			knl = kn_list_lock(kn);
1319 			goto done_ev_add;
1320 		} else {
1321 			/* No matching knote and the EV_ADD flag is not set. */
1322 			KQ_UNLOCK(kq);
1323 			error = ENOENT;
1324 			goto done;
1325 		}
1326 	}
1327 
1328 	if (kev->flags & EV_DELETE) {
1329 		kn->kn_status |= KN_INFLUX;
1330 		KQ_UNLOCK(kq);
1331 		if (!(kn->kn_status & KN_DETACHED))
1332 			kn->kn_fop->f_detach(kn);
1333 		knote_drop(kn, td);
1334 		goto done;
1335 	}
1336 
1337 	if (kev->flags & EV_FORCEONESHOT) {
1338 		kn->kn_flags |= EV_ONESHOT;
1339 		KNOTE_ACTIVATE(kn, 1);
1340 	}
1341 
1342 	/*
1343 	 * The user may change some filter values after the initial EV_ADD,
1344 	 * but doing so will not reset any filter which has already been
1345 	 * triggered.
1346 	 */
1347 	kn->kn_status |= KN_INFLUX | KN_SCAN;
1348 	KQ_UNLOCK(kq);
1349 	knl = kn_list_lock(kn);
1350 	kn->kn_kevent.udata = kev->udata;
1351 	if (!fops->f_isfd && fops->f_touch != NULL) {
1352 		fops->f_touch(kn, kev, EVENT_REGISTER);
1353 	} else {
1354 		kn->kn_sfflags = kev->fflags;
1355 		kn->kn_sdata = kev->data;
1356 	}
1357 
1358 	/*
1359 	 * We can get here with kn->kn_knlist == NULL.  This can happen when
1360 	 * the initial attach event decides that the event is "completed"
1361 	 * already.  i.e. filt_procattach is called on a zombie process.  It
1362 	 * will call filt_proc which will remove it from the list, and NULL
1363 	 * kn_knlist.
1364 	 */
1365 done_ev_add:
1366 	if ((kev->flags & EV_ENABLE) != 0)
1367 		kn->kn_status &= ~KN_DISABLED;
1368 	else if ((kev->flags & EV_DISABLE) != 0)
1369 		kn->kn_status |= KN_DISABLED;
1370 
1371 	if ((kn->kn_status & KN_DISABLED) == 0)
1372 		event = kn->kn_fop->f_event(kn, 0);
1373 	else
1374 		event = 0;
1375 
1376 	KQ_LOCK(kq);
1377 	if (event)
1378 		kn->kn_status |= KN_ACTIVE;
1379 	if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) ==
1380 	    KN_ACTIVE)
1381 		knote_enqueue(kn);
1382 	kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
1383 	kn_list_unlock(knl);
1384 	KQ_UNLOCK_FLUX(kq);
1385 
1386 done:
1387 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1388 	if (filedesc_unlock)
1389 		FILEDESC_XUNLOCK(td->td_proc->p_fd);
1390 	if (fp != NULL)
1391 		fdrop(fp, td);
1392 	knote_free(tkn);
1393 	if (fops != NULL)
1394 		kqueue_fo_release(filt);
1395 	return (error);
1396 }
1397 
1398 static int
1399 kqueue_acquire(struct file *fp, struct kqueue **kqp)
1400 {
1401 	int error;
1402 	struct kqueue *kq;
1403 
1404 	error = 0;
1405 
1406 	kq = fp->f_data;
1407 	if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
1408 		return (EBADF);
1409 	*kqp = kq;
1410 	KQ_LOCK(kq);
1411 	if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
1412 		KQ_UNLOCK(kq);
1413 		return (EBADF);
1414 	}
1415 	kq->kq_refcnt++;
1416 	KQ_UNLOCK(kq);
1417 
1418 	return error;
1419 }
1420 
1421 static void
1422 kqueue_release(struct kqueue *kq, int locked)
1423 {
1424 	if (locked)
1425 		KQ_OWNED(kq);
1426 	else
1427 		KQ_LOCK(kq);
1428 	kq->kq_refcnt--;
1429 	if (kq->kq_refcnt == 1)
1430 		wakeup(&kq->kq_refcnt);
1431 	if (!locked)
1432 		KQ_UNLOCK(kq);
1433 }
1434 
1435 static void
1436 kqueue_schedtask(struct kqueue *kq)
1437 {
1438 
1439 	KQ_OWNED(kq);
1440 	KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
1441 	    ("scheduling kqueue task while draining"));
1442 
1443 	if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
1444 		taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task);
1445 		kq->kq_state |= KQ_TASKSCHED;
1446 	}
1447 }
1448 
1449 /*
1450  * Expand the kq to make sure we have storage for fops/ident pair.
1451  *
1452  * Return 0 on success (or no work necessary), return errno on failure.
1453  *
1454  * Not calling hashinit w/ waitok (proper malloc flag) should be safe.
1455  * If kqueue_register is called from a non-fd context, there usually/should
1456  * be no locks held.
1457  */
1458 static int
1459 kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
1460 	int waitok)
1461 {
1462 	struct klist *list, *tmp_knhash, *to_free;
1463 	u_long tmp_knhashmask;
1464 	int size;
1465 	int fd;
1466 	int mflag = waitok ? M_WAITOK : M_NOWAIT;
1467 
1468 	KQ_NOTOWNED(kq);
1469 
1470 	to_free = NULL;
1471 	if (fops->f_isfd) {
1472 		fd = ident;
1473 		if (kq->kq_knlistsize <= fd) {
1474 			size = kq->kq_knlistsize;
1475 			while (size <= fd)
1476 				size += KQEXTENT;
1477 			list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
1478 			if (list == NULL)
1479 				return ENOMEM;
1480 			KQ_LOCK(kq);
1481 			if (kq->kq_knlistsize > fd) {
1482 				to_free = list;
1483 				list = NULL;
1484 			} else {
1485 				if (kq->kq_knlist != NULL) {
1486 					bcopy(kq->kq_knlist, list,
1487 					    kq->kq_knlistsize * sizeof(*list));
1488 					to_free = kq->kq_knlist;
1489 					kq->kq_knlist = NULL;
1490 				}
1491 				bzero((caddr_t)list +
1492 				    kq->kq_knlistsize * sizeof(*list),
1493 				    (size - kq->kq_knlistsize) * sizeof(*list));
1494 				kq->kq_knlistsize = size;
1495 				kq->kq_knlist = list;
1496 			}
1497 			KQ_UNLOCK(kq);
1498 		}
1499 	} else {
1500 		if (kq->kq_knhashmask == 0) {
1501 			tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
1502 			    &tmp_knhashmask);
1503 			if (tmp_knhash == NULL)
1504 				return ENOMEM;
1505 			KQ_LOCK(kq);
1506 			if (kq->kq_knhashmask == 0) {
1507 				kq->kq_knhash = tmp_knhash;
1508 				kq->kq_knhashmask = tmp_knhashmask;
1509 			} else {
1510 				to_free = tmp_knhash;
1511 			}
1512 			KQ_UNLOCK(kq);
1513 		}
1514 	}
1515 	free(to_free, M_KQUEUE);
1516 
1517 	KQ_NOTOWNED(kq);
1518 	return 0;
1519 }
1520 
1521 static void
1522 kqueue_task(void *arg, int pending)
1523 {
1524 	struct kqueue *kq;
1525 	int haskqglobal;
1526 
1527 	haskqglobal = 0;
1528 	kq = arg;
1529 
1530 	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1531 	KQ_LOCK(kq);
1532 
1533 	KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
1534 
1535 	kq->kq_state &= ~KQ_TASKSCHED;
1536 	if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
1537 		wakeup(&kq->kq_state);
1538 	}
1539 	KQ_UNLOCK(kq);
1540 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1541 }
1542 
1543 /*
1544  * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
1545  * We treat KN_MARKER knotes as if they are INFLUX.
1546  */
1547 static int
1548 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
1549     const struct timespec *tsp, struct kevent *keva, struct thread *td)
1550 {
1551 	struct kevent *kevp;
1552 	struct knote *kn, *marker;
1553 	struct knlist *knl;
1554 	sbintime_t asbt, rsbt;
1555 	int count, error, haskqglobal, influx, nkev, touch;
1556 
1557 	count = maxevents;
1558 	nkev = 0;
1559 	error = 0;
1560 	haskqglobal = 0;
1561 
1562 	if (maxevents == 0)
1563 		goto done_nl;
1564 
1565 	rsbt = 0;
1566 	if (tsp != NULL) {
1567 		if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 ||
1568 		    tsp->tv_nsec >= 1000000000) {
1569 			error = EINVAL;
1570 			goto done_nl;
1571 		}
1572 		if (timespecisset(tsp)) {
1573 			if (tsp->tv_sec <= INT32_MAX) {
1574 				rsbt = tstosbt(*tsp);
1575 				if (TIMESEL(&asbt, rsbt))
1576 					asbt += tc_tick_sbt;
1577 				if (asbt <= SBT_MAX - rsbt)
1578 					asbt += rsbt;
1579 				else
1580 					asbt = 0;
1581 				rsbt >>= tc_precexp;
1582 			} else
1583 				asbt = 0;
1584 		} else
1585 			asbt = -1;
1586 	} else
1587 		asbt = 0;
1588 	marker = knote_alloc(1);
1589 	marker->kn_status = KN_MARKER;
1590 	KQ_LOCK(kq);
1591 
1592 retry:
1593 	kevp = keva;
1594 	if (kq->kq_count == 0) {
1595 		if (asbt == -1) {
1596 			error = EWOULDBLOCK;
1597 		} else {
1598 			kq->kq_state |= KQ_SLEEP;
1599 			error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
1600 			    "kqread", asbt, rsbt, C_ABSOLUTE);
1601 		}
1602 		if (error == 0)
1603 			goto retry;
1604 		/* don't restart after signals... */
1605 		if (error == ERESTART)
1606 			error = EINTR;
1607 		else if (error == EWOULDBLOCK)
1608 			error = 0;
1609 		goto done;
1610 	}
1611 
1612 	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1613 	influx = 0;
1614 	while (count) {
1615 		KQ_OWNED(kq);
1616 		kn = TAILQ_FIRST(&kq->kq_head);
1617 
1618 		if ((kn->kn_status == KN_MARKER && kn != marker) ||
1619 		    (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1620 			if (influx) {
1621 				influx = 0;
1622 				KQ_FLUX_WAKEUP(kq);
1623 			}
1624 			kq->kq_state |= KQ_FLUXWAIT;
1625 			error = msleep(kq, &kq->kq_lock, PSOCK,
1626 			    "kqflxwt", 0);
1627 			continue;
1628 		}
1629 
1630 		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1631 		if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
1632 			kn->kn_status &= ~KN_QUEUED;
1633 			kq->kq_count--;
1634 			continue;
1635 		}
1636 		if (kn == marker) {
1637 			KQ_FLUX_WAKEUP(kq);
1638 			if (count == maxevents)
1639 				goto retry;
1640 			goto done;
1641 		}
1642 		KASSERT((kn->kn_status & KN_INFLUX) == 0,
1643 		    ("KN_INFLUX set when not suppose to be"));
1644 
1645 		if ((kn->kn_flags & EV_DROP) == EV_DROP) {
1646 			kn->kn_status &= ~KN_QUEUED;
1647 			kn->kn_status |= KN_INFLUX;
1648 			kq->kq_count--;
1649 			KQ_UNLOCK(kq);
1650 			/*
1651 			 * We don't need to lock the list since we've marked
1652 			 * it _INFLUX.
1653 			 */
1654 			if (!(kn->kn_status & KN_DETACHED))
1655 				kn->kn_fop->f_detach(kn);
1656 			knote_drop(kn, td);
1657 			KQ_LOCK(kq);
1658 			continue;
1659 		} else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
1660 			kn->kn_status &= ~KN_QUEUED;
1661 			kn->kn_status |= KN_INFLUX;
1662 			kq->kq_count--;
1663 			KQ_UNLOCK(kq);
1664 			/*
1665 			 * We don't need to lock the list since we've marked
1666 			 * it _INFLUX.
1667 			 */
1668 			*kevp = kn->kn_kevent;
1669 			if (!(kn->kn_status & KN_DETACHED))
1670 				kn->kn_fop->f_detach(kn);
1671 			knote_drop(kn, td);
1672 			KQ_LOCK(kq);
1673 			kn = NULL;
1674 		} else {
1675 			kn->kn_status |= KN_INFLUX | KN_SCAN;
1676 			KQ_UNLOCK(kq);
1677 			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
1678 				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1679 			knl = kn_list_lock(kn);
1680 			if (kn->kn_fop->f_event(kn, 0) == 0) {
1681 				KQ_LOCK(kq);
1682 				KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1683 				kn->kn_status &=
1684 				    ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX |
1685 				    KN_SCAN);
1686 				kq->kq_count--;
1687 				kn_list_unlock(knl);
1688 				influx = 1;
1689 				continue;
1690 			}
1691 			touch = (!kn->kn_fop->f_isfd &&
1692 			    kn->kn_fop->f_touch != NULL);
1693 			if (touch)
1694 				kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
1695 			else
1696 				*kevp = kn->kn_kevent;
1697 			KQ_LOCK(kq);
1698 			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1699 			if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
1700 				/*
1701 				 * Manually clear knotes who weren't
1702 				 * 'touch'ed.
1703 				 */
1704 				if (touch == 0 && kn->kn_flags & EV_CLEAR) {
1705 					kn->kn_data = 0;
1706 					kn->kn_fflags = 0;
1707 				}
1708 				if (kn->kn_flags & EV_DISPATCH)
1709 					kn->kn_status |= KN_DISABLED;
1710 				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
1711 				kq->kq_count--;
1712 			} else
1713 				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1714 
1715 			kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
1716 			kn_list_unlock(knl);
1717 			influx = 1;
1718 		}
1719 
1720 		/* we are returning a copy to the user */
1721 		kevp++;
1722 		nkev++;
1723 		count--;
1724 
1725 		if (nkev == KQ_NEVENTS) {
1726 			influx = 0;
1727 			KQ_UNLOCK_FLUX(kq);
1728 			error = k_ops->k_copyout(k_ops->arg, keva, nkev);
1729 			nkev = 0;
1730 			kevp = keva;
1731 			KQ_LOCK(kq);
1732 			if (error)
1733 				break;
1734 		}
1735 	}
1736 	TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
1737 done:
1738 	KQ_OWNED(kq);
1739 	KQ_UNLOCK_FLUX(kq);
1740 	knote_free(marker);
1741 done_nl:
1742 	KQ_NOTOWNED(kq);
1743 	if (nkev != 0)
1744 		error = k_ops->k_copyout(k_ops->arg, keva, nkev);
1745 	td->td_retval[0] = maxevents - count;
1746 	return (error);
1747 }
1748 
1749 /*ARGSUSED*/
1750 static int
1751 kqueue_ioctl(struct file *fp, u_long cmd, void *data,
1752 	struct ucred *active_cred, struct thread *td)
1753 {
1754 	/*
1755 	 * Enabling sigio causes two major problems:
1756 	 * 1) infinite recursion:
1757 	 * Synopsys: kevent is being used to track signals and have FIOASYNC
1758 	 * set.  On receipt of a signal this will cause a kqueue to recurse
1759 	 * into itself over and over.  Sending the sigio causes the kqueue
1760 	 * to become ready, which in turn posts sigio again, forever.
1761 	 * Solution: this can be solved by setting a flag in the kqueue that
1762 	 * we have a SIGIO in progress.
1763 	 * 2) locking problems:
1764 	 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
1765 	 * us above the proc and pgrp locks.
1766 	 * Solution: Post a signal using an async mechanism, being sure to
1767 	 * record a generation count in the delivery so that we do not deliver
1768 	 * a signal to the wrong process.
1769 	 *
1770 	 * Note, these two mechanisms are somewhat mutually exclusive!
1771 	 */
1772 #if 0
1773 	struct kqueue *kq;
1774 
1775 	kq = fp->f_data;
1776 	switch (cmd) {
1777 	case FIOASYNC:
1778 		if (*(int *)data) {
1779 			kq->kq_state |= KQ_ASYNC;
1780 		} else {
1781 			kq->kq_state &= ~KQ_ASYNC;
1782 		}
1783 		return (0);
1784 
1785 	case FIOSETOWN:
1786 		return (fsetown(*(int *)data, &kq->kq_sigio));
1787 
1788 	case FIOGETOWN:
1789 		*(int *)data = fgetown(&kq->kq_sigio);
1790 		return (0);
1791 	}
1792 #endif
1793 
1794 	return (ENOTTY);
1795 }
1796 
1797 /*ARGSUSED*/
1798 static int
1799 kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
1800 	struct thread *td)
1801 {
1802 	struct kqueue *kq;
1803 	int revents = 0;
1804 	int error;
1805 
1806 	if ((error = kqueue_acquire(fp, &kq)))
1807 		return POLLERR;
1808 
1809 	KQ_LOCK(kq);
1810 	if (events & (POLLIN | POLLRDNORM)) {
1811 		if (kq->kq_count) {
1812 			revents |= events & (POLLIN | POLLRDNORM);
1813 		} else {
1814 			selrecord(td, &kq->kq_sel);
1815 			if (SEL_WAITING(&kq->kq_sel))
1816 				kq->kq_state |= KQ_SEL;
1817 		}
1818 	}
1819 	kqueue_release(kq, 1);
1820 	KQ_UNLOCK(kq);
1821 	return (revents);
1822 }
1823 
1824 /*ARGSUSED*/
1825 static int
1826 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
1827 	struct thread *td)
1828 {
1829 
1830 	bzero((void *)st, sizeof *st);
1831 	/*
1832 	 * We no longer return kq_count because the unlocked value is useless.
1833 	 * If you spent all this time getting the count, why not spend your
1834 	 * syscall better by calling kevent?
1835 	 *
1836 	 * XXX - This is needed for libc_r.
1837 	 */
1838 	st->st_mode = S_IFIFO;
1839 	return (0);
1840 }
1841 
1842 static void
1843 kqueue_drain(struct kqueue *kq, struct thread *td)
1844 {
1845 	struct knote *kn;
1846 	int i;
1847 
1848 	KQ_LOCK(kq);
1849 
1850 	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
1851 	    ("kqueue already closing"));
1852 	kq->kq_state |= KQ_CLOSING;
1853 	if (kq->kq_refcnt > 1)
1854 		msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
1855 
1856 	KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
1857 
1858 	KASSERT(knlist_empty(&kq->kq_sel.si_note),
1859 	    ("kqueue's knlist not empty"));
1860 
1861 	for (i = 0; i < kq->kq_knlistsize; i++) {
1862 		while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
1863 			if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1864 				kq->kq_state |= KQ_FLUXWAIT;
1865 				msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
1866 				continue;
1867 			}
1868 			kn->kn_status |= KN_INFLUX;
1869 			KQ_UNLOCK(kq);
1870 			if (!(kn->kn_status & KN_DETACHED))
1871 				kn->kn_fop->f_detach(kn);
1872 			knote_drop(kn, td);
1873 			KQ_LOCK(kq);
1874 		}
1875 	}
1876 	if (kq->kq_knhashmask != 0) {
1877 		for (i = 0; i <= kq->kq_knhashmask; i++) {
1878 			while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
1879 				if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1880 					kq->kq_state |= KQ_FLUXWAIT;
1881 					msleep(kq, &kq->kq_lock, PSOCK,
1882 					       "kqclo2", 0);
1883 					continue;
1884 				}
1885 				kn->kn_status |= KN_INFLUX;
1886 				KQ_UNLOCK(kq);
1887 				if (!(kn->kn_status & KN_DETACHED))
1888 					kn->kn_fop->f_detach(kn);
1889 				knote_drop(kn, td);
1890 				KQ_LOCK(kq);
1891 			}
1892 		}
1893 	}
1894 
1895 	if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
1896 		kq->kq_state |= KQ_TASKDRAIN;
1897 		msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
1898 	}
1899 
1900 	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
1901 		selwakeuppri(&kq->kq_sel, PSOCK);
1902 		if (!SEL_WAITING(&kq->kq_sel))
1903 			kq->kq_state &= ~KQ_SEL;
1904 	}
1905 
1906 	KQ_UNLOCK(kq);
1907 }
1908 
1909 static void
1910 kqueue_destroy(struct kqueue *kq)
1911 {
1912 
1913 	KASSERT(kq->kq_fdp == NULL,
1914 	    ("kqueue still attached to a file descriptor"));
1915 	seldrain(&kq->kq_sel);
1916 	knlist_destroy(&kq->kq_sel.si_note);
1917 	mtx_destroy(&kq->kq_lock);
1918 
1919 	if (kq->kq_knhash != NULL)
1920 		free(kq->kq_knhash, M_KQUEUE);
1921 	if (kq->kq_knlist != NULL)
1922 		free(kq->kq_knlist, M_KQUEUE);
1923 
1924 	funsetown(&kq->kq_sigio);
1925 }
1926 
1927 /*ARGSUSED*/
1928 static int
1929 kqueue_close(struct file *fp, struct thread *td)
1930 {
1931 	struct kqueue *kq = fp->f_data;
1932 	struct filedesc *fdp;
1933 	int error;
1934 	int filedesc_unlock;
1935 
1936 	if ((error = kqueue_acquire(fp, &kq)))
1937 		return error;
1938 	kqueue_drain(kq, td);
1939 
1940 	/*
1941 	 * We could be called due to the knote_drop() doing fdrop(),
1942 	 * called from kqueue_register().  In this case the global
1943 	 * lock is owned, and filedesc sx is locked before, to not
1944 	 * take the sleepable lock after non-sleepable.
1945 	 */
1946 	fdp = kq->kq_fdp;
1947 	kq->kq_fdp = NULL;
1948 	if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
1949 		FILEDESC_XLOCK(fdp);
1950 		filedesc_unlock = 1;
1951 	} else
1952 		filedesc_unlock = 0;
1953 	TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list);
1954 	if (filedesc_unlock)
1955 		FILEDESC_XUNLOCK(fdp);
1956 
1957 	kqueue_destroy(kq);
1958 	chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0);
1959 	crfree(kq->kq_cred);
1960 	free(kq, M_KQUEUE);
1961 	fp->f_data = NULL;
1962 
1963 	return (0);
1964 }
1965 
1966 static int
1967 kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
1968 {
1969 
1970 	kif->kf_type = KF_TYPE_KQUEUE;
1971 	return (0);
1972 }
1973 
1974 static void
1975 kqueue_wakeup(struct kqueue *kq)
1976 {
1977 	KQ_OWNED(kq);
1978 
1979 	if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
1980 		kq->kq_state &= ~KQ_SLEEP;
1981 		wakeup(kq);
1982 	}
1983 	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
1984 		selwakeuppri(&kq->kq_sel, PSOCK);
1985 		if (!SEL_WAITING(&kq->kq_sel))
1986 			kq->kq_state &= ~KQ_SEL;
1987 	}
1988 	if (!knlist_empty(&kq->kq_sel.si_note))
1989 		kqueue_schedtask(kq);
1990 	if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
1991 		pgsigio(&kq->kq_sigio, SIGIO, 0);
1992 	}
1993 }
1994 
1995 /*
1996  * Walk down a list of knotes, activating them if their event has triggered.
1997  *
1998  * There is a possibility to optimize in the case of one kq watching another.
1999  * Instead of scheduling a task to wake it up, you could pass enough state
2000  * down the chain to make up the parent kqueue.  Make this code functional
2001  * first.
2002  */
2003 void
2004 knote(struct knlist *list, long hint, int lockflags)
2005 {
2006 	struct kqueue *kq;
2007 	struct knote *kn, *tkn;
2008 	int error;
2009 	bool own_influx;
2010 
2011 	if (list == NULL)
2012 		return;
2013 
2014 	KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
2015 
2016 	if ((lockflags & KNF_LISTLOCKED) == 0)
2017 		list->kl_lock(list->kl_lockarg);
2018 
2019 	/*
2020 	 * If we unlock the list lock (and set KN_INFLUX), we can
2021 	 * eliminate the kqueue scheduling, but this will introduce
2022 	 * four lock/unlock's for each knote to test.  Also, marker
2023 	 * would be needed to keep iteration position, since filters
2024 	 * or other threads could remove events.
2025 	 */
2026 	SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
2027 		kq = kn->kn_kq;
2028 		KQ_LOCK(kq);
2029 		if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
2030 			/*
2031 			 * Do not process the influx notes, except for
2032 			 * the influx coming from the kq unlock in the
2033 			 * kqueue_scan().  In the later case, we do
2034 			 * not interfere with the scan, since the code
2035 			 * fragment in kqueue_scan() locks the knlist,
2036 			 * and cannot proceed until we finished.
2037 			 */
2038 			KQ_UNLOCK(kq);
2039 		} else if ((lockflags & KNF_NOKQLOCK) != 0) {
2040 			own_influx = (kn->kn_status & KN_INFLUX) == 0;
2041 			if (own_influx)
2042 				kn->kn_status |= KN_INFLUX;
2043 			KQ_UNLOCK(kq);
2044 			error = kn->kn_fop->f_event(kn, hint);
2045 			KQ_LOCK(kq);
2046 			if (own_influx)
2047 				kn->kn_status &= ~KN_INFLUX;
2048 			if (error)
2049 				KNOTE_ACTIVATE(kn, 1);
2050 			KQ_UNLOCK_FLUX(kq);
2051 		} else {
2052 			kn->kn_status |= KN_HASKQLOCK;
2053 			if (kn->kn_fop->f_event(kn, hint))
2054 				KNOTE_ACTIVATE(kn, 1);
2055 			kn->kn_status &= ~KN_HASKQLOCK;
2056 			KQ_UNLOCK(kq);
2057 		}
2058 	}
2059 	if ((lockflags & KNF_LISTLOCKED) == 0)
2060 		list->kl_unlock(list->kl_lockarg);
2061 }
2062 
2063 /*
2064  * add a knote to a knlist
2065  */
2066 void
2067 knlist_add(struct knlist *knl, struct knote *kn, int islocked)
2068 {
2069 	KNL_ASSERT_LOCK(knl, islocked);
2070 	KQ_NOTOWNED(kn->kn_kq);
2071 	KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) ==
2072 	    (KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED"));
2073 	if (!islocked)
2074 		knl->kl_lock(knl->kl_lockarg);
2075 	SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
2076 	if (!islocked)
2077 		knl->kl_unlock(knl->kl_lockarg);
2078 	KQ_LOCK(kn->kn_kq);
2079 	kn->kn_knlist = knl;
2080 	kn->kn_status &= ~KN_DETACHED;
2081 	KQ_UNLOCK(kn->kn_kq);
2082 }
2083 
2084 static void
2085 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked,
2086     int kqislocked)
2087 {
2088 	KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked"));
2089 	KNL_ASSERT_LOCK(knl, knlislocked);
2090 	mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
2091 	if (!kqislocked)
2092 		KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX,
2093     ("knlist_remove called w/o knote being KN_INFLUX or already removed"));
2094 	if (!knlislocked)
2095 		knl->kl_lock(knl->kl_lockarg);
2096 	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
2097 	kn->kn_knlist = NULL;
2098 	if (!knlislocked)
2099 		kn_list_unlock(knl);
2100 	if (!kqislocked)
2101 		KQ_LOCK(kn->kn_kq);
2102 	kn->kn_status |= KN_DETACHED;
2103 	if (!kqislocked)
2104 		KQ_UNLOCK(kn->kn_kq);
2105 }
2106 
2107 /*
2108  * remove knote from the specified knlist
2109  */
2110 void
2111 knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
2112 {
2113 
2114 	knlist_remove_kq(knl, kn, islocked, 0);
2115 }
2116 
2117 int
2118 knlist_empty(struct knlist *knl)
2119 {
2120 
2121 	KNL_ASSERT_LOCKED(knl);
2122 	return SLIST_EMPTY(&knl->kl_list);
2123 }
2124 
2125 static struct mtx	knlist_lock;
2126 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
2127 	MTX_DEF);
2128 static void knlist_mtx_lock(void *arg);
2129 static void knlist_mtx_unlock(void *arg);
2130 
2131 static void
2132 knlist_mtx_lock(void *arg)
2133 {
2134 
2135 	mtx_lock((struct mtx *)arg);
2136 }
2137 
2138 static void
2139 knlist_mtx_unlock(void *arg)
2140 {
2141 
2142 	mtx_unlock((struct mtx *)arg);
2143 }
2144 
2145 static void
2146 knlist_mtx_assert_locked(void *arg)
2147 {
2148 
2149 	mtx_assert((struct mtx *)arg, MA_OWNED);
2150 }
2151 
2152 static void
2153 knlist_mtx_assert_unlocked(void *arg)
2154 {
2155 
2156 	mtx_assert((struct mtx *)arg, MA_NOTOWNED);
2157 }
2158 
2159 static void
2160 knlist_rw_rlock(void *arg)
2161 {
2162 
2163 	rw_rlock((struct rwlock *)arg);
2164 }
2165 
2166 static void
2167 knlist_rw_runlock(void *arg)
2168 {
2169 
2170 	rw_runlock((struct rwlock *)arg);
2171 }
2172 
2173 static void
2174 knlist_rw_assert_locked(void *arg)
2175 {
2176 
2177 	rw_assert((struct rwlock *)arg, RA_LOCKED);
2178 }
2179 
2180 static void
2181 knlist_rw_assert_unlocked(void *arg)
2182 {
2183 
2184 	rw_assert((struct rwlock *)arg, RA_UNLOCKED);
2185 }
2186 
2187 void
2188 knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
2189     void (*kl_unlock)(void *),
2190     void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *))
2191 {
2192 
2193 	if (lock == NULL)
2194 		knl->kl_lockarg = &knlist_lock;
2195 	else
2196 		knl->kl_lockarg = lock;
2197 
2198 	if (kl_lock == NULL)
2199 		knl->kl_lock = knlist_mtx_lock;
2200 	else
2201 		knl->kl_lock = kl_lock;
2202 	if (kl_unlock == NULL)
2203 		knl->kl_unlock = knlist_mtx_unlock;
2204 	else
2205 		knl->kl_unlock = kl_unlock;
2206 	if (kl_assert_locked == NULL)
2207 		knl->kl_assert_locked = knlist_mtx_assert_locked;
2208 	else
2209 		knl->kl_assert_locked = kl_assert_locked;
2210 	if (kl_assert_unlocked == NULL)
2211 		knl->kl_assert_unlocked = knlist_mtx_assert_unlocked;
2212 	else
2213 		knl->kl_assert_unlocked = kl_assert_unlocked;
2214 
2215 	knl->kl_autodestroy = 0;
2216 	SLIST_INIT(&knl->kl_list);
2217 }
2218 
2219 void
2220 knlist_init_mtx(struct knlist *knl, struct mtx *lock)
2221 {
2222 
2223 	knlist_init(knl, lock, NULL, NULL, NULL, NULL);
2224 }
2225 
2226 struct knlist *
2227 knlist_alloc(struct mtx *lock)
2228 {
2229 	struct knlist *knl;
2230 
2231 	knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK);
2232 	knlist_init_mtx(knl, lock);
2233 	return (knl);
2234 }
2235 
2236 void
2237 knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock)
2238 {
2239 
2240 	knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock,
2241 	    knlist_rw_assert_locked, knlist_rw_assert_unlocked);
2242 }
2243 
2244 void
2245 knlist_destroy(struct knlist *knl)
2246 {
2247 
2248 #ifdef INVARIANTS
2249 	/*
2250 	 * if we run across this error, we need to find the offending
2251 	 * driver and have it call knlist_clear or knlist_delete.
2252 	 */
2253 	if (!SLIST_EMPTY(&knl->kl_list))
2254 		printf("WARNING: destroying knlist w/ knotes on it!\n");
2255 #endif
2256 
2257 	knl->kl_lockarg = knl->kl_lock = knl->kl_unlock = NULL;
2258 	SLIST_INIT(&knl->kl_list);
2259 }
2260 
2261 void
2262 knlist_detach(struct knlist *knl)
2263 {
2264 
2265 	KNL_ASSERT_LOCKED(knl);
2266 	knl->kl_autodestroy = 1;
2267 	if (knlist_empty(knl)) {
2268 		knlist_destroy(knl);
2269 		free(knl, M_KQUEUE);
2270 	}
2271 }
2272 
2273 /*
2274  * Even if we are locked, we may need to drop the lock to allow any influx
2275  * knotes time to "settle".
2276  */
2277 void
2278 knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
2279 {
2280 	struct knote *kn, *kn2;
2281 	struct kqueue *kq;
2282 
2283 	KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl));
2284 	if (islocked)
2285 		KNL_ASSERT_LOCKED(knl);
2286 	else {
2287 		KNL_ASSERT_UNLOCKED(knl);
2288 again:		/* need to reacquire lock since we have dropped it */
2289 		knl->kl_lock(knl->kl_lockarg);
2290 	}
2291 
2292 	SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
2293 		kq = kn->kn_kq;
2294 		KQ_LOCK(kq);
2295 		if ((kn->kn_status & KN_INFLUX)) {
2296 			KQ_UNLOCK(kq);
2297 			continue;
2298 		}
2299 		knlist_remove_kq(knl, kn, 1, 1);
2300 		if (killkn) {
2301 			kn->kn_status |= KN_INFLUX | KN_DETACHED;
2302 			KQ_UNLOCK(kq);
2303 			knote_drop(kn, td);
2304 		} else {
2305 			/* Make sure cleared knotes disappear soon */
2306 			kn->kn_flags |= (EV_EOF | EV_ONESHOT);
2307 			KQ_UNLOCK(kq);
2308 		}
2309 		kq = NULL;
2310 	}
2311 
2312 	if (!SLIST_EMPTY(&knl->kl_list)) {
2313 		/* there are still KN_INFLUX remaining */
2314 		kn = SLIST_FIRST(&knl->kl_list);
2315 		kq = kn->kn_kq;
2316 		KQ_LOCK(kq);
2317 		KASSERT(kn->kn_status & KN_INFLUX,
2318 		    ("knote removed w/o list lock"));
2319 		knl->kl_unlock(knl->kl_lockarg);
2320 		kq->kq_state |= KQ_FLUXWAIT;
2321 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
2322 		kq = NULL;
2323 		goto again;
2324 	}
2325 
2326 	if (islocked)
2327 		KNL_ASSERT_LOCKED(knl);
2328 	else {
2329 		knl->kl_unlock(knl->kl_lockarg);
2330 		KNL_ASSERT_UNLOCKED(knl);
2331 	}
2332 }
2333 
2334 /*
2335  * Remove all knotes referencing a specified fd must be called with FILEDESC
2336  * lock.  This prevents a race where a new fd comes along and occupies the
2337  * entry and we attach a knote to the fd.
2338  */
2339 void
2340 knote_fdclose(struct thread *td, int fd)
2341 {
2342 	struct filedesc *fdp = td->td_proc->p_fd;
2343 	struct kqueue *kq;
2344 	struct knote *kn;
2345 	int influx;
2346 
2347 	FILEDESC_XLOCK_ASSERT(fdp);
2348 
2349 	/*
2350 	 * We shouldn't have to worry about new kevents appearing on fd
2351 	 * since filedesc is locked.
2352 	 */
2353 	TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
2354 		KQ_LOCK(kq);
2355 
2356 again:
2357 		influx = 0;
2358 		while (kq->kq_knlistsize > fd &&
2359 		    (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
2360 			if (kn->kn_status & KN_INFLUX) {
2361 				/* someone else might be waiting on our knote */
2362 				if (influx)
2363 					wakeup(kq);
2364 				kq->kq_state |= KQ_FLUXWAIT;
2365 				msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
2366 				goto again;
2367 			}
2368 			kn->kn_status |= KN_INFLUX;
2369 			KQ_UNLOCK(kq);
2370 			if (!(kn->kn_status & KN_DETACHED))
2371 				kn->kn_fop->f_detach(kn);
2372 			knote_drop(kn, td);
2373 			influx = 1;
2374 			KQ_LOCK(kq);
2375 		}
2376 		KQ_UNLOCK_FLUX(kq);
2377 	}
2378 }
2379 
2380 static int
2381 knote_attach(struct knote *kn, struct kqueue *kq)
2382 {
2383 	struct klist *list;
2384 
2385 	KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX"));
2386 	KQ_OWNED(kq);
2387 
2388 	if (kn->kn_fop->f_isfd) {
2389 		if (kn->kn_id >= kq->kq_knlistsize)
2390 			return ENOMEM;
2391 		list = &kq->kq_knlist[kn->kn_id];
2392 	} else {
2393 		if (kq->kq_knhash == NULL)
2394 			return ENOMEM;
2395 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2396 	}
2397 
2398 	SLIST_INSERT_HEAD(list, kn, kn_link);
2399 
2400 	return 0;
2401 }
2402 
2403 /*
2404  * knote must already have been detached using the f_detach method.
2405  * no lock need to be held, it is assumed that the KN_INFLUX flag is set
2406  * to prevent other removal.
2407  */
2408 static void
2409 knote_drop(struct knote *kn, struct thread *td)
2410 {
2411 	struct kqueue *kq;
2412 	struct klist *list;
2413 
2414 	kq = kn->kn_kq;
2415 
2416 	KQ_NOTOWNED(kq);
2417 	KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX,
2418 	    ("knote_drop called without KN_INFLUX set in kn_status"));
2419 
2420 	KQ_LOCK(kq);
2421 	if (kn->kn_fop->f_isfd)
2422 		list = &kq->kq_knlist[kn->kn_id];
2423 	else
2424 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2425 
2426 	if (!SLIST_EMPTY(list))
2427 		SLIST_REMOVE(list, kn, knote, kn_link);
2428 	if (kn->kn_status & KN_QUEUED)
2429 		knote_dequeue(kn);
2430 	KQ_UNLOCK_FLUX(kq);
2431 
2432 	if (kn->kn_fop->f_isfd) {
2433 		fdrop(kn->kn_fp, td);
2434 		kn->kn_fp = NULL;
2435 	}
2436 	kqueue_fo_release(kn->kn_kevent.filter);
2437 	kn->kn_fop = NULL;
2438 	knote_free(kn);
2439 }
2440 
2441 static void
2442 knote_enqueue(struct knote *kn)
2443 {
2444 	struct kqueue *kq = kn->kn_kq;
2445 
2446 	KQ_OWNED(kn->kn_kq);
2447 	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
2448 
2449 	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2450 	kn->kn_status |= KN_QUEUED;
2451 	kq->kq_count++;
2452 	kqueue_wakeup(kq);
2453 }
2454 
2455 static void
2456 knote_dequeue(struct knote *kn)
2457 {
2458 	struct kqueue *kq = kn->kn_kq;
2459 
2460 	KQ_OWNED(kn->kn_kq);
2461 	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
2462 
2463 	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2464 	kn->kn_status &= ~KN_QUEUED;
2465 	kq->kq_count--;
2466 }
2467 
2468 static void
2469 knote_init(void)
2470 {
2471 
2472 	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
2473 	    NULL, NULL, UMA_ALIGN_PTR, 0);
2474 }
2475 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
2476 
2477 static struct knote *
2478 knote_alloc(int waitok)
2479 {
2480 
2481 	return (uma_zalloc(knote_zone, (waitok ? M_WAITOK : M_NOWAIT) |
2482 	    M_ZERO));
2483 }
2484 
2485 static void
2486 knote_free(struct knote *kn)
2487 {
2488 
2489 	uma_zfree(knote_zone, kn);
2490 }
2491 
2492 /*
2493  * Register the kev w/ the kq specified by fd.
2494  */
2495 int
2496 kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok)
2497 {
2498 	struct kqueue *kq;
2499 	struct file *fp;
2500 	cap_rights_t rights;
2501 	int error;
2502 
2503 	error = fget(td, fd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &fp);
2504 	if (error != 0)
2505 		return (error);
2506 	if ((error = kqueue_acquire(fp, &kq)) != 0)
2507 		goto noacquire;
2508 
2509 	error = kqueue_register(kq, kev, td, waitok);
2510 
2511 	kqueue_release(kq, 0);
2512 
2513 noacquire:
2514 	fdrop(fp, td);
2515 
2516 	return error;
2517 }
2518