xref: /freebsd/sys/kern/kern_event.c (revision 2546665afcaf0d53dc2c7058fee96354b3680f5a)
1 /*-
2  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
33 #include <sys/lock.h>
34 #include <sys/mutex.h>
35 #include <sys/proc.h>
36 #include <sys/malloc.h>
37 #include <sys/unistd.h>
38 #include <sys/file.h>
39 #include <sys/filedesc.h>
40 #include <sys/filio.h>
41 #include <sys/fcntl.h>
42 #include <sys/selinfo.h>
43 #include <sys/queue.h>
44 #include <sys/event.h>
45 #include <sys/eventvar.h>
46 #include <sys/poll.h>
47 #include <sys/protosw.h>
48 #include <sys/sigio.h>
49 #include <sys/signalvar.h>
50 #include <sys/socket.h>
51 #include <sys/socketvar.h>
52 #include <sys/stat.h>
53 #include <sys/sysctl.h>
54 #include <sys/sysproto.h>
55 #include <sys/uio.h>
56 
57 #include <vm/uma.h>
58 
59 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
60 
61 static int	kqueue_scan(struct file *fp, int maxevents,
62 		    struct kevent *ulistp, const struct timespec *timeout,
63 		    struct thread *td);
64 static void 	kqueue_wakeup(struct kqueue *kq);
65 
66 static fo_rdwr_t	kqueue_read;
67 static fo_rdwr_t	kqueue_write;
68 static fo_ioctl_t	kqueue_ioctl;
69 static fo_poll_t	kqueue_poll;
70 static fo_kqfilter_t	kqueue_kqfilter;
71 static fo_stat_t	kqueue_stat;
72 static fo_close_t	kqueue_close;
73 
74 static struct fileops kqueueops = {
75 	.fo_read = kqueue_read,
76 	.fo_write = kqueue_write,
77 	.fo_ioctl = kqueue_ioctl,
78 	.fo_poll = kqueue_poll,
79 	.fo_kqfilter = kqueue_kqfilter,
80 	.fo_stat = kqueue_stat,
81 	.fo_close = kqueue_close,
82 };
83 
84 static void 	knote_attach(struct knote *kn, struct filedesc *fdp);
85 static void 	knote_drop(struct knote *kn, struct thread *td);
86 static void 	knote_enqueue(struct knote *kn);
87 static void 	knote_dequeue(struct knote *kn);
88 static void 	knote_init(void);
89 static struct 	knote *knote_alloc(void);
90 static void 	knote_free(struct knote *kn);
91 
92 static void	filt_kqdetach(struct knote *kn);
93 static int	filt_kqueue(struct knote *kn, long hint);
94 static int	filt_procattach(struct knote *kn);
95 static void	filt_procdetach(struct knote *kn);
96 static int	filt_proc(struct knote *kn, long hint);
97 static int	filt_fileattach(struct knote *kn);
98 static void	filt_timerexpire(void *knx);
99 static int	filt_timerattach(struct knote *kn);
100 static void	filt_timerdetach(struct knote *kn);
101 static int	filt_timer(struct knote *kn, long hint);
102 
103 static struct filterops file_filtops =
104 	{ 1, filt_fileattach, NULL, NULL };
105 static struct filterops kqread_filtops =
106 	{ 1, NULL, filt_kqdetach, filt_kqueue };
107 static struct filterops proc_filtops =
108 	{ 0, filt_procattach, filt_procdetach, filt_proc };
109 static struct filterops timer_filtops =
110 	{ 0, filt_timerattach, filt_timerdetach, filt_timer };
111 
112 static uma_zone_t	knote_zone;
113 static int 		kq_ncallouts = 0;
114 static int 		kq_calloutmax = (4 * 1024);
115 SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
116     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
117 
118 #define KNOTE_ACTIVATE(kn) do { 					\
119 	kn->kn_status |= KN_ACTIVE;					\
120 	if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
121 		knote_enqueue(kn);					\
122 } while(0)
123 
124 #define	KN_HASHSIZE		64		/* XXX should be tunable */
125 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
126 
127 static int
128 filt_nullattach(struct knote *kn)
129 {
130 
131 	return (ENXIO);
132 };
133 
134 struct filterops null_filtops =
135 	{ 0, filt_nullattach, NULL, NULL };
136 
137 extern struct filterops sig_filtops;
138 extern struct filterops fs_filtops;
139 
140 /*
141  * Table for for all system-defined filters.
142  */
143 static struct filterops *sysfilt_ops[] = {
144 	&file_filtops,			/* EVFILT_READ */
145 	&file_filtops,			/* EVFILT_WRITE */
146 	&null_filtops,			/* EVFILT_AIO */
147 	&file_filtops,			/* EVFILT_VNODE */
148 	&proc_filtops,			/* EVFILT_PROC */
149 	&sig_filtops,			/* EVFILT_SIGNAL */
150 	&timer_filtops,			/* EVFILT_TIMER */
151 	&file_filtops,			/* EVFILT_NETDEV */
152 	&fs_filtops,			/* EVFILT_FS */
153 };
154 
155 static int
156 filt_fileattach(struct knote *kn)
157 {
158 
159 	return (fo_kqfilter(kn->kn_fp, kn));
160 }
161 
162 /*ARGSUSED*/
163 static int
164 kqueue_kqfilter(struct file *fp, struct knote *kn)
165 {
166 	struct kqueue *kq = kn->kn_fp->f_data;
167 
168 	if (kn->kn_filter != EVFILT_READ)
169 		return (1);
170 
171 	kn->kn_fop = &kqread_filtops;
172 	SLIST_INSERT_HEAD(&kq->kq_sel.si_note, kn, kn_selnext);
173 	return (0);
174 }
175 
176 static void
177 filt_kqdetach(struct knote *kn)
178 {
179 	struct kqueue *kq = kn->kn_fp->f_data;
180 
181 	SLIST_REMOVE(&kq->kq_sel.si_note, kn, knote, kn_selnext);
182 }
183 
184 /*ARGSUSED*/
185 static int
186 filt_kqueue(struct knote *kn, long hint)
187 {
188 	struct kqueue *kq = kn->kn_fp->f_data;
189 
190 	kn->kn_data = kq->kq_count;
191 	return (kn->kn_data > 0);
192 }
193 
194 static int
195 filt_procattach(struct knote *kn)
196 {
197 	struct proc *p;
198 	int immediate;
199 	int error;
200 
201 	immediate = 0;
202 	p = pfind(kn->kn_id);
203 	if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
204 		p = zpfind(kn->kn_id);
205 		immediate = 1;
206 	}
207 	if (p == NULL)
208 		return (ESRCH);
209 	if ((error = p_cansee(curthread, p))) {
210 		PROC_UNLOCK(p);
211 		return (error);
212 	}
213 
214 	kn->kn_ptr.p_proc = p;
215 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
216 
217 	/*
218 	 * internal flag indicating registration done by kernel
219 	 */
220 	if (kn->kn_flags & EV_FLAG1) {
221 		kn->kn_data = kn->kn_sdata;		/* ppid */
222 		kn->kn_fflags = NOTE_CHILD;
223 		kn->kn_flags &= ~EV_FLAG1;
224 	}
225 
226 	if (immediate == 0)
227 		SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
228 
229 	/*
230 	 * Immediately activate any exit notes if the target process is a
231 	 * zombie.  This is necessary to handle the case where the target
232 	 * process, e.g. a child, dies before the kevent is registered.
233 	 */
234 	if (immediate && filt_proc(kn, NOTE_EXIT))
235 		KNOTE_ACTIVATE(kn);
236 
237 	PROC_UNLOCK(p);
238 
239 	return (0);
240 }
241 
242 /*
243  * The knote may be attached to a different process, which may exit,
244  * leaving nothing for the knote to be attached to.  So when the process
245  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
246  * it will be deleted when read out.  However, as part of the knote deletion,
247  * this routine is called, so a check is needed to avoid actually performing
248  * a detach, because the original process does not exist any more.
249  */
250 static void
251 filt_procdetach(struct knote *kn)
252 {
253 	struct proc *p = kn->kn_ptr.p_proc;
254 
255 	if (kn->kn_status & KN_DETACHED)
256 		return;
257 
258 	PROC_LOCK(p);
259 	SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
260 	PROC_UNLOCK(p);
261 }
262 
263 static int
264 filt_proc(struct knote *kn, long hint)
265 {
266 	u_int event;
267 
268 	/*
269 	 * mask off extra data
270 	 */
271 	event = (u_int)hint & NOTE_PCTRLMASK;
272 
273 	/*
274 	 * if the user is interested in this event, record it.
275 	 */
276 	if (kn->kn_sfflags & event)
277 		kn->kn_fflags |= event;
278 
279 	/*
280 	 * process is gone, so flag the event as finished.
281 	 */
282 	if (event == NOTE_EXIT) {
283 		kn->kn_status |= KN_DETACHED;
284 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
285 		return (1);
286 	}
287 
288 	/*
289 	 * process forked, and user wants to track the new process,
290 	 * so attach a new knote to it, and immediately report an
291 	 * event with the parent's pid.
292 	 */
293 	if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
294 		struct kevent kev;
295 		int error;
296 
297 		/*
298 		 * register knote with new process.
299 		 */
300 		kev.ident = hint & NOTE_PDATAMASK;	/* pid */
301 		kev.filter = kn->kn_filter;
302 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
303 		kev.fflags = kn->kn_sfflags;
304 		kev.data = kn->kn_id;			/* parent */
305 		kev.udata = kn->kn_kevent.udata;	/* preserve udata */
306 		error = kqueue_register(kn->kn_kq, &kev, NULL);
307 		if (error)
308 			kn->kn_fflags |= NOTE_TRACKERR;
309 	}
310 
311 	return (kn->kn_fflags != 0);
312 }
313 
314 static void
315 filt_timerexpire(void *knx)
316 {
317 	struct knote *kn = knx;
318 	struct callout *calloutp;
319 	struct timeval tv;
320 	int tticks;
321 
322 	kn->kn_data++;
323 	KNOTE_ACTIVATE(kn);
324 
325 	if ((kn->kn_flags & EV_ONESHOT) == 0) {
326 		tv.tv_sec = kn->kn_sdata / 1000;
327 		tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
328 		tticks = tvtohz(&tv);
329 		calloutp = (struct callout *)kn->kn_hook;
330 		callout_reset(calloutp, tticks, filt_timerexpire, kn);
331 	}
332 }
333 
334 /*
335  * data contains amount of time to sleep, in milliseconds
336  */
337 static int
338 filt_timerattach(struct knote *kn)
339 {
340 	struct callout *calloutp;
341 	struct timeval tv;
342 	int tticks;
343 
344 	if (kq_ncallouts >= kq_calloutmax)
345 		return (ENOMEM);
346 	kq_ncallouts++;
347 
348 	tv.tv_sec = kn->kn_sdata / 1000;
349 	tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
350 	tticks = tvtohz(&tv);
351 
352 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
353 	MALLOC(calloutp, struct callout *, sizeof(*calloutp),
354 	    M_KQUEUE, M_WAITOK);
355 	callout_init(calloutp, 0);
356 	kn->kn_hook = calloutp;
357 	callout_reset(calloutp, tticks, filt_timerexpire, kn);
358 
359 	return (0);
360 }
361 
362 static void
363 filt_timerdetach(struct knote *kn)
364 {
365 	struct callout *calloutp;
366 
367 	calloutp = (struct callout *)kn->kn_hook;
368 	callout_drain(calloutp);
369 	FREE(calloutp, M_KQUEUE);
370 	kq_ncallouts--;
371 }
372 
373 static int
374 filt_timer(struct knote *kn, long hint)
375 {
376 
377 	return (kn->kn_data != 0);
378 }
379 
380 /*
381  * MPSAFE
382  */
383 int
384 kqueue(struct thread *td, struct kqueue_args *uap)
385 {
386 	struct filedesc *fdp;
387 	struct kqueue *kq;
388 	struct file *fp;
389 	int fd, error;
390 
391 	mtx_lock(&Giant);
392 	fdp = td->td_proc->p_fd;
393 	error = falloc(td, &fp, &fd);
394 	if (error)
395 		goto done2;
396 	/* An extra reference on `nfp' has been held for us by falloc(). */
397 	kq = malloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO);
398 	TAILQ_INIT(&kq->kq_head);
399 	FILE_LOCK(fp);
400 	fp->f_flag = FREAD | FWRITE;
401 	fp->f_type = DTYPE_KQUEUE;
402 	fp->f_ops = &kqueueops;
403 	fp->f_data = kq;
404 	FILE_UNLOCK(fp);
405 	fdrop(fp, td);
406 	FILEDESC_LOCK(fdp);
407 	td->td_retval[0] = fd;
408 	if (fdp->fd_knlistsize < 0)
409 		fdp->fd_knlistsize = 0;		/* this process has a kq */
410 	FILEDESC_UNLOCK(fdp);
411 	kq->kq_fdp = fdp;
412 done2:
413 	mtx_unlock(&Giant);
414 	return (error);
415 }
416 
417 #ifndef _SYS_SYSPROTO_H_
418 struct kevent_args {
419 	int	fd;
420 	const struct kevent *changelist;
421 	int	nchanges;
422 	struct	kevent *eventlist;
423 	int	nevents;
424 	const struct timespec *timeout;
425 };
426 #endif
427 /*
428  * MPSAFE
429  */
430 int
431 kevent(struct thread *td, struct kevent_args *uap)
432 {
433 	struct kevent *kevp;
434 	struct kqueue *kq;
435 	struct file *fp;
436 	struct timespec ts;
437 	int i, n, nerrors, error;
438 
439 	if ((error = fget(td, uap->fd, &fp)) != 0)
440 		return (error);
441 	if (fp->f_type != DTYPE_KQUEUE) {
442 		fdrop(fp, td);
443 		return (EBADF);
444 	}
445 	if (uap->timeout != NULL) {
446 		error = copyin(uap->timeout, &ts, sizeof(ts));
447 		if (error)
448 			goto done_nogiant;
449 		uap->timeout = &ts;
450 	}
451 	mtx_lock(&Giant);
452 
453 	kq = fp->f_data;
454 	nerrors = 0;
455 
456 	while (uap->nchanges > 0) {
457 		n = uap->nchanges > KQ_NEVENTS ? KQ_NEVENTS : uap->nchanges;
458 		error = copyin(uap->changelist, kq->kq_kev,
459 		    n * sizeof(struct kevent));
460 		if (error)
461 			goto done;
462 		for (i = 0; i < n; i++) {
463 			kevp = &kq->kq_kev[i];
464 			kevp->flags &= ~EV_SYSFLAGS;
465 			error = kqueue_register(kq, kevp, td);
466 			if (error) {
467 				if (uap->nevents != 0) {
468 					kevp->flags = EV_ERROR;
469 					kevp->data = error;
470 					(void) copyout(kevp,
471 					    uap->eventlist,
472 					    sizeof(*kevp));
473 					uap->eventlist++;
474 					uap->nevents--;
475 					nerrors++;
476 				} else {
477 					goto done;
478 				}
479 			}
480 		}
481 		uap->nchanges -= n;
482 		uap->changelist += n;
483 	}
484 	if (nerrors) {
485         	td->td_retval[0] = nerrors;
486 		error = 0;
487 		goto done;
488 	}
489 
490 	error = kqueue_scan(fp, uap->nevents, uap->eventlist, uap->timeout, td);
491 done:
492 	mtx_unlock(&Giant);
493 done_nogiant:
494 	if (fp != NULL)
495 		fdrop(fp, td);
496 	return (error);
497 }
498 
499 int
500 kqueue_add_filteropts(int filt, struct filterops *filtops)
501 {
502 
503 	if (filt > 0)
504 		panic("filt(%d) > 0", filt);
505 	if (filt + EVFILT_SYSCOUNT < 0)
506 		panic("filt(%d) + EVFILT_SYSCOUNT(%d) == %d < 0",
507 		    filt, EVFILT_SYSCOUNT, filt + EVFILT_SYSCOUNT);
508 	if (sysfilt_ops[~filt] != &null_filtops)
509 		panic("sysfilt_ops[~filt(%d)] != &null_filtops", filt);
510 	sysfilt_ops[~filt] = filtops;
511 	return (0);
512 }
513 
514 int
515 kqueue_del_filteropts(int filt)
516 {
517 
518 	if (filt > 0)
519 		panic("filt(%d) > 0", filt);
520 	if (filt + EVFILT_SYSCOUNT < 0)
521 		panic("filt(%d) + EVFILT_SYSCOUNT(%d) == %d < 0",
522 		    filt, EVFILT_SYSCOUNT, filt + EVFILT_SYSCOUNT);
523 	if (sysfilt_ops[~filt] == &null_filtops)
524 		panic("sysfilt_ops[~filt(%d)] != &null_filtops", filt);
525 	sysfilt_ops[~filt] = &null_filtops;
526 	return (0);
527 }
528 
529 int
530 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td)
531 {
532 	struct filedesc *fdp = kq->kq_fdp;
533 	struct filterops *fops;
534 	struct file *fp = NULL;
535 	struct knote *kn = NULL;
536 	int s, error = 0;
537 
538 	if (kev->filter < 0) {
539 		if (kev->filter + EVFILT_SYSCOUNT < 0)
540 			return (EINVAL);
541 		fops = sysfilt_ops[~kev->filter];	/* to 0-base index */
542 	} else {
543 		/*
544 		 * XXX
545 		 * filter attach routine is responsible for insuring that
546 		 * the identifier can be attached to it.
547 		 */
548 		printf("unknown filter: %d\n", kev->filter);
549 		return (EINVAL);
550 	}
551 
552 	FILEDESC_LOCK(fdp);
553 	if (fops->f_isfd) {
554 		/* validate descriptor */
555 		if ((u_int)kev->ident >= fdp->fd_nfiles ||
556 		    (fp = fdp->fd_ofiles[kev->ident]) == NULL) {
557 			FILEDESC_UNLOCK(fdp);
558 			return (EBADF);
559 		}
560 		fhold(fp);
561 
562 		if (kev->ident < fdp->fd_knlistsize) {
563 			SLIST_FOREACH(kn, &fdp->fd_knlist[kev->ident], kn_link)
564 				if (kq == kn->kn_kq &&
565 				    kev->filter == kn->kn_filter)
566 					break;
567 		}
568 	} else {
569 		if (fdp->fd_knhashmask != 0) {
570 			struct klist *list;
571 
572 			list = &fdp->fd_knhash[
573 			    KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
574 			SLIST_FOREACH(kn, list, kn_link)
575 				if (kev->ident == kn->kn_id &&
576 				    kq == kn->kn_kq &&
577 				    kev->filter == kn->kn_filter)
578 					break;
579 		}
580 	}
581 	FILEDESC_UNLOCK(fdp);
582 
583 	if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
584 		error = ENOENT;
585 		goto done;
586 	}
587 
588 	/*
589 	 * kn now contains the matching knote, or NULL if no match
590 	 */
591 	if (kev->flags & EV_ADD) {
592 
593 		if (kn == NULL) {
594 			kn = knote_alloc();
595 			if (kn == NULL) {
596 				error = ENOMEM;
597 				goto done;
598 			}
599 			kn->kn_fp = fp;
600 			kn->kn_kq = kq;
601 			kn->kn_fop = fops;
602 
603 			/*
604 			 * apply reference count to knote structure, and
605 			 * do not release it at the end of this routine.
606 			 */
607 			fp = NULL;
608 
609 			kn->kn_sfflags = kev->fflags;
610 			kn->kn_sdata = kev->data;
611 			kev->fflags = 0;
612 			kev->data = 0;
613 			kn->kn_kevent = *kev;
614 
615 			knote_attach(kn, fdp);
616 			if ((error = fops->f_attach(kn)) != 0) {
617 				knote_drop(kn, td);
618 				goto done;
619 			}
620 		} else {
621 			/*
622 			 * The user may change some filter values after the
623 			 * initial EV_ADD, but doing so will not reset any
624 			 * filter which has already been triggered.
625 			 */
626 			kn->kn_sfflags = kev->fflags;
627 			kn->kn_sdata = kev->data;
628 			kn->kn_kevent.udata = kev->udata;
629 		}
630 
631 		s = splhigh();
632 		if (kn->kn_fop->f_event(kn, 0))
633 			KNOTE_ACTIVATE(kn);
634 		splx(s);
635 
636 	} else if (kev->flags & EV_DELETE) {
637 		kn->kn_fop->f_detach(kn);
638 		knote_drop(kn, td);
639 		goto done;
640 	}
641 
642 	if ((kev->flags & EV_DISABLE) &&
643 	    ((kn->kn_status & KN_DISABLED) == 0)) {
644 		s = splhigh();
645 		kn->kn_status |= KN_DISABLED;
646 		splx(s);
647 	}
648 
649 	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
650 		s = splhigh();
651 		kn->kn_status &= ~KN_DISABLED;
652 		if ((kn->kn_status & KN_ACTIVE) &&
653 		    ((kn->kn_status & KN_QUEUED) == 0))
654 			knote_enqueue(kn);
655 		splx(s);
656 	}
657 
658 done:
659 	if (fp != NULL)
660 		fdrop(fp, td);
661 	return (error);
662 }
663 
664 static int
665 kqueue_scan(struct file *fp, int maxevents, struct kevent *ulistp,
666 	const struct timespec *tsp, struct thread *td)
667 {
668 	struct kqueue *kq;
669 	struct kevent *kevp;
670 	struct timeval atv, rtv, ttv;
671 	struct knote *kn, marker;
672 	int s, count, timeout, nkev = 0, error = 0;
673 
674 	FILE_LOCK_ASSERT(fp, MA_NOTOWNED);
675 
676 	kq = fp->f_data;
677 	count = maxevents;
678 	if (count == 0)
679 		goto done;
680 
681 	if (tsp != NULL) {
682 		TIMESPEC_TO_TIMEVAL(&atv, tsp);
683 		if (itimerfix(&atv)) {
684 			error = EINVAL;
685 			goto done;
686 		}
687 		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
688 			timeout = -1;
689 		else
690 			timeout = atv.tv_sec > 24 * 60 * 60 ?
691 			    24 * 60 * 60 * hz : tvtohz(&atv);
692 		getmicrouptime(&rtv);
693 		timevaladd(&atv, &rtv);
694 	} else {
695 		atv.tv_sec = 0;
696 		atv.tv_usec = 0;
697 		timeout = 0;
698 	}
699 	goto start;
700 
701 retry:
702 	if (atv.tv_sec || atv.tv_usec) {
703 		getmicrouptime(&rtv);
704 		if (timevalcmp(&rtv, &atv, >=))
705 			goto done;
706 		ttv = atv;
707 		timevalsub(&ttv, &rtv);
708 		timeout = ttv.tv_sec > 24 * 60 * 60 ?
709 			24 * 60 * 60 * hz : tvtohz(&ttv);
710 	}
711 
712 start:
713 	kevp = kq->kq_kev;
714 	s = splhigh();
715 	if (kq->kq_count == 0) {
716 		if (timeout < 0) {
717 			error = EWOULDBLOCK;
718 		} else {
719 			kq->kq_state |= KQ_SLEEP;
720 			error = tsleep(kq, PSOCK | PCATCH, "kqread", timeout);
721 		}
722 		splx(s);
723 		if (error == 0)
724 			goto retry;
725 		/* don't restart after signals... */
726 		if (error == ERESTART)
727 			error = EINTR;
728 		else if (error == EWOULDBLOCK)
729 			error = 0;
730 		goto done;
731 	}
732 
733 	TAILQ_INSERT_TAIL(&kq->kq_head, &marker, kn_tqe);
734 	while (count) {
735 		kn = TAILQ_FIRST(&kq->kq_head);
736 		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
737 		if (kn == &marker) {
738 			splx(s);
739 			if (count == maxevents)
740 				goto retry;
741 			goto done;
742 		}
743 		if (kn->kn_status & KN_DISABLED) {
744 			kn->kn_status &= ~KN_QUEUED;
745 			kq->kq_count--;
746 			continue;
747 		}
748 		if ((kn->kn_flags & EV_ONESHOT) == 0 &&
749 		    kn->kn_fop->f_event(kn, 0) == 0) {
750 			kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
751 			kq->kq_count--;
752 			continue;
753 		}
754 		*kevp = kn->kn_kevent;
755 		kevp++;
756 		nkev++;
757 		if (kn->kn_flags & EV_ONESHOT) {
758 			kn->kn_status &= ~KN_QUEUED;
759 			kq->kq_count--;
760 			splx(s);
761 			kn->kn_fop->f_detach(kn);
762 			knote_drop(kn, td);
763 			s = splhigh();
764 		} else if (kn->kn_flags & EV_CLEAR) {
765 			kn->kn_data = 0;
766 			kn->kn_fflags = 0;
767 			kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
768 			kq->kq_count--;
769 		} else {
770 			TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
771 		}
772 		count--;
773 		if (nkev == KQ_NEVENTS) {
774 			splx(s);
775 			error = copyout(&kq->kq_kev, ulistp,
776 			    sizeof(struct kevent) * nkev);
777 			ulistp += nkev;
778 			nkev = 0;
779 			kevp = kq->kq_kev;
780 			s = splhigh();
781 			if (error)
782 				break;
783 		}
784 	}
785 	TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe);
786 	splx(s);
787 done:
788 	if (nkev != 0)
789 		error = copyout(&kq->kq_kev, ulistp,
790 		    sizeof(struct kevent) * nkev);
791         td->td_retval[0] = maxevents - count;
792 	return (error);
793 }
794 
795 /*
796  * XXX
797  * This could be expanded to call kqueue_scan, if desired.
798  */
799 /*ARGSUSED*/
800 static int
801 kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
802 	int flags, struct thread *td)
803 {
804 	return (ENXIO);
805 }
806 
807 /*ARGSUSED*/
808 static int
809 kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
810 	 int flags, struct thread *td)
811 {
812 	return (ENXIO);
813 }
814 
815 /*ARGSUSED*/
816 static int
817 kqueue_ioctl(struct file *fp, u_long cmd, void *data,
818 	struct ucred *active_cred, struct thread *td)
819 {
820 	/*
821 	 * Enabling sigio causes two major problems:
822 	 * 1) infinite recursion:
823 	 * Synopsys: kevent is being used to track signals and have FIOASYNC
824 	 * set.  On receipt of a signal this will cause a kqueue to recurse
825 	 * into itself over and over.  Sending the sigio causes the kqueue
826 	 * to become ready, which in turn posts sigio again, forever.
827 	 * Solution: this can be solved by setting a flag in the kqueue that
828 	 * we have a SIGIO in progress.
829 	 * 2) locking problems:
830 	 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
831 	 * us above the proc and pgrp locks.
832 	 * Solution: Post a signal using an async mechanism, being sure to
833 	 * record a generation count in the delivery so that we do not deliver
834 	 * a signal to the wrong process.
835 	 *
836 	 * Note, these two mechanisms are somewhat mutually exclusive!
837 	 */
838 #if 0
839 	struct kqueue *kq;
840 
841 	kq = fp->f_data;
842 	switch (cmd) {
843 	case FIOASYNC:
844 		if (*(int *)data) {
845 			kq->kq_state |= KQ_ASYNC;
846 		} else {
847 			kq->kq_state &= ~KQ_ASYNC;
848 		}
849 		return (0);
850 
851 	case FIOSETOWN:
852 		return (fsetown(*(int *)data, &kq->kq_sigio));
853 
854 	case FIOGETOWN:
855 		*(int *)data = fgetown(&kq->kq_sigio);
856 		return (0);
857 	}
858 #endif
859 
860 	return (ENOTTY);
861 }
862 
863 /*ARGSUSED*/
864 static int
865 kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
866 	struct thread *td)
867 {
868 	struct kqueue *kq;
869 	int revents = 0;
870 	int s = splnet();
871 
872 	kq = fp->f_data;
873         if (events & (POLLIN | POLLRDNORM)) {
874                 if (kq->kq_count) {
875                         revents |= events & (POLLIN | POLLRDNORM);
876 		} else {
877                         selrecord(td, &kq->kq_sel);
878 			kq->kq_state |= KQ_SEL;
879 		}
880 	}
881 	splx(s);
882 	return (revents);
883 }
884 
885 /*ARGSUSED*/
886 static int
887 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
888 	struct thread *td)
889 {
890 	struct kqueue *kq;
891 
892 	/* Unlocked read. */
893 	kq = fp->f_data;
894 	bzero((void *)st, sizeof(*st));
895 	st->st_size = kq->kq_count;
896 	st->st_blksize = sizeof(struct kevent);
897 	st->st_mode = S_IFIFO;
898 	return (0);
899 }
900 
901 /*ARGSUSED*/
902 static int
903 kqueue_close(struct file *fp, struct thread *td)
904 {
905 	struct kqueue *kq = fp->f_data;
906 	struct filedesc *fdp = kq->kq_fdp;
907 	struct knote **knp, *kn, *kn0;
908 	int i;
909 
910 	mtx_lock(&Giant);
911 
912 	FILEDESC_LOCK(fdp);
913 	for (i = 0; i < fdp->fd_knlistsize; i++) {
914 		knp = &SLIST_FIRST(&fdp->fd_knlist[i]);
915 		kn = *knp;
916 		while (kn != NULL) {
917 			kn0 = SLIST_NEXT(kn, kn_link);
918 			if (kq == kn->kn_kq) {
919 				kn->kn_fop->f_detach(kn);
920 				*knp = kn0;
921 				FILE_LOCK(kn->kn_fp);
922 				FILEDESC_UNLOCK(fdp);
923 				fdrop_locked(kn->kn_fp, td);
924 				knote_free(kn);
925 				FILEDESC_LOCK(fdp);
926 			} else {
927 				knp = &SLIST_NEXT(kn, kn_link);
928 			}
929 			kn = kn0;
930 		}
931 	}
932 	if (fdp->fd_knhashmask != 0) {
933 		for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
934 			knp = &SLIST_FIRST(&fdp->fd_knhash[i]);
935 			kn = *knp;
936 			while (kn != NULL) {
937 				kn0 = SLIST_NEXT(kn, kn_link);
938 				if (kq == kn->kn_kq) {
939 					kn->kn_fop->f_detach(kn);
940 					*knp = kn0;
941 		/* XXX non-fd release of kn->kn_ptr */
942 					FILEDESC_UNLOCK(fdp);
943 					knote_free(kn);
944 					FILEDESC_LOCK(fdp);
945 				} else {
946 					knp = &SLIST_NEXT(kn, kn_link);
947 				}
948 				kn = kn0;
949 			}
950 		}
951 	}
952 	FILEDESC_UNLOCK(fdp);
953 	if (kq->kq_state & KQ_SEL) {
954 		kq->kq_state &= ~KQ_SEL;
955 		selwakeuppri(&kq->kq_sel, PSOCK);
956 	}
957 	funsetown(&kq->kq_sigio);
958 	free(kq, M_KQUEUE);
959 	fp->f_data = NULL;
960 
961 	mtx_unlock(&Giant);
962 	return (0);
963 }
964 
965 static void
966 kqueue_wakeup(struct kqueue *kq)
967 {
968 
969 	if (kq->kq_state & KQ_SLEEP) {
970 		kq->kq_state &= ~KQ_SLEEP;
971 		wakeup(kq);
972 	}
973 	if (kq->kq_state & KQ_SEL) {
974 		kq->kq_state &= ~KQ_SEL;
975 		selwakeuppri(&kq->kq_sel, PSOCK);
976 	}
977 	if (kq->kq_state & KQ_ASYNC) {
978 		pgsigio(&kq->kq_sigio, SIGIO, 0);
979 	}
980 	KNOTE(&kq->kq_sel.si_note, 0);
981 }
982 
983 /*
984  * walk down a list of knotes, activating them if their event has triggered.
985  */
986 void
987 knote(struct klist *list, long hint)
988 {
989 	struct knote *kn;
990 
991 	SLIST_FOREACH(kn, list, kn_selnext)
992 		if (kn->kn_fop->f_event(kn, hint))
993 			KNOTE_ACTIVATE(kn);
994 }
995 
996 /*
997  * remove all knotes from a specified klist
998  */
999 void
1000 knote_remove(struct thread *td, struct klist *list)
1001 {
1002 	struct knote *kn;
1003 
1004 	while ((kn = SLIST_FIRST(list)) != NULL) {
1005 		kn->kn_fop->f_detach(kn);
1006 		knote_drop(kn, td);
1007 	}
1008 }
1009 
1010 /*
1011  * remove all knotes referencing a specified fd
1012  */
1013 void
1014 knote_fdclose(struct thread *td, int fd)
1015 {
1016 	struct filedesc *fdp = td->td_proc->p_fd;
1017 	struct klist *list;
1018 
1019 	FILEDESC_LOCK(fdp);
1020 	list = &fdp->fd_knlist[fd];
1021 	FILEDESC_UNLOCK(fdp);
1022 	knote_remove(td, list);
1023 }
1024 
1025 static void
1026 knote_attach(struct knote *kn, struct filedesc *fdp)
1027 {
1028 	struct klist *list, *tmp_knhash;
1029 	u_long tmp_knhashmask;
1030 	int size;
1031 
1032 	FILEDESC_LOCK(fdp);
1033 
1034 	if (! kn->kn_fop->f_isfd) {
1035 		if (fdp->fd_knhashmask == 0) {
1036 			FILEDESC_UNLOCK(fdp);
1037 			tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
1038 			    &tmp_knhashmask);
1039 			FILEDESC_LOCK(fdp);
1040 			if (fdp->fd_knhashmask == 0) {
1041 				fdp->fd_knhash = tmp_knhash;
1042 				fdp->fd_knhashmask = tmp_knhashmask;
1043 			} else {
1044 				free(tmp_knhash, M_KQUEUE);
1045 			}
1046 		}
1047 		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
1048 		goto done;
1049 	}
1050 
1051 	if (fdp->fd_knlistsize <= kn->kn_id) {
1052 		size = fdp->fd_knlistsize;
1053 		while (size <= kn->kn_id)
1054 			size += KQEXTENT;
1055 		FILEDESC_UNLOCK(fdp);
1056 		MALLOC(list, struct klist *,
1057 		    size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
1058 		FILEDESC_LOCK(fdp);
1059 		if (fdp->fd_knlistsize > kn->kn_id) {
1060 			FREE(list, M_KQUEUE);
1061 			goto bigenough;
1062 		}
1063 		if (fdp->fd_knlist != NULL) {
1064 			bcopy(fdp->fd_knlist, list,
1065 			    fdp->fd_knlistsize * sizeof(struct klist *));
1066 			FREE(fdp->fd_knlist, M_KQUEUE);
1067 		}
1068 		bzero((caddr_t)list +
1069 		    fdp->fd_knlistsize * sizeof(struct klist *),
1070 		    (size - fdp->fd_knlistsize) * sizeof(struct klist *));
1071 		fdp->fd_knlistsize = size;
1072 		fdp->fd_knlist = list;
1073 	}
1074 bigenough:
1075 	list = &fdp->fd_knlist[kn->kn_id];
1076 done:
1077 	FILEDESC_UNLOCK(fdp);
1078 	SLIST_INSERT_HEAD(list, kn, kn_link);
1079 	kn->kn_status = 0;
1080 }
1081 
1082 /*
1083  * should be called at spl == 0, since we don't want to hold spl
1084  * while calling fdrop and free.
1085  */
1086 static void
1087 knote_drop(struct knote *kn, struct thread *td)
1088 {
1089         struct filedesc *fdp = td->td_proc->p_fd;
1090 	struct klist *list;
1091 
1092 	FILEDESC_LOCK(fdp);
1093 	if (kn->kn_fop->f_isfd)
1094 		list = &fdp->fd_knlist[kn->kn_id];
1095 	else
1096 		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
1097 	if (kn->kn_fop->f_isfd)
1098 		FILE_LOCK(kn->kn_fp);
1099 	FILEDESC_UNLOCK(fdp);
1100 
1101 	SLIST_REMOVE(list, kn, knote, kn_link);
1102 	if (kn->kn_status & KN_QUEUED)
1103 		knote_dequeue(kn);
1104 	if (kn->kn_fop->f_isfd)
1105 		fdrop_locked(kn->kn_fp, td);
1106 	knote_free(kn);
1107 }
1108 
1109 
1110 static void
1111 knote_enqueue(struct knote *kn)
1112 {
1113 	struct kqueue *kq = kn->kn_kq;
1114 	int s = splhigh();
1115 
1116 	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
1117 
1118 	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1119 	kn->kn_status |= KN_QUEUED;
1120 	kq->kq_count++;
1121 	splx(s);
1122 	kqueue_wakeup(kq);
1123 }
1124 
1125 static void
1126 knote_dequeue(struct knote *kn)
1127 {
1128 	struct kqueue *kq = kn->kn_kq;
1129 	int s = splhigh();
1130 
1131 	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
1132 
1133 	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1134 	kn->kn_status &= ~KN_QUEUED;
1135 	kq->kq_count--;
1136 	splx(s);
1137 }
1138 
1139 static void
1140 knote_init(void)
1141 {
1142 	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
1143 	    NULL, NULL, UMA_ALIGN_PTR, 0);
1144 
1145 }
1146 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
1147 
1148 static struct knote *
1149 knote_alloc(void)
1150 {
1151 	return ((struct knote *)uma_zalloc(knote_zone, M_WAITOK));
1152 }
1153 
1154 static void
1155 knote_free(struct knote *kn)
1156 {
1157 	uma_zfree(knote_zone, kn);
1158 }
1159