xref: /freebsd/sys/kern/kern_event.c (revision c18a16ebcf5bf0bad19be10f58d9f42cbc079057)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
5  * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
6  * Copyright (c) 2009 Apple, Inc.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 #include "opt_ktrace.h"
33 #include "opt_kqueue.h"
34 
35 #ifdef COMPAT_FREEBSD11
36 #define	_WANT_FREEBSD11_KEVENT
37 #endif
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/capsicum.h>
42 #include <sys/kernel.h>
43 #include <sys/limits.h>
44 #include <sys/lock.h>
45 #include <sys/mutex.h>
46 #include <sys/proc.h>
47 #include <sys/malloc.h>
48 #include <sys/unistd.h>
49 #include <sys/file.h>
50 #include <sys/filedesc.h>
51 #include <sys/filio.h>
52 #include <sys/fcntl.h>
53 #include <sys/kthread.h>
54 #include <sys/selinfo.h>
55 #include <sys/queue.h>
56 #include <sys/event.h>
57 #include <sys/eventvar.h>
58 #include <sys/poll.h>
59 #include <sys/protosw.h>
60 #include <sys/resourcevar.h>
61 #include <sys/sbuf.h>
62 #include <sys/sigio.h>
63 #include <sys/signalvar.h>
64 #include <sys/socket.h>
65 #include <sys/socketvar.h>
66 #include <sys/stat.h>
67 #include <sys/sysctl.h>
68 #include <sys/sysent.h>
69 #include <sys/sysproto.h>
70 #include <sys/syscallsubr.h>
71 #include <sys/taskqueue.h>
72 #include <sys/uio.h>
73 #include <sys/user.h>
74 #ifdef KTRACE
75 #include <sys/ktrace.h>
76 #endif
77 #include <machine/atomic.h>
78 #ifdef COMPAT_FREEBSD32
79 #include <compat/freebsd32/freebsd32.h>
80 #include <compat/freebsd32/freebsd32_util.h>
81 #endif
82 
83 #include <vm/uma.h>
84 
85 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
86 
87 /*
88  * This lock is used if multiple kq locks are required.  This possibly
89  * should be made into a per proc lock.
90  */
91 static struct mtx	kq_global;
92 MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
93 #define KQ_GLOBAL_LOCK(lck, haslck)	do {	\
94 	if (!haslck)				\
95 		mtx_lock(lck);			\
96 	haslck = 1;				\
97 } while (0)
98 #define KQ_GLOBAL_UNLOCK(lck, haslck)	do {	\
99 	if (haslck)				\
100 		mtx_unlock(lck);			\
101 	haslck = 0;				\
102 } while (0)
103 
104 TASKQUEUE_DEFINE_THREAD(kqueue_ctx);
105 
106 static int	kevent_copyout(void *arg, struct kevent *kevp, int count);
107 static int	kevent_copyin(void *arg, struct kevent *kevp, int count);
108 static int	kqueue_register(struct kqueue *kq, struct kevent *kev,
109 		    struct thread *td, int mflag);
110 static int	kqueue_acquire(struct file *fp, struct kqueue **kqp);
111 static void	kqueue_release(struct kqueue *kq, int locked);
112 static void	kqueue_destroy(struct kqueue *kq);
113 static void	kqueue_drain(struct kqueue *kq, struct thread *td);
114 static int	kqueue_expand(struct kqueue *kq, const struct filterops *fops,
115 		    uintptr_t ident, int mflag);
116 static void	kqueue_task(void *arg, int pending);
117 static int	kqueue_scan(struct kqueue *kq, int maxevents,
118 		    struct kevent_copyops *k_ops,
119 		    const struct timespec *timeout,
120 		    struct kevent *keva, struct thread *td);
121 static void 	kqueue_wakeup(struct kqueue *kq);
122 static const struct filterops *kqueue_fo_find(int filt);
123 static void	kqueue_fo_release(int filt);
124 struct g_kevent_args;
125 static int	kern_kevent_generic(struct thread *td,
126 		    struct g_kevent_args *uap,
127 		    struct kevent_copyops *k_ops, const char *struct_name);
128 
129 static fo_ioctl_t	kqueue_ioctl;
130 static fo_poll_t	kqueue_poll;
131 static fo_kqfilter_t	kqueue_kqfilter;
132 static fo_stat_t	kqueue_stat;
133 static fo_close_t	kqueue_close;
134 static fo_fill_kinfo_t	kqueue_fill_kinfo;
135 
136 static const struct fileops kqueueops = {
137 	.fo_read = invfo_rdwr,
138 	.fo_write = invfo_rdwr,
139 	.fo_truncate = invfo_truncate,
140 	.fo_ioctl = kqueue_ioctl,
141 	.fo_poll = kqueue_poll,
142 	.fo_kqfilter = kqueue_kqfilter,
143 	.fo_stat = kqueue_stat,
144 	.fo_close = kqueue_close,
145 	.fo_chmod = invfo_chmod,
146 	.fo_chown = invfo_chown,
147 	.fo_sendfile = invfo_sendfile,
148 	.fo_cmp = file_kcmp_generic,
149 	.fo_fill_kinfo = kqueue_fill_kinfo,
150 };
151 
152 static int 	knote_attach(struct knote *kn, struct kqueue *kq);
153 static void 	knote_drop(struct knote *kn, struct thread *td);
154 static void 	knote_drop_detached(struct knote *kn, struct thread *td);
155 static void 	knote_enqueue(struct knote *kn);
156 static void 	knote_dequeue(struct knote *kn);
157 static void 	knote_init(void);
158 static struct 	knote *knote_alloc(int mflag);
159 static void 	knote_free(struct knote *kn);
160 
161 static void	filt_kqdetach(struct knote *kn);
162 static int	filt_kqueue(struct knote *kn, long hint);
163 static int	filt_procattach(struct knote *kn);
164 static void	filt_procdetach(struct knote *kn);
165 static int	filt_proc(struct knote *kn, long hint);
166 static int	filt_fileattach(struct knote *kn);
167 static void	filt_timerexpire(void *knx);
168 static void	filt_timerexpire_l(struct knote *kn, bool proc_locked);
169 static int	filt_timerattach(struct knote *kn);
170 static void	filt_timerdetach(struct knote *kn);
171 static void	filt_timerstart(struct knote *kn, sbintime_t to);
172 static void	filt_timertouch(struct knote *kn, struct kevent *kev,
173 		    u_long type);
174 static int	filt_timervalidate(struct knote *kn, sbintime_t *to);
175 static int	filt_timer(struct knote *kn, long hint);
176 static int	filt_userattach(struct knote *kn);
177 static void	filt_userdetach(struct knote *kn);
178 static int	filt_user(struct knote *kn, long hint);
179 static void	filt_usertouch(struct knote *kn, struct kevent *kev,
180 		    u_long type);
181 
182 static const struct filterops file_filtops = {
183 	.f_isfd = 1,
184 	.f_attach = filt_fileattach,
185 };
186 static const struct filterops kqread_filtops = {
187 	.f_isfd = 1,
188 	.f_detach = filt_kqdetach,
189 	.f_event = filt_kqueue,
190 };
191 /* XXX - move to kern_proc.c?  */
192 static const struct filterops proc_filtops = {
193 	.f_isfd = 0,
194 	.f_attach = filt_procattach,
195 	.f_detach = filt_procdetach,
196 	.f_event = filt_proc,
197 };
198 static const struct filterops timer_filtops = {
199 	.f_isfd = 0,
200 	.f_attach = filt_timerattach,
201 	.f_detach = filt_timerdetach,
202 	.f_event = filt_timer,
203 	.f_touch = filt_timertouch,
204 };
205 static const struct filterops user_filtops = {
206 	.f_attach = filt_userattach,
207 	.f_detach = filt_userdetach,
208 	.f_event = filt_user,
209 	.f_touch = filt_usertouch,
210 };
211 
212 static uma_zone_t	knote_zone;
213 static unsigned int __exclusive_cache_line	kq_ncallouts;
214 static unsigned int 	kq_calloutmax = 4 * 1024;
215 SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
216     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
217 
218 /* XXX - ensure not influx ? */
219 #define KNOTE_ACTIVATE(kn, islock) do { 				\
220 	if ((islock))							\
221 		mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED);		\
222 	else								\
223 		KQ_LOCK((kn)->kn_kq);					\
224 	(kn)->kn_status |= KN_ACTIVE;					\
225 	if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
226 		knote_enqueue((kn));					\
227 	if (!(islock))							\
228 		KQ_UNLOCK((kn)->kn_kq);					\
229 } while (0)
230 #define KQ_LOCK(kq) do {						\
231 	mtx_lock(&(kq)->kq_lock);					\
232 } while (0)
233 #define KQ_FLUX_WAKEUP(kq) do {						\
234 	if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) {		\
235 		(kq)->kq_state &= ~KQ_FLUXWAIT;				\
236 		wakeup((kq));						\
237 	}								\
238 } while (0)
239 #define KQ_UNLOCK_FLUX(kq) do {						\
240 	KQ_FLUX_WAKEUP(kq);						\
241 	mtx_unlock(&(kq)->kq_lock);					\
242 } while (0)
243 #define KQ_UNLOCK(kq) do {						\
244 	mtx_unlock(&(kq)->kq_lock);					\
245 } while (0)
246 #define KQ_OWNED(kq) do {						\
247 	mtx_assert(&(kq)->kq_lock, MA_OWNED);				\
248 } while (0)
249 #define KQ_NOTOWNED(kq) do {						\
250 	mtx_assert(&(kq)->kq_lock, MA_NOTOWNED);			\
251 } while (0)
252 
253 static struct knlist *
kn_list_lock(struct knote * kn)254 kn_list_lock(struct knote *kn)
255 {
256 	struct knlist *knl;
257 
258 	knl = kn->kn_knlist;
259 	if (knl != NULL)
260 		knl->kl_lock(knl->kl_lockarg);
261 	return (knl);
262 }
263 
264 static void
kn_list_unlock(struct knlist * knl)265 kn_list_unlock(struct knlist *knl)
266 {
267 	bool do_free;
268 
269 	if (knl == NULL)
270 		return;
271 	do_free = knl->kl_autodestroy && knlist_empty(knl);
272 	knl->kl_unlock(knl->kl_lockarg);
273 	if (do_free) {
274 		knlist_destroy(knl);
275 		free(knl, M_KQUEUE);
276 	}
277 }
278 
279 static bool
kn_in_flux(struct knote * kn)280 kn_in_flux(struct knote *kn)
281 {
282 
283 	return (kn->kn_influx > 0);
284 }
285 
286 static void
kn_enter_flux(struct knote * kn)287 kn_enter_flux(struct knote *kn)
288 {
289 
290 	KQ_OWNED(kn->kn_kq);
291 	MPASS(kn->kn_influx < INT_MAX);
292 	kn->kn_influx++;
293 }
294 
295 static bool
kn_leave_flux(struct knote * kn)296 kn_leave_flux(struct knote *kn)
297 {
298 
299 	KQ_OWNED(kn->kn_kq);
300 	MPASS(kn->kn_influx > 0);
301 	kn->kn_influx--;
302 	return (kn->kn_influx == 0);
303 }
304 
305 #define	KNL_ASSERT_LOCK(knl, islocked) do {				\
306 	if (islocked)							\
307 		KNL_ASSERT_LOCKED(knl);				\
308 	else								\
309 		KNL_ASSERT_UNLOCKED(knl);				\
310 } while (0)
311 #ifdef INVARIANTS
312 #define	KNL_ASSERT_LOCKED(knl) do {					\
313 	knl->kl_assert_lock((knl)->kl_lockarg, LA_LOCKED);		\
314 } while (0)
315 #define	KNL_ASSERT_UNLOCKED(knl) do {					\
316 	knl->kl_assert_lock((knl)->kl_lockarg, LA_UNLOCKED);		\
317 } while (0)
318 #else /* !INVARIANTS */
319 #define	KNL_ASSERT_LOCKED(knl) do {} while (0)
320 #define	KNL_ASSERT_UNLOCKED(knl) do {} while (0)
321 #endif /* INVARIANTS */
322 
323 #ifndef	KN_HASHSIZE
324 #define	KN_HASHSIZE		64		/* XXX should be tunable */
325 #endif
326 
327 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
328 
329 static int
filt_nullattach(struct knote * kn)330 filt_nullattach(struct knote *kn)
331 {
332 
333 	return (ENXIO);
334 };
335 
336 static const struct filterops null_filtops = {
337 	.f_isfd = 0,
338 	.f_attach = filt_nullattach,
339 };
340 
341 /* XXX - make SYSINIT to add these, and move into respective modules. */
342 extern const struct filterops sig_filtops;
343 extern const struct filterops fs_filtops;
344 
345 /*
346  * Table for all system-defined filters.
347  */
348 static struct mtx	filterops_lock;
349 MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops", MTX_DEF);
350 static struct {
351 	const struct filterops *for_fop;
352 	int for_nolock;
353 	int for_refcnt;
354 } sysfilt_ops[EVFILT_SYSCOUNT] = {
355 	[~EVFILT_READ] = { &file_filtops, 1 },
356 	[~EVFILT_WRITE] = { &file_filtops, 1 },
357 	[~EVFILT_AIO] = { &null_filtops },
358 	[~EVFILT_VNODE] = { &file_filtops, 1 },
359 	[~EVFILT_PROC] = { &proc_filtops, 1 },
360 	[~EVFILT_SIGNAL] = { &sig_filtops, 1 },
361 	[~EVFILT_TIMER] = { &timer_filtops, 1 },
362 	[~EVFILT_PROCDESC] = { &file_filtops, 1 },
363 	[~EVFILT_FS] = { &fs_filtops, 1 },
364 	[~EVFILT_LIO] = { &null_filtops },
365 	[~EVFILT_USER] = { &user_filtops, 1 },
366 	[~EVFILT_SENDFILE] = { &null_filtops },
367 	[~EVFILT_EMPTY] = { &file_filtops, 1 },
368 };
369 
370 /*
371  * Simple redirection for all cdevsw style objects to call their fo_kqfilter
372  * method.
373  */
374 static int
filt_fileattach(struct knote * kn)375 filt_fileattach(struct knote *kn)
376 {
377 
378 	return (fo_kqfilter(kn->kn_fp, kn));
379 }
380 
381 /*ARGSUSED*/
382 static int
kqueue_kqfilter(struct file * fp,struct knote * kn)383 kqueue_kqfilter(struct file *fp, struct knote *kn)
384 {
385 	struct kqueue *kq = kn->kn_fp->f_data;
386 
387 	if (kn->kn_filter != EVFILT_READ)
388 		return (EINVAL);
389 
390 	kn->kn_status |= KN_KQUEUE;
391 	kn->kn_fop = &kqread_filtops;
392 	knlist_add(&kq->kq_sel.si_note, kn, 0);
393 
394 	return (0);
395 }
396 
397 static void
filt_kqdetach(struct knote * kn)398 filt_kqdetach(struct knote *kn)
399 {
400 	struct kqueue *kq = kn->kn_fp->f_data;
401 
402 	knlist_remove(&kq->kq_sel.si_note, kn, 0);
403 }
404 
405 /*ARGSUSED*/
406 static int
filt_kqueue(struct knote * kn,long hint)407 filt_kqueue(struct knote *kn, long hint)
408 {
409 	struct kqueue *kq = kn->kn_fp->f_data;
410 
411 	kn->kn_data = kq->kq_count;
412 	return (kn->kn_data > 0);
413 }
414 
415 /* XXX - move to kern_proc.c?  */
416 static int
filt_procattach(struct knote * kn)417 filt_procattach(struct knote *kn)
418 {
419 	struct proc *p;
420 	int error;
421 	bool exiting, immediate;
422 
423 	exiting = immediate = false;
424 	if (kn->kn_sfflags & NOTE_EXIT)
425 		p = pfind_any(kn->kn_id);
426 	else
427 		p = pfind(kn->kn_id);
428 	if (p == NULL)
429 		return (ESRCH);
430 	if (p->p_flag & P_WEXIT)
431 		exiting = true;
432 
433 	if ((error = p_cansee(curthread, p))) {
434 		PROC_UNLOCK(p);
435 		return (error);
436 	}
437 
438 	kn->kn_ptr.p_proc = p;
439 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
440 
441 	/*
442 	 * Internal flag indicating registration done by kernel for the
443 	 * purposes of getting a NOTE_CHILD notification.
444 	 */
445 	if (kn->kn_flags & EV_FLAG2) {
446 		kn->kn_flags &= ~EV_FLAG2;
447 		kn->kn_data = kn->kn_sdata;		/* ppid */
448 		kn->kn_fflags = NOTE_CHILD;
449 		kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK);
450 		immediate = true; /* Force immediate activation of child note. */
451 	}
452 	/*
453 	 * Internal flag indicating registration done by kernel (for other than
454 	 * NOTE_CHILD).
455 	 */
456 	if (kn->kn_flags & EV_FLAG1) {
457 		kn->kn_flags &= ~EV_FLAG1;
458 	}
459 
460 	knlist_add(p->p_klist, kn, 1);
461 
462 	/*
463 	 * Immediately activate any child notes or, in the case of a zombie
464 	 * target process, exit notes.  The latter is necessary to handle the
465 	 * case where the target process, e.g. a child, dies before the kevent
466 	 * is registered.
467 	 */
468 	if (immediate || (exiting && filt_proc(kn, NOTE_EXIT)))
469 		KNOTE_ACTIVATE(kn, 0);
470 
471 	PROC_UNLOCK(p);
472 
473 	return (0);
474 }
475 
476 /*
477  * The knote may be attached to a different process, which may exit,
478  * leaving nothing for the knote to be attached to.  So when the process
479  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
480  * it will be deleted when read out.  However, as part of the knote deletion,
481  * this routine is called, so a check is needed to avoid actually performing
482  * a detach, because the original process does not exist any more.
483  */
484 /* XXX - move to kern_proc.c?  */
485 static void
filt_procdetach(struct knote * kn)486 filt_procdetach(struct knote *kn)
487 {
488 
489 	knlist_remove(kn->kn_knlist, kn, 0);
490 	kn->kn_ptr.p_proc = NULL;
491 }
492 
493 /* XXX - move to kern_proc.c?  */
494 static int
filt_proc(struct knote * kn,long hint)495 filt_proc(struct knote *kn, long hint)
496 {
497 	struct proc *p;
498 	u_int event;
499 
500 	p = kn->kn_ptr.p_proc;
501 	if (p == NULL) /* already activated, from attach filter */
502 		return (0);
503 
504 	/* Mask off extra data. */
505 	event = (u_int)hint & NOTE_PCTRLMASK;
506 
507 	/* If the user is interested in this event, record it. */
508 	if (kn->kn_sfflags & event)
509 		kn->kn_fflags |= event;
510 
511 	/* Process is gone, so flag the event as finished. */
512 	if (event == NOTE_EXIT) {
513 		kn->kn_flags |= EV_EOF | EV_ONESHOT;
514 		kn->kn_ptr.p_proc = NULL;
515 		if (kn->kn_fflags & NOTE_EXIT)
516 			kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig);
517 		if (kn->kn_fflags == 0)
518 			kn->kn_flags |= EV_DROP;
519 		return (1);
520 	}
521 
522 	return (kn->kn_fflags != 0);
523 }
524 
525 /*
526  * Called when the process forked. It mostly does the same as the
527  * knote(), activating all knotes registered to be activated when the
528  * process forked. Additionally, for each knote attached to the
529  * parent, check whether user wants to track the new process. If so
530  * attach a new knote to it, and immediately report an event with the
531  * child's pid.
532  */
533 void
knote_fork(struct knlist * list,int pid)534 knote_fork(struct knlist *list, int pid)
535 {
536 	struct kqueue *kq;
537 	struct knote *kn;
538 	struct kevent kev;
539 	int error;
540 
541 	MPASS(list != NULL);
542 	KNL_ASSERT_LOCKED(list);
543 	if (SLIST_EMPTY(&list->kl_list))
544 		return;
545 
546 	memset(&kev, 0, sizeof(kev));
547 	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
548 		kq = kn->kn_kq;
549 		KQ_LOCK(kq);
550 		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
551 			KQ_UNLOCK(kq);
552 			continue;
553 		}
554 
555 		/*
556 		 * The same as knote(), activate the event.
557 		 */
558 		if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
559 			if (kn->kn_fop->f_event(kn, NOTE_FORK))
560 				KNOTE_ACTIVATE(kn, 1);
561 			KQ_UNLOCK(kq);
562 			continue;
563 		}
564 
565 		/*
566 		 * The NOTE_TRACK case. In addition to the activation
567 		 * of the event, we need to register new events to
568 		 * track the child. Drop the locks in preparation for
569 		 * the call to kqueue_register().
570 		 */
571 		kn_enter_flux(kn);
572 		KQ_UNLOCK(kq);
573 		list->kl_unlock(list->kl_lockarg);
574 
575 		/*
576 		 * Activate existing knote and register tracking knotes with
577 		 * new process.
578 		 *
579 		 * First register a knote to get just the child notice. This
580 		 * must be a separate note from a potential NOTE_EXIT
581 		 * notification since both NOTE_CHILD and NOTE_EXIT are defined
582 		 * to use the data field (in conflicting ways).
583 		 */
584 		kev.ident = pid;
585 		kev.filter = kn->kn_filter;
586 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT |
587 		    EV_FLAG2;
588 		kev.fflags = kn->kn_sfflags;
589 		kev.data = kn->kn_id;		/* parent */
590 		kev.udata = kn->kn_kevent.udata;/* preserve udata */
591 		error = kqueue_register(kq, &kev, NULL, M_NOWAIT);
592 		if (error)
593 			kn->kn_fflags |= NOTE_TRACKERR;
594 
595 		/*
596 		 * Then register another knote to track other potential events
597 		 * from the new process.
598 		 */
599 		kev.ident = pid;
600 		kev.filter = kn->kn_filter;
601 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
602 		kev.fflags = kn->kn_sfflags;
603 		kev.data = kn->kn_id;		/* parent */
604 		kev.udata = kn->kn_kevent.udata;/* preserve udata */
605 		error = kqueue_register(kq, &kev, NULL, M_NOWAIT);
606 		if (error)
607 			kn->kn_fflags |= NOTE_TRACKERR;
608 		if (kn->kn_fop->f_event(kn, NOTE_FORK))
609 			KNOTE_ACTIVATE(kn, 0);
610 		list->kl_lock(list->kl_lockarg);
611 		KQ_LOCK(kq);
612 		kn_leave_flux(kn);
613 		KQ_UNLOCK_FLUX(kq);
614 	}
615 }
616 
617 /*
618  * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
619  * interval timer support code.
620  */
621 
622 #define NOTE_TIMER_PRECMASK						\
623     (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS)
624 
625 static sbintime_t
timer2sbintime(int64_t data,int flags)626 timer2sbintime(int64_t data, int flags)
627 {
628 	int64_t secs;
629 
630         /*
631          * Macros for converting to the fractional second portion of an
632          * sbintime_t using 64bit multiplication to improve precision.
633          */
634 #define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32)
635 #define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32)
636 #define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32)
637 	switch (flags & NOTE_TIMER_PRECMASK) {
638 	case NOTE_SECONDS:
639 #ifdef __LP64__
640 		if (data > (SBT_MAX / SBT_1S))
641 			return (SBT_MAX);
642 #endif
643 		return ((sbintime_t)data << 32);
644 	case NOTE_MSECONDS: /* FALLTHROUGH */
645 	case 0:
646 		if (data >= 1000) {
647 			secs = data / 1000;
648 #ifdef __LP64__
649 			if (secs > (SBT_MAX / SBT_1S))
650 				return (SBT_MAX);
651 #endif
652 			return (secs << 32 | MS_TO_SBT(data % 1000));
653 		}
654 		return (MS_TO_SBT(data));
655 	case NOTE_USECONDS:
656 		if (data >= 1000000) {
657 			secs = data / 1000000;
658 #ifdef __LP64__
659 			if (secs > (SBT_MAX / SBT_1S))
660 				return (SBT_MAX);
661 #endif
662 			return (secs << 32 | US_TO_SBT(data % 1000000));
663 		}
664 		return (US_TO_SBT(data));
665 	case NOTE_NSECONDS:
666 		if (data >= 1000000000) {
667 			secs = data / 1000000000;
668 #ifdef __LP64__
669 			if (secs > (SBT_MAX / SBT_1S))
670 				return (SBT_MAX);
671 #endif
672 			return (secs << 32 | NS_TO_SBT(data % 1000000000));
673 		}
674 		return (NS_TO_SBT(data));
675 	default:
676 		break;
677 	}
678 	return (-1);
679 }
680 
681 struct kq_timer_cb_data {
682 	struct callout c;
683 	struct proc *p;
684 	struct knote *kn;
685 	int cpuid;
686 	int flags;
687 	TAILQ_ENTRY(kq_timer_cb_data) link;
688 	sbintime_t next;	/* next timer event fires at */
689 	sbintime_t to;		/* precalculated timer period, 0 for abs */
690 };
691 
692 #define	KQ_TIMER_CB_ENQUEUED	0x01
693 
694 static void
kqtimer_sched_callout(struct kq_timer_cb_data * kc)695 kqtimer_sched_callout(struct kq_timer_cb_data *kc)
696 {
697 	callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kc->kn,
698 	    kc->cpuid, C_ABSOLUTE);
699 }
700 
701 void
kqtimer_proc_continue(struct proc * p)702 kqtimer_proc_continue(struct proc *p)
703 {
704 	struct kq_timer_cb_data *kc, *kc1;
705 	struct bintime bt;
706 	sbintime_t now;
707 
708 	PROC_LOCK_ASSERT(p, MA_OWNED);
709 
710 	getboottimebin(&bt);
711 	now = bttosbt(bt);
712 
713 	TAILQ_FOREACH_SAFE(kc, &p->p_kqtim_stop, link, kc1) {
714 		TAILQ_REMOVE(&p->p_kqtim_stop, kc, link);
715 		kc->flags &= ~KQ_TIMER_CB_ENQUEUED;
716 		if (kc->next <= now)
717 			filt_timerexpire_l(kc->kn, true);
718 		else
719 			kqtimer_sched_callout(kc);
720 	}
721 }
722 
723 static void
filt_timerexpire_l(struct knote * kn,bool proc_locked)724 filt_timerexpire_l(struct knote *kn, bool proc_locked)
725 {
726 	struct kq_timer_cb_data *kc;
727 	struct proc *p;
728 	uint64_t delta;
729 	sbintime_t now;
730 
731 	kc = kn->kn_ptr.p_v;
732 
733 	if ((kn->kn_flags & EV_ONESHOT) != 0 || kc->to == 0) {
734 		kn->kn_data++;
735 		KNOTE_ACTIVATE(kn, 0);
736 		return;
737 	}
738 
739 	now = sbinuptime();
740 	if (now >= kc->next) {
741 		delta = (now - kc->next) / kc->to;
742 		if (delta == 0)
743 			delta = 1;
744 		kn->kn_data += delta;
745 		kc->next += delta * kc->to;
746 		if (now >= kc->next)	/* overflow */
747 			kc->next = now + kc->to;
748 		KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
749 	}
750 
751 	/*
752 	 * Initial check for stopped kc->p is racy.  It is fine to
753 	 * miss the set of the stop flags, at worst we would schedule
754 	 * one more callout.  On the other hand, it is not fine to not
755 	 * schedule when we we missed clearing of the flags, we
756 	 * recheck them under the lock and observe consistent state.
757 	 */
758 	p = kc->p;
759 	if (P_SHOULDSTOP(p) || P_KILLED(p)) {
760 		if (!proc_locked)
761 			PROC_LOCK(p);
762 		if (P_SHOULDSTOP(p) || P_KILLED(p)) {
763 			if ((kc->flags & KQ_TIMER_CB_ENQUEUED) == 0) {
764 				kc->flags |= KQ_TIMER_CB_ENQUEUED;
765 				TAILQ_INSERT_TAIL(&p->p_kqtim_stop, kc, link);
766 			}
767 			if (!proc_locked)
768 				PROC_UNLOCK(p);
769 			return;
770 		}
771 		if (!proc_locked)
772 			PROC_UNLOCK(p);
773 	}
774 	kqtimer_sched_callout(kc);
775 }
776 
777 static void
filt_timerexpire(void * knx)778 filt_timerexpire(void *knx)
779 {
780 	filt_timerexpire_l(knx, false);
781 }
782 
783 /*
784  * data contains amount of time to sleep
785  */
786 static int
filt_timervalidate(struct knote * kn,sbintime_t * to)787 filt_timervalidate(struct knote *kn, sbintime_t *to)
788 {
789 	struct bintime bt;
790 	sbintime_t sbt;
791 
792 	if (kn->kn_sdata < 0)
793 		return (EINVAL);
794 	if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
795 		kn->kn_sdata = 1;
796 	/*
797 	 * The only fflags values supported are the timer unit
798 	 * (precision) and the absolute time indicator.
799 	 */
800 	if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0)
801 		return (EINVAL);
802 
803 	*to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
804 	if (*to < 0)
805 		return (EINVAL);
806 	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
807 		getboottimebin(&bt);
808 		sbt = bttosbt(bt);
809 		*to = MAX(0, *to - sbt);
810 	}
811 	return (0);
812 }
813 
814 static int
filt_timerattach(struct knote * kn)815 filt_timerattach(struct knote *kn)
816 {
817 	struct kq_timer_cb_data *kc;
818 	sbintime_t to;
819 	int error;
820 
821 	to = -1;
822 	error = filt_timervalidate(kn, &to);
823 	if (error != 0)
824 		return (error);
825 	KASSERT(to > 0 || (kn->kn_flags & EV_ONESHOT) != 0 ||
826 	    (kn->kn_sfflags & NOTE_ABSTIME) != 0,
827 	    ("%s: periodic timer has a calculated zero timeout", __func__));
828 	KASSERT(to >= 0,
829 	    ("%s: timer has a calculated negative timeout", __func__));
830 
831 	if (atomic_fetchadd_int(&kq_ncallouts, 1) + 1 > kq_calloutmax) {
832 		atomic_subtract_int(&kq_ncallouts, 1);
833 		return (ENOMEM);
834 	}
835 
836 	if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
837 		kn->kn_flags |= EV_CLEAR;	/* automatically set */
838 	kn->kn_status &= ~KN_DETACHED;		/* knlist_add clears it */
839 	kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK);
840 	kc->kn = kn;
841 	kc->p = curproc;
842 	kc->cpuid = PCPU_GET(cpuid);
843 	kc->flags = 0;
844 	callout_init(&kc->c, 1);
845 	filt_timerstart(kn, to);
846 
847 	return (0);
848 }
849 
850 static void
filt_timerstart(struct knote * kn,sbintime_t to)851 filt_timerstart(struct knote *kn, sbintime_t to)
852 {
853 	struct kq_timer_cb_data *kc;
854 
855 	kc = kn->kn_ptr.p_v;
856 	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
857 		kc->next = to;
858 		kc->to = 0;
859 	} else {
860 		kc->next = to + sbinuptime();
861 		kc->to = to;
862 	}
863 	kqtimer_sched_callout(kc);
864 }
865 
866 static void
filt_timerdetach(struct knote * kn)867 filt_timerdetach(struct knote *kn)
868 {
869 	struct kq_timer_cb_data *kc;
870 	unsigned int old __unused;
871 	bool pending;
872 
873 	kc = kn->kn_ptr.p_v;
874 	do {
875 		callout_drain(&kc->c);
876 
877 		/*
878 		 * kqtimer_proc_continue() might have rescheduled this callout.
879 		 * Double-check, using the process mutex as an interlock.
880 		 */
881 		PROC_LOCK(kc->p);
882 		if ((kc->flags & KQ_TIMER_CB_ENQUEUED) != 0) {
883 			kc->flags &= ~KQ_TIMER_CB_ENQUEUED;
884 			TAILQ_REMOVE(&kc->p->p_kqtim_stop, kc, link);
885 		}
886 		pending = callout_pending(&kc->c);
887 		PROC_UNLOCK(kc->p);
888 	} while (pending);
889 	free(kc, M_KQUEUE);
890 	old = atomic_fetchadd_int(&kq_ncallouts, -1);
891 	KASSERT(old > 0, ("Number of callouts cannot become negative"));
892 	kn->kn_status |= KN_DETACHED;	/* knlist_remove sets it */
893 }
894 
895 static void
filt_timertouch(struct knote * kn,struct kevent * kev,u_long type)896 filt_timertouch(struct knote *kn, struct kevent *kev, u_long type)
897 {
898 	struct kq_timer_cb_data *kc;
899 	struct kqueue *kq;
900 	sbintime_t to;
901 	int error;
902 
903 	switch (type) {
904 	case EVENT_REGISTER:
905 		/* Handle re-added timers that update data/fflags */
906 		if (kev->flags & EV_ADD) {
907 			kc = kn->kn_ptr.p_v;
908 
909 			/* Drain any existing callout. */
910 			callout_drain(&kc->c);
911 
912 			/* Throw away any existing undelivered record
913 			 * of the timer expiration. This is done under
914 			 * the presumption that if a process is
915 			 * re-adding this timer with new parameters,
916 			 * it is no longer interested in what may have
917 			 * happened under the old parameters. If it is
918 			 * interested, it can wait for the expiration,
919 			 * delete the old timer definition, and then
920 			 * add the new one.
921 			 *
922 			 * This has to be done while the kq is locked:
923 			 *   - if enqueued, dequeue
924 			 *   - make it no longer active
925 			 *   - clear the count of expiration events
926 			 */
927 			kq = kn->kn_kq;
928 			KQ_LOCK(kq);
929 			if (kn->kn_status & KN_QUEUED)
930 				knote_dequeue(kn);
931 
932 			kn->kn_status &= ~KN_ACTIVE;
933 			kn->kn_data = 0;
934 			KQ_UNLOCK(kq);
935 
936 			/* Reschedule timer based on new data/fflags */
937 			kn->kn_sfflags = kev->fflags;
938 			kn->kn_sdata = kev->data;
939 			error = filt_timervalidate(kn, &to);
940 			if (error != 0) {
941 			  	kn->kn_flags |= EV_ERROR;
942 				kn->kn_data = error;
943 			} else
944 			  	filt_timerstart(kn, to);
945 		}
946 		break;
947 
948         case EVENT_PROCESS:
949 		*kev = kn->kn_kevent;
950 		if (kn->kn_flags & EV_CLEAR) {
951 			kn->kn_data = 0;
952 			kn->kn_fflags = 0;
953 		}
954 		break;
955 
956 	default:
957 		panic("filt_timertouch() - invalid type (%ld)", type);
958 		break;
959 	}
960 }
961 
962 static int
filt_timer(struct knote * kn,long hint)963 filt_timer(struct knote *kn, long hint)
964 {
965 
966 	return (kn->kn_data != 0);
967 }
968 
969 static int
filt_userattach(struct knote * kn)970 filt_userattach(struct knote *kn)
971 {
972 
973 	/*
974 	 * EVFILT_USER knotes are not attached to anything in the kernel.
975 	 */
976 	kn->kn_hook = NULL;
977 	if (kn->kn_fflags & NOTE_TRIGGER)
978 		kn->kn_hookid = 1;
979 	else
980 		kn->kn_hookid = 0;
981 	return (0);
982 }
983 
984 static void
filt_userdetach(__unused struct knote * kn)985 filt_userdetach(__unused struct knote *kn)
986 {
987 
988 	/*
989 	 * EVFILT_USER knotes are not attached to anything in the kernel.
990 	 */
991 }
992 
993 static int
filt_user(struct knote * kn,__unused long hint)994 filt_user(struct knote *kn, __unused long hint)
995 {
996 
997 	return (kn->kn_hookid);
998 }
999 
1000 static void
filt_usertouch(struct knote * kn,struct kevent * kev,u_long type)1001 filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
1002 {
1003 	u_int ffctrl;
1004 
1005 	switch (type) {
1006 	case EVENT_REGISTER:
1007 		if (kev->fflags & NOTE_TRIGGER)
1008 			kn->kn_hookid = 1;
1009 
1010 		ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1011 		kev->fflags &= NOTE_FFLAGSMASK;
1012 		switch (ffctrl) {
1013 		case NOTE_FFNOP:
1014 			break;
1015 
1016 		case NOTE_FFAND:
1017 			kn->kn_sfflags &= kev->fflags;
1018 			break;
1019 
1020 		case NOTE_FFOR:
1021 			kn->kn_sfflags |= kev->fflags;
1022 			break;
1023 
1024 		case NOTE_FFCOPY:
1025 			kn->kn_sfflags = kev->fflags;
1026 			break;
1027 
1028 		default:
1029 			/* XXX Return error? */
1030 			break;
1031 		}
1032 		kn->kn_sdata = kev->data;
1033 		if (kev->flags & EV_CLEAR) {
1034 			kn->kn_hookid = 0;
1035 			kn->kn_data = 0;
1036 			kn->kn_fflags = 0;
1037 		}
1038 		break;
1039 
1040         case EVENT_PROCESS:
1041 		*kev = kn->kn_kevent;
1042 		kev->fflags = kn->kn_sfflags;
1043 		kev->data = kn->kn_sdata;
1044 		if (kn->kn_flags & EV_CLEAR) {
1045 			kn->kn_hookid = 0;
1046 			kn->kn_data = 0;
1047 			kn->kn_fflags = 0;
1048 		}
1049 		break;
1050 
1051 	default:
1052 		panic("filt_usertouch() - invalid type (%ld)", type);
1053 		break;
1054 	}
1055 }
1056 
1057 int
sys_kqueue(struct thread * td,struct kqueue_args * uap)1058 sys_kqueue(struct thread *td, struct kqueue_args *uap)
1059 {
1060 
1061 	return (kern_kqueue(td, 0, NULL));
1062 }
1063 
1064 int
sys_kqueuex(struct thread * td,struct kqueuex_args * uap)1065 sys_kqueuex(struct thread *td, struct kqueuex_args *uap)
1066 {
1067 	int flags;
1068 
1069 	if ((uap->flags & ~(KQUEUE_CLOEXEC)) != 0)
1070 		return (EINVAL);
1071 	flags = 0;
1072 	if ((uap->flags & KQUEUE_CLOEXEC) != 0)
1073 		flags |= O_CLOEXEC;
1074 	return (kern_kqueue(td, flags, NULL));
1075 }
1076 
1077 static void
kqueue_init(struct kqueue * kq)1078 kqueue_init(struct kqueue *kq)
1079 {
1080 
1081 	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK);
1082 	TAILQ_INIT(&kq->kq_head);
1083 	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
1084 	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
1085 }
1086 
1087 int
kern_kqueue(struct thread * td,int flags,struct filecaps * fcaps)1088 kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps)
1089 {
1090 	struct filedesc *fdp;
1091 	struct kqueue *kq;
1092 	struct file *fp;
1093 	struct ucred *cred;
1094 	int fd, error;
1095 
1096 	fdp = td->td_proc->p_fd;
1097 	cred = td->td_ucred;
1098 	if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES)))
1099 		return (ENOMEM);
1100 
1101 	error = falloc_caps(td, &fp, &fd, flags, fcaps);
1102 	if (error != 0) {
1103 		chgkqcnt(cred->cr_ruidinfo, -1, 0);
1104 		return (error);
1105 	}
1106 
1107 	/* An extra reference on `fp' has been held for us by falloc(). */
1108 	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
1109 	kqueue_init(kq);
1110 	kq->kq_fdp = fdp;
1111 	kq->kq_cred = crhold(cred);
1112 
1113 	FILEDESC_XLOCK(fdp);
1114 	TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
1115 	FILEDESC_XUNLOCK(fdp);
1116 
1117 	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
1118 	fdrop(fp, td);
1119 
1120 	td->td_retval[0] = fd;
1121 	return (0);
1122 }
1123 
1124 struct g_kevent_args {
1125 	int	fd;
1126 	const void *changelist;
1127 	int	nchanges;
1128 	void	*eventlist;
1129 	int	nevents;
1130 	const struct timespec *timeout;
1131 };
1132 
1133 int
sys_kevent(struct thread * td,struct kevent_args * uap)1134 sys_kevent(struct thread *td, struct kevent_args *uap)
1135 {
1136 	struct kevent_copyops k_ops = {
1137 		.arg = uap,
1138 		.k_copyout = kevent_copyout,
1139 		.k_copyin = kevent_copyin,
1140 		.kevent_size = sizeof(struct kevent),
1141 	};
1142 	struct g_kevent_args gk_args = {
1143 		.fd = uap->fd,
1144 		.changelist = uap->changelist,
1145 		.nchanges = uap->nchanges,
1146 		.eventlist = uap->eventlist,
1147 		.nevents = uap->nevents,
1148 		.timeout = uap->timeout,
1149 	};
1150 
1151 	return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent"));
1152 }
1153 
1154 static int
kern_kevent_generic(struct thread * td,struct g_kevent_args * uap,struct kevent_copyops * k_ops,const char * struct_name)1155 kern_kevent_generic(struct thread *td, struct g_kevent_args *uap,
1156     struct kevent_copyops *k_ops, const char *struct_name)
1157 {
1158 	struct timespec ts, *tsp;
1159 #ifdef KTRACE
1160 	struct kevent *eventlist = uap->eventlist;
1161 #endif
1162 	int error;
1163 
1164 	if (uap->timeout != NULL) {
1165 		error = copyin(uap->timeout, &ts, sizeof(ts));
1166 		if (error)
1167 			return (error);
1168 		tsp = &ts;
1169 	} else
1170 		tsp = NULL;
1171 
1172 #ifdef KTRACE
1173 	if (KTRPOINT(td, KTR_STRUCT_ARRAY))
1174 		ktrstructarray(struct_name, UIO_USERSPACE, uap->changelist,
1175 		    uap->nchanges, k_ops->kevent_size);
1176 #endif
1177 
1178 	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
1179 	    k_ops, tsp);
1180 
1181 #ifdef KTRACE
1182 	if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY))
1183 		ktrstructarray(struct_name, UIO_USERSPACE, eventlist,
1184 		    td->td_retval[0], k_ops->kevent_size);
1185 #endif
1186 
1187 	return (error);
1188 }
1189 
1190 /*
1191  * Copy 'count' items into the destination list pointed to by uap->eventlist.
1192  */
1193 static int
kevent_copyout(void * arg,struct kevent * kevp,int count)1194 kevent_copyout(void *arg, struct kevent *kevp, int count)
1195 {
1196 	struct kevent_args *uap;
1197 	int error;
1198 
1199 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1200 	uap = (struct kevent_args *)arg;
1201 
1202 	error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
1203 	if (error == 0)
1204 		uap->eventlist += count;
1205 	return (error);
1206 }
1207 
1208 /*
1209  * Copy 'count' items from the list pointed to by uap->changelist.
1210  */
1211 static int
kevent_copyin(void * arg,struct kevent * kevp,int count)1212 kevent_copyin(void *arg, struct kevent *kevp, int count)
1213 {
1214 	struct kevent_args *uap;
1215 	int error;
1216 
1217 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1218 	uap = (struct kevent_args *)arg;
1219 
1220 	error = copyin(uap->changelist, kevp, count * sizeof *kevp);
1221 	if (error == 0)
1222 		uap->changelist += count;
1223 	return (error);
1224 }
1225 
1226 #ifdef COMPAT_FREEBSD11
1227 static int
kevent11_copyout(void * arg,struct kevent * kevp,int count)1228 kevent11_copyout(void *arg, struct kevent *kevp, int count)
1229 {
1230 	struct freebsd11_kevent_args *uap;
1231 	struct freebsd11_kevent kev11;
1232 	int error, i;
1233 
1234 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1235 	uap = (struct freebsd11_kevent_args *)arg;
1236 
1237 	for (i = 0; i < count; i++) {
1238 		kev11.ident = kevp->ident;
1239 		kev11.filter = kevp->filter;
1240 		kev11.flags = kevp->flags;
1241 		kev11.fflags = kevp->fflags;
1242 		kev11.data = kevp->data;
1243 		kev11.udata = kevp->udata;
1244 		error = copyout(&kev11, uap->eventlist, sizeof(kev11));
1245 		if (error != 0)
1246 			break;
1247 		uap->eventlist++;
1248 		kevp++;
1249 	}
1250 	return (error);
1251 }
1252 
1253 /*
1254  * Copy 'count' items from the list pointed to by uap->changelist.
1255  */
1256 static int
kevent11_copyin(void * arg,struct kevent * kevp,int count)1257 kevent11_copyin(void *arg, struct kevent *kevp, int count)
1258 {
1259 	struct freebsd11_kevent_args *uap;
1260 	struct freebsd11_kevent kev11;
1261 	int error, i;
1262 
1263 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1264 	uap = (struct freebsd11_kevent_args *)arg;
1265 
1266 	for (i = 0; i < count; i++) {
1267 		error = copyin(uap->changelist, &kev11, sizeof(kev11));
1268 		if (error != 0)
1269 			break;
1270 		kevp->ident = kev11.ident;
1271 		kevp->filter = kev11.filter;
1272 		kevp->flags = kev11.flags;
1273 		kevp->fflags = kev11.fflags;
1274 		kevp->data = (uintptr_t)kev11.data;
1275 		kevp->udata = kev11.udata;
1276 		bzero(&kevp->ext, sizeof(kevp->ext));
1277 		uap->changelist++;
1278 		kevp++;
1279 	}
1280 	return (error);
1281 }
1282 
1283 int
freebsd11_kevent(struct thread * td,struct freebsd11_kevent_args * uap)1284 freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap)
1285 {
1286 	struct kevent_copyops k_ops = {
1287 		.arg = uap,
1288 		.k_copyout = kevent11_copyout,
1289 		.k_copyin = kevent11_copyin,
1290 		.kevent_size = sizeof(struct freebsd11_kevent),
1291 	};
1292 	struct g_kevent_args gk_args = {
1293 		.fd = uap->fd,
1294 		.changelist = uap->changelist,
1295 		.nchanges = uap->nchanges,
1296 		.eventlist = uap->eventlist,
1297 		.nevents = uap->nevents,
1298 		.timeout = uap->timeout,
1299 	};
1300 
1301 	return (kern_kevent_generic(td, &gk_args, &k_ops, "freebsd11_kevent"));
1302 }
1303 #endif
1304 
1305 int
kern_kevent(struct thread * td,int fd,int nchanges,int nevents,struct kevent_copyops * k_ops,const struct timespec * timeout)1306 kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
1307     struct kevent_copyops *k_ops, const struct timespec *timeout)
1308 {
1309 	cap_rights_t rights;
1310 	struct file *fp;
1311 	int error;
1312 
1313 	cap_rights_init_zero(&rights);
1314 	if (nchanges > 0)
1315 		cap_rights_set_one(&rights, CAP_KQUEUE_CHANGE);
1316 	if (nevents > 0)
1317 		cap_rights_set_one(&rights, CAP_KQUEUE_EVENT);
1318 	error = fget(td, fd, &rights, &fp);
1319 	if (error != 0)
1320 		return (error);
1321 
1322 	error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout);
1323 	fdrop(fp, td);
1324 
1325 	return (error);
1326 }
1327 
1328 static int
kqueue_kevent(struct kqueue * kq,struct thread * td,int nchanges,int nevents,struct kevent_copyops * k_ops,const struct timespec * timeout)1329 kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents,
1330     struct kevent_copyops *k_ops, const struct timespec *timeout)
1331 {
1332 	struct kevent keva[KQ_NEVENTS];
1333 	struct kevent *kevp, *changes;
1334 	int i, n, nerrors, error;
1335 
1336 	if (nchanges < 0)
1337 		return (EINVAL);
1338 
1339 	nerrors = 0;
1340 	while (nchanges > 0) {
1341 		n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
1342 		error = k_ops->k_copyin(k_ops->arg, keva, n);
1343 		if (error)
1344 			return (error);
1345 		changes = keva;
1346 		for (i = 0; i < n; i++) {
1347 			kevp = &changes[i];
1348 			if (!kevp->filter)
1349 				continue;
1350 			kevp->flags &= ~EV_SYSFLAGS;
1351 			error = kqueue_register(kq, kevp, td, M_WAITOK);
1352 			if (error || (kevp->flags & EV_RECEIPT)) {
1353 				if (nevents == 0)
1354 					return (error);
1355 				kevp->flags = EV_ERROR;
1356 				kevp->data = error;
1357 				(void)k_ops->k_copyout(k_ops->arg, kevp, 1);
1358 				nevents--;
1359 				nerrors++;
1360 			}
1361 		}
1362 		nchanges -= n;
1363 	}
1364 	if (nerrors) {
1365 		td->td_retval[0] = nerrors;
1366 		return (0);
1367 	}
1368 
1369 	return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td));
1370 }
1371 
1372 int
kern_kevent_fp(struct thread * td,struct file * fp,int nchanges,int nevents,struct kevent_copyops * k_ops,const struct timespec * timeout)1373 kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents,
1374     struct kevent_copyops *k_ops, const struct timespec *timeout)
1375 {
1376 	struct kqueue *kq;
1377 	int error;
1378 
1379 	error = kqueue_acquire(fp, &kq);
1380 	if (error != 0)
1381 		return (error);
1382 	error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout);
1383 	kqueue_release(kq, 0);
1384 	return (error);
1385 }
1386 
1387 /*
1388  * Performs a kevent() call on a temporarily created kqueue. This can be
1389  * used to perform one-shot polling, similar to poll() and select().
1390  */
1391 int
kern_kevent_anonymous(struct thread * td,int nevents,struct kevent_copyops * k_ops)1392 kern_kevent_anonymous(struct thread *td, int nevents,
1393     struct kevent_copyops *k_ops)
1394 {
1395 	struct kqueue kq = {};
1396 	int error;
1397 
1398 	kqueue_init(&kq);
1399 	kq.kq_refcnt = 1;
1400 	error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL);
1401 	kqueue_drain(&kq, td);
1402 	kqueue_destroy(&kq);
1403 	return (error);
1404 }
1405 
1406 int
kqueue_add_filteropts(int filt,const struct filterops * filtops)1407 kqueue_add_filteropts(int filt, const struct filterops *filtops)
1408 {
1409 	int error;
1410 
1411 	error = 0;
1412 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
1413 		printf(
1414 "trying to add a filterop that is out of range: %d is beyond %d\n",
1415 		    ~filt, EVFILT_SYSCOUNT);
1416 		return EINVAL;
1417 	}
1418 	mtx_lock(&filterops_lock);
1419 	if (sysfilt_ops[~filt].for_fop != &null_filtops &&
1420 	    sysfilt_ops[~filt].for_fop != NULL)
1421 		error = EEXIST;
1422 	else {
1423 		sysfilt_ops[~filt].for_fop = filtops;
1424 		sysfilt_ops[~filt].for_refcnt = 0;
1425 	}
1426 	mtx_unlock(&filterops_lock);
1427 
1428 	return (error);
1429 }
1430 
1431 int
kqueue_del_filteropts(int filt)1432 kqueue_del_filteropts(int filt)
1433 {
1434 	int error;
1435 
1436 	error = 0;
1437 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1438 		return EINVAL;
1439 
1440 	mtx_lock(&filterops_lock);
1441 	if (sysfilt_ops[~filt].for_fop == &null_filtops ||
1442 	    sysfilt_ops[~filt].for_fop == NULL)
1443 		error = EINVAL;
1444 	else if (sysfilt_ops[~filt].for_refcnt != 0)
1445 		error = EBUSY;
1446 	else {
1447 		sysfilt_ops[~filt].for_fop = &null_filtops;
1448 		sysfilt_ops[~filt].for_refcnt = 0;
1449 	}
1450 	mtx_unlock(&filterops_lock);
1451 
1452 	return error;
1453 }
1454 
1455 static const struct filterops *
kqueue_fo_find(int filt)1456 kqueue_fo_find(int filt)
1457 {
1458 
1459 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1460 		return NULL;
1461 
1462 	if (sysfilt_ops[~filt].for_nolock)
1463 		return sysfilt_ops[~filt].for_fop;
1464 
1465 	mtx_lock(&filterops_lock);
1466 	sysfilt_ops[~filt].for_refcnt++;
1467 	if (sysfilt_ops[~filt].for_fop == NULL)
1468 		sysfilt_ops[~filt].for_fop = &null_filtops;
1469 	mtx_unlock(&filterops_lock);
1470 
1471 	return sysfilt_ops[~filt].for_fop;
1472 }
1473 
1474 static void
kqueue_fo_release(int filt)1475 kqueue_fo_release(int filt)
1476 {
1477 
1478 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1479 		return;
1480 
1481 	if (sysfilt_ops[~filt].for_nolock)
1482 		return;
1483 
1484 	mtx_lock(&filterops_lock);
1485 	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
1486 	    ("filter object refcount not valid on release"));
1487 	sysfilt_ops[~filt].for_refcnt--;
1488 	mtx_unlock(&filterops_lock);
1489 }
1490 
1491 /*
1492  * A ref to kq (obtained via kqueue_acquire) must be held.
1493  */
1494 static int
kqueue_register(struct kqueue * kq,struct kevent * kev,struct thread * td,int mflag)1495 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td,
1496     int mflag)
1497 {
1498 	const struct filterops *fops;
1499 	struct file *fp;
1500 	struct knote *kn, *tkn;
1501 	struct knlist *knl;
1502 	int error, filt, event;
1503 	int haskqglobal, filedesc_unlock;
1504 
1505 	if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE))
1506 		return (EINVAL);
1507 
1508 	fp = NULL;
1509 	kn = NULL;
1510 	knl = NULL;
1511 	error = 0;
1512 	haskqglobal = 0;
1513 	filedesc_unlock = 0;
1514 
1515 	filt = kev->filter;
1516 	fops = kqueue_fo_find(filt);
1517 	if (fops == NULL)
1518 		return EINVAL;
1519 
1520 	if (kev->flags & EV_ADD) {
1521 		/* Reject an invalid flag pair early */
1522 		if (kev->flags & EV_KEEPUDATA) {
1523 			tkn = NULL;
1524 			error = EINVAL;
1525 			goto done;
1526 		}
1527 
1528 		/*
1529 		 * Prevent waiting with locks.  Non-sleepable
1530 		 * allocation failures are handled in the loop, only
1531 		 * if the spare knote appears to be actually required.
1532 		 */
1533 		tkn = knote_alloc(mflag);
1534 	} else {
1535 		tkn = NULL;
1536 	}
1537 
1538 findkn:
1539 	if (fops->f_isfd) {
1540 		KASSERT(td != NULL, ("td is NULL"));
1541 		if (kev->ident > INT_MAX)
1542 			error = EBADF;
1543 		else
1544 			error = fget(td, kev->ident, &cap_event_rights, &fp);
1545 		if (error)
1546 			goto done;
1547 
1548 		if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
1549 		    kev->ident, M_NOWAIT) != 0) {
1550 			/* try again */
1551 			fdrop(fp, td);
1552 			fp = NULL;
1553 			error = kqueue_expand(kq, fops, kev->ident, mflag);
1554 			if (error)
1555 				goto done;
1556 			goto findkn;
1557 		}
1558 
1559 		if (fp->f_type == DTYPE_KQUEUE) {
1560 			/*
1561 			 * If we add some intelligence about what we are doing,
1562 			 * we should be able to support events on ourselves.
1563 			 * We need to know when we are doing this to prevent
1564 			 * getting both the knlist lock and the kq lock since
1565 			 * they are the same thing.
1566 			 */
1567 			if (fp->f_data == kq) {
1568 				error = EINVAL;
1569 				goto done;
1570 			}
1571 
1572 			/*
1573 			 * Pre-lock the filedesc before the global
1574 			 * lock mutex, see the comment in
1575 			 * kqueue_close().
1576 			 */
1577 			FILEDESC_XLOCK(td->td_proc->p_fd);
1578 			filedesc_unlock = 1;
1579 			KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1580 		}
1581 
1582 		KQ_LOCK(kq);
1583 		if (kev->ident < kq->kq_knlistsize) {
1584 			SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
1585 				if (kev->filter == kn->kn_filter)
1586 					break;
1587 		}
1588 	} else {
1589 		if ((kev->flags & EV_ADD) == EV_ADD) {
1590 			error = kqueue_expand(kq, fops, kev->ident, mflag);
1591 			if (error != 0)
1592 				goto done;
1593 		}
1594 
1595 		KQ_LOCK(kq);
1596 
1597 		/*
1598 		 * If possible, find an existing knote to use for this kevent.
1599 		 */
1600 		if (kev->filter == EVFILT_PROC &&
1601 		    (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
1602 			/* This is an internal creation of a process tracking
1603 			 * note. Don't attempt to coalesce this with an
1604 			 * existing note.
1605 			 */
1606 			;
1607 		} else if (kq->kq_knhashmask != 0) {
1608 			struct klist *list;
1609 
1610 			list = &kq->kq_knhash[
1611 			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
1612 			SLIST_FOREACH(kn, list, kn_link)
1613 				if (kev->ident == kn->kn_id &&
1614 				    kev->filter == kn->kn_filter)
1615 					break;
1616 		}
1617 	}
1618 
1619 	/* knote is in the process of changing, wait for it to stabilize. */
1620 	if (kn != NULL && kn_in_flux(kn)) {
1621 		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1622 		if (filedesc_unlock) {
1623 			FILEDESC_XUNLOCK(td->td_proc->p_fd);
1624 			filedesc_unlock = 0;
1625 		}
1626 		kq->kq_state |= KQ_FLUXWAIT;
1627 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
1628 		if (fp != NULL) {
1629 			fdrop(fp, td);
1630 			fp = NULL;
1631 		}
1632 		goto findkn;
1633 	}
1634 
1635 	/*
1636 	 * kn now contains the matching knote, or NULL if no match
1637 	 */
1638 	if (kn == NULL) {
1639 		if (kev->flags & EV_ADD) {
1640 			kn = tkn;
1641 			tkn = NULL;
1642 			if (kn == NULL) {
1643 				KQ_UNLOCK(kq);
1644 				error = ENOMEM;
1645 				goto done;
1646 			}
1647 			kn->kn_fp = fp;
1648 			kn->kn_kq = kq;
1649 			kn->kn_fop = fops;
1650 			/*
1651 			 * apply reference counts to knote structure, and
1652 			 * do not release it at the end of this routine.
1653 			 */
1654 			fops = NULL;
1655 			fp = NULL;
1656 
1657 			kn->kn_sfflags = kev->fflags;
1658 			kn->kn_sdata = kev->data;
1659 			kev->fflags = 0;
1660 			kev->data = 0;
1661 			kn->kn_kevent = *kev;
1662 			kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
1663 			    EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT);
1664 			kn->kn_status = KN_DETACHED;
1665 			if ((kev->flags & EV_DISABLE) != 0)
1666 				kn->kn_status |= KN_DISABLED;
1667 			kn_enter_flux(kn);
1668 
1669 			error = knote_attach(kn, kq);
1670 			KQ_UNLOCK(kq);
1671 			if (error != 0) {
1672 				tkn = kn;
1673 				goto done;
1674 			}
1675 
1676 			if ((error = kn->kn_fop->f_attach(kn)) != 0) {
1677 				knote_drop_detached(kn, td);
1678 				goto done;
1679 			}
1680 			knl = kn_list_lock(kn);
1681 			goto done_ev_add;
1682 		} else {
1683 			/* No matching knote and the EV_ADD flag is not set. */
1684 			KQ_UNLOCK(kq);
1685 			error = ENOENT;
1686 			goto done;
1687 		}
1688 	}
1689 
1690 	if (kev->flags & EV_DELETE) {
1691 		kn_enter_flux(kn);
1692 		KQ_UNLOCK(kq);
1693 		knote_drop(kn, td);
1694 		goto done;
1695 	}
1696 
1697 	if (kev->flags & EV_FORCEONESHOT) {
1698 		kn->kn_flags |= EV_ONESHOT;
1699 		KNOTE_ACTIVATE(kn, 1);
1700 	}
1701 
1702 	if ((kev->flags & EV_ENABLE) != 0)
1703 		kn->kn_status &= ~KN_DISABLED;
1704 	else if ((kev->flags & EV_DISABLE) != 0)
1705 		kn->kn_status |= KN_DISABLED;
1706 
1707 	/*
1708 	 * The user may change some filter values after the initial EV_ADD,
1709 	 * but doing so will not reset any filter which has already been
1710 	 * triggered.
1711 	 */
1712 	kn->kn_status |= KN_SCAN;
1713 	kn_enter_flux(kn);
1714 	KQ_UNLOCK(kq);
1715 	knl = kn_list_lock(kn);
1716 	if ((kev->flags & EV_KEEPUDATA) == 0)
1717 		kn->kn_kevent.udata = kev->udata;
1718 	if (!fops->f_isfd && fops->f_touch != NULL) {
1719 		fops->f_touch(kn, kev, EVENT_REGISTER);
1720 	} else {
1721 		kn->kn_sfflags = kev->fflags;
1722 		kn->kn_sdata = kev->data;
1723 	}
1724 
1725 done_ev_add:
1726 	/*
1727 	 * We can get here with kn->kn_knlist == NULL.  This can happen when
1728 	 * the initial attach event decides that the event is "completed"
1729 	 * already, e.g., filt_procattach() is called on a zombie process.  It
1730 	 * will call filt_proc() which will remove it from the list, and NULL
1731 	 * kn_knlist.
1732 	 *
1733 	 * KN_DISABLED will be stable while the knote is in flux, so the
1734 	 * unlocked read will not race with an update.
1735 	 */
1736 	if ((kn->kn_status & KN_DISABLED) == 0)
1737 		event = kn->kn_fop->f_event(kn, 0);
1738 	else
1739 		event = 0;
1740 
1741 	KQ_LOCK(kq);
1742 	if (event)
1743 		kn->kn_status |= KN_ACTIVE;
1744 	if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) ==
1745 	    KN_ACTIVE)
1746 		knote_enqueue(kn);
1747 	kn->kn_status &= ~KN_SCAN;
1748 	kn_leave_flux(kn);
1749 	kn_list_unlock(knl);
1750 	KQ_UNLOCK_FLUX(kq);
1751 
1752 done:
1753 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1754 	if (filedesc_unlock)
1755 		FILEDESC_XUNLOCK(td->td_proc->p_fd);
1756 	if (fp != NULL)
1757 		fdrop(fp, td);
1758 	knote_free(tkn);
1759 	if (fops != NULL)
1760 		kqueue_fo_release(filt);
1761 	return (error);
1762 }
1763 
1764 static int
kqueue_acquire(struct file * fp,struct kqueue ** kqp)1765 kqueue_acquire(struct file *fp, struct kqueue **kqp)
1766 {
1767 	int error;
1768 	struct kqueue *kq;
1769 
1770 	error = 0;
1771 
1772 	kq = fp->f_data;
1773 	if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
1774 		return (EBADF);
1775 	*kqp = kq;
1776 	KQ_LOCK(kq);
1777 	if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
1778 		KQ_UNLOCK(kq);
1779 		return (EBADF);
1780 	}
1781 	kq->kq_refcnt++;
1782 	KQ_UNLOCK(kq);
1783 
1784 	return error;
1785 }
1786 
1787 static void
kqueue_release(struct kqueue * kq,int locked)1788 kqueue_release(struct kqueue *kq, int locked)
1789 {
1790 	if (locked)
1791 		KQ_OWNED(kq);
1792 	else
1793 		KQ_LOCK(kq);
1794 	kq->kq_refcnt--;
1795 	if (kq->kq_refcnt == 1)
1796 		wakeup(&kq->kq_refcnt);
1797 	if (!locked)
1798 		KQ_UNLOCK(kq);
1799 }
1800 
1801 static void
ast_kqueue(struct thread * td,int tda __unused)1802 ast_kqueue(struct thread *td, int tda __unused)
1803 {
1804 	taskqueue_quiesce(taskqueue_kqueue_ctx);
1805 }
1806 
1807 static void
kqueue_schedtask(struct kqueue * kq)1808 kqueue_schedtask(struct kqueue *kq)
1809 {
1810 	KQ_OWNED(kq);
1811 	KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
1812 	    ("scheduling kqueue task while draining"));
1813 
1814 	if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
1815 		taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task);
1816 		kq->kq_state |= KQ_TASKSCHED;
1817 		ast_sched(curthread, TDA_KQUEUE);
1818 	}
1819 }
1820 
1821 /*
1822  * Expand the kq to make sure we have storage for fops/ident pair.
1823  *
1824  * Return 0 on success (or no work necessary), return errno on failure.
1825  */
1826 static int
kqueue_expand(struct kqueue * kq,const struct filterops * fops,uintptr_t ident,int mflag)1827 kqueue_expand(struct kqueue *kq, const struct filterops *fops, uintptr_t ident,
1828     int mflag)
1829 {
1830 	struct klist *list, *tmp_knhash, *to_free;
1831 	u_long tmp_knhashmask;
1832 	int error, fd, size;
1833 
1834 	KQ_NOTOWNED(kq);
1835 
1836 	error = 0;
1837 	to_free = NULL;
1838 	if (fops->f_isfd) {
1839 		fd = ident;
1840 		if (kq->kq_knlistsize <= fd) {
1841 			size = kq->kq_knlistsize;
1842 			while (size <= fd)
1843 				size += KQEXTENT;
1844 			list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
1845 			if (list == NULL)
1846 				return ENOMEM;
1847 			KQ_LOCK(kq);
1848 			if ((kq->kq_state & KQ_CLOSING) != 0) {
1849 				to_free = list;
1850 				error = EBADF;
1851 			} else if (kq->kq_knlistsize > fd) {
1852 				to_free = list;
1853 			} else {
1854 				if (kq->kq_knlist != NULL) {
1855 					bcopy(kq->kq_knlist, list,
1856 					    kq->kq_knlistsize * sizeof(*list));
1857 					to_free = kq->kq_knlist;
1858 					kq->kq_knlist = NULL;
1859 				}
1860 				bzero((caddr_t)list +
1861 				    kq->kq_knlistsize * sizeof(*list),
1862 				    (size - kq->kq_knlistsize) * sizeof(*list));
1863 				kq->kq_knlistsize = size;
1864 				kq->kq_knlist = list;
1865 			}
1866 			KQ_UNLOCK(kq);
1867 		}
1868 	} else {
1869 		if (kq->kq_knhashmask == 0) {
1870 			tmp_knhash = hashinit_flags(KN_HASHSIZE, M_KQUEUE,
1871 			    &tmp_knhashmask, (mflag & M_WAITOK) != 0 ?
1872 			    HASH_WAITOK : HASH_NOWAIT);
1873 			if (tmp_knhash == NULL)
1874 				return (ENOMEM);
1875 			KQ_LOCK(kq);
1876 			if ((kq->kq_state & KQ_CLOSING) != 0) {
1877 				to_free = tmp_knhash;
1878 				error = EBADF;
1879 			} else if (kq->kq_knhashmask == 0) {
1880 				kq->kq_knhash = tmp_knhash;
1881 				kq->kq_knhashmask = tmp_knhashmask;
1882 			} else {
1883 				to_free = tmp_knhash;
1884 			}
1885 			KQ_UNLOCK(kq);
1886 		}
1887 	}
1888 	free(to_free, M_KQUEUE);
1889 
1890 	KQ_NOTOWNED(kq);
1891 	return (error);
1892 }
1893 
1894 static void
kqueue_task(void * arg,int pending)1895 kqueue_task(void *arg, int pending)
1896 {
1897 	struct kqueue *kq;
1898 	int haskqglobal;
1899 
1900 	haskqglobal = 0;
1901 	kq = arg;
1902 
1903 	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1904 	KQ_LOCK(kq);
1905 
1906 	KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
1907 
1908 	kq->kq_state &= ~KQ_TASKSCHED;
1909 	if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
1910 		wakeup(&kq->kq_state);
1911 	}
1912 	KQ_UNLOCK(kq);
1913 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1914 }
1915 
1916 /*
1917  * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
1918  * We treat KN_MARKER knotes as if they are in flux.
1919  */
1920 static int
kqueue_scan(struct kqueue * kq,int maxevents,struct kevent_copyops * k_ops,const struct timespec * tsp,struct kevent * keva,struct thread * td)1921 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
1922     const struct timespec *tsp, struct kevent *keva, struct thread *td)
1923 {
1924 	struct kevent *kevp;
1925 	struct knote *kn, *marker;
1926 	struct knlist *knl;
1927 	sbintime_t asbt, rsbt;
1928 	int count, error, haskqglobal, influx, nkev, touch;
1929 
1930 	count = maxevents;
1931 	nkev = 0;
1932 	error = 0;
1933 	haskqglobal = 0;
1934 
1935 	if (maxevents == 0)
1936 		goto done_nl;
1937 	if (maxevents < 0) {
1938 		error = EINVAL;
1939 		goto done_nl;
1940 	}
1941 
1942 	rsbt = 0;
1943 	if (tsp != NULL) {
1944 		if (!timespecvalid_interval(tsp)) {
1945 			error = EINVAL;
1946 			goto done_nl;
1947 		}
1948 		if (timespecisset(tsp)) {
1949 			if (tsp->tv_sec <= INT32_MAX) {
1950 				rsbt = tstosbt(*tsp);
1951 				if (TIMESEL(&asbt, rsbt))
1952 					asbt += tc_tick_sbt;
1953 				if (asbt <= SBT_MAX - rsbt)
1954 					asbt += rsbt;
1955 				else
1956 					asbt = 0;
1957 				rsbt >>= tc_precexp;
1958 			} else
1959 				asbt = 0;
1960 		} else
1961 			asbt = -1;
1962 	} else
1963 		asbt = 0;
1964 	marker = knote_alloc(M_WAITOK);
1965 	marker->kn_status = KN_MARKER;
1966 	KQ_LOCK(kq);
1967 
1968 retry:
1969 	kevp = keva;
1970 	if (kq->kq_count == 0) {
1971 		if (asbt == -1) {
1972 			error = EWOULDBLOCK;
1973 		} else {
1974 			kq->kq_state |= KQ_SLEEP;
1975 			error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
1976 			    "kqread", asbt, rsbt, C_ABSOLUTE);
1977 		}
1978 		if (error == 0)
1979 			goto retry;
1980 		/* don't restart after signals... */
1981 		if (error == ERESTART)
1982 			error = EINTR;
1983 		else if (error == EWOULDBLOCK)
1984 			error = 0;
1985 		goto done;
1986 	}
1987 
1988 	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1989 	influx = 0;
1990 	while (count) {
1991 		KQ_OWNED(kq);
1992 		kn = TAILQ_FIRST(&kq->kq_head);
1993 
1994 		if ((kn->kn_status == KN_MARKER && kn != marker) ||
1995 		    kn_in_flux(kn)) {
1996 			if (influx) {
1997 				influx = 0;
1998 				KQ_FLUX_WAKEUP(kq);
1999 			}
2000 			kq->kq_state |= KQ_FLUXWAIT;
2001 			error = msleep(kq, &kq->kq_lock, PSOCK,
2002 			    "kqflxwt", 0);
2003 			continue;
2004 		}
2005 
2006 		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2007 		if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
2008 			kn->kn_status &= ~KN_QUEUED;
2009 			kq->kq_count--;
2010 			continue;
2011 		}
2012 		if (kn == marker) {
2013 			KQ_FLUX_WAKEUP(kq);
2014 			if (count == maxevents)
2015 				goto retry;
2016 			goto done;
2017 		}
2018 		KASSERT(!kn_in_flux(kn),
2019 		    ("knote %p is unexpectedly in flux", kn));
2020 
2021 		if ((kn->kn_flags & EV_DROP) == EV_DROP) {
2022 			kn->kn_status &= ~KN_QUEUED;
2023 			kn_enter_flux(kn);
2024 			kq->kq_count--;
2025 			KQ_UNLOCK(kq);
2026 			/*
2027 			 * We don't need to lock the list since we've
2028 			 * marked it as in flux.
2029 			 */
2030 			knote_drop(kn, td);
2031 			KQ_LOCK(kq);
2032 			continue;
2033 		} else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
2034 			kn->kn_status &= ~KN_QUEUED;
2035 			kn_enter_flux(kn);
2036 			kq->kq_count--;
2037 			KQ_UNLOCK(kq);
2038 			/*
2039 			 * We don't need to lock the list since we've
2040 			 * marked the knote as being in flux.
2041 			 */
2042 			*kevp = kn->kn_kevent;
2043 			knote_drop(kn, td);
2044 			KQ_LOCK(kq);
2045 			kn = NULL;
2046 		} else {
2047 			kn->kn_status |= KN_SCAN;
2048 			kn_enter_flux(kn);
2049 			KQ_UNLOCK(kq);
2050 			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
2051 				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
2052 			knl = kn_list_lock(kn);
2053 			if (kn->kn_fop->f_event(kn, 0) == 0) {
2054 				KQ_LOCK(kq);
2055 				KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
2056 				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE |
2057 				    KN_SCAN);
2058 				kn_leave_flux(kn);
2059 				kq->kq_count--;
2060 				kn_list_unlock(knl);
2061 				influx = 1;
2062 				continue;
2063 			}
2064 			touch = (!kn->kn_fop->f_isfd &&
2065 			    kn->kn_fop->f_touch != NULL);
2066 			if (touch)
2067 				kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
2068 			else
2069 				*kevp = kn->kn_kevent;
2070 			KQ_LOCK(kq);
2071 			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
2072 			if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
2073 				/*
2074 				 * Manually clear knotes who weren't
2075 				 * 'touch'ed.
2076 				 */
2077 				if (touch == 0 && kn->kn_flags & EV_CLEAR) {
2078 					kn->kn_data = 0;
2079 					kn->kn_fflags = 0;
2080 				}
2081 				if (kn->kn_flags & EV_DISPATCH)
2082 					kn->kn_status |= KN_DISABLED;
2083 				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
2084 				kq->kq_count--;
2085 			} else
2086 				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2087 
2088 			kn->kn_status &= ~KN_SCAN;
2089 			kn_leave_flux(kn);
2090 			kn_list_unlock(knl);
2091 			influx = 1;
2092 		}
2093 
2094 		/* we are returning a copy to the user */
2095 		kevp++;
2096 		nkev++;
2097 		count--;
2098 
2099 		if (nkev == KQ_NEVENTS) {
2100 			influx = 0;
2101 			KQ_UNLOCK_FLUX(kq);
2102 			error = k_ops->k_copyout(k_ops->arg, keva, nkev);
2103 			nkev = 0;
2104 			kevp = keva;
2105 			KQ_LOCK(kq);
2106 			if (error)
2107 				break;
2108 		}
2109 	}
2110 	TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
2111 done:
2112 	KQ_OWNED(kq);
2113 	KQ_UNLOCK_FLUX(kq);
2114 	knote_free(marker);
2115 done_nl:
2116 	KQ_NOTOWNED(kq);
2117 	if (nkev != 0)
2118 		error = k_ops->k_copyout(k_ops->arg, keva, nkev);
2119 	td->td_retval[0] = maxevents - count;
2120 	return (error);
2121 }
2122 
2123 /*ARGSUSED*/
2124 static int
kqueue_ioctl(struct file * fp,u_long cmd,void * data,struct ucred * active_cred,struct thread * td)2125 kqueue_ioctl(struct file *fp, u_long cmd, void *data,
2126 	struct ucred *active_cred, struct thread *td)
2127 {
2128 	/*
2129 	 * Enabling sigio causes two major problems:
2130 	 * 1) infinite recursion:
2131 	 * Synopsys: kevent is being used to track signals and have FIOASYNC
2132 	 * set.  On receipt of a signal this will cause a kqueue to recurse
2133 	 * into itself over and over.  Sending the sigio causes the kqueue
2134 	 * to become ready, which in turn posts sigio again, forever.
2135 	 * Solution: this can be solved by setting a flag in the kqueue that
2136 	 * we have a SIGIO in progress.
2137 	 * 2) locking problems:
2138 	 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
2139 	 * us above the proc and pgrp locks.
2140 	 * Solution: Post a signal using an async mechanism, being sure to
2141 	 * record a generation count in the delivery so that we do not deliver
2142 	 * a signal to the wrong process.
2143 	 *
2144 	 * Note, these two mechanisms are somewhat mutually exclusive!
2145 	 */
2146 #if 0
2147 	struct kqueue *kq;
2148 
2149 	kq = fp->f_data;
2150 	switch (cmd) {
2151 	case FIOASYNC:
2152 		if (*(int *)data) {
2153 			kq->kq_state |= KQ_ASYNC;
2154 		} else {
2155 			kq->kq_state &= ~KQ_ASYNC;
2156 		}
2157 		return (0);
2158 
2159 	case FIOSETOWN:
2160 		return (fsetown(*(int *)data, &kq->kq_sigio));
2161 
2162 	case FIOGETOWN:
2163 		*(int *)data = fgetown(&kq->kq_sigio);
2164 		return (0);
2165 	}
2166 #endif
2167 
2168 	return (ENOTTY);
2169 }
2170 
2171 /*ARGSUSED*/
2172 static int
kqueue_poll(struct file * fp,int events,struct ucred * active_cred,struct thread * td)2173 kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
2174 	struct thread *td)
2175 {
2176 	struct kqueue *kq;
2177 	int revents = 0;
2178 	int error;
2179 
2180 	if ((error = kqueue_acquire(fp, &kq)))
2181 		return POLLERR;
2182 
2183 	KQ_LOCK(kq);
2184 	if (events & (POLLIN | POLLRDNORM)) {
2185 		if (kq->kq_count) {
2186 			revents |= events & (POLLIN | POLLRDNORM);
2187 		} else {
2188 			selrecord(td, &kq->kq_sel);
2189 			if (SEL_WAITING(&kq->kq_sel))
2190 				kq->kq_state |= KQ_SEL;
2191 		}
2192 	}
2193 	kqueue_release(kq, 1);
2194 	KQ_UNLOCK(kq);
2195 	return (revents);
2196 }
2197 
2198 /*ARGSUSED*/
2199 static int
kqueue_stat(struct file * fp,struct stat * st,struct ucred * active_cred)2200 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
2201 {
2202 
2203 	bzero((void *)st, sizeof *st);
2204 	/*
2205 	 * We no longer return kq_count because the unlocked value is useless.
2206 	 * If you spent all this time getting the count, why not spend your
2207 	 * syscall better by calling kevent?
2208 	 *
2209 	 * XXX - This is needed for libc_r.
2210 	 */
2211 	st->st_mode = S_IFIFO;
2212 	return (0);
2213 }
2214 
2215 static void
kqueue_drain(struct kqueue * kq,struct thread * td)2216 kqueue_drain(struct kqueue *kq, struct thread *td)
2217 {
2218 	struct knote *kn;
2219 	int i;
2220 
2221 	KQ_LOCK(kq);
2222 
2223 	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
2224 	    ("kqueue already closing"));
2225 	kq->kq_state |= KQ_CLOSING;
2226 	if (kq->kq_refcnt > 1)
2227 		msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
2228 
2229 	KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
2230 
2231 	KASSERT(knlist_empty(&kq->kq_sel.si_note),
2232 	    ("kqueue's knlist not empty"));
2233 
2234 	for (i = 0; i < kq->kq_knlistsize; i++) {
2235 		while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
2236 			if (kn_in_flux(kn)) {
2237 				kq->kq_state |= KQ_FLUXWAIT;
2238 				msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
2239 				continue;
2240 			}
2241 			kn_enter_flux(kn);
2242 			KQ_UNLOCK(kq);
2243 			knote_drop(kn, td);
2244 			KQ_LOCK(kq);
2245 		}
2246 	}
2247 	if (kq->kq_knhashmask != 0) {
2248 		for (i = 0; i <= kq->kq_knhashmask; i++) {
2249 			while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
2250 				if (kn_in_flux(kn)) {
2251 					kq->kq_state |= KQ_FLUXWAIT;
2252 					msleep(kq, &kq->kq_lock, PSOCK,
2253 					       "kqclo2", 0);
2254 					continue;
2255 				}
2256 				kn_enter_flux(kn);
2257 				KQ_UNLOCK(kq);
2258 				knote_drop(kn, td);
2259 				KQ_LOCK(kq);
2260 			}
2261 		}
2262 	}
2263 
2264 	if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
2265 		kq->kq_state |= KQ_TASKDRAIN;
2266 		msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
2267 	}
2268 
2269 	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
2270 		selwakeuppri(&kq->kq_sel, PSOCK);
2271 		if (!SEL_WAITING(&kq->kq_sel))
2272 			kq->kq_state &= ~KQ_SEL;
2273 	}
2274 
2275 	KQ_UNLOCK(kq);
2276 }
2277 
2278 static void
kqueue_destroy(struct kqueue * kq)2279 kqueue_destroy(struct kqueue *kq)
2280 {
2281 
2282 	KASSERT(kq->kq_fdp == NULL,
2283 	    ("kqueue still attached to a file descriptor"));
2284 	seldrain(&kq->kq_sel);
2285 	knlist_destroy(&kq->kq_sel.si_note);
2286 	mtx_destroy(&kq->kq_lock);
2287 
2288 	if (kq->kq_knhash != NULL)
2289 		free(kq->kq_knhash, M_KQUEUE);
2290 	if (kq->kq_knlist != NULL)
2291 		free(kq->kq_knlist, M_KQUEUE);
2292 
2293 	funsetown(&kq->kq_sigio);
2294 }
2295 
2296 /*ARGSUSED*/
2297 static int
kqueue_close(struct file * fp,struct thread * td)2298 kqueue_close(struct file *fp, struct thread *td)
2299 {
2300 	struct kqueue *kq = fp->f_data;
2301 	struct filedesc *fdp;
2302 	int error;
2303 	int filedesc_unlock;
2304 
2305 	if ((error = kqueue_acquire(fp, &kq)))
2306 		return error;
2307 	kqueue_drain(kq, td);
2308 
2309 	/*
2310 	 * We could be called due to the knote_drop() doing fdrop(),
2311 	 * called from kqueue_register().  In this case the global
2312 	 * lock is owned, and filedesc sx is locked before, to not
2313 	 * take the sleepable lock after non-sleepable.
2314 	 */
2315 	fdp = kq->kq_fdp;
2316 	kq->kq_fdp = NULL;
2317 	if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
2318 		FILEDESC_XLOCK(fdp);
2319 		filedesc_unlock = 1;
2320 	} else
2321 		filedesc_unlock = 0;
2322 	TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list);
2323 	if (filedesc_unlock)
2324 		FILEDESC_XUNLOCK(fdp);
2325 
2326 	kqueue_destroy(kq);
2327 	chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0);
2328 	crfree(kq->kq_cred);
2329 	free(kq, M_KQUEUE);
2330 	fp->f_data = NULL;
2331 
2332 	return (0);
2333 }
2334 
2335 static int
kqueue_fill_kinfo(struct file * fp,struct kinfo_file * kif,struct filedesc * fdp)2336 kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
2337 {
2338 	struct kqueue *kq = fp->f_data;
2339 
2340 	kif->kf_type = KF_TYPE_KQUEUE;
2341 	kif->kf_un.kf_kqueue.kf_kqueue_addr = (uintptr_t)kq;
2342 	kif->kf_un.kf_kqueue.kf_kqueue_count = kq->kq_count;
2343 	kif->kf_un.kf_kqueue.kf_kqueue_state = kq->kq_state;
2344 	return (0);
2345 }
2346 
2347 static void
kqueue_wakeup(struct kqueue * kq)2348 kqueue_wakeup(struct kqueue *kq)
2349 {
2350 	KQ_OWNED(kq);
2351 
2352 	if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
2353 		kq->kq_state &= ~KQ_SLEEP;
2354 		wakeup(kq);
2355 	}
2356 	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
2357 		selwakeuppri(&kq->kq_sel, PSOCK);
2358 		if (!SEL_WAITING(&kq->kq_sel))
2359 			kq->kq_state &= ~KQ_SEL;
2360 	}
2361 	if (!knlist_empty(&kq->kq_sel.si_note))
2362 		kqueue_schedtask(kq);
2363 	if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
2364 		pgsigio(&kq->kq_sigio, SIGIO, 0);
2365 	}
2366 }
2367 
2368 /*
2369  * Walk down a list of knotes, activating them if their event has triggered.
2370  *
2371  * There is a possibility to optimize in the case of one kq watching another.
2372  * Instead of scheduling a task to wake it up, you could pass enough state
2373  * down the chain to make up the parent kqueue.  Make this code functional
2374  * first.
2375  */
2376 void
knote(struct knlist * list,long hint,int lockflags)2377 knote(struct knlist *list, long hint, int lockflags)
2378 {
2379 	struct kqueue *kq;
2380 	struct knote *kn, *tkn;
2381 	int error;
2382 
2383 	if (list == NULL)
2384 		return;
2385 
2386 	KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
2387 
2388 	if ((lockflags & KNF_LISTLOCKED) == 0)
2389 		list->kl_lock(list->kl_lockarg);
2390 
2391 	/*
2392 	 * If we unlock the list lock (and enter influx), we can
2393 	 * eliminate the kqueue scheduling, but this will introduce
2394 	 * four lock/unlock's for each knote to test.  Also, marker
2395 	 * would be needed to keep iteration position, since filters
2396 	 * or other threads could remove events.
2397 	 */
2398 	SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
2399 		kq = kn->kn_kq;
2400 		KQ_LOCK(kq);
2401 		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
2402 			/*
2403 			 * Do not process the influx notes, except for
2404 			 * the influx coming from the kq unlock in the
2405 			 * kqueue_scan().  In the later case, we do
2406 			 * not interfere with the scan, since the code
2407 			 * fragment in kqueue_scan() locks the knlist,
2408 			 * and cannot proceed until we finished.
2409 			 */
2410 			KQ_UNLOCK(kq);
2411 		} else if ((lockflags & KNF_NOKQLOCK) != 0) {
2412 			kn_enter_flux(kn);
2413 			KQ_UNLOCK(kq);
2414 			error = kn->kn_fop->f_event(kn, hint);
2415 			KQ_LOCK(kq);
2416 			kn_leave_flux(kn);
2417 			if (error)
2418 				KNOTE_ACTIVATE(kn, 1);
2419 			KQ_UNLOCK_FLUX(kq);
2420 		} else {
2421 			if (kn->kn_fop->f_event(kn, hint))
2422 				KNOTE_ACTIVATE(kn, 1);
2423 			KQ_UNLOCK(kq);
2424 		}
2425 	}
2426 	if ((lockflags & KNF_LISTLOCKED) == 0)
2427 		list->kl_unlock(list->kl_lockarg);
2428 }
2429 
2430 /*
2431  * add a knote to a knlist
2432  */
2433 void
knlist_add(struct knlist * knl,struct knote * kn,int islocked)2434 knlist_add(struct knlist *knl, struct knote *kn, int islocked)
2435 {
2436 
2437 	KNL_ASSERT_LOCK(knl, islocked);
2438 	KQ_NOTOWNED(kn->kn_kq);
2439 	KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn));
2440 	KASSERT((kn->kn_status & KN_DETACHED) != 0,
2441 	    ("knote %p was not detached", kn));
2442 	if (!islocked)
2443 		knl->kl_lock(knl->kl_lockarg);
2444 	SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
2445 	if (!islocked)
2446 		knl->kl_unlock(knl->kl_lockarg);
2447 	KQ_LOCK(kn->kn_kq);
2448 	kn->kn_knlist = knl;
2449 	kn->kn_status &= ~KN_DETACHED;
2450 	KQ_UNLOCK(kn->kn_kq);
2451 }
2452 
2453 static void
knlist_remove_kq(struct knlist * knl,struct knote * kn,int knlislocked,int kqislocked)2454 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked,
2455     int kqislocked)
2456 {
2457 
2458 	KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked"));
2459 	KNL_ASSERT_LOCK(knl, knlislocked);
2460 	mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
2461 	KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn));
2462 	KASSERT((kn->kn_status & KN_DETACHED) == 0,
2463 	    ("knote %p was already detached", kn));
2464 	if (!knlislocked)
2465 		knl->kl_lock(knl->kl_lockarg);
2466 	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
2467 	kn->kn_knlist = NULL;
2468 	if (!knlislocked)
2469 		kn_list_unlock(knl);
2470 	if (!kqislocked)
2471 		KQ_LOCK(kn->kn_kq);
2472 	kn->kn_status |= KN_DETACHED;
2473 	if (!kqislocked)
2474 		KQ_UNLOCK(kn->kn_kq);
2475 }
2476 
2477 /*
2478  * remove knote from the specified knlist
2479  */
2480 void
knlist_remove(struct knlist * knl,struct knote * kn,int islocked)2481 knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
2482 {
2483 
2484 	knlist_remove_kq(knl, kn, islocked, 0);
2485 }
2486 
2487 int
knlist_empty(struct knlist * knl)2488 knlist_empty(struct knlist *knl)
2489 {
2490 
2491 	KNL_ASSERT_LOCKED(knl);
2492 	return (SLIST_EMPTY(&knl->kl_list));
2493 }
2494 
2495 static struct mtx knlist_lock;
2496 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
2497     MTX_DEF);
2498 static void knlist_mtx_lock(void *arg);
2499 static void knlist_mtx_unlock(void *arg);
2500 
2501 static void
knlist_mtx_lock(void * arg)2502 knlist_mtx_lock(void *arg)
2503 {
2504 
2505 	mtx_lock((struct mtx *)arg);
2506 }
2507 
2508 static void
knlist_mtx_unlock(void * arg)2509 knlist_mtx_unlock(void *arg)
2510 {
2511 
2512 	mtx_unlock((struct mtx *)arg);
2513 }
2514 
2515 static void
knlist_mtx_assert_lock(void * arg,int what)2516 knlist_mtx_assert_lock(void *arg, int what)
2517 {
2518 
2519 	if (what == LA_LOCKED)
2520 		mtx_assert((struct mtx *)arg, MA_OWNED);
2521 	else
2522 		mtx_assert((struct mtx *)arg, MA_NOTOWNED);
2523 }
2524 
2525 void
knlist_init(struct knlist * knl,void * lock,void (* kl_lock)(void *),void (* kl_unlock)(void *),void (* kl_assert_lock)(void *,int))2526 knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
2527     void (*kl_unlock)(void *),
2528     void (*kl_assert_lock)(void *, int))
2529 {
2530 
2531 	if (lock == NULL)
2532 		knl->kl_lockarg = &knlist_lock;
2533 	else
2534 		knl->kl_lockarg = lock;
2535 
2536 	if (kl_lock == NULL)
2537 		knl->kl_lock = knlist_mtx_lock;
2538 	else
2539 		knl->kl_lock = kl_lock;
2540 	if (kl_unlock == NULL)
2541 		knl->kl_unlock = knlist_mtx_unlock;
2542 	else
2543 		knl->kl_unlock = kl_unlock;
2544 	if (kl_assert_lock == NULL)
2545 		knl->kl_assert_lock = knlist_mtx_assert_lock;
2546 	else
2547 		knl->kl_assert_lock = kl_assert_lock;
2548 
2549 	knl->kl_autodestroy = 0;
2550 	SLIST_INIT(&knl->kl_list);
2551 }
2552 
2553 void
knlist_init_mtx(struct knlist * knl,struct mtx * lock)2554 knlist_init_mtx(struct knlist *knl, struct mtx *lock)
2555 {
2556 
2557 	knlist_init(knl, lock, NULL, NULL, NULL);
2558 }
2559 
2560 struct knlist *
knlist_alloc(struct mtx * lock)2561 knlist_alloc(struct mtx *lock)
2562 {
2563 	struct knlist *knl;
2564 
2565 	knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK);
2566 	knlist_init_mtx(knl, lock);
2567 	return (knl);
2568 }
2569 
2570 void
knlist_destroy(struct knlist * knl)2571 knlist_destroy(struct knlist *knl)
2572 {
2573 
2574 	KASSERT(KNLIST_EMPTY(knl),
2575 	    ("destroying knlist %p with knotes on it", knl));
2576 }
2577 
2578 void
knlist_detach(struct knlist * knl)2579 knlist_detach(struct knlist *knl)
2580 {
2581 
2582 	KNL_ASSERT_LOCKED(knl);
2583 	knl->kl_autodestroy = 1;
2584 	if (knlist_empty(knl)) {
2585 		knlist_destroy(knl);
2586 		free(knl, M_KQUEUE);
2587 	}
2588 }
2589 
2590 /*
2591  * Even if we are locked, we may need to drop the lock to allow any influx
2592  * knotes time to "settle".
2593  */
2594 void
knlist_cleardel(struct knlist * knl,struct thread * td,int islocked,int killkn)2595 knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
2596 {
2597 	struct knote *kn, *kn2;
2598 	struct kqueue *kq;
2599 
2600 	KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl));
2601 	if (islocked)
2602 		KNL_ASSERT_LOCKED(knl);
2603 	else {
2604 		KNL_ASSERT_UNLOCKED(knl);
2605 again:		/* need to reacquire lock since we have dropped it */
2606 		knl->kl_lock(knl->kl_lockarg);
2607 	}
2608 
2609 	SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
2610 		kq = kn->kn_kq;
2611 		KQ_LOCK(kq);
2612 		if (kn_in_flux(kn)) {
2613 			KQ_UNLOCK(kq);
2614 			continue;
2615 		}
2616 		knlist_remove_kq(knl, kn, 1, 1);
2617 		if (killkn) {
2618 			kn_enter_flux(kn);
2619 			KQ_UNLOCK(kq);
2620 			knote_drop_detached(kn, td);
2621 		} else {
2622 			/* Make sure cleared knotes disappear soon */
2623 			kn->kn_flags |= EV_EOF | EV_ONESHOT;
2624 			KQ_UNLOCK(kq);
2625 		}
2626 		kq = NULL;
2627 	}
2628 
2629 	if (!SLIST_EMPTY(&knl->kl_list)) {
2630 		/* there are still in flux knotes remaining */
2631 		kn = SLIST_FIRST(&knl->kl_list);
2632 		kq = kn->kn_kq;
2633 		KQ_LOCK(kq);
2634 		KASSERT(kn_in_flux(kn), ("knote removed w/o list lock"));
2635 		knl->kl_unlock(knl->kl_lockarg);
2636 		kq->kq_state |= KQ_FLUXWAIT;
2637 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
2638 		kq = NULL;
2639 		goto again;
2640 	}
2641 
2642 	if (islocked)
2643 		KNL_ASSERT_LOCKED(knl);
2644 	else {
2645 		knl->kl_unlock(knl->kl_lockarg);
2646 		KNL_ASSERT_UNLOCKED(knl);
2647 	}
2648 }
2649 
2650 /*
2651  * Remove all knotes referencing a specified fd must be called with FILEDESC
2652  * lock.  This prevents a race where a new fd comes along and occupies the
2653  * entry and we attach a knote to the fd.
2654  */
2655 void
knote_fdclose(struct thread * td,int fd)2656 knote_fdclose(struct thread *td, int fd)
2657 {
2658 	struct filedesc *fdp = td->td_proc->p_fd;
2659 	struct kqueue *kq;
2660 	struct knote *kn;
2661 	int influx;
2662 
2663 	FILEDESC_XLOCK_ASSERT(fdp);
2664 
2665 	/*
2666 	 * We shouldn't have to worry about new kevents appearing on fd
2667 	 * since filedesc is locked.
2668 	 */
2669 	TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
2670 		KQ_LOCK(kq);
2671 
2672 again:
2673 		influx = 0;
2674 		while (kq->kq_knlistsize > fd &&
2675 		    (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
2676 			if (kn_in_flux(kn)) {
2677 				/* someone else might be waiting on our knote */
2678 				if (influx)
2679 					wakeup(kq);
2680 				kq->kq_state |= KQ_FLUXWAIT;
2681 				msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
2682 				goto again;
2683 			}
2684 			kn_enter_flux(kn);
2685 			KQ_UNLOCK(kq);
2686 			influx = 1;
2687 			knote_drop(kn, td);
2688 			KQ_LOCK(kq);
2689 		}
2690 		KQ_UNLOCK_FLUX(kq);
2691 	}
2692 }
2693 
2694 static int
knote_attach(struct knote * kn,struct kqueue * kq)2695 knote_attach(struct knote *kn, struct kqueue *kq)
2696 {
2697 	struct klist *list;
2698 
2699 	KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn));
2700 	KQ_OWNED(kq);
2701 
2702 	if ((kq->kq_state & KQ_CLOSING) != 0)
2703 		return (EBADF);
2704 	if (kn->kn_fop->f_isfd) {
2705 		if (kn->kn_id >= kq->kq_knlistsize)
2706 			return (ENOMEM);
2707 		list = &kq->kq_knlist[kn->kn_id];
2708 	} else {
2709 		if (kq->kq_knhash == NULL)
2710 			return (ENOMEM);
2711 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2712 	}
2713 	SLIST_INSERT_HEAD(list, kn, kn_link);
2714 	return (0);
2715 }
2716 
2717 static void
knote_drop(struct knote * kn,struct thread * td)2718 knote_drop(struct knote *kn, struct thread *td)
2719 {
2720 
2721 	if ((kn->kn_status & KN_DETACHED) == 0)
2722 		kn->kn_fop->f_detach(kn);
2723 	knote_drop_detached(kn, td);
2724 }
2725 
2726 static void
knote_drop_detached(struct knote * kn,struct thread * td)2727 knote_drop_detached(struct knote *kn, struct thread *td)
2728 {
2729 	struct kqueue *kq;
2730 	struct klist *list;
2731 
2732 	kq = kn->kn_kq;
2733 
2734 	KASSERT((kn->kn_status & KN_DETACHED) != 0,
2735 	    ("knote %p still attached", kn));
2736 	KQ_NOTOWNED(kq);
2737 
2738 	KQ_LOCK(kq);
2739 	for (;;) {
2740 		KASSERT(kn->kn_influx >= 1,
2741 		    ("knote_drop called on %p with influx %d",
2742 		    kn, kn->kn_influx));
2743 		if (kn->kn_influx == 1)
2744 			break;
2745 		kq->kq_state |= KQ_FLUXWAIT;
2746 		msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
2747 	}
2748 
2749 	if (kn->kn_fop->f_isfd)
2750 		list = &kq->kq_knlist[kn->kn_id];
2751 	else
2752 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2753 
2754 	if (!SLIST_EMPTY(list))
2755 		SLIST_REMOVE(list, kn, knote, kn_link);
2756 	if (kn->kn_status & KN_QUEUED)
2757 		knote_dequeue(kn);
2758 	KQ_UNLOCK_FLUX(kq);
2759 
2760 	if (kn->kn_fop->f_isfd) {
2761 		fdrop(kn->kn_fp, td);
2762 		kn->kn_fp = NULL;
2763 	}
2764 	kqueue_fo_release(kn->kn_kevent.filter);
2765 	kn->kn_fop = NULL;
2766 	knote_free(kn);
2767 }
2768 
2769 static void
knote_enqueue(struct knote * kn)2770 knote_enqueue(struct knote *kn)
2771 {
2772 	struct kqueue *kq = kn->kn_kq;
2773 
2774 	KQ_OWNED(kn->kn_kq);
2775 	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
2776 
2777 	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2778 	kn->kn_status |= KN_QUEUED;
2779 	kq->kq_count++;
2780 	kqueue_wakeup(kq);
2781 }
2782 
2783 static void
knote_dequeue(struct knote * kn)2784 knote_dequeue(struct knote *kn)
2785 {
2786 	struct kqueue *kq = kn->kn_kq;
2787 
2788 	KQ_OWNED(kn->kn_kq);
2789 	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
2790 
2791 	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2792 	kn->kn_status &= ~KN_QUEUED;
2793 	kq->kq_count--;
2794 }
2795 
2796 static void
knote_init(void)2797 knote_init(void)
2798 {
2799 
2800 	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
2801 	    NULL, NULL, UMA_ALIGN_PTR, 0);
2802 	ast_register(TDA_KQUEUE, ASTR_ASTF_REQUIRED, 0, ast_kqueue);
2803 }
2804 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
2805 
2806 static struct knote *
knote_alloc(int mflag)2807 knote_alloc(int mflag)
2808 {
2809 
2810 	return (uma_zalloc(knote_zone, mflag | M_ZERO));
2811 }
2812 
2813 static void
knote_free(struct knote * kn)2814 knote_free(struct knote *kn)
2815 {
2816 
2817 	uma_zfree(knote_zone, kn);
2818 }
2819 
2820 /*
2821  * Register the kev w/ the kq specified by fd.
2822  */
2823 int
kqfd_register(int fd,struct kevent * kev,struct thread * td,int mflag)2824 kqfd_register(int fd, struct kevent *kev, struct thread *td, int mflag)
2825 {
2826 	struct kqueue *kq;
2827 	struct file *fp;
2828 	cap_rights_t rights;
2829 	int error;
2830 
2831 	error = fget(td, fd, cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE),
2832 	    &fp);
2833 	if (error != 0)
2834 		return (error);
2835 	if ((error = kqueue_acquire(fp, &kq)) != 0)
2836 		goto noacquire;
2837 
2838 	error = kqueue_register(kq, kev, td, mflag);
2839 	kqueue_release(kq, 0);
2840 
2841 noacquire:
2842 	fdrop(fp, td);
2843 	return (error);
2844 }
2845 
2846 struct knote_status_export_bit {
2847 	int kn_status_bit;
2848 	int knt_status_bit;
2849 };
2850 
2851 #define	ST(name) \
2852     { .kn_status_bit = KN_##name, .knt_status_bit = KNOTE_STATUS_##name }
2853 static const struct knote_status_export_bit knote_status_export_bits[] = {
2854 	ST(ACTIVE),
2855 	ST(QUEUED),
2856 	ST(DISABLED),
2857 	ST(DETACHED),
2858 	ST(KQUEUE),
2859 };
2860 #undef ST
2861 
2862 static int
knote_status_export(int kn_status)2863 knote_status_export(int kn_status)
2864 {
2865 	const struct knote_status_export_bit *b;
2866 	unsigned i;
2867 	int res;
2868 
2869 	res = 0;
2870 	for (i = 0; i < nitems(knote_status_export_bits); i++) {
2871 		b = &knote_status_export_bits[i];
2872 		if ((kn_status & b->kn_status_bit) != 0)
2873 			res |= b->knt_status_bit;
2874 	}
2875 	return (res);
2876 }
2877 
2878 static int
kern_proc_kqueue_report_one(struct sbuf * s,struct proc * p,int kq_fd,struct kqueue * kq,struct knote * kn,bool compat32 __unused)2879 kern_proc_kqueue_report_one(struct sbuf *s, struct proc *p,
2880     int kq_fd, struct kqueue *kq, struct knote *kn, bool compat32 __unused)
2881 {
2882 	struct kinfo_knote kin;
2883 #ifdef COMPAT_FREEBSD32
2884 	struct kinfo_knote32 kin32;
2885 #endif
2886 	int error;
2887 
2888 	if (kn->kn_status == KN_MARKER)
2889 		return (0);
2890 
2891 	memset(&kin, 0, sizeof(kin));
2892 	kin.knt_kq_fd = kq_fd;
2893 	memcpy(&kin.knt_event, &kn->kn_kevent, sizeof(struct kevent));
2894 	kin.knt_status = knote_status_export(kn->kn_status);
2895 	kn_enter_flux(kn);
2896 	KQ_UNLOCK_FLUX(kq);
2897 	if (kn->kn_fop->f_userdump != NULL)
2898 		(void)kn->kn_fop->f_userdump(p, kn, &kin);
2899 #ifdef COMPAT_FREEBSD32
2900 	if (compat32) {
2901 		freebsd32_kinfo_knote_to_32(&kin, &kin32);
2902 		error = sbuf_bcat(s, &kin32, sizeof(kin32));
2903 	} else
2904 #endif
2905 		error = sbuf_bcat(s, &kin, sizeof(kin));
2906 	KQ_LOCK(kq);
2907 	kn_leave_flux(kn);
2908 	return (error);
2909 }
2910 
2911 static int
kern_proc_kqueue_report(struct sbuf * s,struct proc * p,int kq_fd,struct kqueue * kq,bool compat32)2912 kern_proc_kqueue_report(struct sbuf *s, struct proc *p, int kq_fd,
2913     struct kqueue *kq, bool compat32)
2914 {
2915 	struct knote *kn;
2916 	int error, i;
2917 
2918 	error = 0;
2919 	KQ_LOCK(kq);
2920 	for (i = 0; i < kq->kq_knlistsize; i++) {
2921 		SLIST_FOREACH(kn, &kq->kq_knlist[i], kn_link) {
2922 			error = kern_proc_kqueue_report_one(s, p, kq_fd,
2923 			    kq, kn, compat32);
2924 			if (error != 0)
2925 				goto out;
2926 		}
2927 	}
2928 	if (kq->kq_knhashmask == 0)
2929 		goto out;
2930 	for (i = 0; i <= kq->kq_knhashmask; i++) {
2931 		SLIST_FOREACH(kn, &kq->kq_knhash[i], kn_link) {
2932 			error = kern_proc_kqueue_report_one(s, p, kq_fd,
2933 			    kq, kn, compat32);
2934 			if (error != 0)
2935 				goto out;
2936 		}
2937 	}
2938 out:
2939 	KQ_UNLOCK_FLUX(kq);
2940 	return (error);
2941 }
2942 
2943 struct kern_proc_kqueues_out1_cb_args {
2944 	struct sbuf *s;
2945 	bool compat32;
2946 };
2947 
2948 static int
kern_proc_kqueues_out1_cb(struct proc * p,int fd,struct file * fp,void * arg)2949 kern_proc_kqueues_out1_cb(struct proc *p, int fd, struct file *fp, void *arg)
2950 {
2951 	struct kqueue *kq;
2952 	struct kern_proc_kqueues_out1_cb_args *a;
2953 
2954 	if (fp->f_type != DTYPE_KQUEUE)
2955 		return (0);
2956 	a = arg;
2957 	kq = fp->f_data;
2958 	return (kern_proc_kqueue_report(a->s, p, fd, kq, a->compat32));
2959 }
2960 
2961 static int
kern_proc_kqueues_out1(struct thread * td,struct proc * p,struct sbuf * s,bool compat32)2962 kern_proc_kqueues_out1(struct thread *td, struct proc *p, struct sbuf *s,
2963     bool compat32)
2964 {
2965 	struct kern_proc_kqueues_out1_cb_args a;
2966 
2967 	a.s = s;
2968 	a.compat32 = compat32;
2969 	return (fget_remote_foreach(td, p, kern_proc_kqueues_out1_cb, &a));
2970 }
2971 
2972 int
kern_proc_kqueues_out(struct proc * p,struct sbuf * sb,size_t maxlen,bool compat32)2973 kern_proc_kqueues_out(struct proc *p, struct sbuf *sb, size_t maxlen,
2974     bool compat32)
2975 {
2976 	struct sbuf *s, sm;
2977 	size_t sb_len;
2978 	int error;
2979 
2980 	if (maxlen == -1 || maxlen == 0)
2981 		sb_len = 128;
2982 	else
2983 		sb_len = maxlen;
2984 	s = sbuf_new(&sm, NULL, sb_len, maxlen == -1 ? SBUF_AUTOEXTEND :
2985 	    SBUF_FIXEDLEN);
2986 	error = kern_proc_kqueues_out1(curthread, p, s, compat32);
2987 	sbuf_finish(s);
2988 	if (error == 0) {
2989 		sbuf_bcat(sb, sbuf_data(s), MIN(sbuf_len(s), maxlen == -1 ?
2990 		    SIZE_T_MAX : maxlen));
2991 	}
2992 	sbuf_delete(s);
2993 	return (error);
2994 }
2995 
2996 static int
sysctl_kern_proc_kqueue_one(struct thread * td,struct sbuf * s,struct proc * p,int kq_fd,bool compat32)2997 sysctl_kern_proc_kqueue_one(struct thread *td, struct sbuf *s, struct proc *p,
2998     int kq_fd, bool compat32)
2999 {
3000 	struct file *fp;
3001 	struct kqueue *kq;
3002 	int error;
3003 
3004 	error = fget_remote(td, p, kq_fd, &fp);
3005 	if (error == 0) {
3006 		if (fp->f_type != DTYPE_KQUEUE) {
3007 			error = EINVAL;
3008 		} else {
3009 			kq = fp->f_data;
3010 			error = kern_proc_kqueue_report(s, p, kq_fd, kq,
3011 			    compat32);
3012 		}
3013 		fdrop(fp, td);
3014 	}
3015 	return (error);
3016 }
3017 
3018 static int
sysctl_kern_proc_kqueue(SYSCTL_HANDLER_ARGS)3019 sysctl_kern_proc_kqueue(SYSCTL_HANDLER_ARGS)
3020 {
3021 	struct thread *td;
3022 	struct proc *p;
3023 	struct sbuf *s, sm;
3024 	int error, error1, *name;
3025 	bool compat32;
3026 
3027 	name = (int *)arg1;
3028 	if ((u_int)arg2 > 2 || (u_int)arg2 == 0)
3029 		return (EINVAL);
3030 
3031 	error = pget((pid_t)name[0], PGET_HOLD | PGET_CANDEBUG, &p);
3032 	if (error != 0)
3033 		return (error);
3034 
3035 	td = curthread;
3036 #ifdef FREEBSD_COMPAT32
3037 	compat32 = SV_CURPROC_FLAG(SV_ILP32);
3038 #else
3039 	compat32 = false;
3040 #endif
3041 
3042 	s = sbuf_new_for_sysctl(&sm, NULL, 0, req);
3043 	if (s == NULL) {
3044 		error = ENOMEM;
3045 		goto out;
3046 	}
3047 	sbuf_clear_flags(s, SBUF_INCLUDENUL);
3048 
3049 	if ((u_int)arg2 == 1) {
3050 		error = kern_proc_kqueues_out1(td, p, s, compat32);
3051 	} else {
3052 		error = sysctl_kern_proc_kqueue_one(td, s, p,
3053 		    name[1] /* kq_fd */, compat32);
3054 	}
3055 
3056 	error1 = sbuf_finish(s);
3057 	if (error == 0)
3058 		error = error1;
3059 	sbuf_delete(s);
3060 
3061 out:
3062 	PRELE(p);
3063 	return (error);
3064 }
3065 
3066 static SYSCTL_NODE(_kern_proc, KERN_PROC_KQUEUE, kq,
3067     CTLFLAG_RD | CTLFLAG_MPSAFE,
3068     sysctl_kern_proc_kqueue, "KQueue events");
3069