1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
5 * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
6 * Copyright (c) 2009 Apple, Inc.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 #include <sys/cdefs.h>
32 #include "opt_ktrace.h"
33 #include "opt_kqueue.h"
34
35 #ifdef COMPAT_FREEBSD11
36 #define _WANT_FREEBSD11_KEVENT
37 #endif
38
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/capsicum.h>
42 #include <sys/kernel.h>
43 #include <sys/limits.h>
44 #include <sys/lock.h>
45 #include <sys/mutex.h>
46 #include <sys/proc.h>
47 #include <sys/malloc.h>
48 #include <sys/unistd.h>
49 #include <sys/file.h>
50 #include <sys/filedesc.h>
51 #include <sys/filio.h>
52 #include <sys/fcntl.h>
53 #include <sys/kthread.h>
54 #include <sys/selinfo.h>
55 #include <sys/queue.h>
56 #include <sys/event.h>
57 #include <sys/eventvar.h>
58 #include <sys/poll.h>
59 #include <sys/protosw.h>
60 #include <sys/resourcevar.h>
61 #include <sys/sbuf.h>
62 #include <sys/sigio.h>
63 #include <sys/signalvar.h>
64 #include <sys/socket.h>
65 #include <sys/socketvar.h>
66 #include <sys/stat.h>
67 #include <sys/sysctl.h>
68 #include <sys/sysent.h>
69 #include <sys/sysproto.h>
70 #include <sys/syscallsubr.h>
71 #include <sys/taskqueue.h>
72 #include <sys/uio.h>
73 #include <sys/user.h>
74 #ifdef KTRACE
75 #include <sys/ktrace.h>
76 #endif
77 #include <machine/atomic.h>
78 #ifdef COMPAT_FREEBSD32
79 #include <compat/freebsd32/freebsd32.h>
80 #include <compat/freebsd32/freebsd32_util.h>
81 #endif
82
83 #include <vm/uma.h>
84
85 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
86
87 /*
88 * This lock is used if multiple kq locks are required. This possibly
89 * should be made into a per proc lock.
90 */
91 static struct mtx kq_global;
92 MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
93 #define KQ_GLOBAL_LOCK(lck, haslck) do { \
94 if (!haslck) \
95 mtx_lock(lck); \
96 haslck = 1; \
97 } while (0)
98 #define KQ_GLOBAL_UNLOCK(lck, haslck) do { \
99 if (haslck) \
100 mtx_unlock(lck); \
101 haslck = 0; \
102 } while (0)
103
104 TASKQUEUE_DEFINE_THREAD(kqueue_ctx);
105
106 static int kevent_copyout(void *arg, struct kevent *kevp, int count);
107 static int kevent_copyin(void *arg, struct kevent *kevp, int count);
108 static int kqueue_register(struct kqueue *kq, struct kevent *kev,
109 struct thread *td, int mflag);
110 static int kqueue_acquire(struct file *fp, struct kqueue **kqp);
111 static void kqueue_release(struct kqueue *kq, int locked);
112 static void kqueue_destroy(struct kqueue *kq);
113 static void kqueue_drain(struct kqueue *kq, struct thread *td);
114 static int kqueue_expand(struct kqueue *kq, const struct filterops *fops,
115 uintptr_t ident, int mflag);
116 static void kqueue_task(void *arg, int pending);
117 static int kqueue_scan(struct kqueue *kq, int maxevents,
118 struct kevent_copyops *k_ops,
119 const struct timespec *timeout,
120 struct kevent *keva, struct thread *td);
121 static void kqueue_wakeup(struct kqueue *kq);
122 static const struct filterops *kqueue_fo_find(int filt);
123 static void kqueue_fo_release(int filt);
124 struct g_kevent_args;
125 static int kern_kevent_generic(struct thread *td,
126 struct g_kevent_args *uap,
127 struct kevent_copyops *k_ops, const char *struct_name);
128
129 static fo_ioctl_t kqueue_ioctl;
130 static fo_poll_t kqueue_poll;
131 static fo_kqfilter_t kqueue_kqfilter;
132 static fo_stat_t kqueue_stat;
133 static fo_close_t kqueue_close;
134 static fo_fill_kinfo_t kqueue_fill_kinfo;
135
136 static const struct fileops kqueueops = {
137 .fo_read = invfo_rdwr,
138 .fo_write = invfo_rdwr,
139 .fo_truncate = invfo_truncate,
140 .fo_ioctl = kqueue_ioctl,
141 .fo_poll = kqueue_poll,
142 .fo_kqfilter = kqueue_kqfilter,
143 .fo_stat = kqueue_stat,
144 .fo_close = kqueue_close,
145 .fo_chmod = invfo_chmod,
146 .fo_chown = invfo_chown,
147 .fo_sendfile = invfo_sendfile,
148 .fo_cmp = file_kcmp_generic,
149 .fo_fill_kinfo = kqueue_fill_kinfo,
150 };
151
152 static int knote_attach(struct knote *kn, struct kqueue *kq);
153 static void knote_drop(struct knote *kn, struct thread *td);
154 static void knote_drop_detached(struct knote *kn, struct thread *td);
155 static void knote_enqueue(struct knote *kn);
156 static void knote_dequeue(struct knote *kn);
157 static void knote_init(void);
158 static struct knote *knote_alloc(int mflag);
159 static void knote_free(struct knote *kn);
160
161 static void filt_kqdetach(struct knote *kn);
162 static int filt_kqueue(struct knote *kn, long hint);
163 static int filt_procattach(struct knote *kn);
164 static void filt_procdetach(struct knote *kn);
165 static int filt_proc(struct knote *kn, long hint);
166 static int filt_fileattach(struct knote *kn);
167 static void filt_timerexpire(void *knx);
168 static void filt_timerexpire_l(struct knote *kn, bool proc_locked);
169 static int filt_timerattach(struct knote *kn);
170 static void filt_timerdetach(struct knote *kn);
171 static void filt_timerstart(struct knote *kn, sbintime_t to);
172 static void filt_timertouch(struct knote *kn, struct kevent *kev,
173 u_long type);
174 static int filt_timervalidate(struct knote *kn, sbintime_t *to);
175 static int filt_timer(struct knote *kn, long hint);
176 static int filt_userattach(struct knote *kn);
177 static void filt_userdetach(struct knote *kn);
178 static int filt_user(struct knote *kn, long hint);
179 static void filt_usertouch(struct knote *kn, struct kevent *kev,
180 u_long type);
181
182 static const struct filterops file_filtops = {
183 .f_isfd = 1,
184 .f_attach = filt_fileattach,
185 };
186 static const struct filterops kqread_filtops = {
187 .f_isfd = 1,
188 .f_detach = filt_kqdetach,
189 .f_event = filt_kqueue,
190 };
191 /* XXX - move to kern_proc.c? */
192 static const struct filterops proc_filtops = {
193 .f_isfd = 0,
194 .f_attach = filt_procattach,
195 .f_detach = filt_procdetach,
196 .f_event = filt_proc,
197 };
198 static const struct filterops timer_filtops = {
199 .f_isfd = 0,
200 .f_attach = filt_timerattach,
201 .f_detach = filt_timerdetach,
202 .f_event = filt_timer,
203 .f_touch = filt_timertouch,
204 };
205 static const struct filterops user_filtops = {
206 .f_attach = filt_userattach,
207 .f_detach = filt_userdetach,
208 .f_event = filt_user,
209 .f_touch = filt_usertouch,
210 };
211
212 static uma_zone_t knote_zone;
213 static unsigned int __exclusive_cache_line kq_ncallouts;
214 static unsigned int kq_calloutmax = 4 * 1024;
215 SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
216 &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
217
218 /* XXX - ensure not influx ? */
219 #define KNOTE_ACTIVATE(kn, islock) do { \
220 if ((islock)) \
221 mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \
222 else \
223 KQ_LOCK((kn)->kn_kq); \
224 (kn)->kn_status |= KN_ACTIVE; \
225 if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \
226 knote_enqueue((kn)); \
227 if (!(islock)) \
228 KQ_UNLOCK((kn)->kn_kq); \
229 } while (0)
230 #define KQ_LOCK(kq) do { \
231 mtx_lock(&(kq)->kq_lock); \
232 } while (0)
233 #define KQ_FLUX_WAKEUP(kq) do { \
234 if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \
235 (kq)->kq_state &= ~KQ_FLUXWAIT; \
236 wakeup((kq)); \
237 } \
238 } while (0)
239 #define KQ_UNLOCK_FLUX(kq) do { \
240 KQ_FLUX_WAKEUP(kq); \
241 mtx_unlock(&(kq)->kq_lock); \
242 } while (0)
243 #define KQ_UNLOCK(kq) do { \
244 mtx_unlock(&(kq)->kq_lock); \
245 } while (0)
246 #define KQ_OWNED(kq) do { \
247 mtx_assert(&(kq)->kq_lock, MA_OWNED); \
248 } while (0)
249 #define KQ_NOTOWNED(kq) do { \
250 mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \
251 } while (0)
252
253 static struct knlist *
kn_list_lock(struct knote * kn)254 kn_list_lock(struct knote *kn)
255 {
256 struct knlist *knl;
257
258 knl = kn->kn_knlist;
259 if (knl != NULL)
260 knl->kl_lock(knl->kl_lockarg);
261 return (knl);
262 }
263
264 static void
kn_list_unlock(struct knlist * knl)265 kn_list_unlock(struct knlist *knl)
266 {
267 bool do_free;
268
269 if (knl == NULL)
270 return;
271 do_free = knl->kl_autodestroy && knlist_empty(knl);
272 knl->kl_unlock(knl->kl_lockarg);
273 if (do_free) {
274 knlist_destroy(knl);
275 free(knl, M_KQUEUE);
276 }
277 }
278
279 static bool
kn_in_flux(struct knote * kn)280 kn_in_flux(struct knote *kn)
281 {
282
283 return (kn->kn_influx > 0);
284 }
285
286 static void
kn_enter_flux(struct knote * kn)287 kn_enter_flux(struct knote *kn)
288 {
289
290 KQ_OWNED(kn->kn_kq);
291 MPASS(kn->kn_influx < INT_MAX);
292 kn->kn_influx++;
293 }
294
295 static bool
kn_leave_flux(struct knote * kn)296 kn_leave_flux(struct knote *kn)
297 {
298
299 KQ_OWNED(kn->kn_kq);
300 MPASS(kn->kn_influx > 0);
301 kn->kn_influx--;
302 return (kn->kn_influx == 0);
303 }
304
305 #define KNL_ASSERT_LOCK(knl, islocked) do { \
306 if (islocked) \
307 KNL_ASSERT_LOCKED(knl); \
308 else \
309 KNL_ASSERT_UNLOCKED(knl); \
310 } while (0)
311 #ifdef INVARIANTS
312 #define KNL_ASSERT_LOCKED(knl) do { \
313 knl->kl_assert_lock((knl)->kl_lockarg, LA_LOCKED); \
314 } while (0)
315 #define KNL_ASSERT_UNLOCKED(knl) do { \
316 knl->kl_assert_lock((knl)->kl_lockarg, LA_UNLOCKED); \
317 } while (0)
318 #else /* !INVARIANTS */
319 #define KNL_ASSERT_LOCKED(knl) do {} while (0)
320 #define KNL_ASSERT_UNLOCKED(knl) do {} while (0)
321 #endif /* INVARIANTS */
322
323 #ifndef KN_HASHSIZE
324 #define KN_HASHSIZE 64 /* XXX should be tunable */
325 #endif
326
327 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
328
329 static int
filt_nullattach(struct knote * kn)330 filt_nullattach(struct knote *kn)
331 {
332
333 return (ENXIO);
334 };
335
336 static const struct filterops null_filtops = {
337 .f_isfd = 0,
338 .f_attach = filt_nullattach,
339 };
340
341 /* XXX - make SYSINIT to add these, and move into respective modules. */
342 extern const struct filterops sig_filtops;
343 extern const struct filterops fs_filtops;
344
345 /*
346 * Table for all system-defined filters.
347 */
348 static struct mtx filterops_lock;
349 MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops", MTX_DEF);
350 static struct {
351 const struct filterops *for_fop;
352 int for_nolock;
353 int for_refcnt;
354 } sysfilt_ops[EVFILT_SYSCOUNT] = {
355 [~EVFILT_READ] = { &file_filtops, 1 },
356 [~EVFILT_WRITE] = { &file_filtops, 1 },
357 [~EVFILT_AIO] = { &null_filtops },
358 [~EVFILT_VNODE] = { &file_filtops, 1 },
359 [~EVFILT_PROC] = { &proc_filtops, 1 },
360 [~EVFILT_SIGNAL] = { &sig_filtops, 1 },
361 [~EVFILT_TIMER] = { &timer_filtops, 1 },
362 [~EVFILT_PROCDESC] = { &file_filtops, 1 },
363 [~EVFILT_FS] = { &fs_filtops, 1 },
364 [~EVFILT_LIO] = { &null_filtops },
365 [~EVFILT_USER] = { &user_filtops, 1 },
366 [~EVFILT_SENDFILE] = { &null_filtops },
367 [~EVFILT_EMPTY] = { &file_filtops, 1 },
368 };
369
370 /*
371 * Simple redirection for all cdevsw style objects to call their fo_kqfilter
372 * method.
373 */
374 static int
filt_fileattach(struct knote * kn)375 filt_fileattach(struct knote *kn)
376 {
377
378 return (fo_kqfilter(kn->kn_fp, kn));
379 }
380
381 /*ARGSUSED*/
382 static int
kqueue_kqfilter(struct file * fp,struct knote * kn)383 kqueue_kqfilter(struct file *fp, struct knote *kn)
384 {
385 struct kqueue *kq = kn->kn_fp->f_data;
386
387 if (kn->kn_filter != EVFILT_READ)
388 return (EINVAL);
389
390 kn->kn_status |= KN_KQUEUE;
391 kn->kn_fop = &kqread_filtops;
392 knlist_add(&kq->kq_sel.si_note, kn, 0);
393
394 return (0);
395 }
396
397 static void
filt_kqdetach(struct knote * kn)398 filt_kqdetach(struct knote *kn)
399 {
400 struct kqueue *kq = kn->kn_fp->f_data;
401
402 knlist_remove(&kq->kq_sel.si_note, kn, 0);
403 }
404
405 /*ARGSUSED*/
406 static int
filt_kqueue(struct knote * kn,long hint)407 filt_kqueue(struct knote *kn, long hint)
408 {
409 struct kqueue *kq = kn->kn_fp->f_data;
410
411 kn->kn_data = kq->kq_count;
412 return (kn->kn_data > 0);
413 }
414
415 /* XXX - move to kern_proc.c? */
416 static int
filt_procattach(struct knote * kn)417 filt_procattach(struct knote *kn)
418 {
419 struct proc *p;
420 int error;
421 bool exiting, immediate;
422
423 exiting = immediate = false;
424 if (kn->kn_sfflags & NOTE_EXIT)
425 p = pfind_any(kn->kn_id);
426 else
427 p = pfind(kn->kn_id);
428 if (p == NULL)
429 return (ESRCH);
430 if (p->p_flag & P_WEXIT)
431 exiting = true;
432
433 if ((error = p_cansee(curthread, p))) {
434 PROC_UNLOCK(p);
435 return (error);
436 }
437
438 kn->kn_ptr.p_proc = p;
439 kn->kn_flags |= EV_CLEAR; /* automatically set */
440
441 /*
442 * Internal flag indicating registration done by kernel for the
443 * purposes of getting a NOTE_CHILD notification.
444 */
445 if (kn->kn_flags & EV_FLAG2) {
446 kn->kn_flags &= ~EV_FLAG2;
447 kn->kn_data = kn->kn_sdata; /* ppid */
448 kn->kn_fflags = NOTE_CHILD;
449 kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK);
450 immediate = true; /* Force immediate activation of child note. */
451 }
452 /*
453 * Internal flag indicating registration done by kernel (for other than
454 * NOTE_CHILD).
455 */
456 if (kn->kn_flags & EV_FLAG1) {
457 kn->kn_flags &= ~EV_FLAG1;
458 }
459
460 knlist_add(p->p_klist, kn, 1);
461
462 /*
463 * Immediately activate any child notes or, in the case of a zombie
464 * target process, exit notes. The latter is necessary to handle the
465 * case where the target process, e.g. a child, dies before the kevent
466 * is registered.
467 */
468 if (immediate || (exiting && filt_proc(kn, NOTE_EXIT)))
469 KNOTE_ACTIVATE(kn, 0);
470
471 PROC_UNLOCK(p);
472
473 return (0);
474 }
475
476 /*
477 * The knote may be attached to a different process, which may exit,
478 * leaving nothing for the knote to be attached to. So when the process
479 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
480 * it will be deleted when read out. However, as part of the knote deletion,
481 * this routine is called, so a check is needed to avoid actually performing
482 * a detach, because the original process does not exist any more.
483 */
484 /* XXX - move to kern_proc.c? */
485 static void
filt_procdetach(struct knote * kn)486 filt_procdetach(struct knote *kn)
487 {
488
489 knlist_remove(kn->kn_knlist, kn, 0);
490 kn->kn_ptr.p_proc = NULL;
491 }
492
493 /* XXX - move to kern_proc.c? */
494 static int
filt_proc(struct knote * kn,long hint)495 filt_proc(struct knote *kn, long hint)
496 {
497 struct proc *p;
498 u_int event;
499
500 p = kn->kn_ptr.p_proc;
501 if (p == NULL) /* already activated, from attach filter */
502 return (0);
503
504 /* Mask off extra data. */
505 event = (u_int)hint & NOTE_PCTRLMASK;
506
507 /* If the user is interested in this event, record it. */
508 if (kn->kn_sfflags & event)
509 kn->kn_fflags |= event;
510
511 /* Process is gone, so flag the event as finished. */
512 if (event == NOTE_EXIT) {
513 kn->kn_flags |= EV_EOF | EV_ONESHOT;
514 kn->kn_ptr.p_proc = NULL;
515 if (kn->kn_fflags & NOTE_EXIT)
516 kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig);
517 if (kn->kn_fflags == 0)
518 kn->kn_flags |= EV_DROP;
519 return (1);
520 }
521
522 return (kn->kn_fflags != 0);
523 }
524
525 /*
526 * Called when the process forked. It mostly does the same as the
527 * knote(), activating all knotes registered to be activated when the
528 * process forked. Additionally, for each knote attached to the
529 * parent, check whether user wants to track the new process. If so
530 * attach a new knote to it, and immediately report an event with the
531 * child's pid.
532 */
533 void
knote_fork(struct knlist * list,int pid)534 knote_fork(struct knlist *list, int pid)
535 {
536 struct kqueue *kq;
537 struct knote *kn;
538 struct kevent kev;
539 int error;
540
541 MPASS(list != NULL);
542 KNL_ASSERT_LOCKED(list);
543 if (SLIST_EMPTY(&list->kl_list))
544 return;
545
546 memset(&kev, 0, sizeof(kev));
547 SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
548 kq = kn->kn_kq;
549 KQ_LOCK(kq);
550 if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
551 KQ_UNLOCK(kq);
552 continue;
553 }
554
555 /*
556 * The same as knote(), activate the event.
557 */
558 if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
559 if (kn->kn_fop->f_event(kn, NOTE_FORK))
560 KNOTE_ACTIVATE(kn, 1);
561 KQ_UNLOCK(kq);
562 continue;
563 }
564
565 /*
566 * The NOTE_TRACK case. In addition to the activation
567 * of the event, we need to register new events to
568 * track the child. Drop the locks in preparation for
569 * the call to kqueue_register().
570 */
571 kn_enter_flux(kn);
572 KQ_UNLOCK(kq);
573 list->kl_unlock(list->kl_lockarg);
574
575 /*
576 * Activate existing knote and register tracking knotes with
577 * new process.
578 *
579 * First register a knote to get just the child notice. This
580 * must be a separate note from a potential NOTE_EXIT
581 * notification since both NOTE_CHILD and NOTE_EXIT are defined
582 * to use the data field (in conflicting ways).
583 */
584 kev.ident = pid;
585 kev.filter = kn->kn_filter;
586 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT |
587 EV_FLAG2;
588 kev.fflags = kn->kn_sfflags;
589 kev.data = kn->kn_id; /* parent */
590 kev.udata = kn->kn_kevent.udata;/* preserve udata */
591 error = kqueue_register(kq, &kev, NULL, M_NOWAIT);
592 if (error)
593 kn->kn_fflags |= NOTE_TRACKERR;
594
595 /*
596 * Then register another knote to track other potential events
597 * from the new process.
598 */
599 kev.ident = pid;
600 kev.filter = kn->kn_filter;
601 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
602 kev.fflags = kn->kn_sfflags;
603 kev.data = kn->kn_id; /* parent */
604 kev.udata = kn->kn_kevent.udata;/* preserve udata */
605 error = kqueue_register(kq, &kev, NULL, M_NOWAIT);
606 if (error)
607 kn->kn_fflags |= NOTE_TRACKERR;
608 if (kn->kn_fop->f_event(kn, NOTE_FORK))
609 KNOTE_ACTIVATE(kn, 0);
610 list->kl_lock(list->kl_lockarg);
611 KQ_LOCK(kq);
612 kn_leave_flux(kn);
613 KQ_UNLOCK_FLUX(kq);
614 }
615 }
616
617 /*
618 * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
619 * interval timer support code.
620 */
621
622 #define NOTE_TIMER_PRECMASK \
623 (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS)
624
625 static sbintime_t
timer2sbintime(int64_t data,int flags)626 timer2sbintime(int64_t data, int flags)
627 {
628 int64_t secs;
629
630 /*
631 * Macros for converting to the fractional second portion of an
632 * sbintime_t using 64bit multiplication to improve precision.
633 */
634 #define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32)
635 #define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32)
636 #define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32)
637 switch (flags & NOTE_TIMER_PRECMASK) {
638 case NOTE_SECONDS:
639 #ifdef __LP64__
640 if (data > (SBT_MAX / SBT_1S))
641 return (SBT_MAX);
642 #endif
643 return ((sbintime_t)data << 32);
644 case NOTE_MSECONDS: /* FALLTHROUGH */
645 case 0:
646 if (data >= 1000) {
647 secs = data / 1000;
648 #ifdef __LP64__
649 if (secs > (SBT_MAX / SBT_1S))
650 return (SBT_MAX);
651 #endif
652 return (secs << 32 | MS_TO_SBT(data % 1000));
653 }
654 return (MS_TO_SBT(data));
655 case NOTE_USECONDS:
656 if (data >= 1000000) {
657 secs = data / 1000000;
658 #ifdef __LP64__
659 if (secs > (SBT_MAX / SBT_1S))
660 return (SBT_MAX);
661 #endif
662 return (secs << 32 | US_TO_SBT(data % 1000000));
663 }
664 return (US_TO_SBT(data));
665 case NOTE_NSECONDS:
666 if (data >= 1000000000) {
667 secs = data / 1000000000;
668 #ifdef __LP64__
669 if (secs > (SBT_MAX / SBT_1S))
670 return (SBT_MAX);
671 #endif
672 return (secs << 32 | NS_TO_SBT(data % 1000000000));
673 }
674 return (NS_TO_SBT(data));
675 default:
676 break;
677 }
678 return (-1);
679 }
680
681 struct kq_timer_cb_data {
682 struct callout c;
683 struct proc *p;
684 struct knote *kn;
685 int cpuid;
686 int flags;
687 TAILQ_ENTRY(kq_timer_cb_data) link;
688 sbintime_t next; /* next timer event fires at */
689 sbintime_t to; /* precalculated timer period, 0 for abs */
690 };
691
692 #define KQ_TIMER_CB_ENQUEUED 0x01
693
694 static void
kqtimer_sched_callout(struct kq_timer_cb_data * kc)695 kqtimer_sched_callout(struct kq_timer_cb_data *kc)
696 {
697 callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kc->kn,
698 kc->cpuid, C_ABSOLUTE);
699 }
700
701 void
kqtimer_proc_continue(struct proc * p)702 kqtimer_proc_continue(struct proc *p)
703 {
704 struct kq_timer_cb_data *kc, *kc1;
705 struct bintime bt;
706 sbintime_t now;
707
708 PROC_LOCK_ASSERT(p, MA_OWNED);
709
710 getboottimebin(&bt);
711 now = bttosbt(bt);
712
713 TAILQ_FOREACH_SAFE(kc, &p->p_kqtim_stop, link, kc1) {
714 TAILQ_REMOVE(&p->p_kqtim_stop, kc, link);
715 kc->flags &= ~KQ_TIMER_CB_ENQUEUED;
716 if (kc->next <= now)
717 filt_timerexpire_l(kc->kn, true);
718 else
719 kqtimer_sched_callout(kc);
720 }
721 }
722
723 static void
filt_timerexpire_l(struct knote * kn,bool proc_locked)724 filt_timerexpire_l(struct knote *kn, bool proc_locked)
725 {
726 struct kq_timer_cb_data *kc;
727 struct proc *p;
728 uint64_t delta;
729 sbintime_t now;
730
731 kc = kn->kn_ptr.p_v;
732
733 if ((kn->kn_flags & EV_ONESHOT) != 0 || kc->to == 0) {
734 kn->kn_data++;
735 KNOTE_ACTIVATE(kn, 0);
736 return;
737 }
738
739 now = sbinuptime();
740 if (now >= kc->next) {
741 delta = (now - kc->next) / kc->to;
742 if (delta == 0)
743 delta = 1;
744 kn->kn_data += delta;
745 kc->next += delta * kc->to;
746 if (now >= kc->next) /* overflow */
747 kc->next = now + kc->to;
748 KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */
749 }
750
751 /*
752 * Initial check for stopped kc->p is racy. It is fine to
753 * miss the set of the stop flags, at worst we would schedule
754 * one more callout. On the other hand, it is not fine to not
755 * schedule when we we missed clearing of the flags, we
756 * recheck them under the lock and observe consistent state.
757 */
758 p = kc->p;
759 if (P_SHOULDSTOP(p) || P_KILLED(p)) {
760 if (!proc_locked)
761 PROC_LOCK(p);
762 if (P_SHOULDSTOP(p) || P_KILLED(p)) {
763 if ((kc->flags & KQ_TIMER_CB_ENQUEUED) == 0) {
764 kc->flags |= KQ_TIMER_CB_ENQUEUED;
765 TAILQ_INSERT_TAIL(&p->p_kqtim_stop, kc, link);
766 }
767 if (!proc_locked)
768 PROC_UNLOCK(p);
769 return;
770 }
771 if (!proc_locked)
772 PROC_UNLOCK(p);
773 }
774 kqtimer_sched_callout(kc);
775 }
776
777 static void
filt_timerexpire(void * knx)778 filt_timerexpire(void *knx)
779 {
780 filt_timerexpire_l(knx, false);
781 }
782
783 /*
784 * data contains amount of time to sleep
785 */
786 static int
filt_timervalidate(struct knote * kn,sbintime_t * to)787 filt_timervalidate(struct knote *kn, sbintime_t *to)
788 {
789 struct bintime bt;
790 sbintime_t sbt;
791
792 if (kn->kn_sdata < 0)
793 return (EINVAL);
794 if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
795 kn->kn_sdata = 1;
796 /*
797 * The only fflags values supported are the timer unit
798 * (precision) and the absolute time indicator.
799 */
800 if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0)
801 return (EINVAL);
802
803 *to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
804 if (*to < 0)
805 return (EINVAL);
806 if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
807 getboottimebin(&bt);
808 sbt = bttosbt(bt);
809 *to = MAX(0, *to - sbt);
810 }
811 return (0);
812 }
813
814 static int
filt_timerattach(struct knote * kn)815 filt_timerattach(struct knote *kn)
816 {
817 struct kq_timer_cb_data *kc;
818 sbintime_t to;
819 int error;
820
821 to = -1;
822 error = filt_timervalidate(kn, &to);
823 if (error != 0)
824 return (error);
825 KASSERT(to > 0 || (kn->kn_flags & EV_ONESHOT) != 0 ||
826 (kn->kn_sfflags & NOTE_ABSTIME) != 0,
827 ("%s: periodic timer has a calculated zero timeout", __func__));
828 KASSERT(to >= 0,
829 ("%s: timer has a calculated negative timeout", __func__));
830
831 if (atomic_fetchadd_int(&kq_ncallouts, 1) + 1 > kq_calloutmax) {
832 atomic_subtract_int(&kq_ncallouts, 1);
833 return (ENOMEM);
834 }
835
836 if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
837 kn->kn_flags |= EV_CLEAR; /* automatically set */
838 kn->kn_status &= ~KN_DETACHED; /* knlist_add clears it */
839 kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK);
840 kc->kn = kn;
841 kc->p = curproc;
842 kc->cpuid = PCPU_GET(cpuid);
843 kc->flags = 0;
844 callout_init(&kc->c, 1);
845 filt_timerstart(kn, to);
846
847 return (0);
848 }
849
850 static void
filt_timerstart(struct knote * kn,sbintime_t to)851 filt_timerstart(struct knote *kn, sbintime_t to)
852 {
853 struct kq_timer_cb_data *kc;
854
855 kc = kn->kn_ptr.p_v;
856 if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
857 kc->next = to;
858 kc->to = 0;
859 } else {
860 kc->next = to + sbinuptime();
861 kc->to = to;
862 }
863 kqtimer_sched_callout(kc);
864 }
865
866 static void
filt_timerdetach(struct knote * kn)867 filt_timerdetach(struct knote *kn)
868 {
869 struct kq_timer_cb_data *kc;
870 unsigned int old __unused;
871 bool pending;
872
873 kc = kn->kn_ptr.p_v;
874 do {
875 callout_drain(&kc->c);
876
877 /*
878 * kqtimer_proc_continue() might have rescheduled this callout.
879 * Double-check, using the process mutex as an interlock.
880 */
881 PROC_LOCK(kc->p);
882 if ((kc->flags & KQ_TIMER_CB_ENQUEUED) != 0) {
883 kc->flags &= ~KQ_TIMER_CB_ENQUEUED;
884 TAILQ_REMOVE(&kc->p->p_kqtim_stop, kc, link);
885 }
886 pending = callout_pending(&kc->c);
887 PROC_UNLOCK(kc->p);
888 } while (pending);
889 free(kc, M_KQUEUE);
890 old = atomic_fetchadd_int(&kq_ncallouts, -1);
891 KASSERT(old > 0, ("Number of callouts cannot become negative"));
892 kn->kn_status |= KN_DETACHED; /* knlist_remove sets it */
893 }
894
895 static void
filt_timertouch(struct knote * kn,struct kevent * kev,u_long type)896 filt_timertouch(struct knote *kn, struct kevent *kev, u_long type)
897 {
898 struct kq_timer_cb_data *kc;
899 struct kqueue *kq;
900 sbintime_t to;
901 int error;
902
903 switch (type) {
904 case EVENT_REGISTER:
905 /* Handle re-added timers that update data/fflags */
906 if (kev->flags & EV_ADD) {
907 kc = kn->kn_ptr.p_v;
908
909 /* Drain any existing callout. */
910 callout_drain(&kc->c);
911
912 /* Throw away any existing undelivered record
913 * of the timer expiration. This is done under
914 * the presumption that if a process is
915 * re-adding this timer with new parameters,
916 * it is no longer interested in what may have
917 * happened under the old parameters. If it is
918 * interested, it can wait for the expiration,
919 * delete the old timer definition, and then
920 * add the new one.
921 *
922 * This has to be done while the kq is locked:
923 * - if enqueued, dequeue
924 * - make it no longer active
925 * - clear the count of expiration events
926 */
927 kq = kn->kn_kq;
928 KQ_LOCK(kq);
929 if (kn->kn_status & KN_QUEUED)
930 knote_dequeue(kn);
931
932 kn->kn_status &= ~KN_ACTIVE;
933 kn->kn_data = 0;
934 KQ_UNLOCK(kq);
935
936 /* Reschedule timer based on new data/fflags */
937 kn->kn_sfflags = kev->fflags;
938 kn->kn_sdata = kev->data;
939 error = filt_timervalidate(kn, &to);
940 if (error != 0) {
941 kn->kn_flags |= EV_ERROR;
942 kn->kn_data = error;
943 } else
944 filt_timerstart(kn, to);
945 }
946 break;
947
948 case EVENT_PROCESS:
949 *kev = kn->kn_kevent;
950 if (kn->kn_flags & EV_CLEAR) {
951 kn->kn_data = 0;
952 kn->kn_fflags = 0;
953 }
954 break;
955
956 default:
957 panic("filt_timertouch() - invalid type (%ld)", type);
958 break;
959 }
960 }
961
962 static int
filt_timer(struct knote * kn,long hint)963 filt_timer(struct knote *kn, long hint)
964 {
965
966 return (kn->kn_data != 0);
967 }
968
969 static int
filt_userattach(struct knote * kn)970 filt_userattach(struct knote *kn)
971 {
972
973 /*
974 * EVFILT_USER knotes are not attached to anything in the kernel.
975 */
976 kn->kn_hook = NULL;
977 if (kn->kn_fflags & NOTE_TRIGGER)
978 kn->kn_hookid = 1;
979 else
980 kn->kn_hookid = 0;
981 return (0);
982 }
983
984 static void
filt_userdetach(__unused struct knote * kn)985 filt_userdetach(__unused struct knote *kn)
986 {
987
988 /*
989 * EVFILT_USER knotes are not attached to anything in the kernel.
990 */
991 }
992
993 static int
filt_user(struct knote * kn,__unused long hint)994 filt_user(struct knote *kn, __unused long hint)
995 {
996
997 return (kn->kn_hookid);
998 }
999
1000 static void
filt_usertouch(struct knote * kn,struct kevent * kev,u_long type)1001 filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
1002 {
1003 u_int ffctrl;
1004
1005 switch (type) {
1006 case EVENT_REGISTER:
1007 if (kev->fflags & NOTE_TRIGGER)
1008 kn->kn_hookid = 1;
1009
1010 ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1011 kev->fflags &= NOTE_FFLAGSMASK;
1012 switch (ffctrl) {
1013 case NOTE_FFNOP:
1014 break;
1015
1016 case NOTE_FFAND:
1017 kn->kn_sfflags &= kev->fflags;
1018 break;
1019
1020 case NOTE_FFOR:
1021 kn->kn_sfflags |= kev->fflags;
1022 break;
1023
1024 case NOTE_FFCOPY:
1025 kn->kn_sfflags = kev->fflags;
1026 break;
1027
1028 default:
1029 /* XXX Return error? */
1030 break;
1031 }
1032 kn->kn_sdata = kev->data;
1033 if (kev->flags & EV_CLEAR) {
1034 kn->kn_hookid = 0;
1035 kn->kn_data = 0;
1036 kn->kn_fflags = 0;
1037 }
1038 break;
1039
1040 case EVENT_PROCESS:
1041 *kev = kn->kn_kevent;
1042 kev->fflags = kn->kn_sfflags;
1043 kev->data = kn->kn_sdata;
1044 if (kn->kn_flags & EV_CLEAR) {
1045 kn->kn_hookid = 0;
1046 kn->kn_data = 0;
1047 kn->kn_fflags = 0;
1048 }
1049 break;
1050
1051 default:
1052 panic("filt_usertouch() - invalid type (%ld)", type);
1053 break;
1054 }
1055 }
1056
1057 int
sys_kqueue(struct thread * td,struct kqueue_args * uap)1058 sys_kqueue(struct thread *td, struct kqueue_args *uap)
1059 {
1060
1061 return (kern_kqueue(td, 0, NULL));
1062 }
1063
1064 int
sys_kqueuex(struct thread * td,struct kqueuex_args * uap)1065 sys_kqueuex(struct thread *td, struct kqueuex_args *uap)
1066 {
1067 int flags;
1068
1069 if ((uap->flags & ~(KQUEUE_CLOEXEC)) != 0)
1070 return (EINVAL);
1071 flags = 0;
1072 if ((uap->flags & KQUEUE_CLOEXEC) != 0)
1073 flags |= O_CLOEXEC;
1074 return (kern_kqueue(td, flags, NULL));
1075 }
1076
1077 static void
kqueue_init(struct kqueue * kq)1078 kqueue_init(struct kqueue *kq)
1079 {
1080
1081 mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK);
1082 TAILQ_INIT(&kq->kq_head);
1083 knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
1084 TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
1085 }
1086
1087 int
kern_kqueue(struct thread * td,int flags,struct filecaps * fcaps)1088 kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps)
1089 {
1090 struct filedesc *fdp;
1091 struct kqueue *kq;
1092 struct file *fp;
1093 struct ucred *cred;
1094 int fd, error;
1095
1096 fdp = td->td_proc->p_fd;
1097 cred = td->td_ucred;
1098 if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES)))
1099 return (ENOMEM);
1100
1101 error = falloc_caps(td, &fp, &fd, flags, fcaps);
1102 if (error != 0) {
1103 chgkqcnt(cred->cr_ruidinfo, -1, 0);
1104 return (error);
1105 }
1106
1107 /* An extra reference on `fp' has been held for us by falloc(). */
1108 kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
1109 kqueue_init(kq);
1110 kq->kq_fdp = fdp;
1111 kq->kq_cred = crhold(cred);
1112
1113 FILEDESC_XLOCK(fdp);
1114 TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
1115 FILEDESC_XUNLOCK(fdp);
1116
1117 finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
1118 fdrop(fp, td);
1119
1120 td->td_retval[0] = fd;
1121 return (0);
1122 }
1123
1124 struct g_kevent_args {
1125 int fd;
1126 const void *changelist;
1127 int nchanges;
1128 void *eventlist;
1129 int nevents;
1130 const struct timespec *timeout;
1131 };
1132
1133 int
sys_kevent(struct thread * td,struct kevent_args * uap)1134 sys_kevent(struct thread *td, struct kevent_args *uap)
1135 {
1136 struct kevent_copyops k_ops = {
1137 .arg = uap,
1138 .k_copyout = kevent_copyout,
1139 .k_copyin = kevent_copyin,
1140 .kevent_size = sizeof(struct kevent),
1141 };
1142 struct g_kevent_args gk_args = {
1143 .fd = uap->fd,
1144 .changelist = uap->changelist,
1145 .nchanges = uap->nchanges,
1146 .eventlist = uap->eventlist,
1147 .nevents = uap->nevents,
1148 .timeout = uap->timeout,
1149 };
1150
1151 return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent"));
1152 }
1153
1154 static int
kern_kevent_generic(struct thread * td,struct g_kevent_args * uap,struct kevent_copyops * k_ops,const char * struct_name)1155 kern_kevent_generic(struct thread *td, struct g_kevent_args *uap,
1156 struct kevent_copyops *k_ops, const char *struct_name)
1157 {
1158 struct timespec ts, *tsp;
1159 #ifdef KTRACE
1160 struct kevent *eventlist = uap->eventlist;
1161 #endif
1162 int error;
1163
1164 if (uap->timeout != NULL) {
1165 error = copyin(uap->timeout, &ts, sizeof(ts));
1166 if (error)
1167 return (error);
1168 tsp = &ts;
1169 } else
1170 tsp = NULL;
1171
1172 #ifdef KTRACE
1173 if (KTRPOINT(td, KTR_STRUCT_ARRAY))
1174 ktrstructarray(struct_name, UIO_USERSPACE, uap->changelist,
1175 uap->nchanges, k_ops->kevent_size);
1176 #endif
1177
1178 error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
1179 k_ops, tsp);
1180
1181 #ifdef KTRACE
1182 if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY))
1183 ktrstructarray(struct_name, UIO_USERSPACE, eventlist,
1184 td->td_retval[0], k_ops->kevent_size);
1185 #endif
1186
1187 return (error);
1188 }
1189
1190 /*
1191 * Copy 'count' items into the destination list pointed to by uap->eventlist.
1192 */
1193 static int
kevent_copyout(void * arg,struct kevent * kevp,int count)1194 kevent_copyout(void *arg, struct kevent *kevp, int count)
1195 {
1196 struct kevent_args *uap;
1197 int error;
1198
1199 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1200 uap = (struct kevent_args *)arg;
1201
1202 error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
1203 if (error == 0)
1204 uap->eventlist += count;
1205 return (error);
1206 }
1207
1208 /*
1209 * Copy 'count' items from the list pointed to by uap->changelist.
1210 */
1211 static int
kevent_copyin(void * arg,struct kevent * kevp,int count)1212 kevent_copyin(void *arg, struct kevent *kevp, int count)
1213 {
1214 struct kevent_args *uap;
1215 int error;
1216
1217 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1218 uap = (struct kevent_args *)arg;
1219
1220 error = copyin(uap->changelist, kevp, count * sizeof *kevp);
1221 if (error == 0)
1222 uap->changelist += count;
1223 return (error);
1224 }
1225
1226 #ifdef COMPAT_FREEBSD11
1227 static int
kevent11_copyout(void * arg,struct kevent * kevp,int count)1228 kevent11_copyout(void *arg, struct kevent *kevp, int count)
1229 {
1230 struct freebsd11_kevent_args *uap;
1231 struct freebsd11_kevent kev11;
1232 int error, i;
1233
1234 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1235 uap = (struct freebsd11_kevent_args *)arg;
1236
1237 for (i = 0; i < count; i++) {
1238 kev11.ident = kevp->ident;
1239 kev11.filter = kevp->filter;
1240 kev11.flags = kevp->flags;
1241 kev11.fflags = kevp->fflags;
1242 kev11.data = kevp->data;
1243 kev11.udata = kevp->udata;
1244 error = copyout(&kev11, uap->eventlist, sizeof(kev11));
1245 if (error != 0)
1246 break;
1247 uap->eventlist++;
1248 kevp++;
1249 }
1250 return (error);
1251 }
1252
1253 /*
1254 * Copy 'count' items from the list pointed to by uap->changelist.
1255 */
1256 static int
kevent11_copyin(void * arg,struct kevent * kevp,int count)1257 kevent11_copyin(void *arg, struct kevent *kevp, int count)
1258 {
1259 struct freebsd11_kevent_args *uap;
1260 struct freebsd11_kevent kev11;
1261 int error, i;
1262
1263 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1264 uap = (struct freebsd11_kevent_args *)arg;
1265
1266 for (i = 0; i < count; i++) {
1267 error = copyin(uap->changelist, &kev11, sizeof(kev11));
1268 if (error != 0)
1269 break;
1270 kevp->ident = kev11.ident;
1271 kevp->filter = kev11.filter;
1272 kevp->flags = kev11.flags;
1273 kevp->fflags = kev11.fflags;
1274 kevp->data = (uintptr_t)kev11.data;
1275 kevp->udata = kev11.udata;
1276 bzero(&kevp->ext, sizeof(kevp->ext));
1277 uap->changelist++;
1278 kevp++;
1279 }
1280 return (error);
1281 }
1282
1283 int
freebsd11_kevent(struct thread * td,struct freebsd11_kevent_args * uap)1284 freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap)
1285 {
1286 struct kevent_copyops k_ops = {
1287 .arg = uap,
1288 .k_copyout = kevent11_copyout,
1289 .k_copyin = kevent11_copyin,
1290 .kevent_size = sizeof(struct freebsd11_kevent),
1291 };
1292 struct g_kevent_args gk_args = {
1293 .fd = uap->fd,
1294 .changelist = uap->changelist,
1295 .nchanges = uap->nchanges,
1296 .eventlist = uap->eventlist,
1297 .nevents = uap->nevents,
1298 .timeout = uap->timeout,
1299 };
1300
1301 return (kern_kevent_generic(td, &gk_args, &k_ops, "freebsd11_kevent"));
1302 }
1303 #endif
1304
1305 int
kern_kevent(struct thread * td,int fd,int nchanges,int nevents,struct kevent_copyops * k_ops,const struct timespec * timeout)1306 kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
1307 struct kevent_copyops *k_ops, const struct timespec *timeout)
1308 {
1309 cap_rights_t rights;
1310 struct file *fp;
1311 int error;
1312
1313 cap_rights_init_zero(&rights);
1314 if (nchanges > 0)
1315 cap_rights_set_one(&rights, CAP_KQUEUE_CHANGE);
1316 if (nevents > 0)
1317 cap_rights_set_one(&rights, CAP_KQUEUE_EVENT);
1318 error = fget(td, fd, &rights, &fp);
1319 if (error != 0)
1320 return (error);
1321
1322 error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout);
1323 fdrop(fp, td);
1324
1325 return (error);
1326 }
1327
1328 static int
kqueue_kevent(struct kqueue * kq,struct thread * td,int nchanges,int nevents,struct kevent_copyops * k_ops,const struct timespec * timeout)1329 kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents,
1330 struct kevent_copyops *k_ops, const struct timespec *timeout)
1331 {
1332 struct kevent keva[KQ_NEVENTS];
1333 struct kevent *kevp, *changes;
1334 int i, n, nerrors, error;
1335
1336 if (nchanges < 0)
1337 return (EINVAL);
1338
1339 nerrors = 0;
1340 while (nchanges > 0) {
1341 n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
1342 error = k_ops->k_copyin(k_ops->arg, keva, n);
1343 if (error)
1344 return (error);
1345 changes = keva;
1346 for (i = 0; i < n; i++) {
1347 kevp = &changes[i];
1348 if (!kevp->filter)
1349 continue;
1350 kevp->flags &= ~EV_SYSFLAGS;
1351 error = kqueue_register(kq, kevp, td, M_WAITOK);
1352 if (error || (kevp->flags & EV_RECEIPT)) {
1353 if (nevents == 0)
1354 return (error);
1355 kevp->flags = EV_ERROR;
1356 kevp->data = error;
1357 (void)k_ops->k_copyout(k_ops->arg, kevp, 1);
1358 nevents--;
1359 nerrors++;
1360 }
1361 }
1362 nchanges -= n;
1363 }
1364 if (nerrors) {
1365 td->td_retval[0] = nerrors;
1366 return (0);
1367 }
1368
1369 return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td));
1370 }
1371
1372 int
kern_kevent_fp(struct thread * td,struct file * fp,int nchanges,int nevents,struct kevent_copyops * k_ops,const struct timespec * timeout)1373 kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents,
1374 struct kevent_copyops *k_ops, const struct timespec *timeout)
1375 {
1376 struct kqueue *kq;
1377 int error;
1378
1379 error = kqueue_acquire(fp, &kq);
1380 if (error != 0)
1381 return (error);
1382 error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout);
1383 kqueue_release(kq, 0);
1384 return (error);
1385 }
1386
1387 /*
1388 * Performs a kevent() call on a temporarily created kqueue. This can be
1389 * used to perform one-shot polling, similar to poll() and select().
1390 */
1391 int
kern_kevent_anonymous(struct thread * td,int nevents,struct kevent_copyops * k_ops)1392 kern_kevent_anonymous(struct thread *td, int nevents,
1393 struct kevent_copyops *k_ops)
1394 {
1395 struct kqueue kq = {};
1396 int error;
1397
1398 kqueue_init(&kq);
1399 kq.kq_refcnt = 1;
1400 error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL);
1401 kqueue_drain(&kq, td);
1402 kqueue_destroy(&kq);
1403 return (error);
1404 }
1405
1406 int
kqueue_add_filteropts(int filt,const struct filterops * filtops)1407 kqueue_add_filteropts(int filt, const struct filterops *filtops)
1408 {
1409 int error;
1410
1411 error = 0;
1412 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
1413 printf(
1414 "trying to add a filterop that is out of range: %d is beyond %d\n",
1415 ~filt, EVFILT_SYSCOUNT);
1416 return EINVAL;
1417 }
1418 mtx_lock(&filterops_lock);
1419 if (sysfilt_ops[~filt].for_fop != &null_filtops &&
1420 sysfilt_ops[~filt].for_fop != NULL)
1421 error = EEXIST;
1422 else {
1423 sysfilt_ops[~filt].for_fop = filtops;
1424 sysfilt_ops[~filt].for_refcnt = 0;
1425 }
1426 mtx_unlock(&filterops_lock);
1427
1428 return (error);
1429 }
1430
1431 int
kqueue_del_filteropts(int filt)1432 kqueue_del_filteropts(int filt)
1433 {
1434 int error;
1435
1436 error = 0;
1437 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1438 return EINVAL;
1439
1440 mtx_lock(&filterops_lock);
1441 if (sysfilt_ops[~filt].for_fop == &null_filtops ||
1442 sysfilt_ops[~filt].for_fop == NULL)
1443 error = EINVAL;
1444 else if (sysfilt_ops[~filt].for_refcnt != 0)
1445 error = EBUSY;
1446 else {
1447 sysfilt_ops[~filt].for_fop = &null_filtops;
1448 sysfilt_ops[~filt].for_refcnt = 0;
1449 }
1450 mtx_unlock(&filterops_lock);
1451
1452 return error;
1453 }
1454
1455 static const struct filterops *
kqueue_fo_find(int filt)1456 kqueue_fo_find(int filt)
1457 {
1458
1459 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1460 return NULL;
1461
1462 if (sysfilt_ops[~filt].for_nolock)
1463 return sysfilt_ops[~filt].for_fop;
1464
1465 mtx_lock(&filterops_lock);
1466 sysfilt_ops[~filt].for_refcnt++;
1467 if (sysfilt_ops[~filt].for_fop == NULL)
1468 sysfilt_ops[~filt].for_fop = &null_filtops;
1469 mtx_unlock(&filterops_lock);
1470
1471 return sysfilt_ops[~filt].for_fop;
1472 }
1473
1474 static void
kqueue_fo_release(int filt)1475 kqueue_fo_release(int filt)
1476 {
1477
1478 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1479 return;
1480
1481 if (sysfilt_ops[~filt].for_nolock)
1482 return;
1483
1484 mtx_lock(&filterops_lock);
1485 KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
1486 ("filter object refcount not valid on release"));
1487 sysfilt_ops[~filt].for_refcnt--;
1488 mtx_unlock(&filterops_lock);
1489 }
1490
1491 /*
1492 * A ref to kq (obtained via kqueue_acquire) must be held.
1493 */
1494 static int
kqueue_register(struct kqueue * kq,struct kevent * kev,struct thread * td,int mflag)1495 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td,
1496 int mflag)
1497 {
1498 const struct filterops *fops;
1499 struct file *fp;
1500 struct knote *kn, *tkn;
1501 struct knlist *knl;
1502 int error, filt, event;
1503 int haskqglobal, filedesc_unlock;
1504
1505 if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE))
1506 return (EINVAL);
1507
1508 fp = NULL;
1509 kn = NULL;
1510 knl = NULL;
1511 error = 0;
1512 haskqglobal = 0;
1513 filedesc_unlock = 0;
1514
1515 filt = kev->filter;
1516 fops = kqueue_fo_find(filt);
1517 if (fops == NULL)
1518 return EINVAL;
1519
1520 if (kev->flags & EV_ADD) {
1521 /* Reject an invalid flag pair early */
1522 if (kev->flags & EV_KEEPUDATA) {
1523 tkn = NULL;
1524 error = EINVAL;
1525 goto done;
1526 }
1527
1528 /*
1529 * Prevent waiting with locks. Non-sleepable
1530 * allocation failures are handled in the loop, only
1531 * if the spare knote appears to be actually required.
1532 */
1533 tkn = knote_alloc(mflag);
1534 } else {
1535 tkn = NULL;
1536 }
1537
1538 findkn:
1539 if (fops->f_isfd) {
1540 KASSERT(td != NULL, ("td is NULL"));
1541 if (kev->ident > INT_MAX)
1542 error = EBADF;
1543 else
1544 error = fget(td, kev->ident, &cap_event_rights, &fp);
1545 if (error)
1546 goto done;
1547
1548 if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
1549 kev->ident, M_NOWAIT) != 0) {
1550 /* try again */
1551 fdrop(fp, td);
1552 fp = NULL;
1553 error = kqueue_expand(kq, fops, kev->ident, mflag);
1554 if (error)
1555 goto done;
1556 goto findkn;
1557 }
1558
1559 if (fp->f_type == DTYPE_KQUEUE) {
1560 /*
1561 * If we add some intelligence about what we are doing,
1562 * we should be able to support events on ourselves.
1563 * We need to know when we are doing this to prevent
1564 * getting both the knlist lock and the kq lock since
1565 * they are the same thing.
1566 */
1567 if (fp->f_data == kq) {
1568 error = EINVAL;
1569 goto done;
1570 }
1571
1572 /*
1573 * Pre-lock the filedesc before the global
1574 * lock mutex, see the comment in
1575 * kqueue_close().
1576 */
1577 FILEDESC_XLOCK(td->td_proc->p_fd);
1578 filedesc_unlock = 1;
1579 KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1580 }
1581
1582 KQ_LOCK(kq);
1583 if (kev->ident < kq->kq_knlistsize) {
1584 SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
1585 if (kev->filter == kn->kn_filter)
1586 break;
1587 }
1588 } else {
1589 if ((kev->flags & EV_ADD) == EV_ADD) {
1590 error = kqueue_expand(kq, fops, kev->ident, mflag);
1591 if (error != 0)
1592 goto done;
1593 }
1594
1595 KQ_LOCK(kq);
1596
1597 /*
1598 * If possible, find an existing knote to use for this kevent.
1599 */
1600 if (kev->filter == EVFILT_PROC &&
1601 (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
1602 /* This is an internal creation of a process tracking
1603 * note. Don't attempt to coalesce this with an
1604 * existing note.
1605 */
1606 ;
1607 } else if (kq->kq_knhashmask != 0) {
1608 struct klist *list;
1609
1610 list = &kq->kq_knhash[
1611 KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
1612 SLIST_FOREACH(kn, list, kn_link)
1613 if (kev->ident == kn->kn_id &&
1614 kev->filter == kn->kn_filter)
1615 break;
1616 }
1617 }
1618
1619 /* knote is in the process of changing, wait for it to stabilize. */
1620 if (kn != NULL && kn_in_flux(kn)) {
1621 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1622 if (filedesc_unlock) {
1623 FILEDESC_XUNLOCK(td->td_proc->p_fd);
1624 filedesc_unlock = 0;
1625 }
1626 kq->kq_state |= KQ_FLUXWAIT;
1627 msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
1628 if (fp != NULL) {
1629 fdrop(fp, td);
1630 fp = NULL;
1631 }
1632 goto findkn;
1633 }
1634
1635 /*
1636 * kn now contains the matching knote, or NULL if no match
1637 */
1638 if (kn == NULL) {
1639 if (kev->flags & EV_ADD) {
1640 kn = tkn;
1641 tkn = NULL;
1642 if (kn == NULL) {
1643 KQ_UNLOCK(kq);
1644 error = ENOMEM;
1645 goto done;
1646 }
1647 kn->kn_fp = fp;
1648 kn->kn_kq = kq;
1649 kn->kn_fop = fops;
1650 /*
1651 * apply reference counts to knote structure, and
1652 * do not release it at the end of this routine.
1653 */
1654 fops = NULL;
1655 fp = NULL;
1656
1657 kn->kn_sfflags = kev->fflags;
1658 kn->kn_sdata = kev->data;
1659 kev->fflags = 0;
1660 kev->data = 0;
1661 kn->kn_kevent = *kev;
1662 kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
1663 EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT);
1664 kn->kn_status = KN_DETACHED;
1665 if ((kev->flags & EV_DISABLE) != 0)
1666 kn->kn_status |= KN_DISABLED;
1667 kn_enter_flux(kn);
1668
1669 error = knote_attach(kn, kq);
1670 KQ_UNLOCK(kq);
1671 if (error != 0) {
1672 tkn = kn;
1673 goto done;
1674 }
1675
1676 if ((error = kn->kn_fop->f_attach(kn)) != 0) {
1677 knote_drop_detached(kn, td);
1678 goto done;
1679 }
1680 knl = kn_list_lock(kn);
1681 goto done_ev_add;
1682 } else {
1683 /* No matching knote and the EV_ADD flag is not set. */
1684 KQ_UNLOCK(kq);
1685 error = ENOENT;
1686 goto done;
1687 }
1688 }
1689
1690 if (kev->flags & EV_DELETE) {
1691 kn_enter_flux(kn);
1692 KQ_UNLOCK(kq);
1693 knote_drop(kn, td);
1694 goto done;
1695 }
1696
1697 if (kev->flags & EV_FORCEONESHOT) {
1698 kn->kn_flags |= EV_ONESHOT;
1699 KNOTE_ACTIVATE(kn, 1);
1700 }
1701
1702 if ((kev->flags & EV_ENABLE) != 0)
1703 kn->kn_status &= ~KN_DISABLED;
1704 else if ((kev->flags & EV_DISABLE) != 0)
1705 kn->kn_status |= KN_DISABLED;
1706
1707 /*
1708 * The user may change some filter values after the initial EV_ADD,
1709 * but doing so will not reset any filter which has already been
1710 * triggered.
1711 */
1712 kn->kn_status |= KN_SCAN;
1713 kn_enter_flux(kn);
1714 KQ_UNLOCK(kq);
1715 knl = kn_list_lock(kn);
1716 if ((kev->flags & EV_KEEPUDATA) == 0)
1717 kn->kn_kevent.udata = kev->udata;
1718 if (!fops->f_isfd && fops->f_touch != NULL) {
1719 fops->f_touch(kn, kev, EVENT_REGISTER);
1720 } else {
1721 kn->kn_sfflags = kev->fflags;
1722 kn->kn_sdata = kev->data;
1723 }
1724
1725 done_ev_add:
1726 /*
1727 * We can get here with kn->kn_knlist == NULL. This can happen when
1728 * the initial attach event decides that the event is "completed"
1729 * already, e.g., filt_procattach() is called on a zombie process. It
1730 * will call filt_proc() which will remove it from the list, and NULL
1731 * kn_knlist.
1732 *
1733 * KN_DISABLED will be stable while the knote is in flux, so the
1734 * unlocked read will not race with an update.
1735 */
1736 if ((kn->kn_status & KN_DISABLED) == 0)
1737 event = kn->kn_fop->f_event(kn, 0);
1738 else
1739 event = 0;
1740
1741 KQ_LOCK(kq);
1742 if (event)
1743 kn->kn_status |= KN_ACTIVE;
1744 if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) ==
1745 KN_ACTIVE)
1746 knote_enqueue(kn);
1747 kn->kn_status &= ~KN_SCAN;
1748 kn_leave_flux(kn);
1749 kn_list_unlock(knl);
1750 KQ_UNLOCK_FLUX(kq);
1751
1752 done:
1753 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1754 if (filedesc_unlock)
1755 FILEDESC_XUNLOCK(td->td_proc->p_fd);
1756 if (fp != NULL)
1757 fdrop(fp, td);
1758 knote_free(tkn);
1759 if (fops != NULL)
1760 kqueue_fo_release(filt);
1761 return (error);
1762 }
1763
1764 static int
kqueue_acquire(struct file * fp,struct kqueue ** kqp)1765 kqueue_acquire(struct file *fp, struct kqueue **kqp)
1766 {
1767 int error;
1768 struct kqueue *kq;
1769
1770 error = 0;
1771
1772 kq = fp->f_data;
1773 if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
1774 return (EBADF);
1775 *kqp = kq;
1776 KQ_LOCK(kq);
1777 if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
1778 KQ_UNLOCK(kq);
1779 return (EBADF);
1780 }
1781 kq->kq_refcnt++;
1782 KQ_UNLOCK(kq);
1783
1784 return error;
1785 }
1786
1787 static void
kqueue_release(struct kqueue * kq,int locked)1788 kqueue_release(struct kqueue *kq, int locked)
1789 {
1790 if (locked)
1791 KQ_OWNED(kq);
1792 else
1793 KQ_LOCK(kq);
1794 kq->kq_refcnt--;
1795 if (kq->kq_refcnt == 1)
1796 wakeup(&kq->kq_refcnt);
1797 if (!locked)
1798 KQ_UNLOCK(kq);
1799 }
1800
1801 static void
ast_kqueue(struct thread * td,int tda __unused)1802 ast_kqueue(struct thread *td, int tda __unused)
1803 {
1804 taskqueue_quiesce(taskqueue_kqueue_ctx);
1805 }
1806
1807 static void
kqueue_schedtask(struct kqueue * kq)1808 kqueue_schedtask(struct kqueue *kq)
1809 {
1810 KQ_OWNED(kq);
1811 KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
1812 ("scheduling kqueue task while draining"));
1813
1814 if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
1815 taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task);
1816 kq->kq_state |= KQ_TASKSCHED;
1817 ast_sched(curthread, TDA_KQUEUE);
1818 }
1819 }
1820
1821 /*
1822 * Expand the kq to make sure we have storage for fops/ident pair.
1823 *
1824 * Return 0 on success (or no work necessary), return errno on failure.
1825 */
1826 static int
kqueue_expand(struct kqueue * kq,const struct filterops * fops,uintptr_t ident,int mflag)1827 kqueue_expand(struct kqueue *kq, const struct filterops *fops, uintptr_t ident,
1828 int mflag)
1829 {
1830 struct klist *list, *tmp_knhash, *to_free;
1831 u_long tmp_knhashmask;
1832 int error, fd, size;
1833
1834 KQ_NOTOWNED(kq);
1835
1836 error = 0;
1837 to_free = NULL;
1838 if (fops->f_isfd) {
1839 fd = ident;
1840 if (kq->kq_knlistsize <= fd) {
1841 size = kq->kq_knlistsize;
1842 while (size <= fd)
1843 size += KQEXTENT;
1844 list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
1845 if (list == NULL)
1846 return ENOMEM;
1847 KQ_LOCK(kq);
1848 if ((kq->kq_state & KQ_CLOSING) != 0) {
1849 to_free = list;
1850 error = EBADF;
1851 } else if (kq->kq_knlistsize > fd) {
1852 to_free = list;
1853 } else {
1854 if (kq->kq_knlist != NULL) {
1855 bcopy(kq->kq_knlist, list,
1856 kq->kq_knlistsize * sizeof(*list));
1857 to_free = kq->kq_knlist;
1858 kq->kq_knlist = NULL;
1859 }
1860 bzero((caddr_t)list +
1861 kq->kq_knlistsize * sizeof(*list),
1862 (size - kq->kq_knlistsize) * sizeof(*list));
1863 kq->kq_knlistsize = size;
1864 kq->kq_knlist = list;
1865 }
1866 KQ_UNLOCK(kq);
1867 }
1868 } else {
1869 if (kq->kq_knhashmask == 0) {
1870 tmp_knhash = hashinit_flags(KN_HASHSIZE, M_KQUEUE,
1871 &tmp_knhashmask, (mflag & M_WAITOK) != 0 ?
1872 HASH_WAITOK : HASH_NOWAIT);
1873 if (tmp_knhash == NULL)
1874 return (ENOMEM);
1875 KQ_LOCK(kq);
1876 if ((kq->kq_state & KQ_CLOSING) != 0) {
1877 to_free = tmp_knhash;
1878 error = EBADF;
1879 } else if (kq->kq_knhashmask == 0) {
1880 kq->kq_knhash = tmp_knhash;
1881 kq->kq_knhashmask = tmp_knhashmask;
1882 } else {
1883 to_free = tmp_knhash;
1884 }
1885 KQ_UNLOCK(kq);
1886 }
1887 }
1888 free(to_free, M_KQUEUE);
1889
1890 KQ_NOTOWNED(kq);
1891 return (error);
1892 }
1893
1894 static void
kqueue_task(void * arg,int pending)1895 kqueue_task(void *arg, int pending)
1896 {
1897 struct kqueue *kq;
1898 int haskqglobal;
1899
1900 haskqglobal = 0;
1901 kq = arg;
1902
1903 KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1904 KQ_LOCK(kq);
1905
1906 KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
1907
1908 kq->kq_state &= ~KQ_TASKSCHED;
1909 if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
1910 wakeup(&kq->kq_state);
1911 }
1912 KQ_UNLOCK(kq);
1913 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1914 }
1915
1916 /*
1917 * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
1918 * We treat KN_MARKER knotes as if they are in flux.
1919 */
1920 static int
kqueue_scan(struct kqueue * kq,int maxevents,struct kevent_copyops * k_ops,const struct timespec * tsp,struct kevent * keva,struct thread * td)1921 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
1922 const struct timespec *tsp, struct kevent *keva, struct thread *td)
1923 {
1924 struct kevent *kevp;
1925 struct knote *kn, *marker;
1926 struct knlist *knl;
1927 sbintime_t asbt, rsbt;
1928 int count, error, haskqglobal, influx, nkev, touch;
1929
1930 count = maxevents;
1931 nkev = 0;
1932 error = 0;
1933 haskqglobal = 0;
1934
1935 if (maxevents == 0)
1936 goto done_nl;
1937 if (maxevents < 0) {
1938 error = EINVAL;
1939 goto done_nl;
1940 }
1941
1942 rsbt = 0;
1943 if (tsp != NULL) {
1944 if (!timespecvalid_interval(tsp)) {
1945 error = EINVAL;
1946 goto done_nl;
1947 }
1948 if (timespecisset(tsp)) {
1949 if (tsp->tv_sec <= INT32_MAX) {
1950 rsbt = tstosbt(*tsp);
1951 if (TIMESEL(&asbt, rsbt))
1952 asbt += tc_tick_sbt;
1953 if (asbt <= SBT_MAX - rsbt)
1954 asbt += rsbt;
1955 else
1956 asbt = 0;
1957 rsbt >>= tc_precexp;
1958 } else
1959 asbt = 0;
1960 } else
1961 asbt = -1;
1962 } else
1963 asbt = 0;
1964 marker = knote_alloc(M_WAITOK);
1965 marker->kn_status = KN_MARKER;
1966 KQ_LOCK(kq);
1967
1968 retry:
1969 kevp = keva;
1970 if (kq->kq_count == 0) {
1971 if (asbt == -1) {
1972 error = EWOULDBLOCK;
1973 } else {
1974 kq->kq_state |= KQ_SLEEP;
1975 error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
1976 "kqread", asbt, rsbt, C_ABSOLUTE);
1977 }
1978 if (error == 0)
1979 goto retry;
1980 /* don't restart after signals... */
1981 if (error == ERESTART)
1982 error = EINTR;
1983 else if (error == EWOULDBLOCK)
1984 error = 0;
1985 goto done;
1986 }
1987
1988 TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1989 influx = 0;
1990 while (count) {
1991 KQ_OWNED(kq);
1992 kn = TAILQ_FIRST(&kq->kq_head);
1993
1994 if ((kn->kn_status == KN_MARKER && kn != marker) ||
1995 kn_in_flux(kn)) {
1996 if (influx) {
1997 influx = 0;
1998 KQ_FLUX_WAKEUP(kq);
1999 }
2000 kq->kq_state |= KQ_FLUXWAIT;
2001 error = msleep(kq, &kq->kq_lock, PSOCK,
2002 "kqflxwt", 0);
2003 continue;
2004 }
2005
2006 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2007 if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
2008 kn->kn_status &= ~KN_QUEUED;
2009 kq->kq_count--;
2010 continue;
2011 }
2012 if (kn == marker) {
2013 KQ_FLUX_WAKEUP(kq);
2014 if (count == maxevents)
2015 goto retry;
2016 goto done;
2017 }
2018 KASSERT(!kn_in_flux(kn),
2019 ("knote %p is unexpectedly in flux", kn));
2020
2021 if ((kn->kn_flags & EV_DROP) == EV_DROP) {
2022 kn->kn_status &= ~KN_QUEUED;
2023 kn_enter_flux(kn);
2024 kq->kq_count--;
2025 KQ_UNLOCK(kq);
2026 /*
2027 * We don't need to lock the list since we've
2028 * marked it as in flux.
2029 */
2030 knote_drop(kn, td);
2031 KQ_LOCK(kq);
2032 continue;
2033 } else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
2034 kn->kn_status &= ~KN_QUEUED;
2035 kn_enter_flux(kn);
2036 kq->kq_count--;
2037 KQ_UNLOCK(kq);
2038 /*
2039 * We don't need to lock the list since we've
2040 * marked the knote as being in flux.
2041 */
2042 *kevp = kn->kn_kevent;
2043 knote_drop(kn, td);
2044 KQ_LOCK(kq);
2045 kn = NULL;
2046 } else {
2047 kn->kn_status |= KN_SCAN;
2048 kn_enter_flux(kn);
2049 KQ_UNLOCK(kq);
2050 if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
2051 KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
2052 knl = kn_list_lock(kn);
2053 if (kn->kn_fop->f_event(kn, 0) == 0) {
2054 KQ_LOCK(kq);
2055 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
2056 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE |
2057 KN_SCAN);
2058 kn_leave_flux(kn);
2059 kq->kq_count--;
2060 kn_list_unlock(knl);
2061 influx = 1;
2062 continue;
2063 }
2064 touch = (!kn->kn_fop->f_isfd &&
2065 kn->kn_fop->f_touch != NULL);
2066 if (touch)
2067 kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
2068 else
2069 *kevp = kn->kn_kevent;
2070 KQ_LOCK(kq);
2071 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
2072 if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
2073 /*
2074 * Manually clear knotes who weren't
2075 * 'touch'ed.
2076 */
2077 if (touch == 0 && kn->kn_flags & EV_CLEAR) {
2078 kn->kn_data = 0;
2079 kn->kn_fflags = 0;
2080 }
2081 if (kn->kn_flags & EV_DISPATCH)
2082 kn->kn_status |= KN_DISABLED;
2083 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
2084 kq->kq_count--;
2085 } else
2086 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2087
2088 kn->kn_status &= ~KN_SCAN;
2089 kn_leave_flux(kn);
2090 kn_list_unlock(knl);
2091 influx = 1;
2092 }
2093
2094 /* we are returning a copy to the user */
2095 kevp++;
2096 nkev++;
2097 count--;
2098
2099 if (nkev == KQ_NEVENTS) {
2100 influx = 0;
2101 KQ_UNLOCK_FLUX(kq);
2102 error = k_ops->k_copyout(k_ops->arg, keva, nkev);
2103 nkev = 0;
2104 kevp = keva;
2105 KQ_LOCK(kq);
2106 if (error)
2107 break;
2108 }
2109 }
2110 TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
2111 done:
2112 KQ_OWNED(kq);
2113 KQ_UNLOCK_FLUX(kq);
2114 knote_free(marker);
2115 done_nl:
2116 KQ_NOTOWNED(kq);
2117 if (nkev != 0)
2118 error = k_ops->k_copyout(k_ops->arg, keva, nkev);
2119 td->td_retval[0] = maxevents - count;
2120 return (error);
2121 }
2122
2123 /*ARGSUSED*/
2124 static int
kqueue_ioctl(struct file * fp,u_long cmd,void * data,struct ucred * active_cred,struct thread * td)2125 kqueue_ioctl(struct file *fp, u_long cmd, void *data,
2126 struct ucred *active_cred, struct thread *td)
2127 {
2128 /*
2129 * Enabling sigio causes two major problems:
2130 * 1) infinite recursion:
2131 * Synopsys: kevent is being used to track signals and have FIOASYNC
2132 * set. On receipt of a signal this will cause a kqueue to recurse
2133 * into itself over and over. Sending the sigio causes the kqueue
2134 * to become ready, which in turn posts sigio again, forever.
2135 * Solution: this can be solved by setting a flag in the kqueue that
2136 * we have a SIGIO in progress.
2137 * 2) locking problems:
2138 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
2139 * us above the proc and pgrp locks.
2140 * Solution: Post a signal using an async mechanism, being sure to
2141 * record a generation count in the delivery so that we do not deliver
2142 * a signal to the wrong process.
2143 *
2144 * Note, these two mechanisms are somewhat mutually exclusive!
2145 */
2146 #if 0
2147 struct kqueue *kq;
2148
2149 kq = fp->f_data;
2150 switch (cmd) {
2151 case FIOASYNC:
2152 if (*(int *)data) {
2153 kq->kq_state |= KQ_ASYNC;
2154 } else {
2155 kq->kq_state &= ~KQ_ASYNC;
2156 }
2157 return (0);
2158
2159 case FIOSETOWN:
2160 return (fsetown(*(int *)data, &kq->kq_sigio));
2161
2162 case FIOGETOWN:
2163 *(int *)data = fgetown(&kq->kq_sigio);
2164 return (0);
2165 }
2166 #endif
2167
2168 return (ENOTTY);
2169 }
2170
2171 /*ARGSUSED*/
2172 static int
kqueue_poll(struct file * fp,int events,struct ucred * active_cred,struct thread * td)2173 kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
2174 struct thread *td)
2175 {
2176 struct kqueue *kq;
2177 int revents = 0;
2178 int error;
2179
2180 if ((error = kqueue_acquire(fp, &kq)))
2181 return POLLERR;
2182
2183 KQ_LOCK(kq);
2184 if (events & (POLLIN | POLLRDNORM)) {
2185 if (kq->kq_count) {
2186 revents |= events & (POLLIN | POLLRDNORM);
2187 } else {
2188 selrecord(td, &kq->kq_sel);
2189 if (SEL_WAITING(&kq->kq_sel))
2190 kq->kq_state |= KQ_SEL;
2191 }
2192 }
2193 kqueue_release(kq, 1);
2194 KQ_UNLOCK(kq);
2195 return (revents);
2196 }
2197
2198 /*ARGSUSED*/
2199 static int
kqueue_stat(struct file * fp,struct stat * st,struct ucred * active_cred)2200 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
2201 {
2202
2203 bzero((void *)st, sizeof *st);
2204 /*
2205 * We no longer return kq_count because the unlocked value is useless.
2206 * If you spent all this time getting the count, why not spend your
2207 * syscall better by calling kevent?
2208 *
2209 * XXX - This is needed for libc_r.
2210 */
2211 st->st_mode = S_IFIFO;
2212 return (0);
2213 }
2214
2215 static void
kqueue_drain(struct kqueue * kq,struct thread * td)2216 kqueue_drain(struct kqueue *kq, struct thread *td)
2217 {
2218 struct knote *kn;
2219 int i;
2220
2221 KQ_LOCK(kq);
2222
2223 KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
2224 ("kqueue already closing"));
2225 kq->kq_state |= KQ_CLOSING;
2226 if (kq->kq_refcnt > 1)
2227 msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
2228
2229 KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
2230
2231 KASSERT(knlist_empty(&kq->kq_sel.si_note),
2232 ("kqueue's knlist not empty"));
2233
2234 for (i = 0; i < kq->kq_knlistsize; i++) {
2235 while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
2236 if (kn_in_flux(kn)) {
2237 kq->kq_state |= KQ_FLUXWAIT;
2238 msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
2239 continue;
2240 }
2241 kn_enter_flux(kn);
2242 KQ_UNLOCK(kq);
2243 knote_drop(kn, td);
2244 KQ_LOCK(kq);
2245 }
2246 }
2247 if (kq->kq_knhashmask != 0) {
2248 for (i = 0; i <= kq->kq_knhashmask; i++) {
2249 while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
2250 if (kn_in_flux(kn)) {
2251 kq->kq_state |= KQ_FLUXWAIT;
2252 msleep(kq, &kq->kq_lock, PSOCK,
2253 "kqclo2", 0);
2254 continue;
2255 }
2256 kn_enter_flux(kn);
2257 KQ_UNLOCK(kq);
2258 knote_drop(kn, td);
2259 KQ_LOCK(kq);
2260 }
2261 }
2262 }
2263
2264 if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
2265 kq->kq_state |= KQ_TASKDRAIN;
2266 msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
2267 }
2268
2269 if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
2270 selwakeuppri(&kq->kq_sel, PSOCK);
2271 if (!SEL_WAITING(&kq->kq_sel))
2272 kq->kq_state &= ~KQ_SEL;
2273 }
2274
2275 KQ_UNLOCK(kq);
2276 }
2277
2278 static void
kqueue_destroy(struct kqueue * kq)2279 kqueue_destroy(struct kqueue *kq)
2280 {
2281
2282 KASSERT(kq->kq_fdp == NULL,
2283 ("kqueue still attached to a file descriptor"));
2284 seldrain(&kq->kq_sel);
2285 knlist_destroy(&kq->kq_sel.si_note);
2286 mtx_destroy(&kq->kq_lock);
2287
2288 if (kq->kq_knhash != NULL)
2289 free(kq->kq_knhash, M_KQUEUE);
2290 if (kq->kq_knlist != NULL)
2291 free(kq->kq_knlist, M_KQUEUE);
2292
2293 funsetown(&kq->kq_sigio);
2294 }
2295
2296 /*ARGSUSED*/
2297 static int
kqueue_close(struct file * fp,struct thread * td)2298 kqueue_close(struct file *fp, struct thread *td)
2299 {
2300 struct kqueue *kq = fp->f_data;
2301 struct filedesc *fdp;
2302 int error;
2303 int filedesc_unlock;
2304
2305 if ((error = kqueue_acquire(fp, &kq)))
2306 return error;
2307 kqueue_drain(kq, td);
2308
2309 /*
2310 * We could be called due to the knote_drop() doing fdrop(),
2311 * called from kqueue_register(). In this case the global
2312 * lock is owned, and filedesc sx is locked before, to not
2313 * take the sleepable lock after non-sleepable.
2314 */
2315 fdp = kq->kq_fdp;
2316 kq->kq_fdp = NULL;
2317 if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
2318 FILEDESC_XLOCK(fdp);
2319 filedesc_unlock = 1;
2320 } else
2321 filedesc_unlock = 0;
2322 TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list);
2323 if (filedesc_unlock)
2324 FILEDESC_XUNLOCK(fdp);
2325
2326 kqueue_destroy(kq);
2327 chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0);
2328 crfree(kq->kq_cred);
2329 free(kq, M_KQUEUE);
2330 fp->f_data = NULL;
2331
2332 return (0);
2333 }
2334
2335 static int
kqueue_fill_kinfo(struct file * fp,struct kinfo_file * kif,struct filedesc * fdp)2336 kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
2337 {
2338 struct kqueue *kq = fp->f_data;
2339
2340 kif->kf_type = KF_TYPE_KQUEUE;
2341 kif->kf_un.kf_kqueue.kf_kqueue_addr = (uintptr_t)kq;
2342 kif->kf_un.kf_kqueue.kf_kqueue_count = kq->kq_count;
2343 kif->kf_un.kf_kqueue.kf_kqueue_state = kq->kq_state;
2344 return (0);
2345 }
2346
2347 static void
kqueue_wakeup(struct kqueue * kq)2348 kqueue_wakeup(struct kqueue *kq)
2349 {
2350 KQ_OWNED(kq);
2351
2352 if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
2353 kq->kq_state &= ~KQ_SLEEP;
2354 wakeup(kq);
2355 }
2356 if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
2357 selwakeuppri(&kq->kq_sel, PSOCK);
2358 if (!SEL_WAITING(&kq->kq_sel))
2359 kq->kq_state &= ~KQ_SEL;
2360 }
2361 if (!knlist_empty(&kq->kq_sel.si_note))
2362 kqueue_schedtask(kq);
2363 if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
2364 pgsigio(&kq->kq_sigio, SIGIO, 0);
2365 }
2366 }
2367
2368 /*
2369 * Walk down a list of knotes, activating them if their event has triggered.
2370 *
2371 * There is a possibility to optimize in the case of one kq watching another.
2372 * Instead of scheduling a task to wake it up, you could pass enough state
2373 * down the chain to make up the parent kqueue. Make this code functional
2374 * first.
2375 */
2376 void
knote(struct knlist * list,long hint,int lockflags)2377 knote(struct knlist *list, long hint, int lockflags)
2378 {
2379 struct kqueue *kq;
2380 struct knote *kn, *tkn;
2381 int error;
2382
2383 if (list == NULL)
2384 return;
2385
2386 KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
2387
2388 if ((lockflags & KNF_LISTLOCKED) == 0)
2389 list->kl_lock(list->kl_lockarg);
2390
2391 /*
2392 * If we unlock the list lock (and enter influx), we can
2393 * eliminate the kqueue scheduling, but this will introduce
2394 * four lock/unlock's for each knote to test. Also, marker
2395 * would be needed to keep iteration position, since filters
2396 * or other threads could remove events.
2397 */
2398 SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
2399 kq = kn->kn_kq;
2400 KQ_LOCK(kq);
2401 if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
2402 /*
2403 * Do not process the influx notes, except for
2404 * the influx coming from the kq unlock in the
2405 * kqueue_scan(). In the later case, we do
2406 * not interfere with the scan, since the code
2407 * fragment in kqueue_scan() locks the knlist,
2408 * and cannot proceed until we finished.
2409 */
2410 KQ_UNLOCK(kq);
2411 } else if ((lockflags & KNF_NOKQLOCK) != 0) {
2412 kn_enter_flux(kn);
2413 KQ_UNLOCK(kq);
2414 error = kn->kn_fop->f_event(kn, hint);
2415 KQ_LOCK(kq);
2416 kn_leave_flux(kn);
2417 if (error)
2418 KNOTE_ACTIVATE(kn, 1);
2419 KQ_UNLOCK_FLUX(kq);
2420 } else {
2421 if (kn->kn_fop->f_event(kn, hint))
2422 KNOTE_ACTIVATE(kn, 1);
2423 KQ_UNLOCK(kq);
2424 }
2425 }
2426 if ((lockflags & KNF_LISTLOCKED) == 0)
2427 list->kl_unlock(list->kl_lockarg);
2428 }
2429
2430 /*
2431 * add a knote to a knlist
2432 */
2433 void
knlist_add(struct knlist * knl,struct knote * kn,int islocked)2434 knlist_add(struct knlist *knl, struct knote *kn, int islocked)
2435 {
2436
2437 KNL_ASSERT_LOCK(knl, islocked);
2438 KQ_NOTOWNED(kn->kn_kq);
2439 KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn));
2440 KASSERT((kn->kn_status & KN_DETACHED) != 0,
2441 ("knote %p was not detached", kn));
2442 if (!islocked)
2443 knl->kl_lock(knl->kl_lockarg);
2444 SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
2445 if (!islocked)
2446 knl->kl_unlock(knl->kl_lockarg);
2447 KQ_LOCK(kn->kn_kq);
2448 kn->kn_knlist = knl;
2449 kn->kn_status &= ~KN_DETACHED;
2450 KQ_UNLOCK(kn->kn_kq);
2451 }
2452
2453 static void
knlist_remove_kq(struct knlist * knl,struct knote * kn,int knlislocked,int kqislocked)2454 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked,
2455 int kqislocked)
2456 {
2457
2458 KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked"));
2459 KNL_ASSERT_LOCK(knl, knlislocked);
2460 mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
2461 KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn));
2462 KASSERT((kn->kn_status & KN_DETACHED) == 0,
2463 ("knote %p was already detached", kn));
2464 if (!knlislocked)
2465 knl->kl_lock(knl->kl_lockarg);
2466 SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
2467 kn->kn_knlist = NULL;
2468 if (!knlislocked)
2469 kn_list_unlock(knl);
2470 if (!kqislocked)
2471 KQ_LOCK(kn->kn_kq);
2472 kn->kn_status |= KN_DETACHED;
2473 if (!kqislocked)
2474 KQ_UNLOCK(kn->kn_kq);
2475 }
2476
2477 /*
2478 * remove knote from the specified knlist
2479 */
2480 void
knlist_remove(struct knlist * knl,struct knote * kn,int islocked)2481 knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
2482 {
2483
2484 knlist_remove_kq(knl, kn, islocked, 0);
2485 }
2486
2487 int
knlist_empty(struct knlist * knl)2488 knlist_empty(struct knlist *knl)
2489 {
2490
2491 KNL_ASSERT_LOCKED(knl);
2492 return (SLIST_EMPTY(&knl->kl_list));
2493 }
2494
2495 static struct mtx knlist_lock;
2496 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
2497 MTX_DEF);
2498 static void knlist_mtx_lock(void *arg);
2499 static void knlist_mtx_unlock(void *arg);
2500
2501 static void
knlist_mtx_lock(void * arg)2502 knlist_mtx_lock(void *arg)
2503 {
2504
2505 mtx_lock((struct mtx *)arg);
2506 }
2507
2508 static void
knlist_mtx_unlock(void * arg)2509 knlist_mtx_unlock(void *arg)
2510 {
2511
2512 mtx_unlock((struct mtx *)arg);
2513 }
2514
2515 static void
knlist_mtx_assert_lock(void * arg,int what)2516 knlist_mtx_assert_lock(void *arg, int what)
2517 {
2518
2519 if (what == LA_LOCKED)
2520 mtx_assert((struct mtx *)arg, MA_OWNED);
2521 else
2522 mtx_assert((struct mtx *)arg, MA_NOTOWNED);
2523 }
2524
2525 void
knlist_init(struct knlist * knl,void * lock,void (* kl_lock)(void *),void (* kl_unlock)(void *),void (* kl_assert_lock)(void *,int))2526 knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
2527 void (*kl_unlock)(void *),
2528 void (*kl_assert_lock)(void *, int))
2529 {
2530
2531 if (lock == NULL)
2532 knl->kl_lockarg = &knlist_lock;
2533 else
2534 knl->kl_lockarg = lock;
2535
2536 if (kl_lock == NULL)
2537 knl->kl_lock = knlist_mtx_lock;
2538 else
2539 knl->kl_lock = kl_lock;
2540 if (kl_unlock == NULL)
2541 knl->kl_unlock = knlist_mtx_unlock;
2542 else
2543 knl->kl_unlock = kl_unlock;
2544 if (kl_assert_lock == NULL)
2545 knl->kl_assert_lock = knlist_mtx_assert_lock;
2546 else
2547 knl->kl_assert_lock = kl_assert_lock;
2548
2549 knl->kl_autodestroy = 0;
2550 SLIST_INIT(&knl->kl_list);
2551 }
2552
2553 void
knlist_init_mtx(struct knlist * knl,struct mtx * lock)2554 knlist_init_mtx(struct knlist *knl, struct mtx *lock)
2555 {
2556
2557 knlist_init(knl, lock, NULL, NULL, NULL);
2558 }
2559
2560 struct knlist *
knlist_alloc(struct mtx * lock)2561 knlist_alloc(struct mtx *lock)
2562 {
2563 struct knlist *knl;
2564
2565 knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK);
2566 knlist_init_mtx(knl, lock);
2567 return (knl);
2568 }
2569
2570 void
knlist_destroy(struct knlist * knl)2571 knlist_destroy(struct knlist *knl)
2572 {
2573
2574 KASSERT(KNLIST_EMPTY(knl),
2575 ("destroying knlist %p with knotes on it", knl));
2576 }
2577
2578 void
knlist_detach(struct knlist * knl)2579 knlist_detach(struct knlist *knl)
2580 {
2581
2582 KNL_ASSERT_LOCKED(knl);
2583 knl->kl_autodestroy = 1;
2584 if (knlist_empty(knl)) {
2585 knlist_destroy(knl);
2586 free(knl, M_KQUEUE);
2587 }
2588 }
2589
2590 /*
2591 * Even if we are locked, we may need to drop the lock to allow any influx
2592 * knotes time to "settle".
2593 */
2594 void
knlist_cleardel(struct knlist * knl,struct thread * td,int islocked,int killkn)2595 knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
2596 {
2597 struct knote *kn, *kn2;
2598 struct kqueue *kq;
2599
2600 KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl));
2601 if (islocked)
2602 KNL_ASSERT_LOCKED(knl);
2603 else {
2604 KNL_ASSERT_UNLOCKED(knl);
2605 again: /* need to reacquire lock since we have dropped it */
2606 knl->kl_lock(knl->kl_lockarg);
2607 }
2608
2609 SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
2610 kq = kn->kn_kq;
2611 KQ_LOCK(kq);
2612 if (kn_in_flux(kn)) {
2613 KQ_UNLOCK(kq);
2614 continue;
2615 }
2616 knlist_remove_kq(knl, kn, 1, 1);
2617 if (killkn) {
2618 kn_enter_flux(kn);
2619 KQ_UNLOCK(kq);
2620 knote_drop_detached(kn, td);
2621 } else {
2622 /* Make sure cleared knotes disappear soon */
2623 kn->kn_flags |= EV_EOF | EV_ONESHOT;
2624 KQ_UNLOCK(kq);
2625 }
2626 kq = NULL;
2627 }
2628
2629 if (!SLIST_EMPTY(&knl->kl_list)) {
2630 /* there are still in flux knotes remaining */
2631 kn = SLIST_FIRST(&knl->kl_list);
2632 kq = kn->kn_kq;
2633 KQ_LOCK(kq);
2634 KASSERT(kn_in_flux(kn), ("knote removed w/o list lock"));
2635 knl->kl_unlock(knl->kl_lockarg);
2636 kq->kq_state |= KQ_FLUXWAIT;
2637 msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
2638 kq = NULL;
2639 goto again;
2640 }
2641
2642 if (islocked)
2643 KNL_ASSERT_LOCKED(knl);
2644 else {
2645 knl->kl_unlock(knl->kl_lockarg);
2646 KNL_ASSERT_UNLOCKED(knl);
2647 }
2648 }
2649
2650 /*
2651 * Remove all knotes referencing a specified fd must be called with FILEDESC
2652 * lock. This prevents a race where a new fd comes along and occupies the
2653 * entry and we attach a knote to the fd.
2654 */
2655 void
knote_fdclose(struct thread * td,int fd)2656 knote_fdclose(struct thread *td, int fd)
2657 {
2658 struct filedesc *fdp = td->td_proc->p_fd;
2659 struct kqueue *kq;
2660 struct knote *kn;
2661 int influx;
2662
2663 FILEDESC_XLOCK_ASSERT(fdp);
2664
2665 /*
2666 * We shouldn't have to worry about new kevents appearing on fd
2667 * since filedesc is locked.
2668 */
2669 TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
2670 KQ_LOCK(kq);
2671
2672 again:
2673 influx = 0;
2674 while (kq->kq_knlistsize > fd &&
2675 (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
2676 if (kn_in_flux(kn)) {
2677 /* someone else might be waiting on our knote */
2678 if (influx)
2679 wakeup(kq);
2680 kq->kq_state |= KQ_FLUXWAIT;
2681 msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
2682 goto again;
2683 }
2684 kn_enter_flux(kn);
2685 KQ_UNLOCK(kq);
2686 influx = 1;
2687 knote_drop(kn, td);
2688 KQ_LOCK(kq);
2689 }
2690 KQ_UNLOCK_FLUX(kq);
2691 }
2692 }
2693
2694 static int
knote_attach(struct knote * kn,struct kqueue * kq)2695 knote_attach(struct knote *kn, struct kqueue *kq)
2696 {
2697 struct klist *list;
2698
2699 KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn));
2700 KQ_OWNED(kq);
2701
2702 if ((kq->kq_state & KQ_CLOSING) != 0)
2703 return (EBADF);
2704 if (kn->kn_fop->f_isfd) {
2705 if (kn->kn_id >= kq->kq_knlistsize)
2706 return (ENOMEM);
2707 list = &kq->kq_knlist[kn->kn_id];
2708 } else {
2709 if (kq->kq_knhash == NULL)
2710 return (ENOMEM);
2711 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2712 }
2713 SLIST_INSERT_HEAD(list, kn, kn_link);
2714 return (0);
2715 }
2716
2717 static void
knote_drop(struct knote * kn,struct thread * td)2718 knote_drop(struct knote *kn, struct thread *td)
2719 {
2720
2721 if ((kn->kn_status & KN_DETACHED) == 0)
2722 kn->kn_fop->f_detach(kn);
2723 knote_drop_detached(kn, td);
2724 }
2725
2726 static void
knote_drop_detached(struct knote * kn,struct thread * td)2727 knote_drop_detached(struct knote *kn, struct thread *td)
2728 {
2729 struct kqueue *kq;
2730 struct klist *list;
2731
2732 kq = kn->kn_kq;
2733
2734 KASSERT((kn->kn_status & KN_DETACHED) != 0,
2735 ("knote %p still attached", kn));
2736 KQ_NOTOWNED(kq);
2737
2738 KQ_LOCK(kq);
2739 for (;;) {
2740 KASSERT(kn->kn_influx >= 1,
2741 ("knote_drop called on %p with influx %d",
2742 kn, kn->kn_influx));
2743 if (kn->kn_influx == 1)
2744 break;
2745 kq->kq_state |= KQ_FLUXWAIT;
2746 msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
2747 }
2748
2749 if (kn->kn_fop->f_isfd)
2750 list = &kq->kq_knlist[kn->kn_id];
2751 else
2752 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2753
2754 if (!SLIST_EMPTY(list))
2755 SLIST_REMOVE(list, kn, knote, kn_link);
2756 if (kn->kn_status & KN_QUEUED)
2757 knote_dequeue(kn);
2758 KQ_UNLOCK_FLUX(kq);
2759
2760 if (kn->kn_fop->f_isfd) {
2761 fdrop(kn->kn_fp, td);
2762 kn->kn_fp = NULL;
2763 }
2764 kqueue_fo_release(kn->kn_kevent.filter);
2765 kn->kn_fop = NULL;
2766 knote_free(kn);
2767 }
2768
2769 static void
knote_enqueue(struct knote * kn)2770 knote_enqueue(struct knote *kn)
2771 {
2772 struct kqueue *kq = kn->kn_kq;
2773
2774 KQ_OWNED(kn->kn_kq);
2775 KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
2776
2777 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2778 kn->kn_status |= KN_QUEUED;
2779 kq->kq_count++;
2780 kqueue_wakeup(kq);
2781 }
2782
2783 static void
knote_dequeue(struct knote * kn)2784 knote_dequeue(struct knote *kn)
2785 {
2786 struct kqueue *kq = kn->kn_kq;
2787
2788 KQ_OWNED(kn->kn_kq);
2789 KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
2790
2791 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2792 kn->kn_status &= ~KN_QUEUED;
2793 kq->kq_count--;
2794 }
2795
2796 static void
knote_init(void)2797 knote_init(void)
2798 {
2799
2800 knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
2801 NULL, NULL, UMA_ALIGN_PTR, 0);
2802 ast_register(TDA_KQUEUE, ASTR_ASTF_REQUIRED, 0, ast_kqueue);
2803 }
2804 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
2805
2806 static struct knote *
knote_alloc(int mflag)2807 knote_alloc(int mflag)
2808 {
2809
2810 return (uma_zalloc(knote_zone, mflag | M_ZERO));
2811 }
2812
2813 static void
knote_free(struct knote * kn)2814 knote_free(struct knote *kn)
2815 {
2816
2817 uma_zfree(knote_zone, kn);
2818 }
2819
2820 /*
2821 * Register the kev w/ the kq specified by fd.
2822 */
2823 int
kqfd_register(int fd,struct kevent * kev,struct thread * td,int mflag)2824 kqfd_register(int fd, struct kevent *kev, struct thread *td, int mflag)
2825 {
2826 struct kqueue *kq;
2827 struct file *fp;
2828 cap_rights_t rights;
2829 int error;
2830
2831 error = fget(td, fd, cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE),
2832 &fp);
2833 if (error != 0)
2834 return (error);
2835 if ((error = kqueue_acquire(fp, &kq)) != 0)
2836 goto noacquire;
2837
2838 error = kqueue_register(kq, kev, td, mflag);
2839 kqueue_release(kq, 0);
2840
2841 noacquire:
2842 fdrop(fp, td);
2843 return (error);
2844 }
2845
2846 struct knote_status_export_bit {
2847 int kn_status_bit;
2848 int knt_status_bit;
2849 };
2850
2851 #define ST(name) \
2852 { .kn_status_bit = KN_##name, .knt_status_bit = KNOTE_STATUS_##name }
2853 static const struct knote_status_export_bit knote_status_export_bits[] = {
2854 ST(ACTIVE),
2855 ST(QUEUED),
2856 ST(DISABLED),
2857 ST(DETACHED),
2858 ST(KQUEUE),
2859 };
2860 #undef ST
2861
2862 static int
knote_status_export(int kn_status)2863 knote_status_export(int kn_status)
2864 {
2865 const struct knote_status_export_bit *b;
2866 unsigned i;
2867 int res;
2868
2869 res = 0;
2870 for (i = 0; i < nitems(knote_status_export_bits); i++) {
2871 b = &knote_status_export_bits[i];
2872 if ((kn_status & b->kn_status_bit) != 0)
2873 res |= b->knt_status_bit;
2874 }
2875 return (res);
2876 }
2877
2878 static int
kern_proc_kqueue_report_one(struct sbuf * s,struct proc * p,int kq_fd,struct kqueue * kq,struct knote * kn,bool compat32 __unused)2879 kern_proc_kqueue_report_one(struct sbuf *s, struct proc *p,
2880 int kq_fd, struct kqueue *kq, struct knote *kn, bool compat32 __unused)
2881 {
2882 struct kinfo_knote kin;
2883 #ifdef COMPAT_FREEBSD32
2884 struct kinfo_knote32 kin32;
2885 #endif
2886 int error;
2887
2888 if (kn->kn_status == KN_MARKER)
2889 return (0);
2890
2891 memset(&kin, 0, sizeof(kin));
2892 kin.knt_kq_fd = kq_fd;
2893 memcpy(&kin.knt_event, &kn->kn_kevent, sizeof(struct kevent));
2894 kin.knt_status = knote_status_export(kn->kn_status);
2895 kn_enter_flux(kn);
2896 KQ_UNLOCK_FLUX(kq);
2897 if (kn->kn_fop->f_userdump != NULL)
2898 (void)kn->kn_fop->f_userdump(p, kn, &kin);
2899 #ifdef COMPAT_FREEBSD32
2900 if (compat32) {
2901 freebsd32_kinfo_knote_to_32(&kin, &kin32);
2902 error = sbuf_bcat(s, &kin32, sizeof(kin32));
2903 } else
2904 #endif
2905 error = sbuf_bcat(s, &kin, sizeof(kin));
2906 KQ_LOCK(kq);
2907 kn_leave_flux(kn);
2908 return (error);
2909 }
2910
2911 static int
kern_proc_kqueue_report(struct sbuf * s,struct proc * p,int kq_fd,struct kqueue * kq,bool compat32)2912 kern_proc_kqueue_report(struct sbuf *s, struct proc *p, int kq_fd,
2913 struct kqueue *kq, bool compat32)
2914 {
2915 struct knote *kn;
2916 int error, i;
2917
2918 error = 0;
2919 KQ_LOCK(kq);
2920 for (i = 0; i < kq->kq_knlistsize; i++) {
2921 SLIST_FOREACH(kn, &kq->kq_knlist[i], kn_link) {
2922 error = kern_proc_kqueue_report_one(s, p, kq_fd,
2923 kq, kn, compat32);
2924 if (error != 0)
2925 goto out;
2926 }
2927 }
2928 if (kq->kq_knhashmask == 0)
2929 goto out;
2930 for (i = 0; i <= kq->kq_knhashmask; i++) {
2931 SLIST_FOREACH(kn, &kq->kq_knhash[i], kn_link) {
2932 error = kern_proc_kqueue_report_one(s, p, kq_fd,
2933 kq, kn, compat32);
2934 if (error != 0)
2935 goto out;
2936 }
2937 }
2938 out:
2939 KQ_UNLOCK_FLUX(kq);
2940 return (error);
2941 }
2942
2943 struct kern_proc_kqueues_out1_cb_args {
2944 struct sbuf *s;
2945 bool compat32;
2946 };
2947
2948 static int
kern_proc_kqueues_out1_cb(struct proc * p,int fd,struct file * fp,void * arg)2949 kern_proc_kqueues_out1_cb(struct proc *p, int fd, struct file *fp, void *arg)
2950 {
2951 struct kqueue *kq;
2952 struct kern_proc_kqueues_out1_cb_args *a;
2953
2954 if (fp->f_type != DTYPE_KQUEUE)
2955 return (0);
2956 a = arg;
2957 kq = fp->f_data;
2958 return (kern_proc_kqueue_report(a->s, p, fd, kq, a->compat32));
2959 }
2960
2961 static int
kern_proc_kqueues_out1(struct thread * td,struct proc * p,struct sbuf * s,bool compat32)2962 kern_proc_kqueues_out1(struct thread *td, struct proc *p, struct sbuf *s,
2963 bool compat32)
2964 {
2965 struct kern_proc_kqueues_out1_cb_args a;
2966
2967 a.s = s;
2968 a.compat32 = compat32;
2969 return (fget_remote_foreach(td, p, kern_proc_kqueues_out1_cb, &a));
2970 }
2971
2972 int
kern_proc_kqueues_out(struct proc * p,struct sbuf * sb,size_t maxlen,bool compat32)2973 kern_proc_kqueues_out(struct proc *p, struct sbuf *sb, size_t maxlen,
2974 bool compat32)
2975 {
2976 struct sbuf *s, sm;
2977 size_t sb_len;
2978 int error;
2979
2980 if (maxlen == -1 || maxlen == 0)
2981 sb_len = 128;
2982 else
2983 sb_len = maxlen;
2984 s = sbuf_new(&sm, NULL, sb_len, maxlen == -1 ? SBUF_AUTOEXTEND :
2985 SBUF_FIXEDLEN);
2986 error = kern_proc_kqueues_out1(curthread, p, s, compat32);
2987 sbuf_finish(s);
2988 if (error == 0) {
2989 sbuf_bcat(sb, sbuf_data(s), MIN(sbuf_len(s), maxlen == -1 ?
2990 SIZE_T_MAX : maxlen));
2991 }
2992 sbuf_delete(s);
2993 return (error);
2994 }
2995
2996 static int
sysctl_kern_proc_kqueue_one(struct thread * td,struct sbuf * s,struct proc * p,int kq_fd,bool compat32)2997 sysctl_kern_proc_kqueue_one(struct thread *td, struct sbuf *s, struct proc *p,
2998 int kq_fd, bool compat32)
2999 {
3000 struct file *fp;
3001 struct kqueue *kq;
3002 int error;
3003
3004 error = fget_remote(td, p, kq_fd, &fp);
3005 if (error == 0) {
3006 if (fp->f_type != DTYPE_KQUEUE) {
3007 error = EINVAL;
3008 } else {
3009 kq = fp->f_data;
3010 error = kern_proc_kqueue_report(s, p, kq_fd, kq,
3011 compat32);
3012 }
3013 fdrop(fp, td);
3014 }
3015 return (error);
3016 }
3017
3018 static int
sysctl_kern_proc_kqueue(SYSCTL_HANDLER_ARGS)3019 sysctl_kern_proc_kqueue(SYSCTL_HANDLER_ARGS)
3020 {
3021 struct thread *td;
3022 struct proc *p;
3023 struct sbuf *s, sm;
3024 int error, error1, *name;
3025 bool compat32;
3026
3027 name = (int *)arg1;
3028 if ((u_int)arg2 > 2 || (u_int)arg2 == 0)
3029 return (EINVAL);
3030
3031 error = pget((pid_t)name[0], PGET_HOLD | PGET_CANDEBUG, &p);
3032 if (error != 0)
3033 return (error);
3034
3035 td = curthread;
3036 #ifdef FREEBSD_COMPAT32
3037 compat32 = SV_CURPROC_FLAG(SV_ILP32);
3038 #else
3039 compat32 = false;
3040 #endif
3041
3042 s = sbuf_new_for_sysctl(&sm, NULL, 0, req);
3043 if (s == NULL) {
3044 error = ENOMEM;
3045 goto out;
3046 }
3047 sbuf_clear_flags(s, SBUF_INCLUDENUL);
3048
3049 if ((u_int)arg2 == 1) {
3050 error = kern_proc_kqueues_out1(td, p, s, compat32);
3051 } else {
3052 error = sysctl_kern_proc_kqueue_one(td, s, p,
3053 name[1] /* kq_fd */, compat32);
3054 }
3055
3056 error1 = sbuf_finish(s);
3057 if (error == 0)
3058 error = error1;
3059 sbuf_delete(s);
3060
3061 out:
3062 PRELE(p);
3063 return (error);
3064 }
3065
3066 static SYSCTL_NODE(_kern_proc, KERN_PROC_KQUEUE, kq,
3067 CTLFLAG_RD | CTLFLAG_MPSAFE,
3068 sysctl_kern_proc_kqueue, "KQueue events");
3069