xref: /freebsd/sys/compat/linux/linux_event.c (revision f6313575401b3e97469df997e8b9d1a18fb485d0)
1 /*-
2  * Copyright (c) 2007 Roman Divacky
3  * Copyright (c) 2014 Dmitry Chagin
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_compat.h"
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/imgact.h>
36 #include <sys/kernel.h>
37 #include <sys/limits.h>
38 #include <sys/lock.h>
39 #include <sys/mutex.h>
40 #include <sys/callout.h>
41 #include <sys/capsicum.h>
42 #include <sys/types.h>
43 #include <sys/user.h>
44 #include <sys/file.h>
45 #include <sys/filedesc.h>
46 #include <sys/filio.h>
47 #include <sys/errno.h>
48 #include <sys/event.h>
49 #include <sys/poll.h>
50 #include <sys/proc.h>
51 #include <sys/selinfo.h>
52 #include <sys/sx.h>
53 #include <sys/syscallsubr.h>
54 #include <sys/timespec.h>
55 
56 #ifdef COMPAT_LINUX32
57 #include <machine/../linux32/linux.h>
58 #include <machine/../linux32/linux32_proto.h>
59 #else
60 #include <machine/../linux/linux.h>
61 #include <machine/../linux/linux_proto.h>
62 #endif
63 
64 #include <compat/linux/linux_emul.h>
65 #include <compat/linux/linux_event.h>
66 #include <compat/linux/linux_file.h>
67 #include <compat/linux/linux_timer.h>
68 #include <compat/linux/linux_util.h>
69 
70 /*
71  * epoll defines 'struct epoll_event' with the field 'data' as 64 bits
72  * on all architectures. But on 32 bit architectures BSD 'struct kevent' only
73  * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied
74  * data verbatuim. Therefore we allocate 64-bit memory block to pass
75  * user supplied data for every file descriptor.
76  */
77 
78 typedef uint64_t	epoll_udata_t;
79 
80 struct epoll_emuldata {
81 	uint32_t	fdc;		/* epoll udata max index */
82 	epoll_udata_t	udata[1];	/* epoll user data vector */
83 };
84 
85 #define	EPOLL_DEF_SZ		16
86 #define	EPOLL_SIZE(fdn)			\
87 	(sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t))
88 
89 struct epoll_event {
90 	uint32_t	events;
91 	epoll_udata_t	data;
92 }
93 #if defined(__amd64__)
94 __attribute__((packed))
95 #endif
96 ;
97 
98 #define	LINUX_MAX_EVENTS	(INT_MAX / sizeof(struct epoll_event))
99 
100 static void	epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata);
101 static int	epoll_to_kevent(struct thread *td, struct file *epfp,
102 		    int fd, struct epoll_event *l_event, int *kev_flags,
103 		    struct kevent *kevent, int *nkevents);
104 static void	kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event);
105 static int	epoll_kev_copyout(void *arg, struct kevent *kevp, int count);
106 static int	epoll_kev_copyin(void *arg, struct kevent *kevp, int count);
107 static int	epoll_delete_event(struct thread *td, struct file *epfp,
108 		    int fd, int filter);
109 static int	epoll_delete_all_events(struct thread *td, struct file *epfp,
110 		    int fd);
111 
112 struct epoll_copyin_args {
113 	struct kevent	*changelist;
114 };
115 
116 struct epoll_copyout_args {
117 	struct epoll_event	*leventlist;
118 	struct proc		*p;
119 	uint32_t		count;
120 	int			error;
121 };
122 
123 /* eventfd */
124 typedef uint64_t	eventfd_t;
125 
126 static fo_rdwr_t	eventfd_read;
127 static fo_rdwr_t	eventfd_write;
128 static fo_ioctl_t	eventfd_ioctl;
129 static fo_poll_t	eventfd_poll;
130 static fo_kqfilter_t	eventfd_kqfilter;
131 static fo_stat_t	eventfd_stat;
132 static fo_close_t	eventfd_close;
133 static fo_fill_kinfo_t	eventfd_fill_kinfo;
134 
135 static struct fileops eventfdops = {
136 	.fo_read = eventfd_read,
137 	.fo_write = eventfd_write,
138 	.fo_truncate = invfo_truncate,
139 	.fo_ioctl = eventfd_ioctl,
140 	.fo_poll = eventfd_poll,
141 	.fo_kqfilter = eventfd_kqfilter,
142 	.fo_stat = eventfd_stat,
143 	.fo_close = eventfd_close,
144 	.fo_chmod = invfo_chmod,
145 	.fo_chown = invfo_chown,
146 	.fo_sendfile = invfo_sendfile,
147 	.fo_fill_kinfo = eventfd_fill_kinfo,
148 	.fo_flags = DFLAG_PASSABLE
149 };
150 
151 static void	filt_eventfddetach(struct knote *kn);
152 static int	filt_eventfdread(struct knote *kn, long hint);
153 static int	filt_eventfdwrite(struct knote *kn, long hint);
154 
155 static struct filterops eventfd_rfiltops = {
156 	.f_isfd = 1,
157 	.f_detach = filt_eventfddetach,
158 	.f_event = filt_eventfdread
159 };
160 static struct filterops eventfd_wfiltops = {
161 	.f_isfd = 1,
162 	.f_detach = filt_eventfddetach,
163 	.f_event = filt_eventfdwrite
164 };
165 
166 /* timerfd */
167 typedef uint64_t	timerfd_t;
168 
169 static fo_rdwr_t	timerfd_read;
170 static fo_poll_t	timerfd_poll;
171 static fo_kqfilter_t	timerfd_kqfilter;
172 static fo_stat_t	timerfd_stat;
173 static fo_close_t	timerfd_close;
174 static fo_fill_kinfo_t	timerfd_fill_kinfo;
175 
176 static struct fileops timerfdops = {
177 	.fo_read = timerfd_read,
178 	.fo_write = invfo_rdwr,
179 	.fo_truncate = invfo_truncate,
180 	.fo_ioctl = invfo_ioctl,
181 	.fo_poll = timerfd_poll,
182 	.fo_kqfilter = timerfd_kqfilter,
183 	.fo_stat = timerfd_stat,
184 	.fo_close = timerfd_close,
185 	.fo_chmod = invfo_chmod,
186 	.fo_chown = invfo_chown,
187 	.fo_sendfile = invfo_sendfile,
188 	.fo_fill_kinfo = timerfd_fill_kinfo,
189 	.fo_flags = DFLAG_PASSABLE
190 };
191 
192 static void	filt_timerfddetach(struct knote *kn);
193 static int	filt_timerfdread(struct knote *kn, long hint);
194 
195 static struct filterops timerfd_rfiltops = {
196 	.f_isfd = 1,
197 	.f_detach = filt_timerfddetach,
198 	.f_event = filt_timerfdread
199 };
200 
201 struct eventfd {
202 	eventfd_t	efd_count;
203 	uint32_t	efd_flags;
204 	struct selinfo	efd_sel;
205 	struct mtx	efd_lock;
206 };
207 
208 struct timerfd {
209 	clockid_t	tfd_clockid;
210 	struct itimerspec tfd_time;
211 	struct callout	tfd_callout;
212 	timerfd_t	tfd_count;
213 	bool		tfd_canceled;
214 	struct selinfo	tfd_sel;
215 	struct mtx	tfd_lock;
216 };
217 
218 static int	eventfd_create(struct thread *td, uint32_t initval, int flags);
219 static void	linux_timerfd_expire(void *);
220 static void	linux_timerfd_curval(struct timerfd *, struct itimerspec *);
221 
222 
223 static void
224 epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata)
225 {
226 	struct linux_pemuldata *pem;
227 	struct epoll_emuldata *emd;
228 	struct proc *p;
229 
230 	p = td->td_proc;
231 
232 	pem = pem_find(p);
233 	KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
234 
235 	LINUX_PEM_XLOCK(pem);
236 	if (pem->epoll == NULL) {
237 		emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK);
238 		emd->fdc = fd;
239 		pem->epoll = emd;
240 	} else {
241 		emd = pem->epoll;
242 		if (fd > emd->fdc) {
243 			emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK);
244 			emd->fdc = fd;
245 			pem->epoll = emd;
246 		}
247 	}
248 	emd->udata[fd] = udata;
249 	LINUX_PEM_XUNLOCK(pem);
250 }
251 
252 static int
253 epoll_create_common(struct thread *td, int flags)
254 {
255 	int error;
256 
257 	error = kern_kqueue(td, flags, NULL);
258 	if (error != 0)
259 		return (error);
260 
261 	epoll_fd_install(td, EPOLL_DEF_SZ, 0);
262 
263 	return (0);
264 }
265 
266 int
267 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
268 {
269 
270 	/*
271 	 * args->size is unused. Linux just tests it
272 	 * and then forgets it as well.
273 	 */
274 	if (args->size <= 0)
275 		return (EINVAL);
276 
277 	return (epoll_create_common(td, 0));
278 }
279 
280 int
281 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
282 {
283 	int flags;
284 
285 	if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0)
286 		return (EINVAL);
287 
288 	flags = 0;
289 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
290 		flags |= O_CLOEXEC;
291 
292 	return (epoll_create_common(td, flags));
293 }
294 
295 /* Structure converting function from epoll to kevent. */
296 static int
297 epoll_to_kevent(struct thread *td, struct file *epfp,
298     int fd, struct epoll_event *l_event, int *kev_flags,
299     struct kevent *kevent, int *nkevents)
300 {
301 	uint32_t levents = l_event->events;
302 	struct linux_pemuldata *pem;
303 	struct proc *p;
304 
305 	/* flags related to how event is registered */
306 	if ((levents & LINUX_EPOLLONESHOT) != 0)
307 		*kev_flags |= EV_ONESHOT;
308 	if ((levents & LINUX_EPOLLET) != 0)
309 		*kev_flags |= EV_CLEAR;
310 	if ((levents & LINUX_EPOLLERR) != 0)
311 		*kev_flags |= EV_ERROR;
312 	if ((levents & LINUX_EPOLLRDHUP) != 0)
313 		*kev_flags |= EV_EOF;
314 
315 	/* flags related to what event is registered */
316 	if ((levents & LINUX_EPOLL_EVRD) != 0) {
317 		EV_SET(kevent++, fd, EVFILT_READ, *kev_flags, 0, 0, 0);
318 		++(*nkevents);
319 	}
320 	if ((levents & LINUX_EPOLL_EVWR) != 0) {
321 		EV_SET(kevent++, fd, EVFILT_WRITE, *kev_flags, 0, 0, 0);
322 		++(*nkevents);
323 	}
324 
325 	if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) {
326 		p = td->td_proc;
327 
328 		pem = pem_find(p);
329 		KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
330 		KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n"));
331 
332 		LINUX_PEM_XLOCK(pem);
333 		if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) {
334 			pem->flags |= LINUX_XUNSUP_EPOLL;
335 			LINUX_PEM_XUNLOCK(pem);
336 			linux_msg(td, "epoll_ctl unsupported flags: 0x%x\n",
337 			    levents);
338 		} else
339 			LINUX_PEM_XUNLOCK(pem);
340 		return (EINVAL);
341 	}
342 
343 	return (0);
344 }
345 
346 /*
347  * Structure converting function from kevent to epoll. In a case
348  * this is called on error in registration we store the error in
349  * event->data and pick it up later in linux_epoll_ctl().
350  */
351 static void
352 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
353 {
354 
355 	if ((kevent->flags & EV_ERROR) != 0) {
356 		l_event->events = LINUX_EPOLLERR;
357 		return;
358 	}
359 
360 	switch (kevent->filter) {
361 	case EVFILT_READ:
362 		l_event->events = LINUX_EPOLLIN|LINUX_EPOLLRDNORM|LINUX_EPOLLPRI;
363 		if ((kevent->flags & EV_EOF) != 0)
364 			l_event->events |= LINUX_EPOLLRDHUP;
365 	break;
366 	case EVFILT_WRITE:
367 		l_event->events = LINUX_EPOLLOUT|LINUX_EPOLLWRNORM;
368 	break;
369 	}
370 }
371 
372 /*
373  * Copyout callback used by kevent. This converts kevent
374  * events to epoll events and copies them back to the
375  * userspace. This is also called on error on registering
376  * of the filter.
377  */
378 static int
379 epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
380 {
381 	struct epoll_copyout_args *args;
382 	struct linux_pemuldata *pem;
383 	struct epoll_emuldata *emd;
384 	struct epoll_event *eep;
385 	int error, fd, i;
386 
387 	args = (struct epoll_copyout_args*) arg;
388 	eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO);
389 
390 	pem = pem_find(args->p);
391 	KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
392 	LINUX_PEM_SLOCK(pem);
393 	emd = pem->epoll;
394 	KASSERT(emd != NULL, ("epoll proc epolldata not found.\n"));
395 
396 	for (i = 0; i < count; i++) {
397 		kevent_to_epoll(&kevp[i], &eep[i]);
398 
399 		fd = kevp[i].ident;
400 		KASSERT(fd <= emd->fdc, ("epoll user data vector"
401 						    " is too small.\n"));
402 		eep[i].data = emd->udata[fd];
403 	}
404 	LINUX_PEM_SUNLOCK(pem);
405 
406 	error = copyout(eep, args->leventlist, count * sizeof(*eep));
407 	if (error == 0) {
408 		args->leventlist += count;
409 		args->count += count;
410 	} else if (args->error == 0)
411 		args->error = error;
412 
413 	free(eep, M_EPOLL);
414 	return (error);
415 }
416 
417 /*
418  * Copyin callback used by kevent. This copies already
419  * converted filters from kernel memory to the kevent
420  * internal kernel memory. Hence the memcpy instead of
421  * copyin.
422  */
423 static int
424 epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
425 {
426 	struct epoll_copyin_args *args;
427 
428 	args = (struct epoll_copyin_args*) arg;
429 
430 	memcpy(kevp, args->changelist, count * sizeof(*kevp));
431 	args->changelist += count;
432 
433 	return (0);
434 }
435 
436 /*
437  * Load epoll filter, convert it to kevent filter
438  * and load it into kevent subsystem.
439  */
440 int
441 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
442 {
443 	struct file *epfp, *fp;
444 	struct epoll_copyin_args ciargs;
445 	struct kevent kev[2];
446 	struct kevent_copyops k_ops = { &ciargs,
447 					NULL,
448 					epoll_kev_copyin};
449 	struct epoll_event le;
450 	cap_rights_t rights;
451 	int kev_flags;
452 	int nchanges = 0;
453 	int error;
454 
455 	if (args->op != LINUX_EPOLL_CTL_DEL) {
456 		error = copyin(args->event, &le, sizeof(le));
457 		if (error != 0)
458 			return (error);
459 	}
460 
461 	error = fget(td, args->epfd,
462 	    cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &epfp);
463 	if (error != 0)
464 		return (error);
465 	if (epfp->f_type != DTYPE_KQUEUE) {
466 		error = EINVAL;
467 		goto leave1;
468 	}
469 
470 	 /* Protect user data vector from incorrectly supplied fd. */
471 	error = fget(td, args->fd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp);
472 	if (error != 0)
473 		goto leave1;
474 
475 	/* Linux disallows spying on himself */
476 	if (epfp == fp) {
477 		error = EINVAL;
478 		goto leave0;
479 	}
480 
481 	ciargs.changelist = kev;
482 
483 	switch (args->op) {
484 	case LINUX_EPOLL_CTL_MOD:
485 		/*
486 		 * We don't memorize which events were set for this FD
487 		 * on this level, so just delete all we could have set:
488 		 * EVFILT_READ and EVFILT_WRITE, ignoring any errors
489 		 */
490 		error = epoll_delete_all_events(td, epfp, args->fd);
491 		if (error != 0)
492 			goto leave0;
493 		/* FALLTHROUGH */
494 
495 	case LINUX_EPOLL_CTL_ADD:
496 			kev_flags = EV_ADD | EV_ENABLE;
497 		break;
498 
499 	case LINUX_EPOLL_CTL_DEL:
500 		/* CTL_DEL means unregister this fd with this epoll */
501 		error = epoll_delete_all_events(td, epfp, args->fd);
502 		goto leave0;
503 
504 	default:
505 		error = EINVAL;
506 		goto leave0;
507 	}
508 
509 	error = epoll_to_kevent(td, epfp, args->fd, &le, &kev_flags,
510 	    kev, &nchanges);
511 	if (error != 0)
512 		goto leave0;
513 
514 	epoll_fd_install(td, args->fd, le.data);
515 
516 	error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL);
517 
518 leave0:
519 	fdrop(fp, td);
520 
521 leave1:
522 	fdrop(epfp, td);
523 	return (error);
524 }
525 
526 /*
527  * Wait for a filter to be triggered on the epoll file descriptor.
528  */
529 static int
530 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events,
531     int maxevents, int timeout, sigset_t *uset)
532 {
533 	struct epoll_copyout_args coargs;
534 	struct kevent_copyops k_ops = { &coargs,
535 					epoll_kev_copyout,
536 					NULL};
537 	struct timespec ts, *tsp;
538 	cap_rights_t rights;
539 	struct file *epfp;
540 	sigset_t omask;
541 	int error;
542 
543 	if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS)
544 		return (EINVAL);
545 
546 	error = fget(td, epfd,
547 	    cap_rights_init(&rights, CAP_KQUEUE_EVENT), &epfp);
548 	if (error != 0)
549 		return (error);
550 	if (epfp->f_type != DTYPE_KQUEUE) {
551 		error = EINVAL;
552 		goto leave1;
553 	}
554 	if (uset != NULL) {
555 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
556 		    &omask, 0);
557 		if (error != 0)
558 			goto leave1;
559 		td->td_pflags |= TDP_OLDMASK;
560 		/*
561 		 * Make sure that ast() is called on return to
562 		 * usermode and TDP_OLDMASK is cleared, restoring old
563 		 * sigmask.
564 		 */
565 		thread_lock(td);
566 		td->td_flags |= TDF_ASTPENDING;
567 		thread_unlock(td);
568 	}
569 
570 
571 	coargs.leventlist = events;
572 	coargs.p = td->td_proc;
573 	coargs.count = 0;
574 	coargs.error = 0;
575 
576 	if (timeout != -1) {
577 		if (timeout < 0) {
578 			error = EINVAL;
579 			goto leave0;
580 		}
581 		/* Convert from milliseconds to timespec. */
582 		ts.tv_sec = timeout / 1000;
583 		ts.tv_nsec = (timeout % 1000) * 1000000;
584 		tsp = &ts;
585 	} else {
586 		tsp = NULL;
587 	}
588 
589 	error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp);
590 	if (error == 0 && coargs.error != 0)
591 		error = coargs.error;
592 
593 	/*
594 	 * kern_kevent might return ENOMEM which is not expected from epoll_wait.
595 	 * Maybe we should translate that but I don't think it matters at all.
596 	 */
597 	if (error == 0)
598 		td->td_retval[0] = coargs.count;
599 
600 leave0:
601 	if (uset != NULL)
602 		error = kern_sigprocmask(td, SIG_SETMASK, &omask,
603 		    NULL, 0);
604 leave1:
605 	fdrop(epfp, td);
606 	return (error);
607 }
608 
609 int
610 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
611 {
612 
613 	return (linux_epoll_wait_common(td, args->epfd, args->events,
614 	    args->maxevents, args->timeout, NULL));
615 }
616 
617 int
618 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args)
619 {
620 	sigset_t mask, *pmask;
621 	l_sigset_t lmask;
622 	int error;
623 
624 	if (args->mask != NULL) {
625 		if (args->sigsetsize != sizeof(l_sigset_t))
626 			return (EINVAL);
627 		error = copyin(args->mask, &lmask, sizeof(l_sigset_t));
628 		if (error != 0)
629 			return (error);
630 		linux_to_bsd_sigset(&lmask, &mask);
631 		pmask = &mask;
632 	} else
633 		pmask = NULL;
634 	return (linux_epoll_wait_common(td, args->epfd, args->events,
635 	    args->maxevents, args->timeout, pmask));
636 }
637 
638 static int
639 epoll_delete_event(struct thread *td, struct file *epfp, int fd, int filter)
640 {
641 	struct epoll_copyin_args ciargs;
642 	struct kevent kev;
643 	struct kevent_copyops k_ops = { &ciargs,
644 					NULL,
645 					epoll_kev_copyin};
646 	int error;
647 
648 	ciargs.changelist = &kev;
649 	EV_SET(&kev, fd, filter, EV_DELETE | EV_DISABLE, 0, 0, 0);
650 
651 	error = kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL);
652 
653 	/*
654 	 * here we ignore ENONT, because we don't keep track of events here
655 	 */
656 	if (error == ENOENT)
657 		error = 0;
658 	return (error);
659 }
660 
661 static int
662 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd)
663 {
664 	int error1, error2;
665 
666 	error1 = epoll_delete_event(td, epfp, fd, EVFILT_READ);
667 	error2 = epoll_delete_event(td, epfp, fd, EVFILT_WRITE);
668 
669 	/* report any errors we got */
670 	return (error1 == 0 ? error2 : error1);
671 }
672 
673 static int
674 eventfd_create(struct thread *td, uint32_t initval, int flags)
675 {
676 	struct filedesc *fdp;
677 	struct eventfd *efd;
678 	struct file *fp;
679 	int fflags, fd, error;
680 
681 	fflags = 0;
682 	if ((flags & LINUX_O_CLOEXEC) != 0)
683 		fflags |= O_CLOEXEC;
684 
685 	fdp = td->td_proc->p_fd;
686 	error = falloc(td, &fp, &fd, fflags);
687 	if (error != 0)
688 		return (error);
689 
690 	efd = malloc(sizeof(*efd), M_EPOLL, M_WAITOK | M_ZERO);
691 	efd->efd_flags = flags;
692 	efd->efd_count = initval;
693 	mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF);
694 
695 	knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock);
696 
697 	fflags = FREAD | FWRITE;
698 	if ((flags & LINUX_O_NONBLOCK) != 0)
699 		fflags |= FNONBLOCK;
700 
701 	finit(fp, fflags, DTYPE_LINUXEFD, efd, &eventfdops);
702 	fdrop(fp, td);
703 
704 	td->td_retval[0] = fd;
705 	return (error);
706 }
707 
708 int
709 linux_eventfd(struct thread *td, struct linux_eventfd_args *args)
710 {
711 
712 	return (eventfd_create(td, args->initval, 0));
713 }
714 
715 int
716 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
717 {
718 
719 	if ((args->flags & ~(LINUX_O_CLOEXEC|LINUX_O_NONBLOCK|LINUX_EFD_SEMAPHORE)) != 0)
720 		return (EINVAL);
721 
722 	return (eventfd_create(td, args->initval, args->flags));
723 }
724 
725 static int
726 eventfd_close(struct file *fp, struct thread *td)
727 {
728 	struct eventfd *efd;
729 
730 	efd = fp->f_data;
731 	if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
732 		return (EBADF);
733 
734 	seldrain(&efd->efd_sel);
735 	knlist_destroy(&efd->efd_sel.si_note);
736 
737 	fp->f_ops = &badfileops;
738 	mtx_destroy(&efd->efd_lock);
739 	free(efd, M_EPOLL);
740 
741 	return (0);
742 }
743 
744 static int
745 eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
746     int flags, struct thread *td)
747 {
748 	struct eventfd *efd;
749 	eventfd_t count;
750 	int error;
751 
752 	efd = fp->f_data;
753 	if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
754 		return (EBADF);
755 
756 	if (uio->uio_resid < sizeof(eventfd_t))
757 		return (EINVAL);
758 
759 	error = 0;
760 	mtx_lock(&efd->efd_lock);
761 retry:
762 	if (efd->efd_count == 0) {
763 		if ((efd->efd_flags & LINUX_O_NONBLOCK) != 0) {
764 			mtx_unlock(&efd->efd_lock);
765 			return (EAGAIN);
766 		}
767 		error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "lefdrd", 0);
768 		if (error == 0)
769 			goto retry;
770 	}
771 	if (error == 0) {
772 		if ((efd->efd_flags & LINUX_EFD_SEMAPHORE) != 0) {
773 			count = 1;
774 			--efd->efd_count;
775 		} else {
776 			count = efd->efd_count;
777 			efd->efd_count = 0;
778 		}
779 		KNOTE_LOCKED(&efd->efd_sel.si_note, 0);
780 		selwakeup(&efd->efd_sel);
781 		wakeup(&efd->efd_count);
782 		mtx_unlock(&efd->efd_lock);
783 		error = uiomove(&count, sizeof(eventfd_t), uio);
784 	} else
785 		mtx_unlock(&efd->efd_lock);
786 
787 	return (error);
788 }
789 
790 static int
791 eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
792      int flags, struct thread *td)
793 {
794 	struct eventfd *efd;
795 	eventfd_t count;
796 	int error;
797 
798 	efd = fp->f_data;
799 	if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
800 		return (EBADF);
801 
802 	if (uio->uio_resid < sizeof(eventfd_t))
803 		return (EINVAL);
804 
805 	error = uiomove(&count, sizeof(eventfd_t), uio);
806 	if (error != 0)
807 		return (error);
808 	if (count == UINT64_MAX)
809 		return (EINVAL);
810 
811 	mtx_lock(&efd->efd_lock);
812 retry:
813 	if (UINT64_MAX - efd->efd_count <= count) {
814 		if ((efd->efd_flags & LINUX_O_NONBLOCK) != 0) {
815 			mtx_unlock(&efd->efd_lock);
816 			/* Do not not return the number of bytes written */
817 			uio->uio_resid += sizeof(eventfd_t);
818 			return (EAGAIN);
819 		}
820 		error = mtx_sleep(&efd->efd_count, &efd->efd_lock,
821 		    PCATCH, "lefdwr", 0);
822 		if (error == 0)
823 			goto retry;
824 	}
825 	if (error == 0) {
826 		efd->efd_count += count;
827 		KNOTE_LOCKED(&efd->efd_sel.si_note, 0);
828 		selwakeup(&efd->efd_sel);
829 		wakeup(&efd->efd_count);
830 	}
831 	mtx_unlock(&efd->efd_lock);
832 
833 	return (error);
834 }
835 
836 static int
837 eventfd_poll(struct file *fp, int events, struct ucred *active_cred,
838     struct thread *td)
839 {
840 	struct eventfd *efd;
841 	int revents = 0;
842 
843 	efd = fp->f_data;
844 	if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
845 		return (POLLERR);
846 
847 	mtx_lock(&efd->efd_lock);
848 	if ((events & (POLLIN|POLLRDNORM)) && efd->efd_count > 0)
849 		revents |= events & (POLLIN|POLLRDNORM);
850 	if ((events & (POLLOUT|POLLWRNORM)) && UINT64_MAX - 1 > efd->efd_count)
851 		revents |= events & (POLLOUT|POLLWRNORM);
852 	if (revents == 0)
853 		selrecord(td, &efd->efd_sel);
854 	mtx_unlock(&efd->efd_lock);
855 
856 	return (revents);
857 }
858 
859 /*ARGSUSED*/
860 static int
861 eventfd_kqfilter(struct file *fp, struct knote *kn)
862 {
863 	struct eventfd *efd;
864 
865 	efd = fp->f_data;
866 	if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
867 		return (EINVAL);
868 
869 	mtx_lock(&efd->efd_lock);
870 	switch (kn->kn_filter) {
871 	case EVFILT_READ:
872 		kn->kn_fop = &eventfd_rfiltops;
873 		break;
874 	case EVFILT_WRITE:
875 		kn->kn_fop = &eventfd_wfiltops;
876 		break;
877 	default:
878 		mtx_unlock(&efd->efd_lock);
879 		return (EINVAL);
880 	}
881 
882 	kn->kn_hook = efd;
883 	knlist_add(&efd->efd_sel.si_note, kn, 1);
884 	mtx_unlock(&efd->efd_lock);
885 
886 	return (0);
887 }
888 
889 static void
890 filt_eventfddetach(struct knote *kn)
891 {
892 	struct eventfd *efd = kn->kn_hook;
893 
894 	mtx_lock(&efd->efd_lock);
895 	knlist_remove(&efd->efd_sel.si_note, kn, 1);
896 	mtx_unlock(&efd->efd_lock);
897 }
898 
899 /*ARGSUSED*/
900 static int
901 filt_eventfdread(struct knote *kn, long hint)
902 {
903 	struct eventfd *efd = kn->kn_hook;
904 	int ret;
905 
906 	mtx_assert(&efd->efd_lock, MA_OWNED);
907 	ret = (efd->efd_count > 0);
908 
909 	return (ret);
910 }
911 
912 /*ARGSUSED*/
913 static int
914 filt_eventfdwrite(struct knote *kn, long hint)
915 {
916 	struct eventfd *efd = kn->kn_hook;
917 	int ret;
918 
919 	mtx_assert(&efd->efd_lock, MA_OWNED);
920 	ret = (UINT64_MAX - 1 > efd->efd_count);
921 
922 	return (ret);
923 }
924 
925 /*ARGSUSED*/
926 static int
927 eventfd_ioctl(struct file *fp, u_long cmd, void *data,
928     struct ucred *active_cred, struct thread *td)
929 {
930 	struct eventfd *efd;
931 
932 	efd = fp->f_data;
933 	if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
934 		return (EINVAL);
935 
936 	switch (cmd)
937 	{
938 	case FIONBIO:
939 		if (*(int *)data)
940 			efd->efd_flags |= LINUX_O_NONBLOCK;
941 		else
942 			efd->efd_flags &= ~LINUX_O_NONBLOCK;
943 	case FIOASYNC:
944 		return (0);
945 	default:
946 		return (ENXIO);
947 	}
948 }
949 
950 /*ARGSUSED*/
951 static int
952 eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
953     struct thread *td)
954 {
955 
956 	return (ENXIO);
957 }
958 
959 /*ARGSUSED*/
960 static int
961 eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
962 {
963 
964 	kif->kf_type = KF_TYPE_UNKNOWN;
965 	return (0);
966 }
967 
968 int
969 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args)
970 {
971 	struct filedesc *fdp;
972 	struct timerfd *tfd;
973 	struct file *fp;
974 	clockid_t clockid;
975 	int fflags, fd, error;
976 
977 	if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0)
978 		return (EINVAL);
979 
980 	error = linux_to_native_clockid(&clockid, args->clockid);
981 	if (error != 0)
982 		return (error);
983 	if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
984 		return (EINVAL);
985 
986 	fflags = 0;
987 	if ((args->flags & LINUX_TFD_CLOEXEC) != 0)
988 		fflags |= O_CLOEXEC;
989 
990 	fdp = td->td_proc->p_fd;
991 	error = falloc(td, &fp, &fd, fflags);
992 	if (error != 0)
993 		return (error);
994 
995 	tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO);
996 	tfd->tfd_clockid = clockid;
997 	mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
998 
999 	callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
1000 	knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
1001 
1002 	fflags = FREAD;
1003 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
1004 		fflags |= FNONBLOCK;
1005 
1006 	finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops);
1007 	fdrop(fp, td);
1008 
1009 	td->td_retval[0] = fd;
1010 	return (error);
1011 }
1012 
1013 static int
1014 timerfd_close(struct file *fp, struct thread *td)
1015 {
1016 	struct timerfd *tfd;
1017 
1018 	tfd = fp->f_data;
1019 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
1020 		return (EINVAL);
1021 
1022 	timespecclear(&tfd->tfd_time.it_value);
1023 	timespecclear(&tfd->tfd_time.it_interval);
1024 
1025 	mtx_lock(&tfd->tfd_lock);
1026 	callout_drain(&tfd->tfd_callout);
1027 	mtx_unlock(&tfd->tfd_lock);
1028 
1029 	seldrain(&tfd->tfd_sel);
1030 	knlist_destroy(&tfd->tfd_sel.si_note);
1031 
1032 	fp->f_ops = &badfileops;
1033 	mtx_destroy(&tfd->tfd_lock);
1034 	free(tfd, M_EPOLL);
1035 
1036 	return (0);
1037 }
1038 
1039 static int
1040 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
1041     int flags, struct thread *td)
1042 {
1043 	struct timerfd *tfd;
1044 	timerfd_t count;
1045 	int error;
1046 
1047 	tfd = fp->f_data;
1048 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
1049 		return (EINVAL);
1050 
1051 	if (uio->uio_resid < sizeof(timerfd_t))
1052 		return (EINVAL);
1053 
1054 	error = 0;
1055 	mtx_lock(&tfd->tfd_lock);
1056 retry:
1057 	if (tfd->tfd_canceled) {
1058 		tfd->tfd_count = 0;
1059 		mtx_unlock(&tfd->tfd_lock);
1060 		return (ECANCELED);
1061 	}
1062 	if (tfd->tfd_count == 0) {
1063 		if ((fp->f_flag & FNONBLOCK) != 0) {
1064 			mtx_unlock(&tfd->tfd_lock);
1065 			return (EAGAIN);
1066 		}
1067 		error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0);
1068 		if (error == 0)
1069 			goto retry;
1070 	}
1071 	if (error == 0) {
1072 		count = tfd->tfd_count;
1073 		tfd->tfd_count = 0;
1074 		mtx_unlock(&tfd->tfd_lock);
1075 		error = uiomove(&count, sizeof(timerfd_t), uio);
1076 	} else
1077 		mtx_unlock(&tfd->tfd_lock);
1078 
1079 	return (error);
1080 }
1081 
1082 static int
1083 timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
1084     struct thread *td)
1085 {
1086 	struct timerfd *tfd;
1087 	int revents = 0;
1088 
1089 	tfd = fp->f_data;
1090 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
1091 		return (POLLERR);
1092 
1093 	mtx_lock(&tfd->tfd_lock);
1094 	if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0)
1095 		revents |= events & (POLLIN|POLLRDNORM);
1096 	if (revents == 0)
1097 		selrecord(td, &tfd->tfd_sel);
1098 	mtx_unlock(&tfd->tfd_lock);
1099 
1100 	return (revents);
1101 }
1102 
1103 /*ARGSUSED*/
1104 static int
1105 timerfd_kqfilter(struct file *fp, struct knote *kn)
1106 {
1107 	struct timerfd *tfd;
1108 
1109 	tfd = fp->f_data;
1110 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
1111 		return (EINVAL);
1112 
1113 	if (kn->kn_filter == EVFILT_READ)
1114 		kn->kn_fop = &timerfd_rfiltops;
1115 	else
1116 		return (EINVAL);
1117 
1118 	kn->kn_hook = tfd;
1119 	knlist_add(&tfd->tfd_sel.si_note, kn, 0);
1120 
1121 	return (0);
1122 }
1123 
1124 static void
1125 filt_timerfddetach(struct knote *kn)
1126 {
1127 	struct timerfd *tfd = kn->kn_hook;
1128 
1129 	mtx_lock(&tfd->tfd_lock);
1130 	knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
1131 	mtx_unlock(&tfd->tfd_lock);
1132 }
1133 
1134 /*ARGSUSED*/
1135 static int
1136 filt_timerfdread(struct knote *kn, long hint)
1137 {
1138 	struct timerfd *tfd = kn->kn_hook;
1139 
1140 	return (tfd->tfd_count > 0);
1141 }
1142 
1143 /*ARGSUSED*/
1144 static int
1145 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
1146     struct thread *td)
1147 {
1148 
1149 	return (ENXIO);
1150 }
1151 
1152 /*ARGSUSED*/
1153 static int
1154 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
1155 {
1156 
1157 	kif->kf_type = KF_TYPE_UNKNOWN;
1158 	return (0);
1159 }
1160 
1161 static void
1162 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts)
1163 {
1164 
1165 	if (tfd->tfd_clockid == CLOCK_REALTIME)
1166 		getnanotime(ts);
1167 	else	/* CLOCK_MONOTONIC */
1168 		getnanouptime(ts);
1169 }
1170 
1171 static void
1172 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots)
1173 {
1174 	struct timespec cts;
1175 
1176 	linux_timerfd_clocktime(tfd, &cts);
1177 	*ots = tfd->tfd_time;
1178 	if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) {
1179 		timespecsub(&ots->it_value, &cts);
1180 		if (ots->it_value.tv_sec < 0 ||
1181 		    (ots->it_value.tv_sec == 0 &&
1182 		     ots->it_value.tv_nsec == 0)) {
1183 			ots->it_value.tv_sec  = 0;
1184 			ots->it_value.tv_nsec = 1;
1185 		}
1186 	}
1187 }
1188 
1189 int
1190 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args)
1191 {
1192 	cap_rights_t rights;
1193 	struct l_itimerspec lots;
1194 	struct itimerspec ots;
1195 	struct timerfd *tfd;
1196 	struct file *fp;
1197 	int error;
1198 
1199 	error = fget(td, args->fd, cap_rights_init(&rights, CAP_READ), &fp);
1200 	if (error != 0)
1201 		return (error);
1202 	tfd = fp->f_data;
1203 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
1204 		error = EINVAL;
1205 		goto out;
1206 	}
1207 
1208 	mtx_lock(&tfd->tfd_lock);
1209 	linux_timerfd_curval(tfd, &ots);
1210 	mtx_unlock(&tfd->tfd_lock);
1211 
1212 	error = native_to_linux_itimerspec(&lots, &ots);
1213 	if (error == 0)
1214 		error = copyout(&lots, args->old_value, sizeof(lots));
1215 
1216 out:
1217 	fdrop(fp, td);
1218 	return (error);
1219 }
1220 
1221 int
1222 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args)
1223 {
1224 	struct l_itimerspec lots;
1225 	struct itimerspec nts, ots;
1226 	struct timespec cts, ts;
1227 	cap_rights_t rights;
1228 	struct timerfd *tfd;
1229 	struct timeval tv;
1230 	struct file *fp;
1231 	int error;
1232 
1233 	if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0)
1234 		return (EINVAL);
1235 
1236 	error = copyin(args->new_value, &lots, sizeof(lots));
1237 	if (error != 0)
1238 		return (error);
1239 	error = linux_to_native_itimerspec(&nts, &lots);
1240 	if (error != 0)
1241 		return (error);
1242 
1243 	error = fget(td, args->fd, cap_rights_init(&rights, CAP_WRITE), &fp);
1244 	if (error != 0)
1245 		return (error);
1246 	tfd = fp->f_data;
1247 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
1248 		error = EINVAL;
1249 		goto out;
1250 	}
1251 
1252 	mtx_lock(&tfd->tfd_lock);
1253 	if (!timespecisset(&nts.it_value))
1254 		timespecclear(&nts.it_interval);
1255 	if (args->old_value != NULL)
1256 		linux_timerfd_curval(tfd, &ots);
1257 
1258 	tfd->tfd_time = nts;
1259 	if (timespecisset(&nts.it_value)) {
1260 		linux_timerfd_clocktime(tfd, &cts);
1261 		ts = nts.it_value;
1262 		if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) {
1263 			timespecadd(&tfd->tfd_time.it_value, &cts);
1264 		} else {
1265 			timespecsub(&ts, &cts);
1266 		}
1267 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
1268 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1269 			linux_timerfd_expire, tfd);
1270 		tfd->tfd_canceled = false;
1271 	} else {
1272 		tfd->tfd_canceled = true;
1273 		callout_stop(&tfd->tfd_callout);
1274 	}
1275 	mtx_unlock(&tfd->tfd_lock);
1276 
1277 	if (args->old_value != NULL) {
1278 		error = native_to_linux_itimerspec(&lots, &ots);
1279 		if (error == 0)
1280 			error = copyout(&lots, args->old_value, sizeof(lots));
1281 	}
1282 
1283 out:
1284 	fdrop(fp, td);
1285 	return (error);
1286 }
1287 
1288 static void
1289 linux_timerfd_expire(void *arg)
1290 {
1291 	struct timespec cts, ts;
1292 	struct timeval tv;
1293 	struct timerfd *tfd;
1294 
1295 	tfd = (struct timerfd *)arg;
1296 
1297 	linux_timerfd_clocktime(tfd, &cts);
1298 	if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) {
1299 		if (timespecisset(&tfd->tfd_time.it_interval))
1300 			timespecadd(&tfd->tfd_time.it_value,
1301 				    &tfd->tfd_time.it_interval);
1302 		else
1303 			/* single shot timer */
1304 			timespecclear(&tfd->tfd_time.it_value);
1305 		if (timespecisset(&tfd->tfd_time.it_value)) {
1306 			ts = tfd->tfd_time.it_value;
1307 			timespecsub(&ts, &cts);
1308 			TIMESPEC_TO_TIMEVAL(&tv, &ts);
1309 			callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1310 				linux_timerfd_expire, tfd);
1311 		}
1312 		tfd->tfd_count++;
1313 		KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
1314 		selwakeup(&tfd->tfd_sel);
1315 		wakeup(&tfd->tfd_count);
1316 	} else if (timespecisset(&tfd->tfd_time.it_value)) {
1317 		ts = tfd->tfd_time.it_value;
1318 		timespecsub(&ts, &cts);
1319 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
1320 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1321 		    linux_timerfd_expire, tfd);
1322 	}
1323 }
1324