xref: /freebsd/sys/compat/linux/linux_event.c (revision cfd6422a5217410fbd66f7a7a8a64d9d85e61229)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2007 Roman Divacky
5  * Copyright (c) 2014 Dmitry Chagin
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include "opt_compat.h"
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/imgact.h>
38 #include <sys/kernel.h>
39 #include <sys/limits.h>
40 #include <sys/lock.h>
41 #include <sys/mutex.h>
42 #include <sys/callout.h>
43 #include <sys/capsicum.h>
44 #include <sys/types.h>
45 #include <sys/user.h>
46 #include <sys/file.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/errno.h>
50 #include <sys/event.h>
51 #include <sys/poll.h>
52 #include <sys/proc.h>
53 #include <sys/selinfo.h>
54 #include <sys/specialfd.h>
55 #include <sys/sx.h>
56 #include <sys/syscallsubr.h>
57 #include <sys/timespec.h>
58 #include <sys/eventfd.h>
59 
60 #ifdef COMPAT_LINUX32
61 #include <machine/../linux32/linux.h>
62 #include <machine/../linux32/linux32_proto.h>
63 #else
64 #include <machine/../linux/linux.h>
65 #include <machine/../linux/linux_proto.h>
66 #endif
67 
68 #include <compat/linux/linux_emul.h>
69 #include <compat/linux/linux_event.h>
70 #include <compat/linux/linux_file.h>
71 #include <compat/linux/linux_timer.h>
72 #include <compat/linux/linux_util.h>
73 
74 /*
75  * epoll defines 'struct epoll_event' with the field 'data' as 64 bits
76  * on all architectures. But on 32 bit architectures BSD 'struct kevent' only
77  * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied
78  * data verbatuim. Therefore we allocate 64-bit memory block to pass
79  * user supplied data for every file descriptor.
80  */
81 
82 typedef uint64_t	epoll_udata_t;
83 
84 struct epoll_emuldata {
85 	uint32_t	fdc;		/* epoll udata max index */
86 	epoll_udata_t	udata[1];	/* epoll user data vector */
87 };
88 
89 #define	EPOLL_DEF_SZ		16
90 #define	EPOLL_SIZE(fdn)			\
91 	(sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t))
92 
93 struct epoll_event {
94 	uint32_t	events;
95 	epoll_udata_t	data;
96 }
97 #if defined(__amd64__)
98 __attribute__((packed))
99 #endif
100 ;
101 
102 #define	LINUX_MAX_EVENTS	(INT_MAX / sizeof(struct epoll_event))
103 
104 static void	epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata);
105 static int	epoll_to_kevent(struct thread *td, int fd,
106 		    struct epoll_event *l_event, struct kevent *kevent,
107 		    int *nkevents);
108 static void	kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event);
109 static int	epoll_kev_copyout(void *arg, struct kevent *kevp, int count);
110 static int	epoll_kev_copyin(void *arg, struct kevent *kevp, int count);
111 static int	epoll_register_kevent(struct thread *td, struct file *epfp,
112 		    int fd, int filter, unsigned int flags);
113 static int	epoll_fd_registered(struct thread *td, struct file *epfp,
114 		    int fd);
115 static int	epoll_delete_all_events(struct thread *td, struct file *epfp,
116 		    int fd);
117 
118 struct epoll_copyin_args {
119 	struct kevent	*changelist;
120 };
121 
122 struct epoll_copyout_args {
123 	struct epoll_event	*leventlist;
124 	struct proc		*p;
125 	uint32_t		count;
126 	int			error;
127 };
128 
129 /* timerfd */
130 typedef uint64_t	timerfd_t;
131 
132 static fo_rdwr_t	timerfd_read;
133 static fo_ioctl_t	timerfd_ioctl;
134 static fo_poll_t	timerfd_poll;
135 static fo_kqfilter_t	timerfd_kqfilter;
136 static fo_stat_t	timerfd_stat;
137 static fo_close_t	timerfd_close;
138 static fo_fill_kinfo_t	timerfd_fill_kinfo;
139 
140 static struct fileops timerfdops = {
141 	.fo_read = timerfd_read,
142 	.fo_write = invfo_rdwr,
143 	.fo_truncate = invfo_truncate,
144 	.fo_ioctl = timerfd_ioctl,
145 	.fo_poll = timerfd_poll,
146 	.fo_kqfilter = timerfd_kqfilter,
147 	.fo_stat = timerfd_stat,
148 	.fo_close = timerfd_close,
149 	.fo_chmod = invfo_chmod,
150 	.fo_chown = invfo_chown,
151 	.fo_sendfile = invfo_sendfile,
152 	.fo_fill_kinfo = timerfd_fill_kinfo,
153 	.fo_flags = DFLAG_PASSABLE
154 };
155 
156 static void	filt_timerfddetach(struct knote *kn);
157 static int	filt_timerfdread(struct knote *kn, long hint);
158 
159 static struct filterops timerfd_rfiltops = {
160 	.f_isfd = 1,
161 	.f_detach = filt_timerfddetach,
162 	.f_event = filt_timerfdread
163 };
164 
165 struct timerfd {
166 	clockid_t	tfd_clockid;
167 	struct itimerspec tfd_time;
168 	struct callout	tfd_callout;
169 	timerfd_t	tfd_count;
170 	bool		tfd_canceled;
171 	struct selinfo	tfd_sel;
172 	struct mtx	tfd_lock;
173 };
174 
175 static void	linux_timerfd_expire(void *);
176 static void	linux_timerfd_curval(struct timerfd *, struct itimerspec *);
177 
178 static void
179 epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata)
180 {
181 	struct linux_pemuldata *pem;
182 	struct epoll_emuldata *emd;
183 	struct proc *p;
184 
185 	p = td->td_proc;
186 
187 	pem = pem_find(p);
188 	KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
189 
190 	LINUX_PEM_XLOCK(pem);
191 	if (pem->epoll == NULL) {
192 		emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK);
193 		emd->fdc = fd;
194 		pem->epoll = emd;
195 	} else {
196 		emd = pem->epoll;
197 		if (fd > emd->fdc) {
198 			emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK);
199 			emd->fdc = fd;
200 			pem->epoll = emd;
201 		}
202 	}
203 	emd->udata[fd] = udata;
204 	LINUX_PEM_XUNLOCK(pem);
205 }
206 
207 static int
208 epoll_create_common(struct thread *td, int flags)
209 {
210 	int error;
211 
212 	error = kern_kqueue(td, flags, NULL);
213 	if (error != 0)
214 		return (error);
215 
216 	epoll_fd_install(td, EPOLL_DEF_SZ, 0);
217 
218 	return (0);
219 }
220 
221 #ifdef LINUX_LEGACY_SYSCALLS
222 int
223 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
224 {
225 
226 	/*
227 	 * args->size is unused. Linux just tests it
228 	 * and then forgets it as well.
229 	 */
230 	if (args->size <= 0)
231 		return (EINVAL);
232 
233 	return (epoll_create_common(td, 0));
234 }
235 #endif
236 
237 int
238 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
239 {
240 	int flags;
241 
242 	if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0)
243 		return (EINVAL);
244 
245 	flags = 0;
246 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
247 		flags |= O_CLOEXEC;
248 
249 	return (epoll_create_common(td, flags));
250 }
251 
252 /* Structure converting function from epoll to kevent. */
253 static int
254 epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event,
255     struct kevent *kevent, int *nkevents)
256 {
257 	uint32_t levents = l_event->events;
258 	struct linux_pemuldata *pem;
259 	struct proc *p;
260 	unsigned short kev_flags = EV_ADD | EV_ENABLE;
261 
262 	/* flags related to how event is registered */
263 	if ((levents & LINUX_EPOLLONESHOT) != 0)
264 		kev_flags |= EV_DISPATCH;
265 	if ((levents & LINUX_EPOLLET) != 0)
266 		kev_flags |= EV_CLEAR;
267 	if ((levents & LINUX_EPOLLERR) != 0)
268 		kev_flags |= EV_ERROR;
269 	if ((levents & LINUX_EPOLLRDHUP) != 0)
270 		kev_flags |= EV_EOF;
271 
272 	/* flags related to what event is registered */
273 	if ((levents & LINUX_EPOLL_EVRD) != 0) {
274 		EV_SET(kevent++, fd, EVFILT_READ, kev_flags, 0, 0, 0);
275 		++(*nkevents);
276 	}
277 	if ((levents & LINUX_EPOLL_EVWR) != 0) {
278 		EV_SET(kevent++, fd, EVFILT_WRITE, kev_flags, 0, 0, 0);
279 		++(*nkevents);
280 	}
281 	/* zero event mask is legal */
282 	if ((levents & (LINUX_EPOLL_EVRD | LINUX_EPOLL_EVWR)) == 0) {
283 		EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0);
284 		++(*nkevents);
285 	}
286 
287 	if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) {
288 		p = td->td_proc;
289 
290 		pem = pem_find(p);
291 		KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
292 		KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n"));
293 
294 		LINUX_PEM_XLOCK(pem);
295 		if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) {
296 			pem->flags |= LINUX_XUNSUP_EPOLL;
297 			LINUX_PEM_XUNLOCK(pem);
298 			linux_msg(td, "epoll_ctl unsupported flags: 0x%x",
299 			    levents);
300 		} else
301 			LINUX_PEM_XUNLOCK(pem);
302 		return (EINVAL);
303 	}
304 
305 	return (0);
306 }
307 
308 /*
309  * Structure converting function from kevent to epoll. In a case
310  * this is called on error in registration we store the error in
311  * event->data and pick it up later in linux_epoll_ctl().
312  */
313 static void
314 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
315 {
316 
317 	if ((kevent->flags & EV_ERROR) != 0) {
318 		l_event->events = LINUX_EPOLLERR;
319 		return;
320 	}
321 
322 	/* XXX EPOLLPRI, EPOLLHUP */
323 	switch (kevent->filter) {
324 	case EVFILT_READ:
325 		l_event->events = LINUX_EPOLLIN;
326 		if ((kevent->flags & EV_EOF) != 0)
327 			l_event->events |= LINUX_EPOLLRDHUP;
328 	break;
329 	case EVFILT_WRITE:
330 		l_event->events = LINUX_EPOLLOUT;
331 	break;
332 	}
333 }
334 
335 /*
336  * Copyout callback used by kevent. This converts kevent
337  * events to epoll events and copies them back to the
338  * userspace. This is also called on error on registering
339  * of the filter.
340  */
341 static int
342 epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
343 {
344 	struct epoll_copyout_args *args;
345 	struct linux_pemuldata *pem;
346 	struct epoll_emuldata *emd;
347 	struct epoll_event *eep;
348 	int error, fd, i;
349 
350 	args = (struct epoll_copyout_args*) arg;
351 	eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO);
352 
353 	pem = pem_find(args->p);
354 	KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
355 	LINUX_PEM_SLOCK(pem);
356 	emd = pem->epoll;
357 	KASSERT(emd != NULL, ("epoll proc epolldata not found.\n"));
358 
359 	for (i = 0; i < count; i++) {
360 		kevent_to_epoll(&kevp[i], &eep[i]);
361 
362 		fd = kevp[i].ident;
363 		KASSERT(fd <= emd->fdc, ("epoll user data vector"
364 						    " is too small.\n"));
365 		eep[i].data = emd->udata[fd];
366 	}
367 	LINUX_PEM_SUNLOCK(pem);
368 
369 	error = copyout(eep, args->leventlist, count * sizeof(*eep));
370 	if (error == 0) {
371 		args->leventlist += count;
372 		args->count += count;
373 	} else if (args->error == 0)
374 		args->error = error;
375 
376 	free(eep, M_EPOLL);
377 	return (error);
378 }
379 
380 /*
381  * Copyin callback used by kevent. This copies already
382  * converted filters from kernel memory to the kevent
383  * internal kernel memory. Hence the memcpy instead of
384  * copyin.
385  */
386 static int
387 epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
388 {
389 	struct epoll_copyin_args *args;
390 
391 	args = (struct epoll_copyin_args*) arg;
392 
393 	memcpy(kevp, args->changelist, count * sizeof(*kevp));
394 	args->changelist += count;
395 
396 	return (0);
397 }
398 
399 /*
400  * Load epoll filter, convert it to kevent filter
401  * and load it into kevent subsystem.
402  */
403 int
404 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
405 {
406 	struct file *epfp, *fp;
407 	struct epoll_copyin_args ciargs;
408 	struct kevent kev[2];
409 	struct kevent_copyops k_ops = { &ciargs,
410 					NULL,
411 					epoll_kev_copyin};
412 	struct epoll_event le;
413 	cap_rights_t rights;
414 	int nchanges = 0;
415 	int error;
416 
417 	if (args->op != LINUX_EPOLL_CTL_DEL) {
418 		error = copyin(args->event, &le, sizeof(le));
419 		if (error != 0)
420 			return (error);
421 	}
422 
423 	error = fget(td, args->epfd,
424 	    cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE), &epfp);
425 	if (error != 0)
426 		return (error);
427 	if (epfp->f_type != DTYPE_KQUEUE) {
428 		error = EINVAL;
429 		goto leave1;
430 	}
431 
432 	 /* Protect user data vector from incorrectly supplied fd. */
433 	error = fget(td, args->fd,
434 		     cap_rights_init_one(&rights, CAP_POLL_EVENT), &fp);
435 	if (error != 0)
436 		goto leave1;
437 
438 	/* Linux disallows spying on himself */
439 	if (epfp == fp) {
440 		error = EINVAL;
441 		goto leave0;
442 	}
443 
444 	ciargs.changelist = kev;
445 
446 	if (args->op != LINUX_EPOLL_CTL_DEL) {
447 		error = epoll_to_kevent(td, args->fd, &le, kev, &nchanges);
448 		if (error != 0)
449 			goto leave0;
450 	}
451 
452 	switch (args->op) {
453 	case LINUX_EPOLL_CTL_MOD:
454 		error = epoll_delete_all_events(td, epfp, args->fd);
455 		if (error != 0)
456 			goto leave0;
457 		break;
458 
459 	case LINUX_EPOLL_CTL_ADD:
460 		if (epoll_fd_registered(td, epfp, args->fd)) {
461 			error = EEXIST;
462 			goto leave0;
463 		}
464 		break;
465 
466 	case LINUX_EPOLL_CTL_DEL:
467 		/* CTL_DEL means unregister this fd with this epoll */
468 		error = epoll_delete_all_events(td, epfp, args->fd);
469 		goto leave0;
470 
471 	default:
472 		error = EINVAL;
473 		goto leave0;
474 	}
475 
476 	epoll_fd_install(td, args->fd, le.data);
477 
478 	error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL);
479 
480 leave0:
481 	fdrop(fp, td);
482 
483 leave1:
484 	fdrop(epfp, td);
485 	return (error);
486 }
487 
488 /*
489  * Wait for a filter to be triggered on the epoll file descriptor.
490  */
491 static int
492 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events,
493     int maxevents, int timeout, sigset_t *uset)
494 {
495 	struct epoll_copyout_args coargs;
496 	struct kevent_copyops k_ops = { &coargs,
497 					epoll_kev_copyout,
498 					NULL};
499 	struct timespec ts, *tsp;
500 	cap_rights_t rights;
501 	struct file *epfp;
502 	sigset_t omask;
503 	int error;
504 
505 	if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS)
506 		return (EINVAL);
507 
508 	error = fget(td, epfd,
509 	    cap_rights_init_one(&rights, CAP_KQUEUE_EVENT), &epfp);
510 	if (error != 0)
511 		return (error);
512 	if (epfp->f_type != DTYPE_KQUEUE) {
513 		error = EINVAL;
514 		goto leave;
515 	}
516 	if (uset != NULL) {
517 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
518 		    &omask, 0);
519 		if (error != 0)
520 			goto leave;
521 		td->td_pflags |= TDP_OLDMASK;
522 		/*
523 		 * Make sure that ast() is called on return to
524 		 * usermode and TDP_OLDMASK is cleared, restoring old
525 		 * sigmask.
526 		 */
527 		thread_lock(td);
528 		td->td_flags |= TDF_ASTPENDING;
529 		thread_unlock(td);
530 	}
531 
532 	coargs.leventlist = events;
533 	coargs.p = td->td_proc;
534 	coargs.count = 0;
535 	coargs.error = 0;
536 
537 	/*
538 	 * Linux epoll_wait(2) man page states that timeout of -1 causes caller
539 	 * to block indefinitely. Real implementation does it if any negative
540 	 * timeout value is passed.
541 	 */
542 	if (timeout >= 0) {
543 		/* Convert from milliseconds to timespec. */
544 		ts.tv_sec = timeout / 1000;
545 		ts.tv_nsec = (timeout % 1000) * 1000000;
546 		tsp = &ts;
547 	} else {
548 		tsp = NULL;
549 	}
550 
551 	error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp);
552 	if (error == 0 && coargs.error != 0)
553 		error = coargs.error;
554 
555 	/*
556 	 * kern_kevent might return ENOMEM which is not expected from epoll_wait.
557 	 * Maybe we should translate that but I don't think it matters at all.
558 	 */
559 	if (error == 0)
560 		td->td_retval[0] = coargs.count;
561 
562 	if (uset != NULL)
563 		error = kern_sigprocmask(td, SIG_SETMASK, &omask,
564 		    NULL, 0);
565 leave:
566 	fdrop(epfp, td);
567 	return (error);
568 }
569 
570 #ifdef LINUX_LEGACY_SYSCALLS
571 int
572 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
573 {
574 
575 	return (linux_epoll_wait_common(td, args->epfd, args->events,
576 	    args->maxevents, args->timeout, NULL));
577 }
578 #endif
579 
580 int
581 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args)
582 {
583 	sigset_t mask, *pmask;
584 	l_sigset_t lmask;
585 	int error;
586 
587 	if (args->mask != NULL) {
588 		if (args->sigsetsize != sizeof(l_sigset_t))
589 			return (EINVAL);
590 		error = copyin(args->mask, &lmask, sizeof(l_sigset_t));
591 		if (error != 0)
592 			return (error);
593 		linux_to_bsd_sigset(&lmask, &mask);
594 		pmask = &mask;
595 	} else
596 		pmask = NULL;
597 	return (linux_epoll_wait_common(td, args->epfd, args->events,
598 	    args->maxevents, args->timeout, pmask));
599 }
600 
601 static int
602 epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter,
603     unsigned int flags)
604 {
605 	struct epoll_copyin_args ciargs;
606 	struct kevent kev;
607 	struct kevent_copyops k_ops = { &ciargs,
608 					NULL,
609 					epoll_kev_copyin};
610 
611 	ciargs.changelist = &kev;
612 	EV_SET(&kev, fd, filter, flags, 0, 0, 0);
613 
614 	return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL));
615 }
616 
617 static int
618 epoll_fd_registered(struct thread *td, struct file *epfp, int fd)
619 {
620 	/*
621 	 * Set empty filter flags to avoid accidental modification of already
622 	 * registered events. In the case of event re-registration:
623 	 * 1. If event does not exists kevent() does nothing and returns ENOENT
624 	 * 2. If event does exists, it's enabled/disabled state is preserved
625 	 *    but fflags, data and udata fields are overwritten. So we can not
626 	 *    set socket lowats and store user's context pointer in udata.
627 	 */
628 	if (epoll_register_kevent(td, epfp, fd, EVFILT_READ, 0) != ENOENT ||
629 	    epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, 0) != ENOENT)
630 		return (1);
631 
632 	return (0);
633 }
634 
635 static int
636 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd)
637 {
638 	int error1, error2;
639 
640 	error1 = epoll_register_kevent(td, epfp, fd, EVFILT_READ, EV_DELETE);
641 	error2 = epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, EV_DELETE);
642 
643 	/* return 0 if at least one result positive */
644 	return (error1 == 0 ? 0 : error2);
645 }
646 
647 #ifdef LINUX_LEGACY_SYSCALLS
648 int
649 linux_eventfd(struct thread *td, struct linux_eventfd_args *args)
650 {
651 	struct specialfd_eventfd ae;
652 
653 	bzero(&ae, sizeof(ae));
654 	ae.initval = args->initval;
655 	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
656 }
657 #endif
658 
659 int
660 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
661 {
662 	struct specialfd_eventfd ae;
663 	int flags;
664 
665 	if ((args->flags & ~(LINUX_O_CLOEXEC | LINUX_O_NONBLOCK |
666 	    LINUX_EFD_SEMAPHORE)) != 0)
667 		return (EINVAL);
668 	flags = 0;
669 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
670 		flags |= EFD_CLOEXEC;
671 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
672 		flags |= EFD_NONBLOCK;
673 	if ((args->flags & LINUX_EFD_SEMAPHORE) != 0)
674 		flags |= EFD_SEMAPHORE;
675 
676 	bzero(&ae, sizeof(ae));
677 	ae.flags = flags;
678 	ae.initval = args->initval;
679 	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
680 }
681 
682 int
683 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args)
684 {
685 	struct filedesc *fdp;
686 	struct timerfd *tfd;
687 	struct file *fp;
688 	clockid_t clockid;
689 	int fflags, fd, error;
690 
691 	if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0)
692 		return (EINVAL);
693 
694 	error = linux_to_native_clockid(&clockid, args->clockid);
695 	if (error != 0)
696 		return (error);
697 	if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
698 		return (EINVAL);
699 
700 	fflags = 0;
701 	if ((args->flags & LINUX_TFD_CLOEXEC) != 0)
702 		fflags |= O_CLOEXEC;
703 
704 	fdp = td->td_proc->p_fd;
705 	error = falloc(td, &fp, &fd, fflags);
706 	if (error != 0)
707 		return (error);
708 
709 	tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO);
710 	tfd->tfd_clockid = clockid;
711 	mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
712 
713 	callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
714 	knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
715 
716 	fflags = FREAD;
717 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
718 		fflags |= FNONBLOCK;
719 
720 	finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops);
721 	fdrop(fp, td);
722 
723 	td->td_retval[0] = fd;
724 	return (error);
725 }
726 
727 static int
728 timerfd_close(struct file *fp, struct thread *td)
729 {
730 	struct timerfd *tfd;
731 
732 	tfd = fp->f_data;
733 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
734 		return (EINVAL);
735 
736 	timespecclear(&tfd->tfd_time.it_value);
737 	timespecclear(&tfd->tfd_time.it_interval);
738 
739 	mtx_lock(&tfd->tfd_lock);
740 	callout_drain(&tfd->tfd_callout);
741 	mtx_unlock(&tfd->tfd_lock);
742 
743 	seldrain(&tfd->tfd_sel);
744 	knlist_destroy(&tfd->tfd_sel.si_note);
745 
746 	fp->f_ops = &badfileops;
747 	mtx_destroy(&tfd->tfd_lock);
748 	free(tfd, M_EPOLL);
749 
750 	return (0);
751 }
752 
753 static int
754 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
755     int flags, struct thread *td)
756 {
757 	struct timerfd *tfd;
758 	timerfd_t count;
759 	int error;
760 
761 	tfd = fp->f_data;
762 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
763 		return (EINVAL);
764 
765 	if (uio->uio_resid < sizeof(timerfd_t))
766 		return (EINVAL);
767 
768 	error = 0;
769 	mtx_lock(&tfd->tfd_lock);
770 retry:
771 	if (tfd->tfd_canceled) {
772 		tfd->tfd_count = 0;
773 		mtx_unlock(&tfd->tfd_lock);
774 		return (ECANCELED);
775 	}
776 	if (tfd->tfd_count == 0) {
777 		if ((fp->f_flag & FNONBLOCK) != 0) {
778 			mtx_unlock(&tfd->tfd_lock);
779 			return (EAGAIN);
780 		}
781 		error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0);
782 		if (error == 0)
783 			goto retry;
784 	}
785 	if (error == 0) {
786 		count = tfd->tfd_count;
787 		tfd->tfd_count = 0;
788 		mtx_unlock(&tfd->tfd_lock);
789 		error = uiomove(&count, sizeof(timerfd_t), uio);
790 	} else
791 		mtx_unlock(&tfd->tfd_lock);
792 
793 	return (error);
794 }
795 
796 static int
797 timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
798     struct thread *td)
799 {
800 	struct timerfd *tfd;
801 	int revents = 0;
802 
803 	tfd = fp->f_data;
804 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
805 		return (POLLERR);
806 
807 	mtx_lock(&tfd->tfd_lock);
808 	if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0)
809 		revents |= events & (POLLIN|POLLRDNORM);
810 	if (revents == 0)
811 		selrecord(td, &tfd->tfd_sel);
812 	mtx_unlock(&tfd->tfd_lock);
813 
814 	return (revents);
815 }
816 
817 static int
818 timerfd_kqfilter(struct file *fp, struct knote *kn)
819 {
820 	struct timerfd *tfd;
821 
822 	tfd = fp->f_data;
823 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
824 		return (EINVAL);
825 
826 	if (kn->kn_filter == EVFILT_READ)
827 		kn->kn_fop = &timerfd_rfiltops;
828 	else
829 		return (EINVAL);
830 
831 	kn->kn_hook = tfd;
832 	knlist_add(&tfd->tfd_sel.si_note, kn, 0);
833 
834 	return (0);
835 }
836 
837 static void
838 filt_timerfddetach(struct knote *kn)
839 {
840 	struct timerfd *tfd = kn->kn_hook;
841 
842 	mtx_lock(&tfd->tfd_lock);
843 	knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
844 	mtx_unlock(&tfd->tfd_lock);
845 }
846 
847 static int
848 filt_timerfdread(struct knote *kn, long hint)
849 {
850 	struct timerfd *tfd = kn->kn_hook;
851 
852 	return (tfd->tfd_count > 0);
853 }
854 
855 static int
856 timerfd_ioctl(struct file *fp, u_long cmd, void *data,
857     struct ucred *active_cred, struct thread *td)
858 {
859 
860 	if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD)
861 		return (EINVAL);
862 
863 	switch (cmd) {
864 	case FIONBIO:
865 	case FIOASYNC:
866 		return (0);
867 	}
868 
869 	return (ENOTTY);
870 }
871 
872 static int
873 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
874     struct thread *td)
875 {
876 
877 	return (ENXIO);
878 }
879 
880 static int
881 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
882 {
883 
884 	kif->kf_type = KF_TYPE_UNKNOWN;
885 	return (0);
886 }
887 
888 static void
889 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts)
890 {
891 
892 	if (tfd->tfd_clockid == CLOCK_REALTIME)
893 		getnanotime(ts);
894 	else	/* CLOCK_MONOTONIC */
895 		getnanouptime(ts);
896 }
897 
898 static void
899 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots)
900 {
901 	struct timespec cts;
902 
903 	linux_timerfd_clocktime(tfd, &cts);
904 	*ots = tfd->tfd_time;
905 	if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) {
906 		timespecsub(&ots->it_value, &cts, &ots->it_value);
907 		if (ots->it_value.tv_sec < 0 ||
908 		    (ots->it_value.tv_sec == 0 &&
909 		     ots->it_value.tv_nsec == 0)) {
910 			ots->it_value.tv_sec  = 0;
911 			ots->it_value.tv_nsec = 1;
912 		}
913 	}
914 }
915 
916 int
917 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args)
918 {
919 	struct l_itimerspec lots;
920 	struct itimerspec ots;
921 	struct timerfd *tfd;
922 	struct file *fp;
923 	int error;
924 
925 	error = fget(td, args->fd, &cap_read_rights, &fp);
926 	if (error != 0)
927 		return (error);
928 	tfd = fp->f_data;
929 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
930 		error = EINVAL;
931 		goto out;
932 	}
933 
934 	mtx_lock(&tfd->tfd_lock);
935 	linux_timerfd_curval(tfd, &ots);
936 	mtx_unlock(&tfd->tfd_lock);
937 
938 	error = native_to_linux_itimerspec(&lots, &ots);
939 	if (error == 0)
940 		error = copyout(&lots, args->old_value, sizeof(lots));
941 
942 out:
943 	fdrop(fp, td);
944 	return (error);
945 }
946 
947 int
948 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args)
949 {
950 	struct l_itimerspec lots;
951 	struct itimerspec nts, ots;
952 	struct timespec cts, ts;
953 	struct timerfd *tfd;
954 	struct timeval tv;
955 	struct file *fp;
956 	int error;
957 
958 	if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0)
959 		return (EINVAL);
960 
961 	error = copyin(args->new_value, &lots, sizeof(lots));
962 	if (error != 0)
963 		return (error);
964 	error = linux_to_native_itimerspec(&nts, &lots);
965 	if (error != 0)
966 		return (error);
967 
968 	error = fget(td, args->fd, &cap_write_rights, &fp);
969 	if (error != 0)
970 		return (error);
971 	tfd = fp->f_data;
972 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
973 		error = EINVAL;
974 		goto out;
975 	}
976 
977 	mtx_lock(&tfd->tfd_lock);
978 	if (!timespecisset(&nts.it_value))
979 		timespecclear(&nts.it_interval);
980 	if (args->old_value != NULL)
981 		linux_timerfd_curval(tfd, &ots);
982 
983 	tfd->tfd_time = nts;
984 	if (timespecisset(&nts.it_value)) {
985 		linux_timerfd_clocktime(tfd, &cts);
986 		ts = nts.it_value;
987 		if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) {
988 			timespecadd(&tfd->tfd_time.it_value, &cts,
989 				&tfd->tfd_time.it_value);
990 		} else {
991 			timespecsub(&ts, &cts, &ts);
992 		}
993 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
994 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
995 			linux_timerfd_expire, tfd);
996 		tfd->tfd_canceled = false;
997 	} else {
998 		tfd->tfd_canceled = true;
999 		callout_stop(&tfd->tfd_callout);
1000 	}
1001 	mtx_unlock(&tfd->tfd_lock);
1002 
1003 	if (args->old_value != NULL) {
1004 		error = native_to_linux_itimerspec(&lots, &ots);
1005 		if (error == 0)
1006 			error = copyout(&lots, args->old_value, sizeof(lots));
1007 	}
1008 
1009 out:
1010 	fdrop(fp, td);
1011 	return (error);
1012 }
1013 
1014 static void
1015 linux_timerfd_expire(void *arg)
1016 {
1017 	struct timespec cts, ts;
1018 	struct timeval tv;
1019 	struct timerfd *tfd;
1020 
1021 	tfd = (struct timerfd *)arg;
1022 
1023 	linux_timerfd_clocktime(tfd, &cts);
1024 	if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) {
1025 		if (timespecisset(&tfd->tfd_time.it_interval))
1026 			timespecadd(&tfd->tfd_time.it_value,
1027 				    &tfd->tfd_time.it_interval,
1028 				    &tfd->tfd_time.it_value);
1029 		else
1030 			/* single shot timer */
1031 			timespecclear(&tfd->tfd_time.it_value);
1032 		if (timespecisset(&tfd->tfd_time.it_value)) {
1033 			timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
1034 			TIMESPEC_TO_TIMEVAL(&tv, &ts);
1035 			callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1036 				linux_timerfd_expire, tfd);
1037 		}
1038 		tfd->tfd_count++;
1039 		KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
1040 		selwakeup(&tfd->tfd_sel);
1041 		wakeup(&tfd->tfd_count);
1042 	} else if (timespecisset(&tfd->tfd_time.it_value)) {
1043 		timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
1044 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
1045 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1046 		    linux_timerfd_expire, tfd);
1047 	}
1048 }
1049