xref: /freebsd/sys/compat/linux/linux_event.c (revision 690b7ea081790eef2c890f63a4fe7e195cf51df0)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2007 Roman Divacky
5  * Copyright (c) 2014 Dmitry Chagin <dchagin@FreeBSD.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include "opt_compat.h"
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/imgact.h>
37 #include <sys/kernel.h>
38 #include <sys/limits.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/callout.h>
42 #include <sys/capsicum.h>
43 #include <sys/types.h>
44 #include <sys/user.h>
45 #include <sys/file.h>
46 #include <sys/filedesc.h>
47 #include <sys/filio.h>
48 #include <sys/errno.h>
49 #include <sys/event.h>
50 #include <sys/poll.h>
51 #include <sys/proc.h>
52 #include <sys/selinfo.h>
53 #include <sys/specialfd.h>
54 #include <sys/sx.h>
55 #include <sys/syscallsubr.h>
56 #include <sys/timespec.h>
57 #include <sys/eventfd.h>
58 
59 #ifdef COMPAT_LINUX32
60 #include <machine/../linux32/linux.h>
61 #include <machine/../linux32/linux32_proto.h>
62 #else
63 #include <machine/../linux/linux.h>
64 #include <machine/../linux/linux_proto.h>
65 #endif
66 
67 #include <compat/linux/linux_emul.h>
68 #include <compat/linux/linux_event.h>
69 #include <compat/linux/linux_file.h>
70 #include <compat/linux/linux_signal.h>
71 #include <compat/linux/linux_timer.h>
72 #include <compat/linux/linux_util.h>
73 
74 typedef uint64_t	epoll_udata_t;
75 
76 struct epoll_event {
77 	uint32_t	events;
78 	epoll_udata_t	data;
79 }
80 #if defined(__amd64__)
81 __attribute__((packed))
82 #endif
83 ;
84 
85 #define	LINUX_MAX_EVENTS	(INT_MAX / sizeof(struct epoll_event))
86 
87 static int	epoll_to_kevent(struct thread *td, int fd,
88 		    struct epoll_event *l_event, struct kevent *kevent,
89 		    int *nkevents);
90 static void	kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event);
91 static int	epoll_kev_copyout(void *arg, struct kevent *kevp, int count);
92 static int	epoll_kev_copyin(void *arg, struct kevent *kevp, int count);
93 static int	epoll_register_kevent(struct thread *td, struct file *epfp,
94 		    int fd, int filter, unsigned int flags);
95 static int	epoll_fd_registered(struct thread *td, struct file *epfp,
96 		    int fd);
97 static int	epoll_delete_all_events(struct thread *td, struct file *epfp,
98 		    int fd);
99 
100 struct epoll_copyin_args {
101 	struct kevent	*changelist;
102 };
103 
104 struct epoll_copyout_args {
105 	struct epoll_event	*leventlist;
106 	struct proc		*p;
107 	uint32_t		count;
108 	int			error;
109 };
110 
111 /* timerfd */
112 typedef uint64_t	timerfd_t;
113 
114 static fo_rdwr_t	timerfd_read;
115 static fo_ioctl_t	timerfd_ioctl;
116 static fo_poll_t	timerfd_poll;
117 static fo_kqfilter_t	timerfd_kqfilter;
118 static fo_stat_t	timerfd_stat;
119 static fo_close_t	timerfd_close;
120 static fo_fill_kinfo_t	timerfd_fill_kinfo;
121 
122 static struct fileops timerfdops = {
123 	.fo_read = timerfd_read,
124 	.fo_write = invfo_rdwr,
125 	.fo_truncate = invfo_truncate,
126 	.fo_ioctl = timerfd_ioctl,
127 	.fo_poll = timerfd_poll,
128 	.fo_kqfilter = timerfd_kqfilter,
129 	.fo_stat = timerfd_stat,
130 	.fo_close = timerfd_close,
131 	.fo_chmod = invfo_chmod,
132 	.fo_chown = invfo_chown,
133 	.fo_sendfile = invfo_sendfile,
134 	.fo_fill_kinfo = timerfd_fill_kinfo,
135 	.fo_flags = DFLAG_PASSABLE
136 };
137 
138 static void	filt_timerfddetach(struct knote *kn);
139 static int	filt_timerfdread(struct knote *kn, long hint);
140 
141 static struct filterops timerfd_rfiltops = {
142 	.f_isfd = 1,
143 	.f_detach = filt_timerfddetach,
144 	.f_event = filt_timerfdread
145 };
146 
147 struct timerfd {
148 	clockid_t	tfd_clockid;
149 	struct itimerspec tfd_time;
150 	struct callout	tfd_callout;
151 	timerfd_t	tfd_count;
152 	bool		tfd_canceled;
153 	struct selinfo	tfd_sel;
154 	struct mtx	tfd_lock;
155 };
156 
157 static void	linux_timerfd_expire(void *);
158 static void	linux_timerfd_curval(struct timerfd *, struct itimerspec *);
159 
160 static int
161 epoll_create_common(struct thread *td, int flags)
162 {
163 
164 	return (kern_kqueue(td, flags, NULL));
165 }
166 
167 #ifdef LINUX_LEGACY_SYSCALLS
168 int
169 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
170 {
171 
172 	/*
173 	 * args->size is unused. Linux just tests it
174 	 * and then forgets it as well.
175 	 */
176 	if (args->size <= 0)
177 		return (EINVAL);
178 
179 	return (epoll_create_common(td, 0));
180 }
181 #endif
182 
183 int
184 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
185 {
186 	int flags;
187 
188 	if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0)
189 		return (EINVAL);
190 
191 	flags = 0;
192 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
193 		flags |= O_CLOEXEC;
194 
195 	return (epoll_create_common(td, flags));
196 }
197 
198 /* Structure converting function from epoll to kevent. */
199 static int
200 epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event,
201     struct kevent *kevent, int *nkevents)
202 {
203 	uint32_t levents = l_event->events;
204 	struct linux_pemuldata *pem;
205 	struct proc *p;
206 	unsigned short kev_flags = EV_ADD | EV_ENABLE;
207 
208 	/* flags related to how event is registered */
209 	if ((levents & LINUX_EPOLLONESHOT) != 0)
210 		kev_flags |= EV_DISPATCH;
211 	if ((levents & LINUX_EPOLLET) != 0)
212 		kev_flags |= EV_CLEAR;
213 	if ((levents & LINUX_EPOLLERR) != 0)
214 		kev_flags |= EV_ERROR;
215 	if ((levents & LINUX_EPOLLRDHUP) != 0)
216 		kev_flags |= EV_EOF;
217 
218 	/* flags related to what event is registered */
219 	if ((levents & LINUX_EPOLL_EVRD) != 0) {
220 		EV_SET(kevent, fd, EVFILT_READ, kev_flags, 0, 0, 0);
221 		kevent->ext[0] = l_event->data;
222 		++kevent;
223 		++(*nkevents);
224 	}
225 	if ((levents & LINUX_EPOLL_EVWR) != 0) {
226 		EV_SET(kevent, fd, EVFILT_WRITE, kev_flags, 0, 0, 0);
227 		kevent->ext[0] = l_event->data;
228 		++kevent;
229 		++(*nkevents);
230 	}
231 	/* zero event mask is legal */
232 	if ((levents & (LINUX_EPOLL_EVRD | LINUX_EPOLL_EVWR)) == 0) {
233 		EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0);
234 		++(*nkevents);
235 	}
236 
237 	if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) {
238 		p = td->td_proc;
239 
240 		pem = pem_find(p);
241 		KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
242 
243 		LINUX_PEM_XLOCK(pem);
244 		if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) {
245 			pem->flags |= LINUX_XUNSUP_EPOLL;
246 			LINUX_PEM_XUNLOCK(pem);
247 			linux_msg(td, "epoll_ctl unsupported flags: 0x%x",
248 			    levents);
249 		} else
250 			LINUX_PEM_XUNLOCK(pem);
251 		return (EINVAL);
252 	}
253 
254 	return (0);
255 }
256 
257 /*
258  * Structure converting function from kevent to epoll. In a case
259  * this is called on error in registration we store the error in
260  * event->data and pick it up later in linux_epoll_ctl().
261  */
262 static void
263 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
264 {
265 
266 	l_event->data = kevent->ext[0];
267 
268 	if ((kevent->flags & EV_ERROR) != 0) {
269 		l_event->events = LINUX_EPOLLERR;
270 		return;
271 	}
272 
273 	/* XXX EPOLLPRI, EPOLLHUP */
274 	switch (kevent->filter) {
275 	case EVFILT_READ:
276 		l_event->events = LINUX_EPOLLIN;
277 		if ((kevent->flags & EV_EOF) != 0)
278 			l_event->events |= LINUX_EPOLLRDHUP;
279 	break;
280 	case EVFILT_WRITE:
281 		l_event->events = LINUX_EPOLLOUT;
282 	break;
283 	}
284 }
285 
286 /*
287  * Copyout callback used by kevent. This converts kevent
288  * events to epoll events and copies them back to the
289  * userspace. This is also called on error on registering
290  * of the filter.
291  */
292 static int
293 epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
294 {
295 	struct epoll_copyout_args *args;
296 	struct epoll_event *eep;
297 	int error, i;
298 
299 	args = (struct epoll_copyout_args*) arg;
300 	eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO);
301 
302 	for (i = 0; i < count; i++)
303 		kevent_to_epoll(&kevp[i], &eep[i]);
304 
305 	error = copyout(eep, args->leventlist, count * sizeof(*eep));
306 	if (error == 0) {
307 		args->leventlist += count;
308 		args->count += count;
309 	} else if (args->error == 0)
310 		args->error = error;
311 
312 	free(eep, M_EPOLL);
313 	return (error);
314 }
315 
316 /*
317  * Copyin callback used by kevent. This copies already
318  * converted filters from kernel memory to the kevent
319  * internal kernel memory. Hence the memcpy instead of
320  * copyin.
321  */
322 static int
323 epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
324 {
325 	struct epoll_copyin_args *args;
326 
327 	args = (struct epoll_copyin_args*) arg;
328 
329 	memcpy(kevp, args->changelist, count * sizeof(*kevp));
330 	args->changelist += count;
331 
332 	return (0);
333 }
334 
335 /*
336  * Load epoll filter, convert it to kevent filter
337  * and load it into kevent subsystem.
338  */
339 int
340 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
341 {
342 	struct file *epfp, *fp;
343 	struct epoll_copyin_args ciargs;
344 	struct kevent kev[2];
345 	struct kevent_copyops k_ops = { &ciargs,
346 					NULL,
347 					epoll_kev_copyin};
348 	struct epoll_event le;
349 	cap_rights_t rights;
350 	int nchanges = 0;
351 	int error;
352 
353 	if (args->op != LINUX_EPOLL_CTL_DEL) {
354 		error = copyin(args->event, &le, sizeof(le));
355 		if (error != 0)
356 			return (error);
357 	}
358 
359 	error = fget(td, args->epfd,
360 	    cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE), &epfp);
361 	if (error != 0)
362 		return (error);
363 	if (epfp->f_type != DTYPE_KQUEUE) {
364 		error = EINVAL;
365 		goto leave1;
366 	}
367 
368 	 /* Protect user data vector from incorrectly supplied fd. */
369 	error = fget(td, args->fd,
370 		     cap_rights_init_one(&rights, CAP_POLL_EVENT), &fp);
371 	if (error != 0)
372 		goto leave1;
373 
374 	/* Linux disallows spying on himself */
375 	if (epfp == fp) {
376 		error = EINVAL;
377 		goto leave0;
378 	}
379 
380 	ciargs.changelist = kev;
381 
382 	if (args->op != LINUX_EPOLL_CTL_DEL) {
383 		error = epoll_to_kevent(td, args->fd, &le, kev, &nchanges);
384 		if (error != 0)
385 			goto leave0;
386 	}
387 
388 	switch (args->op) {
389 	case LINUX_EPOLL_CTL_MOD:
390 		error = epoll_delete_all_events(td, epfp, args->fd);
391 		if (error != 0)
392 			goto leave0;
393 		break;
394 
395 	case LINUX_EPOLL_CTL_ADD:
396 		if (epoll_fd_registered(td, epfp, args->fd)) {
397 			error = EEXIST;
398 			goto leave0;
399 		}
400 		break;
401 
402 	case LINUX_EPOLL_CTL_DEL:
403 		/* CTL_DEL means unregister this fd with this epoll */
404 		error = epoll_delete_all_events(td, epfp, args->fd);
405 		goto leave0;
406 
407 	default:
408 		error = EINVAL;
409 		goto leave0;
410 	}
411 
412 	error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL);
413 
414 leave0:
415 	fdrop(fp, td);
416 
417 leave1:
418 	fdrop(epfp, td);
419 	return (error);
420 }
421 
422 /*
423  * Wait for a filter to be triggered on the epoll file descriptor.
424  */
425 
426 static int
427 linux_epoll_wait_ts(struct thread *td, int epfd, struct epoll_event *events,
428     int maxevents, struct timespec *tsp, sigset_t *uset)
429 {
430 	struct epoll_copyout_args coargs;
431 	struct kevent_copyops k_ops = { &coargs,
432 					epoll_kev_copyout,
433 					NULL};
434 	cap_rights_t rights;
435 	struct file *epfp;
436 	sigset_t omask;
437 	int error;
438 
439 	if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS)
440 		return (EINVAL);
441 
442 	error = fget(td, epfd,
443 	    cap_rights_init_one(&rights, CAP_KQUEUE_EVENT), &epfp);
444 	if (error != 0)
445 		return (error);
446 	if (epfp->f_type != DTYPE_KQUEUE) {
447 		error = EINVAL;
448 		goto leave;
449 	}
450 	if (uset != NULL) {
451 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
452 		    &omask, 0);
453 		if (error != 0)
454 			goto leave;
455 		td->td_pflags |= TDP_OLDMASK;
456 		/*
457 		 * Make sure that ast() is called on return to
458 		 * usermode and TDP_OLDMASK is cleared, restoring old
459 		 * sigmask.
460 		 */
461 		thread_lock(td);
462 		td->td_flags |= TDF_ASTPENDING;
463 		thread_unlock(td);
464 	}
465 
466 	coargs.leventlist = events;
467 	coargs.p = td->td_proc;
468 	coargs.count = 0;
469 	coargs.error = 0;
470 
471 	error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp);
472 	if (error == 0 && coargs.error != 0)
473 		error = coargs.error;
474 
475 	/*
476 	 * kern_kevent might return ENOMEM which is not expected from epoll_wait.
477 	 * Maybe we should translate that but I don't think it matters at all.
478 	 */
479 	if (error == 0)
480 		td->td_retval[0] = coargs.count;
481 
482 	if (uset != NULL)
483 		error = kern_sigprocmask(td, SIG_SETMASK, &omask,
484 		    NULL, 0);
485 leave:
486 	fdrop(epfp, td);
487 	return (error);
488 }
489 
490 static int
491 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events,
492     int maxevents, int timeout, sigset_t *uset)
493 {
494 	struct timespec ts, *tsp;
495 
496 	/*
497 	 * Linux epoll_wait(2) man page states that timeout of -1 causes caller
498 	 * to block indefinitely. Real implementation does it if any negative
499 	 * timeout value is passed.
500 	 */
501 	if (timeout >= 0) {
502 		/* Convert from milliseconds to timespec. */
503 		ts.tv_sec = timeout / 1000;
504 		ts.tv_nsec = (timeout % 1000) * 1000000;
505 		tsp = &ts;
506 	} else {
507 		tsp = NULL;
508 	}
509 	return (linux_epoll_wait_ts(td, epfd, events, maxevents, tsp, uset));
510 
511 }
512 
513 #ifdef LINUX_LEGACY_SYSCALLS
514 int
515 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
516 {
517 
518 	return (linux_epoll_wait_common(td, args->epfd, args->events,
519 	    args->maxevents, args->timeout, NULL));
520 }
521 #endif
522 
523 int
524 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args)
525 {
526 	sigset_t mask, *pmask;
527 	int error;
528 
529 	error = linux_copyin_sigset(args->mask, sizeof(l_sigset_t),
530 	    &mask, &pmask);
531 	if (error != 0)
532 		return (error);
533 
534 	return (linux_epoll_wait_common(td, args->epfd, args->events,
535 	    args->maxevents, args->timeout, pmask));
536 }
537 
538 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
539 int
540 linux_epoll_pwait2_64(struct thread *td, struct linux_epoll_pwait2_64_args *args)
541 {
542 	struct timespec ts, *tsa;
543 	struct l_timespec64 lts;
544 	sigset_t mask, *pmask;
545 	int error;
546 
547 	error = linux_copyin_sigset(args->mask, sizeof(l_sigset_t),
548 	    &mask, &pmask);
549 	if (error != 0)
550 		return (error);
551 
552 	if (args->timeout) {
553 		if ((error = copyin(args->timeout, &lts, sizeof(lts))))
554 			return (error);
555 		error = linux_to_native_timespec64(&ts, &lts);
556 		if (error != 0)
557 			return (error);
558 		tsa = &ts;
559 	} else
560 		tsa = NULL;
561 
562 	return (linux_epoll_wait_ts(td, args->epfd, args->events,
563 	    args->maxevents, tsa, pmask));
564 }
565 #else
566 int
567 linux_epoll_pwait2(struct thread *td, struct linux_epoll_pwait2_args *args)
568 {
569 	struct timespec ts, *tsa;
570 	struct l_timespec lts;
571 	sigset_t mask, *pmask;
572 	int error;
573 
574 	error = linux_copyin_sigset(args->mask, sizeof(l_sigset_t),
575 	    &mask, &pmask);
576 	if (error != 0)
577 		return (error);
578 
579 	if (args->timeout) {
580 		if ((error = copyin(args->timeout, &lts, sizeof(lts))))
581 			return (error);
582 		error = linux_to_native_timespec(&ts, &lts);
583 		if (error != 0)
584 			return (error);
585 		tsa = &ts;
586 	} else
587 		tsa = NULL;
588 
589 	return (linux_epoll_wait_ts(td, args->epfd, args->events,
590 	    args->maxevents, tsa, pmask));
591 }
592 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
593 
594 static int
595 epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter,
596     unsigned int flags)
597 {
598 	struct epoll_copyin_args ciargs;
599 	struct kevent kev;
600 	struct kevent_copyops k_ops = { &ciargs,
601 					NULL,
602 					epoll_kev_copyin};
603 
604 	ciargs.changelist = &kev;
605 	EV_SET(&kev, fd, filter, flags, 0, 0, 0);
606 
607 	return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL));
608 }
609 
610 static int
611 epoll_fd_registered(struct thread *td, struct file *epfp, int fd)
612 {
613 	/*
614 	 * Set empty filter flags to avoid accidental modification of already
615 	 * registered events. In the case of event re-registration:
616 	 * 1. If event does not exists kevent() does nothing and returns ENOENT
617 	 * 2. If event does exists, it's enabled/disabled state is preserved
618 	 *    but fflags, data and udata fields are overwritten. So we can not
619 	 *    set socket lowats and store user's context pointer in udata.
620 	 */
621 	if (epoll_register_kevent(td, epfp, fd, EVFILT_READ, 0) != ENOENT ||
622 	    epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, 0) != ENOENT)
623 		return (1);
624 
625 	return (0);
626 }
627 
628 static int
629 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd)
630 {
631 	int error1, error2;
632 
633 	error1 = epoll_register_kevent(td, epfp, fd, EVFILT_READ, EV_DELETE);
634 	error2 = epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, EV_DELETE);
635 
636 	/* return 0 if at least one result positive */
637 	return (error1 == 0 ? 0 : error2);
638 }
639 
640 #ifdef LINUX_LEGACY_SYSCALLS
641 int
642 linux_eventfd(struct thread *td, struct linux_eventfd_args *args)
643 {
644 	struct specialfd_eventfd ae;
645 
646 	bzero(&ae, sizeof(ae));
647 	ae.initval = args->initval;
648 	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
649 }
650 #endif
651 
652 int
653 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
654 {
655 	struct specialfd_eventfd ae;
656 	int flags;
657 
658 	if ((args->flags & ~(LINUX_O_CLOEXEC | LINUX_O_NONBLOCK |
659 	    LINUX_EFD_SEMAPHORE)) != 0)
660 		return (EINVAL);
661 	flags = 0;
662 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
663 		flags |= EFD_CLOEXEC;
664 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
665 		flags |= EFD_NONBLOCK;
666 	if ((args->flags & LINUX_EFD_SEMAPHORE) != 0)
667 		flags |= EFD_SEMAPHORE;
668 
669 	bzero(&ae, sizeof(ae));
670 	ae.flags = flags;
671 	ae.initval = args->initval;
672 	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
673 }
674 
675 int
676 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args)
677 {
678 	struct timerfd *tfd;
679 	struct file *fp;
680 	clockid_t clockid;
681 	int fflags, fd, error;
682 
683 	if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0)
684 		return (EINVAL);
685 
686 	error = linux_to_native_clockid(&clockid, args->clockid);
687 	if (error != 0)
688 		return (error);
689 	if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
690 		return (EINVAL);
691 
692 	fflags = 0;
693 	if ((args->flags & LINUX_TFD_CLOEXEC) != 0)
694 		fflags |= O_CLOEXEC;
695 
696 	error = falloc(td, &fp, &fd, fflags);
697 	if (error != 0)
698 		return (error);
699 
700 	tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO);
701 	tfd->tfd_clockid = clockid;
702 	mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
703 
704 	callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
705 	knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
706 
707 	fflags = FREAD;
708 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
709 		fflags |= FNONBLOCK;
710 
711 	finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops);
712 	fdrop(fp, td);
713 
714 	td->td_retval[0] = fd;
715 	return (error);
716 }
717 
718 static int
719 timerfd_close(struct file *fp, struct thread *td)
720 {
721 	struct timerfd *tfd;
722 
723 	tfd = fp->f_data;
724 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
725 		return (EINVAL);
726 
727 	timespecclear(&tfd->tfd_time.it_value);
728 	timespecclear(&tfd->tfd_time.it_interval);
729 
730 	callout_drain(&tfd->tfd_callout);
731 
732 	seldrain(&tfd->tfd_sel);
733 	knlist_destroy(&tfd->tfd_sel.si_note);
734 
735 	fp->f_ops = &badfileops;
736 	mtx_destroy(&tfd->tfd_lock);
737 	free(tfd, M_EPOLL);
738 
739 	return (0);
740 }
741 
742 static int
743 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
744     int flags, struct thread *td)
745 {
746 	struct timerfd *tfd;
747 	timerfd_t count;
748 	int error;
749 
750 	tfd = fp->f_data;
751 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
752 		return (EINVAL);
753 
754 	if (uio->uio_resid < sizeof(timerfd_t))
755 		return (EINVAL);
756 
757 	error = 0;
758 	mtx_lock(&tfd->tfd_lock);
759 retry:
760 	if (tfd->tfd_canceled) {
761 		tfd->tfd_count = 0;
762 		mtx_unlock(&tfd->tfd_lock);
763 		return (ECANCELED);
764 	}
765 	if (tfd->tfd_count == 0) {
766 		if ((fp->f_flag & FNONBLOCK) != 0) {
767 			mtx_unlock(&tfd->tfd_lock);
768 			return (EAGAIN);
769 		}
770 		error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0);
771 		if (error == 0)
772 			goto retry;
773 	}
774 	if (error == 0) {
775 		count = tfd->tfd_count;
776 		tfd->tfd_count = 0;
777 		mtx_unlock(&tfd->tfd_lock);
778 		error = uiomove(&count, sizeof(timerfd_t), uio);
779 	} else
780 		mtx_unlock(&tfd->tfd_lock);
781 
782 	return (error);
783 }
784 
785 static int
786 timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
787     struct thread *td)
788 {
789 	struct timerfd *tfd;
790 	int revents = 0;
791 
792 	tfd = fp->f_data;
793 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
794 		return (POLLERR);
795 
796 	mtx_lock(&tfd->tfd_lock);
797 	if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0)
798 		revents |= events & (POLLIN|POLLRDNORM);
799 	if (revents == 0)
800 		selrecord(td, &tfd->tfd_sel);
801 	mtx_unlock(&tfd->tfd_lock);
802 
803 	return (revents);
804 }
805 
806 static int
807 timerfd_kqfilter(struct file *fp, struct knote *kn)
808 {
809 	struct timerfd *tfd;
810 
811 	tfd = fp->f_data;
812 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
813 		return (EINVAL);
814 
815 	if (kn->kn_filter == EVFILT_READ)
816 		kn->kn_fop = &timerfd_rfiltops;
817 	else
818 		return (EINVAL);
819 
820 	kn->kn_hook = tfd;
821 	knlist_add(&tfd->tfd_sel.si_note, kn, 0);
822 
823 	return (0);
824 }
825 
826 static void
827 filt_timerfddetach(struct knote *kn)
828 {
829 	struct timerfd *tfd = kn->kn_hook;
830 
831 	mtx_lock(&tfd->tfd_lock);
832 	knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
833 	mtx_unlock(&tfd->tfd_lock);
834 }
835 
836 static int
837 filt_timerfdread(struct knote *kn, long hint)
838 {
839 	struct timerfd *tfd = kn->kn_hook;
840 
841 	return (tfd->tfd_count > 0);
842 }
843 
844 static int
845 timerfd_ioctl(struct file *fp, u_long cmd, void *data,
846     struct ucred *active_cred, struct thread *td)
847 {
848 
849 	if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD)
850 		return (EINVAL);
851 
852 	switch (cmd) {
853 	case FIONBIO:
854 	case FIOASYNC:
855 		return (0);
856 	}
857 
858 	return (ENOTTY);
859 }
860 
861 static int
862 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
863 {
864 
865 	return (ENXIO);
866 }
867 
868 static int
869 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
870 {
871 
872 	kif->kf_type = KF_TYPE_UNKNOWN;
873 	return (0);
874 }
875 
876 static void
877 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts)
878 {
879 
880 	if (tfd->tfd_clockid == CLOCK_REALTIME)
881 		getnanotime(ts);
882 	else	/* CLOCK_MONOTONIC */
883 		getnanouptime(ts);
884 }
885 
886 static void
887 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots)
888 {
889 	struct timespec cts;
890 
891 	linux_timerfd_clocktime(tfd, &cts);
892 	*ots = tfd->tfd_time;
893 	if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) {
894 		timespecsub(&ots->it_value, &cts, &ots->it_value);
895 		if (ots->it_value.tv_sec < 0 ||
896 		    (ots->it_value.tv_sec == 0 &&
897 		     ots->it_value.tv_nsec == 0)) {
898 			ots->it_value.tv_sec  = 0;
899 			ots->it_value.tv_nsec = 1;
900 		}
901 	}
902 }
903 
904 int
905 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args)
906 {
907 	struct l_itimerspec lots;
908 	struct itimerspec ots;
909 	struct timerfd *tfd;
910 	struct file *fp;
911 	int error;
912 
913 	error = fget(td, args->fd, &cap_read_rights, &fp);
914 	if (error != 0)
915 		return (error);
916 	tfd = fp->f_data;
917 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
918 		error = EINVAL;
919 		goto out;
920 	}
921 
922 	mtx_lock(&tfd->tfd_lock);
923 	linux_timerfd_curval(tfd, &ots);
924 	mtx_unlock(&tfd->tfd_lock);
925 
926 	error = native_to_linux_itimerspec(&lots, &ots);
927 	if (error == 0)
928 		error = copyout(&lots, args->old_value, sizeof(lots));
929 
930 out:
931 	fdrop(fp, td);
932 	return (error);
933 }
934 
935 int
936 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args)
937 {
938 	struct l_itimerspec lots;
939 	struct itimerspec nts, ots;
940 	struct timespec cts, ts;
941 	struct timerfd *tfd;
942 	struct timeval tv;
943 	struct file *fp;
944 	int error;
945 
946 	if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0)
947 		return (EINVAL);
948 
949 	error = copyin(args->new_value, &lots, sizeof(lots));
950 	if (error != 0)
951 		return (error);
952 	error = linux_to_native_itimerspec(&nts, &lots);
953 	if (error != 0)
954 		return (error);
955 
956 	error = fget(td, args->fd, &cap_write_rights, &fp);
957 	if (error != 0)
958 		return (error);
959 	tfd = fp->f_data;
960 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
961 		error = EINVAL;
962 		goto out;
963 	}
964 
965 	mtx_lock(&tfd->tfd_lock);
966 	if (!timespecisset(&nts.it_value))
967 		timespecclear(&nts.it_interval);
968 	if (args->old_value != NULL)
969 		linux_timerfd_curval(tfd, &ots);
970 
971 	tfd->tfd_time = nts;
972 	tfd->tfd_count = 0;
973 	if (timespecisset(&nts.it_value)) {
974 		linux_timerfd_clocktime(tfd, &cts);
975 		ts = nts.it_value;
976 		if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) {
977 			timespecadd(&tfd->tfd_time.it_value, &cts,
978 				&tfd->tfd_time.it_value);
979 		} else {
980 			timespecsub(&ts, &cts, &ts);
981 		}
982 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
983 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
984 			linux_timerfd_expire, tfd);
985 		tfd->tfd_canceled = false;
986 	} else {
987 		tfd->tfd_canceled = true;
988 		callout_stop(&tfd->tfd_callout);
989 	}
990 	mtx_unlock(&tfd->tfd_lock);
991 
992 	if (args->old_value != NULL) {
993 		error = native_to_linux_itimerspec(&lots, &ots);
994 		if (error == 0)
995 			error = copyout(&lots, args->old_value, sizeof(lots));
996 	}
997 
998 out:
999 	fdrop(fp, td);
1000 	return (error);
1001 }
1002 
1003 static void
1004 linux_timerfd_expire(void *arg)
1005 {
1006 	struct timespec cts, ts;
1007 	struct timeval tv;
1008 	struct timerfd *tfd;
1009 
1010 	tfd = (struct timerfd *)arg;
1011 
1012 	linux_timerfd_clocktime(tfd, &cts);
1013 	if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) {
1014 		if (timespecisset(&tfd->tfd_time.it_interval))
1015 			timespecadd(&tfd->tfd_time.it_value,
1016 				    &tfd->tfd_time.it_interval,
1017 				    &tfd->tfd_time.it_value);
1018 		else
1019 			/* single shot timer */
1020 			timespecclear(&tfd->tfd_time.it_value);
1021 		if (timespecisset(&tfd->tfd_time.it_value)) {
1022 			timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
1023 			TIMESPEC_TO_TIMEVAL(&tv, &ts);
1024 			callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1025 				linux_timerfd_expire, tfd);
1026 		}
1027 		tfd->tfd_count++;
1028 		KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
1029 		selwakeup(&tfd->tfd_sel);
1030 		wakeup(&tfd->tfd_count);
1031 	} else if (timespecisset(&tfd->tfd_time.it_value)) {
1032 		timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
1033 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
1034 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1035 		    linux_timerfd_expire, tfd);
1036 	}
1037 }
1038