xref: /freebsd/sys/compat/linux/linux_event.c (revision 5e801ac66d24704442eba426ed13c3effb8a34e7)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2007 Roman Divacky
5  * Copyright (c) 2014 Dmitry Chagin <dchagin@FreeBSD.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include "opt_compat.h"
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/imgact.h>
37 #include <sys/kernel.h>
38 #include <sys/limits.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/callout.h>
42 #include <sys/capsicum.h>
43 #include <sys/types.h>
44 #include <sys/user.h>
45 #include <sys/file.h>
46 #include <sys/filedesc.h>
47 #include <sys/filio.h>
48 #include <sys/errno.h>
49 #include <sys/event.h>
50 #include <sys/poll.h>
51 #include <sys/proc.h>
52 #include <sys/selinfo.h>
53 #include <sys/specialfd.h>
54 #include <sys/sx.h>
55 #include <sys/syscallsubr.h>
56 #include <sys/timespec.h>
57 #include <sys/eventfd.h>
58 
59 #ifdef COMPAT_LINUX32
60 #include <machine/../linux32/linux.h>
61 #include <machine/../linux32/linux32_proto.h>
62 #else
63 #include <machine/../linux/linux.h>
64 #include <machine/../linux/linux_proto.h>
65 #endif
66 
67 #include <compat/linux/linux_emul.h>
68 #include <compat/linux/linux_event.h>
69 #include <compat/linux/linux_file.h>
70 #include <compat/linux/linux_signal.h>
71 #include <compat/linux/linux_timer.h>
72 #include <compat/linux/linux_util.h>
73 
74 typedef uint64_t	epoll_udata_t;
75 
76 struct epoll_event {
77 	uint32_t	events;
78 	epoll_udata_t	data;
79 }
80 #if defined(__amd64__)
81 __attribute__((packed))
82 #endif
83 ;
84 
85 #define	LINUX_MAX_EVENTS	(INT_MAX / sizeof(struct epoll_event))
86 
87 static int	epoll_to_kevent(struct thread *td, int fd,
88 		    struct epoll_event *l_event, struct kevent *kevent,
89 		    int *nkevents);
90 static void	kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event);
91 static int	epoll_kev_copyout(void *arg, struct kevent *kevp, int count);
92 static int	epoll_kev_copyin(void *arg, struct kevent *kevp, int count);
93 static int	epoll_register_kevent(struct thread *td, struct file *epfp,
94 		    int fd, int filter, unsigned int flags);
95 static int	epoll_fd_registered(struct thread *td, struct file *epfp,
96 		    int fd);
97 static int	epoll_delete_all_events(struct thread *td, struct file *epfp,
98 		    int fd);
99 
100 struct epoll_copyin_args {
101 	struct kevent	*changelist;
102 };
103 
104 struct epoll_copyout_args {
105 	struct epoll_event	*leventlist;
106 	struct proc		*p;
107 	uint32_t		count;
108 	int			error;
109 };
110 
111 /* timerfd */
112 typedef uint64_t	timerfd_t;
113 
114 static fo_rdwr_t	timerfd_read;
115 static fo_ioctl_t	timerfd_ioctl;
116 static fo_poll_t	timerfd_poll;
117 static fo_kqfilter_t	timerfd_kqfilter;
118 static fo_stat_t	timerfd_stat;
119 static fo_close_t	timerfd_close;
120 static fo_fill_kinfo_t	timerfd_fill_kinfo;
121 
122 static struct fileops timerfdops = {
123 	.fo_read = timerfd_read,
124 	.fo_write = invfo_rdwr,
125 	.fo_truncate = invfo_truncate,
126 	.fo_ioctl = timerfd_ioctl,
127 	.fo_poll = timerfd_poll,
128 	.fo_kqfilter = timerfd_kqfilter,
129 	.fo_stat = timerfd_stat,
130 	.fo_close = timerfd_close,
131 	.fo_chmod = invfo_chmod,
132 	.fo_chown = invfo_chown,
133 	.fo_sendfile = invfo_sendfile,
134 	.fo_fill_kinfo = timerfd_fill_kinfo,
135 	.fo_flags = DFLAG_PASSABLE
136 };
137 
138 static void	filt_timerfddetach(struct knote *kn);
139 static int	filt_timerfdread(struct knote *kn, long hint);
140 
141 static struct filterops timerfd_rfiltops = {
142 	.f_isfd = 1,
143 	.f_detach = filt_timerfddetach,
144 	.f_event = filt_timerfdread
145 };
146 
147 struct timerfd {
148 	clockid_t	tfd_clockid;
149 	struct itimerspec tfd_time;
150 	struct callout	tfd_callout;
151 	timerfd_t	tfd_count;
152 	bool		tfd_canceled;
153 	struct selinfo	tfd_sel;
154 	struct mtx	tfd_lock;
155 };
156 
157 static void	linux_timerfd_expire(void *);
158 static void	linux_timerfd_curval(struct timerfd *, struct itimerspec *);
159 
160 static int
161 epoll_create_common(struct thread *td, int flags)
162 {
163 
164 	return (kern_kqueue(td, flags, NULL));
165 }
166 
167 #ifdef LINUX_LEGACY_SYSCALLS
168 int
169 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
170 {
171 
172 	/*
173 	 * args->size is unused. Linux just tests it
174 	 * and then forgets it as well.
175 	 */
176 	if (args->size <= 0)
177 		return (EINVAL);
178 
179 	return (epoll_create_common(td, 0));
180 }
181 #endif
182 
183 int
184 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
185 {
186 	int flags;
187 
188 	if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0)
189 		return (EINVAL);
190 
191 	flags = 0;
192 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
193 		flags |= O_CLOEXEC;
194 
195 	return (epoll_create_common(td, flags));
196 }
197 
198 /* Structure converting function from epoll to kevent. */
199 static int
200 epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event,
201     struct kevent *kevent, int *nkevents)
202 {
203 	uint32_t levents = l_event->events;
204 	struct linux_pemuldata *pem;
205 	struct proc *p;
206 	unsigned short kev_flags = EV_ADD | EV_ENABLE;
207 
208 	/* flags related to how event is registered */
209 	if ((levents & LINUX_EPOLLONESHOT) != 0)
210 		kev_flags |= EV_DISPATCH;
211 	if ((levents & LINUX_EPOLLET) != 0)
212 		kev_flags |= EV_CLEAR;
213 	if ((levents & LINUX_EPOLLERR) != 0)
214 		kev_flags |= EV_ERROR;
215 	if ((levents & LINUX_EPOLLRDHUP) != 0)
216 		kev_flags |= EV_EOF;
217 
218 	/* flags related to what event is registered */
219 	if ((levents & LINUX_EPOLL_EVRD) != 0) {
220 		EV_SET(kevent, fd, EVFILT_READ, kev_flags, 0, 0, 0);
221 		kevent->ext[0] = l_event->data;
222 		++kevent;
223 		++(*nkevents);
224 	}
225 	if ((levents & LINUX_EPOLL_EVWR) != 0) {
226 		EV_SET(kevent, fd, EVFILT_WRITE, kev_flags, 0, 0, 0);
227 		kevent->ext[0] = l_event->data;
228 		++kevent;
229 		++(*nkevents);
230 	}
231 	/* zero event mask is legal */
232 	if ((levents & (LINUX_EPOLL_EVRD | LINUX_EPOLL_EVWR)) == 0) {
233 		EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0);
234 		++(*nkevents);
235 	}
236 
237 	if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) {
238 		p = td->td_proc;
239 
240 		pem = pem_find(p);
241 		KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
242 
243 		LINUX_PEM_XLOCK(pem);
244 		if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) {
245 			pem->flags |= LINUX_XUNSUP_EPOLL;
246 			LINUX_PEM_XUNLOCK(pem);
247 			linux_msg(td, "epoll_ctl unsupported flags: 0x%x",
248 			    levents);
249 		} else
250 			LINUX_PEM_XUNLOCK(pem);
251 		return (EINVAL);
252 	}
253 
254 	return (0);
255 }
256 
257 /*
258  * Structure converting function from kevent to epoll. In a case
259  * this is called on error in registration we store the error in
260  * event->data and pick it up later in linux_epoll_ctl().
261  */
262 static void
263 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
264 {
265 
266 	l_event->data = kevent->ext[0];
267 
268 	if ((kevent->flags & EV_ERROR) != 0) {
269 		l_event->events = LINUX_EPOLLERR;
270 		return;
271 	}
272 
273 	/* XXX EPOLLPRI, EPOLLHUP */
274 	switch (kevent->filter) {
275 	case EVFILT_READ:
276 		l_event->events = LINUX_EPOLLIN;
277 		if ((kevent->flags & EV_EOF) != 0)
278 			l_event->events |= LINUX_EPOLLRDHUP;
279 	break;
280 	case EVFILT_WRITE:
281 		l_event->events = LINUX_EPOLLOUT;
282 	break;
283 	}
284 }
285 
286 /*
287  * Copyout callback used by kevent. This converts kevent
288  * events to epoll events and copies them back to the
289  * userspace. This is also called on error on registering
290  * of the filter.
291  */
292 static int
293 epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
294 {
295 	struct epoll_copyout_args *args;
296 	struct epoll_event *eep;
297 	int error, i;
298 
299 	args = (struct epoll_copyout_args*) arg;
300 	eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO);
301 
302 	for (i = 0; i < count; i++)
303 		kevent_to_epoll(&kevp[i], &eep[i]);
304 
305 	error = copyout(eep, args->leventlist, count * sizeof(*eep));
306 	if (error == 0) {
307 		args->leventlist += count;
308 		args->count += count;
309 	} else if (args->error == 0)
310 		args->error = error;
311 
312 	free(eep, M_EPOLL);
313 	return (error);
314 }
315 
316 /*
317  * Copyin callback used by kevent. This copies already
318  * converted filters from kernel memory to the kevent
319  * internal kernel memory. Hence the memcpy instead of
320  * copyin.
321  */
322 static int
323 epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
324 {
325 	struct epoll_copyin_args *args;
326 
327 	args = (struct epoll_copyin_args*) arg;
328 
329 	memcpy(kevp, args->changelist, count * sizeof(*kevp));
330 	args->changelist += count;
331 
332 	return (0);
333 }
334 
335 /*
336  * Load epoll filter, convert it to kevent filter
337  * and load it into kevent subsystem.
338  */
339 int
340 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
341 {
342 	struct file *epfp, *fp;
343 	struct epoll_copyin_args ciargs;
344 	struct kevent kev[2];
345 	struct kevent_copyops k_ops = { &ciargs,
346 					NULL,
347 					epoll_kev_copyin};
348 	struct epoll_event le;
349 	cap_rights_t rights;
350 	int nchanges = 0;
351 	int error;
352 
353 	if (args->op != LINUX_EPOLL_CTL_DEL) {
354 		error = copyin(args->event, &le, sizeof(le));
355 		if (error != 0)
356 			return (error);
357 	}
358 
359 	error = fget(td, args->epfd,
360 	    cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE), &epfp);
361 	if (error != 0)
362 		return (error);
363 	if (epfp->f_type != DTYPE_KQUEUE) {
364 		error = EINVAL;
365 		goto leave1;
366 	}
367 
368 	 /* Protect user data vector from incorrectly supplied fd. */
369 	error = fget(td, args->fd,
370 		     cap_rights_init_one(&rights, CAP_POLL_EVENT), &fp);
371 	if (error != 0)
372 		goto leave1;
373 
374 	/* Linux disallows spying on himself */
375 	if (epfp == fp) {
376 		error = EINVAL;
377 		goto leave0;
378 	}
379 
380 	ciargs.changelist = kev;
381 
382 	if (args->op != LINUX_EPOLL_CTL_DEL) {
383 		error = epoll_to_kevent(td, args->fd, &le, kev, &nchanges);
384 		if (error != 0)
385 			goto leave0;
386 	}
387 
388 	switch (args->op) {
389 	case LINUX_EPOLL_CTL_MOD:
390 		error = epoll_delete_all_events(td, epfp, args->fd);
391 		if (error != 0)
392 			goto leave0;
393 		break;
394 
395 	case LINUX_EPOLL_CTL_ADD:
396 		if (epoll_fd_registered(td, epfp, args->fd)) {
397 			error = EEXIST;
398 			goto leave0;
399 		}
400 		break;
401 
402 	case LINUX_EPOLL_CTL_DEL:
403 		/* CTL_DEL means unregister this fd with this epoll */
404 		error = epoll_delete_all_events(td, epfp, args->fd);
405 		goto leave0;
406 
407 	default:
408 		error = EINVAL;
409 		goto leave0;
410 	}
411 
412 	error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL);
413 
414 leave0:
415 	fdrop(fp, td);
416 
417 leave1:
418 	fdrop(epfp, td);
419 	return (error);
420 }
421 
422 /*
423  * Wait for a filter to be triggered on the epoll file descriptor.
424  */
425 
426 static int
427 linux_epoll_wait_ts(struct thread *td, int epfd, struct epoll_event *events,
428     int maxevents, struct timespec *tsp, sigset_t *uset)
429 {
430 	struct epoll_copyout_args coargs;
431 	struct kevent_copyops k_ops = { &coargs,
432 					epoll_kev_copyout,
433 					NULL};
434 	cap_rights_t rights;
435 	struct file *epfp;
436 	sigset_t omask;
437 	int error;
438 
439 	if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS)
440 		return (EINVAL);
441 
442 	error = fget(td, epfd,
443 	    cap_rights_init_one(&rights, CAP_KQUEUE_EVENT), &epfp);
444 	if (error != 0)
445 		return (error);
446 	if (epfp->f_type != DTYPE_KQUEUE) {
447 		error = EINVAL;
448 		goto leave;
449 	}
450 	if (uset != NULL) {
451 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
452 		    &omask, 0);
453 		if (error != 0)
454 			goto leave;
455 		td->td_pflags |= TDP_OLDMASK;
456 		/*
457 		 * Make sure that ast() is called on return to
458 		 * usermode and TDP_OLDMASK is cleared, restoring old
459 		 * sigmask.
460 		 */
461 		thread_lock(td);
462 		td->td_flags |= TDF_ASTPENDING;
463 		thread_unlock(td);
464 	}
465 
466 	coargs.leventlist = events;
467 	coargs.p = td->td_proc;
468 	coargs.count = 0;
469 	coargs.error = 0;
470 
471 	error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp);
472 	if (error == 0 && coargs.error != 0)
473 		error = coargs.error;
474 
475 	/*
476 	 * kern_kevent might return ENOMEM which is not expected from epoll_wait.
477 	 * Maybe we should translate that but I don't think it matters at all.
478 	 */
479 	if (error == 0)
480 		td->td_retval[0] = coargs.count;
481 
482 	if (uset != NULL)
483 		error = kern_sigprocmask(td, SIG_SETMASK, &omask,
484 		    NULL, 0);
485 leave:
486 	fdrop(epfp, td);
487 	return (error);
488 }
489 
490 static int
491 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events,
492     int maxevents, int timeout, sigset_t *uset)
493 {
494 	struct timespec ts, *tsp;
495 
496 	/*
497 	 * Linux epoll_wait(2) man page states that timeout of -1 causes caller
498 	 * to block indefinitely. Real implementation does it if any negative
499 	 * timeout value is passed.
500 	 */
501 	if (timeout >= 0) {
502 		/* Convert from milliseconds to timespec. */
503 		ts.tv_sec = timeout / 1000;
504 		ts.tv_nsec = (timeout % 1000) * 1000000;
505 		tsp = &ts;
506 	} else {
507 		tsp = NULL;
508 	}
509 	return (linux_epoll_wait_ts(td, epfd, events, maxevents, tsp, uset));
510 
511 }
512 
513 #ifdef LINUX_LEGACY_SYSCALLS
514 int
515 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
516 {
517 
518 	return (linux_epoll_wait_common(td, args->epfd, args->events,
519 	    args->maxevents, args->timeout, NULL));
520 }
521 #endif
522 
523 int
524 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args)
525 {
526 	sigset_t mask, *pmask;
527 	int error;
528 
529 	error = linux_copyin_sigset(args->mask, sizeof(l_sigset_t),
530 	    &mask, &pmask);
531 	if (error != 0)
532 		return (error);
533 
534 	return (linux_epoll_wait_common(td, args->epfd, args->events,
535 	    args->maxevents, args->timeout, pmask));
536 }
537 
538 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
539 int
540 linux_epoll_pwait2_64(struct thread *td, struct linux_epoll_pwait2_64_args *args)
541 {
542 	struct timespec ts, *tsa;
543 	sigset_t mask, *pmask;
544 	int error;
545 
546 	error = linux_copyin_sigset(args->mask, sizeof(l_sigset_t),
547 	    &mask, &pmask);
548 	if (error != 0)
549 		return (error);
550 
551 	if (args->timeout) {
552 		error = linux_get_timespec64(&ts, args->timeout);
553 		if (error != 0)
554 			return (error);
555 		tsa = &ts;
556 	} else
557 		tsa = NULL;
558 
559 	return (linux_epoll_wait_ts(td, args->epfd, args->events,
560 	    args->maxevents, tsa, pmask));
561 }
562 #else
563 int
564 linux_epoll_pwait2(struct thread *td, struct linux_epoll_pwait2_args *args)
565 {
566 	struct timespec ts, *tsa;
567 	sigset_t mask, *pmask;
568 	int error;
569 
570 	error = linux_copyin_sigset(args->mask, sizeof(l_sigset_t),
571 	    &mask, &pmask);
572 	if (error != 0)
573 		return (error);
574 
575 	if (args->timeout) {
576 		error = linux_get_timespec(&ts, args->timeout);
577 		if (error != 0)
578 			return (error);
579 		tsa = &ts;
580 	} else
581 		tsa = NULL;
582 
583 	return (linux_epoll_wait_ts(td, args->epfd, args->events,
584 	    args->maxevents, tsa, pmask));
585 }
586 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
587 
588 static int
589 epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter,
590     unsigned int flags)
591 {
592 	struct epoll_copyin_args ciargs;
593 	struct kevent kev;
594 	struct kevent_copyops k_ops = { &ciargs,
595 					NULL,
596 					epoll_kev_copyin};
597 
598 	ciargs.changelist = &kev;
599 	EV_SET(&kev, fd, filter, flags, 0, 0, 0);
600 
601 	return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL));
602 }
603 
604 static int
605 epoll_fd_registered(struct thread *td, struct file *epfp, int fd)
606 {
607 	/*
608 	 * Set empty filter flags to avoid accidental modification of already
609 	 * registered events. In the case of event re-registration:
610 	 * 1. If event does not exists kevent() does nothing and returns ENOENT
611 	 * 2. If event does exists, it's enabled/disabled state is preserved
612 	 *    but fflags, data and udata fields are overwritten. So we can not
613 	 *    set socket lowats and store user's context pointer in udata.
614 	 */
615 	if (epoll_register_kevent(td, epfp, fd, EVFILT_READ, 0) != ENOENT ||
616 	    epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, 0) != ENOENT)
617 		return (1);
618 
619 	return (0);
620 }
621 
622 static int
623 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd)
624 {
625 	int error1, error2;
626 
627 	error1 = epoll_register_kevent(td, epfp, fd, EVFILT_READ, EV_DELETE);
628 	error2 = epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, EV_DELETE);
629 
630 	/* return 0 if at least one result positive */
631 	return (error1 == 0 ? 0 : error2);
632 }
633 
634 #ifdef LINUX_LEGACY_SYSCALLS
635 int
636 linux_eventfd(struct thread *td, struct linux_eventfd_args *args)
637 {
638 	struct specialfd_eventfd ae;
639 
640 	bzero(&ae, sizeof(ae));
641 	ae.initval = args->initval;
642 	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
643 }
644 #endif
645 
646 int
647 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
648 {
649 	struct specialfd_eventfd ae;
650 	int flags;
651 
652 	if ((args->flags & ~(LINUX_O_CLOEXEC | LINUX_O_NONBLOCK |
653 	    LINUX_EFD_SEMAPHORE)) != 0)
654 		return (EINVAL);
655 	flags = 0;
656 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
657 		flags |= EFD_CLOEXEC;
658 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
659 		flags |= EFD_NONBLOCK;
660 	if ((args->flags & LINUX_EFD_SEMAPHORE) != 0)
661 		flags |= EFD_SEMAPHORE;
662 
663 	bzero(&ae, sizeof(ae));
664 	ae.flags = flags;
665 	ae.initval = args->initval;
666 	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
667 }
668 
669 int
670 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args)
671 {
672 	struct timerfd *tfd;
673 	struct file *fp;
674 	clockid_t clockid;
675 	int fflags, fd, error;
676 
677 	if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0)
678 		return (EINVAL);
679 
680 	error = linux_to_native_clockid(&clockid, args->clockid);
681 	if (error != 0)
682 		return (error);
683 	if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
684 		return (EINVAL);
685 
686 	fflags = 0;
687 	if ((args->flags & LINUX_TFD_CLOEXEC) != 0)
688 		fflags |= O_CLOEXEC;
689 
690 	error = falloc(td, &fp, &fd, fflags);
691 	if (error != 0)
692 		return (error);
693 
694 	tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO);
695 	tfd->tfd_clockid = clockid;
696 	mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
697 
698 	callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
699 	knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
700 
701 	fflags = FREAD;
702 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
703 		fflags |= FNONBLOCK;
704 
705 	finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops);
706 	fdrop(fp, td);
707 
708 	td->td_retval[0] = fd;
709 	return (error);
710 }
711 
712 static int
713 timerfd_close(struct file *fp, struct thread *td)
714 {
715 	struct timerfd *tfd;
716 
717 	tfd = fp->f_data;
718 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
719 		return (EINVAL);
720 
721 	timespecclear(&tfd->tfd_time.it_value);
722 	timespecclear(&tfd->tfd_time.it_interval);
723 
724 	callout_drain(&tfd->tfd_callout);
725 
726 	seldrain(&tfd->tfd_sel);
727 	knlist_destroy(&tfd->tfd_sel.si_note);
728 
729 	fp->f_ops = &badfileops;
730 	mtx_destroy(&tfd->tfd_lock);
731 	free(tfd, M_EPOLL);
732 
733 	return (0);
734 }
735 
736 static int
737 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
738     int flags, struct thread *td)
739 {
740 	struct timerfd *tfd;
741 	timerfd_t count;
742 	int error;
743 
744 	tfd = fp->f_data;
745 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
746 		return (EINVAL);
747 
748 	if (uio->uio_resid < sizeof(timerfd_t))
749 		return (EINVAL);
750 
751 	error = 0;
752 	mtx_lock(&tfd->tfd_lock);
753 retry:
754 	if (tfd->tfd_canceled) {
755 		tfd->tfd_count = 0;
756 		mtx_unlock(&tfd->tfd_lock);
757 		return (ECANCELED);
758 	}
759 	if (tfd->tfd_count == 0) {
760 		if ((fp->f_flag & FNONBLOCK) != 0) {
761 			mtx_unlock(&tfd->tfd_lock);
762 			return (EAGAIN);
763 		}
764 		error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0);
765 		if (error == 0)
766 			goto retry;
767 	}
768 	if (error == 0) {
769 		count = tfd->tfd_count;
770 		tfd->tfd_count = 0;
771 		mtx_unlock(&tfd->tfd_lock);
772 		error = uiomove(&count, sizeof(timerfd_t), uio);
773 	} else
774 		mtx_unlock(&tfd->tfd_lock);
775 
776 	return (error);
777 }
778 
779 static int
780 timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
781     struct thread *td)
782 {
783 	struct timerfd *tfd;
784 	int revents = 0;
785 
786 	tfd = fp->f_data;
787 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
788 		return (POLLERR);
789 
790 	mtx_lock(&tfd->tfd_lock);
791 	if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0)
792 		revents |= events & (POLLIN|POLLRDNORM);
793 	if (revents == 0)
794 		selrecord(td, &tfd->tfd_sel);
795 	mtx_unlock(&tfd->tfd_lock);
796 
797 	return (revents);
798 }
799 
800 static int
801 timerfd_kqfilter(struct file *fp, struct knote *kn)
802 {
803 	struct timerfd *tfd;
804 
805 	tfd = fp->f_data;
806 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
807 		return (EINVAL);
808 
809 	if (kn->kn_filter == EVFILT_READ)
810 		kn->kn_fop = &timerfd_rfiltops;
811 	else
812 		return (EINVAL);
813 
814 	kn->kn_hook = tfd;
815 	knlist_add(&tfd->tfd_sel.si_note, kn, 0);
816 
817 	return (0);
818 }
819 
820 static void
821 filt_timerfddetach(struct knote *kn)
822 {
823 	struct timerfd *tfd = kn->kn_hook;
824 
825 	mtx_lock(&tfd->tfd_lock);
826 	knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
827 	mtx_unlock(&tfd->tfd_lock);
828 }
829 
830 static int
831 filt_timerfdread(struct knote *kn, long hint)
832 {
833 	struct timerfd *tfd = kn->kn_hook;
834 
835 	return (tfd->tfd_count > 0);
836 }
837 
838 static int
839 timerfd_ioctl(struct file *fp, u_long cmd, void *data,
840     struct ucred *active_cred, struct thread *td)
841 {
842 
843 	if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD)
844 		return (EINVAL);
845 
846 	switch (cmd) {
847 	case FIONBIO:
848 	case FIOASYNC:
849 		return (0);
850 	}
851 
852 	return (ENOTTY);
853 }
854 
855 static int
856 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
857 {
858 
859 	return (ENXIO);
860 }
861 
862 static int
863 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
864 {
865 
866 	kif->kf_type = KF_TYPE_UNKNOWN;
867 	return (0);
868 }
869 
870 static void
871 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts)
872 {
873 
874 	if (tfd->tfd_clockid == CLOCK_REALTIME)
875 		getnanotime(ts);
876 	else	/* CLOCK_MONOTONIC */
877 		getnanouptime(ts);
878 }
879 
880 static void
881 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots)
882 {
883 	struct timespec cts;
884 
885 	linux_timerfd_clocktime(tfd, &cts);
886 	*ots = tfd->tfd_time;
887 	if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) {
888 		timespecsub(&ots->it_value, &cts, &ots->it_value);
889 		if (ots->it_value.tv_sec < 0 ||
890 		    (ots->it_value.tv_sec == 0 &&
891 		     ots->it_value.tv_nsec == 0)) {
892 			ots->it_value.tv_sec  = 0;
893 			ots->it_value.tv_nsec = 1;
894 		}
895 	}
896 }
897 
898 static int
899 linux_timerfd_gettime_common(struct thread *td, int fd, struct itimerspec *ots)
900 {
901 	struct timerfd *tfd;
902 	struct file *fp;
903 	int error;
904 
905 	error = fget(td, fd, &cap_read_rights, &fp);
906 	if (error != 0)
907 		return (error);
908 	tfd = fp->f_data;
909 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
910 		error = EINVAL;
911 		goto out;
912 	}
913 
914 	mtx_lock(&tfd->tfd_lock);
915 	linux_timerfd_curval(tfd, ots);
916 	mtx_unlock(&tfd->tfd_lock);
917 
918 out:
919 	fdrop(fp, td);
920 	return (error);
921 }
922 
923 int
924 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args)
925 {
926 	struct l_itimerspec lots;
927 	struct itimerspec ots;
928 	int error;
929 
930 	error = linux_timerfd_gettime_common(td, args->fd, &ots);
931 	if (error != 0)
932 		return (error);
933 	error = native_to_linux_itimerspec(&lots, &ots);
934 	if (error == 0)
935 		error = copyout(&lots, args->old_value, sizeof(lots));
936 	return (error);
937 }
938 
939 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
940 int
941 linux_timerfd_gettime64(struct thread *td, struct linux_timerfd_gettime64_args *args)
942 {
943 	struct l_itimerspec64 lots;
944 	struct itimerspec ots;
945 	int error;
946 
947 	error = linux_timerfd_gettime_common(td, args->fd, &ots);
948 	if (error != 0)
949 		return (error);
950 	error = native_to_linux_itimerspec64(&lots, &ots);
951 	if (error == 0)
952 		error = copyout(&lots, args->old_value, sizeof(lots));
953 	return (error);
954 }
955 #endif
956 
957 static int
958 linux_timerfd_settime_common(struct thread *td, int fd, int flags,
959     struct itimerspec *nts, struct itimerspec *oval)
960 {
961 	struct timespec cts, ts;
962 	struct timerfd *tfd;
963 	struct timeval tv;
964 	struct file *fp;
965 	int error;
966 
967 	if ((flags & ~LINUX_TFD_SETTIME_FLAGS) != 0)
968 		return (EINVAL);
969 
970 	error = fget(td, fd, &cap_write_rights, &fp);
971 	if (error != 0)
972 		return (error);
973 	tfd = fp->f_data;
974 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
975 		error = EINVAL;
976 		goto out;
977 	}
978 
979 	mtx_lock(&tfd->tfd_lock);
980 	if (!timespecisset(&nts->it_value))
981 		timespecclear(&nts->it_interval);
982 	if (oval != NULL)
983 		linux_timerfd_curval(tfd, oval);
984 
985 	bcopy(nts, &tfd->tfd_time, sizeof(*nts));
986 	tfd->tfd_count = 0;
987 	if (timespecisset(&nts->it_value)) {
988 		linux_timerfd_clocktime(tfd, &cts);
989 		ts = nts->it_value;
990 		if ((flags & LINUX_TFD_TIMER_ABSTIME) == 0) {
991 			timespecadd(&tfd->tfd_time.it_value, &cts,
992 				&tfd->tfd_time.it_value);
993 		} else {
994 			timespecsub(&ts, &cts, &ts);
995 		}
996 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
997 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
998 			linux_timerfd_expire, tfd);
999 		tfd->tfd_canceled = false;
1000 	} else {
1001 		tfd->tfd_canceled = true;
1002 		callout_stop(&tfd->tfd_callout);
1003 	}
1004 	mtx_unlock(&tfd->tfd_lock);
1005 
1006 out:
1007 	fdrop(fp, td);
1008 	return (error);
1009 }
1010 
1011 int
1012 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args)
1013 {
1014 	struct l_itimerspec lots;
1015 	struct itimerspec nts, ots, *pots;
1016 	int error;
1017 
1018 	error = copyin(args->new_value, &lots, sizeof(lots));
1019 	if (error != 0)
1020 		return (error);
1021 	error = linux_to_native_itimerspec(&nts, &lots);
1022 	if (error != 0)
1023 		return (error);
1024 	pots = (args->old_value != NULL ? &ots : NULL);
1025 	error = linux_timerfd_settime_common(td, args->fd, args->flags,
1026 	    &nts, pots);
1027 	if (error == 0 && args->old_value != NULL) {
1028 		error = native_to_linux_itimerspec(&lots, &ots);
1029 		if (error == 0)
1030 			error = copyout(&lots, args->old_value, sizeof(lots));
1031 	}
1032 	return (error);
1033 }
1034 
1035 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1036 int
1037 linux_timerfd_settime64(struct thread *td, struct linux_timerfd_settime64_args *args)
1038 {
1039 	struct l_itimerspec64 lots;
1040 	struct itimerspec nts, ots, *pots;
1041 	int error;
1042 
1043 	error = copyin(args->new_value, &lots, sizeof(lots));
1044 	if (error != 0)
1045 		return (error);
1046 	error = linux_to_native_itimerspec64(&nts, &lots);
1047 	if (error != 0)
1048 		return (error);
1049 	pots = (args->old_value != NULL ? &ots : NULL);
1050 	error = linux_timerfd_settime_common(td, args->fd, args->flags,
1051 	    &nts, pots);
1052 	if (error == 0 && args->old_value != NULL) {
1053 		error = native_to_linux_itimerspec64(&lots, &ots);
1054 		if (error == 0)
1055 			error = copyout(&lots, args->old_value, sizeof(lots));
1056 	}
1057 	return (error);
1058 }
1059 #endif
1060 
1061 static void
1062 linux_timerfd_expire(void *arg)
1063 {
1064 	struct timespec cts, ts;
1065 	struct timeval tv;
1066 	struct timerfd *tfd;
1067 
1068 	tfd = (struct timerfd *)arg;
1069 
1070 	linux_timerfd_clocktime(tfd, &cts);
1071 	if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) {
1072 		if (timespecisset(&tfd->tfd_time.it_interval))
1073 			timespecadd(&tfd->tfd_time.it_value,
1074 				    &tfd->tfd_time.it_interval,
1075 				    &tfd->tfd_time.it_value);
1076 		else
1077 			/* single shot timer */
1078 			timespecclear(&tfd->tfd_time.it_value);
1079 		if (timespecisset(&tfd->tfd_time.it_value)) {
1080 			timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
1081 			TIMESPEC_TO_TIMEVAL(&tv, &ts);
1082 			callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1083 				linux_timerfd_expire, tfd);
1084 		}
1085 		tfd->tfd_count++;
1086 		KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
1087 		selwakeup(&tfd->tfd_sel);
1088 		wakeup(&tfd->tfd_count);
1089 	} else if (timespecisset(&tfd->tfd_time.it_value)) {
1090 		timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
1091 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
1092 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1093 		    linux_timerfd_expire, tfd);
1094 	}
1095 }
1096