xref: /freebsd/sys/compat/linux/linux_event.c (revision 093cf790569775b80662926efea6d9d3464bde94)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2007 Roman Divacky
5  * Copyright (c) 2014 Dmitry Chagin <dchagin@FreeBSD.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include "opt_compat.h"
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/imgact.h>
37 #include <sys/kernel.h>
38 #include <sys/limits.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/callout.h>
42 #include <sys/capsicum.h>
43 #include <sys/types.h>
44 #include <sys/user.h>
45 #include <sys/file.h>
46 #include <sys/filedesc.h>
47 #include <sys/filio.h>
48 #include <sys/errno.h>
49 #include <sys/event.h>
50 #include <sys/poll.h>
51 #include <sys/proc.h>
52 #include <sys/selinfo.h>
53 #include <sys/specialfd.h>
54 #include <sys/sx.h>
55 #include <sys/syscallsubr.h>
56 #include <sys/timespec.h>
57 #include <sys/eventfd.h>
58 
59 #ifdef COMPAT_LINUX32
60 #include <machine/../linux32/linux.h>
61 #include <machine/../linux32/linux32_proto.h>
62 #else
63 #include <machine/../linux/linux.h>
64 #include <machine/../linux/linux_proto.h>
65 #endif
66 
67 #include <compat/linux/linux_emul.h>
68 #include <compat/linux/linux_event.h>
69 #include <compat/linux/linux_file.h>
70 #include <compat/linux/linux_timer.h>
71 #include <compat/linux/linux_util.h>
72 
73 typedef uint64_t	epoll_udata_t;
74 
75 struct epoll_event {
76 	uint32_t	events;
77 	epoll_udata_t	data;
78 }
79 #if defined(__amd64__)
80 __attribute__((packed))
81 #endif
82 ;
83 
84 #define	LINUX_MAX_EVENTS	(INT_MAX / sizeof(struct epoll_event))
85 
86 static int	epoll_to_kevent(struct thread *td, int fd,
87 		    struct epoll_event *l_event, struct kevent *kevent,
88 		    int *nkevents);
89 static void	kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event);
90 static int	epoll_kev_copyout(void *arg, struct kevent *kevp, int count);
91 static int	epoll_kev_copyin(void *arg, struct kevent *kevp, int count);
92 static int	epoll_register_kevent(struct thread *td, struct file *epfp,
93 		    int fd, int filter, unsigned int flags);
94 static int	epoll_fd_registered(struct thread *td, struct file *epfp,
95 		    int fd);
96 static int	epoll_delete_all_events(struct thread *td, struct file *epfp,
97 		    int fd);
98 
99 struct epoll_copyin_args {
100 	struct kevent	*changelist;
101 };
102 
103 struct epoll_copyout_args {
104 	struct epoll_event	*leventlist;
105 	struct proc		*p;
106 	uint32_t		count;
107 	int			error;
108 };
109 
110 /* timerfd */
111 typedef uint64_t	timerfd_t;
112 
113 static fo_rdwr_t	timerfd_read;
114 static fo_ioctl_t	timerfd_ioctl;
115 static fo_poll_t	timerfd_poll;
116 static fo_kqfilter_t	timerfd_kqfilter;
117 static fo_stat_t	timerfd_stat;
118 static fo_close_t	timerfd_close;
119 static fo_fill_kinfo_t	timerfd_fill_kinfo;
120 
121 static struct fileops timerfdops = {
122 	.fo_read = timerfd_read,
123 	.fo_write = invfo_rdwr,
124 	.fo_truncate = invfo_truncate,
125 	.fo_ioctl = timerfd_ioctl,
126 	.fo_poll = timerfd_poll,
127 	.fo_kqfilter = timerfd_kqfilter,
128 	.fo_stat = timerfd_stat,
129 	.fo_close = timerfd_close,
130 	.fo_chmod = invfo_chmod,
131 	.fo_chown = invfo_chown,
132 	.fo_sendfile = invfo_sendfile,
133 	.fo_fill_kinfo = timerfd_fill_kinfo,
134 	.fo_flags = DFLAG_PASSABLE
135 };
136 
137 static void	filt_timerfddetach(struct knote *kn);
138 static int	filt_timerfdread(struct knote *kn, long hint);
139 
140 static struct filterops timerfd_rfiltops = {
141 	.f_isfd = 1,
142 	.f_detach = filt_timerfddetach,
143 	.f_event = filt_timerfdread
144 };
145 
146 struct timerfd {
147 	clockid_t	tfd_clockid;
148 	struct itimerspec tfd_time;
149 	struct callout	tfd_callout;
150 	timerfd_t	tfd_count;
151 	bool		tfd_canceled;
152 	struct selinfo	tfd_sel;
153 	struct mtx	tfd_lock;
154 };
155 
156 static void	linux_timerfd_expire(void *);
157 static void	linux_timerfd_curval(struct timerfd *, struct itimerspec *);
158 
159 static int
160 epoll_create_common(struct thread *td, int flags)
161 {
162 
163 	return (kern_kqueue(td, flags, NULL));
164 }
165 
166 #ifdef LINUX_LEGACY_SYSCALLS
167 int
168 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
169 {
170 
171 	/*
172 	 * args->size is unused. Linux just tests it
173 	 * and then forgets it as well.
174 	 */
175 	if (args->size <= 0)
176 		return (EINVAL);
177 
178 	return (epoll_create_common(td, 0));
179 }
180 #endif
181 
182 int
183 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
184 {
185 	int flags;
186 
187 	if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0)
188 		return (EINVAL);
189 
190 	flags = 0;
191 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
192 		flags |= O_CLOEXEC;
193 
194 	return (epoll_create_common(td, flags));
195 }
196 
197 /* Structure converting function from epoll to kevent. */
198 static int
199 epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event,
200     struct kevent *kevent, int *nkevents)
201 {
202 	uint32_t levents = l_event->events;
203 	struct linux_pemuldata *pem;
204 	struct proc *p;
205 	unsigned short kev_flags = EV_ADD | EV_ENABLE;
206 
207 	/* flags related to how event is registered */
208 	if ((levents & LINUX_EPOLLONESHOT) != 0)
209 		kev_flags |= EV_DISPATCH;
210 	if ((levents & LINUX_EPOLLET) != 0)
211 		kev_flags |= EV_CLEAR;
212 	if ((levents & LINUX_EPOLLERR) != 0)
213 		kev_flags |= EV_ERROR;
214 	if ((levents & LINUX_EPOLLRDHUP) != 0)
215 		kev_flags |= EV_EOF;
216 
217 	/* flags related to what event is registered */
218 	if ((levents & LINUX_EPOLL_EVRD) != 0) {
219 		EV_SET(kevent, fd, EVFILT_READ, kev_flags, 0, 0, 0);
220 		kevent->ext[0] = l_event->data;
221 		++kevent;
222 		++(*nkevents);
223 	}
224 	if ((levents & LINUX_EPOLL_EVWR) != 0) {
225 		EV_SET(kevent, fd, EVFILT_WRITE, kev_flags, 0, 0, 0);
226 		kevent->ext[0] = l_event->data;
227 		++kevent;
228 		++(*nkevents);
229 	}
230 	/* zero event mask is legal */
231 	if ((levents & (LINUX_EPOLL_EVRD | LINUX_EPOLL_EVWR)) == 0) {
232 		EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0);
233 		++(*nkevents);
234 	}
235 
236 	if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) {
237 		p = td->td_proc;
238 
239 		pem = pem_find(p);
240 		KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
241 
242 		LINUX_PEM_XLOCK(pem);
243 		if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) {
244 			pem->flags |= LINUX_XUNSUP_EPOLL;
245 			LINUX_PEM_XUNLOCK(pem);
246 			linux_msg(td, "epoll_ctl unsupported flags: 0x%x",
247 			    levents);
248 		} else
249 			LINUX_PEM_XUNLOCK(pem);
250 		return (EINVAL);
251 	}
252 
253 	return (0);
254 }
255 
256 /*
257  * Structure converting function from kevent to epoll. In a case
258  * this is called on error in registration we store the error in
259  * event->data and pick it up later in linux_epoll_ctl().
260  */
261 static void
262 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
263 {
264 
265 	l_event->data = kevent->ext[0];
266 
267 	if ((kevent->flags & EV_ERROR) != 0) {
268 		l_event->events = LINUX_EPOLLERR;
269 		return;
270 	}
271 
272 	/* XXX EPOLLPRI, EPOLLHUP */
273 	switch (kevent->filter) {
274 	case EVFILT_READ:
275 		l_event->events = LINUX_EPOLLIN;
276 		if ((kevent->flags & EV_EOF) != 0)
277 			l_event->events |= LINUX_EPOLLRDHUP;
278 	break;
279 	case EVFILT_WRITE:
280 		l_event->events = LINUX_EPOLLOUT;
281 	break;
282 	}
283 }
284 
285 /*
286  * Copyout callback used by kevent. This converts kevent
287  * events to epoll events and copies them back to the
288  * userspace. This is also called on error on registering
289  * of the filter.
290  */
291 static int
292 epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
293 {
294 	struct epoll_copyout_args *args;
295 	struct epoll_event *eep;
296 	int error, i;
297 
298 	args = (struct epoll_copyout_args*) arg;
299 	eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO);
300 
301 	for (i = 0; i < count; i++)
302 		kevent_to_epoll(&kevp[i], &eep[i]);
303 
304 	error = copyout(eep, args->leventlist, count * sizeof(*eep));
305 	if (error == 0) {
306 		args->leventlist += count;
307 		args->count += count;
308 	} else if (args->error == 0)
309 		args->error = error;
310 
311 	free(eep, M_EPOLL);
312 	return (error);
313 }
314 
315 /*
316  * Copyin callback used by kevent. This copies already
317  * converted filters from kernel memory to the kevent
318  * internal kernel memory. Hence the memcpy instead of
319  * copyin.
320  */
321 static int
322 epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
323 {
324 	struct epoll_copyin_args *args;
325 
326 	args = (struct epoll_copyin_args*) arg;
327 
328 	memcpy(kevp, args->changelist, count * sizeof(*kevp));
329 	args->changelist += count;
330 
331 	return (0);
332 }
333 
334 /*
335  * Load epoll filter, convert it to kevent filter
336  * and load it into kevent subsystem.
337  */
338 int
339 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
340 {
341 	struct file *epfp, *fp;
342 	struct epoll_copyin_args ciargs;
343 	struct kevent kev[2];
344 	struct kevent_copyops k_ops = { &ciargs,
345 					NULL,
346 					epoll_kev_copyin};
347 	struct epoll_event le;
348 	cap_rights_t rights;
349 	int nchanges = 0;
350 	int error;
351 
352 	if (args->op != LINUX_EPOLL_CTL_DEL) {
353 		error = copyin(args->event, &le, sizeof(le));
354 		if (error != 0)
355 			return (error);
356 	}
357 
358 	error = fget(td, args->epfd,
359 	    cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE), &epfp);
360 	if (error != 0)
361 		return (error);
362 	if (epfp->f_type != DTYPE_KQUEUE) {
363 		error = EINVAL;
364 		goto leave1;
365 	}
366 
367 	 /* Protect user data vector from incorrectly supplied fd. */
368 	error = fget(td, args->fd,
369 		     cap_rights_init_one(&rights, CAP_POLL_EVENT), &fp);
370 	if (error != 0)
371 		goto leave1;
372 
373 	/* Linux disallows spying on himself */
374 	if (epfp == fp) {
375 		error = EINVAL;
376 		goto leave0;
377 	}
378 
379 	ciargs.changelist = kev;
380 
381 	if (args->op != LINUX_EPOLL_CTL_DEL) {
382 		error = epoll_to_kevent(td, args->fd, &le, kev, &nchanges);
383 		if (error != 0)
384 			goto leave0;
385 	}
386 
387 	switch (args->op) {
388 	case LINUX_EPOLL_CTL_MOD:
389 		error = epoll_delete_all_events(td, epfp, args->fd);
390 		if (error != 0)
391 			goto leave0;
392 		break;
393 
394 	case LINUX_EPOLL_CTL_ADD:
395 		if (epoll_fd_registered(td, epfp, args->fd)) {
396 			error = EEXIST;
397 			goto leave0;
398 		}
399 		break;
400 
401 	case LINUX_EPOLL_CTL_DEL:
402 		/* CTL_DEL means unregister this fd with this epoll */
403 		error = epoll_delete_all_events(td, epfp, args->fd);
404 		goto leave0;
405 
406 	default:
407 		error = EINVAL;
408 		goto leave0;
409 	}
410 
411 	error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL);
412 
413 leave0:
414 	fdrop(fp, td);
415 
416 leave1:
417 	fdrop(epfp, td);
418 	return (error);
419 }
420 
421 /*
422  * Wait for a filter to be triggered on the epoll file descriptor.
423  */
424 static int
425 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events,
426     int maxevents, int timeout, sigset_t *uset)
427 {
428 	struct epoll_copyout_args coargs;
429 	struct kevent_copyops k_ops = { &coargs,
430 					epoll_kev_copyout,
431 					NULL};
432 	struct timespec ts, *tsp;
433 	cap_rights_t rights;
434 	struct file *epfp;
435 	sigset_t omask;
436 	int error;
437 
438 	if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS)
439 		return (EINVAL);
440 
441 	error = fget(td, epfd,
442 	    cap_rights_init_one(&rights, CAP_KQUEUE_EVENT), &epfp);
443 	if (error != 0)
444 		return (error);
445 	if (epfp->f_type != DTYPE_KQUEUE) {
446 		error = EINVAL;
447 		goto leave;
448 	}
449 	if (uset != NULL) {
450 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
451 		    &omask, 0);
452 		if (error != 0)
453 			goto leave;
454 		td->td_pflags |= TDP_OLDMASK;
455 		/*
456 		 * Make sure that ast() is called on return to
457 		 * usermode and TDP_OLDMASK is cleared, restoring old
458 		 * sigmask.
459 		 */
460 		thread_lock(td);
461 		td->td_flags |= TDF_ASTPENDING;
462 		thread_unlock(td);
463 	}
464 
465 	coargs.leventlist = events;
466 	coargs.p = td->td_proc;
467 	coargs.count = 0;
468 	coargs.error = 0;
469 
470 	/*
471 	 * Linux epoll_wait(2) man page states that timeout of -1 causes caller
472 	 * to block indefinitely. Real implementation does it if any negative
473 	 * timeout value is passed.
474 	 */
475 	if (timeout >= 0) {
476 		/* Convert from milliseconds to timespec. */
477 		ts.tv_sec = timeout / 1000;
478 		ts.tv_nsec = (timeout % 1000) * 1000000;
479 		tsp = &ts;
480 	} else {
481 		tsp = NULL;
482 	}
483 
484 	error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp);
485 	if (error == 0 && coargs.error != 0)
486 		error = coargs.error;
487 
488 	/*
489 	 * kern_kevent might return ENOMEM which is not expected from epoll_wait.
490 	 * Maybe we should translate that but I don't think it matters at all.
491 	 */
492 	if (error == 0)
493 		td->td_retval[0] = coargs.count;
494 
495 	if (uset != NULL)
496 		error = kern_sigprocmask(td, SIG_SETMASK, &omask,
497 		    NULL, 0);
498 leave:
499 	fdrop(epfp, td);
500 	return (error);
501 }
502 
503 #ifdef LINUX_LEGACY_SYSCALLS
504 int
505 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
506 {
507 
508 	return (linux_epoll_wait_common(td, args->epfd, args->events,
509 	    args->maxevents, args->timeout, NULL));
510 }
511 #endif
512 
513 int
514 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args)
515 {
516 	sigset_t mask, *pmask;
517 	l_sigset_t lmask;
518 	int error;
519 
520 	if (args->mask != NULL) {
521 		if (args->sigsetsize != sizeof(l_sigset_t))
522 			return (EINVAL);
523 		error = copyin(args->mask, &lmask, sizeof(l_sigset_t));
524 		if (error != 0)
525 			return (error);
526 		linux_to_bsd_sigset(&lmask, &mask);
527 		pmask = &mask;
528 	} else
529 		pmask = NULL;
530 	return (linux_epoll_wait_common(td, args->epfd, args->events,
531 	    args->maxevents, args->timeout, pmask));
532 }
533 
534 static int
535 epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter,
536     unsigned int flags)
537 {
538 	struct epoll_copyin_args ciargs;
539 	struct kevent kev;
540 	struct kevent_copyops k_ops = { &ciargs,
541 					NULL,
542 					epoll_kev_copyin};
543 
544 	ciargs.changelist = &kev;
545 	EV_SET(&kev, fd, filter, flags, 0, 0, 0);
546 
547 	return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL));
548 }
549 
550 static int
551 epoll_fd_registered(struct thread *td, struct file *epfp, int fd)
552 {
553 	/*
554 	 * Set empty filter flags to avoid accidental modification of already
555 	 * registered events. In the case of event re-registration:
556 	 * 1. If event does not exists kevent() does nothing and returns ENOENT
557 	 * 2. If event does exists, it's enabled/disabled state is preserved
558 	 *    but fflags, data and udata fields are overwritten. So we can not
559 	 *    set socket lowats and store user's context pointer in udata.
560 	 */
561 	if (epoll_register_kevent(td, epfp, fd, EVFILT_READ, 0) != ENOENT ||
562 	    epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, 0) != ENOENT)
563 		return (1);
564 
565 	return (0);
566 }
567 
568 static int
569 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd)
570 {
571 	int error1, error2;
572 
573 	error1 = epoll_register_kevent(td, epfp, fd, EVFILT_READ, EV_DELETE);
574 	error2 = epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, EV_DELETE);
575 
576 	/* return 0 if at least one result positive */
577 	return (error1 == 0 ? 0 : error2);
578 }
579 
580 #ifdef LINUX_LEGACY_SYSCALLS
581 int
582 linux_eventfd(struct thread *td, struct linux_eventfd_args *args)
583 {
584 	struct specialfd_eventfd ae;
585 
586 	bzero(&ae, sizeof(ae));
587 	ae.initval = args->initval;
588 	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
589 }
590 #endif
591 
592 int
593 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
594 {
595 	struct specialfd_eventfd ae;
596 	int flags;
597 
598 	if ((args->flags & ~(LINUX_O_CLOEXEC | LINUX_O_NONBLOCK |
599 	    LINUX_EFD_SEMAPHORE)) != 0)
600 		return (EINVAL);
601 	flags = 0;
602 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
603 		flags |= EFD_CLOEXEC;
604 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
605 		flags |= EFD_NONBLOCK;
606 	if ((args->flags & LINUX_EFD_SEMAPHORE) != 0)
607 		flags |= EFD_SEMAPHORE;
608 
609 	bzero(&ae, sizeof(ae));
610 	ae.flags = flags;
611 	ae.initval = args->initval;
612 	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
613 }
614 
615 int
616 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args)
617 {
618 	struct filedesc *fdp;
619 	struct timerfd *tfd;
620 	struct file *fp;
621 	clockid_t clockid;
622 	int fflags, fd, error;
623 
624 	if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0)
625 		return (EINVAL);
626 
627 	error = linux_to_native_clockid(&clockid, args->clockid);
628 	if (error != 0)
629 		return (error);
630 	if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
631 		return (EINVAL);
632 
633 	fflags = 0;
634 	if ((args->flags & LINUX_TFD_CLOEXEC) != 0)
635 		fflags |= O_CLOEXEC;
636 
637 	fdp = td->td_proc->p_fd;
638 	error = falloc(td, &fp, &fd, fflags);
639 	if (error != 0)
640 		return (error);
641 
642 	tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO);
643 	tfd->tfd_clockid = clockid;
644 	mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
645 
646 	callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
647 	knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
648 
649 	fflags = FREAD;
650 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
651 		fflags |= FNONBLOCK;
652 
653 	finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops);
654 	fdrop(fp, td);
655 
656 	td->td_retval[0] = fd;
657 	return (error);
658 }
659 
660 static int
661 timerfd_close(struct file *fp, struct thread *td)
662 {
663 	struct timerfd *tfd;
664 
665 	tfd = fp->f_data;
666 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
667 		return (EINVAL);
668 
669 	timespecclear(&tfd->tfd_time.it_value);
670 	timespecclear(&tfd->tfd_time.it_interval);
671 
672 	callout_drain(&tfd->tfd_callout);
673 
674 	seldrain(&tfd->tfd_sel);
675 	knlist_destroy(&tfd->tfd_sel.si_note);
676 
677 	fp->f_ops = &badfileops;
678 	mtx_destroy(&tfd->tfd_lock);
679 	free(tfd, M_EPOLL);
680 
681 	return (0);
682 }
683 
684 static int
685 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
686     int flags, struct thread *td)
687 {
688 	struct timerfd *tfd;
689 	timerfd_t count;
690 	int error;
691 
692 	tfd = fp->f_data;
693 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
694 		return (EINVAL);
695 
696 	if (uio->uio_resid < sizeof(timerfd_t))
697 		return (EINVAL);
698 
699 	error = 0;
700 	mtx_lock(&tfd->tfd_lock);
701 retry:
702 	if (tfd->tfd_canceled) {
703 		tfd->tfd_count = 0;
704 		mtx_unlock(&tfd->tfd_lock);
705 		return (ECANCELED);
706 	}
707 	if (tfd->tfd_count == 0) {
708 		if ((fp->f_flag & FNONBLOCK) != 0) {
709 			mtx_unlock(&tfd->tfd_lock);
710 			return (EAGAIN);
711 		}
712 		error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0);
713 		if (error == 0)
714 			goto retry;
715 	}
716 	if (error == 0) {
717 		count = tfd->tfd_count;
718 		tfd->tfd_count = 0;
719 		mtx_unlock(&tfd->tfd_lock);
720 		error = uiomove(&count, sizeof(timerfd_t), uio);
721 	} else
722 		mtx_unlock(&tfd->tfd_lock);
723 
724 	return (error);
725 }
726 
727 static int
728 timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
729     struct thread *td)
730 {
731 	struct timerfd *tfd;
732 	int revents = 0;
733 
734 	tfd = fp->f_data;
735 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
736 		return (POLLERR);
737 
738 	mtx_lock(&tfd->tfd_lock);
739 	if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0)
740 		revents |= events & (POLLIN|POLLRDNORM);
741 	if (revents == 0)
742 		selrecord(td, &tfd->tfd_sel);
743 	mtx_unlock(&tfd->tfd_lock);
744 
745 	return (revents);
746 }
747 
748 static int
749 timerfd_kqfilter(struct file *fp, struct knote *kn)
750 {
751 	struct timerfd *tfd;
752 
753 	tfd = fp->f_data;
754 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
755 		return (EINVAL);
756 
757 	if (kn->kn_filter == EVFILT_READ)
758 		kn->kn_fop = &timerfd_rfiltops;
759 	else
760 		return (EINVAL);
761 
762 	kn->kn_hook = tfd;
763 	knlist_add(&tfd->tfd_sel.si_note, kn, 0);
764 
765 	return (0);
766 }
767 
768 static void
769 filt_timerfddetach(struct knote *kn)
770 {
771 	struct timerfd *tfd = kn->kn_hook;
772 
773 	mtx_lock(&tfd->tfd_lock);
774 	knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
775 	mtx_unlock(&tfd->tfd_lock);
776 }
777 
778 static int
779 filt_timerfdread(struct knote *kn, long hint)
780 {
781 	struct timerfd *tfd = kn->kn_hook;
782 
783 	return (tfd->tfd_count > 0);
784 }
785 
786 static int
787 timerfd_ioctl(struct file *fp, u_long cmd, void *data,
788     struct ucred *active_cred, struct thread *td)
789 {
790 
791 	if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD)
792 		return (EINVAL);
793 
794 	switch (cmd) {
795 	case FIONBIO:
796 	case FIOASYNC:
797 		return (0);
798 	}
799 
800 	return (ENOTTY);
801 }
802 
803 static int
804 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
805 {
806 
807 	return (ENXIO);
808 }
809 
810 static int
811 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
812 {
813 
814 	kif->kf_type = KF_TYPE_UNKNOWN;
815 	return (0);
816 }
817 
818 static void
819 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts)
820 {
821 
822 	if (tfd->tfd_clockid == CLOCK_REALTIME)
823 		getnanotime(ts);
824 	else	/* CLOCK_MONOTONIC */
825 		getnanouptime(ts);
826 }
827 
828 static void
829 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots)
830 {
831 	struct timespec cts;
832 
833 	linux_timerfd_clocktime(tfd, &cts);
834 	*ots = tfd->tfd_time;
835 	if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) {
836 		timespecsub(&ots->it_value, &cts, &ots->it_value);
837 		if (ots->it_value.tv_sec < 0 ||
838 		    (ots->it_value.tv_sec == 0 &&
839 		     ots->it_value.tv_nsec == 0)) {
840 			ots->it_value.tv_sec  = 0;
841 			ots->it_value.tv_nsec = 1;
842 		}
843 	}
844 }
845 
846 int
847 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args)
848 {
849 	struct l_itimerspec lots;
850 	struct itimerspec ots;
851 	struct timerfd *tfd;
852 	struct file *fp;
853 	int error;
854 
855 	error = fget(td, args->fd, &cap_read_rights, &fp);
856 	if (error != 0)
857 		return (error);
858 	tfd = fp->f_data;
859 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
860 		error = EINVAL;
861 		goto out;
862 	}
863 
864 	mtx_lock(&tfd->tfd_lock);
865 	linux_timerfd_curval(tfd, &ots);
866 	mtx_unlock(&tfd->tfd_lock);
867 
868 	error = native_to_linux_itimerspec(&lots, &ots);
869 	if (error == 0)
870 		error = copyout(&lots, args->old_value, sizeof(lots));
871 
872 out:
873 	fdrop(fp, td);
874 	return (error);
875 }
876 
877 int
878 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args)
879 {
880 	struct l_itimerspec lots;
881 	struct itimerspec nts, ots;
882 	struct timespec cts, ts;
883 	struct timerfd *tfd;
884 	struct timeval tv;
885 	struct file *fp;
886 	int error;
887 
888 	if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0)
889 		return (EINVAL);
890 
891 	error = copyin(args->new_value, &lots, sizeof(lots));
892 	if (error != 0)
893 		return (error);
894 	error = linux_to_native_itimerspec(&nts, &lots);
895 	if (error != 0)
896 		return (error);
897 
898 	error = fget(td, args->fd, &cap_write_rights, &fp);
899 	if (error != 0)
900 		return (error);
901 	tfd = fp->f_data;
902 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
903 		error = EINVAL;
904 		goto out;
905 	}
906 
907 	mtx_lock(&tfd->tfd_lock);
908 	if (!timespecisset(&nts.it_value))
909 		timespecclear(&nts.it_interval);
910 	if (args->old_value != NULL)
911 		linux_timerfd_curval(tfd, &ots);
912 
913 	tfd->tfd_time = nts;
914 	tfd->tfd_count = 0;
915 	if (timespecisset(&nts.it_value)) {
916 		linux_timerfd_clocktime(tfd, &cts);
917 		ts = nts.it_value;
918 		if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) {
919 			timespecadd(&tfd->tfd_time.it_value, &cts,
920 				&tfd->tfd_time.it_value);
921 		} else {
922 			timespecsub(&ts, &cts, &ts);
923 		}
924 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
925 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
926 			linux_timerfd_expire, tfd);
927 		tfd->tfd_canceled = false;
928 	} else {
929 		tfd->tfd_canceled = true;
930 		callout_stop(&tfd->tfd_callout);
931 	}
932 	mtx_unlock(&tfd->tfd_lock);
933 
934 	if (args->old_value != NULL) {
935 		error = native_to_linux_itimerspec(&lots, &ots);
936 		if (error == 0)
937 			error = copyout(&lots, args->old_value, sizeof(lots));
938 	}
939 
940 out:
941 	fdrop(fp, td);
942 	return (error);
943 }
944 
945 static void
946 linux_timerfd_expire(void *arg)
947 {
948 	struct timespec cts, ts;
949 	struct timeval tv;
950 	struct timerfd *tfd;
951 
952 	tfd = (struct timerfd *)arg;
953 
954 	linux_timerfd_clocktime(tfd, &cts);
955 	if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) {
956 		if (timespecisset(&tfd->tfd_time.it_interval))
957 			timespecadd(&tfd->tfd_time.it_value,
958 				    &tfd->tfd_time.it_interval,
959 				    &tfd->tfd_time.it_value);
960 		else
961 			/* single shot timer */
962 			timespecclear(&tfd->tfd_time.it_value);
963 		if (timespecisset(&tfd->tfd_time.it_value)) {
964 			timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
965 			TIMESPEC_TO_TIMEVAL(&tv, &ts);
966 			callout_reset(&tfd->tfd_callout, tvtohz(&tv),
967 				linux_timerfd_expire, tfd);
968 		}
969 		tfd->tfd_count++;
970 		KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
971 		selwakeup(&tfd->tfd_sel);
972 		wakeup(&tfd->tfd_count);
973 	} else if (timespecisset(&tfd->tfd_time.it_value)) {
974 		timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
975 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
976 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
977 		    linux_timerfd_expire, tfd);
978 	}
979 }
980