xref: /freebsd/sys/compat/linux/linux_event.c (revision 0c785f06020f3b02e34c97eb27fecd3af8eb2a7b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2007 Roman Divacky
5  * Copyright (c) 2014 Dmitry Chagin <dchagin@FreeBSD.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/callout.h>
31 #include <sys/capsicum.h>
32 #include <sys/errno.h>
33 #include <sys/event.h>
34 #include <sys/eventfd.h>
35 #include <sys/file.h>
36 #include <sys/filedesc.h>
37 #include <sys/filio.h>
38 #include <sys/limits.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/poll.h>
42 #include <sys/proc.h>
43 #include <sys/selinfo.h>
44 #include <sys/specialfd.h>
45 #include <sys/sx.h>
46 #include <sys/syscallsubr.h>
47 #include <sys/timespec.h>
48 #include <sys/user.h>
49 
50 #ifdef COMPAT_LINUX32
51 #include <machine/../linux32/linux.h>
52 #include <machine/../linux32/linux32_proto.h>
53 #else
54 #include <machine/../linux/linux.h>
55 #include <machine/../linux/linux_proto.h>
56 #endif
57 
58 #include <compat/linux/linux_emul.h>
59 #include <compat/linux/linux_event.h>
60 #include <compat/linux/linux_file.h>
61 #include <compat/linux/linux_signal.h>
62 #include <compat/linux/linux_time.h>
63 #include <compat/linux/linux_util.h>
64 
65 typedef uint64_t	epoll_udata_t;
66 
67 struct epoll_event {
68 	uint32_t	events;
69 	epoll_udata_t	data;
70 }
71 #if defined(__amd64__)
72 __attribute__((packed))
73 #endif
74 ;
75 
76 #define	LINUX_MAX_EVENTS	(INT_MAX / sizeof(struct epoll_event))
77 
78 static int	epoll_to_kevent(struct thread *td, int fd,
79 		    struct epoll_event *l_event, struct kevent *kevent,
80 		    int *nkevents);
81 static void	kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event);
82 static int	epoll_kev_copyout(void *arg, struct kevent *kevp, int count);
83 static int	epoll_kev_copyin(void *arg, struct kevent *kevp, int count);
84 static int	epoll_register_kevent(struct thread *td, struct file *epfp,
85 		    int fd, int filter, unsigned int flags);
86 static int	epoll_fd_registered(struct thread *td, struct file *epfp,
87 		    int fd);
88 static int	epoll_delete_all_events(struct thread *td, struct file *epfp,
89 		    int fd);
90 
91 struct epoll_copyin_args {
92 	struct kevent	*changelist;
93 };
94 
95 struct epoll_copyout_args {
96 	struct epoll_event	*leventlist;
97 	struct proc		*p;
98 	uint32_t		count;
99 	int			error;
100 };
101 
102 /* timerfd */
103 typedef uint64_t	timerfd_t;
104 
105 static fo_rdwr_t	timerfd_read;
106 static fo_ioctl_t	timerfd_ioctl;
107 static fo_poll_t	timerfd_poll;
108 static fo_kqfilter_t	timerfd_kqfilter;
109 static fo_stat_t	timerfd_stat;
110 static fo_close_t	timerfd_close;
111 static fo_fill_kinfo_t	timerfd_fill_kinfo;
112 
113 static struct fileops timerfdops = {
114 	.fo_read = timerfd_read,
115 	.fo_write = invfo_rdwr,
116 	.fo_truncate = invfo_truncate,
117 	.fo_ioctl = timerfd_ioctl,
118 	.fo_poll = timerfd_poll,
119 	.fo_kqfilter = timerfd_kqfilter,
120 	.fo_stat = timerfd_stat,
121 	.fo_close = timerfd_close,
122 	.fo_chmod = invfo_chmod,
123 	.fo_chown = invfo_chown,
124 	.fo_sendfile = invfo_sendfile,
125 	.fo_fill_kinfo = timerfd_fill_kinfo,
126 	.fo_flags = DFLAG_PASSABLE
127 };
128 
129 static void	filt_timerfddetach(struct knote *kn);
130 static int	filt_timerfdread(struct knote *kn, long hint);
131 
132 static struct filterops timerfd_rfiltops = {
133 	.f_isfd = 1,
134 	.f_detach = filt_timerfddetach,
135 	.f_event = filt_timerfdread
136 };
137 
138 struct timerfd {
139 	clockid_t	tfd_clockid;
140 	struct itimerspec tfd_time;
141 	struct callout	tfd_callout;
142 	timerfd_t	tfd_count;
143 	bool		tfd_canceled;
144 	struct selinfo	tfd_sel;
145 	struct mtx	tfd_lock;
146 };
147 
148 static void	linux_timerfd_expire(void *);
149 static void	linux_timerfd_curval(struct timerfd *, struct itimerspec *);
150 
151 static int
152 epoll_create_common(struct thread *td, int flags)
153 {
154 
155 	return (kern_kqueue(td, flags, NULL));
156 }
157 
158 #ifdef LINUX_LEGACY_SYSCALLS
159 int
160 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
161 {
162 
163 	/*
164 	 * args->size is unused. Linux just tests it
165 	 * and then forgets it as well.
166 	 */
167 	if (args->size <= 0)
168 		return (EINVAL);
169 
170 	return (epoll_create_common(td, 0));
171 }
172 #endif
173 
174 int
175 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
176 {
177 	int flags;
178 
179 	if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0)
180 		return (EINVAL);
181 
182 	flags = 0;
183 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
184 		flags |= O_CLOEXEC;
185 
186 	return (epoll_create_common(td, flags));
187 }
188 
189 /* Structure converting function from epoll to kevent. */
190 static int
191 epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event,
192     struct kevent *kevent, int *nkevents)
193 {
194 	uint32_t levents = l_event->events;
195 	struct linux_pemuldata *pem;
196 	struct proc *p;
197 	unsigned short kev_flags = EV_ADD | EV_ENABLE;
198 
199 	/* flags related to how event is registered */
200 	if ((levents & LINUX_EPOLLONESHOT) != 0)
201 		kev_flags |= EV_DISPATCH;
202 	if ((levents & LINUX_EPOLLET) != 0)
203 		kev_flags |= EV_CLEAR;
204 	if ((levents & LINUX_EPOLLERR) != 0)
205 		kev_flags |= EV_ERROR;
206 	if ((levents & LINUX_EPOLLRDHUP) != 0)
207 		kev_flags |= EV_EOF;
208 
209 	/* flags related to what event is registered */
210 	if ((levents & LINUX_EPOLL_EVRD) != 0) {
211 		EV_SET(kevent, fd, EVFILT_READ, kev_flags, 0, 0, 0);
212 		kevent->ext[0] = l_event->data;
213 		++kevent;
214 		++(*nkevents);
215 	}
216 	if ((levents & LINUX_EPOLL_EVWR) != 0) {
217 		EV_SET(kevent, fd, EVFILT_WRITE, kev_flags, 0, 0, 0);
218 		kevent->ext[0] = l_event->data;
219 		++kevent;
220 		++(*nkevents);
221 	}
222 	/* zero event mask is legal */
223 	if ((levents & (LINUX_EPOLL_EVRD | LINUX_EPOLL_EVWR)) == 0) {
224 		EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0);
225 		++(*nkevents);
226 	}
227 
228 	if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) {
229 		p = td->td_proc;
230 
231 		pem = pem_find(p);
232 		KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
233 
234 		LINUX_PEM_XLOCK(pem);
235 		if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) {
236 			pem->flags |= LINUX_XUNSUP_EPOLL;
237 			LINUX_PEM_XUNLOCK(pem);
238 			linux_msg(td, "epoll_ctl unsupported flags: 0x%x",
239 			    levents);
240 		} else
241 			LINUX_PEM_XUNLOCK(pem);
242 		return (EINVAL);
243 	}
244 
245 	return (0);
246 }
247 
248 /*
249  * Structure converting function from kevent to epoll. In a case
250  * this is called on error in registration we store the error in
251  * event->data and pick it up later in linux_epoll_ctl().
252  */
253 static void
254 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
255 {
256 
257 	l_event->data = kevent->ext[0];
258 
259 	if ((kevent->flags & EV_ERROR) != 0) {
260 		l_event->events = LINUX_EPOLLERR;
261 		return;
262 	}
263 
264 	/* XXX EPOLLPRI, EPOLLHUP */
265 	switch (kevent->filter) {
266 	case EVFILT_READ:
267 		l_event->events = LINUX_EPOLLIN;
268 		if ((kevent->flags & EV_EOF) != 0)
269 			l_event->events |= LINUX_EPOLLRDHUP;
270 	break;
271 	case EVFILT_WRITE:
272 		l_event->events = LINUX_EPOLLOUT;
273 	break;
274 	}
275 }
276 
277 /*
278  * Copyout callback used by kevent. This converts kevent
279  * events to epoll events and copies them back to the
280  * userspace. This is also called on error on registering
281  * of the filter.
282  */
283 static int
284 epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
285 {
286 	struct epoll_copyout_args *args;
287 	struct epoll_event *eep;
288 	int error, i;
289 
290 	args = (struct epoll_copyout_args*) arg;
291 	eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO);
292 
293 	for (i = 0; i < count; i++)
294 		kevent_to_epoll(&kevp[i], &eep[i]);
295 
296 	error = copyout(eep, args->leventlist, count * sizeof(*eep));
297 	if (error == 0) {
298 		args->leventlist += count;
299 		args->count += count;
300 	} else if (args->error == 0)
301 		args->error = error;
302 
303 	free(eep, M_EPOLL);
304 	return (error);
305 }
306 
307 /*
308  * Copyin callback used by kevent. This copies already
309  * converted filters from kernel memory to the kevent
310  * internal kernel memory. Hence the memcpy instead of
311  * copyin.
312  */
313 static int
314 epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
315 {
316 	struct epoll_copyin_args *args;
317 
318 	args = (struct epoll_copyin_args*) arg;
319 
320 	memcpy(kevp, args->changelist, count * sizeof(*kevp));
321 	args->changelist += count;
322 
323 	return (0);
324 }
325 
326 /*
327  * Load epoll filter, convert it to kevent filter
328  * and load it into kevent subsystem.
329  */
330 int
331 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
332 {
333 	struct file *epfp, *fp;
334 	struct epoll_copyin_args ciargs;
335 	struct kevent kev[2];
336 	struct kevent_copyops k_ops = { &ciargs,
337 					NULL,
338 					epoll_kev_copyin};
339 	struct epoll_event le;
340 	cap_rights_t rights;
341 	int nchanges = 0;
342 	int error;
343 
344 	if (args->op != LINUX_EPOLL_CTL_DEL) {
345 		error = copyin(args->event, &le, sizeof(le));
346 		if (error != 0)
347 			return (error);
348 	}
349 
350 	error = fget(td, args->epfd,
351 	    cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE), &epfp);
352 	if (error != 0)
353 		return (error);
354 	if (epfp->f_type != DTYPE_KQUEUE) {
355 		error = EINVAL;
356 		goto leave1;
357 	}
358 
359 	 /* Protect user data vector from incorrectly supplied fd. */
360 	error = fget(td, args->fd,
361 		     cap_rights_init_one(&rights, CAP_POLL_EVENT), &fp);
362 	if (error != 0)
363 		goto leave1;
364 
365 	/* Linux disallows spying on himself */
366 	if (epfp == fp) {
367 		error = EINVAL;
368 		goto leave0;
369 	}
370 
371 	ciargs.changelist = kev;
372 
373 	if (args->op != LINUX_EPOLL_CTL_DEL) {
374 		error = epoll_to_kevent(td, args->fd, &le, kev, &nchanges);
375 		if (error != 0)
376 			goto leave0;
377 	}
378 
379 	switch (args->op) {
380 	case LINUX_EPOLL_CTL_MOD:
381 		error = epoll_delete_all_events(td, epfp, args->fd);
382 		if (error != 0)
383 			goto leave0;
384 		break;
385 
386 	case LINUX_EPOLL_CTL_ADD:
387 		if (epoll_fd_registered(td, epfp, args->fd)) {
388 			error = EEXIST;
389 			goto leave0;
390 		}
391 		break;
392 
393 	case LINUX_EPOLL_CTL_DEL:
394 		/* CTL_DEL means unregister this fd with this epoll */
395 		error = epoll_delete_all_events(td, epfp, args->fd);
396 		goto leave0;
397 
398 	default:
399 		error = EINVAL;
400 		goto leave0;
401 	}
402 
403 	error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL);
404 
405 leave0:
406 	fdrop(fp, td);
407 
408 leave1:
409 	fdrop(epfp, td);
410 	return (error);
411 }
412 
413 /*
414  * Wait for a filter to be triggered on the epoll file descriptor.
415  */
416 
417 static int
418 linux_epoll_wait_ts(struct thread *td, int epfd, struct epoll_event *events,
419     int maxevents, struct timespec *tsp, sigset_t *uset)
420 {
421 	struct epoll_copyout_args coargs;
422 	struct kevent_copyops k_ops = { &coargs,
423 					epoll_kev_copyout,
424 					NULL};
425 	cap_rights_t rights;
426 	struct file *epfp;
427 	sigset_t omask;
428 	int error;
429 
430 	if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS)
431 		return (EINVAL);
432 
433 	error = fget(td, epfd,
434 	    cap_rights_init_one(&rights, CAP_KQUEUE_EVENT), &epfp);
435 	if (error != 0)
436 		return (error);
437 	if (epfp->f_type != DTYPE_KQUEUE) {
438 		error = EINVAL;
439 		goto leave;
440 	}
441 	if (uset != NULL) {
442 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
443 		    &omask, 0);
444 		if (error != 0)
445 			goto leave;
446 		td->td_pflags |= TDP_OLDMASK;
447 		/*
448 		 * Make sure that ast() is called on return to
449 		 * usermode and TDP_OLDMASK is cleared, restoring old
450 		 * sigmask.
451 		 */
452 		ast_sched(td, TDA_SIGSUSPEND);
453 	}
454 
455 	coargs.leventlist = events;
456 	coargs.p = td->td_proc;
457 	coargs.count = 0;
458 	coargs.error = 0;
459 
460 	error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp);
461 	if (error == 0 && coargs.error != 0)
462 		error = coargs.error;
463 
464 	/*
465 	 * kern_kevent might return ENOMEM which is not expected from epoll_wait.
466 	 * Maybe we should translate that but I don't think it matters at all.
467 	 */
468 	if (error == 0)
469 		td->td_retval[0] = coargs.count;
470 
471 	if (uset != NULL)
472 		error = kern_sigprocmask(td, SIG_SETMASK, &omask,
473 		    NULL, 0);
474 leave:
475 	fdrop(epfp, td);
476 	return (error);
477 }
478 
479 static int
480 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events,
481     int maxevents, int timeout, sigset_t *uset)
482 {
483 	struct timespec ts, *tsp;
484 
485 	/*
486 	 * Linux epoll_wait(2) man page states that timeout of -1 causes caller
487 	 * to block indefinitely. Real implementation does it if any negative
488 	 * timeout value is passed.
489 	 */
490 	if (timeout >= 0) {
491 		/* Convert from milliseconds to timespec. */
492 		ts.tv_sec = timeout / 1000;
493 		ts.tv_nsec = (timeout % 1000) * 1000000;
494 		tsp = &ts;
495 	} else {
496 		tsp = NULL;
497 	}
498 	return (linux_epoll_wait_ts(td, epfd, events, maxevents, tsp, uset));
499 
500 }
501 
502 #ifdef LINUX_LEGACY_SYSCALLS
503 int
504 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
505 {
506 
507 	return (linux_epoll_wait_common(td, args->epfd, args->events,
508 	    args->maxevents, args->timeout, NULL));
509 }
510 #endif
511 
512 int
513 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args)
514 {
515 	sigset_t mask, *pmask;
516 	int error;
517 
518 	error = linux_copyin_sigset(td, args->mask, sizeof(l_sigset_t),
519 	    &mask, &pmask);
520 	if (error != 0)
521 		return (error);
522 
523 	return (linux_epoll_wait_common(td, args->epfd, args->events,
524 	    args->maxevents, args->timeout, pmask));
525 }
526 
527 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
528 int
529 linux_epoll_pwait2_64(struct thread *td, struct linux_epoll_pwait2_64_args *args)
530 {
531 	struct timespec ts, *tsa;
532 	sigset_t mask, *pmask;
533 	int error;
534 
535 	error = linux_copyin_sigset(td, args->mask, sizeof(l_sigset_t),
536 	    &mask, &pmask);
537 	if (error != 0)
538 		return (error);
539 
540 	if (args->timeout) {
541 		error = linux_get_timespec64(&ts, args->timeout);
542 		if (error != 0)
543 			return (error);
544 		tsa = &ts;
545 	} else
546 		tsa = NULL;
547 
548 	return (linux_epoll_wait_ts(td, args->epfd, args->events,
549 	    args->maxevents, tsa, pmask));
550 }
551 #else
552 int
553 linux_epoll_pwait2(struct thread *td, struct linux_epoll_pwait2_args *args)
554 {
555 	struct timespec ts, *tsa;
556 	sigset_t mask, *pmask;
557 	int error;
558 
559 	error = linux_copyin_sigset(td, args->mask, sizeof(l_sigset_t),
560 	    &mask, &pmask);
561 	if (error != 0)
562 		return (error);
563 
564 	if (args->timeout) {
565 		error = linux_get_timespec(&ts, args->timeout);
566 		if (error != 0)
567 			return (error);
568 		tsa = &ts;
569 	} else
570 		tsa = NULL;
571 
572 	return (linux_epoll_wait_ts(td, args->epfd, args->events,
573 	    args->maxevents, tsa, pmask));
574 }
575 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
576 
577 static int
578 epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter,
579     unsigned int flags)
580 {
581 	struct epoll_copyin_args ciargs;
582 	struct kevent kev;
583 	struct kevent_copyops k_ops = { &ciargs,
584 					NULL,
585 					epoll_kev_copyin};
586 
587 	ciargs.changelist = &kev;
588 	EV_SET(&kev, fd, filter, flags, 0, 0, 0);
589 
590 	return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL));
591 }
592 
593 static int
594 epoll_fd_registered(struct thread *td, struct file *epfp, int fd)
595 {
596 	/*
597 	 * Set empty filter flags to avoid accidental modification of already
598 	 * registered events. In the case of event re-registration:
599 	 * 1. If event does not exists kevent() does nothing and returns ENOENT
600 	 * 2. If event does exists, it's enabled/disabled state is preserved
601 	 *    but fflags, data and udata fields are overwritten. So we can not
602 	 *    set socket lowats and store user's context pointer in udata.
603 	 */
604 	if (epoll_register_kevent(td, epfp, fd, EVFILT_READ, 0) != ENOENT ||
605 	    epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, 0) != ENOENT)
606 		return (1);
607 
608 	return (0);
609 }
610 
611 static int
612 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd)
613 {
614 	int error1, error2;
615 
616 	error1 = epoll_register_kevent(td, epfp, fd, EVFILT_READ, EV_DELETE);
617 	error2 = epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, EV_DELETE);
618 
619 	/* return 0 if at least one result positive */
620 	return (error1 == 0 ? 0 : error2);
621 }
622 
623 #ifdef LINUX_LEGACY_SYSCALLS
624 int
625 linux_eventfd(struct thread *td, struct linux_eventfd_args *args)
626 {
627 	struct specialfd_eventfd ae;
628 
629 	bzero(&ae, sizeof(ae));
630 	ae.initval = args->initval;
631 	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
632 }
633 #endif
634 
635 int
636 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
637 {
638 	struct specialfd_eventfd ae;
639 	int flags;
640 
641 	if ((args->flags & ~(LINUX_O_CLOEXEC | LINUX_O_NONBLOCK |
642 	    LINUX_EFD_SEMAPHORE)) != 0)
643 		return (EINVAL);
644 	flags = 0;
645 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
646 		flags |= EFD_CLOEXEC;
647 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
648 		flags |= EFD_NONBLOCK;
649 	if ((args->flags & LINUX_EFD_SEMAPHORE) != 0)
650 		flags |= EFD_SEMAPHORE;
651 
652 	bzero(&ae, sizeof(ae));
653 	ae.flags = flags;
654 	ae.initval = args->initval;
655 	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
656 }
657 
658 int
659 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args)
660 {
661 	struct timerfd *tfd;
662 	struct file *fp;
663 	clockid_t clockid;
664 	int fflags, fd, error;
665 
666 	if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0)
667 		return (EINVAL);
668 
669 	error = linux_to_native_clockid(&clockid, args->clockid);
670 	if (error != 0)
671 		return (error);
672 	if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
673 		return (EINVAL);
674 
675 	fflags = 0;
676 	if ((args->flags & LINUX_TFD_CLOEXEC) != 0)
677 		fflags |= O_CLOEXEC;
678 
679 	error = falloc(td, &fp, &fd, fflags);
680 	if (error != 0)
681 		return (error);
682 
683 	tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO);
684 	tfd->tfd_clockid = clockid;
685 	mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
686 
687 	callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
688 	knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
689 
690 	fflags = FREAD;
691 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
692 		fflags |= FNONBLOCK;
693 
694 	finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops);
695 	fdrop(fp, td);
696 
697 	td->td_retval[0] = fd;
698 	return (error);
699 }
700 
701 static int
702 timerfd_close(struct file *fp, struct thread *td)
703 {
704 	struct timerfd *tfd;
705 
706 	tfd = fp->f_data;
707 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
708 		return (EINVAL);
709 
710 	timespecclear(&tfd->tfd_time.it_value);
711 	timespecclear(&tfd->tfd_time.it_interval);
712 
713 	callout_drain(&tfd->tfd_callout);
714 
715 	seldrain(&tfd->tfd_sel);
716 	knlist_destroy(&tfd->tfd_sel.si_note);
717 
718 	fp->f_ops = &badfileops;
719 	mtx_destroy(&tfd->tfd_lock);
720 	free(tfd, M_EPOLL);
721 
722 	return (0);
723 }
724 
725 static int
726 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
727     int flags, struct thread *td)
728 {
729 	struct timerfd *tfd;
730 	timerfd_t count;
731 	int error;
732 
733 	tfd = fp->f_data;
734 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
735 		return (EINVAL);
736 
737 	if (uio->uio_resid < sizeof(timerfd_t))
738 		return (EINVAL);
739 
740 	error = 0;
741 	mtx_lock(&tfd->tfd_lock);
742 retry:
743 	if (tfd->tfd_canceled) {
744 		tfd->tfd_count = 0;
745 		mtx_unlock(&tfd->tfd_lock);
746 		return (ECANCELED);
747 	}
748 	if (tfd->tfd_count == 0) {
749 		if ((fp->f_flag & FNONBLOCK) != 0) {
750 			mtx_unlock(&tfd->tfd_lock);
751 			return (EAGAIN);
752 		}
753 		error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0);
754 		if (error == 0)
755 			goto retry;
756 	}
757 	if (error == 0) {
758 		count = tfd->tfd_count;
759 		tfd->tfd_count = 0;
760 		mtx_unlock(&tfd->tfd_lock);
761 		error = uiomove(&count, sizeof(timerfd_t), uio);
762 	} else
763 		mtx_unlock(&tfd->tfd_lock);
764 
765 	return (error);
766 }
767 
768 static int
769 timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
770     struct thread *td)
771 {
772 	struct timerfd *tfd;
773 	int revents = 0;
774 
775 	tfd = fp->f_data;
776 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
777 		return (POLLERR);
778 
779 	mtx_lock(&tfd->tfd_lock);
780 	if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0)
781 		revents |= events & (POLLIN|POLLRDNORM);
782 	if (revents == 0)
783 		selrecord(td, &tfd->tfd_sel);
784 	mtx_unlock(&tfd->tfd_lock);
785 
786 	return (revents);
787 }
788 
789 static int
790 timerfd_kqfilter(struct file *fp, struct knote *kn)
791 {
792 	struct timerfd *tfd;
793 
794 	tfd = fp->f_data;
795 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
796 		return (EINVAL);
797 
798 	if (kn->kn_filter == EVFILT_READ)
799 		kn->kn_fop = &timerfd_rfiltops;
800 	else
801 		return (EINVAL);
802 
803 	kn->kn_hook = tfd;
804 	knlist_add(&tfd->tfd_sel.si_note, kn, 0);
805 
806 	return (0);
807 }
808 
809 static void
810 filt_timerfddetach(struct knote *kn)
811 {
812 	struct timerfd *tfd = kn->kn_hook;
813 
814 	mtx_lock(&tfd->tfd_lock);
815 	knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
816 	mtx_unlock(&tfd->tfd_lock);
817 }
818 
819 static int
820 filt_timerfdread(struct knote *kn, long hint)
821 {
822 	struct timerfd *tfd = kn->kn_hook;
823 
824 	return (tfd->tfd_count > 0);
825 }
826 
827 static int
828 timerfd_ioctl(struct file *fp, u_long cmd, void *data,
829     struct ucred *active_cred, struct thread *td)
830 {
831 
832 	if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD)
833 		return (EINVAL);
834 
835 	switch (cmd) {
836 	case FIONBIO:
837 	case FIOASYNC:
838 		return (0);
839 	}
840 
841 	return (ENOTTY);
842 }
843 
844 static int
845 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
846 {
847 
848 	return (ENXIO);
849 }
850 
851 static int
852 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
853 {
854 
855 	kif->kf_type = KF_TYPE_UNKNOWN;
856 	return (0);
857 }
858 
859 static void
860 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts)
861 {
862 
863 	if (tfd->tfd_clockid == CLOCK_REALTIME)
864 		getnanotime(ts);
865 	else	/* CLOCK_MONOTONIC */
866 		getnanouptime(ts);
867 }
868 
869 static void
870 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots)
871 {
872 	struct timespec cts;
873 
874 	linux_timerfd_clocktime(tfd, &cts);
875 	*ots = tfd->tfd_time;
876 	if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) {
877 		timespecsub(&ots->it_value, &cts, &ots->it_value);
878 		if (ots->it_value.tv_sec < 0 ||
879 		    (ots->it_value.tv_sec == 0 &&
880 		     ots->it_value.tv_nsec == 0)) {
881 			ots->it_value.tv_sec  = 0;
882 			ots->it_value.tv_nsec = 1;
883 		}
884 	}
885 }
886 
887 static int
888 linux_timerfd_gettime_common(struct thread *td, int fd, struct itimerspec *ots)
889 {
890 	struct timerfd *tfd;
891 	struct file *fp;
892 	int error;
893 
894 	error = fget(td, fd, &cap_read_rights, &fp);
895 	if (error != 0)
896 		return (error);
897 	tfd = fp->f_data;
898 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
899 		error = EINVAL;
900 		goto out;
901 	}
902 
903 	mtx_lock(&tfd->tfd_lock);
904 	linux_timerfd_curval(tfd, ots);
905 	mtx_unlock(&tfd->tfd_lock);
906 
907 out:
908 	fdrop(fp, td);
909 	return (error);
910 }
911 
912 int
913 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args)
914 {
915 	struct l_itimerspec lots;
916 	struct itimerspec ots;
917 	int error;
918 
919 	error = linux_timerfd_gettime_common(td, args->fd, &ots);
920 	if (error != 0)
921 		return (error);
922 	error = native_to_linux_itimerspec(&lots, &ots);
923 	if (error == 0)
924 		error = copyout(&lots, args->old_value, sizeof(lots));
925 	return (error);
926 }
927 
928 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
929 int
930 linux_timerfd_gettime64(struct thread *td, struct linux_timerfd_gettime64_args *args)
931 {
932 	struct l_itimerspec64 lots;
933 	struct itimerspec ots;
934 	int error;
935 
936 	error = linux_timerfd_gettime_common(td, args->fd, &ots);
937 	if (error != 0)
938 		return (error);
939 	error = native_to_linux_itimerspec64(&lots, &ots);
940 	if (error == 0)
941 		error = copyout(&lots, args->old_value, sizeof(lots));
942 	return (error);
943 }
944 #endif
945 
946 static int
947 linux_timerfd_settime_common(struct thread *td, int fd, int flags,
948     struct itimerspec *nts, struct itimerspec *oval)
949 {
950 	struct timespec cts, ts;
951 	struct timerfd *tfd;
952 	struct timeval tv;
953 	struct file *fp;
954 	int error;
955 
956 	if ((flags & ~LINUX_TFD_SETTIME_FLAGS) != 0)
957 		return (EINVAL);
958 
959 	error = fget(td, fd, &cap_write_rights, &fp);
960 	if (error != 0)
961 		return (error);
962 	tfd = fp->f_data;
963 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
964 		error = EINVAL;
965 		goto out;
966 	}
967 
968 	mtx_lock(&tfd->tfd_lock);
969 	if (!timespecisset(&nts->it_value))
970 		timespecclear(&nts->it_interval);
971 	if (oval != NULL)
972 		linux_timerfd_curval(tfd, oval);
973 
974 	bcopy(nts, &tfd->tfd_time, sizeof(*nts));
975 	tfd->tfd_count = 0;
976 	if (timespecisset(&nts->it_value)) {
977 		linux_timerfd_clocktime(tfd, &cts);
978 		ts = nts->it_value;
979 		if ((flags & LINUX_TFD_TIMER_ABSTIME) == 0) {
980 			timespecadd(&tfd->tfd_time.it_value, &cts,
981 				&tfd->tfd_time.it_value);
982 		} else {
983 			timespecsub(&ts, &cts, &ts);
984 		}
985 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
986 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
987 			linux_timerfd_expire, tfd);
988 		tfd->tfd_canceled = false;
989 	} else {
990 		tfd->tfd_canceled = true;
991 		callout_stop(&tfd->tfd_callout);
992 	}
993 	mtx_unlock(&tfd->tfd_lock);
994 
995 out:
996 	fdrop(fp, td);
997 	return (error);
998 }
999 
1000 int
1001 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args)
1002 {
1003 	struct l_itimerspec lots;
1004 	struct itimerspec nts, ots, *pots;
1005 	int error;
1006 
1007 	error = copyin(args->new_value, &lots, sizeof(lots));
1008 	if (error != 0)
1009 		return (error);
1010 	error = linux_to_native_itimerspec(&nts, &lots);
1011 	if (error != 0)
1012 		return (error);
1013 	pots = (args->old_value != NULL ? &ots : NULL);
1014 	error = linux_timerfd_settime_common(td, args->fd, args->flags,
1015 	    &nts, pots);
1016 	if (error == 0 && args->old_value != NULL) {
1017 		error = native_to_linux_itimerspec(&lots, &ots);
1018 		if (error == 0)
1019 			error = copyout(&lots, args->old_value, sizeof(lots));
1020 	}
1021 	return (error);
1022 }
1023 
1024 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1025 int
1026 linux_timerfd_settime64(struct thread *td, struct linux_timerfd_settime64_args *args)
1027 {
1028 	struct l_itimerspec64 lots;
1029 	struct itimerspec nts, ots, *pots;
1030 	int error;
1031 
1032 	error = copyin(args->new_value, &lots, sizeof(lots));
1033 	if (error != 0)
1034 		return (error);
1035 	error = linux_to_native_itimerspec64(&nts, &lots);
1036 	if (error != 0)
1037 		return (error);
1038 	pots = (args->old_value != NULL ? &ots : NULL);
1039 	error = linux_timerfd_settime_common(td, args->fd, args->flags,
1040 	    &nts, pots);
1041 	if (error == 0 && args->old_value != NULL) {
1042 		error = native_to_linux_itimerspec64(&lots, &ots);
1043 		if (error == 0)
1044 			error = copyout(&lots, args->old_value, sizeof(lots));
1045 	}
1046 	return (error);
1047 }
1048 #endif
1049 
1050 static void
1051 linux_timerfd_expire(void *arg)
1052 {
1053 	struct timespec cts, ts;
1054 	struct timeval tv;
1055 	struct timerfd *tfd;
1056 
1057 	tfd = (struct timerfd *)arg;
1058 
1059 	linux_timerfd_clocktime(tfd, &cts);
1060 	if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) {
1061 		if (timespecisset(&tfd->tfd_time.it_interval))
1062 			timespecadd(&tfd->tfd_time.it_value,
1063 				    &tfd->tfd_time.it_interval,
1064 				    &tfd->tfd_time.it_value);
1065 		else
1066 			/* single shot timer */
1067 			timespecclear(&tfd->tfd_time.it_value);
1068 		if (timespecisset(&tfd->tfd_time.it_value)) {
1069 			timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
1070 			TIMESPEC_TO_TIMEVAL(&tv, &ts);
1071 			callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1072 				linux_timerfd_expire, tfd);
1073 		}
1074 		tfd->tfd_count++;
1075 		KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
1076 		selwakeup(&tfd->tfd_sel);
1077 		wakeup(&tfd->tfd_count);
1078 	} else if (timespecisset(&tfd->tfd_time.it_value)) {
1079 		timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
1080 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
1081 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1082 		    linux_timerfd_expire, tfd);
1083 	}
1084 }
1085