1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2007 Roman Divacky 5 * Copyright (c) 2014 Dmitry Chagin 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_compat.h" 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/imgact.h> 38 #include <sys/kernel.h> 39 #include <sys/limits.h> 40 #include <sys/lock.h> 41 #include <sys/mutex.h> 42 #include <sys/callout.h> 43 #include <sys/capsicum.h> 44 #include <sys/types.h> 45 #include <sys/user.h> 46 #include <sys/file.h> 47 #include <sys/filedesc.h> 48 #include <sys/filio.h> 49 #include <sys/errno.h> 50 #include <sys/event.h> 51 #include <sys/poll.h> 52 #include <sys/proc.h> 53 #include <sys/selinfo.h> 54 #include <sys/specialfd.h> 55 #include <sys/sx.h> 56 #include <sys/syscallsubr.h> 57 #include <sys/timespec.h> 58 #include <sys/eventfd.h> 59 60 #ifdef COMPAT_LINUX32 61 #include <machine/../linux32/linux.h> 62 #include <machine/../linux32/linux32_proto.h> 63 #else 64 #include <machine/../linux/linux.h> 65 #include <machine/../linux/linux_proto.h> 66 #endif 67 68 #include <compat/linux/linux_emul.h> 69 #include <compat/linux/linux_event.h> 70 #include <compat/linux/linux_file.h> 71 #include <compat/linux/linux_timer.h> 72 #include <compat/linux/linux_util.h> 73 74 /* 75 * epoll defines 'struct epoll_event' with the field 'data' as 64 bits 76 * on all architectures. But on 32 bit architectures BSD 'struct kevent' only 77 * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied 78 * data verbatuim. Therefore we allocate 64-bit memory block to pass 79 * user supplied data for every file descriptor. 80 */ 81 82 typedef uint64_t epoll_udata_t; 83 84 struct epoll_emuldata { 85 uint32_t fdc; /* epoll udata max index */ 86 epoll_udata_t udata[1]; /* epoll user data vector */ 87 }; 88 89 #define EPOLL_DEF_SZ 16 90 #define EPOLL_SIZE(fdn) \ 91 (sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t)) 92 93 struct epoll_event { 94 uint32_t events; 95 epoll_udata_t data; 96 } 97 #if defined(__amd64__) 98 __attribute__((packed)) 99 #endif 100 ; 101 102 #define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 103 104 static void epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata); 105 static int epoll_to_kevent(struct thread *td, int fd, 106 struct epoll_event *l_event, struct kevent *kevent, 107 int *nkevents); 108 static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event); 109 static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count); 110 static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count); 111 static int epoll_register_kevent(struct thread *td, struct file *epfp, 112 int fd, int filter, unsigned int flags); 113 static int epoll_fd_registered(struct thread *td, struct file *epfp, 114 int fd); 115 static int epoll_delete_all_events(struct thread *td, struct file *epfp, 116 int fd); 117 118 struct epoll_copyin_args { 119 struct kevent *changelist; 120 }; 121 122 struct epoll_copyout_args { 123 struct epoll_event *leventlist; 124 struct proc *p; 125 uint32_t count; 126 int error; 127 }; 128 129 /* timerfd */ 130 typedef uint64_t timerfd_t; 131 132 static fo_rdwr_t timerfd_read; 133 static fo_ioctl_t timerfd_ioctl; 134 static fo_poll_t timerfd_poll; 135 static fo_kqfilter_t timerfd_kqfilter; 136 static fo_stat_t timerfd_stat; 137 static fo_close_t timerfd_close; 138 static fo_fill_kinfo_t timerfd_fill_kinfo; 139 140 static struct fileops timerfdops = { 141 .fo_read = timerfd_read, 142 .fo_write = invfo_rdwr, 143 .fo_truncate = invfo_truncate, 144 .fo_ioctl = timerfd_ioctl, 145 .fo_poll = timerfd_poll, 146 .fo_kqfilter = timerfd_kqfilter, 147 .fo_stat = timerfd_stat, 148 .fo_close = timerfd_close, 149 .fo_chmod = invfo_chmod, 150 .fo_chown = invfo_chown, 151 .fo_sendfile = invfo_sendfile, 152 .fo_fill_kinfo = timerfd_fill_kinfo, 153 .fo_flags = DFLAG_PASSABLE 154 }; 155 156 static void filt_timerfddetach(struct knote *kn); 157 static int filt_timerfdread(struct knote *kn, long hint); 158 159 static struct filterops timerfd_rfiltops = { 160 .f_isfd = 1, 161 .f_detach = filt_timerfddetach, 162 .f_event = filt_timerfdread 163 }; 164 165 struct timerfd { 166 clockid_t tfd_clockid; 167 struct itimerspec tfd_time; 168 struct callout tfd_callout; 169 timerfd_t tfd_count; 170 bool tfd_canceled; 171 struct selinfo tfd_sel; 172 struct mtx tfd_lock; 173 }; 174 175 static void linux_timerfd_expire(void *); 176 static void linux_timerfd_curval(struct timerfd *, struct itimerspec *); 177 178 static void 179 epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata) 180 { 181 struct linux_pemuldata *pem; 182 struct epoll_emuldata *emd; 183 struct proc *p; 184 185 p = td->td_proc; 186 187 pem = pem_find(p); 188 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 189 190 LINUX_PEM_XLOCK(pem); 191 if (pem->epoll == NULL) { 192 emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); 193 emd->fdc = fd; 194 pem->epoll = emd; 195 } else { 196 emd = pem->epoll; 197 if (fd > emd->fdc) { 198 emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); 199 emd->fdc = fd; 200 pem->epoll = emd; 201 } 202 } 203 emd->udata[fd] = udata; 204 LINUX_PEM_XUNLOCK(pem); 205 } 206 207 static int 208 epoll_create_common(struct thread *td, int flags) 209 { 210 int error; 211 212 error = kern_kqueue(td, flags, NULL); 213 if (error != 0) 214 return (error); 215 216 epoll_fd_install(td, EPOLL_DEF_SZ, 0); 217 218 return (0); 219 } 220 221 #ifdef LINUX_LEGACY_SYSCALLS 222 int 223 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args) 224 { 225 226 /* 227 * args->size is unused. Linux just tests it 228 * and then forgets it as well. 229 */ 230 if (args->size <= 0) 231 return (EINVAL); 232 233 return (epoll_create_common(td, 0)); 234 } 235 #endif 236 237 int 238 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args) 239 { 240 int flags; 241 242 if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0) 243 return (EINVAL); 244 245 flags = 0; 246 if ((args->flags & LINUX_O_CLOEXEC) != 0) 247 flags |= O_CLOEXEC; 248 249 return (epoll_create_common(td, flags)); 250 } 251 252 /* Structure converting function from epoll to kevent. */ 253 static int 254 epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event, 255 struct kevent *kevent, int *nkevents) 256 { 257 uint32_t levents = l_event->events; 258 struct linux_pemuldata *pem; 259 struct proc *p; 260 unsigned short kev_flags = EV_ADD | EV_ENABLE; 261 262 /* flags related to how event is registered */ 263 if ((levents & LINUX_EPOLLONESHOT) != 0) 264 kev_flags |= EV_DISPATCH; 265 if ((levents & LINUX_EPOLLET) != 0) 266 kev_flags |= EV_CLEAR; 267 if ((levents & LINUX_EPOLLERR) != 0) 268 kev_flags |= EV_ERROR; 269 if ((levents & LINUX_EPOLLRDHUP) != 0) 270 kev_flags |= EV_EOF; 271 272 /* flags related to what event is registered */ 273 if ((levents & LINUX_EPOLL_EVRD) != 0) { 274 EV_SET(kevent++, fd, EVFILT_READ, kev_flags, 0, 0, 0); 275 ++(*nkevents); 276 } 277 if ((levents & LINUX_EPOLL_EVWR) != 0) { 278 EV_SET(kevent++, fd, EVFILT_WRITE, kev_flags, 0, 0, 0); 279 ++(*nkevents); 280 } 281 /* zero event mask is legal */ 282 if ((levents & (LINUX_EPOLL_EVRD | LINUX_EPOLL_EVWR)) == 0) { 283 EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0); 284 ++(*nkevents); 285 } 286 287 if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) { 288 p = td->td_proc; 289 290 pem = pem_find(p); 291 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 292 KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n")); 293 294 LINUX_PEM_XLOCK(pem); 295 if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) { 296 pem->flags |= LINUX_XUNSUP_EPOLL; 297 LINUX_PEM_XUNLOCK(pem); 298 linux_msg(td, "epoll_ctl unsupported flags: 0x%x", 299 levents); 300 } else 301 LINUX_PEM_XUNLOCK(pem); 302 return (EINVAL); 303 } 304 305 return (0); 306 } 307 308 /* 309 * Structure converting function from kevent to epoll. In a case 310 * this is called on error in registration we store the error in 311 * event->data and pick it up later in linux_epoll_ctl(). 312 */ 313 static void 314 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event) 315 { 316 317 if ((kevent->flags & EV_ERROR) != 0) { 318 l_event->events = LINUX_EPOLLERR; 319 return; 320 } 321 322 /* XXX EPOLLPRI, EPOLLHUP */ 323 switch (kevent->filter) { 324 case EVFILT_READ: 325 l_event->events = LINUX_EPOLLIN; 326 if ((kevent->flags & EV_EOF) != 0) 327 l_event->events |= LINUX_EPOLLRDHUP; 328 break; 329 case EVFILT_WRITE: 330 l_event->events = LINUX_EPOLLOUT; 331 break; 332 } 333 } 334 335 /* 336 * Copyout callback used by kevent. This converts kevent 337 * events to epoll events and copies them back to the 338 * userspace. This is also called on error on registering 339 * of the filter. 340 */ 341 static int 342 epoll_kev_copyout(void *arg, struct kevent *kevp, int count) 343 { 344 struct epoll_copyout_args *args; 345 struct linux_pemuldata *pem; 346 struct epoll_emuldata *emd; 347 struct epoll_event *eep; 348 int error, fd, i; 349 350 args = (struct epoll_copyout_args*) arg; 351 eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO); 352 353 pem = pem_find(args->p); 354 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 355 LINUX_PEM_SLOCK(pem); 356 emd = pem->epoll; 357 KASSERT(emd != NULL, ("epoll proc epolldata not found.\n")); 358 359 for (i = 0; i < count; i++) { 360 kevent_to_epoll(&kevp[i], &eep[i]); 361 362 fd = kevp[i].ident; 363 KASSERT(fd <= emd->fdc, ("epoll user data vector" 364 " is too small.\n")); 365 eep[i].data = emd->udata[fd]; 366 } 367 LINUX_PEM_SUNLOCK(pem); 368 369 error = copyout(eep, args->leventlist, count * sizeof(*eep)); 370 if (error == 0) { 371 args->leventlist += count; 372 args->count += count; 373 } else if (args->error == 0) 374 args->error = error; 375 376 free(eep, M_EPOLL); 377 return (error); 378 } 379 380 /* 381 * Copyin callback used by kevent. This copies already 382 * converted filters from kernel memory to the kevent 383 * internal kernel memory. Hence the memcpy instead of 384 * copyin. 385 */ 386 static int 387 epoll_kev_copyin(void *arg, struct kevent *kevp, int count) 388 { 389 struct epoll_copyin_args *args; 390 391 args = (struct epoll_copyin_args*) arg; 392 393 memcpy(kevp, args->changelist, count * sizeof(*kevp)); 394 args->changelist += count; 395 396 return (0); 397 } 398 399 /* 400 * Load epoll filter, convert it to kevent filter 401 * and load it into kevent subsystem. 402 */ 403 int 404 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args) 405 { 406 struct file *epfp, *fp; 407 struct epoll_copyin_args ciargs; 408 struct kevent kev[2]; 409 struct kevent_copyops k_ops = { &ciargs, 410 NULL, 411 epoll_kev_copyin}; 412 struct epoll_event le; 413 cap_rights_t rights; 414 int nchanges = 0; 415 int error; 416 417 if (args->op != LINUX_EPOLL_CTL_DEL) { 418 error = copyin(args->event, &le, sizeof(le)); 419 if (error != 0) 420 return (error); 421 } 422 423 error = fget(td, args->epfd, 424 cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &epfp); 425 if (error != 0) 426 return (error); 427 if (epfp->f_type != DTYPE_KQUEUE) { 428 error = EINVAL; 429 goto leave1; 430 } 431 432 /* Protect user data vector from incorrectly supplied fd. */ 433 error = fget(td, args->fd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp); 434 if (error != 0) 435 goto leave1; 436 437 /* Linux disallows spying on himself */ 438 if (epfp == fp) { 439 error = EINVAL; 440 goto leave0; 441 } 442 443 ciargs.changelist = kev; 444 445 if (args->op != LINUX_EPOLL_CTL_DEL) { 446 error = epoll_to_kevent(td, args->fd, &le, kev, &nchanges); 447 if (error != 0) 448 goto leave0; 449 } 450 451 switch (args->op) { 452 case LINUX_EPOLL_CTL_MOD: 453 error = epoll_delete_all_events(td, epfp, args->fd); 454 if (error != 0) 455 goto leave0; 456 break; 457 458 case LINUX_EPOLL_CTL_ADD: 459 if (epoll_fd_registered(td, epfp, args->fd)) { 460 error = EEXIST; 461 goto leave0; 462 } 463 break; 464 465 case LINUX_EPOLL_CTL_DEL: 466 /* CTL_DEL means unregister this fd with this epoll */ 467 error = epoll_delete_all_events(td, epfp, args->fd); 468 goto leave0; 469 470 default: 471 error = EINVAL; 472 goto leave0; 473 } 474 475 epoll_fd_install(td, args->fd, le.data); 476 477 error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL); 478 479 leave0: 480 fdrop(fp, td); 481 482 leave1: 483 fdrop(epfp, td); 484 return (error); 485 } 486 487 /* 488 * Wait for a filter to be triggered on the epoll file descriptor. 489 */ 490 static int 491 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events, 492 int maxevents, int timeout, sigset_t *uset) 493 { 494 struct epoll_copyout_args coargs; 495 struct kevent_copyops k_ops = { &coargs, 496 epoll_kev_copyout, 497 NULL}; 498 struct timespec ts, *tsp; 499 cap_rights_t rights; 500 struct file *epfp; 501 sigset_t omask; 502 int error; 503 504 if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS) 505 return (EINVAL); 506 507 error = fget(td, epfd, 508 cap_rights_init(&rights, CAP_KQUEUE_EVENT), &epfp); 509 if (error != 0) 510 return (error); 511 if (epfp->f_type != DTYPE_KQUEUE) { 512 error = EINVAL; 513 goto leave; 514 } 515 if (uset != NULL) { 516 error = kern_sigprocmask(td, SIG_SETMASK, uset, 517 &omask, 0); 518 if (error != 0) 519 goto leave; 520 td->td_pflags |= TDP_OLDMASK; 521 /* 522 * Make sure that ast() is called on return to 523 * usermode and TDP_OLDMASK is cleared, restoring old 524 * sigmask. 525 */ 526 thread_lock(td); 527 td->td_flags |= TDF_ASTPENDING; 528 thread_unlock(td); 529 } 530 531 coargs.leventlist = events; 532 coargs.p = td->td_proc; 533 coargs.count = 0; 534 coargs.error = 0; 535 536 /* 537 * Linux epoll_wait(2) man page states that timeout of -1 causes caller 538 * to block indefinitely. Real implementation does it if any negative 539 * timeout value is passed. 540 */ 541 if (timeout >= 0) { 542 /* Convert from milliseconds to timespec. */ 543 ts.tv_sec = timeout / 1000; 544 ts.tv_nsec = (timeout % 1000) * 1000000; 545 tsp = &ts; 546 } else { 547 tsp = NULL; 548 } 549 550 error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp); 551 if (error == 0 && coargs.error != 0) 552 error = coargs.error; 553 554 /* 555 * kern_kevent might return ENOMEM which is not expected from epoll_wait. 556 * Maybe we should translate that but I don't think it matters at all. 557 */ 558 if (error == 0) 559 td->td_retval[0] = coargs.count; 560 561 if (uset != NULL) 562 error = kern_sigprocmask(td, SIG_SETMASK, &omask, 563 NULL, 0); 564 leave: 565 fdrop(epfp, td); 566 return (error); 567 } 568 569 #ifdef LINUX_LEGACY_SYSCALLS 570 int 571 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args) 572 { 573 574 return (linux_epoll_wait_common(td, args->epfd, args->events, 575 args->maxevents, args->timeout, NULL)); 576 } 577 #endif 578 579 int 580 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args) 581 { 582 sigset_t mask, *pmask; 583 l_sigset_t lmask; 584 int error; 585 586 if (args->mask != NULL) { 587 if (args->sigsetsize != sizeof(l_sigset_t)) 588 return (EINVAL); 589 error = copyin(args->mask, &lmask, sizeof(l_sigset_t)); 590 if (error != 0) 591 return (error); 592 linux_to_bsd_sigset(&lmask, &mask); 593 pmask = &mask; 594 } else 595 pmask = NULL; 596 return (linux_epoll_wait_common(td, args->epfd, args->events, 597 args->maxevents, args->timeout, pmask)); 598 } 599 600 static int 601 epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter, 602 unsigned int flags) 603 { 604 struct epoll_copyin_args ciargs; 605 struct kevent kev; 606 struct kevent_copyops k_ops = { &ciargs, 607 NULL, 608 epoll_kev_copyin}; 609 610 ciargs.changelist = &kev; 611 EV_SET(&kev, fd, filter, flags, 0, 0, 0); 612 613 return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL)); 614 } 615 616 static int 617 epoll_fd_registered(struct thread *td, struct file *epfp, int fd) 618 { 619 /* 620 * Set empty filter flags to avoid accidental modification of already 621 * registered events. In the case of event re-registration: 622 * 1. If event does not exists kevent() does nothing and returns ENOENT 623 * 2. If event does exists, it's enabled/disabled state is preserved 624 * but fflags, data and udata fields are overwritten. So we can not 625 * set socket lowats and store user's context pointer in udata. 626 */ 627 if (epoll_register_kevent(td, epfp, fd, EVFILT_READ, 0) != ENOENT || 628 epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, 0) != ENOENT) 629 return (1); 630 631 return (0); 632 } 633 634 static int 635 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd) 636 { 637 int error1, error2; 638 639 error1 = epoll_register_kevent(td, epfp, fd, EVFILT_READ, EV_DELETE); 640 error2 = epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, EV_DELETE); 641 642 /* return 0 if at least one result positive */ 643 return (error1 == 0 ? 0 : error2); 644 } 645 646 #ifdef LINUX_LEGACY_SYSCALLS 647 int 648 linux_eventfd(struct thread *td, struct linux_eventfd_args *args) 649 { 650 struct specialfd_eventfd ae; 651 652 bzero(&ae, sizeof(ae)); 653 ae.initval = args->initval; 654 return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae)); 655 } 656 #endif 657 658 int 659 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args) 660 { 661 struct specialfd_eventfd ae; 662 int flags; 663 664 if ((args->flags & ~(LINUX_O_CLOEXEC | LINUX_O_NONBLOCK | 665 LINUX_EFD_SEMAPHORE)) != 0) 666 return (EINVAL); 667 flags = 0; 668 if ((args->flags & LINUX_O_CLOEXEC) != 0) 669 flags |= EFD_CLOEXEC; 670 if ((args->flags & LINUX_O_NONBLOCK) != 0) 671 flags |= EFD_NONBLOCK; 672 if ((args->flags & LINUX_EFD_SEMAPHORE) != 0) 673 flags |= EFD_SEMAPHORE; 674 675 bzero(&ae, sizeof(ae)); 676 ae.flags = flags; 677 ae.initval = args->initval; 678 return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae)); 679 } 680 681 int 682 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args) 683 { 684 struct filedesc *fdp; 685 struct timerfd *tfd; 686 struct file *fp; 687 clockid_t clockid; 688 int fflags, fd, error; 689 690 if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0) 691 return (EINVAL); 692 693 error = linux_to_native_clockid(&clockid, args->clockid); 694 if (error != 0) 695 return (error); 696 if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) 697 return (EINVAL); 698 699 fflags = 0; 700 if ((args->flags & LINUX_TFD_CLOEXEC) != 0) 701 fflags |= O_CLOEXEC; 702 703 fdp = td->td_proc->p_fd; 704 error = falloc(td, &fp, &fd, fflags); 705 if (error != 0) 706 return (error); 707 708 tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO); 709 tfd->tfd_clockid = clockid; 710 mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF); 711 712 callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0); 713 knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock); 714 715 fflags = FREAD; 716 if ((args->flags & LINUX_O_NONBLOCK) != 0) 717 fflags |= FNONBLOCK; 718 719 finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops); 720 fdrop(fp, td); 721 722 td->td_retval[0] = fd; 723 return (error); 724 } 725 726 static int 727 timerfd_close(struct file *fp, struct thread *td) 728 { 729 struct timerfd *tfd; 730 731 tfd = fp->f_data; 732 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 733 return (EINVAL); 734 735 timespecclear(&tfd->tfd_time.it_value); 736 timespecclear(&tfd->tfd_time.it_interval); 737 738 mtx_lock(&tfd->tfd_lock); 739 callout_drain(&tfd->tfd_callout); 740 mtx_unlock(&tfd->tfd_lock); 741 742 seldrain(&tfd->tfd_sel); 743 knlist_destroy(&tfd->tfd_sel.si_note); 744 745 fp->f_ops = &badfileops; 746 mtx_destroy(&tfd->tfd_lock); 747 free(tfd, M_EPOLL); 748 749 return (0); 750 } 751 752 static int 753 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 754 int flags, struct thread *td) 755 { 756 struct timerfd *tfd; 757 timerfd_t count; 758 int error; 759 760 tfd = fp->f_data; 761 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 762 return (EINVAL); 763 764 if (uio->uio_resid < sizeof(timerfd_t)) 765 return (EINVAL); 766 767 error = 0; 768 mtx_lock(&tfd->tfd_lock); 769 retry: 770 if (tfd->tfd_canceled) { 771 tfd->tfd_count = 0; 772 mtx_unlock(&tfd->tfd_lock); 773 return (ECANCELED); 774 } 775 if (tfd->tfd_count == 0) { 776 if ((fp->f_flag & FNONBLOCK) != 0) { 777 mtx_unlock(&tfd->tfd_lock); 778 return (EAGAIN); 779 } 780 error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0); 781 if (error == 0) 782 goto retry; 783 } 784 if (error == 0) { 785 count = tfd->tfd_count; 786 tfd->tfd_count = 0; 787 mtx_unlock(&tfd->tfd_lock); 788 error = uiomove(&count, sizeof(timerfd_t), uio); 789 } else 790 mtx_unlock(&tfd->tfd_lock); 791 792 return (error); 793 } 794 795 static int 796 timerfd_poll(struct file *fp, int events, struct ucred *active_cred, 797 struct thread *td) 798 { 799 struct timerfd *tfd; 800 int revents = 0; 801 802 tfd = fp->f_data; 803 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 804 return (POLLERR); 805 806 mtx_lock(&tfd->tfd_lock); 807 if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0) 808 revents |= events & (POLLIN|POLLRDNORM); 809 if (revents == 0) 810 selrecord(td, &tfd->tfd_sel); 811 mtx_unlock(&tfd->tfd_lock); 812 813 return (revents); 814 } 815 816 static int 817 timerfd_kqfilter(struct file *fp, struct knote *kn) 818 { 819 struct timerfd *tfd; 820 821 tfd = fp->f_data; 822 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 823 return (EINVAL); 824 825 if (kn->kn_filter == EVFILT_READ) 826 kn->kn_fop = &timerfd_rfiltops; 827 else 828 return (EINVAL); 829 830 kn->kn_hook = tfd; 831 knlist_add(&tfd->tfd_sel.si_note, kn, 0); 832 833 return (0); 834 } 835 836 static void 837 filt_timerfddetach(struct knote *kn) 838 { 839 struct timerfd *tfd = kn->kn_hook; 840 841 mtx_lock(&tfd->tfd_lock); 842 knlist_remove(&tfd->tfd_sel.si_note, kn, 1); 843 mtx_unlock(&tfd->tfd_lock); 844 } 845 846 static int 847 filt_timerfdread(struct knote *kn, long hint) 848 { 849 struct timerfd *tfd = kn->kn_hook; 850 851 return (tfd->tfd_count > 0); 852 } 853 854 static int 855 timerfd_ioctl(struct file *fp, u_long cmd, void *data, 856 struct ucred *active_cred, struct thread *td) 857 { 858 859 if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD) 860 return (EINVAL); 861 862 switch (cmd) { 863 case FIONBIO: 864 case FIOASYNC: 865 return (0); 866 } 867 868 return (ENOTTY); 869 } 870 871 static int 872 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, 873 struct thread *td) 874 { 875 876 return (ENXIO); 877 } 878 879 static int 880 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 881 { 882 883 kif->kf_type = KF_TYPE_UNKNOWN; 884 return (0); 885 } 886 887 static void 888 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts) 889 { 890 891 if (tfd->tfd_clockid == CLOCK_REALTIME) 892 getnanotime(ts); 893 else /* CLOCK_MONOTONIC */ 894 getnanouptime(ts); 895 } 896 897 static void 898 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots) 899 { 900 struct timespec cts; 901 902 linux_timerfd_clocktime(tfd, &cts); 903 *ots = tfd->tfd_time; 904 if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) { 905 timespecsub(&ots->it_value, &cts, &ots->it_value); 906 if (ots->it_value.tv_sec < 0 || 907 (ots->it_value.tv_sec == 0 && 908 ots->it_value.tv_nsec == 0)) { 909 ots->it_value.tv_sec = 0; 910 ots->it_value.tv_nsec = 1; 911 } 912 } 913 } 914 915 int 916 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args) 917 { 918 struct l_itimerspec lots; 919 struct itimerspec ots; 920 struct timerfd *tfd; 921 struct file *fp; 922 int error; 923 924 error = fget(td, args->fd, &cap_read_rights, &fp); 925 if (error != 0) 926 return (error); 927 tfd = fp->f_data; 928 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { 929 error = EINVAL; 930 goto out; 931 } 932 933 mtx_lock(&tfd->tfd_lock); 934 linux_timerfd_curval(tfd, &ots); 935 mtx_unlock(&tfd->tfd_lock); 936 937 error = native_to_linux_itimerspec(&lots, &ots); 938 if (error == 0) 939 error = copyout(&lots, args->old_value, sizeof(lots)); 940 941 out: 942 fdrop(fp, td); 943 return (error); 944 } 945 946 int 947 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args) 948 { 949 struct l_itimerspec lots; 950 struct itimerspec nts, ots; 951 struct timespec cts, ts; 952 struct timerfd *tfd; 953 struct timeval tv; 954 struct file *fp; 955 int error; 956 957 if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0) 958 return (EINVAL); 959 960 error = copyin(args->new_value, &lots, sizeof(lots)); 961 if (error != 0) 962 return (error); 963 error = linux_to_native_itimerspec(&nts, &lots); 964 if (error != 0) 965 return (error); 966 967 error = fget(td, args->fd, &cap_write_rights, &fp); 968 if (error != 0) 969 return (error); 970 tfd = fp->f_data; 971 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { 972 error = EINVAL; 973 goto out; 974 } 975 976 mtx_lock(&tfd->tfd_lock); 977 if (!timespecisset(&nts.it_value)) 978 timespecclear(&nts.it_interval); 979 if (args->old_value != NULL) 980 linux_timerfd_curval(tfd, &ots); 981 982 tfd->tfd_time = nts; 983 if (timespecisset(&nts.it_value)) { 984 linux_timerfd_clocktime(tfd, &cts); 985 ts = nts.it_value; 986 if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) { 987 timespecadd(&tfd->tfd_time.it_value, &cts, 988 &tfd->tfd_time.it_value); 989 } else { 990 timespecsub(&ts, &cts, &ts); 991 } 992 TIMESPEC_TO_TIMEVAL(&tv, &ts); 993 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 994 linux_timerfd_expire, tfd); 995 tfd->tfd_canceled = false; 996 } else { 997 tfd->tfd_canceled = true; 998 callout_stop(&tfd->tfd_callout); 999 } 1000 mtx_unlock(&tfd->tfd_lock); 1001 1002 if (args->old_value != NULL) { 1003 error = native_to_linux_itimerspec(&lots, &ots); 1004 if (error == 0) 1005 error = copyout(&lots, args->old_value, sizeof(lots)); 1006 } 1007 1008 out: 1009 fdrop(fp, td); 1010 return (error); 1011 } 1012 1013 static void 1014 linux_timerfd_expire(void *arg) 1015 { 1016 struct timespec cts, ts; 1017 struct timeval tv; 1018 struct timerfd *tfd; 1019 1020 tfd = (struct timerfd *)arg; 1021 1022 linux_timerfd_clocktime(tfd, &cts); 1023 if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) { 1024 if (timespecisset(&tfd->tfd_time.it_interval)) 1025 timespecadd(&tfd->tfd_time.it_value, 1026 &tfd->tfd_time.it_interval, 1027 &tfd->tfd_time.it_value); 1028 else 1029 /* single shot timer */ 1030 timespecclear(&tfd->tfd_time.it_value); 1031 if (timespecisset(&tfd->tfd_time.it_value)) { 1032 timespecsub(&tfd->tfd_time.it_value, &cts, &ts); 1033 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1034 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1035 linux_timerfd_expire, tfd); 1036 } 1037 tfd->tfd_count++; 1038 KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0); 1039 selwakeup(&tfd->tfd_sel); 1040 wakeup(&tfd->tfd_count); 1041 } else if (timespecisset(&tfd->tfd_time.it_value)) { 1042 timespecsub(&tfd->tfd_time.it_value, &cts, &ts); 1043 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1044 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1045 linux_timerfd_expire, tfd); 1046 } 1047 } 1048