1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2007 Roman Divacky 5 * Copyright (c) 2014 Dmitry Chagin 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_compat.h" 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/imgact.h> 38 #include <sys/kernel.h> 39 #include <sys/limits.h> 40 #include <sys/lock.h> 41 #include <sys/mutex.h> 42 #include <sys/callout.h> 43 #include <sys/capsicum.h> 44 #include <sys/types.h> 45 #include <sys/user.h> 46 #include <sys/file.h> 47 #include <sys/filedesc.h> 48 #include <sys/filio.h> 49 #include <sys/errno.h> 50 #include <sys/event.h> 51 #include <sys/poll.h> 52 #include <sys/proc.h> 53 #include <sys/selinfo.h> 54 #include <sys/specialfd.h> 55 #include <sys/sx.h> 56 #include <sys/syscallsubr.h> 57 #include <sys/timespec.h> 58 #include <sys/eventfd.h> 59 60 #ifdef COMPAT_LINUX32 61 #include <machine/../linux32/linux.h> 62 #include <machine/../linux32/linux32_proto.h> 63 #else 64 #include <machine/../linux/linux.h> 65 #include <machine/../linux/linux_proto.h> 66 #endif 67 68 #include <compat/linux/linux_emul.h> 69 #include <compat/linux/linux_event.h> 70 #include <compat/linux/linux_file.h> 71 #include <compat/linux/linux_timer.h> 72 #include <compat/linux/linux_util.h> 73 74 /* 75 * epoll defines 'struct epoll_event' with the field 'data' as 64 bits 76 * on all architectures. But on 32 bit architectures BSD 'struct kevent' only 77 * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied 78 * data verbatuim. Therefore we allocate 64-bit memory block to pass 79 * user supplied data for every file descriptor. 80 */ 81 82 typedef uint64_t epoll_udata_t; 83 84 struct epoll_emuldata { 85 uint32_t fdc; /* epoll udata max index */ 86 epoll_udata_t udata[1]; /* epoll user data vector */ 87 }; 88 89 #define EPOLL_DEF_SZ 16 90 #define EPOLL_SIZE(fdn) \ 91 (sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t)) 92 93 struct epoll_event { 94 uint32_t events; 95 epoll_udata_t data; 96 } 97 #if defined(__amd64__) 98 __attribute__((packed)) 99 #endif 100 ; 101 102 #define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 103 104 static void epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata); 105 static int epoll_to_kevent(struct thread *td, int fd, 106 struct epoll_event *l_event, struct kevent *kevent, 107 int *nkevents); 108 static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event); 109 static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count); 110 static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count); 111 static int epoll_register_kevent(struct thread *td, struct file *epfp, 112 int fd, int filter, unsigned int flags); 113 static int epoll_fd_registered(struct thread *td, struct file *epfp, 114 int fd); 115 static int epoll_delete_all_events(struct thread *td, struct file *epfp, 116 int fd); 117 118 struct epoll_copyin_args { 119 struct kevent *changelist; 120 }; 121 122 struct epoll_copyout_args { 123 struct epoll_event *leventlist; 124 struct proc *p; 125 uint32_t count; 126 int error; 127 }; 128 129 /* timerfd */ 130 typedef uint64_t timerfd_t; 131 132 static fo_rdwr_t timerfd_read; 133 static fo_ioctl_t timerfd_ioctl; 134 static fo_poll_t timerfd_poll; 135 static fo_kqfilter_t timerfd_kqfilter; 136 static fo_stat_t timerfd_stat; 137 static fo_close_t timerfd_close; 138 static fo_fill_kinfo_t timerfd_fill_kinfo; 139 140 static struct fileops timerfdops = { 141 .fo_read = timerfd_read, 142 .fo_write = invfo_rdwr, 143 .fo_truncate = invfo_truncate, 144 .fo_ioctl = timerfd_ioctl, 145 .fo_poll = timerfd_poll, 146 .fo_kqfilter = timerfd_kqfilter, 147 .fo_stat = timerfd_stat, 148 .fo_close = timerfd_close, 149 .fo_chmod = invfo_chmod, 150 .fo_chown = invfo_chown, 151 .fo_sendfile = invfo_sendfile, 152 .fo_fill_kinfo = timerfd_fill_kinfo, 153 .fo_flags = DFLAG_PASSABLE 154 }; 155 156 static void filt_timerfddetach(struct knote *kn); 157 static int filt_timerfdread(struct knote *kn, long hint); 158 159 static struct filterops timerfd_rfiltops = { 160 .f_isfd = 1, 161 .f_detach = filt_timerfddetach, 162 .f_event = filt_timerfdread 163 }; 164 165 struct timerfd { 166 clockid_t tfd_clockid; 167 struct itimerspec tfd_time; 168 struct callout tfd_callout; 169 timerfd_t tfd_count; 170 bool tfd_canceled; 171 struct selinfo tfd_sel; 172 struct mtx tfd_lock; 173 }; 174 175 static void linux_timerfd_expire(void *); 176 static void linux_timerfd_curval(struct timerfd *, struct itimerspec *); 177 178 static void 179 epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata) 180 { 181 struct linux_pemuldata *pem; 182 struct epoll_emuldata *emd; 183 struct proc *p; 184 185 p = td->td_proc; 186 187 pem = pem_find(p); 188 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 189 190 LINUX_PEM_XLOCK(pem); 191 if (pem->epoll == NULL) { 192 emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); 193 emd->fdc = fd; 194 pem->epoll = emd; 195 } else { 196 emd = pem->epoll; 197 if (fd > emd->fdc) { 198 emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); 199 emd->fdc = fd; 200 pem->epoll = emd; 201 } 202 } 203 emd->udata[fd] = udata; 204 LINUX_PEM_XUNLOCK(pem); 205 } 206 207 static int 208 epoll_create_common(struct thread *td, int flags) 209 { 210 int error; 211 212 error = kern_kqueue(td, flags, NULL); 213 if (error != 0) 214 return (error); 215 216 epoll_fd_install(td, EPOLL_DEF_SZ, 0); 217 218 return (0); 219 } 220 221 #ifdef LINUX_LEGACY_SYSCALLS 222 int 223 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args) 224 { 225 226 /* 227 * args->size is unused. Linux just tests it 228 * and then forgets it as well. 229 */ 230 if (args->size <= 0) 231 return (EINVAL); 232 233 return (epoll_create_common(td, 0)); 234 } 235 #endif 236 237 int 238 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args) 239 { 240 int flags; 241 242 if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0) 243 return (EINVAL); 244 245 flags = 0; 246 if ((args->flags & LINUX_O_CLOEXEC) != 0) 247 flags |= O_CLOEXEC; 248 249 return (epoll_create_common(td, flags)); 250 } 251 252 /* Structure converting function from epoll to kevent. */ 253 static int 254 epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event, 255 struct kevent *kevent, int *nkevents) 256 { 257 uint32_t levents = l_event->events; 258 struct linux_pemuldata *pem; 259 struct proc *p; 260 unsigned short kev_flags = EV_ADD | EV_ENABLE; 261 262 /* flags related to how event is registered */ 263 if ((levents & LINUX_EPOLLONESHOT) != 0) 264 kev_flags |= EV_DISPATCH; 265 if ((levents & LINUX_EPOLLET) != 0) 266 kev_flags |= EV_CLEAR; 267 if ((levents & LINUX_EPOLLERR) != 0) 268 kev_flags |= EV_ERROR; 269 if ((levents & LINUX_EPOLLRDHUP) != 0) 270 kev_flags |= EV_EOF; 271 272 /* flags related to what event is registered */ 273 if ((levents & LINUX_EPOLL_EVRD) != 0) { 274 EV_SET(kevent++, fd, EVFILT_READ, kev_flags, 0, 0, 0); 275 ++(*nkevents); 276 } 277 if ((levents & LINUX_EPOLL_EVWR) != 0) { 278 EV_SET(kevent++, fd, EVFILT_WRITE, kev_flags, 0, 0, 0); 279 ++(*nkevents); 280 } 281 /* zero event mask is legal */ 282 if ((levents & (LINUX_EPOLL_EVRD | LINUX_EPOLL_EVWR)) == 0) { 283 EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0); 284 ++(*nkevents); 285 } 286 287 if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) { 288 p = td->td_proc; 289 290 pem = pem_find(p); 291 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 292 KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n")); 293 294 LINUX_PEM_XLOCK(pem); 295 if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) { 296 pem->flags |= LINUX_XUNSUP_EPOLL; 297 LINUX_PEM_XUNLOCK(pem); 298 linux_msg(td, "epoll_ctl unsupported flags: 0x%x", 299 levents); 300 } else 301 LINUX_PEM_XUNLOCK(pem); 302 return (EINVAL); 303 } 304 305 return (0); 306 } 307 308 /* 309 * Structure converting function from kevent to epoll. In a case 310 * this is called on error in registration we store the error in 311 * event->data and pick it up later in linux_epoll_ctl(). 312 */ 313 static void 314 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event) 315 { 316 317 if ((kevent->flags & EV_ERROR) != 0) { 318 l_event->events = LINUX_EPOLLERR; 319 return; 320 } 321 322 /* XXX EPOLLPRI, EPOLLHUP */ 323 switch (kevent->filter) { 324 case EVFILT_READ: 325 l_event->events = LINUX_EPOLLIN; 326 if ((kevent->flags & EV_EOF) != 0) 327 l_event->events |= LINUX_EPOLLRDHUP; 328 break; 329 case EVFILT_WRITE: 330 l_event->events = LINUX_EPOLLOUT; 331 break; 332 } 333 } 334 335 /* 336 * Copyout callback used by kevent. This converts kevent 337 * events to epoll events and copies them back to the 338 * userspace. This is also called on error on registering 339 * of the filter. 340 */ 341 static int 342 epoll_kev_copyout(void *arg, struct kevent *kevp, int count) 343 { 344 struct epoll_copyout_args *args; 345 struct linux_pemuldata *pem; 346 struct epoll_emuldata *emd; 347 struct epoll_event *eep; 348 int error, fd, i; 349 350 args = (struct epoll_copyout_args*) arg; 351 eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO); 352 353 pem = pem_find(args->p); 354 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 355 LINUX_PEM_SLOCK(pem); 356 emd = pem->epoll; 357 KASSERT(emd != NULL, ("epoll proc epolldata not found.\n")); 358 359 for (i = 0; i < count; i++) { 360 kevent_to_epoll(&kevp[i], &eep[i]); 361 362 fd = kevp[i].ident; 363 KASSERT(fd <= emd->fdc, ("epoll user data vector" 364 " is too small.\n")); 365 eep[i].data = emd->udata[fd]; 366 } 367 LINUX_PEM_SUNLOCK(pem); 368 369 error = copyout(eep, args->leventlist, count * sizeof(*eep)); 370 if (error == 0) { 371 args->leventlist += count; 372 args->count += count; 373 } else if (args->error == 0) 374 args->error = error; 375 376 free(eep, M_EPOLL); 377 return (error); 378 } 379 380 /* 381 * Copyin callback used by kevent. This copies already 382 * converted filters from kernel memory to the kevent 383 * internal kernel memory. Hence the memcpy instead of 384 * copyin. 385 */ 386 static int 387 epoll_kev_copyin(void *arg, struct kevent *kevp, int count) 388 { 389 struct epoll_copyin_args *args; 390 391 args = (struct epoll_copyin_args*) arg; 392 393 memcpy(kevp, args->changelist, count * sizeof(*kevp)); 394 args->changelist += count; 395 396 return (0); 397 } 398 399 /* 400 * Load epoll filter, convert it to kevent filter 401 * and load it into kevent subsystem. 402 */ 403 int 404 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args) 405 { 406 struct file *epfp, *fp; 407 struct epoll_copyin_args ciargs; 408 struct kevent kev[2]; 409 struct kevent_copyops k_ops = { &ciargs, 410 NULL, 411 epoll_kev_copyin}; 412 struct epoll_event le; 413 cap_rights_t rights; 414 int nchanges = 0; 415 int error; 416 417 if (args->op != LINUX_EPOLL_CTL_DEL) { 418 error = copyin(args->event, &le, sizeof(le)); 419 if (error != 0) 420 return (error); 421 } 422 423 error = fget(td, args->epfd, 424 cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE), &epfp); 425 if (error != 0) 426 return (error); 427 if (epfp->f_type != DTYPE_KQUEUE) { 428 error = EINVAL; 429 goto leave1; 430 } 431 432 /* Protect user data vector from incorrectly supplied fd. */ 433 error = fget(td, args->fd, 434 cap_rights_init_one(&rights, CAP_POLL_EVENT), &fp); 435 if (error != 0) 436 goto leave1; 437 438 /* Linux disallows spying on himself */ 439 if (epfp == fp) { 440 error = EINVAL; 441 goto leave0; 442 } 443 444 ciargs.changelist = kev; 445 446 if (args->op != LINUX_EPOLL_CTL_DEL) { 447 error = epoll_to_kevent(td, args->fd, &le, kev, &nchanges); 448 if (error != 0) 449 goto leave0; 450 } 451 452 switch (args->op) { 453 case LINUX_EPOLL_CTL_MOD: 454 error = epoll_delete_all_events(td, epfp, args->fd); 455 if (error != 0) 456 goto leave0; 457 break; 458 459 case LINUX_EPOLL_CTL_ADD: 460 if (epoll_fd_registered(td, epfp, args->fd)) { 461 error = EEXIST; 462 goto leave0; 463 } 464 break; 465 466 case LINUX_EPOLL_CTL_DEL: 467 /* CTL_DEL means unregister this fd with this epoll */ 468 error = epoll_delete_all_events(td, epfp, args->fd); 469 goto leave0; 470 471 default: 472 error = EINVAL; 473 goto leave0; 474 } 475 476 epoll_fd_install(td, args->fd, le.data); 477 478 error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL); 479 480 leave0: 481 fdrop(fp, td); 482 483 leave1: 484 fdrop(epfp, td); 485 return (error); 486 } 487 488 /* 489 * Wait for a filter to be triggered on the epoll file descriptor. 490 */ 491 static int 492 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events, 493 int maxevents, int timeout, sigset_t *uset) 494 { 495 struct epoll_copyout_args coargs; 496 struct kevent_copyops k_ops = { &coargs, 497 epoll_kev_copyout, 498 NULL}; 499 struct timespec ts, *tsp; 500 cap_rights_t rights; 501 struct file *epfp; 502 sigset_t omask; 503 int error; 504 505 if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS) 506 return (EINVAL); 507 508 error = fget(td, epfd, 509 cap_rights_init_one(&rights, CAP_KQUEUE_EVENT), &epfp); 510 if (error != 0) 511 return (error); 512 if (epfp->f_type != DTYPE_KQUEUE) { 513 error = EINVAL; 514 goto leave; 515 } 516 if (uset != NULL) { 517 error = kern_sigprocmask(td, SIG_SETMASK, uset, 518 &omask, 0); 519 if (error != 0) 520 goto leave; 521 td->td_pflags |= TDP_OLDMASK; 522 /* 523 * Make sure that ast() is called on return to 524 * usermode and TDP_OLDMASK is cleared, restoring old 525 * sigmask. 526 */ 527 thread_lock(td); 528 td->td_flags |= TDF_ASTPENDING; 529 thread_unlock(td); 530 } 531 532 coargs.leventlist = events; 533 coargs.p = td->td_proc; 534 coargs.count = 0; 535 coargs.error = 0; 536 537 /* 538 * Linux epoll_wait(2) man page states that timeout of -1 causes caller 539 * to block indefinitely. Real implementation does it if any negative 540 * timeout value is passed. 541 */ 542 if (timeout >= 0) { 543 /* Convert from milliseconds to timespec. */ 544 ts.tv_sec = timeout / 1000; 545 ts.tv_nsec = (timeout % 1000) * 1000000; 546 tsp = &ts; 547 } else { 548 tsp = NULL; 549 } 550 551 error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp); 552 if (error == 0 && coargs.error != 0) 553 error = coargs.error; 554 555 /* 556 * kern_kevent might return ENOMEM which is not expected from epoll_wait. 557 * Maybe we should translate that but I don't think it matters at all. 558 */ 559 if (error == 0) 560 td->td_retval[0] = coargs.count; 561 562 if (uset != NULL) 563 error = kern_sigprocmask(td, SIG_SETMASK, &omask, 564 NULL, 0); 565 leave: 566 fdrop(epfp, td); 567 return (error); 568 } 569 570 #ifdef LINUX_LEGACY_SYSCALLS 571 int 572 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args) 573 { 574 575 return (linux_epoll_wait_common(td, args->epfd, args->events, 576 args->maxevents, args->timeout, NULL)); 577 } 578 #endif 579 580 int 581 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args) 582 { 583 sigset_t mask, *pmask; 584 l_sigset_t lmask; 585 int error; 586 587 if (args->mask != NULL) { 588 if (args->sigsetsize != sizeof(l_sigset_t)) 589 return (EINVAL); 590 error = copyin(args->mask, &lmask, sizeof(l_sigset_t)); 591 if (error != 0) 592 return (error); 593 linux_to_bsd_sigset(&lmask, &mask); 594 pmask = &mask; 595 } else 596 pmask = NULL; 597 return (linux_epoll_wait_common(td, args->epfd, args->events, 598 args->maxevents, args->timeout, pmask)); 599 } 600 601 static int 602 epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter, 603 unsigned int flags) 604 { 605 struct epoll_copyin_args ciargs; 606 struct kevent kev; 607 struct kevent_copyops k_ops = { &ciargs, 608 NULL, 609 epoll_kev_copyin}; 610 611 ciargs.changelist = &kev; 612 EV_SET(&kev, fd, filter, flags, 0, 0, 0); 613 614 return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL)); 615 } 616 617 static int 618 epoll_fd_registered(struct thread *td, struct file *epfp, int fd) 619 { 620 /* 621 * Set empty filter flags to avoid accidental modification of already 622 * registered events. In the case of event re-registration: 623 * 1. If event does not exists kevent() does nothing and returns ENOENT 624 * 2. If event does exists, it's enabled/disabled state is preserved 625 * but fflags, data and udata fields are overwritten. So we can not 626 * set socket lowats and store user's context pointer in udata. 627 */ 628 if (epoll_register_kevent(td, epfp, fd, EVFILT_READ, 0) != ENOENT || 629 epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, 0) != ENOENT) 630 return (1); 631 632 return (0); 633 } 634 635 static int 636 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd) 637 { 638 int error1, error2; 639 640 error1 = epoll_register_kevent(td, epfp, fd, EVFILT_READ, EV_DELETE); 641 error2 = epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, EV_DELETE); 642 643 /* return 0 if at least one result positive */ 644 return (error1 == 0 ? 0 : error2); 645 } 646 647 #ifdef LINUX_LEGACY_SYSCALLS 648 int 649 linux_eventfd(struct thread *td, struct linux_eventfd_args *args) 650 { 651 struct specialfd_eventfd ae; 652 653 bzero(&ae, sizeof(ae)); 654 ae.initval = args->initval; 655 return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae)); 656 } 657 #endif 658 659 int 660 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args) 661 { 662 struct specialfd_eventfd ae; 663 int flags; 664 665 if ((args->flags & ~(LINUX_O_CLOEXEC | LINUX_O_NONBLOCK | 666 LINUX_EFD_SEMAPHORE)) != 0) 667 return (EINVAL); 668 flags = 0; 669 if ((args->flags & LINUX_O_CLOEXEC) != 0) 670 flags |= EFD_CLOEXEC; 671 if ((args->flags & LINUX_O_NONBLOCK) != 0) 672 flags |= EFD_NONBLOCK; 673 if ((args->flags & LINUX_EFD_SEMAPHORE) != 0) 674 flags |= EFD_SEMAPHORE; 675 676 bzero(&ae, sizeof(ae)); 677 ae.flags = flags; 678 ae.initval = args->initval; 679 return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae)); 680 } 681 682 int 683 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args) 684 { 685 struct filedesc *fdp; 686 struct timerfd *tfd; 687 struct file *fp; 688 clockid_t clockid; 689 int fflags, fd, error; 690 691 if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0) 692 return (EINVAL); 693 694 error = linux_to_native_clockid(&clockid, args->clockid); 695 if (error != 0) 696 return (error); 697 if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) 698 return (EINVAL); 699 700 fflags = 0; 701 if ((args->flags & LINUX_TFD_CLOEXEC) != 0) 702 fflags |= O_CLOEXEC; 703 704 fdp = td->td_proc->p_fd; 705 error = falloc(td, &fp, &fd, fflags); 706 if (error != 0) 707 return (error); 708 709 tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO); 710 tfd->tfd_clockid = clockid; 711 mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF); 712 713 callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0); 714 knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock); 715 716 fflags = FREAD; 717 if ((args->flags & LINUX_O_NONBLOCK) != 0) 718 fflags |= FNONBLOCK; 719 720 finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops); 721 fdrop(fp, td); 722 723 td->td_retval[0] = fd; 724 return (error); 725 } 726 727 static int 728 timerfd_close(struct file *fp, struct thread *td) 729 { 730 struct timerfd *tfd; 731 732 tfd = fp->f_data; 733 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 734 return (EINVAL); 735 736 timespecclear(&tfd->tfd_time.it_value); 737 timespecclear(&tfd->tfd_time.it_interval); 738 739 mtx_lock(&tfd->tfd_lock); 740 callout_drain(&tfd->tfd_callout); 741 mtx_unlock(&tfd->tfd_lock); 742 743 seldrain(&tfd->tfd_sel); 744 knlist_destroy(&tfd->tfd_sel.si_note); 745 746 fp->f_ops = &badfileops; 747 mtx_destroy(&tfd->tfd_lock); 748 free(tfd, M_EPOLL); 749 750 return (0); 751 } 752 753 static int 754 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 755 int flags, struct thread *td) 756 { 757 struct timerfd *tfd; 758 timerfd_t count; 759 int error; 760 761 tfd = fp->f_data; 762 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 763 return (EINVAL); 764 765 if (uio->uio_resid < sizeof(timerfd_t)) 766 return (EINVAL); 767 768 error = 0; 769 mtx_lock(&tfd->tfd_lock); 770 retry: 771 if (tfd->tfd_canceled) { 772 tfd->tfd_count = 0; 773 mtx_unlock(&tfd->tfd_lock); 774 return (ECANCELED); 775 } 776 if (tfd->tfd_count == 0) { 777 if ((fp->f_flag & FNONBLOCK) != 0) { 778 mtx_unlock(&tfd->tfd_lock); 779 return (EAGAIN); 780 } 781 error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0); 782 if (error == 0) 783 goto retry; 784 } 785 if (error == 0) { 786 count = tfd->tfd_count; 787 tfd->tfd_count = 0; 788 mtx_unlock(&tfd->tfd_lock); 789 error = uiomove(&count, sizeof(timerfd_t), uio); 790 } else 791 mtx_unlock(&tfd->tfd_lock); 792 793 return (error); 794 } 795 796 static int 797 timerfd_poll(struct file *fp, int events, struct ucred *active_cred, 798 struct thread *td) 799 { 800 struct timerfd *tfd; 801 int revents = 0; 802 803 tfd = fp->f_data; 804 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 805 return (POLLERR); 806 807 mtx_lock(&tfd->tfd_lock); 808 if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0) 809 revents |= events & (POLLIN|POLLRDNORM); 810 if (revents == 0) 811 selrecord(td, &tfd->tfd_sel); 812 mtx_unlock(&tfd->tfd_lock); 813 814 return (revents); 815 } 816 817 static int 818 timerfd_kqfilter(struct file *fp, struct knote *kn) 819 { 820 struct timerfd *tfd; 821 822 tfd = fp->f_data; 823 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 824 return (EINVAL); 825 826 if (kn->kn_filter == EVFILT_READ) 827 kn->kn_fop = &timerfd_rfiltops; 828 else 829 return (EINVAL); 830 831 kn->kn_hook = tfd; 832 knlist_add(&tfd->tfd_sel.si_note, kn, 0); 833 834 return (0); 835 } 836 837 static void 838 filt_timerfddetach(struct knote *kn) 839 { 840 struct timerfd *tfd = kn->kn_hook; 841 842 mtx_lock(&tfd->tfd_lock); 843 knlist_remove(&tfd->tfd_sel.si_note, kn, 1); 844 mtx_unlock(&tfd->tfd_lock); 845 } 846 847 static int 848 filt_timerfdread(struct knote *kn, long hint) 849 { 850 struct timerfd *tfd = kn->kn_hook; 851 852 return (tfd->tfd_count > 0); 853 } 854 855 static int 856 timerfd_ioctl(struct file *fp, u_long cmd, void *data, 857 struct ucred *active_cred, struct thread *td) 858 { 859 860 if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD) 861 return (EINVAL); 862 863 switch (cmd) { 864 case FIONBIO: 865 case FIOASYNC: 866 return (0); 867 } 868 869 return (ENOTTY); 870 } 871 872 static int 873 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, 874 struct thread *td) 875 { 876 877 return (ENXIO); 878 } 879 880 static int 881 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 882 { 883 884 kif->kf_type = KF_TYPE_UNKNOWN; 885 return (0); 886 } 887 888 static void 889 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts) 890 { 891 892 if (tfd->tfd_clockid == CLOCK_REALTIME) 893 getnanotime(ts); 894 else /* CLOCK_MONOTONIC */ 895 getnanouptime(ts); 896 } 897 898 static void 899 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots) 900 { 901 struct timespec cts; 902 903 linux_timerfd_clocktime(tfd, &cts); 904 *ots = tfd->tfd_time; 905 if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) { 906 timespecsub(&ots->it_value, &cts, &ots->it_value); 907 if (ots->it_value.tv_sec < 0 || 908 (ots->it_value.tv_sec == 0 && 909 ots->it_value.tv_nsec == 0)) { 910 ots->it_value.tv_sec = 0; 911 ots->it_value.tv_nsec = 1; 912 } 913 } 914 } 915 916 int 917 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args) 918 { 919 struct l_itimerspec lots; 920 struct itimerspec ots; 921 struct timerfd *tfd; 922 struct file *fp; 923 int error; 924 925 error = fget(td, args->fd, &cap_read_rights, &fp); 926 if (error != 0) 927 return (error); 928 tfd = fp->f_data; 929 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { 930 error = EINVAL; 931 goto out; 932 } 933 934 mtx_lock(&tfd->tfd_lock); 935 linux_timerfd_curval(tfd, &ots); 936 mtx_unlock(&tfd->tfd_lock); 937 938 error = native_to_linux_itimerspec(&lots, &ots); 939 if (error == 0) 940 error = copyout(&lots, args->old_value, sizeof(lots)); 941 942 out: 943 fdrop(fp, td); 944 return (error); 945 } 946 947 int 948 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args) 949 { 950 struct l_itimerspec lots; 951 struct itimerspec nts, ots; 952 struct timespec cts, ts; 953 struct timerfd *tfd; 954 struct timeval tv; 955 struct file *fp; 956 int error; 957 958 if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0) 959 return (EINVAL); 960 961 error = copyin(args->new_value, &lots, sizeof(lots)); 962 if (error != 0) 963 return (error); 964 error = linux_to_native_itimerspec(&nts, &lots); 965 if (error != 0) 966 return (error); 967 968 error = fget(td, args->fd, &cap_write_rights, &fp); 969 if (error != 0) 970 return (error); 971 tfd = fp->f_data; 972 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { 973 error = EINVAL; 974 goto out; 975 } 976 977 mtx_lock(&tfd->tfd_lock); 978 if (!timespecisset(&nts.it_value)) 979 timespecclear(&nts.it_interval); 980 if (args->old_value != NULL) 981 linux_timerfd_curval(tfd, &ots); 982 983 tfd->tfd_time = nts; 984 if (timespecisset(&nts.it_value)) { 985 linux_timerfd_clocktime(tfd, &cts); 986 ts = nts.it_value; 987 if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) { 988 timespecadd(&tfd->tfd_time.it_value, &cts, 989 &tfd->tfd_time.it_value); 990 } else { 991 timespecsub(&ts, &cts, &ts); 992 } 993 TIMESPEC_TO_TIMEVAL(&tv, &ts); 994 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 995 linux_timerfd_expire, tfd); 996 tfd->tfd_canceled = false; 997 } else { 998 tfd->tfd_canceled = true; 999 callout_stop(&tfd->tfd_callout); 1000 } 1001 mtx_unlock(&tfd->tfd_lock); 1002 1003 if (args->old_value != NULL) { 1004 error = native_to_linux_itimerspec(&lots, &ots); 1005 if (error == 0) 1006 error = copyout(&lots, args->old_value, sizeof(lots)); 1007 } 1008 1009 out: 1010 fdrop(fp, td); 1011 return (error); 1012 } 1013 1014 static void 1015 linux_timerfd_expire(void *arg) 1016 { 1017 struct timespec cts, ts; 1018 struct timeval tv; 1019 struct timerfd *tfd; 1020 1021 tfd = (struct timerfd *)arg; 1022 1023 linux_timerfd_clocktime(tfd, &cts); 1024 if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) { 1025 if (timespecisset(&tfd->tfd_time.it_interval)) 1026 timespecadd(&tfd->tfd_time.it_value, 1027 &tfd->tfd_time.it_interval, 1028 &tfd->tfd_time.it_value); 1029 else 1030 /* single shot timer */ 1031 timespecclear(&tfd->tfd_time.it_value); 1032 if (timespecisset(&tfd->tfd_time.it_value)) { 1033 timespecsub(&tfd->tfd_time.it_value, &cts, &ts); 1034 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1035 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1036 linux_timerfd_expire, tfd); 1037 } 1038 tfd->tfd_count++; 1039 KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0); 1040 selwakeup(&tfd->tfd_sel); 1041 wakeup(&tfd->tfd_count); 1042 } else if (timespecisset(&tfd->tfd_time.it_value)) { 1043 timespecsub(&tfd->tfd_time.it_value, &cts, &ts); 1044 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1045 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1046 linux_timerfd_expire, tfd); 1047 } 1048 } 1049