1 /*- 2 * Copyright (c) 2007 Roman Divacky 3 * Copyright (c) 2014 Dmitry Chagin 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include "opt_compat.h" 32 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/imgact.h> 36 #include <sys/kernel.h> 37 #include <sys/limits.h> 38 #include <sys/lock.h> 39 #include <sys/mutex.h> 40 #include <sys/callout.h> 41 #include <sys/capsicum.h> 42 #include <sys/types.h> 43 #include <sys/user.h> 44 #include <sys/file.h> 45 #include <sys/filedesc.h> 46 #include <sys/filio.h> 47 #include <sys/errno.h> 48 #include <sys/event.h> 49 #include <sys/poll.h> 50 #include <sys/proc.h> 51 #include <sys/selinfo.h> 52 #include <sys/sx.h> 53 #include <sys/syscallsubr.h> 54 #include <sys/timespec.h> 55 56 #ifdef COMPAT_LINUX32 57 #include <machine/../linux32/linux.h> 58 #include <machine/../linux32/linux32_proto.h> 59 #else 60 #include <machine/../linux/linux.h> 61 #include <machine/../linux/linux_proto.h> 62 #endif 63 64 #include <compat/linux/linux_emul.h> 65 #include <compat/linux/linux_event.h> 66 #include <compat/linux/linux_file.h> 67 #include <compat/linux/linux_timer.h> 68 #include <compat/linux/linux_util.h> 69 70 /* 71 * epoll defines 'struct epoll_event' with the field 'data' as 64 bits 72 * on all architectures. But on 32 bit architectures BSD 'struct kevent' only 73 * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied 74 * data verbatuim. Therefore we allocate 64-bit memory block to pass 75 * user supplied data for every file descriptor. 76 */ 77 78 typedef uint64_t epoll_udata_t; 79 80 struct epoll_emuldata { 81 uint32_t fdc; /* epoll udata max index */ 82 epoll_udata_t udata[1]; /* epoll user data vector */ 83 }; 84 85 #define EPOLL_DEF_SZ 16 86 #define EPOLL_SIZE(fdn) \ 87 (sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t)) 88 89 struct epoll_event { 90 uint32_t events; 91 epoll_udata_t data; 92 } 93 #if defined(__amd64__) 94 __attribute__((packed)) 95 #endif 96 ; 97 98 #define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 99 100 static void epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata); 101 static int epoll_to_kevent(struct thread *td, struct file *epfp, 102 int fd, struct epoll_event *l_event, int *kev_flags, 103 struct kevent *kevent, int *nkevents); 104 static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event); 105 static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count); 106 static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count); 107 static int epoll_delete_event(struct thread *td, struct file *epfp, 108 int fd, int filter); 109 static int epoll_delete_all_events(struct thread *td, struct file *epfp, 110 int fd); 111 112 struct epoll_copyin_args { 113 struct kevent *changelist; 114 }; 115 116 struct epoll_copyout_args { 117 struct epoll_event *leventlist; 118 struct proc *p; 119 uint32_t count; 120 int error; 121 }; 122 123 /* eventfd */ 124 typedef uint64_t eventfd_t; 125 126 static fo_rdwr_t eventfd_read; 127 static fo_rdwr_t eventfd_write; 128 static fo_ioctl_t eventfd_ioctl; 129 static fo_poll_t eventfd_poll; 130 static fo_kqfilter_t eventfd_kqfilter; 131 static fo_stat_t eventfd_stat; 132 static fo_close_t eventfd_close; 133 static fo_fill_kinfo_t eventfd_fill_kinfo; 134 135 static struct fileops eventfdops = { 136 .fo_read = eventfd_read, 137 .fo_write = eventfd_write, 138 .fo_truncate = invfo_truncate, 139 .fo_ioctl = eventfd_ioctl, 140 .fo_poll = eventfd_poll, 141 .fo_kqfilter = eventfd_kqfilter, 142 .fo_stat = eventfd_stat, 143 .fo_close = eventfd_close, 144 .fo_chmod = invfo_chmod, 145 .fo_chown = invfo_chown, 146 .fo_sendfile = invfo_sendfile, 147 .fo_fill_kinfo = eventfd_fill_kinfo, 148 .fo_flags = DFLAG_PASSABLE 149 }; 150 151 static void filt_eventfddetach(struct knote *kn); 152 static int filt_eventfdread(struct knote *kn, long hint); 153 static int filt_eventfdwrite(struct knote *kn, long hint); 154 155 static struct filterops eventfd_rfiltops = { 156 .f_isfd = 1, 157 .f_detach = filt_eventfddetach, 158 .f_event = filt_eventfdread 159 }; 160 static struct filterops eventfd_wfiltops = { 161 .f_isfd = 1, 162 .f_detach = filt_eventfddetach, 163 .f_event = filt_eventfdwrite 164 }; 165 166 /* timerfd */ 167 typedef uint64_t timerfd_t; 168 169 static fo_rdwr_t timerfd_read; 170 static fo_poll_t timerfd_poll; 171 static fo_kqfilter_t timerfd_kqfilter; 172 static fo_stat_t timerfd_stat; 173 static fo_close_t timerfd_close; 174 static fo_fill_kinfo_t timerfd_fill_kinfo; 175 176 static struct fileops timerfdops = { 177 .fo_read = timerfd_read, 178 .fo_write = invfo_rdwr, 179 .fo_truncate = invfo_truncate, 180 .fo_ioctl = invfo_ioctl, 181 .fo_poll = timerfd_poll, 182 .fo_kqfilter = timerfd_kqfilter, 183 .fo_stat = timerfd_stat, 184 .fo_close = timerfd_close, 185 .fo_chmod = invfo_chmod, 186 .fo_chown = invfo_chown, 187 .fo_sendfile = invfo_sendfile, 188 .fo_fill_kinfo = timerfd_fill_kinfo, 189 .fo_flags = DFLAG_PASSABLE 190 }; 191 192 static void filt_timerfddetach(struct knote *kn); 193 static int filt_timerfdread(struct knote *kn, long hint); 194 195 static struct filterops timerfd_rfiltops = { 196 .f_isfd = 1, 197 .f_detach = filt_timerfddetach, 198 .f_event = filt_timerfdread 199 }; 200 201 struct eventfd { 202 eventfd_t efd_count; 203 uint32_t efd_flags; 204 struct selinfo efd_sel; 205 struct mtx efd_lock; 206 }; 207 208 struct timerfd { 209 clockid_t tfd_clockid; 210 struct itimerspec tfd_time; 211 struct callout tfd_callout; 212 timerfd_t tfd_count; 213 bool tfd_canceled; 214 struct selinfo tfd_sel; 215 struct mtx tfd_lock; 216 }; 217 218 static int eventfd_create(struct thread *td, uint32_t initval, int flags); 219 static void linux_timerfd_expire(void *); 220 static void linux_timerfd_curval(struct timerfd *, struct itimerspec *); 221 222 223 static void 224 epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata) 225 { 226 struct linux_pemuldata *pem; 227 struct epoll_emuldata *emd; 228 struct proc *p; 229 230 p = td->td_proc; 231 232 pem = pem_find(p); 233 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 234 235 LINUX_PEM_XLOCK(pem); 236 if (pem->epoll == NULL) { 237 emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); 238 emd->fdc = fd; 239 pem->epoll = emd; 240 } else { 241 emd = pem->epoll; 242 if (fd > emd->fdc) { 243 emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); 244 emd->fdc = fd; 245 pem->epoll = emd; 246 } 247 } 248 emd->udata[fd] = udata; 249 LINUX_PEM_XUNLOCK(pem); 250 } 251 252 static int 253 epoll_create_common(struct thread *td, int flags) 254 { 255 int error; 256 257 error = kern_kqueue(td, flags, NULL); 258 if (error != 0) 259 return (error); 260 261 epoll_fd_install(td, EPOLL_DEF_SZ, 0); 262 263 return (0); 264 } 265 266 int 267 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args) 268 { 269 270 /* 271 * args->size is unused. Linux just tests it 272 * and then forgets it as well. 273 */ 274 if (args->size <= 0) 275 return (EINVAL); 276 277 return (epoll_create_common(td, 0)); 278 } 279 280 int 281 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args) 282 { 283 int flags; 284 285 if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0) 286 return (EINVAL); 287 288 flags = 0; 289 if ((args->flags & LINUX_O_CLOEXEC) != 0) 290 flags |= O_CLOEXEC; 291 292 return (epoll_create_common(td, flags)); 293 } 294 295 /* Structure converting function from epoll to kevent. */ 296 static int 297 epoll_to_kevent(struct thread *td, struct file *epfp, 298 int fd, struct epoll_event *l_event, int *kev_flags, 299 struct kevent *kevent, int *nkevents) 300 { 301 uint32_t levents = l_event->events; 302 struct linux_pemuldata *pem; 303 struct proc *p; 304 305 /* flags related to how event is registered */ 306 if ((levents & LINUX_EPOLLONESHOT) != 0) 307 *kev_flags |= EV_ONESHOT; 308 if ((levents & LINUX_EPOLLET) != 0) 309 *kev_flags |= EV_CLEAR; 310 if ((levents & LINUX_EPOLLERR) != 0) 311 *kev_flags |= EV_ERROR; 312 if ((levents & LINUX_EPOLLRDHUP) != 0) 313 *kev_flags |= EV_EOF; 314 315 /* flags related to what event is registered */ 316 if ((levents & LINUX_EPOLL_EVRD) != 0) { 317 EV_SET(kevent++, fd, EVFILT_READ, *kev_flags, 0, 0, 0); 318 ++(*nkevents); 319 } 320 if ((levents & LINUX_EPOLL_EVWR) != 0) { 321 EV_SET(kevent++, fd, EVFILT_WRITE, *kev_flags, 0, 0, 0); 322 ++(*nkevents); 323 } 324 325 if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) { 326 p = td->td_proc; 327 328 pem = pem_find(p); 329 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 330 KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n")); 331 332 LINUX_PEM_XLOCK(pem); 333 if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) { 334 pem->flags |= LINUX_XUNSUP_EPOLL; 335 LINUX_PEM_XUNLOCK(pem); 336 linux_msg(td, "epoll_ctl unsupported flags: 0x%x\n", 337 levents); 338 } else 339 LINUX_PEM_XUNLOCK(pem); 340 return (EINVAL); 341 } 342 343 return (0); 344 } 345 346 /* 347 * Structure converting function from kevent to epoll. In a case 348 * this is called on error in registration we store the error in 349 * event->data and pick it up later in linux_epoll_ctl(). 350 */ 351 static void 352 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event) 353 { 354 355 if ((kevent->flags & EV_ERROR) != 0) { 356 l_event->events = LINUX_EPOLLERR; 357 return; 358 } 359 360 switch (kevent->filter) { 361 case EVFILT_READ: 362 l_event->events = LINUX_EPOLLIN|LINUX_EPOLLRDNORM|LINUX_EPOLLPRI; 363 if ((kevent->flags & EV_EOF) != 0) 364 l_event->events |= LINUX_EPOLLRDHUP; 365 break; 366 case EVFILT_WRITE: 367 l_event->events = LINUX_EPOLLOUT|LINUX_EPOLLWRNORM; 368 break; 369 } 370 } 371 372 /* 373 * Copyout callback used by kevent. This converts kevent 374 * events to epoll events and copies them back to the 375 * userspace. This is also called on error on registering 376 * of the filter. 377 */ 378 static int 379 epoll_kev_copyout(void *arg, struct kevent *kevp, int count) 380 { 381 struct epoll_copyout_args *args; 382 struct linux_pemuldata *pem; 383 struct epoll_emuldata *emd; 384 struct epoll_event *eep; 385 int error, fd, i; 386 387 args = (struct epoll_copyout_args*) arg; 388 eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO); 389 390 pem = pem_find(args->p); 391 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 392 LINUX_PEM_SLOCK(pem); 393 emd = pem->epoll; 394 KASSERT(emd != NULL, ("epoll proc epolldata not found.\n")); 395 396 for (i = 0; i < count; i++) { 397 kevent_to_epoll(&kevp[i], &eep[i]); 398 399 fd = kevp[i].ident; 400 KASSERT(fd <= emd->fdc, ("epoll user data vector" 401 " is too small.\n")); 402 eep[i].data = emd->udata[fd]; 403 } 404 LINUX_PEM_SUNLOCK(pem); 405 406 error = copyout(eep, args->leventlist, count * sizeof(*eep)); 407 if (error == 0) { 408 args->leventlist += count; 409 args->count += count; 410 } else if (args->error == 0) 411 args->error = error; 412 413 free(eep, M_EPOLL); 414 return (error); 415 } 416 417 /* 418 * Copyin callback used by kevent. This copies already 419 * converted filters from kernel memory to the kevent 420 * internal kernel memory. Hence the memcpy instead of 421 * copyin. 422 */ 423 static int 424 epoll_kev_copyin(void *arg, struct kevent *kevp, int count) 425 { 426 struct epoll_copyin_args *args; 427 428 args = (struct epoll_copyin_args*) arg; 429 430 memcpy(kevp, args->changelist, count * sizeof(*kevp)); 431 args->changelist += count; 432 433 return (0); 434 } 435 436 /* 437 * Load epoll filter, convert it to kevent filter 438 * and load it into kevent subsystem. 439 */ 440 int 441 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args) 442 { 443 struct file *epfp, *fp; 444 struct epoll_copyin_args ciargs; 445 struct kevent kev[2]; 446 struct kevent_copyops k_ops = { &ciargs, 447 NULL, 448 epoll_kev_copyin}; 449 struct epoll_event le; 450 cap_rights_t rights; 451 int kev_flags; 452 int nchanges = 0; 453 int error; 454 455 if (args->op != LINUX_EPOLL_CTL_DEL) { 456 error = copyin(args->event, &le, sizeof(le)); 457 if (error != 0) 458 return (error); 459 } 460 461 error = fget(td, args->epfd, 462 cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &epfp); 463 if (error != 0) 464 return (error); 465 if (epfp->f_type != DTYPE_KQUEUE) { 466 error = EINVAL; 467 goto leave1; 468 } 469 470 /* Protect user data vector from incorrectly supplied fd. */ 471 error = fget(td, args->fd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp); 472 if (error != 0) 473 goto leave1; 474 475 /* Linux disallows spying on himself */ 476 if (epfp == fp) { 477 error = EINVAL; 478 goto leave0; 479 } 480 481 ciargs.changelist = kev; 482 483 switch (args->op) { 484 case LINUX_EPOLL_CTL_MOD: 485 /* 486 * We don't memorize which events were set for this FD 487 * on this level, so just delete all we could have set: 488 * EVFILT_READ and EVFILT_WRITE, ignoring any errors 489 */ 490 error = epoll_delete_all_events(td, epfp, args->fd); 491 if (error != 0) 492 goto leave0; 493 /* FALLTHROUGH */ 494 495 case LINUX_EPOLL_CTL_ADD: 496 kev_flags = EV_ADD | EV_ENABLE; 497 break; 498 499 case LINUX_EPOLL_CTL_DEL: 500 /* CTL_DEL means unregister this fd with this epoll */ 501 error = epoll_delete_all_events(td, epfp, args->fd); 502 goto leave0; 503 504 default: 505 error = EINVAL; 506 goto leave0; 507 } 508 509 error = epoll_to_kevent(td, epfp, args->fd, &le, &kev_flags, 510 kev, &nchanges); 511 if (error != 0) 512 goto leave0; 513 514 epoll_fd_install(td, args->fd, le.data); 515 516 error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL); 517 518 leave0: 519 fdrop(fp, td); 520 521 leave1: 522 fdrop(epfp, td); 523 return (error); 524 } 525 526 /* 527 * Wait for a filter to be triggered on the epoll file descriptor. 528 */ 529 static int 530 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events, 531 int maxevents, int timeout, sigset_t *uset) 532 { 533 struct epoll_copyout_args coargs; 534 struct kevent_copyops k_ops = { &coargs, 535 epoll_kev_copyout, 536 NULL}; 537 struct timespec ts, *tsp; 538 cap_rights_t rights; 539 struct file *epfp; 540 sigset_t omask; 541 int error; 542 543 if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS) 544 return (EINVAL); 545 546 error = fget(td, epfd, 547 cap_rights_init(&rights, CAP_KQUEUE_EVENT), &epfp); 548 if (error != 0) 549 return (error); 550 if (epfp->f_type != DTYPE_KQUEUE) { 551 error = EINVAL; 552 goto leave1; 553 } 554 if (uset != NULL) { 555 error = kern_sigprocmask(td, SIG_SETMASK, uset, 556 &omask, 0); 557 if (error != 0) 558 goto leave1; 559 td->td_pflags |= TDP_OLDMASK; 560 /* 561 * Make sure that ast() is called on return to 562 * usermode and TDP_OLDMASK is cleared, restoring old 563 * sigmask. 564 */ 565 thread_lock(td); 566 td->td_flags |= TDF_ASTPENDING; 567 thread_unlock(td); 568 } 569 570 571 coargs.leventlist = events; 572 coargs.p = td->td_proc; 573 coargs.count = 0; 574 coargs.error = 0; 575 576 if (timeout != -1) { 577 if (timeout < 0) { 578 error = EINVAL; 579 goto leave0; 580 } 581 /* Convert from milliseconds to timespec. */ 582 ts.tv_sec = timeout / 1000; 583 ts.tv_nsec = (timeout % 1000) * 1000000; 584 tsp = &ts; 585 } else { 586 tsp = NULL; 587 } 588 589 error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp); 590 if (error == 0 && coargs.error != 0) 591 error = coargs.error; 592 593 /* 594 * kern_kevent might return ENOMEM which is not expected from epoll_wait. 595 * Maybe we should translate that but I don't think it matters at all. 596 */ 597 if (error == 0) 598 td->td_retval[0] = coargs.count; 599 600 leave0: 601 if (uset != NULL) 602 error = kern_sigprocmask(td, SIG_SETMASK, &omask, 603 NULL, 0); 604 leave1: 605 fdrop(epfp, td); 606 return (error); 607 } 608 609 int 610 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args) 611 { 612 613 return (linux_epoll_wait_common(td, args->epfd, args->events, 614 args->maxevents, args->timeout, NULL)); 615 } 616 617 int 618 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args) 619 { 620 sigset_t mask, *pmask; 621 l_sigset_t lmask; 622 int error; 623 624 if (args->mask != NULL) { 625 if (args->sigsetsize != sizeof(l_sigset_t)) 626 return (EINVAL); 627 error = copyin(args->mask, &lmask, sizeof(l_sigset_t)); 628 if (error != 0) 629 return (error); 630 linux_to_bsd_sigset(&lmask, &mask); 631 pmask = &mask; 632 } else 633 pmask = NULL; 634 return (linux_epoll_wait_common(td, args->epfd, args->events, 635 args->maxevents, args->timeout, pmask)); 636 } 637 638 static int 639 epoll_delete_event(struct thread *td, struct file *epfp, int fd, int filter) 640 { 641 struct epoll_copyin_args ciargs; 642 struct kevent kev; 643 struct kevent_copyops k_ops = { &ciargs, 644 NULL, 645 epoll_kev_copyin}; 646 int error; 647 648 ciargs.changelist = &kev; 649 EV_SET(&kev, fd, filter, EV_DELETE | EV_DISABLE, 0, 0, 0); 650 651 error = kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL); 652 653 /* 654 * here we ignore ENONT, because we don't keep track of events here 655 */ 656 if (error == ENOENT) 657 error = 0; 658 return (error); 659 } 660 661 static int 662 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd) 663 { 664 int error1, error2; 665 666 error1 = epoll_delete_event(td, epfp, fd, EVFILT_READ); 667 error2 = epoll_delete_event(td, epfp, fd, EVFILT_WRITE); 668 669 /* report any errors we got */ 670 return (error1 == 0 ? error2 : error1); 671 } 672 673 static int 674 eventfd_create(struct thread *td, uint32_t initval, int flags) 675 { 676 struct filedesc *fdp; 677 struct eventfd *efd; 678 struct file *fp; 679 int fflags, fd, error; 680 681 fflags = 0; 682 if ((flags & LINUX_O_CLOEXEC) != 0) 683 fflags |= O_CLOEXEC; 684 685 fdp = td->td_proc->p_fd; 686 error = falloc(td, &fp, &fd, fflags); 687 if (error != 0) 688 return (error); 689 690 efd = malloc(sizeof(*efd), M_EPOLL, M_WAITOK | M_ZERO); 691 efd->efd_flags = flags; 692 efd->efd_count = initval; 693 mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF); 694 695 knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock); 696 697 fflags = FREAD | FWRITE; 698 if ((flags & LINUX_O_NONBLOCK) != 0) 699 fflags |= FNONBLOCK; 700 701 finit(fp, fflags, DTYPE_LINUXEFD, efd, &eventfdops); 702 fdrop(fp, td); 703 704 td->td_retval[0] = fd; 705 return (error); 706 } 707 708 int 709 linux_eventfd(struct thread *td, struct linux_eventfd_args *args) 710 { 711 712 return (eventfd_create(td, args->initval, 0)); 713 } 714 715 int 716 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args) 717 { 718 719 if ((args->flags & ~(LINUX_O_CLOEXEC|LINUX_O_NONBLOCK|LINUX_EFD_SEMAPHORE)) != 0) 720 return (EINVAL); 721 722 return (eventfd_create(td, args->initval, args->flags)); 723 } 724 725 static int 726 eventfd_close(struct file *fp, struct thread *td) 727 { 728 struct eventfd *efd; 729 730 efd = fp->f_data; 731 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 732 return (EBADF); 733 734 seldrain(&efd->efd_sel); 735 knlist_destroy(&efd->efd_sel.si_note); 736 737 fp->f_ops = &badfileops; 738 mtx_destroy(&efd->efd_lock); 739 free(efd, M_EPOLL); 740 741 return (0); 742 } 743 744 static int 745 eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 746 int flags, struct thread *td) 747 { 748 struct eventfd *efd; 749 eventfd_t count; 750 int error; 751 752 efd = fp->f_data; 753 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 754 return (EBADF); 755 756 if (uio->uio_resid < sizeof(eventfd_t)) 757 return (EINVAL); 758 759 error = 0; 760 mtx_lock(&efd->efd_lock); 761 retry: 762 if (efd->efd_count == 0) { 763 if ((efd->efd_flags & LINUX_O_NONBLOCK) != 0) { 764 mtx_unlock(&efd->efd_lock); 765 return (EAGAIN); 766 } 767 error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "lefdrd", 0); 768 if (error == 0) 769 goto retry; 770 } 771 if (error == 0) { 772 if ((efd->efd_flags & LINUX_EFD_SEMAPHORE) != 0) { 773 count = 1; 774 --efd->efd_count; 775 } else { 776 count = efd->efd_count; 777 efd->efd_count = 0; 778 } 779 KNOTE_LOCKED(&efd->efd_sel.si_note, 0); 780 selwakeup(&efd->efd_sel); 781 wakeup(&efd->efd_count); 782 mtx_unlock(&efd->efd_lock); 783 error = uiomove(&count, sizeof(eventfd_t), uio); 784 } else 785 mtx_unlock(&efd->efd_lock); 786 787 return (error); 788 } 789 790 static int 791 eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred, 792 int flags, struct thread *td) 793 { 794 struct eventfd *efd; 795 eventfd_t count; 796 int error; 797 798 efd = fp->f_data; 799 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 800 return (EBADF); 801 802 if (uio->uio_resid < sizeof(eventfd_t)) 803 return (EINVAL); 804 805 error = uiomove(&count, sizeof(eventfd_t), uio); 806 if (error != 0) 807 return (error); 808 if (count == UINT64_MAX) 809 return (EINVAL); 810 811 mtx_lock(&efd->efd_lock); 812 retry: 813 if (UINT64_MAX - efd->efd_count <= count) { 814 if ((efd->efd_flags & LINUX_O_NONBLOCK) != 0) { 815 mtx_unlock(&efd->efd_lock); 816 /* Do not not return the number of bytes written */ 817 uio->uio_resid += sizeof(eventfd_t); 818 return (EAGAIN); 819 } 820 error = mtx_sleep(&efd->efd_count, &efd->efd_lock, 821 PCATCH, "lefdwr", 0); 822 if (error == 0) 823 goto retry; 824 } 825 if (error == 0) { 826 efd->efd_count += count; 827 KNOTE_LOCKED(&efd->efd_sel.si_note, 0); 828 selwakeup(&efd->efd_sel); 829 wakeup(&efd->efd_count); 830 } 831 mtx_unlock(&efd->efd_lock); 832 833 return (error); 834 } 835 836 static int 837 eventfd_poll(struct file *fp, int events, struct ucred *active_cred, 838 struct thread *td) 839 { 840 struct eventfd *efd; 841 int revents = 0; 842 843 efd = fp->f_data; 844 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 845 return (POLLERR); 846 847 mtx_lock(&efd->efd_lock); 848 if ((events & (POLLIN|POLLRDNORM)) && efd->efd_count > 0) 849 revents |= events & (POLLIN|POLLRDNORM); 850 if ((events & (POLLOUT|POLLWRNORM)) && UINT64_MAX - 1 > efd->efd_count) 851 revents |= events & (POLLOUT|POLLWRNORM); 852 if (revents == 0) 853 selrecord(td, &efd->efd_sel); 854 mtx_unlock(&efd->efd_lock); 855 856 return (revents); 857 } 858 859 /*ARGSUSED*/ 860 static int 861 eventfd_kqfilter(struct file *fp, struct knote *kn) 862 { 863 struct eventfd *efd; 864 865 efd = fp->f_data; 866 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 867 return (EINVAL); 868 869 mtx_lock(&efd->efd_lock); 870 switch (kn->kn_filter) { 871 case EVFILT_READ: 872 kn->kn_fop = &eventfd_rfiltops; 873 break; 874 case EVFILT_WRITE: 875 kn->kn_fop = &eventfd_wfiltops; 876 break; 877 default: 878 mtx_unlock(&efd->efd_lock); 879 return (EINVAL); 880 } 881 882 kn->kn_hook = efd; 883 knlist_add(&efd->efd_sel.si_note, kn, 1); 884 mtx_unlock(&efd->efd_lock); 885 886 return (0); 887 } 888 889 static void 890 filt_eventfddetach(struct knote *kn) 891 { 892 struct eventfd *efd = kn->kn_hook; 893 894 mtx_lock(&efd->efd_lock); 895 knlist_remove(&efd->efd_sel.si_note, kn, 1); 896 mtx_unlock(&efd->efd_lock); 897 } 898 899 /*ARGSUSED*/ 900 static int 901 filt_eventfdread(struct knote *kn, long hint) 902 { 903 struct eventfd *efd = kn->kn_hook; 904 int ret; 905 906 mtx_assert(&efd->efd_lock, MA_OWNED); 907 ret = (efd->efd_count > 0); 908 909 return (ret); 910 } 911 912 /*ARGSUSED*/ 913 static int 914 filt_eventfdwrite(struct knote *kn, long hint) 915 { 916 struct eventfd *efd = kn->kn_hook; 917 int ret; 918 919 mtx_assert(&efd->efd_lock, MA_OWNED); 920 ret = (UINT64_MAX - 1 > efd->efd_count); 921 922 return (ret); 923 } 924 925 /*ARGSUSED*/ 926 static int 927 eventfd_ioctl(struct file *fp, u_long cmd, void *data, 928 struct ucred *active_cred, struct thread *td) 929 { 930 struct eventfd *efd; 931 932 efd = fp->f_data; 933 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 934 return (EINVAL); 935 936 switch (cmd) 937 { 938 case FIONBIO: 939 if (*(int *)data) 940 efd->efd_flags |= LINUX_O_NONBLOCK; 941 else 942 efd->efd_flags &= ~LINUX_O_NONBLOCK; 943 case FIOASYNC: 944 return (0); 945 default: 946 return (ENXIO); 947 } 948 } 949 950 /*ARGSUSED*/ 951 static int 952 eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, 953 struct thread *td) 954 { 955 956 return (ENXIO); 957 } 958 959 /*ARGSUSED*/ 960 static int 961 eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 962 { 963 964 kif->kf_type = KF_TYPE_UNKNOWN; 965 return (0); 966 } 967 968 int 969 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args) 970 { 971 struct filedesc *fdp; 972 struct timerfd *tfd; 973 struct file *fp; 974 clockid_t clockid; 975 int fflags, fd, error; 976 977 if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0) 978 return (EINVAL); 979 980 error = linux_to_native_clockid(&clockid, args->clockid); 981 if (error != 0) 982 return (error); 983 if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) 984 return (EINVAL); 985 986 fflags = 0; 987 if ((args->flags & LINUX_TFD_CLOEXEC) != 0) 988 fflags |= O_CLOEXEC; 989 990 fdp = td->td_proc->p_fd; 991 error = falloc(td, &fp, &fd, fflags); 992 if (error != 0) 993 return (error); 994 995 tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO); 996 tfd->tfd_clockid = clockid; 997 mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF); 998 999 callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0); 1000 knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock); 1001 1002 fflags = FREAD; 1003 if ((args->flags & LINUX_O_NONBLOCK) != 0) 1004 fflags |= FNONBLOCK; 1005 1006 finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops); 1007 fdrop(fp, td); 1008 1009 td->td_retval[0] = fd; 1010 return (error); 1011 } 1012 1013 static int 1014 timerfd_close(struct file *fp, struct thread *td) 1015 { 1016 struct timerfd *tfd; 1017 1018 tfd = fp->f_data; 1019 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1020 return (EINVAL); 1021 1022 timespecclear(&tfd->tfd_time.it_value); 1023 timespecclear(&tfd->tfd_time.it_interval); 1024 1025 mtx_lock(&tfd->tfd_lock); 1026 callout_drain(&tfd->tfd_callout); 1027 mtx_unlock(&tfd->tfd_lock); 1028 1029 seldrain(&tfd->tfd_sel); 1030 knlist_destroy(&tfd->tfd_sel.si_note); 1031 1032 fp->f_ops = &badfileops; 1033 mtx_destroy(&tfd->tfd_lock); 1034 free(tfd, M_EPOLL); 1035 1036 return (0); 1037 } 1038 1039 static int 1040 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 1041 int flags, struct thread *td) 1042 { 1043 struct timerfd *tfd; 1044 timerfd_t count; 1045 int error; 1046 1047 tfd = fp->f_data; 1048 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1049 return (EINVAL); 1050 1051 if (uio->uio_resid < sizeof(timerfd_t)) 1052 return (EINVAL); 1053 1054 error = 0; 1055 mtx_lock(&tfd->tfd_lock); 1056 retry: 1057 if (tfd->tfd_canceled) { 1058 tfd->tfd_count = 0; 1059 mtx_unlock(&tfd->tfd_lock); 1060 return (ECANCELED); 1061 } 1062 if (tfd->tfd_count == 0) { 1063 if ((fp->f_flag & FNONBLOCK) != 0) { 1064 mtx_unlock(&tfd->tfd_lock); 1065 return (EAGAIN); 1066 } 1067 error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0); 1068 if (error == 0) 1069 goto retry; 1070 } 1071 if (error == 0) { 1072 count = tfd->tfd_count; 1073 tfd->tfd_count = 0; 1074 mtx_unlock(&tfd->tfd_lock); 1075 error = uiomove(&count, sizeof(timerfd_t), uio); 1076 } else 1077 mtx_unlock(&tfd->tfd_lock); 1078 1079 return (error); 1080 } 1081 1082 static int 1083 timerfd_poll(struct file *fp, int events, struct ucred *active_cred, 1084 struct thread *td) 1085 { 1086 struct timerfd *tfd; 1087 int revents = 0; 1088 1089 tfd = fp->f_data; 1090 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1091 return (POLLERR); 1092 1093 mtx_lock(&tfd->tfd_lock); 1094 if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0) 1095 revents |= events & (POLLIN|POLLRDNORM); 1096 if (revents == 0) 1097 selrecord(td, &tfd->tfd_sel); 1098 mtx_unlock(&tfd->tfd_lock); 1099 1100 return (revents); 1101 } 1102 1103 /*ARGSUSED*/ 1104 static int 1105 timerfd_kqfilter(struct file *fp, struct knote *kn) 1106 { 1107 struct timerfd *tfd; 1108 1109 tfd = fp->f_data; 1110 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1111 return (EINVAL); 1112 1113 if (kn->kn_filter == EVFILT_READ) 1114 kn->kn_fop = &timerfd_rfiltops; 1115 else 1116 return (EINVAL); 1117 1118 kn->kn_hook = tfd; 1119 knlist_add(&tfd->tfd_sel.si_note, kn, 0); 1120 1121 return (0); 1122 } 1123 1124 static void 1125 filt_timerfddetach(struct knote *kn) 1126 { 1127 struct timerfd *tfd = kn->kn_hook; 1128 1129 mtx_lock(&tfd->tfd_lock); 1130 knlist_remove(&tfd->tfd_sel.si_note, kn, 1); 1131 mtx_unlock(&tfd->tfd_lock); 1132 } 1133 1134 /*ARGSUSED*/ 1135 static int 1136 filt_timerfdread(struct knote *kn, long hint) 1137 { 1138 struct timerfd *tfd = kn->kn_hook; 1139 1140 return (tfd->tfd_count > 0); 1141 } 1142 1143 /*ARGSUSED*/ 1144 static int 1145 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, 1146 struct thread *td) 1147 { 1148 1149 return (ENXIO); 1150 } 1151 1152 /*ARGSUSED*/ 1153 static int 1154 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 1155 { 1156 1157 kif->kf_type = KF_TYPE_UNKNOWN; 1158 return (0); 1159 } 1160 1161 static void 1162 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts) 1163 { 1164 1165 if (tfd->tfd_clockid == CLOCK_REALTIME) 1166 getnanotime(ts); 1167 else /* CLOCK_MONOTONIC */ 1168 getnanouptime(ts); 1169 } 1170 1171 static void 1172 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots) 1173 { 1174 struct timespec cts; 1175 1176 linux_timerfd_clocktime(tfd, &cts); 1177 *ots = tfd->tfd_time; 1178 if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) { 1179 timespecsub(&ots->it_value, &cts); 1180 if (ots->it_value.tv_sec < 0 || 1181 (ots->it_value.tv_sec == 0 && 1182 ots->it_value.tv_nsec == 0)) { 1183 ots->it_value.tv_sec = 0; 1184 ots->it_value.tv_nsec = 1; 1185 } 1186 } 1187 } 1188 1189 int 1190 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args) 1191 { 1192 cap_rights_t rights; 1193 struct l_itimerspec lots; 1194 struct itimerspec ots; 1195 struct timerfd *tfd; 1196 struct file *fp; 1197 int error; 1198 1199 error = fget(td, args->fd, cap_rights_init(&rights, CAP_READ), &fp); 1200 if (error != 0) 1201 return (error); 1202 tfd = fp->f_data; 1203 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { 1204 error = EINVAL; 1205 goto out; 1206 } 1207 1208 mtx_lock(&tfd->tfd_lock); 1209 linux_timerfd_curval(tfd, &ots); 1210 mtx_unlock(&tfd->tfd_lock); 1211 1212 error = native_to_linux_itimerspec(&lots, &ots); 1213 if (error == 0) 1214 error = copyout(&lots, args->old_value, sizeof(lots)); 1215 1216 out: 1217 fdrop(fp, td); 1218 return (error); 1219 } 1220 1221 int 1222 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args) 1223 { 1224 struct l_itimerspec lots; 1225 struct itimerspec nts, ots; 1226 struct timespec cts, ts; 1227 cap_rights_t rights; 1228 struct timerfd *tfd; 1229 struct timeval tv; 1230 struct file *fp; 1231 int error; 1232 1233 if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0) 1234 return (EINVAL); 1235 1236 error = copyin(args->new_value, &lots, sizeof(lots)); 1237 if (error != 0) 1238 return (error); 1239 error = linux_to_native_itimerspec(&nts, &lots); 1240 if (error != 0) 1241 return (error); 1242 1243 error = fget(td, args->fd, cap_rights_init(&rights, CAP_WRITE), &fp); 1244 if (error != 0) 1245 return (error); 1246 tfd = fp->f_data; 1247 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { 1248 error = EINVAL; 1249 goto out; 1250 } 1251 1252 mtx_lock(&tfd->tfd_lock); 1253 if (!timespecisset(&nts.it_value)) 1254 timespecclear(&nts.it_interval); 1255 if (args->old_value != NULL) 1256 linux_timerfd_curval(tfd, &ots); 1257 1258 tfd->tfd_time = nts; 1259 if (timespecisset(&nts.it_value)) { 1260 linux_timerfd_clocktime(tfd, &cts); 1261 ts = nts.it_value; 1262 if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) { 1263 timespecadd(&tfd->tfd_time.it_value, &cts); 1264 } else { 1265 timespecsub(&ts, &cts); 1266 } 1267 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1268 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1269 linux_timerfd_expire, tfd); 1270 tfd->tfd_canceled = false; 1271 } else { 1272 tfd->tfd_canceled = true; 1273 callout_stop(&tfd->tfd_callout); 1274 } 1275 mtx_unlock(&tfd->tfd_lock); 1276 1277 if (args->old_value != NULL) { 1278 error = native_to_linux_itimerspec(&lots, &ots); 1279 if (error == 0) 1280 error = copyout(&lots, args->old_value, sizeof(lots)); 1281 } 1282 1283 out: 1284 fdrop(fp, td); 1285 return (error); 1286 } 1287 1288 static void 1289 linux_timerfd_expire(void *arg) 1290 { 1291 struct timespec cts, ts; 1292 struct timeval tv; 1293 struct timerfd *tfd; 1294 1295 tfd = (struct timerfd *)arg; 1296 1297 linux_timerfd_clocktime(tfd, &cts); 1298 if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) { 1299 if (timespecisset(&tfd->tfd_time.it_interval)) 1300 timespecadd(&tfd->tfd_time.it_value, 1301 &tfd->tfd_time.it_interval); 1302 else 1303 /* single shot timer */ 1304 timespecclear(&tfd->tfd_time.it_value); 1305 if (timespecisset(&tfd->tfd_time.it_value)) { 1306 ts = tfd->tfd_time.it_value; 1307 timespecsub(&ts, &cts); 1308 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1309 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1310 linux_timerfd_expire, tfd); 1311 } 1312 tfd->tfd_count++; 1313 KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0); 1314 selwakeup(&tfd->tfd_sel); 1315 wakeup(&tfd->tfd_count); 1316 } else if (timespecisset(&tfd->tfd_time.it_value)) { 1317 ts = tfd->tfd_time.it_value; 1318 timespecsub(&ts, &cts); 1319 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1320 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1321 linux_timerfd_expire, tfd); 1322 } 1323 } 1324