1 /*- 2 * Copyright (c) 2007 Roman Divacky 3 * Copyright (c) 2014 Dmitry Chagin 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include "opt_compat.h" 32 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/imgact.h> 36 #include <sys/kernel.h> 37 #include <sys/limits.h> 38 #include <sys/lock.h> 39 #include <sys/mutex.h> 40 #include <sys/callout.h> 41 #include <sys/capsicum.h> 42 #include <sys/types.h> 43 #include <sys/user.h> 44 #include <sys/file.h> 45 #include <sys/filedesc.h> 46 #include <sys/filio.h> 47 #include <sys/errno.h> 48 #include <sys/event.h> 49 #include <sys/poll.h> 50 #include <sys/proc.h> 51 #include <sys/selinfo.h> 52 #include <sys/sx.h> 53 #include <sys/syscallsubr.h> 54 #include <sys/timespec.h> 55 56 #ifdef COMPAT_LINUX32 57 #include <machine/../linux32/linux.h> 58 #include <machine/../linux32/linux32_proto.h> 59 #else 60 #include <machine/../linux/linux.h> 61 #include <machine/../linux/linux_proto.h> 62 #endif 63 64 #include <compat/linux/linux_emul.h> 65 #include <compat/linux/linux_event.h> 66 #include <compat/linux/linux_file.h> 67 #include <compat/linux/linux_timer.h> 68 #include <compat/linux/linux_util.h> 69 70 /* 71 * epoll defines 'struct epoll_event' with the field 'data' as 64 bits 72 * on all architectures. But on 32 bit architectures BSD 'struct kevent' only 73 * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied 74 * data verbatuim. Therefore we allocate 64-bit memory block to pass 75 * user supplied data for every file descriptor. 76 */ 77 78 typedef uint64_t epoll_udata_t; 79 80 struct epoll_emuldata { 81 uint32_t fdc; /* epoll udata max index */ 82 epoll_udata_t udata[1]; /* epoll user data vector */ 83 }; 84 85 #define EPOLL_DEF_SZ 16 86 #define EPOLL_SIZE(fdn) \ 87 (sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t)) 88 89 struct epoll_event { 90 uint32_t events; 91 epoll_udata_t data; 92 } 93 #if defined(__amd64__) 94 __attribute__((packed)) 95 #endif 96 ; 97 98 #define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 99 100 static void epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata); 101 static int epoll_to_kevent(struct thread *td, struct file *epfp, 102 int fd, struct epoll_event *l_event, int *kev_flags, 103 struct kevent *kevent, int *nkevents); 104 static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event); 105 static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count); 106 static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count); 107 static int epoll_delete_event(struct thread *td, struct file *epfp, 108 int fd, int filter); 109 static int epoll_delete_all_events(struct thread *td, struct file *epfp, 110 int fd); 111 112 struct epoll_copyin_args { 113 struct kevent *changelist; 114 }; 115 116 struct epoll_copyout_args { 117 struct epoll_event *leventlist; 118 struct proc *p; 119 uint32_t count; 120 int error; 121 }; 122 123 /* eventfd */ 124 typedef uint64_t eventfd_t; 125 126 static fo_rdwr_t eventfd_read; 127 static fo_rdwr_t eventfd_write; 128 static fo_ioctl_t eventfd_ioctl; 129 static fo_poll_t eventfd_poll; 130 static fo_kqfilter_t eventfd_kqfilter; 131 static fo_stat_t eventfd_stat; 132 static fo_close_t eventfd_close; 133 static fo_fill_kinfo_t eventfd_fill_kinfo; 134 135 static struct fileops eventfdops = { 136 .fo_read = eventfd_read, 137 .fo_write = eventfd_write, 138 .fo_truncate = invfo_truncate, 139 .fo_ioctl = eventfd_ioctl, 140 .fo_poll = eventfd_poll, 141 .fo_kqfilter = eventfd_kqfilter, 142 .fo_stat = eventfd_stat, 143 .fo_close = eventfd_close, 144 .fo_chmod = invfo_chmod, 145 .fo_chown = invfo_chown, 146 .fo_sendfile = invfo_sendfile, 147 .fo_fill_kinfo = eventfd_fill_kinfo, 148 .fo_flags = DFLAG_PASSABLE 149 }; 150 151 static void filt_eventfddetach(struct knote *kn); 152 static int filt_eventfdread(struct knote *kn, long hint); 153 static int filt_eventfdwrite(struct knote *kn, long hint); 154 155 static struct filterops eventfd_rfiltops = { 156 .f_isfd = 1, 157 .f_detach = filt_eventfddetach, 158 .f_event = filt_eventfdread 159 }; 160 static struct filterops eventfd_wfiltops = { 161 .f_isfd = 1, 162 .f_detach = filt_eventfddetach, 163 .f_event = filt_eventfdwrite 164 }; 165 166 /* timerfd */ 167 typedef uint64_t timerfd_t; 168 169 static fo_rdwr_t timerfd_read; 170 static fo_poll_t timerfd_poll; 171 static fo_kqfilter_t timerfd_kqfilter; 172 static fo_stat_t timerfd_stat; 173 static fo_close_t timerfd_close; 174 static fo_fill_kinfo_t timerfd_fill_kinfo; 175 176 static struct fileops timerfdops = { 177 .fo_read = timerfd_read, 178 .fo_write = invfo_rdwr, 179 .fo_truncate = invfo_truncate, 180 .fo_ioctl = eventfd_ioctl, 181 .fo_poll = timerfd_poll, 182 .fo_kqfilter = timerfd_kqfilter, 183 .fo_stat = timerfd_stat, 184 .fo_close = timerfd_close, 185 .fo_chmod = invfo_chmod, 186 .fo_chown = invfo_chown, 187 .fo_sendfile = invfo_sendfile, 188 .fo_fill_kinfo = timerfd_fill_kinfo, 189 .fo_flags = DFLAG_PASSABLE 190 }; 191 192 static void filt_timerfddetach(struct knote *kn); 193 static int filt_timerfdread(struct knote *kn, long hint); 194 195 static struct filterops timerfd_rfiltops = { 196 .f_isfd = 1, 197 .f_detach = filt_timerfddetach, 198 .f_event = filt_timerfdread 199 }; 200 201 struct eventfd { 202 eventfd_t efd_count; 203 uint32_t efd_flags; 204 struct selinfo efd_sel; 205 struct mtx efd_lock; 206 }; 207 208 struct timerfd { 209 clockid_t tfd_clockid; 210 struct itimerspec tfd_time; 211 struct callout tfd_callout; 212 timerfd_t tfd_count; 213 bool tfd_canceled; 214 struct selinfo tfd_sel; 215 struct mtx tfd_lock; 216 }; 217 218 static int eventfd_create(struct thread *td, uint32_t initval, int flags); 219 static void linux_timerfd_expire(void *); 220 static void linux_timerfd_curval(struct timerfd *, struct itimerspec *); 221 222 223 static void 224 epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata) 225 { 226 struct linux_pemuldata *pem; 227 struct epoll_emuldata *emd; 228 struct proc *p; 229 230 p = td->td_proc; 231 232 pem = pem_find(p); 233 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 234 235 LINUX_PEM_XLOCK(pem); 236 if (pem->epoll == NULL) { 237 emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); 238 emd->fdc = fd; 239 pem->epoll = emd; 240 } else { 241 emd = pem->epoll; 242 if (fd > emd->fdc) { 243 emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); 244 emd->fdc = fd; 245 pem->epoll = emd; 246 } 247 } 248 emd->udata[fd] = udata; 249 LINUX_PEM_XUNLOCK(pem); 250 } 251 252 static int 253 epoll_create_common(struct thread *td, int flags) 254 { 255 int error; 256 257 error = kern_kqueue(td, flags, NULL); 258 if (error != 0) 259 return (error); 260 261 epoll_fd_install(td, EPOLL_DEF_SZ, 0); 262 263 return (0); 264 } 265 266 int 267 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args) 268 { 269 270 /* 271 * args->size is unused. Linux just tests it 272 * and then forgets it as well. 273 */ 274 if (args->size <= 0) 275 return (EINVAL); 276 277 return (epoll_create_common(td, 0)); 278 } 279 280 int 281 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args) 282 { 283 int flags; 284 285 if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0) 286 return (EINVAL); 287 288 flags = 0; 289 if ((args->flags & LINUX_O_CLOEXEC) != 0) 290 flags |= O_CLOEXEC; 291 292 return (epoll_create_common(td, flags)); 293 } 294 295 /* Structure converting function from epoll to kevent. */ 296 static int 297 epoll_to_kevent(struct thread *td, struct file *epfp, 298 int fd, struct epoll_event *l_event, int *kev_flags, 299 struct kevent *kevent, int *nkevents) 300 { 301 uint32_t levents = l_event->events; 302 struct linux_pemuldata *pem; 303 struct proc *p; 304 305 /* flags related to how event is registered */ 306 if ((levents & LINUX_EPOLLONESHOT) != 0) 307 *kev_flags |= EV_ONESHOT; 308 if ((levents & LINUX_EPOLLET) != 0) 309 *kev_flags |= EV_CLEAR; 310 if ((levents & LINUX_EPOLLERR) != 0) 311 *kev_flags |= EV_ERROR; 312 if ((levents & LINUX_EPOLLRDHUP) != 0) 313 *kev_flags |= EV_EOF; 314 315 /* flags related to what event is registered */ 316 if ((levents & LINUX_EPOLL_EVRD) != 0) { 317 EV_SET(kevent++, fd, EVFILT_READ, *kev_flags, 0, 0, 0); 318 ++(*nkevents); 319 } 320 if ((levents & LINUX_EPOLL_EVWR) != 0) { 321 EV_SET(kevent++, fd, EVFILT_WRITE, *kev_flags, 0, 0, 0); 322 ++(*nkevents); 323 } 324 325 if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) { 326 p = td->td_proc; 327 328 pem = pem_find(p); 329 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 330 KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n")); 331 332 LINUX_PEM_XLOCK(pem); 333 if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) { 334 pem->flags |= LINUX_XUNSUP_EPOLL; 335 LINUX_PEM_XUNLOCK(pem); 336 linux_msg(td, "epoll_ctl unsupported flags: 0x%x\n", 337 levents); 338 } else 339 LINUX_PEM_XUNLOCK(pem); 340 return (EINVAL); 341 } 342 343 return (0); 344 } 345 346 /* 347 * Structure converting function from kevent to epoll. In a case 348 * this is called on error in registration we store the error in 349 * event->data and pick it up later in linux_epoll_ctl(). 350 */ 351 static void 352 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event) 353 { 354 355 if ((kevent->flags & EV_ERROR) != 0) { 356 l_event->events = LINUX_EPOLLERR; 357 return; 358 } 359 360 /* XXX EPOLLPRI, EPOLLHUP */ 361 switch (kevent->filter) { 362 case EVFILT_READ: 363 l_event->events = LINUX_EPOLLIN; 364 if ((kevent->flags & EV_EOF) != 0) 365 l_event->events |= LINUX_EPOLLRDHUP; 366 break; 367 case EVFILT_WRITE: 368 l_event->events = LINUX_EPOLLOUT; 369 break; 370 } 371 } 372 373 /* 374 * Copyout callback used by kevent. This converts kevent 375 * events to epoll events and copies them back to the 376 * userspace. This is also called on error on registering 377 * of the filter. 378 */ 379 static int 380 epoll_kev_copyout(void *arg, struct kevent *kevp, int count) 381 { 382 struct epoll_copyout_args *args; 383 struct linux_pemuldata *pem; 384 struct epoll_emuldata *emd; 385 struct epoll_event *eep; 386 int error, fd, i; 387 388 args = (struct epoll_copyout_args*) arg; 389 eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO); 390 391 pem = pem_find(args->p); 392 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 393 LINUX_PEM_SLOCK(pem); 394 emd = pem->epoll; 395 KASSERT(emd != NULL, ("epoll proc epolldata not found.\n")); 396 397 for (i = 0; i < count; i++) { 398 kevent_to_epoll(&kevp[i], &eep[i]); 399 400 fd = kevp[i].ident; 401 KASSERT(fd <= emd->fdc, ("epoll user data vector" 402 " is too small.\n")); 403 eep[i].data = emd->udata[fd]; 404 } 405 LINUX_PEM_SUNLOCK(pem); 406 407 error = copyout(eep, args->leventlist, count * sizeof(*eep)); 408 if (error == 0) { 409 args->leventlist += count; 410 args->count += count; 411 } else if (args->error == 0) 412 args->error = error; 413 414 free(eep, M_EPOLL); 415 return (error); 416 } 417 418 /* 419 * Copyin callback used by kevent. This copies already 420 * converted filters from kernel memory to the kevent 421 * internal kernel memory. Hence the memcpy instead of 422 * copyin. 423 */ 424 static int 425 epoll_kev_copyin(void *arg, struct kevent *kevp, int count) 426 { 427 struct epoll_copyin_args *args; 428 429 args = (struct epoll_copyin_args*) arg; 430 431 memcpy(kevp, args->changelist, count * sizeof(*kevp)); 432 args->changelist += count; 433 434 return (0); 435 } 436 437 /* 438 * Load epoll filter, convert it to kevent filter 439 * and load it into kevent subsystem. 440 */ 441 int 442 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args) 443 { 444 struct file *epfp, *fp; 445 struct epoll_copyin_args ciargs; 446 struct kevent kev[2]; 447 struct kevent_copyops k_ops = { &ciargs, 448 NULL, 449 epoll_kev_copyin}; 450 struct epoll_event le; 451 cap_rights_t rights; 452 int kev_flags; 453 int nchanges = 0; 454 int error; 455 456 if (args->op != LINUX_EPOLL_CTL_DEL) { 457 error = copyin(args->event, &le, sizeof(le)); 458 if (error != 0) 459 return (error); 460 } 461 462 error = fget(td, args->epfd, 463 cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &epfp); 464 if (error != 0) 465 return (error); 466 if (epfp->f_type != DTYPE_KQUEUE) { 467 error = EINVAL; 468 goto leave1; 469 } 470 471 /* Protect user data vector from incorrectly supplied fd. */ 472 error = fget(td, args->fd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp); 473 if (error != 0) 474 goto leave1; 475 476 /* Linux disallows spying on himself */ 477 if (epfp == fp) { 478 error = EINVAL; 479 goto leave0; 480 } 481 482 ciargs.changelist = kev; 483 484 if (args->op != LINUX_EPOLL_CTL_DEL) { 485 kev_flags = EV_ADD | EV_ENABLE; 486 error = epoll_to_kevent(td, epfp, args->fd, &le, 487 &kev_flags, kev, &nchanges); 488 if (error != 0) 489 goto leave0; 490 } 491 492 switch (args->op) { 493 case LINUX_EPOLL_CTL_MOD: 494 error = epoll_delete_all_events(td, epfp, args->fd); 495 if (error != 0) 496 goto leave0; 497 break; 498 499 case LINUX_EPOLL_CTL_ADD: 500 /* 501 * kqueue_register() return ENOENT if event does not exists 502 * and the EV_ADD flag is not set. 503 */ 504 kev[0].flags &= ~EV_ADD; 505 error = kqfd_register(args->epfd, &kev[0], td, 1); 506 if (error != ENOENT) { 507 error = EEXIST; 508 goto leave0; 509 } 510 error = 0; 511 kev[0].flags |= EV_ADD; 512 break; 513 514 case LINUX_EPOLL_CTL_DEL: 515 /* CTL_DEL means unregister this fd with this epoll */ 516 error = epoll_delete_all_events(td, epfp, args->fd); 517 goto leave0; 518 519 default: 520 error = EINVAL; 521 goto leave0; 522 } 523 524 epoll_fd_install(td, args->fd, le.data); 525 526 error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL); 527 528 leave0: 529 fdrop(fp, td); 530 531 leave1: 532 fdrop(epfp, td); 533 return (error); 534 } 535 536 /* 537 * Wait for a filter to be triggered on the epoll file descriptor. 538 */ 539 static int 540 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events, 541 int maxevents, int timeout, sigset_t *uset) 542 { 543 struct epoll_copyout_args coargs; 544 struct kevent_copyops k_ops = { &coargs, 545 epoll_kev_copyout, 546 NULL}; 547 struct timespec ts, *tsp; 548 cap_rights_t rights; 549 struct file *epfp; 550 sigset_t omask; 551 int error; 552 553 if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS) 554 return (EINVAL); 555 556 error = fget(td, epfd, 557 cap_rights_init(&rights, CAP_KQUEUE_EVENT), &epfp); 558 if (error != 0) 559 return (error); 560 if (epfp->f_type != DTYPE_KQUEUE) { 561 error = EINVAL; 562 goto leave1; 563 } 564 if (uset != NULL) { 565 error = kern_sigprocmask(td, SIG_SETMASK, uset, 566 &omask, 0); 567 if (error != 0) 568 goto leave1; 569 td->td_pflags |= TDP_OLDMASK; 570 /* 571 * Make sure that ast() is called on return to 572 * usermode and TDP_OLDMASK is cleared, restoring old 573 * sigmask. 574 */ 575 thread_lock(td); 576 td->td_flags |= TDF_ASTPENDING; 577 thread_unlock(td); 578 } 579 580 581 coargs.leventlist = events; 582 coargs.p = td->td_proc; 583 coargs.count = 0; 584 coargs.error = 0; 585 586 if (timeout != -1) { 587 if (timeout < 0) { 588 error = EINVAL; 589 goto leave0; 590 } 591 /* Convert from milliseconds to timespec. */ 592 ts.tv_sec = timeout / 1000; 593 ts.tv_nsec = (timeout % 1000) * 1000000; 594 tsp = &ts; 595 } else { 596 tsp = NULL; 597 } 598 599 error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp); 600 if (error == 0 && coargs.error != 0) 601 error = coargs.error; 602 603 /* 604 * kern_kevent might return ENOMEM which is not expected from epoll_wait. 605 * Maybe we should translate that but I don't think it matters at all. 606 */ 607 if (error == 0) 608 td->td_retval[0] = coargs.count; 609 610 leave0: 611 if (uset != NULL) 612 error = kern_sigprocmask(td, SIG_SETMASK, &omask, 613 NULL, 0); 614 leave1: 615 fdrop(epfp, td); 616 return (error); 617 } 618 619 int 620 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args) 621 { 622 623 return (linux_epoll_wait_common(td, args->epfd, args->events, 624 args->maxevents, args->timeout, NULL)); 625 } 626 627 int 628 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args) 629 { 630 sigset_t mask, *pmask; 631 l_sigset_t lmask; 632 int error; 633 634 if (args->mask != NULL) { 635 if (args->sigsetsize != sizeof(l_sigset_t)) 636 return (EINVAL); 637 error = copyin(args->mask, &lmask, sizeof(l_sigset_t)); 638 if (error != 0) 639 return (error); 640 linux_to_bsd_sigset(&lmask, &mask); 641 pmask = &mask; 642 } else 643 pmask = NULL; 644 return (linux_epoll_wait_common(td, args->epfd, args->events, 645 args->maxevents, args->timeout, pmask)); 646 } 647 648 static int 649 epoll_delete_event(struct thread *td, struct file *epfp, int fd, int filter) 650 { 651 struct epoll_copyin_args ciargs; 652 struct kevent kev; 653 struct kevent_copyops k_ops = { &ciargs, 654 NULL, 655 epoll_kev_copyin}; 656 657 ciargs.changelist = &kev; 658 EV_SET(&kev, fd, filter, EV_DELETE | EV_DISABLE, 0, 0, 0); 659 660 return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL)); 661 } 662 663 static int 664 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd) 665 { 666 int error1, error2; 667 668 error1 = epoll_delete_event(td, epfp, fd, EVFILT_READ); 669 error2 = epoll_delete_event(td, epfp, fd, EVFILT_WRITE); 670 671 /* return 0 if at least one result positive */ 672 return (error1 == 0 ? 0 : error2); 673 } 674 675 static int 676 eventfd_create(struct thread *td, uint32_t initval, int flags) 677 { 678 struct filedesc *fdp; 679 struct eventfd *efd; 680 struct file *fp; 681 int fflags, fd, error; 682 683 fflags = 0; 684 if ((flags & LINUX_O_CLOEXEC) != 0) 685 fflags |= O_CLOEXEC; 686 687 fdp = td->td_proc->p_fd; 688 error = falloc(td, &fp, &fd, fflags); 689 if (error != 0) 690 return (error); 691 692 efd = malloc(sizeof(*efd), M_EPOLL, M_WAITOK | M_ZERO); 693 efd->efd_flags = flags; 694 efd->efd_count = initval; 695 mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF); 696 697 knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock); 698 699 fflags = FREAD | FWRITE; 700 if ((flags & LINUX_O_NONBLOCK) != 0) 701 fflags |= FNONBLOCK; 702 703 finit(fp, fflags, DTYPE_LINUXEFD, efd, &eventfdops); 704 fdrop(fp, td); 705 706 td->td_retval[0] = fd; 707 return (error); 708 } 709 710 int 711 linux_eventfd(struct thread *td, struct linux_eventfd_args *args) 712 { 713 714 return (eventfd_create(td, args->initval, 0)); 715 } 716 717 int 718 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args) 719 { 720 721 if ((args->flags & ~(LINUX_O_CLOEXEC|LINUX_O_NONBLOCK|LINUX_EFD_SEMAPHORE)) != 0) 722 return (EINVAL); 723 724 return (eventfd_create(td, args->initval, args->flags)); 725 } 726 727 static int 728 eventfd_close(struct file *fp, struct thread *td) 729 { 730 struct eventfd *efd; 731 732 efd = fp->f_data; 733 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 734 return (EINVAL); 735 736 seldrain(&efd->efd_sel); 737 knlist_destroy(&efd->efd_sel.si_note); 738 739 fp->f_ops = &badfileops; 740 mtx_destroy(&efd->efd_lock); 741 free(efd, M_EPOLL); 742 743 return (0); 744 } 745 746 static int 747 eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 748 int flags, struct thread *td) 749 { 750 struct eventfd *efd; 751 eventfd_t count; 752 int error; 753 754 efd = fp->f_data; 755 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 756 return (EINVAL); 757 758 if (uio->uio_resid < sizeof(eventfd_t)) 759 return (EINVAL); 760 761 error = 0; 762 mtx_lock(&efd->efd_lock); 763 retry: 764 if (efd->efd_count == 0) { 765 if ((fp->f_flag & FNONBLOCK) != 0) { 766 mtx_unlock(&efd->efd_lock); 767 return (EAGAIN); 768 } 769 error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "lefdrd", 0); 770 if (error == 0) 771 goto retry; 772 } 773 if (error == 0) { 774 if ((efd->efd_flags & LINUX_EFD_SEMAPHORE) != 0) { 775 count = 1; 776 --efd->efd_count; 777 } else { 778 count = efd->efd_count; 779 efd->efd_count = 0; 780 } 781 KNOTE_LOCKED(&efd->efd_sel.si_note, 0); 782 selwakeup(&efd->efd_sel); 783 wakeup(&efd->efd_count); 784 mtx_unlock(&efd->efd_lock); 785 error = uiomove(&count, sizeof(eventfd_t), uio); 786 } else 787 mtx_unlock(&efd->efd_lock); 788 789 return (error); 790 } 791 792 static int 793 eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred, 794 int flags, struct thread *td) 795 { 796 struct eventfd *efd; 797 eventfd_t count; 798 int error; 799 800 efd = fp->f_data; 801 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 802 return (EINVAL); 803 804 if (uio->uio_resid < sizeof(eventfd_t)) 805 return (EINVAL); 806 807 error = uiomove(&count, sizeof(eventfd_t), uio); 808 if (error != 0) 809 return (error); 810 if (count == UINT64_MAX) 811 return (EINVAL); 812 813 mtx_lock(&efd->efd_lock); 814 retry: 815 if (UINT64_MAX - efd->efd_count <= count) { 816 if ((fp->f_flag & FNONBLOCK) != 0) { 817 mtx_unlock(&efd->efd_lock); 818 /* Do not not return the number of bytes written */ 819 uio->uio_resid += sizeof(eventfd_t); 820 return (EAGAIN); 821 } 822 error = mtx_sleep(&efd->efd_count, &efd->efd_lock, 823 PCATCH, "lefdwr", 0); 824 if (error == 0) 825 goto retry; 826 } 827 if (error == 0) { 828 efd->efd_count += count; 829 KNOTE_LOCKED(&efd->efd_sel.si_note, 0); 830 selwakeup(&efd->efd_sel); 831 wakeup(&efd->efd_count); 832 } 833 mtx_unlock(&efd->efd_lock); 834 835 return (error); 836 } 837 838 static int 839 eventfd_poll(struct file *fp, int events, struct ucred *active_cred, 840 struct thread *td) 841 { 842 struct eventfd *efd; 843 int revents = 0; 844 845 efd = fp->f_data; 846 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 847 return (POLLERR); 848 849 mtx_lock(&efd->efd_lock); 850 if ((events & (POLLIN|POLLRDNORM)) && efd->efd_count > 0) 851 revents |= events & (POLLIN|POLLRDNORM); 852 if ((events & (POLLOUT|POLLWRNORM)) && UINT64_MAX - 1 > efd->efd_count) 853 revents |= events & (POLLOUT|POLLWRNORM); 854 if (revents == 0) 855 selrecord(td, &efd->efd_sel); 856 mtx_unlock(&efd->efd_lock); 857 858 return (revents); 859 } 860 861 /*ARGSUSED*/ 862 static int 863 eventfd_kqfilter(struct file *fp, struct knote *kn) 864 { 865 struct eventfd *efd; 866 867 efd = fp->f_data; 868 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 869 return (EINVAL); 870 871 mtx_lock(&efd->efd_lock); 872 switch (kn->kn_filter) { 873 case EVFILT_READ: 874 kn->kn_fop = &eventfd_rfiltops; 875 break; 876 case EVFILT_WRITE: 877 kn->kn_fop = &eventfd_wfiltops; 878 break; 879 default: 880 mtx_unlock(&efd->efd_lock); 881 return (EINVAL); 882 } 883 884 kn->kn_hook = efd; 885 knlist_add(&efd->efd_sel.si_note, kn, 1); 886 mtx_unlock(&efd->efd_lock); 887 888 return (0); 889 } 890 891 static void 892 filt_eventfddetach(struct knote *kn) 893 { 894 struct eventfd *efd = kn->kn_hook; 895 896 mtx_lock(&efd->efd_lock); 897 knlist_remove(&efd->efd_sel.si_note, kn, 1); 898 mtx_unlock(&efd->efd_lock); 899 } 900 901 /*ARGSUSED*/ 902 static int 903 filt_eventfdread(struct knote *kn, long hint) 904 { 905 struct eventfd *efd = kn->kn_hook; 906 int ret; 907 908 mtx_assert(&efd->efd_lock, MA_OWNED); 909 ret = (efd->efd_count > 0); 910 911 return (ret); 912 } 913 914 /*ARGSUSED*/ 915 static int 916 filt_eventfdwrite(struct knote *kn, long hint) 917 { 918 struct eventfd *efd = kn->kn_hook; 919 int ret; 920 921 mtx_assert(&efd->efd_lock, MA_OWNED); 922 ret = (UINT64_MAX - 1 > efd->efd_count); 923 924 return (ret); 925 } 926 927 /*ARGSUSED*/ 928 static int 929 eventfd_ioctl(struct file *fp, u_long cmd, void *data, 930 struct ucred *active_cred, struct thread *td) 931 { 932 933 if (fp->f_data == NULL || (fp->f_type != DTYPE_LINUXEFD && 934 fp->f_type != DTYPE_LINUXTFD)) 935 return (EINVAL); 936 937 switch (cmd) 938 { 939 case FIONBIO: 940 if ((*(int *)data)) 941 atomic_set_int(&fp->f_flag, FNONBLOCK); 942 else 943 atomic_clear_int(&fp->f_flag, FNONBLOCK); 944 case FIOASYNC: 945 return (0); 946 default: 947 return (ENXIO); 948 } 949 } 950 951 /*ARGSUSED*/ 952 static int 953 eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, 954 struct thread *td) 955 { 956 957 return (ENXIO); 958 } 959 960 /*ARGSUSED*/ 961 static int 962 eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 963 { 964 965 kif->kf_type = KF_TYPE_UNKNOWN; 966 return (0); 967 } 968 969 int 970 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args) 971 { 972 struct filedesc *fdp; 973 struct timerfd *tfd; 974 struct file *fp; 975 clockid_t clockid; 976 int fflags, fd, error; 977 978 if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0) 979 return (EINVAL); 980 981 error = linux_to_native_clockid(&clockid, args->clockid); 982 if (error != 0) 983 return (error); 984 if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) 985 return (EINVAL); 986 987 fflags = 0; 988 if ((args->flags & LINUX_TFD_CLOEXEC) != 0) 989 fflags |= O_CLOEXEC; 990 991 fdp = td->td_proc->p_fd; 992 error = falloc(td, &fp, &fd, fflags); 993 if (error != 0) 994 return (error); 995 996 tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO); 997 tfd->tfd_clockid = clockid; 998 mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF); 999 1000 callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0); 1001 knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock); 1002 1003 fflags = FREAD; 1004 if ((args->flags & LINUX_O_NONBLOCK) != 0) 1005 fflags |= FNONBLOCK; 1006 1007 finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops); 1008 fdrop(fp, td); 1009 1010 td->td_retval[0] = fd; 1011 return (error); 1012 } 1013 1014 static int 1015 timerfd_close(struct file *fp, struct thread *td) 1016 { 1017 struct timerfd *tfd; 1018 1019 tfd = fp->f_data; 1020 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1021 return (EINVAL); 1022 1023 timespecclear(&tfd->tfd_time.it_value); 1024 timespecclear(&tfd->tfd_time.it_interval); 1025 1026 mtx_lock(&tfd->tfd_lock); 1027 callout_drain(&tfd->tfd_callout); 1028 mtx_unlock(&tfd->tfd_lock); 1029 1030 seldrain(&tfd->tfd_sel); 1031 knlist_destroy(&tfd->tfd_sel.si_note); 1032 1033 fp->f_ops = &badfileops; 1034 mtx_destroy(&tfd->tfd_lock); 1035 free(tfd, M_EPOLL); 1036 1037 return (0); 1038 } 1039 1040 static int 1041 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 1042 int flags, struct thread *td) 1043 { 1044 struct timerfd *tfd; 1045 timerfd_t count; 1046 int error; 1047 1048 tfd = fp->f_data; 1049 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1050 return (EINVAL); 1051 1052 if (uio->uio_resid < sizeof(timerfd_t)) 1053 return (EINVAL); 1054 1055 error = 0; 1056 mtx_lock(&tfd->tfd_lock); 1057 retry: 1058 if (tfd->tfd_canceled) { 1059 tfd->tfd_count = 0; 1060 mtx_unlock(&tfd->tfd_lock); 1061 return (ECANCELED); 1062 } 1063 if (tfd->tfd_count == 0) { 1064 if ((fp->f_flag & FNONBLOCK) != 0) { 1065 mtx_unlock(&tfd->tfd_lock); 1066 return (EAGAIN); 1067 } 1068 error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0); 1069 if (error == 0) 1070 goto retry; 1071 } 1072 if (error == 0) { 1073 count = tfd->tfd_count; 1074 tfd->tfd_count = 0; 1075 mtx_unlock(&tfd->tfd_lock); 1076 error = uiomove(&count, sizeof(timerfd_t), uio); 1077 } else 1078 mtx_unlock(&tfd->tfd_lock); 1079 1080 return (error); 1081 } 1082 1083 static int 1084 timerfd_poll(struct file *fp, int events, struct ucred *active_cred, 1085 struct thread *td) 1086 { 1087 struct timerfd *tfd; 1088 int revents = 0; 1089 1090 tfd = fp->f_data; 1091 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1092 return (POLLERR); 1093 1094 mtx_lock(&tfd->tfd_lock); 1095 if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0) 1096 revents |= events & (POLLIN|POLLRDNORM); 1097 if (revents == 0) 1098 selrecord(td, &tfd->tfd_sel); 1099 mtx_unlock(&tfd->tfd_lock); 1100 1101 return (revents); 1102 } 1103 1104 /*ARGSUSED*/ 1105 static int 1106 timerfd_kqfilter(struct file *fp, struct knote *kn) 1107 { 1108 struct timerfd *tfd; 1109 1110 tfd = fp->f_data; 1111 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1112 return (EINVAL); 1113 1114 if (kn->kn_filter == EVFILT_READ) 1115 kn->kn_fop = &timerfd_rfiltops; 1116 else 1117 return (EINVAL); 1118 1119 kn->kn_hook = tfd; 1120 knlist_add(&tfd->tfd_sel.si_note, kn, 0); 1121 1122 return (0); 1123 } 1124 1125 static void 1126 filt_timerfddetach(struct knote *kn) 1127 { 1128 struct timerfd *tfd = kn->kn_hook; 1129 1130 mtx_lock(&tfd->tfd_lock); 1131 knlist_remove(&tfd->tfd_sel.si_note, kn, 1); 1132 mtx_unlock(&tfd->tfd_lock); 1133 } 1134 1135 /*ARGSUSED*/ 1136 static int 1137 filt_timerfdread(struct knote *kn, long hint) 1138 { 1139 struct timerfd *tfd = kn->kn_hook; 1140 1141 return (tfd->tfd_count > 0); 1142 } 1143 1144 /*ARGSUSED*/ 1145 static int 1146 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, 1147 struct thread *td) 1148 { 1149 1150 return (ENXIO); 1151 } 1152 1153 /*ARGSUSED*/ 1154 static int 1155 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 1156 { 1157 1158 kif->kf_type = KF_TYPE_UNKNOWN; 1159 return (0); 1160 } 1161 1162 static void 1163 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts) 1164 { 1165 1166 if (tfd->tfd_clockid == CLOCK_REALTIME) 1167 getnanotime(ts); 1168 else /* CLOCK_MONOTONIC */ 1169 getnanouptime(ts); 1170 } 1171 1172 static void 1173 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots) 1174 { 1175 struct timespec cts; 1176 1177 linux_timerfd_clocktime(tfd, &cts); 1178 *ots = tfd->tfd_time; 1179 if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) { 1180 timespecsub(&ots->it_value, &cts); 1181 if (ots->it_value.tv_sec < 0 || 1182 (ots->it_value.tv_sec == 0 && 1183 ots->it_value.tv_nsec == 0)) { 1184 ots->it_value.tv_sec = 0; 1185 ots->it_value.tv_nsec = 1; 1186 } 1187 } 1188 } 1189 1190 int 1191 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args) 1192 { 1193 struct l_itimerspec lots; 1194 struct itimerspec ots; 1195 struct timerfd *tfd; 1196 struct file *fp; 1197 int error; 1198 1199 error = fget(td, args->fd, &cap_read_rights, &fp); 1200 if (error != 0) 1201 return (error); 1202 tfd = fp->f_data; 1203 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { 1204 error = EINVAL; 1205 goto out; 1206 } 1207 1208 mtx_lock(&tfd->tfd_lock); 1209 linux_timerfd_curval(tfd, &ots); 1210 mtx_unlock(&tfd->tfd_lock); 1211 1212 error = native_to_linux_itimerspec(&lots, &ots); 1213 if (error == 0) 1214 error = copyout(&lots, args->old_value, sizeof(lots)); 1215 1216 out: 1217 fdrop(fp, td); 1218 return (error); 1219 } 1220 1221 int 1222 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args) 1223 { 1224 struct l_itimerspec lots; 1225 struct itimerspec nts, ots; 1226 struct timespec cts, ts; 1227 struct timerfd *tfd; 1228 struct timeval tv; 1229 struct file *fp; 1230 int error; 1231 1232 if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0) 1233 return (EINVAL); 1234 1235 error = copyin(args->new_value, &lots, sizeof(lots)); 1236 if (error != 0) 1237 return (error); 1238 error = linux_to_native_itimerspec(&nts, &lots); 1239 if (error != 0) 1240 return (error); 1241 1242 error = fget(td, args->fd, &cap_write_rights, &fp); 1243 if (error != 0) 1244 return (error); 1245 tfd = fp->f_data; 1246 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { 1247 error = EINVAL; 1248 goto out; 1249 } 1250 1251 mtx_lock(&tfd->tfd_lock); 1252 if (!timespecisset(&nts.it_value)) 1253 timespecclear(&nts.it_interval); 1254 if (args->old_value != NULL) 1255 linux_timerfd_curval(tfd, &ots); 1256 1257 tfd->tfd_time = nts; 1258 if (timespecisset(&nts.it_value)) { 1259 linux_timerfd_clocktime(tfd, &cts); 1260 ts = nts.it_value; 1261 if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) { 1262 timespecadd(&tfd->tfd_time.it_value, &cts); 1263 } else { 1264 timespecsub(&ts, &cts); 1265 } 1266 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1267 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1268 linux_timerfd_expire, tfd); 1269 tfd->tfd_canceled = false; 1270 } else { 1271 tfd->tfd_canceled = true; 1272 callout_stop(&tfd->tfd_callout); 1273 } 1274 mtx_unlock(&tfd->tfd_lock); 1275 1276 if (args->old_value != NULL) { 1277 error = native_to_linux_itimerspec(&lots, &ots); 1278 if (error == 0) 1279 error = copyout(&lots, args->old_value, sizeof(lots)); 1280 } 1281 1282 out: 1283 fdrop(fp, td); 1284 return (error); 1285 } 1286 1287 static void 1288 linux_timerfd_expire(void *arg) 1289 { 1290 struct timespec cts, ts; 1291 struct timeval tv; 1292 struct timerfd *tfd; 1293 1294 tfd = (struct timerfd *)arg; 1295 1296 linux_timerfd_clocktime(tfd, &cts); 1297 if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) { 1298 if (timespecisset(&tfd->tfd_time.it_interval)) 1299 timespecadd(&tfd->tfd_time.it_value, 1300 &tfd->tfd_time.it_interval); 1301 else 1302 /* single shot timer */ 1303 timespecclear(&tfd->tfd_time.it_value); 1304 if (timespecisset(&tfd->tfd_time.it_value)) { 1305 ts = tfd->tfd_time.it_value; 1306 timespecsub(&ts, &cts); 1307 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1308 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1309 linux_timerfd_expire, tfd); 1310 } 1311 tfd->tfd_count++; 1312 KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0); 1313 selwakeup(&tfd->tfd_sel); 1314 wakeup(&tfd->tfd_count); 1315 } else if (timespecisset(&tfd->tfd_time.it_value)) { 1316 ts = tfd->tfd_time.it_value; 1317 timespecsub(&ts, &cts); 1318 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1319 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1320 linux_timerfd_expire, tfd); 1321 } 1322 } 1323