1 /*- 2 * Copyright (c) 2007 Roman Divacky 3 * Copyright (c) 2014 Dmitry Chagin 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include "opt_compat.h" 32 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/imgact.h> 36 #include <sys/kernel.h> 37 #include <sys/limits.h> 38 #include <sys/lock.h> 39 #include <sys/mutex.h> 40 #include <sys/callout.h> 41 #include <sys/capsicum.h> 42 #include <sys/types.h> 43 #include <sys/user.h> 44 #include <sys/file.h> 45 #include <sys/filedesc.h> 46 #include <sys/filio.h> 47 #include <sys/errno.h> 48 #include <sys/event.h> 49 #include <sys/poll.h> 50 #include <sys/proc.h> 51 #include <sys/selinfo.h> 52 #include <sys/sx.h> 53 #include <sys/syscallsubr.h> 54 #include <sys/timespec.h> 55 56 #ifdef COMPAT_LINUX32 57 #include <machine/../linux32/linux.h> 58 #include <machine/../linux32/linux32_proto.h> 59 #else 60 #include <machine/../linux/linux.h> 61 #include <machine/../linux/linux_proto.h> 62 #endif 63 64 #include <compat/linux/linux_emul.h> 65 #include <compat/linux/linux_event.h> 66 #include <compat/linux/linux_file.h> 67 #include <compat/linux/linux_timer.h> 68 #include <compat/linux/linux_util.h> 69 70 /* 71 * epoll defines 'struct epoll_event' with the field 'data' as 64 bits 72 * on all architectures. But on 32 bit architectures BSD 'struct kevent' only 73 * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied 74 * data verbatuim. Therefore we allocate 64-bit memory block to pass 75 * user supplied data for every file descriptor. 76 */ 77 78 typedef uint64_t epoll_udata_t; 79 80 struct epoll_emuldata { 81 uint32_t fdc; /* epoll udata max index */ 82 epoll_udata_t udata[1]; /* epoll user data vector */ 83 }; 84 85 #define EPOLL_DEF_SZ 16 86 #define EPOLL_SIZE(fdn) \ 87 (sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t)) 88 89 struct epoll_event { 90 uint32_t events; 91 epoll_udata_t data; 92 } 93 #if defined(__amd64__) 94 __attribute__((packed)) 95 #endif 96 ; 97 98 #define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 99 100 static void epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata); 101 static int epoll_to_kevent(struct thread *td, struct file *epfp, 102 int fd, struct epoll_event *l_event, int *kev_flags, 103 struct kevent *kevent, int *nkevents); 104 static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event); 105 static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count); 106 static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count); 107 static int epoll_delete_event(struct thread *td, struct file *epfp, 108 int fd, int filter); 109 static int epoll_delete_all_events(struct thread *td, struct file *epfp, 110 int fd); 111 112 struct epoll_copyin_args { 113 struct kevent *changelist; 114 }; 115 116 struct epoll_copyout_args { 117 struct epoll_event *leventlist; 118 struct proc *p; 119 uint32_t count; 120 int error; 121 }; 122 123 /* eventfd */ 124 typedef uint64_t eventfd_t; 125 126 static fo_rdwr_t eventfd_read; 127 static fo_rdwr_t eventfd_write; 128 static fo_ioctl_t eventfd_ioctl; 129 static fo_poll_t eventfd_poll; 130 static fo_kqfilter_t eventfd_kqfilter; 131 static fo_stat_t eventfd_stat; 132 static fo_close_t eventfd_close; 133 static fo_fill_kinfo_t eventfd_fill_kinfo; 134 135 static struct fileops eventfdops = { 136 .fo_read = eventfd_read, 137 .fo_write = eventfd_write, 138 .fo_truncate = invfo_truncate, 139 .fo_ioctl = eventfd_ioctl, 140 .fo_poll = eventfd_poll, 141 .fo_kqfilter = eventfd_kqfilter, 142 .fo_stat = eventfd_stat, 143 .fo_close = eventfd_close, 144 .fo_chmod = invfo_chmod, 145 .fo_chown = invfo_chown, 146 .fo_sendfile = invfo_sendfile, 147 .fo_fill_kinfo = eventfd_fill_kinfo, 148 .fo_flags = DFLAG_PASSABLE 149 }; 150 151 static void filt_eventfddetach(struct knote *kn); 152 static int filt_eventfdread(struct knote *kn, long hint); 153 static int filt_eventfdwrite(struct knote *kn, long hint); 154 155 static struct filterops eventfd_rfiltops = { 156 .f_isfd = 1, 157 .f_detach = filt_eventfddetach, 158 .f_event = filt_eventfdread 159 }; 160 static struct filterops eventfd_wfiltops = { 161 .f_isfd = 1, 162 .f_detach = filt_eventfddetach, 163 .f_event = filt_eventfdwrite 164 }; 165 166 /* timerfd */ 167 typedef uint64_t timerfd_t; 168 169 static fo_rdwr_t timerfd_read; 170 static fo_poll_t timerfd_poll; 171 static fo_kqfilter_t timerfd_kqfilter; 172 static fo_stat_t timerfd_stat; 173 static fo_close_t timerfd_close; 174 static fo_fill_kinfo_t timerfd_fill_kinfo; 175 176 static struct fileops timerfdops = { 177 .fo_read = timerfd_read, 178 .fo_write = invfo_rdwr, 179 .fo_truncate = invfo_truncate, 180 .fo_ioctl = eventfd_ioctl, 181 .fo_poll = timerfd_poll, 182 .fo_kqfilter = timerfd_kqfilter, 183 .fo_stat = timerfd_stat, 184 .fo_close = timerfd_close, 185 .fo_chmod = invfo_chmod, 186 .fo_chown = invfo_chown, 187 .fo_sendfile = invfo_sendfile, 188 .fo_fill_kinfo = timerfd_fill_kinfo, 189 .fo_flags = DFLAG_PASSABLE 190 }; 191 192 static void filt_timerfddetach(struct knote *kn); 193 static int filt_timerfdread(struct knote *kn, long hint); 194 195 static struct filterops timerfd_rfiltops = { 196 .f_isfd = 1, 197 .f_detach = filt_timerfddetach, 198 .f_event = filt_timerfdread 199 }; 200 201 struct eventfd { 202 eventfd_t efd_count; 203 uint32_t efd_flags; 204 struct selinfo efd_sel; 205 struct mtx efd_lock; 206 }; 207 208 struct timerfd { 209 clockid_t tfd_clockid; 210 struct itimerspec tfd_time; 211 struct callout tfd_callout; 212 timerfd_t tfd_count; 213 bool tfd_canceled; 214 struct selinfo tfd_sel; 215 struct mtx tfd_lock; 216 }; 217 218 static int eventfd_create(struct thread *td, uint32_t initval, int flags); 219 static void linux_timerfd_expire(void *); 220 static void linux_timerfd_curval(struct timerfd *, struct itimerspec *); 221 222 223 static void 224 epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata) 225 { 226 struct linux_pemuldata *pem; 227 struct epoll_emuldata *emd; 228 struct proc *p; 229 230 p = td->td_proc; 231 232 pem = pem_find(p); 233 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 234 235 LINUX_PEM_XLOCK(pem); 236 if (pem->epoll == NULL) { 237 emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); 238 emd->fdc = fd; 239 pem->epoll = emd; 240 } else { 241 emd = pem->epoll; 242 if (fd > emd->fdc) { 243 emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); 244 emd->fdc = fd; 245 pem->epoll = emd; 246 } 247 } 248 emd->udata[fd] = udata; 249 LINUX_PEM_XUNLOCK(pem); 250 } 251 252 static int 253 epoll_create_common(struct thread *td, int flags) 254 { 255 int error; 256 257 error = kern_kqueue(td, flags, NULL); 258 if (error != 0) 259 return (error); 260 261 epoll_fd_install(td, EPOLL_DEF_SZ, 0); 262 263 return (0); 264 } 265 266 #ifdef LINUX_LEGACY_SYSCALLS 267 int 268 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args) 269 { 270 271 /* 272 * args->size is unused. Linux just tests it 273 * and then forgets it as well. 274 */ 275 if (args->size <= 0) 276 return (EINVAL); 277 278 return (epoll_create_common(td, 0)); 279 } 280 #endif 281 282 int 283 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args) 284 { 285 int flags; 286 287 if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0) 288 return (EINVAL); 289 290 flags = 0; 291 if ((args->flags & LINUX_O_CLOEXEC) != 0) 292 flags |= O_CLOEXEC; 293 294 return (epoll_create_common(td, flags)); 295 } 296 297 /* Structure converting function from epoll to kevent. */ 298 static int 299 epoll_to_kevent(struct thread *td, struct file *epfp, 300 int fd, struct epoll_event *l_event, int *kev_flags, 301 struct kevent *kevent, int *nkevents) 302 { 303 uint32_t levents = l_event->events; 304 struct linux_pemuldata *pem; 305 struct proc *p; 306 307 /* flags related to how event is registered */ 308 if ((levents & LINUX_EPOLLONESHOT) != 0) 309 *kev_flags |= EV_ONESHOT; 310 if ((levents & LINUX_EPOLLET) != 0) 311 *kev_flags |= EV_CLEAR; 312 if ((levents & LINUX_EPOLLERR) != 0) 313 *kev_flags |= EV_ERROR; 314 if ((levents & LINUX_EPOLLRDHUP) != 0) 315 *kev_flags |= EV_EOF; 316 317 /* flags related to what event is registered */ 318 if ((levents & LINUX_EPOLL_EVRD) != 0) { 319 EV_SET(kevent++, fd, EVFILT_READ, *kev_flags, 0, 0, 0); 320 ++(*nkevents); 321 } 322 if ((levents & LINUX_EPOLL_EVWR) != 0) { 323 EV_SET(kevent++, fd, EVFILT_WRITE, *kev_flags, 0, 0, 0); 324 ++(*nkevents); 325 } 326 327 if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) { 328 p = td->td_proc; 329 330 pem = pem_find(p); 331 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 332 KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n")); 333 334 LINUX_PEM_XLOCK(pem); 335 if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) { 336 pem->flags |= LINUX_XUNSUP_EPOLL; 337 LINUX_PEM_XUNLOCK(pem); 338 linux_msg(td, "epoll_ctl unsupported flags: 0x%x\n", 339 levents); 340 } else 341 LINUX_PEM_XUNLOCK(pem); 342 return (EINVAL); 343 } 344 345 return (0); 346 } 347 348 /* 349 * Structure converting function from kevent to epoll. In a case 350 * this is called on error in registration we store the error in 351 * event->data and pick it up later in linux_epoll_ctl(). 352 */ 353 static void 354 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event) 355 { 356 357 if ((kevent->flags & EV_ERROR) != 0) { 358 l_event->events = LINUX_EPOLLERR; 359 return; 360 } 361 362 /* XXX EPOLLPRI, EPOLLHUP */ 363 switch (kevent->filter) { 364 case EVFILT_READ: 365 l_event->events = LINUX_EPOLLIN; 366 if ((kevent->flags & EV_EOF) != 0) 367 l_event->events |= LINUX_EPOLLRDHUP; 368 break; 369 case EVFILT_WRITE: 370 l_event->events = LINUX_EPOLLOUT; 371 break; 372 } 373 } 374 375 /* 376 * Copyout callback used by kevent. This converts kevent 377 * events to epoll events and copies them back to the 378 * userspace. This is also called on error on registering 379 * of the filter. 380 */ 381 static int 382 epoll_kev_copyout(void *arg, struct kevent *kevp, int count) 383 { 384 struct epoll_copyout_args *args; 385 struct linux_pemuldata *pem; 386 struct epoll_emuldata *emd; 387 struct epoll_event *eep; 388 int error, fd, i; 389 390 args = (struct epoll_copyout_args*) arg; 391 eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO); 392 393 pem = pem_find(args->p); 394 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 395 LINUX_PEM_SLOCK(pem); 396 emd = pem->epoll; 397 KASSERT(emd != NULL, ("epoll proc epolldata not found.\n")); 398 399 for (i = 0; i < count; i++) { 400 kevent_to_epoll(&kevp[i], &eep[i]); 401 402 fd = kevp[i].ident; 403 KASSERT(fd <= emd->fdc, ("epoll user data vector" 404 " is too small.\n")); 405 eep[i].data = emd->udata[fd]; 406 } 407 LINUX_PEM_SUNLOCK(pem); 408 409 error = copyout(eep, args->leventlist, count * sizeof(*eep)); 410 if (error == 0) { 411 args->leventlist += count; 412 args->count += count; 413 } else if (args->error == 0) 414 args->error = error; 415 416 free(eep, M_EPOLL); 417 return (error); 418 } 419 420 /* 421 * Copyin callback used by kevent. This copies already 422 * converted filters from kernel memory to the kevent 423 * internal kernel memory. Hence the memcpy instead of 424 * copyin. 425 */ 426 static int 427 epoll_kev_copyin(void *arg, struct kevent *kevp, int count) 428 { 429 struct epoll_copyin_args *args; 430 431 args = (struct epoll_copyin_args*) arg; 432 433 memcpy(kevp, args->changelist, count * sizeof(*kevp)); 434 args->changelist += count; 435 436 return (0); 437 } 438 439 /* 440 * Load epoll filter, convert it to kevent filter 441 * and load it into kevent subsystem. 442 */ 443 int 444 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args) 445 { 446 struct file *epfp, *fp; 447 struct epoll_copyin_args ciargs; 448 struct kevent kev[2]; 449 struct kevent_copyops k_ops = { &ciargs, 450 NULL, 451 epoll_kev_copyin}; 452 struct epoll_event le; 453 cap_rights_t rights; 454 int kev_flags; 455 int nchanges = 0; 456 int error; 457 458 if (args->op != LINUX_EPOLL_CTL_DEL) { 459 error = copyin(args->event, &le, sizeof(le)); 460 if (error != 0) 461 return (error); 462 } 463 464 error = fget(td, args->epfd, 465 cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &epfp); 466 if (error != 0) 467 return (error); 468 if (epfp->f_type != DTYPE_KQUEUE) { 469 error = EINVAL; 470 goto leave1; 471 } 472 473 /* Protect user data vector from incorrectly supplied fd. */ 474 error = fget(td, args->fd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp); 475 if (error != 0) 476 goto leave1; 477 478 /* Linux disallows spying on himself */ 479 if (epfp == fp) { 480 error = EINVAL; 481 goto leave0; 482 } 483 484 ciargs.changelist = kev; 485 486 if (args->op != LINUX_EPOLL_CTL_DEL) { 487 kev_flags = EV_ADD | EV_ENABLE; 488 error = epoll_to_kevent(td, epfp, args->fd, &le, 489 &kev_flags, kev, &nchanges); 490 if (error != 0) 491 goto leave0; 492 } 493 494 switch (args->op) { 495 case LINUX_EPOLL_CTL_MOD: 496 error = epoll_delete_all_events(td, epfp, args->fd); 497 if (error != 0) 498 goto leave0; 499 break; 500 501 case LINUX_EPOLL_CTL_ADD: 502 /* 503 * kqueue_register() return ENOENT if event does not exists 504 * and the EV_ADD flag is not set. 505 */ 506 kev[0].flags &= ~EV_ADD; 507 error = kqfd_register(args->epfd, &kev[0], td, 1); 508 if (error != ENOENT) { 509 error = EEXIST; 510 goto leave0; 511 } 512 error = 0; 513 kev[0].flags |= EV_ADD; 514 break; 515 516 case LINUX_EPOLL_CTL_DEL: 517 /* CTL_DEL means unregister this fd with this epoll */ 518 error = epoll_delete_all_events(td, epfp, args->fd); 519 goto leave0; 520 521 default: 522 error = EINVAL; 523 goto leave0; 524 } 525 526 epoll_fd_install(td, args->fd, le.data); 527 528 error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL); 529 530 leave0: 531 fdrop(fp, td); 532 533 leave1: 534 fdrop(epfp, td); 535 return (error); 536 } 537 538 /* 539 * Wait for a filter to be triggered on the epoll file descriptor. 540 */ 541 static int 542 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events, 543 int maxevents, int timeout, sigset_t *uset) 544 { 545 struct epoll_copyout_args coargs; 546 struct kevent_copyops k_ops = { &coargs, 547 epoll_kev_copyout, 548 NULL}; 549 struct timespec ts, *tsp; 550 cap_rights_t rights; 551 struct file *epfp; 552 sigset_t omask; 553 int error; 554 555 if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS) 556 return (EINVAL); 557 558 error = fget(td, epfd, 559 cap_rights_init(&rights, CAP_KQUEUE_EVENT), &epfp); 560 if (error != 0) 561 return (error); 562 if (epfp->f_type != DTYPE_KQUEUE) { 563 error = EINVAL; 564 goto leave1; 565 } 566 if (uset != NULL) { 567 error = kern_sigprocmask(td, SIG_SETMASK, uset, 568 &omask, 0); 569 if (error != 0) 570 goto leave1; 571 td->td_pflags |= TDP_OLDMASK; 572 /* 573 * Make sure that ast() is called on return to 574 * usermode and TDP_OLDMASK is cleared, restoring old 575 * sigmask. 576 */ 577 thread_lock(td); 578 td->td_flags |= TDF_ASTPENDING; 579 thread_unlock(td); 580 } 581 582 583 coargs.leventlist = events; 584 coargs.p = td->td_proc; 585 coargs.count = 0; 586 coargs.error = 0; 587 588 if (timeout != -1) { 589 if (timeout < 0) { 590 error = EINVAL; 591 goto leave0; 592 } 593 /* Convert from milliseconds to timespec. */ 594 ts.tv_sec = timeout / 1000; 595 ts.tv_nsec = (timeout % 1000) * 1000000; 596 tsp = &ts; 597 } else { 598 tsp = NULL; 599 } 600 601 error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp); 602 if (error == 0 && coargs.error != 0) 603 error = coargs.error; 604 605 /* 606 * kern_kevent might return ENOMEM which is not expected from epoll_wait. 607 * Maybe we should translate that but I don't think it matters at all. 608 */ 609 if (error == 0) 610 td->td_retval[0] = coargs.count; 611 612 leave0: 613 if (uset != NULL) 614 error = kern_sigprocmask(td, SIG_SETMASK, &omask, 615 NULL, 0); 616 leave1: 617 fdrop(epfp, td); 618 return (error); 619 } 620 621 #ifdef LINUX_LEGACY_SYSCALLS 622 int 623 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args) 624 { 625 626 return (linux_epoll_wait_common(td, args->epfd, args->events, 627 args->maxevents, args->timeout, NULL)); 628 } 629 #endif 630 631 int 632 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args) 633 { 634 sigset_t mask, *pmask; 635 l_sigset_t lmask; 636 int error; 637 638 if (args->mask != NULL) { 639 if (args->sigsetsize != sizeof(l_sigset_t)) 640 return (EINVAL); 641 error = copyin(args->mask, &lmask, sizeof(l_sigset_t)); 642 if (error != 0) 643 return (error); 644 linux_to_bsd_sigset(&lmask, &mask); 645 pmask = &mask; 646 } else 647 pmask = NULL; 648 return (linux_epoll_wait_common(td, args->epfd, args->events, 649 args->maxevents, args->timeout, pmask)); 650 } 651 652 static int 653 epoll_delete_event(struct thread *td, struct file *epfp, int fd, int filter) 654 { 655 struct epoll_copyin_args ciargs; 656 struct kevent kev; 657 struct kevent_copyops k_ops = { &ciargs, 658 NULL, 659 epoll_kev_copyin}; 660 661 ciargs.changelist = &kev; 662 EV_SET(&kev, fd, filter, EV_DELETE | EV_DISABLE, 0, 0, 0); 663 664 return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL)); 665 } 666 667 static int 668 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd) 669 { 670 int error1, error2; 671 672 error1 = epoll_delete_event(td, epfp, fd, EVFILT_READ); 673 error2 = epoll_delete_event(td, epfp, fd, EVFILT_WRITE); 674 675 /* return 0 if at least one result positive */ 676 return (error1 == 0 ? 0 : error2); 677 } 678 679 static int 680 eventfd_create(struct thread *td, uint32_t initval, int flags) 681 { 682 struct filedesc *fdp; 683 struct eventfd *efd; 684 struct file *fp; 685 int fflags, fd, error; 686 687 fflags = 0; 688 if ((flags & LINUX_O_CLOEXEC) != 0) 689 fflags |= O_CLOEXEC; 690 691 fdp = td->td_proc->p_fd; 692 error = falloc(td, &fp, &fd, fflags); 693 if (error != 0) 694 return (error); 695 696 efd = malloc(sizeof(*efd), M_EPOLL, M_WAITOK | M_ZERO); 697 efd->efd_flags = flags; 698 efd->efd_count = initval; 699 mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF); 700 701 knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock); 702 703 fflags = FREAD | FWRITE; 704 if ((flags & LINUX_O_NONBLOCK) != 0) 705 fflags |= FNONBLOCK; 706 707 finit(fp, fflags, DTYPE_LINUXEFD, efd, &eventfdops); 708 fdrop(fp, td); 709 710 td->td_retval[0] = fd; 711 return (error); 712 } 713 714 #ifdef LINUX_LEGACY_SYSCALLS 715 int 716 linux_eventfd(struct thread *td, struct linux_eventfd_args *args) 717 { 718 719 return (eventfd_create(td, args->initval, 0)); 720 } 721 #endif 722 723 int 724 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args) 725 { 726 727 if ((args->flags & ~(LINUX_O_CLOEXEC|LINUX_O_NONBLOCK|LINUX_EFD_SEMAPHORE)) != 0) 728 return (EINVAL); 729 730 return (eventfd_create(td, args->initval, args->flags)); 731 } 732 733 static int 734 eventfd_close(struct file *fp, struct thread *td) 735 { 736 struct eventfd *efd; 737 738 efd = fp->f_data; 739 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 740 return (EINVAL); 741 742 seldrain(&efd->efd_sel); 743 knlist_destroy(&efd->efd_sel.si_note); 744 745 fp->f_ops = &badfileops; 746 mtx_destroy(&efd->efd_lock); 747 free(efd, M_EPOLL); 748 749 return (0); 750 } 751 752 static int 753 eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 754 int flags, struct thread *td) 755 { 756 struct eventfd *efd; 757 eventfd_t count; 758 int error; 759 760 efd = fp->f_data; 761 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 762 return (EINVAL); 763 764 if (uio->uio_resid < sizeof(eventfd_t)) 765 return (EINVAL); 766 767 error = 0; 768 mtx_lock(&efd->efd_lock); 769 retry: 770 if (efd->efd_count == 0) { 771 if ((fp->f_flag & FNONBLOCK) != 0) { 772 mtx_unlock(&efd->efd_lock); 773 return (EAGAIN); 774 } 775 error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "lefdrd", 0); 776 if (error == 0) 777 goto retry; 778 } 779 if (error == 0) { 780 if ((efd->efd_flags & LINUX_EFD_SEMAPHORE) != 0) { 781 count = 1; 782 --efd->efd_count; 783 } else { 784 count = efd->efd_count; 785 efd->efd_count = 0; 786 } 787 KNOTE_LOCKED(&efd->efd_sel.si_note, 0); 788 selwakeup(&efd->efd_sel); 789 wakeup(&efd->efd_count); 790 mtx_unlock(&efd->efd_lock); 791 error = uiomove(&count, sizeof(eventfd_t), uio); 792 } else 793 mtx_unlock(&efd->efd_lock); 794 795 return (error); 796 } 797 798 static int 799 eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred, 800 int flags, struct thread *td) 801 { 802 struct eventfd *efd; 803 eventfd_t count; 804 int error; 805 806 efd = fp->f_data; 807 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 808 return (EINVAL); 809 810 if (uio->uio_resid < sizeof(eventfd_t)) 811 return (EINVAL); 812 813 error = uiomove(&count, sizeof(eventfd_t), uio); 814 if (error != 0) 815 return (error); 816 if (count == UINT64_MAX) 817 return (EINVAL); 818 819 mtx_lock(&efd->efd_lock); 820 retry: 821 if (UINT64_MAX - efd->efd_count <= count) { 822 if ((fp->f_flag & FNONBLOCK) != 0) { 823 mtx_unlock(&efd->efd_lock); 824 /* Do not not return the number of bytes written */ 825 uio->uio_resid += sizeof(eventfd_t); 826 return (EAGAIN); 827 } 828 error = mtx_sleep(&efd->efd_count, &efd->efd_lock, 829 PCATCH, "lefdwr", 0); 830 if (error == 0) 831 goto retry; 832 } 833 if (error == 0) { 834 efd->efd_count += count; 835 KNOTE_LOCKED(&efd->efd_sel.si_note, 0); 836 selwakeup(&efd->efd_sel); 837 wakeup(&efd->efd_count); 838 } 839 mtx_unlock(&efd->efd_lock); 840 841 return (error); 842 } 843 844 static int 845 eventfd_poll(struct file *fp, int events, struct ucred *active_cred, 846 struct thread *td) 847 { 848 struct eventfd *efd; 849 int revents = 0; 850 851 efd = fp->f_data; 852 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 853 return (POLLERR); 854 855 mtx_lock(&efd->efd_lock); 856 if ((events & (POLLIN|POLLRDNORM)) && efd->efd_count > 0) 857 revents |= events & (POLLIN|POLLRDNORM); 858 if ((events & (POLLOUT|POLLWRNORM)) && UINT64_MAX - 1 > efd->efd_count) 859 revents |= events & (POLLOUT|POLLWRNORM); 860 if (revents == 0) 861 selrecord(td, &efd->efd_sel); 862 mtx_unlock(&efd->efd_lock); 863 864 return (revents); 865 } 866 867 /*ARGSUSED*/ 868 static int 869 eventfd_kqfilter(struct file *fp, struct knote *kn) 870 { 871 struct eventfd *efd; 872 873 efd = fp->f_data; 874 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 875 return (EINVAL); 876 877 mtx_lock(&efd->efd_lock); 878 switch (kn->kn_filter) { 879 case EVFILT_READ: 880 kn->kn_fop = &eventfd_rfiltops; 881 break; 882 case EVFILT_WRITE: 883 kn->kn_fop = &eventfd_wfiltops; 884 break; 885 default: 886 mtx_unlock(&efd->efd_lock); 887 return (EINVAL); 888 } 889 890 kn->kn_hook = efd; 891 knlist_add(&efd->efd_sel.si_note, kn, 1); 892 mtx_unlock(&efd->efd_lock); 893 894 return (0); 895 } 896 897 static void 898 filt_eventfddetach(struct knote *kn) 899 { 900 struct eventfd *efd = kn->kn_hook; 901 902 mtx_lock(&efd->efd_lock); 903 knlist_remove(&efd->efd_sel.si_note, kn, 1); 904 mtx_unlock(&efd->efd_lock); 905 } 906 907 /*ARGSUSED*/ 908 static int 909 filt_eventfdread(struct knote *kn, long hint) 910 { 911 struct eventfd *efd = kn->kn_hook; 912 int ret; 913 914 mtx_assert(&efd->efd_lock, MA_OWNED); 915 ret = (efd->efd_count > 0); 916 917 return (ret); 918 } 919 920 /*ARGSUSED*/ 921 static int 922 filt_eventfdwrite(struct knote *kn, long hint) 923 { 924 struct eventfd *efd = kn->kn_hook; 925 int ret; 926 927 mtx_assert(&efd->efd_lock, MA_OWNED); 928 ret = (UINT64_MAX - 1 > efd->efd_count); 929 930 return (ret); 931 } 932 933 /*ARGSUSED*/ 934 static int 935 eventfd_ioctl(struct file *fp, u_long cmd, void *data, 936 struct ucred *active_cred, struct thread *td) 937 { 938 939 if (fp->f_data == NULL || (fp->f_type != DTYPE_LINUXEFD && 940 fp->f_type != DTYPE_LINUXTFD)) 941 return (EINVAL); 942 943 switch (cmd) 944 { 945 case FIONBIO: 946 if ((*(int *)data)) 947 atomic_set_int(&fp->f_flag, FNONBLOCK); 948 else 949 atomic_clear_int(&fp->f_flag, FNONBLOCK); 950 case FIOASYNC: 951 return (0); 952 default: 953 return (ENXIO); 954 } 955 } 956 957 /*ARGSUSED*/ 958 static int 959 eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, 960 struct thread *td) 961 { 962 963 return (ENXIO); 964 } 965 966 /*ARGSUSED*/ 967 static int 968 eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 969 { 970 971 kif->kf_type = KF_TYPE_UNKNOWN; 972 return (0); 973 } 974 975 int 976 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args) 977 { 978 struct filedesc *fdp; 979 struct timerfd *tfd; 980 struct file *fp; 981 clockid_t clockid; 982 int fflags, fd, error; 983 984 if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0) 985 return (EINVAL); 986 987 error = linux_to_native_clockid(&clockid, args->clockid); 988 if (error != 0) 989 return (error); 990 if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) 991 return (EINVAL); 992 993 fflags = 0; 994 if ((args->flags & LINUX_TFD_CLOEXEC) != 0) 995 fflags |= O_CLOEXEC; 996 997 fdp = td->td_proc->p_fd; 998 error = falloc(td, &fp, &fd, fflags); 999 if (error != 0) 1000 return (error); 1001 1002 tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO); 1003 tfd->tfd_clockid = clockid; 1004 mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF); 1005 1006 callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0); 1007 knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock); 1008 1009 fflags = FREAD; 1010 if ((args->flags & LINUX_O_NONBLOCK) != 0) 1011 fflags |= FNONBLOCK; 1012 1013 finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops); 1014 fdrop(fp, td); 1015 1016 td->td_retval[0] = fd; 1017 return (error); 1018 } 1019 1020 static int 1021 timerfd_close(struct file *fp, struct thread *td) 1022 { 1023 struct timerfd *tfd; 1024 1025 tfd = fp->f_data; 1026 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1027 return (EINVAL); 1028 1029 timespecclear(&tfd->tfd_time.it_value); 1030 timespecclear(&tfd->tfd_time.it_interval); 1031 1032 mtx_lock(&tfd->tfd_lock); 1033 callout_drain(&tfd->tfd_callout); 1034 mtx_unlock(&tfd->tfd_lock); 1035 1036 seldrain(&tfd->tfd_sel); 1037 knlist_destroy(&tfd->tfd_sel.si_note); 1038 1039 fp->f_ops = &badfileops; 1040 mtx_destroy(&tfd->tfd_lock); 1041 free(tfd, M_EPOLL); 1042 1043 return (0); 1044 } 1045 1046 static int 1047 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 1048 int flags, struct thread *td) 1049 { 1050 struct timerfd *tfd; 1051 timerfd_t count; 1052 int error; 1053 1054 tfd = fp->f_data; 1055 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1056 return (EINVAL); 1057 1058 if (uio->uio_resid < sizeof(timerfd_t)) 1059 return (EINVAL); 1060 1061 error = 0; 1062 mtx_lock(&tfd->tfd_lock); 1063 retry: 1064 if (tfd->tfd_canceled) { 1065 tfd->tfd_count = 0; 1066 mtx_unlock(&tfd->tfd_lock); 1067 return (ECANCELED); 1068 } 1069 if (tfd->tfd_count == 0) { 1070 if ((fp->f_flag & FNONBLOCK) != 0) { 1071 mtx_unlock(&tfd->tfd_lock); 1072 return (EAGAIN); 1073 } 1074 error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0); 1075 if (error == 0) 1076 goto retry; 1077 } 1078 if (error == 0) { 1079 count = tfd->tfd_count; 1080 tfd->tfd_count = 0; 1081 mtx_unlock(&tfd->tfd_lock); 1082 error = uiomove(&count, sizeof(timerfd_t), uio); 1083 } else 1084 mtx_unlock(&tfd->tfd_lock); 1085 1086 return (error); 1087 } 1088 1089 static int 1090 timerfd_poll(struct file *fp, int events, struct ucred *active_cred, 1091 struct thread *td) 1092 { 1093 struct timerfd *tfd; 1094 int revents = 0; 1095 1096 tfd = fp->f_data; 1097 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1098 return (POLLERR); 1099 1100 mtx_lock(&tfd->tfd_lock); 1101 if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0) 1102 revents |= events & (POLLIN|POLLRDNORM); 1103 if (revents == 0) 1104 selrecord(td, &tfd->tfd_sel); 1105 mtx_unlock(&tfd->tfd_lock); 1106 1107 return (revents); 1108 } 1109 1110 /*ARGSUSED*/ 1111 static int 1112 timerfd_kqfilter(struct file *fp, struct knote *kn) 1113 { 1114 struct timerfd *tfd; 1115 1116 tfd = fp->f_data; 1117 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1118 return (EINVAL); 1119 1120 if (kn->kn_filter == EVFILT_READ) 1121 kn->kn_fop = &timerfd_rfiltops; 1122 else 1123 return (EINVAL); 1124 1125 kn->kn_hook = tfd; 1126 knlist_add(&tfd->tfd_sel.si_note, kn, 0); 1127 1128 return (0); 1129 } 1130 1131 static void 1132 filt_timerfddetach(struct knote *kn) 1133 { 1134 struct timerfd *tfd = kn->kn_hook; 1135 1136 mtx_lock(&tfd->tfd_lock); 1137 knlist_remove(&tfd->tfd_sel.si_note, kn, 1); 1138 mtx_unlock(&tfd->tfd_lock); 1139 } 1140 1141 /*ARGSUSED*/ 1142 static int 1143 filt_timerfdread(struct knote *kn, long hint) 1144 { 1145 struct timerfd *tfd = kn->kn_hook; 1146 1147 return (tfd->tfd_count > 0); 1148 } 1149 1150 /*ARGSUSED*/ 1151 static int 1152 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, 1153 struct thread *td) 1154 { 1155 1156 return (ENXIO); 1157 } 1158 1159 /*ARGSUSED*/ 1160 static int 1161 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 1162 { 1163 1164 kif->kf_type = KF_TYPE_UNKNOWN; 1165 return (0); 1166 } 1167 1168 static void 1169 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts) 1170 { 1171 1172 if (tfd->tfd_clockid == CLOCK_REALTIME) 1173 getnanotime(ts); 1174 else /* CLOCK_MONOTONIC */ 1175 getnanouptime(ts); 1176 } 1177 1178 static void 1179 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots) 1180 { 1181 struct timespec cts; 1182 1183 linux_timerfd_clocktime(tfd, &cts); 1184 *ots = tfd->tfd_time; 1185 if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) { 1186 timespecsub(&ots->it_value, &cts); 1187 if (ots->it_value.tv_sec < 0 || 1188 (ots->it_value.tv_sec == 0 && 1189 ots->it_value.tv_nsec == 0)) { 1190 ots->it_value.tv_sec = 0; 1191 ots->it_value.tv_nsec = 1; 1192 } 1193 } 1194 } 1195 1196 int 1197 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args) 1198 { 1199 struct l_itimerspec lots; 1200 struct itimerspec ots; 1201 struct timerfd *tfd; 1202 struct file *fp; 1203 int error; 1204 1205 error = fget(td, args->fd, &cap_read_rights, &fp); 1206 if (error != 0) 1207 return (error); 1208 tfd = fp->f_data; 1209 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { 1210 error = EINVAL; 1211 goto out; 1212 } 1213 1214 mtx_lock(&tfd->tfd_lock); 1215 linux_timerfd_curval(tfd, &ots); 1216 mtx_unlock(&tfd->tfd_lock); 1217 1218 error = native_to_linux_itimerspec(&lots, &ots); 1219 if (error == 0) 1220 error = copyout(&lots, args->old_value, sizeof(lots)); 1221 1222 out: 1223 fdrop(fp, td); 1224 return (error); 1225 } 1226 1227 int 1228 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args) 1229 { 1230 struct l_itimerspec lots; 1231 struct itimerspec nts, ots; 1232 struct timespec cts, ts; 1233 struct timerfd *tfd; 1234 struct timeval tv; 1235 struct file *fp; 1236 int error; 1237 1238 if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0) 1239 return (EINVAL); 1240 1241 error = copyin(args->new_value, &lots, sizeof(lots)); 1242 if (error != 0) 1243 return (error); 1244 error = linux_to_native_itimerspec(&nts, &lots); 1245 if (error != 0) 1246 return (error); 1247 1248 error = fget(td, args->fd, &cap_write_rights, &fp); 1249 if (error != 0) 1250 return (error); 1251 tfd = fp->f_data; 1252 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { 1253 error = EINVAL; 1254 goto out; 1255 } 1256 1257 mtx_lock(&tfd->tfd_lock); 1258 if (!timespecisset(&nts.it_value)) 1259 timespecclear(&nts.it_interval); 1260 if (args->old_value != NULL) 1261 linux_timerfd_curval(tfd, &ots); 1262 1263 tfd->tfd_time = nts; 1264 if (timespecisset(&nts.it_value)) { 1265 linux_timerfd_clocktime(tfd, &cts); 1266 ts = nts.it_value; 1267 if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) { 1268 timespecadd(&tfd->tfd_time.it_value, &cts); 1269 } else { 1270 timespecsub(&ts, &cts); 1271 } 1272 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1273 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1274 linux_timerfd_expire, tfd); 1275 tfd->tfd_canceled = false; 1276 } else { 1277 tfd->tfd_canceled = true; 1278 callout_stop(&tfd->tfd_callout); 1279 } 1280 mtx_unlock(&tfd->tfd_lock); 1281 1282 if (args->old_value != NULL) { 1283 error = native_to_linux_itimerspec(&lots, &ots); 1284 if (error == 0) 1285 error = copyout(&lots, args->old_value, sizeof(lots)); 1286 } 1287 1288 out: 1289 fdrop(fp, td); 1290 return (error); 1291 } 1292 1293 static void 1294 linux_timerfd_expire(void *arg) 1295 { 1296 struct timespec cts, ts; 1297 struct timeval tv; 1298 struct timerfd *tfd; 1299 1300 tfd = (struct timerfd *)arg; 1301 1302 linux_timerfd_clocktime(tfd, &cts); 1303 if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) { 1304 if (timespecisset(&tfd->tfd_time.it_interval)) 1305 timespecadd(&tfd->tfd_time.it_value, 1306 &tfd->tfd_time.it_interval); 1307 else 1308 /* single shot timer */ 1309 timespecclear(&tfd->tfd_time.it_value); 1310 if (timespecisset(&tfd->tfd_time.it_value)) { 1311 ts = tfd->tfd_time.it_value; 1312 timespecsub(&ts, &cts); 1313 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1314 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1315 linux_timerfd_expire, tfd); 1316 } 1317 tfd->tfd_count++; 1318 KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0); 1319 selwakeup(&tfd->tfd_sel); 1320 wakeup(&tfd->tfd_count); 1321 } else if (timespecisset(&tfd->tfd_time.it_value)) { 1322 ts = tfd->tfd_time.it_value; 1323 timespecsub(&ts, &cts); 1324 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1325 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1326 linux_timerfd_expire, tfd); 1327 } 1328 } 1329