1 /*- 2 * Copyright (c) 2007 Roman Divacky 3 * Copyright (c) 2014 Dmitry Chagin 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include "opt_compat.h" 32 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/imgact.h> 36 #include <sys/kernel.h> 37 #include <sys/limits.h> 38 #include <sys/lock.h> 39 #include <sys/mutex.h> 40 #include <sys/callout.h> 41 #include <sys/capsicum.h> 42 #include <sys/types.h> 43 #include <sys/user.h> 44 #include <sys/file.h> 45 #include <sys/filedesc.h> 46 #include <sys/filio.h> 47 #include <sys/errno.h> 48 #include <sys/event.h> 49 #include <sys/poll.h> 50 #include <sys/proc.h> 51 #include <sys/selinfo.h> 52 #include <sys/sx.h> 53 #include <sys/syscallsubr.h> 54 #include <sys/timespec.h> 55 56 #ifdef COMPAT_LINUX32 57 #include <machine/../linux32/linux.h> 58 #include <machine/../linux32/linux32_proto.h> 59 #else 60 #include <machine/../linux/linux.h> 61 #include <machine/../linux/linux_proto.h> 62 #endif 63 64 #include <compat/linux/linux_emul.h> 65 #include <compat/linux/linux_event.h> 66 #include <compat/linux/linux_file.h> 67 #include <compat/linux/linux_timer.h> 68 #include <compat/linux/linux_util.h> 69 70 /* 71 * epoll defines 'struct epoll_event' with the field 'data' as 64 bits 72 * on all architectures. But on 32 bit architectures BSD 'struct kevent' only 73 * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied 74 * data verbatuim. Therefore we allocate 64-bit memory block to pass 75 * user supplied data for every file descriptor. 76 */ 77 78 typedef uint64_t epoll_udata_t; 79 80 struct epoll_emuldata { 81 uint32_t fdc; /* epoll udata max index */ 82 epoll_udata_t udata[1]; /* epoll user data vector */ 83 }; 84 85 #define EPOLL_DEF_SZ 16 86 #define EPOLL_SIZE(fdn) \ 87 (sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t)) 88 89 struct epoll_event { 90 uint32_t events; 91 epoll_udata_t data; 92 } 93 #if defined(__amd64__) 94 __attribute__((packed)) 95 #endif 96 ; 97 98 #define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 99 100 static void epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata); 101 static int epoll_to_kevent(struct thread *td, int fd, 102 struct epoll_event *l_event, struct kevent *kevent, 103 int *nkevents); 104 static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event); 105 static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count); 106 static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count); 107 static int epoll_register_kevent(struct thread *td, struct file *epfp, 108 int fd, int filter, unsigned int flags); 109 static int epoll_fd_registered(struct thread *td, struct file *epfp, 110 int fd); 111 static int epoll_delete_all_events(struct thread *td, struct file *epfp, 112 int fd); 113 114 struct epoll_copyin_args { 115 struct kevent *changelist; 116 }; 117 118 struct epoll_copyout_args { 119 struct epoll_event *leventlist; 120 struct proc *p; 121 uint32_t count; 122 int error; 123 }; 124 125 /* eventfd */ 126 typedef uint64_t eventfd_t; 127 128 static fo_rdwr_t eventfd_read; 129 static fo_rdwr_t eventfd_write; 130 static fo_ioctl_t eventfd_ioctl; 131 static fo_poll_t eventfd_poll; 132 static fo_kqfilter_t eventfd_kqfilter; 133 static fo_stat_t eventfd_stat; 134 static fo_close_t eventfd_close; 135 static fo_fill_kinfo_t eventfd_fill_kinfo; 136 137 static struct fileops eventfdops = { 138 .fo_read = eventfd_read, 139 .fo_write = eventfd_write, 140 .fo_truncate = invfo_truncate, 141 .fo_ioctl = eventfd_ioctl, 142 .fo_poll = eventfd_poll, 143 .fo_kqfilter = eventfd_kqfilter, 144 .fo_stat = eventfd_stat, 145 .fo_close = eventfd_close, 146 .fo_chmod = invfo_chmod, 147 .fo_chown = invfo_chown, 148 .fo_sendfile = invfo_sendfile, 149 .fo_fill_kinfo = eventfd_fill_kinfo, 150 .fo_flags = DFLAG_PASSABLE 151 }; 152 153 static void filt_eventfddetach(struct knote *kn); 154 static int filt_eventfdread(struct knote *kn, long hint); 155 static int filt_eventfdwrite(struct knote *kn, long hint); 156 157 static struct filterops eventfd_rfiltops = { 158 .f_isfd = 1, 159 .f_detach = filt_eventfddetach, 160 .f_event = filt_eventfdread 161 }; 162 static struct filterops eventfd_wfiltops = { 163 .f_isfd = 1, 164 .f_detach = filt_eventfddetach, 165 .f_event = filt_eventfdwrite 166 }; 167 168 /* timerfd */ 169 typedef uint64_t timerfd_t; 170 171 static fo_rdwr_t timerfd_read; 172 static fo_poll_t timerfd_poll; 173 static fo_kqfilter_t timerfd_kqfilter; 174 static fo_stat_t timerfd_stat; 175 static fo_close_t timerfd_close; 176 static fo_fill_kinfo_t timerfd_fill_kinfo; 177 178 static struct fileops timerfdops = { 179 .fo_read = timerfd_read, 180 .fo_write = invfo_rdwr, 181 .fo_truncate = invfo_truncate, 182 .fo_ioctl = eventfd_ioctl, 183 .fo_poll = timerfd_poll, 184 .fo_kqfilter = timerfd_kqfilter, 185 .fo_stat = timerfd_stat, 186 .fo_close = timerfd_close, 187 .fo_chmod = invfo_chmod, 188 .fo_chown = invfo_chown, 189 .fo_sendfile = invfo_sendfile, 190 .fo_fill_kinfo = timerfd_fill_kinfo, 191 .fo_flags = DFLAG_PASSABLE 192 }; 193 194 static void filt_timerfddetach(struct knote *kn); 195 static int filt_timerfdread(struct knote *kn, long hint); 196 197 static struct filterops timerfd_rfiltops = { 198 .f_isfd = 1, 199 .f_detach = filt_timerfddetach, 200 .f_event = filt_timerfdread 201 }; 202 203 struct eventfd { 204 eventfd_t efd_count; 205 uint32_t efd_flags; 206 struct selinfo efd_sel; 207 struct mtx efd_lock; 208 }; 209 210 struct timerfd { 211 clockid_t tfd_clockid; 212 struct itimerspec tfd_time; 213 struct callout tfd_callout; 214 timerfd_t tfd_count; 215 bool tfd_canceled; 216 struct selinfo tfd_sel; 217 struct mtx tfd_lock; 218 }; 219 220 static int eventfd_create(struct thread *td, uint32_t initval, int flags); 221 static void linux_timerfd_expire(void *); 222 static void linux_timerfd_curval(struct timerfd *, struct itimerspec *); 223 224 static void 225 epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata) 226 { 227 struct linux_pemuldata *pem; 228 struct epoll_emuldata *emd; 229 struct proc *p; 230 231 p = td->td_proc; 232 233 pem = pem_find(p); 234 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 235 236 LINUX_PEM_XLOCK(pem); 237 if (pem->epoll == NULL) { 238 emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); 239 emd->fdc = fd; 240 pem->epoll = emd; 241 } else { 242 emd = pem->epoll; 243 if (fd > emd->fdc) { 244 emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); 245 emd->fdc = fd; 246 pem->epoll = emd; 247 } 248 } 249 emd->udata[fd] = udata; 250 LINUX_PEM_XUNLOCK(pem); 251 } 252 253 static int 254 epoll_create_common(struct thread *td, int flags) 255 { 256 int error; 257 258 error = kern_kqueue(td, flags, NULL); 259 if (error != 0) 260 return (error); 261 262 epoll_fd_install(td, EPOLL_DEF_SZ, 0); 263 264 return (0); 265 } 266 267 #ifdef LINUX_LEGACY_SYSCALLS 268 int 269 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args) 270 { 271 272 /* 273 * args->size is unused. Linux just tests it 274 * and then forgets it as well. 275 */ 276 if (args->size <= 0) 277 return (EINVAL); 278 279 return (epoll_create_common(td, 0)); 280 } 281 #endif 282 283 int 284 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args) 285 { 286 int flags; 287 288 if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0) 289 return (EINVAL); 290 291 flags = 0; 292 if ((args->flags & LINUX_O_CLOEXEC) != 0) 293 flags |= O_CLOEXEC; 294 295 return (epoll_create_common(td, flags)); 296 } 297 298 /* Structure converting function from epoll to kevent. */ 299 static int 300 epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event, 301 struct kevent *kevent, int *nkevents) 302 { 303 uint32_t levents = l_event->events; 304 struct linux_pemuldata *pem; 305 struct proc *p; 306 unsigned short kev_flags = EV_ADD | EV_ENABLE; 307 308 /* flags related to how event is registered */ 309 if ((levents & LINUX_EPOLLONESHOT) != 0) 310 kev_flags |= EV_DISPATCH; 311 if ((levents & LINUX_EPOLLET) != 0) 312 kev_flags |= EV_CLEAR; 313 if ((levents & LINUX_EPOLLERR) != 0) 314 kev_flags |= EV_ERROR; 315 if ((levents & LINUX_EPOLLRDHUP) != 0) 316 kev_flags |= EV_EOF; 317 318 /* flags related to what event is registered */ 319 if ((levents & LINUX_EPOLL_EVRD) != 0) { 320 EV_SET(kevent++, fd, EVFILT_READ, kev_flags, 0, 0, 0); 321 ++(*nkevents); 322 } 323 if ((levents & LINUX_EPOLL_EVWR) != 0) { 324 EV_SET(kevent++, fd, EVFILT_WRITE, kev_flags, 0, 0, 0); 325 ++(*nkevents); 326 } 327 /* zero event mask is legal */ 328 if ((levents & (LINUX_EPOLL_EVRD | LINUX_EPOLL_EVWR)) == 0) { 329 EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0); 330 ++(*nkevents); 331 } 332 333 if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) { 334 p = td->td_proc; 335 336 pem = pem_find(p); 337 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 338 KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n")); 339 340 LINUX_PEM_XLOCK(pem); 341 if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) { 342 pem->flags |= LINUX_XUNSUP_EPOLL; 343 LINUX_PEM_XUNLOCK(pem); 344 linux_msg(td, "epoll_ctl unsupported flags: 0x%x", 345 levents); 346 } else 347 LINUX_PEM_XUNLOCK(pem); 348 return (EINVAL); 349 } 350 351 return (0); 352 } 353 354 /* 355 * Structure converting function from kevent to epoll. In a case 356 * this is called on error in registration we store the error in 357 * event->data and pick it up later in linux_epoll_ctl(). 358 */ 359 static void 360 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event) 361 { 362 363 if ((kevent->flags & EV_ERROR) != 0) { 364 l_event->events = LINUX_EPOLLERR; 365 return; 366 } 367 368 /* XXX EPOLLPRI, EPOLLHUP */ 369 switch (kevent->filter) { 370 case EVFILT_READ: 371 l_event->events = LINUX_EPOLLIN; 372 if ((kevent->flags & EV_EOF) != 0) 373 l_event->events |= LINUX_EPOLLRDHUP; 374 break; 375 case EVFILT_WRITE: 376 l_event->events = LINUX_EPOLLOUT; 377 break; 378 } 379 } 380 381 /* 382 * Copyout callback used by kevent. This converts kevent 383 * events to epoll events and copies them back to the 384 * userspace. This is also called on error on registering 385 * of the filter. 386 */ 387 static int 388 epoll_kev_copyout(void *arg, struct kevent *kevp, int count) 389 { 390 struct epoll_copyout_args *args; 391 struct linux_pemuldata *pem; 392 struct epoll_emuldata *emd; 393 struct epoll_event *eep; 394 int error, fd, i; 395 396 args = (struct epoll_copyout_args*) arg; 397 eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO); 398 399 pem = pem_find(args->p); 400 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 401 LINUX_PEM_SLOCK(pem); 402 emd = pem->epoll; 403 KASSERT(emd != NULL, ("epoll proc epolldata not found.\n")); 404 405 for (i = 0; i < count; i++) { 406 kevent_to_epoll(&kevp[i], &eep[i]); 407 408 fd = kevp[i].ident; 409 KASSERT(fd <= emd->fdc, ("epoll user data vector" 410 " is too small.\n")); 411 eep[i].data = emd->udata[fd]; 412 } 413 LINUX_PEM_SUNLOCK(pem); 414 415 error = copyout(eep, args->leventlist, count * sizeof(*eep)); 416 if (error == 0) { 417 args->leventlist += count; 418 args->count += count; 419 } else if (args->error == 0) 420 args->error = error; 421 422 free(eep, M_EPOLL); 423 return (error); 424 } 425 426 /* 427 * Copyin callback used by kevent. This copies already 428 * converted filters from kernel memory to the kevent 429 * internal kernel memory. Hence the memcpy instead of 430 * copyin. 431 */ 432 static int 433 epoll_kev_copyin(void *arg, struct kevent *kevp, int count) 434 { 435 struct epoll_copyin_args *args; 436 437 args = (struct epoll_copyin_args*) arg; 438 439 memcpy(kevp, args->changelist, count * sizeof(*kevp)); 440 args->changelist += count; 441 442 return (0); 443 } 444 445 /* 446 * Load epoll filter, convert it to kevent filter 447 * and load it into kevent subsystem. 448 */ 449 int 450 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args) 451 { 452 struct file *epfp, *fp; 453 struct epoll_copyin_args ciargs; 454 struct kevent kev[2]; 455 struct kevent_copyops k_ops = { &ciargs, 456 NULL, 457 epoll_kev_copyin}; 458 struct epoll_event le; 459 cap_rights_t rights; 460 int nchanges = 0; 461 int error; 462 463 if (args->op != LINUX_EPOLL_CTL_DEL) { 464 error = copyin(args->event, &le, sizeof(le)); 465 if (error != 0) 466 return (error); 467 } 468 469 error = fget(td, args->epfd, 470 cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &epfp); 471 if (error != 0) 472 return (error); 473 if (epfp->f_type != DTYPE_KQUEUE) { 474 error = EINVAL; 475 goto leave1; 476 } 477 478 /* Protect user data vector from incorrectly supplied fd. */ 479 error = fget(td, args->fd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp); 480 if (error != 0) 481 goto leave1; 482 483 /* Linux disallows spying on himself */ 484 if (epfp == fp) { 485 error = EINVAL; 486 goto leave0; 487 } 488 489 ciargs.changelist = kev; 490 491 if (args->op != LINUX_EPOLL_CTL_DEL) { 492 error = epoll_to_kevent(td, args->fd, &le, kev, &nchanges); 493 if (error != 0) 494 goto leave0; 495 } 496 497 switch (args->op) { 498 case LINUX_EPOLL_CTL_MOD: 499 error = epoll_delete_all_events(td, epfp, args->fd); 500 if (error != 0) 501 goto leave0; 502 break; 503 504 case LINUX_EPOLL_CTL_ADD: 505 if (epoll_fd_registered(td, epfp, args->fd)) { 506 error = EEXIST; 507 goto leave0; 508 } 509 break; 510 511 case LINUX_EPOLL_CTL_DEL: 512 /* CTL_DEL means unregister this fd with this epoll */ 513 error = epoll_delete_all_events(td, epfp, args->fd); 514 goto leave0; 515 516 default: 517 error = EINVAL; 518 goto leave0; 519 } 520 521 epoll_fd_install(td, args->fd, le.data); 522 523 error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL); 524 525 leave0: 526 fdrop(fp, td); 527 528 leave1: 529 fdrop(epfp, td); 530 return (error); 531 } 532 533 /* 534 * Wait for a filter to be triggered on the epoll file descriptor. 535 */ 536 static int 537 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events, 538 int maxevents, int timeout, sigset_t *uset) 539 { 540 struct epoll_copyout_args coargs; 541 struct kevent_copyops k_ops = { &coargs, 542 epoll_kev_copyout, 543 NULL}; 544 struct timespec ts, *tsp; 545 cap_rights_t rights; 546 struct file *epfp; 547 sigset_t omask; 548 int error; 549 550 if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS) 551 return (EINVAL); 552 553 error = fget(td, epfd, 554 cap_rights_init(&rights, CAP_KQUEUE_EVENT), &epfp); 555 if (error != 0) 556 return (error); 557 if (epfp->f_type != DTYPE_KQUEUE) { 558 error = EINVAL; 559 goto leave; 560 } 561 if (uset != NULL) { 562 error = kern_sigprocmask(td, SIG_SETMASK, uset, 563 &omask, 0); 564 if (error != 0) 565 goto leave; 566 td->td_pflags |= TDP_OLDMASK; 567 /* 568 * Make sure that ast() is called on return to 569 * usermode and TDP_OLDMASK is cleared, restoring old 570 * sigmask. 571 */ 572 thread_lock(td); 573 td->td_flags |= TDF_ASTPENDING; 574 thread_unlock(td); 575 } 576 577 coargs.leventlist = events; 578 coargs.p = td->td_proc; 579 coargs.count = 0; 580 coargs.error = 0; 581 582 /* 583 * Linux epoll_wait(2) man page states that timeout of -1 causes caller 584 * to block indefinitely. Real implementation does it if any negative 585 * timeout value is passed. 586 */ 587 if (timeout >= 0) { 588 /* Convert from milliseconds to timespec. */ 589 ts.tv_sec = timeout / 1000; 590 ts.tv_nsec = (timeout % 1000) * 1000000; 591 tsp = &ts; 592 } else { 593 tsp = NULL; 594 } 595 596 error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp); 597 if (error == 0 && coargs.error != 0) 598 error = coargs.error; 599 600 /* 601 * kern_kevent might return ENOMEM which is not expected from epoll_wait. 602 * Maybe we should translate that but I don't think it matters at all. 603 */ 604 if (error == 0) 605 td->td_retval[0] = coargs.count; 606 607 if (uset != NULL) 608 error = kern_sigprocmask(td, SIG_SETMASK, &omask, 609 NULL, 0); 610 leave: 611 fdrop(epfp, td); 612 return (error); 613 } 614 615 #ifdef LINUX_LEGACY_SYSCALLS 616 int 617 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args) 618 { 619 620 return (linux_epoll_wait_common(td, args->epfd, args->events, 621 args->maxevents, args->timeout, NULL)); 622 } 623 #endif 624 625 int 626 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args) 627 { 628 sigset_t mask, *pmask; 629 l_sigset_t lmask; 630 int error; 631 632 if (args->mask != NULL) { 633 if (args->sigsetsize != sizeof(l_sigset_t)) 634 return (EINVAL); 635 error = copyin(args->mask, &lmask, sizeof(l_sigset_t)); 636 if (error != 0) 637 return (error); 638 linux_to_bsd_sigset(&lmask, &mask); 639 pmask = &mask; 640 } else 641 pmask = NULL; 642 return (linux_epoll_wait_common(td, args->epfd, args->events, 643 args->maxevents, args->timeout, pmask)); 644 } 645 646 static int 647 epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter, 648 unsigned int flags) 649 { 650 struct epoll_copyin_args ciargs; 651 struct kevent kev; 652 struct kevent_copyops k_ops = { &ciargs, 653 NULL, 654 epoll_kev_copyin}; 655 656 ciargs.changelist = &kev; 657 EV_SET(&kev, fd, filter, flags, 0, 0, 0); 658 659 return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL)); 660 } 661 662 static int 663 epoll_fd_registered(struct thread *td, struct file *epfp, int fd) 664 { 665 /* 666 * Set empty filter flags to avoid accidental modification of already 667 * registered events. In the case of event re-registration: 668 * 1. If event does not exists kevent() does nothing and returns ENOENT 669 * 2. If event does exists, it's enabled/disabled state is preserved 670 * but fflags, data and udata fields are overwritten. So we can not 671 * set socket lowats and store user's context pointer in udata. 672 */ 673 if (epoll_register_kevent(td, epfp, fd, EVFILT_READ, 0) != ENOENT || 674 epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, 0) != ENOENT) 675 return (1); 676 677 return (0); 678 } 679 680 static int 681 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd) 682 { 683 int error1, error2; 684 685 error1 = epoll_register_kevent(td, epfp, fd, EVFILT_READ, EV_DELETE); 686 error2 = epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, EV_DELETE); 687 688 /* return 0 if at least one result positive */ 689 return (error1 == 0 ? 0 : error2); 690 } 691 692 static int 693 eventfd_create(struct thread *td, uint32_t initval, int flags) 694 { 695 struct filedesc *fdp; 696 struct eventfd *efd; 697 struct file *fp; 698 int fflags, fd, error; 699 700 fflags = 0; 701 if ((flags & LINUX_O_CLOEXEC) != 0) 702 fflags |= O_CLOEXEC; 703 704 fdp = td->td_proc->p_fd; 705 error = falloc(td, &fp, &fd, fflags); 706 if (error != 0) 707 return (error); 708 709 efd = malloc(sizeof(*efd), M_EPOLL, M_WAITOK | M_ZERO); 710 efd->efd_flags = flags; 711 efd->efd_count = initval; 712 mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF); 713 714 knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock); 715 716 fflags = FREAD | FWRITE; 717 if ((flags & LINUX_O_NONBLOCK) != 0) 718 fflags |= FNONBLOCK; 719 720 finit(fp, fflags, DTYPE_LINUXEFD, efd, &eventfdops); 721 fdrop(fp, td); 722 723 td->td_retval[0] = fd; 724 return (error); 725 } 726 727 #ifdef LINUX_LEGACY_SYSCALLS 728 int 729 linux_eventfd(struct thread *td, struct linux_eventfd_args *args) 730 { 731 732 return (eventfd_create(td, args->initval, 0)); 733 } 734 #endif 735 736 int 737 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args) 738 { 739 740 if ((args->flags & ~(LINUX_O_CLOEXEC|LINUX_O_NONBLOCK|LINUX_EFD_SEMAPHORE)) != 0) 741 return (EINVAL); 742 743 return (eventfd_create(td, args->initval, args->flags)); 744 } 745 746 static int 747 eventfd_close(struct file *fp, struct thread *td) 748 { 749 struct eventfd *efd; 750 751 efd = fp->f_data; 752 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 753 return (EINVAL); 754 755 seldrain(&efd->efd_sel); 756 knlist_destroy(&efd->efd_sel.si_note); 757 758 fp->f_ops = &badfileops; 759 mtx_destroy(&efd->efd_lock); 760 free(efd, M_EPOLL); 761 762 return (0); 763 } 764 765 static int 766 eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 767 int flags, struct thread *td) 768 { 769 struct eventfd *efd; 770 eventfd_t count; 771 int error; 772 773 efd = fp->f_data; 774 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 775 return (EINVAL); 776 777 if (uio->uio_resid < sizeof(eventfd_t)) 778 return (EINVAL); 779 780 error = 0; 781 mtx_lock(&efd->efd_lock); 782 retry: 783 if (efd->efd_count == 0) { 784 if ((fp->f_flag & FNONBLOCK) != 0) { 785 mtx_unlock(&efd->efd_lock); 786 return (EAGAIN); 787 } 788 error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "lefdrd", 0); 789 if (error == 0) 790 goto retry; 791 } 792 if (error == 0) { 793 if ((efd->efd_flags & LINUX_EFD_SEMAPHORE) != 0) { 794 count = 1; 795 --efd->efd_count; 796 } else { 797 count = efd->efd_count; 798 efd->efd_count = 0; 799 } 800 KNOTE_LOCKED(&efd->efd_sel.si_note, 0); 801 selwakeup(&efd->efd_sel); 802 wakeup(&efd->efd_count); 803 mtx_unlock(&efd->efd_lock); 804 error = uiomove(&count, sizeof(eventfd_t), uio); 805 } else 806 mtx_unlock(&efd->efd_lock); 807 808 return (error); 809 } 810 811 static int 812 eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred, 813 int flags, struct thread *td) 814 { 815 struct eventfd *efd; 816 eventfd_t count; 817 int error; 818 819 efd = fp->f_data; 820 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 821 return (EINVAL); 822 823 if (uio->uio_resid < sizeof(eventfd_t)) 824 return (EINVAL); 825 826 error = uiomove(&count, sizeof(eventfd_t), uio); 827 if (error != 0) 828 return (error); 829 if (count == UINT64_MAX) 830 return (EINVAL); 831 832 mtx_lock(&efd->efd_lock); 833 retry: 834 if (UINT64_MAX - efd->efd_count <= count) { 835 if ((fp->f_flag & FNONBLOCK) != 0) { 836 mtx_unlock(&efd->efd_lock); 837 /* Do not not return the number of bytes written */ 838 uio->uio_resid += sizeof(eventfd_t); 839 return (EAGAIN); 840 } 841 error = mtx_sleep(&efd->efd_count, &efd->efd_lock, 842 PCATCH, "lefdwr", 0); 843 if (error == 0) 844 goto retry; 845 } 846 if (error == 0) { 847 efd->efd_count += count; 848 KNOTE_LOCKED(&efd->efd_sel.si_note, 0); 849 selwakeup(&efd->efd_sel); 850 wakeup(&efd->efd_count); 851 } 852 mtx_unlock(&efd->efd_lock); 853 854 return (error); 855 } 856 857 static int 858 eventfd_poll(struct file *fp, int events, struct ucred *active_cred, 859 struct thread *td) 860 { 861 struct eventfd *efd; 862 int revents = 0; 863 864 efd = fp->f_data; 865 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 866 return (POLLERR); 867 868 mtx_lock(&efd->efd_lock); 869 if ((events & (POLLIN|POLLRDNORM)) && efd->efd_count > 0) 870 revents |= events & (POLLIN|POLLRDNORM); 871 if ((events & (POLLOUT|POLLWRNORM)) && UINT64_MAX - 1 > efd->efd_count) 872 revents |= events & (POLLOUT|POLLWRNORM); 873 if (revents == 0) 874 selrecord(td, &efd->efd_sel); 875 mtx_unlock(&efd->efd_lock); 876 877 return (revents); 878 } 879 880 /*ARGSUSED*/ 881 static int 882 eventfd_kqfilter(struct file *fp, struct knote *kn) 883 { 884 struct eventfd *efd; 885 886 efd = fp->f_data; 887 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 888 return (EINVAL); 889 890 mtx_lock(&efd->efd_lock); 891 switch (kn->kn_filter) { 892 case EVFILT_READ: 893 kn->kn_fop = &eventfd_rfiltops; 894 break; 895 case EVFILT_WRITE: 896 kn->kn_fop = &eventfd_wfiltops; 897 break; 898 default: 899 mtx_unlock(&efd->efd_lock); 900 return (EINVAL); 901 } 902 903 kn->kn_hook = efd; 904 knlist_add(&efd->efd_sel.si_note, kn, 1); 905 mtx_unlock(&efd->efd_lock); 906 907 return (0); 908 } 909 910 static void 911 filt_eventfddetach(struct knote *kn) 912 { 913 struct eventfd *efd = kn->kn_hook; 914 915 mtx_lock(&efd->efd_lock); 916 knlist_remove(&efd->efd_sel.si_note, kn, 1); 917 mtx_unlock(&efd->efd_lock); 918 } 919 920 /*ARGSUSED*/ 921 static int 922 filt_eventfdread(struct knote *kn, long hint) 923 { 924 struct eventfd *efd = kn->kn_hook; 925 int ret; 926 927 mtx_assert(&efd->efd_lock, MA_OWNED); 928 ret = (efd->efd_count > 0); 929 930 return (ret); 931 } 932 933 /*ARGSUSED*/ 934 static int 935 filt_eventfdwrite(struct knote *kn, long hint) 936 { 937 struct eventfd *efd = kn->kn_hook; 938 int ret; 939 940 mtx_assert(&efd->efd_lock, MA_OWNED); 941 ret = (UINT64_MAX - 1 > efd->efd_count); 942 943 return (ret); 944 } 945 946 /*ARGSUSED*/ 947 static int 948 eventfd_ioctl(struct file *fp, u_long cmd, void *data, 949 struct ucred *active_cred, struct thread *td) 950 { 951 952 if (fp->f_data == NULL || (fp->f_type != DTYPE_LINUXEFD && 953 fp->f_type != DTYPE_LINUXTFD)) 954 return (EINVAL); 955 956 switch (cmd) 957 { 958 case FIONBIO: 959 if ((*(int *)data)) 960 atomic_set_int(&fp->f_flag, FNONBLOCK); 961 else 962 atomic_clear_int(&fp->f_flag, FNONBLOCK); 963 case FIOASYNC: 964 return (0); 965 default: 966 return (ENXIO); 967 } 968 } 969 970 /*ARGSUSED*/ 971 static int 972 eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, 973 struct thread *td) 974 { 975 976 return (ENXIO); 977 } 978 979 /*ARGSUSED*/ 980 static int 981 eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 982 { 983 984 kif->kf_type = KF_TYPE_UNKNOWN; 985 return (0); 986 } 987 988 int 989 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args) 990 { 991 struct filedesc *fdp; 992 struct timerfd *tfd; 993 struct file *fp; 994 clockid_t clockid; 995 int fflags, fd, error; 996 997 if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0) 998 return (EINVAL); 999 1000 error = linux_to_native_clockid(&clockid, args->clockid); 1001 if (error != 0) 1002 return (error); 1003 if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) 1004 return (EINVAL); 1005 1006 fflags = 0; 1007 if ((args->flags & LINUX_TFD_CLOEXEC) != 0) 1008 fflags |= O_CLOEXEC; 1009 1010 fdp = td->td_proc->p_fd; 1011 error = falloc(td, &fp, &fd, fflags); 1012 if (error != 0) 1013 return (error); 1014 1015 tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO); 1016 tfd->tfd_clockid = clockid; 1017 mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF); 1018 1019 callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0); 1020 knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock); 1021 1022 fflags = FREAD; 1023 if ((args->flags & LINUX_O_NONBLOCK) != 0) 1024 fflags |= FNONBLOCK; 1025 1026 finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops); 1027 fdrop(fp, td); 1028 1029 td->td_retval[0] = fd; 1030 return (error); 1031 } 1032 1033 static int 1034 timerfd_close(struct file *fp, struct thread *td) 1035 { 1036 struct timerfd *tfd; 1037 1038 tfd = fp->f_data; 1039 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1040 return (EINVAL); 1041 1042 timespecclear(&tfd->tfd_time.it_value); 1043 timespecclear(&tfd->tfd_time.it_interval); 1044 1045 mtx_lock(&tfd->tfd_lock); 1046 callout_drain(&tfd->tfd_callout); 1047 mtx_unlock(&tfd->tfd_lock); 1048 1049 seldrain(&tfd->tfd_sel); 1050 knlist_destroy(&tfd->tfd_sel.si_note); 1051 1052 fp->f_ops = &badfileops; 1053 mtx_destroy(&tfd->tfd_lock); 1054 free(tfd, M_EPOLL); 1055 1056 return (0); 1057 } 1058 1059 static int 1060 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 1061 int flags, struct thread *td) 1062 { 1063 struct timerfd *tfd; 1064 timerfd_t count; 1065 int error; 1066 1067 tfd = fp->f_data; 1068 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1069 return (EINVAL); 1070 1071 if (uio->uio_resid < sizeof(timerfd_t)) 1072 return (EINVAL); 1073 1074 error = 0; 1075 mtx_lock(&tfd->tfd_lock); 1076 retry: 1077 if (tfd->tfd_canceled) { 1078 tfd->tfd_count = 0; 1079 mtx_unlock(&tfd->tfd_lock); 1080 return (ECANCELED); 1081 } 1082 if (tfd->tfd_count == 0) { 1083 if ((fp->f_flag & FNONBLOCK) != 0) { 1084 mtx_unlock(&tfd->tfd_lock); 1085 return (EAGAIN); 1086 } 1087 error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0); 1088 if (error == 0) 1089 goto retry; 1090 } 1091 if (error == 0) { 1092 count = tfd->tfd_count; 1093 tfd->tfd_count = 0; 1094 mtx_unlock(&tfd->tfd_lock); 1095 error = uiomove(&count, sizeof(timerfd_t), uio); 1096 } else 1097 mtx_unlock(&tfd->tfd_lock); 1098 1099 return (error); 1100 } 1101 1102 static int 1103 timerfd_poll(struct file *fp, int events, struct ucred *active_cred, 1104 struct thread *td) 1105 { 1106 struct timerfd *tfd; 1107 int revents = 0; 1108 1109 tfd = fp->f_data; 1110 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1111 return (POLLERR); 1112 1113 mtx_lock(&tfd->tfd_lock); 1114 if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0) 1115 revents |= events & (POLLIN|POLLRDNORM); 1116 if (revents == 0) 1117 selrecord(td, &tfd->tfd_sel); 1118 mtx_unlock(&tfd->tfd_lock); 1119 1120 return (revents); 1121 } 1122 1123 /*ARGSUSED*/ 1124 static int 1125 timerfd_kqfilter(struct file *fp, struct knote *kn) 1126 { 1127 struct timerfd *tfd; 1128 1129 tfd = fp->f_data; 1130 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1131 return (EINVAL); 1132 1133 if (kn->kn_filter == EVFILT_READ) 1134 kn->kn_fop = &timerfd_rfiltops; 1135 else 1136 return (EINVAL); 1137 1138 kn->kn_hook = tfd; 1139 knlist_add(&tfd->tfd_sel.si_note, kn, 0); 1140 1141 return (0); 1142 } 1143 1144 static void 1145 filt_timerfddetach(struct knote *kn) 1146 { 1147 struct timerfd *tfd = kn->kn_hook; 1148 1149 mtx_lock(&tfd->tfd_lock); 1150 knlist_remove(&tfd->tfd_sel.si_note, kn, 1); 1151 mtx_unlock(&tfd->tfd_lock); 1152 } 1153 1154 /*ARGSUSED*/ 1155 static int 1156 filt_timerfdread(struct knote *kn, long hint) 1157 { 1158 struct timerfd *tfd = kn->kn_hook; 1159 1160 return (tfd->tfd_count > 0); 1161 } 1162 1163 /*ARGSUSED*/ 1164 static int 1165 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, 1166 struct thread *td) 1167 { 1168 1169 return (ENXIO); 1170 } 1171 1172 /*ARGSUSED*/ 1173 static int 1174 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 1175 { 1176 1177 kif->kf_type = KF_TYPE_UNKNOWN; 1178 return (0); 1179 } 1180 1181 static void 1182 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts) 1183 { 1184 1185 if (tfd->tfd_clockid == CLOCK_REALTIME) 1186 getnanotime(ts); 1187 else /* CLOCK_MONOTONIC */ 1188 getnanouptime(ts); 1189 } 1190 1191 static void 1192 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots) 1193 { 1194 struct timespec cts; 1195 1196 linux_timerfd_clocktime(tfd, &cts); 1197 *ots = tfd->tfd_time; 1198 if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) { 1199 timespecsub(&ots->it_value, &cts, &ots->it_value); 1200 if (ots->it_value.tv_sec < 0 || 1201 (ots->it_value.tv_sec == 0 && 1202 ots->it_value.tv_nsec == 0)) { 1203 ots->it_value.tv_sec = 0; 1204 ots->it_value.tv_nsec = 1; 1205 } 1206 } 1207 } 1208 1209 int 1210 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args) 1211 { 1212 struct l_itimerspec lots; 1213 struct itimerspec ots; 1214 struct timerfd *tfd; 1215 struct file *fp; 1216 int error; 1217 1218 error = fget(td, args->fd, &cap_read_rights, &fp); 1219 if (error != 0) 1220 return (error); 1221 tfd = fp->f_data; 1222 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { 1223 error = EINVAL; 1224 goto out; 1225 } 1226 1227 mtx_lock(&tfd->tfd_lock); 1228 linux_timerfd_curval(tfd, &ots); 1229 mtx_unlock(&tfd->tfd_lock); 1230 1231 error = native_to_linux_itimerspec(&lots, &ots); 1232 if (error == 0) 1233 error = copyout(&lots, args->old_value, sizeof(lots)); 1234 1235 out: 1236 fdrop(fp, td); 1237 return (error); 1238 } 1239 1240 int 1241 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args) 1242 { 1243 struct l_itimerspec lots; 1244 struct itimerspec nts, ots; 1245 struct timespec cts, ts; 1246 struct timerfd *tfd; 1247 struct timeval tv; 1248 struct file *fp; 1249 int error; 1250 1251 if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0) 1252 return (EINVAL); 1253 1254 error = copyin(args->new_value, &lots, sizeof(lots)); 1255 if (error != 0) 1256 return (error); 1257 error = linux_to_native_itimerspec(&nts, &lots); 1258 if (error != 0) 1259 return (error); 1260 1261 error = fget(td, args->fd, &cap_write_rights, &fp); 1262 if (error != 0) 1263 return (error); 1264 tfd = fp->f_data; 1265 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { 1266 error = EINVAL; 1267 goto out; 1268 } 1269 1270 mtx_lock(&tfd->tfd_lock); 1271 if (!timespecisset(&nts.it_value)) 1272 timespecclear(&nts.it_interval); 1273 if (args->old_value != NULL) 1274 linux_timerfd_curval(tfd, &ots); 1275 1276 tfd->tfd_time = nts; 1277 if (timespecisset(&nts.it_value)) { 1278 linux_timerfd_clocktime(tfd, &cts); 1279 ts = nts.it_value; 1280 if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) { 1281 timespecadd(&tfd->tfd_time.it_value, &cts, 1282 &tfd->tfd_time.it_value); 1283 } else { 1284 timespecsub(&ts, &cts, &ts); 1285 } 1286 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1287 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1288 linux_timerfd_expire, tfd); 1289 tfd->tfd_canceled = false; 1290 } else { 1291 tfd->tfd_canceled = true; 1292 callout_stop(&tfd->tfd_callout); 1293 } 1294 mtx_unlock(&tfd->tfd_lock); 1295 1296 if (args->old_value != NULL) { 1297 error = native_to_linux_itimerspec(&lots, &ots); 1298 if (error == 0) 1299 error = copyout(&lots, args->old_value, sizeof(lots)); 1300 } 1301 1302 out: 1303 fdrop(fp, td); 1304 return (error); 1305 } 1306 1307 static void 1308 linux_timerfd_expire(void *arg) 1309 { 1310 struct timespec cts, ts; 1311 struct timeval tv; 1312 struct timerfd *tfd; 1313 1314 tfd = (struct timerfd *)arg; 1315 1316 linux_timerfd_clocktime(tfd, &cts); 1317 if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) { 1318 if (timespecisset(&tfd->tfd_time.it_interval)) 1319 timespecadd(&tfd->tfd_time.it_value, 1320 &tfd->tfd_time.it_interval, 1321 &tfd->tfd_time.it_value); 1322 else 1323 /* single shot timer */ 1324 timespecclear(&tfd->tfd_time.it_value); 1325 if (timespecisset(&tfd->tfd_time.it_value)) { 1326 timespecsub(&tfd->tfd_time.it_value, &cts, &ts); 1327 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1328 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1329 linux_timerfd_expire, tfd); 1330 } 1331 tfd->tfd_count++; 1332 KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0); 1333 selwakeup(&tfd->tfd_sel); 1334 wakeup(&tfd->tfd_count); 1335 } else if (timespecisset(&tfd->tfd_time.it_value)) { 1336 timespecsub(&tfd->tfd_time.it_value, &cts, &ts); 1337 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1338 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1339 linux_timerfd_expire, tfd); 1340 } 1341 } 1342