1 /*- 2 * Copyright (c) 2007 Roman Divacky 3 * Copyright (c) 2014 Dmitry Chagin 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include "opt_compat.h" 32 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/imgact.h> 36 #include <sys/kernel.h> 37 #include <sys/limits.h> 38 #include <sys/lock.h> 39 #include <sys/mutex.h> 40 #include <sys/callout.h> 41 #include <sys/capsicum.h> 42 #include <sys/types.h> 43 #include <sys/user.h> 44 #include <sys/file.h> 45 #include <sys/filedesc.h> 46 #include <sys/filio.h> 47 #include <sys/errno.h> 48 #include <sys/event.h> 49 #include <sys/poll.h> 50 #include <sys/proc.h> 51 #include <sys/selinfo.h> 52 #include <sys/sx.h> 53 #include <sys/syscallsubr.h> 54 #include <sys/timespec.h> 55 56 #ifdef COMPAT_LINUX32 57 #include <machine/../linux32/linux.h> 58 #include <machine/../linux32/linux32_proto.h> 59 #else 60 #include <machine/../linux/linux.h> 61 #include <machine/../linux/linux_proto.h> 62 #endif 63 64 #include <compat/linux/linux_emul.h> 65 #include <compat/linux/linux_event.h> 66 #include <compat/linux/linux_file.h> 67 #include <compat/linux/linux_timer.h> 68 #include <compat/linux/linux_util.h> 69 70 /* 71 * epoll defines 'struct epoll_event' with the field 'data' as 64 bits 72 * on all architectures. But on 32 bit architectures BSD 'struct kevent' only 73 * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied 74 * data verbatuim. Therefore we allocate 64-bit memory block to pass 75 * user supplied data for every file descriptor. 76 */ 77 78 typedef uint64_t epoll_udata_t; 79 80 struct epoll_emuldata { 81 uint32_t fdc; /* epoll udata max index */ 82 epoll_udata_t udata[1]; /* epoll user data vector */ 83 }; 84 85 #define EPOLL_DEF_SZ 16 86 #define EPOLL_SIZE(fdn) \ 87 (sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t)) 88 89 struct epoll_event { 90 uint32_t events; 91 epoll_udata_t data; 92 } 93 #if defined(__amd64__) 94 __attribute__((packed)) 95 #endif 96 ; 97 98 #define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 99 100 static void epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata); 101 static int epoll_to_kevent(struct thread *td, int fd, 102 struct epoll_event *l_event, struct kevent *kevent, 103 int *nkevents); 104 static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event); 105 static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count); 106 static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count); 107 static int epoll_register_kevent(struct thread *td, struct file *epfp, 108 int fd, int filter, unsigned int flags); 109 static int epoll_fd_registered(struct thread *td, struct file *epfp, 110 int fd); 111 static int epoll_delete_all_events(struct thread *td, struct file *epfp, 112 int fd); 113 114 struct epoll_copyin_args { 115 struct kevent *changelist; 116 }; 117 118 struct epoll_copyout_args { 119 struct epoll_event *leventlist; 120 struct proc *p; 121 uint32_t count; 122 int error; 123 }; 124 125 /* eventfd */ 126 typedef uint64_t eventfd_t; 127 128 static fo_rdwr_t eventfd_read; 129 static fo_rdwr_t eventfd_write; 130 static fo_ioctl_t eventfd_ioctl; 131 static fo_poll_t eventfd_poll; 132 static fo_kqfilter_t eventfd_kqfilter; 133 static fo_stat_t eventfd_stat; 134 static fo_close_t eventfd_close; 135 static fo_fill_kinfo_t eventfd_fill_kinfo; 136 137 static struct fileops eventfdops = { 138 .fo_read = eventfd_read, 139 .fo_write = eventfd_write, 140 .fo_truncate = invfo_truncate, 141 .fo_ioctl = eventfd_ioctl, 142 .fo_poll = eventfd_poll, 143 .fo_kqfilter = eventfd_kqfilter, 144 .fo_stat = eventfd_stat, 145 .fo_close = eventfd_close, 146 .fo_chmod = invfo_chmod, 147 .fo_chown = invfo_chown, 148 .fo_sendfile = invfo_sendfile, 149 .fo_fill_kinfo = eventfd_fill_kinfo, 150 .fo_flags = DFLAG_PASSABLE 151 }; 152 153 static void filt_eventfddetach(struct knote *kn); 154 static int filt_eventfdread(struct knote *kn, long hint); 155 static int filt_eventfdwrite(struct knote *kn, long hint); 156 157 static struct filterops eventfd_rfiltops = { 158 .f_isfd = 1, 159 .f_detach = filt_eventfddetach, 160 .f_event = filt_eventfdread 161 }; 162 static struct filterops eventfd_wfiltops = { 163 .f_isfd = 1, 164 .f_detach = filt_eventfddetach, 165 .f_event = filt_eventfdwrite 166 }; 167 168 /* timerfd */ 169 typedef uint64_t timerfd_t; 170 171 static fo_rdwr_t timerfd_read; 172 static fo_poll_t timerfd_poll; 173 static fo_kqfilter_t timerfd_kqfilter; 174 static fo_stat_t timerfd_stat; 175 static fo_close_t timerfd_close; 176 static fo_fill_kinfo_t timerfd_fill_kinfo; 177 178 static struct fileops timerfdops = { 179 .fo_read = timerfd_read, 180 .fo_write = invfo_rdwr, 181 .fo_truncate = invfo_truncate, 182 .fo_ioctl = eventfd_ioctl, 183 .fo_poll = timerfd_poll, 184 .fo_kqfilter = timerfd_kqfilter, 185 .fo_stat = timerfd_stat, 186 .fo_close = timerfd_close, 187 .fo_chmod = invfo_chmod, 188 .fo_chown = invfo_chown, 189 .fo_sendfile = invfo_sendfile, 190 .fo_fill_kinfo = timerfd_fill_kinfo, 191 .fo_flags = DFLAG_PASSABLE 192 }; 193 194 static void filt_timerfddetach(struct knote *kn); 195 static int filt_timerfdread(struct knote *kn, long hint); 196 197 static struct filterops timerfd_rfiltops = { 198 .f_isfd = 1, 199 .f_detach = filt_timerfddetach, 200 .f_event = filt_timerfdread 201 }; 202 203 struct eventfd { 204 eventfd_t efd_count; 205 uint32_t efd_flags; 206 struct selinfo efd_sel; 207 struct mtx efd_lock; 208 }; 209 210 struct timerfd { 211 clockid_t tfd_clockid; 212 struct itimerspec tfd_time; 213 struct callout tfd_callout; 214 timerfd_t tfd_count; 215 bool tfd_canceled; 216 struct selinfo tfd_sel; 217 struct mtx tfd_lock; 218 }; 219 220 static int eventfd_create(struct thread *td, uint32_t initval, int flags); 221 static void linux_timerfd_expire(void *); 222 static void linux_timerfd_curval(struct timerfd *, struct itimerspec *); 223 224 225 static void 226 epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata) 227 { 228 struct linux_pemuldata *pem; 229 struct epoll_emuldata *emd; 230 struct proc *p; 231 232 p = td->td_proc; 233 234 pem = pem_find(p); 235 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 236 237 LINUX_PEM_XLOCK(pem); 238 if (pem->epoll == NULL) { 239 emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); 240 emd->fdc = fd; 241 pem->epoll = emd; 242 } else { 243 emd = pem->epoll; 244 if (fd > emd->fdc) { 245 emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); 246 emd->fdc = fd; 247 pem->epoll = emd; 248 } 249 } 250 emd->udata[fd] = udata; 251 LINUX_PEM_XUNLOCK(pem); 252 } 253 254 static int 255 epoll_create_common(struct thread *td, int flags) 256 { 257 int error; 258 259 error = kern_kqueue(td, flags, NULL); 260 if (error != 0) 261 return (error); 262 263 epoll_fd_install(td, EPOLL_DEF_SZ, 0); 264 265 return (0); 266 } 267 268 #ifdef LINUX_LEGACY_SYSCALLS 269 int 270 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args) 271 { 272 273 /* 274 * args->size is unused. Linux just tests it 275 * and then forgets it as well. 276 */ 277 if (args->size <= 0) 278 return (EINVAL); 279 280 return (epoll_create_common(td, 0)); 281 } 282 #endif 283 284 int 285 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args) 286 { 287 int flags; 288 289 if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0) 290 return (EINVAL); 291 292 flags = 0; 293 if ((args->flags & LINUX_O_CLOEXEC) != 0) 294 flags |= O_CLOEXEC; 295 296 return (epoll_create_common(td, flags)); 297 } 298 299 /* Structure converting function from epoll to kevent. */ 300 static int 301 epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event, 302 struct kevent *kevent, int *nkevents) 303 { 304 uint32_t levents = l_event->events; 305 struct linux_pemuldata *pem; 306 struct proc *p; 307 unsigned short kev_flags = EV_ADD | EV_ENABLE; 308 309 /* flags related to how event is registered */ 310 if ((levents & LINUX_EPOLLONESHOT) != 0) 311 kev_flags |= EV_DISPATCH; 312 if ((levents & LINUX_EPOLLET) != 0) 313 kev_flags |= EV_CLEAR; 314 if ((levents & LINUX_EPOLLERR) != 0) 315 kev_flags |= EV_ERROR; 316 if ((levents & LINUX_EPOLLRDHUP) != 0) 317 kev_flags |= EV_EOF; 318 319 /* flags related to what event is registered */ 320 if ((levents & LINUX_EPOLL_EVRD) != 0) { 321 EV_SET(kevent++, fd, EVFILT_READ, kev_flags, 0, 0, 0); 322 ++(*nkevents); 323 } 324 if ((levents & LINUX_EPOLL_EVWR) != 0) { 325 EV_SET(kevent++, fd, EVFILT_WRITE, kev_flags, 0, 0, 0); 326 ++(*nkevents); 327 } 328 /* zero event mask is legal */ 329 if ((levents & (LINUX_EPOLL_EVRD | LINUX_EPOLL_EVWR)) == 0) { 330 EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0); 331 ++(*nkevents); 332 } 333 334 if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) { 335 p = td->td_proc; 336 337 pem = pem_find(p); 338 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 339 KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n")); 340 341 LINUX_PEM_XLOCK(pem); 342 if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) { 343 pem->flags |= LINUX_XUNSUP_EPOLL; 344 LINUX_PEM_XUNLOCK(pem); 345 linux_msg(td, "epoll_ctl unsupported flags: 0x%x", 346 levents); 347 } else 348 LINUX_PEM_XUNLOCK(pem); 349 return (EINVAL); 350 } 351 352 return (0); 353 } 354 355 /* 356 * Structure converting function from kevent to epoll. In a case 357 * this is called on error in registration we store the error in 358 * event->data and pick it up later in linux_epoll_ctl(). 359 */ 360 static void 361 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event) 362 { 363 364 if ((kevent->flags & EV_ERROR) != 0) { 365 l_event->events = LINUX_EPOLLERR; 366 return; 367 } 368 369 /* XXX EPOLLPRI, EPOLLHUP */ 370 switch (kevent->filter) { 371 case EVFILT_READ: 372 l_event->events = LINUX_EPOLLIN; 373 if ((kevent->flags & EV_EOF) != 0) 374 l_event->events |= LINUX_EPOLLRDHUP; 375 break; 376 case EVFILT_WRITE: 377 l_event->events = LINUX_EPOLLOUT; 378 break; 379 } 380 } 381 382 /* 383 * Copyout callback used by kevent. This converts kevent 384 * events to epoll events and copies them back to the 385 * userspace. This is also called on error on registering 386 * of the filter. 387 */ 388 static int 389 epoll_kev_copyout(void *arg, struct kevent *kevp, int count) 390 { 391 struct epoll_copyout_args *args; 392 struct linux_pemuldata *pem; 393 struct epoll_emuldata *emd; 394 struct epoll_event *eep; 395 int error, fd, i; 396 397 args = (struct epoll_copyout_args*) arg; 398 eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO); 399 400 pem = pem_find(args->p); 401 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 402 LINUX_PEM_SLOCK(pem); 403 emd = pem->epoll; 404 KASSERT(emd != NULL, ("epoll proc epolldata not found.\n")); 405 406 for (i = 0; i < count; i++) { 407 kevent_to_epoll(&kevp[i], &eep[i]); 408 409 fd = kevp[i].ident; 410 KASSERT(fd <= emd->fdc, ("epoll user data vector" 411 " is too small.\n")); 412 eep[i].data = emd->udata[fd]; 413 } 414 LINUX_PEM_SUNLOCK(pem); 415 416 error = copyout(eep, args->leventlist, count * sizeof(*eep)); 417 if (error == 0) { 418 args->leventlist += count; 419 args->count += count; 420 } else if (args->error == 0) 421 args->error = error; 422 423 free(eep, M_EPOLL); 424 return (error); 425 } 426 427 /* 428 * Copyin callback used by kevent. This copies already 429 * converted filters from kernel memory to the kevent 430 * internal kernel memory. Hence the memcpy instead of 431 * copyin. 432 */ 433 static int 434 epoll_kev_copyin(void *arg, struct kevent *kevp, int count) 435 { 436 struct epoll_copyin_args *args; 437 438 args = (struct epoll_copyin_args*) arg; 439 440 memcpy(kevp, args->changelist, count * sizeof(*kevp)); 441 args->changelist += count; 442 443 return (0); 444 } 445 446 /* 447 * Load epoll filter, convert it to kevent filter 448 * and load it into kevent subsystem. 449 */ 450 int 451 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args) 452 { 453 struct file *epfp, *fp; 454 struct epoll_copyin_args ciargs; 455 struct kevent kev[2]; 456 struct kevent_copyops k_ops = { &ciargs, 457 NULL, 458 epoll_kev_copyin}; 459 struct epoll_event le; 460 cap_rights_t rights; 461 int nchanges = 0; 462 int error; 463 464 if (args->op != LINUX_EPOLL_CTL_DEL) { 465 error = copyin(args->event, &le, sizeof(le)); 466 if (error != 0) 467 return (error); 468 } 469 470 error = fget(td, args->epfd, 471 cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &epfp); 472 if (error != 0) 473 return (error); 474 if (epfp->f_type != DTYPE_KQUEUE) { 475 error = EINVAL; 476 goto leave1; 477 } 478 479 /* Protect user data vector from incorrectly supplied fd. */ 480 error = fget(td, args->fd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp); 481 if (error != 0) 482 goto leave1; 483 484 /* Linux disallows spying on himself */ 485 if (epfp == fp) { 486 error = EINVAL; 487 goto leave0; 488 } 489 490 ciargs.changelist = kev; 491 492 if (args->op != LINUX_EPOLL_CTL_DEL) { 493 error = epoll_to_kevent(td, args->fd, &le, kev, &nchanges); 494 if (error != 0) 495 goto leave0; 496 } 497 498 switch (args->op) { 499 case LINUX_EPOLL_CTL_MOD: 500 error = epoll_delete_all_events(td, epfp, args->fd); 501 if (error != 0) 502 goto leave0; 503 break; 504 505 case LINUX_EPOLL_CTL_ADD: 506 if (epoll_fd_registered(td, epfp, args->fd)) { 507 error = EEXIST; 508 goto leave0; 509 } 510 break; 511 512 case LINUX_EPOLL_CTL_DEL: 513 /* CTL_DEL means unregister this fd with this epoll */ 514 error = epoll_delete_all_events(td, epfp, args->fd); 515 goto leave0; 516 517 default: 518 error = EINVAL; 519 goto leave0; 520 } 521 522 epoll_fd_install(td, args->fd, le.data); 523 524 error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL); 525 526 leave0: 527 fdrop(fp, td); 528 529 leave1: 530 fdrop(epfp, td); 531 return (error); 532 } 533 534 /* 535 * Wait for a filter to be triggered on the epoll file descriptor. 536 */ 537 static int 538 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events, 539 int maxevents, int timeout, sigset_t *uset) 540 { 541 struct epoll_copyout_args coargs; 542 struct kevent_copyops k_ops = { &coargs, 543 epoll_kev_copyout, 544 NULL}; 545 struct timespec ts, *tsp; 546 cap_rights_t rights; 547 struct file *epfp; 548 sigset_t omask; 549 int error; 550 551 if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS) 552 return (EINVAL); 553 554 error = fget(td, epfd, 555 cap_rights_init(&rights, CAP_KQUEUE_EVENT), &epfp); 556 if (error != 0) 557 return (error); 558 if (epfp->f_type != DTYPE_KQUEUE) { 559 error = EINVAL; 560 goto leave; 561 } 562 if (uset != NULL) { 563 error = kern_sigprocmask(td, SIG_SETMASK, uset, 564 &omask, 0); 565 if (error != 0) 566 goto leave; 567 td->td_pflags |= TDP_OLDMASK; 568 /* 569 * Make sure that ast() is called on return to 570 * usermode and TDP_OLDMASK is cleared, restoring old 571 * sigmask. 572 */ 573 thread_lock(td); 574 td->td_flags |= TDF_ASTPENDING; 575 thread_unlock(td); 576 } 577 578 579 coargs.leventlist = events; 580 coargs.p = td->td_proc; 581 coargs.count = 0; 582 coargs.error = 0; 583 584 /* 585 * Linux epoll_wait(2) man page states that timeout of -1 causes caller 586 * to block indefinitely. Real implementation does it if any negative 587 * timeout value is passed. 588 */ 589 if (timeout >= 0) { 590 /* Convert from milliseconds to timespec. */ 591 ts.tv_sec = timeout / 1000; 592 ts.tv_nsec = (timeout % 1000) * 1000000; 593 tsp = &ts; 594 } else { 595 tsp = NULL; 596 } 597 598 error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp); 599 if (error == 0 && coargs.error != 0) 600 error = coargs.error; 601 602 /* 603 * kern_kevent might return ENOMEM which is not expected from epoll_wait. 604 * Maybe we should translate that but I don't think it matters at all. 605 */ 606 if (error == 0) 607 td->td_retval[0] = coargs.count; 608 609 if (uset != NULL) 610 error = kern_sigprocmask(td, SIG_SETMASK, &omask, 611 NULL, 0); 612 leave: 613 fdrop(epfp, td); 614 return (error); 615 } 616 617 #ifdef LINUX_LEGACY_SYSCALLS 618 int 619 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args) 620 { 621 622 return (linux_epoll_wait_common(td, args->epfd, args->events, 623 args->maxevents, args->timeout, NULL)); 624 } 625 #endif 626 627 int 628 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args) 629 { 630 sigset_t mask, *pmask; 631 l_sigset_t lmask; 632 int error; 633 634 if (args->mask != NULL) { 635 if (args->sigsetsize != sizeof(l_sigset_t)) 636 return (EINVAL); 637 error = copyin(args->mask, &lmask, sizeof(l_sigset_t)); 638 if (error != 0) 639 return (error); 640 linux_to_bsd_sigset(&lmask, &mask); 641 pmask = &mask; 642 } else 643 pmask = NULL; 644 return (linux_epoll_wait_common(td, args->epfd, args->events, 645 args->maxevents, args->timeout, pmask)); 646 } 647 648 static int 649 epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter, 650 unsigned int flags) 651 { 652 struct epoll_copyin_args ciargs; 653 struct kevent kev; 654 struct kevent_copyops k_ops = { &ciargs, 655 NULL, 656 epoll_kev_copyin}; 657 658 ciargs.changelist = &kev; 659 EV_SET(&kev, fd, filter, flags, 0, 0, 0); 660 661 return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL)); 662 } 663 664 static int 665 epoll_fd_registered(struct thread *td, struct file *epfp, int fd) 666 { 667 /* 668 * Set empty filter flags to avoid accidental modification of already 669 * registered events. In the case of event re-registration: 670 * 1. If event does not exists kevent() does nothing and returns ENOENT 671 * 2. If event does exists, it's enabled/disabled state is preserved 672 * but fflags, data and udata fields are overwritten. So we can not 673 * set socket lowats and store user's context pointer in udata. 674 */ 675 if (epoll_register_kevent(td, epfp, fd, EVFILT_READ, 0) != ENOENT || 676 epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, 0) != ENOENT) 677 return (1); 678 679 return (0); 680 } 681 682 static int 683 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd) 684 { 685 int error1, error2; 686 687 error1 = epoll_register_kevent(td, epfp, fd, EVFILT_READ, EV_DELETE); 688 error2 = epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, EV_DELETE); 689 690 /* return 0 if at least one result positive */ 691 return (error1 == 0 ? 0 : error2); 692 } 693 694 static int 695 eventfd_create(struct thread *td, uint32_t initval, int flags) 696 { 697 struct filedesc *fdp; 698 struct eventfd *efd; 699 struct file *fp; 700 int fflags, fd, error; 701 702 fflags = 0; 703 if ((flags & LINUX_O_CLOEXEC) != 0) 704 fflags |= O_CLOEXEC; 705 706 fdp = td->td_proc->p_fd; 707 error = falloc(td, &fp, &fd, fflags); 708 if (error != 0) 709 return (error); 710 711 efd = malloc(sizeof(*efd), M_EPOLL, M_WAITOK | M_ZERO); 712 efd->efd_flags = flags; 713 efd->efd_count = initval; 714 mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF); 715 716 knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock); 717 718 fflags = FREAD | FWRITE; 719 if ((flags & LINUX_O_NONBLOCK) != 0) 720 fflags |= FNONBLOCK; 721 722 finit(fp, fflags, DTYPE_LINUXEFD, efd, &eventfdops); 723 fdrop(fp, td); 724 725 td->td_retval[0] = fd; 726 return (error); 727 } 728 729 #ifdef LINUX_LEGACY_SYSCALLS 730 int 731 linux_eventfd(struct thread *td, struct linux_eventfd_args *args) 732 { 733 734 return (eventfd_create(td, args->initval, 0)); 735 } 736 #endif 737 738 int 739 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args) 740 { 741 742 if ((args->flags & ~(LINUX_O_CLOEXEC|LINUX_O_NONBLOCK|LINUX_EFD_SEMAPHORE)) != 0) 743 return (EINVAL); 744 745 return (eventfd_create(td, args->initval, args->flags)); 746 } 747 748 static int 749 eventfd_close(struct file *fp, struct thread *td) 750 { 751 struct eventfd *efd; 752 753 efd = fp->f_data; 754 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 755 return (EINVAL); 756 757 seldrain(&efd->efd_sel); 758 knlist_destroy(&efd->efd_sel.si_note); 759 760 fp->f_ops = &badfileops; 761 mtx_destroy(&efd->efd_lock); 762 free(efd, M_EPOLL); 763 764 return (0); 765 } 766 767 static int 768 eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 769 int flags, struct thread *td) 770 { 771 struct eventfd *efd; 772 eventfd_t count; 773 int error; 774 775 efd = fp->f_data; 776 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 777 return (EINVAL); 778 779 if (uio->uio_resid < sizeof(eventfd_t)) 780 return (EINVAL); 781 782 error = 0; 783 mtx_lock(&efd->efd_lock); 784 retry: 785 if (efd->efd_count == 0) { 786 if ((fp->f_flag & FNONBLOCK) != 0) { 787 mtx_unlock(&efd->efd_lock); 788 return (EAGAIN); 789 } 790 error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "lefdrd", 0); 791 if (error == 0) 792 goto retry; 793 } 794 if (error == 0) { 795 if ((efd->efd_flags & LINUX_EFD_SEMAPHORE) != 0) { 796 count = 1; 797 --efd->efd_count; 798 } else { 799 count = efd->efd_count; 800 efd->efd_count = 0; 801 } 802 KNOTE_LOCKED(&efd->efd_sel.si_note, 0); 803 selwakeup(&efd->efd_sel); 804 wakeup(&efd->efd_count); 805 mtx_unlock(&efd->efd_lock); 806 error = uiomove(&count, sizeof(eventfd_t), uio); 807 } else 808 mtx_unlock(&efd->efd_lock); 809 810 return (error); 811 } 812 813 static int 814 eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred, 815 int flags, struct thread *td) 816 { 817 struct eventfd *efd; 818 eventfd_t count; 819 int error; 820 821 efd = fp->f_data; 822 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 823 return (EINVAL); 824 825 if (uio->uio_resid < sizeof(eventfd_t)) 826 return (EINVAL); 827 828 error = uiomove(&count, sizeof(eventfd_t), uio); 829 if (error != 0) 830 return (error); 831 if (count == UINT64_MAX) 832 return (EINVAL); 833 834 mtx_lock(&efd->efd_lock); 835 retry: 836 if (UINT64_MAX - efd->efd_count <= count) { 837 if ((fp->f_flag & FNONBLOCK) != 0) { 838 mtx_unlock(&efd->efd_lock); 839 /* Do not not return the number of bytes written */ 840 uio->uio_resid += sizeof(eventfd_t); 841 return (EAGAIN); 842 } 843 error = mtx_sleep(&efd->efd_count, &efd->efd_lock, 844 PCATCH, "lefdwr", 0); 845 if (error == 0) 846 goto retry; 847 } 848 if (error == 0) { 849 efd->efd_count += count; 850 KNOTE_LOCKED(&efd->efd_sel.si_note, 0); 851 selwakeup(&efd->efd_sel); 852 wakeup(&efd->efd_count); 853 } 854 mtx_unlock(&efd->efd_lock); 855 856 return (error); 857 } 858 859 static int 860 eventfd_poll(struct file *fp, int events, struct ucred *active_cred, 861 struct thread *td) 862 { 863 struct eventfd *efd; 864 int revents = 0; 865 866 efd = fp->f_data; 867 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 868 return (POLLERR); 869 870 mtx_lock(&efd->efd_lock); 871 if ((events & (POLLIN|POLLRDNORM)) && efd->efd_count > 0) 872 revents |= events & (POLLIN|POLLRDNORM); 873 if ((events & (POLLOUT|POLLWRNORM)) && UINT64_MAX - 1 > efd->efd_count) 874 revents |= events & (POLLOUT|POLLWRNORM); 875 if (revents == 0) 876 selrecord(td, &efd->efd_sel); 877 mtx_unlock(&efd->efd_lock); 878 879 return (revents); 880 } 881 882 /*ARGSUSED*/ 883 static int 884 eventfd_kqfilter(struct file *fp, struct knote *kn) 885 { 886 struct eventfd *efd; 887 888 efd = fp->f_data; 889 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 890 return (EINVAL); 891 892 mtx_lock(&efd->efd_lock); 893 switch (kn->kn_filter) { 894 case EVFILT_READ: 895 kn->kn_fop = &eventfd_rfiltops; 896 break; 897 case EVFILT_WRITE: 898 kn->kn_fop = &eventfd_wfiltops; 899 break; 900 default: 901 mtx_unlock(&efd->efd_lock); 902 return (EINVAL); 903 } 904 905 kn->kn_hook = efd; 906 knlist_add(&efd->efd_sel.si_note, kn, 1); 907 mtx_unlock(&efd->efd_lock); 908 909 return (0); 910 } 911 912 static void 913 filt_eventfddetach(struct knote *kn) 914 { 915 struct eventfd *efd = kn->kn_hook; 916 917 mtx_lock(&efd->efd_lock); 918 knlist_remove(&efd->efd_sel.si_note, kn, 1); 919 mtx_unlock(&efd->efd_lock); 920 } 921 922 /*ARGSUSED*/ 923 static int 924 filt_eventfdread(struct knote *kn, long hint) 925 { 926 struct eventfd *efd = kn->kn_hook; 927 int ret; 928 929 mtx_assert(&efd->efd_lock, MA_OWNED); 930 ret = (efd->efd_count > 0); 931 932 return (ret); 933 } 934 935 /*ARGSUSED*/ 936 static int 937 filt_eventfdwrite(struct knote *kn, long hint) 938 { 939 struct eventfd *efd = kn->kn_hook; 940 int ret; 941 942 mtx_assert(&efd->efd_lock, MA_OWNED); 943 ret = (UINT64_MAX - 1 > efd->efd_count); 944 945 return (ret); 946 } 947 948 /*ARGSUSED*/ 949 static int 950 eventfd_ioctl(struct file *fp, u_long cmd, void *data, 951 struct ucred *active_cred, struct thread *td) 952 { 953 954 if (fp->f_data == NULL || (fp->f_type != DTYPE_LINUXEFD && 955 fp->f_type != DTYPE_LINUXTFD)) 956 return (EINVAL); 957 958 switch (cmd) 959 { 960 case FIONBIO: 961 if ((*(int *)data)) 962 atomic_set_int(&fp->f_flag, FNONBLOCK); 963 else 964 atomic_clear_int(&fp->f_flag, FNONBLOCK); 965 case FIOASYNC: 966 return (0); 967 default: 968 return (ENXIO); 969 } 970 } 971 972 /*ARGSUSED*/ 973 static int 974 eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, 975 struct thread *td) 976 { 977 978 return (ENXIO); 979 } 980 981 /*ARGSUSED*/ 982 static int 983 eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 984 { 985 986 kif->kf_type = KF_TYPE_UNKNOWN; 987 return (0); 988 } 989 990 int 991 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args) 992 { 993 struct filedesc *fdp; 994 struct timerfd *tfd; 995 struct file *fp; 996 clockid_t clockid; 997 int fflags, fd, error; 998 999 if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0) 1000 return (EINVAL); 1001 1002 error = linux_to_native_clockid(&clockid, args->clockid); 1003 if (error != 0) 1004 return (error); 1005 if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) 1006 return (EINVAL); 1007 1008 fflags = 0; 1009 if ((args->flags & LINUX_TFD_CLOEXEC) != 0) 1010 fflags |= O_CLOEXEC; 1011 1012 fdp = td->td_proc->p_fd; 1013 error = falloc(td, &fp, &fd, fflags); 1014 if (error != 0) 1015 return (error); 1016 1017 tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO); 1018 tfd->tfd_clockid = clockid; 1019 mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF); 1020 1021 callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0); 1022 knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock); 1023 1024 fflags = FREAD; 1025 if ((args->flags & LINUX_O_NONBLOCK) != 0) 1026 fflags |= FNONBLOCK; 1027 1028 finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops); 1029 fdrop(fp, td); 1030 1031 td->td_retval[0] = fd; 1032 return (error); 1033 } 1034 1035 static int 1036 timerfd_close(struct file *fp, struct thread *td) 1037 { 1038 struct timerfd *tfd; 1039 1040 tfd = fp->f_data; 1041 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1042 return (EINVAL); 1043 1044 timespecclear(&tfd->tfd_time.it_value); 1045 timespecclear(&tfd->tfd_time.it_interval); 1046 1047 mtx_lock(&tfd->tfd_lock); 1048 callout_drain(&tfd->tfd_callout); 1049 mtx_unlock(&tfd->tfd_lock); 1050 1051 seldrain(&tfd->tfd_sel); 1052 knlist_destroy(&tfd->tfd_sel.si_note); 1053 1054 fp->f_ops = &badfileops; 1055 mtx_destroy(&tfd->tfd_lock); 1056 free(tfd, M_EPOLL); 1057 1058 return (0); 1059 } 1060 1061 static int 1062 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 1063 int flags, struct thread *td) 1064 { 1065 struct timerfd *tfd; 1066 timerfd_t count; 1067 int error; 1068 1069 tfd = fp->f_data; 1070 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1071 return (EINVAL); 1072 1073 if (uio->uio_resid < sizeof(timerfd_t)) 1074 return (EINVAL); 1075 1076 error = 0; 1077 mtx_lock(&tfd->tfd_lock); 1078 retry: 1079 if (tfd->tfd_canceled) { 1080 tfd->tfd_count = 0; 1081 mtx_unlock(&tfd->tfd_lock); 1082 return (ECANCELED); 1083 } 1084 if (tfd->tfd_count == 0) { 1085 if ((fp->f_flag & FNONBLOCK) != 0) { 1086 mtx_unlock(&tfd->tfd_lock); 1087 return (EAGAIN); 1088 } 1089 error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0); 1090 if (error == 0) 1091 goto retry; 1092 } 1093 if (error == 0) { 1094 count = tfd->tfd_count; 1095 tfd->tfd_count = 0; 1096 mtx_unlock(&tfd->tfd_lock); 1097 error = uiomove(&count, sizeof(timerfd_t), uio); 1098 } else 1099 mtx_unlock(&tfd->tfd_lock); 1100 1101 return (error); 1102 } 1103 1104 static int 1105 timerfd_poll(struct file *fp, int events, struct ucred *active_cred, 1106 struct thread *td) 1107 { 1108 struct timerfd *tfd; 1109 int revents = 0; 1110 1111 tfd = fp->f_data; 1112 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1113 return (POLLERR); 1114 1115 mtx_lock(&tfd->tfd_lock); 1116 if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0) 1117 revents |= events & (POLLIN|POLLRDNORM); 1118 if (revents == 0) 1119 selrecord(td, &tfd->tfd_sel); 1120 mtx_unlock(&tfd->tfd_lock); 1121 1122 return (revents); 1123 } 1124 1125 /*ARGSUSED*/ 1126 static int 1127 timerfd_kqfilter(struct file *fp, struct knote *kn) 1128 { 1129 struct timerfd *tfd; 1130 1131 tfd = fp->f_data; 1132 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) 1133 return (EINVAL); 1134 1135 if (kn->kn_filter == EVFILT_READ) 1136 kn->kn_fop = &timerfd_rfiltops; 1137 else 1138 return (EINVAL); 1139 1140 kn->kn_hook = tfd; 1141 knlist_add(&tfd->tfd_sel.si_note, kn, 0); 1142 1143 return (0); 1144 } 1145 1146 static void 1147 filt_timerfddetach(struct knote *kn) 1148 { 1149 struct timerfd *tfd = kn->kn_hook; 1150 1151 mtx_lock(&tfd->tfd_lock); 1152 knlist_remove(&tfd->tfd_sel.si_note, kn, 1); 1153 mtx_unlock(&tfd->tfd_lock); 1154 } 1155 1156 /*ARGSUSED*/ 1157 static int 1158 filt_timerfdread(struct knote *kn, long hint) 1159 { 1160 struct timerfd *tfd = kn->kn_hook; 1161 1162 return (tfd->tfd_count > 0); 1163 } 1164 1165 /*ARGSUSED*/ 1166 static int 1167 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, 1168 struct thread *td) 1169 { 1170 1171 return (ENXIO); 1172 } 1173 1174 /*ARGSUSED*/ 1175 static int 1176 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 1177 { 1178 1179 kif->kf_type = KF_TYPE_UNKNOWN; 1180 return (0); 1181 } 1182 1183 static void 1184 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts) 1185 { 1186 1187 if (tfd->tfd_clockid == CLOCK_REALTIME) 1188 getnanotime(ts); 1189 else /* CLOCK_MONOTONIC */ 1190 getnanouptime(ts); 1191 } 1192 1193 static void 1194 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots) 1195 { 1196 struct timespec cts; 1197 1198 linux_timerfd_clocktime(tfd, &cts); 1199 *ots = tfd->tfd_time; 1200 if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) { 1201 timespecsub(&ots->it_value, &cts, &ots->it_value); 1202 if (ots->it_value.tv_sec < 0 || 1203 (ots->it_value.tv_sec == 0 && 1204 ots->it_value.tv_nsec == 0)) { 1205 ots->it_value.tv_sec = 0; 1206 ots->it_value.tv_nsec = 1; 1207 } 1208 } 1209 } 1210 1211 int 1212 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args) 1213 { 1214 struct l_itimerspec lots; 1215 struct itimerspec ots; 1216 struct timerfd *tfd; 1217 struct file *fp; 1218 int error; 1219 1220 error = fget(td, args->fd, &cap_read_rights, &fp); 1221 if (error != 0) 1222 return (error); 1223 tfd = fp->f_data; 1224 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { 1225 error = EINVAL; 1226 goto out; 1227 } 1228 1229 mtx_lock(&tfd->tfd_lock); 1230 linux_timerfd_curval(tfd, &ots); 1231 mtx_unlock(&tfd->tfd_lock); 1232 1233 error = native_to_linux_itimerspec(&lots, &ots); 1234 if (error == 0) 1235 error = copyout(&lots, args->old_value, sizeof(lots)); 1236 1237 out: 1238 fdrop(fp, td); 1239 return (error); 1240 } 1241 1242 int 1243 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args) 1244 { 1245 struct l_itimerspec lots; 1246 struct itimerspec nts, ots; 1247 struct timespec cts, ts; 1248 struct timerfd *tfd; 1249 struct timeval tv; 1250 struct file *fp; 1251 int error; 1252 1253 if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0) 1254 return (EINVAL); 1255 1256 error = copyin(args->new_value, &lots, sizeof(lots)); 1257 if (error != 0) 1258 return (error); 1259 error = linux_to_native_itimerspec(&nts, &lots); 1260 if (error != 0) 1261 return (error); 1262 1263 error = fget(td, args->fd, &cap_write_rights, &fp); 1264 if (error != 0) 1265 return (error); 1266 tfd = fp->f_data; 1267 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { 1268 error = EINVAL; 1269 goto out; 1270 } 1271 1272 mtx_lock(&tfd->tfd_lock); 1273 if (!timespecisset(&nts.it_value)) 1274 timespecclear(&nts.it_interval); 1275 if (args->old_value != NULL) 1276 linux_timerfd_curval(tfd, &ots); 1277 1278 tfd->tfd_time = nts; 1279 if (timespecisset(&nts.it_value)) { 1280 linux_timerfd_clocktime(tfd, &cts); 1281 ts = nts.it_value; 1282 if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) { 1283 timespecadd(&tfd->tfd_time.it_value, &cts, 1284 &tfd->tfd_time.it_value); 1285 } else { 1286 timespecsub(&ts, &cts, &ts); 1287 } 1288 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1289 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1290 linux_timerfd_expire, tfd); 1291 tfd->tfd_canceled = false; 1292 } else { 1293 tfd->tfd_canceled = true; 1294 callout_stop(&tfd->tfd_callout); 1295 } 1296 mtx_unlock(&tfd->tfd_lock); 1297 1298 if (args->old_value != NULL) { 1299 error = native_to_linux_itimerspec(&lots, &ots); 1300 if (error == 0) 1301 error = copyout(&lots, args->old_value, sizeof(lots)); 1302 } 1303 1304 out: 1305 fdrop(fp, td); 1306 return (error); 1307 } 1308 1309 static void 1310 linux_timerfd_expire(void *arg) 1311 { 1312 struct timespec cts, ts; 1313 struct timeval tv; 1314 struct timerfd *tfd; 1315 1316 tfd = (struct timerfd *)arg; 1317 1318 linux_timerfd_clocktime(tfd, &cts); 1319 if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) { 1320 if (timespecisset(&tfd->tfd_time.it_interval)) 1321 timespecadd(&tfd->tfd_time.it_value, 1322 &tfd->tfd_time.it_interval, 1323 &tfd->tfd_time.it_value); 1324 else 1325 /* single shot timer */ 1326 timespecclear(&tfd->tfd_time.it_value); 1327 if (timespecisset(&tfd->tfd_time.it_value)) { 1328 timespecsub(&tfd->tfd_time.it_value, &cts, &ts); 1329 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1330 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1331 linux_timerfd_expire, tfd); 1332 } 1333 tfd->tfd_count++; 1334 KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0); 1335 selwakeup(&tfd->tfd_sel); 1336 wakeup(&tfd->tfd_count); 1337 } else if (timespecisset(&tfd->tfd_time.it_value)) { 1338 timespecsub(&tfd->tfd_time.it_value, &cts, &ts); 1339 TIMESPEC_TO_TIMEVAL(&tv, &ts); 1340 callout_reset(&tfd->tfd_callout, tvtohz(&tv), 1341 linux_timerfd_expire, tfd); 1342 } 1343 } 1344