1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_capsicum.h" 41 #include "opt_compat.h" 42 #include "opt_ktrace.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/sysproto.h> 47 #include <sys/capability.h> 48 #include <sys/filedesc.h> 49 #include <sys/filio.h> 50 #include <sys/fcntl.h> 51 #include <sys/file.h> 52 #include <sys/lock.h> 53 #include <sys/proc.h> 54 #include <sys/signalvar.h> 55 #include <sys/socketvar.h> 56 #include <sys/uio.h> 57 #include <sys/kernel.h> 58 #include <sys/ktr.h> 59 #include <sys/limits.h> 60 #include <sys/malloc.h> 61 #include <sys/poll.h> 62 #include <sys/resourcevar.h> 63 #include <sys/selinfo.h> 64 #include <sys/sleepqueue.h> 65 #include <sys/syscallsubr.h> 66 #include <sys/sysctl.h> 67 #include <sys/sysent.h> 68 #include <sys/vnode.h> 69 #include <sys/bio.h> 70 #include <sys/buf.h> 71 #include <sys/condvar.h> 72 #ifdef KTRACE 73 #include <sys/ktrace.h> 74 #endif 75 76 #include <security/audit/audit.h> 77 78 int iosize_max_clamp = 1; 79 SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW, 80 &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX"); 81 /* 82 * Assert that the return value of read(2) and write(2) syscalls fits 83 * into a register. If not, an architecture will need to provide the 84 * usermode wrappers to reconstruct the result. 85 */ 86 CTASSERT(sizeof(register_t) >= sizeof(size_t)); 87 88 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 89 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 90 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 91 92 static int pollout(struct thread *, struct pollfd *, struct pollfd *, 93 u_int); 94 static int pollscan(struct thread *, struct pollfd *, u_int); 95 static int pollrescan(struct thread *); 96 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 97 static int selrescan(struct thread *, fd_mask **, fd_mask **); 98 static void selfdalloc(struct thread *, void *); 99 static void selfdfree(struct seltd *, struct selfd *); 100 static int dofileread(struct thread *, int, struct file *, struct uio *, 101 off_t, int); 102 static int dofilewrite(struct thread *, int, struct file *, struct uio *, 103 off_t, int); 104 static void doselwakeup(struct selinfo *, int); 105 static void seltdinit(struct thread *); 106 static int seltdwait(struct thread *, sbintime_t, sbintime_t); 107 static void seltdclear(struct thread *); 108 109 /* 110 * One seltd per-thread allocated on demand as needed. 111 * 112 * t - protected by st_mtx 113 * k - Only accessed by curthread or read-only 114 */ 115 struct seltd { 116 STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */ 117 struct selfd *st_free1; /* (k) free fd for read set. */ 118 struct selfd *st_free2; /* (k) free fd for write set. */ 119 struct mtx st_mtx; /* Protects struct seltd */ 120 struct cv st_wait; /* (t) Wait channel. */ 121 int st_flags; /* (t) SELTD_ flags. */ 122 }; 123 124 #define SELTD_PENDING 0x0001 /* We have pending events. */ 125 #define SELTD_RESCAN 0x0002 /* Doing a rescan. */ 126 127 /* 128 * One selfd allocated per-thread per-file-descriptor. 129 * f - protected by sf_mtx 130 */ 131 struct selfd { 132 STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ 133 TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ 134 struct selinfo *sf_si; /* (f) selinfo when linked. */ 135 struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ 136 struct seltd *sf_td; /* (k) owning seltd. */ 137 void *sf_cookie; /* (k) fd or pollfd. */ 138 }; 139 140 static uma_zone_t selfd_zone; 141 static struct mtx_pool *mtxpool_select; 142 143 #ifndef _SYS_SYSPROTO_H_ 144 struct read_args { 145 int fd; 146 void *buf; 147 size_t nbyte; 148 }; 149 #endif 150 int 151 sys_read(td, uap) 152 struct thread *td; 153 struct read_args *uap; 154 { 155 struct uio auio; 156 struct iovec aiov; 157 int error; 158 159 if (uap->nbyte > IOSIZE_MAX) 160 return (EINVAL); 161 aiov.iov_base = uap->buf; 162 aiov.iov_len = uap->nbyte; 163 auio.uio_iov = &aiov; 164 auio.uio_iovcnt = 1; 165 auio.uio_resid = uap->nbyte; 166 auio.uio_segflg = UIO_USERSPACE; 167 error = kern_readv(td, uap->fd, &auio); 168 return(error); 169 } 170 171 /* 172 * Positioned read system call 173 */ 174 #ifndef _SYS_SYSPROTO_H_ 175 struct pread_args { 176 int fd; 177 void *buf; 178 size_t nbyte; 179 int pad; 180 off_t offset; 181 }; 182 #endif 183 int 184 sys_pread(td, uap) 185 struct thread *td; 186 struct pread_args *uap; 187 { 188 struct uio auio; 189 struct iovec aiov; 190 int error; 191 192 if (uap->nbyte > IOSIZE_MAX) 193 return (EINVAL); 194 aiov.iov_base = uap->buf; 195 aiov.iov_len = uap->nbyte; 196 auio.uio_iov = &aiov; 197 auio.uio_iovcnt = 1; 198 auio.uio_resid = uap->nbyte; 199 auio.uio_segflg = UIO_USERSPACE; 200 error = kern_preadv(td, uap->fd, &auio, uap->offset); 201 return(error); 202 } 203 204 int 205 freebsd6_pread(td, uap) 206 struct thread *td; 207 struct freebsd6_pread_args *uap; 208 { 209 struct pread_args oargs; 210 211 oargs.fd = uap->fd; 212 oargs.buf = uap->buf; 213 oargs.nbyte = uap->nbyte; 214 oargs.offset = uap->offset; 215 return (sys_pread(td, &oargs)); 216 } 217 218 /* 219 * Scatter read system call. 220 */ 221 #ifndef _SYS_SYSPROTO_H_ 222 struct readv_args { 223 int fd; 224 struct iovec *iovp; 225 u_int iovcnt; 226 }; 227 #endif 228 int 229 sys_readv(struct thread *td, struct readv_args *uap) 230 { 231 struct uio *auio; 232 int error; 233 234 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 235 if (error) 236 return (error); 237 error = kern_readv(td, uap->fd, auio); 238 free(auio, M_IOV); 239 return (error); 240 } 241 242 int 243 kern_readv(struct thread *td, int fd, struct uio *auio) 244 { 245 struct file *fp; 246 cap_rights_t rights; 247 int error; 248 249 error = fget_read(td, fd, cap_rights_init(&rights, CAP_READ), &fp); 250 if (error) 251 return (error); 252 error = dofileread(td, fd, fp, auio, (off_t)-1, 0); 253 fdrop(fp, td); 254 return (error); 255 } 256 257 /* 258 * Scatter positioned read system call. 259 */ 260 #ifndef _SYS_SYSPROTO_H_ 261 struct preadv_args { 262 int fd; 263 struct iovec *iovp; 264 u_int iovcnt; 265 off_t offset; 266 }; 267 #endif 268 int 269 sys_preadv(struct thread *td, struct preadv_args *uap) 270 { 271 struct uio *auio; 272 int error; 273 274 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 275 if (error) 276 return (error); 277 error = kern_preadv(td, uap->fd, auio, uap->offset); 278 free(auio, M_IOV); 279 return (error); 280 } 281 282 int 283 kern_preadv(td, fd, auio, offset) 284 struct thread *td; 285 int fd; 286 struct uio *auio; 287 off_t offset; 288 { 289 struct file *fp; 290 cap_rights_t rights; 291 int error; 292 293 error = fget_read(td, fd, cap_rights_init(&rights, CAP_PREAD), &fp); 294 if (error) 295 return (error); 296 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 297 error = ESPIPE; 298 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 299 error = EINVAL; 300 else 301 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); 302 fdrop(fp, td); 303 return (error); 304 } 305 306 /* 307 * Common code for readv and preadv that reads data in 308 * from a file using the passed in uio, offset, and flags. 309 */ 310 static int 311 dofileread(td, fd, fp, auio, offset, flags) 312 struct thread *td; 313 int fd; 314 struct file *fp; 315 struct uio *auio; 316 off_t offset; 317 int flags; 318 { 319 ssize_t cnt; 320 int error; 321 #ifdef KTRACE 322 struct uio *ktruio = NULL; 323 #endif 324 325 /* Finish zero length reads right here */ 326 if (auio->uio_resid == 0) { 327 td->td_retval[0] = 0; 328 return(0); 329 } 330 auio->uio_rw = UIO_READ; 331 auio->uio_offset = offset; 332 auio->uio_td = td; 333 #ifdef KTRACE 334 if (KTRPOINT(td, KTR_GENIO)) 335 ktruio = cloneuio(auio); 336 #endif 337 cnt = auio->uio_resid; 338 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { 339 if (auio->uio_resid != cnt && (error == ERESTART || 340 error == EINTR || error == EWOULDBLOCK)) 341 error = 0; 342 } 343 cnt -= auio->uio_resid; 344 #ifdef KTRACE 345 if (ktruio != NULL) { 346 ktruio->uio_resid = cnt; 347 ktrgenio(fd, UIO_READ, ktruio, error); 348 } 349 #endif 350 td->td_retval[0] = cnt; 351 return (error); 352 } 353 354 #ifndef _SYS_SYSPROTO_H_ 355 struct write_args { 356 int fd; 357 const void *buf; 358 size_t nbyte; 359 }; 360 #endif 361 int 362 sys_write(td, uap) 363 struct thread *td; 364 struct write_args *uap; 365 { 366 struct uio auio; 367 struct iovec aiov; 368 int error; 369 370 if (uap->nbyte > IOSIZE_MAX) 371 return (EINVAL); 372 aiov.iov_base = (void *)(uintptr_t)uap->buf; 373 aiov.iov_len = uap->nbyte; 374 auio.uio_iov = &aiov; 375 auio.uio_iovcnt = 1; 376 auio.uio_resid = uap->nbyte; 377 auio.uio_segflg = UIO_USERSPACE; 378 error = kern_writev(td, uap->fd, &auio); 379 return(error); 380 } 381 382 /* 383 * Positioned write system call. 384 */ 385 #ifndef _SYS_SYSPROTO_H_ 386 struct pwrite_args { 387 int fd; 388 const void *buf; 389 size_t nbyte; 390 int pad; 391 off_t offset; 392 }; 393 #endif 394 int 395 sys_pwrite(td, uap) 396 struct thread *td; 397 struct pwrite_args *uap; 398 { 399 struct uio auio; 400 struct iovec aiov; 401 int error; 402 403 if (uap->nbyte > IOSIZE_MAX) 404 return (EINVAL); 405 aiov.iov_base = (void *)(uintptr_t)uap->buf; 406 aiov.iov_len = uap->nbyte; 407 auio.uio_iov = &aiov; 408 auio.uio_iovcnt = 1; 409 auio.uio_resid = uap->nbyte; 410 auio.uio_segflg = UIO_USERSPACE; 411 error = kern_pwritev(td, uap->fd, &auio, uap->offset); 412 return(error); 413 } 414 415 int 416 freebsd6_pwrite(td, uap) 417 struct thread *td; 418 struct freebsd6_pwrite_args *uap; 419 { 420 struct pwrite_args oargs; 421 422 oargs.fd = uap->fd; 423 oargs.buf = uap->buf; 424 oargs.nbyte = uap->nbyte; 425 oargs.offset = uap->offset; 426 return (sys_pwrite(td, &oargs)); 427 } 428 429 /* 430 * Gather write system call. 431 */ 432 #ifndef _SYS_SYSPROTO_H_ 433 struct writev_args { 434 int fd; 435 struct iovec *iovp; 436 u_int iovcnt; 437 }; 438 #endif 439 int 440 sys_writev(struct thread *td, struct writev_args *uap) 441 { 442 struct uio *auio; 443 int error; 444 445 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 446 if (error) 447 return (error); 448 error = kern_writev(td, uap->fd, auio); 449 free(auio, M_IOV); 450 return (error); 451 } 452 453 int 454 kern_writev(struct thread *td, int fd, struct uio *auio) 455 { 456 struct file *fp; 457 cap_rights_t rights; 458 int error; 459 460 error = fget_write(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp); 461 if (error) 462 return (error); 463 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); 464 fdrop(fp, td); 465 return (error); 466 } 467 468 /* 469 * Gather positioned write system call. 470 */ 471 #ifndef _SYS_SYSPROTO_H_ 472 struct pwritev_args { 473 int fd; 474 struct iovec *iovp; 475 u_int iovcnt; 476 off_t offset; 477 }; 478 #endif 479 int 480 sys_pwritev(struct thread *td, struct pwritev_args *uap) 481 { 482 struct uio *auio; 483 int error; 484 485 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 486 if (error) 487 return (error); 488 error = kern_pwritev(td, uap->fd, auio, uap->offset); 489 free(auio, M_IOV); 490 return (error); 491 } 492 493 int 494 kern_pwritev(td, fd, auio, offset) 495 struct thread *td; 496 struct uio *auio; 497 int fd; 498 off_t offset; 499 { 500 struct file *fp; 501 cap_rights_t rights; 502 int error; 503 504 error = fget_write(td, fd, cap_rights_init(&rights, CAP_PWRITE), &fp); 505 if (error) 506 return (error); 507 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 508 error = ESPIPE; 509 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 510 error = EINVAL; 511 else 512 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); 513 fdrop(fp, td); 514 return (error); 515 } 516 517 /* 518 * Common code for writev and pwritev that writes data to 519 * a file using the passed in uio, offset, and flags. 520 */ 521 static int 522 dofilewrite(td, fd, fp, auio, offset, flags) 523 struct thread *td; 524 int fd; 525 struct file *fp; 526 struct uio *auio; 527 off_t offset; 528 int flags; 529 { 530 ssize_t cnt; 531 int error; 532 #ifdef KTRACE 533 struct uio *ktruio = NULL; 534 #endif 535 536 auio->uio_rw = UIO_WRITE; 537 auio->uio_td = td; 538 auio->uio_offset = offset; 539 #ifdef KTRACE 540 if (KTRPOINT(td, KTR_GENIO)) 541 ktruio = cloneuio(auio); 542 #endif 543 cnt = auio->uio_resid; 544 if (fp->f_type == DTYPE_VNODE && 545 (fp->f_vnread_flags & FDEVFS_VNODE) == 0) 546 bwillwrite(); 547 if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { 548 if (auio->uio_resid != cnt && (error == ERESTART || 549 error == EINTR || error == EWOULDBLOCK)) 550 error = 0; 551 /* Socket layer is responsible for issuing SIGPIPE. */ 552 if (fp->f_type != DTYPE_SOCKET && error == EPIPE) { 553 PROC_LOCK(td->td_proc); 554 tdsignal(td, SIGPIPE); 555 PROC_UNLOCK(td->td_proc); 556 } 557 } 558 cnt -= auio->uio_resid; 559 #ifdef KTRACE 560 if (ktruio != NULL) { 561 ktruio->uio_resid = cnt; 562 ktrgenio(fd, UIO_WRITE, ktruio, error); 563 } 564 #endif 565 td->td_retval[0] = cnt; 566 return (error); 567 } 568 569 /* 570 * Truncate a file given a file descriptor. 571 * 572 * Can't use fget_write() here, since must return EINVAL and not EBADF if the 573 * descriptor isn't writable. 574 */ 575 int 576 kern_ftruncate(td, fd, length) 577 struct thread *td; 578 int fd; 579 off_t length; 580 { 581 struct file *fp; 582 cap_rights_t rights; 583 int error; 584 585 AUDIT_ARG_FD(fd); 586 if (length < 0) 587 return (EINVAL); 588 error = fget(td, fd, cap_rights_init(&rights, CAP_FTRUNCATE), &fp); 589 if (error) 590 return (error); 591 AUDIT_ARG_FILE(td->td_proc, fp); 592 if (!(fp->f_flag & FWRITE)) { 593 fdrop(fp, td); 594 return (EINVAL); 595 } 596 error = fo_truncate(fp, length, td->td_ucred, td); 597 fdrop(fp, td); 598 return (error); 599 } 600 601 #ifndef _SYS_SYSPROTO_H_ 602 struct ftruncate_args { 603 int fd; 604 int pad; 605 off_t length; 606 }; 607 #endif 608 int 609 sys_ftruncate(td, uap) 610 struct thread *td; 611 struct ftruncate_args *uap; 612 { 613 614 return (kern_ftruncate(td, uap->fd, uap->length)); 615 } 616 617 #if defined(COMPAT_43) 618 #ifndef _SYS_SYSPROTO_H_ 619 struct oftruncate_args { 620 int fd; 621 long length; 622 }; 623 #endif 624 int 625 oftruncate(td, uap) 626 struct thread *td; 627 struct oftruncate_args *uap; 628 { 629 630 return (kern_ftruncate(td, uap->fd, uap->length)); 631 } 632 #endif /* COMPAT_43 */ 633 634 #ifndef _SYS_SYSPROTO_H_ 635 struct ioctl_args { 636 int fd; 637 u_long com; 638 caddr_t data; 639 }; 640 #endif 641 /* ARGSUSED */ 642 int 643 sys_ioctl(struct thread *td, struct ioctl_args *uap) 644 { 645 u_long com; 646 int arg, error; 647 u_int size; 648 caddr_t data; 649 650 if (uap->com > 0xffffffff) { 651 printf( 652 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 653 td->td_proc->p_pid, td->td_name, uap->com); 654 uap->com &= 0xffffffff; 655 } 656 com = uap->com; 657 658 /* 659 * Interpret high order word to find amount of data to be 660 * copied to/from the user's address space. 661 */ 662 size = IOCPARM_LEN(com); 663 if ((size > IOCPARM_MAX) || 664 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 665 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) 666 ((com & IOC_OUT) && size == 0) || 667 #else 668 ((com & (IOC_IN | IOC_OUT)) && size == 0) || 669 #endif 670 ((com & IOC_VOID) && size > 0 && size != sizeof(int))) 671 return (ENOTTY); 672 673 if (size > 0) { 674 if (com & IOC_VOID) { 675 /* Integer argument. */ 676 arg = (intptr_t)uap->data; 677 data = (void *)&arg; 678 size = 0; 679 } else 680 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 681 } else 682 data = (void *)&uap->data; 683 if (com & IOC_IN) { 684 error = copyin(uap->data, data, (u_int)size); 685 if (error) { 686 if (size > 0) 687 free(data, M_IOCTLOPS); 688 return (error); 689 } 690 } else if (com & IOC_OUT) { 691 /* 692 * Zero the buffer so the user always 693 * gets back something deterministic. 694 */ 695 bzero(data, size); 696 } 697 698 error = kern_ioctl(td, uap->fd, com, data); 699 700 if (error == 0 && (com & IOC_OUT)) 701 error = copyout(data, uap->data, (u_int)size); 702 703 if (size > 0) 704 free(data, M_IOCTLOPS); 705 return (error); 706 } 707 708 int 709 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) 710 { 711 struct file *fp; 712 struct filedesc *fdp; 713 #ifndef CAPABILITIES 714 cap_rights_t rights; 715 #endif 716 int error, tmp, locked; 717 718 AUDIT_ARG_FD(fd); 719 AUDIT_ARG_CMD(com); 720 721 fdp = td->td_proc->p_fd; 722 723 switch (com) { 724 case FIONCLEX: 725 case FIOCLEX: 726 FILEDESC_XLOCK(fdp); 727 locked = LA_XLOCKED; 728 break; 729 default: 730 #ifdef CAPABILITIES 731 FILEDESC_SLOCK(fdp); 732 locked = LA_SLOCKED; 733 #else 734 locked = LA_UNLOCKED; 735 #endif 736 break; 737 } 738 739 #ifdef CAPABILITIES 740 if ((fp = fget_locked(fdp, fd)) == NULL) { 741 error = EBADF; 742 goto out; 743 } 744 if ((error = cap_ioctl_check(fdp, fd, com)) != 0) { 745 fp = NULL; /* fhold() was not called yet */ 746 goto out; 747 } 748 fhold(fp); 749 if (locked == LA_SLOCKED) { 750 FILEDESC_SUNLOCK(fdp); 751 locked = LA_UNLOCKED; 752 } 753 #else 754 error = fget(td, fd, cap_rights_init(&rights, CAP_IOCTL), &fp); 755 if (error != 0) { 756 fp = NULL; 757 goto out; 758 } 759 #endif 760 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 761 error = EBADF; 762 goto out; 763 } 764 765 switch (com) { 766 case FIONCLEX: 767 fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE; 768 goto out; 769 case FIOCLEX: 770 fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE; 771 goto out; 772 case FIONBIO: 773 if ((tmp = *(int *)data)) 774 atomic_set_int(&fp->f_flag, FNONBLOCK); 775 else 776 atomic_clear_int(&fp->f_flag, FNONBLOCK); 777 data = (void *)&tmp; 778 break; 779 case FIOASYNC: 780 if ((tmp = *(int *)data)) 781 atomic_set_int(&fp->f_flag, FASYNC); 782 else 783 atomic_clear_int(&fp->f_flag, FASYNC); 784 data = (void *)&tmp; 785 break; 786 } 787 788 error = fo_ioctl(fp, com, data, td->td_ucred, td); 789 out: 790 switch (locked) { 791 case LA_XLOCKED: 792 FILEDESC_XUNLOCK(fdp); 793 break; 794 #ifdef CAPABILITIES 795 case LA_SLOCKED: 796 FILEDESC_SUNLOCK(fdp); 797 break; 798 #endif 799 default: 800 FILEDESC_UNLOCK_ASSERT(fdp); 801 break; 802 } 803 if (fp != NULL) 804 fdrop(fp, td); 805 return (error); 806 } 807 808 int 809 poll_no_poll(int events) 810 { 811 /* 812 * Return true for read/write. If the user asked for something 813 * special, return POLLNVAL, so that clients have a way of 814 * determining reliably whether or not the extended 815 * functionality is present without hard-coding knowledge 816 * of specific filesystem implementations. 817 */ 818 if (events & ~POLLSTANDARD) 819 return (POLLNVAL); 820 821 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 822 } 823 824 int 825 sys_pselect(struct thread *td, struct pselect_args *uap) 826 { 827 struct timespec ts; 828 struct timeval tv, *tvp; 829 sigset_t set, *uset; 830 int error; 831 832 if (uap->ts != NULL) { 833 error = copyin(uap->ts, &ts, sizeof(ts)); 834 if (error != 0) 835 return (error); 836 TIMESPEC_TO_TIMEVAL(&tv, &ts); 837 tvp = &tv; 838 } else 839 tvp = NULL; 840 if (uap->sm != NULL) { 841 error = copyin(uap->sm, &set, sizeof(set)); 842 if (error != 0) 843 return (error); 844 uset = &set; 845 } else 846 uset = NULL; 847 return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 848 uset, NFDBITS)); 849 } 850 851 int 852 kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex, 853 struct timeval *tvp, sigset_t *uset, int abi_nfdbits) 854 { 855 int error; 856 857 if (uset != NULL) { 858 error = kern_sigprocmask(td, SIG_SETMASK, uset, 859 &td->td_oldsigmask, 0); 860 if (error != 0) 861 return (error); 862 td->td_pflags |= TDP_OLDMASK; 863 /* 864 * Make sure that ast() is called on return to 865 * usermode and TDP_OLDMASK is cleared, restoring old 866 * sigmask. 867 */ 868 thread_lock(td); 869 td->td_flags |= TDF_ASTPENDING; 870 thread_unlock(td); 871 } 872 error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits); 873 return (error); 874 } 875 876 #ifndef _SYS_SYSPROTO_H_ 877 struct select_args { 878 int nd; 879 fd_set *in, *ou, *ex; 880 struct timeval *tv; 881 }; 882 #endif 883 int 884 sys_select(struct thread *td, struct select_args *uap) 885 { 886 struct timeval tv, *tvp; 887 int error; 888 889 if (uap->tv != NULL) { 890 error = copyin(uap->tv, &tv, sizeof(tv)); 891 if (error) 892 return (error); 893 tvp = &tv; 894 } else 895 tvp = NULL; 896 897 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 898 NFDBITS)); 899 } 900 901 /* 902 * In the unlikely case when user specified n greater then the last 903 * open file descriptor, check that no bits are set after the last 904 * valid fd. We must return EBADF if any is set. 905 * 906 * There are applications that rely on the behaviour. 907 * 908 * nd is fd_lastfile + 1. 909 */ 910 static int 911 select_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits) 912 { 913 char *addr, *oaddr; 914 int b, i, res; 915 uint8_t bits; 916 917 if (nd >= ndu || fd_in == NULL) 918 return (0); 919 920 oaddr = NULL; 921 bits = 0; /* silence gcc */ 922 for (i = nd; i < ndu; i++) { 923 b = i / NBBY; 924 #if BYTE_ORDER == LITTLE_ENDIAN 925 addr = (char *)fd_in + b; 926 #else 927 addr = (char *)fd_in; 928 if (abi_nfdbits == NFDBITS) { 929 addr += rounddown(b, sizeof(fd_mask)) + 930 sizeof(fd_mask) - 1 - b % sizeof(fd_mask); 931 } else { 932 addr += rounddown(b, sizeof(uint32_t)) + 933 sizeof(uint32_t) - 1 - b % sizeof(uint32_t); 934 } 935 #endif 936 if (addr != oaddr) { 937 res = fubyte(addr); 938 if (res == -1) 939 return (EFAULT); 940 oaddr = addr; 941 bits = res; 942 } 943 if ((bits & (1 << (i % NBBY))) != 0) 944 return (EBADF); 945 } 946 return (0); 947 } 948 949 int 950 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 951 fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits) 952 { 953 struct filedesc *fdp; 954 /* 955 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 956 * infds with the new FD_SETSIZE of 1024, and more than enough for 957 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 958 * of 256. 959 */ 960 fd_mask s_selbits[howmany(2048, NFDBITS)]; 961 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 962 struct timeval rtv; 963 sbintime_t asbt, precision, rsbt; 964 u_int nbufbytes, ncpbytes, ncpubytes, nfdbits; 965 int error, lf, ndu; 966 967 if (nd < 0) 968 return (EINVAL); 969 fdp = td->td_proc->p_fd; 970 ndu = nd; 971 lf = fdp->fd_lastfile; 972 if (nd > lf + 1) 973 nd = lf + 1; 974 975 error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits); 976 if (error != 0) 977 return (error); 978 error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits); 979 if (error != 0) 980 return (error); 981 error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits); 982 if (error != 0) 983 return (error); 984 985 /* 986 * Allocate just enough bits for the non-null fd_sets. Use the 987 * preallocated auto buffer if possible. 988 */ 989 nfdbits = roundup(nd, NFDBITS); 990 ncpbytes = nfdbits / NBBY; 991 ncpubytes = roundup(nd, abi_nfdbits) / NBBY; 992 nbufbytes = 0; 993 if (fd_in != NULL) 994 nbufbytes += 2 * ncpbytes; 995 if (fd_ou != NULL) 996 nbufbytes += 2 * ncpbytes; 997 if (fd_ex != NULL) 998 nbufbytes += 2 * ncpbytes; 999 if (nbufbytes <= sizeof s_selbits) 1000 selbits = &s_selbits[0]; 1001 else 1002 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 1003 1004 /* 1005 * Assign pointers into the bit buffers and fetch the input bits. 1006 * Put the output buffers together so that they can be bzeroed 1007 * together. 1008 */ 1009 sbp = selbits; 1010 #define getbits(name, x) \ 1011 do { \ 1012 if (name == NULL) { \ 1013 ibits[x] = NULL; \ 1014 obits[x] = NULL; \ 1015 } else { \ 1016 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 1017 obits[x] = sbp; \ 1018 sbp += ncpbytes / sizeof *sbp; \ 1019 error = copyin(name, ibits[x], ncpubytes); \ 1020 if (error != 0) \ 1021 goto done; \ 1022 bzero((char *)ibits[x] + ncpubytes, \ 1023 ncpbytes - ncpubytes); \ 1024 } \ 1025 } while (0) 1026 getbits(fd_in, 0); 1027 getbits(fd_ou, 1); 1028 getbits(fd_ex, 2); 1029 #undef getbits 1030 1031 #if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__) 1032 /* 1033 * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS, 1034 * we are running under 32-bit emulation. This should be more 1035 * generic. 1036 */ 1037 #define swizzle_fdset(bits) \ 1038 if (abi_nfdbits != NFDBITS && bits != NULL) { \ 1039 int i; \ 1040 for (i = 0; i < ncpbytes / sizeof *sbp; i++) \ 1041 bits[i] = (bits[i] >> 32) | (bits[i] << 32); \ 1042 } 1043 #else 1044 #define swizzle_fdset(bits) 1045 #endif 1046 1047 /* Make sure the bit order makes it through an ABI transition */ 1048 swizzle_fdset(ibits[0]); 1049 swizzle_fdset(ibits[1]); 1050 swizzle_fdset(ibits[2]); 1051 1052 if (nbufbytes != 0) 1053 bzero(selbits, nbufbytes / 2); 1054 1055 precision = 0; 1056 if (tvp != NULL) { 1057 rtv = *tvp; 1058 if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 1059 rtv.tv_usec >= 1000000) { 1060 error = EINVAL; 1061 goto done; 1062 } 1063 if (!timevalisset(&rtv)) 1064 asbt = 0; 1065 else if (rtv.tv_sec <= INT32_MAX) { 1066 rsbt = tvtosbt(rtv); 1067 precision = rsbt; 1068 precision >>= tc_precexp; 1069 if (TIMESEL(&asbt, rsbt)) 1070 asbt += tc_tick_sbt; 1071 if (asbt <= INT64_MAX - rsbt) 1072 asbt += rsbt; 1073 else 1074 asbt = -1; 1075 } else 1076 asbt = -1; 1077 } else 1078 asbt = -1; 1079 seltdinit(td); 1080 /* Iterate until the timeout expires or descriptors become ready. */ 1081 for (;;) { 1082 error = selscan(td, ibits, obits, nd); 1083 if (error || td->td_retval[0] != 0) 1084 break; 1085 error = seltdwait(td, asbt, precision); 1086 if (error) 1087 break; 1088 error = selrescan(td, ibits, obits); 1089 if (error || td->td_retval[0] != 0) 1090 break; 1091 } 1092 seltdclear(td); 1093 1094 done: 1095 /* select is not restarted after signals... */ 1096 if (error == ERESTART) 1097 error = EINTR; 1098 if (error == EWOULDBLOCK) 1099 error = 0; 1100 1101 /* swizzle bit order back, if necessary */ 1102 swizzle_fdset(obits[0]); 1103 swizzle_fdset(obits[1]); 1104 swizzle_fdset(obits[2]); 1105 #undef swizzle_fdset 1106 1107 #define putbits(name, x) \ 1108 if (name && (error2 = copyout(obits[x], name, ncpubytes))) \ 1109 error = error2; 1110 if (error == 0) { 1111 int error2; 1112 1113 putbits(fd_in, 0); 1114 putbits(fd_ou, 1); 1115 putbits(fd_ex, 2); 1116 #undef putbits 1117 } 1118 if (selbits != &s_selbits[0]) 1119 free(selbits, M_SELECT); 1120 1121 return (error); 1122 } 1123 /* 1124 * Convert a select bit set to poll flags. 1125 * 1126 * The backend always returns POLLHUP/POLLERR if appropriate and we 1127 * return this as a set bit in any set. 1128 */ 1129 static int select_flags[3] = { 1130 POLLRDNORM | POLLHUP | POLLERR, 1131 POLLWRNORM | POLLHUP | POLLERR, 1132 POLLRDBAND | POLLERR 1133 }; 1134 1135 /* 1136 * Compute the fo_poll flags required for a fd given by the index and 1137 * bit position in the fd_mask array. 1138 */ 1139 static __inline int 1140 selflags(fd_mask **ibits, int idx, fd_mask bit) 1141 { 1142 int flags; 1143 int msk; 1144 1145 flags = 0; 1146 for (msk = 0; msk < 3; msk++) { 1147 if (ibits[msk] == NULL) 1148 continue; 1149 if ((ibits[msk][idx] & bit) == 0) 1150 continue; 1151 flags |= select_flags[msk]; 1152 } 1153 return (flags); 1154 } 1155 1156 /* 1157 * Set the appropriate output bits given a mask of fired events and the 1158 * input bits originally requested. 1159 */ 1160 static __inline int 1161 selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events) 1162 { 1163 int msk; 1164 int n; 1165 1166 n = 0; 1167 for (msk = 0; msk < 3; msk++) { 1168 if ((events & select_flags[msk]) == 0) 1169 continue; 1170 if (ibits[msk] == NULL) 1171 continue; 1172 if ((ibits[msk][idx] & bit) == 0) 1173 continue; 1174 /* 1175 * XXX Check for a duplicate set. This can occur because a 1176 * socket calls selrecord() twice for each poll() call 1177 * resulting in two selfds per real fd. selrescan() will 1178 * call selsetbits twice as a result. 1179 */ 1180 if ((obits[msk][idx] & bit) != 0) 1181 continue; 1182 obits[msk][idx] |= bit; 1183 n++; 1184 } 1185 1186 return (n); 1187 } 1188 1189 static __inline int 1190 getselfd_cap(struct filedesc *fdp, int fd, struct file **fpp) 1191 { 1192 cap_rights_t rights; 1193 1194 return (fget_unlocked(fdp, fd, cap_rights_init(&rights, CAP_POLL_EVENT), 1195 0, fpp, NULL)); 1196 } 1197 1198 /* 1199 * Traverse the list of fds attached to this thread's seltd and check for 1200 * completion. 1201 */ 1202 static int 1203 selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) 1204 { 1205 struct filedesc *fdp; 1206 struct selinfo *si; 1207 struct seltd *stp; 1208 struct selfd *sfp; 1209 struct selfd *sfn; 1210 struct file *fp; 1211 fd_mask bit; 1212 int fd, ev, n, idx; 1213 int error; 1214 1215 fdp = td->td_proc->p_fd; 1216 stp = td->td_sel; 1217 n = 0; 1218 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1219 fd = (int)(uintptr_t)sfp->sf_cookie; 1220 si = sfp->sf_si; 1221 selfdfree(stp, sfp); 1222 /* If the selinfo wasn't cleared the event didn't fire. */ 1223 if (si != NULL) 1224 continue; 1225 error = getselfd_cap(fdp, fd, &fp); 1226 if (error) 1227 return (error); 1228 idx = fd / NFDBITS; 1229 bit = (fd_mask)1 << (fd % NFDBITS); 1230 ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td); 1231 fdrop(fp, td); 1232 if (ev != 0) 1233 n += selsetbits(ibits, obits, idx, bit, ev); 1234 } 1235 stp->st_flags = 0; 1236 td->td_retval[0] = n; 1237 return (0); 1238 } 1239 1240 /* 1241 * Perform the initial filedescriptor scan and register ourselves with 1242 * each selinfo. 1243 */ 1244 static int 1245 selscan(td, ibits, obits, nfd) 1246 struct thread *td; 1247 fd_mask **ibits, **obits; 1248 int nfd; 1249 { 1250 struct filedesc *fdp; 1251 struct file *fp; 1252 fd_mask bit; 1253 int ev, flags, end, fd; 1254 int n, idx; 1255 int error; 1256 1257 fdp = td->td_proc->p_fd; 1258 n = 0; 1259 for (idx = 0, fd = 0; fd < nfd; idx++) { 1260 end = imin(fd + NFDBITS, nfd); 1261 for (bit = 1; fd < end; bit <<= 1, fd++) { 1262 /* Compute the list of events we're interested in. */ 1263 flags = selflags(ibits, idx, bit); 1264 if (flags == 0) 1265 continue; 1266 error = getselfd_cap(fdp, fd, &fp); 1267 if (error) 1268 return (error); 1269 selfdalloc(td, (void *)(uintptr_t)fd); 1270 ev = fo_poll(fp, flags, td->td_ucred, td); 1271 fdrop(fp, td); 1272 if (ev != 0) 1273 n += selsetbits(ibits, obits, idx, bit, ev); 1274 } 1275 } 1276 1277 td->td_retval[0] = n; 1278 return (0); 1279 } 1280 1281 #ifndef _SYS_SYSPROTO_H_ 1282 struct poll_args { 1283 struct pollfd *fds; 1284 u_int nfds; 1285 int timeout; 1286 }; 1287 #endif 1288 int 1289 sys_poll(td, uap) 1290 struct thread *td; 1291 struct poll_args *uap; 1292 { 1293 struct pollfd *bits; 1294 struct pollfd smallbits[32]; 1295 sbintime_t asbt, precision, rsbt; 1296 u_int nfds; 1297 int error; 1298 size_t ni; 1299 1300 nfds = uap->nfds; 1301 if (nfds > maxfilesperproc && nfds > FD_SETSIZE) 1302 return (EINVAL); 1303 ni = nfds * sizeof(struct pollfd); 1304 if (ni > sizeof(smallbits)) 1305 bits = malloc(ni, M_TEMP, M_WAITOK); 1306 else 1307 bits = smallbits; 1308 error = copyin(uap->fds, bits, ni); 1309 if (error) 1310 goto done; 1311 precision = 0; 1312 if (uap->timeout != INFTIM) { 1313 if (uap->timeout < 0) { 1314 error = EINVAL; 1315 goto done; 1316 } 1317 if (uap->timeout == 0) 1318 asbt = 0; 1319 else { 1320 rsbt = SBT_1MS * uap->timeout; 1321 precision = rsbt; 1322 precision >>= tc_precexp; 1323 if (TIMESEL(&asbt, rsbt)) 1324 asbt += tc_tick_sbt; 1325 asbt += rsbt; 1326 } 1327 } else 1328 asbt = -1; 1329 seltdinit(td); 1330 /* Iterate until the timeout expires or descriptors become ready. */ 1331 for (;;) { 1332 error = pollscan(td, bits, nfds); 1333 if (error || td->td_retval[0] != 0) 1334 break; 1335 error = seltdwait(td, asbt, precision); 1336 if (error) 1337 break; 1338 error = pollrescan(td); 1339 if (error || td->td_retval[0] != 0) 1340 break; 1341 } 1342 seltdclear(td); 1343 1344 done: 1345 /* poll is not restarted after signals... */ 1346 if (error == ERESTART) 1347 error = EINTR; 1348 if (error == EWOULDBLOCK) 1349 error = 0; 1350 if (error == 0) { 1351 error = pollout(td, bits, uap->fds, nfds); 1352 if (error) 1353 goto out; 1354 } 1355 out: 1356 if (ni > sizeof(smallbits)) 1357 free(bits, M_TEMP); 1358 return (error); 1359 } 1360 1361 static int 1362 pollrescan(struct thread *td) 1363 { 1364 struct seltd *stp; 1365 struct selfd *sfp; 1366 struct selfd *sfn; 1367 struct selinfo *si; 1368 struct filedesc *fdp; 1369 struct file *fp; 1370 struct pollfd *fd; 1371 #ifdef CAPABILITIES 1372 cap_rights_t rights; 1373 #endif 1374 int n; 1375 1376 n = 0; 1377 fdp = td->td_proc->p_fd; 1378 stp = td->td_sel; 1379 FILEDESC_SLOCK(fdp); 1380 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1381 fd = (struct pollfd *)sfp->sf_cookie; 1382 si = sfp->sf_si; 1383 selfdfree(stp, sfp); 1384 /* If the selinfo wasn't cleared the event didn't fire. */ 1385 if (si != NULL) 1386 continue; 1387 fp = fdp->fd_ofiles[fd->fd].fde_file; 1388 #ifdef CAPABILITIES 1389 if (fp == NULL || 1390 cap_check(cap_rights(fdp, fd->fd), 1391 cap_rights_init(&rights, CAP_POLL_EVENT)) != 0) 1392 #else 1393 if (fp == NULL) 1394 #endif 1395 { 1396 fd->revents = POLLNVAL; 1397 n++; 1398 continue; 1399 } 1400 1401 /* 1402 * Note: backend also returns POLLHUP and 1403 * POLLERR if appropriate. 1404 */ 1405 fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); 1406 if (fd->revents != 0) 1407 n++; 1408 } 1409 FILEDESC_SUNLOCK(fdp); 1410 stp->st_flags = 0; 1411 td->td_retval[0] = n; 1412 return (0); 1413 } 1414 1415 1416 static int 1417 pollout(td, fds, ufds, nfd) 1418 struct thread *td; 1419 struct pollfd *fds; 1420 struct pollfd *ufds; 1421 u_int nfd; 1422 { 1423 int error = 0; 1424 u_int i = 0; 1425 u_int n = 0; 1426 1427 for (i = 0; i < nfd; i++) { 1428 error = copyout(&fds->revents, &ufds->revents, 1429 sizeof(ufds->revents)); 1430 if (error) 1431 return (error); 1432 if (fds->revents != 0) 1433 n++; 1434 fds++; 1435 ufds++; 1436 } 1437 td->td_retval[0] = n; 1438 return (0); 1439 } 1440 1441 static int 1442 pollscan(td, fds, nfd) 1443 struct thread *td; 1444 struct pollfd *fds; 1445 u_int nfd; 1446 { 1447 struct filedesc *fdp = td->td_proc->p_fd; 1448 struct file *fp; 1449 #ifdef CAPABILITIES 1450 cap_rights_t rights; 1451 #endif 1452 int i, n = 0; 1453 1454 FILEDESC_SLOCK(fdp); 1455 for (i = 0; i < nfd; i++, fds++) { 1456 if (fds->fd >= fdp->fd_nfiles) { 1457 fds->revents = POLLNVAL; 1458 n++; 1459 } else if (fds->fd < 0) { 1460 fds->revents = 0; 1461 } else { 1462 fp = fdp->fd_ofiles[fds->fd].fde_file; 1463 #ifdef CAPABILITIES 1464 if (fp == NULL || 1465 cap_check(cap_rights(fdp, fds->fd), 1466 cap_rights_init(&rights, CAP_POLL_EVENT)) != 0) 1467 #else 1468 if (fp == NULL) 1469 #endif 1470 { 1471 fds->revents = POLLNVAL; 1472 n++; 1473 } else { 1474 /* 1475 * Note: backend also returns POLLHUP and 1476 * POLLERR if appropriate. 1477 */ 1478 selfdalloc(td, fds); 1479 fds->revents = fo_poll(fp, fds->events, 1480 td->td_ucred, td); 1481 /* 1482 * POSIX requires POLLOUT to be never 1483 * set simultaneously with POLLHUP. 1484 */ 1485 if ((fds->revents & POLLHUP) != 0) 1486 fds->revents &= ~POLLOUT; 1487 1488 if (fds->revents != 0) 1489 n++; 1490 } 1491 } 1492 } 1493 FILEDESC_SUNLOCK(fdp); 1494 td->td_retval[0] = n; 1495 return (0); 1496 } 1497 1498 /* 1499 * OpenBSD poll system call. 1500 * 1501 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1502 */ 1503 #ifndef _SYS_SYSPROTO_H_ 1504 struct openbsd_poll_args { 1505 struct pollfd *fds; 1506 u_int nfds; 1507 int timeout; 1508 }; 1509 #endif 1510 int 1511 sys_openbsd_poll(td, uap) 1512 register struct thread *td; 1513 register struct openbsd_poll_args *uap; 1514 { 1515 return (sys_poll(td, (struct poll_args *)uap)); 1516 } 1517 1518 /* 1519 * XXX This was created specifically to support netncp and netsmb. This 1520 * allows the caller to specify a socket to wait for events on. It returns 1521 * 0 if any events matched and an error otherwise. There is no way to 1522 * determine which events fired. 1523 */ 1524 int 1525 selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) 1526 { 1527 struct timeval rtv; 1528 sbintime_t asbt, precision, rsbt; 1529 int error; 1530 1531 precision = 0; /* stupid gcc! */ 1532 if (tvp != NULL) { 1533 rtv = *tvp; 1534 if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 1535 rtv.tv_usec >= 1000000) 1536 return (EINVAL); 1537 if (!timevalisset(&rtv)) 1538 asbt = 0; 1539 else if (rtv.tv_sec <= INT32_MAX) { 1540 rsbt = tvtosbt(rtv); 1541 precision = rsbt; 1542 precision >>= tc_precexp; 1543 if (TIMESEL(&asbt, rsbt)) 1544 asbt += tc_tick_sbt; 1545 if (asbt <= INT64_MAX - rsbt) 1546 asbt += rsbt; 1547 else 1548 asbt = -1; 1549 } else 1550 asbt = -1; 1551 } else 1552 asbt = -1; 1553 seltdinit(td); 1554 /* 1555 * Iterate until the timeout expires or the socket becomes ready. 1556 */ 1557 for (;;) { 1558 selfdalloc(td, NULL); 1559 error = sopoll(so, events, NULL, td); 1560 /* error here is actually the ready events. */ 1561 if (error) 1562 return (0); 1563 error = seltdwait(td, asbt, precision); 1564 if (error) 1565 break; 1566 } 1567 seltdclear(td); 1568 /* XXX Duplicates ncp/smb behavior. */ 1569 if (error == ERESTART) 1570 error = 0; 1571 return (error); 1572 } 1573 1574 /* 1575 * Preallocate two selfds associated with 'cookie'. Some fo_poll routines 1576 * have two select sets, one for read and another for write. 1577 */ 1578 static void 1579 selfdalloc(struct thread *td, void *cookie) 1580 { 1581 struct seltd *stp; 1582 1583 stp = td->td_sel; 1584 if (stp->st_free1 == NULL) 1585 stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); 1586 stp->st_free1->sf_td = stp; 1587 stp->st_free1->sf_cookie = cookie; 1588 if (stp->st_free2 == NULL) 1589 stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); 1590 stp->st_free2->sf_td = stp; 1591 stp->st_free2->sf_cookie = cookie; 1592 } 1593 1594 static void 1595 selfdfree(struct seltd *stp, struct selfd *sfp) 1596 { 1597 STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link); 1598 mtx_lock(sfp->sf_mtx); 1599 if (sfp->sf_si) 1600 TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); 1601 mtx_unlock(sfp->sf_mtx); 1602 uma_zfree(selfd_zone, sfp); 1603 } 1604 1605 /* Drain the waiters tied to all the selfd belonging the specified selinfo. */ 1606 void 1607 seldrain(sip) 1608 struct selinfo *sip; 1609 { 1610 1611 /* 1612 * This feature is already provided by doselwakeup(), thus it is 1613 * enough to go for it. 1614 * Eventually, the context, should take care to avoid races 1615 * between thread calling select()/poll() and file descriptor 1616 * detaching, but, again, the races are just the same as 1617 * selwakeup(). 1618 */ 1619 doselwakeup(sip, -1); 1620 } 1621 1622 /* 1623 * Record a select request. 1624 */ 1625 void 1626 selrecord(selector, sip) 1627 struct thread *selector; 1628 struct selinfo *sip; 1629 { 1630 struct selfd *sfp; 1631 struct seltd *stp; 1632 struct mtx *mtxp; 1633 1634 stp = selector->td_sel; 1635 /* 1636 * Don't record when doing a rescan. 1637 */ 1638 if (stp->st_flags & SELTD_RESCAN) 1639 return; 1640 /* 1641 * Grab one of the preallocated descriptors. 1642 */ 1643 sfp = NULL; 1644 if ((sfp = stp->st_free1) != NULL) 1645 stp->st_free1 = NULL; 1646 else if ((sfp = stp->st_free2) != NULL) 1647 stp->st_free2 = NULL; 1648 else 1649 panic("selrecord: No free selfd on selq"); 1650 mtxp = sip->si_mtx; 1651 if (mtxp == NULL) 1652 mtxp = mtx_pool_find(mtxpool_select, sip); 1653 /* 1654 * Initialize the sfp and queue it in the thread. 1655 */ 1656 sfp->sf_si = sip; 1657 sfp->sf_mtx = mtxp; 1658 STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); 1659 /* 1660 * Now that we've locked the sip, check for initialization. 1661 */ 1662 mtx_lock(mtxp); 1663 if (sip->si_mtx == NULL) { 1664 sip->si_mtx = mtxp; 1665 TAILQ_INIT(&sip->si_tdlist); 1666 } 1667 /* 1668 * Add this thread to the list of selfds listening on this selinfo. 1669 */ 1670 TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); 1671 mtx_unlock(sip->si_mtx); 1672 } 1673 1674 /* Wake up a selecting thread. */ 1675 void 1676 selwakeup(sip) 1677 struct selinfo *sip; 1678 { 1679 doselwakeup(sip, -1); 1680 } 1681 1682 /* Wake up a selecting thread, and set its priority. */ 1683 void 1684 selwakeuppri(sip, pri) 1685 struct selinfo *sip; 1686 int pri; 1687 { 1688 doselwakeup(sip, pri); 1689 } 1690 1691 /* 1692 * Do a wakeup when a selectable event occurs. 1693 */ 1694 static void 1695 doselwakeup(sip, pri) 1696 struct selinfo *sip; 1697 int pri; 1698 { 1699 struct selfd *sfp; 1700 struct selfd *sfn; 1701 struct seltd *stp; 1702 1703 /* If it's not initialized there can't be any waiters. */ 1704 if (sip->si_mtx == NULL) 1705 return; 1706 /* 1707 * Locking the selinfo locks all selfds associated with it. 1708 */ 1709 mtx_lock(sip->si_mtx); 1710 TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { 1711 /* 1712 * Once we remove this sfp from the list and clear the 1713 * sf_si seltdclear will know to ignore this si. 1714 */ 1715 TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); 1716 sfp->sf_si = NULL; 1717 stp = sfp->sf_td; 1718 mtx_lock(&stp->st_mtx); 1719 stp->st_flags |= SELTD_PENDING; 1720 cv_broadcastpri(&stp->st_wait, pri); 1721 mtx_unlock(&stp->st_mtx); 1722 } 1723 mtx_unlock(sip->si_mtx); 1724 } 1725 1726 static void 1727 seltdinit(struct thread *td) 1728 { 1729 struct seltd *stp; 1730 1731 if ((stp = td->td_sel) != NULL) 1732 goto out; 1733 td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); 1734 mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); 1735 cv_init(&stp->st_wait, "select"); 1736 out: 1737 stp->st_flags = 0; 1738 STAILQ_INIT(&stp->st_selq); 1739 } 1740 1741 static int 1742 seltdwait(struct thread *td, sbintime_t sbt, sbintime_t precision) 1743 { 1744 struct seltd *stp; 1745 int error; 1746 1747 stp = td->td_sel; 1748 /* 1749 * An event of interest may occur while we do not hold the seltd 1750 * locked so check the pending flag before we sleep. 1751 */ 1752 mtx_lock(&stp->st_mtx); 1753 /* 1754 * Any further calls to selrecord will be a rescan. 1755 */ 1756 stp->st_flags |= SELTD_RESCAN; 1757 if (stp->st_flags & SELTD_PENDING) { 1758 mtx_unlock(&stp->st_mtx); 1759 return (0); 1760 } 1761 if (sbt == 0) 1762 error = EWOULDBLOCK; 1763 else if (sbt != -1) 1764 error = cv_timedwait_sig_sbt(&stp->st_wait, &stp->st_mtx, 1765 sbt, precision, C_ABSOLUTE); 1766 else 1767 error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); 1768 mtx_unlock(&stp->st_mtx); 1769 1770 return (error); 1771 } 1772 1773 void 1774 seltdfini(struct thread *td) 1775 { 1776 struct seltd *stp; 1777 1778 stp = td->td_sel; 1779 if (stp == NULL) 1780 return; 1781 if (stp->st_free1) 1782 uma_zfree(selfd_zone, stp->st_free1); 1783 if (stp->st_free2) 1784 uma_zfree(selfd_zone, stp->st_free2); 1785 td->td_sel = NULL; 1786 free(stp, M_SELECT); 1787 } 1788 1789 /* 1790 * Remove the references to the thread from all of the objects we were 1791 * polling. 1792 */ 1793 static void 1794 seltdclear(struct thread *td) 1795 { 1796 struct seltd *stp; 1797 struct selfd *sfp; 1798 struct selfd *sfn; 1799 1800 stp = td->td_sel; 1801 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) 1802 selfdfree(stp, sfp); 1803 stp->st_flags = 0; 1804 } 1805 1806 static void selectinit(void *); 1807 SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); 1808 static void 1809 selectinit(void *dummy __unused) 1810 { 1811 1812 selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL, 1813 NULL, NULL, UMA_ALIGN_PTR, 0); 1814 mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF); 1815 } 1816