1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_capsicum.h" 41 #include "opt_compat.h" 42 #include "opt_ktrace.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/sysproto.h> 47 #include <sys/capability.h> 48 #include <sys/filedesc.h> 49 #include <sys/filio.h> 50 #include <sys/fcntl.h> 51 #include <sys/file.h> 52 #include <sys/proc.h> 53 #include <sys/signalvar.h> 54 #include <sys/socketvar.h> 55 #include <sys/uio.h> 56 #include <sys/kernel.h> 57 #include <sys/ktr.h> 58 #include <sys/limits.h> 59 #include <sys/malloc.h> 60 #include <sys/poll.h> 61 #include <sys/resourcevar.h> 62 #include <sys/selinfo.h> 63 #include <sys/sleepqueue.h> 64 #include <sys/syscallsubr.h> 65 #include <sys/sysctl.h> 66 #include <sys/sysent.h> 67 #include <sys/vnode.h> 68 #include <sys/bio.h> 69 #include <sys/buf.h> 70 #include <sys/condvar.h> 71 #ifdef KTRACE 72 #include <sys/ktrace.h> 73 #endif 74 75 #include <security/audit/audit.h> 76 77 int iosize_max_clamp = 1; 78 SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW, &iosize_max_clamp, 0, 79 "Clamp max i/o size to INT_MAX"); 80 81 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 82 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 83 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 84 85 static int pollout(struct thread *, struct pollfd *, struct pollfd *, 86 u_int); 87 static int pollscan(struct thread *, struct pollfd *, u_int); 88 static int pollrescan(struct thread *); 89 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 90 static int selrescan(struct thread *, fd_mask **, fd_mask **); 91 static void selfdalloc(struct thread *, void *); 92 static void selfdfree(struct seltd *, struct selfd *); 93 static int dofileread(struct thread *, int, struct file *, struct uio *, 94 off_t, int); 95 static int dofilewrite(struct thread *, int, struct file *, struct uio *, 96 off_t, int); 97 static void doselwakeup(struct selinfo *, int); 98 static void seltdinit(struct thread *); 99 static int seltdwait(struct thread *, int); 100 static void seltdclear(struct thread *); 101 102 /* 103 * One seltd per-thread allocated on demand as needed. 104 * 105 * t - protected by st_mtx 106 * k - Only accessed by curthread or read-only 107 */ 108 struct seltd { 109 STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */ 110 struct selfd *st_free1; /* (k) free fd for read set. */ 111 struct selfd *st_free2; /* (k) free fd for write set. */ 112 struct mtx st_mtx; /* Protects struct seltd */ 113 struct cv st_wait; /* (t) Wait channel. */ 114 int st_flags; /* (t) SELTD_ flags. */ 115 }; 116 117 #define SELTD_PENDING 0x0001 /* We have pending events. */ 118 #define SELTD_RESCAN 0x0002 /* Doing a rescan. */ 119 120 /* 121 * One selfd allocated per-thread per-file-descriptor. 122 * f - protected by sf_mtx 123 */ 124 struct selfd { 125 STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ 126 TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ 127 struct selinfo *sf_si; /* (f) selinfo when linked. */ 128 struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ 129 struct seltd *sf_td; /* (k) owning seltd. */ 130 void *sf_cookie; /* (k) fd or pollfd. */ 131 }; 132 133 static uma_zone_t selfd_zone; 134 static struct mtx_pool *mtxpool_select; 135 136 #ifndef _SYS_SYSPROTO_H_ 137 struct read_args { 138 int fd; 139 void *buf; 140 size_t nbyte; 141 }; 142 #endif 143 int 144 sys_read(td, uap) 145 struct thread *td; 146 struct read_args *uap; 147 { 148 struct uio auio; 149 struct iovec aiov; 150 int error; 151 152 if (uap->nbyte > IOSIZE_MAX) 153 return (EINVAL); 154 aiov.iov_base = uap->buf; 155 aiov.iov_len = uap->nbyte; 156 auio.uio_iov = &aiov; 157 auio.uio_iovcnt = 1; 158 auio.uio_resid = uap->nbyte; 159 auio.uio_segflg = UIO_USERSPACE; 160 error = kern_readv(td, uap->fd, &auio); 161 return(error); 162 } 163 164 /* 165 * Positioned read system call 166 */ 167 #ifndef _SYS_SYSPROTO_H_ 168 struct pread_args { 169 int fd; 170 void *buf; 171 size_t nbyte; 172 int pad; 173 off_t offset; 174 }; 175 #endif 176 int 177 sys_pread(td, uap) 178 struct thread *td; 179 struct pread_args *uap; 180 { 181 struct uio auio; 182 struct iovec aiov; 183 int error; 184 185 if (uap->nbyte > IOSIZE_MAX) 186 return (EINVAL); 187 aiov.iov_base = uap->buf; 188 aiov.iov_len = uap->nbyte; 189 auio.uio_iov = &aiov; 190 auio.uio_iovcnt = 1; 191 auio.uio_resid = uap->nbyte; 192 auio.uio_segflg = UIO_USERSPACE; 193 error = kern_preadv(td, uap->fd, &auio, uap->offset); 194 return(error); 195 } 196 197 int 198 freebsd6_pread(td, uap) 199 struct thread *td; 200 struct freebsd6_pread_args *uap; 201 { 202 struct pread_args oargs; 203 204 oargs.fd = uap->fd; 205 oargs.buf = uap->buf; 206 oargs.nbyte = uap->nbyte; 207 oargs.offset = uap->offset; 208 return (sys_pread(td, &oargs)); 209 } 210 211 /* 212 * Scatter read system call. 213 */ 214 #ifndef _SYS_SYSPROTO_H_ 215 struct readv_args { 216 int fd; 217 struct iovec *iovp; 218 u_int iovcnt; 219 }; 220 #endif 221 int 222 sys_readv(struct thread *td, struct readv_args *uap) 223 { 224 struct uio *auio; 225 int error; 226 227 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 228 if (error) 229 return (error); 230 error = kern_readv(td, uap->fd, auio); 231 free(auio, M_IOV); 232 return (error); 233 } 234 235 int 236 kern_readv(struct thread *td, int fd, struct uio *auio) 237 { 238 struct file *fp; 239 int error; 240 241 error = fget_read(td, fd, CAP_READ | CAP_SEEK, &fp); 242 if (error) 243 return (error); 244 error = dofileread(td, fd, fp, auio, (off_t)-1, 0); 245 fdrop(fp, td); 246 return (error); 247 } 248 249 /* 250 * Scatter positioned read system call. 251 */ 252 #ifndef _SYS_SYSPROTO_H_ 253 struct preadv_args { 254 int fd; 255 struct iovec *iovp; 256 u_int iovcnt; 257 off_t offset; 258 }; 259 #endif 260 int 261 sys_preadv(struct thread *td, struct preadv_args *uap) 262 { 263 struct uio *auio; 264 int error; 265 266 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 267 if (error) 268 return (error); 269 error = kern_preadv(td, uap->fd, auio, uap->offset); 270 free(auio, M_IOV); 271 return (error); 272 } 273 274 int 275 kern_preadv(td, fd, auio, offset) 276 struct thread *td; 277 int fd; 278 struct uio *auio; 279 off_t offset; 280 { 281 struct file *fp; 282 int error; 283 284 error = fget_read(td, fd, CAP_READ, &fp); 285 if (error) 286 return (error); 287 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 288 error = ESPIPE; 289 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 290 error = EINVAL; 291 else 292 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); 293 fdrop(fp, td); 294 return (error); 295 } 296 297 /* 298 * Common code for readv and preadv that reads data in 299 * from a file using the passed in uio, offset, and flags. 300 */ 301 static int 302 dofileread(td, fd, fp, auio, offset, flags) 303 struct thread *td; 304 int fd; 305 struct file *fp; 306 struct uio *auio; 307 off_t offset; 308 int flags; 309 { 310 ssize_t cnt; 311 int error; 312 #ifdef KTRACE 313 struct uio *ktruio = NULL; 314 #endif 315 316 /* Finish zero length reads right here */ 317 if (auio->uio_resid == 0) { 318 td->td_retval[0] = 0; 319 return(0); 320 } 321 auio->uio_rw = UIO_READ; 322 auio->uio_offset = offset; 323 auio->uio_td = td; 324 #ifdef KTRACE 325 if (KTRPOINT(td, KTR_GENIO)) 326 ktruio = cloneuio(auio); 327 #endif 328 cnt = auio->uio_resid; 329 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { 330 if (auio->uio_resid != cnt && (error == ERESTART || 331 error == EINTR || error == EWOULDBLOCK)) 332 error = 0; 333 } 334 cnt -= auio->uio_resid; 335 #ifdef KTRACE 336 if (ktruio != NULL) { 337 ktruio->uio_resid = cnt; 338 ktrgenio(fd, UIO_READ, ktruio, error); 339 } 340 #endif 341 #if SSIZE_MAX > LONG_MAX 342 td->td_retval[1] = cnt >> (sizeof(register_t) * CHAR_BIT); 343 td->td_retval[0] = cnt; 344 #else 345 td->td_retval[0] = cnt; 346 #endif 347 return (error); 348 } 349 350 #ifndef _SYS_SYSPROTO_H_ 351 struct write_args { 352 int fd; 353 const void *buf; 354 size_t nbyte; 355 }; 356 #endif 357 int 358 sys_write(td, uap) 359 struct thread *td; 360 struct write_args *uap; 361 { 362 struct uio auio; 363 struct iovec aiov; 364 int error; 365 366 if (uap->nbyte > IOSIZE_MAX) 367 return (EINVAL); 368 aiov.iov_base = (void *)(uintptr_t)uap->buf; 369 aiov.iov_len = uap->nbyte; 370 auio.uio_iov = &aiov; 371 auio.uio_iovcnt = 1; 372 auio.uio_resid = uap->nbyte; 373 auio.uio_segflg = UIO_USERSPACE; 374 error = kern_writev(td, uap->fd, &auio); 375 return(error); 376 } 377 378 /* 379 * Positioned write system call. 380 */ 381 #ifndef _SYS_SYSPROTO_H_ 382 struct pwrite_args { 383 int fd; 384 const void *buf; 385 size_t nbyte; 386 int pad; 387 off_t offset; 388 }; 389 #endif 390 int 391 sys_pwrite(td, uap) 392 struct thread *td; 393 struct pwrite_args *uap; 394 { 395 struct uio auio; 396 struct iovec aiov; 397 int error; 398 399 if (uap->nbyte > IOSIZE_MAX) 400 return (EINVAL); 401 aiov.iov_base = (void *)(uintptr_t)uap->buf; 402 aiov.iov_len = uap->nbyte; 403 auio.uio_iov = &aiov; 404 auio.uio_iovcnt = 1; 405 auio.uio_resid = uap->nbyte; 406 auio.uio_segflg = UIO_USERSPACE; 407 error = kern_pwritev(td, uap->fd, &auio, uap->offset); 408 return(error); 409 } 410 411 int 412 freebsd6_pwrite(td, uap) 413 struct thread *td; 414 struct freebsd6_pwrite_args *uap; 415 { 416 struct pwrite_args oargs; 417 418 oargs.fd = uap->fd; 419 oargs.buf = uap->buf; 420 oargs.nbyte = uap->nbyte; 421 oargs.offset = uap->offset; 422 return (sys_pwrite(td, &oargs)); 423 } 424 425 /* 426 * Gather write system call. 427 */ 428 #ifndef _SYS_SYSPROTO_H_ 429 struct writev_args { 430 int fd; 431 struct iovec *iovp; 432 u_int iovcnt; 433 }; 434 #endif 435 int 436 sys_writev(struct thread *td, struct writev_args *uap) 437 { 438 struct uio *auio; 439 int error; 440 441 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 442 if (error) 443 return (error); 444 error = kern_writev(td, uap->fd, auio); 445 free(auio, M_IOV); 446 return (error); 447 } 448 449 int 450 kern_writev(struct thread *td, int fd, struct uio *auio) 451 { 452 struct file *fp; 453 int error; 454 455 error = fget_write(td, fd, CAP_WRITE | CAP_SEEK, &fp); 456 if (error) 457 return (error); 458 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); 459 fdrop(fp, td); 460 return (error); 461 } 462 463 /* 464 * Gather positioned write system call. 465 */ 466 #ifndef _SYS_SYSPROTO_H_ 467 struct pwritev_args { 468 int fd; 469 struct iovec *iovp; 470 u_int iovcnt; 471 off_t offset; 472 }; 473 #endif 474 int 475 sys_pwritev(struct thread *td, struct pwritev_args *uap) 476 { 477 struct uio *auio; 478 int error; 479 480 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 481 if (error) 482 return (error); 483 error = kern_pwritev(td, uap->fd, auio, uap->offset); 484 free(auio, M_IOV); 485 return (error); 486 } 487 488 int 489 kern_pwritev(td, fd, auio, offset) 490 struct thread *td; 491 struct uio *auio; 492 int fd; 493 off_t offset; 494 { 495 struct file *fp; 496 int error; 497 498 error = fget_write(td, fd, CAP_WRITE, &fp); 499 if (error) 500 return (error); 501 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 502 error = ESPIPE; 503 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 504 error = EINVAL; 505 else 506 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); 507 fdrop(fp, td); 508 return (error); 509 } 510 511 /* 512 * Common code for writev and pwritev that writes data to 513 * a file using the passed in uio, offset, and flags. 514 */ 515 static int 516 dofilewrite(td, fd, fp, auio, offset, flags) 517 struct thread *td; 518 int fd; 519 struct file *fp; 520 struct uio *auio; 521 off_t offset; 522 int flags; 523 { 524 ssize_t cnt; 525 int error; 526 #ifdef KTRACE 527 struct uio *ktruio = NULL; 528 #endif 529 530 auio->uio_rw = UIO_WRITE; 531 auio->uio_td = td; 532 auio->uio_offset = offset; 533 #ifdef KTRACE 534 if (KTRPOINT(td, KTR_GENIO)) 535 ktruio = cloneuio(auio); 536 #endif 537 cnt = auio->uio_resid; 538 if (fp->f_type == DTYPE_VNODE) 539 bwillwrite(); 540 if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { 541 if (auio->uio_resid != cnt && (error == ERESTART || 542 error == EINTR || error == EWOULDBLOCK)) 543 error = 0; 544 /* Socket layer is responsible for issuing SIGPIPE. */ 545 if (fp->f_type != DTYPE_SOCKET && error == EPIPE) { 546 PROC_LOCK(td->td_proc); 547 tdsignal(td, SIGPIPE); 548 PROC_UNLOCK(td->td_proc); 549 } 550 } 551 cnt -= auio->uio_resid; 552 #ifdef KTRACE 553 if (ktruio != NULL) { 554 ktruio->uio_resid = cnt; 555 ktrgenio(fd, UIO_WRITE, ktruio, error); 556 } 557 #endif 558 #if SSIZE_MAX > LONG_MAX 559 td->td_retval[1] = cnt >> (sizeof(register_t) * CHAR_BIT); 560 td->td_retval[0] = cnt; 561 #else 562 td->td_retval[0] = cnt; 563 #endif 564 return (error); 565 } 566 567 /* 568 * Truncate a file given a file descriptor. 569 * 570 * Can't use fget_write() here, since must return EINVAL and not EBADF if the 571 * descriptor isn't writable. 572 */ 573 int 574 kern_ftruncate(td, fd, length) 575 struct thread *td; 576 int fd; 577 off_t length; 578 { 579 struct file *fp; 580 int error; 581 582 AUDIT_ARG_FD(fd); 583 if (length < 0) 584 return (EINVAL); 585 error = fget(td, fd, CAP_FTRUNCATE, &fp); 586 if (error) 587 return (error); 588 AUDIT_ARG_FILE(td->td_proc, fp); 589 if (!(fp->f_flag & FWRITE)) { 590 fdrop(fp, td); 591 return (EINVAL); 592 } 593 error = fo_truncate(fp, length, td->td_ucred, td); 594 fdrop(fp, td); 595 return (error); 596 } 597 598 #ifndef _SYS_SYSPROTO_H_ 599 struct ftruncate_args { 600 int fd; 601 int pad; 602 off_t length; 603 }; 604 #endif 605 int 606 sys_ftruncate(td, uap) 607 struct thread *td; 608 struct ftruncate_args *uap; 609 { 610 611 return (kern_ftruncate(td, uap->fd, uap->length)); 612 } 613 614 #if defined(COMPAT_43) 615 #ifndef _SYS_SYSPROTO_H_ 616 struct oftruncate_args { 617 int fd; 618 long length; 619 }; 620 #endif 621 int 622 oftruncate(td, uap) 623 struct thread *td; 624 struct oftruncate_args *uap; 625 { 626 627 return (kern_ftruncate(td, uap->fd, uap->length)); 628 } 629 #endif /* COMPAT_43 */ 630 631 #ifndef _SYS_SYSPROTO_H_ 632 struct ioctl_args { 633 int fd; 634 u_long com; 635 caddr_t data; 636 }; 637 #endif 638 /* ARGSUSED */ 639 int 640 sys_ioctl(struct thread *td, struct ioctl_args *uap) 641 { 642 u_long com; 643 int arg, error; 644 u_int size; 645 caddr_t data; 646 647 if (uap->com > 0xffffffff) { 648 printf( 649 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 650 td->td_proc->p_pid, td->td_name, uap->com); 651 uap->com &= 0xffffffff; 652 } 653 com = uap->com; 654 655 /* 656 * Interpret high order word to find amount of data to be 657 * copied to/from the user's address space. 658 */ 659 size = IOCPARM_LEN(com); 660 if ((size > IOCPARM_MAX) || 661 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 662 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) 663 ((com & IOC_OUT) && size == 0) || 664 #else 665 ((com & (IOC_IN | IOC_OUT)) && size == 0) || 666 #endif 667 ((com & IOC_VOID) && size > 0 && size != sizeof(int))) 668 return (ENOTTY); 669 670 if (size > 0) { 671 if (com & IOC_VOID) { 672 /* Integer argument. */ 673 arg = (intptr_t)uap->data; 674 data = (void *)&arg; 675 size = 0; 676 } else 677 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 678 } else 679 data = (void *)&uap->data; 680 if (com & IOC_IN) { 681 error = copyin(uap->data, data, (u_int)size); 682 if (error) { 683 if (size > 0) 684 free(data, M_IOCTLOPS); 685 return (error); 686 } 687 } else if (com & IOC_OUT) { 688 /* 689 * Zero the buffer so the user always 690 * gets back something deterministic. 691 */ 692 bzero(data, size); 693 } 694 695 error = kern_ioctl(td, uap->fd, com, data); 696 697 if (error == 0 && (com & IOC_OUT)) 698 error = copyout(data, uap->data, (u_int)size); 699 700 if (size > 0) 701 free(data, M_IOCTLOPS); 702 return (error); 703 } 704 705 int 706 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) 707 { 708 struct file *fp; 709 struct filedesc *fdp; 710 int error; 711 int tmp; 712 713 AUDIT_ARG_FD(fd); 714 AUDIT_ARG_CMD(com); 715 if ((error = fget(td, fd, CAP_IOCTL, &fp)) != 0) 716 return (error); 717 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 718 fdrop(fp, td); 719 return (EBADF); 720 } 721 fdp = td->td_proc->p_fd; 722 switch (com) { 723 case FIONCLEX: 724 FILEDESC_XLOCK(fdp); 725 fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE; 726 FILEDESC_XUNLOCK(fdp); 727 goto out; 728 case FIOCLEX: 729 FILEDESC_XLOCK(fdp); 730 fdp->fd_ofileflags[fd] |= UF_EXCLOSE; 731 FILEDESC_XUNLOCK(fdp); 732 goto out; 733 case FIONBIO: 734 if ((tmp = *(int *)data)) 735 atomic_set_int(&fp->f_flag, FNONBLOCK); 736 else 737 atomic_clear_int(&fp->f_flag, FNONBLOCK); 738 data = (void *)&tmp; 739 break; 740 case FIOASYNC: 741 if ((tmp = *(int *)data)) 742 atomic_set_int(&fp->f_flag, FASYNC); 743 else 744 atomic_clear_int(&fp->f_flag, FASYNC); 745 data = (void *)&tmp; 746 break; 747 } 748 749 error = fo_ioctl(fp, com, data, td->td_ucred, td); 750 out: 751 fdrop(fp, td); 752 return (error); 753 } 754 755 int 756 poll_no_poll(int events) 757 { 758 /* 759 * Return true for read/write. If the user asked for something 760 * special, return POLLNVAL, so that clients have a way of 761 * determining reliably whether or not the extended 762 * functionality is present without hard-coding knowledge 763 * of specific filesystem implementations. 764 */ 765 if (events & ~POLLSTANDARD) 766 return (POLLNVAL); 767 768 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 769 } 770 771 int 772 sys_pselect(struct thread *td, struct pselect_args *uap) 773 { 774 struct timespec ts; 775 struct timeval tv, *tvp; 776 sigset_t set, *uset; 777 int error; 778 779 if (uap->ts != NULL) { 780 error = copyin(uap->ts, &ts, sizeof(ts)); 781 if (error != 0) 782 return (error); 783 TIMESPEC_TO_TIMEVAL(&tv, &ts); 784 tvp = &tv; 785 } else 786 tvp = NULL; 787 if (uap->sm != NULL) { 788 error = copyin(uap->sm, &set, sizeof(set)); 789 if (error != 0) 790 return (error); 791 uset = &set; 792 } else 793 uset = NULL; 794 return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 795 uset, NFDBITS)); 796 } 797 798 int 799 kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex, 800 struct timeval *tvp, sigset_t *uset, int abi_nfdbits) 801 { 802 int error; 803 804 if (uset != NULL) { 805 error = kern_sigprocmask(td, SIG_SETMASK, uset, 806 &td->td_oldsigmask, 0); 807 if (error != 0) 808 return (error); 809 td->td_pflags |= TDP_OLDMASK; 810 /* 811 * Make sure that ast() is called on return to 812 * usermode and TDP_OLDMASK is cleared, restoring old 813 * sigmask. 814 */ 815 thread_lock(td); 816 td->td_flags |= TDF_ASTPENDING; 817 thread_unlock(td); 818 } 819 error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits); 820 return (error); 821 } 822 823 #ifndef _SYS_SYSPROTO_H_ 824 struct select_args { 825 int nd; 826 fd_set *in, *ou, *ex; 827 struct timeval *tv; 828 }; 829 #endif 830 int 831 sys_select(struct thread *td, struct select_args *uap) 832 { 833 struct timeval tv, *tvp; 834 int error; 835 836 if (uap->tv != NULL) { 837 error = copyin(uap->tv, &tv, sizeof(tv)); 838 if (error) 839 return (error); 840 tvp = &tv; 841 } else 842 tvp = NULL; 843 844 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 845 NFDBITS)); 846 } 847 848 /* 849 * In the unlikely case when user specified n greater then the last 850 * open file descriptor, check that no bits are set after the last 851 * valid fd. We must return EBADF if any is set. 852 * 853 * There are applications that rely on the behaviour. 854 * 855 * nd is fd_lastfile + 1. 856 */ 857 static int 858 select_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits) 859 { 860 char *addr, *oaddr; 861 int b, i, res; 862 uint8_t bits; 863 864 if (nd >= ndu || fd_in == NULL) 865 return (0); 866 867 oaddr = NULL; 868 bits = 0; /* silence gcc */ 869 for (i = nd; i < ndu; i++) { 870 b = i / NBBY; 871 #if BYTE_ORDER == LITTLE_ENDIAN 872 addr = (char *)fd_in + b; 873 #else 874 addr = (char *)fd_in; 875 if (abi_nfdbits == NFDBITS) { 876 addr += rounddown(b, sizeof(fd_mask)) + 877 sizeof(fd_mask) - 1 - b % sizeof(fd_mask); 878 } else { 879 addr += rounddown(b, sizeof(uint32_t)) + 880 sizeof(uint32_t) - 1 - b % sizeof(uint32_t); 881 } 882 #endif 883 if (addr != oaddr) { 884 res = fubyte(addr); 885 if (res == -1) 886 return (EFAULT); 887 oaddr = addr; 888 bits = res; 889 } 890 if ((bits & (1 << (i % NBBY))) != 0) 891 return (EBADF); 892 } 893 return (0); 894 } 895 896 int 897 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 898 fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits) 899 { 900 struct filedesc *fdp; 901 /* 902 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 903 * infds with the new FD_SETSIZE of 1024, and more than enough for 904 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 905 * of 256. 906 */ 907 fd_mask s_selbits[howmany(2048, NFDBITS)]; 908 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 909 struct timeval atv, rtv, ttv; 910 int error, lf, ndu, timo; 911 u_int nbufbytes, ncpbytes, ncpubytes, nfdbits; 912 913 if (nd < 0) 914 return (EINVAL); 915 fdp = td->td_proc->p_fd; 916 ndu = nd; 917 lf = fdp->fd_lastfile; 918 if (nd > lf + 1) 919 nd = lf + 1; 920 921 error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits); 922 if (error != 0) 923 return (error); 924 error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits); 925 if (error != 0) 926 return (error); 927 error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits); 928 if (error != 0) 929 return (error); 930 931 /* 932 * Allocate just enough bits for the non-null fd_sets. Use the 933 * preallocated auto buffer if possible. 934 */ 935 nfdbits = roundup(nd, NFDBITS); 936 ncpbytes = nfdbits / NBBY; 937 ncpubytes = roundup(nd, abi_nfdbits) / NBBY; 938 nbufbytes = 0; 939 if (fd_in != NULL) 940 nbufbytes += 2 * ncpbytes; 941 if (fd_ou != NULL) 942 nbufbytes += 2 * ncpbytes; 943 if (fd_ex != NULL) 944 nbufbytes += 2 * ncpbytes; 945 if (nbufbytes <= sizeof s_selbits) 946 selbits = &s_selbits[0]; 947 else 948 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 949 950 /* 951 * Assign pointers into the bit buffers and fetch the input bits. 952 * Put the output buffers together so that they can be bzeroed 953 * together. 954 */ 955 sbp = selbits; 956 #define getbits(name, x) \ 957 do { \ 958 if (name == NULL) { \ 959 ibits[x] = NULL; \ 960 obits[x] = NULL; \ 961 } else { \ 962 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 963 obits[x] = sbp; \ 964 sbp += ncpbytes / sizeof *sbp; \ 965 error = copyin(name, ibits[x], ncpubytes); \ 966 if (error != 0) \ 967 goto done; \ 968 bzero((char *)ibits[x] + ncpubytes, \ 969 ncpbytes - ncpubytes); \ 970 } \ 971 } while (0) 972 getbits(fd_in, 0); 973 getbits(fd_ou, 1); 974 getbits(fd_ex, 2); 975 #undef getbits 976 977 #if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__) 978 /* 979 * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS, 980 * we are running under 32-bit emulation. This should be more 981 * generic. 982 */ 983 #define swizzle_fdset(bits) \ 984 if (abi_nfdbits != NFDBITS && bits != NULL) { \ 985 int i; \ 986 for (i = 0; i < ncpbytes / sizeof *sbp; i++) \ 987 bits[i] = (bits[i] >> 32) | (bits[i] << 32); \ 988 } 989 #else 990 #define swizzle_fdset(bits) 991 #endif 992 993 /* Make sure the bit order makes it through an ABI transition */ 994 swizzle_fdset(ibits[0]); 995 swizzle_fdset(ibits[1]); 996 swizzle_fdset(ibits[2]); 997 998 if (nbufbytes != 0) 999 bzero(selbits, nbufbytes / 2); 1000 1001 if (tvp != NULL) { 1002 atv = *tvp; 1003 if (itimerfix(&atv)) { 1004 error = EINVAL; 1005 goto done; 1006 } 1007 getmicrouptime(&rtv); 1008 timevaladd(&atv, &rtv); 1009 } else { 1010 atv.tv_sec = 0; 1011 atv.tv_usec = 0; 1012 } 1013 timo = 0; 1014 seltdinit(td); 1015 /* Iterate until the timeout expires or descriptors become ready. */ 1016 for (;;) { 1017 error = selscan(td, ibits, obits, nd); 1018 if (error || td->td_retval[0] != 0) 1019 break; 1020 if (atv.tv_sec || atv.tv_usec) { 1021 getmicrouptime(&rtv); 1022 if (timevalcmp(&rtv, &atv, >=)) 1023 break; 1024 ttv = atv; 1025 timevalsub(&ttv, &rtv); 1026 timo = ttv.tv_sec > 24 * 60 * 60 ? 1027 24 * 60 * 60 * hz : tvtohz(&ttv); 1028 } 1029 error = seltdwait(td, timo); 1030 if (error) 1031 break; 1032 error = selrescan(td, ibits, obits); 1033 if (error || td->td_retval[0] != 0) 1034 break; 1035 } 1036 seltdclear(td); 1037 1038 done: 1039 /* select is not restarted after signals... */ 1040 if (error == ERESTART) 1041 error = EINTR; 1042 if (error == EWOULDBLOCK) 1043 error = 0; 1044 1045 /* swizzle bit order back, if necessary */ 1046 swizzle_fdset(obits[0]); 1047 swizzle_fdset(obits[1]); 1048 swizzle_fdset(obits[2]); 1049 #undef swizzle_fdset 1050 1051 #define putbits(name, x) \ 1052 if (name && (error2 = copyout(obits[x], name, ncpubytes))) \ 1053 error = error2; 1054 if (error == 0) { 1055 int error2; 1056 1057 putbits(fd_in, 0); 1058 putbits(fd_ou, 1); 1059 putbits(fd_ex, 2); 1060 #undef putbits 1061 } 1062 if (selbits != &s_selbits[0]) 1063 free(selbits, M_SELECT); 1064 1065 return (error); 1066 } 1067 /* 1068 * Convert a select bit set to poll flags. 1069 * 1070 * The backend always returns POLLHUP/POLLERR if appropriate and we 1071 * return this as a set bit in any set. 1072 */ 1073 static int select_flags[3] = { 1074 POLLRDNORM | POLLHUP | POLLERR, 1075 POLLWRNORM | POLLHUP | POLLERR, 1076 POLLRDBAND | POLLERR 1077 }; 1078 1079 /* 1080 * Compute the fo_poll flags required for a fd given by the index and 1081 * bit position in the fd_mask array. 1082 */ 1083 static __inline int 1084 selflags(fd_mask **ibits, int idx, fd_mask bit) 1085 { 1086 int flags; 1087 int msk; 1088 1089 flags = 0; 1090 for (msk = 0; msk < 3; msk++) { 1091 if (ibits[msk] == NULL) 1092 continue; 1093 if ((ibits[msk][idx] & bit) == 0) 1094 continue; 1095 flags |= select_flags[msk]; 1096 } 1097 return (flags); 1098 } 1099 1100 /* 1101 * Set the appropriate output bits given a mask of fired events and the 1102 * input bits originally requested. 1103 */ 1104 static __inline int 1105 selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events) 1106 { 1107 int msk; 1108 int n; 1109 1110 n = 0; 1111 for (msk = 0; msk < 3; msk++) { 1112 if ((events & select_flags[msk]) == 0) 1113 continue; 1114 if (ibits[msk] == NULL) 1115 continue; 1116 if ((ibits[msk][idx] & bit) == 0) 1117 continue; 1118 /* 1119 * XXX Check for a duplicate set. This can occur because a 1120 * socket calls selrecord() twice for each poll() call 1121 * resulting in two selfds per real fd. selrescan() will 1122 * call selsetbits twice as a result. 1123 */ 1124 if ((obits[msk][idx] & bit) != 0) 1125 continue; 1126 obits[msk][idx] |= bit; 1127 n++; 1128 } 1129 1130 return (n); 1131 } 1132 1133 static __inline int 1134 getselfd_cap(struct filedesc *fdp, int fd, struct file **fpp) 1135 { 1136 struct file *fp; 1137 #ifdef CAPABILITIES 1138 struct file *fp_fromcap; 1139 int error; 1140 #endif 1141 1142 if ((fp = fget_unlocked(fdp, fd)) == NULL) 1143 return (EBADF); 1144 #ifdef CAPABILITIES 1145 /* 1146 * If the file descriptor is for a capability, test rights and use 1147 * the file descriptor references by the capability. 1148 */ 1149 error = cap_funwrap(fp, CAP_POLL_EVENT, &fp_fromcap); 1150 if (error) { 1151 fdrop(fp, curthread); 1152 return (error); 1153 } 1154 if (fp != fp_fromcap) { 1155 fhold(fp_fromcap); 1156 fdrop(fp, curthread); 1157 fp = fp_fromcap; 1158 } 1159 #endif /* CAPABILITIES */ 1160 *fpp = fp; 1161 return (0); 1162 } 1163 1164 /* 1165 * Traverse the list of fds attached to this thread's seltd and check for 1166 * completion. 1167 */ 1168 static int 1169 selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) 1170 { 1171 struct filedesc *fdp; 1172 struct selinfo *si; 1173 struct seltd *stp; 1174 struct selfd *sfp; 1175 struct selfd *sfn; 1176 struct file *fp; 1177 fd_mask bit; 1178 int fd, ev, n, idx; 1179 int error; 1180 1181 fdp = td->td_proc->p_fd; 1182 stp = td->td_sel; 1183 n = 0; 1184 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1185 fd = (int)(uintptr_t)sfp->sf_cookie; 1186 si = sfp->sf_si; 1187 selfdfree(stp, sfp); 1188 /* If the selinfo wasn't cleared the event didn't fire. */ 1189 if (si != NULL) 1190 continue; 1191 error = getselfd_cap(fdp, fd, &fp); 1192 if (error) 1193 return (error); 1194 idx = fd / NFDBITS; 1195 bit = (fd_mask)1 << (fd % NFDBITS); 1196 ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td); 1197 fdrop(fp, td); 1198 if (ev != 0) 1199 n += selsetbits(ibits, obits, idx, bit, ev); 1200 } 1201 stp->st_flags = 0; 1202 td->td_retval[0] = n; 1203 return (0); 1204 } 1205 1206 /* 1207 * Perform the initial filedescriptor scan and register ourselves with 1208 * each selinfo. 1209 */ 1210 static int 1211 selscan(td, ibits, obits, nfd) 1212 struct thread *td; 1213 fd_mask **ibits, **obits; 1214 int nfd; 1215 { 1216 struct filedesc *fdp; 1217 struct file *fp; 1218 fd_mask bit; 1219 int ev, flags, end, fd; 1220 int n, idx; 1221 int error; 1222 1223 fdp = td->td_proc->p_fd; 1224 n = 0; 1225 for (idx = 0, fd = 0; fd < nfd; idx++) { 1226 end = imin(fd + NFDBITS, nfd); 1227 for (bit = 1; fd < end; bit <<= 1, fd++) { 1228 /* Compute the list of events we're interested in. */ 1229 flags = selflags(ibits, idx, bit); 1230 if (flags == 0) 1231 continue; 1232 error = getselfd_cap(fdp, fd, &fp); 1233 if (error) 1234 return (error); 1235 selfdalloc(td, (void *)(uintptr_t)fd); 1236 ev = fo_poll(fp, flags, td->td_ucred, td); 1237 fdrop(fp, td); 1238 if (ev != 0) 1239 n += selsetbits(ibits, obits, idx, bit, ev); 1240 } 1241 } 1242 1243 td->td_retval[0] = n; 1244 return (0); 1245 } 1246 1247 #ifndef _SYS_SYSPROTO_H_ 1248 struct poll_args { 1249 struct pollfd *fds; 1250 u_int nfds; 1251 int timeout; 1252 }; 1253 #endif 1254 int 1255 sys_poll(td, uap) 1256 struct thread *td; 1257 struct poll_args *uap; 1258 { 1259 struct pollfd *bits; 1260 struct pollfd smallbits[32]; 1261 struct timeval atv, rtv, ttv; 1262 int error = 0, timo; 1263 u_int nfds; 1264 size_t ni; 1265 1266 nfds = uap->nfds; 1267 if (nfds > maxfilesperproc && nfds > FD_SETSIZE) 1268 return (EINVAL); 1269 ni = nfds * sizeof(struct pollfd); 1270 if (ni > sizeof(smallbits)) 1271 bits = malloc(ni, M_TEMP, M_WAITOK); 1272 else 1273 bits = smallbits; 1274 error = copyin(uap->fds, bits, ni); 1275 if (error) 1276 goto done; 1277 if (uap->timeout != INFTIM) { 1278 atv.tv_sec = uap->timeout / 1000; 1279 atv.tv_usec = (uap->timeout % 1000) * 1000; 1280 if (itimerfix(&atv)) { 1281 error = EINVAL; 1282 goto done; 1283 } 1284 getmicrouptime(&rtv); 1285 timevaladd(&atv, &rtv); 1286 } else { 1287 atv.tv_sec = 0; 1288 atv.tv_usec = 0; 1289 } 1290 timo = 0; 1291 seltdinit(td); 1292 /* Iterate until the timeout expires or descriptors become ready. */ 1293 for (;;) { 1294 error = pollscan(td, bits, nfds); 1295 if (error || td->td_retval[0] != 0) 1296 break; 1297 if (atv.tv_sec || atv.tv_usec) { 1298 getmicrouptime(&rtv); 1299 if (timevalcmp(&rtv, &atv, >=)) 1300 break; 1301 ttv = atv; 1302 timevalsub(&ttv, &rtv); 1303 timo = ttv.tv_sec > 24 * 60 * 60 ? 1304 24 * 60 * 60 * hz : tvtohz(&ttv); 1305 } 1306 error = seltdwait(td, timo); 1307 if (error) 1308 break; 1309 error = pollrescan(td); 1310 if (error || td->td_retval[0] != 0) 1311 break; 1312 } 1313 seltdclear(td); 1314 1315 done: 1316 /* poll is not restarted after signals... */ 1317 if (error == ERESTART) 1318 error = EINTR; 1319 if (error == EWOULDBLOCK) 1320 error = 0; 1321 if (error == 0) { 1322 error = pollout(td, bits, uap->fds, nfds); 1323 if (error) 1324 goto out; 1325 } 1326 out: 1327 if (ni > sizeof(smallbits)) 1328 free(bits, M_TEMP); 1329 return (error); 1330 } 1331 1332 static int 1333 pollrescan(struct thread *td) 1334 { 1335 struct seltd *stp; 1336 struct selfd *sfp; 1337 struct selfd *sfn; 1338 struct selinfo *si; 1339 struct filedesc *fdp; 1340 struct file *fp; 1341 struct pollfd *fd; 1342 int n; 1343 1344 n = 0; 1345 fdp = td->td_proc->p_fd; 1346 stp = td->td_sel; 1347 FILEDESC_SLOCK(fdp); 1348 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1349 fd = (struct pollfd *)sfp->sf_cookie; 1350 si = sfp->sf_si; 1351 selfdfree(stp, sfp); 1352 /* If the selinfo wasn't cleared the event didn't fire. */ 1353 if (si != NULL) 1354 continue; 1355 fp = fdp->fd_ofiles[fd->fd]; 1356 #ifdef CAPABILITIES 1357 if ((fp == NULL) 1358 || (cap_funwrap(fp, CAP_POLL_EVENT, &fp) != 0)) { 1359 #else 1360 if (fp == NULL) { 1361 #endif 1362 fd->revents = POLLNVAL; 1363 n++; 1364 continue; 1365 } 1366 1367 /* 1368 * Note: backend also returns POLLHUP and 1369 * POLLERR if appropriate. 1370 */ 1371 fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); 1372 if (fd->revents != 0) 1373 n++; 1374 } 1375 FILEDESC_SUNLOCK(fdp); 1376 stp->st_flags = 0; 1377 td->td_retval[0] = n; 1378 return (0); 1379 } 1380 1381 1382 static int 1383 pollout(td, fds, ufds, nfd) 1384 struct thread *td; 1385 struct pollfd *fds; 1386 struct pollfd *ufds; 1387 u_int nfd; 1388 { 1389 int error = 0; 1390 u_int i = 0; 1391 u_int n = 0; 1392 1393 for (i = 0; i < nfd; i++) { 1394 error = copyout(&fds->revents, &ufds->revents, 1395 sizeof(ufds->revents)); 1396 if (error) 1397 return (error); 1398 if (fds->revents != 0) 1399 n++; 1400 fds++; 1401 ufds++; 1402 } 1403 td->td_retval[0] = n; 1404 return (0); 1405 } 1406 1407 static int 1408 pollscan(td, fds, nfd) 1409 struct thread *td; 1410 struct pollfd *fds; 1411 u_int nfd; 1412 { 1413 struct filedesc *fdp = td->td_proc->p_fd; 1414 int i; 1415 struct file *fp; 1416 int n = 0; 1417 1418 FILEDESC_SLOCK(fdp); 1419 for (i = 0; i < nfd; i++, fds++) { 1420 if (fds->fd >= fdp->fd_nfiles) { 1421 fds->revents = POLLNVAL; 1422 n++; 1423 } else if (fds->fd < 0) { 1424 fds->revents = 0; 1425 } else { 1426 fp = fdp->fd_ofiles[fds->fd]; 1427 #ifdef CAPABILITIES 1428 if ((fp == NULL) 1429 || (cap_funwrap(fp, CAP_POLL_EVENT, &fp) != 0)) { 1430 #else 1431 if (fp == NULL) { 1432 #endif 1433 fds->revents = POLLNVAL; 1434 n++; 1435 } else { 1436 /* 1437 * Note: backend also returns POLLHUP and 1438 * POLLERR if appropriate. 1439 */ 1440 selfdalloc(td, fds); 1441 fds->revents = fo_poll(fp, fds->events, 1442 td->td_ucred, td); 1443 /* 1444 * POSIX requires POLLOUT to be never 1445 * set simultaneously with POLLHUP. 1446 */ 1447 if ((fds->revents & POLLHUP) != 0) 1448 fds->revents &= ~POLLOUT; 1449 1450 if (fds->revents != 0) 1451 n++; 1452 } 1453 } 1454 } 1455 FILEDESC_SUNLOCK(fdp); 1456 td->td_retval[0] = n; 1457 return (0); 1458 } 1459 1460 /* 1461 * OpenBSD poll system call. 1462 * 1463 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1464 */ 1465 #ifndef _SYS_SYSPROTO_H_ 1466 struct openbsd_poll_args { 1467 struct pollfd *fds; 1468 u_int nfds; 1469 int timeout; 1470 }; 1471 #endif 1472 int 1473 sys_openbsd_poll(td, uap) 1474 register struct thread *td; 1475 register struct openbsd_poll_args *uap; 1476 { 1477 return (sys_poll(td, (struct poll_args *)uap)); 1478 } 1479 1480 /* 1481 * XXX This was created specifically to support netncp and netsmb. This 1482 * allows the caller to specify a socket to wait for events on. It returns 1483 * 0 if any events matched and an error otherwise. There is no way to 1484 * determine which events fired. 1485 */ 1486 int 1487 selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) 1488 { 1489 struct timeval atv, rtv, ttv; 1490 int error, timo; 1491 1492 if (tvp != NULL) { 1493 atv = *tvp; 1494 if (itimerfix(&atv)) 1495 return (EINVAL); 1496 getmicrouptime(&rtv); 1497 timevaladd(&atv, &rtv); 1498 } else { 1499 atv.tv_sec = 0; 1500 atv.tv_usec = 0; 1501 } 1502 1503 timo = 0; 1504 seltdinit(td); 1505 /* 1506 * Iterate until the timeout expires or the socket becomes ready. 1507 */ 1508 for (;;) { 1509 selfdalloc(td, NULL); 1510 error = sopoll(so, events, NULL, td); 1511 /* error here is actually the ready events. */ 1512 if (error) 1513 return (0); 1514 if (atv.tv_sec || atv.tv_usec) { 1515 getmicrouptime(&rtv); 1516 if (timevalcmp(&rtv, &atv, >=)) { 1517 seltdclear(td); 1518 return (EWOULDBLOCK); 1519 } 1520 ttv = atv; 1521 timevalsub(&ttv, &rtv); 1522 timo = ttv.tv_sec > 24 * 60 * 60 ? 1523 24 * 60 * 60 * hz : tvtohz(&ttv); 1524 } 1525 error = seltdwait(td, timo); 1526 seltdclear(td); 1527 if (error) 1528 break; 1529 } 1530 /* XXX Duplicates ncp/smb behavior. */ 1531 if (error == ERESTART) 1532 error = 0; 1533 return (error); 1534 } 1535 1536 /* 1537 * Preallocate two selfds associated with 'cookie'. Some fo_poll routines 1538 * have two select sets, one for read and another for write. 1539 */ 1540 static void 1541 selfdalloc(struct thread *td, void *cookie) 1542 { 1543 struct seltd *stp; 1544 1545 stp = td->td_sel; 1546 if (stp->st_free1 == NULL) 1547 stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); 1548 stp->st_free1->sf_td = stp; 1549 stp->st_free1->sf_cookie = cookie; 1550 if (stp->st_free2 == NULL) 1551 stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); 1552 stp->st_free2->sf_td = stp; 1553 stp->st_free2->sf_cookie = cookie; 1554 } 1555 1556 static void 1557 selfdfree(struct seltd *stp, struct selfd *sfp) 1558 { 1559 STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link); 1560 mtx_lock(sfp->sf_mtx); 1561 if (sfp->sf_si) 1562 TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); 1563 mtx_unlock(sfp->sf_mtx); 1564 uma_zfree(selfd_zone, sfp); 1565 } 1566 1567 /* Drain the waiters tied to all the selfd belonging the specified selinfo. */ 1568 void 1569 seldrain(sip) 1570 struct selinfo *sip; 1571 { 1572 1573 /* 1574 * This feature is already provided by doselwakeup(), thus it is 1575 * enough to go for it. 1576 * Eventually, the context, should take care to avoid races 1577 * between thread calling select()/poll() and file descriptor 1578 * detaching, but, again, the races are just the same as 1579 * selwakeup(). 1580 */ 1581 doselwakeup(sip, -1); 1582 } 1583 1584 /* 1585 * Record a select request. 1586 */ 1587 void 1588 selrecord(selector, sip) 1589 struct thread *selector; 1590 struct selinfo *sip; 1591 { 1592 struct selfd *sfp; 1593 struct seltd *stp; 1594 struct mtx *mtxp; 1595 1596 stp = selector->td_sel; 1597 /* 1598 * Don't record when doing a rescan. 1599 */ 1600 if (stp->st_flags & SELTD_RESCAN) 1601 return; 1602 /* 1603 * Grab one of the preallocated descriptors. 1604 */ 1605 sfp = NULL; 1606 if ((sfp = stp->st_free1) != NULL) 1607 stp->st_free1 = NULL; 1608 else if ((sfp = stp->st_free2) != NULL) 1609 stp->st_free2 = NULL; 1610 else 1611 panic("selrecord: No free selfd on selq"); 1612 mtxp = sip->si_mtx; 1613 if (mtxp == NULL) 1614 mtxp = mtx_pool_find(mtxpool_select, sip); 1615 /* 1616 * Initialize the sfp and queue it in the thread. 1617 */ 1618 sfp->sf_si = sip; 1619 sfp->sf_mtx = mtxp; 1620 STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); 1621 /* 1622 * Now that we've locked the sip, check for initialization. 1623 */ 1624 mtx_lock(mtxp); 1625 if (sip->si_mtx == NULL) { 1626 sip->si_mtx = mtxp; 1627 TAILQ_INIT(&sip->si_tdlist); 1628 } 1629 /* 1630 * Add this thread to the list of selfds listening on this selinfo. 1631 */ 1632 TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); 1633 mtx_unlock(sip->si_mtx); 1634 } 1635 1636 /* Wake up a selecting thread. */ 1637 void 1638 selwakeup(sip) 1639 struct selinfo *sip; 1640 { 1641 doselwakeup(sip, -1); 1642 } 1643 1644 /* Wake up a selecting thread, and set its priority. */ 1645 void 1646 selwakeuppri(sip, pri) 1647 struct selinfo *sip; 1648 int pri; 1649 { 1650 doselwakeup(sip, pri); 1651 } 1652 1653 /* 1654 * Do a wakeup when a selectable event occurs. 1655 */ 1656 static void 1657 doselwakeup(sip, pri) 1658 struct selinfo *sip; 1659 int pri; 1660 { 1661 struct selfd *sfp; 1662 struct selfd *sfn; 1663 struct seltd *stp; 1664 1665 /* If it's not initialized there can't be any waiters. */ 1666 if (sip->si_mtx == NULL) 1667 return; 1668 /* 1669 * Locking the selinfo locks all selfds associated with it. 1670 */ 1671 mtx_lock(sip->si_mtx); 1672 TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { 1673 /* 1674 * Once we remove this sfp from the list and clear the 1675 * sf_si seltdclear will know to ignore this si. 1676 */ 1677 TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); 1678 sfp->sf_si = NULL; 1679 stp = sfp->sf_td; 1680 mtx_lock(&stp->st_mtx); 1681 stp->st_flags |= SELTD_PENDING; 1682 cv_broadcastpri(&stp->st_wait, pri); 1683 mtx_unlock(&stp->st_mtx); 1684 } 1685 mtx_unlock(sip->si_mtx); 1686 } 1687 1688 static void 1689 seltdinit(struct thread *td) 1690 { 1691 struct seltd *stp; 1692 1693 if ((stp = td->td_sel) != NULL) 1694 goto out; 1695 td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); 1696 mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); 1697 cv_init(&stp->st_wait, "select"); 1698 out: 1699 stp->st_flags = 0; 1700 STAILQ_INIT(&stp->st_selq); 1701 } 1702 1703 static int 1704 seltdwait(struct thread *td, int timo) 1705 { 1706 struct seltd *stp; 1707 int error; 1708 1709 stp = td->td_sel; 1710 /* 1711 * An event of interest may occur while we do not hold the seltd 1712 * locked so check the pending flag before we sleep. 1713 */ 1714 mtx_lock(&stp->st_mtx); 1715 /* 1716 * Any further calls to selrecord will be a rescan. 1717 */ 1718 stp->st_flags |= SELTD_RESCAN; 1719 if (stp->st_flags & SELTD_PENDING) { 1720 mtx_unlock(&stp->st_mtx); 1721 return (0); 1722 } 1723 if (timo > 0) 1724 error = cv_timedwait_sig(&stp->st_wait, &stp->st_mtx, timo); 1725 else 1726 error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); 1727 mtx_unlock(&stp->st_mtx); 1728 1729 return (error); 1730 } 1731 1732 void 1733 seltdfini(struct thread *td) 1734 { 1735 struct seltd *stp; 1736 1737 stp = td->td_sel; 1738 if (stp == NULL) 1739 return; 1740 if (stp->st_free1) 1741 uma_zfree(selfd_zone, stp->st_free1); 1742 if (stp->st_free2) 1743 uma_zfree(selfd_zone, stp->st_free2); 1744 td->td_sel = NULL; 1745 free(stp, M_SELECT); 1746 } 1747 1748 /* 1749 * Remove the references to the thread from all of the objects we were 1750 * polling. 1751 */ 1752 static void 1753 seltdclear(struct thread *td) 1754 { 1755 struct seltd *stp; 1756 struct selfd *sfp; 1757 struct selfd *sfn; 1758 1759 stp = td->td_sel; 1760 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) 1761 selfdfree(stp, sfp); 1762 stp->st_flags = 0; 1763 } 1764 1765 static void selectinit(void *); 1766 SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); 1767 static void 1768 selectinit(void *dummy __unused) 1769 { 1770 1771 selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL, 1772 NULL, NULL, UMA_ALIGN_PTR, 0); 1773 mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF); 1774 } 1775