1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_compat.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/sysproto.h> 46 #include <sys/filedesc.h> 47 #include <sys/filio.h> 48 #include <sys/fcntl.h> 49 #include <sys/file.h> 50 #include <sys/proc.h> 51 #include <sys/signalvar.h> 52 #include <sys/socketvar.h> 53 #include <sys/uio.h> 54 #include <sys/kernel.h> 55 #include <sys/ktr.h> 56 #include <sys/limits.h> 57 #include <sys/malloc.h> 58 #include <sys/poll.h> 59 #include <sys/resourcevar.h> 60 #include <sys/selinfo.h> 61 #include <sys/sleepqueue.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysent.h> 65 #include <sys/vnode.h> 66 #include <sys/bio.h> 67 #include <sys/buf.h> 68 #include <sys/condvar.h> 69 #ifdef KTRACE 70 #include <sys/ktrace.h> 71 #endif 72 73 #include <security/audit/audit.h> 74 75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 76 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 77 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 78 79 static int pollout(struct thread *, struct pollfd *, struct pollfd *, 80 u_int); 81 static int pollscan(struct thread *, struct pollfd *, u_int); 82 static int pollrescan(struct thread *); 83 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 84 static int selrescan(struct thread *, fd_mask **, fd_mask **); 85 static void selfdalloc(struct thread *, void *); 86 static void selfdfree(struct seltd *, struct selfd *); 87 static int dofileread(struct thread *, int, struct file *, struct uio *, 88 off_t, int); 89 static int dofilewrite(struct thread *, int, struct file *, struct uio *, 90 off_t, int); 91 static void doselwakeup(struct selinfo *, int); 92 static void seltdinit(struct thread *); 93 static int seltdwait(struct thread *, int); 94 static void seltdclear(struct thread *); 95 96 /* 97 * One seltd per-thread allocated on demand as needed. 98 * 99 * t - protected by st_mtx 100 * k - Only accessed by curthread or read-only 101 */ 102 struct seltd { 103 STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */ 104 struct selfd *st_free1; /* (k) free fd for read set. */ 105 struct selfd *st_free2; /* (k) free fd for write set. */ 106 struct mtx st_mtx; /* Protects struct seltd */ 107 struct cv st_wait; /* (t) Wait channel. */ 108 int st_flags; /* (t) SELTD_ flags. */ 109 }; 110 111 #define SELTD_PENDING 0x0001 /* We have pending events. */ 112 #define SELTD_RESCAN 0x0002 /* Doing a rescan. */ 113 114 /* 115 * One selfd allocated per-thread per-file-descriptor. 116 * f - protected by sf_mtx 117 */ 118 struct selfd { 119 STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ 120 TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ 121 struct selinfo *sf_si; /* (f) selinfo when linked. */ 122 struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ 123 struct seltd *sf_td; /* (k) owning seltd. */ 124 void *sf_cookie; /* (k) fd or pollfd. */ 125 }; 126 127 static uma_zone_t selfd_zone; 128 static struct mtx_pool *mtxpool_select; 129 130 #ifndef _SYS_SYSPROTO_H_ 131 struct read_args { 132 int fd; 133 void *buf; 134 size_t nbyte; 135 }; 136 #endif 137 int 138 read(td, uap) 139 struct thread *td; 140 struct read_args *uap; 141 { 142 struct uio auio; 143 struct iovec aiov; 144 int error; 145 146 if (uap->nbyte > INT_MAX) 147 return (EINVAL); 148 aiov.iov_base = uap->buf; 149 aiov.iov_len = uap->nbyte; 150 auio.uio_iov = &aiov; 151 auio.uio_iovcnt = 1; 152 auio.uio_resid = uap->nbyte; 153 auio.uio_segflg = UIO_USERSPACE; 154 error = kern_readv(td, uap->fd, &auio); 155 return(error); 156 } 157 158 /* 159 * Positioned read system call 160 */ 161 #ifndef _SYS_SYSPROTO_H_ 162 struct pread_args { 163 int fd; 164 void *buf; 165 size_t nbyte; 166 int pad; 167 off_t offset; 168 }; 169 #endif 170 int 171 pread(td, uap) 172 struct thread *td; 173 struct pread_args *uap; 174 { 175 struct uio auio; 176 struct iovec aiov; 177 int error; 178 179 if (uap->nbyte > INT_MAX) 180 return (EINVAL); 181 aiov.iov_base = uap->buf; 182 aiov.iov_len = uap->nbyte; 183 auio.uio_iov = &aiov; 184 auio.uio_iovcnt = 1; 185 auio.uio_resid = uap->nbyte; 186 auio.uio_segflg = UIO_USERSPACE; 187 error = kern_preadv(td, uap->fd, &auio, uap->offset); 188 return(error); 189 } 190 191 int 192 freebsd6_pread(td, uap) 193 struct thread *td; 194 struct freebsd6_pread_args *uap; 195 { 196 struct pread_args oargs; 197 198 oargs.fd = uap->fd; 199 oargs.buf = uap->buf; 200 oargs.nbyte = uap->nbyte; 201 oargs.offset = uap->offset; 202 return (pread(td, &oargs)); 203 } 204 205 /* 206 * Scatter read system call. 207 */ 208 #ifndef _SYS_SYSPROTO_H_ 209 struct readv_args { 210 int fd; 211 struct iovec *iovp; 212 u_int iovcnt; 213 }; 214 #endif 215 int 216 readv(struct thread *td, struct readv_args *uap) 217 { 218 struct uio *auio; 219 int error; 220 221 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 222 if (error) 223 return (error); 224 error = kern_readv(td, uap->fd, auio); 225 free(auio, M_IOV); 226 return (error); 227 } 228 229 int 230 kern_readv(struct thread *td, int fd, struct uio *auio) 231 { 232 struct file *fp; 233 int error; 234 235 error = fget_read(td, fd, &fp); 236 if (error) 237 return (error); 238 error = dofileread(td, fd, fp, auio, (off_t)-1, 0); 239 fdrop(fp, td); 240 return (error); 241 } 242 243 /* 244 * Scatter positioned read system call. 245 */ 246 #ifndef _SYS_SYSPROTO_H_ 247 struct preadv_args { 248 int fd; 249 struct iovec *iovp; 250 u_int iovcnt; 251 off_t offset; 252 }; 253 #endif 254 int 255 preadv(struct thread *td, struct preadv_args *uap) 256 { 257 struct uio *auio; 258 int error; 259 260 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 261 if (error) 262 return (error); 263 error = kern_preadv(td, uap->fd, auio, uap->offset); 264 free(auio, M_IOV); 265 return (error); 266 } 267 268 int 269 kern_preadv(td, fd, auio, offset) 270 struct thread *td; 271 int fd; 272 struct uio *auio; 273 off_t offset; 274 { 275 struct file *fp; 276 int error; 277 278 error = fget_read(td, fd, &fp); 279 if (error) 280 return (error); 281 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 282 error = ESPIPE; 283 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 284 error = EINVAL; 285 else 286 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); 287 fdrop(fp, td); 288 return (error); 289 } 290 291 /* 292 * Common code for readv and preadv that reads data in 293 * from a file using the passed in uio, offset, and flags. 294 */ 295 static int 296 dofileread(td, fd, fp, auio, offset, flags) 297 struct thread *td; 298 int fd; 299 struct file *fp; 300 struct uio *auio; 301 off_t offset; 302 int flags; 303 { 304 ssize_t cnt; 305 int error; 306 #ifdef KTRACE 307 struct uio *ktruio = NULL; 308 #endif 309 310 /* Finish zero length reads right here */ 311 if (auio->uio_resid == 0) { 312 td->td_retval[0] = 0; 313 return(0); 314 } 315 auio->uio_rw = UIO_READ; 316 auio->uio_offset = offset; 317 auio->uio_td = td; 318 #ifdef KTRACE 319 if (KTRPOINT(td, KTR_GENIO)) 320 ktruio = cloneuio(auio); 321 #endif 322 cnt = auio->uio_resid; 323 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { 324 if (auio->uio_resid != cnt && (error == ERESTART || 325 error == EINTR || error == EWOULDBLOCK)) 326 error = 0; 327 } 328 cnt -= auio->uio_resid; 329 #ifdef KTRACE 330 if (ktruio != NULL) { 331 ktruio->uio_resid = cnt; 332 ktrgenio(fd, UIO_READ, ktruio, error); 333 } 334 #endif 335 td->td_retval[0] = cnt; 336 return (error); 337 } 338 339 #ifndef _SYS_SYSPROTO_H_ 340 struct write_args { 341 int fd; 342 const void *buf; 343 size_t nbyte; 344 }; 345 #endif 346 int 347 write(td, uap) 348 struct thread *td; 349 struct write_args *uap; 350 { 351 struct uio auio; 352 struct iovec aiov; 353 int error; 354 355 if (uap->nbyte > INT_MAX) 356 return (EINVAL); 357 aiov.iov_base = (void *)(uintptr_t)uap->buf; 358 aiov.iov_len = uap->nbyte; 359 auio.uio_iov = &aiov; 360 auio.uio_iovcnt = 1; 361 auio.uio_resid = uap->nbyte; 362 auio.uio_segflg = UIO_USERSPACE; 363 error = kern_writev(td, uap->fd, &auio); 364 return(error); 365 } 366 367 /* 368 * Positioned write system call. 369 */ 370 #ifndef _SYS_SYSPROTO_H_ 371 struct pwrite_args { 372 int fd; 373 const void *buf; 374 size_t nbyte; 375 int pad; 376 off_t offset; 377 }; 378 #endif 379 int 380 pwrite(td, uap) 381 struct thread *td; 382 struct pwrite_args *uap; 383 { 384 struct uio auio; 385 struct iovec aiov; 386 int error; 387 388 if (uap->nbyte > INT_MAX) 389 return (EINVAL); 390 aiov.iov_base = (void *)(uintptr_t)uap->buf; 391 aiov.iov_len = uap->nbyte; 392 auio.uio_iov = &aiov; 393 auio.uio_iovcnt = 1; 394 auio.uio_resid = uap->nbyte; 395 auio.uio_segflg = UIO_USERSPACE; 396 error = kern_pwritev(td, uap->fd, &auio, uap->offset); 397 return(error); 398 } 399 400 int 401 freebsd6_pwrite(td, uap) 402 struct thread *td; 403 struct freebsd6_pwrite_args *uap; 404 { 405 struct pwrite_args oargs; 406 407 oargs.fd = uap->fd; 408 oargs.buf = uap->buf; 409 oargs.nbyte = uap->nbyte; 410 oargs.offset = uap->offset; 411 return (pwrite(td, &oargs)); 412 } 413 414 /* 415 * Gather write system call. 416 */ 417 #ifndef _SYS_SYSPROTO_H_ 418 struct writev_args { 419 int fd; 420 struct iovec *iovp; 421 u_int iovcnt; 422 }; 423 #endif 424 int 425 writev(struct thread *td, struct writev_args *uap) 426 { 427 struct uio *auio; 428 int error; 429 430 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 431 if (error) 432 return (error); 433 error = kern_writev(td, uap->fd, auio); 434 free(auio, M_IOV); 435 return (error); 436 } 437 438 int 439 kern_writev(struct thread *td, int fd, struct uio *auio) 440 { 441 struct file *fp; 442 int error; 443 444 error = fget_write(td, fd, &fp); 445 if (error) 446 return (error); 447 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); 448 fdrop(fp, td); 449 return (error); 450 } 451 452 /* 453 * Gather positioned write system call. 454 */ 455 #ifndef _SYS_SYSPROTO_H_ 456 struct pwritev_args { 457 int fd; 458 struct iovec *iovp; 459 u_int iovcnt; 460 off_t offset; 461 }; 462 #endif 463 int 464 pwritev(struct thread *td, struct pwritev_args *uap) 465 { 466 struct uio *auio; 467 int error; 468 469 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 470 if (error) 471 return (error); 472 error = kern_pwritev(td, uap->fd, auio, uap->offset); 473 free(auio, M_IOV); 474 return (error); 475 } 476 477 int 478 kern_pwritev(td, fd, auio, offset) 479 struct thread *td; 480 struct uio *auio; 481 int fd; 482 off_t offset; 483 { 484 struct file *fp; 485 int error; 486 487 error = fget_write(td, fd, &fp); 488 if (error) 489 return (error); 490 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 491 error = ESPIPE; 492 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 493 error = EINVAL; 494 else 495 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); 496 fdrop(fp, td); 497 return (error); 498 } 499 500 /* 501 * Common code for writev and pwritev that writes data to 502 * a file using the passed in uio, offset, and flags. 503 */ 504 static int 505 dofilewrite(td, fd, fp, auio, offset, flags) 506 struct thread *td; 507 int fd; 508 struct file *fp; 509 struct uio *auio; 510 off_t offset; 511 int flags; 512 { 513 ssize_t cnt; 514 int error; 515 #ifdef KTRACE 516 struct uio *ktruio = NULL; 517 #endif 518 519 auio->uio_rw = UIO_WRITE; 520 auio->uio_td = td; 521 auio->uio_offset = offset; 522 #ifdef KTRACE 523 if (KTRPOINT(td, KTR_GENIO)) 524 ktruio = cloneuio(auio); 525 #endif 526 cnt = auio->uio_resid; 527 if (fp->f_type == DTYPE_VNODE) 528 bwillwrite(); 529 if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { 530 if (auio->uio_resid != cnt && (error == ERESTART || 531 error == EINTR || error == EWOULDBLOCK)) 532 error = 0; 533 /* Socket layer is responsible for issuing SIGPIPE. */ 534 if (fp->f_type != DTYPE_SOCKET && error == EPIPE) { 535 PROC_LOCK(td->td_proc); 536 tdsignal(td, SIGPIPE); 537 PROC_UNLOCK(td->td_proc); 538 } 539 } 540 cnt -= auio->uio_resid; 541 #ifdef KTRACE 542 if (ktruio != NULL) { 543 ktruio->uio_resid = cnt; 544 ktrgenio(fd, UIO_WRITE, ktruio, error); 545 } 546 #endif 547 td->td_retval[0] = cnt; 548 return (error); 549 } 550 551 /* 552 * Truncate a file given a file descriptor. 553 * 554 * Can't use fget_write() here, since must return EINVAL and not EBADF if the 555 * descriptor isn't writable. 556 */ 557 int 558 kern_ftruncate(td, fd, length) 559 struct thread *td; 560 int fd; 561 off_t length; 562 { 563 struct file *fp; 564 int error; 565 566 AUDIT_ARG_FD(fd); 567 if (length < 0) 568 return (EINVAL); 569 error = fget(td, fd, &fp); 570 if (error) 571 return (error); 572 AUDIT_ARG_FILE(td->td_proc, fp); 573 if (!(fp->f_flag & FWRITE)) { 574 fdrop(fp, td); 575 return (EINVAL); 576 } 577 error = fo_truncate(fp, length, td->td_ucred, td); 578 fdrop(fp, td); 579 return (error); 580 } 581 582 #ifndef _SYS_SYSPROTO_H_ 583 struct ftruncate_args { 584 int fd; 585 int pad; 586 off_t length; 587 }; 588 #endif 589 int 590 ftruncate(td, uap) 591 struct thread *td; 592 struct ftruncate_args *uap; 593 { 594 595 return (kern_ftruncate(td, uap->fd, uap->length)); 596 } 597 598 #if defined(COMPAT_43) 599 #ifndef _SYS_SYSPROTO_H_ 600 struct oftruncate_args { 601 int fd; 602 long length; 603 }; 604 #endif 605 int 606 oftruncate(td, uap) 607 struct thread *td; 608 struct oftruncate_args *uap; 609 { 610 611 return (kern_ftruncate(td, uap->fd, uap->length)); 612 } 613 #endif /* COMPAT_43 */ 614 615 #ifndef _SYS_SYSPROTO_H_ 616 struct ioctl_args { 617 int fd; 618 u_long com; 619 caddr_t data; 620 }; 621 #endif 622 /* ARGSUSED */ 623 int 624 ioctl(struct thread *td, struct ioctl_args *uap) 625 { 626 u_long com; 627 int arg, error; 628 u_int size; 629 caddr_t data; 630 631 if (uap->com > 0xffffffff) { 632 printf( 633 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 634 td->td_proc->p_pid, td->td_name, uap->com); 635 uap->com &= 0xffffffff; 636 } 637 com = uap->com; 638 639 /* 640 * Interpret high order word to find amount of data to be 641 * copied to/from the user's address space. 642 */ 643 size = IOCPARM_LEN(com); 644 if ((size > IOCPARM_MAX) || 645 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 646 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) 647 ((com & IOC_OUT) && size == 0) || 648 #else 649 ((com & (IOC_IN | IOC_OUT)) && size == 0) || 650 #endif 651 ((com & IOC_VOID) && size > 0 && size != sizeof(int))) 652 return (ENOTTY); 653 654 if (size > 0) { 655 if (com & IOC_VOID) { 656 /* Integer argument. */ 657 arg = (intptr_t)uap->data; 658 data = (void *)&arg; 659 size = 0; 660 } else 661 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 662 } else 663 data = (void *)&uap->data; 664 if (com & IOC_IN) { 665 error = copyin(uap->data, data, (u_int)size); 666 if (error) { 667 if (size > 0) 668 free(data, M_IOCTLOPS); 669 return (error); 670 } 671 } else if (com & IOC_OUT) { 672 /* 673 * Zero the buffer so the user always 674 * gets back something deterministic. 675 */ 676 bzero(data, size); 677 } 678 679 error = kern_ioctl(td, uap->fd, com, data); 680 681 if (error == 0 && (com & IOC_OUT)) 682 error = copyout(data, uap->data, (u_int)size); 683 684 if (size > 0) 685 free(data, M_IOCTLOPS); 686 return (error); 687 } 688 689 int 690 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) 691 { 692 struct file *fp; 693 struct filedesc *fdp; 694 int error; 695 int tmp; 696 697 AUDIT_ARG_FD(fd); 698 AUDIT_ARG_CMD(com); 699 if ((error = fget(td, fd, &fp)) != 0) 700 return (error); 701 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 702 fdrop(fp, td); 703 return (EBADF); 704 } 705 fdp = td->td_proc->p_fd; 706 switch (com) { 707 case FIONCLEX: 708 FILEDESC_XLOCK(fdp); 709 fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE; 710 FILEDESC_XUNLOCK(fdp); 711 goto out; 712 case FIOCLEX: 713 FILEDESC_XLOCK(fdp); 714 fdp->fd_ofileflags[fd] |= UF_EXCLOSE; 715 FILEDESC_XUNLOCK(fdp); 716 goto out; 717 case FIONBIO: 718 if ((tmp = *(int *)data)) 719 atomic_set_int(&fp->f_flag, FNONBLOCK); 720 else 721 atomic_clear_int(&fp->f_flag, FNONBLOCK); 722 data = (void *)&tmp; 723 break; 724 case FIOASYNC: 725 if ((tmp = *(int *)data)) 726 atomic_set_int(&fp->f_flag, FASYNC); 727 else 728 atomic_clear_int(&fp->f_flag, FASYNC); 729 data = (void *)&tmp; 730 break; 731 } 732 733 error = fo_ioctl(fp, com, data, td->td_ucred, td); 734 out: 735 fdrop(fp, td); 736 return (error); 737 } 738 739 int 740 poll_no_poll(int events) 741 { 742 /* 743 * Return true for read/write. If the user asked for something 744 * special, return POLLNVAL, so that clients have a way of 745 * determining reliably whether or not the extended 746 * functionality is present without hard-coding knowledge 747 * of specific filesystem implementations. 748 */ 749 if (events & ~POLLSTANDARD) 750 return (POLLNVAL); 751 752 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 753 } 754 755 int 756 pselect(struct thread *td, struct pselect_args *uap) 757 { 758 struct timespec ts; 759 struct timeval tv, *tvp; 760 sigset_t set, *uset; 761 int error; 762 763 if (uap->ts != NULL) { 764 error = copyin(uap->ts, &ts, sizeof(ts)); 765 if (error != 0) 766 return (error); 767 TIMESPEC_TO_TIMEVAL(&tv, &ts); 768 tvp = &tv; 769 } else 770 tvp = NULL; 771 if (uap->sm != NULL) { 772 error = copyin(uap->sm, &set, sizeof(set)); 773 if (error != 0) 774 return (error); 775 uset = &set; 776 } else 777 uset = NULL; 778 return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 779 uset, NFDBITS)); 780 } 781 782 int 783 kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex, 784 struct timeval *tvp, sigset_t *uset, int abi_nfdbits) 785 { 786 int error; 787 788 if (uset != NULL) { 789 error = kern_sigprocmask(td, SIG_SETMASK, uset, 790 &td->td_oldsigmask, 0); 791 if (error != 0) 792 return (error); 793 td->td_pflags |= TDP_OLDMASK; 794 /* 795 * Make sure that ast() is called on return to 796 * usermode and TDP_OLDMASK is cleared, restoring old 797 * sigmask. 798 */ 799 thread_lock(td); 800 td->td_flags |= TDF_ASTPENDING; 801 thread_unlock(td); 802 } 803 error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits); 804 return (error); 805 } 806 807 #ifndef _SYS_SYSPROTO_H_ 808 struct select_args { 809 int nd; 810 fd_set *in, *ou, *ex; 811 struct timeval *tv; 812 }; 813 #endif 814 int 815 select(struct thread *td, struct select_args *uap) 816 { 817 struct timeval tv, *tvp; 818 int error; 819 820 if (uap->tv != NULL) { 821 error = copyin(uap->tv, &tv, sizeof(tv)); 822 if (error) 823 return (error); 824 tvp = &tv; 825 } else 826 tvp = NULL; 827 828 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 829 NFDBITS)); 830 } 831 832 int 833 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 834 fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits) 835 { 836 struct filedesc *fdp; 837 /* 838 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 839 * infds with the new FD_SETSIZE of 1024, and more than enough for 840 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 841 * of 256. 842 */ 843 fd_mask s_selbits[howmany(2048, NFDBITS)]; 844 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 845 struct timeval atv, rtv, ttv; 846 int error, timo; 847 u_int nbufbytes, ncpbytes, ncpubytes, nfdbits; 848 849 if (nd < 0) 850 return (EINVAL); 851 fdp = td->td_proc->p_fd; 852 if (nd > fdp->fd_lastfile + 1) 853 nd = fdp->fd_lastfile + 1; 854 855 /* 856 * Allocate just enough bits for the non-null fd_sets. Use the 857 * preallocated auto buffer if possible. 858 */ 859 nfdbits = roundup(nd, NFDBITS); 860 ncpbytes = nfdbits / NBBY; 861 ncpubytes = roundup(nd, abi_nfdbits) / NBBY; 862 nbufbytes = 0; 863 if (fd_in != NULL) 864 nbufbytes += 2 * ncpbytes; 865 if (fd_ou != NULL) 866 nbufbytes += 2 * ncpbytes; 867 if (fd_ex != NULL) 868 nbufbytes += 2 * ncpbytes; 869 if (nbufbytes <= sizeof s_selbits) 870 selbits = &s_selbits[0]; 871 else 872 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 873 874 /* 875 * Assign pointers into the bit buffers and fetch the input bits. 876 * Put the output buffers together so that they can be bzeroed 877 * together. 878 */ 879 sbp = selbits; 880 #define getbits(name, x) \ 881 do { \ 882 if (name == NULL) { \ 883 ibits[x] = NULL; \ 884 obits[x] = NULL; \ 885 } else { \ 886 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 887 obits[x] = sbp; \ 888 sbp += ncpbytes / sizeof *sbp; \ 889 error = copyin(name, ibits[x], ncpubytes); \ 890 if (error != 0) \ 891 goto done; \ 892 bzero((char *)ibits[x] + ncpubytes, \ 893 ncpbytes - ncpubytes); \ 894 } \ 895 } while (0) 896 getbits(fd_in, 0); 897 getbits(fd_ou, 1); 898 getbits(fd_ex, 2); 899 #undef getbits 900 901 #if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__) 902 /* 903 * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS, 904 * we are running under 32-bit emulation. This should be more 905 * generic. 906 */ 907 #define swizzle_fdset(bits) \ 908 if (abi_nfdbits != NFDBITS && bits != NULL) { \ 909 int i; \ 910 for (i = 0; i < ncpbytes / sizeof *sbp; i++) \ 911 bits[i] = (bits[i] >> 32) | (bits[i] << 32); \ 912 } 913 #else 914 #define swizzle_fdset(bits) 915 #endif 916 917 /* Make sure the bit order makes it through an ABI transition */ 918 swizzle_fdset(ibits[0]); 919 swizzle_fdset(ibits[1]); 920 swizzle_fdset(ibits[2]); 921 922 if (nbufbytes != 0) 923 bzero(selbits, nbufbytes / 2); 924 925 if (tvp != NULL) { 926 atv = *tvp; 927 if (itimerfix(&atv)) { 928 error = EINVAL; 929 goto done; 930 } 931 getmicrouptime(&rtv); 932 timevaladd(&atv, &rtv); 933 } else { 934 atv.tv_sec = 0; 935 atv.tv_usec = 0; 936 } 937 timo = 0; 938 seltdinit(td); 939 /* Iterate until the timeout expires or descriptors become ready. */ 940 for (;;) { 941 error = selscan(td, ibits, obits, nd); 942 if (error || td->td_retval[0] != 0) 943 break; 944 if (atv.tv_sec || atv.tv_usec) { 945 getmicrouptime(&rtv); 946 if (timevalcmp(&rtv, &atv, >=)) 947 break; 948 ttv = atv; 949 timevalsub(&ttv, &rtv); 950 timo = ttv.tv_sec > 24 * 60 * 60 ? 951 24 * 60 * 60 * hz : tvtohz(&ttv); 952 } 953 error = seltdwait(td, timo); 954 if (error) 955 break; 956 error = selrescan(td, ibits, obits); 957 if (error || td->td_retval[0] != 0) 958 break; 959 } 960 seltdclear(td); 961 962 done: 963 /* select is not restarted after signals... */ 964 if (error == ERESTART) 965 error = EINTR; 966 if (error == EWOULDBLOCK) 967 error = 0; 968 969 /* swizzle bit order back, if necessary */ 970 swizzle_fdset(obits[0]); 971 swizzle_fdset(obits[1]); 972 swizzle_fdset(obits[2]); 973 #undef swizzle_fdset 974 975 #define putbits(name, x) \ 976 if (name && (error2 = copyout(obits[x], name, ncpubytes))) \ 977 error = error2; 978 if (error == 0) { 979 int error2; 980 981 putbits(fd_in, 0); 982 putbits(fd_ou, 1); 983 putbits(fd_ex, 2); 984 #undef putbits 985 } 986 if (selbits != &s_selbits[0]) 987 free(selbits, M_SELECT); 988 989 return (error); 990 } 991 /* 992 * Convert a select bit set to poll flags. 993 * 994 * The backend always returns POLLHUP/POLLERR if appropriate and we 995 * return this as a set bit in any set. 996 */ 997 static int select_flags[3] = { 998 POLLRDNORM | POLLHUP | POLLERR, 999 POLLWRNORM | POLLHUP | POLLERR, 1000 POLLRDBAND | POLLERR 1001 }; 1002 1003 /* 1004 * Compute the fo_poll flags required for a fd given by the index and 1005 * bit position in the fd_mask array. 1006 */ 1007 static __inline int 1008 selflags(fd_mask **ibits, int idx, fd_mask bit) 1009 { 1010 int flags; 1011 int msk; 1012 1013 flags = 0; 1014 for (msk = 0; msk < 3; msk++) { 1015 if (ibits[msk] == NULL) 1016 continue; 1017 if ((ibits[msk][idx] & bit) == 0) 1018 continue; 1019 flags |= select_flags[msk]; 1020 } 1021 return (flags); 1022 } 1023 1024 /* 1025 * Set the appropriate output bits given a mask of fired events and the 1026 * input bits originally requested. 1027 */ 1028 static __inline int 1029 selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events) 1030 { 1031 int msk; 1032 int n; 1033 1034 n = 0; 1035 for (msk = 0; msk < 3; msk++) { 1036 if ((events & select_flags[msk]) == 0) 1037 continue; 1038 if (ibits[msk] == NULL) 1039 continue; 1040 if ((ibits[msk][idx] & bit) == 0) 1041 continue; 1042 /* 1043 * XXX Check for a duplicate set. This can occur because a 1044 * socket calls selrecord() twice for each poll() call 1045 * resulting in two selfds per real fd. selrescan() will 1046 * call selsetbits twice as a result. 1047 */ 1048 if ((obits[msk][idx] & bit) != 0) 1049 continue; 1050 obits[msk][idx] |= bit; 1051 n++; 1052 } 1053 1054 return (n); 1055 } 1056 1057 /* 1058 * Traverse the list of fds attached to this thread's seltd and check for 1059 * completion. 1060 */ 1061 static int 1062 selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) 1063 { 1064 struct filedesc *fdp; 1065 struct selinfo *si; 1066 struct seltd *stp; 1067 struct selfd *sfp; 1068 struct selfd *sfn; 1069 struct file *fp; 1070 fd_mask bit; 1071 int fd, ev, n, idx; 1072 1073 fdp = td->td_proc->p_fd; 1074 stp = td->td_sel; 1075 n = 0; 1076 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1077 fd = (int)(uintptr_t)sfp->sf_cookie; 1078 si = sfp->sf_si; 1079 selfdfree(stp, sfp); 1080 /* If the selinfo wasn't cleared the event didn't fire. */ 1081 if (si != NULL) 1082 continue; 1083 if ((fp = fget_unlocked(fdp, fd)) == NULL) 1084 return (EBADF); 1085 idx = fd / NFDBITS; 1086 bit = (fd_mask)1 << (fd % NFDBITS); 1087 ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td); 1088 fdrop(fp, td); 1089 if (ev != 0) 1090 n += selsetbits(ibits, obits, idx, bit, ev); 1091 } 1092 stp->st_flags = 0; 1093 td->td_retval[0] = n; 1094 return (0); 1095 } 1096 1097 /* 1098 * Perform the initial filedescriptor scan and register ourselves with 1099 * each selinfo. 1100 */ 1101 static int 1102 selscan(td, ibits, obits, nfd) 1103 struct thread *td; 1104 fd_mask **ibits, **obits; 1105 int nfd; 1106 { 1107 struct filedesc *fdp; 1108 struct file *fp; 1109 fd_mask bit; 1110 int ev, flags, end, fd; 1111 int n, idx; 1112 1113 fdp = td->td_proc->p_fd; 1114 n = 0; 1115 for (idx = 0, fd = 0; fd < nfd; idx++) { 1116 end = imin(fd + NFDBITS, nfd); 1117 for (bit = 1; fd < end; bit <<= 1, fd++) { 1118 /* Compute the list of events we're interested in. */ 1119 flags = selflags(ibits, idx, bit); 1120 if (flags == 0) 1121 continue; 1122 if ((fp = fget_unlocked(fdp, fd)) == NULL) 1123 return (EBADF); 1124 selfdalloc(td, (void *)(uintptr_t)fd); 1125 ev = fo_poll(fp, flags, td->td_ucred, td); 1126 fdrop(fp, td); 1127 if (ev != 0) 1128 n += selsetbits(ibits, obits, idx, bit, ev); 1129 } 1130 } 1131 1132 td->td_retval[0] = n; 1133 return (0); 1134 } 1135 1136 #ifndef _SYS_SYSPROTO_H_ 1137 struct poll_args { 1138 struct pollfd *fds; 1139 u_int nfds; 1140 int timeout; 1141 }; 1142 #endif 1143 int 1144 poll(td, uap) 1145 struct thread *td; 1146 struct poll_args *uap; 1147 { 1148 struct pollfd *bits; 1149 struct pollfd smallbits[32]; 1150 struct timeval atv, rtv, ttv; 1151 int error = 0, timo; 1152 u_int nfds; 1153 size_t ni; 1154 1155 nfds = uap->nfds; 1156 if (nfds > maxfilesperproc && nfds > FD_SETSIZE) 1157 return (EINVAL); 1158 ni = nfds * sizeof(struct pollfd); 1159 if (ni > sizeof(smallbits)) 1160 bits = malloc(ni, M_TEMP, M_WAITOK); 1161 else 1162 bits = smallbits; 1163 error = copyin(uap->fds, bits, ni); 1164 if (error) 1165 goto done; 1166 if (uap->timeout != INFTIM) { 1167 atv.tv_sec = uap->timeout / 1000; 1168 atv.tv_usec = (uap->timeout % 1000) * 1000; 1169 if (itimerfix(&atv)) { 1170 error = EINVAL; 1171 goto done; 1172 } 1173 getmicrouptime(&rtv); 1174 timevaladd(&atv, &rtv); 1175 } else { 1176 atv.tv_sec = 0; 1177 atv.tv_usec = 0; 1178 } 1179 timo = 0; 1180 seltdinit(td); 1181 /* Iterate until the timeout expires or descriptors become ready. */ 1182 for (;;) { 1183 error = pollscan(td, bits, nfds); 1184 if (error || td->td_retval[0] != 0) 1185 break; 1186 if (atv.tv_sec || atv.tv_usec) { 1187 getmicrouptime(&rtv); 1188 if (timevalcmp(&rtv, &atv, >=)) 1189 break; 1190 ttv = atv; 1191 timevalsub(&ttv, &rtv); 1192 timo = ttv.tv_sec > 24 * 60 * 60 ? 1193 24 * 60 * 60 * hz : tvtohz(&ttv); 1194 } 1195 error = seltdwait(td, timo); 1196 if (error) 1197 break; 1198 error = pollrescan(td); 1199 if (error || td->td_retval[0] != 0) 1200 break; 1201 } 1202 seltdclear(td); 1203 1204 done: 1205 /* poll is not restarted after signals... */ 1206 if (error == ERESTART) 1207 error = EINTR; 1208 if (error == EWOULDBLOCK) 1209 error = 0; 1210 if (error == 0) { 1211 error = pollout(td, bits, uap->fds, nfds); 1212 if (error) 1213 goto out; 1214 } 1215 out: 1216 if (ni > sizeof(smallbits)) 1217 free(bits, M_TEMP); 1218 return (error); 1219 } 1220 1221 static int 1222 pollrescan(struct thread *td) 1223 { 1224 struct seltd *stp; 1225 struct selfd *sfp; 1226 struct selfd *sfn; 1227 struct selinfo *si; 1228 struct filedesc *fdp; 1229 struct file *fp; 1230 struct pollfd *fd; 1231 int n; 1232 1233 n = 0; 1234 fdp = td->td_proc->p_fd; 1235 stp = td->td_sel; 1236 FILEDESC_SLOCK(fdp); 1237 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1238 fd = (struct pollfd *)sfp->sf_cookie; 1239 si = sfp->sf_si; 1240 selfdfree(stp, sfp); 1241 /* If the selinfo wasn't cleared the event didn't fire. */ 1242 if (si != NULL) 1243 continue; 1244 fp = fdp->fd_ofiles[fd->fd]; 1245 if (fp == NULL) { 1246 fd->revents = POLLNVAL; 1247 n++; 1248 continue; 1249 } 1250 /* 1251 * Note: backend also returns POLLHUP and 1252 * POLLERR if appropriate. 1253 */ 1254 fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); 1255 if (fd->revents != 0) 1256 n++; 1257 } 1258 FILEDESC_SUNLOCK(fdp); 1259 stp->st_flags = 0; 1260 td->td_retval[0] = n; 1261 return (0); 1262 } 1263 1264 1265 static int 1266 pollout(td, fds, ufds, nfd) 1267 struct thread *td; 1268 struct pollfd *fds; 1269 struct pollfd *ufds; 1270 u_int nfd; 1271 { 1272 int error = 0; 1273 u_int i = 0; 1274 u_int n = 0; 1275 1276 for (i = 0; i < nfd; i++) { 1277 error = copyout(&fds->revents, &ufds->revents, 1278 sizeof(ufds->revents)); 1279 if (error) 1280 return (error); 1281 if (fds->revents != 0) 1282 n++; 1283 fds++; 1284 ufds++; 1285 } 1286 td->td_retval[0] = n; 1287 return (0); 1288 } 1289 1290 static int 1291 pollscan(td, fds, nfd) 1292 struct thread *td; 1293 struct pollfd *fds; 1294 u_int nfd; 1295 { 1296 struct filedesc *fdp = td->td_proc->p_fd; 1297 int i; 1298 struct file *fp; 1299 int n = 0; 1300 1301 FILEDESC_SLOCK(fdp); 1302 for (i = 0; i < nfd; i++, fds++) { 1303 if (fds->fd >= fdp->fd_nfiles) { 1304 fds->revents = POLLNVAL; 1305 n++; 1306 } else if (fds->fd < 0) { 1307 fds->revents = 0; 1308 } else { 1309 fp = fdp->fd_ofiles[fds->fd]; 1310 if (fp == NULL) { 1311 fds->revents = POLLNVAL; 1312 n++; 1313 } else { 1314 /* 1315 * Note: backend also returns POLLHUP and 1316 * POLLERR if appropriate. 1317 */ 1318 selfdalloc(td, fds); 1319 fds->revents = fo_poll(fp, fds->events, 1320 td->td_ucred, td); 1321 /* 1322 * POSIX requires POLLOUT to be never 1323 * set simultaneously with POLLHUP. 1324 */ 1325 if ((fds->revents & POLLHUP) != 0) 1326 fds->revents &= ~POLLOUT; 1327 1328 if (fds->revents != 0) 1329 n++; 1330 } 1331 } 1332 } 1333 FILEDESC_SUNLOCK(fdp); 1334 td->td_retval[0] = n; 1335 return (0); 1336 } 1337 1338 /* 1339 * OpenBSD poll system call. 1340 * 1341 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1342 */ 1343 #ifndef _SYS_SYSPROTO_H_ 1344 struct openbsd_poll_args { 1345 struct pollfd *fds; 1346 u_int nfds; 1347 int timeout; 1348 }; 1349 #endif 1350 int 1351 openbsd_poll(td, uap) 1352 register struct thread *td; 1353 register struct openbsd_poll_args *uap; 1354 { 1355 return (poll(td, (struct poll_args *)uap)); 1356 } 1357 1358 /* 1359 * XXX This was created specifically to support netncp and netsmb. This 1360 * allows the caller to specify a socket to wait for events on. It returns 1361 * 0 if any events matched and an error otherwise. There is no way to 1362 * determine which events fired. 1363 */ 1364 int 1365 selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) 1366 { 1367 struct timeval atv, rtv, ttv; 1368 int error, timo; 1369 1370 if (tvp != NULL) { 1371 atv = *tvp; 1372 if (itimerfix(&atv)) 1373 return (EINVAL); 1374 getmicrouptime(&rtv); 1375 timevaladd(&atv, &rtv); 1376 } else { 1377 atv.tv_sec = 0; 1378 atv.tv_usec = 0; 1379 } 1380 1381 timo = 0; 1382 seltdinit(td); 1383 /* 1384 * Iterate until the timeout expires or the socket becomes ready. 1385 */ 1386 for (;;) { 1387 selfdalloc(td, NULL); 1388 error = sopoll(so, events, NULL, td); 1389 /* error here is actually the ready events. */ 1390 if (error) 1391 return (0); 1392 if (atv.tv_sec || atv.tv_usec) { 1393 getmicrouptime(&rtv); 1394 if (timevalcmp(&rtv, &atv, >=)) { 1395 seltdclear(td); 1396 return (EWOULDBLOCK); 1397 } 1398 ttv = atv; 1399 timevalsub(&ttv, &rtv); 1400 timo = ttv.tv_sec > 24 * 60 * 60 ? 1401 24 * 60 * 60 * hz : tvtohz(&ttv); 1402 } 1403 error = seltdwait(td, timo); 1404 seltdclear(td); 1405 if (error) 1406 break; 1407 } 1408 /* XXX Duplicates ncp/smb behavior. */ 1409 if (error == ERESTART) 1410 error = 0; 1411 return (error); 1412 } 1413 1414 /* 1415 * Preallocate two selfds associated with 'cookie'. Some fo_poll routines 1416 * have two select sets, one for read and another for write. 1417 */ 1418 static void 1419 selfdalloc(struct thread *td, void *cookie) 1420 { 1421 struct seltd *stp; 1422 1423 stp = td->td_sel; 1424 if (stp->st_free1 == NULL) 1425 stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); 1426 stp->st_free1->sf_td = stp; 1427 stp->st_free1->sf_cookie = cookie; 1428 if (stp->st_free2 == NULL) 1429 stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); 1430 stp->st_free2->sf_td = stp; 1431 stp->st_free2->sf_cookie = cookie; 1432 } 1433 1434 static void 1435 selfdfree(struct seltd *stp, struct selfd *sfp) 1436 { 1437 STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link); 1438 mtx_lock(sfp->sf_mtx); 1439 if (sfp->sf_si) 1440 TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); 1441 mtx_unlock(sfp->sf_mtx); 1442 uma_zfree(selfd_zone, sfp); 1443 } 1444 1445 /* 1446 * Record a select request. 1447 */ 1448 void 1449 selrecord(selector, sip) 1450 struct thread *selector; 1451 struct selinfo *sip; 1452 { 1453 struct selfd *sfp; 1454 struct seltd *stp; 1455 struct mtx *mtxp; 1456 1457 stp = selector->td_sel; 1458 /* 1459 * Don't record when doing a rescan. 1460 */ 1461 if (stp->st_flags & SELTD_RESCAN) 1462 return; 1463 /* 1464 * Grab one of the preallocated descriptors. 1465 */ 1466 sfp = NULL; 1467 if ((sfp = stp->st_free1) != NULL) 1468 stp->st_free1 = NULL; 1469 else if ((sfp = stp->st_free2) != NULL) 1470 stp->st_free2 = NULL; 1471 else 1472 panic("selrecord: No free selfd on selq"); 1473 mtxp = sip->si_mtx; 1474 if (mtxp == NULL) 1475 mtxp = mtx_pool_find(mtxpool_select, sip); 1476 /* 1477 * Initialize the sfp and queue it in the thread. 1478 */ 1479 sfp->sf_si = sip; 1480 sfp->sf_mtx = mtxp; 1481 STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); 1482 /* 1483 * Now that we've locked the sip, check for initialization. 1484 */ 1485 mtx_lock(mtxp); 1486 if (sip->si_mtx == NULL) { 1487 sip->si_mtx = mtxp; 1488 TAILQ_INIT(&sip->si_tdlist); 1489 } 1490 /* 1491 * Add this thread to the list of selfds listening on this selinfo. 1492 */ 1493 TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); 1494 mtx_unlock(sip->si_mtx); 1495 } 1496 1497 /* Wake up a selecting thread. */ 1498 void 1499 selwakeup(sip) 1500 struct selinfo *sip; 1501 { 1502 doselwakeup(sip, -1); 1503 } 1504 1505 /* Wake up a selecting thread, and set its priority. */ 1506 void 1507 selwakeuppri(sip, pri) 1508 struct selinfo *sip; 1509 int pri; 1510 { 1511 doselwakeup(sip, pri); 1512 } 1513 1514 /* 1515 * Do a wakeup when a selectable event occurs. 1516 */ 1517 static void 1518 doselwakeup(sip, pri) 1519 struct selinfo *sip; 1520 int pri; 1521 { 1522 struct selfd *sfp; 1523 struct selfd *sfn; 1524 struct seltd *stp; 1525 1526 /* If it's not initialized there can't be any waiters. */ 1527 if (sip->si_mtx == NULL) 1528 return; 1529 /* 1530 * Locking the selinfo locks all selfds associated with it. 1531 */ 1532 mtx_lock(sip->si_mtx); 1533 TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { 1534 /* 1535 * Once we remove this sfp from the list and clear the 1536 * sf_si seltdclear will know to ignore this si. 1537 */ 1538 TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); 1539 sfp->sf_si = NULL; 1540 stp = sfp->sf_td; 1541 mtx_lock(&stp->st_mtx); 1542 stp->st_flags |= SELTD_PENDING; 1543 cv_broadcastpri(&stp->st_wait, pri); 1544 mtx_unlock(&stp->st_mtx); 1545 } 1546 mtx_unlock(sip->si_mtx); 1547 } 1548 1549 static void 1550 seltdinit(struct thread *td) 1551 { 1552 struct seltd *stp; 1553 1554 if ((stp = td->td_sel) != NULL) 1555 goto out; 1556 td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); 1557 mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); 1558 cv_init(&stp->st_wait, "select"); 1559 out: 1560 stp->st_flags = 0; 1561 STAILQ_INIT(&stp->st_selq); 1562 } 1563 1564 static int 1565 seltdwait(struct thread *td, int timo) 1566 { 1567 struct seltd *stp; 1568 int error; 1569 1570 stp = td->td_sel; 1571 /* 1572 * An event of interest may occur while we do not hold the seltd 1573 * locked so check the pending flag before we sleep. 1574 */ 1575 mtx_lock(&stp->st_mtx); 1576 /* 1577 * Any further calls to selrecord will be a rescan. 1578 */ 1579 stp->st_flags |= SELTD_RESCAN; 1580 if (stp->st_flags & SELTD_PENDING) { 1581 mtx_unlock(&stp->st_mtx); 1582 return (0); 1583 } 1584 if (timo > 0) 1585 error = cv_timedwait_sig(&stp->st_wait, &stp->st_mtx, timo); 1586 else 1587 error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); 1588 mtx_unlock(&stp->st_mtx); 1589 1590 return (error); 1591 } 1592 1593 void 1594 seltdfini(struct thread *td) 1595 { 1596 struct seltd *stp; 1597 1598 stp = td->td_sel; 1599 if (stp == NULL) 1600 return; 1601 if (stp->st_free1) 1602 uma_zfree(selfd_zone, stp->st_free1); 1603 if (stp->st_free2) 1604 uma_zfree(selfd_zone, stp->st_free2); 1605 td->td_sel = NULL; 1606 free(stp, M_SELECT); 1607 } 1608 1609 /* 1610 * Remove the references to the thread from all of the objects we were 1611 * polling. 1612 */ 1613 static void 1614 seltdclear(struct thread *td) 1615 { 1616 struct seltd *stp; 1617 struct selfd *sfp; 1618 struct selfd *sfn; 1619 1620 stp = td->td_sel; 1621 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) 1622 selfdfree(stp, sfp); 1623 stp->st_flags = 0; 1624 } 1625 1626 static void selectinit(void *); 1627 SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); 1628 static void 1629 selectinit(void *dummy __unused) 1630 { 1631 1632 selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL, 1633 NULL, NULL, UMA_ALIGN_PTR, 0); 1634 mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF); 1635 } 1636