1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_compat.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/sysproto.h> 46 #include <sys/filedesc.h> 47 #include <sys/filio.h> 48 #include <sys/fcntl.h> 49 #include <sys/file.h> 50 #include <sys/proc.h> 51 #include <sys/signalvar.h> 52 #include <sys/socketvar.h> 53 #include <sys/uio.h> 54 #include <sys/kernel.h> 55 #include <sys/ktr.h> 56 #include <sys/limits.h> 57 #include <sys/malloc.h> 58 #include <sys/poll.h> 59 #include <sys/resourcevar.h> 60 #include <sys/selinfo.h> 61 #include <sys/sleepqueue.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysent.h> 65 #include <sys/vnode.h> 66 #include <sys/bio.h> 67 #include <sys/buf.h> 68 #include <sys/condvar.h> 69 #ifdef KTRACE 70 #include <sys/ktrace.h> 71 #endif 72 73 #include <security/audit/audit.h> 74 75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 76 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 77 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 78 79 static int pollout(struct pollfd *, struct pollfd *, u_int); 80 static int pollscan(struct thread *, struct pollfd *, u_int); 81 static int pollrescan(struct thread *); 82 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 83 static int selrescan(struct thread *, fd_mask **, fd_mask **); 84 static void selfdalloc(struct thread *, void *); 85 static void selfdfree(struct seltd *, struct selfd *); 86 static int dofileread(struct thread *, int, struct file *, struct uio *, 87 off_t, int); 88 static int dofilewrite(struct thread *, int, struct file *, struct uio *, 89 off_t, int); 90 static void doselwakeup(struct selinfo *, int); 91 static void seltdinit(struct thread *); 92 static int seltdwait(struct thread *, int); 93 static void seltdclear(struct thread *); 94 95 /* 96 * One seltd per-thread allocated on demand as needed. 97 * 98 * t - protected by st_mtx 99 * k - Only accessed by curthread or read-only 100 */ 101 struct seltd { 102 STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */ 103 struct selfd *st_free1; /* (k) free fd for read set. */ 104 struct selfd *st_free2; /* (k) free fd for write set. */ 105 struct mtx st_mtx; /* Protects struct seltd */ 106 struct cv st_wait; /* (t) Wait channel. */ 107 int st_flags; /* (t) SELTD_ flags. */ 108 }; 109 110 #define SELTD_PENDING 0x0001 /* We have pending events. */ 111 #define SELTD_RESCAN 0x0002 /* Doing a rescan. */ 112 113 /* 114 * One selfd allocated per-thread per-file-descriptor. 115 * f - protected by sf_mtx 116 */ 117 struct selfd { 118 STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ 119 TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ 120 struct selinfo *sf_si; /* (f) selinfo when linked. */ 121 struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ 122 struct seltd *sf_td; /* (k) owning seltd. */ 123 void *sf_cookie; /* (k) fd or pollfd. */ 124 }; 125 126 static uma_zone_t selfd_zone; 127 static struct mtx_pool *mtxpool_select; 128 129 #ifndef _SYS_SYSPROTO_H_ 130 struct read_args { 131 int fd; 132 void *buf; 133 size_t nbyte; 134 }; 135 #endif 136 int 137 read(td, uap) 138 struct thread *td; 139 struct read_args *uap; 140 { 141 struct uio auio; 142 struct iovec aiov; 143 int error; 144 145 if (uap->nbyte > INT_MAX) 146 return (EINVAL); 147 aiov.iov_base = uap->buf; 148 aiov.iov_len = uap->nbyte; 149 auio.uio_iov = &aiov; 150 auio.uio_iovcnt = 1; 151 auio.uio_resid = uap->nbyte; 152 auio.uio_segflg = UIO_USERSPACE; 153 error = kern_readv(td, uap->fd, &auio); 154 return(error); 155 } 156 157 /* 158 * Positioned read system call 159 */ 160 #ifndef _SYS_SYSPROTO_H_ 161 struct pread_args { 162 int fd; 163 void *buf; 164 size_t nbyte; 165 int pad; 166 off_t offset; 167 }; 168 #endif 169 int 170 pread(td, uap) 171 struct thread *td; 172 struct pread_args *uap; 173 { 174 struct uio auio; 175 struct iovec aiov; 176 int error; 177 178 if (uap->nbyte > INT_MAX) 179 return (EINVAL); 180 aiov.iov_base = uap->buf; 181 aiov.iov_len = uap->nbyte; 182 auio.uio_iov = &aiov; 183 auio.uio_iovcnt = 1; 184 auio.uio_resid = uap->nbyte; 185 auio.uio_segflg = UIO_USERSPACE; 186 error = kern_preadv(td, uap->fd, &auio, uap->offset); 187 return(error); 188 } 189 190 int 191 freebsd6_pread(td, uap) 192 struct thread *td; 193 struct freebsd6_pread_args *uap; 194 { 195 struct pread_args oargs; 196 197 oargs.fd = uap->fd; 198 oargs.buf = uap->buf; 199 oargs.nbyte = uap->nbyte; 200 oargs.offset = uap->offset; 201 return (pread(td, &oargs)); 202 } 203 204 /* 205 * Scatter read system call. 206 */ 207 #ifndef _SYS_SYSPROTO_H_ 208 struct readv_args { 209 int fd; 210 struct iovec *iovp; 211 u_int iovcnt; 212 }; 213 #endif 214 int 215 readv(struct thread *td, struct readv_args *uap) 216 { 217 struct uio *auio; 218 int error; 219 220 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 221 if (error) 222 return (error); 223 error = kern_readv(td, uap->fd, auio); 224 free(auio, M_IOV); 225 return (error); 226 } 227 228 int 229 kern_readv(struct thread *td, int fd, struct uio *auio) 230 { 231 struct file *fp; 232 int error; 233 234 error = fget_read(td, fd, &fp); 235 if (error) 236 return (error); 237 error = dofileread(td, fd, fp, auio, (off_t)-1, 0); 238 fdrop(fp, td); 239 return (error); 240 } 241 242 /* 243 * Scatter positioned read system call. 244 */ 245 #ifndef _SYS_SYSPROTO_H_ 246 struct preadv_args { 247 int fd; 248 struct iovec *iovp; 249 u_int iovcnt; 250 off_t offset; 251 }; 252 #endif 253 int 254 preadv(struct thread *td, struct preadv_args *uap) 255 { 256 struct uio *auio; 257 int error; 258 259 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 260 if (error) 261 return (error); 262 error = kern_preadv(td, uap->fd, auio, uap->offset); 263 free(auio, M_IOV); 264 return (error); 265 } 266 267 int 268 kern_preadv(td, fd, auio, offset) 269 struct thread *td; 270 int fd; 271 struct uio *auio; 272 off_t offset; 273 { 274 struct file *fp; 275 int error; 276 277 error = fget_read(td, fd, &fp); 278 if (error) 279 return (error); 280 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 281 error = ESPIPE; 282 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 283 error = EINVAL; 284 else 285 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); 286 fdrop(fp, td); 287 return (error); 288 } 289 290 /* 291 * Common code for readv and preadv that reads data in 292 * from a file using the passed in uio, offset, and flags. 293 */ 294 static int 295 dofileread(td, fd, fp, auio, offset, flags) 296 struct thread *td; 297 int fd; 298 struct file *fp; 299 struct uio *auio; 300 off_t offset; 301 int flags; 302 { 303 ssize_t cnt; 304 int error; 305 #ifdef KTRACE 306 struct uio *ktruio = NULL; 307 #endif 308 309 /* Finish zero length reads right here */ 310 if (auio->uio_resid == 0) { 311 td->td_retval[0] = 0; 312 return(0); 313 } 314 auio->uio_rw = UIO_READ; 315 auio->uio_offset = offset; 316 auio->uio_td = td; 317 #ifdef KTRACE 318 if (KTRPOINT(td, KTR_GENIO)) 319 ktruio = cloneuio(auio); 320 #endif 321 cnt = auio->uio_resid; 322 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { 323 if (auio->uio_resid != cnt && (error == ERESTART || 324 error == EINTR || error == EWOULDBLOCK)) 325 error = 0; 326 } 327 cnt -= auio->uio_resid; 328 #ifdef KTRACE 329 if (ktruio != NULL) { 330 ktruio->uio_resid = cnt; 331 ktrgenio(fd, UIO_READ, ktruio, error); 332 } 333 #endif 334 td->td_retval[0] = cnt; 335 return (error); 336 } 337 338 #ifndef _SYS_SYSPROTO_H_ 339 struct write_args { 340 int fd; 341 const void *buf; 342 size_t nbyte; 343 }; 344 #endif 345 int 346 write(td, uap) 347 struct thread *td; 348 struct write_args *uap; 349 { 350 struct uio auio; 351 struct iovec aiov; 352 int error; 353 354 if (uap->nbyte > INT_MAX) 355 return (EINVAL); 356 aiov.iov_base = (void *)(uintptr_t)uap->buf; 357 aiov.iov_len = uap->nbyte; 358 auio.uio_iov = &aiov; 359 auio.uio_iovcnt = 1; 360 auio.uio_resid = uap->nbyte; 361 auio.uio_segflg = UIO_USERSPACE; 362 error = kern_writev(td, uap->fd, &auio); 363 return(error); 364 } 365 366 /* 367 * Positioned write system call. 368 */ 369 #ifndef _SYS_SYSPROTO_H_ 370 struct pwrite_args { 371 int fd; 372 const void *buf; 373 size_t nbyte; 374 int pad; 375 off_t offset; 376 }; 377 #endif 378 int 379 pwrite(td, uap) 380 struct thread *td; 381 struct pwrite_args *uap; 382 { 383 struct uio auio; 384 struct iovec aiov; 385 int error; 386 387 if (uap->nbyte > INT_MAX) 388 return (EINVAL); 389 aiov.iov_base = (void *)(uintptr_t)uap->buf; 390 aiov.iov_len = uap->nbyte; 391 auio.uio_iov = &aiov; 392 auio.uio_iovcnt = 1; 393 auio.uio_resid = uap->nbyte; 394 auio.uio_segflg = UIO_USERSPACE; 395 error = kern_pwritev(td, uap->fd, &auio, uap->offset); 396 return(error); 397 } 398 399 int 400 freebsd6_pwrite(td, uap) 401 struct thread *td; 402 struct freebsd6_pwrite_args *uap; 403 { 404 struct pwrite_args oargs; 405 406 oargs.fd = uap->fd; 407 oargs.buf = uap->buf; 408 oargs.nbyte = uap->nbyte; 409 oargs.offset = uap->offset; 410 return (pwrite(td, &oargs)); 411 } 412 413 /* 414 * Gather write system call. 415 */ 416 #ifndef _SYS_SYSPROTO_H_ 417 struct writev_args { 418 int fd; 419 struct iovec *iovp; 420 u_int iovcnt; 421 }; 422 #endif 423 int 424 writev(struct thread *td, struct writev_args *uap) 425 { 426 struct uio *auio; 427 int error; 428 429 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 430 if (error) 431 return (error); 432 error = kern_writev(td, uap->fd, auio); 433 free(auio, M_IOV); 434 return (error); 435 } 436 437 int 438 kern_writev(struct thread *td, int fd, struct uio *auio) 439 { 440 struct file *fp; 441 int error; 442 443 error = fget_write(td, fd, &fp); 444 if (error) 445 return (error); 446 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); 447 fdrop(fp, td); 448 return (error); 449 } 450 451 /* 452 * Gather positioned write system call. 453 */ 454 #ifndef _SYS_SYSPROTO_H_ 455 struct pwritev_args { 456 int fd; 457 struct iovec *iovp; 458 u_int iovcnt; 459 off_t offset; 460 }; 461 #endif 462 int 463 pwritev(struct thread *td, struct pwritev_args *uap) 464 { 465 struct uio *auio; 466 int error; 467 468 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 469 if (error) 470 return (error); 471 error = kern_pwritev(td, uap->fd, auio, uap->offset); 472 free(auio, M_IOV); 473 return (error); 474 } 475 476 int 477 kern_pwritev(td, fd, auio, offset) 478 struct thread *td; 479 struct uio *auio; 480 int fd; 481 off_t offset; 482 { 483 struct file *fp; 484 int error; 485 486 error = fget_write(td, fd, &fp); 487 if (error) 488 return (error); 489 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 490 error = ESPIPE; 491 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 492 error = EINVAL; 493 else 494 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); 495 fdrop(fp, td); 496 return (error); 497 } 498 499 /* 500 * Common code for writev and pwritev that writes data to 501 * a file using the passed in uio, offset, and flags. 502 */ 503 static int 504 dofilewrite(td, fd, fp, auio, offset, flags) 505 struct thread *td; 506 int fd; 507 struct file *fp; 508 struct uio *auio; 509 off_t offset; 510 int flags; 511 { 512 ssize_t cnt; 513 int error; 514 #ifdef KTRACE 515 struct uio *ktruio = NULL; 516 #endif 517 518 auio->uio_rw = UIO_WRITE; 519 auio->uio_td = td; 520 auio->uio_offset = offset; 521 #ifdef KTRACE 522 if (KTRPOINT(td, KTR_GENIO)) 523 ktruio = cloneuio(auio); 524 #endif 525 cnt = auio->uio_resid; 526 if (fp->f_type == DTYPE_VNODE) 527 bwillwrite(); 528 if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { 529 if (auio->uio_resid != cnt && (error == ERESTART || 530 error == EINTR || error == EWOULDBLOCK)) 531 error = 0; 532 /* Socket layer is responsible for issuing SIGPIPE. */ 533 if (fp->f_type != DTYPE_SOCKET && error == EPIPE) { 534 PROC_LOCK(td->td_proc); 535 psignal(td->td_proc, SIGPIPE); 536 PROC_UNLOCK(td->td_proc); 537 } 538 } 539 cnt -= auio->uio_resid; 540 #ifdef KTRACE 541 if (ktruio != NULL) { 542 ktruio->uio_resid = cnt; 543 ktrgenio(fd, UIO_WRITE, ktruio, error); 544 } 545 #endif 546 td->td_retval[0] = cnt; 547 return (error); 548 } 549 550 /* 551 * Truncate a file given a file descriptor. 552 * 553 * Can't use fget_write() here, since must return EINVAL and not EBADF if the 554 * descriptor isn't writable. 555 */ 556 int 557 kern_ftruncate(td, fd, length) 558 struct thread *td; 559 int fd; 560 off_t length; 561 { 562 struct file *fp; 563 int error; 564 565 AUDIT_ARG_FD(fd); 566 if (length < 0) 567 return (EINVAL); 568 error = fget(td, fd, &fp); 569 if (error) 570 return (error); 571 AUDIT_ARG_FILE(td->td_proc, fp); 572 if (!(fp->f_flag & FWRITE)) { 573 fdrop(fp, td); 574 return (EINVAL); 575 } 576 error = fo_truncate(fp, length, td->td_ucred, td); 577 fdrop(fp, td); 578 return (error); 579 } 580 581 #ifndef _SYS_SYSPROTO_H_ 582 struct ftruncate_args { 583 int fd; 584 int pad; 585 off_t length; 586 }; 587 #endif 588 int 589 ftruncate(td, uap) 590 struct thread *td; 591 struct ftruncate_args *uap; 592 { 593 594 return (kern_ftruncate(td, uap->fd, uap->length)); 595 } 596 597 #if defined(COMPAT_43) 598 #ifndef _SYS_SYSPROTO_H_ 599 struct oftruncate_args { 600 int fd; 601 long length; 602 }; 603 #endif 604 int 605 oftruncate(td, uap) 606 struct thread *td; 607 struct oftruncate_args *uap; 608 { 609 610 return (kern_ftruncate(td, uap->fd, uap->length)); 611 } 612 #endif /* COMPAT_43 */ 613 614 #ifndef _SYS_SYSPROTO_H_ 615 struct ioctl_args { 616 int fd; 617 u_long com; 618 caddr_t data; 619 }; 620 #endif 621 /* ARGSUSED */ 622 int 623 ioctl(struct thread *td, struct ioctl_args *uap) 624 { 625 u_long com; 626 int arg, error; 627 u_int size; 628 caddr_t data; 629 630 if (uap->com > 0xffffffff) { 631 printf( 632 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 633 td->td_proc->p_pid, td->td_name, uap->com); 634 uap->com &= 0xffffffff; 635 } 636 com = uap->com; 637 638 /* 639 * Interpret high order word to find amount of data to be 640 * copied to/from the user's address space. 641 */ 642 size = IOCPARM_LEN(com); 643 if ((size > IOCPARM_MAX) || 644 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 645 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) 646 ((com & IOC_OUT) && size == 0) || 647 #else 648 ((com & (IOC_IN | IOC_OUT)) && size == 0) || 649 #endif 650 ((com & IOC_VOID) && size > 0 && size != sizeof(int))) 651 return (ENOTTY); 652 653 if (size > 0) { 654 if (com & IOC_VOID) { 655 /* Integer argument. */ 656 arg = (intptr_t)uap->data; 657 data = (void *)&arg; 658 size = 0; 659 } else 660 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 661 } else 662 data = (void *)&uap->data; 663 if (com & IOC_IN) { 664 error = copyin(uap->data, data, (u_int)size); 665 if (error) { 666 if (size > 0) 667 free(data, M_IOCTLOPS); 668 return (error); 669 } 670 } else if (com & IOC_OUT) { 671 /* 672 * Zero the buffer so the user always 673 * gets back something deterministic. 674 */ 675 bzero(data, size); 676 } 677 678 error = kern_ioctl(td, uap->fd, com, data); 679 680 if (error == 0 && (com & IOC_OUT)) 681 error = copyout(data, uap->data, (u_int)size); 682 683 if (size > 0) 684 free(data, M_IOCTLOPS); 685 return (error); 686 } 687 688 int 689 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) 690 { 691 struct file *fp; 692 struct filedesc *fdp; 693 int error; 694 int tmp; 695 696 AUDIT_ARG_FD(fd); 697 AUDIT_ARG_CMD(com); 698 if ((error = fget(td, fd, &fp)) != 0) 699 return (error); 700 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 701 fdrop(fp, td); 702 return (EBADF); 703 } 704 fdp = td->td_proc->p_fd; 705 switch (com) { 706 case FIONCLEX: 707 FILEDESC_XLOCK(fdp); 708 fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE; 709 FILEDESC_XUNLOCK(fdp); 710 goto out; 711 case FIOCLEX: 712 FILEDESC_XLOCK(fdp); 713 fdp->fd_ofileflags[fd] |= UF_EXCLOSE; 714 FILEDESC_XUNLOCK(fdp); 715 goto out; 716 case FIONBIO: 717 if ((tmp = *(int *)data)) 718 atomic_set_int(&fp->f_flag, FNONBLOCK); 719 else 720 atomic_clear_int(&fp->f_flag, FNONBLOCK); 721 data = (void *)&tmp; 722 break; 723 case FIOASYNC: 724 if ((tmp = *(int *)data)) 725 atomic_set_int(&fp->f_flag, FASYNC); 726 else 727 atomic_clear_int(&fp->f_flag, FASYNC); 728 data = (void *)&tmp; 729 break; 730 } 731 732 error = fo_ioctl(fp, com, data, td->td_ucred, td); 733 out: 734 fdrop(fp, td); 735 return (error); 736 } 737 738 int 739 poll_no_poll(int events) 740 { 741 /* 742 * Return true for read/write. If the user asked for something 743 * special, return POLLNVAL, so that clients have a way of 744 * determining reliably whether or not the extended 745 * functionality is present without hard-coding knowledge 746 * of specific filesystem implementations. 747 */ 748 if (events & ~POLLSTANDARD) 749 return (POLLNVAL); 750 751 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 752 } 753 754 int 755 pselect(struct thread *td, struct pselect_args *uap) 756 { 757 struct timespec ts; 758 struct timeval tv, *tvp; 759 sigset_t set, *uset; 760 int error; 761 762 if (uap->ts != NULL) { 763 error = copyin(uap->ts, &ts, sizeof(ts)); 764 if (error != 0) 765 return (error); 766 TIMESPEC_TO_TIMEVAL(&tv, &ts); 767 tvp = &tv; 768 } else 769 tvp = NULL; 770 if (uap->sm != NULL) { 771 error = copyin(uap->sm, &set, sizeof(set)); 772 if (error != 0) 773 return (error); 774 uset = &set; 775 } else 776 uset = NULL; 777 return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 778 uset, NFDBITS)); 779 } 780 781 int 782 kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex, 783 struct timeval *tvp, sigset_t *uset, int abi_nfdbits) 784 { 785 int error; 786 787 if (uset != NULL) { 788 error = kern_sigprocmask(td, SIG_SETMASK, uset, 789 &td->td_oldsigmask, 0); 790 if (error != 0) 791 return (error); 792 td->td_pflags |= TDP_OLDMASK; 793 /* 794 * Make sure that ast() is called on return to 795 * usermode and TDP_OLDMASK is cleared, restoring old 796 * sigmask. 797 */ 798 thread_lock(td); 799 td->td_flags |= TDF_ASTPENDING; 800 thread_unlock(td); 801 } 802 error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits); 803 return (error); 804 } 805 806 #ifndef _SYS_SYSPROTO_H_ 807 struct select_args { 808 int nd; 809 fd_set *in, *ou, *ex; 810 struct timeval *tv; 811 }; 812 #endif 813 int 814 select(struct thread *td, struct select_args *uap) 815 { 816 struct timeval tv, *tvp; 817 int error; 818 819 if (uap->tv != NULL) { 820 error = copyin(uap->tv, &tv, sizeof(tv)); 821 if (error) 822 return (error); 823 tvp = &tv; 824 } else 825 tvp = NULL; 826 827 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, 828 NFDBITS)); 829 } 830 831 int 832 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 833 fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits) 834 { 835 struct filedesc *fdp; 836 /* 837 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 838 * infds with the new FD_SETSIZE of 1024, and more than enough for 839 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 840 * of 256. 841 */ 842 fd_mask s_selbits[howmany(2048, NFDBITS)]; 843 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 844 struct timeval atv, rtv, ttv; 845 int error, timo; 846 u_int nbufbytes, ncpbytes, ncpubytes, nfdbits; 847 848 if (nd < 0) 849 return (EINVAL); 850 fdp = td->td_proc->p_fd; 851 if (nd > fdp->fd_lastfile + 1) 852 nd = fdp->fd_lastfile + 1; 853 854 /* 855 * Allocate just enough bits for the non-null fd_sets. Use the 856 * preallocated auto buffer if possible. 857 */ 858 nfdbits = roundup(nd, NFDBITS); 859 ncpbytes = nfdbits / NBBY; 860 ncpubytes = roundup(nd, abi_nfdbits) / NBBY; 861 nbufbytes = 0; 862 if (fd_in != NULL) 863 nbufbytes += 2 * ncpbytes; 864 if (fd_ou != NULL) 865 nbufbytes += 2 * ncpbytes; 866 if (fd_ex != NULL) 867 nbufbytes += 2 * ncpbytes; 868 if (nbufbytes <= sizeof s_selbits) 869 selbits = &s_selbits[0]; 870 else 871 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 872 873 /* 874 * Assign pointers into the bit buffers and fetch the input bits. 875 * Put the output buffers together so that they can be bzeroed 876 * together. 877 */ 878 sbp = selbits; 879 #define getbits(name, x) \ 880 do { \ 881 if (name == NULL) { \ 882 ibits[x] = NULL; \ 883 obits[x] = NULL; \ 884 } else { \ 885 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 886 obits[x] = sbp; \ 887 sbp += ncpbytes / sizeof *sbp; \ 888 error = copyin(name, ibits[x], ncpubytes); \ 889 if (error != 0) \ 890 goto done; \ 891 bzero((char *)ibits[x] + ncpubytes, \ 892 ncpbytes - ncpubytes); \ 893 } \ 894 } while (0) 895 getbits(fd_in, 0); 896 getbits(fd_ou, 1); 897 getbits(fd_ex, 2); 898 #undef getbits 899 900 #if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__) 901 /* 902 * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS, 903 * we are running under 32-bit emulation. This should be more 904 * generic. 905 */ 906 #define swizzle_fdset(bits) \ 907 if (abi_nfdbits != NFDBITS && bits != NULL) { \ 908 int i; \ 909 for (i = 0; i < ncpbytes / sizeof *sbp; i++) \ 910 bits[i] = (bits[i] >> 32) | (bits[i] << 32); \ 911 } 912 #else 913 #define swizzle_fdset(bits) 914 #endif 915 916 /* Make sure the bit order makes it through an ABI transition */ 917 swizzle_fdset(ibits[0]); 918 swizzle_fdset(ibits[1]); 919 swizzle_fdset(ibits[2]); 920 921 if (nbufbytes != 0) 922 bzero(selbits, nbufbytes / 2); 923 924 if (tvp != NULL) { 925 atv = *tvp; 926 if (itimerfix(&atv)) { 927 error = EINVAL; 928 goto done; 929 } 930 getmicrouptime(&rtv); 931 timevaladd(&atv, &rtv); 932 } else { 933 atv.tv_sec = 0; 934 atv.tv_usec = 0; 935 } 936 timo = 0; 937 seltdinit(td); 938 /* Iterate until the timeout expires or descriptors become ready. */ 939 for (;;) { 940 error = selscan(td, ibits, obits, nd); 941 if (error || td->td_retval[0] != 0) 942 break; 943 if (atv.tv_sec || atv.tv_usec) { 944 getmicrouptime(&rtv); 945 if (timevalcmp(&rtv, &atv, >=)) 946 break; 947 ttv = atv; 948 timevalsub(&ttv, &rtv); 949 timo = ttv.tv_sec > 24 * 60 * 60 ? 950 24 * 60 * 60 * hz : tvtohz(&ttv); 951 } 952 error = seltdwait(td, timo); 953 if (error) 954 break; 955 error = selrescan(td, ibits, obits); 956 if (error || td->td_retval[0] != 0) 957 break; 958 } 959 seltdclear(td); 960 961 done: 962 /* select is not restarted after signals... */ 963 if (error == ERESTART) 964 error = EINTR; 965 if (error == EWOULDBLOCK) 966 error = 0; 967 968 /* swizzle bit order back, if necessary */ 969 swizzle_fdset(obits[0]); 970 swizzle_fdset(obits[1]); 971 swizzle_fdset(obits[2]); 972 #undef swizzle_fdset 973 974 #define putbits(name, x) \ 975 if (name && (error2 = copyout(obits[x], name, ncpubytes))) \ 976 error = error2; 977 if (error == 0) { 978 int error2; 979 980 putbits(fd_in, 0); 981 putbits(fd_ou, 1); 982 putbits(fd_ex, 2); 983 #undef putbits 984 } 985 if (selbits != &s_selbits[0]) 986 free(selbits, M_SELECT); 987 988 return (error); 989 } 990 /* 991 * Convert a select bit set to poll flags. 992 * 993 * The backend always returns POLLHUP/POLLERR if appropriate and we 994 * return this as a set bit in any set. 995 */ 996 static int select_flags[3] = { 997 POLLRDNORM | POLLHUP | POLLERR, 998 POLLWRNORM | POLLHUP | POLLERR, 999 POLLRDBAND | POLLERR 1000 }; 1001 1002 /* 1003 * Compute the fo_poll flags required for a fd given by the index and 1004 * bit position in the fd_mask array. 1005 */ 1006 static __inline int 1007 selflags(fd_mask **ibits, int idx, fd_mask bit) 1008 { 1009 int flags; 1010 int msk; 1011 1012 flags = 0; 1013 for (msk = 0; msk < 3; msk++) { 1014 if (ibits[msk] == NULL) 1015 continue; 1016 if ((ibits[msk][idx] & bit) == 0) 1017 continue; 1018 flags |= select_flags[msk]; 1019 } 1020 return (flags); 1021 } 1022 1023 /* 1024 * Set the appropriate output bits given a mask of fired events and the 1025 * input bits originally requested. 1026 */ 1027 static __inline int 1028 selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events) 1029 { 1030 int msk; 1031 int n; 1032 1033 n = 0; 1034 for (msk = 0; msk < 3; msk++) { 1035 if ((events & select_flags[msk]) == 0) 1036 continue; 1037 if (ibits[msk] == NULL) 1038 continue; 1039 if ((ibits[msk][idx] & bit) == 0) 1040 continue; 1041 /* 1042 * XXX Check for a duplicate set. This can occur because a 1043 * socket calls selrecord() twice for each poll() call 1044 * resulting in two selfds per real fd. selrescan() will 1045 * call selsetbits twice as a result. 1046 */ 1047 if ((obits[msk][idx] & bit) != 0) 1048 continue; 1049 obits[msk][idx] |= bit; 1050 n++; 1051 } 1052 1053 return (n); 1054 } 1055 1056 /* 1057 * Traverse the list of fds attached to this thread's seltd and check for 1058 * completion. 1059 */ 1060 static int 1061 selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) 1062 { 1063 struct filedesc *fdp; 1064 struct selinfo *si; 1065 struct seltd *stp; 1066 struct selfd *sfp; 1067 struct selfd *sfn; 1068 struct file *fp; 1069 fd_mask bit; 1070 int fd, ev, n, idx; 1071 1072 fdp = td->td_proc->p_fd; 1073 stp = td->td_sel; 1074 n = 0; 1075 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1076 fd = (int)(uintptr_t)sfp->sf_cookie; 1077 si = sfp->sf_si; 1078 selfdfree(stp, sfp); 1079 /* If the selinfo wasn't cleared the event didn't fire. */ 1080 if (si != NULL) 1081 continue; 1082 if ((fp = fget_unlocked(fdp, fd)) == NULL) 1083 return (EBADF); 1084 idx = fd / NFDBITS; 1085 bit = (fd_mask)1 << (fd % NFDBITS); 1086 ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td); 1087 fdrop(fp, td); 1088 if (ev != 0) 1089 n += selsetbits(ibits, obits, idx, bit, ev); 1090 } 1091 stp->st_flags = 0; 1092 td->td_retval[0] = n; 1093 return (0); 1094 } 1095 1096 /* 1097 * Perform the initial filedescriptor scan and register ourselves with 1098 * each selinfo. 1099 */ 1100 static int 1101 selscan(td, ibits, obits, nfd) 1102 struct thread *td; 1103 fd_mask **ibits, **obits; 1104 int nfd; 1105 { 1106 struct filedesc *fdp; 1107 struct file *fp; 1108 fd_mask bit; 1109 int ev, flags, end, fd; 1110 int n, idx; 1111 1112 fdp = td->td_proc->p_fd; 1113 n = 0; 1114 for (idx = 0, fd = 0; fd < nfd; idx++) { 1115 end = imin(fd + NFDBITS, nfd); 1116 for (bit = 1; fd < end; bit <<= 1, fd++) { 1117 /* Compute the list of events we're interested in. */ 1118 flags = selflags(ibits, idx, bit); 1119 if (flags == 0) 1120 continue; 1121 if ((fp = fget_unlocked(fdp, fd)) == NULL) 1122 return (EBADF); 1123 selfdalloc(td, (void *)(uintptr_t)fd); 1124 ev = fo_poll(fp, flags, td->td_ucred, td); 1125 fdrop(fp, td); 1126 if (ev != 0) 1127 n += selsetbits(ibits, obits, idx, bit, ev); 1128 } 1129 } 1130 1131 td->td_retval[0] = n; 1132 return (0); 1133 } 1134 1135 #ifndef _SYS_SYSPROTO_H_ 1136 struct poll_args { 1137 struct pollfd *fds; 1138 u_int nfds; 1139 int timeout; 1140 }; 1141 #endif 1142 int 1143 poll(td, uap) 1144 struct thread *td; 1145 struct poll_args *uap; 1146 { 1147 struct pollfd *bits; 1148 struct pollfd smallbits[32]; 1149 struct timeval atv, rtv, ttv; 1150 int error = 0, timo; 1151 u_int nfds; 1152 size_t ni; 1153 1154 nfds = uap->nfds; 1155 if (nfds > maxfilesperproc && nfds > FD_SETSIZE) 1156 return (EINVAL); 1157 ni = nfds * sizeof(struct pollfd); 1158 if (ni > sizeof(smallbits)) 1159 bits = malloc(ni, M_TEMP, M_WAITOK); 1160 else 1161 bits = smallbits; 1162 error = copyin(uap->fds, bits, ni); 1163 if (error) 1164 goto done; 1165 if (uap->timeout != INFTIM) { 1166 atv.tv_sec = uap->timeout / 1000; 1167 atv.tv_usec = (uap->timeout % 1000) * 1000; 1168 if (itimerfix(&atv)) { 1169 error = EINVAL; 1170 goto done; 1171 } 1172 getmicrouptime(&rtv); 1173 timevaladd(&atv, &rtv); 1174 } else { 1175 atv.tv_sec = 0; 1176 atv.tv_usec = 0; 1177 } 1178 timo = 0; 1179 seltdinit(td); 1180 /* Iterate until the timeout expires or descriptors become ready. */ 1181 for (;;) { 1182 error = pollscan(td, bits, nfds); 1183 if (error || td->td_retval[0] != 0) 1184 break; 1185 if (atv.tv_sec || atv.tv_usec) { 1186 getmicrouptime(&rtv); 1187 if (timevalcmp(&rtv, &atv, >=)) 1188 break; 1189 ttv = atv; 1190 timevalsub(&ttv, &rtv); 1191 timo = ttv.tv_sec > 24 * 60 * 60 ? 1192 24 * 60 * 60 * hz : tvtohz(&ttv); 1193 } 1194 error = seltdwait(td, timo); 1195 if (error) 1196 break; 1197 error = pollrescan(td); 1198 if (error || td->td_retval[0] != 0) 1199 break; 1200 } 1201 seltdclear(td); 1202 1203 done: 1204 /* poll is not restarted after signals... */ 1205 if (error == ERESTART) 1206 error = EINTR; 1207 if (error == EWOULDBLOCK) 1208 error = 0; 1209 if (error == 0) { 1210 error = pollout(bits, uap->fds, nfds); 1211 if (error) 1212 goto out; 1213 } 1214 out: 1215 if (ni > sizeof(smallbits)) 1216 free(bits, M_TEMP); 1217 return (error); 1218 } 1219 1220 static int 1221 pollrescan(struct thread *td) 1222 { 1223 struct seltd *stp; 1224 struct selfd *sfp; 1225 struct selfd *sfn; 1226 struct selinfo *si; 1227 struct filedesc *fdp; 1228 struct file *fp; 1229 struct pollfd *fd; 1230 int n; 1231 1232 n = 0; 1233 fdp = td->td_proc->p_fd; 1234 stp = td->td_sel; 1235 FILEDESC_SLOCK(fdp); 1236 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { 1237 fd = (struct pollfd *)sfp->sf_cookie; 1238 si = sfp->sf_si; 1239 selfdfree(stp, sfp); 1240 /* If the selinfo wasn't cleared the event didn't fire. */ 1241 if (si != NULL) 1242 continue; 1243 fp = fdp->fd_ofiles[fd->fd]; 1244 if (fp == NULL) { 1245 fd->revents = POLLNVAL; 1246 n++; 1247 continue; 1248 } 1249 /* 1250 * Note: backend also returns POLLHUP and 1251 * POLLERR if appropriate. 1252 */ 1253 fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); 1254 if (fd->revents != 0) 1255 n++; 1256 } 1257 FILEDESC_SUNLOCK(fdp); 1258 stp->st_flags = 0; 1259 td->td_retval[0] = n; 1260 return (0); 1261 } 1262 1263 1264 static int 1265 pollout(fds, ufds, nfd) 1266 struct pollfd *fds; 1267 struct pollfd *ufds; 1268 u_int nfd; 1269 { 1270 int error = 0; 1271 u_int i = 0; 1272 1273 for (i = 0; i < nfd; i++) { 1274 error = copyout(&fds->revents, &ufds->revents, 1275 sizeof(ufds->revents)); 1276 if (error) 1277 return (error); 1278 fds++; 1279 ufds++; 1280 } 1281 return (0); 1282 } 1283 1284 static int 1285 pollscan(td, fds, nfd) 1286 struct thread *td; 1287 struct pollfd *fds; 1288 u_int nfd; 1289 { 1290 struct filedesc *fdp = td->td_proc->p_fd; 1291 int i; 1292 struct file *fp; 1293 int n = 0; 1294 1295 FILEDESC_SLOCK(fdp); 1296 for (i = 0; i < nfd; i++, fds++) { 1297 if (fds->fd >= fdp->fd_nfiles) { 1298 fds->revents = POLLNVAL; 1299 n++; 1300 } else if (fds->fd < 0) { 1301 fds->revents = 0; 1302 } else { 1303 fp = fdp->fd_ofiles[fds->fd]; 1304 if (fp == NULL) { 1305 fds->revents = POLLNVAL; 1306 n++; 1307 } else { 1308 /* 1309 * Note: backend also returns POLLHUP and 1310 * POLLERR if appropriate. 1311 */ 1312 selfdalloc(td, fds); 1313 fds->revents = fo_poll(fp, fds->events, 1314 td->td_ucred, td); 1315 /* 1316 * POSIX requires POLLOUT to be never 1317 * set simultaneously with POLLHUP. 1318 */ 1319 if ((fds->revents & POLLHUP) != 0) 1320 fds->revents &= ~POLLOUT; 1321 1322 if (fds->revents != 0) 1323 n++; 1324 } 1325 } 1326 } 1327 FILEDESC_SUNLOCK(fdp); 1328 td->td_retval[0] = n; 1329 return (0); 1330 } 1331 1332 /* 1333 * OpenBSD poll system call. 1334 * 1335 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1336 */ 1337 #ifndef _SYS_SYSPROTO_H_ 1338 struct openbsd_poll_args { 1339 struct pollfd *fds; 1340 u_int nfds; 1341 int timeout; 1342 }; 1343 #endif 1344 int 1345 openbsd_poll(td, uap) 1346 register struct thread *td; 1347 register struct openbsd_poll_args *uap; 1348 { 1349 return (poll(td, (struct poll_args *)uap)); 1350 } 1351 1352 /* 1353 * XXX This was created specifically to support netncp and netsmb. This 1354 * allows the caller to specify a socket to wait for events on. It returns 1355 * 0 if any events matched and an error otherwise. There is no way to 1356 * determine which events fired. 1357 */ 1358 int 1359 selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) 1360 { 1361 struct timeval atv, rtv, ttv; 1362 int error, timo; 1363 1364 if (tvp != NULL) { 1365 atv = *tvp; 1366 if (itimerfix(&atv)) 1367 return (EINVAL); 1368 getmicrouptime(&rtv); 1369 timevaladd(&atv, &rtv); 1370 } else { 1371 atv.tv_sec = 0; 1372 atv.tv_usec = 0; 1373 } 1374 1375 timo = 0; 1376 seltdinit(td); 1377 /* 1378 * Iterate until the timeout expires or the socket becomes ready. 1379 */ 1380 for (;;) { 1381 selfdalloc(td, NULL); 1382 error = sopoll(so, events, NULL, td); 1383 /* error here is actually the ready events. */ 1384 if (error) 1385 return (0); 1386 if (atv.tv_sec || atv.tv_usec) { 1387 getmicrouptime(&rtv); 1388 if (timevalcmp(&rtv, &atv, >=)) { 1389 seltdclear(td); 1390 return (EWOULDBLOCK); 1391 } 1392 ttv = atv; 1393 timevalsub(&ttv, &rtv); 1394 timo = ttv.tv_sec > 24 * 60 * 60 ? 1395 24 * 60 * 60 * hz : tvtohz(&ttv); 1396 } 1397 error = seltdwait(td, timo); 1398 seltdclear(td); 1399 if (error) 1400 break; 1401 } 1402 /* XXX Duplicates ncp/smb behavior. */ 1403 if (error == ERESTART) 1404 error = 0; 1405 return (error); 1406 } 1407 1408 /* 1409 * Preallocate two selfds associated with 'cookie'. Some fo_poll routines 1410 * have two select sets, one for read and another for write. 1411 */ 1412 static void 1413 selfdalloc(struct thread *td, void *cookie) 1414 { 1415 struct seltd *stp; 1416 1417 stp = td->td_sel; 1418 if (stp->st_free1 == NULL) 1419 stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); 1420 stp->st_free1->sf_td = stp; 1421 stp->st_free1->sf_cookie = cookie; 1422 if (stp->st_free2 == NULL) 1423 stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); 1424 stp->st_free2->sf_td = stp; 1425 stp->st_free2->sf_cookie = cookie; 1426 } 1427 1428 static void 1429 selfdfree(struct seltd *stp, struct selfd *sfp) 1430 { 1431 STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link); 1432 mtx_lock(sfp->sf_mtx); 1433 if (sfp->sf_si) 1434 TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); 1435 mtx_unlock(sfp->sf_mtx); 1436 uma_zfree(selfd_zone, sfp); 1437 } 1438 1439 /* 1440 * Record a select request. 1441 */ 1442 void 1443 selrecord(selector, sip) 1444 struct thread *selector; 1445 struct selinfo *sip; 1446 { 1447 struct selfd *sfp; 1448 struct seltd *stp; 1449 struct mtx *mtxp; 1450 1451 stp = selector->td_sel; 1452 /* 1453 * Don't record when doing a rescan. 1454 */ 1455 if (stp->st_flags & SELTD_RESCAN) 1456 return; 1457 /* 1458 * Grab one of the preallocated descriptors. 1459 */ 1460 sfp = NULL; 1461 if ((sfp = stp->st_free1) != NULL) 1462 stp->st_free1 = NULL; 1463 else if ((sfp = stp->st_free2) != NULL) 1464 stp->st_free2 = NULL; 1465 else 1466 panic("selrecord: No free selfd on selq"); 1467 mtxp = sip->si_mtx; 1468 if (mtxp == NULL) 1469 mtxp = mtx_pool_find(mtxpool_select, sip); 1470 /* 1471 * Initialize the sfp and queue it in the thread. 1472 */ 1473 sfp->sf_si = sip; 1474 sfp->sf_mtx = mtxp; 1475 STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); 1476 /* 1477 * Now that we've locked the sip, check for initialization. 1478 */ 1479 mtx_lock(mtxp); 1480 if (sip->si_mtx == NULL) { 1481 sip->si_mtx = mtxp; 1482 TAILQ_INIT(&sip->si_tdlist); 1483 } 1484 /* 1485 * Add this thread to the list of selfds listening on this selinfo. 1486 */ 1487 TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); 1488 mtx_unlock(sip->si_mtx); 1489 } 1490 1491 /* Wake up a selecting thread. */ 1492 void 1493 selwakeup(sip) 1494 struct selinfo *sip; 1495 { 1496 doselwakeup(sip, -1); 1497 } 1498 1499 /* Wake up a selecting thread, and set its priority. */ 1500 void 1501 selwakeuppri(sip, pri) 1502 struct selinfo *sip; 1503 int pri; 1504 { 1505 doselwakeup(sip, pri); 1506 } 1507 1508 /* 1509 * Do a wakeup when a selectable event occurs. 1510 */ 1511 static void 1512 doselwakeup(sip, pri) 1513 struct selinfo *sip; 1514 int pri; 1515 { 1516 struct selfd *sfp; 1517 struct selfd *sfn; 1518 struct seltd *stp; 1519 1520 /* If it's not initialized there can't be any waiters. */ 1521 if (sip->si_mtx == NULL) 1522 return; 1523 /* 1524 * Locking the selinfo locks all selfds associated with it. 1525 */ 1526 mtx_lock(sip->si_mtx); 1527 TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { 1528 /* 1529 * Once we remove this sfp from the list and clear the 1530 * sf_si seltdclear will know to ignore this si. 1531 */ 1532 TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); 1533 sfp->sf_si = NULL; 1534 stp = sfp->sf_td; 1535 mtx_lock(&stp->st_mtx); 1536 stp->st_flags |= SELTD_PENDING; 1537 cv_broadcastpri(&stp->st_wait, pri); 1538 mtx_unlock(&stp->st_mtx); 1539 } 1540 mtx_unlock(sip->si_mtx); 1541 } 1542 1543 static void 1544 seltdinit(struct thread *td) 1545 { 1546 struct seltd *stp; 1547 1548 if ((stp = td->td_sel) != NULL) 1549 goto out; 1550 td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); 1551 mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); 1552 cv_init(&stp->st_wait, "select"); 1553 out: 1554 stp->st_flags = 0; 1555 STAILQ_INIT(&stp->st_selq); 1556 } 1557 1558 static int 1559 seltdwait(struct thread *td, int timo) 1560 { 1561 struct seltd *stp; 1562 int error; 1563 1564 stp = td->td_sel; 1565 /* 1566 * An event of interest may occur while we do not hold the seltd 1567 * locked so check the pending flag before we sleep. 1568 */ 1569 mtx_lock(&stp->st_mtx); 1570 /* 1571 * Any further calls to selrecord will be a rescan. 1572 */ 1573 stp->st_flags |= SELTD_RESCAN; 1574 if (stp->st_flags & SELTD_PENDING) { 1575 mtx_unlock(&stp->st_mtx); 1576 return (0); 1577 } 1578 if (timo > 0) 1579 error = cv_timedwait_sig(&stp->st_wait, &stp->st_mtx, timo); 1580 else 1581 error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); 1582 mtx_unlock(&stp->st_mtx); 1583 1584 return (error); 1585 } 1586 1587 void 1588 seltdfini(struct thread *td) 1589 { 1590 struct seltd *stp; 1591 1592 stp = td->td_sel; 1593 if (stp == NULL) 1594 return; 1595 if (stp->st_free1) 1596 uma_zfree(selfd_zone, stp->st_free1); 1597 if (stp->st_free2) 1598 uma_zfree(selfd_zone, stp->st_free2); 1599 td->td_sel = NULL; 1600 free(stp, M_SELECT); 1601 } 1602 1603 /* 1604 * Remove the references to the thread from all of the objects we were 1605 * polling. 1606 */ 1607 static void 1608 seltdclear(struct thread *td) 1609 { 1610 struct seltd *stp; 1611 struct selfd *sfp; 1612 struct selfd *sfn; 1613 1614 stp = td->td_sel; 1615 STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) 1616 selfdfree(stp, sfp); 1617 stp->st_flags = 0; 1618 } 1619 1620 static void selectinit(void *); 1621 SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); 1622 static void 1623 selectinit(void *dummy __unused) 1624 { 1625 1626 selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL, 1627 NULL, NULL, UMA_ALIGN_PTR, 0); 1628 mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF); 1629 } 1630