1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_compat.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/sysproto.h> 46 #include <sys/filedesc.h> 47 #include <sys/filio.h> 48 #include <sys/fcntl.h> 49 #include <sys/file.h> 50 #include <sys/proc.h> 51 #include <sys/signalvar.h> 52 #include <sys/socketvar.h> 53 #include <sys/uio.h> 54 #include <sys/kernel.h> 55 #include <sys/limits.h> 56 #include <sys/malloc.h> 57 #include <sys/poll.h> 58 #include <sys/resourcevar.h> 59 #include <sys/selinfo.h> 60 #include <sys/sleepqueue.h> 61 #include <sys/syscallsubr.h> 62 #include <sys/sysctl.h> 63 #include <sys/sysent.h> 64 #include <sys/vnode.h> 65 #include <sys/bio.h> 66 #include <sys/buf.h> 67 #include <sys/condvar.h> 68 #ifdef KTRACE 69 #include <sys/ktrace.h> 70 #endif 71 #include <vm/vm.h> 72 #include <vm/vm_page.h> 73 74 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 75 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 76 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 77 78 static int pollscan(struct thread *, struct pollfd *, u_int); 79 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 80 static int dofileread(struct thread *, int, struct file *, struct uio *, 81 off_t, int); 82 static int dofilewrite(struct thread *, int, struct file *, struct uio *, 83 off_t, int); 84 static void doselwakeup(struct selinfo *, int); 85 86 /* 87 * Read system call. 88 */ 89 #ifndef _SYS_SYSPROTO_H_ 90 struct read_args { 91 int fd; 92 void *buf; 93 size_t nbyte; 94 }; 95 #endif 96 /* 97 * MPSAFE 98 */ 99 int 100 read(td, uap) 101 struct thread *td; 102 struct read_args *uap; 103 { 104 struct uio auio; 105 struct iovec aiov; 106 int error; 107 108 if (uap->nbyte > INT_MAX) 109 return (EINVAL); 110 aiov.iov_base = uap->buf; 111 aiov.iov_len = uap->nbyte; 112 auio.uio_iov = &aiov; 113 auio.uio_iovcnt = 1; 114 auio.uio_resid = uap->nbyte; 115 auio.uio_segflg = UIO_USERSPACE; 116 error = kern_readv(td, uap->fd, &auio); 117 return(error); 118 } 119 120 /* 121 * Positioned read system call 122 */ 123 #ifndef _SYS_SYSPROTO_H_ 124 struct pread_args { 125 int fd; 126 void *buf; 127 size_t nbyte; 128 int pad; 129 off_t offset; 130 }; 131 #endif 132 /* 133 * MPSAFE 134 */ 135 int 136 pread(td, uap) 137 struct thread *td; 138 struct pread_args *uap; 139 { 140 struct uio auio; 141 struct iovec aiov; 142 int error; 143 144 if (uap->nbyte > INT_MAX) 145 return (EINVAL); 146 aiov.iov_base = uap->buf; 147 aiov.iov_len = uap->nbyte; 148 auio.uio_iov = &aiov; 149 auio.uio_iovcnt = 1; 150 auio.uio_resid = uap->nbyte; 151 auio.uio_segflg = UIO_USERSPACE; 152 error = kern_preadv(td, uap->fd, &auio, uap->offset); 153 return(error); 154 } 155 156 /* 157 * Scatter read system call. 158 */ 159 #ifndef _SYS_SYSPROTO_H_ 160 struct readv_args { 161 int fd; 162 struct iovec *iovp; 163 u_int iovcnt; 164 }; 165 #endif 166 /* 167 * MPSAFE 168 */ 169 int 170 readv(struct thread *td, struct readv_args *uap) 171 { 172 struct uio *auio; 173 int error; 174 175 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 176 if (error) 177 return (error); 178 error = kern_readv(td, uap->fd, auio); 179 free(auio, M_IOV); 180 return (error); 181 } 182 183 int 184 kern_readv(struct thread *td, int fd, struct uio *auio) 185 { 186 struct file *fp; 187 int error; 188 189 error = fget_read(td, fd, &fp); 190 if (error) 191 return (error); 192 error = dofileread(td, fd, fp, auio, (off_t)-1, 0); 193 fdrop(fp, td); 194 return (error); 195 } 196 197 /* 198 * Scatter positioned read system call. 199 */ 200 #ifndef _SYS_SYSPROTO_H_ 201 struct preadv_args { 202 int fd; 203 struct iovec *iovp; 204 u_int iovcnt; 205 off_t offset; 206 }; 207 #endif 208 /* 209 * MPSAFE 210 */ 211 int 212 preadv(struct thread *td, struct preadv_args *uap) 213 { 214 struct uio *auio; 215 int error; 216 217 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 218 if (error) 219 return (error); 220 error = kern_preadv(td, uap->fd, auio, uap->offset); 221 free(auio, M_IOV); 222 return (error); 223 } 224 225 int 226 kern_preadv(td, fd, auio, offset) 227 struct thread *td; 228 int fd; 229 struct uio *auio; 230 off_t offset; 231 { 232 struct file *fp; 233 int error; 234 235 error = fget_read(td, fd, &fp); 236 if (error) 237 return (error); 238 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 239 error = ESPIPE; 240 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 241 error = EINVAL; 242 else 243 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); 244 fdrop(fp, td); 245 return (error); 246 } 247 248 /* 249 * Common code for readv and preadv that reads data in 250 * from a file using the passed in uio, offset, and flags. 251 */ 252 static int 253 dofileread(td, fd, fp, auio, offset, flags) 254 struct thread *td; 255 int fd; 256 struct file *fp; 257 struct uio *auio; 258 off_t offset; 259 int flags; 260 { 261 ssize_t cnt; 262 int error; 263 #ifdef KTRACE 264 struct uio *ktruio = NULL; 265 #endif 266 267 /* Finish zero length reads right here */ 268 if (auio->uio_resid == 0) { 269 td->td_retval[0] = 0; 270 return(0); 271 } 272 auio->uio_rw = UIO_READ; 273 auio->uio_offset = offset; 274 auio->uio_td = td; 275 #ifdef KTRACE 276 if (KTRPOINT(td, KTR_GENIO)) 277 ktruio = cloneuio(auio); 278 #endif 279 cnt = auio->uio_resid; 280 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { 281 if (auio->uio_resid != cnt && (error == ERESTART || 282 error == EINTR || error == EWOULDBLOCK)) 283 error = 0; 284 } 285 cnt -= auio->uio_resid; 286 #ifdef KTRACE 287 if (ktruio != NULL) { 288 ktruio->uio_resid = cnt; 289 ktrgenio(fd, UIO_READ, ktruio, error); 290 } 291 #endif 292 td->td_retval[0] = cnt; 293 return (error); 294 } 295 296 /* 297 * Write system call 298 */ 299 #ifndef _SYS_SYSPROTO_H_ 300 struct write_args { 301 int fd; 302 const void *buf; 303 size_t nbyte; 304 }; 305 #endif 306 /* 307 * MPSAFE 308 */ 309 int 310 write(td, uap) 311 struct thread *td; 312 struct write_args *uap; 313 { 314 struct uio auio; 315 struct iovec aiov; 316 int error; 317 318 if (uap->nbyte > INT_MAX) 319 return (EINVAL); 320 aiov.iov_base = (void *)(uintptr_t)uap->buf; 321 aiov.iov_len = uap->nbyte; 322 auio.uio_iov = &aiov; 323 auio.uio_iovcnt = 1; 324 auio.uio_resid = uap->nbyte; 325 auio.uio_segflg = UIO_USERSPACE; 326 error = kern_writev(td, uap->fd, &auio); 327 return(error); 328 } 329 330 /* 331 * Positioned write system call 332 */ 333 #ifndef _SYS_SYSPROTO_H_ 334 struct pwrite_args { 335 int fd; 336 const void *buf; 337 size_t nbyte; 338 int pad; 339 off_t offset; 340 }; 341 #endif 342 /* 343 * MPSAFE 344 */ 345 int 346 pwrite(td, uap) 347 struct thread *td; 348 struct pwrite_args *uap; 349 { 350 struct uio auio; 351 struct iovec aiov; 352 int error; 353 354 if (uap->nbyte > INT_MAX) 355 return (EINVAL); 356 aiov.iov_base = (void *)(uintptr_t)uap->buf; 357 aiov.iov_len = uap->nbyte; 358 auio.uio_iov = &aiov; 359 auio.uio_iovcnt = 1; 360 auio.uio_resid = uap->nbyte; 361 auio.uio_segflg = UIO_USERSPACE; 362 error = kern_pwritev(td, uap->fd, &auio, uap->offset); 363 return(error); 364 } 365 366 /* 367 * Gather write system call 368 */ 369 #ifndef _SYS_SYSPROTO_H_ 370 struct writev_args { 371 int fd; 372 struct iovec *iovp; 373 u_int iovcnt; 374 }; 375 #endif 376 /* 377 * MPSAFE 378 */ 379 int 380 writev(struct thread *td, struct writev_args *uap) 381 { 382 struct uio *auio; 383 int error; 384 385 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 386 if (error) 387 return (error); 388 error = kern_writev(td, uap->fd, auio); 389 free(auio, M_IOV); 390 return (error); 391 } 392 393 int 394 kern_writev(struct thread *td, int fd, struct uio *auio) 395 { 396 struct file *fp; 397 int error; 398 399 error = fget_write(td, fd, &fp); 400 if (error) 401 return (error); 402 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); 403 fdrop(fp, td); 404 return (error); 405 } 406 407 /* 408 * Gather positioned write system call 409 */ 410 #ifndef _SYS_SYSPROTO_H_ 411 struct pwritev_args { 412 int fd; 413 struct iovec *iovp; 414 u_int iovcnt; 415 off_t offset; 416 }; 417 #endif 418 /* 419 * MPSAFE 420 */ 421 int 422 pwritev(struct thread *td, struct pwritev_args *uap) 423 { 424 struct uio *auio; 425 int error; 426 427 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 428 if (error) 429 return (error); 430 error = kern_pwritev(td, uap->fd, auio, uap->offset); 431 free(auio, M_IOV); 432 return (error); 433 } 434 435 int 436 kern_pwritev(td, fd, auio, offset) 437 struct thread *td; 438 struct uio *auio; 439 int fd; 440 off_t offset; 441 { 442 struct file *fp; 443 int error; 444 445 error = fget_write(td, fd, &fp); 446 if (error) 447 return (error); 448 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 449 error = ESPIPE; 450 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 451 error = EINVAL; 452 else 453 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); 454 fdrop(fp, td); 455 return (error); 456 } 457 458 /* 459 * Common code for writev and pwritev that writes data to 460 * a file using the passed in uio, offset, and flags. 461 */ 462 static int 463 dofilewrite(td, fd, fp, auio, offset, flags) 464 struct thread *td; 465 int fd; 466 struct file *fp; 467 struct uio *auio; 468 off_t offset; 469 int flags; 470 { 471 ssize_t cnt; 472 int error; 473 #ifdef KTRACE 474 struct uio *ktruio = NULL; 475 #endif 476 477 auio->uio_rw = UIO_WRITE; 478 auio->uio_td = td; 479 auio->uio_offset = offset; 480 #ifdef KTRACE 481 if (KTRPOINT(td, KTR_GENIO)) 482 ktruio = cloneuio(auio); 483 #endif 484 cnt = auio->uio_resid; 485 if (fp->f_type == DTYPE_VNODE) 486 bwillwrite(); 487 if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { 488 if (auio->uio_resid != cnt && (error == ERESTART || 489 error == EINTR || error == EWOULDBLOCK)) 490 error = 0; 491 /* Socket layer is responsible for issuing SIGPIPE. */ 492 if (error == EPIPE) { 493 PROC_LOCK(td->td_proc); 494 psignal(td->td_proc, SIGPIPE); 495 PROC_UNLOCK(td->td_proc); 496 } 497 } 498 cnt -= auio->uio_resid; 499 #ifdef KTRACE 500 if (ktruio != NULL) { 501 ktruio->uio_resid = cnt; 502 ktrgenio(fd, UIO_WRITE, ktruio, error); 503 } 504 #endif 505 td->td_retval[0] = cnt; 506 return (error); 507 } 508 509 /* 510 * Ioctl system call 511 */ 512 #ifndef _SYS_SYSPROTO_H_ 513 struct ioctl_args { 514 int fd; 515 u_long com; 516 caddr_t data; 517 }; 518 #endif 519 /* 520 * MPSAFE 521 */ 522 /* ARGSUSED */ 523 int 524 ioctl(struct thread *td, struct ioctl_args *uap) 525 { 526 u_long com; 527 int arg, error; 528 u_int size; 529 caddr_t data; 530 531 if (uap->com > 0xffffffff) { 532 printf( 533 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 534 td->td_proc->p_pid, td->td_proc->p_comm, uap->com); 535 uap->com &= 0xffffffff; 536 } 537 com = uap->com; 538 539 /* 540 * Interpret high order word to find amount of data to be 541 * copied to/from the user's address space. 542 */ 543 size = IOCPARM_LEN(com); 544 if ((size > IOCPARM_MAX) || 545 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 546 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) 547 ((com & IOC_OUT) && size == 0) || 548 #else 549 ((com & (IOC_IN | IOC_OUT)) && size == 0) || 550 #endif 551 ((com & IOC_VOID) && size > 0 && size != sizeof(int))) 552 return (ENOTTY); 553 554 if (size > 0) { 555 if (!(com & IOC_VOID)) 556 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 557 else { 558 /* Integer argument. */ 559 arg = (intptr_t)uap->data; 560 data = (void *)&arg; 561 size = 0; 562 } 563 } else 564 data = (void *)&uap->data; 565 if (com & IOC_IN) { 566 error = copyin(uap->data, data, (u_int)size); 567 if (error) { 568 if (size > 0) 569 free(data, M_IOCTLOPS); 570 return (error); 571 } 572 } else if (com & IOC_OUT) { 573 /* 574 * Zero the buffer so the user always 575 * gets back something deterministic. 576 */ 577 bzero(data, size); 578 } 579 580 error = kern_ioctl(td, uap->fd, com, data); 581 582 if (error == 0 && (com & IOC_OUT)) 583 error = copyout(data, uap->data, (u_int)size); 584 585 if (size > 0) 586 free(data, M_IOCTLOPS); 587 return (error); 588 } 589 590 int 591 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) 592 { 593 struct file *fp; 594 struct filedesc *fdp; 595 int error; 596 int tmp; 597 598 if ((error = fget(td, fd, &fp)) != 0) 599 return (error); 600 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 601 fdrop(fp, td); 602 return (EBADF); 603 } 604 fdp = td->td_proc->p_fd; 605 switch (com) { 606 case FIONCLEX: 607 FILEDESC_LOCK_FAST(fdp); 608 fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE; 609 FILEDESC_UNLOCK_FAST(fdp); 610 goto out; 611 case FIOCLEX: 612 FILEDESC_LOCK_FAST(fdp); 613 fdp->fd_ofileflags[fd] |= UF_EXCLOSE; 614 FILEDESC_UNLOCK_FAST(fdp); 615 goto out; 616 case FIONBIO: 617 FILE_LOCK(fp); 618 if ((tmp = *(int *)data)) 619 fp->f_flag |= FNONBLOCK; 620 else 621 fp->f_flag &= ~FNONBLOCK; 622 FILE_UNLOCK(fp); 623 data = (void *)&tmp; 624 break; 625 case FIOASYNC: 626 FILE_LOCK(fp); 627 if ((tmp = *(int *)data)) 628 fp->f_flag |= FASYNC; 629 else 630 fp->f_flag &= ~FASYNC; 631 FILE_UNLOCK(fp); 632 data = (void *)&tmp; 633 break; 634 } 635 636 error = fo_ioctl(fp, com, data, td->td_ucred, td); 637 out: 638 fdrop(fp, td); 639 return (error); 640 } 641 642 /* 643 * sellock and selwait are initialized in selectinit() via SYSINIT. 644 */ 645 struct mtx sellock; 646 struct cv selwait; 647 u_int nselcoll; /* Select collisions since boot */ 648 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 649 650 /* 651 * Select system call. 652 */ 653 #ifndef _SYS_SYSPROTO_H_ 654 struct select_args { 655 int nd; 656 fd_set *in, *ou, *ex; 657 struct timeval *tv; 658 }; 659 #endif 660 /* 661 * MPSAFE 662 */ 663 int 664 select(td, uap) 665 register struct thread *td; 666 register struct select_args *uap; 667 { 668 struct timeval tv, *tvp; 669 int error; 670 671 if (uap->tv != NULL) { 672 error = copyin(uap->tv, &tv, sizeof(tv)); 673 if (error) 674 return (error); 675 tvp = &tv; 676 } else 677 tvp = NULL; 678 679 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); 680 } 681 682 int 683 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 684 fd_set *fd_ex, struct timeval *tvp) 685 { 686 struct filedesc *fdp; 687 /* 688 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 689 * infds with the new FD_SETSIZE of 1024, and more than enough for 690 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 691 * of 256. 692 */ 693 fd_mask s_selbits[howmany(2048, NFDBITS)]; 694 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 695 struct timeval atv, rtv, ttv; 696 int error, timo; 697 u_int ncoll, nbufbytes, ncpbytes, nfdbits; 698 699 if (nd < 0) 700 return (EINVAL); 701 fdp = td->td_proc->p_fd; 702 703 FILEDESC_LOCK_FAST(fdp); 704 705 if (nd > td->td_proc->p_fd->fd_nfiles) 706 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 707 FILEDESC_UNLOCK_FAST(fdp); 708 709 /* 710 * Allocate just enough bits for the non-null fd_sets. Use the 711 * preallocated auto buffer if possible. 712 */ 713 nfdbits = roundup(nd, NFDBITS); 714 ncpbytes = nfdbits / NBBY; 715 nbufbytes = 0; 716 if (fd_in != NULL) 717 nbufbytes += 2 * ncpbytes; 718 if (fd_ou != NULL) 719 nbufbytes += 2 * ncpbytes; 720 if (fd_ex != NULL) 721 nbufbytes += 2 * ncpbytes; 722 if (nbufbytes <= sizeof s_selbits) 723 selbits = &s_selbits[0]; 724 else 725 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 726 727 /* 728 * Assign pointers into the bit buffers and fetch the input bits. 729 * Put the output buffers together so that they can be bzeroed 730 * together. 731 */ 732 sbp = selbits; 733 #define getbits(name, x) \ 734 do { \ 735 if (name == NULL) \ 736 ibits[x] = NULL; \ 737 else { \ 738 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 739 obits[x] = sbp; \ 740 sbp += ncpbytes / sizeof *sbp; \ 741 error = copyin(name, ibits[x], ncpbytes); \ 742 if (error != 0) \ 743 goto done_nosellock; \ 744 } \ 745 } while (0) 746 getbits(fd_in, 0); 747 getbits(fd_ou, 1); 748 getbits(fd_ex, 2); 749 #undef getbits 750 if (nbufbytes != 0) 751 bzero(selbits, nbufbytes / 2); 752 753 if (tvp != NULL) { 754 atv = *tvp; 755 if (itimerfix(&atv)) { 756 error = EINVAL; 757 goto done_nosellock; 758 } 759 getmicrouptime(&rtv); 760 timevaladd(&atv, &rtv); 761 } else { 762 atv.tv_sec = 0; 763 atv.tv_usec = 0; 764 } 765 timo = 0; 766 TAILQ_INIT(&td->td_selq); 767 mtx_lock(&sellock); 768 retry: 769 ncoll = nselcoll; 770 mtx_lock_spin(&sched_lock); 771 td->td_flags |= TDF_SELECT; 772 mtx_unlock_spin(&sched_lock); 773 mtx_unlock(&sellock); 774 775 error = selscan(td, ibits, obits, nd); 776 mtx_lock(&sellock); 777 if (error || td->td_retval[0]) 778 goto done; 779 if (atv.tv_sec || atv.tv_usec) { 780 getmicrouptime(&rtv); 781 if (timevalcmp(&rtv, &atv, >=)) 782 goto done; 783 ttv = atv; 784 timevalsub(&ttv, &rtv); 785 timo = ttv.tv_sec > 24 * 60 * 60 ? 786 24 * 60 * 60 * hz : tvtohz(&ttv); 787 } 788 789 /* 790 * An event of interest may occur while we do not hold 791 * sellock, so check TDF_SELECT and the number of 792 * collisions and rescan the file descriptors if 793 * necessary. 794 */ 795 mtx_lock_spin(&sched_lock); 796 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 797 mtx_unlock_spin(&sched_lock); 798 goto retry; 799 } 800 mtx_unlock_spin(&sched_lock); 801 802 if (timo > 0) 803 error = cv_timedwait_sig(&selwait, &sellock, timo); 804 else 805 error = cv_wait_sig(&selwait, &sellock); 806 807 if (error == 0) 808 goto retry; 809 810 done: 811 clear_selinfo_list(td); 812 mtx_lock_spin(&sched_lock); 813 td->td_flags &= ~TDF_SELECT; 814 mtx_unlock_spin(&sched_lock); 815 mtx_unlock(&sellock); 816 817 done_nosellock: 818 /* select is not restarted after signals... */ 819 if (error == ERESTART) 820 error = EINTR; 821 if (error == EWOULDBLOCK) 822 error = 0; 823 #define putbits(name, x) \ 824 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 825 error = error2; 826 if (error == 0) { 827 int error2; 828 829 putbits(fd_in, 0); 830 putbits(fd_ou, 1); 831 putbits(fd_ex, 2); 832 #undef putbits 833 } 834 if (selbits != &s_selbits[0]) 835 free(selbits, M_SELECT); 836 837 return (error); 838 } 839 840 static int 841 selscan(td, ibits, obits, nfd) 842 struct thread *td; 843 fd_mask **ibits, **obits; 844 int nfd; 845 { 846 int msk, i, fd; 847 fd_mask bits; 848 struct file *fp; 849 int n = 0; 850 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 851 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 852 struct filedesc *fdp = td->td_proc->p_fd; 853 854 FILEDESC_LOCK(fdp); 855 for (msk = 0; msk < 3; msk++) { 856 if (ibits[msk] == NULL) 857 continue; 858 for (i = 0; i < nfd; i += NFDBITS) { 859 bits = ibits[msk][i/NFDBITS]; 860 /* ffs(int mask) not portable, fd_mask is long */ 861 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 862 if (!(bits & 1)) 863 continue; 864 if ((fp = fget_locked(fdp, fd)) == NULL) { 865 FILEDESC_UNLOCK(fdp); 866 return (EBADF); 867 } 868 if (fo_poll(fp, flag[msk], td->td_ucred, 869 td)) { 870 obits[msk][(fd)/NFDBITS] |= 871 ((fd_mask)1 << ((fd) % NFDBITS)); 872 n++; 873 } 874 } 875 } 876 } 877 FILEDESC_UNLOCK(fdp); 878 td->td_retval[0] = n; 879 return (0); 880 } 881 882 /* 883 * Poll system call. 884 */ 885 #ifndef _SYS_SYSPROTO_H_ 886 struct poll_args { 887 struct pollfd *fds; 888 u_int nfds; 889 int timeout; 890 }; 891 #endif 892 /* 893 * MPSAFE 894 */ 895 int 896 poll(td, uap) 897 struct thread *td; 898 struct poll_args *uap; 899 { 900 struct pollfd *bits; 901 struct pollfd smallbits[32]; 902 struct timeval atv, rtv, ttv; 903 int error = 0, timo; 904 u_int ncoll, nfds; 905 size_t ni; 906 907 nfds = uap->nfds; 908 909 /* 910 * This is kinda bogus. We have fd limits, but that is not 911 * really related to the size of the pollfd array. Make sure 912 * we let the process use at least FD_SETSIZE entries and at 913 * least enough for the current limits. We want to be reasonably 914 * safe, but not overly restrictive. 915 */ 916 PROC_LOCK(td->td_proc); 917 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) && 918 (nfds > FD_SETSIZE)) { 919 PROC_UNLOCK(td->td_proc); 920 error = EINVAL; 921 goto done2; 922 } 923 PROC_UNLOCK(td->td_proc); 924 ni = nfds * sizeof(struct pollfd); 925 if (ni > sizeof(smallbits)) 926 bits = malloc(ni, M_TEMP, M_WAITOK); 927 else 928 bits = smallbits; 929 error = copyin(uap->fds, bits, ni); 930 if (error) 931 goto done_nosellock; 932 if (uap->timeout != INFTIM) { 933 atv.tv_sec = uap->timeout / 1000; 934 atv.tv_usec = (uap->timeout % 1000) * 1000; 935 if (itimerfix(&atv)) { 936 error = EINVAL; 937 goto done_nosellock; 938 } 939 getmicrouptime(&rtv); 940 timevaladd(&atv, &rtv); 941 } else { 942 atv.tv_sec = 0; 943 atv.tv_usec = 0; 944 } 945 timo = 0; 946 TAILQ_INIT(&td->td_selq); 947 mtx_lock(&sellock); 948 retry: 949 ncoll = nselcoll; 950 mtx_lock_spin(&sched_lock); 951 td->td_flags |= TDF_SELECT; 952 mtx_unlock_spin(&sched_lock); 953 mtx_unlock(&sellock); 954 955 error = pollscan(td, bits, nfds); 956 mtx_lock(&sellock); 957 if (error || td->td_retval[0]) 958 goto done; 959 if (atv.tv_sec || atv.tv_usec) { 960 getmicrouptime(&rtv); 961 if (timevalcmp(&rtv, &atv, >=)) 962 goto done; 963 ttv = atv; 964 timevalsub(&ttv, &rtv); 965 timo = ttv.tv_sec > 24 * 60 * 60 ? 966 24 * 60 * 60 * hz : tvtohz(&ttv); 967 } 968 /* 969 * An event of interest may occur while we do not hold 970 * sellock, so check TDF_SELECT and the number of collisions 971 * and rescan the file descriptors if necessary. 972 */ 973 mtx_lock_spin(&sched_lock); 974 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 975 mtx_unlock_spin(&sched_lock); 976 goto retry; 977 } 978 mtx_unlock_spin(&sched_lock); 979 980 if (timo > 0) 981 error = cv_timedwait_sig(&selwait, &sellock, timo); 982 else 983 error = cv_wait_sig(&selwait, &sellock); 984 985 if (error == 0) 986 goto retry; 987 988 done: 989 clear_selinfo_list(td); 990 mtx_lock_spin(&sched_lock); 991 td->td_flags &= ~TDF_SELECT; 992 mtx_unlock_spin(&sched_lock); 993 mtx_unlock(&sellock); 994 995 done_nosellock: 996 /* poll is not restarted after signals... */ 997 if (error == ERESTART) 998 error = EINTR; 999 if (error == EWOULDBLOCK) 1000 error = 0; 1001 if (error == 0) { 1002 error = copyout(bits, uap->fds, ni); 1003 if (error) 1004 goto out; 1005 } 1006 out: 1007 if (ni > sizeof(smallbits)) 1008 free(bits, M_TEMP); 1009 done2: 1010 return (error); 1011 } 1012 1013 static int 1014 pollscan(td, fds, nfd) 1015 struct thread *td; 1016 struct pollfd *fds; 1017 u_int nfd; 1018 { 1019 register struct filedesc *fdp = td->td_proc->p_fd; 1020 int i; 1021 struct file *fp; 1022 int n = 0; 1023 1024 FILEDESC_LOCK(fdp); 1025 for (i = 0; i < nfd; i++, fds++) { 1026 if (fds->fd >= fdp->fd_nfiles) { 1027 fds->revents = POLLNVAL; 1028 n++; 1029 } else if (fds->fd < 0) { 1030 fds->revents = 0; 1031 } else { 1032 fp = fdp->fd_ofiles[fds->fd]; 1033 if (fp == NULL) { 1034 fds->revents = POLLNVAL; 1035 n++; 1036 } else { 1037 /* 1038 * Note: backend also returns POLLHUP and 1039 * POLLERR if appropriate. 1040 */ 1041 fds->revents = fo_poll(fp, fds->events, 1042 td->td_ucred, td); 1043 if (fds->revents != 0) 1044 n++; 1045 } 1046 } 1047 } 1048 FILEDESC_UNLOCK(fdp); 1049 td->td_retval[0] = n; 1050 return (0); 1051 } 1052 1053 /* 1054 * OpenBSD poll system call. 1055 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1056 */ 1057 #ifndef _SYS_SYSPROTO_H_ 1058 struct openbsd_poll_args { 1059 struct pollfd *fds; 1060 u_int nfds; 1061 int timeout; 1062 }; 1063 #endif 1064 /* 1065 * MPSAFE 1066 */ 1067 int 1068 openbsd_poll(td, uap) 1069 register struct thread *td; 1070 register struct openbsd_poll_args *uap; 1071 { 1072 return (poll(td, (struct poll_args *)uap)); 1073 } 1074 1075 /* 1076 * Remove the references to the thread from all of the objects 1077 * we were polling. 1078 * 1079 * This code assumes that the underlying owner of the selinfo 1080 * structure will hold sellock before it changes it, and that 1081 * it will unlink itself from our list if it goes away. 1082 */ 1083 void 1084 clear_selinfo_list(td) 1085 struct thread *td; 1086 { 1087 struct selinfo *si; 1088 1089 mtx_assert(&sellock, MA_OWNED); 1090 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1091 si->si_thread = NULL; 1092 TAILQ_INIT(&td->td_selq); 1093 } 1094 1095 /* 1096 * Record a select request. 1097 */ 1098 void 1099 selrecord(selector, sip) 1100 struct thread *selector; 1101 struct selinfo *sip; 1102 { 1103 1104 mtx_lock(&sellock); 1105 /* 1106 * If the selinfo's thread pointer is NULL then take ownership of it. 1107 * 1108 * If the thread pointer is not NULL and it points to another 1109 * thread, then we have a collision. 1110 * 1111 * If the thread pointer is not NULL and points back to us then leave 1112 * it alone as we've already added pointed it at us and added it to 1113 * our list. 1114 */ 1115 if (sip->si_thread == NULL) { 1116 sip->si_thread = selector; 1117 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1118 } else if (sip->si_thread != selector) { 1119 sip->si_flags |= SI_COLL; 1120 } 1121 1122 mtx_unlock(&sellock); 1123 } 1124 1125 /* Wake up a selecting thread. */ 1126 void 1127 selwakeup(sip) 1128 struct selinfo *sip; 1129 { 1130 doselwakeup(sip, -1); 1131 } 1132 1133 /* Wake up a selecting thread, and set its priority. */ 1134 void 1135 selwakeuppri(sip, pri) 1136 struct selinfo *sip; 1137 int pri; 1138 { 1139 doselwakeup(sip, pri); 1140 } 1141 1142 /* 1143 * Do a wakeup when a selectable event occurs. 1144 */ 1145 static void 1146 doselwakeup(sip, pri) 1147 struct selinfo *sip; 1148 int pri; 1149 { 1150 struct thread *td; 1151 1152 mtx_lock(&sellock); 1153 td = sip->si_thread; 1154 if ((sip->si_flags & SI_COLL) != 0) { 1155 nselcoll++; 1156 sip->si_flags &= ~SI_COLL; 1157 cv_broadcastpri(&selwait, pri); 1158 } 1159 if (td == NULL) { 1160 mtx_unlock(&sellock); 1161 return; 1162 } 1163 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1164 sip->si_thread = NULL; 1165 mtx_lock_spin(&sched_lock); 1166 td->td_flags &= ~TDF_SELECT; 1167 mtx_unlock_spin(&sched_lock); 1168 sleepq_remove(td, &selwait); 1169 mtx_unlock(&sellock); 1170 } 1171 1172 static void selectinit(void *); 1173 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1174 1175 /* ARGSUSED*/ 1176 static void 1177 selectinit(dummy) 1178 void *dummy; 1179 { 1180 cv_init(&selwait, "select"); 1181 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1182 } 1183