1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_compat.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/sysproto.h> 46 #include <sys/filedesc.h> 47 #include <sys/filio.h> 48 #include <sys/fcntl.h> 49 #include <sys/file.h> 50 #include <sys/proc.h> 51 #include <sys/signalvar.h> 52 #include <sys/socketvar.h> 53 #include <sys/uio.h> 54 #include <sys/kernel.h> 55 #include <sys/limits.h> 56 #include <sys/malloc.h> 57 #include <sys/poll.h> 58 #include <sys/resourcevar.h> 59 #include <sys/selinfo.h> 60 #include <sys/sleepqueue.h> 61 #include <sys/syscallsubr.h> 62 #include <sys/sysctl.h> 63 #include <sys/sysent.h> 64 #include <sys/vnode.h> 65 #include <sys/bio.h> 66 #include <sys/buf.h> 67 #include <sys/condvar.h> 68 #ifdef KTRACE 69 #include <sys/ktrace.h> 70 #endif 71 72 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 73 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 74 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 75 76 static int pollscan(struct thread *, struct pollfd *, u_int); 77 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 78 static int dofileread(struct thread *, int, struct file *, struct uio *, 79 off_t, int); 80 static int dofilewrite(struct thread *, int, struct file *, struct uio *, 81 off_t, int); 82 static void doselwakeup(struct selinfo *, int); 83 84 #ifndef _SYS_SYSPROTO_H_ 85 struct read_args { 86 int fd; 87 void *buf; 88 size_t nbyte; 89 }; 90 #endif 91 int 92 read(td, uap) 93 struct thread *td; 94 struct read_args *uap; 95 { 96 struct uio auio; 97 struct iovec aiov; 98 int error; 99 100 if (uap->nbyte > INT_MAX) 101 return (EINVAL); 102 aiov.iov_base = uap->buf; 103 aiov.iov_len = uap->nbyte; 104 auio.uio_iov = &aiov; 105 auio.uio_iovcnt = 1; 106 auio.uio_resid = uap->nbyte; 107 auio.uio_segflg = UIO_USERSPACE; 108 error = kern_readv(td, uap->fd, &auio); 109 return(error); 110 } 111 112 /* 113 * Positioned read system call 114 */ 115 #ifndef _SYS_SYSPROTO_H_ 116 struct pread_args { 117 int fd; 118 void *buf; 119 size_t nbyte; 120 int pad; 121 off_t offset; 122 }; 123 #endif 124 int 125 pread(td, uap) 126 struct thread *td; 127 struct pread_args *uap; 128 { 129 struct uio auio; 130 struct iovec aiov; 131 int error; 132 133 if (uap->nbyte > INT_MAX) 134 return (EINVAL); 135 aiov.iov_base = uap->buf; 136 aiov.iov_len = uap->nbyte; 137 auio.uio_iov = &aiov; 138 auio.uio_iovcnt = 1; 139 auio.uio_resid = uap->nbyte; 140 auio.uio_segflg = UIO_USERSPACE; 141 error = kern_preadv(td, uap->fd, &auio, uap->offset); 142 return(error); 143 } 144 145 int 146 freebsd6_pread(td, uap) 147 struct thread *td; 148 struct freebsd6_pread_args *uap; 149 { 150 struct pread_args oargs; 151 152 oargs.fd = uap->fd; 153 oargs.buf = uap->buf; 154 oargs.nbyte = uap->nbyte; 155 oargs.offset = uap->offset; 156 return (pread(td, &oargs)); 157 } 158 159 /* 160 * Scatter read system call. 161 */ 162 #ifndef _SYS_SYSPROTO_H_ 163 struct readv_args { 164 int fd; 165 struct iovec *iovp; 166 u_int iovcnt; 167 }; 168 #endif 169 int 170 readv(struct thread *td, struct readv_args *uap) 171 { 172 struct uio *auio; 173 int error; 174 175 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 176 if (error) 177 return (error); 178 error = kern_readv(td, uap->fd, auio); 179 free(auio, M_IOV); 180 return (error); 181 } 182 183 int 184 kern_readv(struct thread *td, int fd, struct uio *auio) 185 { 186 struct file *fp; 187 int error; 188 189 error = fget_read(td, fd, &fp); 190 if (error) 191 return (error); 192 error = dofileread(td, fd, fp, auio, (off_t)-1, 0); 193 fdrop(fp, td); 194 return (error); 195 } 196 197 /* 198 * Scatter positioned read system call. 199 */ 200 #ifndef _SYS_SYSPROTO_H_ 201 struct preadv_args { 202 int fd; 203 struct iovec *iovp; 204 u_int iovcnt; 205 off_t offset; 206 }; 207 #endif 208 int 209 preadv(struct thread *td, struct preadv_args *uap) 210 { 211 struct uio *auio; 212 int error; 213 214 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 215 if (error) 216 return (error); 217 error = kern_preadv(td, uap->fd, auio, uap->offset); 218 free(auio, M_IOV); 219 return (error); 220 } 221 222 int 223 kern_preadv(td, fd, auio, offset) 224 struct thread *td; 225 int fd; 226 struct uio *auio; 227 off_t offset; 228 { 229 struct file *fp; 230 int error; 231 232 error = fget_read(td, fd, &fp); 233 if (error) 234 return (error); 235 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 236 error = ESPIPE; 237 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 238 error = EINVAL; 239 else 240 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); 241 fdrop(fp, td); 242 return (error); 243 } 244 245 /* 246 * Common code for readv and preadv that reads data in 247 * from a file using the passed in uio, offset, and flags. 248 */ 249 static int 250 dofileread(td, fd, fp, auio, offset, flags) 251 struct thread *td; 252 int fd; 253 struct file *fp; 254 struct uio *auio; 255 off_t offset; 256 int flags; 257 { 258 ssize_t cnt; 259 int error; 260 #ifdef KTRACE 261 struct uio *ktruio = NULL; 262 #endif 263 264 /* Finish zero length reads right here */ 265 if (auio->uio_resid == 0) { 266 td->td_retval[0] = 0; 267 return(0); 268 } 269 auio->uio_rw = UIO_READ; 270 auio->uio_offset = offset; 271 auio->uio_td = td; 272 #ifdef KTRACE 273 if (KTRPOINT(td, KTR_GENIO)) 274 ktruio = cloneuio(auio); 275 #endif 276 cnt = auio->uio_resid; 277 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { 278 if (auio->uio_resid != cnt && (error == ERESTART || 279 error == EINTR || error == EWOULDBLOCK)) 280 error = 0; 281 } 282 cnt -= auio->uio_resid; 283 #ifdef KTRACE 284 if (ktruio != NULL) { 285 ktruio->uio_resid = cnt; 286 ktrgenio(fd, UIO_READ, ktruio, error); 287 } 288 #endif 289 td->td_retval[0] = cnt; 290 return (error); 291 } 292 293 #ifndef _SYS_SYSPROTO_H_ 294 struct write_args { 295 int fd; 296 const void *buf; 297 size_t nbyte; 298 }; 299 #endif 300 int 301 write(td, uap) 302 struct thread *td; 303 struct write_args *uap; 304 { 305 struct uio auio; 306 struct iovec aiov; 307 int error; 308 309 if (uap->nbyte > INT_MAX) 310 return (EINVAL); 311 aiov.iov_base = (void *)(uintptr_t)uap->buf; 312 aiov.iov_len = uap->nbyte; 313 auio.uio_iov = &aiov; 314 auio.uio_iovcnt = 1; 315 auio.uio_resid = uap->nbyte; 316 auio.uio_segflg = UIO_USERSPACE; 317 error = kern_writev(td, uap->fd, &auio); 318 return(error); 319 } 320 321 /* 322 * Positioned write system call. 323 */ 324 #ifndef _SYS_SYSPROTO_H_ 325 struct pwrite_args { 326 int fd; 327 const void *buf; 328 size_t nbyte; 329 int pad; 330 off_t offset; 331 }; 332 #endif 333 int 334 pwrite(td, uap) 335 struct thread *td; 336 struct pwrite_args *uap; 337 { 338 struct uio auio; 339 struct iovec aiov; 340 int error; 341 342 if (uap->nbyte > INT_MAX) 343 return (EINVAL); 344 aiov.iov_base = (void *)(uintptr_t)uap->buf; 345 aiov.iov_len = uap->nbyte; 346 auio.uio_iov = &aiov; 347 auio.uio_iovcnt = 1; 348 auio.uio_resid = uap->nbyte; 349 auio.uio_segflg = UIO_USERSPACE; 350 error = kern_pwritev(td, uap->fd, &auio, uap->offset); 351 return(error); 352 } 353 354 int 355 freebsd6_pwrite(td, uap) 356 struct thread *td; 357 struct freebsd6_pwrite_args *uap; 358 { 359 struct pwrite_args oargs; 360 361 oargs.fd = uap->fd; 362 oargs.buf = uap->buf; 363 oargs.nbyte = uap->nbyte; 364 oargs.offset = uap->offset; 365 return (pwrite(td, &oargs)); 366 } 367 368 /* 369 * Gather write system call. 370 */ 371 #ifndef _SYS_SYSPROTO_H_ 372 struct writev_args { 373 int fd; 374 struct iovec *iovp; 375 u_int iovcnt; 376 }; 377 #endif 378 int 379 writev(struct thread *td, struct writev_args *uap) 380 { 381 struct uio *auio; 382 int error; 383 384 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 385 if (error) 386 return (error); 387 error = kern_writev(td, uap->fd, auio); 388 free(auio, M_IOV); 389 return (error); 390 } 391 392 int 393 kern_writev(struct thread *td, int fd, struct uio *auio) 394 { 395 struct file *fp; 396 int error; 397 398 error = fget_write(td, fd, &fp); 399 if (error) 400 return (error); 401 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); 402 fdrop(fp, td); 403 return (error); 404 } 405 406 /* 407 * Gather positioned write system call. 408 */ 409 #ifndef _SYS_SYSPROTO_H_ 410 struct pwritev_args { 411 int fd; 412 struct iovec *iovp; 413 u_int iovcnt; 414 off_t offset; 415 }; 416 #endif 417 int 418 pwritev(struct thread *td, struct pwritev_args *uap) 419 { 420 struct uio *auio; 421 int error; 422 423 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 424 if (error) 425 return (error); 426 error = kern_pwritev(td, uap->fd, auio, uap->offset); 427 free(auio, M_IOV); 428 return (error); 429 } 430 431 int 432 kern_pwritev(td, fd, auio, offset) 433 struct thread *td; 434 struct uio *auio; 435 int fd; 436 off_t offset; 437 { 438 struct file *fp; 439 int error; 440 441 error = fget_write(td, fd, &fp); 442 if (error) 443 return (error); 444 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 445 error = ESPIPE; 446 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 447 error = EINVAL; 448 else 449 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); 450 fdrop(fp, td); 451 return (error); 452 } 453 454 /* 455 * Common code for writev and pwritev that writes data to 456 * a file using the passed in uio, offset, and flags. 457 */ 458 static int 459 dofilewrite(td, fd, fp, auio, offset, flags) 460 struct thread *td; 461 int fd; 462 struct file *fp; 463 struct uio *auio; 464 off_t offset; 465 int flags; 466 { 467 ssize_t cnt; 468 int error; 469 #ifdef KTRACE 470 struct uio *ktruio = NULL; 471 #endif 472 473 auio->uio_rw = UIO_WRITE; 474 auio->uio_td = td; 475 auio->uio_offset = offset; 476 #ifdef KTRACE 477 if (KTRPOINT(td, KTR_GENIO)) 478 ktruio = cloneuio(auio); 479 #endif 480 cnt = auio->uio_resid; 481 if (fp->f_type == DTYPE_VNODE) 482 bwillwrite(); 483 if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { 484 if (auio->uio_resid != cnt && (error == ERESTART || 485 error == EINTR || error == EWOULDBLOCK)) 486 error = 0; 487 /* Socket layer is responsible for issuing SIGPIPE. */ 488 if (fp->f_type != DTYPE_SOCKET && error == EPIPE) { 489 PROC_LOCK(td->td_proc); 490 psignal(td->td_proc, SIGPIPE); 491 PROC_UNLOCK(td->td_proc); 492 } 493 } 494 cnt -= auio->uio_resid; 495 #ifdef KTRACE 496 if (ktruio != NULL) { 497 ktruio->uio_resid = cnt; 498 ktrgenio(fd, UIO_WRITE, ktruio, error); 499 } 500 #endif 501 td->td_retval[0] = cnt; 502 return (error); 503 } 504 505 #ifndef _SYS_SYSPROTO_H_ 506 struct ioctl_args { 507 int fd; 508 u_long com; 509 caddr_t data; 510 }; 511 #endif 512 /* ARGSUSED */ 513 int 514 ioctl(struct thread *td, struct ioctl_args *uap) 515 { 516 u_long com; 517 int arg, error; 518 u_int size; 519 caddr_t data; 520 521 if (uap->com > 0xffffffff) { 522 printf( 523 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 524 td->td_proc->p_pid, td->td_name, uap->com); 525 uap->com &= 0xffffffff; 526 } 527 com = uap->com; 528 529 /* 530 * Interpret high order word to find amount of data to be 531 * copied to/from the user's address space. 532 */ 533 size = IOCPARM_LEN(com); 534 if ((size > IOCPARM_MAX) || 535 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 536 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) 537 ((com & IOC_OUT) && size == 0) || 538 #else 539 ((com & (IOC_IN | IOC_OUT)) && size == 0) || 540 #endif 541 ((com & IOC_VOID) && size > 0 && size != sizeof(int))) 542 return (ENOTTY); 543 544 if (size > 0) { 545 if (!(com & IOC_VOID)) 546 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 547 else { 548 /* Integer argument. */ 549 arg = (intptr_t)uap->data; 550 data = (void *)&arg; 551 size = 0; 552 } 553 } else 554 data = (void *)&uap->data; 555 if (com & IOC_IN) { 556 error = copyin(uap->data, data, (u_int)size); 557 if (error) { 558 if (size > 0) 559 free(data, M_IOCTLOPS); 560 return (error); 561 } 562 } else if (com & IOC_OUT) { 563 /* 564 * Zero the buffer so the user always 565 * gets back something deterministic. 566 */ 567 bzero(data, size); 568 } 569 570 error = kern_ioctl(td, uap->fd, com, data); 571 572 if (error == 0 && (com & IOC_OUT)) 573 error = copyout(data, uap->data, (u_int)size); 574 575 if (size > 0) 576 free(data, M_IOCTLOPS); 577 return (error); 578 } 579 580 int 581 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) 582 { 583 struct file *fp; 584 struct filedesc *fdp; 585 int error; 586 int tmp; 587 588 if ((error = fget(td, fd, &fp)) != 0) 589 return (error); 590 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 591 fdrop(fp, td); 592 return (EBADF); 593 } 594 fdp = td->td_proc->p_fd; 595 switch (com) { 596 case FIONCLEX: 597 FILEDESC_XLOCK(fdp); 598 fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE; 599 FILEDESC_XUNLOCK(fdp); 600 goto out; 601 case FIOCLEX: 602 FILEDESC_XLOCK(fdp); 603 fdp->fd_ofileflags[fd] |= UF_EXCLOSE; 604 FILEDESC_XUNLOCK(fdp); 605 goto out; 606 case FIONBIO: 607 FILE_LOCK(fp); 608 if ((tmp = *(int *)data)) 609 fp->f_flag |= FNONBLOCK; 610 else 611 fp->f_flag &= ~FNONBLOCK; 612 FILE_UNLOCK(fp); 613 data = (void *)&tmp; 614 break; 615 case FIOASYNC: 616 FILE_LOCK(fp); 617 if ((tmp = *(int *)data)) 618 fp->f_flag |= FASYNC; 619 else 620 fp->f_flag &= ~FASYNC; 621 FILE_UNLOCK(fp); 622 data = (void *)&tmp; 623 break; 624 } 625 626 error = fo_ioctl(fp, com, data, td->td_ucred, td); 627 out: 628 fdrop(fp, td); 629 return (error); 630 } 631 632 /* 633 * sellock and selwait are initialized in selectinit() via SYSINIT. 634 */ 635 struct mtx sellock; 636 struct cv selwait; 637 u_int nselcoll; /* Select collisions since boot */ 638 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 639 640 #ifndef _SYS_SYSPROTO_H_ 641 struct select_args { 642 int nd; 643 fd_set *in, *ou, *ex; 644 struct timeval *tv; 645 }; 646 #endif 647 int 648 select(td, uap) 649 register struct thread *td; 650 register struct select_args *uap; 651 { 652 struct timeval tv, *tvp; 653 int error; 654 655 if (uap->tv != NULL) { 656 error = copyin(uap->tv, &tv, sizeof(tv)); 657 if (error) 658 return (error); 659 tvp = &tv; 660 } else 661 tvp = NULL; 662 663 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); 664 } 665 666 int 667 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 668 fd_set *fd_ex, struct timeval *tvp) 669 { 670 struct filedesc *fdp; 671 /* 672 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 673 * infds with the new FD_SETSIZE of 1024, and more than enough for 674 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 675 * of 256. 676 */ 677 fd_mask s_selbits[howmany(2048, NFDBITS)]; 678 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 679 struct timeval atv, rtv, ttv; 680 int error, timo; 681 u_int ncoll, nbufbytes, ncpbytes, nfdbits; 682 683 if (nd < 0) 684 return (EINVAL); 685 fdp = td->td_proc->p_fd; 686 687 FILEDESC_SLOCK(fdp); 688 if (nd > td->td_proc->p_fd->fd_nfiles) 689 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 690 FILEDESC_SUNLOCK(fdp); 691 692 /* 693 * Allocate just enough bits for the non-null fd_sets. Use the 694 * preallocated auto buffer if possible. 695 */ 696 nfdbits = roundup(nd, NFDBITS); 697 ncpbytes = nfdbits / NBBY; 698 nbufbytes = 0; 699 if (fd_in != NULL) 700 nbufbytes += 2 * ncpbytes; 701 if (fd_ou != NULL) 702 nbufbytes += 2 * ncpbytes; 703 if (fd_ex != NULL) 704 nbufbytes += 2 * ncpbytes; 705 if (nbufbytes <= sizeof s_selbits) 706 selbits = &s_selbits[0]; 707 else 708 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 709 710 /* 711 * Assign pointers into the bit buffers and fetch the input bits. 712 * Put the output buffers together so that they can be bzeroed 713 * together. 714 */ 715 sbp = selbits; 716 #define getbits(name, x) \ 717 do { \ 718 if (name == NULL) \ 719 ibits[x] = NULL; \ 720 else { \ 721 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 722 obits[x] = sbp; \ 723 sbp += ncpbytes / sizeof *sbp; \ 724 error = copyin(name, ibits[x], ncpbytes); \ 725 if (error != 0) \ 726 goto done_nosellock; \ 727 } \ 728 } while (0) 729 getbits(fd_in, 0); 730 getbits(fd_ou, 1); 731 getbits(fd_ex, 2); 732 #undef getbits 733 if (nbufbytes != 0) 734 bzero(selbits, nbufbytes / 2); 735 736 if (tvp != NULL) { 737 atv = *tvp; 738 if (itimerfix(&atv)) { 739 error = EINVAL; 740 goto done_nosellock; 741 } 742 getmicrouptime(&rtv); 743 timevaladd(&atv, &rtv); 744 } else { 745 atv.tv_sec = 0; 746 atv.tv_usec = 0; 747 } 748 timo = 0; 749 TAILQ_INIT(&td->td_selq); 750 mtx_lock(&sellock); 751 retry: 752 ncoll = nselcoll; 753 thread_lock(td); 754 td->td_flags |= TDF_SELECT; 755 thread_unlock(td); 756 mtx_unlock(&sellock); 757 758 error = selscan(td, ibits, obits, nd); 759 mtx_lock(&sellock); 760 if (error || td->td_retval[0]) 761 goto done; 762 if (atv.tv_sec || atv.tv_usec) { 763 getmicrouptime(&rtv); 764 if (timevalcmp(&rtv, &atv, >=)) 765 goto done; 766 ttv = atv; 767 timevalsub(&ttv, &rtv); 768 timo = ttv.tv_sec > 24 * 60 * 60 ? 769 24 * 60 * 60 * hz : tvtohz(&ttv); 770 } 771 772 /* 773 * An event of interest may occur while we do not hold 774 * sellock, so check TDF_SELECT and the number of 775 * collisions and rescan the file descriptors if 776 * necessary. 777 */ 778 thread_lock(td); 779 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 780 thread_unlock(td); 781 goto retry; 782 } 783 thread_unlock(td); 784 785 if (timo > 0) 786 error = cv_timedwait_sig(&selwait, &sellock, timo); 787 else 788 error = cv_wait_sig(&selwait, &sellock); 789 790 if (error == 0) 791 goto retry; 792 793 done: 794 clear_selinfo_list(td); 795 thread_lock(td); 796 td->td_flags &= ~TDF_SELECT; 797 thread_unlock(td); 798 mtx_unlock(&sellock); 799 800 done_nosellock: 801 /* select is not restarted after signals... */ 802 if (error == ERESTART) 803 error = EINTR; 804 if (error == EWOULDBLOCK) 805 error = 0; 806 #define putbits(name, x) \ 807 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 808 error = error2; 809 if (error == 0) { 810 int error2; 811 812 putbits(fd_in, 0); 813 putbits(fd_ou, 1); 814 putbits(fd_ex, 2); 815 #undef putbits 816 } 817 if (selbits != &s_selbits[0]) 818 free(selbits, M_SELECT); 819 820 return (error); 821 } 822 823 static int 824 selscan(td, ibits, obits, nfd) 825 struct thread *td; 826 fd_mask **ibits, **obits; 827 int nfd; 828 { 829 int msk, i, fd; 830 fd_mask bits; 831 struct file *fp; 832 int n = 0; 833 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 834 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 835 struct filedesc *fdp = td->td_proc->p_fd; 836 837 FILEDESC_SLOCK(fdp); 838 for (msk = 0; msk < 3; msk++) { 839 if (ibits[msk] == NULL) 840 continue; 841 for (i = 0; i < nfd; i += NFDBITS) { 842 bits = ibits[msk][i/NFDBITS]; 843 /* ffs(int mask) not portable, fd_mask is long */ 844 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 845 if (!(bits & 1)) 846 continue; 847 if ((fp = fget_locked(fdp, fd)) == NULL) { 848 FILEDESC_SUNLOCK(fdp); 849 return (EBADF); 850 } 851 if (fo_poll(fp, flag[msk], td->td_ucred, 852 td)) { 853 obits[msk][(fd)/NFDBITS] |= 854 ((fd_mask)1 << ((fd) % NFDBITS)); 855 n++; 856 } 857 } 858 } 859 } 860 FILEDESC_SUNLOCK(fdp); 861 td->td_retval[0] = n; 862 return (0); 863 } 864 865 #ifndef _SYS_SYSPROTO_H_ 866 struct poll_args { 867 struct pollfd *fds; 868 u_int nfds; 869 int timeout; 870 }; 871 #endif 872 int 873 poll(td, uap) 874 struct thread *td; 875 struct poll_args *uap; 876 { 877 struct pollfd *bits; 878 struct pollfd smallbits[32]; 879 struct timeval atv, rtv, ttv; 880 int error = 0, timo; 881 u_int ncoll, nfds; 882 size_t ni; 883 884 nfds = uap->nfds; 885 886 /* 887 * This is kinda bogus. We have fd limits, but that is not 888 * really related to the size of the pollfd array. Make sure 889 * we let the process use at least FD_SETSIZE entries and at 890 * least enough for the current limits. We want to be reasonably 891 * safe, but not overly restrictive. 892 */ 893 PROC_LOCK(td->td_proc); 894 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) && 895 (nfds > FD_SETSIZE)) { 896 PROC_UNLOCK(td->td_proc); 897 error = EINVAL; 898 goto done2; 899 } 900 PROC_UNLOCK(td->td_proc); 901 ni = nfds * sizeof(struct pollfd); 902 if (ni > sizeof(smallbits)) 903 bits = malloc(ni, M_TEMP, M_WAITOK); 904 else 905 bits = smallbits; 906 error = copyin(uap->fds, bits, ni); 907 if (error) 908 goto done_nosellock; 909 if (uap->timeout != INFTIM) { 910 atv.tv_sec = uap->timeout / 1000; 911 atv.tv_usec = (uap->timeout % 1000) * 1000; 912 if (itimerfix(&atv)) { 913 error = EINVAL; 914 goto done_nosellock; 915 } 916 getmicrouptime(&rtv); 917 timevaladd(&atv, &rtv); 918 } else { 919 atv.tv_sec = 0; 920 atv.tv_usec = 0; 921 } 922 timo = 0; 923 TAILQ_INIT(&td->td_selq); 924 mtx_lock(&sellock); 925 retry: 926 ncoll = nselcoll; 927 thread_lock(td); 928 td->td_flags |= TDF_SELECT; 929 thread_unlock(td); 930 mtx_unlock(&sellock); 931 932 error = pollscan(td, bits, nfds); 933 mtx_lock(&sellock); 934 if (error || td->td_retval[0]) 935 goto done; 936 if (atv.tv_sec || atv.tv_usec) { 937 getmicrouptime(&rtv); 938 if (timevalcmp(&rtv, &atv, >=)) 939 goto done; 940 ttv = atv; 941 timevalsub(&ttv, &rtv); 942 timo = ttv.tv_sec > 24 * 60 * 60 ? 943 24 * 60 * 60 * hz : tvtohz(&ttv); 944 } 945 /* 946 * An event of interest may occur while we do not hold 947 * sellock, so check TDF_SELECT and the number of collisions 948 * and rescan the file descriptors if necessary. 949 */ 950 thread_lock(td); 951 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 952 thread_unlock(td); 953 goto retry; 954 } 955 thread_unlock(td); 956 957 if (timo > 0) 958 error = cv_timedwait_sig(&selwait, &sellock, timo); 959 else 960 error = cv_wait_sig(&selwait, &sellock); 961 962 if (error == 0) 963 goto retry; 964 965 done: 966 clear_selinfo_list(td); 967 thread_lock(td); 968 td->td_flags &= ~TDF_SELECT; 969 thread_unlock(td); 970 mtx_unlock(&sellock); 971 972 done_nosellock: 973 /* poll is not restarted after signals... */ 974 if (error == ERESTART) 975 error = EINTR; 976 if (error == EWOULDBLOCK) 977 error = 0; 978 if (error == 0) { 979 error = copyout(bits, uap->fds, ni); 980 if (error) 981 goto out; 982 } 983 out: 984 if (ni > sizeof(smallbits)) 985 free(bits, M_TEMP); 986 done2: 987 return (error); 988 } 989 990 static int 991 pollscan(td, fds, nfd) 992 struct thread *td; 993 struct pollfd *fds; 994 u_int nfd; 995 { 996 register struct filedesc *fdp = td->td_proc->p_fd; 997 int i; 998 struct file *fp; 999 int n = 0; 1000 1001 FILEDESC_SLOCK(fdp); 1002 for (i = 0; i < nfd; i++, fds++) { 1003 if (fds->fd >= fdp->fd_nfiles) { 1004 fds->revents = POLLNVAL; 1005 n++; 1006 } else if (fds->fd < 0) { 1007 fds->revents = 0; 1008 } else { 1009 fp = fdp->fd_ofiles[fds->fd]; 1010 if (fp == NULL) { 1011 fds->revents = POLLNVAL; 1012 n++; 1013 } else { 1014 /* 1015 * Note: backend also returns POLLHUP and 1016 * POLLERR if appropriate. 1017 */ 1018 fds->revents = fo_poll(fp, fds->events, 1019 td->td_ucred, td); 1020 if (fds->revents != 0) 1021 n++; 1022 } 1023 } 1024 } 1025 FILEDESC_SUNLOCK(fdp); 1026 td->td_retval[0] = n; 1027 return (0); 1028 } 1029 1030 /* 1031 * OpenBSD poll system call. 1032 * 1033 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1034 */ 1035 #ifndef _SYS_SYSPROTO_H_ 1036 struct openbsd_poll_args { 1037 struct pollfd *fds; 1038 u_int nfds; 1039 int timeout; 1040 }; 1041 #endif 1042 int 1043 openbsd_poll(td, uap) 1044 register struct thread *td; 1045 register struct openbsd_poll_args *uap; 1046 { 1047 return (poll(td, (struct poll_args *)uap)); 1048 } 1049 1050 /* 1051 * Remove the references to the thread from all of the objects we were 1052 * polling. 1053 * 1054 * This code assumes that the underlying owner of the selinfo structure will 1055 * hold sellock before it changes it, and that it will unlink itself from our 1056 * list if it goes away. 1057 */ 1058 void 1059 clear_selinfo_list(td) 1060 struct thread *td; 1061 { 1062 struct selinfo *si; 1063 1064 mtx_assert(&sellock, MA_OWNED); 1065 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1066 si->si_thread = NULL; 1067 TAILQ_INIT(&td->td_selq); 1068 } 1069 1070 /* 1071 * Record a select request. 1072 */ 1073 void 1074 selrecord(selector, sip) 1075 struct thread *selector; 1076 struct selinfo *sip; 1077 { 1078 1079 mtx_lock(&sellock); 1080 /* 1081 * If the selinfo's thread pointer is NULL then take ownership of it. 1082 * 1083 * If the thread pointer is not NULL and it points to another 1084 * thread, then we have a collision. 1085 * 1086 * If the thread pointer is not NULL and points back to us then leave 1087 * it alone as we've already added pointed it at us and added it to 1088 * our list. 1089 */ 1090 if (sip->si_thread == NULL) { 1091 sip->si_thread = selector; 1092 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1093 } else if (sip->si_thread != selector) { 1094 sip->si_flags |= SI_COLL; 1095 } 1096 1097 mtx_unlock(&sellock); 1098 } 1099 1100 /* Wake up a selecting thread. */ 1101 void 1102 selwakeup(sip) 1103 struct selinfo *sip; 1104 { 1105 doselwakeup(sip, -1); 1106 } 1107 1108 /* Wake up a selecting thread, and set its priority. */ 1109 void 1110 selwakeuppri(sip, pri) 1111 struct selinfo *sip; 1112 int pri; 1113 { 1114 doselwakeup(sip, pri); 1115 } 1116 1117 /* 1118 * Do a wakeup when a selectable event occurs. 1119 */ 1120 static void 1121 doselwakeup(sip, pri) 1122 struct selinfo *sip; 1123 int pri; 1124 { 1125 struct thread *td; 1126 1127 mtx_lock(&sellock); 1128 td = sip->si_thread; 1129 if ((sip->si_flags & SI_COLL) != 0) { 1130 nselcoll++; 1131 sip->si_flags &= ~SI_COLL; 1132 cv_broadcastpri(&selwait, pri); 1133 } 1134 if (td == NULL) { 1135 mtx_unlock(&sellock); 1136 return; 1137 } 1138 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1139 sip->si_thread = NULL; 1140 thread_lock(td); 1141 td->td_flags &= ~TDF_SELECT; 1142 thread_unlock(td); 1143 sleepq_remove(td, &selwait); 1144 mtx_unlock(&sellock); 1145 } 1146 1147 static void selectinit(void *); 1148 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1149 1150 /* ARGSUSED*/ 1151 static void 1152 selectinit(dummy) 1153 void *dummy; 1154 { 1155 cv_init(&selwait, "select"); 1156 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1157 } 1158