1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_compat.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/sysproto.h> 46 #include <sys/filedesc.h> 47 #include <sys/filio.h> 48 #include <sys/fcntl.h> 49 #include <sys/file.h> 50 #include <sys/proc.h> 51 #include <sys/signalvar.h> 52 #include <sys/socketvar.h> 53 #include <sys/uio.h> 54 #include <sys/kernel.h> 55 #include <sys/limits.h> 56 #include <sys/malloc.h> 57 #include <sys/poll.h> 58 #include <sys/resourcevar.h> 59 #include <sys/selinfo.h> 60 #include <sys/sleepqueue.h> 61 #include <sys/syscallsubr.h> 62 #include <sys/sysctl.h> 63 #include <sys/sysent.h> 64 #include <sys/vnode.h> 65 #include <sys/bio.h> 66 #include <sys/buf.h> 67 #include <sys/condvar.h> 68 #ifdef KTRACE 69 #include <sys/ktrace.h> 70 #endif 71 72 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 73 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 74 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 75 76 static int pollscan(struct thread *, struct pollfd *, u_int); 77 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 78 static int dofileread(struct thread *, int, struct file *, struct uio *, 79 off_t, int); 80 static int dofilewrite(struct thread *, int, struct file *, struct uio *, 81 off_t, int); 82 static void doselwakeup(struct selinfo *, int); 83 84 #ifndef _SYS_SYSPROTO_H_ 85 struct read_args { 86 int fd; 87 void *buf; 88 size_t nbyte; 89 }; 90 #endif 91 int 92 read(td, uap) 93 struct thread *td; 94 struct read_args *uap; 95 { 96 struct uio auio; 97 struct iovec aiov; 98 int error; 99 100 if (uap->nbyte > INT_MAX) 101 return (EINVAL); 102 aiov.iov_base = uap->buf; 103 aiov.iov_len = uap->nbyte; 104 auio.uio_iov = &aiov; 105 auio.uio_iovcnt = 1; 106 auio.uio_resid = uap->nbyte; 107 auio.uio_segflg = UIO_USERSPACE; 108 error = kern_readv(td, uap->fd, &auio); 109 return(error); 110 } 111 112 /* 113 * Positioned read system call 114 */ 115 #ifndef _SYS_SYSPROTO_H_ 116 struct pread_args { 117 int fd; 118 void *buf; 119 size_t nbyte; 120 int pad; 121 off_t offset; 122 }; 123 #endif 124 int 125 pread(td, uap) 126 struct thread *td; 127 struct pread_args *uap; 128 { 129 struct uio auio; 130 struct iovec aiov; 131 int error; 132 133 if (uap->nbyte > INT_MAX) 134 return (EINVAL); 135 aiov.iov_base = uap->buf; 136 aiov.iov_len = uap->nbyte; 137 auio.uio_iov = &aiov; 138 auio.uio_iovcnt = 1; 139 auio.uio_resid = uap->nbyte; 140 auio.uio_segflg = UIO_USERSPACE; 141 error = kern_preadv(td, uap->fd, &auio, uap->offset); 142 return(error); 143 } 144 145 /* 146 * Scatter read system call. 147 */ 148 #ifndef _SYS_SYSPROTO_H_ 149 struct readv_args { 150 int fd; 151 struct iovec *iovp; 152 u_int iovcnt; 153 }; 154 #endif 155 int 156 readv(struct thread *td, struct readv_args *uap) 157 { 158 struct uio *auio; 159 int error; 160 161 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 162 if (error) 163 return (error); 164 error = kern_readv(td, uap->fd, auio); 165 free(auio, M_IOV); 166 return (error); 167 } 168 169 int 170 kern_readv(struct thread *td, int fd, struct uio *auio) 171 { 172 struct file *fp; 173 int error; 174 175 error = fget_read(td, fd, &fp); 176 if (error) 177 return (error); 178 error = dofileread(td, fd, fp, auio, (off_t)-1, 0); 179 fdrop(fp, td); 180 return (error); 181 } 182 183 /* 184 * Scatter positioned read system call. 185 */ 186 #ifndef _SYS_SYSPROTO_H_ 187 struct preadv_args { 188 int fd; 189 struct iovec *iovp; 190 u_int iovcnt; 191 off_t offset; 192 }; 193 #endif 194 int 195 preadv(struct thread *td, struct preadv_args *uap) 196 { 197 struct uio *auio; 198 int error; 199 200 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 201 if (error) 202 return (error); 203 error = kern_preadv(td, uap->fd, auio, uap->offset); 204 free(auio, M_IOV); 205 return (error); 206 } 207 208 int 209 kern_preadv(td, fd, auio, offset) 210 struct thread *td; 211 int fd; 212 struct uio *auio; 213 off_t offset; 214 { 215 struct file *fp; 216 int error; 217 218 error = fget_read(td, fd, &fp); 219 if (error) 220 return (error); 221 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 222 error = ESPIPE; 223 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 224 error = EINVAL; 225 else 226 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); 227 fdrop(fp, td); 228 return (error); 229 } 230 231 /* 232 * Common code for readv and preadv that reads data in 233 * from a file using the passed in uio, offset, and flags. 234 */ 235 static int 236 dofileread(td, fd, fp, auio, offset, flags) 237 struct thread *td; 238 int fd; 239 struct file *fp; 240 struct uio *auio; 241 off_t offset; 242 int flags; 243 { 244 ssize_t cnt; 245 int error; 246 #ifdef KTRACE 247 struct uio *ktruio = NULL; 248 #endif 249 250 /* Finish zero length reads right here */ 251 if (auio->uio_resid == 0) { 252 td->td_retval[0] = 0; 253 return(0); 254 } 255 auio->uio_rw = UIO_READ; 256 auio->uio_offset = offset; 257 auio->uio_td = td; 258 #ifdef KTRACE 259 if (KTRPOINT(td, KTR_GENIO)) 260 ktruio = cloneuio(auio); 261 #endif 262 cnt = auio->uio_resid; 263 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { 264 if (auio->uio_resid != cnt && (error == ERESTART || 265 error == EINTR || error == EWOULDBLOCK)) 266 error = 0; 267 } 268 cnt -= auio->uio_resid; 269 #ifdef KTRACE 270 if (ktruio != NULL) { 271 ktruio->uio_resid = cnt; 272 ktrgenio(fd, UIO_READ, ktruio, error); 273 } 274 #endif 275 td->td_retval[0] = cnt; 276 return (error); 277 } 278 279 #ifndef _SYS_SYSPROTO_H_ 280 struct write_args { 281 int fd; 282 const void *buf; 283 size_t nbyte; 284 }; 285 #endif 286 int 287 write(td, uap) 288 struct thread *td; 289 struct write_args *uap; 290 { 291 struct uio auio; 292 struct iovec aiov; 293 int error; 294 295 if (uap->nbyte > INT_MAX) 296 return (EINVAL); 297 aiov.iov_base = (void *)(uintptr_t)uap->buf; 298 aiov.iov_len = uap->nbyte; 299 auio.uio_iov = &aiov; 300 auio.uio_iovcnt = 1; 301 auio.uio_resid = uap->nbyte; 302 auio.uio_segflg = UIO_USERSPACE; 303 error = kern_writev(td, uap->fd, &auio); 304 return(error); 305 } 306 307 /* 308 * Positioned write system call. 309 */ 310 #ifndef _SYS_SYSPROTO_H_ 311 struct pwrite_args { 312 int fd; 313 const void *buf; 314 size_t nbyte; 315 int pad; 316 off_t offset; 317 }; 318 #endif 319 int 320 pwrite(td, uap) 321 struct thread *td; 322 struct pwrite_args *uap; 323 { 324 struct uio auio; 325 struct iovec aiov; 326 int error; 327 328 if (uap->nbyte > INT_MAX) 329 return (EINVAL); 330 aiov.iov_base = (void *)(uintptr_t)uap->buf; 331 aiov.iov_len = uap->nbyte; 332 auio.uio_iov = &aiov; 333 auio.uio_iovcnt = 1; 334 auio.uio_resid = uap->nbyte; 335 auio.uio_segflg = UIO_USERSPACE; 336 error = kern_pwritev(td, uap->fd, &auio, uap->offset); 337 return(error); 338 } 339 340 /* 341 * Gather write system call. 342 */ 343 #ifndef _SYS_SYSPROTO_H_ 344 struct writev_args { 345 int fd; 346 struct iovec *iovp; 347 u_int iovcnt; 348 }; 349 #endif 350 int 351 writev(struct thread *td, struct writev_args *uap) 352 { 353 struct uio *auio; 354 int error; 355 356 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 357 if (error) 358 return (error); 359 error = kern_writev(td, uap->fd, auio); 360 free(auio, M_IOV); 361 return (error); 362 } 363 364 int 365 kern_writev(struct thread *td, int fd, struct uio *auio) 366 { 367 struct file *fp; 368 int error; 369 370 error = fget_write(td, fd, &fp); 371 if (error) 372 return (error); 373 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); 374 fdrop(fp, td); 375 return (error); 376 } 377 378 /* 379 * Gather positioned write system call. 380 */ 381 #ifndef _SYS_SYSPROTO_H_ 382 struct pwritev_args { 383 int fd; 384 struct iovec *iovp; 385 u_int iovcnt; 386 off_t offset; 387 }; 388 #endif 389 int 390 pwritev(struct thread *td, struct pwritev_args *uap) 391 { 392 struct uio *auio; 393 int error; 394 395 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 396 if (error) 397 return (error); 398 error = kern_pwritev(td, uap->fd, auio, uap->offset); 399 free(auio, M_IOV); 400 return (error); 401 } 402 403 int 404 kern_pwritev(td, fd, auio, offset) 405 struct thread *td; 406 struct uio *auio; 407 int fd; 408 off_t offset; 409 { 410 struct file *fp; 411 int error; 412 413 error = fget_write(td, fd, &fp); 414 if (error) 415 return (error); 416 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 417 error = ESPIPE; 418 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 419 error = EINVAL; 420 else 421 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); 422 fdrop(fp, td); 423 return (error); 424 } 425 426 /* 427 * Common code for writev and pwritev that writes data to 428 * a file using the passed in uio, offset, and flags. 429 */ 430 static int 431 dofilewrite(td, fd, fp, auio, offset, flags) 432 struct thread *td; 433 int fd; 434 struct file *fp; 435 struct uio *auio; 436 off_t offset; 437 int flags; 438 { 439 ssize_t cnt; 440 int error; 441 #ifdef KTRACE 442 struct uio *ktruio = NULL; 443 #endif 444 445 auio->uio_rw = UIO_WRITE; 446 auio->uio_td = td; 447 auio->uio_offset = offset; 448 #ifdef KTRACE 449 if (KTRPOINT(td, KTR_GENIO)) 450 ktruio = cloneuio(auio); 451 #endif 452 cnt = auio->uio_resid; 453 if (fp->f_type == DTYPE_VNODE) 454 bwillwrite(); 455 if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { 456 if (auio->uio_resid != cnt && (error == ERESTART || 457 error == EINTR || error == EWOULDBLOCK)) 458 error = 0; 459 /* Socket layer is responsible for issuing SIGPIPE. */ 460 if (fp->f_type != DTYPE_SOCKET && error == EPIPE) { 461 PROC_LOCK(td->td_proc); 462 psignal(td->td_proc, SIGPIPE); 463 PROC_UNLOCK(td->td_proc); 464 } 465 } 466 cnt -= auio->uio_resid; 467 #ifdef KTRACE 468 if (ktruio != NULL) { 469 ktruio->uio_resid = cnt; 470 ktrgenio(fd, UIO_WRITE, ktruio, error); 471 } 472 #endif 473 td->td_retval[0] = cnt; 474 return (error); 475 } 476 477 #ifndef _SYS_SYSPROTO_H_ 478 struct ioctl_args { 479 int fd; 480 u_long com; 481 caddr_t data; 482 }; 483 #endif 484 /* ARGSUSED */ 485 int 486 ioctl(struct thread *td, struct ioctl_args *uap) 487 { 488 u_long com; 489 int arg, error; 490 u_int size; 491 caddr_t data; 492 493 if (uap->com > 0xffffffff) { 494 printf( 495 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 496 td->td_proc->p_pid, td->td_proc->p_comm, uap->com); 497 uap->com &= 0xffffffff; 498 } 499 com = uap->com; 500 501 /* 502 * Interpret high order word to find amount of data to be 503 * copied to/from the user's address space. 504 */ 505 size = IOCPARM_LEN(com); 506 if ((size > IOCPARM_MAX) || 507 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 508 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) 509 ((com & IOC_OUT) && size == 0) || 510 #else 511 ((com & (IOC_IN | IOC_OUT)) && size == 0) || 512 #endif 513 ((com & IOC_VOID) && size > 0 && size != sizeof(int))) 514 return (ENOTTY); 515 516 if (size > 0) { 517 if (!(com & IOC_VOID)) 518 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 519 else { 520 /* Integer argument. */ 521 arg = (intptr_t)uap->data; 522 data = (void *)&arg; 523 size = 0; 524 } 525 } else 526 data = (void *)&uap->data; 527 if (com & IOC_IN) { 528 error = copyin(uap->data, data, (u_int)size); 529 if (error) { 530 if (size > 0) 531 free(data, M_IOCTLOPS); 532 return (error); 533 } 534 } else if (com & IOC_OUT) { 535 /* 536 * Zero the buffer so the user always 537 * gets back something deterministic. 538 */ 539 bzero(data, size); 540 } 541 542 error = kern_ioctl(td, uap->fd, com, data); 543 544 if (error == 0 && (com & IOC_OUT)) 545 error = copyout(data, uap->data, (u_int)size); 546 547 if (size > 0) 548 free(data, M_IOCTLOPS); 549 return (error); 550 } 551 552 int 553 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) 554 { 555 struct file *fp; 556 struct filedesc *fdp; 557 int error; 558 int tmp; 559 560 if ((error = fget(td, fd, &fp)) != 0) 561 return (error); 562 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 563 fdrop(fp, td); 564 return (EBADF); 565 } 566 fdp = td->td_proc->p_fd; 567 switch (com) { 568 case FIONCLEX: 569 FILEDESC_XLOCK(fdp); 570 fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE; 571 FILEDESC_XUNLOCK(fdp); 572 goto out; 573 case FIOCLEX: 574 FILEDESC_XLOCK(fdp); 575 fdp->fd_ofileflags[fd] |= UF_EXCLOSE; 576 FILEDESC_XUNLOCK(fdp); 577 goto out; 578 case FIONBIO: 579 FILE_LOCK(fp); 580 if ((tmp = *(int *)data)) 581 fp->f_flag |= FNONBLOCK; 582 else 583 fp->f_flag &= ~FNONBLOCK; 584 FILE_UNLOCK(fp); 585 data = (void *)&tmp; 586 break; 587 case FIOASYNC: 588 FILE_LOCK(fp); 589 if ((tmp = *(int *)data)) 590 fp->f_flag |= FASYNC; 591 else 592 fp->f_flag &= ~FASYNC; 593 FILE_UNLOCK(fp); 594 data = (void *)&tmp; 595 break; 596 } 597 598 error = fo_ioctl(fp, com, data, td->td_ucred, td); 599 out: 600 fdrop(fp, td); 601 return (error); 602 } 603 604 /* 605 * sellock and selwait are initialized in selectinit() via SYSINIT. 606 */ 607 struct mtx sellock; 608 struct cv selwait; 609 u_int nselcoll; /* Select collisions since boot */ 610 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 611 612 #ifndef _SYS_SYSPROTO_H_ 613 struct select_args { 614 int nd; 615 fd_set *in, *ou, *ex; 616 struct timeval *tv; 617 }; 618 #endif 619 int 620 select(td, uap) 621 register struct thread *td; 622 register struct select_args *uap; 623 { 624 struct timeval tv, *tvp; 625 int error; 626 627 if (uap->tv != NULL) { 628 error = copyin(uap->tv, &tv, sizeof(tv)); 629 if (error) 630 return (error); 631 tvp = &tv; 632 } else 633 tvp = NULL; 634 635 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); 636 } 637 638 int 639 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 640 fd_set *fd_ex, struct timeval *tvp) 641 { 642 struct filedesc *fdp; 643 /* 644 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 645 * infds with the new FD_SETSIZE of 1024, and more than enough for 646 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 647 * of 256. 648 */ 649 fd_mask s_selbits[howmany(2048, NFDBITS)]; 650 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 651 struct timeval atv, rtv, ttv; 652 int error, timo; 653 u_int ncoll, nbufbytes, ncpbytes, nfdbits; 654 655 if (nd < 0) 656 return (EINVAL); 657 fdp = td->td_proc->p_fd; 658 659 FILEDESC_SLOCK(fdp); 660 if (nd > td->td_proc->p_fd->fd_nfiles) 661 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 662 FILEDESC_SUNLOCK(fdp); 663 664 /* 665 * Allocate just enough bits for the non-null fd_sets. Use the 666 * preallocated auto buffer if possible. 667 */ 668 nfdbits = roundup(nd, NFDBITS); 669 ncpbytes = nfdbits / NBBY; 670 nbufbytes = 0; 671 if (fd_in != NULL) 672 nbufbytes += 2 * ncpbytes; 673 if (fd_ou != NULL) 674 nbufbytes += 2 * ncpbytes; 675 if (fd_ex != NULL) 676 nbufbytes += 2 * ncpbytes; 677 if (nbufbytes <= sizeof s_selbits) 678 selbits = &s_selbits[0]; 679 else 680 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 681 682 /* 683 * Assign pointers into the bit buffers and fetch the input bits. 684 * Put the output buffers together so that they can be bzeroed 685 * together. 686 */ 687 sbp = selbits; 688 #define getbits(name, x) \ 689 do { \ 690 if (name == NULL) \ 691 ibits[x] = NULL; \ 692 else { \ 693 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 694 obits[x] = sbp; \ 695 sbp += ncpbytes / sizeof *sbp; \ 696 error = copyin(name, ibits[x], ncpbytes); \ 697 if (error != 0) \ 698 goto done_nosellock; \ 699 } \ 700 } while (0) 701 getbits(fd_in, 0); 702 getbits(fd_ou, 1); 703 getbits(fd_ex, 2); 704 #undef getbits 705 if (nbufbytes != 0) 706 bzero(selbits, nbufbytes / 2); 707 708 if (tvp != NULL) { 709 atv = *tvp; 710 if (itimerfix(&atv)) { 711 error = EINVAL; 712 goto done_nosellock; 713 } 714 getmicrouptime(&rtv); 715 timevaladd(&atv, &rtv); 716 } else { 717 atv.tv_sec = 0; 718 atv.tv_usec = 0; 719 } 720 timo = 0; 721 TAILQ_INIT(&td->td_selq); 722 mtx_lock(&sellock); 723 retry: 724 ncoll = nselcoll; 725 thread_lock(td); 726 td->td_flags |= TDF_SELECT; 727 thread_unlock(td); 728 mtx_unlock(&sellock); 729 730 error = selscan(td, ibits, obits, nd); 731 mtx_lock(&sellock); 732 if (error || td->td_retval[0]) 733 goto done; 734 if (atv.tv_sec || atv.tv_usec) { 735 getmicrouptime(&rtv); 736 if (timevalcmp(&rtv, &atv, >=)) 737 goto done; 738 ttv = atv; 739 timevalsub(&ttv, &rtv); 740 timo = ttv.tv_sec > 24 * 60 * 60 ? 741 24 * 60 * 60 * hz : tvtohz(&ttv); 742 } 743 744 /* 745 * An event of interest may occur while we do not hold 746 * sellock, so check TDF_SELECT and the number of 747 * collisions and rescan the file descriptors if 748 * necessary. 749 */ 750 thread_lock(td); 751 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 752 thread_unlock(td); 753 goto retry; 754 } 755 thread_unlock(td); 756 757 if (timo > 0) 758 error = cv_timedwait_sig(&selwait, &sellock, timo); 759 else 760 error = cv_wait_sig(&selwait, &sellock); 761 762 if (error == 0) 763 goto retry; 764 765 done: 766 clear_selinfo_list(td); 767 thread_lock(td); 768 td->td_flags &= ~TDF_SELECT; 769 thread_unlock(td); 770 mtx_unlock(&sellock); 771 772 done_nosellock: 773 /* select is not restarted after signals... */ 774 if (error == ERESTART) 775 error = EINTR; 776 if (error == EWOULDBLOCK) 777 error = 0; 778 #define putbits(name, x) \ 779 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 780 error = error2; 781 if (error == 0) { 782 int error2; 783 784 putbits(fd_in, 0); 785 putbits(fd_ou, 1); 786 putbits(fd_ex, 2); 787 #undef putbits 788 } 789 if (selbits != &s_selbits[0]) 790 free(selbits, M_SELECT); 791 792 return (error); 793 } 794 795 static int 796 selscan(td, ibits, obits, nfd) 797 struct thread *td; 798 fd_mask **ibits, **obits; 799 int nfd; 800 { 801 int msk, i, fd; 802 fd_mask bits; 803 struct file *fp; 804 int n = 0; 805 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 806 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 807 struct filedesc *fdp = td->td_proc->p_fd; 808 809 FILEDESC_SLOCK(fdp); 810 for (msk = 0; msk < 3; msk++) { 811 if (ibits[msk] == NULL) 812 continue; 813 for (i = 0; i < nfd; i += NFDBITS) { 814 bits = ibits[msk][i/NFDBITS]; 815 /* ffs(int mask) not portable, fd_mask is long */ 816 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 817 if (!(bits & 1)) 818 continue; 819 if ((fp = fget_locked(fdp, fd)) == NULL) { 820 FILEDESC_SUNLOCK(fdp); 821 return (EBADF); 822 } 823 if (fo_poll(fp, flag[msk], td->td_ucred, 824 td)) { 825 obits[msk][(fd)/NFDBITS] |= 826 ((fd_mask)1 << ((fd) % NFDBITS)); 827 n++; 828 } 829 } 830 } 831 } 832 FILEDESC_SUNLOCK(fdp); 833 td->td_retval[0] = n; 834 return (0); 835 } 836 837 #ifndef _SYS_SYSPROTO_H_ 838 struct poll_args { 839 struct pollfd *fds; 840 u_int nfds; 841 int timeout; 842 }; 843 #endif 844 int 845 poll(td, uap) 846 struct thread *td; 847 struct poll_args *uap; 848 { 849 struct pollfd *bits; 850 struct pollfd smallbits[32]; 851 struct timeval atv, rtv, ttv; 852 int error = 0, timo; 853 u_int ncoll, nfds; 854 size_t ni; 855 856 nfds = uap->nfds; 857 858 /* 859 * This is kinda bogus. We have fd limits, but that is not 860 * really related to the size of the pollfd array. Make sure 861 * we let the process use at least FD_SETSIZE entries and at 862 * least enough for the current limits. We want to be reasonably 863 * safe, but not overly restrictive. 864 */ 865 PROC_LOCK(td->td_proc); 866 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) && 867 (nfds > FD_SETSIZE)) { 868 PROC_UNLOCK(td->td_proc); 869 error = EINVAL; 870 goto done2; 871 } 872 PROC_UNLOCK(td->td_proc); 873 ni = nfds * sizeof(struct pollfd); 874 if (ni > sizeof(smallbits)) 875 bits = malloc(ni, M_TEMP, M_WAITOK); 876 else 877 bits = smallbits; 878 error = copyin(uap->fds, bits, ni); 879 if (error) 880 goto done_nosellock; 881 if (uap->timeout != INFTIM) { 882 atv.tv_sec = uap->timeout / 1000; 883 atv.tv_usec = (uap->timeout % 1000) * 1000; 884 if (itimerfix(&atv)) { 885 error = EINVAL; 886 goto done_nosellock; 887 } 888 getmicrouptime(&rtv); 889 timevaladd(&atv, &rtv); 890 } else { 891 atv.tv_sec = 0; 892 atv.tv_usec = 0; 893 } 894 timo = 0; 895 TAILQ_INIT(&td->td_selq); 896 mtx_lock(&sellock); 897 retry: 898 ncoll = nselcoll; 899 thread_lock(td); 900 td->td_flags |= TDF_SELECT; 901 thread_unlock(td); 902 mtx_unlock(&sellock); 903 904 error = pollscan(td, bits, nfds); 905 mtx_lock(&sellock); 906 if (error || td->td_retval[0]) 907 goto done; 908 if (atv.tv_sec || atv.tv_usec) { 909 getmicrouptime(&rtv); 910 if (timevalcmp(&rtv, &atv, >=)) 911 goto done; 912 ttv = atv; 913 timevalsub(&ttv, &rtv); 914 timo = ttv.tv_sec > 24 * 60 * 60 ? 915 24 * 60 * 60 * hz : tvtohz(&ttv); 916 } 917 /* 918 * An event of interest may occur while we do not hold 919 * sellock, so check TDF_SELECT and the number of collisions 920 * and rescan the file descriptors if necessary. 921 */ 922 thread_lock(td); 923 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 924 thread_unlock(td); 925 goto retry; 926 } 927 thread_unlock(td); 928 929 if (timo > 0) 930 error = cv_timedwait_sig(&selwait, &sellock, timo); 931 else 932 error = cv_wait_sig(&selwait, &sellock); 933 934 if (error == 0) 935 goto retry; 936 937 done: 938 clear_selinfo_list(td); 939 thread_lock(td); 940 td->td_flags &= ~TDF_SELECT; 941 thread_unlock(td); 942 mtx_unlock(&sellock); 943 944 done_nosellock: 945 /* poll is not restarted after signals... */ 946 if (error == ERESTART) 947 error = EINTR; 948 if (error == EWOULDBLOCK) 949 error = 0; 950 if (error == 0) { 951 error = copyout(bits, uap->fds, ni); 952 if (error) 953 goto out; 954 } 955 out: 956 if (ni > sizeof(smallbits)) 957 free(bits, M_TEMP); 958 done2: 959 return (error); 960 } 961 962 static int 963 pollscan(td, fds, nfd) 964 struct thread *td; 965 struct pollfd *fds; 966 u_int nfd; 967 { 968 register struct filedesc *fdp = td->td_proc->p_fd; 969 int i; 970 struct file *fp; 971 int n = 0; 972 973 FILEDESC_SLOCK(fdp); 974 for (i = 0; i < nfd; i++, fds++) { 975 if (fds->fd >= fdp->fd_nfiles) { 976 fds->revents = POLLNVAL; 977 n++; 978 } else if (fds->fd < 0) { 979 fds->revents = 0; 980 } else { 981 fp = fdp->fd_ofiles[fds->fd]; 982 if (fp == NULL) { 983 fds->revents = POLLNVAL; 984 n++; 985 } else { 986 /* 987 * Note: backend also returns POLLHUP and 988 * POLLERR if appropriate. 989 */ 990 fds->revents = fo_poll(fp, fds->events, 991 td->td_ucred, td); 992 if (fds->revents != 0) 993 n++; 994 } 995 } 996 } 997 FILEDESC_SUNLOCK(fdp); 998 td->td_retval[0] = n; 999 return (0); 1000 } 1001 1002 /* 1003 * OpenBSD poll system call. 1004 * 1005 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1006 */ 1007 #ifndef _SYS_SYSPROTO_H_ 1008 struct openbsd_poll_args { 1009 struct pollfd *fds; 1010 u_int nfds; 1011 int timeout; 1012 }; 1013 #endif 1014 int 1015 openbsd_poll(td, uap) 1016 register struct thread *td; 1017 register struct openbsd_poll_args *uap; 1018 { 1019 return (poll(td, (struct poll_args *)uap)); 1020 } 1021 1022 /* 1023 * Remove the references to the thread from all of the objects we were 1024 * polling. 1025 * 1026 * This code assumes that the underlying owner of the selinfo structure will 1027 * hold sellock before it changes it, and that it will unlink itself from our 1028 * list if it goes away. 1029 */ 1030 void 1031 clear_selinfo_list(td) 1032 struct thread *td; 1033 { 1034 struct selinfo *si; 1035 1036 mtx_assert(&sellock, MA_OWNED); 1037 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1038 si->si_thread = NULL; 1039 TAILQ_INIT(&td->td_selq); 1040 } 1041 1042 /* 1043 * Record a select request. 1044 */ 1045 void 1046 selrecord(selector, sip) 1047 struct thread *selector; 1048 struct selinfo *sip; 1049 { 1050 1051 mtx_lock(&sellock); 1052 /* 1053 * If the selinfo's thread pointer is NULL then take ownership of it. 1054 * 1055 * If the thread pointer is not NULL and it points to another 1056 * thread, then we have a collision. 1057 * 1058 * If the thread pointer is not NULL and points back to us then leave 1059 * it alone as we've already added pointed it at us and added it to 1060 * our list. 1061 */ 1062 if (sip->si_thread == NULL) { 1063 sip->si_thread = selector; 1064 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1065 } else if (sip->si_thread != selector) { 1066 sip->si_flags |= SI_COLL; 1067 } 1068 1069 mtx_unlock(&sellock); 1070 } 1071 1072 /* Wake up a selecting thread. */ 1073 void 1074 selwakeup(sip) 1075 struct selinfo *sip; 1076 { 1077 doselwakeup(sip, -1); 1078 } 1079 1080 /* Wake up a selecting thread, and set its priority. */ 1081 void 1082 selwakeuppri(sip, pri) 1083 struct selinfo *sip; 1084 int pri; 1085 { 1086 doselwakeup(sip, pri); 1087 } 1088 1089 /* 1090 * Do a wakeup when a selectable event occurs. 1091 */ 1092 static void 1093 doselwakeup(sip, pri) 1094 struct selinfo *sip; 1095 int pri; 1096 { 1097 struct thread *td; 1098 1099 mtx_lock(&sellock); 1100 td = sip->si_thread; 1101 if ((sip->si_flags & SI_COLL) != 0) { 1102 nselcoll++; 1103 sip->si_flags &= ~SI_COLL; 1104 cv_broadcastpri(&selwait, pri); 1105 } 1106 if (td == NULL) { 1107 mtx_unlock(&sellock); 1108 return; 1109 } 1110 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1111 sip->si_thread = NULL; 1112 thread_lock(td); 1113 td->td_flags &= ~TDF_SELECT; 1114 thread_unlock(td); 1115 sleepq_remove(td, &selwait); 1116 mtx_unlock(&sellock); 1117 } 1118 1119 static void selectinit(void *); 1120 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1121 1122 /* ARGSUSED*/ 1123 static void 1124 selectinit(dummy) 1125 void *dummy; 1126 { 1127 cv_init(&selwait, "select"); 1128 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1129 } 1130