1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_compat.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/sysproto.h> 46 #include <sys/filedesc.h> 47 #include <sys/filio.h> 48 #include <sys/fcntl.h> 49 #include <sys/file.h> 50 #include <sys/proc.h> 51 #include <sys/signalvar.h> 52 #include <sys/socketvar.h> 53 #include <sys/uio.h> 54 #include <sys/kernel.h> 55 #include <sys/limits.h> 56 #include <sys/malloc.h> 57 #include <sys/poll.h> 58 #include <sys/resourcevar.h> 59 #include <sys/selinfo.h> 60 #include <sys/sleepqueue.h> 61 #include <sys/syscallsubr.h> 62 #include <sys/sysctl.h> 63 #include <sys/sysent.h> 64 #include <sys/vnode.h> 65 #include <sys/bio.h> 66 #include <sys/buf.h> 67 #include <sys/condvar.h> 68 #ifdef KTRACE 69 #include <sys/ktrace.h> 70 #endif 71 #include <vm/vm.h> 72 #include <vm/vm_page.h> 73 74 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 75 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 76 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 77 78 static int pollscan(struct thread *, struct pollfd *, u_int); 79 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 80 static int dofileread(struct thread *, int, struct file *, struct uio *, 81 off_t, int); 82 static int dofilewrite(struct thread *, int, struct file *, struct uio *, 83 off_t, int); 84 static void doselwakeup(struct selinfo *, int); 85 86 #ifndef _SYS_SYSPROTO_H_ 87 struct read_args { 88 int fd; 89 void *buf; 90 size_t nbyte; 91 }; 92 #endif 93 int 94 read(td, uap) 95 struct thread *td; 96 struct read_args *uap; 97 { 98 struct uio auio; 99 struct iovec aiov; 100 int error; 101 102 if (uap->nbyte > INT_MAX) 103 return (EINVAL); 104 aiov.iov_base = uap->buf; 105 aiov.iov_len = uap->nbyte; 106 auio.uio_iov = &aiov; 107 auio.uio_iovcnt = 1; 108 auio.uio_resid = uap->nbyte; 109 auio.uio_segflg = UIO_USERSPACE; 110 error = kern_readv(td, uap->fd, &auio); 111 return(error); 112 } 113 114 /* 115 * Positioned read system call 116 */ 117 #ifndef _SYS_SYSPROTO_H_ 118 struct pread_args { 119 int fd; 120 void *buf; 121 size_t nbyte; 122 int pad; 123 off_t offset; 124 }; 125 #endif 126 int 127 pread(td, uap) 128 struct thread *td; 129 struct pread_args *uap; 130 { 131 struct uio auio; 132 struct iovec aiov; 133 int error; 134 135 if (uap->nbyte > INT_MAX) 136 return (EINVAL); 137 aiov.iov_base = uap->buf; 138 aiov.iov_len = uap->nbyte; 139 auio.uio_iov = &aiov; 140 auio.uio_iovcnt = 1; 141 auio.uio_resid = uap->nbyte; 142 auio.uio_segflg = UIO_USERSPACE; 143 error = kern_preadv(td, uap->fd, &auio, uap->offset); 144 return(error); 145 } 146 147 /* 148 * Scatter read system call. 149 */ 150 #ifndef _SYS_SYSPROTO_H_ 151 struct readv_args { 152 int fd; 153 struct iovec *iovp; 154 u_int iovcnt; 155 }; 156 #endif 157 int 158 readv(struct thread *td, struct readv_args *uap) 159 { 160 struct uio *auio; 161 int error; 162 163 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 164 if (error) 165 return (error); 166 error = kern_readv(td, uap->fd, auio); 167 free(auio, M_IOV); 168 return (error); 169 } 170 171 int 172 kern_readv(struct thread *td, int fd, struct uio *auio) 173 { 174 struct file *fp; 175 int error; 176 177 error = fget_read(td, fd, &fp); 178 if (error) 179 return (error); 180 error = dofileread(td, fd, fp, auio, (off_t)-1, 0); 181 fdrop(fp, td); 182 return (error); 183 } 184 185 /* 186 * Scatter positioned read system call. 187 */ 188 #ifndef _SYS_SYSPROTO_H_ 189 struct preadv_args { 190 int fd; 191 struct iovec *iovp; 192 u_int iovcnt; 193 off_t offset; 194 }; 195 #endif 196 int 197 preadv(struct thread *td, struct preadv_args *uap) 198 { 199 struct uio *auio; 200 int error; 201 202 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 203 if (error) 204 return (error); 205 error = kern_preadv(td, uap->fd, auio, uap->offset); 206 free(auio, M_IOV); 207 return (error); 208 } 209 210 int 211 kern_preadv(td, fd, auio, offset) 212 struct thread *td; 213 int fd; 214 struct uio *auio; 215 off_t offset; 216 { 217 struct file *fp; 218 int error; 219 220 error = fget_read(td, fd, &fp); 221 if (error) 222 return (error); 223 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 224 error = ESPIPE; 225 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 226 error = EINVAL; 227 else 228 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); 229 fdrop(fp, td); 230 return (error); 231 } 232 233 /* 234 * Common code for readv and preadv that reads data in 235 * from a file using the passed in uio, offset, and flags. 236 */ 237 static int 238 dofileread(td, fd, fp, auio, offset, flags) 239 struct thread *td; 240 int fd; 241 struct file *fp; 242 struct uio *auio; 243 off_t offset; 244 int flags; 245 { 246 ssize_t cnt; 247 int error; 248 #ifdef KTRACE 249 struct uio *ktruio = NULL; 250 #endif 251 252 /* Finish zero length reads right here */ 253 if (auio->uio_resid == 0) { 254 td->td_retval[0] = 0; 255 return(0); 256 } 257 auio->uio_rw = UIO_READ; 258 auio->uio_offset = offset; 259 auio->uio_td = td; 260 #ifdef KTRACE 261 if (KTRPOINT(td, KTR_GENIO)) 262 ktruio = cloneuio(auio); 263 #endif 264 cnt = auio->uio_resid; 265 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { 266 if (auio->uio_resid != cnt && (error == ERESTART || 267 error == EINTR || error == EWOULDBLOCK)) 268 error = 0; 269 } 270 cnt -= auio->uio_resid; 271 #ifdef KTRACE 272 if (ktruio != NULL) { 273 ktruio->uio_resid = cnt; 274 ktrgenio(fd, UIO_READ, ktruio, error); 275 } 276 #endif 277 td->td_retval[0] = cnt; 278 return (error); 279 } 280 281 #ifndef _SYS_SYSPROTO_H_ 282 struct write_args { 283 int fd; 284 const void *buf; 285 size_t nbyte; 286 }; 287 #endif 288 int 289 write(td, uap) 290 struct thread *td; 291 struct write_args *uap; 292 { 293 struct uio auio; 294 struct iovec aiov; 295 int error; 296 297 if (uap->nbyte > INT_MAX) 298 return (EINVAL); 299 aiov.iov_base = (void *)(uintptr_t)uap->buf; 300 aiov.iov_len = uap->nbyte; 301 auio.uio_iov = &aiov; 302 auio.uio_iovcnt = 1; 303 auio.uio_resid = uap->nbyte; 304 auio.uio_segflg = UIO_USERSPACE; 305 error = kern_writev(td, uap->fd, &auio); 306 return(error); 307 } 308 309 /* 310 * Positioned write system call. 311 */ 312 #ifndef _SYS_SYSPROTO_H_ 313 struct pwrite_args { 314 int fd; 315 const void *buf; 316 size_t nbyte; 317 int pad; 318 off_t offset; 319 }; 320 #endif 321 int 322 pwrite(td, uap) 323 struct thread *td; 324 struct pwrite_args *uap; 325 { 326 struct uio auio; 327 struct iovec aiov; 328 int error; 329 330 if (uap->nbyte > INT_MAX) 331 return (EINVAL); 332 aiov.iov_base = (void *)(uintptr_t)uap->buf; 333 aiov.iov_len = uap->nbyte; 334 auio.uio_iov = &aiov; 335 auio.uio_iovcnt = 1; 336 auio.uio_resid = uap->nbyte; 337 auio.uio_segflg = UIO_USERSPACE; 338 error = kern_pwritev(td, uap->fd, &auio, uap->offset); 339 return(error); 340 } 341 342 /* 343 * Gather write system call. 344 */ 345 #ifndef _SYS_SYSPROTO_H_ 346 struct writev_args { 347 int fd; 348 struct iovec *iovp; 349 u_int iovcnt; 350 }; 351 #endif 352 int 353 writev(struct thread *td, struct writev_args *uap) 354 { 355 struct uio *auio; 356 int error; 357 358 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 359 if (error) 360 return (error); 361 error = kern_writev(td, uap->fd, auio); 362 free(auio, M_IOV); 363 return (error); 364 } 365 366 int 367 kern_writev(struct thread *td, int fd, struct uio *auio) 368 { 369 struct file *fp; 370 int error; 371 372 error = fget_write(td, fd, &fp); 373 if (error) 374 return (error); 375 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); 376 fdrop(fp, td); 377 return (error); 378 } 379 380 /* 381 * Gather positioned write system call. 382 */ 383 #ifndef _SYS_SYSPROTO_H_ 384 struct pwritev_args { 385 int fd; 386 struct iovec *iovp; 387 u_int iovcnt; 388 off_t offset; 389 }; 390 #endif 391 int 392 pwritev(struct thread *td, struct pwritev_args *uap) 393 { 394 struct uio *auio; 395 int error; 396 397 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 398 if (error) 399 return (error); 400 error = kern_pwritev(td, uap->fd, auio, uap->offset); 401 free(auio, M_IOV); 402 return (error); 403 } 404 405 int 406 kern_pwritev(td, fd, auio, offset) 407 struct thread *td; 408 struct uio *auio; 409 int fd; 410 off_t offset; 411 { 412 struct file *fp; 413 int error; 414 415 error = fget_write(td, fd, &fp); 416 if (error) 417 return (error); 418 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 419 error = ESPIPE; 420 else if (offset < 0 && fp->f_vnode->v_type != VCHR) 421 error = EINVAL; 422 else 423 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); 424 fdrop(fp, td); 425 return (error); 426 } 427 428 /* 429 * Common code for writev and pwritev that writes data to 430 * a file using the passed in uio, offset, and flags. 431 */ 432 static int 433 dofilewrite(td, fd, fp, auio, offset, flags) 434 struct thread *td; 435 int fd; 436 struct file *fp; 437 struct uio *auio; 438 off_t offset; 439 int flags; 440 { 441 ssize_t cnt; 442 int error; 443 #ifdef KTRACE 444 struct uio *ktruio = NULL; 445 #endif 446 447 auio->uio_rw = UIO_WRITE; 448 auio->uio_td = td; 449 auio->uio_offset = offset; 450 #ifdef KTRACE 451 if (KTRPOINT(td, KTR_GENIO)) 452 ktruio = cloneuio(auio); 453 #endif 454 cnt = auio->uio_resid; 455 if (fp->f_type == DTYPE_VNODE) 456 bwillwrite(); 457 if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { 458 if (auio->uio_resid != cnt && (error == ERESTART || 459 error == EINTR || error == EWOULDBLOCK)) 460 error = 0; 461 /* Socket layer is responsible for issuing SIGPIPE. */ 462 if (fp->f_type != DTYPE_SOCKET && error == EPIPE) { 463 PROC_LOCK(td->td_proc); 464 psignal(td->td_proc, SIGPIPE); 465 PROC_UNLOCK(td->td_proc); 466 } 467 } 468 cnt -= auio->uio_resid; 469 #ifdef KTRACE 470 if (ktruio != NULL) { 471 ktruio->uio_resid = cnt; 472 ktrgenio(fd, UIO_WRITE, ktruio, error); 473 } 474 #endif 475 td->td_retval[0] = cnt; 476 return (error); 477 } 478 479 #ifndef _SYS_SYSPROTO_H_ 480 struct ioctl_args { 481 int fd; 482 u_long com; 483 caddr_t data; 484 }; 485 #endif 486 /* ARGSUSED */ 487 int 488 ioctl(struct thread *td, struct ioctl_args *uap) 489 { 490 u_long com; 491 int arg, error; 492 u_int size; 493 caddr_t data; 494 495 if (uap->com > 0xffffffff) { 496 printf( 497 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 498 td->td_proc->p_pid, td->td_proc->p_comm, uap->com); 499 uap->com &= 0xffffffff; 500 } 501 com = uap->com; 502 503 /* 504 * Interpret high order word to find amount of data to be 505 * copied to/from the user's address space. 506 */ 507 size = IOCPARM_LEN(com); 508 if ((size > IOCPARM_MAX) || 509 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 510 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) 511 ((com & IOC_OUT) && size == 0) || 512 #else 513 ((com & (IOC_IN | IOC_OUT)) && size == 0) || 514 #endif 515 ((com & IOC_VOID) && size > 0 && size != sizeof(int))) 516 return (ENOTTY); 517 518 if (size > 0) { 519 if (!(com & IOC_VOID)) 520 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 521 else { 522 /* Integer argument. */ 523 arg = (intptr_t)uap->data; 524 data = (void *)&arg; 525 size = 0; 526 } 527 } else 528 data = (void *)&uap->data; 529 if (com & IOC_IN) { 530 error = copyin(uap->data, data, (u_int)size); 531 if (error) { 532 if (size > 0) 533 free(data, M_IOCTLOPS); 534 return (error); 535 } 536 } else if (com & IOC_OUT) { 537 /* 538 * Zero the buffer so the user always 539 * gets back something deterministic. 540 */ 541 bzero(data, size); 542 } 543 544 error = kern_ioctl(td, uap->fd, com, data); 545 546 if (error == 0 && (com & IOC_OUT)) 547 error = copyout(data, uap->data, (u_int)size); 548 549 if (size > 0) 550 free(data, M_IOCTLOPS); 551 return (error); 552 } 553 554 int 555 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) 556 { 557 struct file *fp; 558 struct filedesc *fdp; 559 int error; 560 int tmp; 561 562 if ((error = fget(td, fd, &fp)) != 0) 563 return (error); 564 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 565 fdrop(fp, td); 566 return (EBADF); 567 } 568 fdp = td->td_proc->p_fd; 569 switch (com) { 570 case FIONCLEX: 571 FILEDESC_LOCK_FAST(fdp); 572 fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE; 573 FILEDESC_UNLOCK_FAST(fdp); 574 goto out; 575 case FIOCLEX: 576 FILEDESC_LOCK_FAST(fdp); 577 fdp->fd_ofileflags[fd] |= UF_EXCLOSE; 578 FILEDESC_UNLOCK_FAST(fdp); 579 goto out; 580 case FIONBIO: 581 FILE_LOCK(fp); 582 if ((tmp = *(int *)data)) 583 fp->f_flag |= FNONBLOCK; 584 else 585 fp->f_flag &= ~FNONBLOCK; 586 FILE_UNLOCK(fp); 587 data = (void *)&tmp; 588 break; 589 case FIOASYNC: 590 FILE_LOCK(fp); 591 if ((tmp = *(int *)data)) 592 fp->f_flag |= FASYNC; 593 else 594 fp->f_flag &= ~FASYNC; 595 FILE_UNLOCK(fp); 596 data = (void *)&tmp; 597 break; 598 } 599 600 error = fo_ioctl(fp, com, data, td->td_ucred, td); 601 out: 602 fdrop(fp, td); 603 return (error); 604 } 605 606 /* 607 * sellock and selwait are initialized in selectinit() via SYSINIT. 608 */ 609 struct mtx sellock; 610 struct cv selwait; 611 u_int nselcoll; /* Select collisions since boot */ 612 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 613 614 #ifndef _SYS_SYSPROTO_H_ 615 struct select_args { 616 int nd; 617 fd_set *in, *ou, *ex; 618 struct timeval *tv; 619 }; 620 #endif 621 int 622 select(td, uap) 623 register struct thread *td; 624 register struct select_args *uap; 625 { 626 struct timeval tv, *tvp; 627 int error; 628 629 if (uap->tv != NULL) { 630 error = copyin(uap->tv, &tv, sizeof(tv)); 631 if (error) 632 return (error); 633 tvp = &tv; 634 } else 635 tvp = NULL; 636 637 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); 638 } 639 640 int 641 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 642 fd_set *fd_ex, struct timeval *tvp) 643 { 644 struct filedesc *fdp; 645 /* 646 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 647 * infds with the new FD_SETSIZE of 1024, and more than enough for 648 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 649 * of 256. 650 */ 651 fd_mask s_selbits[howmany(2048, NFDBITS)]; 652 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 653 struct timeval atv, rtv, ttv; 654 int error, timo; 655 u_int ncoll, nbufbytes, ncpbytes, nfdbits; 656 657 if (nd < 0) 658 return (EINVAL); 659 fdp = td->td_proc->p_fd; 660 661 FILEDESC_LOCK_FAST(fdp); 662 663 if (nd > td->td_proc->p_fd->fd_nfiles) 664 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 665 FILEDESC_UNLOCK_FAST(fdp); 666 667 /* 668 * Allocate just enough bits for the non-null fd_sets. Use the 669 * preallocated auto buffer if possible. 670 */ 671 nfdbits = roundup(nd, NFDBITS); 672 ncpbytes = nfdbits / NBBY; 673 nbufbytes = 0; 674 if (fd_in != NULL) 675 nbufbytes += 2 * ncpbytes; 676 if (fd_ou != NULL) 677 nbufbytes += 2 * ncpbytes; 678 if (fd_ex != NULL) 679 nbufbytes += 2 * ncpbytes; 680 if (nbufbytes <= sizeof s_selbits) 681 selbits = &s_selbits[0]; 682 else 683 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 684 685 /* 686 * Assign pointers into the bit buffers and fetch the input bits. 687 * Put the output buffers together so that they can be bzeroed 688 * together. 689 */ 690 sbp = selbits; 691 #define getbits(name, x) \ 692 do { \ 693 if (name == NULL) \ 694 ibits[x] = NULL; \ 695 else { \ 696 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 697 obits[x] = sbp; \ 698 sbp += ncpbytes / sizeof *sbp; \ 699 error = copyin(name, ibits[x], ncpbytes); \ 700 if (error != 0) \ 701 goto done_nosellock; \ 702 } \ 703 } while (0) 704 getbits(fd_in, 0); 705 getbits(fd_ou, 1); 706 getbits(fd_ex, 2); 707 #undef getbits 708 if (nbufbytes != 0) 709 bzero(selbits, nbufbytes / 2); 710 711 if (tvp != NULL) { 712 atv = *tvp; 713 if (itimerfix(&atv)) { 714 error = EINVAL; 715 goto done_nosellock; 716 } 717 getmicrouptime(&rtv); 718 timevaladd(&atv, &rtv); 719 } else { 720 atv.tv_sec = 0; 721 atv.tv_usec = 0; 722 } 723 timo = 0; 724 TAILQ_INIT(&td->td_selq); 725 mtx_lock(&sellock); 726 retry: 727 ncoll = nselcoll; 728 mtx_lock_spin(&sched_lock); 729 td->td_flags |= TDF_SELECT; 730 mtx_unlock_spin(&sched_lock); 731 mtx_unlock(&sellock); 732 733 error = selscan(td, ibits, obits, nd); 734 mtx_lock(&sellock); 735 if (error || td->td_retval[0]) 736 goto done; 737 if (atv.tv_sec || atv.tv_usec) { 738 getmicrouptime(&rtv); 739 if (timevalcmp(&rtv, &atv, >=)) 740 goto done; 741 ttv = atv; 742 timevalsub(&ttv, &rtv); 743 timo = ttv.tv_sec > 24 * 60 * 60 ? 744 24 * 60 * 60 * hz : tvtohz(&ttv); 745 } 746 747 /* 748 * An event of interest may occur while we do not hold 749 * sellock, so check TDF_SELECT and the number of 750 * collisions and rescan the file descriptors if 751 * necessary. 752 */ 753 mtx_lock_spin(&sched_lock); 754 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 755 mtx_unlock_spin(&sched_lock); 756 goto retry; 757 } 758 mtx_unlock_spin(&sched_lock); 759 760 if (timo > 0) 761 error = cv_timedwait_sig(&selwait, &sellock, timo); 762 else 763 error = cv_wait_sig(&selwait, &sellock); 764 765 if (error == 0) 766 goto retry; 767 768 done: 769 clear_selinfo_list(td); 770 mtx_lock_spin(&sched_lock); 771 td->td_flags &= ~TDF_SELECT; 772 mtx_unlock_spin(&sched_lock); 773 mtx_unlock(&sellock); 774 775 done_nosellock: 776 /* select is not restarted after signals... */ 777 if (error == ERESTART) 778 error = EINTR; 779 if (error == EWOULDBLOCK) 780 error = 0; 781 #define putbits(name, x) \ 782 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 783 error = error2; 784 if (error == 0) { 785 int error2; 786 787 putbits(fd_in, 0); 788 putbits(fd_ou, 1); 789 putbits(fd_ex, 2); 790 #undef putbits 791 } 792 if (selbits != &s_selbits[0]) 793 free(selbits, M_SELECT); 794 795 return (error); 796 } 797 798 static int 799 selscan(td, ibits, obits, nfd) 800 struct thread *td; 801 fd_mask **ibits, **obits; 802 int nfd; 803 { 804 int msk, i, fd; 805 fd_mask bits; 806 struct file *fp; 807 int n = 0; 808 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 809 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 810 struct filedesc *fdp = td->td_proc->p_fd; 811 812 FILEDESC_LOCK(fdp); 813 for (msk = 0; msk < 3; msk++) { 814 if (ibits[msk] == NULL) 815 continue; 816 for (i = 0; i < nfd; i += NFDBITS) { 817 bits = ibits[msk][i/NFDBITS]; 818 /* ffs(int mask) not portable, fd_mask is long */ 819 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 820 if (!(bits & 1)) 821 continue; 822 if ((fp = fget_locked(fdp, fd)) == NULL) { 823 FILEDESC_UNLOCK(fdp); 824 return (EBADF); 825 } 826 if (fo_poll(fp, flag[msk], td->td_ucred, 827 td)) { 828 obits[msk][(fd)/NFDBITS] |= 829 ((fd_mask)1 << ((fd) % NFDBITS)); 830 n++; 831 } 832 } 833 } 834 } 835 FILEDESC_UNLOCK(fdp); 836 td->td_retval[0] = n; 837 return (0); 838 } 839 840 #ifndef _SYS_SYSPROTO_H_ 841 struct poll_args { 842 struct pollfd *fds; 843 u_int nfds; 844 int timeout; 845 }; 846 #endif 847 int 848 poll(td, uap) 849 struct thread *td; 850 struct poll_args *uap; 851 { 852 struct pollfd *bits; 853 struct pollfd smallbits[32]; 854 struct timeval atv, rtv, ttv; 855 int error = 0, timo; 856 u_int ncoll, nfds; 857 size_t ni; 858 859 nfds = uap->nfds; 860 861 /* 862 * This is kinda bogus. We have fd limits, but that is not 863 * really related to the size of the pollfd array. Make sure 864 * we let the process use at least FD_SETSIZE entries and at 865 * least enough for the current limits. We want to be reasonably 866 * safe, but not overly restrictive. 867 */ 868 PROC_LOCK(td->td_proc); 869 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) && 870 (nfds > FD_SETSIZE)) { 871 PROC_UNLOCK(td->td_proc); 872 error = EINVAL; 873 goto done2; 874 } 875 PROC_UNLOCK(td->td_proc); 876 ni = nfds * sizeof(struct pollfd); 877 if (ni > sizeof(smallbits)) 878 bits = malloc(ni, M_TEMP, M_WAITOK); 879 else 880 bits = smallbits; 881 error = copyin(uap->fds, bits, ni); 882 if (error) 883 goto done_nosellock; 884 if (uap->timeout != INFTIM) { 885 atv.tv_sec = uap->timeout / 1000; 886 atv.tv_usec = (uap->timeout % 1000) * 1000; 887 if (itimerfix(&atv)) { 888 error = EINVAL; 889 goto done_nosellock; 890 } 891 getmicrouptime(&rtv); 892 timevaladd(&atv, &rtv); 893 } else { 894 atv.tv_sec = 0; 895 atv.tv_usec = 0; 896 } 897 timo = 0; 898 TAILQ_INIT(&td->td_selq); 899 mtx_lock(&sellock); 900 retry: 901 ncoll = nselcoll; 902 mtx_lock_spin(&sched_lock); 903 td->td_flags |= TDF_SELECT; 904 mtx_unlock_spin(&sched_lock); 905 mtx_unlock(&sellock); 906 907 error = pollscan(td, bits, nfds); 908 mtx_lock(&sellock); 909 if (error || td->td_retval[0]) 910 goto done; 911 if (atv.tv_sec || atv.tv_usec) { 912 getmicrouptime(&rtv); 913 if (timevalcmp(&rtv, &atv, >=)) 914 goto done; 915 ttv = atv; 916 timevalsub(&ttv, &rtv); 917 timo = ttv.tv_sec > 24 * 60 * 60 ? 918 24 * 60 * 60 * hz : tvtohz(&ttv); 919 } 920 /* 921 * An event of interest may occur while we do not hold 922 * sellock, so check TDF_SELECT and the number of collisions 923 * and rescan the file descriptors if necessary. 924 */ 925 mtx_lock_spin(&sched_lock); 926 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 927 mtx_unlock_spin(&sched_lock); 928 goto retry; 929 } 930 mtx_unlock_spin(&sched_lock); 931 932 if (timo > 0) 933 error = cv_timedwait_sig(&selwait, &sellock, timo); 934 else 935 error = cv_wait_sig(&selwait, &sellock); 936 937 if (error == 0) 938 goto retry; 939 940 done: 941 clear_selinfo_list(td); 942 mtx_lock_spin(&sched_lock); 943 td->td_flags &= ~TDF_SELECT; 944 mtx_unlock_spin(&sched_lock); 945 mtx_unlock(&sellock); 946 947 done_nosellock: 948 /* poll is not restarted after signals... */ 949 if (error == ERESTART) 950 error = EINTR; 951 if (error == EWOULDBLOCK) 952 error = 0; 953 if (error == 0) { 954 error = copyout(bits, uap->fds, ni); 955 if (error) 956 goto out; 957 } 958 out: 959 if (ni > sizeof(smallbits)) 960 free(bits, M_TEMP); 961 done2: 962 return (error); 963 } 964 965 static int 966 pollscan(td, fds, nfd) 967 struct thread *td; 968 struct pollfd *fds; 969 u_int nfd; 970 { 971 register struct filedesc *fdp = td->td_proc->p_fd; 972 int i; 973 struct file *fp; 974 int n = 0; 975 976 FILEDESC_LOCK(fdp); 977 for (i = 0; i < nfd; i++, fds++) { 978 if (fds->fd >= fdp->fd_nfiles) { 979 fds->revents = POLLNVAL; 980 n++; 981 } else if (fds->fd < 0) { 982 fds->revents = 0; 983 } else { 984 fp = fdp->fd_ofiles[fds->fd]; 985 if (fp == NULL) { 986 fds->revents = POLLNVAL; 987 n++; 988 } else { 989 /* 990 * Note: backend also returns POLLHUP and 991 * POLLERR if appropriate. 992 */ 993 fds->revents = fo_poll(fp, fds->events, 994 td->td_ucred, td); 995 if (fds->revents != 0) 996 n++; 997 } 998 } 999 } 1000 FILEDESC_UNLOCK(fdp); 1001 td->td_retval[0] = n; 1002 return (0); 1003 } 1004 1005 /* 1006 * OpenBSD poll system call. 1007 * 1008 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1009 */ 1010 #ifndef _SYS_SYSPROTO_H_ 1011 struct openbsd_poll_args { 1012 struct pollfd *fds; 1013 u_int nfds; 1014 int timeout; 1015 }; 1016 #endif 1017 int 1018 openbsd_poll(td, uap) 1019 register struct thread *td; 1020 register struct openbsd_poll_args *uap; 1021 { 1022 return (poll(td, (struct poll_args *)uap)); 1023 } 1024 1025 /* 1026 * Remove the references to the thread from all of the objects we were 1027 * polling. 1028 * 1029 * This code assumes that the underlying owner of the selinfo structure will 1030 * hold sellock before it changes it, and that it will unlink itself from our 1031 * list if it goes away. 1032 */ 1033 void 1034 clear_selinfo_list(td) 1035 struct thread *td; 1036 { 1037 struct selinfo *si; 1038 1039 mtx_assert(&sellock, MA_OWNED); 1040 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1041 si->si_thread = NULL; 1042 TAILQ_INIT(&td->td_selq); 1043 } 1044 1045 /* 1046 * Record a select request. 1047 */ 1048 void 1049 selrecord(selector, sip) 1050 struct thread *selector; 1051 struct selinfo *sip; 1052 { 1053 1054 mtx_lock(&sellock); 1055 /* 1056 * If the selinfo's thread pointer is NULL then take ownership of it. 1057 * 1058 * If the thread pointer is not NULL and it points to another 1059 * thread, then we have a collision. 1060 * 1061 * If the thread pointer is not NULL and points back to us then leave 1062 * it alone as we've already added pointed it at us and added it to 1063 * our list. 1064 */ 1065 if (sip->si_thread == NULL) { 1066 sip->si_thread = selector; 1067 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1068 } else if (sip->si_thread != selector) { 1069 sip->si_flags |= SI_COLL; 1070 } 1071 1072 mtx_unlock(&sellock); 1073 } 1074 1075 /* Wake up a selecting thread. */ 1076 void 1077 selwakeup(sip) 1078 struct selinfo *sip; 1079 { 1080 doselwakeup(sip, -1); 1081 } 1082 1083 /* Wake up a selecting thread, and set its priority. */ 1084 void 1085 selwakeuppri(sip, pri) 1086 struct selinfo *sip; 1087 int pri; 1088 { 1089 doselwakeup(sip, pri); 1090 } 1091 1092 /* 1093 * Do a wakeup when a selectable event occurs. 1094 */ 1095 static void 1096 doselwakeup(sip, pri) 1097 struct selinfo *sip; 1098 int pri; 1099 { 1100 struct thread *td; 1101 1102 mtx_lock(&sellock); 1103 td = sip->si_thread; 1104 if ((sip->si_flags & SI_COLL) != 0) { 1105 nselcoll++; 1106 sip->si_flags &= ~SI_COLL; 1107 cv_broadcastpri(&selwait, pri); 1108 } 1109 if (td == NULL) { 1110 mtx_unlock(&sellock); 1111 return; 1112 } 1113 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1114 sip->si_thread = NULL; 1115 mtx_lock_spin(&sched_lock); 1116 td->td_flags &= ~TDF_SELECT; 1117 mtx_unlock_spin(&sched_lock); 1118 sleepq_remove(td, &selwait); 1119 mtx_unlock(&sellock); 1120 } 1121 1122 static void selectinit(void *); 1123 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1124 1125 /* ARGSUSED*/ 1126 static void 1127 selectinit(dummy) 1128 void *dummy; 1129 { 1130 cv_init(&selwait, "select"); 1131 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1132 } 1133