1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ktrace.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/sysproto.h> 45 #include <sys/filedesc.h> 46 #include <sys/filio.h> 47 #include <sys/fcntl.h> 48 #include <sys/file.h> 49 #include <sys/proc.h> 50 #include <sys/signalvar.h> 51 #include <sys/socketvar.h> 52 #include <sys/uio.h> 53 #include <sys/kernel.h> 54 #include <sys/limits.h> 55 #include <sys/malloc.h> 56 #include <sys/poll.h> 57 #include <sys/resourcevar.h> 58 #include <sys/selinfo.h> 59 #include <sys/sleepqueue.h> 60 #include <sys/syscallsubr.h> 61 #include <sys/sysctl.h> 62 #include <sys/sysent.h> 63 #include <sys/vnode.h> 64 #include <sys/bio.h> 65 #include <sys/buf.h> 66 #include <sys/condvar.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 #include <vm/vm.h> 71 #include <vm/vm_page.h> 72 73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 75 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 76 77 static int pollscan(struct thread *, struct pollfd *, u_int); 78 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 79 static int dofileread(struct thread *, struct file *, int, void *, 80 size_t, off_t, int); 81 static int dofilewrite(struct thread *, struct file *, int, 82 const void *, size_t, off_t, int); 83 static void doselwakeup(struct selinfo *, int); 84 85 /* 86 * Read system call. 87 */ 88 #ifndef _SYS_SYSPROTO_H_ 89 struct read_args { 90 int fd; 91 void *buf; 92 size_t nbyte; 93 }; 94 #endif 95 /* 96 * MPSAFE 97 */ 98 int 99 read(td, uap) 100 struct thread *td; 101 struct read_args *uap; 102 { 103 struct file *fp; 104 int error; 105 106 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 107 error = dofileread(td, fp, uap->fd, uap->buf, 108 uap->nbyte, (off_t)-1, 0); 109 fdrop(fp, td); 110 } 111 return(error); 112 } 113 114 /* 115 * Pread system call 116 */ 117 #ifndef _SYS_SYSPROTO_H_ 118 struct pread_args { 119 int fd; 120 void *buf; 121 size_t nbyte; 122 int pad; 123 off_t offset; 124 }; 125 #endif 126 /* 127 * MPSAFE 128 */ 129 int 130 pread(td, uap) 131 struct thread *td; 132 struct pread_args *uap; 133 { 134 struct file *fp; 135 int error; 136 137 if ((error = fget_read(td, uap->fd, &fp)) != 0) 138 return (error); 139 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 140 error = ESPIPE; 141 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR) 142 error = EINVAL; 143 else { 144 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 145 uap->offset, FOF_OFFSET); 146 } 147 fdrop(fp, td); 148 return(error); 149 } 150 151 /* 152 * Code common for read and pread 153 */ 154 static int 155 dofileread(td, fp, fd, buf, nbyte, offset, flags) 156 struct thread *td; 157 struct file *fp; 158 int fd, flags; 159 void *buf; 160 size_t nbyte; 161 off_t offset; 162 { 163 struct uio auio; 164 struct iovec aiov; 165 ssize_t cnt; 166 long error = 0; 167 #ifdef KTRACE 168 struct uio *ktruio = NULL; 169 #endif 170 171 /* Finish zero length reads right here */ 172 if (nbyte == 0) { 173 td->td_retval[0] = 0; 174 return(0); 175 } 176 aiov.iov_base = buf; 177 aiov.iov_len = nbyte; 178 auio.uio_iov = &aiov; 179 auio.uio_iovcnt = 1; 180 auio.uio_offset = offset; 181 if (nbyte > INT_MAX) 182 return (EINVAL); 183 auio.uio_resid = nbyte; 184 auio.uio_rw = UIO_READ; 185 auio.uio_segflg = UIO_USERSPACE; 186 auio.uio_td = td; 187 #ifdef KTRACE 188 if (KTRPOINT(td, KTR_GENIO)) 189 ktruio = cloneuio(&auio); 190 #endif 191 cnt = nbyte; 192 193 if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) { 194 if (auio.uio_resid != cnt && (error == ERESTART || 195 error == EINTR || error == EWOULDBLOCK)) 196 error = 0; 197 } 198 cnt -= auio.uio_resid; 199 #ifdef KTRACE 200 if (ktruio != NULL) { 201 ktruio->uio_resid = cnt; 202 ktrgenio(fd, UIO_READ, ktruio, error); 203 } 204 #endif 205 td->td_retval[0] = cnt; 206 return (error); 207 } 208 209 /* 210 * Scatter read system call. 211 */ 212 #ifndef _SYS_SYSPROTO_H_ 213 struct readv_args { 214 int fd; 215 struct iovec *iovp; 216 u_int iovcnt; 217 }; 218 #endif 219 /* 220 * MPSAFE 221 */ 222 int 223 readv(struct thread *td, struct readv_args *uap) 224 { 225 struct uio *auio; 226 int error; 227 228 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 229 if (error) 230 return (error); 231 error = kern_readv(td, uap->fd, auio); 232 free(auio, M_IOV); 233 return (error); 234 } 235 236 int 237 kern_readv(struct thread *td, int fd, struct uio *auio) 238 { 239 struct file *fp; 240 long cnt; 241 int error; 242 #ifdef KTRACE 243 struct uio *ktruio = NULL; 244 #endif 245 246 error = fget_read(td, fd, &fp); 247 if (error) 248 return (error); 249 /* Finish zero length reads right here */ 250 if (auio->uio_resid == 0) { 251 td->td_retval[0] = 0; 252 fdrop(fp, td); 253 return(0); 254 } 255 auio->uio_rw = UIO_READ; 256 auio->uio_td = td; 257 #ifdef KTRACE 258 if (KTRPOINT(td, KTR_GENIO)) 259 ktruio = cloneuio(auio); 260 #endif 261 cnt = auio->uio_resid; 262 if ((error = fo_read(fp, auio, td->td_ucred, 0, td))) { 263 if (auio->uio_resid != cnt && (error == ERESTART || 264 error == EINTR || error == EWOULDBLOCK)) 265 error = 0; 266 } 267 cnt -= auio->uio_resid; 268 #ifdef KTRACE 269 if (ktruio != NULL) { 270 ktruio->uio_resid = cnt; 271 ktrgenio(fd, UIO_READ, ktruio, error); 272 } 273 #endif 274 td->td_retval[0] = cnt; 275 fdrop(fp, td); 276 return (error); 277 } 278 279 /* 280 * Write system call 281 */ 282 #ifndef _SYS_SYSPROTO_H_ 283 struct write_args { 284 int fd; 285 const void *buf; 286 size_t nbyte; 287 }; 288 #endif 289 /* 290 * MPSAFE 291 */ 292 int 293 write(td, uap) 294 struct thread *td; 295 struct write_args *uap; 296 { 297 struct file *fp; 298 int error; 299 300 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 301 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 302 (off_t)-1, 0); 303 fdrop(fp, td); 304 } else { 305 error = EBADF; /* XXX this can't be right */ 306 } 307 return(error); 308 } 309 310 /* 311 * Pwrite system call 312 */ 313 #ifndef _SYS_SYSPROTO_H_ 314 struct pwrite_args { 315 int fd; 316 const void *buf; 317 size_t nbyte; 318 int pad; 319 off_t offset; 320 }; 321 #endif 322 /* 323 * MPSAFE 324 */ 325 int 326 pwrite(td, uap) 327 struct thread *td; 328 struct pwrite_args *uap; 329 { 330 struct file *fp; 331 int error; 332 333 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 334 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 335 error = ESPIPE; 336 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR) 337 error = EINVAL; 338 else { 339 error = dofilewrite(td, fp, uap->fd, uap->buf, 340 uap->nbyte, uap->offset, FOF_OFFSET); 341 } 342 fdrop(fp, td); 343 } else { 344 error = EBADF; /* this can't be right */ 345 } 346 return(error); 347 } 348 349 static int 350 dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 351 struct thread *td; 352 struct file *fp; 353 int fd, flags; 354 const void *buf; 355 size_t nbyte; 356 off_t offset; 357 { 358 struct uio auio; 359 struct iovec aiov; 360 ssize_t cnt; 361 long error = 0; 362 #ifdef KTRACE 363 struct uio *ktruio = NULL; 364 #endif 365 366 aiov.iov_base = (void *)(uintptr_t)buf; 367 aiov.iov_len = nbyte; 368 auio.uio_iov = &aiov; 369 auio.uio_iovcnt = 1; 370 auio.uio_offset = offset; 371 if (nbyte > INT_MAX) 372 return (EINVAL); 373 auio.uio_resid = nbyte; 374 auio.uio_rw = UIO_WRITE; 375 auio.uio_segflg = UIO_USERSPACE; 376 auio.uio_td = td; 377 #ifdef KTRACE 378 if (KTRPOINT(td, KTR_GENIO)) 379 ktruio = cloneuio(&auio); 380 #endif 381 cnt = nbyte; 382 if (fp->f_type == DTYPE_VNODE) 383 bwillwrite(); 384 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) { 385 if (auio.uio_resid != cnt && (error == ERESTART || 386 error == EINTR || error == EWOULDBLOCK)) 387 error = 0; 388 /* Socket layer is responsible for issuing SIGPIPE. */ 389 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) { 390 PROC_LOCK(td->td_proc); 391 psignal(td->td_proc, SIGPIPE); 392 PROC_UNLOCK(td->td_proc); 393 } 394 } 395 cnt -= auio.uio_resid; 396 #ifdef KTRACE 397 if (ktruio != NULL) { 398 ktruio->uio_resid = cnt; 399 ktrgenio(fd, UIO_WRITE, ktruio, error); 400 } 401 #endif 402 td->td_retval[0] = cnt; 403 return (error); 404 } 405 406 /* 407 * Gather write system call 408 */ 409 #ifndef _SYS_SYSPROTO_H_ 410 struct writev_args { 411 int fd; 412 struct iovec *iovp; 413 u_int iovcnt; 414 }; 415 #endif 416 /* 417 * MPSAFE 418 */ 419 int 420 writev(struct thread *td, struct writev_args *uap) 421 { 422 struct uio *auio; 423 int error; 424 425 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 426 if (error) 427 return (error); 428 error = kern_writev(td, uap->fd, auio); 429 free(auio, M_IOV); 430 return (error); 431 } 432 433 int 434 kern_writev(struct thread *td, int fd, struct uio *auio) 435 { 436 struct file *fp; 437 long cnt; 438 int error; 439 #ifdef KTRACE 440 struct uio *ktruio = NULL; 441 #endif 442 443 error = fget_write(td, fd, &fp); 444 if (error) 445 return (EBADF); 446 auio->uio_rw = UIO_WRITE; 447 auio->uio_td = td; 448 #ifdef KTRACE 449 if (KTRPOINT(td, KTR_GENIO)) 450 ktruio = cloneuio(auio); 451 #endif 452 cnt = auio->uio_resid; 453 if (fp->f_type == DTYPE_VNODE) 454 bwillwrite(); 455 if ((error = fo_write(fp, auio, td->td_ucred, 0, td))) { 456 if (auio->uio_resid != cnt && (error == ERESTART || 457 error == EINTR || error == EWOULDBLOCK)) 458 error = 0; 459 if (error == EPIPE) { 460 PROC_LOCK(td->td_proc); 461 psignal(td->td_proc, SIGPIPE); 462 PROC_UNLOCK(td->td_proc); 463 } 464 } 465 cnt -= auio->uio_resid; 466 #ifdef KTRACE 467 if (ktruio != NULL) { 468 ktruio->uio_resid = cnt; 469 ktrgenio(fd, UIO_WRITE, ktruio, error); 470 } 471 #endif 472 td->td_retval[0] = cnt; 473 fdrop(fp, td); 474 return (error); 475 } 476 477 /* 478 * Ioctl system call 479 */ 480 #ifndef _SYS_SYSPROTO_H_ 481 struct ioctl_args { 482 int fd; 483 u_long com; 484 caddr_t data; 485 }; 486 #endif 487 /* 488 * MPSAFE 489 */ 490 /* ARGSUSED */ 491 int 492 ioctl(struct thread *td, struct ioctl_args *uap) 493 { 494 struct file *fp; 495 struct filedesc *fdp; 496 u_long com; 497 int error = 0; 498 u_int size; 499 caddr_t data, memp; 500 int tmp; 501 502 if (uap->com > 0xffffffff) { 503 printf( 504 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 505 td->td_proc->p_pid, td->td_proc->p_comm, uap->com); 506 uap->com &= 0xffffffff; 507 } 508 if ((error = fget(td, uap->fd, &fp)) != 0) 509 return (error); 510 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 511 fdrop(fp, td); 512 return (EBADF); 513 } 514 fdp = td->td_proc->p_fd; 515 switch (com = uap->com) { 516 case FIONCLEX: 517 FILEDESC_LOCK_FAST(fdp); 518 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 519 FILEDESC_UNLOCK_FAST(fdp); 520 fdrop(fp, td); 521 return (0); 522 case FIOCLEX: 523 FILEDESC_LOCK_FAST(fdp); 524 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 525 FILEDESC_UNLOCK_FAST(fdp); 526 fdrop(fp, td); 527 return (0); 528 } 529 530 /* 531 * Interpret high order word to find amount of data to be 532 * copied to/from the user's address space. 533 */ 534 size = IOCPARM_LEN(com); 535 if ((size > IOCPARM_MAX) || 536 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 537 ((com & IOC_VOID) && size > 0) || 538 ((com & (IOC_IN | IOC_OUT)) && size == 0)) { 539 fdrop(fp, td); 540 return (ENOTTY); 541 } 542 543 if (size > 0) { 544 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 545 data = memp; 546 } else { 547 memp = NULL; 548 data = (void *)&uap->data; 549 } 550 if (com & IOC_IN) { 551 error = copyin(uap->data, data, (u_int)size); 552 if (error) { 553 free(memp, M_IOCTLOPS); 554 fdrop(fp, td); 555 return (error); 556 } 557 } else if (com & IOC_OUT) { 558 /* 559 * Zero the buffer so the user always 560 * gets back something deterministic. 561 */ 562 bzero(data, size); 563 } 564 565 if (com == FIONBIO) { 566 FILE_LOCK(fp); 567 if ((tmp = *(int *)data)) 568 fp->f_flag |= FNONBLOCK; 569 else 570 fp->f_flag &= ~FNONBLOCK; 571 FILE_UNLOCK(fp); 572 data = (void *)&tmp; 573 } else if (com == FIOASYNC) { 574 FILE_LOCK(fp); 575 if ((tmp = *(int *)data)) 576 fp->f_flag |= FASYNC; 577 else 578 fp->f_flag &= ~FASYNC; 579 FILE_UNLOCK(fp); 580 data = (void *)&tmp; 581 } 582 583 error = fo_ioctl(fp, com, data, td->td_ucred, td); 584 585 if (error == 0 && (com & IOC_OUT)) 586 error = copyout(data, uap->data, (u_int)size); 587 588 if (memp != NULL) 589 free(memp, M_IOCTLOPS); 590 fdrop(fp, td); 591 return (error); 592 } 593 594 /* 595 * sellock and selwait are initialized in selectinit() via SYSINIT. 596 */ 597 struct mtx sellock; 598 struct cv selwait; 599 u_int nselcoll; /* Select collisions since boot */ 600 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 601 602 /* 603 * Select system call. 604 */ 605 #ifndef _SYS_SYSPROTO_H_ 606 struct select_args { 607 int nd; 608 fd_set *in, *ou, *ex; 609 struct timeval *tv; 610 }; 611 #endif 612 /* 613 * MPSAFE 614 */ 615 int 616 select(td, uap) 617 register struct thread *td; 618 register struct select_args *uap; 619 { 620 struct timeval tv, *tvp; 621 int error; 622 623 if (uap->tv != NULL) { 624 error = copyin(uap->tv, &tv, sizeof(tv)); 625 if (error) 626 return (error); 627 tvp = &tv; 628 } else 629 tvp = NULL; 630 631 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); 632 } 633 634 int 635 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 636 fd_set *fd_ex, struct timeval *tvp) 637 { 638 struct filedesc *fdp; 639 /* 640 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 641 * infds with the new FD_SETSIZE of 1024, and more than enough for 642 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 643 * of 256. 644 */ 645 fd_mask s_selbits[howmany(2048, NFDBITS)]; 646 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 647 struct timeval atv, rtv, ttv; 648 int error, timo; 649 u_int ncoll, nbufbytes, ncpbytes, nfdbits; 650 651 if (nd < 0) 652 return (EINVAL); 653 fdp = td->td_proc->p_fd; 654 655 FILEDESC_LOCK_FAST(fdp); 656 657 if (nd > td->td_proc->p_fd->fd_nfiles) 658 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 659 FILEDESC_UNLOCK_FAST(fdp); 660 661 /* 662 * Allocate just enough bits for the non-null fd_sets. Use the 663 * preallocated auto buffer if possible. 664 */ 665 nfdbits = roundup(nd, NFDBITS); 666 ncpbytes = nfdbits / NBBY; 667 nbufbytes = 0; 668 if (fd_in != NULL) 669 nbufbytes += 2 * ncpbytes; 670 if (fd_ou != NULL) 671 nbufbytes += 2 * ncpbytes; 672 if (fd_ex != NULL) 673 nbufbytes += 2 * ncpbytes; 674 if (nbufbytes <= sizeof s_selbits) 675 selbits = &s_selbits[0]; 676 else 677 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 678 679 /* 680 * Assign pointers into the bit buffers and fetch the input bits. 681 * Put the output buffers together so that they can be bzeroed 682 * together. 683 */ 684 sbp = selbits; 685 #define getbits(name, x) \ 686 do { \ 687 if (name == NULL) \ 688 ibits[x] = NULL; \ 689 else { \ 690 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 691 obits[x] = sbp; \ 692 sbp += ncpbytes / sizeof *sbp; \ 693 error = copyin(name, ibits[x], ncpbytes); \ 694 if (error != 0) \ 695 goto done_nosellock; \ 696 } \ 697 } while (0) 698 getbits(fd_in, 0); 699 getbits(fd_ou, 1); 700 getbits(fd_ex, 2); 701 #undef getbits 702 if (nbufbytes != 0) 703 bzero(selbits, nbufbytes / 2); 704 705 if (tvp != NULL) { 706 atv = *tvp; 707 if (itimerfix(&atv)) { 708 error = EINVAL; 709 goto done_nosellock; 710 } 711 getmicrouptime(&rtv); 712 timevaladd(&atv, &rtv); 713 } else { 714 atv.tv_sec = 0; 715 atv.tv_usec = 0; 716 } 717 timo = 0; 718 TAILQ_INIT(&td->td_selq); 719 mtx_lock(&sellock); 720 retry: 721 ncoll = nselcoll; 722 mtx_lock_spin(&sched_lock); 723 td->td_flags |= TDF_SELECT; 724 mtx_unlock_spin(&sched_lock); 725 mtx_unlock(&sellock); 726 727 error = selscan(td, ibits, obits, nd); 728 mtx_lock(&sellock); 729 if (error || td->td_retval[0]) 730 goto done; 731 if (atv.tv_sec || atv.tv_usec) { 732 getmicrouptime(&rtv); 733 if (timevalcmp(&rtv, &atv, >=)) 734 goto done; 735 ttv = atv; 736 timevalsub(&ttv, &rtv); 737 timo = ttv.tv_sec > 24 * 60 * 60 ? 738 24 * 60 * 60 * hz : tvtohz(&ttv); 739 } 740 741 /* 742 * An event of interest may occur while we do not hold 743 * sellock, so check TDF_SELECT and the number of 744 * collisions and rescan the file descriptors if 745 * necessary. 746 */ 747 mtx_lock_spin(&sched_lock); 748 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 749 mtx_unlock_spin(&sched_lock); 750 goto retry; 751 } 752 mtx_unlock_spin(&sched_lock); 753 754 if (timo > 0) 755 error = cv_timedwait_sig(&selwait, &sellock, timo); 756 else 757 error = cv_wait_sig(&selwait, &sellock); 758 759 if (error == 0) 760 goto retry; 761 762 done: 763 clear_selinfo_list(td); 764 mtx_lock_spin(&sched_lock); 765 td->td_flags &= ~TDF_SELECT; 766 mtx_unlock_spin(&sched_lock); 767 mtx_unlock(&sellock); 768 769 done_nosellock: 770 /* select is not restarted after signals... */ 771 if (error == ERESTART) 772 error = EINTR; 773 if (error == EWOULDBLOCK) 774 error = 0; 775 #define putbits(name, x) \ 776 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 777 error = error2; 778 if (error == 0) { 779 int error2; 780 781 putbits(fd_in, 0); 782 putbits(fd_ou, 1); 783 putbits(fd_ex, 2); 784 #undef putbits 785 } 786 if (selbits != &s_selbits[0]) 787 free(selbits, M_SELECT); 788 789 return (error); 790 } 791 792 static int 793 selscan(td, ibits, obits, nfd) 794 struct thread *td; 795 fd_mask **ibits, **obits; 796 int nfd; 797 { 798 int msk, i, fd; 799 fd_mask bits; 800 struct file *fp; 801 int n = 0; 802 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 803 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 804 struct filedesc *fdp = td->td_proc->p_fd; 805 806 FILEDESC_LOCK(fdp); 807 for (msk = 0; msk < 3; msk++) { 808 if (ibits[msk] == NULL) 809 continue; 810 for (i = 0; i < nfd; i += NFDBITS) { 811 bits = ibits[msk][i/NFDBITS]; 812 /* ffs(int mask) not portable, fd_mask is long */ 813 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 814 if (!(bits & 1)) 815 continue; 816 if ((fp = fget_locked(fdp, fd)) == NULL) { 817 FILEDESC_UNLOCK(fdp); 818 return (EBADF); 819 } 820 if (fo_poll(fp, flag[msk], td->td_ucred, 821 td)) { 822 obits[msk][(fd)/NFDBITS] |= 823 ((fd_mask)1 << ((fd) % NFDBITS)); 824 n++; 825 } 826 } 827 } 828 } 829 FILEDESC_UNLOCK(fdp); 830 td->td_retval[0] = n; 831 return (0); 832 } 833 834 /* 835 * Poll system call. 836 */ 837 #ifndef _SYS_SYSPROTO_H_ 838 struct poll_args { 839 struct pollfd *fds; 840 u_int nfds; 841 int timeout; 842 }; 843 #endif 844 /* 845 * MPSAFE 846 */ 847 int 848 poll(td, uap) 849 struct thread *td; 850 struct poll_args *uap; 851 { 852 struct pollfd *bits; 853 struct pollfd smallbits[32]; 854 struct timeval atv, rtv, ttv; 855 int error = 0, timo; 856 u_int ncoll, nfds; 857 size_t ni; 858 859 nfds = uap->nfds; 860 861 /* 862 * This is kinda bogus. We have fd limits, but that is not 863 * really related to the size of the pollfd array. Make sure 864 * we let the process use at least FD_SETSIZE entries and at 865 * least enough for the current limits. We want to be reasonably 866 * safe, but not overly restrictive. 867 */ 868 PROC_LOCK(td->td_proc); 869 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) && 870 (nfds > FD_SETSIZE)) { 871 PROC_UNLOCK(td->td_proc); 872 error = EINVAL; 873 goto done2; 874 } 875 PROC_UNLOCK(td->td_proc); 876 ni = nfds * sizeof(struct pollfd); 877 if (ni > sizeof(smallbits)) 878 bits = malloc(ni, M_TEMP, M_WAITOK); 879 else 880 bits = smallbits; 881 error = copyin(uap->fds, bits, ni); 882 if (error) 883 goto done_nosellock; 884 if (uap->timeout != INFTIM) { 885 atv.tv_sec = uap->timeout / 1000; 886 atv.tv_usec = (uap->timeout % 1000) * 1000; 887 if (itimerfix(&atv)) { 888 error = EINVAL; 889 goto done_nosellock; 890 } 891 getmicrouptime(&rtv); 892 timevaladd(&atv, &rtv); 893 } else { 894 atv.tv_sec = 0; 895 atv.tv_usec = 0; 896 } 897 timo = 0; 898 TAILQ_INIT(&td->td_selq); 899 mtx_lock(&sellock); 900 retry: 901 ncoll = nselcoll; 902 mtx_lock_spin(&sched_lock); 903 td->td_flags |= TDF_SELECT; 904 mtx_unlock_spin(&sched_lock); 905 mtx_unlock(&sellock); 906 907 error = pollscan(td, bits, nfds); 908 mtx_lock(&sellock); 909 if (error || td->td_retval[0]) 910 goto done; 911 if (atv.tv_sec || atv.tv_usec) { 912 getmicrouptime(&rtv); 913 if (timevalcmp(&rtv, &atv, >=)) 914 goto done; 915 ttv = atv; 916 timevalsub(&ttv, &rtv); 917 timo = ttv.tv_sec > 24 * 60 * 60 ? 918 24 * 60 * 60 * hz : tvtohz(&ttv); 919 } 920 /* 921 * An event of interest may occur while we do not hold 922 * sellock, so check TDF_SELECT and the number of collisions 923 * and rescan the file descriptors if necessary. 924 */ 925 mtx_lock_spin(&sched_lock); 926 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 927 mtx_unlock_spin(&sched_lock); 928 goto retry; 929 } 930 mtx_unlock_spin(&sched_lock); 931 932 if (timo > 0) 933 error = cv_timedwait_sig(&selwait, &sellock, timo); 934 else 935 error = cv_wait_sig(&selwait, &sellock); 936 937 if (error == 0) 938 goto retry; 939 940 done: 941 clear_selinfo_list(td); 942 mtx_lock_spin(&sched_lock); 943 td->td_flags &= ~TDF_SELECT; 944 mtx_unlock_spin(&sched_lock); 945 mtx_unlock(&sellock); 946 947 done_nosellock: 948 /* poll is not restarted after signals... */ 949 if (error == ERESTART) 950 error = EINTR; 951 if (error == EWOULDBLOCK) 952 error = 0; 953 if (error == 0) { 954 error = copyout(bits, uap->fds, ni); 955 if (error) 956 goto out; 957 } 958 out: 959 if (ni > sizeof(smallbits)) 960 free(bits, M_TEMP); 961 done2: 962 return (error); 963 } 964 965 static int 966 pollscan(td, fds, nfd) 967 struct thread *td; 968 struct pollfd *fds; 969 u_int nfd; 970 { 971 register struct filedesc *fdp = td->td_proc->p_fd; 972 int i; 973 struct file *fp; 974 int n = 0; 975 976 FILEDESC_LOCK(fdp); 977 for (i = 0; i < nfd; i++, fds++) { 978 if (fds->fd >= fdp->fd_nfiles) { 979 fds->revents = POLLNVAL; 980 n++; 981 } else if (fds->fd < 0) { 982 fds->revents = 0; 983 } else { 984 fp = fdp->fd_ofiles[fds->fd]; 985 if (fp == NULL) { 986 fds->revents = POLLNVAL; 987 n++; 988 } else { 989 /* 990 * Note: backend also returns POLLHUP and 991 * POLLERR if appropriate. 992 */ 993 fds->revents = fo_poll(fp, fds->events, 994 td->td_ucred, td); 995 if (fds->revents != 0) 996 n++; 997 } 998 } 999 } 1000 FILEDESC_UNLOCK(fdp); 1001 td->td_retval[0] = n; 1002 return (0); 1003 } 1004 1005 /* 1006 * OpenBSD poll system call. 1007 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1008 */ 1009 #ifndef _SYS_SYSPROTO_H_ 1010 struct openbsd_poll_args { 1011 struct pollfd *fds; 1012 u_int nfds; 1013 int timeout; 1014 }; 1015 #endif 1016 /* 1017 * MPSAFE 1018 */ 1019 int 1020 openbsd_poll(td, uap) 1021 register struct thread *td; 1022 register struct openbsd_poll_args *uap; 1023 { 1024 return (poll(td, (struct poll_args *)uap)); 1025 } 1026 1027 /* 1028 * Remove the references to the thread from all of the objects 1029 * we were polling. 1030 * 1031 * This code assumes that the underlying owner of the selinfo 1032 * structure will hold sellock before it changes it, and that 1033 * it will unlink itself from our list if it goes away. 1034 */ 1035 void 1036 clear_selinfo_list(td) 1037 struct thread *td; 1038 { 1039 struct selinfo *si; 1040 1041 mtx_assert(&sellock, MA_OWNED); 1042 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1043 si->si_thread = NULL; 1044 TAILQ_INIT(&td->td_selq); 1045 } 1046 1047 /* 1048 * Record a select request. 1049 */ 1050 void 1051 selrecord(selector, sip) 1052 struct thread *selector; 1053 struct selinfo *sip; 1054 { 1055 1056 mtx_lock(&sellock); 1057 /* 1058 * If the selinfo's thread pointer is NULL then take ownership of it. 1059 * 1060 * If the thread pointer is not NULL and it points to another 1061 * thread, then we have a collision. 1062 * 1063 * If the thread pointer is not NULL and points back to us then leave 1064 * it alone as we've already added pointed it at us and added it to 1065 * our list. 1066 */ 1067 if (sip->si_thread == NULL) { 1068 sip->si_thread = selector; 1069 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1070 } else if (sip->si_thread != selector) { 1071 sip->si_flags |= SI_COLL; 1072 } 1073 1074 mtx_unlock(&sellock); 1075 } 1076 1077 /* Wake up a selecting thread. */ 1078 void 1079 selwakeup(sip) 1080 struct selinfo *sip; 1081 { 1082 doselwakeup(sip, -1); 1083 } 1084 1085 /* Wake up a selecting thread, and set its priority. */ 1086 void 1087 selwakeuppri(sip, pri) 1088 struct selinfo *sip; 1089 int pri; 1090 { 1091 doselwakeup(sip, pri); 1092 } 1093 1094 /* 1095 * Do a wakeup when a selectable event occurs. 1096 */ 1097 static void 1098 doselwakeup(sip, pri) 1099 struct selinfo *sip; 1100 int pri; 1101 { 1102 struct thread *td; 1103 1104 mtx_lock(&sellock); 1105 td = sip->si_thread; 1106 if ((sip->si_flags & SI_COLL) != 0) { 1107 nselcoll++; 1108 sip->si_flags &= ~SI_COLL; 1109 cv_broadcastpri(&selwait, pri); 1110 } 1111 if (td == NULL) { 1112 mtx_unlock(&sellock); 1113 return; 1114 } 1115 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1116 sip->si_thread = NULL; 1117 mtx_lock_spin(&sched_lock); 1118 td->td_flags &= ~TDF_SELECT; 1119 mtx_unlock_spin(&sched_lock); 1120 sleepq_remove(td, &selwait); 1121 mtx_unlock(&sellock); 1122 } 1123 1124 static void selectinit(void *); 1125 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1126 1127 /* ARGSUSED*/ 1128 static void 1129 selectinit(dummy) 1130 void *dummy; 1131 { 1132 cv_init(&selwait, "select"); 1133 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1134 } 1135