1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ktrace.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/sysproto.h> 45 #include <sys/filedesc.h> 46 #include <sys/filio.h> 47 #include <sys/fcntl.h> 48 #include <sys/file.h> 49 #include <sys/proc.h> 50 #include <sys/signalvar.h> 51 #include <sys/socketvar.h> 52 #include <sys/uio.h> 53 #include <sys/kernel.h> 54 #include <sys/limits.h> 55 #include <sys/malloc.h> 56 #include <sys/poll.h> 57 #include <sys/resourcevar.h> 58 #include <sys/selinfo.h> 59 #include <sys/sleepqueue.h> 60 #include <sys/syscallsubr.h> 61 #include <sys/sysctl.h> 62 #include <sys/sysent.h> 63 #include <sys/vnode.h> 64 #include <sys/bio.h> 65 #include <sys/buf.h> 66 #include <sys/condvar.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 #include <vm/vm.h> 71 #include <vm/vm_page.h> 72 73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 75 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 76 77 static int pollscan(struct thread *, struct pollfd *, u_int); 78 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 79 static int dofileread(struct thread *, struct file *, int, void *, 80 size_t, off_t, int); 81 static int dofilewrite(struct thread *, struct file *, int, 82 const void *, size_t, off_t, int); 83 static void doselwakeup(struct selinfo *, int); 84 85 /* 86 * Read system call. 87 */ 88 #ifndef _SYS_SYSPROTO_H_ 89 struct read_args { 90 int fd; 91 void *buf; 92 size_t nbyte; 93 }; 94 #endif 95 /* 96 * MPSAFE 97 */ 98 int 99 read(td, uap) 100 struct thread *td; 101 struct read_args *uap; 102 { 103 struct file *fp; 104 int error; 105 106 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 107 error = dofileread(td, fp, uap->fd, uap->buf, 108 uap->nbyte, (off_t)-1, 0); 109 fdrop(fp, td); 110 } 111 return(error); 112 } 113 114 /* 115 * Pread system call 116 */ 117 #ifndef _SYS_SYSPROTO_H_ 118 struct pread_args { 119 int fd; 120 void *buf; 121 size_t nbyte; 122 int pad; 123 off_t offset; 124 }; 125 #endif 126 /* 127 * MPSAFE 128 */ 129 int 130 pread(td, uap) 131 struct thread *td; 132 struct pread_args *uap; 133 { 134 struct file *fp; 135 int error; 136 137 if ((error = fget_read(td, uap->fd, &fp)) != 0) 138 return (error); 139 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 140 error = ESPIPE; 141 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR) 142 error = EINVAL; 143 else { 144 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 145 uap->offset, FOF_OFFSET); 146 } 147 fdrop(fp, td); 148 return(error); 149 } 150 151 /* 152 * Code common for read and pread 153 */ 154 static int 155 dofileread(td, fp, fd, buf, nbyte, offset, flags) 156 struct thread *td; 157 struct file *fp; 158 int fd, flags; 159 void *buf; 160 size_t nbyte; 161 off_t offset; 162 { 163 struct uio auio; 164 struct iovec aiov; 165 long cnt, error = 0; 166 #ifdef KTRACE 167 struct uio *ktruio = NULL; 168 #endif 169 170 aiov.iov_base = buf; 171 aiov.iov_len = nbyte; 172 auio.uio_iov = &aiov; 173 auio.uio_iovcnt = 1; 174 auio.uio_offset = offset; 175 if (nbyte > INT_MAX) 176 return (EINVAL); 177 auio.uio_resid = nbyte; 178 auio.uio_rw = UIO_READ; 179 auio.uio_segflg = UIO_USERSPACE; 180 auio.uio_td = td; 181 #ifdef KTRACE 182 if (KTRPOINT(td, KTR_GENIO)) 183 ktruio = cloneuio(&auio); 184 #endif 185 cnt = nbyte; 186 187 if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) { 188 if (auio.uio_resid != cnt && (error == ERESTART || 189 error == EINTR || error == EWOULDBLOCK)) 190 error = 0; 191 } 192 cnt -= auio.uio_resid; 193 #ifdef KTRACE 194 if (ktruio != NULL) { 195 ktruio->uio_resid = cnt; 196 ktrgenio(fd, UIO_READ, ktruio, error); 197 } 198 #endif 199 td->td_retval[0] = cnt; 200 return (error); 201 } 202 203 /* 204 * Scatter read system call. 205 */ 206 #ifndef _SYS_SYSPROTO_H_ 207 struct readv_args { 208 int fd; 209 struct iovec *iovp; 210 u_int iovcnt; 211 }; 212 #endif 213 /* 214 * MPSAFE 215 */ 216 int 217 readv(struct thread *td, struct readv_args *uap) 218 { 219 struct file *fp; 220 struct uio *auio = NULL; 221 long cnt; 222 int error; 223 #ifdef KTRACE 224 struct uio *ktruio = NULL; 225 #endif 226 227 error = fget_read(td, uap->fd, &fp); 228 if (error) 229 return (error); 230 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 231 if (error) { 232 fdrop(fp, td); 233 return (error); 234 } 235 auio->uio_rw = UIO_READ; 236 auio->uio_td = td; 237 #ifdef KTRACE 238 if (KTRPOINT(td, KTR_GENIO)) 239 ktruio = cloneuio(auio); 240 #endif 241 cnt = auio->uio_resid; 242 if ((error = fo_read(fp, auio, td->td_ucred, 0, td))) { 243 if (auio->uio_resid != cnt && (error == ERESTART || 244 error == EINTR || error == EWOULDBLOCK)) 245 error = 0; 246 } 247 cnt -= auio->uio_resid; 248 #ifdef KTRACE 249 if (ktruio != NULL) { 250 ktruio->uio_resid = cnt; 251 ktrgenio(uap->fd, UIO_READ, ktruio, error); 252 } 253 #endif 254 td->td_retval[0] = cnt; 255 free(auio, M_IOV); 256 fdrop(fp, td); 257 return (error); 258 } 259 260 /* 261 * Write system call 262 */ 263 #ifndef _SYS_SYSPROTO_H_ 264 struct write_args { 265 int fd; 266 const void *buf; 267 size_t nbyte; 268 }; 269 #endif 270 /* 271 * MPSAFE 272 */ 273 int 274 write(td, uap) 275 struct thread *td; 276 struct write_args *uap; 277 { 278 struct file *fp; 279 int error; 280 281 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 282 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 283 (off_t)-1, 0); 284 fdrop(fp, td); 285 } else { 286 error = EBADF; /* XXX this can't be right */ 287 } 288 return(error); 289 } 290 291 /* 292 * Pwrite system call 293 */ 294 #ifndef _SYS_SYSPROTO_H_ 295 struct pwrite_args { 296 int fd; 297 const void *buf; 298 size_t nbyte; 299 int pad; 300 off_t offset; 301 }; 302 #endif 303 /* 304 * MPSAFE 305 */ 306 int 307 pwrite(td, uap) 308 struct thread *td; 309 struct pwrite_args *uap; 310 { 311 struct file *fp; 312 int error; 313 314 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 315 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 316 error = ESPIPE; 317 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR) 318 error = EINVAL; 319 else { 320 error = dofilewrite(td, fp, uap->fd, uap->buf, 321 uap->nbyte, uap->offset, FOF_OFFSET); 322 } 323 fdrop(fp, td); 324 } else { 325 error = EBADF; /* this can't be right */ 326 } 327 return(error); 328 } 329 330 static int 331 dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 332 struct thread *td; 333 struct file *fp; 334 int fd, flags; 335 const void *buf; 336 size_t nbyte; 337 off_t offset; 338 { 339 struct uio auio; 340 struct iovec aiov; 341 long cnt, error = 0; 342 #ifdef KTRACE 343 struct uio *ktruio = NULL; 344 #endif 345 346 aiov.iov_base = (void *)(uintptr_t)buf; 347 aiov.iov_len = nbyte; 348 auio.uio_iov = &aiov; 349 auio.uio_iovcnt = 1; 350 auio.uio_offset = offset; 351 if (nbyte > INT_MAX) 352 return (EINVAL); 353 auio.uio_resid = nbyte; 354 auio.uio_rw = UIO_WRITE; 355 auio.uio_segflg = UIO_USERSPACE; 356 auio.uio_td = td; 357 #ifdef KTRACE 358 if (KTRPOINT(td, KTR_GENIO)) 359 ktruio = cloneuio(&auio); 360 #endif 361 cnt = nbyte; 362 if (fp->f_type == DTYPE_VNODE) 363 bwillwrite(); 364 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) { 365 if (auio.uio_resid != cnt && (error == ERESTART || 366 error == EINTR || error == EWOULDBLOCK)) 367 error = 0; 368 /* Socket layer is responsible for issuing SIGPIPE. */ 369 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) { 370 PROC_LOCK(td->td_proc); 371 psignal(td->td_proc, SIGPIPE); 372 PROC_UNLOCK(td->td_proc); 373 } 374 } 375 cnt -= auio.uio_resid; 376 #ifdef KTRACE 377 if (ktruio != NULL) { 378 ktruio->uio_resid = cnt; 379 ktrgenio(fd, UIO_WRITE, ktruio, error); 380 } 381 #endif 382 td->td_retval[0] = cnt; 383 return (error); 384 } 385 386 /* 387 * Gather write system call 388 */ 389 #ifndef _SYS_SYSPROTO_H_ 390 struct writev_args { 391 int fd; 392 struct iovec *iovp; 393 u_int iovcnt; 394 }; 395 #endif 396 /* 397 * MPSAFE 398 */ 399 int 400 writev(struct thread *td, struct writev_args *uap) 401 { 402 struct file *fp; 403 struct uio *auio = NULL; 404 long cnt; 405 int error; 406 #ifdef KTRACE 407 struct uio *ktruio = NULL; 408 #endif 409 410 error = fget_write(td, uap->fd, &fp); 411 if (error) 412 return (EBADF); 413 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 414 if (error) { 415 fdrop(fp, td); 416 return (error); 417 } 418 auio->uio_rw = UIO_WRITE; 419 auio->uio_td = td; 420 #ifdef KTRACE 421 if (KTRPOINT(td, KTR_GENIO)) 422 ktruio = cloneuio(auio); 423 #endif 424 cnt = auio->uio_resid; 425 if (fp->f_type == DTYPE_VNODE) 426 bwillwrite(); 427 if ((error = fo_write(fp, auio, td->td_ucred, 0, td))) { 428 if (auio->uio_resid != cnt && (error == ERESTART || 429 error == EINTR || error == EWOULDBLOCK)) 430 error = 0; 431 if (error == EPIPE) { 432 PROC_LOCK(td->td_proc); 433 psignal(td->td_proc, SIGPIPE); 434 PROC_UNLOCK(td->td_proc); 435 } 436 } 437 cnt -= auio->uio_resid; 438 #ifdef KTRACE 439 if (ktruio != NULL) { 440 ktruio->uio_resid = cnt; 441 ktrgenio(uap->fd, UIO_WRITE, ktruio, error); 442 } 443 #endif 444 td->td_retval[0] = cnt; 445 fdrop(fp, td); 446 free(auio, M_IOV); 447 return (error); 448 } 449 450 /* 451 * Ioctl system call 452 */ 453 #ifndef _SYS_SYSPROTO_H_ 454 struct ioctl_args { 455 int fd; 456 u_long com; 457 caddr_t data; 458 }; 459 #endif 460 /* 461 * MPSAFE 462 */ 463 /* ARGSUSED */ 464 int 465 ioctl(td, uap) 466 struct thread *td; 467 register struct ioctl_args *uap; 468 { 469 struct file *fp; 470 register struct filedesc *fdp; 471 register u_long com; 472 int error = 0; 473 register u_int size; 474 caddr_t data, memp; 475 int tmp; 476 #define STK_PARAMS 128 477 union { 478 char stkbuf[STK_PARAMS]; 479 long align; 480 } ubuf; 481 482 if ((error = fget(td, uap->fd, &fp)) != 0) 483 return (error); 484 mtx_lock(&Giant); 485 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 486 fdrop(fp, td); 487 mtx_unlock(&Giant); 488 return (EBADF); 489 } 490 fdp = td->td_proc->p_fd; 491 switch (com = uap->com) { 492 case FIONCLEX: 493 FILEDESC_LOCK(fdp); 494 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 495 FILEDESC_UNLOCK(fdp); 496 fdrop(fp, td); 497 mtx_unlock(&Giant); 498 return (0); 499 case FIOCLEX: 500 FILEDESC_LOCK(fdp); 501 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 502 FILEDESC_UNLOCK(fdp); 503 fdrop(fp, td); 504 mtx_unlock(&Giant); 505 return (0); 506 } 507 508 /* 509 * Interpret high order word to find amount of data to be 510 * copied to/from the user's address space. 511 */ 512 size = IOCPARM_LEN(com); 513 if (size > IOCPARM_MAX) { 514 fdrop(fp, td); 515 mtx_unlock(&Giant); 516 return (ENOTTY); 517 } 518 519 memp = NULL; 520 if (size > sizeof (ubuf.stkbuf)) { 521 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 522 data = memp; 523 } else { 524 data = ubuf.stkbuf; 525 } 526 if (com&IOC_IN) { 527 if (size) { 528 error = copyin(uap->data, data, (u_int)size); 529 if (error) { 530 if (memp) 531 free(memp, M_IOCTLOPS); 532 fdrop(fp, td); 533 goto done; 534 } 535 } else { 536 *(caddr_t *)data = uap->data; 537 } 538 } else if ((com&IOC_OUT) && size) { 539 /* 540 * Zero the buffer so the user always 541 * gets back something deterministic. 542 */ 543 bzero(data, size); 544 } else if (com&IOC_VOID) { 545 *(caddr_t *)data = uap->data; 546 } 547 548 switch (com) { 549 550 case FIONBIO: 551 FILE_LOCK(fp); 552 if ((tmp = *(int *)data)) 553 fp->f_flag |= FNONBLOCK; 554 else 555 fp->f_flag &= ~FNONBLOCK; 556 FILE_UNLOCK(fp); 557 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 558 break; 559 560 case FIOASYNC: 561 FILE_LOCK(fp); 562 if ((tmp = *(int *)data)) 563 fp->f_flag |= FASYNC; 564 else 565 fp->f_flag &= ~FASYNC; 566 FILE_UNLOCK(fp); 567 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); 568 break; 569 570 default: 571 error = fo_ioctl(fp, com, data, td->td_ucred, td); 572 /* 573 * Copy any data to user, size was 574 * already set and checked above. 575 */ 576 if (error == 0 && (com&IOC_OUT) && size) 577 error = copyout(data, uap->data, (u_int)size); 578 break; 579 } 580 if (memp) 581 free(memp, M_IOCTLOPS); 582 fdrop(fp, td); 583 done: 584 mtx_unlock(&Giant); 585 return (error); 586 } 587 588 /* 589 * sellock and selwait are initialized in selectinit() via SYSINIT. 590 */ 591 struct mtx sellock; 592 struct cv selwait; 593 u_int nselcoll; /* Select collisions since boot */ 594 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 595 596 /* 597 * Select system call. 598 */ 599 #ifndef _SYS_SYSPROTO_H_ 600 struct select_args { 601 int nd; 602 fd_set *in, *ou, *ex; 603 struct timeval *tv; 604 }; 605 #endif 606 /* 607 * MPSAFE 608 */ 609 int 610 select(td, uap) 611 register struct thread *td; 612 register struct select_args *uap; 613 { 614 struct timeval tv, *tvp; 615 int error; 616 617 if (uap->tv != NULL) { 618 error = copyin(uap->tv, &tv, sizeof(tv)); 619 if (error) 620 return (error); 621 tvp = &tv; 622 } else 623 tvp = NULL; 624 625 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); 626 } 627 628 int 629 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 630 fd_set *fd_ex, struct timeval *tvp) 631 { 632 struct filedesc *fdp; 633 /* 634 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 635 * infds with the new FD_SETSIZE of 1024, and more than enough for 636 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 637 * of 256. 638 */ 639 fd_mask s_selbits[howmany(2048, NFDBITS)]; 640 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 641 struct timeval atv, rtv, ttv; 642 int error, timo; 643 u_int ncoll, nbufbytes, ncpbytes, nfdbits; 644 645 if (nd < 0) 646 return (EINVAL); 647 fdp = td->td_proc->p_fd; 648 /* 649 * XXX: kern_select() currently requires that we acquire Giant 650 * even if none of the file descriptors we poll requires Giant. 651 */ 652 mtx_lock(&Giant); 653 FILEDESC_LOCK(fdp); 654 655 if (nd > td->td_proc->p_fd->fd_nfiles) 656 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 657 FILEDESC_UNLOCK(fdp); 658 659 /* 660 * Allocate just enough bits for the non-null fd_sets. Use the 661 * preallocated auto buffer if possible. 662 */ 663 nfdbits = roundup(nd, NFDBITS); 664 ncpbytes = nfdbits / NBBY; 665 nbufbytes = 0; 666 if (fd_in != NULL) 667 nbufbytes += 2 * ncpbytes; 668 if (fd_ou != NULL) 669 nbufbytes += 2 * ncpbytes; 670 if (fd_ex != NULL) 671 nbufbytes += 2 * ncpbytes; 672 if (nbufbytes <= sizeof s_selbits) 673 selbits = &s_selbits[0]; 674 else 675 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 676 677 /* 678 * Assign pointers into the bit buffers and fetch the input bits. 679 * Put the output buffers together so that they can be bzeroed 680 * together. 681 */ 682 sbp = selbits; 683 #define getbits(name, x) \ 684 do { \ 685 if (name == NULL) \ 686 ibits[x] = NULL; \ 687 else { \ 688 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 689 obits[x] = sbp; \ 690 sbp += ncpbytes / sizeof *sbp; \ 691 error = copyin(name, ibits[x], ncpbytes); \ 692 if (error != 0) \ 693 goto done_nosellock; \ 694 } \ 695 } while (0) 696 getbits(fd_in, 0); 697 getbits(fd_ou, 1); 698 getbits(fd_ex, 2); 699 #undef getbits 700 if (nbufbytes != 0) 701 bzero(selbits, nbufbytes / 2); 702 703 if (tvp != NULL) { 704 atv = *tvp; 705 if (itimerfix(&atv)) { 706 error = EINVAL; 707 goto done_nosellock; 708 } 709 getmicrouptime(&rtv); 710 timevaladd(&atv, &rtv); 711 } else { 712 atv.tv_sec = 0; 713 atv.tv_usec = 0; 714 } 715 timo = 0; 716 TAILQ_INIT(&td->td_selq); 717 mtx_lock(&sellock); 718 retry: 719 ncoll = nselcoll; 720 mtx_lock_spin(&sched_lock); 721 td->td_flags |= TDF_SELECT; 722 mtx_unlock_spin(&sched_lock); 723 mtx_unlock(&sellock); 724 725 error = selscan(td, ibits, obits, nd); 726 mtx_lock(&sellock); 727 if (error || td->td_retval[0]) 728 goto done; 729 if (atv.tv_sec || atv.tv_usec) { 730 getmicrouptime(&rtv); 731 if (timevalcmp(&rtv, &atv, >=)) 732 goto done; 733 ttv = atv; 734 timevalsub(&ttv, &rtv); 735 timo = ttv.tv_sec > 24 * 60 * 60 ? 736 24 * 60 * 60 * hz : tvtohz(&ttv); 737 } 738 739 /* 740 * An event of interest may occur while we do not hold 741 * sellock, so check TDF_SELECT and the number of 742 * collisions and rescan the file descriptors if 743 * necessary. 744 */ 745 mtx_lock_spin(&sched_lock); 746 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 747 mtx_unlock_spin(&sched_lock); 748 goto retry; 749 } 750 mtx_unlock_spin(&sched_lock); 751 752 if (timo > 0) 753 error = cv_timedwait_sig(&selwait, &sellock, timo); 754 else 755 error = cv_wait_sig(&selwait, &sellock); 756 757 if (error == 0) 758 goto retry; 759 760 done: 761 clear_selinfo_list(td); 762 mtx_lock_spin(&sched_lock); 763 td->td_flags &= ~TDF_SELECT; 764 mtx_unlock_spin(&sched_lock); 765 mtx_unlock(&sellock); 766 767 done_nosellock: 768 /* select is not restarted after signals... */ 769 if (error == ERESTART) 770 error = EINTR; 771 if (error == EWOULDBLOCK) 772 error = 0; 773 #define putbits(name, x) \ 774 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 775 error = error2; 776 if (error == 0) { 777 int error2; 778 779 putbits(fd_in, 0); 780 putbits(fd_ou, 1); 781 putbits(fd_ex, 2); 782 #undef putbits 783 } 784 if (selbits != &s_selbits[0]) 785 free(selbits, M_SELECT); 786 787 mtx_unlock(&Giant); 788 return (error); 789 } 790 791 static int 792 selscan(td, ibits, obits, nfd) 793 struct thread *td; 794 fd_mask **ibits, **obits; 795 int nfd; 796 { 797 int msk, i, fd; 798 fd_mask bits; 799 struct file *fp; 800 int n = 0; 801 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 802 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 803 struct filedesc *fdp = td->td_proc->p_fd; 804 805 FILEDESC_LOCK(fdp); 806 for (msk = 0; msk < 3; msk++) { 807 if (ibits[msk] == NULL) 808 continue; 809 for (i = 0; i < nfd; i += NFDBITS) { 810 bits = ibits[msk][i/NFDBITS]; 811 /* ffs(int mask) not portable, fd_mask is long */ 812 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 813 if (!(bits & 1)) 814 continue; 815 if ((fp = fget_locked(fdp, fd)) == NULL) { 816 FILEDESC_UNLOCK(fdp); 817 return (EBADF); 818 } 819 if (fo_poll(fp, flag[msk], td->td_ucred, 820 td)) { 821 obits[msk][(fd)/NFDBITS] |= 822 ((fd_mask)1 << ((fd) % NFDBITS)); 823 n++; 824 } 825 } 826 } 827 } 828 FILEDESC_UNLOCK(fdp); 829 td->td_retval[0] = n; 830 return (0); 831 } 832 833 /* 834 * Poll system call. 835 */ 836 #ifndef _SYS_SYSPROTO_H_ 837 struct poll_args { 838 struct pollfd *fds; 839 u_int nfds; 840 int timeout; 841 }; 842 #endif 843 /* 844 * MPSAFE 845 */ 846 int 847 poll(td, uap) 848 struct thread *td; 849 struct poll_args *uap; 850 { 851 caddr_t bits; 852 char smallbits[32 * sizeof(struct pollfd)]; 853 struct timeval atv, rtv, ttv; 854 int error = 0, timo; 855 u_int ncoll, nfds; 856 size_t ni; 857 858 nfds = uap->nfds; 859 860 /* 861 * XXX: poll() currently requires that we acquire Giant even if 862 * none of the file descriptors we poll requires Giant. 863 */ 864 mtx_lock(&Giant); 865 /* 866 * This is kinda bogus. We have fd limits, but that is not 867 * really related to the size of the pollfd array. Make sure 868 * we let the process use at least FD_SETSIZE entries and at 869 * least enough for the current limits. We want to be reasonably 870 * safe, but not overly restrictive. 871 */ 872 PROC_LOCK(td->td_proc); 873 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) && 874 (nfds > FD_SETSIZE)) { 875 PROC_UNLOCK(td->td_proc); 876 error = EINVAL; 877 goto done2; 878 } 879 PROC_UNLOCK(td->td_proc); 880 ni = nfds * sizeof(struct pollfd); 881 if (ni > sizeof(smallbits)) 882 bits = malloc(ni, M_TEMP, M_WAITOK); 883 else 884 bits = smallbits; 885 error = copyin(uap->fds, bits, ni); 886 if (error) 887 goto done_nosellock; 888 if (uap->timeout != INFTIM) { 889 atv.tv_sec = uap->timeout / 1000; 890 atv.tv_usec = (uap->timeout % 1000) * 1000; 891 if (itimerfix(&atv)) { 892 error = EINVAL; 893 goto done_nosellock; 894 } 895 getmicrouptime(&rtv); 896 timevaladd(&atv, &rtv); 897 } else { 898 atv.tv_sec = 0; 899 atv.tv_usec = 0; 900 } 901 timo = 0; 902 TAILQ_INIT(&td->td_selq); 903 mtx_lock(&sellock); 904 retry: 905 ncoll = nselcoll; 906 mtx_lock_spin(&sched_lock); 907 td->td_flags |= TDF_SELECT; 908 mtx_unlock_spin(&sched_lock); 909 mtx_unlock(&sellock); 910 911 error = pollscan(td, (struct pollfd *)bits, nfds); 912 mtx_lock(&sellock); 913 if (error || td->td_retval[0]) 914 goto done; 915 if (atv.tv_sec || atv.tv_usec) { 916 getmicrouptime(&rtv); 917 if (timevalcmp(&rtv, &atv, >=)) 918 goto done; 919 ttv = atv; 920 timevalsub(&ttv, &rtv); 921 timo = ttv.tv_sec > 24 * 60 * 60 ? 922 24 * 60 * 60 * hz : tvtohz(&ttv); 923 } 924 /* 925 * An event of interest may occur while we do not hold 926 * sellock, so check TDF_SELECT and the number of collisions 927 * and rescan the file descriptors if necessary. 928 */ 929 mtx_lock_spin(&sched_lock); 930 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 931 mtx_unlock_spin(&sched_lock); 932 goto retry; 933 } 934 mtx_unlock_spin(&sched_lock); 935 936 if (timo > 0) 937 error = cv_timedwait_sig(&selwait, &sellock, timo); 938 else 939 error = cv_wait_sig(&selwait, &sellock); 940 941 if (error == 0) 942 goto retry; 943 944 done: 945 clear_selinfo_list(td); 946 mtx_lock_spin(&sched_lock); 947 td->td_flags &= ~TDF_SELECT; 948 mtx_unlock_spin(&sched_lock); 949 mtx_unlock(&sellock); 950 951 done_nosellock: 952 /* poll is not restarted after signals... */ 953 if (error == ERESTART) 954 error = EINTR; 955 if (error == EWOULDBLOCK) 956 error = 0; 957 if (error == 0) { 958 error = copyout(bits, uap->fds, ni); 959 if (error) 960 goto out; 961 } 962 out: 963 if (ni > sizeof(smallbits)) 964 free(bits, M_TEMP); 965 done2: 966 mtx_unlock(&Giant); 967 return (error); 968 } 969 970 static int 971 pollscan(td, fds, nfd) 972 struct thread *td; 973 struct pollfd *fds; 974 u_int nfd; 975 { 976 register struct filedesc *fdp = td->td_proc->p_fd; 977 int i; 978 struct file *fp; 979 int n = 0; 980 981 FILEDESC_LOCK(fdp); 982 for (i = 0; i < nfd; i++, fds++) { 983 if (fds->fd >= fdp->fd_nfiles) { 984 fds->revents = POLLNVAL; 985 n++; 986 } else if (fds->fd < 0) { 987 fds->revents = 0; 988 } else { 989 fp = fdp->fd_ofiles[fds->fd]; 990 if (fp == NULL) { 991 fds->revents = POLLNVAL; 992 n++; 993 } else { 994 /* 995 * Note: backend also returns POLLHUP and 996 * POLLERR if appropriate. 997 */ 998 fds->revents = fo_poll(fp, fds->events, 999 td->td_ucred, td); 1000 if (fds->revents != 0) 1001 n++; 1002 } 1003 } 1004 } 1005 FILEDESC_UNLOCK(fdp); 1006 td->td_retval[0] = n; 1007 return (0); 1008 } 1009 1010 /* 1011 * OpenBSD poll system call. 1012 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1013 */ 1014 #ifndef _SYS_SYSPROTO_H_ 1015 struct openbsd_poll_args { 1016 struct pollfd *fds; 1017 u_int nfds; 1018 int timeout; 1019 }; 1020 #endif 1021 /* 1022 * MPSAFE 1023 */ 1024 int 1025 openbsd_poll(td, uap) 1026 register struct thread *td; 1027 register struct openbsd_poll_args *uap; 1028 { 1029 return (poll(td, (struct poll_args *)uap)); 1030 } 1031 1032 /* 1033 * Remove the references to the thread from all of the objects 1034 * we were polling. 1035 * 1036 * This code assumes that the underlying owner of the selinfo 1037 * structure will hold sellock before it changes it, and that 1038 * it will unlink itself from our list if it goes away. 1039 */ 1040 void 1041 clear_selinfo_list(td) 1042 struct thread *td; 1043 { 1044 struct selinfo *si; 1045 1046 mtx_assert(&sellock, MA_OWNED); 1047 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1048 si->si_thread = NULL; 1049 TAILQ_INIT(&td->td_selq); 1050 } 1051 1052 /* 1053 * Record a select request. 1054 */ 1055 void 1056 selrecord(selector, sip) 1057 struct thread *selector; 1058 struct selinfo *sip; 1059 { 1060 1061 mtx_lock(&sellock); 1062 /* 1063 * If the selinfo's thread pointer is NULL then take ownership of it. 1064 * 1065 * If the thread pointer is not NULL and it points to another 1066 * thread, then we have a collision. 1067 * 1068 * If the thread pointer is not NULL and points back to us then leave 1069 * it alone as we've already added pointed it at us and added it to 1070 * our list. 1071 */ 1072 if (sip->si_thread == NULL) { 1073 sip->si_thread = selector; 1074 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1075 } else if (sip->si_thread != selector) { 1076 sip->si_flags |= SI_COLL; 1077 } 1078 1079 mtx_unlock(&sellock); 1080 } 1081 1082 /* Wake up a selecting thread. */ 1083 void 1084 selwakeup(sip) 1085 struct selinfo *sip; 1086 { 1087 doselwakeup(sip, -1); 1088 } 1089 1090 /* Wake up a selecting thread, and set its priority. */ 1091 void 1092 selwakeuppri(sip, pri) 1093 struct selinfo *sip; 1094 int pri; 1095 { 1096 doselwakeup(sip, pri); 1097 } 1098 1099 /* 1100 * Do a wakeup when a selectable event occurs. 1101 */ 1102 static void 1103 doselwakeup(sip, pri) 1104 struct selinfo *sip; 1105 int pri; 1106 { 1107 struct thread *td; 1108 1109 mtx_lock(&sellock); 1110 td = sip->si_thread; 1111 if ((sip->si_flags & SI_COLL) != 0) { 1112 nselcoll++; 1113 sip->si_flags &= ~SI_COLL; 1114 cv_broadcastpri(&selwait, pri); 1115 } 1116 if (td == NULL) { 1117 mtx_unlock(&sellock); 1118 return; 1119 } 1120 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1121 sip->si_thread = NULL; 1122 mtx_lock_spin(&sched_lock); 1123 td->td_flags &= ~TDF_SELECT; 1124 mtx_unlock_spin(&sched_lock); 1125 sleepq_remove(td, &selwait); 1126 mtx_unlock(&sellock); 1127 } 1128 1129 static void selectinit(void *); 1130 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1131 1132 /* ARGSUSED*/ 1133 static void 1134 selectinit(dummy) 1135 void *dummy; 1136 { 1137 cv_init(&selwait, "select"); 1138 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1139 } 1140