1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ktrace.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/sysproto.h> 45 #include <sys/filedesc.h> 46 #include <sys/filio.h> 47 #include <sys/fcntl.h> 48 #include <sys/file.h> 49 #include <sys/proc.h> 50 #include <sys/signalvar.h> 51 #include <sys/socketvar.h> 52 #include <sys/uio.h> 53 #include <sys/kernel.h> 54 #include <sys/limits.h> 55 #include <sys/malloc.h> 56 #include <sys/poll.h> 57 #include <sys/resourcevar.h> 58 #include <sys/selinfo.h> 59 #include <sys/sleepqueue.h> 60 #include <sys/syscallsubr.h> 61 #include <sys/sysctl.h> 62 #include <sys/sysent.h> 63 #include <sys/vnode.h> 64 #include <sys/bio.h> 65 #include <sys/buf.h> 66 #include <sys/condvar.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 #include <vm/vm.h> 71 #include <vm/vm_page.h> 72 73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 75 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 76 77 static int pollscan(struct thread *, struct pollfd *, u_int); 78 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 79 static int dofileread(struct thread *, struct file *, int, void *, 80 size_t, off_t, int); 81 static int dofilewrite(struct thread *, struct file *, int, 82 const void *, size_t, off_t, int); 83 static void doselwakeup(struct selinfo *, int); 84 85 /* 86 * Read system call. 87 */ 88 #ifndef _SYS_SYSPROTO_H_ 89 struct read_args { 90 int fd; 91 void *buf; 92 size_t nbyte; 93 }; 94 #endif 95 /* 96 * MPSAFE 97 */ 98 int 99 read(td, uap) 100 struct thread *td; 101 struct read_args *uap; 102 { 103 struct file *fp; 104 int error; 105 106 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 107 error = dofileread(td, fp, uap->fd, uap->buf, 108 uap->nbyte, (off_t)-1, 0); 109 fdrop(fp, td); 110 } 111 return(error); 112 } 113 114 /* 115 * Pread system call 116 */ 117 #ifndef _SYS_SYSPROTO_H_ 118 struct pread_args { 119 int fd; 120 void *buf; 121 size_t nbyte; 122 int pad; 123 off_t offset; 124 }; 125 #endif 126 /* 127 * MPSAFE 128 */ 129 int 130 pread(td, uap) 131 struct thread *td; 132 struct pread_args *uap; 133 { 134 struct file *fp; 135 int error; 136 137 if ((error = fget_read(td, uap->fd, &fp)) != 0) 138 return (error); 139 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 140 error = ESPIPE; 141 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR) 142 error = EINVAL; 143 else { 144 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 145 uap->offset, FOF_OFFSET); 146 } 147 fdrop(fp, td); 148 return(error); 149 } 150 151 /* 152 * Code common for read and pread 153 */ 154 static int 155 dofileread(td, fp, fd, buf, nbyte, offset, flags) 156 struct thread *td; 157 struct file *fp; 158 int fd, flags; 159 void *buf; 160 size_t nbyte; 161 off_t offset; 162 { 163 struct uio auio; 164 struct iovec aiov; 165 ssize_t cnt; 166 long error = 0; 167 #ifdef KTRACE 168 struct uio *ktruio = NULL; 169 #endif 170 171 /* Finish zero length reads right here */ 172 if (nbyte == 0) { 173 td->td_retval[0] = 0; 174 return(0); 175 } 176 aiov.iov_base = buf; 177 aiov.iov_len = nbyte; 178 auio.uio_iov = &aiov; 179 auio.uio_iovcnt = 1; 180 auio.uio_offset = offset; 181 if (nbyte > INT_MAX) 182 return (EINVAL); 183 auio.uio_resid = nbyte; 184 auio.uio_rw = UIO_READ; 185 auio.uio_segflg = UIO_USERSPACE; 186 auio.uio_td = td; 187 #ifdef KTRACE 188 if (KTRPOINT(td, KTR_GENIO)) 189 ktruio = cloneuio(&auio); 190 #endif 191 cnt = nbyte; 192 193 if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) { 194 if (auio.uio_resid != cnt && (error == ERESTART || 195 error == EINTR || error == EWOULDBLOCK)) 196 error = 0; 197 } 198 cnt -= auio.uio_resid; 199 #ifdef KTRACE 200 if (ktruio != NULL) { 201 ktruio->uio_resid = cnt; 202 ktrgenio(fd, UIO_READ, ktruio, error); 203 } 204 #endif 205 td->td_retval[0] = cnt; 206 return (error); 207 } 208 209 /* 210 * Scatter read system call. 211 */ 212 #ifndef _SYS_SYSPROTO_H_ 213 struct readv_args { 214 int fd; 215 struct iovec *iovp; 216 u_int iovcnt; 217 }; 218 #endif 219 /* 220 * MPSAFE 221 */ 222 int 223 readv(struct thread *td, struct readv_args *uap) 224 { 225 struct file *fp; 226 struct uio *auio = NULL; 227 long cnt; 228 int error; 229 #ifdef KTRACE 230 struct uio *ktruio = NULL; 231 #endif 232 233 error = fget_read(td, uap->fd, &fp); 234 if (error) 235 return (error); 236 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 237 if (error) { 238 fdrop(fp, td); 239 return (error); 240 } 241 /* Finish zero length reads right here */ 242 if (auio->uio_resid == 0) { 243 td->td_retval[0] = 0; 244 free(auio, M_IOV); 245 fdrop(fp, td); 246 return(0); 247 } 248 auio->uio_rw = UIO_READ; 249 auio->uio_td = td; 250 #ifdef KTRACE 251 if (KTRPOINT(td, KTR_GENIO)) 252 ktruio = cloneuio(auio); 253 #endif 254 cnt = auio->uio_resid; 255 if ((error = fo_read(fp, auio, td->td_ucred, 0, td))) { 256 if (auio->uio_resid != cnt && (error == ERESTART || 257 error == EINTR || error == EWOULDBLOCK)) 258 error = 0; 259 } 260 cnt -= auio->uio_resid; 261 #ifdef KTRACE 262 if (ktruio != NULL) { 263 ktruio->uio_resid = cnt; 264 ktrgenio(uap->fd, UIO_READ, ktruio, error); 265 } 266 #endif 267 td->td_retval[0] = cnt; 268 free(auio, M_IOV); 269 fdrop(fp, td); 270 return (error); 271 } 272 273 /* 274 * Write system call 275 */ 276 #ifndef _SYS_SYSPROTO_H_ 277 struct write_args { 278 int fd; 279 const void *buf; 280 size_t nbyte; 281 }; 282 #endif 283 /* 284 * MPSAFE 285 */ 286 int 287 write(td, uap) 288 struct thread *td; 289 struct write_args *uap; 290 { 291 struct file *fp; 292 int error; 293 294 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 295 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 296 (off_t)-1, 0); 297 fdrop(fp, td); 298 } else { 299 error = EBADF; /* XXX this can't be right */ 300 } 301 return(error); 302 } 303 304 /* 305 * Pwrite system call 306 */ 307 #ifndef _SYS_SYSPROTO_H_ 308 struct pwrite_args { 309 int fd; 310 const void *buf; 311 size_t nbyte; 312 int pad; 313 off_t offset; 314 }; 315 #endif 316 /* 317 * MPSAFE 318 */ 319 int 320 pwrite(td, uap) 321 struct thread *td; 322 struct pwrite_args *uap; 323 { 324 struct file *fp; 325 int error; 326 327 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 328 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 329 error = ESPIPE; 330 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR) 331 error = EINVAL; 332 else { 333 error = dofilewrite(td, fp, uap->fd, uap->buf, 334 uap->nbyte, uap->offset, FOF_OFFSET); 335 } 336 fdrop(fp, td); 337 } else { 338 error = EBADF; /* this can't be right */ 339 } 340 return(error); 341 } 342 343 static int 344 dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 345 struct thread *td; 346 struct file *fp; 347 int fd, flags; 348 const void *buf; 349 size_t nbyte; 350 off_t offset; 351 { 352 struct uio auio; 353 struct iovec aiov; 354 ssize_t cnt; 355 long error = 0; 356 #ifdef KTRACE 357 struct uio *ktruio = NULL; 358 #endif 359 360 aiov.iov_base = (void *)(uintptr_t)buf; 361 aiov.iov_len = nbyte; 362 auio.uio_iov = &aiov; 363 auio.uio_iovcnt = 1; 364 auio.uio_offset = offset; 365 if (nbyte > INT_MAX) 366 return (EINVAL); 367 auio.uio_resid = nbyte; 368 auio.uio_rw = UIO_WRITE; 369 auio.uio_segflg = UIO_USERSPACE; 370 auio.uio_td = td; 371 #ifdef KTRACE 372 if (KTRPOINT(td, KTR_GENIO)) 373 ktruio = cloneuio(&auio); 374 #endif 375 cnt = nbyte; 376 if (fp->f_type == DTYPE_VNODE) 377 bwillwrite(); 378 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) { 379 if (auio.uio_resid != cnt && (error == ERESTART || 380 error == EINTR || error == EWOULDBLOCK)) 381 error = 0; 382 /* Socket layer is responsible for issuing SIGPIPE. */ 383 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) { 384 PROC_LOCK(td->td_proc); 385 psignal(td->td_proc, SIGPIPE); 386 PROC_UNLOCK(td->td_proc); 387 } 388 } 389 cnt -= auio.uio_resid; 390 #ifdef KTRACE 391 if (ktruio != NULL) { 392 ktruio->uio_resid = cnt; 393 ktrgenio(fd, UIO_WRITE, ktruio, error); 394 } 395 #endif 396 td->td_retval[0] = cnt; 397 return (error); 398 } 399 400 /* 401 * Gather write system call 402 */ 403 #ifndef _SYS_SYSPROTO_H_ 404 struct writev_args { 405 int fd; 406 struct iovec *iovp; 407 u_int iovcnt; 408 }; 409 #endif 410 /* 411 * MPSAFE 412 */ 413 int 414 writev(struct thread *td, struct writev_args *uap) 415 { 416 struct file *fp; 417 struct uio *auio = NULL; 418 long cnt; 419 int error; 420 #ifdef KTRACE 421 struct uio *ktruio = NULL; 422 #endif 423 424 error = fget_write(td, uap->fd, &fp); 425 if (error) 426 return (EBADF); 427 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 428 if (error) { 429 fdrop(fp, td); 430 return (error); 431 } 432 auio->uio_rw = UIO_WRITE; 433 auio->uio_td = td; 434 #ifdef KTRACE 435 if (KTRPOINT(td, KTR_GENIO)) 436 ktruio = cloneuio(auio); 437 #endif 438 cnt = auio->uio_resid; 439 if (fp->f_type == DTYPE_VNODE) 440 bwillwrite(); 441 if ((error = fo_write(fp, auio, td->td_ucred, 0, td))) { 442 if (auio->uio_resid != cnt && (error == ERESTART || 443 error == EINTR || error == EWOULDBLOCK)) 444 error = 0; 445 if (error == EPIPE) { 446 PROC_LOCK(td->td_proc); 447 psignal(td->td_proc, SIGPIPE); 448 PROC_UNLOCK(td->td_proc); 449 } 450 } 451 cnt -= auio->uio_resid; 452 #ifdef KTRACE 453 if (ktruio != NULL) { 454 ktruio->uio_resid = cnt; 455 ktrgenio(uap->fd, UIO_WRITE, ktruio, error); 456 } 457 #endif 458 td->td_retval[0] = cnt; 459 fdrop(fp, td); 460 free(auio, M_IOV); 461 return (error); 462 } 463 464 /* 465 * Ioctl system call 466 */ 467 #ifndef _SYS_SYSPROTO_H_ 468 struct ioctl_args { 469 int fd; 470 u_long com; 471 caddr_t data; 472 }; 473 #endif 474 /* 475 * MPSAFE 476 */ 477 /* ARGSUSED */ 478 int 479 ioctl(struct thread *td, struct ioctl_args *uap) 480 { 481 struct file *fp; 482 struct filedesc *fdp; 483 u_long com; 484 int error = 0; 485 u_int size; 486 caddr_t data, memp; 487 int tmp; 488 489 if (uap->com > 0xffffffff) { 490 printf( 491 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 492 td->td_proc->p_pid, td->td_proc->p_comm, uap->com); 493 uap->com &= 0xffffffff; 494 } 495 if ((error = fget(td, uap->fd, &fp)) != 0) 496 return (error); 497 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 498 fdrop(fp, td); 499 return (EBADF); 500 } 501 fdp = td->td_proc->p_fd; 502 switch (com = uap->com) { 503 case FIONCLEX: 504 FILEDESC_LOCK_FAST(fdp); 505 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 506 FILEDESC_UNLOCK_FAST(fdp); 507 fdrop(fp, td); 508 return (0); 509 case FIOCLEX: 510 FILEDESC_LOCK_FAST(fdp); 511 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 512 FILEDESC_UNLOCK_FAST(fdp); 513 fdrop(fp, td); 514 return (0); 515 } 516 517 /* 518 * Interpret high order word to find amount of data to be 519 * copied to/from the user's address space. 520 */ 521 size = IOCPARM_LEN(com); 522 if ((size > IOCPARM_MAX) || 523 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 524 ((com & IOC_VOID) && size > 0) || 525 ((com & (IOC_IN | IOC_OUT)) && size == 0)) { 526 fdrop(fp, td); 527 return (ENOTTY); 528 } 529 530 if (size > 0) { 531 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 532 data = memp; 533 } else { 534 memp = NULL; 535 data = (void *)&uap->data; 536 } 537 if (com & IOC_IN) { 538 error = copyin(uap->data, data, (u_int)size); 539 if (error) { 540 free(memp, M_IOCTLOPS); 541 fdrop(fp, td); 542 return (error); 543 } 544 } else if (com & IOC_OUT) { 545 /* 546 * Zero the buffer so the user always 547 * gets back something deterministic. 548 */ 549 bzero(data, size); 550 } 551 552 if (com == FIONBIO) { 553 FILE_LOCK(fp); 554 if ((tmp = *(int *)data)) 555 fp->f_flag |= FNONBLOCK; 556 else 557 fp->f_flag &= ~FNONBLOCK; 558 FILE_UNLOCK(fp); 559 data = (void *)&tmp; 560 } else if (com == FIOASYNC) { 561 FILE_LOCK(fp); 562 if ((tmp = *(int *)data)) 563 fp->f_flag |= FASYNC; 564 else 565 fp->f_flag &= ~FASYNC; 566 FILE_UNLOCK(fp); 567 data = (void *)&tmp; 568 } 569 570 error = fo_ioctl(fp, com, data, td->td_ucred, td); 571 572 if (error == 0 && (com & IOC_OUT)) 573 error = copyout(data, uap->data, (u_int)size); 574 575 if (memp != NULL) 576 free(memp, M_IOCTLOPS); 577 fdrop(fp, td); 578 return (error); 579 } 580 581 /* 582 * sellock and selwait are initialized in selectinit() via SYSINIT. 583 */ 584 struct mtx sellock; 585 struct cv selwait; 586 u_int nselcoll; /* Select collisions since boot */ 587 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 588 589 /* 590 * Select system call. 591 */ 592 #ifndef _SYS_SYSPROTO_H_ 593 struct select_args { 594 int nd; 595 fd_set *in, *ou, *ex; 596 struct timeval *tv; 597 }; 598 #endif 599 /* 600 * MPSAFE 601 */ 602 int 603 select(td, uap) 604 register struct thread *td; 605 register struct select_args *uap; 606 { 607 struct timeval tv, *tvp; 608 int error; 609 610 if (uap->tv != NULL) { 611 error = copyin(uap->tv, &tv, sizeof(tv)); 612 if (error) 613 return (error); 614 tvp = &tv; 615 } else 616 tvp = NULL; 617 618 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); 619 } 620 621 int 622 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 623 fd_set *fd_ex, struct timeval *tvp) 624 { 625 struct filedesc *fdp; 626 /* 627 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 628 * infds with the new FD_SETSIZE of 1024, and more than enough for 629 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 630 * of 256. 631 */ 632 fd_mask s_selbits[howmany(2048, NFDBITS)]; 633 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 634 struct timeval atv, rtv, ttv; 635 int error, timo; 636 u_int ncoll, nbufbytes, ncpbytes, nfdbits; 637 638 if (nd < 0) 639 return (EINVAL); 640 fdp = td->td_proc->p_fd; 641 642 FILEDESC_LOCK_FAST(fdp); 643 644 if (nd > td->td_proc->p_fd->fd_nfiles) 645 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 646 FILEDESC_UNLOCK_FAST(fdp); 647 648 /* 649 * Allocate just enough bits for the non-null fd_sets. Use the 650 * preallocated auto buffer if possible. 651 */ 652 nfdbits = roundup(nd, NFDBITS); 653 ncpbytes = nfdbits / NBBY; 654 nbufbytes = 0; 655 if (fd_in != NULL) 656 nbufbytes += 2 * ncpbytes; 657 if (fd_ou != NULL) 658 nbufbytes += 2 * ncpbytes; 659 if (fd_ex != NULL) 660 nbufbytes += 2 * ncpbytes; 661 if (nbufbytes <= sizeof s_selbits) 662 selbits = &s_selbits[0]; 663 else 664 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 665 666 /* 667 * Assign pointers into the bit buffers and fetch the input bits. 668 * Put the output buffers together so that they can be bzeroed 669 * together. 670 */ 671 sbp = selbits; 672 #define getbits(name, x) \ 673 do { \ 674 if (name == NULL) \ 675 ibits[x] = NULL; \ 676 else { \ 677 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 678 obits[x] = sbp; \ 679 sbp += ncpbytes / sizeof *sbp; \ 680 error = copyin(name, ibits[x], ncpbytes); \ 681 if (error != 0) \ 682 goto done_nosellock; \ 683 } \ 684 } while (0) 685 getbits(fd_in, 0); 686 getbits(fd_ou, 1); 687 getbits(fd_ex, 2); 688 #undef getbits 689 if (nbufbytes != 0) 690 bzero(selbits, nbufbytes / 2); 691 692 if (tvp != NULL) { 693 atv = *tvp; 694 if (itimerfix(&atv)) { 695 error = EINVAL; 696 goto done_nosellock; 697 } 698 getmicrouptime(&rtv); 699 timevaladd(&atv, &rtv); 700 } else { 701 atv.tv_sec = 0; 702 atv.tv_usec = 0; 703 } 704 timo = 0; 705 TAILQ_INIT(&td->td_selq); 706 mtx_lock(&sellock); 707 retry: 708 ncoll = nselcoll; 709 mtx_lock_spin(&sched_lock); 710 td->td_flags |= TDF_SELECT; 711 mtx_unlock_spin(&sched_lock); 712 mtx_unlock(&sellock); 713 714 error = selscan(td, ibits, obits, nd); 715 mtx_lock(&sellock); 716 if (error || td->td_retval[0]) 717 goto done; 718 if (atv.tv_sec || atv.tv_usec) { 719 getmicrouptime(&rtv); 720 if (timevalcmp(&rtv, &atv, >=)) 721 goto done; 722 ttv = atv; 723 timevalsub(&ttv, &rtv); 724 timo = ttv.tv_sec > 24 * 60 * 60 ? 725 24 * 60 * 60 * hz : tvtohz(&ttv); 726 } 727 728 /* 729 * An event of interest may occur while we do not hold 730 * sellock, so check TDF_SELECT and the number of 731 * collisions and rescan the file descriptors if 732 * necessary. 733 */ 734 mtx_lock_spin(&sched_lock); 735 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 736 mtx_unlock_spin(&sched_lock); 737 goto retry; 738 } 739 mtx_unlock_spin(&sched_lock); 740 741 if (timo > 0) 742 error = cv_timedwait_sig(&selwait, &sellock, timo); 743 else 744 error = cv_wait_sig(&selwait, &sellock); 745 746 if (error == 0) 747 goto retry; 748 749 done: 750 clear_selinfo_list(td); 751 mtx_lock_spin(&sched_lock); 752 td->td_flags &= ~TDF_SELECT; 753 mtx_unlock_spin(&sched_lock); 754 mtx_unlock(&sellock); 755 756 done_nosellock: 757 /* select is not restarted after signals... */ 758 if (error == ERESTART) 759 error = EINTR; 760 if (error == EWOULDBLOCK) 761 error = 0; 762 #define putbits(name, x) \ 763 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 764 error = error2; 765 if (error == 0) { 766 int error2; 767 768 putbits(fd_in, 0); 769 putbits(fd_ou, 1); 770 putbits(fd_ex, 2); 771 #undef putbits 772 } 773 if (selbits != &s_selbits[0]) 774 free(selbits, M_SELECT); 775 776 return (error); 777 } 778 779 static int 780 selscan(td, ibits, obits, nfd) 781 struct thread *td; 782 fd_mask **ibits, **obits; 783 int nfd; 784 { 785 int msk, i, fd; 786 fd_mask bits; 787 struct file *fp; 788 int n = 0; 789 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 790 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 791 struct filedesc *fdp = td->td_proc->p_fd; 792 793 FILEDESC_LOCK(fdp); 794 for (msk = 0; msk < 3; msk++) { 795 if (ibits[msk] == NULL) 796 continue; 797 for (i = 0; i < nfd; i += NFDBITS) { 798 bits = ibits[msk][i/NFDBITS]; 799 /* ffs(int mask) not portable, fd_mask is long */ 800 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 801 if (!(bits & 1)) 802 continue; 803 if ((fp = fget_locked(fdp, fd)) == NULL) { 804 FILEDESC_UNLOCK(fdp); 805 return (EBADF); 806 } 807 if (fo_poll(fp, flag[msk], td->td_ucred, 808 td)) { 809 obits[msk][(fd)/NFDBITS] |= 810 ((fd_mask)1 << ((fd) % NFDBITS)); 811 n++; 812 } 813 } 814 } 815 } 816 FILEDESC_UNLOCK(fdp); 817 td->td_retval[0] = n; 818 return (0); 819 } 820 821 /* 822 * Poll system call. 823 */ 824 #ifndef _SYS_SYSPROTO_H_ 825 struct poll_args { 826 struct pollfd *fds; 827 u_int nfds; 828 int timeout; 829 }; 830 #endif 831 /* 832 * MPSAFE 833 */ 834 int 835 poll(td, uap) 836 struct thread *td; 837 struct poll_args *uap; 838 { 839 struct pollfd *bits; 840 struct pollfd smallbits[32]; 841 struct timeval atv, rtv, ttv; 842 int error = 0, timo; 843 u_int ncoll, nfds; 844 size_t ni; 845 846 nfds = uap->nfds; 847 848 /* 849 * This is kinda bogus. We have fd limits, but that is not 850 * really related to the size of the pollfd array. Make sure 851 * we let the process use at least FD_SETSIZE entries and at 852 * least enough for the current limits. We want to be reasonably 853 * safe, but not overly restrictive. 854 */ 855 PROC_LOCK(td->td_proc); 856 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) && 857 (nfds > FD_SETSIZE)) { 858 PROC_UNLOCK(td->td_proc); 859 error = EINVAL; 860 goto done2; 861 } 862 PROC_UNLOCK(td->td_proc); 863 ni = nfds * sizeof(struct pollfd); 864 if (ni > sizeof(smallbits)) 865 bits = malloc(ni, M_TEMP, M_WAITOK); 866 else 867 bits = smallbits; 868 error = copyin(uap->fds, bits, ni); 869 if (error) 870 goto done_nosellock; 871 if (uap->timeout != INFTIM) { 872 atv.tv_sec = uap->timeout / 1000; 873 atv.tv_usec = (uap->timeout % 1000) * 1000; 874 if (itimerfix(&atv)) { 875 error = EINVAL; 876 goto done_nosellock; 877 } 878 getmicrouptime(&rtv); 879 timevaladd(&atv, &rtv); 880 } else { 881 atv.tv_sec = 0; 882 atv.tv_usec = 0; 883 } 884 timo = 0; 885 TAILQ_INIT(&td->td_selq); 886 mtx_lock(&sellock); 887 retry: 888 ncoll = nselcoll; 889 mtx_lock_spin(&sched_lock); 890 td->td_flags |= TDF_SELECT; 891 mtx_unlock_spin(&sched_lock); 892 mtx_unlock(&sellock); 893 894 error = pollscan(td, bits, nfds); 895 mtx_lock(&sellock); 896 if (error || td->td_retval[0]) 897 goto done; 898 if (atv.tv_sec || atv.tv_usec) { 899 getmicrouptime(&rtv); 900 if (timevalcmp(&rtv, &atv, >=)) 901 goto done; 902 ttv = atv; 903 timevalsub(&ttv, &rtv); 904 timo = ttv.tv_sec > 24 * 60 * 60 ? 905 24 * 60 * 60 * hz : tvtohz(&ttv); 906 } 907 /* 908 * An event of interest may occur while we do not hold 909 * sellock, so check TDF_SELECT and the number of collisions 910 * and rescan the file descriptors if necessary. 911 */ 912 mtx_lock_spin(&sched_lock); 913 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 914 mtx_unlock_spin(&sched_lock); 915 goto retry; 916 } 917 mtx_unlock_spin(&sched_lock); 918 919 if (timo > 0) 920 error = cv_timedwait_sig(&selwait, &sellock, timo); 921 else 922 error = cv_wait_sig(&selwait, &sellock); 923 924 if (error == 0) 925 goto retry; 926 927 done: 928 clear_selinfo_list(td); 929 mtx_lock_spin(&sched_lock); 930 td->td_flags &= ~TDF_SELECT; 931 mtx_unlock_spin(&sched_lock); 932 mtx_unlock(&sellock); 933 934 done_nosellock: 935 /* poll is not restarted after signals... */ 936 if (error == ERESTART) 937 error = EINTR; 938 if (error == EWOULDBLOCK) 939 error = 0; 940 if (error == 0) { 941 error = copyout(bits, uap->fds, ni); 942 if (error) 943 goto out; 944 } 945 out: 946 if (ni > sizeof(smallbits)) 947 free(bits, M_TEMP); 948 done2: 949 return (error); 950 } 951 952 static int 953 pollscan(td, fds, nfd) 954 struct thread *td; 955 struct pollfd *fds; 956 u_int nfd; 957 { 958 register struct filedesc *fdp = td->td_proc->p_fd; 959 int i; 960 struct file *fp; 961 int n = 0; 962 963 FILEDESC_LOCK(fdp); 964 for (i = 0; i < nfd; i++, fds++) { 965 if (fds->fd >= fdp->fd_nfiles) { 966 fds->revents = POLLNVAL; 967 n++; 968 } else if (fds->fd < 0) { 969 fds->revents = 0; 970 } else { 971 fp = fdp->fd_ofiles[fds->fd]; 972 if (fp == NULL) { 973 fds->revents = POLLNVAL; 974 n++; 975 } else { 976 /* 977 * Note: backend also returns POLLHUP and 978 * POLLERR if appropriate. 979 */ 980 fds->revents = fo_poll(fp, fds->events, 981 td->td_ucred, td); 982 if (fds->revents != 0) 983 n++; 984 } 985 } 986 } 987 FILEDESC_UNLOCK(fdp); 988 td->td_retval[0] = n; 989 return (0); 990 } 991 992 /* 993 * OpenBSD poll system call. 994 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 995 */ 996 #ifndef _SYS_SYSPROTO_H_ 997 struct openbsd_poll_args { 998 struct pollfd *fds; 999 u_int nfds; 1000 int timeout; 1001 }; 1002 #endif 1003 /* 1004 * MPSAFE 1005 */ 1006 int 1007 openbsd_poll(td, uap) 1008 register struct thread *td; 1009 register struct openbsd_poll_args *uap; 1010 { 1011 return (poll(td, (struct poll_args *)uap)); 1012 } 1013 1014 /* 1015 * Remove the references to the thread from all of the objects 1016 * we were polling. 1017 * 1018 * This code assumes that the underlying owner of the selinfo 1019 * structure will hold sellock before it changes it, and that 1020 * it will unlink itself from our list if it goes away. 1021 */ 1022 void 1023 clear_selinfo_list(td) 1024 struct thread *td; 1025 { 1026 struct selinfo *si; 1027 1028 mtx_assert(&sellock, MA_OWNED); 1029 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1030 si->si_thread = NULL; 1031 TAILQ_INIT(&td->td_selq); 1032 } 1033 1034 /* 1035 * Record a select request. 1036 */ 1037 void 1038 selrecord(selector, sip) 1039 struct thread *selector; 1040 struct selinfo *sip; 1041 { 1042 1043 mtx_lock(&sellock); 1044 /* 1045 * If the selinfo's thread pointer is NULL then take ownership of it. 1046 * 1047 * If the thread pointer is not NULL and it points to another 1048 * thread, then we have a collision. 1049 * 1050 * If the thread pointer is not NULL and points back to us then leave 1051 * it alone as we've already added pointed it at us and added it to 1052 * our list. 1053 */ 1054 if (sip->si_thread == NULL) { 1055 sip->si_thread = selector; 1056 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1057 } else if (sip->si_thread != selector) { 1058 sip->si_flags |= SI_COLL; 1059 } 1060 1061 mtx_unlock(&sellock); 1062 } 1063 1064 /* Wake up a selecting thread. */ 1065 void 1066 selwakeup(sip) 1067 struct selinfo *sip; 1068 { 1069 doselwakeup(sip, -1); 1070 } 1071 1072 /* Wake up a selecting thread, and set its priority. */ 1073 void 1074 selwakeuppri(sip, pri) 1075 struct selinfo *sip; 1076 int pri; 1077 { 1078 doselwakeup(sip, pri); 1079 } 1080 1081 /* 1082 * Do a wakeup when a selectable event occurs. 1083 */ 1084 static void 1085 doselwakeup(sip, pri) 1086 struct selinfo *sip; 1087 int pri; 1088 { 1089 struct thread *td; 1090 1091 mtx_lock(&sellock); 1092 td = sip->si_thread; 1093 if ((sip->si_flags & SI_COLL) != 0) { 1094 nselcoll++; 1095 sip->si_flags &= ~SI_COLL; 1096 cv_broadcastpri(&selwait, pri); 1097 } 1098 if (td == NULL) { 1099 mtx_unlock(&sellock); 1100 return; 1101 } 1102 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1103 sip->si_thread = NULL; 1104 mtx_lock_spin(&sched_lock); 1105 td->td_flags &= ~TDF_SELECT; 1106 mtx_unlock_spin(&sched_lock); 1107 sleepq_remove(td, &selwait); 1108 mtx_unlock(&sellock); 1109 } 1110 1111 static void selectinit(void *); 1112 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1113 1114 /* ARGSUSED*/ 1115 static void 1116 selectinit(dummy) 1117 void *dummy; 1118 { 1119 cv_init(&selwait, "select"); 1120 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1121 } 1122