1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ktrace.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/sysproto.h> 45 #include <sys/filedesc.h> 46 #include <sys/filio.h> 47 #include <sys/fcntl.h> 48 #include <sys/file.h> 49 #include <sys/proc.h> 50 #include <sys/signalvar.h> 51 #include <sys/socketvar.h> 52 #include <sys/uio.h> 53 #include <sys/kernel.h> 54 #include <sys/limits.h> 55 #include <sys/malloc.h> 56 #include <sys/poll.h> 57 #include <sys/resourcevar.h> 58 #include <sys/selinfo.h> 59 #include <sys/sleepqueue.h> 60 #include <sys/syscallsubr.h> 61 #include <sys/sysctl.h> 62 #include <sys/sysent.h> 63 #include <sys/vnode.h> 64 #include <sys/bio.h> 65 #include <sys/buf.h> 66 #include <sys/condvar.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 #include <vm/vm.h> 71 #include <vm/vm_page.h> 72 73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 75 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 76 77 static int pollscan(struct thread *, struct pollfd *, u_int); 78 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 79 static int dofileread(struct thread *, struct file *, int, void *, 80 size_t, off_t, int); 81 static int dofilewrite(struct thread *, struct file *, int, 82 const void *, size_t, off_t, int); 83 static void doselwakeup(struct selinfo *, int); 84 85 /* 86 * Read system call. 87 */ 88 #ifndef _SYS_SYSPROTO_H_ 89 struct read_args { 90 int fd; 91 void *buf; 92 size_t nbyte; 93 }; 94 #endif 95 /* 96 * MPSAFE 97 */ 98 int 99 read(td, uap) 100 struct thread *td; 101 struct read_args *uap; 102 { 103 struct file *fp; 104 int error; 105 106 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 107 error = dofileread(td, fp, uap->fd, uap->buf, 108 uap->nbyte, (off_t)-1, 0); 109 fdrop(fp, td); 110 } 111 return(error); 112 } 113 114 /* 115 * Pread system call 116 */ 117 #ifndef _SYS_SYSPROTO_H_ 118 struct pread_args { 119 int fd; 120 void *buf; 121 size_t nbyte; 122 int pad; 123 off_t offset; 124 }; 125 #endif 126 /* 127 * MPSAFE 128 */ 129 int 130 pread(td, uap) 131 struct thread *td; 132 struct pread_args *uap; 133 { 134 struct file *fp; 135 int error; 136 137 if ((error = fget_read(td, uap->fd, &fp)) != 0) 138 return (error); 139 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 140 error = ESPIPE; 141 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR) 142 error = EINVAL; 143 else { 144 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 145 uap->offset, FOF_OFFSET); 146 } 147 fdrop(fp, td); 148 return(error); 149 } 150 151 /* 152 * Code common for read and pread 153 */ 154 static int 155 dofileread(td, fp, fd, buf, nbyte, offset, flags) 156 struct thread *td; 157 struct file *fp; 158 int fd, flags; 159 void *buf; 160 size_t nbyte; 161 off_t offset; 162 { 163 struct uio auio; 164 struct iovec aiov; 165 long cnt, error = 0; 166 #ifdef KTRACE 167 struct uio *ktruio = NULL; 168 #endif 169 170 /* Finish zero length reads right here */ 171 if (nbyte == 0) { 172 td->td_retval[0] = 0; 173 return(0); 174 } 175 aiov.iov_base = buf; 176 aiov.iov_len = nbyte; 177 auio.uio_iov = &aiov; 178 auio.uio_iovcnt = 1; 179 auio.uio_offset = offset; 180 if (nbyte > INT_MAX) 181 return (EINVAL); 182 auio.uio_resid = nbyte; 183 auio.uio_rw = UIO_READ; 184 auio.uio_segflg = UIO_USERSPACE; 185 auio.uio_td = td; 186 #ifdef KTRACE 187 if (KTRPOINT(td, KTR_GENIO)) 188 ktruio = cloneuio(&auio); 189 #endif 190 cnt = nbyte; 191 192 if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) { 193 if (auio.uio_resid != cnt && (error == ERESTART || 194 error == EINTR || error == EWOULDBLOCK)) 195 error = 0; 196 } 197 cnt -= auio.uio_resid; 198 #ifdef KTRACE 199 if (ktruio != NULL) { 200 ktruio->uio_resid = cnt; 201 ktrgenio(fd, UIO_READ, ktruio, error); 202 } 203 #endif 204 td->td_retval[0] = cnt; 205 return (error); 206 } 207 208 /* 209 * Scatter read system call. 210 */ 211 #ifndef _SYS_SYSPROTO_H_ 212 struct readv_args { 213 int fd; 214 struct iovec *iovp; 215 u_int iovcnt; 216 }; 217 #endif 218 /* 219 * MPSAFE 220 */ 221 int 222 readv(struct thread *td, struct readv_args *uap) 223 { 224 struct file *fp; 225 struct uio *auio = NULL; 226 long cnt; 227 int error; 228 #ifdef KTRACE 229 struct uio *ktruio = NULL; 230 #endif 231 232 error = fget_read(td, uap->fd, &fp); 233 if (error) 234 return (error); 235 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 236 if (error) { 237 fdrop(fp, td); 238 return (error); 239 } 240 /* Finish zero length reads right here */ 241 if (auio->uio_resid == 0) { 242 td->td_retval[0] = 0; 243 free(auio, M_IOV); 244 fdrop(fp, td); 245 return(0); 246 } 247 auio->uio_rw = UIO_READ; 248 auio->uio_td = td; 249 #ifdef KTRACE 250 if (KTRPOINT(td, KTR_GENIO)) 251 ktruio = cloneuio(auio); 252 #endif 253 cnt = auio->uio_resid; 254 if ((error = fo_read(fp, auio, td->td_ucred, 0, td))) { 255 if (auio->uio_resid != cnt && (error == ERESTART || 256 error == EINTR || error == EWOULDBLOCK)) 257 error = 0; 258 } 259 cnt -= auio->uio_resid; 260 #ifdef KTRACE 261 if (ktruio != NULL) { 262 ktruio->uio_resid = cnt; 263 ktrgenio(uap->fd, UIO_READ, ktruio, error); 264 } 265 #endif 266 td->td_retval[0] = cnt; 267 free(auio, M_IOV); 268 fdrop(fp, td); 269 return (error); 270 } 271 272 /* 273 * Write system call 274 */ 275 #ifndef _SYS_SYSPROTO_H_ 276 struct write_args { 277 int fd; 278 const void *buf; 279 size_t nbyte; 280 }; 281 #endif 282 /* 283 * MPSAFE 284 */ 285 int 286 write(td, uap) 287 struct thread *td; 288 struct write_args *uap; 289 { 290 struct file *fp; 291 int error; 292 293 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 294 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 295 (off_t)-1, 0); 296 fdrop(fp, td); 297 } else { 298 error = EBADF; /* XXX this can't be right */ 299 } 300 return(error); 301 } 302 303 /* 304 * Pwrite system call 305 */ 306 #ifndef _SYS_SYSPROTO_H_ 307 struct pwrite_args { 308 int fd; 309 const void *buf; 310 size_t nbyte; 311 int pad; 312 off_t offset; 313 }; 314 #endif 315 /* 316 * MPSAFE 317 */ 318 int 319 pwrite(td, uap) 320 struct thread *td; 321 struct pwrite_args *uap; 322 { 323 struct file *fp; 324 int error; 325 326 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 327 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 328 error = ESPIPE; 329 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR) 330 error = EINVAL; 331 else { 332 error = dofilewrite(td, fp, uap->fd, uap->buf, 333 uap->nbyte, uap->offset, FOF_OFFSET); 334 } 335 fdrop(fp, td); 336 } else { 337 error = EBADF; /* this can't be right */ 338 } 339 return(error); 340 } 341 342 static int 343 dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 344 struct thread *td; 345 struct file *fp; 346 int fd, flags; 347 const void *buf; 348 size_t nbyte; 349 off_t offset; 350 { 351 struct uio auio; 352 struct iovec aiov; 353 long cnt, error = 0; 354 #ifdef KTRACE 355 struct uio *ktruio = NULL; 356 #endif 357 358 aiov.iov_base = (void *)(uintptr_t)buf; 359 aiov.iov_len = nbyte; 360 auio.uio_iov = &aiov; 361 auio.uio_iovcnt = 1; 362 auio.uio_offset = offset; 363 if (nbyte > INT_MAX) 364 return (EINVAL); 365 auio.uio_resid = nbyte; 366 auio.uio_rw = UIO_WRITE; 367 auio.uio_segflg = UIO_USERSPACE; 368 auio.uio_td = td; 369 #ifdef KTRACE 370 if (KTRPOINT(td, KTR_GENIO)) 371 ktruio = cloneuio(&auio); 372 #endif 373 cnt = nbyte; 374 if (fp->f_type == DTYPE_VNODE) 375 bwillwrite(); 376 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) { 377 if (auio.uio_resid != cnt && (error == ERESTART || 378 error == EINTR || error == EWOULDBLOCK)) 379 error = 0; 380 /* Socket layer is responsible for issuing SIGPIPE. */ 381 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) { 382 PROC_LOCK(td->td_proc); 383 psignal(td->td_proc, SIGPIPE); 384 PROC_UNLOCK(td->td_proc); 385 } 386 } 387 cnt -= auio.uio_resid; 388 #ifdef KTRACE 389 if (ktruio != NULL) { 390 ktruio->uio_resid = cnt; 391 ktrgenio(fd, UIO_WRITE, ktruio, error); 392 } 393 #endif 394 td->td_retval[0] = cnt; 395 return (error); 396 } 397 398 /* 399 * Gather write system call 400 */ 401 #ifndef _SYS_SYSPROTO_H_ 402 struct writev_args { 403 int fd; 404 struct iovec *iovp; 405 u_int iovcnt; 406 }; 407 #endif 408 /* 409 * MPSAFE 410 */ 411 int 412 writev(struct thread *td, struct writev_args *uap) 413 { 414 struct file *fp; 415 struct uio *auio = NULL; 416 long cnt; 417 int error; 418 #ifdef KTRACE 419 struct uio *ktruio = NULL; 420 #endif 421 422 error = fget_write(td, uap->fd, &fp); 423 if (error) 424 return (EBADF); 425 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 426 if (error) { 427 fdrop(fp, td); 428 return (error); 429 } 430 auio->uio_rw = UIO_WRITE; 431 auio->uio_td = td; 432 #ifdef KTRACE 433 if (KTRPOINT(td, KTR_GENIO)) 434 ktruio = cloneuio(auio); 435 #endif 436 cnt = auio->uio_resid; 437 if (fp->f_type == DTYPE_VNODE) 438 bwillwrite(); 439 if ((error = fo_write(fp, auio, td->td_ucred, 0, td))) { 440 if (auio->uio_resid != cnt && (error == ERESTART || 441 error == EINTR || error == EWOULDBLOCK)) 442 error = 0; 443 if (error == EPIPE) { 444 PROC_LOCK(td->td_proc); 445 psignal(td->td_proc, SIGPIPE); 446 PROC_UNLOCK(td->td_proc); 447 } 448 } 449 cnt -= auio->uio_resid; 450 #ifdef KTRACE 451 if (ktruio != NULL) { 452 ktruio->uio_resid = cnt; 453 ktrgenio(uap->fd, UIO_WRITE, ktruio, error); 454 } 455 #endif 456 td->td_retval[0] = cnt; 457 fdrop(fp, td); 458 free(auio, M_IOV); 459 return (error); 460 } 461 462 /* 463 * Ioctl system call 464 */ 465 #ifndef _SYS_SYSPROTO_H_ 466 struct ioctl_args { 467 int fd; 468 u_long com; 469 caddr_t data; 470 }; 471 #endif 472 /* 473 * MPSAFE 474 */ 475 /* ARGSUSED */ 476 int 477 ioctl(struct thread *td, struct ioctl_args *uap) 478 { 479 struct file *fp; 480 struct filedesc *fdp; 481 u_long com; 482 int error = 0; 483 u_int size; 484 caddr_t data, memp; 485 int tmp; 486 487 if (uap->com > 0xffffffff) { 488 printf( 489 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", 490 td->td_proc->p_pid, td->td_proc->p_comm, uap->com); 491 uap->com &= 0xffffffff; 492 } 493 if ((error = fget(td, uap->fd, &fp)) != 0) 494 return (error); 495 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 496 fdrop(fp, td); 497 return (EBADF); 498 } 499 fdp = td->td_proc->p_fd; 500 switch (com = uap->com) { 501 case FIONCLEX: 502 FILEDESC_LOCK_FAST(fdp); 503 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 504 FILEDESC_UNLOCK_FAST(fdp); 505 fdrop(fp, td); 506 return (0); 507 case FIOCLEX: 508 FILEDESC_LOCK_FAST(fdp); 509 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 510 FILEDESC_UNLOCK_FAST(fdp); 511 fdrop(fp, td); 512 return (0); 513 } 514 515 /* 516 * Interpret high order word to find amount of data to be 517 * copied to/from the user's address space. 518 */ 519 size = IOCPARM_LEN(com); 520 if ((size > IOCPARM_MAX) || 521 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || 522 ((com & IOC_VOID) && size > 0) || 523 ((com & (IOC_IN | IOC_OUT)) && size == 0)) { 524 fdrop(fp, td); 525 return (ENOTTY); 526 } 527 528 if (size > 0) { 529 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 530 data = memp; 531 } else { 532 memp = NULL; 533 data = (void *)&uap->data; 534 } 535 if (com & IOC_IN) { 536 error = copyin(uap->data, data, (u_int)size); 537 if (error) { 538 free(memp, M_IOCTLOPS); 539 fdrop(fp, td); 540 return (error); 541 } 542 } else if (com & IOC_OUT) { 543 /* 544 * Zero the buffer so the user always 545 * gets back something deterministic. 546 */ 547 bzero(data, size); 548 } 549 550 if (com == FIONBIO) { 551 FILE_LOCK(fp); 552 if ((tmp = *(int *)data)) 553 fp->f_flag |= FNONBLOCK; 554 else 555 fp->f_flag &= ~FNONBLOCK; 556 FILE_UNLOCK(fp); 557 data = (void *)&tmp; 558 } else if (com == FIOASYNC) { 559 FILE_LOCK(fp); 560 if ((tmp = *(int *)data)) 561 fp->f_flag |= FASYNC; 562 else 563 fp->f_flag &= ~FASYNC; 564 FILE_UNLOCK(fp); 565 data = (void *)&tmp; 566 } 567 568 error = fo_ioctl(fp, com, data, td->td_ucred, td); 569 570 if (error == 0 && (com & IOC_OUT)) 571 error = copyout(data, uap->data, (u_int)size); 572 573 if (memp != NULL) 574 free(memp, M_IOCTLOPS); 575 fdrop(fp, td); 576 return (error); 577 } 578 579 /* 580 * sellock and selwait are initialized in selectinit() via SYSINIT. 581 */ 582 struct mtx sellock; 583 struct cv selwait; 584 u_int nselcoll; /* Select collisions since boot */ 585 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 586 587 /* 588 * Select system call. 589 */ 590 #ifndef _SYS_SYSPROTO_H_ 591 struct select_args { 592 int nd; 593 fd_set *in, *ou, *ex; 594 struct timeval *tv; 595 }; 596 #endif 597 /* 598 * MPSAFE 599 */ 600 int 601 select(td, uap) 602 register struct thread *td; 603 register struct select_args *uap; 604 { 605 struct timeval tv, *tvp; 606 int error; 607 608 if (uap->tv != NULL) { 609 error = copyin(uap->tv, &tv, sizeof(tv)); 610 if (error) 611 return (error); 612 tvp = &tv; 613 } else 614 tvp = NULL; 615 616 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); 617 } 618 619 int 620 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 621 fd_set *fd_ex, struct timeval *tvp) 622 { 623 struct filedesc *fdp; 624 /* 625 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 626 * infds with the new FD_SETSIZE of 1024, and more than enough for 627 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 628 * of 256. 629 */ 630 fd_mask s_selbits[howmany(2048, NFDBITS)]; 631 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 632 struct timeval atv, rtv, ttv; 633 int error, timo; 634 u_int ncoll, nbufbytes, ncpbytes, nfdbits; 635 636 if (nd < 0) 637 return (EINVAL); 638 fdp = td->td_proc->p_fd; 639 640 FILEDESC_LOCK_FAST(fdp); 641 642 if (nd > td->td_proc->p_fd->fd_nfiles) 643 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 644 FILEDESC_UNLOCK_FAST(fdp); 645 646 /* 647 * Allocate just enough bits for the non-null fd_sets. Use the 648 * preallocated auto buffer if possible. 649 */ 650 nfdbits = roundup(nd, NFDBITS); 651 ncpbytes = nfdbits / NBBY; 652 nbufbytes = 0; 653 if (fd_in != NULL) 654 nbufbytes += 2 * ncpbytes; 655 if (fd_ou != NULL) 656 nbufbytes += 2 * ncpbytes; 657 if (fd_ex != NULL) 658 nbufbytes += 2 * ncpbytes; 659 if (nbufbytes <= sizeof s_selbits) 660 selbits = &s_selbits[0]; 661 else 662 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 663 664 /* 665 * Assign pointers into the bit buffers and fetch the input bits. 666 * Put the output buffers together so that they can be bzeroed 667 * together. 668 */ 669 sbp = selbits; 670 #define getbits(name, x) \ 671 do { \ 672 if (name == NULL) \ 673 ibits[x] = NULL; \ 674 else { \ 675 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 676 obits[x] = sbp; \ 677 sbp += ncpbytes / sizeof *sbp; \ 678 error = copyin(name, ibits[x], ncpbytes); \ 679 if (error != 0) \ 680 goto done_nosellock; \ 681 } \ 682 } while (0) 683 getbits(fd_in, 0); 684 getbits(fd_ou, 1); 685 getbits(fd_ex, 2); 686 #undef getbits 687 if (nbufbytes != 0) 688 bzero(selbits, nbufbytes / 2); 689 690 if (tvp != NULL) { 691 atv = *tvp; 692 if (itimerfix(&atv)) { 693 error = EINVAL; 694 goto done_nosellock; 695 } 696 getmicrouptime(&rtv); 697 timevaladd(&atv, &rtv); 698 } else { 699 atv.tv_sec = 0; 700 atv.tv_usec = 0; 701 } 702 timo = 0; 703 TAILQ_INIT(&td->td_selq); 704 mtx_lock(&sellock); 705 retry: 706 ncoll = nselcoll; 707 mtx_lock_spin(&sched_lock); 708 td->td_flags |= TDF_SELECT; 709 mtx_unlock_spin(&sched_lock); 710 mtx_unlock(&sellock); 711 712 error = selscan(td, ibits, obits, nd); 713 mtx_lock(&sellock); 714 if (error || td->td_retval[0]) 715 goto done; 716 if (atv.tv_sec || atv.tv_usec) { 717 getmicrouptime(&rtv); 718 if (timevalcmp(&rtv, &atv, >=)) 719 goto done; 720 ttv = atv; 721 timevalsub(&ttv, &rtv); 722 timo = ttv.tv_sec > 24 * 60 * 60 ? 723 24 * 60 * 60 * hz : tvtohz(&ttv); 724 } 725 726 /* 727 * An event of interest may occur while we do not hold 728 * sellock, so check TDF_SELECT and the number of 729 * collisions and rescan the file descriptors if 730 * necessary. 731 */ 732 mtx_lock_spin(&sched_lock); 733 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 734 mtx_unlock_spin(&sched_lock); 735 goto retry; 736 } 737 mtx_unlock_spin(&sched_lock); 738 739 if (timo > 0) 740 error = cv_timedwait_sig(&selwait, &sellock, timo); 741 else 742 error = cv_wait_sig(&selwait, &sellock); 743 744 if (error == 0) 745 goto retry; 746 747 done: 748 clear_selinfo_list(td); 749 mtx_lock_spin(&sched_lock); 750 td->td_flags &= ~TDF_SELECT; 751 mtx_unlock_spin(&sched_lock); 752 mtx_unlock(&sellock); 753 754 done_nosellock: 755 /* select is not restarted after signals... */ 756 if (error == ERESTART) 757 error = EINTR; 758 if (error == EWOULDBLOCK) 759 error = 0; 760 #define putbits(name, x) \ 761 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 762 error = error2; 763 if (error == 0) { 764 int error2; 765 766 putbits(fd_in, 0); 767 putbits(fd_ou, 1); 768 putbits(fd_ex, 2); 769 #undef putbits 770 } 771 if (selbits != &s_selbits[0]) 772 free(selbits, M_SELECT); 773 774 return (error); 775 } 776 777 static int 778 selscan(td, ibits, obits, nfd) 779 struct thread *td; 780 fd_mask **ibits, **obits; 781 int nfd; 782 { 783 int msk, i, fd; 784 fd_mask bits; 785 struct file *fp; 786 int n = 0; 787 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 788 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 789 struct filedesc *fdp = td->td_proc->p_fd; 790 791 FILEDESC_LOCK(fdp); 792 for (msk = 0; msk < 3; msk++) { 793 if (ibits[msk] == NULL) 794 continue; 795 for (i = 0; i < nfd; i += NFDBITS) { 796 bits = ibits[msk][i/NFDBITS]; 797 /* ffs(int mask) not portable, fd_mask is long */ 798 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 799 if (!(bits & 1)) 800 continue; 801 if ((fp = fget_locked(fdp, fd)) == NULL) { 802 FILEDESC_UNLOCK(fdp); 803 return (EBADF); 804 } 805 if (fo_poll(fp, flag[msk], td->td_ucred, 806 td)) { 807 obits[msk][(fd)/NFDBITS] |= 808 ((fd_mask)1 << ((fd) % NFDBITS)); 809 n++; 810 } 811 } 812 } 813 } 814 FILEDESC_UNLOCK(fdp); 815 td->td_retval[0] = n; 816 return (0); 817 } 818 819 /* 820 * Poll system call. 821 */ 822 #ifndef _SYS_SYSPROTO_H_ 823 struct poll_args { 824 struct pollfd *fds; 825 u_int nfds; 826 int timeout; 827 }; 828 #endif 829 /* 830 * MPSAFE 831 */ 832 int 833 poll(td, uap) 834 struct thread *td; 835 struct poll_args *uap; 836 { 837 struct pollfd *bits; 838 struct pollfd smallbits[32]; 839 struct timeval atv, rtv, ttv; 840 int error = 0, timo; 841 u_int ncoll, nfds; 842 size_t ni; 843 844 nfds = uap->nfds; 845 846 /* 847 * This is kinda bogus. We have fd limits, but that is not 848 * really related to the size of the pollfd array. Make sure 849 * we let the process use at least FD_SETSIZE entries and at 850 * least enough for the current limits. We want to be reasonably 851 * safe, but not overly restrictive. 852 */ 853 PROC_LOCK(td->td_proc); 854 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) && 855 (nfds > FD_SETSIZE)) { 856 PROC_UNLOCK(td->td_proc); 857 error = EINVAL; 858 goto done2; 859 } 860 PROC_UNLOCK(td->td_proc); 861 ni = nfds * sizeof(struct pollfd); 862 if (ni > sizeof(smallbits)) 863 bits = malloc(ni, M_TEMP, M_WAITOK); 864 else 865 bits = smallbits; 866 error = copyin(uap->fds, bits, ni); 867 if (error) 868 goto done_nosellock; 869 if (uap->timeout != INFTIM) { 870 atv.tv_sec = uap->timeout / 1000; 871 atv.tv_usec = (uap->timeout % 1000) * 1000; 872 if (itimerfix(&atv)) { 873 error = EINVAL; 874 goto done_nosellock; 875 } 876 getmicrouptime(&rtv); 877 timevaladd(&atv, &rtv); 878 } else { 879 atv.tv_sec = 0; 880 atv.tv_usec = 0; 881 } 882 timo = 0; 883 TAILQ_INIT(&td->td_selq); 884 mtx_lock(&sellock); 885 retry: 886 ncoll = nselcoll; 887 mtx_lock_spin(&sched_lock); 888 td->td_flags |= TDF_SELECT; 889 mtx_unlock_spin(&sched_lock); 890 mtx_unlock(&sellock); 891 892 error = pollscan(td, bits, nfds); 893 mtx_lock(&sellock); 894 if (error || td->td_retval[0]) 895 goto done; 896 if (atv.tv_sec || atv.tv_usec) { 897 getmicrouptime(&rtv); 898 if (timevalcmp(&rtv, &atv, >=)) 899 goto done; 900 ttv = atv; 901 timevalsub(&ttv, &rtv); 902 timo = ttv.tv_sec > 24 * 60 * 60 ? 903 24 * 60 * 60 * hz : tvtohz(&ttv); 904 } 905 /* 906 * An event of interest may occur while we do not hold 907 * sellock, so check TDF_SELECT and the number of collisions 908 * and rescan the file descriptors if necessary. 909 */ 910 mtx_lock_spin(&sched_lock); 911 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 912 mtx_unlock_spin(&sched_lock); 913 goto retry; 914 } 915 mtx_unlock_spin(&sched_lock); 916 917 if (timo > 0) 918 error = cv_timedwait_sig(&selwait, &sellock, timo); 919 else 920 error = cv_wait_sig(&selwait, &sellock); 921 922 if (error == 0) 923 goto retry; 924 925 done: 926 clear_selinfo_list(td); 927 mtx_lock_spin(&sched_lock); 928 td->td_flags &= ~TDF_SELECT; 929 mtx_unlock_spin(&sched_lock); 930 mtx_unlock(&sellock); 931 932 done_nosellock: 933 /* poll is not restarted after signals... */ 934 if (error == ERESTART) 935 error = EINTR; 936 if (error == EWOULDBLOCK) 937 error = 0; 938 if (error == 0) { 939 error = copyout(bits, uap->fds, ni); 940 if (error) 941 goto out; 942 } 943 out: 944 if (ni > sizeof(smallbits)) 945 free(bits, M_TEMP); 946 done2: 947 return (error); 948 } 949 950 static int 951 pollscan(td, fds, nfd) 952 struct thread *td; 953 struct pollfd *fds; 954 u_int nfd; 955 { 956 register struct filedesc *fdp = td->td_proc->p_fd; 957 int i; 958 struct file *fp; 959 int n = 0; 960 961 FILEDESC_LOCK(fdp); 962 for (i = 0; i < nfd; i++, fds++) { 963 if (fds->fd >= fdp->fd_nfiles) { 964 fds->revents = POLLNVAL; 965 n++; 966 } else if (fds->fd < 0) { 967 fds->revents = 0; 968 } else { 969 fp = fdp->fd_ofiles[fds->fd]; 970 if (fp == NULL) { 971 fds->revents = POLLNVAL; 972 n++; 973 } else { 974 /* 975 * Note: backend also returns POLLHUP and 976 * POLLERR if appropriate. 977 */ 978 fds->revents = fo_poll(fp, fds->events, 979 td->td_ucred, td); 980 if (fds->revents != 0) 981 n++; 982 } 983 } 984 } 985 FILEDESC_UNLOCK(fdp); 986 td->td_retval[0] = n; 987 return (0); 988 } 989 990 /* 991 * OpenBSD poll system call. 992 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 993 */ 994 #ifndef _SYS_SYSPROTO_H_ 995 struct openbsd_poll_args { 996 struct pollfd *fds; 997 u_int nfds; 998 int timeout; 999 }; 1000 #endif 1001 /* 1002 * MPSAFE 1003 */ 1004 int 1005 openbsd_poll(td, uap) 1006 register struct thread *td; 1007 register struct openbsd_poll_args *uap; 1008 { 1009 return (poll(td, (struct poll_args *)uap)); 1010 } 1011 1012 /* 1013 * Remove the references to the thread from all of the objects 1014 * we were polling. 1015 * 1016 * This code assumes that the underlying owner of the selinfo 1017 * structure will hold sellock before it changes it, and that 1018 * it will unlink itself from our list if it goes away. 1019 */ 1020 void 1021 clear_selinfo_list(td) 1022 struct thread *td; 1023 { 1024 struct selinfo *si; 1025 1026 mtx_assert(&sellock, MA_OWNED); 1027 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1028 si->si_thread = NULL; 1029 TAILQ_INIT(&td->td_selq); 1030 } 1031 1032 /* 1033 * Record a select request. 1034 */ 1035 void 1036 selrecord(selector, sip) 1037 struct thread *selector; 1038 struct selinfo *sip; 1039 { 1040 1041 mtx_lock(&sellock); 1042 /* 1043 * If the selinfo's thread pointer is NULL then take ownership of it. 1044 * 1045 * If the thread pointer is not NULL and it points to another 1046 * thread, then we have a collision. 1047 * 1048 * If the thread pointer is not NULL and points back to us then leave 1049 * it alone as we've already added pointed it at us and added it to 1050 * our list. 1051 */ 1052 if (sip->si_thread == NULL) { 1053 sip->si_thread = selector; 1054 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1055 } else if (sip->si_thread != selector) { 1056 sip->si_flags |= SI_COLL; 1057 } 1058 1059 mtx_unlock(&sellock); 1060 } 1061 1062 /* Wake up a selecting thread. */ 1063 void 1064 selwakeup(sip) 1065 struct selinfo *sip; 1066 { 1067 doselwakeup(sip, -1); 1068 } 1069 1070 /* Wake up a selecting thread, and set its priority. */ 1071 void 1072 selwakeuppri(sip, pri) 1073 struct selinfo *sip; 1074 int pri; 1075 { 1076 doselwakeup(sip, pri); 1077 } 1078 1079 /* 1080 * Do a wakeup when a selectable event occurs. 1081 */ 1082 static void 1083 doselwakeup(sip, pri) 1084 struct selinfo *sip; 1085 int pri; 1086 { 1087 struct thread *td; 1088 1089 mtx_lock(&sellock); 1090 td = sip->si_thread; 1091 if ((sip->si_flags & SI_COLL) != 0) { 1092 nselcoll++; 1093 sip->si_flags &= ~SI_COLL; 1094 cv_broadcastpri(&selwait, pri); 1095 } 1096 if (td == NULL) { 1097 mtx_unlock(&sellock); 1098 return; 1099 } 1100 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1101 sip->si_thread = NULL; 1102 mtx_lock_spin(&sched_lock); 1103 td->td_flags &= ~TDF_SELECT; 1104 mtx_unlock_spin(&sched_lock); 1105 sleepq_remove(td, &selwait); 1106 mtx_unlock(&sellock); 1107 } 1108 1109 static void selectinit(void *); 1110 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1111 1112 /* ARGSUSED*/ 1113 static void 1114 selectinit(dummy) 1115 void *dummy; 1116 { 1117 cv_init(&selwait, "select"); 1118 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1119 } 1120