1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ktrace.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/sysproto.h> 45 #include <sys/filedesc.h> 46 #include <sys/filio.h> 47 #include <sys/fcntl.h> 48 #include <sys/file.h> 49 #include <sys/proc.h> 50 #include <sys/signalvar.h> 51 #include <sys/socketvar.h> 52 #include <sys/uio.h> 53 #include <sys/kernel.h> 54 #include <sys/limits.h> 55 #include <sys/malloc.h> 56 #include <sys/poll.h> 57 #include <sys/resourcevar.h> 58 #include <sys/selinfo.h> 59 #include <sys/sleepqueue.h> 60 #include <sys/syscallsubr.h> 61 #include <sys/sysctl.h> 62 #include <sys/sysent.h> 63 #include <sys/vnode.h> 64 #include <sys/bio.h> 65 #include <sys/buf.h> 66 #include <sys/condvar.h> 67 #ifdef KTRACE 68 #include <sys/ktrace.h> 69 #endif 70 #include <vm/vm.h> 71 #include <vm/vm_page.h> 72 73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 75 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 76 77 static int pollscan(struct thread *, struct pollfd *, u_int); 78 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 79 static int dofileread(struct thread *, struct file *, int, void *, 80 size_t, off_t, int); 81 static int dofilewrite(struct thread *, struct file *, int, 82 const void *, size_t, off_t, int); 83 static void doselwakeup(struct selinfo *, int); 84 85 /* 86 * Read system call. 87 */ 88 #ifndef _SYS_SYSPROTO_H_ 89 struct read_args { 90 int fd; 91 void *buf; 92 size_t nbyte; 93 }; 94 #endif 95 /* 96 * MPSAFE 97 */ 98 int 99 read(td, uap) 100 struct thread *td; 101 struct read_args *uap; 102 { 103 struct file *fp; 104 int error; 105 106 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 107 error = dofileread(td, fp, uap->fd, uap->buf, 108 uap->nbyte, (off_t)-1, 0); 109 fdrop(fp, td); 110 } 111 return(error); 112 } 113 114 /* 115 * Pread system call 116 */ 117 #ifndef _SYS_SYSPROTO_H_ 118 struct pread_args { 119 int fd; 120 void *buf; 121 size_t nbyte; 122 int pad; 123 off_t offset; 124 }; 125 #endif 126 /* 127 * MPSAFE 128 */ 129 int 130 pread(td, uap) 131 struct thread *td; 132 struct pread_args *uap; 133 { 134 struct file *fp; 135 int error; 136 137 if ((error = fget_read(td, uap->fd, &fp)) != 0) 138 return (error); 139 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 140 error = ESPIPE; 141 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR) 142 error = EINVAL; 143 else { 144 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 145 uap->offset, FOF_OFFSET); 146 } 147 fdrop(fp, td); 148 return(error); 149 } 150 151 /* 152 * Code common for read and pread 153 */ 154 static int 155 dofileread(td, fp, fd, buf, nbyte, offset, flags) 156 struct thread *td; 157 struct file *fp; 158 int fd, flags; 159 void *buf; 160 size_t nbyte; 161 off_t offset; 162 { 163 struct uio auio; 164 struct iovec aiov; 165 long cnt, error = 0; 166 #ifdef KTRACE 167 struct iovec ktriov; 168 struct uio ktruio; 169 int didktr = 0; 170 #endif 171 172 aiov.iov_base = buf; 173 aiov.iov_len = nbyte; 174 auio.uio_iov = &aiov; 175 auio.uio_iovcnt = 1; 176 auio.uio_offset = offset; 177 if (nbyte > INT_MAX) 178 return (EINVAL); 179 auio.uio_resid = nbyte; 180 auio.uio_rw = UIO_READ; 181 auio.uio_segflg = UIO_USERSPACE; 182 auio.uio_td = td; 183 #ifdef KTRACE 184 /* 185 * if tracing, save a copy of iovec 186 */ 187 if (KTRPOINT(td, KTR_GENIO)) { 188 ktriov = aiov; 189 ktruio = auio; 190 didktr = 1; 191 } 192 #endif 193 cnt = nbyte; 194 195 if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) { 196 if (auio.uio_resid != cnt && (error == ERESTART || 197 error == EINTR || error == EWOULDBLOCK)) 198 error = 0; 199 } 200 cnt -= auio.uio_resid; 201 #ifdef KTRACE 202 if (didktr && error == 0) { 203 ktruio.uio_iov = &ktriov; 204 ktruio.uio_resid = cnt; 205 ktrgenio(fd, UIO_READ, &ktruio, error); 206 } 207 #endif 208 td->td_retval[0] = cnt; 209 return (error); 210 } 211 212 /* 213 * Scatter read system call. 214 */ 215 #ifndef _SYS_SYSPROTO_H_ 216 struct readv_args { 217 int fd; 218 struct iovec *iovp; 219 u_int iovcnt; 220 }; 221 #endif 222 /* 223 * MPSAFE 224 */ 225 int 226 readv(td, uap) 227 struct thread *td; 228 struct readv_args *uap; 229 { 230 struct file *fp; 231 struct uio auio; 232 struct iovec *iov; 233 struct iovec *needfree; 234 struct iovec aiov[UIO_SMALLIOV]; 235 long i, cnt; 236 int error; 237 u_int iovlen; 238 #ifdef KTRACE 239 struct iovec *ktriov = NULL; 240 struct uio ktruio; 241 #endif 242 243 if ((error = fget_read(td, uap->fd, &fp)) != 0) 244 return (error); 245 needfree = NULL; 246 /* note: can't use iovlen until iovcnt is validated */ 247 iovlen = uap->iovcnt * sizeof (struct iovec); 248 if (uap->iovcnt > UIO_SMALLIOV) { 249 if (uap->iovcnt > UIO_MAXIOV) { 250 error = EINVAL; 251 goto done; 252 } 253 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 254 needfree = iov; 255 } else 256 iov = aiov; 257 auio.uio_iov = iov; 258 auio.uio_iovcnt = uap->iovcnt; 259 auio.uio_rw = UIO_READ; 260 auio.uio_segflg = UIO_USERSPACE; 261 auio.uio_td = td; 262 auio.uio_offset = -1; 263 if ((error = copyin(uap->iovp, iov, iovlen))) 264 goto done; 265 auio.uio_resid = 0; 266 for (i = 0; i < uap->iovcnt; i++) { 267 if (iov->iov_len > INT_MAX - auio.uio_resid) { 268 error = EINVAL; 269 goto done; 270 } 271 auio.uio_resid += iov->iov_len; 272 iov++; 273 } 274 #ifdef KTRACE 275 /* 276 * if tracing, save a copy of iovec 277 */ 278 if (KTRPOINT(td, KTR_GENIO)) { 279 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 280 bcopy(auio.uio_iov, ktriov, iovlen); 281 ktruio = auio; 282 } 283 #endif 284 cnt = auio.uio_resid; 285 if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) { 286 if (auio.uio_resid != cnt && (error == ERESTART || 287 error == EINTR || error == EWOULDBLOCK)) 288 error = 0; 289 } 290 cnt -= auio.uio_resid; 291 #ifdef KTRACE 292 if (ktriov != NULL) { 293 if (error == 0) { 294 ktruio.uio_iov = ktriov; 295 ktruio.uio_resid = cnt; 296 ktrgenio(uap->fd, UIO_READ, &ktruio, error); 297 } 298 FREE(ktriov, M_TEMP); 299 } 300 #endif 301 td->td_retval[0] = cnt; 302 done: 303 fdrop(fp, td); 304 if (needfree) 305 FREE(needfree, M_IOV); 306 return (error); 307 } 308 309 /* 310 * Write system call 311 */ 312 #ifndef _SYS_SYSPROTO_H_ 313 struct write_args { 314 int fd; 315 const void *buf; 316 size_t nbyte; 317 }; 318 #endif 319 /* 320 * MPSAFE 321 */ 322 int 323 write(td, uap) 324 struct thread *td; 325 struct write_args *uap; 326 { 327 struct file *fp; 328 int error; 329 330 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 331 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 332 (off_t)-1, 0); 333 fdrop(fp, td); 334 } else { 335 error = EBADF; /* XXX this can't be right */ 336 } 337 return(error); 338 } 339 340 /* 341 * Pwrite system call 342 */ 343 #ifndef _SYS_SYSPROTO_H_ 344 struct pwrite_args { 345 int fd; 346 const void *buf; 347 size_t nbyte; 348 int pad; 349 off_t offset; 350 }; 351 #endif 352 /* 353 * MPSAFE 354 */ 355 int 356 pwrite(td, uap) 357 struct thread *td; 358 struct pwrite_args *uap; 359 { 360 struct file *fp; 361 int error; 362 363 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 364 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 365 error = ESPIPE; 366 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR) 367 error = EINVAL; 368 else { 369 error = dofilewrite(td, fp, uap->fd, uap->buf, 370 uap->nbyte, uap->offset, FOF_OFFSET); 371 } 372 fdrop(fp, td); 373 } else { 374 error = EBADF; /* this can't be right */ 375 } 376 return(error); 377 } 378 379 static int 380 dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 381 struct thread *td; 382 struct file *fp; 383 int fd, flags; 384 const void *buf; 385 size_t nbyte; 386 off_t offset; 387 { 388 struct uio auio; 389 struct iovec aiov; 390 long cnt, error = 0; 391 #ifdef KTRACE 392 struct iovec ktriov; 393 struct uio ktruio; 394 int didktr = 0; 395 #endif 396 397 aiov.iov_base = (void *)(uintptr_t)buf; 398 aiov.iov_len = nbyte; 399 auio.uio_iov = &aiov; 400 auio.uio_iovcnt = 1; 401 auio.uio_offset = offset; 402 if (nbyte > INT_MAX) 403 return (EINVAL); 404 auio.uio_resid = nbyte; 405 auio.uio_rw = UIO_WRITE; 406 auio.uio_segflg = UIO_USERSPACE; 407 auio.uio_td = td; 408 #ifdef KTRACE 409 /* 410 * if tracing, save a copy of iovec and uio 411 */ 412 if (KTRPOINT(td, KTR_GENIO)) { 413 ktriov = aiov; 414 ktruio = auio; 415 didktr = 1; 416 } 417 #endif 418 cnt = nbyte; 419 if (fp->f_type == DTYPE_VNODE) 420 bwillwrite(); 421 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) { 422 if (auio.uio_resid != cnt && (error == ERESTART || 423 error == EINTR || error == EWOULDBLOCK)) 424 error = 0; 425 /* Socket layer is responsible for issuing SIGPIPE. */ 426 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) { 427 PROC_LOCK(td->td_proc); 428 psignal(td->td_proc, SIGPIPE); 429 PROC_UNLOCK(td->td_proc); 430 } 431 } 432 cnt -= auio.uio_resid; 433 #ifdef KTRACE 434 if (didktr && error == 0) { 435 ktruio.uio_iov = &ktriov; 436 ktruio.uio_resid = cnt; 437 ktrgenio(fd, UIO_WRITE, &ktruio, error); 438 } 439 #endif 440 td->td_retval[0] = cnt; 441 return (error); 442 } 443 444 /* 445 * Gather write system call 446 */ 447 #ifndef _SYS_SYSPROTO_H_ 448 struct writev_args { 449 int fd; 450 struct iovec *iovp; 451 u_int iovcnt; 452 }; 453 #endif 454 /* 455 * MPSAFE 456 */ 457 int 458 writev(td, uap) 459 struct thread *td; 460 register struct writev_args *uap; 461 { 462 struct file *fp; 463 struct uio auio; 464 register struct iovec *iov; 465 struct iovec *needfree; 466 struct iovec aiov[UIO_SMALLIOV]; 467 long i, cnt, error = 0; 468 u_int iovlen; 469 #ifdef KTRACE 470 struct iovec *ktriov = NULL; 471 struct uio ktruio; 472 #endif 473 474 if ((error = fget_write(td, uap->fd, &fp)) != 0) 475 return (EBADF); 476 needfree = NULL; 477 /* note: can't use iovlen until iovcnt is validated */ 478 iovlen = uap->iovcnt * sizeof (struct iovec); 479 if (uap->iovcnt > UIO_SMALLIOV) { 480 if (uap->iovcnt > UIO_MAXIOV) { 481 error = EINVAL; 482 goto done; 483 } 484 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 485 needfree = iov; 486 } else 487 iov = aiov; 488 auio.uio_iov = iov; 489 auio.uio_iovcnt = uap->iovcnt; 490 auio.uio_rw = UIO_WRITE; 491 auio.uio_segflg = UIO_USERSPACE; 492 auio.uio_td = td; 493 auio.uio_offset = -1; 494 if ((error = copyin(uap->iovp, iov, iovlen))) 495 goto done; 496 auio.uio_resid = 0; 497 for (i = 0; i < uap->iovcnt; i++) { 498 if (iov->iov_len > INT_MAX - auio.uio_resid) { 499 error = EINVAL; 500 goto done; 501 } 502 auio.uio_resid += iov->iov_len; 503 iov++; 504 } 505 #ifdef KTRACE 506 /* 507 * if tracing, save a copy of iovec and uio 508 */ 509 if (KTRPOINT(td, KTR_GENIO)) { 510 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 511 bcopy(auio.uio_iov, ktriov, iovlen); 512 ktruio = auio; 513 } 514 #endif 515 cnt = auio.uio_resid; 516 if (fp->f_type == DTYPE_VNODE) 517 bwillwrite(); 518 if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) { 519 if (auio.uio_resid != cnt && (error == ERESTART || 520 error == EINTR || error == EWOULDBLOCK)) 521 error = 0; 522 if (error == EPIPE) { 523 PROC_LOCK(td->td_proc); 524 psignal(td->td_proc, SIGPIPE); 525 PROC_UNLOCK(td->td_proc); 526 } 527 } 528 cnt -= auio.uio_resid; 529 #ifdef KTRACE 530 if (ktriov != NULL) { 531 if (error == 0) { 532 ktruio.uio_iov = ktriov; 533 ktruio.uio_resid = cnt; 534 ktrgenio(uap->fd, UIO_WRITE, &ktruio, error); 535 } 536 FREE(ktriov, M_TEMP); 537 } 538 #endif 539 td->td_retval[0] = cnt; 540 done: 541 fdrop(fp, td); 542 if (needfree) 543 FREE(needfree, M_IOV); 544 return (error); 545 } 546 547 /* 548 * Ioctl system call 549 */ 550 #ifndef _SYS_SYSPROTO_H_ 551 struct ioctl_args { 552 int fd; 553 u_long com; 554 caddr_t data; 555 }; 556 #endif 557 /* 558 * MPSAFE 559 */ 560 /* ARGSUSED */ 561 int 562 ioctl(td, uap) 563 struct thread *td; 564 register struct ioctl_args *uap; 565 { 566 struct file *fp; 567 register struct filedesc *fdp; 568 register u_long com; 569 int error = 0; 570 register u_int size; 571 caddr_t data, memp; 572 int tmp; 573 #define STK_PARAMS 128 574 union { 575 char stkbuf[STK_PARAMS]; 576 long align; 577 } ubuf; 578 579 if ((error = fget(td, uap->fd, &fp)) != 0) 580 return (error); 581 mtx_lock(&Giant); 582 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 583 fdrop(fp, td); 584 mtx_unlock(&Giant); 585 return (EBADF); 586 } 587 fdp = td->td_proc->p_fd; 588 switch (com = uap->com) { 589 case FIONCLEX: 590 FILEDESC_LOCK(fdp); 591 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 592 FILEDESC_UNLOCK(fdp); 593 fdrop(fp, td); 594 mtx_unlock(&Giant); 595 return (0); 596 case FIOCLEX: 597 FILEDESC_LOCK(fdp); 598 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 599 FILEDESC_UNLOCK(fdp); 600 fdrop(fp, td); 601 mtx_unlock(&Giant); 602 return (0); 603 } 604 605 /* 606 * Interpret high order word to find amount of data to be 607 * copied to/from the user's address space. 608 */ 609 size = IOCPARM_LEN(com); 610 if (size > IOCPARM_MAX) { 611 fdrop(fp, td); 612 mtx_unlock(&Giant); 613 return (ENOTTY); 614 } 615 616 memp = NULL; 617 if (size > sizeof (ubuf.stkbuf)) { 618 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 619 data = memp; 620 } else { 621 data = ubuf.stkbuf; 622 } 623 if (com&IOC_IN) { 624 if (size) { 625 error = copyin(uap->data, data, (u_int)size); 626 if (error) { 627 if (memp) 628 free(memp, M_IOCTLOPS); 629 fdrop(fp, td); 630 goto done; 631 } 632 } else { 633 *(caddr_t *)data = uap->data; 634 } 635 } else if ((com&IOC_OUT) && size) { 636 /* 637 * Zero the buffer so the user always 638 * gets back something deterministic. 639 */ 640 bzero(data, size); 641 } else if (com&IOC_VOID) { 642 *(caddr_t *)data = uap->data; 643 } 644 645 switch (com) { 646 647 case FIONBIO: 648 FILE_LOCK(fp); 649 if ((tmp = *(int *)data)) 650 fp->f_flag |= FNONBLOCK; 651 else 652 fp->f_flag &= ~FNONBLOCK; 653 FILE_UNLOCK(fp); 654 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 655 break; 656 657 case FIOASYNC: 658 FILE_LOCK(fp); 659 if ((tmp = *(int *)data)) 660 fp->f_flag |= FASYNC; 661 else 662 fp->f_flag &= ~FASYNC; 663 FILE_UNLOCK(fp); 664 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); 665 break; 666 667 default: 668 error = fo_ioctl(fp, com, data, td->td_ucred, td); 669 /* 670 * Copy any data to user, size was 671 * already set and checked above. 672 */ 673 if (error == 0 && (com&IOC_OUT) && size) 674 error = copyout(data, uap->data, (u_int)size); 675 break; 676 } 677 if (memp) 678 free(memp, M_IOCTLOPS); 679 fdrop(fp, td); 680 done: 681 mtx_unlock(&Giant); 682 return (error); 683 } 684 685 /* 686 * sellock and selwait are initialized in selectinit() via SYSINIT. 687 */ 688 struct mtx sellock; 689 struct cv selwait; 690 u_int nselcoll; /* Select collisions since boot */ 691 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 692 693 /* 694 * Select system call. 695 */ 696 #ifndef _SYS_SYSPROTO_H_ 697 struct select_args { 698 int nd; 699 fd_set *in, *ou, *ex; 700 struct timeval *tv; 701 }; 702 #endif 703 /* 704 * MPSAFE 705 */ 706 int 707 select(td, uap) 708 register struct thread *td; 709 register struct select_args *uap; 710 { 711 struct timeval tv, *tvp; 712 int error; 713 714 if (uap->tv != NULL) { 715 error = copyin(uap->tv, &tv, sizeof(tv)); 716 if (error) 717 return (error); 718 tvp = &tv; 719 } else 720 tvp = NULL; 721 722 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); 723 } 724 725 int 726 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 727 fd_set *fd_ex, struct timeval *tvp) 728 { 729 struct filedesc *fdp; 730 /* 731 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 732 * infds with the new FD_SETSIZE of 1024, and more than enough for 733 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 734 * of 256. 735 */ 736 fd_mask s_selbits[howmany(2048, NFDBITS)]; 737 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 738 struct timeval atv, rtv, ttv; 739 int error, timo; 740 u_int ncoll, nbufbytes, ncpbytes, nfdbits; 741 742 if (nd < 0) 743 return (EINVAL); 744 fdp = td->td_proc->p_fd; 745 /* 746 * XXX: kern_select() currently requires that we acquire Giant 747 * even if none of the file descriptors we poll requires Giant. 748 */ 749 mtx_lock(&Giant); 750 FILEDESC_LOCK(fdp); 751 752 if (nd > td->td_proc->p_fd->fd_nfiles) 753 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 754 FILEDESC_UNLOCK(fdp); 755 756 /* 757 * Allocate just enough bits for the non-null fd_sets. Use the 758 * preallocated auto buffer if possible. 759 */ 760 nfdbits = roundup(nd, NFDBITS); 761 ncpbytes = nfdbits / NBBY; 762 nbufbytes = 0; 763 if (fd_in != NULL) 764 nbufbytes += 2 * ncpbytes; 765 if (fd_ou != NULL) 766 nbufbytes += 2 * ncpbytes; 767 if (fd_ex != NULL) 768 nbufbytes += 2 * ncpbytes; 769 if (nbufbytes <= sizeof s_selbits) 770 selbits = &s_selbits[0]; 771 else 772 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 773 774 /* 775 * Assign pointers into the bit buffers and fetch the input bits. 776 * Put the output buffers together so that they can be bzeroed 777 * together. 778 */ 779 sbp = selbits; 780 #define getbits(name, x) \ 781 do { \ 782 if (name == NULL) \ 783 ibits[x] = NULL; \ 784 else { \ 785 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 786 obits[x] = sbp; \ 787 sbp += ncpbytes / sizeof *sbp; \ 788 error = copyin(name, ibits[x], ncpbytes); \ 789 if (error != 0) \ 790 goto done_nosellock; \ 791 } \ 792 } while (0) 793 getbits(fd_in, 0); 794 getbits(fd_ou, 1); 795 getbits(fd_ex, 2); 796 #undef getbits 797 if (nbufbytes != 0) 798 bzero(selbits, nbufbytes / 2); 799 800 if (tvp != NULL) { 801 atv = *tvp; 802 if (itimerfix(&atv)) { 803 error = EINVAL; 804 goto done_nosellock; 805 } 806 getmicrouptime(&rtv); 807 timevaladd(&atv, &rtv); 808 } else { 809 atv.tv_sec = 0; 810 atv.tv_usec = 0; 811 } 812 timo = 0; 813 TAILQ_INIT(&td->td_selq); 814 mtx_lock(&sellock); 815 retry: 816 ncoll = nselcoll; 817 mtx_lock_spin(&sched_lock); 818 td->td_flags |= TDF_SELECT; 819 mtx_unlock_spin(&sched_lock); 820 mtx_unlock(&sellock); 821 822 error = selscan(td, ibits, obits, nd); 823 mtx_lock(&sellock); 824 if (error || td->td_retval[0]) 825 goto done; 826 if (atv.tv_sec || atv.tv_usec) { 827 getmicrouptime(&rtv); 828 if (timevalcmp(&rtv, &atv, >=)) 829 goto done; 830 ttv = atv; 831 timevalsub(&ttv, &rtv); 832 timo = ttv.tv_sec > 24 * 60 * 60 ? 833 24 * 60 * 60 * hz : tvtohz(&ttv); 834 } 835 836 /* 837 * An event of interest may occur while we do not hold 838 * sellock, so check TDF_SELECT and the number of 839 * collisions and rescan the file descriptors if 840 * necessary. 841 */ 842 mtx_lock_spin(&sched_lock); 843 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 844 mtx_unlock_spin(&sched_lock); 845 goto retry; 846 } 847 mtx_unlock_spin(&sched_lock); 848 849 if (timo > 0) 850 error = cv_timedwait_sig(&selwait, &sellock, timo); 851 else 852 error = cv_wait_sig(&selwait, &sellock); 853 854 if (error == 0) 855 goto retry; 856 857 done: 858 clear_selinfo_list(td); 859 mtx_lock_spin(&sched_lock); 860 td->td_flags &= ~TDF_SELECT; 861 mtx_unlock_spin(&sched_lock); 862 mtx_unlock(&sellock); 863 864 done_nosellock: 865 /* select is not restarted after signals... */ 866 if (error == ERESTART) 867 error = EINTR; 868 if (error == EWOULDBLOCK) 869 error = 0; 870 #define putbits(name, x) \ 871 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 872 error = error2; 873 if (error == 0) { 874 int error2; 875 876 putbits(fd_in, 0); 877 putbits(fd_ou, 1); 878 putbits(fd_ex, 2); 879 #undef putbits 880 } 881 if (selbits != &s_selbits[0]) 882 free(selbits, M_SELECT); 883 884 mtx_unlock(&Giant); 885 return (error); 886 } 887 888 static int 889 selscan(td, ibits, obits, nfd) 890 struct thread *td; 891 fd_mask **ibits, **obits; 892 int nfd; 893 { 894 int msk, i, fd; 895 fd_mask bits; 896 struct file *fp; 897 int n = 0; 898 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 899 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 900 struct filedesc *fdp = td->td_proc->p_fd; 901 902 FILEDESC_LOCK(fdp); 903 for (msk = 0; msk < 3; msk++) { 904 if (ibits[msk] == NULL) 905 continue; 906 for (i = 0; i < nfd; i += NFDBITS) { 907 bits = ibits[msk][i/NFDBITS]; 908 /* ffs(int mask) not portable, fd_mask is long */ 909 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 910 if (!(bits & 1)) 911 continue; 912 if ((fp = fget_locked(fdp, fd)) == NULL) { 913 FILEDESC_UNLOCK(fdp); 914 return (EBADF); 915 } 916 if (fo_poll(fp, flag[msk], td->td_ucred, 917 td)) { 918 obits[msk][(fd)/NFDBITS] |= 919 ((fd_mask)1 << ((fd) % NFDBITS)); 920 n++; 921 } 922 } 923 } 924 } 925 FILEDESC_UNLOCK(fdp); 926 td->td_retval[0] = n; 927 return (0); 928 } 929 930 /* 931 * Poll system call. 932 */ 933 #ifndef _SYS_SYSPROTO_H_ 934 struct poll_args { 935 struct pollfd *fds; 936 u_int nfds; 937 int timeout; 938 }; 939 #endif 940 /* 941 * MPSAFE 942 */ 943 int 944 poll(td, uap) 945 struct thread *td; 946 struct poll_args *uap; 947 { 948 caddr_t bits; 949 char smallbits[32 * sizeof(struct pollfd)]; 950 struct timeval atv, rtv, ttv; 951 int error = 0, timo; 952 u_int ncoll, nfds; 953 size_t ni; 954 955 nfds = uap->nfds; 956 957 /* 958 * XXX: poll() currently requires that we acquire Giant even if 959 * none of the file descriptors we poll requires Giant. 960 */ 961 mtx_lock(&Giant); 962 /* 963 * This is kinda bogus. We have fd limits, but that is not 964 * really related to the size of the pollfd array. Make sure 965 * we let the process use at least FD_SETSIZE entries and at 966 * least enough for the current limits. We want to be reasonably 967 * safe, but not overly restrictive. 968 */ 969 PROC_LOCK(td->td_proc); 970 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) && 971 (nfds > FD_SETSIZE)) { 972 PROC_UNLOCK(td->td_proc); 973 error = EINVAL; 974 goto done2; 975 } 976 PROC_UNLOCK(td->td_proc); 977 ni = nfds * sizeof(struct pollfd); 978 if (ni > sizeof(smallbits)) 979 bits = malloc(ni, M_TEMP, M_WAITOK); 980 else 981 bits = smallbits; 982 error = copyin(uap->fds, bits, ni); 983 if (error) 984 goto done_nosellock; 985 if (uap->timeout != INFTIM) { 986 atv.tv_sec = uap->timeout / 1000; 987 atv.tv_usec = (uap->timeout % 1000) * 1000; 988 if (itimerfix(&atv)) { 989 error = EINVAL; 990 goto done_nosellock; 991 } 992 getmicrouptime(&rtv); 993 timevaladd(&atv, &rtv); 994 } else { 995 atv.tv_sec = 0; 996 atv.tv_usec = 0; 997 } 998 timo = 0; 999 TAILQ_INIT(&td->td_selq); 1000 mtx_lock(&sellock); 1001 retry: 1002 ncoll = nselcoll; 1003 mtx_lock_spin(&sched_lock); 1004 td->td_flags |= TDF_SELECT; 1005 mtx_unlock_spin(&sched_lock); 1006 mtx_unlock(&sellock); 1007 1008 error = pollscan(td, (struct pollfd *)bits, nfds); 1009 mtx_lock(&sellock); 1010 if (error || td->td_retval[0]) 1011 goto done; 1012 if (atv.tv_sec || atv.tv_usec) { 1013 getmicrouptime(&rtv); 1014 if (timevalcmp(&rtv, &atv, >=)) 1015 goto done; 1016 ttv = atv; 1017 timevalsub(&ttv, &rtv); 1018 timo = ttv.tv_sec > 24 * 60 * 60 ? 1019 24 * 60 * 60 * hz : tvtohz(&ttv); 1020 } 1021 /* 1022 * An event of interest may occur while we do not hold 1023 * sellock, so check TDF_SELECT and the number of collisions 1024 * and rescan the file descriptors if necessary. 1025 */ 1026 mtx_lock_spin(&sched_lock); 1027 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 1028 mtx_unlock_spin(&sched_lock); 1029 goto retry; 1030 } 1031 mtx_unlock_spin(&sched_lock); 1032 1033 if (timo > 0) 1034 error = cv_timedwait_sig(&selwait, &sellock, timo); 1035 else 1036 error = cv_wait_sig(&selwait, &sellock); 1037 1038 if (error == 0) 1039 goto retry; 1040 1041 done: 1042 clear_selinfo_list(td); 1043 mtx_lock_spin(&sched_lock); 1044 td->td_flags &= ~TDF_SELECT; 1045 mtx_unlock_spin(&sched_lock); 1046 mtx_unlock(&sellock); 1047 1048 done_nosellock: 1049 /* poll is not restarted after signals... */ 1050 if (error == ERESTART) 1051 error = EINTR; 1052 if (error == EWOULDBLOCK) 1053 error = 0; 1054 if (error == 0) { 1055 error = copyout(bits, uap->fds, ni); 1056 if (error) 1057 goto out; 1058 } 1059 out: 1060 if (ni > sizeof(smallbits)) 1061 free(bits, M_TEMP); 1062 done2: 1063 mtx_unlock(&Giant); 1064 return (error); 1065 } 1066 1067 static int 1068 pollscan(td, fds, nfd) 1069 struct thread *td; 1070 struct pollfd *fds; 1071 u_int nfd; 1072 { 1073 register struct filedesc *fdp = td->td_proc->p_fd; 1074 int i; 1075 struct file *fp; 1076 int n = 0; 1077 1078 FILEDESC_LOCK(fdp); 1079 for (i = 0; i < nfd; i++, fds++) { 1080 if (fds->fd >= fdp->fd_nfiles) { 1081 fds->revents = POLLNVAL; 1082 n++; 1083 } else if (fds->fd < 0) { 1084 fds->revents = 0; 1085 } else { 1086 fp = fdp->fd_ofiles[fds->fd]; 1087 if (fp == NULL) { 1088 fds->revents = POLLNVAL; 1089 n++; 1090 } else { 1091 /* 1092 * Note: backend also returns POLLHUP and 1093 * POLLERR if appropriate. 1094 */ 1095 fds->revents = fo_poll(fp, fds->events, 1096 td->td_ucred, td); 1097 if (fds->revents != 0) 1098 n++; 1099 } 1100 } 1101 } 1102 FILEDESC_UNLOCK(fdp); 1103 td->td_retval[0] = n; 1104 return (0); 1105 } 1106 1107 /* 1108 * OpenBSD poll system call. 1109 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1110 */ 1111 #ifndef _SYS_SYSPROTO_H_ 1112 struct openbsd_poll_args { 1113 struct pollfd *fds; 1114 u_int nfds; 1115 int timeout; 1116 }; 1117 #endif 1118 /* 1119 * MPSAFE 1120 */ 1121 int 1122 openbsd_poll(td, uap) 1123 register struct thread *td; 1124 register struct openbsd_poll_args *uap; 1125 { 1126 return (poll(td, (struct poll_args *)uap)); 1127 } 1128 1129 /* 1130 * Remove the references to the thread from all of the objects 1131 * we were polling. 1132 * 1133 * This code assumes that the underlying owner of the selinfo 1134 * structure will hold sellock before it changes it, and that 1135 * it will unlink itself from our list if it goes away. 1136 */ 1137 void 1138 clear_selinfo_list(td) 1139 struct thread *td; 1140 { 1141 struct selinfo *si; 1142 1143 mtx_assert(&sellock, MA_OWNED); 1144 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1145 si->si_thread = NULL; 1146 TAILQ_INIT(&td->td_selq); 1147 } 1148 1149 /* 1150 * Record a select request. 1151 */ 1152 void 1153 selrecord(selector, sip) 1154 struct thread *selector; 1155 struct selinfo *sip; 1156 { 1157 1158 mtx_lock(&sellock); 1159 /* 1160 * If the selinfo's thread pointer is NULL then take ownership of it. 1161 * 1162 * If the thread pointer is not NULL and it points to another 1163 * thread, then we have a collision. 1164 * 1165 * If the thread pointer is not NULL and points back to us then leave 1166 * it alone as we've already added pointed it at us and added it to 1167 * our list. 1168 */ 1169 if (sip->si_thread == NULL) { 1170 sip->si_thread = selector; 1171 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1172 } else if (sip->si_thread != selector) { 1173 sip->si_flags |= SI_COLL; 1174 } 1175 1176 mtx_unlock(&sellock); 1177 } 1178 1179 /* Wake up a selecting thread. */ 1180 void 1181 selwakeup(sip) 1182 struct selinfo *sip; 1183 { 1184 doselwakeup(sip, -1); 1185 } 1186 1187 /* Wake up a selecting thread, and set its priority. */ 1188 void 1189 selwakeuppri(sip, pri) 1190 struct selinfo *sip; 1191 int pri; 1192 { 1193 doselwakeup(sip, pri); 1194 } 1195 1196 /* 1197 * Do a wakeup when a selectable event occurs. 1198 */ 1199 static void 1200 doselwakeup(sip, pri) 1201 struct selinfo *sip; 1202 int pri; 1203 { 1204 struct thread *td; 1205 1206 mtx_lock(&sellock); 1207 td = sip->si_thread; 1208 if ((sip->si_flags & SI_COLL) != 0) { 1209 nselcoll++; 1210 sip->si_flags &= ~SI_COLL; 1211 cv_broadcastpri(&selwait, pri); 1212 } 1213 if (td == NULL) { 1214 mtx_unlock(&sellock); 1215 return; 1216 } 1217 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1218 sip->si_thread = NULL; 1219 mtx_lock_spin(&sched_lock); 1220 td->td_flags &= ~TDF_SELECT; 1221 mtx_unlock_spin(&sched_lock); 1222 sleepq_remove(td, &selwait); 1223 mtx_unlock(&sellock); 1224 } 1225 1226 static void selectinit(void *); 1227 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1228 1229 /* ARGSUSED*/ 1230 static void 1231 selectinit(dummy) 1232 void *dummy; 1233 { 1234 cv_init(&selwait, "select"); 1235 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1236 } 1237