1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_ktrace.h" 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/sysproto.h> 49 #include <sys/filedesc.h> 50 #include <sys/filio.h> 51 #include <sys/fcntl.h> 52 #include <sys/file.h> 53 #include <sys/proc.h> 54 #include <sys/signalvar.h> 55 #include <sys/socketvar.h> 56 #include <sys/uio.h> 57 #include <sys/kernel.h> 58 #include <sys/limits.h> 59 #include <sys/malloc.h> 60 #include <sys/poll.h> 61 #include <sys/resourcevar.h> 62 #include <sys/selinfo.h> 63 #include <sys/sleepqueue.h> 64 #include <sys/syscallsubr.h> 65 #include <sys/sysctl.h> 66 #include <sys/sysent.h> 67 #include <sys/vnode.h> 68 #include <sys/bio.h> 69 #include <sys/buf.h> 70 #include <sys/condvar.h> 71 #ifdef KTRACE 72 #include <sys/ktrace.h> 73 #endif 74 #include <vm/vm.h> 75 #include <vm/vm_page.h> 76 77 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 78 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 79 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 80 81 static int pollscan(struct thread *, struct pollfd *, u_int); 82 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 83 static int dofileread(struct thread *, struct file *, int, void *, 84 size_t, off_t, int); 85 static int dofilewrite(struct thread *, struct file *, int, 86 const void *, size_t, off_t, int); 87 static void doselwakeup(struct selinfo *, int); 88 89 /* 90 * Read system call. 91 */ 92 #ifndef _SYS_SYSPROTO_H_ 93 struct read_args { 94 int fd; 95 void *buf; 96 size_t nbyte; 97 }; 98 #endif 99 /* 100 * MPSAFE 101 */ 102 int 103 read(td, uap) 104 struct thread *td; 105 struct read_args *uap; 106 { 107 struct file *fp; 108 int error; 109 110 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 111 error = dofileread(td, fp, uap->fd, uap->buf, 112 uap->nbyte, (off_t)-1, 0); 113 fdrop(fp, td); 114 } 115 return(error); 116 } 117 118 /* 119 * Pread system call 120 */ 121 #ifndef _SYS_SYSPROTO_H_ 122 struct pread_args { 123 int fd; 124 void *buf; 125 size_t nbyte; 126 int pad; 127 off_t offset; 128 }; 129 #endif 130 /* 131 * MPSAFE 132 */ 133 int 134 pread(td, uap) 135 struct thread *td; 136 struct pread_args *uap; 137 { 138 struct file *fp; 139 int error; 140 141 if ((error = fget_read(td, uap->fd, &fp)) != 0) 142 return (error); 143 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 144 error = ESPIPE; 145 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR) 146 error = EINVAL; 147 else { 148 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 149 uap->offset, FOF_OFFSET); 150 } 151 fdrop(fp, td); 152 return(error); 153 } 154 155 /* 156 * Code common for read and pread 157 */ 158 static int 159 dofileread(td, fp, fd, buf, nbyte, offset, flags) 160 struct thread *td; 161 struct file *fp; 162 int fd, flags; 163 void *buf; 164 size_t nbyte; 165 off_t offset; 166 { 167 struct uio auio; 168 struct iovec aiov; 169 long cnt, error = 0; 170 #ifdef KTRACE 171 struct iovec ktriov; 172 struct uio ktruio; 173 int didktr = 0; 174 #endif 175 176 aiov.iov_base = buf; 177 aiov.iov_len = nbyte; 178 auio.uio_iov = &aiov; 179 auio.uio_iovcnt = 1; 180 auio.uio_offset = offset; 181 if (nbyte > INT_MAX) 182 return (EINVAL); 183 auio.uio_resid = nbyte; 184 auio.uio_rw = UIO_READ; 185 auio.uio_segflg = UIO_USERSPACE; 186 auio.uio_td = td; 187 #ifdef KTRACE 188 /* 189 * if tracing, save a copy of iovec 190 */ 191 if (KTRPOINT(td, KTR_GENIO)) { 192 ktriov = aiov; 193 ktruio = auio; 194 didktr = 1; 195 } 196 #endif 197 cnt = nbyte; 198 199 if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) { 200 if (auio.uio_resid != cnt && (error == ERESTART || 201 error == EINTR || error == EWOULDBLOCK)) 202 error = 0; 203 } 204 cnt -= auio.uio_resid; 205 #ifdef KTRACE 206 if (didktr && error == 0) { 207 ktruio.uio_iov = &ktriov; 208 ktruio.uio_resid = cnt; 209 ktrgenio(fd, UIO_READ, &ktruio, error); 210 } 211 #endif 212 td->td_retval[0] = cnt; 213 return (error); 214 } 215 216 /* 217 * Scatter read system call. 218 */ 219 #ifndef _SYS_SYSPROTO_H_ 220 struct readv_args { 221 int fd; 222 struct iovec *iovp; 223 u_int iovcnt; 224 }; 225 #endif 226 /* 227 * MPSAFE 228 */ 229 int 230 readv(td, uap) 231 struct thread *td; 232 struct readv_args *uap; 233 { 234 struct file *fp; 235 struct uio auio; 236 struct iovec *iov; 237 struct iovec *needfree; 238 struct iovec aiov[UIO_SMALLIOV]; 239 long i, cnt; 240 int error; 241 u_int iovlen; 242 #ifdef KTRACE 243 struct iovec *ktriov = NULL; 244 struct uio ktruio; 245 #endif 246 247 if ((error = fget_read(td, uap->fd, &fp)) != 0) 248 return (error); 249 needfree = NULL; 250 /* note: can't use iovlen until iovcnt is validated */ 251 iovlen = uap->iovcnt * sizeof (struct iovec); 252 if (uap->iovcnt > UIO_SMALLIOV) { 253 if (uap->iovcnt > UIO_MAXIOV) { 254 error = EINVAL; 255 goto done; 256 } 257 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 258 needfree = iov; 259 } else 260 iov = aiov; 261 auio.uio_iov = iov; 262 auio.uio_iovcnt = uap->iovcnt; 263 auio.uio_rw = UIO_READ; 264 auio.uio_segflg = UIO_USERSPACE; 265 auio.uio_td = td; 266 auio.uio_offset = -1; 267 if ((error = copyin(uap->iovp, iov, iovlen))) 268 goto done; 269 auio.uio_resid = 0; 270 for (i = 0; i < uap->iovcnt; i++) { 271 if (iov->iov_len > INT_MAX - auio.uio_resid) { 272 error = EINVAL; 273 goto done; 274 } 275 auio.uio_resid += iov->iov_len; 276 iov++; 277 } 278 #ifdef KTRACE 279 /* 280 * if tracing, save a copy of iovec 281 */ 282 if (KTRPOINT(td, KTR_GENIO)) { 283 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 284 bcopy(auio.uio_iov, ktriov, iovlen); 285 ktruio = auio; 286 } 287 #endif 288 cnt = auio.uio_resid; 289 if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) { 290 if (auio.uio_resid != cnt && (error == ERESTART || 291 error == EINTR || error == EWOULDBLOCK)) 292 error = 0; 293 } 294 cnt -= auio.uio_resid; 295 #ifdef KTRACE 296 if (ktriov != NULL) { 297 if (error == 0) { 298 ktruio.uio_iov = ktriov; 299 ktruio.uio_resid = cnt; 300 ktrgenio(uap->fd, UIO_READ, &ktruio, error); 301 } 302 FREE(ktriov, M_TEMP); 303 } 304 #endif 305 td->td_retval[0] = cnt; 306 done: 307 fdrop(fp, td); 308 if (needfree) 309 FREE(needfree, M_IOV); 310 return (error); 311 } 312 313 /* 314 * Write system call 315 */ 316 #ifndef _SYS_SYSPROTO_H_ 317 struct write_args { 318 int fd; 319 const void *buf; 320 size_t nbyte; 321 }; 322 #endif 323 /* 324 * MPSAFE 325 */ 326 int 327 write(td, uap) 328 struct thread *td; 329 struct write_args *uap; 330 { 331 struct file *fp; 332 int error; 333 334 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 335 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 336 (off_t)-1, 0); 337 fdrop(fp, td); 338 } else { 339 error = EBADF; /* XXX this can't be right */ 340 } 341 return(error); 342 } 343 344 /* 345 * Pwrite system call 346 */ 347 #ifndef _SYS_SYSPROTO_H_ 348 struct pwrite_args { 349 int fd; 350 const void *buf; 351 size_t nbyte; 352 int pad; 353 off_t offset; 354 }; 355 #endif 356 /* 357 * MPSAFE 358 */ 359 int 360 pwrite(td, uap) 361 struct thread *td; 362 struct pwrite_args *uap; 363 { 364 struct file *fp; 365 int error; 366 367 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 368 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 369 error = ESPIPE; 370 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR) 371 error = EINVAL; 372 else { 373 error = dofilewrite(td, fp, uap->fd, uap->buf, 374 uap->nbyte, uap->offset, FOF_OFFSET); 375 } 376 fdrop(fp, td); 377 } else { 378 error = EBADF; /* this can't be right */ 379 } 380 return(error); 381 } 382 383 static int 384 dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 385 struct thread *td; 386 struct file *fp; 387 int fd, flags; 388 const void *buf; 389 size_t nbyte; 390 off_t offset; 391 { 392 struct uio auio; 393 struct iovec aiov; 394 long cnt, error = 0; 395 #ifdef KTRACE 396 struct iovec ktriov; 397 struct uio ktruio; 398 int didktr = 0; 399 #endif 400 401 aiov.iov_base = (void *)(uintptr_t)buf; 402 aiov.iov_len = nbyte; 403 auio.uio_iov = &aiov; 404 auio.uio_iovcnt = 1; 405 auio.uio_offset = offset; 406 if (nbyte > INT_MAX) 407 return (EINVAL); 408 auio.uio_resid = nbyte; 409 auio.uio_rw = UIO_WRITE; 410 auio.uio_segflg = UIO_USERSPACE; 411 auio.uio_td = td; 412 #ifdef KTRACE 413 /* 414 * if tracing, save a copy of iovec and uio 415 */ 416 if (KTRPOINT(td, KTR_GENIO)) { 417 ktriov = aiov; 418 ktruio = auio; 419 didktr = 1; 420 } 421 #endif 422 cnt = nbyte; 423 if (fp->f_type == DTYPE_VNODE) 424 bwillwrite(); 425 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) { 426 if (auio.uio_resid != cnt && (error == ERESTART || 427 error == EINTR || error == EWOULDBLOCK)) 428 error = 0; 429 /* Socket layer is responsible for issuing SIGPIPE. */ 430 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) { 431 PROC_LOCK(td->td_proc); 432 psignal(td->td_proc, SIGPIPE); 433 PROC_UNLOCK(td->td_proc); 434 } 435 } 436 cnt -= auio.uio_resid; 437 #ifdef KTRACE 438 if (didktr && error == 0) { 439 ktruio.uio_iov = &ktriov; 440 ktruio.uio_resid = cnt; 441 ktrgenio(fd, UIO_WRITE, &ktruio, error); 442 } 443 #endif 444 td->td_retval[0] = cnt; 445 return (error); 446 } 447 448 /* 449 * Gather write system call 450 */ 451 #ifndef _SYS_SYSPROTO_H_ 452 struct writev_args { 453 int fd; 454 struct iovec *iovp; 455 u_int iovcnt; 456 }; 457 #endif 458 /* 459 * MPSAFE 460 */ 461 int 462 writev(td, uap) 463 struct thread *td; 464 register struct writev_args *uap; 465 { 466 struct file *fp; 467 struct uio auio; 468 register struct iovec *iov; 469 struct iovec *needfree; 470 struct iovec aiov[UIO_SMALLIOV]; 471 long i, cnt, error = 0; 472 u_int iovlen; 473 #ifdef KTRACE 474 struct iovec *ktriov = NULL; 475 struct uio ktruio; 476 #endif 477 478 if ((error = fget_write(td, uap->fd, &fp)) != 0) 479 return (EBADF); 480 needfree = NULL; 481 /* note: can't use iovlen until iovcnt is validated */ 482 iovlen = uap->iovcnt * sizeof (struct iovec); 483 if (uap->iovcnt > UIO_SMALLIOV) { 484 if (uap->iovcnt > UIO_MAXIOV) { 485 error = EINVAL; 486 goto done; 487 } 488 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 489 needfree = iov; 490 } else 491 iov = aiov; 492 auio.uio_iov = iov; 493 auio.uio_iovcnt = uap->iovcnt; 494 auio.uio_rw = UIO_WRITE; 495 auio.uio_segflg = UIO_USERSPACE; 496 auio.uio_td = td; 497 auio.uio_offset = -1; 498 if ((error = copyin(uap->iovp, iov, iovlen))) 499 goto done; 500 auio.uio_resid = 0; 501 for (i = 0; i < uap->iovcnt; i++) { 502 if (iov->iov_len > INT_MAX - auio.uio_resid) { 503 error = EINVAL; 504 goto done; 505 } 506 auio.uio_resid += iov->iov_len; 507 iov++; 508 } 509 #ifdef KTRACE 510 /* 511 * if tracing, save a copy of iovec and uio 512 */ 513 if (KTRPOINT(td, KTR_GENIO)) { 514 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 515 bcopy(auio.uio_iov, ktriov, iovlen); 516 ktruio = auio; 517 } 518 #endif 519 cnt = auio.uio_resid; 520 if (fp->f_type == DTYPE_VNODE) 521 bwillwrite(); 522 if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) { 523 if (auio.uio_resid != cnt && (error == ERESTART || 524 error == EINTR || error == EWOULDBLOCK)) 525 error = 0; 526 if (error == EPIPE) { 527 PROC_LOCK(td->td_proc); 528 psignal(td->td_proc, SIGPIPE); 529 PROC_UNLOCK(td->td_proc); 530 } 531 } 532 cnt -= auio.uio_resid; 533 #ifdef KTRACE 534 if (ktriov != NULL) { 535 if (error == 0) { 536 ktruio.uio_iov = ktriov; 537 ktruio.uio_resid = cnt; 538 ktrgenio(uap->fd, UIO_WRITE, &ktruio, error); 539 } 540 FREE(ktriov, M_TEMP); 541 } 542 #endif 543 td->td_retval[0] = cnt; 544 done: 545 fdrop(fp, td); 546 if (needfree) 547 FREE(needfree, M_IOV); 548 return (error); 549 } 550 551 /* 552 * Ioctl system call 553 */ 554 #ifndef _SYS_SYSPROTO_H_ 555 struct ioctl_args { 556 int fd; 557 u_long com; 558 caddr_t data; 559 }; 560 #endif 561 /* 562 * MPSAFE 563 */ 564 /* ARGSUSED */ 565 int 566 ioctl(td, uap) 567 struct thread *td; 568 register struct ioctl_args *uap; 569 { 570 struct file *fp; 571 register struct filedesc *fdp; 572 register u_long com; 573 int error = 0; 574 register u_int size; 575 caddr_t data, memp; 576 int tmp; 577 #define STK_PARAMS 128 578 union { 579 char stkbuf[STK_PARAMS]; 580 long align; 581 } ubuf; 582 583 if ((error = fget(td, uap->fd, &fp)) != 0) 584 return (error); 585 mtx_lock(&Giant); 586 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 587 fdrop(fp, td); 588 mtx_unlock(&Giant); 589 return (EBADF); 590 } 591 fdp = td->td_proc->p_fd; 592 switch (com = uap->com) { 593 case FIONCLEX: 594 FILEDESC_LOCK(fdp); 595 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 596 FILEDESC_UNLOCK(fdp); 597 fdrop(fp, td); 598 mtx_unlock(&Giant); 599 return (0); 600 case FIOCLEX: 601 FILEDESC_LOCK(fdp); 602 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 603 FILEDESC_UNLOCK(fdp); 604 fdrop(fp, td); 605 mtx_unlock(&Giant); 606 return (0); 607 } 608 609 /* 610 * Interpret high order word to find amount of data to be 611 * copied to/from the user's address space. 612 */ 613 size = IOCPARM_LEN(com); 614 if (size > IOCPARM_MAX) { 615 fdrop(fp, td); 616 mtx_unlock(&Giant); 617 return (ENOTTY); 618 } 619 620 memp = NULL; 621 if (size > sizeof (ubuf.stkbuf)) { 622 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 623 data = memp; 624 } else { 625 data = ubuf.stkbuf; 626 } 627 if (com&IOC_IN) { 628 if (size) { 629 error = copyin(uap->data, data, (u_int)size); 630 if (error) { 631 if (memp) 632 free(memp, M_IOCTLOPS); 633 fdrop(fp, td); 634 goto done; 635 } 636 } else { 637 *(caddr_t *)data = uap->data; 638 } 639 } else if ((com&IOC_OUT) && size) { 640 /* 641 * Zero the buffer so the user always 642 * gets back something deterministic. 643 */ 644 bzero(data, size); 645 } else if (com&IOC_VOID) { 646 *(caddr_t *)data = uap->data; 647 } 648 649 switch (com) { 650 651 case FIONBIO: 652 FILE_LOCK(fp); 653 if ((tmp = *(int *)data)) 654 fp->f_flag |= FNONBLOCK; 655 else 656 fp->f_flag &= ~FNONBLOCK; 657 FILE_UNLOCK(fp); 658 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 659 break; 660 661 case FIOASYNC: 662 FILE_LOCK(fp); 663 if ((tmp = *(int *)data)) 664 fp->f_flag |= FASYNC; 665 else 666 fp->f_flag &= ~FASYNC; 667 FILE_UNLOCK(fp); 668 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); 669 break; 670 671 default: 672 error = fo_ioctl(fp, com, data, td->td_ucred, td); 673 /* 674 * Copy any data to user, size was 675 * already set and checked above. 676 */ 677 if (error == 0 && (com&IOC_OUT) && size) 678 error = copyout(data, uap->data, (u_int)size); 679 break; 680 } 681 if (memp) 682 free(memp, M_IOCTLOPS); 683 fdrop(fp, td); 684 done: 685 mtx_unlock(&Giant); 686 return (error); 687 } 688 689 /* 690 * sellock and selwait are initialized in selectinit() via SYSINIT. 691 */ 692 struct mtx sellock; 693 struct cv selwait; 694 u_int nselcoll; /* Select collisions since boot */ 695 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 696 697 /* 698 * Select system call. 699 */ 700 #ifndef _SYS_SYSPROTO_H_ 701 struct select_args { 702 int nd; 703 fd_set *in, *ou, *ex; 704 struct timeval *tv; 705 }; 706 #endif 707 /* 708 * MPSAFE 709 */ 710 int 711 select(td, uap) 712 register struct thread *td; 713 register struct select_args *uap; 714 { 715 struct timeval tv, *tvp; 716 int error; 717 718 if (uap->tv != NULL) { 719 error = copyin(uap->tv, &tv, sizeof(tv)); 720 if (error) 721 return (error); 722 tvp = &tv; 723 } else 724 tvp = NULL; 725 726 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); 727 } 728 729 int 730 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 731 fd_set *fd_ex, struct timeval *tvp) 732 { 733 struct filedesc *fdp; 734 /* 735 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 736 * infds with the new FD_SETSIZE of 1024, and more than enough for 737 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 738 * of 256. 739 */ 740 fd_mask s_selbits[howmany(2048, NFDBITS)]; 741 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 742 struct timeval atv, rtv, ttv; 743 int error, timo; 744 u_int ncoll, nbufbytes, ncpbytes, nfdbits; 745 746 if (nd < 0) 747 return (EINVAL); 748 fdp = td->td_proc->p_fd; 749 /* 750 * XXX: kern_select() currently requires that we acquire Giant 751 * even if none of the file descriptors we poll requires Giant. 752 */ 753 mtx_lock(&Giant); 754 FILEDESC_LOCK(fdp); 755 756 if (nd > td->td_proc->p_fd->fd_nfiles) 757 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 758 FILEDESC_UNLOCK(fdp); 759 760 /* 761 * Allocate just enough bits for the non-null fd_sets. Use the 762 * preallocated auto buffer if possible. 763 */ 764 nfdbits = roundup(nd, NFDBITS); 765 ncpbytes = nfdbits / NBBY; 766 nbufbytes = 0; 767 if (fd_in != NULL) 768 nbufbytes += 2 * ncpbytes; 769 if (fd_ou != NULL) 770 nbufbytes += 2 * ncpbytes; 771 if (fd_ex != NULL) 772 nbufbytes += 2 * ncpbytes; 773 if (nbufbytes <= sizeof s_selbits) 774 selbits = &s_selbits[0]; 775 else 776 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 777 778 /* 779 * Assign pointers into the bit buffers and fetch the input bits. 780 * Put the output buffers together so that they can be bzeroed 781 * together. 782 */ 783 sbp = selbits; 784 #define getbits(name, x) \ 785 do { \ 786 if (name == NULL) \ 787 ibits[x] = NULL; \ 788 else { \ 789 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 790 obits[x] = sbp; \ 791 sbp += ncpbytes / sizeof *sbp; \ 792 error = copyin(name, ibits[x], ncpbytes); \ 793 if (error != 0) \ 794 goto done_nosellock; \ 795 } \ 796 } while (0) 797 getbits(fd_in, 0); 798 getbits(fd_ou, 1); 799 getbits(fd_ex, 2); 800 #undef getbits 801 if (nbufbytes != 0) 802 bzero(selbits, nbufbytes / 2); 803 804 if (tvp != NULL) { 805 atv = *tvp; 806 if (itimerfix(&atv)) { 807 error = EINVAL; 808 goto done_nosellock; 809 } 810 getmicrouptime(&rtv); 811 timevaladd(&atv, &rtv); 812 } else { 813 atv.tv_sec = 0; 814 atv.tv_usec = 0; 815 } 816 timo = 0; 817 TAILQ_INIT(&td->td_selq); 818 mtx_lock(&sellock); 819 retry: 820 ncoll = nselcoll; 821 mtx_lock_spin(&sched_lock); 822 td->td_flags |= TDF_SELECT; 823 mtx_unlock_spin(&sched_lock); 824 mtx_unlock(&sellock); 825 826 error = selscan(td, ibits, obits, nd); 827 mtx_lock(&sellock); 828 if (error || td->td_retval[0]) 829 goto done; 830 if (atv.tv_sec || atv.tv_usec) { 831 getmicrouptime(&rtv); 832 if (timevalcmp(&rtv, &atv, >=)) 833 goto done; 834 ttv = atv; 835 timevalsub(&ttv, &rtv); 836 timo = ttv.tv_sec > 24 * 60 * 60 ? 837 24 * 60 * 60 * hz : tvtohz(&ttv); 838 } 839 840 /* 841 * An event of interest may occur while we do not hold 842 * sellock, so check TDF_SELECT and the number of 843 * collisions and rescan the file descriptors if 844 * necessary. 845 */ 846 mtx_lock_spin(&sched_lock); 847 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 848 mtx_unlock_spin(&sched_lock); 849 goto retry; 850 } 851 mtx_unlock_spin(&sched_lock); 852 853 if (timo > 0) 854 error = cv_timedwait_sig(&selwait, &sellock, timo); 855 else 856 error = cv_wait_sig(&selwait, &sellock); 857 858 if (error == 0) 859 goto retry; 860 861 done: 862 clear_selinfo_list(td); 863 mtx_lock_spin(&sched_lock); 864 td->td_flags &= ~TDF_SELECT; 865 mtx_unlock_spin(&sched_lock); 866 mtx_unlock(&sellock); 867 868 done_nosellock: 869 /* select is not restarted after signals... */ 870 if (error == ERESTART) 871 error = EINTR; 872 if (error == EWOULDBLOCK) 873 error = 0; 874 #define putbits(name, x) \ 875 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 876 error = error2; 877 if (error == 0) { 878 int error2; 879 880 putbits(fd_in, 0); 881 putbits(fd_ou, 1); 882 putbits(fd_ex, 2); 883 #undef putbits 884 } 885 if (selbits != &s_selbits[0]) 886 free(selbits, M_SELECT); 887 888 mtx_unlock(&Giant); 889 return (error); 890 } 891 892 static int 893 selscan(td, ibits, obits, nfd) 894 struct thread *td; 895 fd_mask **ibits, **obits; 896 int nfd; 897 { 898 int msk, i, fd; 899 fd_mask bits; 900 struct file *fp; 901 int n = 0; 902 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 903 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 904 struct filedesc *fdp = td->td_proc->p_fd; 905 906 FILEDESC_LOCK(fdp); 907 for (msk = 0; msk < 3; msk++) { 908 if (ibits[msk] == NULL) 909 continue; 910 for (i = 0; i < nfd; i += NFDBITS) { 911 bits = ibits[msk][i/NFDBITS]; 912 /* ffs(int mask) not portable, fd_mask is long */ 913 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 914 if (!(bits & 1)) 915 continue; 916 if ((fp = fget_locked(fdp, fd)) == NULL) { 917 FILEDESC_UNLOCK(fdp); 918 return (EBADF); 919 } 920 if (fo_poll(fp, flag[msk], td->td_ucred, 921 td)) { 922 obits[msk][(fd)/NFDBITS] |= 923 ((fd_mask)1 << ((fd) % NFDBITS)); 924 n++; 925 } 926 } 927 } 928 } 929 FILEDESC_UNLOCK(fdp); 930 td->td_retval[0] = n; 931 return (0); 932 } 933 934 /* 935 * Poll system call. 936 */ 937 #ifndef _SYS_SYSPROTO_H_ 938 struct poll_args { 939 struct pollfd *fds; 940 u_int nfds; 941 int timeout; 942 }; 943 #endif 944 /* 945 * MPSAFE 946 */ 947 int 948 poll(td, uap) 949 struct thread *td; 950 struct poll_args *uap; 951 { 952 caddr_t bits; 953 char smallbits[32 * sizeof(struct pollfd)]; 954 struct timeval atv, rtv, ttv; 955 int error = 0, timo; 956 u_int ncoll, nfds; 957 size_t ni; 958 959 nfds = uap->nfds; 960 961 /* 962 * XXX: poll() currently requires that we acquire Giant even if 963 * none of the file descriptors we poll requires Giant. 964 */ 965 mtx_lock(&Giant); 966 /* 967 * This is kinda bogus. We have fd limits, but that is not 968 * really related to the size of the pollfd array. Make sure 969 * we let the process use at least FD_SETSIZE entries and at 970 * least enough for the current limits. We want to be reasonably 971 * safe, but not overly restrictive. 972 */ 973 PROC_LOCK(td->td_proc); 974 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) && 975 (nfds > FD_SETSIZE)) { 976 PROC_UNLOCK(td->td_proc); 977 error = EINVAL; 978 goto done2; 979 } 980 PROC_UNLOCK(td->td_proc); 981 ni = nfds * sizeof(struct pollfd); 982 if (ni > sizeof(smallbits)) 983 bits = malloc(ni, M_TEMP, M_WAITOK); 984 else 985 bits = smallbits; 986 error = copyin(uap->fds, bits, ni); 987 if (error) 988 goto done_nosellock; 989 if (uap->timeout != INFTIM) { 990 atv.tv_sec = uap->timeout / 1000; 991 atv.tv_usec = (uap->timeout % 1000) * 1000; 992 if (itimerfix(&atv)) { 993 error = EINVAL; 994 goto done_nosellock; 995 } 996 getmicrouptime(&rtv); 997 timevaladd(&atv, &rtv); 998 } else { 999 atv.tv_sec = 0; 1000 atv.tv_usec = 0; 1001 } 1002 timo = 0; 1003 TAILQ_INIT(&td->td_selq); 1004 mtx_lock(&sellock); 1005 retry: 1006 ncoll = nselcoll; 1007 mtx_lock_spin(&sched_lock); 1008 td->td_flags |= TDF_SELECT; 1009 mtx_unlock_spin(&sched_lock); 1010 mtx_unlock(&sellock); 1011 1012 error = pollscan(td, (struct pollfd *)bits, nfds); 1013 mtx_lock(&sellock); 1014 if (error || td->td_retval[0]) 1015 goto done; 1016 if (atv.tv_sec || atv.tv_usec) { 1017 getmicrouptime(&rtv); 1018 if (timevalcmp(&rtv, &atv, >=)) 1019 goto done; 1020 ttv = atv; 1021 timevalsub(&ttv, &rtv); 1022 timo = ttv.tv_sec > 24 * 60 * 60 ? 1023 24 * 60 * 60 * hz : tvtohz(&ttv); 1024 } 1025 /* 1026 * An event of interest may occur while we do not hold 1027 * sellock, so check TDF_SELECT and the number of collisions 1028 * and rescan the file descriptors if necessary. 1029 */ 1030 mtx_lock_spin(&sched_lock); 1031 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 1032 mtx_unlock_spin(&sched_lock); 1033 goto retry; 1034 } 1035 mtx_unlock_spin(&sched_lock); 1036 1037 if (timo > 0) 1038 error = cv_timedwait_sig(&selwait, &sellock, timo); 1039 else 1040 error = cv_wait_sig(&selwait, &sellock); 1041 1042 if (error == 0) 1043 goto retry; 1044 1045 done: 1046 clear_selinfo_list(td); 1047 mtx_lock_spin(&sched_lock); 1048 td->td_flags &= ~TDF_SELECT; 1049 mtx_unlock_spin(&sched_lock); 1050 mtx_unlock(&sellock); 1051 1052 done_nosellock: 1053 /* poll is not restarted after signals... */ 1054 if (error == ERESTART) 1055 error = EINTR; 1056 if (error == EWOULDBLOCK) 1057 error = 0; 1058 if (error == 0) { 1059 error = copyout(bits, uap->fds, ni); 1060 if (error) 1061 goto out; 1062 } 1063 out: 1064 if (ni > sizeof(smallbits)) 1065 free(bits, M_TEMP); 1066 done2: 1067 mtx_unlock(&Giant); 1068 return (error); 1069 } 1070 1071 static int 1072 pollscan(td, fds, nfd) 1073 struct thread *td; 1074 struct pollfd *fds; 1075 u_int nfd; 1076 { 1077 register struct filedesc *fdp = td->td_proc->p_fd; 1078 int i; 1079 struct file *fp; 1080 int n = 0; 1081 1082 FILEDESC_LOCK(fdp); 1083 for (i = 0; i < nfd; i++, fds++) { 1084 if (fds->fd >= fdp->fd_nfiles) { 1085 fds->revents = POLLNVAL; 1086 n++; 1087 } else if (fds->fd < 0) { 1088 fds->revents = 0; 1089 } else { 1090 fp = fdp->fd_ofiles[fds->fd]; 1091 if (fp == NULL) { 1092 fds->revents = POLLNVAL; 1093 n++; 1094 } else { 1095 /* 1096 * Note: backend also returns POLLHUP and 1097 * POLLERR if appropriate. 1098 */ 1099 fds->revents = fo_poll(fp, fds->events, 1100 td->td_ucred, td); 1101 if (fds->revents != 0) 1102 n++; 1103 } 1104 } 1105 } 1106 FILEDESC_UNLOCK(fdp); 1107 td->td_retval[0] = n; 1108 return (0); 1109 } 1110 1111 /* 1112 * OpenBSD poll system call. 1113 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1114 */ 1115 #ifndef _SYS_SYSPROTO_H_ 1116 struct openbsd_poll_args { 1117 struct pollfd *fds; 1118 u_int nfds; 1119 int timeout; 1120 }; 1121 #endif 1122 /* 1123 * MPSAFE 1124 */ 1125 int 1126 openbsd_poll(td, uap) 1127 register struct thread *td; 1128 register struct openbsd_poll_args *uap; 1129 { 1130 return (poll(td, (struct poll_args *)uap)); 1131 } 1132 1133 /* 1134 * Remove the references to the thread from all of the objects 1135 * we were polling. 1136 * 1137 * This code assumes that the underlying owner of the selinfo 1138 * structure will hold sellock before it changes it, and that 1139 * it will unlink itself from our list if it goes away. 1140 */ 1141 void 1142 clear_selinfo_list(td) 1143 struct thread *td; 1144 { 1145 struct selinfo *si; 1146 1147 mtx_assert(&sellock, MA_OWNED); 1148 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1149 si->si_thread = NULL; 1150 TAILQ_INIT(&td->td_selq); 1151 } 1152 1153 /* 1154 * Record a select request. 1155 */ 1156 void 1157 selrecord(selector, sip) 1158 struct thread *selector; 1159 struct selinfo *sip; 1160 { 1161 1162 mtx_lock(&sellock); 1163 /* 1164 * If the selinfo's thread pointer is NULL then take ownership of it. 1165 * 1166 * If the thread pointer is not NULL and it points to another 1167 * thread, then we have a collision. 1168 * 1169 * If the thread pointer is not NULL and points back to us then leave 1170 * it alone as we've already added pointed it at us and added it to 1171 * our list. 1172 */ 1173 if (sip->si_thread == NULL) { 1174 sip->si_thread = selector; 1175 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1176 } else if (sip->si_thread != selector) { 1177 sip->si_flags |= SI_COLL; 1178 } 1179 1180 mtx_unlock(&sellock); 1181 } 1182 1183 /* Wake up a selecting thread. */ 1184 void 1185 selwakeup(sip) 1186 struct selinfo *sip; 1187 { 1188 doselwakeup(sip, -1); 1189 } 1190 1191 /* Wake up a selecting thread, and set its priority. */ 1192 void 1193 selwakeuppri(sip, pri) 1194 struct selinfo *sip; 1195 int pri; 1196 { 1197 doselwakeup(sip, pri); 1198 } 1199 1200 /* 1201 * Do a wakeup when a selectable event occurs. 1202 */ 1203 static void 1204 doselwakeup(sip, pri) 1205 struct selinfo *sip; 1206 int pri; 1207 { 1208 struct thread *td; 1209 1210 mtx_lock(&sellock); 1211 td = sip->si_thread; 1212 if ((sip->si_flags & SI_COLL) != 0) { 1213 nselcoll++; 1214 sip->si_flags &= ~SI_COLL; 1215 cv_broadcastpri(&selwait, pri); 1216 } 1217 if (td == NULL) { 1218 mtx_unlock(&sellock); 1219 return; 1220 } 1221 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1222 sip->si_thread = NULL; 1223 mtx_lock_spin(&sched_lock); 1224 td->td_flags &= ~TDF_SELECT; 1225 mtx_unlock_spin(&sched_lock); 1226 sleepq_remove(td, &selwait); 1227 mtx_unlock(&sellock); 1228 } 1229 1230 static void selectinit(void *); 1231 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1232 1233 /* ARGSUSED*/ 1234 static void 1235 selectinit(dummy) 1236 void *dummy; 1237 { 1238 cv_init(&selwait, "select"); 1239 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1240 } 1241