1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_ktrace.h" 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/sysproto.h> 49 #include <sys/filedesc.h> 50 #include <sys/filio.h> 51 #include <sys/fcntl.h> 52 #include <sys/file.h> 53 #include <sys/proc.h> 54 #include <sys/signalvar.h> 55 #include <sys/socketvar.h> 56 #include <sys/uio.h> 57 #include <sys/kernel.h> 58 #include <sys/limits.h> 59 #include <sys/malloc.h> 60 #include <sys/poll.h> 61 #include <sys/resourcevar.h> 62 #include <sys/selinfo.h> 63 #include <sys/syscallsubr.h> 64 #include <sys/sysctl.h> 65 #include <sys/sysent.h> 66 #include <sys/vnode.h> 67 #include <sys/bio.h> 68 #include <sys/buf.h> 69 #include <sys/condvar.h> 70 #ifdef KTRACE 71 #include <sys/ktrace.h> 72 #endif 73 #include <vm/vm.h> 74 #include <vm/vm_page.h> 75 76 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 77 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 78 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 79 80 static int pollscan(struct thread *, struct pollfd *, u_int); 81 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 82 static int dofileread(struct thread *, struct file *, int, void *, 83 size_t, off_t, int); 84 static int dofilewrite(struct thread *, struct file *, int, 85 const void *, size_t, off_t, int); 86 static void doselwakeup(struct selinfo *, int); 87 88 /* 89 * Read system call. 90 */ 91 #ifndef _SYS_SYSPROTO_H_ 92 struct read_args { 93 int fd; 94 void *buf; 95 size_t nbyte; 96 }; 97 #endif 98 /* 99 * MPSAFE 100 */ 101 int 102 read(td, uap) 103 struct thread *td; 104 struct read_args *uap; 105 { 106 struct file *fp; 107 int error; 108 109 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 110 error = dofileread(td, fp, uap->fd, uap->buf, 111 uap->nbyte, (off_t)-1, 0); 112 fdrop(fp, td); 113 } 114 return(error); 115 } 116 117 /* 118 * Pread system call 119 */ 120 #ifndef _SYS_SYSPROTO_H_ 121 struct pread_args { 122 int fd; 123 void *buf; 124 size_t nbyte; 125 int pad; 126 off_t offset; 127 }; 128 #endif 129 /* 130 * MPSAFE 131 */ 132 int 133 pread(td, uap) 134 struct thread *td; 135 struct pread_args *uap; 136 { 137 struct file *fp; 138 int error; 139 140 if ((error = fget_read(td, uap->fd, &fp)) != 0) 141 return (error); 142 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 143 error = ESPIPE; 144 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR) 145 error = EINVAL; 146 else { 147 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 148 uap->offset, FOF_OFFSET); 149 } 150 fdrop(fp, td); 151 return(error); 152 } 153 154 /* 155 * Code common for read and pread 156 */ 157 static int 158 dofileread(td, fp, fd, buf, nbyte, offset, flags) 159 struct thread *td; 160 struct file *fp; 161 int fd, flags; 162 void *buf; 163 size_t nbyte; 164 off_t offset; 165 { 166 struct uio auio; 167 struct iovec aiov; 168 long cnt, error = 0; 169 #ifdef KTRACE 170 struct iovec ktriov; 171 struct uio ktruio; 172 int didktr = 0; 173 #endif 174 175 aiov.iov_base = buf; 176 aiov.iov_len = nbyte; 177 auio.uio_iov = &aiov; 178 auio.uio_iovcnt = 1; 179 auio.uio_offset = offset; 180 if (nbyte > INT_MAX) 181 return (EINVAL); 182 auio.uio_resid = nbyte; 183 auio.uio_rw = UIO_READ; 184 auio.uio_segflg = UIO_USERSPACE; 185 auio.uio_td = td; 186 #ifdef KTRACE 187 /* 188 * if tracing, save a copy of iovec 189 */ 190 if (KTRPOINT(td, KTR_GENIO)) { 191 ktriov = aiov; 192 ktruio = auio; 193 didktr = 1; 194 } 195 #endif 196 cnt = nbyte; 197 198 if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) { 199 if (auio.uio_resid != cnt && (error == ERESTART || 200 error == EINTR || error == EWOULDBLOCK)) 201 error = 0; 202 } 203 cnt -= auio.uio_resid; 204 #ifdef KTRACE 205 if (didktr && error == 0) { 206 ktruio.uio_iov = &ktriov; 207 ktruio.uio_resid = cnt; 208 ktrgenio(fd, UIO_READ, &ktruio, error); 209 } 210 #endif 211 td->td_retval[0] = cnt; 212 return (error); 213 } 214 215 /* 216 * Scatter read system call. 217 */ 218 #ifndef _SYS_SYSPROTO_H_ 219 struct readv_args { 220 int fd; 221 struct iovec *iovp; 222 u_int iovcnt; 223 }; 224 #endif 225 /* 226 * MPSAFE 227 */ 228 int 229 readv(td, uap) 230 struct thread *td; 231 struct readv_args *uap; 232 { 233 struct file *fp; 234 struct uio auio; 235 struct iovec *iov; 236 struct iovec *needfree; 237 struct iovec aiov[UIO_SMALLIOV]; 238 long i, cnt; 239 int error; 240 u_int iovlen; 241 #ifdef KTRACE 242 struct iovec *ktriov = NULL; 243 struct uio ktruio; 244 #endif 245 246 if ((error = fget_read(td, uap->fd, &fp)) != 0) 247 return (error); 248 needfree = NULL; 249 /* note: can't use iovlen until iovcnt is validated */ 250 iovlen = uap->iovcnt * sizeof (struct iovec); 251 if (uap->iovcnt > UIO_SMALLIOV) { 252 if (uap->iovcnt > UIO_MAXIOV) { 253 error = EINVAL; 254 goto done; 255 } 256 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 257 needfree = iov; 258 } else 259 iov = aiov; 260 auio.uio_iov = iov; 261 auio.uio_iovcnt = uap->iovcnt; 262 auio.uio_rw = UIO_READ; 263 auio.uio_segflg = UIO_USERSPACE; 264 auio.uio_td = td; 265 auio.uio_offset = -1; 266 if ((error = copyin(uap->iovp, iov, iovlen))) 267 goto done; 268 auio.uio_resid = 0; 269 for (i = 0; i < uap->iovcnt; i++) { 270 if (iov->iov_len > INT_MAX - auio.uio_resid) { 271 error = EINVAL; 272 goto done; 273 } 274 auio.uio_resid += iov->iov_len; 275 iov++; 276 } 277 #ifdef KTRACE 278 /* 279 * if tracing, save a copy of iovec 280 */ 281 if (KTRPOINT(td, KTR_GENIO)) { 282 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 283 bcopy(auio.uio_iov, ktriov, iovlen); 284 ktruio = auio; 285 } 286 #endif 287 cnt = auio.uio_resid; 288 if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) { 289 if (auio.uio_resid != cnt && (error == ERESTART || 290 error == EINTR || error == EWOULDBLOCK)) 291 error = 0; 292 } 293 cnt -= auio.uio_resid; 294 #ifdef KTRACE 295 if (ktriov != NULL) { 296 if (error == 0) { 297 ktruio.uio_iov = ktriov; 298 ktruio.uio_resid = cnt; 299 ktrgenio(uap->fd, UIO_READ, &ktruio, error); 300 } 301 FREE(ktriov, M_TEMP); 302 } 303 #endif 304 td->td_retval[0] = cnt; 305 done: 306 fdrop(fp, td); 307 if (needfree) 308 FREE(needfree, M_IOV); 309 return (error); 310 } 311 312 /* 313 * Write system call 314 */ 315 #ifndef _SYS_SYSPROTO_H_ 316 struct write_args { 317 int fd; 318 const void *buf; 319 size_t nbyte; 320 }; 321 #endif 322 /* 323 * MPSAFE 324 */ 325 int 326 write(td, uap) 327 struct thread *td; 328 struct write_args *uap; 329 { 330 struct file *fp; 331 int error; 332 333 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 334 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 335 (off_t)-1, 0); 336 fdrop(fp, td); 337 } else { 338 error = EBADF; /* XXX this can't be right */ 339 } 340 return(error); 341 } 342 343 /* 344 * Pwrite system call 345 */ 346 #ifndef _SYS_SYSPROTO_H_ 347 struct pwrite_args { 348 int fd; 349 const void *buf; 350 size_t nbyte; 351 int pad; 352 off_t offset; 353 }; 354 #endif 355 /* 356 * MPSAFE 357 */ 358 int 359 pwrite(td, uap) 360 struct thread *td; 361 struct pwrite_args *uap; 362 { 363 struct file *fp; 364 int error; 365 366 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 367 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 368 error = ESPIPE; 369 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR) 370 error = EINVAL; 371 else { 372 error = dofilewrite(td, fp, uap->fd, uap->buf, 373 uap->nbyte, uap->offset, FOF_OFFSET); 374 } 375 fdrop(fp, td); 376 } else { 377 error = EBADF; /* this can't be right */ 378 } 379 return(error); 380 } 381 382 static int 383 dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 384 struct thread *td; 385 struct file *fp; 386 int fd, flags; 387 const void *buf; 388 size_t nbyte; 389 off_t offset; 390 { 391 struct uio auio; 392 struct iovec aiov; 393 long cnt, error = 0; 394 #ifdef KTRACE 395 struct iovec ktriov; 396 struct uio ktruio; 397 int didktr = 0; 398 #endif 399 400 aiov.iov_base = (void *)(uintptr_t)buf; 401 aiov.iov_len = nbyte; 402 auio.uio_iov = &aiov; 403 auio.uio_iovcnt = 1; 404 auio.uio_offset = offset; 405 if (nbyte > INT_MAX) 406 return (EINVAL); 407 auio.uio_resid = nbyte; 408 auio.uio_rw = UIO_WRITE; 409 auio.uio_segflg = UIO_USERSPACE; 410 auio.uio_td = td; 411 #ifdef KTRACE 412 /* 413 * if tracing, save a copy of iovec and uio 414 */ 415 if (KTRPOINT(td, KTR_GENIO)) { 416 ktriov = aiov; 417 ktruio = auio; 418 didktr = 1; 419 } 420 #endif 421 cnt = nbyte; 422 if (fp->f_type == DTYPE_VNODE) 423 bwillwrite(); 424 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) { 425 if (auio.uio_resid != cnt && (error == ERESTART || 426 error == EINTR || error == EWOULDBLOCK)) 427 error = 0; 428 /* Socket layer is responsible for issuing SIGPIPE. */ 429 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) { 430 PROC_LOCK(td->td_proc); 431 psignal(td->td_proc, SIGPIPE); 432 PROC_UNLOCK(td->td_proc); 433 } 434 } 435 cnt -= auio.uio_resid; 436 #ifdef KTRACE 437 if (didktr && error == 0) { 438 ktruio.uio_iov = &ktriov; 439 ktruio.uio_resid = cnt; 440 ktrgenio(fd, UIO_WRITE, &ktruio, error); 441 } 442 #endif 443 td->td_retval[0] = cnt; 444 return (error); 445 } 446 447 /* 448 * Gather write system call 449 */ 450 #ifndef _SYS_SYSPROTO_H_ 451 struct writev_args { 452 int fd; 453 struct iovec *iovp; 454 u_int iovcnt; 455 }; 456 #endif 457 /* 458 * MPSAFE 459 */ 460 int 461 writev(td, uap) 462 struct thread *td; 463 register struct writev_args *uap; 464 { 465 struct file *fp; 466 struct uio auio; 467 register struct iovec *iov; 468 struct iovec *needfree; 469 struct iovec aiov[UIO_SMALLIOV]; 470 long i, cnt, error = 0; 471 u_int iovlen; 472 #ifdef KTRACE 473 struct iovec *ktriov = NULL; 474 struct uio ktruio; 475 #endif 476 477 if ((error = fget_write(td, uap->fd, &fp)) != 0) 478 return (EBADF); 479 needfree = NULL; 480 /* note: can't use iovlen until iovcnt is validated */ 481 iovlen = uap->iovcnt * sizeof (struct iovec); 482 if (uap->iovcnt > UIO_SMALLIOV) { 483 if (uap->iovcnt > UIO_MAXIOV) { 484 error = EINVAL; 485 goto done; 486 } 487 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 488 needfree = iov; 489 } else 490 iov = aiov; 491 auio.uio_iov = iov; 492 auio.uio_iovcnt = uap->iovcnt; 493 auio.uio_rw = UIO_WRITE; 494 auio.uio_segflg = UIO_USERSPACE; 495 auio.uio_td = td; 496 auio.uio_offset = -1; 497 if ((error = copyin(uap->iovp, iov, iovlen))) 498 goto done; 499 auio.uio_resid = 0; 500 for (i = 0; i < uap->iovcnt; i++) { 501 if (iov->iov_len > INT_MAX - auio.uio_resid) { 502 error = EINVAL; 503 goto done; 504 } 505 auio.uio_resid += iov->iov_len; 506 iov++; 507 } 508 #ifdef KTRACE 509 /* 510 * if tracing, save a copy of iovec and uio 511 */ 512 if (KTRPOINT(td, KTR_GENIO)) { 513 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 514 bcopy(auio.uio_iov, ktriov, iovlen); 515 ktruio = auio; 516 } 517 #endif 518 cnt = auio.uio_resid; 519 if (fp->f_type == DTYPE_VNODE) 520 bwillwrite(); 521 if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) { 522 if (auio.uio_resid != cnt && (error == ERESTART || 523 error == EINTR || error == EWOULDBLOCK)) 524 error = 0; 525 if (error == EPIPE) { 526 PROC_LOCK(td->td_proc); 527 psignal(td->td_proc, SIGPIPE); 528 PROC_UNLOCK(td->td_proc); 529 } 530 } 531 cnt -= auio.uio_resid; 532 #ifdef KTRACE 533 if (ktriov != NULL) { 534 if (error == 0) { 535 ktruio.uio_iov = ktriov; 536 ktruio.uio_resid = cnt; 537 ktrgenio(uap->fd, UIO_WRITE, &ktruio, error); 538 } 539 FREE(ktriov, M_TEMP); 540 } 541 #endif 542 td->td_retval[0] = cnt; 543 done: 544 fdrop(fp, td); 545 if (needfree) 546 FREE(needfree, M_IOV); 547 return (error); 548 } 549 550 /* 551 * Ioctl system call 552 */ 553 #ifndef _SYS_SYSPROTO_H_ 554 struct ioctl_args { 555 int fd; 556 u_long com; 557 caddr_t data; 558 }; 559 #endif 560 /* 561 * MPSAFE 562 */ 563 /* ARGSUSED */ 564 int 565 ioctl(td, uap) 566 struct thread *td; 567 register struct ioctl_args *uap; 568 { 569 struct file *fp; 570 register struct filedesc *fdp; 571 register u_long com; 572 int error = 0; 573 register u_int size; 574 caddr_t data, memp; 575 int tmp; 576 #define STK_PARAMS 128 577 union { 578 char stkbuf[STK_PARAMS]; 579 long align; 580 } ubuf; 581 582 if ((error = fget(td, uap->fd, &fp)) != 0) 583 return (error); 584 mtx_lock(&Giant); 585 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 586 fdrop(fp, td); 587 mtx_unlock(&Giant); 588 return (EBADF); 589 } 590 fdp = td->td_proc->p_fd; 591 switch (com = uap->com) { 592 case FIONCLEX: 593 FILEDESC_LOCK(fdp); 594 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 595 FILEDESC_UNLOCK(fdp); 596 fdrop(fp, td); 597 mtx_unlock(&Giant); 598 return (0); 599 case FIOCLEX: 600 FILEDESC_LOCK(fdp); 601 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 602 FILEDESC_UNLOCK(fdp); 603 fdrop(fp, td); 604 mtx_unlock(&Giant); 605 return (0); 606 } 607 608 /* 609 * Interpret high order word to find amount of data to be 610 * copied to/from the user's address space. 611 */ 612 size = IOCPARM_LEN(com); 613 if (size > IOCPARM_MAX) { 614 fdrop(fp, td); 615 mtx_unlock(&Giant); 616 return (ENOTTY); 617 } 618 619 memp = NULL; 620 if (size > sizeof (ubuf.stkbuf)) { 621 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 622 data = memp; 623 } else { 624 data = ubuf.stkbuf; 625 } 626 if (com&IOC_IN) { 627 if (size) { 628 error = copyin(uap->data, data, (u_int)size); 629 if (error) { 630 if (memp) 631 free(memp, M_IOCTLOPS); 632 fdrop(fp, td); 633 goto done; 634 } 635 } else { 636 *(caddr_t *)data = uap->data; 637 } 638 } else if ((com&IOC_OUT) && size) { 639 /* 640 * Zero the buffer so the user always 641 * gets back something deterministic. 642 */ 643 bzero(data, size); 644 } else if (com&IOC_VOID) { 645 *(caddr_t *)data = uap->data; 646 } 647 648 switch (com) { 649 650 case FIONBIO: 651 FILE_LOCK(fp); 652 if ((tmp = *(int *)data)) 653 fp->f_flag |= FNONBLOCK; 654 else 655 fp->f_flag &= ~FNONBLOCK; 656 FILE_UNLOCK(fp); 657 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 658 break; 659 660 case FIOASYNC: 661 FILE_LOCK(fp); 662 if ((tmp = *(int *)data)) 663 fp->f_flag |= FASYNC; 664 else 665 fp->f_flag &= ~FASYNC; 666 FILE_UNLOCK(fp); 667 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); 668 break; 669 670 default: 671 error = fo_ioctl(fp, com, data, td->td_ucred, td); 672 /* 673 * Copy any data to user, size was 674 * already set and checked above. 675 */ 676 if (error == 0 && (com&IOC_OUT) && size) 677 error = copyout(data, uap->data, (u_int)size); 678 break; 679 } 680 if (memp) 681 free(memp, M_IOCTLOPS); 682 fdrop(fp, td); 683 done: 684 mtx_unlock(&Giant); 685 return (error); 686 } 687 688 /* 689 * sellock and selwait are initialized in selectinit() via SYSINIT. 690 */ 691 struct mtx sellock; 692 struct cv selwait; 693 u_int nselcoll; /* Select collisions since boot */ 694 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 695 696 /* 697 * Select system call. 698 */ 699 #ifndef _SYS_SYSPROTO_H_ 700 struct select_args { 701 int nd; 702 fd_set *in, *ou, *ex; 703 struct timeval *tv; 704 }; 705 #endif 706 /* 707 * MPSAFE 708 */ 709 int 710 select(td, uap) 711 register struct thread *td; 712 register struct select_args *uap; 713 { 714 struct timeval tv, *tvp; 715 int error; 716 717 if (uap->tv != NULL) { 718 error = copyin(uap->tv, &tv, sizeof(tv)); 719 if (error) 720 return (error); 721 tvp = &tv; 722 } else 723 tvp = NULL; 724 725 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); 726 } 727 728 int 729 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 730 fd_set *fd_ex, struct timeval *tvp) 731 { 732 struct filedesc *fdp; 733 /* 734 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 735 * infds with the new FD_SETSIZE of 1024, and more than enough for 736 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 737 * of 256. 738 */ 739 fd_mask s_selbits[howmany(2048, NFDBITS)]; 740 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 741 struct timeval atv, rtv, ttv; 742 int error, timo; 743 u_int ncoll, nbufbytes, ncpbytes, nfdbits; 744 745 if (nd < 0) 746 return (EINVAL); 747 fdp = td->td_proc->p_fd; 748 mtx_lock(&Giant); 749 FILEDESC_LOCK(fdp); 750 751 if (nd > td->td_proc->p_fd->fd_nfiles) 752 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 753 FILEDESC_UNLOCK(fdp); 754 755 /* 756 * Allocate just enough bits for the non-null fd_sets. Use the 757 * preallocated auto buffer if possible. 758 */ 759 nfdbits = roundup(nd, NFDBITS); 760 ncpbytes = nfdbits / NBBY; 761 nbufbytes = 0; 762 if (fd_in != NULL) 763 nbufbytes += 2 * ncpbytes; 764 if (fd_ou != NULL) 765 nbufbytes += 2 * ncpbytes; 766 if (fd_ex != NULL) 767 nbufbytes += 2 * ncpbytes; 768 if (nbufbytes <= sizeof s_selbits) 769 selbits = &s_selbits[0]; 770 else 771 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 772 773 /* 774 * Assign pointers into the bit buffers and fetch the input bits. 775 * Put the output buffers together so that they can be bzeroed 776 * together. 777 */ 778 sbp = selbits; 779 #define getbits(name, x) \ 780 do { \ 781 if (name == NULL) \ 782 ibits[x] = NULL; \ 783 else { \ 784 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 785 obits[x] = sbp; \ 786 sbp += ncpbytes / sizeof *sbp; \ 787 error = copyin(name, ibits[x], ncpbytes); \ 788 if (error != 0) \ 789 goto done_nosellock; \ 790 } \ 791 } while (0) 792 getbits(fd_in, 0); 793 getbits(fd_ou, 1); 794 getbits(fd_ex, 2); 795 #undef getbits 796 if (nbufbytes != 0) 797 bzero(selbits, nbufbytes / 2); 798 799 if (tvp != NULL) { 800 atv = *tvp; 801 if (itimerfix(&atv)) { 802 error = EINVAL; 803 goto done_nosellock; 804 } 805 getmicrouptime(&rtv); 806 timevaladd(&atv, &rtv); 807 } else { 808 atv.tv_sec = 0; 809 atv.tv_usec = 0; 810 } 811 timo = 0; 812 TAILQ_INIT(&td->td_selq); 813 mtx_lock(&sellock); 814 retry: 815 ncoll = nselcoll; 816 mtx_lock_spin(&sched_lock); 817 td->td_flags |= TDF_SELECT; 818 mtx_unlock_spin(&sched_lock); 819 mtx_unlock(&sellock); 820 821 error = selscan(td, ibits, obits, nd); 822 mtx_lock(&sellock); 823 if (error || td->td_retval[0]) 824 goto done; 825 if (atv.tv_sec || atv.tv_usec) { 826 getmicrouptime(&rtv); 827 if (timevalcmp(&rtv, &atv, >=)) 828 goto done; 829 ttv = atv; 830 timevalsub(&ttv, &rtv); 831 timo = ttv.tv_sec > 24 * 60 * 60 ? 832 24 * 60 * 60 * hz : tvtohz(&ttv); 833 } 834 835 /* 836 * An event of interest may occur while we do not hold 837 * sellock, so check TDF_SELECT and the number of 838 * collisions and rescan the file descriptors if 839 * necessary. 840 */ 841 mtx_lock_spin(&sched_lock); 842 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 843 mtx_unlock_spin(&sched_lock); 844 goto retry; 845 } 846 mtx_unlock_spin(&sched_lock); 847 848 if (timo > 0) 849 error = cv_timedwait_sig(&selwait, &sellock, timo); 850 else 851 error = cv_wait_sig(&selwait, &sellock); 852 853 if (error == 0) 854 goto retry; 855 856 done: 857 clear_selinfo_list(td); 858 mtx_lock_spin(&sched_lock); 859 td->td_flags &= ~TDF_SELECT; 860 mtx_unlock_spin(&sched_lock); 861 mtx_unlock(&sellock); 862 863 done_nosellock: 864 /* select is not restarted after signals... */ 865 if (error == ERESTART) 866 error = EINTR; 867 if (error == EWOULDBLOCK) 868 error = 0; 869 #define putbits(name, x) \ 870 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 871 error = error2; 872 if (error == 0) { 873 int error2; 874 875 putbits(fd_in, 0); 876 putbits(fd_ou, 1); 877 putbits(fd_ex, 2); 878 #undef putbits 879 } 880 if (selbits != &s_selbits[0]) 881 free(selbits, M_SELECT); 882 883 mtx_unlock(&Giant); 884 return (error); 885 } 886 887 static int 888 selscan(td, ibits, obits, nfd) 889 struct thread *td; 890 fd_mask **ibits, **obits; 891 int nfd; 892 { 893 int msk, i, fd; 894 fd_mask bits; 895 struct file *fp; 896 int n = 0; 897 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 898 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 899 struct filedesc *fdp = td->td_proc->p_fd; 900 901 FILEDESC_LOCK(fdp); 902 for (msk = 0; msk < 3; msk++) { 903 if (ibits[msk] == NULL) 904 continue; 905 for (i = 0; i < nfd; i += NFDBITS) { 906 bits = ibits[msk][i/NFDBITS]; 907 /* ffs(int mask) not portable, fd_mask is long */ 908 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 909 if (!(bits & 1)) 910 continue; 911 if ((fp = fget_locked(fdp, fd)) == NULL) { 912 FILEDESC_UNLOCK(fdp); 913 return (EBADF); 914 } 915 if (fo_poll(fp, flag[msk], td->td_ucred, 916 td)) { 917 obits[msk][(fd)/NFDBITS] |= 918 ((fd_mask)1 << ((fd) % NFDBITS)); 919 n++; 920 } 921 } 922 } 923 } 924 FILEDESC_UNLOCK(fdp); 925 td->td_retval[0] = n; 926 return (0); 927 } 928 929 /* 930 * Poll system call. 931 */ 932 #ifndef _SYS_SYSPROTO_H_ 933 struct poll_args { 934 struct pollfd *fds; 935 u_int nfds; 936 int timeout; 937 }; 938 #endif 939 /* 940 * MPSAFE 941 */ 942 int 943 poll(td, uap) 944 struct thread *td; 945 struct poll_args *uap; 946 { 947 caddr_t bits; 948 char smallbits[32 * sizeof(struct pollfd)]; 949 struct timeval atv, rtv, ttv; 950 int error = 0, timo; 951 u_int ncoll, nfds; 952 size_t ni; 953 954 nfds = uap->nfds; 955 956 mtx_lock(&Giant); 957 /* 958 * This is kinda bogus. We have fd limits, but that is not 959 * really related to the size of the pollfd array. Make sure 960 * we let the process use at least FD_SETSIZE entries and at 961 * least enough for the current limits. We want to be reasonably 962 * safe, but not overly restrictive. 963 */ 964 PROC_LOCK(td->td_proc); 965 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) && 966 (nfds > FD_SETSIZE)) { 967 PROC_UNLOCK(td->td_proc); 968 error = EINVAL; 969 goto done2; 970 } 971 PROC_UNLOCK(td->td_proc); 972 ni = nfds * sizeof(struct pollfd); 973 if (ni > sizeof(smallbits)) 974 bits = malloc(ni, M_TEMP, M_WAITOK); 975 else 976 bits = smallbits; 977 error = copyin(uap->fds, bits, ni); 978 if (error) 979 goto done_nosellock; 980 if (uap->timeout != INFTIM) { 981 atv.tv_sec = uap->timeout / 1000; 982 atv.tv_usec = (uap->timeout % 1000) * 1000; 983 if (itimerfix(&atv)) { 984 error = EINVAL; 985 goto done_nosellock; 986 } 987 getmicrouptime(&rtv); 988 timevaladd(&atv, &rtv); 989 } else { 990 atv.tv_sec = 0; 991 atv.tv_usec = 0; 992 } 993 timo = 0; 994 TAILQ_INIT(&td->td_selq); 995 mtx_lock(&sellock); 996 retry: 997 ncoll = nselcoll; 998 mtx_lock_spin(&sched_lock); 999 td->td_flags |= TDF_SELECT; 1000 mtx_unlock_spin(&sched_lock); 1001 mtx_unlock(&sellock); 1002 1003 error = pollscan(td, (struct pollfd *)bits, nfds); 1004 mtx_lock(&sellock); 1005 if (error || td->td_retval[0]) 1006 goto done; 1007 if (atv.tv_sec || atv.tv_usec) { 1008 getmicrouptime(&rtv); 1009 if (timevalcmp(&rtv, &atv, >=)) 1010 goto done; 1011 ttv = atv; 1012 timevalsub(&ttv, &rtv); 1013 timo = ttv.tv_sec > 24 * 60 * 60 ? 1014 24 * 60 * 60 * hz : tvtohz(&ttv); 1015 } 1016 /* 1017 * An event of interest may occur while we do not hold 1018 * sellock, so check TDF_SELECT and the number of collisions 1019 * and rescan the file descriptors if necessary. 1020 */ 1021 mtx_lock_spin(&sched_lock); 1022 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 1023 mtx_unlock_spin(&sched_lock); 1024 goto retry; 1025 } 1026 mtx_unlock_spin(&sched_lock); 1027 1028 if (timo > 0) 1029 error = cv_timedwait_sig(&selwait, &sellock, timo); 1030 else 1031 error = cv_wait_sig(&selwait, &sellock); 1032 1033 if (error == 0) 1034 goto retry; 1035 1036 done: 1037 clear_selinfo_list(td); 1038 mtx_lock_spin(&sched_lock); 1039 td->td_flags &= ~TDF_SELECT; 1040 mtx_unlock_spin(&sched_lock); 1041 mtx_unlock(&sellock); 1042 1043 done_nosellock: 1044 /* poll is not restarted after signals... */ 1045 if (error == ERESTART) 1046 error = EINTR; 1047 if (error == EWOULDBLOCK) 1048 error = 0; 1049 if (error == 0) { 1050 error = copyout(bits, uap->fds, ni); 1051 if (error) 1052 goto out; 1053 } 1054 out: 1055 if (ni > sizeof(smallbits)) 1056 free(bits, M_TEMP); 1057 done2: 1058 mtx_unlock(&Giant); 1059 return (error); 1060 } 1061 1062 static int 1063 pollscan(td, fds, nfd) 1064 struct thread *td; 1065 struct pollfd *fds; 1066 u_int nfd; 1067 { 1068 register struct filedesc *fdp = td->td_proc->p_fd; 1069 int i; 1070 struct file *fp; 1071 int n = 0; 1072 1073 FILEDESC_LOCK(fdp); 1074 for (i = 0; i < nfd; i++, fds++) { 1075 if (fds->fd >= fdp->fd_nfiles) { 1076 fds->revents = POLLNVAL; 1077 n++; 1078 } else if (fds->fd < 0) { 1079 fds->revents = 0; 1080 } else { 1081 fp = fdp->fd_ofiles[fds->fd]; 1082 if (fp == NULL) { 1083 fds->revents = POLLNVAL; 1084 n++; 1085 } else { 1086 /* 1087 * Note: backend also returns POLLHUP and 1088 * POLLERR if appropriate. 1089 */ 1090 fds->revents = fo_poll(fp, fds->events, 1091 td->td_ucred, td); 1092 if (fds->revents != 0) 1093 n++; 1094 } 1095 } 1096 } 1097 FILEDESC_UNLOCK(fdp); 1098 td->td_retval[0] = n; 1099 return (0); 1100 } 1101 1102 /* 1103 * OpenBSD poll system call. 1104 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1105 */ 1106 #ifndef _SYS_SYSPROTO_H_ 1107 struct openbsd_poll_args { 1108 struct pollfd *fds; 1109 u_int nfds; 1110 int timeout; 1111 }; 1112 #endif 1113 /* 1114 * MPSAFE 1115 */ 1116 int 1117 openbsd_poll(td, uap) 1118 register struct thread *td; 1119 register struct openbsd_poll_args *uap; 1120 { 1121 return (poll(td, (struct poll_args *)uap)); 1122 } 1123 1124 /* 1125 * Remove the references to the thread from all of the objects 1126 * we were polling. 1127 * 1128 * This code assumes that the underlying owner of the selinfo 1129 * structure will hold sellock before it changes it, and that 1130 * it will unlink itself from our list if it goes away. 1131 */ 1132 void 1133 clear_selinfo_list(td) 1134 struct thread *td; 1135 { 1136 struct selinfo *si; 1137 1138 mtx_assert(&sellock, MA_OWNED); 1139 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1140 si->si_thread = NULL; 1141 TAILQ_INIT(&td->td_selq); 1142 } 1143 1144 /* 1145 * Record a select request. 1146 */ 1147 void 1148 selrecord(selector, sip) 1149 struct thread *selector; 1150 struct selinfo *sip; 1151 { 1152 1153 mtx_lock(&sellock); 1154 /* 1155 * If the selinfo's thread pointer is NULL then take ownership of it. 1156 * 1157 * If the thread pointer is not NULL and it points to another 1158 * thread, then we have a collision. 1159 * 1160 * If the thread pointer is not NULL and points back to us then leave 1161 * it alone as we've already added pointed it at us and added it to 1162 * our list. 1163 */ 1164 if (sip->si_thread == NULL) { 1165 sip->si_thread = selector; 1166 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1167 } else if (sip->si_thread != selector) { 1168 sip->si_flags |= SI_COLL; 1169 } 1170 1171 mtx_unlock(&sellock); 1172 } 1173 1174 /* Wake up a selecting thread. */ 1175 void 1176 selwakeup(sip) 1177 struct selinfo *sip; 1178 { 1179 doselwakeup(sip, -1); 1180 } 1181 1182 /* Wake up a selecting thread, and set its priority. */ 1183 void 1184 selwakeuppri(sip, pri) 1185 struct selinfo *sip; 1186 int pri; 1187 { 1188 doselwakeup(sip, pri); 1189 } 1190 1191 /* 1192 * Do a wakeup when a selectable event occurs. 1193 */ 1194 static void 1195 doselwakeup(sip, pri) 1196 struct selinfo *sip; 1197 int pri; 1198 { 1199 struct thread *td; 1200 1201 mtx_lock(&sellock); 1202 td = sip->si_thread; 1203 if ((sip->si_flags & SI_COLL) != 0) { 1204 nselcoll++; 1205 sip->si_flags &= ~SI_COLL; 1206 cv_broadcastpri(&selwait, pri); 1207 } 1208 if (td == NULL) { 1209 mtx_unlock(&sellock); 1210 return; 1211 } 1212 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1213 sip->si_thread = NULL; 1214 mtx_lock_spin(&sched_lock); 1215 if (td->td_wchan == &selwait) { 1216 cv_waitq_remove(td); 1217 TD_CLR_SLEEPING(td); 1218 if (pri >= PRI_MIN && pri <= PRI_MAX && td->td_priority > pri) 1219 td->td_priority = pri; 1220 setrunnable(td); 1221 } else 1222 td->td_flags &= ~TDF_SELECT; 1223 mtx_unlock_spin(&sched_lock); 1224 mtx_unlock(&sellock); 1225 } 1226 1227 static void selectinit(void *); 1228 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1229 1230 /* ARGSUSED*/ 1231 static void 1232 selectinit(dummy) 1233 void *dummy; 1234 { 1235 cv_init(&selwait, "select"); 1236 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1237 } 1238