1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_ktrace.h" 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/sysproto.h> 49 #include <sys/filedesc.h> 50 #include <sys/filio.h> 51 #include <sys/fcntl.h> 52 #include <sys/file.h> 53 #include <sys/proc.h> 54 #include <sys/signalvar.h> 55 #include <sys/socketvar.h> 56 #include <sys/uio.h> 57 #include <sys/kernel.h> 58 #include <sys/limits.h> 59 #include <sys/malloc.h> 60 #include <sys/poll.h> 61 #include <sys/resourcevar.h> 62 #include <sys/selinfo.h> 63 #include <sys/syscallsubr.h> 64 #include <sys/sysctl.h> 65 #include <sys/sysent.h> 66 #include <sys/vnode.h> 67 #include <sys/bio.h> 68 #include <sys/buf.h> 69 #include <sys/condvar.h> 70 #ifdef KTRACE 71 #include <sys/ktrace.h> 72 #endif 73 #include <vm/vm.h> 74 #include <vm/vm_page.h> 75 76 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 77 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 78 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 79 80 static int pollscan(struct thread *, struct pollfd *, u_int); 81 static int selscan(struct thread *, fd_mask **, fd_mask **, int); 82 static int dofileread(struct thread *, struct file *, int, void *, 83 size_t, off_t, int); 84 static int dofilewrite(struct thread *, struct file *, int, 85 const void *, size_t, off_t, int); 86 static void doselwakeup(struct selinfo *, int); 87 88 /* 89 * Read system call. 90 */ 91 #ifndef _SYS_SYSPROTO_H_ 92 struct read_args { 93 int fd; 94 void *buf; 95 size_t nbyte; 96 }; 97 #endif 98 /* 99 * MPSAFE 100 */ 101 int 102 read(td, uap) 103 struct thread *td; 104 struct read_args *uap; 105 { 106 struct file *fp; 107 int error; 108 109 if ((error = fget_read(td, uap->fd, &fp)) == 0) { 110 error = dofileread(td, fp, uap->fd, uap->buf, 111 uap->nbyte, (off_t)-1, 0); 112 fdrop(fp, td); 113 } 114 return(error); 115 } 116 117 /* 118 * Pread system call 119 */ 120 #ifndef _SYS_SYSPROTO_H_ 121 struct pread_args { 122 int fd; 123 void *buf; 124 size_t nbyte; 125 int pad; 126 off_t offset; 127 }; 128 #endif 129 /* 130 * MPSAFE 131 */ 132 int 133 pread(td, uap) 134 struct thread *td; 135 struct pread_args *uap; 136 { 137 struct file *fp; 138 int error; 139 140 if ((error = fget_read(td, uap->fd, &fp)) != 0) 141 return (error); 142 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 143 error = ESPIPE; 144 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR) 145 error = EINVAL; 146 else { 147 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 148 uap->offset, FOF_OFFSET); 149 } 150 fdrop(fp, td); 151 return(error); 152 } 153 154 /* 155 * Code common for read and pread 156 */ 157 static int 158 dofileread(td, fp, fd, buf, nbyte, offset, flags) 159 struct thread *td; 160 struct file *fp; 161 int fd, flags; 162 void *buf; 163 size_t nbyte; 164 off_t offset; 165 { 166 struct uio auio; 167 struct iovec aiov; 168 long cnt, error = 0; 169 #ifdef KTRACE 170 struct iovec ktriov; 171 struct uio ktruio; 172 int didktr = 0; 173 #endif 174 175 aiov.iov_base = buf; 176 aiov.iov_len = nbyte; 177 auio.uio_iov = &aiov; 178 auio.uio_iovcnt = 1; 179 auio.uio_offset = offset; 180 if (nbyte > INT_MAX) 181 return (EINVAL); 182 auio.uio_resid = nbyte; 183 auio.uio_rw = UIO_READ; 184 auio.uio_segflg = UIO_USERSPACE; 185 auio.uio_td = td; 186 #ifdef KTRACE 187 /* 188 * if tracing, save a copy of iovec 189 */ 190 if (KTRPOINT(td, KTR_GENIO)) { 191 ktriov = aiov; 192 ktruio = auio; 193 didktr = 1; 194 } 195 #endif 196 cnt = nbyte; 197 198 if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) { 199 if (auio.uio_resid != cnt && (error == ERESTART || 200 error == EINTR || error == EWOULDBLOCK)) 201 error = 0; 202 } 203 cnt -= auio.uio_resid; 204 #ifdef KTRACE 205 if (didktr && error == 0) { 206 ktruio.uio_iov = &ktriov; 207 ktruio.uio_resid = cnt; 208 ktrgenio(fd, UIO_READ, &ktruio, error); 209 } 210 #endif 211 td->td_retval[0] = cnt; 212 return (error); 213 } 214 215 /* 216 * Scatter read system call. 217 */ 218 #ifndef _SYS_SYSPROTO_H_ 219 struct readv_args { 220 int fd; 221 struct iovec *iovp; 222 u_int iovcnt; 223 }; 224 #endif 225 /* 226 * MPSAFE 227 */ 228 int 229 readv(td, uap) 230 struct thread *td; 231 struct readv_args *uap; 232 { 233 struct file *fp; 234 struct uio auio; 235 struct iovec *iov; 236 struct iovec *needfree; 237 struct iovec aiov[UIO_SMALLIOV]; 238 long i, cnt; 239 int error; 240 u_int iovlen; 241 #ifdef KTRACE 242 struct iovec *ktriov = NULL; 243 struct uio ktruio; 244 #endif 245 246 if ((error = fget_read(td, uap->fd, &fp)) != 0) 247 return (error); 248 needfree = NULL; 249 /* note: can't use iovlen until iovcnt is validated */ 250 iovlen = uap->iovcnt * sizeof (struct iovec); 251 if (uap->iovcnt > UIO_SMALLIOV) { 252 if (uap->iovcnt > UIO_MAXIOV) { 253 error = EINVAL; 254 goto done; 255 } 256 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 257 needfree = iov; 258 } else 259 iov = aiov; 260 auio.uio_iov = iov; 261 auio.uio_iovcnt = uap->iovcnt; 262 auio.uio_rw = UIO_READ; 263 auio.uio_segflg = UIO_USERSPACE; 264 auio.uio_td = td; 265 auio.uio_offset = -1; 266 if ((error = copyin(uap->iovp, iov, iovlen))) 267 goto done; 268 auio.uio_resid = 0; 269 for (i = 0; i < uap->iovcnt; i++) { 270 if (iov->iov_len > INT_MAX - auio.uio_resid) { 271 error = EINVAL; 272 goto done; 273 } 274 auio.uio_resid += iov->iov_len; 275 iov++; 276 } 277 #ifdef KTRACE 278 /* 279 * if tracing, save a copy of iovec 280 */ 281 if (KTRPOINT(td, KTR_GENIO)) { 282 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 283 bcopy(auio.uio_iov, ktriov, iovlen); 284 ktruio = auio; 285 } 286 #endif 287 cnt = auio.uio_resid; 288 if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) { 289 if (auio.uio_resid != cnt && (error == ERESTART || 290 error == EINTR || error == EWOULDBLOCK)) 291 error = 0; 292 } 293 cnt -= auio.uio_resid; 294 #ifdef KTRACE 295 if (ktriov != NULL) { 296 if (error == 0) { 297 ktruio.uio_iov = ktriov; 298 ktruio.uio_resid = cnt; 299 ktrgenio(uap->fd, UIO_READ, &ktruio, error); 300 } 301 FREE(ktriov, M_TEMP); 302 } 303 #endif 304 td->td_retval[0] = cnt; 305 done: 306 fdrop(fp, td); 307 if (needfree) 308 FREE(needfree, M_IOV); 309 return (error); 310 } 311 312 /* 313 * Write system call 314 */ 315 #ifndef _SYS_SYSPROTO_H_ 316 struct write_args { 317 int fd; 318 const void *buf; 319 size_t nbyte; 320 }; 321 #endif 322 /* 323 * MPSAFE 324 */ 325 int 326 write(td, uap) 327 struct thread *td; 328 struct write_args *uap; 329 { 330 struct file *fp; 331 int error; 332 333 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 334 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 335 (off_t)-1, 0); 336 fdrop(fp, td); 337 } else { 338 error = EBADF; /* XXX this can't be right */ 339 } 340 return(error); 341 } 342 343 /* 344 * Pwrite system call 345 */ 346 #ifndef _SYS_SYSPROTO_H_ 347 struct pwrite_args { 348 int fd; 349 const void *buf; 350 size_t nbyte; 351 int pad; 352 off_t offset; 353 }; 354 #endif 355 /* 356 * MPSAFE 357 */ 358 int 359 pwrite(td, uap) 360 struct thread *td; 361 struct pwrite_args *uap; 362 { 363 struct file *fp; 364 int error; 365 366 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 367 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) 368 error = ESPIPE; 369 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR) 370 error = EINVAL; 371 else { 372 error = dofilewrite(td, fp, uap->fd, uap->buf, 373 uap->nbyte, uap->offset, FOF_OFFSET); 374 } 375 fdrop(fp, td); 376 } else { 377 error = EBADF; /* this can't be right */ 378 } 379 return(error); 380 } 381 382 static int 383 dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 384 struct thread *td; 385 struct file *fp; 386 int fd, flags; 387 const void *buf; 388 size_t nbyte; 389 off_t offset; 390 { 391 struct uio auio; 392 struct iovec aiov; 393 long cnt, error = 0; 394 #ifdef KTRACE 395 struct iovec ktriov; 396 struct uio ktruio; 397 int didktr = 0; 398 #endif 399 400 aiov.iov_base = (void *)(uintptr_t)buf; 401 aiov.iov_len = nbyte; 402 auio.uio_iov = &aiov; 403 auio.uio_iovcnt = 1; 404 auio.uio_offset = offset; 405 if (nbyte > INT_MAX) 406 return (EINVAL); 407 auio.uio_resid = nbyte; 408 auio.uio_rw = UIO_WRITE; 409 auio.uio_segflg = UIO_USERSPACE; 410 auio.uio_td = td; 411 #ifdef KTRACE 412 /* 413 * if tracing, save a copy of iovec and uio 414 */ 415 if (KTRPOINT(td, KTR_GENIO)) { 416 ktriov = aiov; 417 ktruio = auio; 418 didktr = 1; 419 } 420 #endif 421 cnt = nbyte; 422 if (fp->f_type == DTYPE_VNODE) 423 bwillwrite(); 424 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) { 425 if (auio.uio_resid != cnt && (error == ERESTART || 426 error == EINTR || error == EWOULDBLOCK)) 427 error = 0; 428 /* Socket layer is responsible for issuing SIGPIPE. */ 429 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) { 430 PROC_LOCK(td->td_proc); 431 psignal(td->td_proc, SIGPIPE); 432 PROC_UNLOCK(td->td_proc); 433 } 434 } 435 cnt -= auio.uio_resid; 436 #ifdef KTRACE 437 if (didktr && error == 0) { 438 ktruio.uio_iov = &ktriov; 439 ktruio.uio_resid = cnt; 440 ktrgenio(fd, UIO_WRITE, &ktruio, error); 441 } 442 #endif 443 td->td_retval[0] = cnt; 444 return (error); 445 } 446 447 /* 448 * Gather write system call 449 */ 450 #ifndef _SYS_SYSPROTO_H_ 451 struct writev_args { 452 int fd; 453 struct iovec *iovp; 454 u_int iovcnt; 455 }; 456 #endif 457 /* 458 * MPSAFE 459 */ 460 int 461 writev(td, uap) 462 struct thread *td; 463 register struct writev_args *uap; 464 { 465 struct file *fp; 466 struct uio auio; 467 register struct iovec *iov; 468 struct iovec *needfree; 469 struct iovec aiov[UIO_SMALLIOV]; 470 long i, cnt, error = 0; 471 u_int iovlen; 472 #ifdef KTRACE 473 struct iovec *ktriov = NULL; 474 struct uio ktruio; 475 #endif 476 477 if ((error = fget_write(td, uap->fd, &fp)) != 0) 478 return (EBADF); 479 needfree = NULL; 480 /* note: can't use iovlen until iovcnt is validated */ 481 iovlen = uap->iovcnt * sizeof (struct iovec); 482 if (uap->iovcnt > UIO_SMALLIOV) { 483 if (uap->iovcnt > UIO_MAXIOV) { 484 error = EINVAL; 485 goto done; 486 } 487 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 488 needfree = iov; 489 } else 490 iov = aiov; 491 auio.uio_iov = iov; 492 auio.uio_iovcnt = uap->iovcnt; 493 auio.uio_rw = UIO_WRITE; 494 auio.uio_segflg = UIO_USERSPACE; 495 auio.uio_td = td; 496 auio.uio_offset = -1; 497 if ((error = copyin(uap->iovp, iov, iovlen))) 498 goto done; 499 auio.uio_resid = 0; 500 for (i = 0; i < uap->iovcnt; i++) { 501 if (iov->iov_len > INT_MAX - auio.uio_resid) { 502 error = EINVAL; 503 goto done; 504 } 505 auio.uio_resid += iov->iov_len; 506 iov++; 507 } 508 #ifdef KTRACE 509 /* 510 * if tracing, save a copy of iovec and uio 511 */ 512 if (KTRPOINT(td, KTR_GENIO)) { 513 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 514 bcopy(auio.uio_iov, ktriov, iovlen); 515 ktruio = auio; 516 } 517 #endif 518 cnt = auio.uio_resid; 519 if (fp->f_type == DTYPE_VNODE) 520 bwillwrite(); 521 if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) { 522 if (auio.uio_resid != cnt && (error == ERESTART || 523 error == EINTR || error == EWOULDBLOCK)) 524 error = 0; 525 if (error == EPIPE) { 526 PROC_LOCK(td->td_proc); 527 psignal(td->td_proc, SIGPIPE); 528 PROC_UNLOCK(td->td_proc); 529 } 530 } 531 cnt -= auio.uio_resid; 532 #ifdef KTRACE 533 if (ktriov != NULL) { 534 if (error == 0) { 535 ktruio.uio_iov = ktriov; 536 ktruio.uio_resid = cnt; 537 ktrgenio(uap->fd, UIO_WRITE, &ktruio, error); 538 } 539 FREE(ktriov, M_TEMP); 540 } 541 #endif 542 td->td_retval[0] = cnt; 543 done: 544 fdrop(fp, td); 545 if (needfree) 546 FREE(needfree, M_IOV); 547 return (error); 548 } 549 550 /* 551 * Ioctl system call 552 */ 553 #ifndef _SYS_SYSPROTO_H_ 554 struct ioctl_args { 555 int fd; 556 u_long com; 557 caddr_t data; 558 }; 559 #endif 560 /* 561 * MPSAFE 562 */ 563 /* ARGSUSED */ 564 int 565 ioctl(td, uap) 566 struct thread *td; 567 register struct ioctl_args *uap; 568 { 569 struct file *fp; 570 register struct filedesc *fdp; 571 register u_long com; 572 int error = 0; 573 register u_int size; 574 caddr_t data, memp; 575 int tmp; 576 #define STK_PARAMS 128 577 union { 578 char stkbuf[STK_PARAMS]; 579 long align; 580 } ubuf; 581 582 if ((error = fget(td, uap->fd, &fp)) != 0) 583 return (error); 584 mtx_lock(&Giant); 585 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 586 fdrop(fp, td); 587 mtx_unlock(&Giant); 588 return (EBADF); 589 } 590 fdp = td->td_proc->p_fd; 591 switch (com = uap->com) { 592 case FIONCLEX: 593 FILEDESC_LOCK(fdp); 594 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; 595 FILEDESC_UNLOCK(fdp); 596 fdrop(fp, td); 597 mtx_unlock(&Giant); 598 return (0); 599 case FIOCLEX: 600 FILEDESC_LOCK(fdp); 601 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; 602 FILEDESC_UNLOCK(fdp); 603 fdrop(fp, td); 604 mtx_unlock(&Giant); 605 return (0); 606 } 607 608 /* 609 * Interpret high order word to find amount of data to be 610 * copied to/from the user's address space. 611 */ 612 size = IOCPARM_LEN(com); 613 if (size > IOCPARM_MAX) { 614 fdrop(fp, td); 615 mtx_unlock(&Giant); 616 return (ENOTTY); 617 } 618 619 memp = NULL; 620 if (size > sizeof (ubuf.stkbuf)) { 621 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); 622 data = memp; 623 } else { 624 data = ubuf.stkbuf; 625 } 626 if (com&IOC_IN) { 627 if (size) { 628 error = copyin(uap->data, data, (u_int)size); 629 if (error) { 630 if (memp) 631 free(memp, M_IOCTLOPS); 632 fdrop(fp, td); 633 goto done; 634 } 635 } else { 636 *(caddr_t *)data = uap->data; 637 } 638 } else if ((com&IOC_OUT) && size) { 639 /* 640 * Zero the buffer so the user always 641 * gets back something deterministic. 642 */ 643 bzero(data, size); 644 } else if (com&IOC_VOID) { 645 *(caddr_t *)data = uap->data; 646 } 647 648 switch (com) { 649 650 case FIONBIO: 651 FILE_LOCK(fp); 652 if ((tmp = *(int *)data)) 653 fp->f_flag |= FNONBLOCK; 654 else 655 fp->f_flag &= ~FNONBLOCK; 656 FILE_UNLOCK(fp); 657 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 658 break; 659 660 case FIOASYNC: 661 FILE_LOCK(fp); 662 if ((tmp = *(int *)data)) 663 fp->f_flag |= FASYNC; 664 else 665 fp->f_flag &= ~FASYNC; 666 FILE_UNLOCK(fp); 667 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); 668 break; 669 670 default: 671 error = fo_ioctl(fp, com, data, td->td_ucred, td); 672 /* 673 * Copy any data to user, size was 674 * already set and checked above. 675 */ 676 if (error == 0 && (com&IOC_OUT) && size) 677 error = copyout(data, uap->data, (u_int)size); 678 break; 679 } 680 if (memp) 681 free(memp, M_IOCTLOPS); 682 fdrop(fp, td); 683 done: 684 mtx_unlock(&Giant); 685 return (error); 686 } 687 688 /* 689 * sellock and selwait are initialized in selectinit() via SYSINIT. 690 */ 691 struct mtx sellock; 692 struct cv selwait; 693 u_int nselcoll; /* Select collisions since boot */ 694 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 695 696 /* 697 * Select system call. 698 */ 699 #ifndef _SYS_SYSPROTO_H_ 700 struct select_args { 701 int nd; 702 fd_set *in, *ou, *ex; 703 struct timeval *tv; 704 }; 705 #endif 706 /* 707 * MPSAFE 708 */ 709 int 710 select(td, uap) 711 register struct thread *td; 712 register struct select_args *uap; 713 { 714 struct timeval tv, *tvp; 715 int error; 716 717 if (uap->tv != NULL) { 718 error = copyin(uap->tv, &tv, sizeof(tv)); 719 if (error) 720 return (error); 721 tvp = &tv; 722 } else 723 tvp = NULL; 724 725 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); 726 } 727 728 int 729 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, 730 fd_set *fd_ex, struct timeval *tvp) 731 { 732 struct filedesc *fdp; 733 /* 734 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 735 * infds with the new FD_SETSIZE of 1024, and more than enough for 736 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 737 * of 256. 738 */ 739 fd_mask s_selbits[howmany(2048, NFDBITS)]; 740 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 741 struct timeval atv, rtv, ttv; 742 int error, timo; 743 u_int ncoll, nbufbytes, ncpbytes, nfdbits; 744 745 if (nd < 0) 746 return (EINVAL); 747 fdp = td->td_proc->p_fd; 748 mtx_lock(&Giant); 749 FILEDESC_LOCK(fdp); 750 751 if (nd > td->td_proc->p_fd->fd_nfiles) 752 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 753 FILEDESC_UNLOCK(fdp); 754 755 /* 756 * Allocate just enough bits for the non-null fd_sets. Use the 757 * preallocated auto buffer if possible. 758 */ 759 nfdbits = roundup(nd, NFDBITS); 760 ncpbytes = nfdbits / NBBY; 761 nbufbytes = 0; 762 if (fd_in != NULL) 763 nbufbytes += 2 * ncpbytes; 764 if (fd_ou != NULL) 765 nbufbytes += 2 * ncpbytes; 766 if (fd_ex != NULL) 767 nbufbytes += 2 * ncpbytes; 768 if (nbufbytes <= sizeof s_selbits) 769 selbits = &s_selbits[0]; 770 else 771 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); 772 773 /* 774 * Assign pointers into the bit buffers and fetch the input bits. 775 * Put the output buffers together so that they can be bzeroed 776 * together. 777 */ 778 sbp = selbits; 779 #define getbits(name, x) \ 780 do { \ 781 if (name == NULL) \ 782 ibits[x] = NULL; \ 783 else { \ 784 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 785 obits[x] = sbp; \ 786 sbp += ncpbytes / sizeof *sbp; \ 787 error = copyin(name, ibits[x], ncpbytes); \ 788 if (error != 0) \ 789 goto done_nosellock; \ 790 } \ 791 } while (0) 792 getbits(fd_in, 0); 793 getbits(fd_ou, 1); 794 getbits(fd_ex, 2); 795 #undef getbits 796 if (nbufbytes != 0) 797 bzero(selbits, nbufbytes / 2); 798 799 if (tvp != NULL) { 800 atv = *tvp; 801 if (itimerfix(&atv)) { 802 error = EINVAL; 803 goto done_nosellock; 804 } 805 getmicrouptime(&rtv); 806 timevaladd(&atv, &rtv); 807 } else { 808 atv.tv_sec = 0; 809 atv.tv_usec = 0; 810 } 811 timo = 0; 812 TAILQ_INIT(&td->td_selq); 813 mtx_lock(&sellock); 814 retry: 815 ncoll = nselcoll; 816 mtx_lock_spin(&sched_lock); 817 td->td_flags |= TDF_SELECT; 818 mtx_unlock_spin(&sched_lock); 819 mtx_unlock(&sellock); 820 821 error = selscan(td, ibits, obits, nd); 822 mtx_lock(&sellock); 823 if (error || td->td_retval[0]) 824 goto done; 825 if (atv.tv_sec || atv.tv_usec) { 826 getmicrouptime(&rtv); 827 if (timevalcmp(&rtv, &atv, >=)) 828 goto done; 829 ttv = atv; 830 timevalsub(&ttv, &rtv); 831 timo = ttv.tv_sec > 24 * 60 * 60 ? 832 24 * 60 * 60 * hz : tvtohz(&ttv); 833 } 834 835 /* 836 * An event of interest may occur while we do not hold 837 * sellock, so check TDF_SELECT and the number of 838 * collisions and rescan the file descriptors if 839 * necessary. 840 */ 841 mtx_lock_spin(&sched_lock); 842 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 843 mtx_unlock_spin(&sched_lock); 844 goto retry; 845 } 846 mtx_unlock_spin(&sched_lock); 847 848 if (timo > 0) 849 error = cv_timedwait_sig(&selwait, &sellock, timo); 850 else 851 error = cv_wait_sig(&selwait, &sellock); 852 853 if (error == 0) 854 goto retry; 855 856 done: 857 clear_selinfo_list(td); 858 mtx_lock_spin(&sched_lock); 859 td->td_flags &= ~TDF_SELECT; 860 mtx_unlock_spin(&sched_lock); 861 mtx_unlock(&sellock); 862 863 done_nosellock: 864 /* select is not restarted after signals... */ 865 if (error == ERESTART) 866 error = EINTR; 867 if (error == EWOULDBLOCK) 868 error = 0; 869 #define putbits(name, x) \ 870 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 871 error = error2; 872 if (error == 0) { 873 int error2; 874 875 putbits(fd_in, 0); 876 putbits(fd_ou, 1); 877 putbits(fd_ex, 2); 878 #undef putbits 879 } 880 if (selbits != &s_selbits[0]) 881 free(selbits, M_SELECT); 882 883 mtx_unlock(&Giant); 884 return (error); 885 } 886 887 static int 888 selscan(td, ibits, obits, nfd) 889 struct thread *td; 890 fd_mask **ibits, **obits; 891 int nfd; 892 { 893 int msk, i, fd; 894 fd_mask bits; 895 struct file *fp; 896 int n = 0; 897 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 898 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 899 struct filedesc *fdp = td->td_proc->p_fd; 900 901 FILEDESC_LOCK(fdp); 902 for (msk = 0; msk < 3; msk++) { 903 if (ibits[msk] == NULL) 904 continue; 905 for (i = 0; i < nfd; i += NFDBITS) { 906 bits = ibits[msk][i/NFDBITS]; 907 /* ffs(int mask) not portable, fd_mask is long */ 908 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 909 if (!(bits & 1)) 910 continue; 911 if ((fp = fget_locked(fdp, fd)) == NULL) { 912 FILEDESC_UNLOCK(fdp); 913 return (EBADF); 914 } 915 if (fo_poll(fp, flag[msk], td->td_ucred, 916 td)) { 917 obits[msk][(fd)/NFDBITS] |= 918 ((fd_mask)1 << ((fd) % NFDBITS)); 919 n++; 920 } 921 } 922 } 923 } 924 FILEDESC_UNLOCK(fdp); 925 td->td_retval[0] = n; 926 return (0); 927 } 928 929 /* 930 * Poll system call. 931 */ 932 #ifndef _SYS_SYSPROTO_H_ 933 struct poll_args { 934 struct pollfd *fds; 935 u_int nfds; 936 int timeout; 937 }; 938 #endif 939 /* 940 * MPSAFE 941 */ 942 int 943 poll(td, uap) 944 struct thread *td; 945 struct poll_args *uap; 946 { 947 caddr_t bits; 948 char smallbits[32 * sizeof(struct pollfd)]; 949 struct timeval atv, rtv, ttv; 950 int error = 0, timo; 951 u_int ncoll, nfds; 952 size_t ni; 953 954 nfds = uap->nfds; 955 956 mtx_lock(&Giant); 957 /* 958 * This is kinda bogus. We have fd limits, but that is not 959 * really related to the size of the pollfd array. Make sure 960 * we let the process use at least FD_SETSIZE entries and at 961 * least enough for the current limits. We want to be reasonably 962 * safe, but not overly restrictive. 963 */ 964 if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) && 965 (nfds > FD_SETSIZE)) { 966 error = EINVAL; 967 goto done2; 968 } 969 ni = nfds * sizeof(struct pollfd); 970 if (ni > sizeof(smallbits)) 971 bits = malloc(ni, M_TEMP, M_WAITOK); 972 else 973 bits = smallbits; 974 error = copyin(uap->fds, bits, ni); 975 if (error) 976 goto done_nosellock; 977 if (uap->timeout != INFTIM) { 978 atv.tv_sec = uap->timeout / 1000; 979 atv.tv_usec = (uap->timeout % 1000) * 1000; 980 if (itimerfix(&atv)) { 981 error = EINVAL; 982 goto done_nosellock; 983 } 984 getmicrouptime(&rtv); 985 timevaladd(&atv, &rtv); 986 } else { 987 atv.tv_sec = 0; 988 atv.tv_usec = 0; 989 } 990 timo = 0; 991 TAILQ_INIT(&td->td_selq); 992 mtx_lock(&sellock); 993 retry: 994 ncoll = nselcoll; 995 mtx_lock_spin(&sched_lock); 996 td->td_flags |= TDF_SELECT; 997 mtx_unlock_spin(&sched_lock); 998 mtx_unlock(&sellock); 999 1000 error = pollscan(td, (struct pollfd *)bits, nfds); 1001 mtx_lock(&sellock); 1002 if (error || td->td_retval[0]) 1003 goto done; 1004 if (atv.tv_sec || atv.tv_usec) { 1005 getmicrouptime(&rtv); 1006 if (timevalcmp(&rtv, &atv, >=)) 1007 goto done; 1008 ttv = atv; 1009 timevalsub(&ttv, &rtv); 1010 timo = ttv.tv_sec > 24 * 60 * 60 ? 1011 24 * 60 * 60 * hz : tvtohz(&ttv); 1012 } 1013 /* 1014 * An event of interest may occur while we do not hold 1015 * sellock, so check TDF_SELECT and the number of collisions 1016 * and rescan the file descriptors if necessary. 1017 */ 1018 mtx_lock_spin(&sched_lock); 1019 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { 1020 mtx_unlock_spin(&sched_lock); 1021 goto retry; 1022 } 1023 mtx_unlock_spin(&sched_lock); 1024 1025 if (timo > 0) 1026 error = cv_timedwait_sig(&selwait, &sellock, timo); 1027 else 1028 error = cv_wait_sig(&selwait, &sellock); 1029 1030 if (error == 0) 1031 goto retry; 1032 1033 done: 1034 clear_selinfo_list(td); 1035 mtx_lock_spin(&sched_lock); 1036 td->td_flags &= ~TDF_SELECT; 1037 mtx_unlock_spin(&sched_lock); 1038 mtx_unlock(&sellock); 1039 1040 done_nosellock: 1041 /* poll is not restarted after signals... */ 1042 if (error == ERESTART) 1043 error = EINTR; 1044 if (error == EWOULDBLOCK) 1045 error = 0; 1046 if (error == 0) { 1047 error = copyout(bits, uap->fds, ni); 1048 if (error) 1049 goto out; 1050 } 1051 out: 1052 if (ni > sizeof(smallbits)) 1053 free(bits, M_TEMP); 1054 done2: 1055 mtx_unlock(&Giant); 1056 return (error); 1057 } 1058 1059 static int 1060 pollscan(td, fds, nfd) 1061 struct thread *td; 1062 struct pollfd *fds; 1063 u_int nfd; 1064 { 1065 register struct filedesc *fdp = td->td_proc->p_fd; 1066 int i; 1067 struct file *fp; 1068 int n = 0; 1069 1070 FILEDESC_LOCK(fdp); 1071 for (i = 0; i < nfd; i++, fds++) { 1072 if (fds->fd >= fdp->fd_nfiles) { 1073 fds->revents = POLLNVAL; 1074 n++; 1075 } else if (fds->fd < 0) { 1076 fds->revents = 0; 1077 } else { 1078 fp = fdp->fd_ofiles[fds->fd]; 1079 if (fp == NULL) { 1080 fds->revents = POLLNVAL; 1081 n++; 1082 } else { 1083 /* 1084 * Note: backend also returns POLLHUP and 1085 * POLLERR if appropriate. 1086 */ 1087 fds->revents = fo_poll(fp, fds->events, 1088 td->td_ucred, td); 1089 if (fds->revents != 0) 1090 n++; 1091 } 1092 } 1093 } 1094 FILEDESC_UNLOCK(fdp); 1095 td->td_retval[0] = n; 1096 return (0); 1097 } 1098 1099 /* 1100 * OpenBSD poll system call. 1101 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1102 */ 1103 #ifndef _SYS_SYSPROTO_H_ 1104 struct openbsd_poll_args { 1105 struct pollfd *fds; 1106 u_int nfds; 1107 int timeout; 1108 }; 1109 #endif 1110 /* 1111 * MPSAFE 1112 */ 1113 int 1114 openbsd_poll(td, uap) 1115 register struct thread *td; 1116 register struct openbsd_poll_args *uap; 1117 { 1118 return (poll(td, (struct poll_args *)uap)); 1119 } 1120 1121 /* 1122 * Remove the references to the thread from all of the objects 1123 * we were polling. 1124 * 1125 * This code assumes that the underlying owner of the selinfo 1126 * structure will hold sellock before it changes it, and that 1127 * it will unlink itself from our list if it goes away. 1128 */ 1129 void 1130 clear_selinfo_list(td) 1131 struct thread *td; 1132 { 1133 struct selinfo *si; 1134 1135 mtx_assert(&sellock, MA_OWNED); 1136 TAILQ_FOREACH(si, &td->td_selq, si_thrlist) 1137 si->si_thread = NULL; 1138 TAILQ_INIT(&td->td_selq); 1139 } 1140 1141 /* 1142 * Record a select request. 1143 */ 1144 void 1145 selrecord(selector, sip) 1146 struct thread *selector; 1147 struct selinfo *sip; 1148 { 1149 1150 mtx_lock(&sellock); 1151 /* 1152 * If the selinfo's thread pointer is NULL then take ownership of it. 1153 * 1154 * If the thread pointer is not NULL and it points to another 1155 * thread, then we have a collision. 1156 * 1157 * If the thread pointer is not NULL and points back to us then leave 1158 * it alone as we've already added pointed it at us and added it to 1159 * our list. 1160 */ 1161 if (sip->si_thread == NULL) { 1162 sip->si_thread = selector; 1163 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); 1164 } else if (sip->si_thread != selector) { 1165 sip->si_flags |= SI_COLL; 1166 } 1167 1168 mtx_unlock(&sellock); 1169 } 1170 1171 /* Wake up a selecting thread. */ 1172 void 1173 selwakeup(sip) 1174 struct selinfo *sip; 1175 { 1176 doselwakeup(sip, -1); 1177 } 1178 1179 /* Wake up a selecting thread, and set its priority. */ 1180 void 1181 selwakeuppri(sip, pri) 1182 struct selinfo *sip; 1183 int pri; 1184 { 1185 doselwakeup(sip, pri); 1186 } 1187 1188 /* 1189 * Do a wakeup when a selectable event occurs. 1190 */ 1191 static void 1192 doselwakeup(sip, pri) 1193 struct selinfo *sip; 1194 int pri; 1195 { 1196 struct thread *td; 1197 1198 mtx_lock(&sellock); 1199 td = sip->si_thread; 1200 if ((sip->si_flags & SI_COLL) != 0) { 1201 nselcoll++; 1202 sip->si_flags &= ~SI_COLL; 1203 cv_broadcastpri(&selwait, pri); 1204 } 1205 if (td == NULL) { 1206 mtx_unlock(&sellock); 1207 return; 1208 } 1209 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); 1210 sip->si_thread = NULL; 1211 mtx_lock_spin(&sched_lock); 1212 if (td->td_wchan == &selwait) { 1213 cv_waitq_remove(td); 1214 TD_CLR_SLEEPING(td); 1215 if (pri >= PRI_MIN && pri <= PRI_MAX && td->td_priority > pri) 1216 td->td_priority = pri; 1217 setrunnable(td); 1218 } else 1219 td->td_flags &= ~TDF_SELECT; 1220 mtx_unlock_spin(&sched_lock); 1221 mtx_unlock(&sellock); 1222 } 1223 1224 static void selectinit(void *); 1225 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) 1226 1227 /* ARGSUSED*/ 1228 static void 1229 selectinit(dummy) 1230 void *dummy; 1231 { 1232 cv_init(&selwait, "select"); 1233 mtx_init(&sellock, "sellck", NULL, MTX_DEF); 1234 } 1235